From 5a42351cf1a708c9bacd54b5cc04321f25b0cead Mon Sep 17 00:00:00 2001 From: tr4umatic4i Date: Wed, 3 Jun 2020 20:20:05 -0700 Subject: [PATCH 0001/1025] add support for na_rep when using pd.NA in _format_strings See #33950 --- pandas/io/formats/format.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 02339f4344d4d..c1d40726a0993 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -1207,7 +1207,7 @@ def _format(x): if x is None: return "None" elif x is NA: - return str(NA) + return self.na_rep elif x is NaT or np.isnat(x): return "NaT" except (TypeError, ValueError): From a61c396f79ec8cff7b30032b8902fa1299657d14 Mon Sep 17 00:00:00 2001 From: tr4umatic4i Date: Thu, 4 Jun 2020 09:55:15 -0700 Subject: [PATCH 0002/1025] Fix expressions to retain default behavior when na_rep is not set --- pandas/io/formats/format.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index c1d40726a0993..7d610deabea5d 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -1206,8 +1206,10 @@ def _format(x): # determine na_rep if x is None or NaT-like if x is None: return "None" - elif x is NA: + elif x is NA and self.na_rep != 'NaN': return self.na_rep + elif x is NA: + return str(NA) elif x is NaT or np.isnat(x): return "NaT" except (TypeError, ValueError): From c46a567fa4a204815ebe894daa45ffc330f896ca Mon Sep 17 00:00:00 2001 From: tr4umatic4i Date: Fri, 5 Jun 2020 13:05:48 -0700 Subject: [PATCH 0003/1025] Add initial tests for na_rep --- pandas/tests/io/formats/test_format.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index c1850826926d8..f9457c5a7ef97 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -2866,6 +2866,10 @@ def dtype(self): expected = "0 [False True]\n" + "1 [ True False]\n" + "dtype: DtypeStub" assert res == expected + def test_na_rep(self): + from pandas.io.formats.format import GenericArrayFormatter + assert GenericArrayFormatter(np.array([pd.NA,str(float('nan'))])).get_result() == [' '+str(pd.NA),' '+str((float('nan')))] + assert GenericArrayFormatter(np.array([pd.NA,float('nan')]),na_rep='test').get_result() == [' test',' test'] def _three_digit_exp(): return f"{1.7e8:.4g}" == "1.7e+008" From 2569deceb4310705721dea63ed88284bbdd50aab Mon Sep 17 00:00:00 2001 From: tr4umatic4i Date: Fri, 5 Jun 2020 14:15:25 -0700 Subject: [PATCH 0004/1025] cleanup of test code --- pandas/tests/io/formats/test_format.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index f9457c5a7ef97..d82c57daf70f1 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -2868,8 +2868,9 @@ def dtype(self): def test_na_rep(self): from pandas.io.formats.format import GenericArrayFormatter - assert GenericArrayFormatter(np.array([pd.NA,str(float('nan'))])).get_result() == [' '+str(pd.NA),' '+str((float('nan')))] - assert GenericArrayFormatter(np.array([pd.NA,float('nan')]),na_rep='test').get_result() == [' test',' test'] + test_array = np.array([pd.NA, float('nan')]) + assert GenericArrayFormatter(test_array).get_result() == [' ' + str(pd.NA), ' ' + str('NaN')] + assert GenericArrayFormatter(test_array,na_rep='test').get_result() == [' test', ' test'] def _three_digit_exp(): return f"{1.7e8:.4g}" == "1.7e+008" @@ -2881,6 +2882,7 @@ def test_misc(self): result = obj.get_result() assert len(result) == 0 + def test_format(self): obj = fmt.FloatArrayFormatter(np.array([12, 0], dtype=np.float64)) result = obj.get_result() From 2b0392ffe332a0ab2e30a3315d19c78d4fbf7e38 Mon Sep 17 00:00:00 2001 From: tr4umatic4i Date: Fri, 5 Jun 2020 14:43:30 -0700 Subject: [PATCH 0005/1025] refactor leading whitespace note NaN still has leading whitespace. --- pandas/tests/io/formats/test_format.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index d82c57daf70f1..30cf2499cf560 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -2869,8 +2869,8 @@ def dtype(self): def test_na_rep(self): from pandas.io.formats.format import GenericArrayFormatter test_array = np.array([pd.NA, float('nan')]) - assert GenericArrayFormatter(test_array).get_result() == [' ' + str(pd.NA), ' ' + str('NaN')] - assert GenericArrayFormatter(test_array,na_rep='test').get_result() == [' test', ' test'] + assert GenericArrayFormatter(test_array,leading_space=False).get_result() == [str(pd.NA), ' NaN'] + assert GenericArrayFormatter(test_array,na_rep='test',leading_space= False).get_result() == ['test', 'test'] def _three_digit_exp(): return f"{1.7e8:.4g}" == "1.7e+008" From ba5f901b852b47accd2c8dfd05d5cb8d855d9ff5 Mon Sep 17 00:00:00 2001 From: tr4umatic4i Date: Fri, 5 Jun 2020 14:45:52 -0700 Subject: [PATCH 0006/1025] more cleanup --- pandas/tests/io/formats/test_format.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index 30cf2499cf560..6b0de0e657946 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -2867,10 +2867,10 @@ def dtype(self): assert res == expected def test_na_rep(self): - from pandas.io.formats.format import GenericArrayFormatter + from pandas.io.formats.format import GenericArrayFormatter as gaf test_array = np.array([pd.NA, float('nan')]) - assert GenericArrayFormatter(test_array,leading_space=False).get_result() == [str(pd.NA), ' NaN'] - assert GenericArrayFormatter(test_array,na_rep='test',leading_space= False).get_result() == ['test', 'test'] + assert gaf(test_array, leading_space=False).get_result() == [str(pd.NA), ' NaN'] + assert gaf(test_array, na_rep='test', leading_space=False).get_result() == ['test', 'test'] def _three_digit_exp(): return f"{1.7e8:.4g}" == "1.7e+008" From 1494bea6cf22340fd470732b87d6abb09fadd56e Mon Sep 17 00:00:00 2001 From: tr4umatic4i Date: Fri, 5 Jun 2020 15:30:10 -0700 Subject: [PATCH 0007/1025] more cleanup #2 --- pandas/tests/io/formats/test_format.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index 6b0de0e657946..b925e7266db6c 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -2869,8 +2869,10 @@ def dtype(self): def test_na_rep(self): from pandas.io.formats.format import GenericArrayFormatter as gaf test_array = np.array([pd.NA, float('nan')]) + val_na = ['x', 'x'] assert gaf(test_array, leading_space=False).get_result() == [str(pd.NA), ' NaN'] - assert gaf(test_array, na_rep='test', leading_space=False).get_result() == ['test', 'test'] + assert gaf(test_array, na_rep='x', leading_space=False).get_result() == val_na + def _three_digit_exp(): return f"{1.7e8:.4g}" == "1.7e+008" @@ -2882,7 +2884,6 @@ def test_misc(self): result = obj.get_result() assert len(result) == 0 - def test_format(self): obj = fmt.FloatArrayFormatter(np.array([12, 0], dtype=np.float64)) result = obj.get_result() From 6b813e7537d060aada445af3f25121f207338d3d Mon Sep 17 00:00:00 2001 From: tr4umatic4i Date: Fri, 5 Jun 2020 15:52:28 -0700 Subject: [PATCH 0008/1025] fixing linting --- pandas/io/formats/format.py | 2 +- pandas/tests/io/formats/test_format.py | 9 +++++---- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 7d610deabea5d..b94b56da7b279 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -1206,7 +1206,7 @@ def _format(x): # determine na_rep if x is None or NaT-like if x is None: return "None" - elif x is NA and self.na_rep != 'NaN': + elif x is NA and self.na_rep != "NaN": return self.na_rep elif x is NA: return str(NA) diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index b925e7266db6c..60b9375dbef38 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -2868,10 +2868,11 @@ def dtype(self): def test_na_rep(self): from pandas.io.formats.format import GenericArrayFormatter as gaf - test_array = np.array([pd.NA, float('nan')]) - val_na = ['x', 'x'] - assert gaf(test_array, leading_space=False).get_result() == [str(pd.NA), ' NaN'] - assert gaf(test_array, na_rep='x', leading_space=False).get_result() == val_na + + test_array = np.array([pd.NA, float("nan")]) + val_na = ["x", "x"] + assert gaf(test_array, leading_space=False).get_result() == [str(pd.NA), " NaN"] + assert gaf(test_array, na_rep="x", leading_space=False).get_result() == val_na def _three_digit_exp(): From dbc6feaf345d8fc66095cb83f7a5e4fdac722f89 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 3 Jun 2020 22:51:15 -0700 Subject: [PATCH 0009/1025] REF/PERF: PeriodDtype decouple from DateOffset (#34499) --- pandas/_libs/tslibs/dtypes.pxd | 56 +++++++++++++++ pandas/_libs/tslibs/dtypes.pyx | 108 ++++++++++++++++++++++++++++ pandas/_libs/tslibs/frequencies.pyx | 69 +----------------- pandas/_libs/tslibs/offsets.pyx | 31 +++++++- pandas/_libs/tslibs/period.pyx | 50 +++++++------ pandas/tests/tslibs/test_api.py | 1 + setup.py | 3 +- 7 files changed, 228 insertions(+), 90 deletions(-) create mode 100644 pandas/_libs/tslibs/dtypes.pxd create mode 100644 pandas/_libs/tslibs/dtypes.pyx diff --git a/pandas/_libs/tslibs/dtypes.pxd b/pandas/_libs/tslibs/dtypes.pxd new file mode 100644 index 0000000000000..23c473726e5a9 --- /dev/null +++ b/pandas/_libs/tslibs/dtypes.pxd @@ -0,0 +1,56 @@ + +cdef enum PeriodDtypeCode: + # Annual freqs with various fiscal year ends. + # eg, 2005 for A_FEB runs Mar 1, 2004 to Feb 28, 2005 + A = 1000 # Default alias + A_DEC = 1000 # Annual - December year end + A_JAN = 1001 # Annual - January year end + A_FEB = 1002 # Annual - February year end + A_MAR = 1003 # Annual - March year end + A_APR = 1004 # Annual - April year end + A_MAY = 1005 # Annual - May year end + A_JUN = 1006 # Annual - June year end + A_JUL = 1007 # Annual - July year end + A_AUG = 1008 # Annual - August year end + A_SEP = 1009 # Annual - September year end + A_OCT = 1010 # Annual - October year end + A_NOV = 1011 # Annual - November year end + + # Quarterly frequencies with various fiscal year ends. + # eg, Q42005 for Q_OCT runs Aug 1, 2005 to Oct 31, 2005 + Q_DEC = 2000 # Quarterly - December year end + Q_JAN = 2001 # Quarterly - January year end + Q_FEB = 2002 # Quarterly - February year end + Q_MAR = 2003 # Quarterly - March year end + Q_APR = 2004 # Quarterly - April year end + Q_MAY = 2005 # Quarterly - May year end + Q_JUN = 2006 # Quarterly - June year end + Q_JUL = 2007 # Quarterly - July year end + Q_AUG = 2008 # Quarterly - August year end + Q_SEP = 2009 # Quarterly - September year end + Q_OCT = 2010 # Quarterly - October year end + Q_NOV = 2011 # Quarterly - November year end + + M = 3000 # Monthly + + W_SUN = 4000 # Weekly - Sunday end of week + W_MON = 4001 # Weekly - Monday end of week + W_TUE = 4002 # Weekly - Tuesday end of week + W_WED = 4003 # Weekly - Wednesday end of week + W_THU = 4004 # Weekly - Thursday end of week + W_FRI = 4005 # Weekly - Friday end of week + W_SAT = 4006 # Weekly - Saturday end of week + + B = 5000 # Business days + D = 6000 # Daily + H = 7000 # Hourly + T = 8000 # Minutely + S = 9000 # Secondly + L = 10000 # Millisecondly + U = 11000 # Microsecondly + N = 12000 # Nanosecondly + + +cdef class PeriodPseudoDtype: + cdef readonly: + PeriodDtypeCode dtype_code diff --git a/pandas/_libs/tslibs/dtypes.pyx b/pandas/_libs/tslibs/dtypes.pyx new file mode 100644 index 0000000000000..d0d4e579a456b --- /dev/null +++ b/pandas/_libs/tslibs/dtypes.pyx @@ -0,0 +1,108 @@ +# period frequency constants corresponding to scikits timeseries +# originals + + +cdef class PeriodPseudoDtype: + """ + Similar to an actual dtype, this contains all of the information + describing a PeriodDtype in an integer code. + """ + # cdef readonly: + # PeriodDtypeCode dtype_code + + def __cinit__(self, PeriodDtypeCode code): + self.dtype_code = code + + def __eq__(self, other): + if not isinstance(other, PeriodPseudoDtype): + return False + if not isinstance(self, PeriodPseudoDtype): + # cython semantics, this is a reversed op + return False + return self.dtype_code == other.dtype_code + + @property + def date_offset(self): + """ + Corresponding DateOffset object. + + This mapping is mainly for backward-compatibility. + """ + from .offsets import to_offset + + freqstr = _reverse_period_code_map.get(self.dtype_code) + # equiv: freqstr = libfrequencies.get_freq_str(self.dtype_code) + + return to_offset(freqstr) + + @classmethod + def from_date_offset(cls, offset): + code = offset._period_dtype_code + return cls(code) + + +_period_code_map = { + # Annual freqs with various fiscal year ends. + # eg, 2005 for A-FEB runs Mar 1, 2004 to Feb 28, 2005 + "A-DEC": 1000, # Annual - December year end + "A-JAN": 1001, # Annual - January year end + "A-FEB": 1002, # Annual - February year end + "A-MAR": 1003, # Annual - March year end + "A-APR": 1004, # Annual - April year end + "A-MAY": 1005, # Annual - May year end + "A-JUN": 1006, # Annual - June year end + "A-JUL": 1007, # Annual - July year end + "A-AUG": 1008, # Annual - August year end + "A-SEP": 1009, # Annual - September year end + "A-OCT": 1010, # Annual - October year end + "A-NOV": 1011, # Annual - November year end + + # Quarterly frequencies with various fiscal year ends. + # eg, Q42005 for Q-OCT runs Aug 1, 2005 to Oct 31, 2005 + "Q-DEC": 2000, # Quarterly - December year end + "Q-JAN": 2001, # Quarterly - January year end + "Q-FEB": 2002, # Quarterly - February year end + "Q-MAR": 2003, # Quarterly - March year end + "Q-APR": 2004, # Quarterly - April year end + "Q-MAY": 2005, # Quarterly - May year end + "Q-JUN": 2006, # Quarterly - June year end + "Q-JUL": 2007, # Quarterly - July year end + "Q-AUG": 2008, # Quarterly - August year end + "Q-SEP": 2009, # Quarterly - September year end + "Q-OCT": 2010, # Quarterly - October year end + "Q-NOV": 2011, # Quarterly - November year end + + "M": 3000, # Monthly + + "W-SUN": 4000, # Weekly - Sunday end of week + "W-MON": 4001, # Weekly - Monday end of week + "W-TUE": 4002, # Weekly - Tuesday end of week + "W-WED": 4003, # Weekly - Wednesday end of week + "W-THU": 4004, # Weekly - Thursday end of week + "W-FRI": 4005, # Weekly - Friday end of week + "W-SAT": 4006, # Weekly - Saturday end of week + + "B": 5000, # Business days + "D": 6000, # Daily + "H": 7000, # Hourly + "T": 8000, # Minutely + "S": 9000, # Secondly + "L": 10000, # Millisecondly + "U": 11000, # Microsecondly + "N": 12000, # Nanosecondly +} + +_reverse_period_code_map = { + _period_code_map[key]: key for key in _period_code_map} + +# Yearly aliases; careful not to put these in _reverse_period_code_map +_period_code_map.update({"Y" + key[1:]: _period_code_map[key] + for key in _period_code_map + if key.startswith("A-")}) + +_period_code_map.update({ + "Q": 2000, # Quarterly - December year end (default quarterly) + "A": 1000, # Annual + "W": 4000, # Weekly + "C": 5000, # Custom Business Day +}) diff --git a/pandas/_libs/tslibs/frequencies.pyx b/pandas/_libs/tslibs/frequencies.pyx index 8246e24319dbd..8ca442de59f9f 100644 --- a/pandas/_libs/tslibs/frequencies.pyx +++ b/pandas/_libs/tslibs/frequencies.pyx @@ -12,6 +12,8 @@ from pandas._libs.tslibs.offsets import ( opattern, ) +from .dtypes import _period_code_map, _reverse_period_code_map + # --------------------------------------------------------------------- # Period codes @@ -31,73 +33,6 @@ class FreqGroup: FR_NS = 12000 -# period frequency constants corresponding to scikits timeseries -# originals -_period_code_map = { - # Annual freqs with various fiscal year ends. - # eg, 2005 for A-FEB runs Mar 1, 2004 to Feb 28, 2005 - "A-DEC": 1000, # Annual - December year end - "A-JAN": 1001, # Annual - January year end - "A-FEB": 1002, # Annual - February year end - "A-MAR": 1003, # Annual - March year end - "A-APR": 1004, # Annual - April year end - "A-MAY": 1005, # Annual - May year end - "A-JUN": 1006, # Annual - June year end - "A-JUL": 1007, # Annual - July year end - "A-AUG": 1008, # Annual - August year end - "A-SEP": 1009, # Annual - September year end - "A-OCT": 1010, # Annual - October year end - "A-NOV": 1011, # Annual - November year end - - # Quarterly frequencies with various fiscal year ends. - # eg, Q42005 for Q-OCT runs Aug 1, 2005 to Oct 31, 2005 - "Q-DEC": 2000, # Quarterly - December year end - "Q-JAN": 2001, # Quarterly - January year end - "Q-FEB": 2002, # Quarterly - February year end - "Q-MAR": 2003, # Quarterly - March year end - "Q-APR": 2004, # Quarterly - April year end - "Q-MAY": 2005, # Quarterly - May year end - "Q-JUN": 2006, # Quarterly - June year end - "Q-JUL": 2007, # Quarterly - July year end - "Q-AUG": 2008, # Quarterly - August year end - "Q-SEP": 2009, # Quarterly - September year end - "Q-OCT": 2010, # Quarterly - October year end - "Q-NOV": 2011, # Quarterly - November year end - - "M": 3000, # Monthly - - "W-SUN": 4000, # Weekly - Sunday end of week - "W-MON": 4001, # Weekly - Monday end of week - "W-TUE": 4002, # Weekly - Tuesday end of week - "W-WED": 4003, # Weekly - Wednesday end of week - "W-THU": 4004, # Weekly - Thursday end of week - "W-FRI": 4005, # Weekly - Friday end of week - "W-SAT": 4006, # Weekly - Saturday end of week - - "B": 5000, # Business days - "D": 6000, # Daily - "H": 7000, # Hourly - "T": 8000, # Minutely - "S": 9000, # Secondly - "L": 10000, # Millisecondly - "U": 11000, # Microsecondly - "N": 12000} # Nanosecondly - - -_reverse_period_code_map = { - _period_code_map[key]: key for key in _period_code_map} - -# Yearly aliases; careful not to put these in _reverse_period_code_map -_period_code_map.update({'Y' + key[1:]: _period_code_map[key] - for key in _period_code_map - if key.startswith('A-')}) - -_period_code_map.update({ - "Q": 2000, # Quarterly - December year end (default quarterly) - "A": 1000, # Annual - "W": 4000, # Weekly - "C": 5000}) # Custom Business Day - # Map attribute-name resolutions to resolution abbreviations _attrname_to_abbrevs = { "year": "A", diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index 77b60d0c22322..63dc3407b4c55 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -48,6 +48,7 @@ from pandas._libs.tslibs.np_datetime cimport ( from pandas._libs.tslibs.timezones cimport utc_pytz as UTC from pandas._libs.tslibs.tzconversion cimport tz_convert_single +from .dtypes cimport PeriodDtypeCode from .timedeltas cimport delta_to_nanoseconds @@ -892,36 +893,43 @@ cdef class Tick(SingleConstructorOffset): cdef class Day(Tick): _nanos_inc = 24 * 3600 * 1_000_000_000 _prefix = "D" + _period_dtype_code = PeriodDtypeCode.D cdef class Hour(Tick): _nanos_inc = 3600 * 1_000_000_000 _prefix = "H" + _period_dtype_code = PeriodDtypeCode.H cdef class Minute(Tick): _nanos_inc = 60 * 1_000_000_000 _prefix = "T" + _period_dtype_code = PeriodDtypeCode.T cdef class Second(Tick): _nanos_inc = 1_000_000_000 _prefix = "S" + _period_dtype_code = PeriodDtypeCode.S cdef class Milli(Tick): _nanos_inc = 1_000_000 _prefix = "L" + _period_dtype_code = PeriodDtypeCode.L cdef class Micro(Tick): _nanos_inc = 1000 _prefix = "U" + _period_dtype_code = PeriodDtypeCode.U cdef class Nano(Tick): _nanos_inc = 1 _prefix = "N" + _period_dtype_code = PeriodDtypeCode.N def delta_to_tick(delta: timedelta) -> Tick: @@ -1281,7 +1289,7 @@ cdef class BusinessDay(BusinessMixin): """ DateOffset subclass representing possibly n business days. """ - + _period_dtype_code = PeriodDtypeCode.B _prefix = "B" _attributes = tuple(["n", "normalize", "offset"]) @@ -1945,6 +1953,15 @@ cdef class YearEnd(YearOffset): _prefix = "A" _day_opt = "end" + cdef readonly: + int _period_dtype_code + + def __init__(self, n=1, normalize=False, month=None): + # Because YearEnd can be the freq for a Period, define its + # _period_dtype_code at construction for performance + YearOffset.__init__(self, n, normalize, month) + self._period_dtype_code = PeriodDtypeCode.A + self.month % 12 + cdef class YearBegin(YearOffset): """ @@ -2099,6 +2116,14 @@ cdef class QuarterEnd(QuarterOffset): _prefix = "Q" _day_opt = "end" + cdef readonly: + int _period_dtype_code + + def __init__(self, n=1, normalize=False, startingMonth=None): + # Because QuarterEnd can be the freq for a Period, define its + # _period_dtype_code at construction for performance + QuarterOffset.__init__(self, n, normalize, startingMonth) + self._period_dtype_code = PeriodDtypeCode.Q_DEC + self.startingMonth % 12 cdef class QuarterBegin(QuarterOffset): """ @@ -2148,6 +2173,7 @@ cdef class MonthEnd(MonthOffset): """ DateOffset of one month end. """ + _period_dtype_code = PeriodDtypeCode.M _prefix = "M" _day_opt = "end" @@ -2452,6 +2478,7 @@ cdef class Week(SingleConstructorOffset): cdef readonly: object weekday # int or None + int _period_dtype_code def __init__(self, n=1, normalize=False, weekday=None): BaseOffset.__init__(self, n, normalize) @@ -2461,6 +2488,8 @@ cdef class Week(SingleConstructorOffset): if self.weekday < 0 or self.weekday > 6: raise ValueError(f"Day must be 0<=day<=6, got {self.weekday}") + self._period_dtype_code = PeriodDtypeCode.W_SUN + (weekday + 1) % 7 + cpdef __setstate__(self, state): self.n = state.pop("n") self.normalize = state.pop("normalize") diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index 14cce1c000207..e88a20bc549bd 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -55,6 +55,9 @@ from pandas._libs.tslibs.ccalendar cimport ( get_days_in_month, ) from pandas._libs.tslibs.ccalendar cimport c_MONTH_NUMBERS + +from pandas._libs.tslibs.dtypes cimport PeriodPseudoDtype + from pandas._libs.tslibs.frequencies cimport ( attrname_to_abbrevs, get_freq_code, @@ -1514,11 +1517,16 @@ cdef class _Period: cdef readonly: int64_t ordinal + PeriodPseudoDtype _dtype BaseOffset freq def __cinit__(self, int64_t ordinal, BaseOffset freq): self.ordinal = ordinal self.freq = freq + # Note: this is more performant than PeriodDtype.from_date_offset(freq) + # because from_date_offset cannot be made a cdef method (until cython + # supported cdef classmethods) + self._dtype = PeriodPseudoDtype(freq._period_dtype_code) @classmethod def _maybe_convert_freq(cls, object freq): @@ -1662,13 +1670,13 @@ cdef class _Period: """ freq = self._maybe_convert_freq(freq) how = validate_end_alias(how) - base1, mult1 = get_freq_code(self.freq) - base2, mult2 = get_freq_code(freq) + base1 = self._dtype.dtype_code + base2, _ = get_freq_code(freq) - # mult1 can't be negative or 0 + # self.n can't be negative or 0 end = how == 'E' if end: - ordinal = self.ordinal + mult1 - 1 + ordinal = self.ordinal + self.freq.n - 1 else: ordinal = self.ordinal ordinal = period_asfreq(ordinal, base1, base2, end) @@ -1751,12 +1759,12 @@ cdef class _Period: return endpoint - Timedelta(1, 'ns') if freq is None: - base, mult = get_freq_code(self.freq) + base = self._dtype.dtype_code freq = get_to_timestamp_base(base) else: freq = self._maybe_convert_freq(freq) - base, mult = get_freq_code(freq) + base, _ = get_freq_code(freq) val = self.asfreq(freq, how) dt64 = period_ordinal_to_dt64(val.ordinal, base) @@ -1764,12 +1772,12 @@ cdef class _Period: @property def year(self) -> int: - base, mult = get_freq_code(self.freq) + base = self._dtype.dtype_code return pyear(self.ordinal, base) @property def month(self) -> int: - base, mult = get_freq_code(self.freq) + base = self._dtype.dtype_code return pmonth(self.ordinal, base) @property @@ -1792,7 +1800,7 @@ cdef class _Period: >>> p.day 11 """ - base, mult = get_freq_code(self.freq) + base = self._dtype.dtype_code return pday(self.ordinal, base) @property @@ -1822,7 +1830,7 @@ cdef class _Period: >>> p.hour 0 """ - base, mult = get_freq_code(self.freq) + base = self._dtype.dtype_code return phour(self.ordinal, base) @property @@ -1846,7 +1854,7 @@ cdef class _Period: >>> p.minute 3 """ - base, mult = get_freq_code(self.freq) + base = self._dtype.dtype_code return pminute(self.ordinal, base) @property @@ -1870,12 +1878,12 @@ cdef class _Period: >>> p.second 12 """ - base, mult = get_freq_code(self.freq) + base = self._dtype.dtype_code return psecond(self.ordinal, base) @property def weekofyear(self) -> int: - base, mult = get_freq_code(self.freq) + base = self._dtype.dtype_code return pweek(self.ordinal, base) @property @@ -1956,7 +1964,7 @@ cdef class _Period: >>> per.end_time.dayofweek 2 """ - base, mult = get_freq_code(self.freq) + base = self._dtype.dtype_code return pweekday(self.ordinal, base) @property @@ -2044,12 +2052,12 @@ cdef class _Period: >>> period.dayofyear 1 """ - base, mult = get_freq_code(self.freq) + base = self._dtype.dtype_code return pday_of_year(self.ordinal, base) @property def quarter(self) -> int: - base, mult = get_freq_code(self.freq) + base = self._dtype.dtype_code return pquarter(self.ordinal, base) @property @@ -2093,7 +2101,7 @@ cdef class _Period: >>> per.year 2017 """ - base, mult = get_freq_code(self.freq) + base = self._dtype.dtype_code return pqyear(self.ordinal, base) @property @@ -2127,7 +2135,7 @@ cdef class _Period: >>> p.days_in_month 29 """ - base, mult = get_freq_code(self.freq) + base = self._dtype.dtype_code return pdays_in_month(self.ordinal, base) @property @@ -2165,7 +2173,7 @@ cdef class _Period: return self.freq.freqstr def __repr__(self) -> str: - base, mult = get_freq_code(self.freq) + base = self._dtype.dtype_code formatted = period_format(self.ordinal, base) return f"Period('{formatted}', '{self.freqstr}')" @@ -2173,7 +2181,7 @@ cdef class _Period: """ Return a string representation for a particular DataFrame """ - base, mult = get_freq_code(self.freq) + base = self._dtype.dtype_code formatted = period_format(self.ordinal, base) value = str(formatted) return value @@ -2325,7 +2333,7 @@ cdef class _Period: >>> a.strftime('%b. %d, %Y was a %A') 'Jan. 01, 2001 was a Monday' """ - base, mult = get_freq_code(self.freq) + base = self._dtype.dtype_code return period_format(self.ordinal, base, fmt) diff --git a/pandas/tests/tslibs/test_api.py b/pandas/tests/tslibs/test_api.py index bbabfed4cb976..b0c524a257684 100644 --- a/pandas/tests/tslibs/test_api.py +++ b/pandas/tests/tslibs/test_api.py @@ -9,6 +9,7 @@ def test_namespace(): "base", "ccalendar", "conversion", + "dtypes", "fields", "frequencies", "nattype", diff --git a/setup.py b/setup.py index 63510867f0dd7..9f411ec10cd80 100755 --- a/setup.py +++ b/setup.py @@ -308,8 +308,8 @@ class CheckSDist(sdist_class): "pandas/_libs/ops.pyx", "pandas/_libs/parsers.pyx", "pandas/_libs/tslibs/base.pyx", - "pandas/_libs/tslibs/c_timestamp.pyx", "pandas/_libs/tslibs/ccalendar.pyx", + "pandas/_libs/tslibs/dtypes.pyx", "pandas/_libs/tslibs/period.pyx", "pandas/_libs/tslibs/strptime.pyx", "pandas/_libs/tslibs/np_datetime.pyx", @@ -605,6 +605,7 @@ def srcpath(name=None, suffix=".pyx", subdir="src"): "_libs.tslib": {"pyxfile": "_libs/tslib", "depends": tseries_depends}, "_libs.tslibs.base": {"pyxfile": "_libs/tslibs/base"}, "_libs.tslibs.ccalendar": {"pyxfile": "_libs/tslibs/ccalendar"}, + "_libs.tslibs.dtypes": {"pyxfile": "_libs/tslibs/dtypes"}, "_libs.tslibs.conversion": { "pyxfile": "_libs/tslibs/conversion", "depends": tseries_depends, From 36d4a08c7a392664d8bb31e016e5024f9d5a523d Mon Sep 17 00:00:00 2001 From: willpeppo Date: Thu, 4 Jun 2020 03:04:56 -0400 Subject: [PATCH 0010/1025] DOC: Fixed PR06 (wrong parameter type) in pandas.Timestamp (#34561) --- pandas/_libs/tslibs/nattype.pyx | 4 ++-- pandas/_libs/tslibs/timestamps.pyx | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/tslibs/nattype.pyx b/pandas/_libs/tslibs/nattype.pyx index 6dc49914ef4b7..f079c5157eeb3 100644 --- a/pandas/_libs/tslibs/nattype.pyx +++ b/pandas/_libs/tslibs/nattype.pyx @@ -397,7 +397,7 @@ class NaTType(_NaT): Parameters ---------- - locale : string, default None (English locale) + locale : str, default None (English locale) Locale determining the language in which to return the month name. Returns @@ -414,7 +414,7 @@ class NaTType(_NaT): Parameters ---------- - locale : string, default None (English locale) + locale : str, default None (English locale) Locale determining the language in which to return the day name. Returns diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index fad87f9f910cb..471ed557f4327 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -1053,7 +1053,7 @@ timedelta}, default 'raise' Parameters ---------- - locale : string, default None (English locale) + locale : str, default None (English locale) Locale determining the language in which to return the day name. Returns @@ -1070,7 +1070,7 @@ timedelta}, default 'raise' Parameters ---------- - locale : string, default None (English locale) + locale : str, default None (English locale) Locale determining the language in which to return the month name. Returns From 7aa9edecfd83f1876bd7ce50d3be653c3857269b Mon Sep 17 00:00:00 2001 From: Kevin Sheppard Date: Thu, 4 Jun 2020 12:38:47 +0100 Subject: [PATCH 0011/1025] BUG/ENH: Improve categorical construction when using the iterator in StataReader (#34128) * BUG/ENH: Correct categorical on iterators Return categoricals with the same categories if possible when reading data through an interator. Warn if not possible. closes #31544 --- doc/source/whatsnew/v1.1.0.rst | 1 + pandas/io/stata.py | 76 ++++++++++++++++-- .../stata/stata-dta-partially-labeled.dta | Bin 0 -> 1390 bytes pandas/tests/io/test_stata.py | 60 ++++++++++++++ 4 files changed, 129 insertions(+), 8 deletions(-) create mode 100644 pandas/tests/io/data/stata/stata-dta-partially-labeled.dta diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 7834e1a5c4898..86d3e50493bd1 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -949,6 +949,7 @@ I/O - Bug in :meth:`~DataFrame.read_feather` was raising an `ArrowIOError` when reading an s3 or http file path (:issue:`29055`) - Bug in :meth:`~DataFrame.to_excel` could not handle the column name `render` and was raising an ``KeyError`` (:issue:`34331`) - Bug in :meth:`~SQLDatabase.execute` was raising a ``ProgrammingError`` for some DB-API drivers when the SQL statement contained the `%` character and no parameters were present (:issue:`34211`) +- Bug in :meth:`~pandas.io.stata.StataReader` which resulted in categorical variables with difference dtypes when reading data using an iterator. (:issue:`31544`) Plotting ^^^^^^^^ diff --git a/pandas/io/stata.py b/pandas/io/stata.py index fe8dcf1bdb9aa..e9adf5292ef6f 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -106,6 +106,14 @@ iterator : bool, default False Return StataReader object.""" +_reader_notes = """\ +Notes +----- +Categorical variables read through an iterator may not have the same +categories and dtype. This occurs when a variable stored in a DTA +file is associated to an incomplete set of value labels that only +label a strict subset of the values.""" + _read_stata_doc = f""" Read Stata file into DataFrame. @@ -135,6 +143,8 @@ io.stata.StataReader : Low-level reader for Stata data files. DataFrame.to_stata: Export Stata data files. +{_reader_notes} + Examples -------- Read a Stata dta file: @@ -176,6 +186,8 @@ {_statafile_processing_params1} {_statafile_processing_params2} {_chunksize_params} + +{_reader_notes} """ @@ -497,6 +509,21 @@ class InvalidColumnName(Warning): """ +class CategoricalConversionWarning(Warning): + pass + + +categorical_conversion_warning = """ +One or more series with value labels are not fully labeled. Reading this +dataset with an iterator results in categorical variable with different +categories. This occurs since it is not possible to know all possible values +until the entire dataset has been read. To avoid this warning, you can either +read dataset without an interator, or manually convert categorical data by +``convert_categoricals`` to False and then accessing the variable labels +through the value_labels method of the reader. +""" + + def _cast_to_stata_types(data: DataFrame) -> DataFrame: """ Checks the dtypes of the columns of a pandas DataFrame for @@ -1023,6 +1050,10 @@ def __init__( self._order_categoricals = order_categoricals self._encoding = "" self._chunksize = chunksize + if self._chunksize is not None and ( + not isinstance(chunksize, int) or chunksize <= 0 + ): + raise ValueError("chunksize must be a positive integer when set.") # State variables for the file self._has_string_data = False @@ -1488,6 +1519,10 @@ def _read_strls(self) -> None: self.GSO[str(v_o)] = decoded_va def __next__(self) -> DataFrame: + if self._chunksize is None: + raise ValueError( + "chunksize must be set to a positive integer to use as an iterator." + ) return self.read(nrows=self._chunksize or 1) def get_chunk(self, size: Optional[int] = None) -> DataFrame: @@ -1753,8 +1788,8 @@ def _do_select_columns(self, data: DataFrame, columns: Sequence[str]) -> DataFra return data[columns] - @staticmethod def _do_convert_categoricals( + self, data: DataFrame, value_label_dict: Dict[str, Dict[Union[float, int], str]], lbllist: Sequence[str], @@ -1768,14 +1803,39 @@ def _do_convert_categoricals( for col, label in zip(data, lbllist): if label in value_labels: # Explicit call with ordered=True - cat_data = Categorical(data[col], ordered=order_categoricals) - categories = [] - for category in cat_data.categories: - if category in value_label_dict[label]: - categories.append(value_label_dict[label][category]) - else: - categories.append(category) # Partially labeled + vl = value_label_dict[label] + keys = np.array(list(vl.keys())) + column = data[col] + key_matches = column.isin(keys) + if self._chunksize is not None and key_matches.all(): + initial_categories = keys + # If all categories are in the keys and we are iterating, + # use the same keys for all chunks. If some are missing + # value labels, then we will fall back to the categories + # varying across chunks. + else: + if self._chunksize is not None: + # warn is using an iterator + warnings.warn( + categorical_conversion_warning, CategoricalConversionWarning + ) + initial_categories = None + cat_data = Categorical( + column, categories=initial_categories, ordered=order_categoricals + ) + if initial_categories is None: + # If None here, then we need to match the cats in the Categorical + categories = [] + for category in cat_data.categories: + if category in vl: + categories.append(vl[category]) + else: + categories.append(category) + else: + # If all cats are matched, we can use the values + categories = list(vl.values()) try: + # Try to catch duplicate categories cat_data.categories = categories except ValueError as err: vc = Series(categories).value_counts() diff --git a/pandas/tests/io/data/stata/stata-dta-partially-labeled.dta b/pandas/tests/io/data/stata/stata-dta-partially-labeled.dta new file mode 100644 index 0000000000000000000000000000000000000000..b9abdb8827432d59ab781a079f96cd466570d483 GIT binary patch literal 1390 zcmd5+y-yTD6yGx#E(Qw>%_SCB2+?4ey>0l&%XkTBb3j8wcg)@ma>*=rnHfl~($3D( z(u%^uKf*u2ih|A|9=A|LW2K9Byf9@Y7_81qnl3{M*0V?UIy-iUQ;H~s<~DfAA67CkGqTe?BdaA3DI zefP+?*}s3~^0!NDw}Ax{7^{_!M7*(HPOEaopSsBKo8e%dAX93bE(X;lp#OBmpBk=S k1P!K+{&b<_?;jP?`Lox16iMzMI>*#cXtIZ~CwgM90h9mgy8r+H literal 0 HcmV?d00001 diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index 698b5417b471b..aa3aa61bbb984 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -20,6 +20,7 @@ from pandas.io.parsers import read_csv from pandas.io.stata import ( + CategoricalConversionWarning, InvalidColumnName, PossiblePrecisionLoss, StataMissingValue, @@ -1923,3 +1924,62 @@ def test_compression_dict(method, file_ext): fp = path reread = read_stata(fp, index_col="index") tm.assert_frame_equal(reread, df) + + +@pytest.mark.parametrize("version", [114, 117, 118, 119, None]) +def test_chunked_categorical(version): + df = DataFrame({"cats": Series(["a", "b", "a", "b", "c"], dtype="category")}) + df.index.name = "index" + with tm.ensure_clean() as path: + df.to_stata(path, version=version) + reader = StataReader(path, chunksize=2, order_categoricals=False) + for i, block in enumerate(reader): + block = block.set_index("index") + assert "cats" in block + tm.assert_series_equal(block.cats, df.cats.iloc[2 * i : 2 * (i + 1)]) + + +def test_chunked_categorical_partial(dirpath): + dta_file = os.path.join(dirpath, "stata-dta-partially-labeled.dta") + values = ["a", "b", "a", "b", 3.0] + with StataReader(dta_file, chunksize=2) as reader: + with tm.assert_produces_warning(CategoricalConversionWarning): + for i, block in enumerate(reader): + assert list(block.cats) == values[2 * i : 2 * (i + 1)] + if i < 2: + idx = pd.Index(["a", "b"]) + else: + idx = pd.Float64Index([3.0]) + tm.assert_index_equal(block.cats.cat.categories, idx) + with tm.assert_produces_warning(CategoricalConversionWarning): + with StataReader(dta_file, chunksize=5) as reader: + large_chunk = reader.__next__() + direct = read_stata(dta_file) + tm.assert_frame_equal(direct, large_chunk) + + +def test_iterator_errors(dirpath): + dta_file = os.path.join(dirpath, "stata-dta-partially-labeled.dta") + with pytest.raises(ValueError, match="chunksize must be a positive"): + StataReader(dta_file, chunksize=-1) + with pytest.raises(ValueError, match="chunksize must be a positive"): + StataReader(dta_file, chunksize=0) + with pytest.raises(ValueError, match="chunksize must be a positive"): + StataReader(dta_file, chunksize="apple") + with pytest.raises(ValueError, match="chunksize must be set to a positive"): + with StataReader(dta_file) as reader: + reader.__next__() + + +def test_iterator_value_labels(): + # GH 31544 + values = ["c_label", "b_label"] + ["a_label"] * 500 + df = DataFrame({f"col{k}": pd.Categorical(values, ordered=True) for k in range(2)}) + with tm.ensure_clean() as path: + df.to_stata(path, write_index=False) + reader = pd.read_stata(path, chunksize=100) + expected = pd.Index(["a_label", "b_label", "c_label"], dtype="object") + for j, chunk in enumerate(reader): + for i in range(2): + tm.assert_index_equal(chunk.dtypes[i].categories, expected) + tm.assert_frame_equal(chunk, df.iloc[j * 100 : (j + 1) * 100]) From f202c95bb91095c8d7db0fcd5c20d01ffaf1f350 Mon Sep 17 00:00:00 2001 From: Sumanau Sareen Date: Thu, 4 Jun 2020 23:11:33 +0530 Subject: [PATCH 0012/1025] Send None parameter to pandas-gbq to set no progress bar (#33477) --- ci/deps/travis-36-locale.yaml | 2 +- doc/source/getting_started/install.rst | 2 +- doc/source/whatsnew/v1.1.0.rst | 3 +++ pandas/compat/_optional.py | 2 +- pandas/io/gbq.py | 5 ++--- pandas/tests/io/test_gbq.py | 6 +----- 6 files changed, 9 insertions(+), 11 deletions(-) diff --git a/ci/deps/travis-36-locale.yaml b/ci/deps/travis-36-locale.yaml index 3fc19f1bca084..2c8403acf6971 100644 --- a/ci/deps/travis-36-locale.yaml +++ b/ci/deps/travis-36-locale.yaml @@ -27,7 +27,7 @@ dependencies: - numexpr - numpy - openpyxl - - pandas-gbq=0.8.0 + - pandas-gbq=0.12.0 - psycopg2=2.6.2 - pymysql=0.7.11 - pytables diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index ba99aaa9f430c..da1161c8f68b4 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -274,7 +274,7 @@ lxml 3.8.0 HTML parser for read_html (see :ref matplotlib 2.2.2 Visualization numba 0.46.0 Alternative execution engine for rolling operations openpyxl 2.5.7 Reading / writing for xlsx files -pandas-gbq 0.8.0 Google Big Query access +pandas-gbq 0.12.0 Google Big Query access psycopg2 PostgreSQL engine for sqlalchemy pyarrow 0.12.0 Parquet, ORC (requires 0.13.0), and feather reading / writing pymysql 0.7.11 MySQL engine for sqlalchemy diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 86d3e50493bd1..17623b943bf87 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -289,6 +289,7 @@ Other enhancements - Make :class:`pandas.core.window.Rolling` and :class:`pandas.core.window.Expanding` iterable(:issue:`11704`) - Make ``option_context`` a :class:`contextlib.ContextDecorator`, which allows it to be used as a decorator over an entire function (:issue:`34253`). - :meth:`groupby.transform` now allows ``func`` to be ``pad``, ``backfill`` and ``cumcount`` (:issue:`31269`). +- :meth `~pandas.io.gbq.read_gbq` now allows to disable progress bar (:issue:`33360`). .. --------------------------------------------------------------------------- @@ -355,6 +356,8 @@ Optional libraries below the lowest tested version may still work, but are not c +-----------------+-----------------+---------+ | xlwt | 1.2.0 | | +-----------------+-----------------+---------+ +| pandas-gbq | 1.2.0 | X | ++-----------------+-----------------+---------+ See :ref:`install.dependencies` and :ref:`install.optional_dependencies` for more. diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py index c5fd294699c45..0a5e0f5050040 100644 --- a/pandas/compat/_optional.py +++ b/pandas/compat/_optional.py @@ -15,7 +15,7 @@ "numexpr": "2.6.2", "odfpy": "1.3.0", "openpyxl": "2.5.7", - "pandas_gbq": "0.8.0", + "pandas_gbq": "0.12.0", "pyarrow": "0.13.0", "pytables": "3.4.3", "pytest": "5.0.1", diff --git a/pandas/io/gbq.py b/pandas/io/gbq.py index 405bf27cac02d..9b46f970afc66 100644 --- a/pandas/io/gbq.py +++ b/pandas/io/gbq.py @@ -162,14 +162,13 @@ def read_gbq( """ pandas_gbq = _try_import() - kwargs: Dict[str, Union[str, bool]] = {} + kwargs: Dict[str, Union[str, bool, None]] = {} # START: new kwargs. Don't populate unless explicitly set. if use_bqstorage_api is not None: kwargs["use_bqstorage_api"] = use_bqstorage_api - if progress_bar_type is not None: - kwargs["progress_bar_type"] = progress_bar_type + kwargs["progress_bar_type"] = progress_bar_type # END: new kwargs return pandas_gbq.read_gbq( diff --git a/pandas/tests/io/test_gbq.py b/pandas/tests/io/test_gbq.py index 7a5eba5264421..e9cefe3056130 100644 --- a/pandas/tests/io/test_gbq.py +++ b/pandas/tests/io/test_gbq.py @@ -142,11 +142,7 @@ def mock_read_gbq(sql, **kwargs): monkeypatch.setattr("pandas_gbq.read_gbq", mock_read_gbq) pd.read_gbq("SELECT 1", progress_bar_type=progress_bar) - - if progress_bar: - assert "progress_bar_type" in captured_kwargs - else: - assert "progress_bar_type" not in captured_kwargs + assert "progress_bar_type" in captured_kwargs @pytest.mark.single From 21e02523cfd6e371f641ff817c2386628b84a08f Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 4 Jun 2020 10:42:13 -0700 Subject: [PATCH 0013/1025] REF: avoid runtime imports in offsets (#34563) --- pandas/_libs/tslibs/offsets.pyx | 34 +++++----------------------- pandas/_libs/tslibs/tzconversion.pyx | 2 +- 2 files changed, 7 insertions(+), 29 deletions(-) diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index 63dc3407b4c55..b1d09dbb26fca 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -31,8 +31,6 @@ from pandas._libs.tslibs.util cimport ( is_float_object, ) -from pandas._libs.tslibs.base cimport ABCTimestamp - from pandas._libs.tslibs.ccalendar import ( MONTH_ALIASES, MONTH_TO_CAL_NUM, weekday_to_int, int_to_weekday, ) @@ -50,7 +48,9 @@ from pandas._libs.tslibs.tzconversion cimport tz_convert_single from .dtypes cimport PeriodDtypeCode from .timedeltas cimport delta_to_nanoseconds - +from .timedeltas import Timedelta +from .timestamps cimport _Timestamp +from .timestamps import Timestamp # --------------------------------------------------------------------- # Misc Helpers @@ -64,7 +64,7 @@ cdef bint is_tick_object(object obj): cdef datetime _as_datetime(datetime obj): - if isinstance(obj, ABCTimestamp): + if isinstance(obj, _Timestamp): return obj.to_pydatetime() return obj @@ -73,7 +73,7 @@ cdef bint _is_normalized(datetime dt): if dt.hour != 0 or dt.minute != 0 or dt.second != 0 or dt.microsecond != 0: # Regardless of whether dt is datetime vs Timestamp return False - if isinstance(dt, ABCTimestamp): + if isinstance(dt, _Timestamp): return dt.nanosecond == 0 return True @@ -108,7 +108,6 @@ def apply_wraps(func): # not play nicely with cython class methods def wrapper(self, other): - from pandas import Timestamp if other is NaT: return NaT @@ -585,7 +584,6 @@ cdef class BaseOffset: TimeStamp Rolled timestamp if not on offset, otherwise unchanged timestamp. """ - from pandas import Timestamp dt = Timestamp(dt) if not self.is_on_offset(dt): dt = dt - type(self)(1, normalize=self.normalize, **self.kwds) @@ -600,7 +598,6 @@ cdef class BaseOffset: TimeStamp Rolled timestamp if not on offset, otherwise unchanged timestamp. """ - from pandas import Timestamp dt = Timestamp(dt) if not self.is_on_offset(dt): dt = dt + type(self)(1, normalize=self.normalize, **self.kwds) @@ -767,7 +764,6 @@ cdef class Tick(SingleConstructorOffset): @property def delta(self): - from .timedeltas import Timedelta return self.n * Timedelta(self._nanos_inc) @property @@ -854,7 +850,7 @@ cdef class Tick(SingleConstructorOffset): def apply(self, other): # Timestamp can handle tz and nano sec, thus no need to use apply_wraps - if isinstance(other, ABCTimestamp): + if isinstance(other, _Timestamp): # GH#15126 # in order to avoid a recursive @@ -869,7 +865,6 @@ cdef class Tick(SingleConstructorOffset): return NaT elif is_datetime64_object(other) or PyDate_Check(other): # PyDate_Check includes date, datetime - from pandas import Timestamp return Timestamp(other) + self if PyDelta_Check(other): @@ -1028,7 +1023,6 @@ cdef class RelativeDeltaOffset(BaseOffset): # bring tz back from UTC calculation other = localize_pydatetime(other, tzinfo) - from .timestamps import Timestamp return Timestamp(other) else: return other + timedelta(self.n) @@ -1077,7 +1071,6 @@ cdef class RelativeDeltaOffset(BaseOffset): if k in ["days", "hours", "minutes", "seconds", "microseconds"] } if timedelta_kwds: - from .timedeltas import Timedelta delta = Timedelta(**timedelta_kwds) index = index + (self.n * delta) return index @@ -2291,7 +2284,6 @@ cdef class SemiMonthOffset(SingleConstructorOffset): @apply_index_wraps def apply_index(self, dtindex): # determine how many days away from the 1st of the month we are - from pandas import Timedelta dti = dtindex i8other = dtindex.asi8 @@ -2394,8 +2386,6 @@ cdef class SemiMonthEnd(SemiMonthOffset): ------- result : DatetimeIndex """ - from pandas import Timedelta - nanos = (roll % 2) * Timedelta(days=self.day_of_month).value dtindex += nanos.astype("timedelta64[ns]") return dtindex + Timedelta(days=-1) @@ -2453,7 +2443,6 @@ cdef class SemiMonthBegin(SemiMonthOffset): ------- result : DatetimeIndex """ - from pandas import Timedelta nanos = (roll % 2) * Timedelta(days=self.day_of_month - 1).value return dtindex + nanos.astype("timedelta64[ns]") @@ -2545,7 +2534,6 @@ cdef class Week(SingleConstructorOffset): ------- result : DatetimeIndex """ - from pandas import Timedelta from .frequencies import get_freq_code # TODO: avoid circular import i8other = dtindex.asi8 @@ -2847,8 +2835,6 @@ cdef class FY5253(FY5253Mixin): @apply_wraps def apply(self, other): - from pandas import Timestamp - norm = Timestamp(other).normalize() n = self.n @@ -3069,8 +3055,6 @@ cdef class FY5253Quarter(FY5253Mixin): num_qtrs : int tdelta : Timedelta """ - from pandas import Timestamp, Timedelta - num_qtrs = 0 norm = Timestamp(other).tz_localize(None) @@ -3101,7 +3085,6 @@ cdef class FY5253Quarter(FY5253Mixin): @apply_wraps def apply(self, other): # Note: self.n == 0 is not allowed. - from pandas import Timedelta n = self.n @@ -3141,8 +3124,6 @@ cdef class FY5253Quarter(FY5253Mixin): def year_has_extra_week(self, dt: datetime) -> bool: # Avoid round-down errors --> normalize to get # e.g. '370D' instead of '360D23H' - from pandas import Timestamp - norm = Timestamp(dt).normalize().tz_localize(None) next_year_end = self._offset.rollforward(norm) @@ -3621,9 +3602,6 @@ cpdef to_offset(freq): >>> to_offset(Hour()) """ - # TODO: avoid runtime imports - from pandas._libs.tslibs.timedeltas import Timedelta - if freq is None: return None diff --git a/pandas/_libs/tslibs/tzconversion.pyx b/pandas/_libs/tslibs/tzconversion.pyx index a90d06fa53997..02fe203637d62 100644 --- a/pandas/_libs/tslibs/tzconversion.pyx +++ b/pandas/_libs/tslibs/tzconversion.pyx @@ -20,7 +20,6 @@ from pandas._libs.tslibs.ccalendar cimport DAY_NANOS, HOUR_NANOS from pandas._libs.tslibs.nattype cimport NPY_NAT from pandas._libs.tslibs.np_datetime cimport ( npy_datetimestruct, dt64_to_dtstruct) -from pandas._libs.tslibs.timedeltas cimport delta_to_nanoseconds from pandas._libs.tslibs.timezones cimport ( get_dst_info, is_tzlocal, is_utc, get_timezone, get_utcoffset) @@ -123,6 +122,7 @@ timedelta-like} elif nonexistent == 'shift_backward': shift_backward = True elif PyDelta_Check(nonexistent): + from .timedeltas import delta_to_nanoseconds shift_delta = delta_to_nanoseconds(nonexistent) elif nonexistent not in ('raise', None): msg = ("nonexistent must be one of {'NaT', 'raise', 'shift_forward', " From f9c4ad01865b819acf304f991db762b0784c57fd Mon Sep 17 00:00:00 2001 From: Mohammad Hasnain Mohsin Rajan Date: Fri, 5 Jun 2020 00:34:32 +0530 Subject: [PATCH 0014/1025] solves ci issues with #34575 (#34579) --- asv_bench/benchmarks/arithmetic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/asv_bench/benchmarks/arithmetic.py b/asv_bench/benchmarks/arithmetic.py index a9a8a4f3add92..3ef6ab6209ea7 100644 --- a/asv_bench/benchmarks/arithmetic.py +++ b/asv_bench/benchmarks/arithmetic.py @@ -466,7 +466,7 @@ def setup(self, offset): self.rng = rng def time_apply_index(self, offset): - offset.apply_index(self.rng) + self.rng + offset class BinaryOpsMultiIndex: From c1f0919bc98eb588e92d5ade71227151c898e769 Mon Sep 17 00:00:00 2001 From: willpeppo Date: Thu, 4 Jun 2020 15:06:39 -0400 Subject: [PATCH 0015/1025] DOC: fixed PR06 in pandas.Timedeltas (#34574) --- pandas/_libs/tslibs/timedeltas.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index eb04049f18e0c..a239804ea7bc2 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -1093,7 +1093,7 @@ class Timedelta(_Timedelta): Parameters ---------- - value : Timedelta, timedelta, np.timedelta64, string, or integer + value : Timedelta, timedelta, np.timedelta64, str, or int unit : str, default 'ns' Denote the unit of the input, if input is an integer. From 90308b709e146af21d42f54f3495ce1bdf86ec14 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 4 Jun 2020 12:07:35 -0700 Subject: [PATCH 0016/1025] CLN: address FIXMEs in liboffsets (#34566) --- pandas/_libs/tslibs/offsets.pxd | 8 ++++---- pandas/_libs/tslibs/offsets.pyx | 17 +++-------------- 2 files changed, 7 insertions(+), 18 deletions(-) diff --git a/pandas/_libs/tslibs/offsets.pxd b/pandas/_libs/tslibs/offsets.pxd index 2b8ad97b83917..9a9244db4a565 100644 --- a/pandas/_libs/tslibs/offsets.pxd +++ b/pandas/_libs/tslibs/offsets.pxd @@ -5,7 +5,7 @@ cdef bint is_offset_object(object obj) cdef bint is_tick_object(object obj) cdef class BaseOffset: - cdef readonly: - int64_t n - bint normalize - dict _cache + cdef readonly: + int64_t n + bint normalize + dict _cache diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index b1d09dbb26fca..63136367a5b5c 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -1293,11 +1293,7 @@ cdef class BusinessDay(BusinessMixin): self._offset = state.pop("_offset") elif "offset" in state: self._offset = state.pop("offset") - - @property - def _params(self): - # FIXME: using cache_readonly breaks a pytables test - return BaseOffset._params.func(self) + self._cache = state.pop("_cache", {}) def _offset_str(self) -> str: def get_str(td): @@ -1384,8 +1380,6 @@ cdef class BusinessDay(BusinessMixin): if self.n > 0: shifted = (dtindex.to_perioddelta("B") - time).asi8 != 0 - # Integer-array addition is deprecated, so we use - # _time_shift directly roll = np.where(shifted, self.n - 1, self.n) shifted = asper._addsub_int_array(roll, operator.add) else: @@ -2483,12 +2477,7 @@ cdef class Week(SingleConstructorOffset): self.n = state.pop("n") self.normalize = state.pop("normalize") self.weekday = state.pop("weekday") - - @property - def _params(self): - # TODO: making this into a property shouldn't be necessary, but otherwise - # we unpickle legacy objects incorrectly - return BaseOffset._params.func(self) + self._cache = state.pop("_cache", {}) def is_anchored(self) -> bool: return self.n == 1 and self.weekday is not None @@ -2537,7 +2526,7 @@ cdef class Week(SingleConstructorOffset): from .frequencies import get_freq_code # TODO: avoid circular import i8other = dtindex.asi8 - off = (i8other % DAY_NANOS).view("timedelta64") + off = (i8other % DAY_NANOS).view("timedelta64[ns]") base, mult = get_freq_code(self.freqstr) base_period = dtindex.to_period(base) From e08679864f2b21a875ecf5444c29a91b2666973d Mon Sep 17 00:00:00 2001 From: KD-dev-lab <64783881+KD-dev-lab@users.noreply.github.com> Date: Thu, 4 Jun 2020 21:09:55 +0200 Subject: [PATCH 0017/1025] =?UTF-8?q?TST:=20Added=20test=20to=20check=20th?= =?UTF-8?q?at=20the=20freqstr=20attribute=20of=20the=20index=20is=20p?= =?UTF-8?q?=E2=80=A6=20(#34572)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pandas/tests/series/methods/test_shift.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/pandas/tests/series/methods/test_shift.py b/pandas/tests/series/methods/test_shift.py index 686e66162fe0b..f981e98100d31 100644 --- a/pandas/tests/series/methods/test_shift.py +++ b/pandas/tests/series/methods/test_shift.py @@ -276,3 +276,19 @@ def test_shift_dt64values_int_fill_deprecated(self): expected = pd.Series([pd.Timestamp(0), ser[0]]) tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("periods", [1, 2, 3, 4]) + def test_shift_preserve_freqstr(self, periods): + # GH#21275 + ser = pd.Series( + range(periods), + index=pd.date_range("2016-1-1 00:00:00", periods=periods, freq="H"), + ) + + result = ser.shift(1, "2H") + + expected = pd.Series( + range(periods), + index=pd.date_range("2016-1-1 02:00:00", periods=periods, freq="H"), + ) + tm.assert_series_equal(result, expected) From 7bc3da673a12df314f21ebb8d1eda7a954dc3422 Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Thu, 4 Jun 2020 20:37:08 +0100 Subject: [PATCH 0018/1025] TST, TYP: _use_dynamic_x (#34487) --- pandas/plotting/_matplotlib/timeseries.py | 3 ++- pandas/tests/plotting/test_datetimelike.py | 14 +++++++++++++- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/pandas/plotting/_matplotlib/timeseries.py b/pandas/plotting/_matplotlib/timeseries.py index 9d8c26093296e..475452c71db58 100644 --- a/pandas/plotting/_matplotlib/timeseries.py +++ b/pandas/plotting/_matplotlib/timeseries.py @@ -7,6 +7,7 @@ from pandas._libs.tslibs import Period, to_offset from pandas._libs.tslibs.frequencies import FreqGroup, base_and_stride, get_freq_code +from pandas._typing import FrameOrSeriesUnion from pandas.core.dtypes.generic import ( ABCDatetimeIndex, @@ -192,7 +193,7 @@ def _get_freq(ax, series: "Series"): return freq, ax_freq -def _use_dynamic_x(ax, data): +def _use_dynamic_x(ax, data: "FrameOrSeriesUnion") -> bool: freq = _get_index_freq(data.index) ax_freq = _get_ax_freq(ax) diff --git a/pandas/tests/plotting/test_datetimelike.py b/pandas/tests/plotting/test_datetimelike.py index 7dcb692e29337..738df5244955a 100644 --- a/pandas/tests/plotting/test_datetimelike.py +++ b/pandas/tests/plotting/test_datetimelike.py @@ -16,7 +16,7 @@ from pandas.core.resample import DatetimeIndex from pandas.tests.plotting.common import TestPlotBase -from pandas.tseries.offsets import DateOffset +from pandas.tseries.offsets import DateOffset, WeekOfMonth @td.skip_if_no_mpl @@ -325,6 +325,18 @@ def test_business_freq_convert(self): idx = ax.get_lines()[0].get_xdata() assert PeriodIndex(data=idx).freqstr == "M" + def test_freq_with_no_period_alias(self): + # GH34487 + freq = WeekOfMonth() + bts = tm.makeTimeSeries(5).asfreq(freq) + _, ax = self.plt.subplots() + bts.plot(ax=ax) + assert ax.get_lines()[0].get_xydata()[0, 0] == bts.index[0].toordinal() + idx = ax.get_lines()[0].get_xdata() + msg = "freq not specified and cannot be inferred" + with pytest.raises(ValueError, match=msg): + PeriodIndex(data=idx) + def test_nonzero_base(self): # GH2571 idx = date_range("2012-12-20", periods=24, freq="H") + timedelta(minutes=30) From 807d02eb7d3220ba4985710960ac264d4cd5291e Mon Sep 17 00:00:00 2001 From: Mohammad Hasnain Mohsin Rajan Date: Fri, 5 Jun 2020 02:14:20 +0530 Subject: [PATCH 0019/1025] Add nrows to read json. (#33962) --- asv_bench/benchmarks/io/json.py | 6 +++ doc/source/whatsnew/v1.1.0.rst | 1 + pandas/io/json/_json.py | 61 +++++++++++++++++++------- pandas/tests/io/json/test_readlines.py | 40 +++++++++++++++++ 4 files changed, 91 insertions(+), 17 deletions(-) diff --git a/asv_bench/benchmarks/io/json.py b/asv_bench/benchmarks/io/json.py index f478bf2aee0ba..a490e250943f5 100644 --- a/asv_bench/benchmarks/io/json.py +++ b/asv_bench/benchmarks/io/json.py @@ -53,12 +53,18 @@ def time_read_json_lines(self, index): def time_read_json_lines_concat(self, index): concat(read_json(self.fname, orient="records", lines=True, chunksize=25000)) + def time_read_json_lines_nrows(self, index): + read_json(self.fname, orient="records", lines=True, nrows=25000) + def peakmem_read_json_lines(self, index): read_json(self.fname, orient="records", lines=True) def peakmem_read_json_lines_concat(self, index): concat(read_json(self.fname, orient="records", lines=True, chunksize=25000)) + def peakmem_read_json_lines_nrows(self, index): + read_json(self.fname, orient="records", lines=True, nrows=15000) + class ToJSON(BaseIO): diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 17623b943bf87..2243790a663df 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -289,6 +289,7 @@ Other enhancements - Make :class:`pandas.core.window.Rolling` and :class:`pandas.core.window.Expanding` iterable(:issue:`11704`) - Make ``option_context`` a :class:`contextlib.ContextDecorator`, which allows it to be used as a decorator over an entire function (:issue:`34253`). - :meth:`groupby.transform` now allows ``func`` to be ``pad``, ``backfill`` and ``cumcount`` (:issue:`31269`). +- :meth:`~pandas.io.json.read_json` now accepts `nrows` parameter. (:issue:`33916`). - :meth `~pandas.io.gbq.read_gbq` now allows to disable progress bar (:issue:`33360`). .. --------------------------------------------------------------------------- diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 72aa8fdd16e6d..b973553a767ba 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -355,14 +355,15 @@ def read_json( dtype=None, convert_axes=None, convert_dates=True, - keep_default_dates=True, - numpy=False, - precise_float=False, + keep_default_dates: bool = True, + numpy: bool = False, + precise_float: bool = False, date_unit=None, encoding=None, - lines=False, - chunksize=None, + lines: bool = False, + chunksize: Optional[int] = None, compression="infer", + nrows: Optional[int] = None, ): """ Convert a JSON string to pandas object. @@ -493,6 +494,7 @@ def read_json( for more information on ``chunksize``. This can only be passed if `lines=True`. If this is None, the file will be read into memory all at once. + compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer' For on-the-fly decompression of on-disk data. If 'infer', then use gzip, bz2, zip or xz if path_or_buf is a string ending in @@ -500,6 +502,13 @@ def read_json( otherwise. If using 'zip', the ZIP file must contain only one data file to be read in. Set to None for no decompression. + nrows : int, optional + The number of lines from the line-delimited jsonfile that has to be read. + This can only be passed if `lines=True`. + If this is None, all the rows will be returned. + + .. versionadded:: 1.1 + Returns ------- Series or DataFrame @@ -600,6 +609,7 @@ def read_json( lines=lines, chunksize=chunksize, compression=compression, + nrows=nrows, ) if chunksize: @@ -629,14 +639,15 @@ def __init__( dtype, convert_axes, convert_dates, - keep_default_dates, - numpy, - precise_float, + keep_default_dates: bool, + numpy: bool, + precise_float: bool, date_unit, encoding, - lines, - chunksize, + lines: bool, + chunksize: Optional[int], compression, + nrows: Optional[int], ): self.orient = orient @@ -654,11 +665,16 @@ def __init__( self.chunksize = chunksize self.nrows_seen = 0 self.should_close = False + self.nrows = nrows if self.chunksize is not None: self.chunksize = _validate_integer("chunksize", self.chunksize, 1) if not self.lines: raise ValueError("chunksize can only be passed if lines=True") + if self.nrows is not None: + self.nrows = _validate_integer("nrows", self.nrows, 0) + if not self.lines: + raise ValueError("nrows can only be passed if lines=True") data = self._get_data_from_filepath(filepath_or_buffer) self.data = self._preprocess_data(data) @@ -671,9 +687,9 @@ def _preprocess_data(self, data): If self.chunksize, we prepare the data for the `__next__` method. Otherwise, we read it into memory for the `read` method. """ - if hasattr(data, "read") and not self.chunksize: + if hasattr(data, "read") and (not self.chunksize or not self.nrows): data = data.read() - if not hasattr(data, "read") and self.chunksize: + if not hasattr(data, "read") and (self.chunksize or self.nrows): data = StringIO(data) return data @@ -721,11 +737,17 @@ def read(self): """ Read the whole JSON input into a pandas object. """ - if self.lines and self.chunksize: - obj = concat(self) - elif self.lines: - data = ensure_str(self.data) - obj = self._get_object_parser(self._combine_lines(data.split("\n"))) + if self.lines: + if self.chunksize: + obj = concat(self) + elif self.nrows: + lines = list(islice(self.data, self.nrows)) + lines_json = self._combine_lines(lines) + obj = self._get_object_parser(lines_json) + else: + data = ensure_str(self.data) + data = data.split("\n") + obj = self._get_object_parser(self._combine_lines(data)) else: obj = self._get_object_parser(self.data) self.close() @@ -772,6 +794,11 @@ def close(self): pass def __next__(self): + if self.nrows: + if self.nrows_seen >= self.nrows: + self.close() + raise StopIteration + lines = list(islice(self.data, self.chunksize)) if lines: lines_json = self._combine_lines(lines) diff --git a/pandas/tests/io/json/test_readlines.py b/pandas/tests/io/json/test_readlines.py index e531457627342..53462eaaada8d 100644 --- a/pandas/tests/io/json/test_readlines.py +++ b/pandas/tests/io/json/test_readlines.py @@ -130,6 +130,7 @@ def test_readjson_chunks_closes(chunksize): lines=True, chunksize=chunksize, compression=None, + nrows=None, ) reader.read() assert ( @@ -179,3 +180,42 @@ def test_readjson_unicode(monkeypatch): result = read_json(path) expected = pd.DataFrame({"£©µÀÆÖÞßéöÿ": ["АБВГДабвгд가"]}) tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("nrows", [1, 2]) +def test_readjson_nrows(nrows): + # GH 33916 + # Test reading line-format JSON to Series with nrows param + jsonl = """{"a": 1, "b": 2} + {"a": 3, "b": 4} + {"a": 5, "b": 6} + {"a": 7, "b": 8}""" + result = pd.read_json(jsonl, lines=True, nrows=nrows) + expected = pd.DataFrame({"a": [1, 3, 5, 7], "b": [2, 4, 6, 8]}).iloc[:nrows] + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("nrows,chunksize", [(2, 2), (4, 2)]) +def test_readjson_nrows_chunks(nrows, chunksize): + # GH 33916 + # Test reading line-format JSON to Series with nrows and chunksize param + jsonl = """{"a": 1, "b": 2} + {"a": 3, "b": 4} + {"a": 5, "b": 6} + {"a": 7, "b": 8}""" + reader = read_json(jsonl, lines=True, nrows=nrows, chunksize=chunksize) + chunked = pd.concat(reader) + expected = pd.DataFrame({"a": [1, 3, 5, 7], "b": [2, 4, 6, 8]}).iloc[:nrows] + tm.assert_frame_equal(chunked, expected) + + +def test_readjson_nrows_requires_lines(): + # GH 33916 + # Test ValuError raised if nrows is set without setting lines in read_json + jsonl = """{"a": 1, "b": 2} + {"a": 3, "b": 4} + {"a": 5, "b": 6} + {"a": 7, "b": 8}""" + msg = "nrows can only be passed if lines=True" + with pytest.raises(ValueError, match=msg): + pd.read_json(jsonl, lines=False, nrows=2) From 1678f0b35725ae87709e157c84ed1c15551f56e5 Mon Sep 17 00:00:00 2001 From: mproszewska <38814059+mproszewska@users.noreply.github.com> Date: Thu, 4 Jun 2020 23:56:54 +0200 Subject: [PATCH 0020/1025] BUG: Handling columns from index_col in _is_potential_multi_index (#33982) --- pandas/io/parsers.py | 17 +++++++++++----- pandas/tests/io/data/excel/df_empty.xlsx | Bin 0 -> 5595 bytes pandas/tests/io/data/excel/df_equals.xlsx | Bin 0 -> 5595 bytes pandas/tests/io/excel/test_readers.py | 13 ++++++++++++ pandas/tests/io/parser/test_index_col.py | 23 ++++++++++++++++++++++ 5 files changed, 48 insertions(+), 5 deletions(-) create mode 100644 pandas/tests/io/data/excel/df_empty.xlsx create mode 100644 pandas/tests/io/data/excel/df_equals.xlsx diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index aca2f9f5ac5bb..c54e264faedd2 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -10,7 +10,7 @@ import re import sys from textwrap import fill -from typing import Any, Dict, Iterable, List, Set +from typing import Any, Dict, Iterable, List, Optional, Sequence, Set import warnings import numpy as np @@ -20,7 +20,7 @@ import pandas._libs.parsers as parsers from pandas._libs.parsers import STR_NA_VALUES from pandas._libs.tslibs import parsing -from pandas._typing import FilePathOrBuffer +from pandas._typing import FilePathOrBuffer, Union from pandas.errors import ( AbstractMethodError, EmptyDataError, @@ -1168,7 +1168,9 @@ def _is_index_col(col): return col is not None and col is not False -def _is_potential_multi_index(columns): +def _is_potential_multi_index( + columns, index_col: Optional[Union[bool, Sequence[int]]] = None +): """ Check whether or not the `columns` parameter could be converted into a MultiIndex. @@ -1177,15 +1179,20 @@ def _is_potential_multi_index(columns): ---------- columns : array-like Object which may or may not be convertible into a MultiIndex + index_col : None, bool or list, optional + Column or columns to use as the (possibly hierarchical) index Returns ------- boolean : Whether or not columns could become a MultiIndex """ + if index_col is None or isinstance(index_col, bool): + index_col = [] + return ( len(columns) and not isinstance(columns, MultiIndex) - and all(isinstance(c, tuple) for c in columns) + and all(isinstance(c, tuple) for c in columns if c not in list(index_col)) ) @@ -1570,7 +1577,7 @@ def _maybe_dedup_names(self, names): if self.mangle_dupe_cols: names = list(names) # so we can index counts = defaultdict(int) - is_potential_mi = _is_potential_multi_index(names) + is_potential_mi = _is_potential_multi_index(names, self.index_col) for i, col in enumerate(names): cur_count = counts[col] diff --git a/pandas/tests/io/data/excel/df_empty.xlsx b/pandas/tests/io/data/excel/df_empty.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..d65a92b10e2932a10438c1580972698bff213420 GIT binary patch literal 5595 zcmaJ_by(DGv!+2p8cC%Wkl2-4Km?>@>28o%8kUkqI;Esr2}$Yh6iJCix^xwkR6s)N z?D~G^$m?}JXRm92`^U~a*K^Nr?zv~w6fvD!si9pn0_1N~SEz#JN zN>1D>Kz2owC~f^Hj1G1S|Kk3UOBZiGp|Sj@1EgSi4VW~SYrHDtaP))JaG|ez0tEld zfwJNx`i8}}C7QTogsZW(=eM|!W zc??373jB(}?PNsIaD49-sDI1I>R$3q8baVBWzsaJ_1!@#cvBqHMe@kk^mOvpmUUNW z^Fy4mcCt{QwY#q}i~l2~-4~P${HBm+2D#@W>*Q*RxOici0-3u=5Z+;+p{e~>m`ITS zu!LEtxx$>?_{^N0dA%GRauQTkkyyRi^NZ$MXi)enCs-svj^~nz6%pL5&E(Sb@>~<4 zvfKm+tlt?O8z28U|I|-R%#hwpiTxyXX)2mxO?zTs&v1W6Jk4H%HQ7Xw?tc9ztSE;! zL;(!ap`=8G%;bJ{L+BtUYQ;@774}TW7dcEZD(0z(+%PjBer|9ky_e+^Z9zgStSYIr z?}V%Y{bQt81n;FTmeN+*G)M3Eh&fL}uzKcSZDT+dX>dR8J~$qamgu*MO-yGY6*~~= z9of4%Gr?v&nW^(E=R15U;B{7~KBX>;|K&#lidtxcS9Z*M=-9kqT%DrxM>-T#S7un- zJ4hh>(f?8?g2F#?mk^IkT}qPVc5s zkZu{CSTK+U3HcET@;}#nBwtBu%)g~!Z2sm1a2Flm;umIL{rYjL8H~EDhm?Vb;DKIQ zY^7*OutmvB99trG(wt5xJMN=VwN#DT&N>T8AwIBv#-$yPN#32p%79?g4dU_T zlDvec+VpA%_RL#-1sS)cyOEJVm;c{%X8xqCZ6-B5|z(I0n$1L->tC{p*hZBsqyApEg; z%)B<{IiaB>uH=B4^||K=Cm!Ku=Q$R}?>Cp$y2a{Yp$tv7Q*X;PpsvDeTt83S1h`HA*j%!@8N^EjZ%sB{^T3Mf5m% z44WE_-bz?h@;v{@a0pg0$0~xxstoXvSyn?(*5PwTH^- z^jLm_g^B79yNyi3@VA>u(%p;iYiXF6hhyhY?2Fr{b{{jpqw*G=Kqmq)8NA?sv zB2Yd*JI779(nhVicJt=6AMJJ=bunq3+~|OPIVn8SwqL-U7L*m~a@_PaYs%3vh1Rot z8oXGzZj;q7ATf1yB=0lE+2m5To~-T9Ccqb&;wkM3*X>iUx3hZ^C*drz9ayvFXumq% z_;yjoW1+!Ur7@y%txcSOBJj->Y(6up@gP^f&cD)?S!dTfqBPH@b#hef| z8P^Sd9{UvMVK~JY!f=PyEPw~Sy>2zFP$hP{B}IS&qwcYh>rFfQV_ubt8M9bVXSzK? z0`XwB&n0rUt8y8BS$;28l)99jD#O?%g7`N?n+$Z~FM+3`r=SagizP1ob$U)Ac&M+3Cy)OeYld zO3YYcFC40KsH%zP_AX?T6umgEgk~4P9~`0=c6`~MUq*Z$Ye`+(xQiQD{(v8lv-0ova___`L*_wuTk0iab$%YQo!u1CkT-&?Q_?eK<02k> zCdP|RR)7x9i{8f0-rn=F7wtE6?0=>+_?cJOU^qQb#H5^5o#M2Mxy?h3_nau~yM!zeRY2dtPl9`8mEG93Y@om38P;f3#$ z`)kQCIf&43NPG=uoC)=a?c&|AgGn+268jkOdZ})`JDJQEtI|l!+zl8~h#r*$Fc!0| zGsH&D#~;IOGDnLh84V%|q^O+$A;bmWEfbvJ$?RzkiQYex+&`<`I#d+Bq$2WXJyzT0 ztzUE-ZQP*$wBWMOS#jn$eQrWt{)>|ZGUJWTkeuxN*9J=QA5OM0bA?)Jy1UvsS^uKu z*aQ$kFOVe6GeEZ47%m!_%$gs(R{6}(W?X2($c~rt;#f{>x5a4-vKW%GrI0RBDc9?q0OmaBQK60gq z-r}xzqz&CJNz&fuS`M8lLwUwcpvF`+1}L#Dw)x7cV2&g z(Xg$K-fCY>M1XsAwG0_B57jwp!3m>O@Y|W z$bM-;oU(kZu%0&_cpX6(yE`9fbjI>m)@<0~tM8DNx0BBW-csN*s5=1b^P2Z2h#40S zK~zd-hgw5+)cHpC72w(EW0aZ|hM4tEch%b5datT2W{GZ^7WlpZ4ieoMbG`kXe^Mjf zeC;mZJ={ab)6e3cC72oJSca=YUT6q;-gs|nb$ic;0Cr=sLhb4HIqSiGQ-*9s)ylFT z1_~L~Z>K&}$a}N>wM!#u2Gylen7G+Mq3&*czaAhIrF7^j!xq6L9eY(I?p2*T4Va6v zi0MV_sn!xW&jcNhIu&X3bYe2Y*{tUyNbN_yZKh>K8b2Dr7{iEJ;SAq;%hq zf?u@W>234Q7mw+az}mBf?F`sp>x;O;qEXO~3ZBcitG!F;-$`D%qgJfCsX(XC<~<9G7)8x5R_b>$7U>*xAe z?n~VQ!Dbs!ZFP(x^e>(I1Ax%oAWR75{6m|HElADsFJDdx# z9BL;`?{@!z;lsG|CfkD)Ad$JmyNg)x@hybj{w-=YVQRKn7GuUAF7DzK+fT@5JHAvI z3i6Gbg}^5xaYGM6EX&ecSzrk=h!0=JrHZ*Hz5`y1ND^2OClJ3}otffDHMD*3k*8=4 zc5>*5_L%ZRSMLjD$?&Q3nc%{bM1y_hiTc!pr_!U*t&IBN^Rjv&`uVq5>Jxznx@8?y z4XMRMNnV(mFPdayX#2k~fBYuQI6!3XhYmU#=xdTE46&IAjKfWXh#fsFFkd~I7Wj$I zDUgNN=@$Am0=sEv?^}Ib{v5%0q~4y6UFsH1OC2GTOKgz@+Eid~+s}cycfnCz@6T!D zeYZ$|-0z*eeJ~?kjIHlpDkH3MvNuSPFYB*}<(utd~bOiZzEyp(n zJV%$8jQB4n8F+Q1(hYR17+vwHL5~x~;#ji~U{|}Kni8=eMhTeK_>&?^`g(_*q(CmN z#9Y{kN8`_!{%|y)ptR!J*O7;FwTrmT5aswCY^&6;jF9?1Mnd_XmuD_H*=|p~NA|2* zT7$)Edd+2Tew(C`&hbS+R6@0UI8HUv?8%23+)NKr2(sSp(z-D)cyJOH#O%#!2F;Vz z;`HEQC}gR-ZR!|t1mP`p+DA`q5ObfYle~{jRAE%H^=PjqDY%U5Fi-9^x9|E2KYwbD z>TIoo&f}bkGD(ZHO$T)*gD!We+FQ}8 zQtZo3#}fsdJKdGX&qZ6)8pL9+yr7E&HUu7p#siqdA7%p#-*NzU_{2|fP-(g&X?9|T z?3vTps5GHtP@w%@qg~$|q1I^swrfkZKOL{{QBaGe-v&S)_2-)DPw(qH0Mz^O+cc0Z z?Ek!fZ(jZkaD6UCtqOkIG*T7)4)CAF!JqEer8jDL{cS-=kp4Bw{^@#MeWA4ZZ(GLu zzyA8aO8rmg>k+?h>%r?jPPHC&!2Iw*DgvJ{5Aj* nkpCtV{)}?{)%tfHbCFs3f2v+h5gP|}6h873h)lKYFR%Xr|F=qU literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/excel/df_equals.xlsx b/pandas/tests/io/data/excel/df_equals.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..d65a92b10e2932a10438c1580972698bff213420 GIT binary patch literal 5595 zcmaJ_by(DGv!+2p8cC%Wkl2-4Km?>@>28o%8kUkqI;Esr2}$Yh6iJCix^xwkR6s)N z?D~G^$m?}JXRm92`^U~a*K^Nr?zv~w6fvD!si9pn0_1N~SEz#JN zN>1D>Kz2owC~f^Hj1G1S|Kk3UOBZiGp|Sj@1EgSi4VW~SYrHDtaP))JaG|ez0tEld zfwJNx`i8}}C7QTogsZW(=eM|!W zc??373jB(}?PNsIaD49-sDI1I>R$3q8baVBWzsaJ_1!@#cvBqHMe@kk^mOvpmUUNW z^Fy4mcCt{QwY#q}i~l2~-4~P${HBm+2D#@W>*Q*RxOici0-3u=5Z+;+p{e~>m`ITS zu!LEtxx$>?_{^N0dA%GRauQTkkyyRi^NZ$MXi)enCs-svj^~nz6%pL5&E(Sb@>~<4 zvfKm+tlt?O8z28U|I|-R%#hwpiTxyXX)2mxO?zTs&v1W6Jk4H%HQ7Xw?tc9ztSE;! zL;(!ap`=8G%;bJ{L+BtUYQ;@774}TW7dcEZD(0z(+%PjBer|9ky_e+^Z9zgStSYIr z?}V%Y{bQt81n;FTmeN+*G)M3Eh&fL}uzKcSZDT+dX>dR8J~$qamgu*MO-yGY6*~~= z9of4%Gr?v&nW^(E=R15U;B{7~KBX>;|K&#lidtxcS9Z*M=-9kqT%DrxM>-T#S7un- zJ4hh>(f?8?g2F#?mk^IkT}qPVc5s zkZu{CSTK+U3HcET@;}#nBwtBu%)g~!Z2sm1a2Flm;umIL{rYjL8H~EDhm?Vb;DKIQ zY^7*OutmvB99trG(wt5xJMN=VwN#DT&N>T8AwIBv#-$yPN#32p%79?g4dU_T zlDvec+VpA%_RL#-1sS)cyOEJVm;c{%X8xqCZ6-B5|z(I0n$1L->tC{p*hZBsqyApEg; z%)B<{IiaB>uH=B4^||K=Cm!Ku=Q$R}?>Cp$y2a{Yp$tv7Q*X;PpsvDeTt83S1h`HA*j%!@8N^EjZ%sB{^T3Mf5m% z44WE_-bz?h@;v{@a0pg0$0~xxstoXvSyn?(*5PwTH^- z^jLm_g^B79yNyi3@VA>u(%p;iYiXF6hhyhY?2Fr{b{{jpqw*G=Kqmq)8NA?sv zB2Yd*JI779(nhVicJt=6AMJJ=bunq3+~|OPIVn8SwqL-U7L*m~a@_PaYs%3vh1Rot z8oXGzZj;q7ATf1yB=0lE+2m5To~-T9Ccqb&;wkM3*X>iUx3hZ^C*drz9ayvFXumq% z_;yjoW1+!Ur7@y%txcSOBJj->Y(6up@gP^f&cD)?S!dTfqBPH@b#hef| z8P^Sd9{UvMVK~JY!f=PyEPw~Sy>2zFP$hP{B}IS&qwcYh>rFfQV_ubt8M9bVXSzK? z0`XwB&n0rUt8y8BS$;28l)99jD#O?%g7`N?n+$Z~FM+3`r=SagizP1ob$U)Ac&M+3Cy)OeYld zO3YYcFC40KsH%zP_AX?T6umgEgk~4P9~`0=c6`~MUq*Z$Ye`+(xQiQD{(v8lv-0ova___`L*_wuTk0iab$%YQo!u1CkT-&?Q_?eK<02k> zCdP|RR)7x9i{8f0-rn=F7wtE6?0=>+_?cJOU^qQb#H5^5o#M2Mxy?h3_nau~yM!zeRY2dtPl9`8mEG93Y@om38P;f3#$ z`)kQCIf&43NPG=uoC)=a?c&|AgGn+268jkOdZ})`JDJQEtI|l!+zl8~h#r*$Fc!0| zGsH&D#~;IOGDnLh84V%|q^O+$A;bmWEfbvJ$?RzkiQYex+&`<`I#d+Bq$2WXJyzT0 ztzUE-ZQP*$wBWMOS#jn$eQrWt{)>|ZGUJWTkeuxN*9J=QA5OM0bA?)Jy1UvsS^uKu z*aQ$kFOVe6GeEZ47%m!_%$gs(R{6}(W?X2($c~rt;#f{>x5a4-vKW%GrI0RBDc9?q0OmaBQK60gq z-r}xzqz&CJNz&fuS`M8lLwUwcpvF`+1}L#Dw)x7cV2&g z(Xg$K-fCY>M1XsAwG0_B57jwp!3m>O@Y|W z$bM-;oU(kZu%0&_cpX6(yE`9fbjI>m)@<0~tM8DNx0BBW-csN*s5=1b^P2Z2h#40S zK~zd-hgw5+)cHpC72w(EW0aZ|hM4tEch%b5datT2W{GZ^7WlpZ4ieoMbG`kXe^Mjf zeC;mZJ={ab)6e3cC72oJSca=YUT6q;-gs|nb$ic;0Cr=sLhb4HIqSiGQ-*9s)ylFT z1_~L~Z>K&}$a}N>wM!#u2Gylen7G+Mq3&*czaAhIrF7^j!xq6L9eY(I?p2*T4Va6v zi0MV_sn!xW&jcNhIu&X3bYe2Y*{tUyNbN_yZKh>K8b2Dr7{iEJ;SAq;%hq zf?u@W>234Q7mw+az}mBf?F`sp>x;O;qEXO~3ZBcitG!F;-$`D%qgJfCsX(XC<~<9G7)8x5R_b>$7U>*xAe z?n~VQ!Dbs!ZFP(x^e>(I1Ax%oAWR75{6m|HElADsFJDdx# z9BL;`?{@!z;lsG|CfkD)Ad$JmyNg)x@hybj{w-=YVQRKn7GuUAF7DzK+fT@5JHAvI z3i6Gbg}^5xaYGM6EX&ecSzrk=h!0=JrHZ*Hz5`y1ND^2OClJ3}otffDHMD*3k*8=4 zc5>*5_L%ZRSMLjD$?&Q3nc%{bM1y_hiTc!pr_!U*t&IBN^Rjv&`uVq5>Jxznx@8?y z4XMRMNnV(mFPdayX#2k~fBYuQI6!3XhYmU#=xdTE46&IAjKfWXh#fsFFkd~I7Wj$I zDUgNN=@$Am0=sEv?^}Ib{v5%0q~4y6UFsH1OC2GTOKgz@+Eid~+s}cycfnCz@6T!D zeYZ$|-0z*eeJ~?kjIHlpDkH3MvNuSPFYB*}<(utd~bOiZzEyp(n zJV%$8jQB4n8F+Q1(hYR17+vwHL5~x~;#ji~U{|}Kni8=eMhTeK_>&?^`g(_*q(CmN z#9Y{kN8`_!{%|y)ptR!J*O7;FwTrmT5aswCY^&6;jF9?1Mnd_XmuD_H*=|p~NA|2* zT7$)Edd+2Tew(C`&hbS+R6@0UI8HUv?8%23+)NKr2(sSp(z-D)cyJOH#O%#!2F;Vz z;`HEQC}gR-ZR!|t1mP`p+DA`q5ObfYle~{jRAE%H^=PjqDY%U5Fi-9^x9|E2KYwbD z>TIoo&f}bkGD(ZHO$T)*gD!We+FQ}8 zQtZo3#}fsdJKdGX&qZ6)8pL9+yr7E&HUu7p#siqdA7%p#-*NzU_{2|fP-(g&X?9|T z?3vTps5GHtP@w%@qg~$|q1I^swrfkZKOL{{QBaGe-v&S)_2-)DPw(qH0Mz^O+cc0Z z?Ek!fZ(jZkaD6UCtqOkIG*T7)4)CAF!JqEer8jDL{cS-=kp4Bw{^@#MeWA4ZZ(GLu zzyA8aO8rmg>k+?h>%r?jPPHC&!2Iw*DgvJ{5Aj* nkpCtV{)}?{)%tfHbCFs3f2v+h5gP|}6h873h)lKYFR%Xr|F=qU literal 0 HcmV?d00001 diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 109da630f76a2..955db982f8300 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -1130,3 +1130,16 @@ def test_excel_high_surrogate(self, engine): # should not produce a segmentation violation actual = pd.read_excel("high_surrogate.xlsx") tm.assert_frame_equal(expected, actual) + + @pytest.mark.parametrize("filename", ["df_empty.xlsx", "df_equals.xlsx"]) + def test_header_with_index_col(self, engine, filename): + # GH 33476 + idx = pd.Index(["Z"], name="I2") + cols = pd.MultiIndex.from_tuples( + [("A", "B"), ("A", "B.1")], names=["I11", "I12"] + ) + expected = pd.DataFrame([[1, 3]], index=idx, columns=cols, dtype="int64") + result = pd.read_excel( + filename, sheet_name="Sheet1", index_col=0, header=[0, 1] + ) + tm.assert_frame_equal(expected, result) diff --git a/pandas/tests/io/parser/test_index_col.py b/pandas/tests/io/parser/test_index_col.py index f67a658cadfa2..9f425168540ba 100644 --- a/pandas/tests/io/parser/test_index_col.py +++ b/pandas/tests/io/parser/test_index_col.py @@ -184,3 +184,26 @@ def test_no_multi_index_level_names_empty(all_parsers): expected.to_csv(path) result = parser.read_csv(path, index_col=[0, 1, 2]) tm.assert_frame_equal(result, expected) + + +def test_header_with_index_col(all_parsers): + # GH 33476 + parser = all_parsers + data = """ +I11,A,A +I12,B,B +I2,1,3 +""" + midx = MultiIndex.from_tuples([("A", "B"), ("A", "B.1")], names=["I11", "I12"]) + idx = Index(["I2"]) + expected = DataFrame([[1, 3]], index=idx, columns=midx) + + result = parser.read_csv(StringIO(data), index_col=0, header=[0, 1]) + tm.assert_frame_equal(result, expected) + + col_idx = Index(["A", "A.1"]) + idx = Index(["I12", "I2"], name="I11") + expected = DataFrame([["B", "B"], ["1", "3"]], index=idx, columns=col_idx) + + result = parser.read_csv(StringIO(data), index_col="I11", header=0) + tm.assert_frame_equal(result, expected) From a4cf69239f2ea8be0914f5fcd0011375f634e167 Mon Sep 17 00:00:00 2001 From: Natalie Jann <47032941+najann@users.noreply.github.com> Date: Sat, 6 Jun 2020 01:04:23 +0200 Subject: [PATCH 0021/1025] CLN: Clean csv files in test data GH34427 (#34458) --- .../comparison/comparison_with_sas.rst | 4 +- .../comparison/comparison_with_sql.rst | 2 +- .../comparison/comparison_with_stata.rst | 4 +- doc/source/user_guide/visualization.rst | 4 +- pandas/conftest.py | 2 +- pandas/plotting/_misc.py | 4 +- pandas/tests/data/iris.csv | 151 ----------- pandas/tests/data/tips.csv | 245 ------------------ pandas/tests/io/parser/data/iris.csv | 151 ----------- pandas/tests/io/test_common.py | 4 +- pandas/tests/io/test_sql.py | 2 +- pandas/tests/util/test_util.py | 2 +- 12 files changed, 14 insertions(+), 561 deletions(-) delete mode 100644 pandas/tests/data/iris.csv delete mode 100644 pandas/tests/data/tips.csv delete mode 100644 pandas/tests/io/parser/data/iris.csv diff --git a/doc/source/getting_started/comparison/comparison_with_sas.rst b/doc/source/getting_started/comparison/comparison_with_sas.rst index f12d97d1d0fde..85c6ea2c31969 100644 --- a/doc/source/getting_started/comparison/comparison_with_sas.rst +++ b/doc/source/getting_started/comparison/comparison_with_sas.rst @@ -115,7 +115,7 @@ Reading external data Like SAS, pandas provides utilities for reading in data from many formats. The ``tips`` dataset, found within the pandas -tests (`csv `_) +tests (`csv `_) will be used in many of the following examples. SAS provides ``PROC IMPORT`` to read csv data into a data set. @@ -131,7 +131,7 @@ The pandas method is :func:`read_csv`, which works similarly. .. ipython:: python url = ('https://raw.github.com/pandas-dev/' - 'pandas/master/pandas/tests/data/tips.csv') + 'pandas/master/pandas/tests/io/data/csv/tips.csv') tips = pd.read_csv(url) tips.head() diff --git a/doc/source/getting_started/comparison/comparison_with_sql.rst b/doc/source/getting_started/comparison/comparison_with_sql.rst index c46ec9b3f7090..aa7218c3e4fad 100644 --- a/doc/source/getting_started/comparison/comparison_with_sql.rst +++ b/doc/source/getting_started/comparison/comparison_with_sql.rst @@ -25,7 +25,7 @@ structure. .. ipython:: python url = ('https://raw.github.com/pandas-dev' - '/pandas/master/pandas/tests/data/tips.csv') + '/pandas/master/pandas/tests/io/data/csv/tips.csv') tips = pd.read_csv(url) tips.head() diff --git a/doc/source/getting_started/comparison/comparison_with_stata.rst b/doc/source/getting_started/comparison/comparison_with_stata.rst index decf12db77af2..06f9e45466243 100644 --- a/doc/source/getting_started/comparison/comparison_with_stata.rst +++ b/doc/source/getting_started/comparison/comparison_with_stata.rst @@ -112,7 +112,7 @@ Reading external data Like Stata, pandas provides utilities for reading in data from many formats. The ``tips`` data set, found within the pandas -tests (`csv `_) +tests (`csv `_) will be used in many of the following examples. Stata provides ``import delimited`` to read csv data into a data set in memory. @@ -128,7 +128,7 @@ the data set if presented with a url. .. ipython:: python url = ('https://raw.github.com/pandas-dev' - '/pandas/master/pandas/tests/data/tips.csv') + '/pandas/master/pandas/tests/io/data/csv/tips.csv') tips = pd.read_csv(url) tips.head() diff --git a/doc/source/user_guide/visualization.rst b/doc/source/user_guide/visualization.rst index 814627043cfc8..5dca9d4c900dc 100644 --- a/doc/source/user_guide/visualization.rst +++ b/doc/source/user_guide/visualization.rst @@ -865,7 +865,7 @@ for more information. By coloring these curves differently for each class it is possible to visualize data clustering. Curves belonging to samples of the same class will usually be closer together and form larger structures. -**Note**: The "Iris" dataset is available `here `__. +**Note**: The "Iris" dataset is available `here `__. .. ipython:: python @@ -1025,7 +1025,7 @@ be colored differently. See the R package `Radviz `__ for more information. -**Note**: The "Iris" dataset is available `here `__. +**Note**: The "Iris" dataset is available `here `__. .. ipython:: python diff --git a/pandas/conftest.py b/pandas/conftest.py index 1e7f1b769c856..e4cb3270b9acf 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -813,7 +813,7 @@ def iris(datapath): """ The iris dataset as a DataFrame. """ - return pd.read_csv(datapath("data", "iris.csv")) + return pd.read_csv(datapath("io", "data", "csv", "iris.csv")) # ---------------------------------------------------------------- diff --git a/pandas/plotting/_misc.py b/pandas/plotting/_misc.py index 594b95d1937ea..3056977ec78ad 100644 --- a/pandas/plotting/_misc.py +++ b/pandas/plotting/_misc.py @@ -263,7 +263,7 @@ def andrews_curves( >>> df = pd.read_csv( ... 'https://raw.github.com/pandas-dev/' - ... 'pandas/master/pandas/tests/data/iris.csv' + ... 'pandas/master/pandas/tests/io/data/csv/iris.csv' ... ) >>> pd.plotting.andrews_curves(df, 'Name') """ @@ -387,7 +387,7 @@ def parallel_coordinates( >>> df = pd.read_csv( ... 'https://raw.github.com/pandas-dev/' - ... 'pandas/master/pandas/tests/data/iris.csv' + ... 'pandas/master/pandas/tests/io/data/csv/iris.csv' ... ) >>> pd.plotting.parallel_coordinates( ... df, 'Name', color=('#556270', '#4ECDC4', '#C7F464') diff --git a/pandas/tests/data/iris.csv b/pandas/tests/data/iris.csv deleted file mode 100644 index c19b9c3688515..0000000000000 --- a/pandas/tests/data/iris.csv +++ /dev/null @@ -1,151 +0,0 @@ -SepalLength,SepalWidth,PetalLength,PetalWidth,Name -5.1,3.5,1.4,0.2,Iris-setosa -4.9,3.0,1.4,0.2,Iris-setosa -4.7,3.2,1.3,0.2,Iris-setosa -4.6,3.1,1.5,0.2,Iris-setosa -5.0,3.6,1.4,0.2,Iris-setosa -5.4,3.9,1.7,0.4,Iris-setosa -4.6,3.4,1.4,0.3,Iris-setosa -5.0,3.4,1.5,0.2,Iris-setosa -4.4,2.9,1.4,0.2,Iris-setosa -4.9,3.1,1.5,0.1,Iris-setosa -5.4,3.7,1.5,0.2,Iris-setosa -4.8,3.4,1.6,0.2,Iris-setosa -4.8,3.0,1.4,0.1,Iris-setosa -4.3,3.0,1.1,0.1,Iris-setosa -5.8,4.0,1.2,0.2,Iris-setosa -5.7,4.4,1.5,0.4,Iris-setosa -5.4,3.9,1.3,0.4,Iris-setosa -5.1,3.5,1.4,0.3,Iris-setosa -5.7,3.8,1.7,0.3,Iris-setosa -5.1,3.8,1.5,0.3,Iris-setosa -5.4,3.4,1.7,0.2,Iris-setosa -5.1,3.7,1.5,0.4,Iris-setosa -4.6,3.6,1.0,0.2,Iris-setosa -5.1,3.3,1.7,0.5,Iris-setosa -4.8,3.4,1.9,0.2,Iris-setosa -5.0,3.0,1.6,0.2,Iris-setosa -5.0,3.4,1.6,0.4,Iris-setosa -5.2,3.5,1.5,0.2,Iris-setosa -5.2,3.4,1.4,0.2,Iris-setosa -4.7,3.2,1.6,0.2,Iris-setosa -4.8,3.1,1.6,0.2,Iris-setosa -5.4,3.4,1.5,0.4,Iris-setosa -5.2,4.1,1.5,0.1,Iris-setosa -5.5,4.2,1.4,0.2,Iris-setosa -4.9,3.1,1.5,0.1,Iris-setosa -5.0,3.2,1.2,0.2,Iris-setosa -5.5,3.5,1.3,0.2,Iris-setosa -4.9,3.1,1.5,0.1,Iris-setosa -4.4,3.0,1.3,0.2,Iris-setosa -5.1,3.4,1.5,0.2,Iris-setosa -5.0,3.5,1.3,0.3,Iris-setosa -4.5,2.3,1.3,0.3,Iris-setosa -4.4,3.2,1.3,0.2,Iris-setosa -5.0,3.5,1.6,0.6,Iris-setosa -5.1,3.8,1.9,0.4,Iris-setosa -4.8,3.0,1.4,0.3,Iris-setosa -5.1,3.8,1.6,0.2,Iris-setosa -4.6,3.2,1.4,0.2,Iris-setosa -5.3,3.7,1.5,0.2,Iris-setosa -5.0,3.3,1.4,0.2,Iris-setosa -7.0,3.2,4.7,1.4,Iris-versicolor -6.4,3.2,4.5,1.5,Iris-versicolor -6.9,3.1,4.9,1.5,Iris-versicolor -5.5,2.3,4.0,1.3,Iris-versicolor -6.5,2.8,4.6,1.5,Iris-versicolor -5.7,2.8,4.5,1.3,Iris-versicolor -6.3,3.3,4.7,1.6,Iris-versicolor -4.9,2.4,3.3,1.0,Iris-versicolor -6.6,2.9,4.6,1.3,Iris-versicolor -5.2,2.7,3.9,1.4,Iris-versicolor -5.0,2.0,3.5,1.0,Iris-versicolor -5.9,3.0,4.2,1.5,Iris-versicolor -6.0,2.2,4.0,1.0,Iris-versicolor -6.1,2.9,4.7,1.4,Iris-versicolor -5.6,2.9,3.6,1.3,Iris-versicolor -6.7,3.1,4.4,1.4,Iris-versicolor -5.6,3.0,4.5,1.5,Iris-versicolor -5.8,2.7,4.1,1.0,Iris-versicolor -6.2,2.2,4.5,1.5,Iris-versicolor -5.6,2.5,3.9,1.1,Iris-versicolor -5.9,3.2,4.8,1.8,Iris-versicolor -6.1,2.8,4.0,1.3,Iris-versicolor -6.3,2.5,4.9,1.5,Iris-versicolor -6.1,2.8,4.7,1.2,Iris-versicolor -6.4,2.9,4.3,1.3,Iris-versicolor -6.6,3.0,4.4,1.4,Iris-versicolor -6.8,2.8,4.8,1.4,Iris-versicolor -6.7,3.0,5.0,1.7,Iris-versicolor -6.0,2.9,4.5,1.5,Iris-versicolor -5.7,2.6,3.5,1.0,Iris-versicolor -5.5,2.4,3.8,1.1,Iris-versicolor -5.5,2.4,3.7,1.0,Iris-versicolor -5.8,2.7,3.9,1.2,Iris-versicolor -6.0,2.7,5.1,1.6,Iris-versicolor -5.4,3.0,4.5,1.5,Iris-versicolor -6.0,3.4,4.5,1.6,Iris-versicolor -6.7,3.1,4.7,1.5,Iris-versicolor -6.3,2.3,4.4,1.3,Iris-versicolor -5.6,3.0,4.1,1.3,Iris-versicolor -5.5,2.5,4.0,1.3,Iris-versicolor -5.5,2.6,4.4,1.2,Iris-versicolor -6.1,3.0,4.6,1.4,Iris-versicolor -5.8,2.6,4.0,1.2,Iris-versicolor -5.0,2.3,3.3,1.0,Iris-versicolor -5.6,2.7,4.2,1.3,Iris-versicolor -5.7,3.0,4.2,1.2,Iris-versicolor -5.7,2.9,4.2,1.3,Iris-versicolor -6.2,2.9,4.3,1.3,Iris-versicolor -5.1,2.5,3.0,1.1,Iris-versicolor -5.7,2.8,4.1,1.3,Iris-versicolor -6.3,3.3,6.0,2.5,Iris-virginica -5.8,2.7,5.1,1.9,Iris-virginica -7.1,3.0,5.9,2.1,Iris-virginica -6.3,2.9,5.6,1.8,Iris-virginica -6.5,3.0,5.8,2.2,Iris-virginica -7.6,3.0,6.6,2.1,Iris-virginica -4.9,2.5,4.5,1.7,Iris-virginica -7.3,2.9,6.3,1.8,Iris-virginica -6.7,2.5,5.8,1.8,Iris-virginica -7.2,3.6,6.1,2.5,Iris-virginica -6.5,3.2,5.1,2.0,Iris-virginica -6.4,2.7,5.3,1.9,Iris-virginica -6.8,3.0,5.5,2.1,Iris-virginica -5.7,2.5,5.0,2.0,Iris-virginica -5.8,2.8,5.1,2.4,Iris-virginica -6.4,3.2,5.3,2.3,Iris-virginica -6.5,3.0,5.5,1.8,Iris-virginica -7.7,3.8,6.7,2.2,Iris-virginica -7.7,2.6,6.9,2.3,Iris-virginica -6.0,2.2,5.0,1.5,Iris-virginica -6.9,3.2,5.7,2.3,Iris-virginica -5.6,2.8,4.9,2.0,Iris-virginica -7.7,2.8,6.7,2.0,Iris-virginica -6.3,2.7,4.9,1.8,Iris-virginica -6.7,3.3,5.7,2.1,Iris-virginica -7.2,3.2,6.0,1.8,Iris-virginica -6.2,2.8,4.8,1.8,Iris-virginica -6.1,3.0,4.9,1.8,Iris-virginica -6.4,2.8,5.6,2.1,Iris-virginica -7.2,3.0,5.8,1.6,Iris-virginica -7.4,2.8,6.1,1.9,Iris-virginica -7.9,3.8,6.4,2.0,Iris-virginica -6.4,2.8,5.6,2.2,Iris-virginica -6.3,2.8,5.1,1.5,Iris-virginica -6.1,2.6,5.6,1.4,Iris-virginica -7.7,3.0,6.1,2.3,Iris-virginica -6.3,3.4,5.6,2.4,Iris-virginica -6.4,3.1,5.5,1.8,Iris-virginica -6.0,3.0,4.8,1.8,Iris-virginica -6.9,3.1,5.4,2.1,Iris-virginica -6.7,3.1,5.6,2.4,Iris-virginica -6.9,3.1,5.1,2.3,Iris-virginica -5.8,2.7,5.1,1.9,Iris-virginica -6.8,3.2,5.9,2.3,Iris-virginica -6.7,3.3,5.7,2.5,Iris-virginica -6.7,3.0,5.2,2.3,Iris-virginica -6.3,2.5,5.0,1.9,Iris-virginica -6.5,3.0,5.2,2.0,Iris-virginica -6.2,3.4,5.4,2.3,Iris-virginica -5.9,3.0,5.1,1.8,Iris-virginica \ No newline at end of file diff --git a/pandas/tests/data/tips.csv b/pandas/tests/data/tips.csv deleted file mode 100644 index 856a65a69e647..0000000000000 --- a/pandas/tests/data/tips.csv +++ /dev/null @@ -1,245 +0,0 @@ -total_bill,tip,sex,smoker,day,time,size -16.99,1.01,Female,No,Sun,Dinner,2 -10.34,1.66,Male,No,Sun,Dinner,3 -21.01,3.5,Male,No,Sun,Dinner,3 -23.68,3.31,Male,No,Sun,Dinner,2 -24.59,3.61,Female,No,Sun,Dinner,4 -25.29,4.71,Male,No,Sun,Dinner,4 -8.77,2.0,Male,No,Sun,Dinner,2 -26.88,3.12,Male,No,Sun,Dinner,4 -15.04,1.96,Male,No,Sun,Dinner,2 -14.78,3.23,Male,No,Sun,Dinner,2 -10.27,1.71,Male,No,Sun,Dinner,2 -35.26,5.0,Female,No,Sun,Dinner,4 -15.42,1.57,Male,No,Sun,Dinner,2 -18.43,3.0,Male,No,Sun,Dinner,4 -14.83,3.02,Female,No,Sun,Dinner,2 -21.58,3.92,Male,No,Sun,Dinner,2 -10.33,1.67,Female,No,Sun,Dinner,3 -16.29,3.71,Male,No,Sun,Dinner,3 -16.97,3.5,Female,No,Sun,Dinner,3 -20.65,3.35,Male,No,Sat,Dinner,3 -17.92,4.08,Male,No,Sat,Dinner,2 -20.29,2.75,Female,No,Sat,Dinner,2 -15.77,2.23,Female,No,Sat,Dinner,2 -39.42,7.58,Male,No,Sat,Dinner,4 -19.82,3.18,Male,No,Sat,Dinner,2 -17.81,2.34,Male,No,Sat,Dinner,4 -13.37,2.0,Male,No,Sat,Dinner,2 -12.69,2.0,Male,No,Sat,Dinner,2 -21.7,4.3,Male,No,Sat,Dinner,2 -19.65,3.0,Female,No,Sat,Dinner,2 -9.55,1.45,Male,No,Sat,Dinner,2 -18.35,2.5,Male,No,Sat,Dinner,4 -15.06,3.0,Female,No,Sat,Dinner,2 -20.69,2.45,Female,No,Sat,Dinner,4 -17.78,3.27,Male,No,Sat,Dinner,2 -24.06,3.6,Male,No,Sat,Dinner,3 -16.31,2.0,Male,No,Sat,Dinner,3 -16.93,3.07,Female,No,Sat,Dinner,3 -18.69,2.31,Male,No,Sat,Dinner,3 -31.27,5.0,Male,No,Sat,Dinner,3 -16.04,2.24,Male,No,Sat,Dinner,3 -17.46,2.54,Male,No,Sun,Dinner,2 -13.94,3.06,Male,No,Sun,Dinner,2 -9.68,1.32,Male,No,Sun,Dinner,2 -30.4,5.6,Male,No,Sun,Dinner,4 -18.29,3.0,Male,No,Sun,Dinner,2 -22.23,5.0,Male,No,Sun,Dinner,2 -32.4,6.0,Male,No,Sun,Dinner,4 -28.55,2.05,Male,No,Sun,Dinner,3 -18.04,3.0,Male,No,Sun,Dinner,2 -12.54,2.5,Male,No,Sun,Dinner,2 -10.29,2.6,Female,No,Sun,Dinner,2 -34.81,5.2,Female,No,Sun,Dinner,4 -9.94,1.56,Male,No,Sun,Dinner,2 -25.56,4.34,Male,No,Sun,Dinner,4 -19.49,3.51,Male,No,Sun,Dinner,2 -38.01,3.0,Male,Yes,Sat,Dinner,4 -26.41,1.5,Female,No,Sat,Dinner,2 -11.24,1.76,Male,Yes,Sat,Dinner,2 -48.27,6.73,Male,No,Sat,Dinner,4 -20.29,3.21,Male,Yes,Sat,Dinner,2 -13.81,2.0,Male,Yes,Sat,Dinner,2 -11.02,1.98,Male,Yes,Sat,Dinner,2 -18.29,3.76,Male,Yes,Sat,Dinner,4 -17.59,2.64,Male,No,Sat,Dinner,3 -20.08,3.15,Male,No,Sat,Dinner,3 -16.45,2.47,Female,No,Sat,Dinner,2 -3.07,1.0,Female,Yes,Sat,Dinner,1 -20.23,2.01,Male,No,Sat,Dinner,2 -15.01,2.09,Male,Yes,Sat,Dinner,2 -12.02,1.97,Male,No,Sat,Dinner,2 -17.07,3.0,Female,No,Sat,Dinner,3 -26.86,3.14,Female,Yes,Sat,Dinner,2 -25.28,5.0,Female,Yes,Sat,Dinner,2 -14.73,2.2,Female,No,Sat,Dinner,2 -10.51,1.25,Male,No,Sat,Dinner,2 -17.92,3.08,Male,Yes,Sat,Dinner,2 -27.2,4.0,Male,No,Thur,Lunch,4 -22.76,3.0,Male,No,Thur,Lunch,2 -17.29,2.71,Male,No,Thur,Lunch,2 -19.44,3.0,Male,Yes,Thur,Lunch,2 -16.66,3.4,Male,No,Thur,Lunch,2 -10.07,1.83,Female,No,Thur,Lunch,1 -32.68,5.0,Male,Yes,Thur,Lunch,2 -15.98,2.03,Male,No,Thur,Lunch,2 -34.83,5.17,Female,No,Thur,Lunch,4 -13.03,2.0,Male,No,Thur,Lunch,2 -18.28,4.0,Male,No,Thur,Lunch,2 -24.71,5.85,Male,No,Thur,Lunch,2 -21.16,3.0,Male,No,Thur,Lunch,2 -28.97,3.0,Male,Yes,Fri,Dinner,2 -22.49,3.5,Male,No,Fri,Dinner,2 -5.75,1.0,Female,Yes,Fri,Dinner,2 -16.32,4.3,Female,Yes,Fri,Dinner,2 -22.75,3.25,Female,No,Fri,Dinner,2 -40.17,4.73,Male,Yes,Fri,Dinner,4 -27.28,4.0,Male,Yes,Fri,Dinner,2 -12.03,1.5,Male,Yes,Fri,Dinner,2 -21.01,3.0,Male,Yes,Fri,Dinner,2 -12.46,1.5,Male,No,Fri,Dinner,2 -11.35,2.5,Female,Yes,Fri,Dinner,2 -15.38,3.0,Female,Yes,Fri,Dinner,2 -44.3,2.5,Female,Yes,Sat,Dinner,3 -22.42,3.48,Female,Yes,Sat,Dinner,2 -20.92,4.08,Female,No,Sat,Dinner,2 -15.36,1.64,Male,Yes,Sat,Dinner,2 -20.49,4.06,Male,Yes,Sat,Dinner,2 -25.21,4.29,Male,Yes,Sat,Dinner,2 -18.24,3.76,Male,No,Sat,Dinner,2 -14.31,4.0,Female,Yes,Sat,Dinner,2 -14.0,3.0,Male,No,Sat,Dinner,2 -7.25,1.0,Female,No,Sat,Dinner,1 -38.07,4.0,Male,No,Sun,Dinner,3 -23.95,2.55,Male,No,Sun,Dinner,2 -25.71,4.0,Female,No,Sun,Dinner,3 -17.31,3.5,Female,No,Sun,Dinner,2 -29.93,5.07,Male,No,Sun,Dinner,4 -10.65,1.5,Female,No,Thur,Lunch,2 -12.43,1.8,Female,No,Thur,Lunch,2 -24.08,2.92,Female,No,Thur,Lunch,4 -11.69,2.31,Male,No,Thur,Lunch,2 -13.42,1.68,Female,No,Thur,Lunch,2 -14.26,2.5,Male,No,Thur,Lunch,2 -15.95,2.0,Male,No,Thur,Lunch,2 -12.48,2.52,Female,No,Thur,Lunch,2 -29.8,4.2,Female,No,Thur,Lunch,6 -8.52,1.48,Male,No,Thur,Lunch,2 -14.52,2.0,Female,No,Thur,Lunch,2 -11.38,2.0,Female,No,Thur,Lunch,2 -22.82,2.18,Male,No,Thur,Lunch,3 -19.08,1.5,Male,No,Thur,Lunch,2 -20.27,2.83,Female,No,Thur,Lunch,2 -11.17,1.5,Female,No,Thur,Lunch,2 -12.26,2.0,Female,No,Thur,Lunch,2 -18.26,3.25,Female,No,Thur,Lunch,2 -8.51,1.25,Female,No,Thur,Lunch,2 -10.33,2.0,Female,No,Thur,Lunch,2 -14.15,2.0,Female,No,Thur,Lunch,2 -16.0,2.0,Male,Yes,Thur,Lunch,2 -13.16,2.75,Female,No,Thur,Lunch,2 -17.47,3.5,Female,No,Thur,Lunch,2 -34.3,6.7,Male,No,Thur,Lunch,6 -41.19,5.0,Male,No,Thur,Lunch,5 -27.05,5.0,Female,No,Thur,Lunch,6 -16.43,2.3,Female,No,Thur,Lunch,2 -8.35,1.5,Female,No,Thur,Lunch,2 -18.64,1.36,Female,No,Thur,Lunch,3 -11.87,1.63,Female,No,Thur,Lunch,2 -9.78,1.73,Male,No,Thur,Lunch,2 -7.51,2.0,Male,No,Thur,Lunch,2 -14.07,2.5,Male,No,Sun,Dinner,2 -13.13,2.0,Male,No,Sun,Dinner,2 -17.26,2.74,Male,No,Sun,Dinner,3 -24.55,2.0,Male,No,Sun,Dinner,4 -19.77,2.0,Male,No,Sun,Dinner,4 -29.85,5.14,Female,No,Sun,Dinner,5 -48.17,5.0,Male,No,Sun,Dinner,6 -25.0,3.75,Female,No,Sun,Dinner,4 -13.39,2.61,Female,No,Sun,Dinner,2 -16.49,2.0,Male,No,Sun,Dinner,4 -21.5,3.5,Male,No,Sun,Dinner,4 -12.66,2.5,Male,No,Sun,Dinner,2 -16.21,2.0,Female,No,Sun,Dinner,3 -13.81,2.0,Male,No,Sun,Dinner,2 -17.51,3.0,Female,Yes,Sun,Dinner,2 -24.52,3.48,Male,No,Sun,Dinner,3 -20.76,2.24,Male,No,Sun,Dinner,2 -31.71,4.5,Male,No,Sun,Dinner,4 -10.59,1.61,Female,Yes,Sat,Dinner,2 -10.63,2.0,Female,Yes,Sat,Dinner,2 -50.81,10.0,Male,Yes,Sat,Dinner,3 -15.81,3.16,Male,Yes,Sat,Dinner,2 -7.25,5.15,Male,Yes,Sun,Dinner,2 -31.85,3.18,Male,Yes,Sun,Dinner,2 -16.82,4.0,Male,Yes,Sun,Dinner,2 -32.9,3.11,Male,Yes,Sun,Dinner,2 -17.89,2.0,Male,Yes,Sun,Dinner,2 -14.48,2.0,Male,Yes,Sun,Dinner,2 -9.6,4.0,Female,Yes,Sun,Dinner,2 -34.63,3.55,Male,Yes,Sun,Dinner,2 -34.65,3.68,Male,Yes,Sun,Dinner,4 -23.33,5.65,Male,Yes,Sun,Dinner,2 -45.35,3.5,Male,Yes,Sun,Dinner,3 -23.17,6.5,Male,Yes,Sun,Dinner,4 -40.55,3.0,Male,Yes,Sun,Dinner,2 -20.69,5.0,Male,No,Sun,Dinner,5 -20.9,3.5,Female,Yes,Sun,Dinner,3 -30.46,2.0,Male,Yes,Sun,Dinner,5 -18.15,3.5,Female,Yes,Sun,Dinner,3 -23.1,4.0,Male,Yes,Sun,Dinner,3 -15.69,1.5,Male,Yes,Sun,Dinner,2 -19.81,4.19,Female,Yes,Thur,Lunch,2 -28.44,2.56,Male,Yes,Thur,Lunch,2 -15.48,2.02,Male,Yes,Thur,Lunch,2 -16.58,4.0,Male,Yes,Thur,Lunch,2 -7.56,1.44,Male,No,Thur,Lunch,2 -10.34,2.0,Male,Yes,Thur,Lunch,2 -43.11,5.0,Female,Yes,Thur,Lunch,4 -13.0,2.0,Female,Yes,Thur,Lunch,2 -13.51,2.0,Male,Yes,Thur,Lunch,2 -18.71,4.0,Male,Yes,Thur,Lunch,3 -12.74,2.01,Female,Yes,Thur,Lunch,2 -13.0,2.0,Female,Yes,Thur,Lunch,2 -16.4,2.5,Female,Yes,Thur,Lunch,2 -20.53,4.0,Male,Yes,Thur,Lunch,4 -16.47,3.23,Female,Yes,Thur,Lunch,3 -26.59,3.41,Male,Yes,Sat,Dinner,3 -38.73,3.0,Male,Yes,Sat,Dinner,4 -24.27,2.03,Male,Yes,Sat,Dinner,2 -12.76,2.23,Female,Yes,Sat,Dinner,2 -30.06,2.0,Male,Yes,Sat,Dinner,3 -25.89,5.16,Male,Yes,Sat,Dinner,4 -48.33,9.0,Male,No,Sat,Dinner,4 -13.27,2.5,Female,Yes,Sat,Dinner,2 -28.17,6.5,Female,Yes,Sat,Dinner,3 -12.9,1.1,Female,Yes,Sat,Dinner,2 -28.15,3.0,Male,Yes,Sat,Dinner,5 -11.59,1.5,Male,Yes,Sat,Dinner,2 -7.74,1.44,Male,Yes,Sat,Dinner,2 -30.14,3.09,Female,Yes,Sat,Dinner,4 -12.16,2.2,Male,Yes,Fri,Lunch,2 -13.42,3.48,Female,Yes,Fri,Lunch,2 -8.58,1.92,Male,Yes,Fri,Lunch,1 -15.98,3.0,Female,No,Fri,Lunch,3 -13.42,1.58,Male,Yes,Fri,Lunch,2 -16.27,2.5,Female,Yes,Fri,Lunch,2 -10.09,2.0,Female,Yes,Fri,Lunch,2 -20.45,3.0,Male,No,Sat,Dinner,4 -13.28,2.72,Male,No,Sat,Dinner,2 -22.12,2.88,Female,Yes,Sat,Dinner,2 -24.01,2.0,Male,Yes,Sat,Dinner,4 -15.69,3.0,Male,Yes,Sat,Dinner,3 -11.61,3.39,Male,No,Sat,Dinner,2 -10.77,1.47,Male,No,Sat,Dinner,2 -15.53,3.0,Male,Yes,Sat,Dinner,2 -10.07,1.25,Male,No,Sat,Dinner,2 -12.6,1.0,Male,Yes,Sat,Dinner,2 -32.83,1.17,Male,Yes,Sat,Dinner,2 -35.83,4.67,Female,No,Sat,Dinner,3 -29.03,5.92,Male,No,Sat,Dinner,3 -27.18,2.0,Female,Yes,Sat,Dinner,2 -22.67,2.0,Male,Yes,Sat,Dinner,2 -17.82,1.75,Male,No,Sat,Dinner,2 -18.78,3.0,Female,No,Thur,Dinner,2 diff --git a/pandas/tests/io/parser/data/iris.csv b/pandas/tests/io/parser/data/iris.csv deleted file mode 100644 index c19b9c3688515..0000000000000 --- a/pandas/tests/io/parser/data/iris.csv +++ /dev/null @@ -1,151 +0,0 @@ -SepalLength,SepalWidth,PetalLength,PetalWidth,Name -5.1,3.5,1.4,0.2,Iris-setosa -4.9,3.0,1.4,0.2,Iris-setosa -4.7,3.2,1.3,0.2,Iris-setosa -4.6,3.1,1.5,0.2,Iris-setosa -5.0,3.6,1.4,0.2,Iris-setosa -5.4,3.9,1.7,0.4,Iris-setosa -4.6,3.4,1.4,0.3,Iris-setosa -5.0,3.4,1.5,0.2,Iris-setosa -4.4,2.9,1.4,0.2,Iris-setosa -4.9,3.1,1.5,0.1,Iris-setosa -5.4,3.7,1.5,0.2,Iris-setosa -4.8,3.4,1.6,0.2,Iris-setosa -4.8,3.0,1.4,0.1,Iris-setosa -4.3,3.0,1.1,0.1,Iris-setosa -5.8,4.0,1.2,0.2,Iris-setosa -5.7,4.4,1.5,0.4,Iris-setosa -5.4,3.9,1.3,0.4,Iris-setosa -5.1,3.5,1.4,0.3,Iris-setosa -5.7,3.8,1.7,0.3,Iris-setosa -5.1,3.8,1.5,0.3,Iris-setosa -5.4,3.4,1.7,0.2,Iris-setosa -5.1,3.7,1.5,0.4,Iris-setosa -4.6,3.6,1.0,0.2,Iris-setosa -5.1,3.3,1.7,0.5,Iris-setosa -4.8,3.4,1.9,0.2,Iris-setosa -5.0,3.0,1.6,0.2,Iris-setosa -5.0,3.4,1.6,0.4,Iris-setosa -5.2,3.5,1.5,0.2,Iris-setosa -5.2,3.4,1.4,0.2,Iris-setosa -4.7,3.2,1.6,0.2,Iris-setosa -4.8,3.1,1.6,0.2,Iris-setosa -5.4,3.4,1.5,0.4,Iris-setosa -5.2,4.1,1.5,0.1,Iris-setosa -5.5,4.2,1.4,0.2,Iris-setosa -4.9,3.1,1.5,0.1,Iris-setosa -5.0,3.2,1.2,0.2,Iris-setosa -5.5,3.5,1.3,0.2,Iris-setosa -4.9,3.1,1.5,0.1,Iris-setosa -4.4,3.0,1.3,0.2,Iris-setosa -5.1,3.4,1.5,0.2,Iris-setosa -5.0,3.5,1.3,0.3,Iris-setosa -4.5,2.3,1.3,0.3,Iris-setosa -4.4,3.2,1.3,0.2,Iris-setosa -5.0,3.5,1.6,0.6,Iris-setosa -5.1,3.8,1.9,0.4,Iris-setosa -4.8,3.0,1.4,0.3,Iris-setosa -5.1,3.8,1.6,0.2,Iris-setosa -4.6,3.2,1.4,0.2,Iris-setosa -5.3,3.7,1.5,0.2,Iris-setosa -5.0,3.3,1.4,0.2,Iris-setosa -7.0,3.2,4.7,1.4,Iris-versicolor -6.4,3.2,4.5,1.5,Iris-versicolor -6.9,3.1,4.9,1.5,Iris-versicolor -5.5,2.3,4.0,1.3,Iris-versicolor -6.5,2.8,4.6,1.5,Iris-versicolor -5.7,2.8,4.5,1.3,Iris-versicolor -6.3,3.3,4.7,1.6,Iris-versicolor -4.9,2.4,3.3,1.0,Iris-versicolor -6.6,2.9,4.6,1.3,Iris-versicolor -5.2,2.7,3.9,1.4,Iris-versicolor -5.0,2.0,3.5,1.0,Iris-versicolor -5.9,3.0,4.2,1.5,Iris-versicolor -6.0,2.2,4.0,1.0,Iris-versicolor -6.1,2.9,4.7,1.4,Iris-versicolor -5.6,2.9,3.6,1.3,Iris-versicolor -6.7,3.1,4.4,1.4,Iris-versicolor -5.6,3.0,4.5,1.5,Iris-versicolor -5.8,2.7,4.1,1.0,Iris-versicolor -6.2,2.2,4.5,1.5,Iris-versicolor -5.6,2.5,3.9,1.1,Iris-versicolor -5.9,3.2,4.8,1.8,Iris-versicolor -6.1,2.8,4.0,1.3,Iris-versicolor -6.3,2.5,4.9,1.5,Iris-versicolor -6.1,2.8,4.7,1.2,Iris-versicolor -6.4,2.9,4.3,1.3,Iris-versicolor -6.6,3.0,4.4,1.4,Iris-versicolor -6.8,2.8,4.8,1.4,Iris-versicolor -6.7,3.0,5.0,1.7,Iris-versicolor -6.0,2.9,4.5,1.5,Iris-versicolor -5.7,2.6,3.5,1.0,Iris-versicolor -5.5,2.4,3.8,1.1,Iris-versicolor -5.5,2.4,3.7,1.0,Iris-versicolor -5.8,2.7,3.9,1.2,Iris-versicolor -6.0,2.7,5.1,1.6,Iris-versicolor -5.4,3.0,4.5,1.5,Iris-versicolor -6.0,3.4,4.5,1.6,Iris-versicolor -6.7,3.1,4.7,1.5,Iris-versicolor -6.3,2.3,4.4,1.3,Iris-versicolor -5.6,3.0,4.1,1.3,Iris-versicolor -5.5,2.5,4.0,1.3,Iris-versicolor -5.5,2.6,4.4,1.2,Iris-versicolor -6.1,3.0,4.6,1.4,Iris-versicolor -5.8,2.6,4.0,1.2,Iris-versicolor -5.0,2.3,3.3,1.0,Iris-versicolor -5.6,2.7,4.2,1.3,Iris-versicolor -5.7,3.0,4.2,1.2,Iris-versicolor -5.7,2.9,4.2,1.3,Iris-versicolor -6.2,2.9,4.3,1.3,Iris-versicolor -5.1,2.5,3.0,1.1,Iris-versicolor -5.7,2.8,4.1,1.3,Iris-versicolor -6.3,3.3,6.0,2.5,Iris-virginica -5.8,2.7,5.1,1.9,Iris-virginica -7.1,3.0,5.9,2.1,Iris-virginica -6.3,2.9,5.6,1.8,Iris-virginica -6.5,3.0,5.8,2.2,Iris-virginica -7.6,3.0,6.6,2.1,Iris-virginica -4.9,2.5,4.5,1.7,Iris-virginica -7.3,2.9,6.3,1.8,Iris-virginica -6.7,2.5,5.8,1.8,Iris-virginica -7.2,3.6,6.1,2.5,Iris-virginica -6.5,3.2,5.1,2.0,Iris-virginica -6.4,2.7,5.3,1.9,Iris-virginica -6.8,3.0,5.5,2.1,Iris-virginica -5.7,2.5,5.0,2.0,Iris-virginica -5.8,2.8,5.1,2.4,Iris-virginica -6.4,3.2,5.3,2.3,Iris-virginica -6.5,3.0,5.5,1.8,Iris-virginica -7.7,3.8,6.7,2.2,Iris-virginica -7.7,2.6,6.9,2.3,Iris-virginica -6.0,2.2,5.0,1.5,Iris-virginica -6.9,3.2,5.7,2.3,Iris-virginica -5.6,2.8,4.9,2.0,Iris-virginica -7.7,2.8,6.7,2.0,Iris-virginica -6.3,2.7,4.9,1.8,Iris-virginica -6.7,3.3,5.7,2.1,Iris-virginica -7.2,3.2,6.0,1.8,Iris-virginica -6.2,2.8,4.8,1.8,Iris-virginica -6.1,3.0,4.9,1.8,Iris-virginica -6.4,2.8,5.6,2.1,Iris-virginica -7.2,3.0,5.8,1.6,Iris-virginica -7.4,2.8,6.1,1.9,Iris-virginica -7.9,3.8,6.4,2.0,Iris-virginica -6.4,2.8,5.6,2.2,Iris-virginica -6.3,2.8,5.1,1.5,Iris-virginica -6.1,2.6,5.6,1.4,Iris-virginica -7.7,3.0,6.1,2.3,Iris-virginica -6.3,3.4,5.6,2.4,Iris-virginica -6.4,3.1,5.5,1.8,Iris-virginica -6.0,3.0,4.8,1.8,Iris-virginica -6.9,3.1,5.4,2.1,Iris-virginica -6.7,3.1,5.6,2.4,Iris-virginica -6.9,3.1,5.1,2.3,Iris-virginica -5.8,2.7,5.1,1.9,Iris-virginica -6.8,3.2,5.9,2.3,Iris-virginica -6.7,3.3,5.7,2.5,Iris-virginica -6.7,3.0,5.2,2.3,Iris-virginica -6.3,2.5,5.0,1.9,Iris-virginica -6.5,3.0,5.2,2.0,Iris-virginica -6.2,3.4,5.4,2.3,Iris-virginica -5.9,3.0,5.1,1.8,Iris-virginica \ No newline at end of file diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index b27b028694d20..6f1d4daeb39cb 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -207,8 +207,8 @@ def test_read_expands_user_home_dir( @pytest.mark.parametrize( "reader, module, path", [ - (pd.read_csv, "os", ("data", "iris.csv")), - (pd.read_table, "os", ("data", "iris.csv")), + (pd.read_csv, "os", ("io", "data", "csv", "iris.csv")), + (pd.read_table, "os", ("io", "data", "csv", "iris.csv")), ( pd.read_fwf, "os", diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index bd53785e89bfe..7d4716e1b7d0c 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -278,7 +278,7 @@ def _get_exec(self): else: return self.conn.cursor() - @pytest.fixture(params=[("data", "iris.csv")]) + @pytest.fixture(params=[("io", "data", "csv", "iris.csv")]) def load_iris_data(self, datapath, request): import io diff --git a/pandas/tests/util/test_util.py b/pandas/tests/util/test_util.py index 8860e6fe272ce..d73a789b876f4 100644 --- a/pandas/tests/util/test_util.py +++ b/pandas/tests/util/test_util.py @@ -58,7 +58,7 @@ def test_datapath_missing(datapath): def test_datapath(datapath): - args = ("data", "iris.csv") + args = ("io", "data", "csv", "iris.csv") result = datapath(*args) expected = os.path.join(os.path.dirname(os.path.dirname(__file__)), *args) From 073792e0630224f39efdd4745907804e679d0699 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sat, 6 Jun 2020 07:33:08 +0100 Subject: [PATCH 0022/1025] PERF: isinstance ABCIndexClass and ABCExtensionArray (#34607) --- pandas/core/dtypes/generic.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/dtypes/generic.py b/pandas/core/dtypes/generic.py index 09b039e317424..36eff214fc314 100644 --- a/pandas/core/dtypes/generic.py +++ b/pandas/core/dtypes/generic.py @@ -38,7 +38,7 @@ def _check(cls, inst) -> bool: ABCIndexClass = create_pandas_abc_type( "ABCIndexClass", "_typ", - ( + { "index", "int64index", "rangeindex", @@ -50,7 +50,7 @@ def _check(cls, inst) -> bool: "periodindex", "categoricalindex", "intervalindex", - ), + }, ) ABCSeries = create_pandas_abc_type("ABCSeries", "_typ", ("series",)) @@ -66,6 +66,6 @@ def _check(cls, inst) -> bool: "ABCExtensionArray", "_typ", # Note: IntervalArray and SparseArray are included bc they have _typ="extension" - ("extension", "categorical", "periodarray", "datetimearray", "timedeltaarray"), + {"extension", "categorical", "periodarray", "datetimearray", "timedeltaarray"}, ) ABCPandasArray = create_pandas_abc_type("ABCPandasArray", "_typ", ("npy_extension",)) From 424109b0de8474045308acf011610ce4204f6cc0 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 6 Jun 2020 10:29:19 +0200 Subject: [PATCH 0023/1025] TST/REF: refactor the arithmetic tests for IntegerArray (#34454) --- .../tests/arrays/integer/test_arithmetic.py | 577 ++++++++++-------- 1 file changed, 315 insertions(+), 262 deletions(-) diff --git a/pandas/tests/arrays/integer/test_arithmetic.py b/pandas/tests/arrays/integer/test_arithmetic.py index 18f1dac3c13b2..a6c47f3192175 100644 --- a/pandas/tests/arrays/integer/test_arithmetic.py +++ b/pandas/tests/arrays/integer/test_arithmetic.py @@ -1,302 +1,355 @@ +import operator + import numpy as np import pytest import pandas as pd import pandas._testing as tm -from pandas.api.types import is_float, is_float_dtype, is_scalar -from pandas.core.arrays import IntegerArray, integer_array -from pandas.tests.extension.base import BaseOpsUtil - - -class TestArithmeticOps(BaseOpsUtil): - def _check_divmod_op(self, s, op, other, exc=None): - super()._check_divmod_op(s, op, other, None) - - def _check_op(self, s, op_name, other, exc=None): - op = self.get_op_from_name(op_name) - result = op(s, other) - - # compute expected - mask = s.isna() - - # if s is a DataFrame, squeeze to a Series - # for comparison - if isinstance(s, pd.DataFrame): - result = result.squeeze() - s = s.squeeze() - mask = mask.squeeze() - - # other array is an Integer - if isinstance(other, IntegerArray): - omask = getattr(other, "mask", None) - mask = getattr(other, "data", other) - if omask is not None: - mask |= omask - - # 1 ** na is na, so need to unmask those - if op_name == "__pow__": - mask = np.where(~s.isna() & (s == 1), False, mask) - - elif op_name == "__rpow__": - other_is_one = other == 1 - if isinstance(other_is_one, pd.Series): - other_is_one = other_is_one.fillna(False) - mask = np.where(other_is_one, False, mask) - - # float result type or float op - if ( - is_float_dtype(other) - or is_float(other) - or op_name in ["__rtruediv__", "__truediv__", "__rdiv__", "__div__"] - ): - rs = s.astype("float") - expected = op(rs, other) - self._check_op_float(result, expected, mask, s, op_name, other) - - # integer result type +from pandas.core.arrays import ExtensionArray, integer_array +import pandas.core.ops as ops + + +# TODO need to use existing utility function or move this somewhere central +def get_op_from_name(op_name): + short_opname = op_name.strip("_") + try: + op = getattr(operator, short_opname) + except AttributeError: + # Assume it is the reverse operator + rop = getattr(operator, short_opname[1:]) + op = lambda x, y: rop(y, x) + + return op + + +# Basic test for the arithmetic array ops +# ----------------------------------------------------------------------------- + + +@pytest.mark.parametrize( + "opname, exp", + [("add", [1, 3, None, None, 9]), ("mul", [0, 2, None, None, 20])], + ids=["add", "mul"], +) +def test_add_mul(dtype, opname, exp): + a = pd.array([0, 1, None, 3, 4], dtype=dtype) + b = pd.array([1, 2, 3, None, 5], dtype=dtype) + + # array / array + expected = pd.array(exp, dtype=dtype) + + op = getattr(operator, opname) + result = op(a, b) + tm.assert_extension_array_equal(result, expected) + + op = getattr(ops, "r" + opname) + result = op(a, b) + tm.assert_extension_array_equal(result, expected) + + +def test_sub(dtype): + a = pd.array([1, 2, 3, None, 5], dtype=dtype) + b = pd.array([0, 1, None, 3, 4], dtype=dtype) + + result = a - b + expected = pd.array([1, 1, None, None, 1], dtype=dtype) + tm.assert_extension_array_equal(result, expected) + + +def test_div(dtype): + # for now division gives a float numpy array + a = pd.array([1, 2, 3, None, 5], dtype=dtype) + b = pd.array([0, 1, None, 3, 4], dtype=dtype) + + result = a / b + expected = np.array([np.inf, 2, np.nan, np.nan, 1.25], dtype="float64") + tm.assert_numpy_array_equal(result, expected) + + +@pytest.mark.parametrize("zero, negative", [(0, False), (0.0, False), (-0.0, True)]) +def test_divide_by_zero(zero, negative): + # https://github.com/pandas-dev/pandas/issues/27398 + a = pd.array([0, 1, -1, None], dtype="Int64") + result = a / zero + expected = np.array([np.nan, np.inf, -np.inf, np.nan]) + if negative: + expected *= -1 + tm.assert_numpy_array_equal(result, expected) + + +def test_floordiv(dtype): + a = pd.array([1, 2, 3, None, 5], dtype=dtype) + b = pd.array([0, 1, None, 3, 4], dtype=dtype) + + result = a // b + # Series op sets 1//0 to np.inf, which IntegerArray does not do (yet) + expected = pd.array([0, 2, None, None, 1], dtype=dtype) + tm.assert_extension_array_equal(result, expected) + + +def test_mod(dtype): + a = pd.array([1, 2, 3, None, 5], dtype=dtype) + b = pd.array([0, 1, None, 3, 4], dtype=dtype) + + result = a % b + expected = pd.array([0, 0, None, None, 1], dtype=dtype) + tm.assert_extension_array_equal(result, expected) + + +def test_pow_scalar(): + a = pd.array([-1, 0, 1, None, 2], dtype="Int64") + result = a ** 0 + expected = pd.array([1, 1, 1, 1, 1], dtype="Int64") + tm.assert_extension_array_equal(result, expected) + + result = a ** 1 + expected = pd.array([-1, 0, 1, None, 2], dtype="Int64") + tm.assert_extension_array_equal(result, expected) + + result = a ** pd.NA + expected = pd.array([None, None, 1, None, None], dtype="Int64") + tm.assert_extension_array_equal(result, expected) + + result = a ** np.nan + expected = np.array([np.nan, np.nan, 1, np.nan, np.nan], dtype="float64") + tm.assert_numpy_array_equal(result, expected) + + # reversed + a = a[1:] # Can't raise integers to negative powers. + + result = 0 ** a + expected = pd.array([1, 0, None, 0], dtype="Int64") + tm.assert_extension_array_equal(result, expected) + + result = 1 ** a + expected = pd.array([1, 1, 1, 1], dtype="Int64") + tm.assert_extension_array_equal(result, expected) + + result = pd.NA ** a + expected = pd.array([1, None, None, None], dtype="Int64") + tm.assert_extension_array_equal(result, expected) + + result = np.nan ** a + expected = np.array([1, np.nan, np.nan, np.nan], dtype="float64") + tm.assert_numpy_array_equal(result, expected) + + +def test_pow_array(): + a = integer_array([0, 0, 0, 1, 1, 1, None, None, None]) + b = integer_array([0, 1, None, 0, 1, None, 0, 1, None]) + result = a ** b + expected = integer_array([1, 0, None, 1, 1, 1, 1, None, None]) + tm.assert_extension_array_equal(result, expected) + + +def test_rpow_one_to_na(): + # https://github.com/pandas-dev/pandas/issues/22022 + # https://github.com/pandas-dev/pandas/issues/29997 + arr = integer_array([np.nan, np.nan]) + result = np.array([1.0, 2.0]) ** arr + expected = np.array([1.0, np.nan]) + tm.assert_numpy_array_equal(result, expected) + + +# Test equivalence of scalars, numpy arrays with array ops +# ----------------------------------------------------------------------------- + + +def test_array_scalar_like_equivalence(data, all_arithmetic_operators): + op = get_op_from_name(all_arithmetic_operators) + + scalar = 2 + scalar_array = pd.array([2] * len(data), dtype=data.dtype) + + # TODO also add len-1 array (np.array([2], dtype=data.dtype.numpy_dtype)) + for scalar in [2, data.dtype.type(2)]: + result = op(data, scalar) + expected = op(data, scalar_array) + if isinstance(expected, ExtensionArray): + tm.assert_extension_array_equal(result, expected) else: - rs = pd.Series(s.values._data, name=s.name) - expected = op(rs, other) - self._check_op_integer(result, expected, mask, s, op_name, other) - - def _check_op_float(self, result, expected, mask, s, op_name, other): - # check comparisons that are resulting in float dtypes - - expected[mask] = np.nan - if "floordiv" in op_name: - # Series op sets 1//0 to np.inf, which IntegerArray does not do (yet) - mask2 = np.isinf(expected) & np.isnan(result) - expected[mask2] = np.nan - tm.assert_series_equal(result, expected) - - def _check_op_integer(self, result, expected, mask, s, op_name, other): - # check comparisons that are resulting in integer dtypes - - # to compare properly, we convert the expected - # to float, mask to nans and convert infs - # if we have uints then we process as uints - # then convert to float - # and we ultimately want to create a IntArray - # for comparisons - - fill_value = 0 - - # mod/rmod turn floating 0 into NaN while - # integer works as expected (no nan) - if op_name in ["__mod__", "__rmod__"]: - if is_scalar(other): - if other == 0: - expected[s.values == 0] = 0 - else: - expected = expected.fillna(0) - else: - expected[ - (s.values == 0).fillna(False) - & ((expected == 0).fillna(False) | expected.isna()) - ] = 0 - try: - expected[ - ((expected == np.inf) | (expected == -np.inf)).fillna(False) - ] = fill_value - original = expected - expected = expected.astype(s.dtype) - - except ValueError: - - expected = expected.astype(float) - expected[ - ((expected == np.inf) | (expected == -np.inf)).fillna(False) - ] = fill_value - original = expected - expected = expected.astype(s.dtype) - - expected[mask] = pd.NA - - # assert that the expected astype is ok - # (skip for unsigned as they have wrap around) - if not s.dtype.is_unsigned_integer: - original = pd.Series(original) - - # we need to fill with 0's to emulate what an astype('int') does - # (truncation) for certain ops - if op_name in ["__rtruediv__", "__rdiv__"]: - mask |= original.isna() - original = original.fillna(0).astype("int") - - original = original.astype("float") - original[mask] = np.nan - tm.assert_series_equal(original, expected.astype("float")) - - # assert our expected result - tm.assert_series_equal(result, expected) - - def test_arith_integer_array(self, data, all_arithmetic_operators): - # we operate with a rhs of an integer array - - op = all_arithmetic_operators + # TODO div still gives float ndarray -> remove this once we have Float EA + tm.assert_numpy_array_equal(result, expected) - s = pd.Series(data) - rhs = pd.Series([1] * len(data), dtype=data.dtype) - rhs.iloc[-1] = np.nan - self._check_op(s, op, rhs) +def test_array_NA(data, all_arithmetic_operators): + if "truediv" in all_arithmetic_operators: + pytest.skip("division with pd.NA raises") + op = get_op_from_name(all_arithmetic_operators) - def test_arith_series_with_scalar(self, data, all_arithmetic_operators): - # scalar - op = all_arithmetic_operators - s = pd.Series(data) - self._check_op(s, op, 1, exc=TypeError) + scalar = pd.NA + scalar_array = pd.array([pd.NA] * len(data), dtype=data.dtype) - def test_arith_frame_with_scalar(self, data, all_arithmetic_operators): - # frame & scalar - op = all_arithmetic_operators - df = pd.DataFrame({"A": data}) - self._check_op(df, op, 1, exc=TypeError) + result = op(data, scalar) + expected = op(data, scalar_array) + tm.assert_extension_array_equal(result, expected) - def test_arith_series_with_array(self, data, all_arithmetic_operators): - # ndarray & other series - op = all_arithmetic_operators - s = pd.Series(data) - other = np.ones(len(s), dtype=s.dtype.type) - self._check_op(s, op, other, exc=TypeError) - def test_arith_coerce_scalar(self, data, all_arithmetic_operators): +def test_numpy_array_equivalence(data, all_arithmetic_operators): + op = get_op_from_name(all_arithmetic_operators) - op = all_arithmetic_operators - s = pd.Series(data) + numpy_array = np.array([2] * len(data), dtype=data.dtype.numpy_dtype) + pd_array = pd.array(numpy_array, dtype=data.dtype) + + result = op(data, numpy_array) + expected = op(data, pd_array) + if isinstance(expected, ExtensionArray): + tm.assert_extension_array_equal(result, expected) + else: + # TODO div still gives float ndarray -> remove this once we have Float EA + tm.assert_numpy_array_equal(result, expected) - other = 0.01 - self._check_op(s, op, other) - @pytest.mark.parametrize("other", [1.0, np.array(1.0)]) - def test_arithmetic_conversion(self, all_arithmetic_operators, other): - # if we have a float operand we should have a float result - # if that is equal to an integer - op = self.get_op_from_name(all_arithmetic_operators) +@pytest.mark.parametrize("other", [0, 0.5]) +def test_numpy_zero_dim_ndarray(other): + arr = integer_array([1, None, 2]) + result = arr + np.array(other) + expected = arr + other + tm.assert_equal(result, expected) - s = pd.Series([1, 2, 3], dtype="Int64") - result = op(s, other) - assert result.dtype is np.dtype("float") - def test_arith_len_mismatch(self, all_arithmetic_operators): - # operating with a list-like with non-matching length raises - op = self.get_op_from_name(all_arithmetic_operators) - other = np.array([1.0]) +# Test equivalence with Series and DataFrame ops +# ----------------------------------------------------------------------------- - s = pd.Series([1, 2, 3], dtype="Int64") - with pytest.raises(ValueError, match="Lengths must match"): - op(s, other) - @pytest.mark.parametrize("other", [0, 0.5]) - def test_arith_zero_dim_ndarray(self, other): - arr = integer_array([1, None, 2]) - result = arr + np.array(other) - expected = arr + other - tm.assert_equal(result, expected) +def test_frame(data, all_arithmetic_operators): + op = get_op_from_name(all_arithmetic_operators) - def test_error(self, data, all_arithmetic_operators): - # invalid ops + # DataFrame with scalar + df = pd.DataFrame({"A": data}) + scalar = 2 - op = all_arithmetic_operators - s = pd.Series(data) - ops = getattr(s, op) - opa = getattr(data, op) + result = op(df, scalar) + expected = pd.DataFrame({"A": op(data, scalar)}) + tm.assert_frame_equal(result, expected) + + +def test_series(data, all_arithmetic_operators): + op = get_op_from_name(all_arithmetic_operators) + + s = pd.Series(data) + + # Series with scalar + scalar = 2 + result = op(s, scalar) + expected = pd.Series(op(data, scalar)) + tm.assert_series_equal(result, expected) + + # Series with np.ndarray + other = np.ones(len(data), dtype=data.dtype.type) + result = op(s, other) + expected = pd.Series(op(data, other)) + tm.assert_series_equal(result, expected) - # invalid scalars + # Series with pd.array + other = pd.array(np.ones(len(data)), dtype=data.dtype) + result = op(s, other) + expected = pd.Series(op(data, other)) + tm.assert_series_equal(result, expected) + + # Series with Series + other = pd.Series(np.ones(len(data)), dtype=data.dtype) + result = op(s, other) + expected = pd.Series(op(data, other.array)) + tm.assert_series_equal(result, expected) + + +# Test generic charachteristics / errors +# ----------------------------------------------------------------------------- + + +def test_error_invalid_values(data, all_arithmetic_operators): + + op = all_arithmetic_operators + s = pd.Series(data) + ops = getattr(s, op) + + # invalid scalars + msg = ( + r"(:?can only perform ops with numeric values)" + r"|(:?IntegerArray cannot perform the operation mod)" + ) + with pytest.raises(TypeError, match=msg): + ops("foo") + with pytest.raises(TypeError, match=msg): + ops(pd.Timestamp("20180101")) + + # invalid array-likes + with pytest.raises(TypeError, match=msg): + ops(pd.Series("foo", index=s.index)) + + if op != "__rpow__": + # TODO(extension) + # rpow with a datetimelike coerces the integer array incorrectly msg = ( - r"(:?can only perform ops with numeric values)" - r"|(:?IntegerArray cannot perform the operation mod)" + "can only perform ops with numeric values|" + "cannot perform .* with this index type: DatetimeArray|" + "Addition/subtraction of integers and integer-arrays " + "with DatetimeArray is no longer supported. *" ) with pytest.raises(TypeError, match=msg): - ops("foo") - with pytest.raises(TypeError, match=msg): - ops(pd.Timestamp("20180101")) + ops(pd.Series(pd.date_range("20180101", periods=len(s)))) - # invalid array-likes - with pytest.raises(TypeError, match=msg): - ops(pd.Series("foo", index=s.index)) - - if op != "__rpow__": - # TODO(extension) - # rpow with a datetimelike coerces the integer array incorrectly - msg = ( - "can only perform ops with numeric values|" - "cannot perform .* with this index type: DatetimeArray|" - "Addition/subtraction of integers and integer-arrays " - "with DatetimeArray is no longer supported. *" - ) - with pytest.raises(TypeError, match=msg): - ops(pd.Series(pd.date_range("20180101", periods=len(s)))) - - # 2d - result = opa(pd.DataFrame({"A": s})) - assert result is NotImplemented - - msg = r"can only perform ops with 1-d structures" - with pytest.raises(NotImplementedError, match=msg): - opa(np.arange(len(s)).reshape(-1, len(s))) - - @pytest.mark.parametrize("zero, negative", [(0, False), (0.0, False), (-0.0, True)]) - def test_divide_by_zero(self, zero, negative): - # https://github.com/pandas-dev/pandas/issues/27398 - a = pd.array([0, 1, -1, None], dtype="Int64") - result = a / zero - expected = np.array([np.nan, np.inf, -np.inf, np.nan]) - if negative: - expected *= -1 - tm.assert_numpy_array_equal(result, expected) - def test_pow_scalar(self): - a = pd.array([-1, 0, 1, None, 2], dtype="Int64") - result = a ** 0 - expected = pd.array([1, 1, 1, 1, 1], dtype="Int64") - tm.assert_extension_array_equal(result, expected) +def test_error_invalid_object(data, all_arithmetic_operators): - result = a ** 1 - expected = pd.array([-1, 0, 1, None, 2], dtype="Int64") - tm.assert_extension_array_equal(result, expected) + op = all_arithmetic_operators + opa = getattr(data, op) - result = a ** pd.NA - expected = pd.array([None, None, 1, None, None], dtype="Int64") - tm.assert_extension_array_equal(result, expected) + # 2d -> return NotImplemented + result = opa(pd.DataFrame({"A": data})) + assert result is NotImplemented - result = a ** np.nan - expected = np.array([np.nan, np.nan, 1, np.nan, np.nan], dtype="float64") - tm.assert_numpy_array_equal(result, expected) + msg = r"can only perform ops with 1-d structures" + with pytest.raises(NotImplementedError, match=msg): + opa(np.arange(len(data)).reshape(-1, len(data))) - # reversed - a = a[1:] # Can't raise integers to negative powers. - result = 0 ** a - expected = pd.array([1, 0, None, 0], dtype="Int64") - tm.assert_extension_array_equal(result, expected) +def test_error_len_mismatch(all_arithmetic_operators): + # operating with a list-like with non-matching length raises + op = get_op_from_name(all_arithmetic_operators) - result = 1 ** a - expected = pd.array([1, 1, 1, 1], dtype="Int64") - tm.assert_extension_array_equal(result, expected) + data = pd.array([1, 2, 3], dtype="Int64") - result = pd.NA ** a - expected = pd.array([1, None, None, None], dtype="Int64") - tm.assert_extension_array_equal(result, expected) + for other in [[1, 2], np.array([1.0, 2.0])]: + with pytest.raises(ValueError, match="Lengths must match"): + op(data, other) - result = np.nan ** a - expected = np.array([1, np.nan, np.nan, np.nan], dtype="float64") - tm.assert_numpy_array_equal(result, expected) + s = pd.Series(data) + with pytest.raises(ValueError, match="Lengths must match"): + op(s, other) - def test_pow_array(self): - a = integer_array([0, 0, 0, 1, 1, 1, None, None, None]) - b = integer_array([0, 1, None, 0, 1, None, 0, 1, None]) - result = a ** b - expected = integer_array([1, 0, None, 1, 1, 1, 1, None, None]) - tm.assert_extension_array_equal(result, expected) - def test_rpow_one_to_na(self): - # https://github.com/pandas-dev/pandas/issues/22022 - # https://github.com/pandas-dev/pandas/issues/29997 - arr = integer_array([np.nan, np.nan]) - result = np.array([1.0, 2.0]) ** arr - expected = np.array([1.0, np.nan]) - tm.assert_numpy_array_equal(result, expected) +# Various +# ----------------------------------------------------------------------------- + + +# TODO test unsigned overflow + + +def test_arith_coerce_scalar(data, all_arithmetic_operators): + op = get_op_from_name(all_arithmetic_operators) + s = pd.Series(data) + other = 0.01 + + result = op(s, other) + expected = op(s.astype(float), other) + # rfloordiv results in nan instead of inf + if all_arithmetic_operators == "__rfloordiv__": + expected[(expected == np.inf) | (expected == -np.inf)] = np.nan + + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("other", [1.0, np.array(1.0)]) +def test_arithmetic_conversion(all_arithmetic_operators, other): + # if we have a float operand we should have a float result + # if that is equal to an integer + op = get_op_from_name(all_arithmetic_operators) + + s = pd.Series([1, 2, 3], dtype="Int64") + result = op(s, other) + assert result.dtype is np.dtype("float") def test_cross_type_arithmetic(): From d74fd67378b1fa40be78235a63b671ae9917244f Mon Sep 17 00:00:00 2001 From: willpeppo Date: Sat, 6 Jun 2020 18:23:52 -0400 Subject: [PATCH 0024/1025] DOC: updated core/arrays/base.py for PR08 errors (#34624) --- pandas/core/arrays/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index b5e917bafca7e..79f0039a9df65 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -512,7 +512,7 @@ def argsort( kind : {'quicksort', 'mergesort', 'heapsort'}, optional Sorting algorithm. *args, **kwargs: - passed through to :func:`numpy.argsort`. + Passed through to :func:`numpy.argsort`. Returns ------- From 3cddcef3031406a04cdd26efff8e21af3ef57350 Mon Sep 17 00:00:00 2001 From: Marian Denes Date: Sun, 7 Jun 2020 12:58:54 +0200 Subject: [PATCH 0025/1025] DOC: Fixed docstring in Series .isin() method (#34600) * DOC: Fixed docstring in Series .isin() method --- pandas/core/series.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index 6b5ed86027806..ef47e52151961 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -4513,7 +4513,7 @@ def memory_usage(self, index=True, deep=False): def isin(self, values) -> "Series": """ - Check whether `values` are contained in Series. + Whether elements in Series are contained in `values`. Return a boolean Series showing whether each element in the Series matches an element in the passed sequence of `values` exactly. From b2cd9c12cdfd1ebd2052f08cb0b7ce3e191b7fa1 Mon Sep 17 00:00:00 2001 From: OlivierLuG <59281854+OlivierLuG@users.noreply.github.com> Date: Sun, 7 Jun 2020 19:19:56 +0200 Subject: [PATCH 0026/1025] TST #24444 added tests (#34627) * TST #24444 added tests * TST #24444 added tests The test was modified according to review --- pandas/tests/scalar/period/test_period.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/pandas/tests/scalar/period/test_period.py b/pandas/tests/scalar/period/test_period.py index 42bd20fd9640b..3e769b577582a 100644 --- a/pandas/tests/scalar/period/test_period.py +++ b/pandas/tests/scalar/period/test_period.py @@ -649,6 +649,26 @@ def test_to_timestamp_business_end(self): expected = pd.Timestamp("1990-01-06") - pd.Timedelta(nanoseconds=1) assert result == expected + @pytest.mark.parametrize( + "ts, expected", + [ + ("1970-01-01 00:00:00", 0), + ("1970-01-01 00:00:00.000001", 1), + ("1970-01-01 00:00:00.00001", 10), + ("1970-01-01 00:00:00.499", 499000), + ("1999-12-31 23:59:59.999", 999000), + ("1999-12-31 23:59:59.999999", 999999), + ("2050-12-31 23:59:59.5", 500000), + ("2050-12-31 23:59:59.500001", 500001), + ("2050-12-31 23:59:59.123456", 123456), + ], + ) + @pytest.mark.parametrize("freq", [None, "us", "ns"]) + def test_to_timestamp_microsecond(self, ts, expected, freq): + # GH 24444 + result = Period(ts).to_timestamp(freq=freq).microsecond + assert result == expected + # -------------------------------------------------------------- # Rendering: __repr__, strftime, etc From 9b8bc79822df6878c8112164b8645af8b3df81f1 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sun, 7 Jun 2020 21:07:21 +0100 Subject: [PATCH 0027/1025] TYP: some type annotations for interpolate (#34631) --- pandas/core/generic.py | 18 ++++----- pandas/core/internals/blocks.py | 66 ++++++++++++++++++--------------- pandas/core/missing.py | 23 +++++++----- 3 files changed, 58 insertions(+), 49 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 4b074924baaf2..714a332be2196 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6863,16 +6863,16 @@ def replace( @Appender(_shared_docs["interpolate"] % _shared_doc_kwargs) def interpolate( - self, - method="linear", - axis=0, - limit=None, - inplace=False, - limit_direction="forward", - limit_area=None, - downcast=None, + self: FrameOrSeries, + method: str = "linear", + axis: Axis = 0, + limit: Optional[int] = None, + inplace: bool_t = False, + limit_direction: str = "forward", + limit_area: Optional[str] = None, + downcast: Optional[str] = None, **kwargs, - ): + ) -> Optional[FrameOrSeries]: """ Interpolate values according to different methods. """ diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index e70c8f9d5f09a..a4a8d672895ce 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1,7 +1,7 @@ from datetime import datetime, timedelta import inspect import re -from typing import Any, List +from typing import TYPE_CHECKING, Any, List, Optional import warnings import numpy as np @@ -83,6 +83,9 @@ import pandas.core.missing as missing from pandas.core.nanops import nanpercentile +if TYPE_CHECKING: + from pandas import Index + class Block(PandasObject): """ @@ -1066,16 +1069,16 @@ def coerce_to_target_dtype(self, other): def interpolate( self, - method="pad", - axis=0, - index=None, - inplace=False, - limit=None, - limit_direction="forward", - limit_area=None, - fill_value=None, - coerce=False, - downcast=None, + method: str = "pad", + axis: int = 0, + index: Optional["Index"] = None, + inplace: bool = False, + limit: Optional[int] = None, + limit_direction: str = "forward", + limit_area: Optional[str] = None, + fill_value: Optional[Any] = None, + coerce: bool = False, + downcast: Optional[str] = None, **kwargs, ): @@ -1115,6 +1118,9 @@ def check_int_bool(self, inplace): r = check_int_bool(self, inplace) if r is not None: return r + + assert index is not None # for mypy + return self._interpolate( method=m, index=index, @@ -1130,13 +1136,13 @@ def check_int_bool(self, inplace): def _interpolate_with_fill( self, - method="pad", - axis=0, - inplace=False, - limit=None, - fill_value=None, - coerce=False, - downcast=None, + method: str = "pad", + axis: int = 0, + inplace: bool = False, + limit: Optional[int] = None, + fill_value: Optional[Any] = None, + coerce: bool = False, + downcast: Optional[str] = None, ) -> List["Block"]: """ fillna but using the interpolate machinery """ inplace = validate_bool_kwarg(inplace, "inplace") @@ -1169,15 +1175,15 @@ def _interpolate_with_fill( def _interpolate( self, - method=None, - index=None, - fill_value=None, - axis=0, - limit=None, - limit_direction="forward", - limit_area=None, - inplace=False, - downcast=None, + method: str, + index: "Index", + fill_value: Optional[Any] = None, + axis: int = 0, + limit: Optional[int] = None, + limit_direction: str = "forward", + limit_area: Optional[str] = None, + inplace: bool = False, + downcast: Optional[str] = None, **kwargs, ) -> List["Block"]: """ interpolate using scipy wrappers """ @@ -1200,14 +1206,14 @@ def _interpolate( ) # process 1-d slices in the axis direction - def func(x): + def func(yvalues: np.ndarray) -> np.ndarray: # process a 1-d slice, returning it # should the axis argument be handled below in apply_along_axis? # i.e. not an arg to missing.interpolate_1d return missing.interpolate_1d( - index, - x, + xvalues=index, + yvalues=yvalues, method=method, limit=limit, limit_direction=limit_direction, diff --git a/pandas/core/missing.py b/pandas/core/missing.py index 79bbef5fa5505..d8671616f944e 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -2,6 +2,8 @@ Routines for filling missing data. """ +from typing import Any, List, Optional, Set, Union + import numpy as np from pandas._libs import algos, lib @@ -92,7 +94,7 @@ def clean_fill_method(method, allow_nearest=False): return method -def clean_interp_method(method, **kwargs): +def clean_interp_method(method: str, **kwargs) -> str: order = kwargs.get("order") valid = [ "linear", @@ -160,15 +162,15 @@ def find_valid_index(values, how: str): def interpolate_1d( - xvalues, - yvalues, - method="linear", - limit=None, - limit_direction="forward", - limit_area=None, - fill_value=None, - bounds_error=False, - order=None, + xvalues: np.ndarray, + yvalues: np.ndarray, + method: Optional[str] = "linear", + limit: Optional[int] = None, + limit_direction: str = "forward", + limit_area: Optional[str] = None, + fill_value: Optional[Any] = None, + bounds_error: bool = False, + order: Optional[int] = None, **kwargs, ): """ @@ -238,6 +240,7 @@ def interpolate_1d( # are more than'limit' away from the prior non-NaN. # set preserve_nans based on direction using _interp_limit + preserve_nans: Union[List, Set] if limit_direction == "forward": preserve_nans = start_nans | set(_interp_limit(invalid, limit, 0)) elif limit_direction == "backward": From 769403ed1eab3927621a28ad286f32aaca0099ad Mon Sep 17 00:00:00 2001 From: Ror <47309835+rorcores@users.noreply.github.com> Date: Sun, 7 Jun 2020 13:09:51 -0700 Subject: [PATCH 0028/1025] DOC: Added documentation for building using pyenv (#34490) --- doc/source/development/contributing.rst | 30 ++++++++++++++++++++++--- 1 file changed, 27 insertions(+), 3 deletions(-) diff --git a/doc/source/development/contributing.rst b/doc/source/development/contributing.rst index 457aabcff0c17..163d345b4f829 100644 --- a/doc/source/development/contributing.rst +++ b/doc/source/development/contributing.rst @@ -270,7 +270,7 @@ Creating a Python environment (pip) If you aren't using conda for your development environment, follow these instructions. You'll need to have at least Python 3.6.1 installed on your system. -**Unix**/**Mac OS** +**Unix**/**Mac OS with virtualenv** .. code-block:: bash @@ -286,7 +286,31 @@ You'll need to have at least Python 3.6.1 installed on your system. python -m pip install -r requirements-dev.txt # Build and install pandas - python setup.py build_ext --inplace -j 0 + python setup.py build_ext --inplace -j 4 + python -m pip install -e . --no-build-isolation --no-use-pep517 + +**Unix**/**Mac OS with pyenv** + +Consult the docs for setting up pyenv `here `__. + +.. code-block:: bash + + # Create a virtual environment + # Use an ENV_DIR of your choice. We'll use ~/Users//.pyenv/versions/pandas-dev + + pyenv virtualenv + + # For instance: + pyenv virtualenv 3.7.6 pandas-dev + + # Activate the virtualenv + pyenv activate pandas-dev + + # Now install the build dependencies in the cloned pandas repo + python -m pip install -r requirements-dev.txt + + # Build and install pandas + python setup.py build_ext --inplace -j 4 python -m pip install -e . --no-build-isolation --no-use-pep517 **Windows** @@ -312,7 +336,7 @@ should already exist. python -m pip install -r requirements-dev.txt # Build and install pandas - python setup.py build_ext --inplace -j 0 + python setup.py build_ext --inplace -j 4 python -m pip install -e . --no-build-isolation --no-use-pep517 Creating a branch From ad40cb3b9865ebab750e3486a44b9bb30e8aa91b Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 7 Jun 2020 13:11:12 -0700 Subject: [PATCH 0029/1025] DOC: docstring, closes #23475 (#34619) --- pandas/_libs/tslibs/period.pyx | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index e88a20bc549bd..5c890c7fbf59d 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -1529,7 +1529,16 @@ cdef class _Period: self._dtype = PeriodPseudoDtype(freq._period_dtype_code) @classmethod - def _maybe_convert_freq(cls, object freq): + def _maybe_convert_freq(cls, object freq) -> BaseOffset: + """ + Internally we allow integer and tuple representations (for now) that + are not recognized by to_offset, so we convert them here. Also, a + Period's freq attribute must have `freq.n > 0`, which we check for here. + + Returns + ------- + DateOffset + """ if isinstance(freq, (int, tuple)): code, stride = get_freq_code(freq) freq = get_freq_str(code, stride) From 7d99c4a8e3433c9a5d1e3c60b1e80ed7806ac1ad Mon Sep 17 00:00:00 2001 From: Nick Newman <31258241+nnick14@users.noreply.github.com> Date: Sun, 7 Jun 2020 16:26:13 -0400 Subject: [PATCH 0030/1025] DOC: updating the `indicator` wording in `merge` doc (#34485) --- pandas/core/frame.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 5f8ab8966c1f0..873ece27e548a 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -233,14 +233,13 @@ copy : bool, default True If False, avoid copy if possible. indicator : bool or str, default False - If True, adds a column to output DataFrame called "_merge" with - information on the source of each row. - If string, column with information on source of each row will be added to - output DataFrame, and column will be named value of string. - Information column is Categorical-type and takes on a value of "left_only" - for observations whose merge key only appears in 'left' DataFrame, - "right_only" for observations whose merge key only appears in 'right' - DataFrame, and "both" if the observation's merge key is found in both. + If True, adds a column to the output DataFrame called "_merge" with + information on the source of each row. The column can be given a different + name by providing a string argument. The column will have a Categorical + type with the value of "left_only" for observations whose merge key only + appears in the left DataFrame, "right_only" for observations + whose merge key only appears in the right DataFrame, and "both" + if the observation's merge key is found in both DataFrames. validate : str, optional If specified, checks if merge is of specified type. From 33ed872d9afc2ee99b390ad1d0e67dcada19269d Mon Sep 17 00:00:00 2001 From: Atsushi Nukariya Date: Mon, 8 Jun 2020 05:59:51 +0900 Subject: [PATCH 0031/1025] Improve document for **kwargs argument of pandas.DataFrame.to_markdown (#34594) --- pandas/core/frame.py | 11 +++++++++++ pandas/core/generic.py | 3 ++- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 873ece27e548a..b522920ec9f23 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2195,6 +2195,17 @@ def to_feather(self, path, **kwargs) -> None: |---:|:-----------|:-----------| | 0 | elk | dog | | 1 | pig | quetzal | + + Output markdown with a tabulate option. + + >>> print(df.to_markdown(tablefmt="grid")) + +----+------------+------------+ + | | animal_1 | animal_2 | + +====+============+============+ + | 0 | elk | dog | + +----+------------+------------+ + | 1 | pig | quetzal | + +----+------------+------------+ """ ) @Substitution(klass="DataFrame") diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 714a332be2196..8c57c2b8b851b 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1917,7 +1917,8 @@ def _repr_data_resource_(self): mode : str, optional Mode in which file is opened. **kwargs - These parameters will be passed to `tabulate`. + These parameters will be passed to `tabulate \ + `_. Returns ------- From 5704a4eaacd6fb99455d8b0fc96355ff148be93d Mon Sep 17 00:00:00 2001 From: Natalie Jann <47032941+najann@users.noreply.github.com> Date: Sun, 7 Jun 2020 23:23:09 +0200 Subject: [PATCH 0032/1025] DOC: "Setting layouts" plots' label size GH34305 (#34394) --- doc/source/user_guide/visualization.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/source/user_guide/visualization.rst b/doc/source/user_guide/visualization.rst index 5dca9d4c900dc..4cd7b9e8cecca 100644 --- a/doc/source/user_guide/visualization.rst +++ b/doc/source/user_guide/visualization.rst @@ -1332,7 +1332,7 @@ otherwise you will see a warning. .. ipython:: python - fig, axes = plt.subplots(4, 4, figsize=(6, 6)) + fig, axes = plt.subplots(4, 4, figsize=(9, 9)) plt.subplots_adjust(wspace=0.5, hspace=0.5) target1 = [axes[0][0], axes[1][1], axes[2][2], axes[3][3]] target2 = [axes[3][0], axes[2][1], axes[1][2], axes[0][3]] @@ -1369,6 +1369,7 @@ Another option is passing an ``ax`` argument to :meth:`Series.plot` to plot on a .. ipython:: python fig, axes = plt.subplots(nrows=2, ncols=2) + plt.subplots_adjust(wspace=0.2, hspace=0.5) df['A'].plot(ax=axes[0, 0]); axes[0, 0].set_title('A'); df['B'].plot(ax=axes[0, 1]); From 872b30cfbdc976e581ee61b02b3c13a90d0e8f10 Mon Sep 17 00:00:00 2001 From: willpeppo Date: Sun, 7 Jun 2020 17:23:19 -0400 Subject: [PATCH 0033/1025] DOC: updated base.py and datetimes.py in core/indexes for PR08 (#34591) * DOC: updated base.py and datetimes.py in core/indexes for PR08 * DOC: updated datetimes.py to remove trailing whitespaces --- pandas/core/indexes/base.py | 2 +- pandas/core/indexes/datetimes.py | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 746fd140e48a1..240882e561bc6 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -517,7 +517,7 @@ def is_(self, other) -> bool: Parameters ---------- other : object - other object to compare against. + Other object to compare against. Returns ------- diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 68c55426294ef..e1f0221eaee65 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -721,9 +721,9 @@ def indexer_at_time(self, time, asof=False): Parameters ---------- time : datetime.time or str - datetime.time or string in appropriate format ("%H:%M", "%H%M", - "%I:%M%p", "%I%M%p", "%H:%M:%S", "%H%M%S", "%I:%M:%S%p", - "%I%M%S%p"). + Time passed in either as object (datetime.time) or as string in + appropriate format ("%H:%M", "%H%M", "%I:%M%p", "%I%M%p", + "%H:%M:%S", "%H%M%S", "%I:%M:%S%p", "%I%M%S%p"). Returns ------- @@ -762,9 +762,9 @@ def indexer_between_time( Parameters ---------- start_time, end_time : datetime.time, str - datetime.time or string in appropriate format ("%H:%M", "%H%M", - "%I:%M%p", "%I%M%p", "%H:%M:%S", "%H%M%S", "%I:%M:%S%p", - "%I%M%S%p"). + Time passed either as object (datetime.time) or as string in + appropriate format ("%H:%M", "%H%M", "%I:%M%p", "%I%M%p", + "%H:%M:%S", "%H%M%S", "%I:%M:%S%p","%I%M%S%p"). include_start : bool, default True include_end : bool, default True From 13b52f14c435954d000bd4afd04d2ebb6ff3f6e3 Mon Sep 17 00:00:00 2001 From: Puneetha Pai <21996583+PuneethaPai@users.noreply.github.com> Date: Mon, 8 Jun 2020 07:06:51 +0530 Subject: [PATCH 0034/1025] Restrict Pandas merge suffixes param type to list/tuple to avoid interchange in right and left suffix order (#34208) --- doc/source/user_guide/merging.rst | 2 +- doc/source/whatsnew/v1.1.0.rst | 1 + pandas/core/reshape/merge.py | 19 +++++++----- pandas/tests/reshape/merge/test_join.py | 4 +-- pandas/tests/reshape/merge/test_merge.py | 39 +++++++++++++++--------- 5 files changed, 41 insertions(+), 24 deletions(-) diff --git a/doc/source/user_guide/merging.rst b/doc/source/user_guide/merging.rst index 56ff8c1fc7c9b..0639e4a7bb5e4 100644 --- a/doc/source/user_guide/merging.rst +++ b/doc/source/user_guide/merging.rst @@ -1273,7 +1273,7 @@ columns: .. ipython:: python - result = pd.merge(left, right, on='k', suffixes=['_l', '_r']) + result = pd.merge(left, right, on='k', suffixes=('_l', '_r')) .. ipython:: python :suppress: diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 2243790a663df..197ffdc2ccef0 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -401,6 +401,7 @@ Backwards incompatible API changes - :func: `pandas.api.dtypes.is_string_dtype` no longer incorrectly identifies categorical series as string. - :func:`read_excel` no longer takes ``**kwds`` arguments. This means that passing in keyword ``chunksize`` now raises a ``TypeError`` (previously raised a ``NotImplementedError``), while passing in keyword ``encoding`` now raises a ``TypeError`` (:issue:`34464`) +- :func: `merge` now checks ``suffixes`` parameter type to be ``tuple`` and raises ``TypeError``, whereas before a ``list`` or ``set`` were accepted and that the ``set`` could produce unexpected results (:issue:`33740`) ``MultiIndex.get_indexer`` interprets `method` argument differently ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 0c796c8f45a52..5e4eb89f0b45f 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -667,10 +667,8 @@ def get_result(self): join_index, left_indexer, right_indexer = self._get_join_info() - lsuf, rsuf = self.suffixes - llabels, rlabels = _items_overlap_with_suffix( - self.left._info_axis, lsuf, self.right._info_axis, rsuf + self.left._info_axis, self.right._info_axis, self.suffixes ) lindexers = {1: left_indexer} if left_indexer is not None else {} @@ -1484,10 +1482,8 @@ def __init__( def get_result(self): join_index, left_indexer, right_indexer = self._get_join_info() - lsuf, rsuf = self.suffixes - llabels, rlabels = _items_overlap_with_suffix( - self.left._info_axis, lsuf, self.right._info_axis, rsuf + self.left._info_axis, self.right._info_axis, self.suffixes ) if self.fill_method == "ffill": @@ -2067,17 +2063,26 @@ def _validate_operand(obj: FrameOrSeries) -> "DataFrame": ) -def _items_overlap_with_suffix(left: Index, lsuffix, right: Index, rsuffix): +def _items_overlap_with_suffix(left: Index, right: Index, suffixes: Tuple[str, str]): """ + Suffixes type validation. + If two indices overlap, add suffixes to overlapping entries. If corresponding suffix is empty, the entry is simply converted to string. """ + if not isinstance(suffixes, tuple): + raise TypeError( + f"suffixes should be tuple of (str, str). But got {type(suffixes).__name__}" + ) + to_rename = left.intersection(right) if len(to_rename) == 0: return left, right + lsuffix, rsuffix = suffixes + if not lsuffix and not rsuffix: raise ValueError(f"columns overlap but no suffix specified: {to_rename}") diff --git a/pandas/tests/reshape/merge/test_join.py b/pandas/tests/reshape/merge/test_join.py index dc1efa46403be..c33443e24b268 100644 --- a/pandas/tests/reshape/merge/test_join.py +++ b/pandas/tests/reshape/merge/test_join.py @@ -162,7 +162,7 @@ def test_inner_join(self): _check_join(self.df, self.df2, joined_both, ["key1", "key2"], how="inner") def test_handle_overlap(self): - joined = merge(self.df, self.df2, on="key2", suffixes=[".foo", ".bar"]) + joined = merge(self.df, self.df2, on="key2", suffixes=(".foo", ".bar")) assert "key1.foo" in joined assert "key1.bar" in joined @@ -173,7 +173,7 @@ def test_handle_overlap_arbitrary_key(self): self.df2, left_on="key2", right_on="key1", - suffixes=[".foo", ".bar"], + suffixes=(".foo", ".bar"), ) assert "key1.foo" in joined assert "key2.bar" in joined diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 4408aa0bbce4a..0a4d5f17a48cc 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -2004,8 +2004,8 @@ def test_merge_series(on, left_on, right_on, left_index, right_index, nm): ("b", "b", dict(suffixes=(None, "_y")), ["b", "b_y"]), ("a", "a", dict(suffixes=("_x", None)), ["a_x", "a"]), ("a", "b", dict(suffixes=("_x", None)), ["a", "b"]), - ("a", "a", dict(suffixes=[None, "_x"]), ["a", "a_x"]), - (0, 0, dict(suffixes=["_a", None]), ["0_a", 0]), + ("a", "a", dict(suffixes=(None, "_x")), ["a", "a_x"]), + (0, 0, dict(suffixes=("_a", None)), ["0_a", 0]), ("a", "a", dict(), ["a_x", "a_y"]), (0, 0, dict(), ["0_x", "0_y"]), ], @@ -2056,13 +2056,7 @@ def test_merge_duplicate_suffix(how, expected): @pytest.mark.parametrize( "col1, col2, suffixes", - [ - ("a", "a", [None, None]), - ("a", "a", (None, None)), - ("a", "a", ("", None)), - (0, 0, [None, None]), - (0, 0, (None, "")), - ], + [("a", "a", (None, None)), ("a", "a", ("", None)), (0, 0, (None, ""))], ) def test_merge_suffix_error(col1, col2, suffixes): # issue: 24782 @@ -2075,18 +2069,35 @@ def test_merge_suffix_error(col1, col2, suffixes): pd.merge(a, b, left_index=True, right_index=True, suffixes=suffixes) -@pytest.mark.parametrize("col1, col2, suffixes", [("a", "a", None), (0, 0, None)]) -def test_merge_suffix_none_error(col1, col2, suffixes): - # issue: 24782 +@pytest.mark.parametrize( + "col1, col2, suffixes", [("a", "a", {"a", "b"}), ("a", "a", None), (0, 0, None)], +) +def test_merge_suffix_type_error(col1, col2, suffixes): a = pd.DataFrame({col1: [1, 2, 3]}) b = pd.DataFrame({col2: [3, 4, 5]}) - # TODO: might reconsider current raise behaviour, see GH24782 - msg = "iterable" + msg = ( + f"suffixes should be tuple of \\(str, str\\). But got {type(suffixes).__name__}" + ) with pytest.raises(TypeError, match=msg): pd.merge(a, b, left_index=True, right_index=True, suffixes=suffixes) +@pytest.mark.parametrize( + "col1, col2, suffixes, msg", + [ + ("a", "a", ("a", "b", "c"), r"too many values to unpack \(expected 2\)"), + ("a", "a", tuple("a"), r"not enough values to unpack \(expected 2, got 1\)"), + ], +) +def test_merge_suffix_length_error(col1, col2, suffixes, msg): + a = pd.DataFrame({col1: [1, 2, 3]}) + b = pd.DataFrame({col2: [3, 4, 5]}) + + with pytest.raises(ValueError, match=msg): + pd.merge(a, b, left_index=True, right_index=True, suffixes=suffixes) + + @pytest.mark.parametrize("cat_dtype", ["one", "two"]) @pytest.mark.parametrize("reverse", [True, False]) def test_merge_equal_cat_dtypes(cat_dtype, reverse): From feb910440775822958af39314d9266f957987138 Mon Sep 17 00:00:00 2001 From: alimcmaster1 Date: Mon, 8 Jun 2020 02:45:22 +0100 Subject: [PATCH 0035/1025] CLN: Removed duplicated test data (#34477) --- pandas/tests/io/conftest.py | 2 +- .../{parser/data => data/csv}/test1.csv.bz2 | Bin .../io/{parser/data => data/csv}/test1.csv.gz | Bin .../io/{parser/data => data/csv}/tips.csv.bz2 | Bin .../io/{parser/data => data/csv}/tips.csv.gz | Bin pandas/tests/io/parser/conftest.py | 4 +- pandas/tests/io/parser/data/test1.csv | 8 - pandas/tests/io/parser/data/tips.csv | 245 ------------------ pandas/tests/io/parser/test_encoding.py | 14 +- pandas/tests/io/parser/test_network.py | 2 +- pandas/tests/io/parser/test_textreader.py | 3 +- 11 files changed, 14 insertions(+), 264 deletions(-) rename pandas/tests/io/{parser/data => data/csv}/test1.csv.bz2 (100%) rename pandas/tests/io/{parser/data => data/csv}/test1.csv.gz (100%) rename pandas/tests/io/{parser/data => data/csv}/tips.csv.bz2 (100%) rename pandas/tests/io/{parser/data => data/csv}/tips.csv.gz (100%) delete mode 100644 pandas/tests/io/parser/data/test1.csv delete mode 100644 pandas/tests/io/parser/data/tips.csv diff --git a/pandas/tests/io/conftest.py b/pandas/tests/io/conftest.py index f1de15dd34464..fcee25c258efa 100644 --- a/pandas/tests/io/conftest.py +++ b/pandas/tests/io/conftest.py @@ -10,7 +10,7 @@ @pytest.fixture def tips_file(datapath): """Path to the tips dataset""" - return datapath("io", "parser", "data", "tips.csv") + return datapath("io", "data", "csv", "tips.csv") @pytest.fixture diff --git a/pandas/tests/io/parser/data/test1.csv.bz2 b/pandas/tests/io/data/csv/test1.csv.bz2 similarity index 100% rename from pandas/tests/io/parser/data/test1.csv.bz2 rename to pandas/tests/io/data/csv/test1.csv.bz2 diff --git a/pandas/tests/io/parser/data/test1.csv.gz b/pandas/tests/io/data/csv/test1.csv.gz similarity index 100% rename from pandas/tests/io/parser/data/test1.csv.gz rename to pandas/tests/io/data/csv/test1.csv.gz diff --git a/pandas/tests/io/parser/data/tips.csv.bz2 b/pandas/tests/io/data/csv/tips.csv.bz2 similarity index 100% rename from pandas/tests/io/parser/data/tips.csv.bz2 rename to pandas/tests/io/data/csv/tips.csv.bz2 diff --git a/pandas/tests/io/parser/data/tips.csv.gz b/pandas/tests/io/data/csv/tips.csv.gz similarity index 100% rename from pandas/tests/io/parser/data/tips.csv.gz rename to pandas/tests/io/data/csv/tips.csv.gz diff --git a/pandas/tests/io/parser/conftest.py b/pandas/tests/io/parser/conftest.py index 15967e3be176a..d03c85f65ea8d 100644 --- a/pandas/tests/io/parser/conftest.py +++ b/pandas/tests/io/parser/conftest.py @@ -53,11 +53,11 @@ def csv_dir_path(datapath): @pytest.fixture -def csv1(csv_dir_path): +def csv1(datapath): """ The path to the data file "test1.csv" needed for parser tests. """ - return os.path.join(csv_dir_path, "test1.csv") + return os.path.join(datapath("io", "data", "csv"), "test1.csv") _cParserHighMemory = CParserHighMemory() diff --git a/pandas/tests/io/parser/data/test1.csv b/pandas/tests/io/parser/data/test1.csv deleted file mode 100644 index 4bdb62943c4c8..0000000000000 --- a/pandas/tests/io/parser/data/test1.csv +++ /dev/null @@ -1,8 +0,0 @@ -index,A,B,C,D -2000-01-03 00:00:00,0.980268513777,3.68573087906,-0.364216805298,-1.15973806169 -2000-01-04 00:00:00,1.04791624281,-0.0412318367011,-0.16181208307,0.212549316967 -2000-01-05 00:00:00,0.498580885705,0.731167677815,-0.537677223318,1.34627041952 -2000-01-06 00:00:00,1.12020151869,1.56762092543,0.00364077397681,0.67525259227 -2000-01-07 00:00:00,-0.487094399463,0.571454623474,-1.6116394093,0.103468562917 -2000-01-10 00:00:00,0.836648671666,0.246461918642,0.588542635376,1.0627820613 -2000-01-11 00:00:00,-0.157160753327,1.34030689438,1.19577795622,-1.09700699751 \ No newline at end of file diff --git a/pandas/tests/io/parser/data/tips.csv b/pandas/tests/io/parser/data/tips.csv deleted file mode 100644 index 856a65a69e647..0000000000000 --- a/pandas/tests/io/parser/data/tips.csv +++ /dev/null @@ -1,245 +0,0 @@ -total_bill,tip,sex,smoker,day,time,size -16.99,1.01,Female,No,Sun,Dinner,2 -10.34,1.66,Male,No,Sun,Dinner,3 -21.01,3.5,Male,No,Sun,Dinner,3 -23.68,3.31,Male,No,Sun,Dinner,2 -24.59,3.61,Female,No,Sun,Dinner,4 -25.29,4.71,Male,No,Sun,Dinner,4 -8.77,2.0,Male,No,Sun,Dinner,2 -26.88,3.12,Male,No,Sun,Dinner,4 -15.04,1.96,Male,No,Sun,Dinner,2 -14.78,3.23,Male,No,Sun,Dinner,2 -10.27,1.71,Male,No,Sun,Dinner,2 -35.26,5.0,Female,No,Sun,Dinner,4 -15.42,1.57,Male,No,Sun,Dinner,2 -18.43,3.0,Male,No,Sun,Dinner,4 -14.83,3.02,Female,No,Sun,Dinner,2 -21.58,3.92,Male,No,Sun,Dinner,2 -10.33,1.67,Female,No,Sun,Dinner,3 -16.29,3.71,Male,No,Sun,Dinner,3 -16.97,3.5,Female,No,Sun,Dinner,3 -20.65,3.35,Male,No,Sat,Dinner,3 -17.92,4.08,Male,No,Sat,Dinner,2 -20.29,2.75,Female,No,Sat,Dinner,2 -15.77,2.23,Female,No,Sat,Dinner,2 -39.42,7.58,Male,No,Sat,Dinner,4 -19.82,3.18,Male,No,Sat,Dinner,2 -17.81,2.34,Male,No,Sat,Dinner,4 -13.37,2.0,Male,No,Sat,Dinner,2 -12.69,2.0,Male,No,Sat,Dinner,2 -21.7,4.3,Male,No,Sat,Dinner,2 -19.65,3.0,Female,No,Sat,Dinner,2 -9.55,1.45,Male,No,Sat,Dinner,2 -18.35,2.5,Male,No,Sat,Dinner,4 -15.06,3.0,Female,No,Sat,Dinner,2 -20.69,2.45,Female,No,Sat,Dinner,4 -17.78,3.27,Male,No,Sat,Dinner,2 -24.06,3.6,Male,No,Sat,Dinner,3 -16.31,2.0,Male,No,Sat,Dinner,3 -16.93,3.07,Female,No,Sat,Dinner,3 -18.69,2.31,Male,No,Sat,Dinner,3 -31.27,5.0,Male,No,Sat,Dinner,3 -16.04,2.24,Male,No,Sat,Dinner,3 -17.46,2.54,Male,No,Sun,Dinner,2 -13.94,3.06,Male,No,Sun,Dinner,2 -9.68,1.32,Male,No,Sun,Dinner,2 -30.4,5.6,Male,No,Sun,Dinner,4 -18.29,3.0,Male,No,Sun,Dinner,2 -22.23,5.0,Male,No,Sun,Dinner,2 -32.4,6.0,Male,No,Sun,Dinner,4 -28.55,2.05,Male,No,Sun,Dinner,3 -18.04,3.0,Male,No,Sun,Dinner,2 -12.54,2.5,Male,No,Sun,Dinner,2 -10.29,2.6,Female,No,Sun,Dinner,2 -34.81,5.2,Female,No,Sun,Dinner,4 -9.94,1.56,Male,No,Sun,Dinner,2 -25.56,4.34,Male,No,Sun,Dinner,4 -19.49,3.51,Male,No,Sun,Dinner,2 -38.01,3.0,Male,Yes,Sat,Dinner,4 -26.41,1.5,Female,No,Sat,Dinner,2 -11.24,1.76,Male,Yes,Sat,Dinner,2 -48.27,6.73,Male,No,Sat,Dinner,4 -20.29,3.21,Male,Yes,Sat,Dinner,2 -13.81,2.0,Male,Yes,Sat,Dinner,2 -11.02,1.98,Male,Yes,Sat,Dinner,2 -18.29,3.76,Male,Yes,Sat,Dinner,4 -17.59,2.64,Male,No,Sat,Dinner,3 -20.08,3.15,Male,No,Sat,Dinner,3 -16.45,2.47,Female,No,Sat,Dinner,2 -3.07,1.0,Female,Yes,Sat,Dinner,1 -20.23,2.01,Male,No,Sat,Dinner,2 -15.01,2.09,Male,Yes,Sat,Dinner,2 -12.02,1.97,Male,No,Sat,Dinner,2 -17.07,3.0,Female,No,Sat,Dinner,3 -26.86,3.14,Female,Yes,Sat,Dinner,2 -25.28,5.0,Female,Yes,Sat,Dinner,2 -14.73,2.2,Female,No,Sat,Dinner,2 -10.51,1.25,Male,No,Sat,Dinner,2 -17.92,3.08,Male,Yes,Sat,Dinner,2 -27.2,4.0,Male,No,Thur,Lunch,4 -22.76,3.0,Male,No,Thur,Lunch,2 -17.29,2.71,Male,No,Thur,Lunch,2 -19.44,3.0,Male,Yes,Thur,Lunch,2 -16.66,3.4,Male,No,Thur,Lunch,2 -10.07,1.83,Female,No,Thur,Lunch,1 -32.68,5.0,Male,Yes,Thur,Lunch,2 -15.98,2.03,Male,No,Thur,Lunch,2 -34.83,5.17,Female,No,Thur,Lunch,4 -13.03,2.0,Male,No,Thur,Lunch,2 -18.28,4.0,Male,No,Thur,Lunch,2 -24.71,5.85,Male,No,Thur,Lunch,2 -21.16,3.0,Male,No,Thur,Lunch,2 -28.97,3.0,Male,Yes,Fri,Dinner,2 -22.49,3.5,Male,No,Fri,Dinner,2 -5.75,1.0,Female,Yes,Fri,Dinner,2 -16.32,4.3,Female,Yes,Fri,Dinner,2 -22.75,3.25,Female,No,Fri,Dinner,2 -40.17,4.73,Male,Yes,Fri,Dinner,4 -27.28,4.0,Male,Yes,Fri,Dinner,2 -12.03,1.5,Male,Yes,Fri,Dinner,2 -21.01,3.0,Male,Yes,Fri,Dinner,2 -12.46,1.5,Male,No,Fri,Dinner,2 -11.35,2.5,Female,Yes,Fri,Dinner,2 -15.38,3.0,Female,Yes,Fri,Dinner,2 -44.3,2.5,Female,Yes,Sat,Dinner,3 -22.42,3.48,Female,Yes,Sat,Dinner,2 -20.92,4.08,Female,No,Sat,Dinner,2 -15.36,1.64,Male,Yes,Sat,Dinner,2 -20.49,4.06,Male,Yes,Sat,Dinner,2 -25.21,4.29,Male,Yes,Sat,Dinner,2 -18.24,3.76,Male,No,Sat,Dinner,2 -14.31,4.0,Female,Yes,Sat,Dinner,2 -14.0,3.0,Male,No,Sat,Dinner,2 -7.25,1.0,Female,No,Sat,Dinner,1 -38.07,4.0,Male,No,Sun,Dinner,3 -23.95,2.55,Male,No,Sun,Dinner,2 -25.71,4.0,Female,No,Sun,Dinner,3 -17.31,3.5,Female,No,Sun,Dinner,2 -29.93,5.07,Male,No,Sun,Dinner,4 -10.65,1.5,Female,No,Thur,Lunch,2 -12.43,1.8,Female,No,Thur,Lunch,2 -24.08,2.92,Female,No,Thur,Lunch,4 -11.69,2.31,Male,No,Thur,Lunch,2 -13.42,1.68,Female,No,Thur,Lunch,2 -14.26,2.5,Male,No,Thur,Lunch,2 -15.95,2.0,Male,No,Thur,Lunch,2 -12.48,2.52,Female,No,Thur,Lunch,2 -29.8,4.2,Female,No,Thur,Lunch,6 -8.52,1.48,Male,No,Thur,Lunch,2 -14.52,2.0,Female,No,Thur,Lunch,2 -11.38,2.0,Female,No,Thur,Lunch,2 -22.82,2.18,Male,No,Thur,Lunch,3 -19.08,1.5,Male,No,Thur,Lunch,2 -20.27,2.83,Female,No,Thur,Lunch,2 -11.17,1.5,Female,No,Thur,Lunch,2 -12.26,2.0,Female,No,Thur,Lunch,2 -18.26,3.25,Female,No,Thur,Lunch,2 -8.51,1.25,Female,No,Thur,Lunch,2 -10.33,2.0,Female,No,Thur,Lunch,2 -14.15,2.0,Female,No,Thur,Lunch,2 -16.0,2.0,Male,Yes,Thur,Lunch,2 -13.16,2.75,Female,No,Thur,Lunch,2 -17.47,3.5,Female,No,Thur,Lunch,2 -34.3,6.7,Male,No,Thur,Lunch,6 -41.19,5.0,Male,No,Thur,Lunch,5 -27.05,5.0,Female,No,Thur,Lunch,6 -16.43,2.3,Female,No,Thur,Lunch,2 -8.35,1.5,Female,No,Thur,Lunch,2 -18.64,1.36,Female,No,Thur,Lunch,3 -11.87,1.63,Female,No,Thur,Lunch,2 -9.78,1.73,Male,No,Thur,Lunch,2 -7.51,2.0,Male,No,Thur,Lunch,2 -14.07,2.5,Male,No,Sun,Dinner,2 -13.13,2.0,Male,No,Sun,Dinner,2 -17.26,2.74,Male,No,Sun,Dinner,3 -24.55,2.0,Male,No,Sun,Dinner,4 -19.77,2.0,Male,No,Sun,Dinner,4 -29.85,5.14,Female,No,Sun,Dinner,5 -48.17,5.0,Male,No,Sun,Dinner,6 -25.0,3.75,Female,No,Sun,Dinner,4 -13.39,2.61,Female,No,Sun,Dinner,2 -16.49,2.0,Male,No,Sun,Dinner,4 -21.5,3.5,Male,No,Sun,Dinner,4 -12.66,2.5,Male,No,Sun,Dinner,2 -16.21,2.0,Female,No,Sun,Dinner,3 -13.81,2.0,Male,No,Sun,Dinner,2 -17.51,3.0,Female,Yes,Sun,Dinner,2 -24.52,3.48,Male,No,Sun,Dinner,3 -20.76,2.24,Male,No,Sun,Dinner,2 -31.71,4.5,Male,No,Sun,Dinner,4 -10.59,1.61,Female,Yes,Sat,Dinner,2 -10.63,2.0,Female,Yes,Sat,Dinner,2 -50.81,10.0,Male,Yes,Sat,Dinner,3 -15.81,3.16,Male,Yes,Sat,Dinner,2 -7.25,5.15,Male,Yes,Sun,Dinner,2 -31.85,3.18,Male,Yes,Sun,Dinner,2 -16.82,4.0,Male,Yes,Sun,Dinner,2 -32.9,3.11,Male,Yes,Sun,Dinner,2 -17.89,2.0,Male,Yes,Sun,Dinner,2 -14.48,2.0,Male,Yes,Sun,Dinner,2 -9.6,4.0,Female,Yes,Sun,Dinner,2 -34.63,3.55,Male,Yes,Sun,Dinner,2 -34.65,3.68,Male,Yes,Sun,Dinner,4 -23.33,5.65,Male,Yes,Sun,Dinner,2 -45.35,3.5,Male,Yes,Sun,Dinner,3 -23.17,6.5,Male,Yes,Sun,Dinner,4 -40.55,3.0,Male,Yes,Sun,Dinner,2 -20.69,5.0,Male,No,Sun,Dinner,5 -20.9,3.5,Female,Yes,Sun,Dinner,3 -30.46,2.0,Male,Yes,Sun,Dinner,5 -18.15,3.5,Female,Yes,Sun,Dinner,3 -23.1,4.0,Male,Yes,Sun,Dinner,3 -15.69,1.5,Male,Yes,Sun,Dinner,2 -19.81,4.19,Female,Yes,Thur,Lunch,2 -28.44,2.56,Male,Yes,Thur,Lunch,2 -15.48,2.02,Male,Yes,Thur,Lunch,2 -16.58,4.0,Male,Yes,Thur,Lunch,2 -7.56,1.44,Male,No,Thur,Lunch,2 -10.34,2.0,Male,Yes,Thur,Lunch,2 -43.11,5.0,Female,Yes,Thur,Lunch,4 -13.0,2.0,Female,Yes,Thur,Lunch,2 -13.51,2.0,Male,Yes,Thur,Lunch,2 -18.71,4.0,Male,Yes,Thur,Lunch,3 -12.74,2.01,Female,Yes,Thur,Lunch,2 -13.0,2.0,Female,Yes,Thur,Lunch,2 -16.4,2.5,Female,Yes,Thur,Lunch,2 -20.53,4.0,Male,Yes,Thur,Lunch,4 -16.47,3.23,Female,Yes,Thur,Lunch,3 -26.59,3.41,Male,Yes,Sat,Dinner,3 -38.73,3.0,Male,Yes,Sat,Dinner,4 -24.27,2.03,Male,Yes,Sat,Dinner,2 -12.76,2.23,Female,Yes,Sat,Dinner,2 -30.06,2.0,Male,Yes,Sat,Dinner,3 -25.89,5.16,Male,Yes,Sat,Dinner,4 -48.33,9.0,Male,No,Sat,Dinner,4 -13.27,2.5,Female,Yes,Sat,Dinner,2 -28.17,6.5,Female,Yes,Sat,Dinner,3 -12.9,1.1,Female,Yes,Sat,Dinner,2 -28.15,3.0,Male,Yes,Sat,Dinner,5 -11.59,1.5,Male,Yes,Sat,Dinner,2 -7.74,1.44,Male,Yes,Sat,Dinner,2 -30.14,3.09,Female,Yes,Sat,Dinner,4 -12.16,2.2,Male,Yes,Fri,Lunch,2 -13.42,3.48,Female,Yes,Fri,Lunch,2 -8.58,1.92,Male,Yes,Fri,Lunch,1 -15.98,3.0,Female,No,Fri,Lunch,3 -13.42,1.58,Male,Yes,Fri,Lunch,2 -16.27,2.5,Female,Yes,Fri,Lunch,2 -10.09,2.0,Female,Yes,Fri,Lunch,2 -20.45,3.0,Male,No,Sat,Dinner,4 -13.28,2.72,Male,No,Sat,Dinner,2 -22.12,2.88,Female,Yes,Sat,Dinner,2 -24.01,2.0,Male,Yes,Sat,Dinner,4 -15.69,3.0,Male,Yes,Sat,Dinner,3 -11.61,3.39,Male,No,Sat,Dinner,2 -10.77,1.47,Male,No,Sat,Dinner,2 -15.53,3.0,Male,Yes,Sat,Dinner,2 -10.07,1.25,Male,No,Sat,Dinner,2 -12.6,1.0,Male,Yes,Sat,Dinner,2 -32.83,1.17,Male,Yes,Sat,Dinner,2 -35.83,4.67,Female,No,Sat,Dinner,3 -29.03,5.92,Male,No,Sat,Dinner,3 -27.18,2.0,Female,Yes,Sat,Dinner,2 -22.67,2.0,Male,Yes,Sat,Dinner,2 -17.82,1.75,Male,No,Sat,Dinner,2 -18.78,3.0,Female,No,Thur,Dinner,2 diff --git a/pandas/tests/io/parser/test_encoding.py b/pandas/tests/io/parser/test_encoding.py index 13b74cf29f857..de7b3bed034c7 100644 --- a/pandas/tests/io/parser/test_encoding.py +++ b/pandas/tests/io/parser/test_encoding.py @@ -133,19 +133,21 @@ def test_read_csv_utf_aliases(all_parsers, utf_value, encoding_fmt): @pytest.mark.parametrize( - "fname,encoding", + "file_path,encoding", [ - ("test1.csv", "utf-8"), - ("unicode_series.csv", "latin-1"), - ("sauron.SHIFT_JIS.csv", "shiftjis"), + (("io", "data", "csv", "test1.csv"), "utf-8"), + (("io", "parser", "data", "unicode_series.csv"), "latin-1"), + (("io", "parser", "data", "sauron.SHIFT_JIS.csv"), "shiftjis"), ], ) -def test_binary_mode_file_buffers(all_parsers, csv_dir_path, fname, encoding): +def test_binary_mode_file_buffers( + all_parsers, csv_dir_path, file_path, encoding, datapath +): # gh-23779: Python csv engine shouldn't error on files opened in binary. # gh-31575: Python csv engine shouldn't error on files opened in raw binary. parser = all_parsers - fpath = os.path.join(csv_dir_path, fname) + fpath = datapath(*file_path) expected = parser.read_csv(fpath, encoding=encoding) with open(fpath, mode="r", encoding=encoding) as fa: diff --git a/pandas/tests/io/parser/test_network.py b/pandas/tests/io/parser/test_network.py index e0dee878006b8..509ae89909699 100644 --- a/pandas/tests/io/parser/test_network.py +++ b/pandas/tests/io/parser/test_network.py @@ -49,7 +49,7 @@ def check_compressed_urls(salaries_table, compression, extension, mode, engine): @pytest.fixture def tips_df(datapath): """DataFrame with the tips dataset.""" - return read_csv(datapath("io", "parser", "data", "tips.csv")) + return read_csv(datapath("io", "data", "csv", "tips.csv")) @pytest.mark.usefixtures("s3_resource") diff --git a/pandas/tests/io/parser/test_textreader.py b/pandas/tests/io/parser/test_textreader.py index 8d5af85c20d33..1c2518646bb29 100644 --- a/pandas/tests/io/parser/test_textreader.py +++ b/pandas/tests/io/parser/test_textreader.py @@ -21,7 +21,8 @@ class TestTextReader: @pytest.fixture(autouse=True) def setup_method(self, datapath): self.dirpath = datapath("io", "parser", "data") - self.csv1 = os.path.join(self.dirpath, "test1.csv") + csv1_dirpath = datapath("io", "data", "csv") + self.csv1 = os.path.join(csv1_dirpath, "test1.csv") self.csv2 = os.path.join(self.dirpath, "test2.csv") self.xls1 = os.path.join(self.dirpath, "test.xls") From 853100310da9ab76830425c7bd56cf2c43221614 Mon Sep 17 00:00:00 2001 From: willpeppo Date: Sun, 7 Jun 2020 21:52:29 -0400 Subject: [PATCH 0036/1025] DOC: updated io/pytables.py to fix PR08 (#34604) --- pandas/io/pytables.py | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 36cd61b6c3adb..497b25d73df3e 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -997,12 +997,14 @@ def put( key : str value : {Series, DataFrame} format : 'fixed(f)|table(t)', default is 'fixed' - fixed(f) : Fixed format - Fast writing/reading. Not-appendable, nor searchable. - table(t) : Table format - Write as a PyTables Table structure which may perform - worse but allow more flexible operations like searching - / selecting subsets of the data. + Format to use when storing object in HDFStore. Value can be one of: + + ``'fixed'`` + Fixed format. Fast writing/reading. Not-appendable, nor searchable. + ``'table'`` + Table format. Write as a PyTables Table structure which may perform + worse but allow more flexible operations like searching / selecting + subsets of the data. append : bool, default False This will force Table format, append the input data to the existing. @@ -1126,10 +1128,12 @@ def append( key : str value : {Series, DataFrame} format : 'table' is the default - table(t) : table format - Write as a PyTables Table structure which may perform - worse but allow more flexible operations like searching - / selecting subsets of the data. + Format to use when storing object in HDFStore. Value can be one of: + + ``'table'`` + Table format. Write as a PyTables Table structure which may perform + worse but allow more flexible operations like searching / selecting + subsets of the data. append : bool, default True Append the input data to the existing. data_columns : list of columns, or True, default None From 8e75f2122fdfd658f68bfbac735bca1b071cfa60 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 7 Jun 2020 18:59:47 -0700 Subject: [PATCH 0037/1025] REF: make DateOffset apply_index methods operate on ndarrays where feasible (#34612) --- pandas/_libs/tslibs/offsets.pyx | 47 ++++++++++++++++++--------------- 1 file changed, 26 insertions(+), 21 deletions(-) diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index 63136367a5b5c..28ead3593cf85 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -47,6 +47,7 @@ from pandas._libs.tslibs.timezones cimport utc_pytz as UTC from pandas._libs.tslibs.tzconversion cimport tz_convert_single from .dtypes cimport PeriodDtypeCode +from .fields import get_start_end_field from .timedeltas cimport delta_to_nanoseconds from .timedeltas import Timedelta from .timestamps cimport _Timestamp @@ -2291,7 +2292,7 @@ cdef class SemiMonthOffset(SingleConstructorOffset): after_day_of_month = days_from_start > delta # determine the correct n for each date in dtindex - roll = self._get_roll(dtindex, before_day_of_month, after_day_of_month) + roll = self._get_roll(i8other, before_day_of_month, after_day_of_month) # isolate the time since it will be striped away one the next line time = (i8other % DAY_NANOS).view("timedelta64[ns]") @@ -2304,24 +2305,26 @@ cdef class SemiMonthOffset(SingleConstructorOffset): shifted = asper._addsub_int_array(roll // 2, operator.add) dtindex = type(dti)(shifted.to_timestamp()) + dt64other = np.asarray(dtindex) # apply the correct day - dtindex = self._apply_index_days(dtindex, roll) + dt64result = self._apply_index_days(dt64other, roll) - return dtindex + time + return dt64result + time - def _get_roll(self, dtindex, before_day_of_month, after_day_of_month): + def _get_roll(self, i8other, before_day_of_month, after_day_of_month): """ Return an array with the correct n for each date in dtindex. The roll array is based on the fact that dtindex gets rolled back to the first day of the month. """ + # before_day_of_month and after_day_of_month are ndarray[bool] raise NotImplementedError - def _apply_index_days(self, dtindex, roll): + def _apply_index_days(self, dt64other, roll): """ - Apply the correct day for each date in dtindex. + Apply the correct day for each date in dt64other. """ raise NotImplementedError @@ -2352,9 +2355,10 @@ cdef class SemiMonthEnd(SemiMonthOffset): day = 31 if n % 2 else self.day_of_month return shift_month(other, months, day) - def _get_roll(self, dtindex, before_day_of_month, after_day_of_month): + def _get_roll(self, i8other, before_day_of_month, after_day_of_month): + # before_day_of_month and after_day_of_month are ndarray[bool] n = self.n - is_month_end = dtindex.is_month_end + is_month_end = get_start_end_field(i8other, "is_month_end") if n > 0: roll_end = np.where(is_month_end, 1, 0) roll_before = np.where(before_day_of_month, n, n + 1) @@ -2367,22 +2371,22 @@ cdef class SemiMonthEnd(SemiMonthOffset): roll = np.where(after_day_of_month, n + 2, n + 1) return roll - def _apply_index_days(self, dtindex, roll): + def _apply_index_days(self, dt64other, roll): """ - Add days portion of offset to DatetimeIndex dtindex. + Add days portion of offset to dt64other. Parameters ---------- - dtindex : DatetimeIndex + dt64other : ndarray[datetime64[ns]] roll : ndarray[int64_t] Returns ------- - result : DatetimeIndex + ndarray[datetime64[ns]] """ nanos = (roll % 2) * Timedelta(days=self.day_of_month).value - dtindex += nanos.astype("timedelta64[ns]") - return dtindex + Timedelta(days=-1) + dt64other += nanos.astype("timedelta64[ns]") + return dt64other + Timedelta(days=-1) cdef class SemiMonthBegin(SemiMonthOffset): @@ -2409,9 +2413,10 @@ cdef class SemiMonthBegin(SemiMonthOffset): day = 1 if n % 2 else self.day_of_month return shift_month(other, months, day) - def _get_roll(self, dtindex, before_day_of_month, after_day_of_month): + def _get_roll(self, i8other, before_day_of_month, after_day_of_month): + # before_day_of_month and after_day_of_month are ndarray[bool] n = self.n - is_month_start = dtindex.is_month_start + is_month_start = get_start_end_field(i8other, "is_month_start") if n > 0: roll = np.where(before_day_of_month, n, n + 1) elif n == 0: @@ -2424,21 +2429,21 @@ cdef class SemiMonthBegin(SemiMonthOffset): roll = roll_after + roll_start return roll - def _apply_index_days(self, dtindex, roll): + def _apply_index_days(self, dt64other, roll): """ - Add days portion of offset to DatetimeIndex dtindex. + Add days portion of offset to dt64other. Parameters ---------- - dtindex : DatetimeIndex + dt64other : ndarray[datetime64[ns]] roll : ndarray[int64_t] Returns ------- - result : DatetimeIndex + ndarray[datetime64[ns]] """ nanos = (roll % 2) * Timedelta(days=self.day_of_month - 1).value - return dtindex + nanos.astype("timedelta64[ns]") + return dt64other + nanos.astype("timedelta64[ns]") # --------------------------------------------------------------------- From edbb30509ac6b2a2a492cd6cf6d6dbea9770814a Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 7 Jun 2020 19:01:40 -0700 Subject: [PATCH 0038/1025] TST: mark tzlocal tests as slow, closes #34413 (#34610) --- pandas/tests/resample/test_datetime_index.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index 9909e554aa14d..8d7d45f54ad5f 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -2,6 +2,7 @@ from functools import partial from io import StringIO +from dateutil.tz import tzlocal import numpy as np import pytest import pytz @@ -480,6 +481,11 @@ def test_upsample_with_limit(): @pytest.mark.parametrize("rule", ["Y", "3M", "15D", "30H", "15Min", "30S"]) def test_nearest_upsample_with_limit(tz_aware_fixture, freq, rule): # GH 33939 + tz = tz_aware_fixture + if str(tz) == "tzlocal()" and rule == "30S" and freq in ["Y", "10M"]: + # GH#34413 separate these so we can mark as slow, see + # test_nearest_upsample_with_limit_tzlocal + return rng = date_range("1/1/2000", periods=3, freq=freq, tz=tz_aware_fixture) ts = Series(np.random.randn(len(rng)), rng) @@ -488,6 +494,20 @@ def test_nearest_upsample_with_limit(tz_aware_fixture, freq, rule): tm.assert_series_equal(result, expected) +@pytest.mark.slow +@pytest.mark.parametrize("freq", ["Y", "10M"]) +def test_nearest_upsample_with_limit_tzlocal(freq): + # GH#33939, GH#34413 split off from test_nearest_upsample_with_limit + rule = "30S" + tz = tzlocal() + rng = date_range("1/1/2000", periods=3, freq=freq, tz=tz) + ts = Series(np.random.randn(len(rng)), rng) + + result = ts.resample(rule).nearest(limit=2) + expected = ts.reindex(result.index, method="nearest", limit=2) + tm.assert_series_equal(result, expected) + + def test_resample_ohlc(series): s = series From c40b904885522425d3d6888f2b292fe5c9f61997 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Mon, 8 Jun 2020 03:15:43 +0100 Subject: [PATCH 0039/1025] BUG: fix Series.where(cond) when cond is empty (#34595) --- doc/source/whatsnew/v1.1.0.rst | 1 + pandas/core/generic.py | 19 ++++++++++--------- pandas/tests/series/indexing/test_where.py | 7 +++++++ 3 files changed, 18 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 197ffdc2ccef0..b2b55b7b503ec 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -1013,6 +1013,7 @@ Reshaping - Ensure only named functions can be used in :func:`eval()` (:issue:`32460`) - Bug in :func:`Dataframe.aggregate` and :func:`Series.aggregate` was causing recursive loop in some cases (:issue:`34224`) - Fixed bug in :func:`melt` where melting MultiIndex columns with ``col_level`` > 0 would raise a ``KeyError`` on ``id_vars`` (:issue:`34129`) +- Bug in :meth:`Series.where` with an empty Series and empty ``cond`` having non-bool dtype (:issue:`34592`) Sparse ^^^^^^ diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 8c57c2b8b851b..06be602c8090d 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -8831,16 +8831,17 @@ def _where( msg = "Boolean array expected for the condition, not {dtype}" - if not isinstance(cond, ABCDataFrame): - # This is a single-dimensional object. - if not is_bool_dtype(cond): - raise ValueError(msg.format(dtype=cond.dtype)) - elif not cond.empty: - for dt in cond.dtypes: - if not is_bool_dtype(dt): - raise ValueError(msg.format(dtype=dt)) + if not cond.empty: + if not isinstance(cond, ABCDataFrame): + # This is a single-dimensional object. + if not is_bool_dtype(cond): + raise ValueError(msg.format(dtype=cond.dtype)) + else: + for dt in cond.dtypes: + if not is_bool_dtype(dt): + raise ValueError(msg.format(dtype=dt)) else: - # GH#21947 we have an empty DataFrame, could be object-dtype + # GH#21947 we have an empty DataFrame/Series, could be object-dtype cond = cond.astype(bool) cond = -cond if inplace else cond diff --git a/pandas/tests/series/indexing/test_where.py b/pandas/tests/series/indexing/test_where.py index 6765d9f9d8266..8daea84492871 100644 --- a/pandas/tests/series/indexing/test_where.py +++ b/pandas/tests/series/indexing/test_where.py @@ -443,3 +443,10 @@ def test_where_sparse(): result = ser.where(ser >= 2, 0) expected = pd.Series(pd.arrays.SparseArray([0, 2])) tm.assert_series_equal(result, expected) + + +def test_where_empty_series_and_empty_cond_having_non_bool_dtypes(): + # https://github.com/pandas-dev/pandas/issues/34592 + ser = Series([], dtype=float) + result = ser.where([]) + tm.assert_series_equal(result, ser) From 41469279b96820875f98e369711968f71e1777ec Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 7 Jun 2020 19:20:33 -0700 Subject: [PATCH 0040/1025] REF: ensure we have offset objects in plotting functions (#34585) --- pandas/plotting/_matplotlib/converter.py | 51 +++++++++++----------- pandas/tests/plotting/test_datetimelike.py | 13 +++--- 2 files changed, 33 insertions(+), 31 deletions(-) diff --git a/pandas/plotting/_matplotlib/converter.py b/pandas/plotting/_matplotlib/converter.py index b8be8a66a59fd..65f030223c7ca 100644 --- a/pandas/plotting/_matplotlib/converter.py +++ b/pandas/plotting/_matplotlib/converter.py @@ -11,7 +11,9 @@ import numpy as np from pandas._libs import lib, tslibs -from pandas._libs.tslibs.frequencies import FreqGroup, get_freq_code, get_freq_group +from pandas._libs.tslibs import to_offset +from pandas._libs.tslibs.frequencies import FreqGroup, get_freq_group +from pandas._libs.tslibs.offsets import BaseOffset from pandas.core.dtypes.common import ( is_datetime64_ns_dtype, @@ -522,34 +524,36 @@ def has_level_label(label_flags, vmin): return True -def _daily_finder(vmin, vmax, freq): +def _daily_finder(vmin, vmax, freq: BaseOffset): + dtype_code = freq._period_dtype_code + periodsperday = -1 - if freq >= FreqGroup.FR_HR: - if freq == FreqGroup.FR_NS: + if dtype_code >= FreqGroup.FR_HR: + if dtype_code == FreqGroup.FR_NS: periodsperday = 24 * 60 * 60 * 1000000000 - elif freq == FreqGroup.FR_US: + elif dtype_code == FreqGroup.FR_US: periodsperday = 24 * 60 * 60 * 1000000 - elif freq == FreqGroup.FR_MS: + elif dtype_code == FreqGroup.FR_MS: periodsperday = 24 * 60 * 60 * 1000 - elif freq == FreqGroup.FR_SEC: + elif dtype_code == FreqGroup.FR_SEC: periodsperday = 24 * 60 * 60 - elif freq == FreqGroup.FR_MIN: + elif dtype_code == FreqGroup.FR_MIN: periodsperday = 24 * 60 - elif freq == FreqGroup.FR_HR: + elif dtype_code == FreqGroup.FR_HR: periodsperday = 24 else: # pragma: no cover - raise ValueError(f"unexpected frequency: {freq}") + raise ValueError(f"unexpected frequency: {dtype_code}") periodsperyear = 365 * periodsperday periodspermonth = 28 * periodsperday - elif freq == FreqGroup.FR_BUS: + elif dtype_code == FreqGroup.FR_BUS: periodsperyear = 261 periodspermonth = 19 - elif freq == FreqGroup.FR_DAY: + elif dtype_code == FreqGroup.FR_DAY: periodsperyear = 365 periodspermonth = 28 - elif get_freq_group(freq) == FreqGroup.FR_WK: + elif get_freq_group(dtype_code) == FreqGroup.FR_WK: periodsperyear = 52 periodspermonth = 3 else: # pragma: no cover @@ -676,7 +680,7 @@ def _second_finder(label_interval): elif span <= periodsperyear // 4: month_start = period_break(dates_, "month") info_maj[month_start] = True - if freq < FreqGroup.FR_HR: + if dtype_code < FreqGroup.FR_HR: info["min"] = True else: day_start = period_break(dates_, "day") @@ -884,21 +888,20 @@ def _annual_finder(vmin, vmax, freq): return info -def get_finder(freq): - if isinstance(freq, str): - freq = get_freq_code(freq)[0] - fgroup = get_freq_group(freq) +def get_finder(freq: BaseOffset): + dtype_code = freq._period_dtype_code + fgroup = (dtype_code // 1000) * 1000 if fgroup == FreqGroup.FR_ANN: return _annual_finder elif fgroup == FreqGroup.FR_QTR: return _quarterly_finder - elif freq == FreqGroup.FR_MTH: + elif dtype_code == FreqGroup.FR_MTH: return _monthly_finder - elif (freq >= FreqGroup.FR_BUS) or fgroup == FreqGroup.FR_WK: + elif (dtype_code >= FreqGroup.FR_BUS) or fgroup == FreqGroup.FR_WK: return _daily_finder else: # pragma: no cover - raise NotImplementedError(f"Unsupported frequency: {freq}") + raise NotImplementedError(f"Unsupported frequency: {dtype_code}") class TimeSeries_DateLocator(Locator): @@ -930,8 +933,7 @@ def __init__( day=1, plot_obj=None, ): - if isinstance(freq, str): - freq = get_freq_code(freq)[0] + freq = to_offset(freq) self.freq = freq self.base = base (self.quarter, self.month, self.day) = (quarter, month, day) @@ -1009,8 +1011,7 @@ class TimeSeries_DateFormatter(Formatter): """ def __init__(self, freq, minor_locator=False, dynamic_mode=True, plot_obj=None): - if isinstance(freq, str): - freq = get_freq_code(freq)[0] + freq = to_offset(freq) self.format = None self.freq = freq self.locs = [] diff --git a/pandas/tests/plotting/test_datetimelike.py b/pandas/tests/plotting/test_datetimelike.py index 738df5244955a..fa129167a744f 100644 --- a/pandas/tests/plotting/test_datetimelike.py +++ b/pandas/tests/plotting/test_datetimelike.py @@ -6,6 +6,7 @@ import numpy as np import pytest +from pandas._libs.tslibs import to_offset import pandas.util._test_decorators as td from pandas import DataFrame, Index, NaT, Series, isna @@ -397,12 +398,12 @@ def _test(ax): def test_get_finder(self): import pandas.plotting._matplotlib.converter as conv - assert conv.get_finder("B") == conv._daily_finder - assert conv.get_finder("D") == conv._daily_finder - assert conv.get_finder("M") == conv._monthly_finder - assert conv.get_finder("Q") == conv._quarterly_finder - assert conv.get_finder("A") == conv._annual_finder - assert conv.get_finder("W") == conv._daily_finder + assert conv.get_finder(to_offset("B")) == conv._daily_finder + assert conv.get_finder(to_offset("D")) == conv._daily_finder + assert conv.get_finder(to_offset("M")) == conv._monthly_finder + assert conv.get_finder(to_offset("Q")) == conv._quarterly_finder + assert conv.get_finder(to_offset("A")) == conv._annual_finder + assert conv.get_finder(to_offset("W")) == conv._daily_finder @pytest.mark.slow def test_finder_daily(self): From ce3915d40b3ac486f2ef2abf5c33e9b9f28c3ca9 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 7 Jun 2020 19:27:07 -0700 Subject: [PATCH 0041/1025] CLN: update tslibs/tseries test locations/imports (#34614) --- .../tseries/frequencies/test_freq_code.py | 11 +++----- .../tseries/frequencies/test_frequencies.py | 26 +++++++++++++++++++ pandas/tests/tslibs/test_libfrequencies.py | 22 ---------------- .../frequencies => tslibs}/test_to_offset.py | 0 4 files changed, 30 insertions(+), 29 deletions(-) create mode 100644 pandas/tests/tseries/frequencies/test_frequencies.py rename pandas/tests/{tseries/frequencies => tslibs}/test_to_offset.py (100%) diff --git a/pandas/tests/tseries/frequencies/test_freq_code.py b/pandas/tests/tseries/frequencies/test_freq_code.py index d4eb31168b20e..4df221913b805 100644 --- a/pandas/tests/tseries/frequencies/test_freq_code.py +++ b/pandas/tests/tseries/frequencies/test_freq_code.py @@ -1,6 +1,6 @@ import pytest -from pandas._libs.tslibs import to_offset +from pandas._libs.tslibs import Resolution, offsets, to_offset from pandas._libs.tslibs.frequencies import ( FreqGroup, _attrname_to_abbrevs, @@ -9,9 +9,6 @@ get_freq_group, get_to_timestamp_base, ) -from pandas._libs.tslibs.resolution import Resolution as _reso - -import pandas.tseries.offsets as offsets @pytest.fixture(params=list(_period_code_map.items())) @@ -103,19 +100,19 @@ def test_get_to_timestamp_base(freqstr, exp_freqstr): ], ) def test_get_attrname_from_abbrev(freqstr, expected): - assert _reso.get_reso_from_freq(freqstr).attrname == expected + assert Resolution.get_reso_from_freq(freqstr).attrname == expected @pytest.mark.parametrize("freq", ["A", "Q", "M"]) def test_get_freq_unsupported_(freq): # Lowest-frequency resolution is for Day with pytest.raises(KeyError, match=freq.lower()): - _reso.get_reso_from_freq(freq) + Resolution.get_reso_from_freq(freq) @pytest.mark.parametrize("freq", ["D", "H", "T", "S", "L", "U", "N"]) def test_get_freq_roundtrip2(freq): - obj = _reso.get_reso_from_freq(freq) + obj = Resolution.get_reso_from_freq(freq) result = _attrname_to_abbrevs[obj.attrname] assert freq == result diff --git a/pandas/tests/tseries/frequencies/test_frequencies.py b/pandas/tests/tseries/frequencies/test_frequencies.py new file mode 100644 index 0000000000000..0479de8e8e7c3 --- /dev/null +++ b/pandas/tests/tseries/frequencies/test_frequencies.py @@ -0,0 +1,26 @@ +import pytest + +from pandas._libs.tslibs import offsets + +from pandas.tseries.frequencies import is_subperiod, is_superperiod + + +@pytest.mark.parametrize( + "p1,p2,expected", + [ + # Input validation. + (offsets.MonthEnd(), None, False), + (offsets.YearEnd(), None, False), + (None, offsets.YearEnd(), False), + (None, offsets.MonthEnd(), False), + (None, None, False), + (offsets.YearEnd(), offsets.MonthEnd(), True), + (offsets.Hour(), offsets.Minute(), True), + (offsets.Second(), offsets.Milli(), True), + (offsets.Milli(), offsets.Micro(), True), + (offsets.Micro(), offsets.Nano(), True), + ], +) +def test_super_sub_symmetry(p1, p2, expected): + assert is_superperiod(p1, p2) is expected + assert is_subperiod(p2, p1) is expected diff --git a/pandas/tests/tslibs/test_libfrequencies.py b/pandas/tests/tslibs/test_libfrequencies.py index 65d3b15bb3dac..feaaaf6adca6f 100644 --- a/pandas/tests/tslibs/test_libfrequencies.py +++ b/pandas/tests/tslibs/test_libfrequencies.py @@ -4,7 +4,6 @@ from pandas._libs.tslibs.parsing import get_rule_month from pandas.tseries import offsets -from pandas.tseries.frequencies import is_subperiod, is_superperiod # TODO: move tests @pytest.mark.parametrize( @@ -56,27 +55,6 @@ def test_period_str_to_code(obj, expected): assert _period_str_to_code(obj) == expected -@pytest.mark.parametrize( - "p1,p2,expected", - [ - # Input validation. - (offsets.MonthEnd(), None, False), - (offsets.YearEnd(), None, False), - (None, offsets.YearEnd(), False), - (None, offsets.MonthEnd(), False), - (None, None, False), - (offsets.YearEnd(), offsets.MonthEnd(), True), - (offsets.Hour(), offsets.Minute(), True), - (offsets.Second(), offsets.Milli(), True), - (offsets.Milli(), offsets.Micro(), True), - (offsets.Micro(), offsets.Nano(), True), - ], -) -def test_super_sub_symmetry(p1, p2, expected): - assert is_superperiod(p1, p2) is expected - assert is_subperiod(p2, p1) is expected - - @pytest.mark.parametrize( "freq,expected,aliases", [ diff --git a/pandas/tests/tseries/frequencies/test_to_offset.py b/pandas/tests/tslibs/test_to_offset.py similarity index 100% rename from pandas/tests/tseries/frequencies/test_to_offset.py rename to pandas/tests/tslibs/test_to_offset.py From 82483b583d6052c1d15913cf11f7809284d313e8 Mon Sep 17 00:00:00 2001 From: Atsushi Nukariya Date: Mon, 8 Jun 2020 11:38:21 +0900 Subject: [PATCH 0042/1025] Improve document for **kwargs argument of pandas.Series.to_markdown (#34616) --- pandas/core/series.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/pandas/core/series.py b/pandas/core/series.py index ef47e52151961..71ffdcbd40fe7 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1422,6 +1422,21 @@ def to_string( | 1 | pig | | 2 | dog | | 3 | quetzal | + + Output markdown with a tabulate option. + + >>> print(s.to_markdown(tablefmt="grid")) + +----+----------+ + | | animal | + +====+==========+ + | 0 | elk | + +----+----------+ + | 1 | pig | + +----+----------+ + | 2 | dog | + +----+----------+ + | 3 | quetzal | + +----+----------+ """ ) @Substitution(klass="Series") From d80d5ae8f8bacca1d109315bebed0392fcad27bc Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Mon, 8 Jun 2020 07:25:29 -0700 Subject: [PATCH 0043/1025] CLN: EWMA cython code and function dispatch (#34636) Co-authored-by: Matt Roeschke --- pandas/_libs/window/aggregations.pyx | 28 ++++++++++++------------ pandas/core/window/ewm.py | 32 +++++++++++----------------- 2 files changed, 26 insertions(+), 34 deletions(-) diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index afa0539014041..9e088062d7280 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -1793,19 +1793,19 @@ def ewma(float64_t[:] vals, float64_t com, int adjust, bint ignore_na, int minp) new_wt = 1. if adjust else alpha weighted_avg = vals[0] - is_observation = (weighted_avg == weighted_avg) + is_observation = weighted_avg == weighted_avg nobs = int(is_observation) - output[0] = weighted_avg if (nobs >= minp) else NaN + output[0] = weighted_avg if nobs >= minp else NaN old_wt = 1. with nogil: for i in range(1, N): cur = vals[i] - is_observation = (cur == cur) + is_observation = cur == cur nobs += is_observation if weighted_avg == weighted_avg: - if is_observation or (not ignore_na): + if is_observation or not ignore_na: old_wt *= old_wt_factor if is_observation: @@ -1821,7 +1821,7 @@ def ewma(float64_t[:] vals, float64_t com, int adjust, bint ignore_na, int minp) elif is_observation: weighted_avg = cur - output[i] = weighted_avg if (nobs >= minp) else NaN + output[i] = weighted_avg if nobs >= minp else NaN return output @@ -1851,7 +1851,7 @@ def ewmcov(float64_t[:] input_x, float64_t[:] input_y, """ cdef: - Py_ssize_t N = len(input_x) + Py_ssize_t N = len(input_x), M = len(input_y) float64_t alpha, old_wt_factor, new_wt, mean_x, mean_y, cov float64_t sum_wt, sum_wt2, old_wt, cur_x, cur_y, old_mean_x, old_mean_y float64_t numerator, denominator @@ -1859,8 +1859,8 @@ def ewmcov(float64_t[:] input_x, float64_t[:] input_y, ndarray[float64_t] output bint is_observation - if len(input_y) != N: - raise ValueError(f"arrays are of different lengths ({N} and {len(input_y)})") + if M != N: + raise ValueError(f"arrays are of different lengths ({N} and {M})") output = np.empty(N, dtype=float) if N == 0: @@ -1874,12 +1874,12 @@ def ewmcov(float64_t[:] input_x, float64_t[:] input_y, mean_x = input_x[0] mean_y = input_y[0] - is_observation = ((mean_x == mean_x) and (mean_y == mean_y)) + is_observation = (mean_x == mean_x) and (mean_y == mean_y) nobs = int(is_observation) if not is_observation: mean_x = NaN mean_y = NaN - output[0] = (0. if bias else NaN) if (nobs >= minp) else NaN + output[0] = (0. if bias else NaN) if nobs >= minp else NaN cov = 0. sum_wt = 1. sum_wt2 = 1. @@ -1890,10 +1890,10 @@ def ewmcov(float64_t[:] input_x, float64_t[:] input_y, for i in range(1, N): cur_x = input_x[i] cur_y = input_y[i] - is_observation = ((cur_x == cur_x) and (cur_y == cur_y)) + is_observation = (cur_x == cur_x) and (cur_y == cur_y) nobs += is_observation if mean_x == mean_x: - if is_observation or (not ignore_na): + if is_observation or not ignore_na: sum_wt *= old_wt_factor sum_wt2 *= (old_wt_factor * old_wt_factor) old_wt *= old_wt_factor @@ -1929,8 +1929,8 @@ def ewmcov(float64_t[:] input_x, float64_t[:] input_y, if not bias: numerator = sum_wt * sum_wt denominator = numerator - sum_wt2 - if (denominator > 0.): - output[i] = ((numerator / denominator) * cov) + if denominator > 0: + output[i] = (numerator / denominator) * cov else: output[i] = NaN else: diff --git a/pandas/core/window/ewm.py b/pandas/core/window/ewm.py index d5f2b67eeac2e..a5e30c900cae2 100644 --- a/pandas/core/window/ewm.py +++ b/pandas/core/window/ewm.py @@ -1,3 +1,4 @@ +from functools import partial from textwrap import dedent import numpy as np @@ -219,7 +220,7 @@ def aggregate(self, func, *args, **kwargs): agg = aggregate - def _apply(self, func, **kwargs): + def _apply(self, func): """ Rolling statistical measure using supplied function. Designed to be used with passed-in Cython array-based functions. @@ -253,23 +254,6 @@ def _apply(self, func, **kwargs): results.append(values.copy()) continue - # if we have a string function name, wrap it - if isinstance(func, str): - cfunc = getattr(window_aggregations, func, None) - if cfunc is None: - raise ValueError( - f"we do not support this function in window_aggregations.{func}" - ) - - def func(arg): - return cfunc( - arg, - self.com, - int(self.adjust), - int(self.ignore_na), - int(self.min_periods), - ) - results.append(np.apply_along_axis(func, self.axis, values)) return self._wrap_results(results, block_list, obj, exclude) @@ -286,7 +270,15 @@ def mean(self, *args, **kwargs): Arguments and keyword arguments to be passed into func. """ nv.validate_window_func("mean", args, kwargs) - return self._apply("ewma", **kwargs) + window_func = self._get_roll_func("ewma") + window_func = partial( + window_func, + com=self.com, + adjust=int(self.adjust), + ignore_na=self.ignore_na, + minp=int(self.min_periods), + ) + return self._apply(window_func) @Substitution(name="ewm", func_name="std") @Appender(_doc_template) @@ -320,7 +312,7 @@ def f(arg): int(bias), ) - return self._apply(f, **kwargs) + return self._apply(f) @Substitution(name="ewm", func_name="cov") @Appender(_doc_template) From de45dceeb558730a7a25874993302e50ddefb47c Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 8 Jun 2020 09:09:37 -0700 Subject: [PATCH 0044/1025] REF: re-use existing conversion functions (#34625) --- pandas/_libs/tslibs/period.pyx | 67 ++++++++++++++++------------------ 1 file changed, 32 insertions(+), 35 deletions(-) diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index 5c890c7fbf59d..32acbcfb39b50 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -763,10 +763,9 @@ cdef int64_t get_period_ordinal(npy_datetimestruct *dts, int freq) nogil: period_ordinal : int64_t """ cdef: - int64_t unix_date, seconds, delta - int64_t weeks - int64_t day_adj + int64_t unix_date int freq_group, fmonth, mdiff + NPY_DATETIMEUNIT unit freq_group = get_freq_group(freq) @@ -789,44 +788,42 @@ cdef int64_t get_period_ordinal(npy_datetimestruct *dts, int freq) nogil: mdiff = dts.month - fmonth + 12 return (dts.year - 1970) * 4 + (mdiff - 1) // 3 - elif freq == FR_MTH: - return (dts.year - 1970) * 12 + dts.month - 1 - - unix_date = npy_datetimestruct_to_datetime(NPY_FR_D, dts) - - if freq >= FR_SEC: - seconds = unix_date * 86400 + dts.hour * 3600 + dts.min * 60 + dts.sec - - if freq == FR_MS: - return seconds * 1000 + dts.us // 1000 - - elif freq == FR_US: - return seconds * 1000000 + dts.us - - elif freq == FR_NS: - return (seconds * 1000000000 + - dts.us * 1000 + dts.ps // 1000) + elif freq_group == FR_WK: + unix_date = npy_datetimestruct_to_datetime(NPY_FR_D, dts) + return unix_date_to_week(unix_date, freq - FR_WK) - else: - return seconds + elif freq == FR_BUS: + unix_date = npy_datetimestruct_to_datetime(NPY_FR_D, dts) + return DtoB(dts, 0, unix_date) - elif freq == FR_MIN: - return unix_date * 1440 + dts.hour * 60 + dts.min + unit = get_unit(freq) + return npy_datetimestruct_to_datetime(unit, dts) - elif freq == FR_HR: - return unix_date * 24 + dts.hour +cdef NPY_DATETIMEUNIT get_unit(int freq) nogil: + """ + Convert the freq to the corresponding NPY_DATETIMEUNIT to pass + to npy_datetimestruct_to_datetime. + """ + if freq == FR_MTH: + return NPY_DATETIMEUNIT.NPY_FR_M elif freq == FR_DAY: - return unix_date - + return NPY_DATETIMEUNIT.NPY_FR_D + elif freq == FR_HR: + return NPY_DATETIMEUNIT.NPY_FR_h + elif freq == FR_MIN: + return NPY_DATETIMEUNIT.NPY_FR_m + elif freq == FR_SEC: + return NPY_DATETIMEUNIT.NPY_FR_s + elif freq == FR_MS: + return NPY_DATETIMEUNIT.NPY_FR_ms + elif freq == FR_US: + return NPY_DATETIMEUNIT.NPY_FR_us + elif freq == FR_NS: + return NPY_DATETIMEUNIT.NPY_FR_ns elif freq == FR_UND: - return unix_date - - elif freq == FR_BUS: - return DtoB(dts, 0, unix_date) - - elif freq_group == FR_WK: - return unix_date_to_week(unix_date, freq - FR_WK) + # Default to Day + return NPY_DATETIMEUNIT.NPY_FR_D cdef void get_date_info(int64_t ordinal, int freq, npy_datetimestruct *dts) nogil: From 957c4a81fdc34539f11f11a1ca8a751688bacab6 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 8 Jun 2020 09:15:52 -0700 Subject: [PATCH 0045/1025] REF: RelativeDeltaOffset.apply_index operate on ndarray (#34613) --- pandas/_libs/tslibs/offsets.pyx | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index 28ead3593cf85..66d29e58a6966 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -557,7 +557,7 @@ cdef class BaseOffset: # ------------------------------------------------------------------ @apply_index_wraps - def apply_index(self, index): + def apply_index(self, dtindex): """ Vectorized apply of DateOffset to DatetimeIndex, raises NotImplementedError for offsets without a @@ -1029,7 +1029,7 @@ cdef class RelativeDeltaOffset(BaseOffset): return other + timedelta(self.n) @apply_index_wraps - def apply_index(self, index): + def apply_index(self, dtindex): """ Vectorized apply of DateOffset to DatetimeIndex, raises NotImplementedError for offsets without a @@ -1041,8 +1041,9 @@ cdef class RelativeDeltaOffset(BaseOffset): Returns ------- - DatetimeIndex + ndarray[datetime64[ns]] """ + dt64other = np.asarray(dtindex) kwds = self.kwds relativedelta_fast = { "years", @@ -1059,12 +1060,12 @@ cdef class RelativeDeltaOffset(BaseOffset): months = (kwds.get("years", 0) * 12 + kwds.get("months", 0)) * self.n if months: - shifted = shift_months(index.asi8, months) - index = type(index)(shifted, dtype=index.dtype) + shifted = shift_months(dt64other.view("i8"), months) + dt64other = shifted.view("datetime64[ns]") weeks = kwds.get("weeks", 0) * self.n if weeks: - index = index + timedelta(days=7 * weeks) + dt64other = dt64other + Timedelta(days=7 * weeks) timedelta_kwds = { k: v @@ -1073,11 +1074,11 @@ cdef class RelativeDeltaOffset(BaseOffset): } if timedelta_kwds: delta = Timedelta(**timedelta_kwds) - index = index + (self.n * delta) - return index + dt64other = dt64other + (self.n * delta) + return dt64other elif not self._use_relativedelta and hasattr(self, "_offset"): # timedelta - return index + (self._offset * self.n) + return dt64other + Timedelta(self._offset * self.n) else: # relativedelta with other keywords kwd = set(kwds) - relativedelta_fast From bba2359c57c4be6d685b54bab0e5dc8434566b82 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 8 Jun 2020 09:17:24 -0700 Subject: [PATCH 0046/1025] REF: mix PeriodPseudoDtype into PeriodDtype (#34590) --- pandas/_libs/tslibs/__init__.py | 3 ++- pandas/_libs/tslibs/dtypes.pxd | 4 +++- pandas/_libs/tslibs/dtypes.pyx | 6 +++--- pandas/_libs/tslibs/offsets.pyx | 4 +--- pandas/_libs/tslibs/period.pyx | 8 ++++---- pandas/core/arrays/period.py | 16 ++++++++-------- pandas/core/dtypes/dtypes.py | 15 ++++++++++----- pandas/plotting/_matplotlib/timeseries.py | 4 ++-- pandas/tests/dtypes/test_dtypes.py | 6 +++++- 9 files changed, 38 insertions(+), 28 deletions(-) diff --git a/pandas/_libs/tslibs/__init__.py b/pandas/_libs/tslibs/__init__.py index 25e2d8ba477e0..6dbb4ce7bc974 100644 --- a/pandas/_libs/tslibs/__init__.py +++ b/pandas/_libs/tslibs/__init__.py @@ -1,4 +1,5 @@ __all__ = [ + "dtypes", "localize_pydatetime", "NaT", "NaTType", @@ -17,7 +18,7 @@ "to_offset", ] - +from . import dtypes # type: ignore from .conversion import localize_pydatetime from .nattype import NaT, NaTType, iNaT, is_null_datetimelike, nat_strings from .np_datetime import OutOfBoundsDatetime diff --git a/pandas/_libs/tslibs/dtypes.pxd b/pandas/_libs/tslibs/dtypes.pxd index 23c473726e5a9..b6373550b1c78 100644 --- a/pandas/_libs/tslibs/dtypes.pxd +++ b/pandas/_libs/tslibs/dtypes.pxd @@ -50,7 +50,9 @@ cdef enum PeriodDtypeCode: U = 11000 # Microsecondly N = 12000 # Nanosecondly + UNDEFINED = -10_000 -cdef class PeriodPseudoDtype: + +cdef class PeriodDtypeBase: cdef readonly: PeriodDtypeCode dtype_code diff --git a/pandas/_libs/tslibs/dtypes.pyx b/pandas/_libs/tslibs/dtypes.pyx index d0d4e579a456b..047f942178179 100644 --- a/pandas/_libs/tslibs/dtypes.pyx +++ b/pandas/_libs/tslibs/dtypes.pyx @@ -2,7 +2,7 @@ # originals -cdef class PeriodPseudoDtype: +cdef class PeriodDtypeBase: """ Similar to an actual dtype, this contains all of the information describing a PeriodDtype in an integer code. @@ -14,9 +14,9 @@ cdef class PeriodPseudoDtype: self.dtype_code = code def __eq__(self, other): - if not isinstance(other, PeriodPseudoDtype): + if not isinstance(other, PeriodDtypeBase): return False - if not isinstance(self, PeriodPseudoDtype): + if not isinstance(self, PeriodDtypeBase): # cython semantics, this is a reversed op return False return self.dtype_code == other.dtype_code diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index 66d29e58a6966..4069d192d9e88 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -2529,12 +2529,10 @@ cdef class Week(SingleConstructorOffset): ------- result : DatetimeIndex """ - from .frequencies import get_freq_code # TODO: avoid circular import - i8other = dtindex.asi8 off = (i8other % DAY_NANOS).view("timedelta64[ns]") - base, mult = get_freq_code(self.freqstr) + base = self._period_dtype_code base_period = dtindex.to_period(base) if self.n > 0: diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index 32acbcfb39b50..55148041c1718 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -56,7 +56,7 @@ from pandas._libs.tslibs.ccalendar cimport ( ) from pandas._libs.tslibs.ccalendar cimport c_MONTH_NUMBERS -from pandas._libs.tslibs.dtypes cimport PeriodPseudoDtype +from pandas._libs.tslibs.dtypes cimport PeriodDtypeBase from pandas._libs.tslibs.frequencies cimport ( attrname_to_abbrevs, @@ -1514,7 +1514,7 @@ cdef class _Period: cdef readonly: int64_t ordinal - PeriodPseudoDtype _dtype + PeriodDtypeBase _dtype BaseOffset freq def __cinit__(self, int64_t ordinal, BaseOffset freq): @@ -1523,7 +1523,7 @@ cdef class _Period: # Note: this is more performant than PeriodDtype.from_date_offset(freq) # because from_date_offset cannot be made a cdef method (until cython # supported cdef classmethods) - self._dtype = PeriodPseudoDtype(freq._period_dtype_code) + self._dtype = PeriodDtypeBase(freq._period_dtype_code) @classmethod def _maybe_convert_freq(cls, object freq) -> BaseOffset: @@ -2460,7 +2460,7 @@ class Period(_Period): raise ValueError(msg) if ordinal is None: - base, mult = get_freq_code(freq) + base, _ = get_freq_code(freq) ordinal = period_ordinal(dt.year, dt.month, dt.day, dt.hour, dt.minute, dt.second, dt.microsecond, 0, base) diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 1b8a0b2780a7d..b16a3df003512 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -52,7 +52,7 @@ def _field_accessor(name: str, docstring=None): def f(self): - base, _ = libfrequencies.get_freq_code(self.freq) + base = self.freq._period_dtype_code result = get_period_field_arr(name, self.asi8, base) return result @@ -440,12 +440,12 @@ def to_timestamp(self, freq=None, how="start"): return (self + self.freq).to_timestamp(how="start") - adjust if freq is None: - base, mult = libfrequencies.get_freq_code(self.freq) + base = self.freq._period_dtype_code freq = libfrequencies.get_to_timestamp_base(base) else: freq = Period._maybe_convert_freq(freq) - base, mult = libfrequencies.get_freq_code(freq) + base, _ = libfrequencies.get_freq_code(freq) new_data = self.asfreq(freq, how=how) new_data = libperiod.periodarr_to_dt64arr(new_data.asi8, base) @@ -523,14 +523,14 @@ def asfreq(self, freq=None, how: str = "E") -> "PeriodArray": freq = Period._maybe_convert_freq(freq) - base1, mult1 = libfrequencies.get_freq_code(self.freq) - base2, mult2 = libfrequencies.get_freq_code(freq) + base1 = self.freq._period_dtype_code + base2 = freq._period_dtype_code asi8 = self.asi8 - # mult1 can't be negative or 0 + # self.freq.n can't be negative or 0 end = how == "E" if end: - ordinal = asi8 + mult1 - 1 + ordinal = asi8 + self.freq.n - 1 else: ordinal = asi8 @@ -950,7 +950,7 @@ def dt64arr_to_periodarr(data, freq, tz=None): if isinstance(data, (ABCIndexClass, ABCSeries)): data = data._values - base, mult = libfrequencies.get_freq_code(freq) + base = freq._period_dtype_code return libperiod.dt64arr_to_periodarr(data.view("i8"), base, tz), freq diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 84284c581c9e5..b9d16ac5959e3 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -20,7 +20,7 @@ import pytz from pandas._libs.interval import Interval -from pandas._libs.tslibs import NaT, Period, Timestamp, timezones, to_offset +from pandas._libs.tslibs import NaT, Period, Timestamp, dtypes, timezones, to_offset from pandas._libs.tslibs.offsets import BaseOffset from pandas._typing import DtypeObj, Ordered @@ -848,7 +848,7 @@ def __setstate__(self, state) -> None: @register_extension_dtype -class PeriodDtype(PandasExtensionDtype): +class PeriodDtype(dtypes.PeriodDtypeBase, PandasExtensionDtype): """ An ExtensionDtype for Period data. @@ -896,7 +896,8 @@ def __new__(cls, freq=None): elif freq is None: # empty constructor for pickle compat - u = object.__new__(cls) + # -10_000 corresponds to PeriodDtypeCode.UNDEFINED + u = dtypes.PeriodDtypeBase.__new__(cls, -10_000) u._freq = None return u @@ -906,11 +907,15 @@ def __new__(cls, freq=None): try: return cls._cache[freq.freqstr] except KeyError: - u = object.__new__(cls) + dtype_code = freq._period_dtype_code + u = dtypes.PeriodDtypeBase.__new__(cls, dtype_code) u._freq = freq cls._cache[freq.freqstr] = u return u + def __reduce__(self): + return type(self), (self.freq,) + @property def freq(self): """ @@ -977,7 +982,7 @@ def __eq__(self, other: Any) -> bool: return isinstance(other, PeriodDtype) and self.freq == other.freq def __setstate__(self, state): - # for pickle compat. __get_state__ is defined in the + # for pickle compat. __getstate__ is defined in the # PandasExtensionDtype superclass and uses the public properties to # pickle -> need to set the settable private ones here (see GH26067) self._freq = state["freq"] diff --git a/pandas/plotting/_matplotlib/timeseries.py b/pandas/plotting/_matplotlib/timeseries.py index 475452c71db58..a9cca32271b9f 100644 --- a/pandas/plotting/_matplotlib/timeseries.py +++ b/pandas/plotting/_matplotlib/timeseries.py @@ -6,7 +6,7 @@ import numpy as np from pandas._libs.tslibs import Period, to_offset -from pandas._libs.tslibs.frequencies import FreqGroup, base_and_stride, get_freq_code +from pandas._libs.tslibs.frequencies import FreqGroup, base_and_stride from pandas._typing import FrameOrSeriesUnion from pandas.core.dtypes.generic import ( @@ -213,7 +213,7 @@ def _use_dynamic_x(ax, data: "FrameOrSeriesUnion") -> bool: # FIXME: hack this for 0.10.1, creating more technical debt...sigh if isinstance(data.index, ABCDatetimeIndex): - base = get_freq_code(freq)[0] + base = to_offset(freq)._period_dtype_code x = data.index if base <= FreqGroup.FR_DAY: return x[:1].is_normalized diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index 2684aa2c590d9..3b9d3dc0b91f6 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -67,7 +67,11 @@ def test_pickle(self, dtype): # force back to the cache result = tm.round_trip_pickle(dtype) - assert not len(dtype._cache) + if not isinstance(dtype, PeriodDtype): + # Because PeriodDtype has a cython class as a base class, + # it has different pickle semantics, and its cache is re-populated + # on un-pickling. + assert not len(dtype._cache) assert result == dtype From b702595b2ef3bc7ebd79a7ffaafb7f647788eda3 Mon Sep 17 00:00:00 2001 From: Phan Duc Nhat Minh Date: Tue, 9 Jun 2020 06:10:03 +0800 Subject: [PATCH 0047/1025] BUG/API: Disallow unit if input to Timedelta and to_timedelta is/contains str (#34634) --- doc/source/whatsnew/v1.1.0.rst | 1 + pandas/_libs/tslibs/timedeltas.pyx | 15 ++++++++++++--- pandas/core/arrays/timedeltas.py | 11 +++++++---- pandas/core/computation/pytables.py | 5 ++++- pandas/core/tools/timedeltas.py | 11 ++++++++--- .../tests/scalar/timedelta/test_constructors.py | 14 ++++++++++++++ 6 files changed, 46 insertions(+), 11 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index b2b55b7b503ec..cf5a6976524de 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -827,6 +827,7 @@ Timedelta - Bug in :func:`timedelta_range` that produced an extra point on a edge case (:issue:`30353`, :issue:`33498`) - Bug in :meth:`DataFrame.resample` that produced an extra point on a edge case (:issue:`30353`, :issue:`13022`, :issue:`33498`) - Bug in :meth:`DataFrame.resample` that ignored the ``loffset`` argument when dealing with timedelta (:issue:`7687`, :issue:`33498`) +- Bug in :class:`Timedelta` and `pandas.to_timedelta` that ignored `unit`-argument for string input (:issue:`12136`) Timezones ^^^^^^^^^ diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index a239804ea7bc2..a5b502f3f4071 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -218,7 +218,7 @@ cdef convert_to_timedelta64(object ts, object unit): @cython.boundscheck(False) @cython.wraparound(False) -def array_to_timedelta64(object[:] values, unit='ns', errors='raise'): +def array_to_timedelta64(object[:] values, unit=None, errors='raise'): """ Convert an ndarray to an array of timedeltas. If errors == 'coerce', coerce non-convertible objects to NaT. Otherwise, raise. @@ -235,6 +235,13 @@ def array_to_timedelta64(object[:] values, unit='ns', errors='raise'): result = np.empty(n, dtype='m8[ns]') iresult = result.view('i8') + if unit is not None: + for i in range(n): + if isinstance(values[i], str): + raise ValueError( + "unit must not be specified if the input contains a str" + ) + # Usually, we have all strings. If so, we hit the fast path. # If this path fails, we try conversion a different way, and # this is where all of the error handling will take place. @@ -247,10 +254,10 @@ def array_to_timedelta64(object[:] values, unit='ns', errors='raise'): else: result[i] = parse_timedelta_string(values[i]) except (TypeError, ValueError): - unit = parse_timedelta_unit(unit) + parsed_unit = parse_timedelta_unit(unit or 'ns') for i in range(n): try: - result[i] = convert_to_timedelta64(values[i], unit) + result[i] = convert_to_timedelta64(values[i], parsed_unit) except ValueError: if errors == 'coerce': result[i] = NPY_NAT @@ -1155,6 +1162,8 @@ class Timedelta(_Timedelta): elif isinstance(value, _Timedelta): value = value.value elif isinstance(value, str): + if unit is not None: + raise ValueError("unit must not be specified if the value is a str") if len(value) > 0 and value[0] == 'P': value = parse_iso_format_string(value) else: diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index f439f07790274..d0657994dd81c 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -876,7 +876,7 @@ def f(x): # Constructor Helpers -def sequence_to_td64ns(data, copy=False, unit="ns", errors="raise"): +def sequence_to_td64ns(data, copy=False, unit=None, errors="raise"): """ Parameters ---------- @@ -884,6 +884,7 @@ def sequence_to_td64ns(data, copy=False, unit="ns", errors="raise"): copy : bool, default False unit : str, default "ns" The timedelta unit to treat integers as multiples of. + Must be un-specifed if the data contains a str. errors : {"raise", "coerce", "ignore"}, default "raise" How to handle elements that cannot be converted to timedelta64[ns]. See ``pandas.to_timedelta`` for details. @@ -906,7 +907,8 @@ def sequence_to_td64ns(data, copy=False, unit="ns", errors="raise"): higher level. """ inferred_freq = None - unit = parse_timedelta_unit(unit) + if unit is not None: + unit = parse_timedelta_unit(unit) # Unwrap whatever we have into a np.ndarray if not hasattr(data, "dtype"): @@ -936,7 +938,7 @@ def sequence_to_td64ns(data, copy=False, unit="ns", errors="raise"): # cast the unit, multiply base/frac separately # to avoid precision issues from float -> int mask = np.isnan(data) - m, p = precision_from_unit(unit) + m, p = precision_from_unit(unit or "ns") base = data.astype(np.int64) frac = data - base if p: @@ -1002,7 +1004,7 @@ def ints_to_td64ns(data, unit="ns"): return data, copy_made -def objects_to_td64ns(data, unit="ns", errors="raise"): +def objects_to_td64ns(data, unit=None, errors="raise"): """ Convert a object-dtyped or string-dtyped array into an timedelta64[ns]-dtyped array. @@ -1012,6 +1014,7 @@ def objects_to_td64ns(data, unit="ns", errors="raise"): data : ndarray or Index unit : str, default "ns" The timedelta unit to treat integers as multiples of. + Must not be specified if the data contains a str. errors : {"raise", "coerce", "ignore"}, default "raise" How to handle elements that cannot be converted to timedelta64[ns]. See ``pandas.to_timedelta`` for details. diff --git a/pandas/core/computation/pytables.py b/pandas/core/computation/pytables.py index 15d9987310f18..001eb1789007f 100644 --- a/pandas/core/computation/pytables.py +++ b/pandas/core/computation/pytables.py @@ -200,7 +200,10 @@ def stringify(value): v = v.tz_convert("UTC") return TermValue(v, v.value, kind) elif kind == "timedelta64" or kind == "timedelta": - v = Timedelta(v, unit="s").value + if isinstance(v, str): + v = Timedelta(v).value + else: + v = Timedelta(v, unit="s").value return TermValue(int(v), v, kind) elif meta == "category": metadata = extract_array(self.metadata, extract_numpy=True) diff --git a/pandas/core/tools/timedeltas.py b/pandas/core/tools/timedeltas.py index 51b404b46f321..87eac93a6072c 100644 --- a/pandas/core/tools/timedeltas.py +++ b/pandas/core/tools/timedeltas.py @@ -13,7 +13,7 @@ from pandas.core.arrays.timedeltas import sequence_to_td64ns -def to_timedelta(arg, unit="ns", errors="raise"): +def to_timedelta(arg, unit=None, errors="raise"): """ Convert argument to timedelta. @@ -27,6 +27,7 @@ def to_timedelta(arg, unit="ns", errors="raise"): arg : str, timedelta, list-like or Series The data to be converted to timedelta. unit : str, default 'ns' + Must not be specified if the arg is/contains a str. Denotes the unit of the arg. Possible values: ('W', 'D', 'days', 'day', 'hours', hour', 'hr', 'h', 'm', 'minute', 'min', 'minutes', 'T', 'S', 'seconds', @@ -76,7 +77,8 @@ def to_timedelta(arg, unit="ns", errors="raise"): TimedeltaIndex(['0 days', '1 days', '2 days', '3 days', '4 days'], dtype='timedelta64[ns]', freq=None) """ - unit = parse_timedelta_unit(unit) + if unit is not None: + unit = parse_timedelta_unit(unit) if errors not in ("ignore", "raise", "coerce"): raise ValueError("errors must be one of 'ignore', 'raise', or 'coerce'}") @@ -104,6 +106,9 @@ def to_timedelta(arg, unit="ns", errors="raise"): "arg must be a string, timedelta, list, tuple, 1-d array, or Series" ) + if isinstance(arg, str) and unit is not None: + raise ValueError("unit must not be specified if the input is/contains a str") + # ...so it must be a scalar value. Return scalar. return _coerce_scalar_to_timedelta_type(arg, unit=unit, errors=errors) @@ -124,7 +129,7 @@ def _coerce_scalar_to_timedelta_type(r, unit="ns", errors="raise"): return result -def _convert_listlike(arg, unit="ns", errors="raise", name=None): +def _convert_listlike(arg, unit=None, errors="raise", name=None): """Convert a list of objects to a timedelta index object.""" if isinstance(arg, (list, tuple)) or not hasattr(arg, "dtype"): # This is needed only to ensure that in the case where we end up diff --git a/pandas/tests/scalar/timedelta/test_constructors.py b/pandas/tests/scalar/timedelta/test_constructors.py index c58994d738562..23fb25b838da6 100644 --- a/pandas/tests/scalar/timedelta/test_constructors.py +++ b/pandas/tests/scalar/timedelta/test_constructors.py @@ -289,3 +289,17 @@ def test_timedelta_constructor_identity(): expected = Timedelta(np.timedelta64(1, "s")) result = Timedelta(expected) assert result is expected + + +@pytest.mark.parametrize( + "constructor, value, unit, expectation", + [ + (Timedelta, "10s", "ms", (ValueError, "unit must not be specified")), + (to_timedelta, "10s", "ms", (ValueError, "unit must not be specified")), + (to_timedelta, ["1", 2, 3], "s", (ValueError, "unit must not be specified")), + ], +) +def test_string_with_unit(constructor, value, unit, expectation): + exp, match = expectation + with pytest.raises(exp, match=match): + _ = constructor(value, unit=unit) From 6a40178aaaf2205dffd02177253e0e803e87a684 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 8 Jun 2020 15:55:09 -0700 Subject: [PATCH 0048/1025] REF: simplify get_yq (#34649) --- pandas/_libs/tslibs/period.pyx | 51 ++++++++++++++++------------------ 1 file changed, 24 insertions(+), 27 deletions(-) diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index 55148041c1718..c628f5361acad 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -497,12 +497,11 @@ cdef int64_t asfreq_DTtoA(int64_t ordinal, asfreq_info *af_info) nogil: return (dts.year - 1970) -cdef int DtoQ_yq(int64_t ordinal, asfreq_info *af_info, int *year) nogil: +cdef int DtoQ_yq(int64_t ordinal, asfreq_info *af_info, npy_datetimestruct* dts) nogil: cdef: - npy_datetimestruct dts int quarter - pandas_datetime_to_datetimestruct(ordinal, NPY_FR_D, &dts) + pandas_datetime_to_datetimestruct(ordinal, NPY_FR_D, dts) if af_info.to_end != 12: dts.month -= af_info.to_end if dts.month <= 0: @@ -510,19 +509,19 @@ cdef int DtoQ_yq(int64_t ordinal, asfreq_info *af_info, int *year) nogil: else: dts.year += 1 - year[0] = dts.year quarter = month_to_quarter(dts.month) return quarter cdef int64_t asfreq_DTtoQ(int64_t ordinal, asfreq_info *af_info) nogil: cdef: - int year, quarter + int quarter + npy_datetimestruct dts ordinal = downsample_daytime(ordinal, af_info) - quarter = DtoQ_yq(ordinal, af_info, &year) - return ((year - 1970) * 4 + quarter - 1) + quarter = DtoQ_yq(ordinal, af_info, &dts) + return ((dts.year - 1970) * 4 + quarter - 1) cdef int64_t asfreq_DTtoM(int64_t ordinal, asfreq_info *af_info) nogil: @@ -919,7 +918,7 @@ cdef int64_t get_time_nanos(int freq, int64_t unix_date, int64_t ordinal) nogil: return sub * factor -cdef int get_yq(int64_t ordinal, int freq, int *quarter, int *year): +cdef int get_yq(int64_t ordinal, int freq, npy_datetimestruct* dts): """ Find the year and quarter of a Period with the given ordinal and frequency @@ -927,22 +926,22 @@ cdef int get_yq(int64_t ordinal, int freq, int *quarter, int *year): ---------- ordinal : int64_t freq : int - quarter : *int - year : *int + dts : *npy_datetimestruct Returns ------- - qtr_freq : int + quarter : int describes the implied quarterly frequency associated with `freq` Notes ----- - Sets quarter and year inplace + Sets dts.year in-place. """ cdef: asfreq_info af_info int qtr_freq int64_t unix_date + int quarter unix_date = get_unix_date(ordinal, freq) @@ -951,11 +950,10 @@ cdef int get_yq(int64_t ordinal, int freq, int *quarter, int *year): else: qtr_freq = FR_QTR - assert (qtr_freq % 1000) <= 12 get_asfreq_info(FR_DAY, qtr_freq, True, &af_info) - quarter[0] = DtoQ_yq(unix_date, &af_info, year) - return qtr_freq + quarter = DtoQ_yq(unix_date, &af_info, dts) + return quarter cdef inline int month_to_quarter(int month) nogil: @@ -1225,15 +1223,15 @@ cdef str _period_strftime(int64_t value, int freq, bytes fmt): for i in range(len(extra_fmts)): if found_pat[i]: - if get_yq(value, freq, &quarter, &year) < 0: - raise ValueError('Unable to get quarter and year') + + quarter = get_yq(value, freq, &dts) if i == 0: repl = str(quarter) elif i == 1: # %f, 2-digit year - repl = f"{(year % 100):02d}" + repl = f"{(dts.year % 100):02d}" elif i == 2: - repl = str(year) + repl = str(dts.year) elif i == 3: repl = f"{(value % 1_000):03d}" elif i == 4: @@ -1259,20 +1257,19 @@ cdef int pyear(int64_t ordinal, int freq): return dts.year -@cython.cdivision cdef int pqyear(int64_t ordinal, int freq): cdef: - int year = 0 - int quarter = 0 - get_yq(ordinal, freq, &quarter, &year) - return year + npy_datetimestruct dts + + get_yq(ordinal, freq, &dts) + return dts.year cdef int pquarter(int64_t ordinal, int freq): cdef: - int year = 0 - int quarter = 0 - get_yq(ordinal, freq, &quarter, &year) + int quarter + npy_datetimestruct dts + quarter = get_yq(ordinal, freq, &dts) return quarter From cb15406bb8b1e0beff06ab35ac0993a0175fa39c Mon Sep 17 00:00:00 2001 From: willpeppo Date: Mon, 8 Jun 2020 18:56:49 -0400 Subject: [PATCH 0049/1025] DOC: updated _testing.py for PR08 errors (#34653) --- pandas/_testing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_testing.py b/pandas/_testing.py index 0180169973e0c..61eab6b8152e1 100644 --- a/pandas/_testing.py +++ b/pandas/_testing.py @@ -1047,7 +1047,7 @@ def assert_extension_array_equal( check_exact : bool, default False Whether to compare number exactly. index_values : numpy.ndarray, default None - optional index (shared by both left and right), used in output. + Optional index (shared by both left and right), used in output. Notes ----- From b776dbd00b25e11b9530e841932b2f7510dcc080 Mon Sep 17 00:00:00 2001 From: willpeppo Date: Mon, 8 Jun 2020 18:58:11 -0400 Subject: [PATCH 0050/1025] DOC: updated io/sql.py for PR08 errors (#34646) --- pandas/io/sql.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 991d222bfae1f..b137608475b3d 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -332,11 +332,9 @@ def read_sql_query( ---------- sql : str SQL query or SQLAlchemy Selectable (select or text object) SQL query to be executed. - con : SQLAlchemy connectable(engine/connection), database str URI, - or sqlite3 DBAPI2 connection + con : SQLAlchemy connectable, str, or sqlite3 connection Using SQLAlchemy makes it possible to use any DB supported by that - library. - If a DBAPI2 object, only sqlite3 is supported. + library. If a DBAPI2 object, only sqlite3 is supported. index_col : str or list of str, optional, default: None Column(s) to set as index(MultiIndex). coerce_float : bool, default True @@ -438,9 +436,7 @@ def read_sql( ---------- sql : str or SQLAlchemy Selectable (select or text object) SQL query to be executed or a table name. - con : SQLAlchemy connectable (engine/connection) or database str URI - or DBAPI2 connection (fallback mode). - + con : SQLAlchemy connectable, str, or sqlite3 connection Using SQLAlchemy makes it possible to use any DB supported by that library. If a DBAPI2 object, only sqlite3 is supported. The user is responsible for engine disposal and connection closure for the SQLAlchemy connectable. See From ff878c81abf4336c58fb0e28f5c55020ab34d5e7 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 8 Jun 2020 16:00:24 -0700 Subject: [PATCH 0051/1025] REF: implement c_FreqGroup, FreqGroup in tslibs.dtypes (#34588) --- pandas/_libs/tslibs/dtypes.pxd | 17 +++++++ pandas/_libs/tslibs/dtypes.pyx | 17 +++++++ pandas/_libs/tslibs/frequencies.pyx | 17 +------ pandas/_libs/tslibs/period.pyx | 77 +++++++++++------------------ 4 files changed, 65 insertions(+), 63 deletions(-) diff --git a/pandas/_libs/tslibs/dtypes.pxd b/pandas/_libs/tslibs/dtypes.pxd index b6373550b1c78..bce071d45c12f 100644 --- a/pandas/_libs/tslibs/dtypes.pxd +++ b/pandas/_libs/tslibs/dtypes.pxd @@ -1,4 +1,21 @@ +cdef enum c_FreqGroup: + # Mirrors FreqGroup in the .pxy file + FR_ANN = 1000 + FR_QTR = 2000 + FR_MTH = 3000 + FR_WK = 4000 + FR_BUS = 5000 + FR_DAY = 6000 + FR_HR = 7000 + FR_MIN = 8000 + FR_SEC = 9000 + FR_MS = 10000 + FR_US = 11000 + FR_NS = 12000 + FR_UND = -10000 # undefined + + cdef enum PeriodDtypeCode: # Annual freqs with various fiscal year ends. # eg, 2005 for A_FEB runs Mar 1, 2004 to Feb 28, 2005 diff --git a/pandas/_libs/tslibs/dtypes.pyx b/pandas/_libs/tslibs/dtypes.pyx index 047f942178179..e38cfe21a65cc 100644 --- a/pandas/_libs/tslibs/dtypes.pyx +++ b/pandas/_libs/tslibs/dtypes.pyx @@ -106,3 +106,20 @@ _period_code_map.update({ "W": 4000, # Weekly "C": 5000, # Custom Business Day }) + + +class FreqGroup: + # Mirrors c_FreqGroup in the .pxd file + FR_ANN = 1000 + FR_QTR = 2000 + FR_MTH = 3000 + FR_WK = 4000 + FR_BUS = 5000 + FR_DAY = 6000 + FR_HR = 7000 + FR_MIN = 8000 + FR_SEC = 9000 + FR_MS = 10000 + FR_US = 11000 + FR_NS = 12000 + FR_UND = -10000 # undefined diff --git a/pandas/_libs/tslibs/frequencies.pyx b/pandas/_libs/tslibs/frequencies.pyx index 8ca442de59f9f..9ff34ef0b6f89 100644 --- a/pandas/_libs/tslibs/frequencies.pyx +++ b/pandas/_libs/tslibs/frequencies.pyx @@ -12,27 +12,12 @@ from pandas._libs.tslibs.offsets import ( opattern, ) -from .dtypes import _period_code_map, _reverse_period_code_map +from .dtypes import FreqGroup, _period_code_map, _reverse_period_code_map # --------------------------------------------------------------------- # Period codes -class FreqGroup: - FR_ANN = 1000 - FR_QTR = 2000 - FR_MTH = 3000 - FR_WK = 4000 - FR_BUS = 5000 - FR_DAY = 6000 - FR_HR = 7000 - FR_MIN = 8000 - FR_SEC = 9000 - FR_MS = 10000 - FR_US = 11000 - FR_NS = 12000 - - # Map attribute-name resolutions to resolution abbreviations _attrname_to_abbrevs = { "year": "A", diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index c628f5361acad..bbdcb63d18175 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -56,7 +56,22 @@ from pandas._libs.tslibs.ccalendar cimport ( ) from pandas._libs.tslibs.ccalendar cimport c_MONTH_NUMBERS -from pandas._libs.tslibs.dtypes cimport PeriodDtypeBase +from pandas._libs.tslibs.dtypes cimport ( + PeriodDtypeBase, + FR_UND, + FR_ANN, + FR_QTR, + FR_MTH, + FR_WK, + FR_BUS, + FR_DAY, + FR_HR, + FR_MIN, + FR_SEC, + FR_MS, + FR_US, + FR_NS, +) from pandas._libs.tslibs.frequencies cimport ( attrname_to_abbrevs, @@ -98,23 +113,6 @@ ctypedef int64_t (*freq_conv_func)(int64_t, asfreq_info*) nogil cdef extern from *: """ - /*** FREQUENCY CONSTANTS ***/ - // See frequencies.pyx for more detailed variants - - #define FR_ANN 1000 /* Annual */ - #define FR_QTR 2000 /* Quarterly - December year end (default Q) */ - #define FR_MTH 3000 /* Monthly */ - #define FR_WK 4000 /* Weekly */ - #define FR_BUS 5000 /* Business days */ - #define FR_DAY 6000 /* Daily */ - #define FR_HR 7000 /* Hourly */ - #define FR_MIN 8000 /* Minutely */ - #define FR_SEC 9000 /* Secondly */ - #define FR_MS 10000 /* Millisecondly */ - #define FR_US 11000 /* Microsecondly */ - #define FR_NS 12000 /* Nanosecondly */ - #define FR_UND -10000 /* Undefined */ - // must use npy typedef b/c int64_t is aliased in cython-generated c // unclear why we need LL for that row. // see https://github.com/pandas-dev/pandas/pull/34416/ @@ -128,20 +126,6 @@ cdef extern from *: {0, 0, 0, 0, 0, 0, 1}}; """ int64_t daytime_conversion_factor_matrix[7][7] - # TODO: Can we get these frequencies from frequencies.FreqGroup? - int FR_ANN - int FR_QTR - int FR_MTH - int FR_WK - int FR_DAY - int FR_HR - int FR_MIN - int FR_SEC - int FR_MS - int FR_US - int FR_NS - int FR_BUS - int FR_UND cdef int max_value(int left, int right) nogil: @@ -1157,30 +1141,29 @@ cdef str period_format(int64_t value, int freq, object fmt=None): if fmt is None: freq_group = get_freq_group(freq) - if freq_group == 1000: # FR_ANN + if freq_group == FR_ANN: fmt = b'%Y' - elif freq_group == 2000: # FR_QTR + elif freq_group == FR_QTR: fmt = b'%FQ%q' - elif freq_group == 3000: # FR_MTH + elif freq_group == FR_MTH: fmt = b'%Y-%m' - elif freq_group == 4000: # WK - left = period_asfreq(value, freq, 6000, 0) - right = period_asfreq(value, freq, 6000, 1) - return f"{period_format(left, 6000)}/{period_format(right, 6000)}" - elif (freq_group == 5000 # BUS - or freq_group == 6000): # DAY + elif freq_group == FR_WK: + left = period_asfreq(value, freq, FR_DAY, 0) + right = period_asfreq(value, freq, FR_DAY, 1) + return f"{period_format(left, FR_DAY)}/{period_format(right, FR_DAY)}" + elif freq_group == FR_BUS or freq_group == FR_DAY: fmt = b'%Y-%m-%d' - elif freq_group == 7000: # HR + elif freq_group == FR_HR: fmt = b'%Y-%m-%d %H:00' - elif freq_group == 8000: # MIN + elif freq_group == FR_MIN: fmt = b'%Y-%m-%d %H:%M' - elif freq_group == 9000: # SEC + elif freq_group == FR_SEC: fmt = b'%Y-%m-%d %H:%M:%S' - elif freq_group == 10000: # MILLISEC + elif freq_group == FR_MS: fmt = b'%Y-%m-%d %H:%M:%S.%l' - elif freq_group == 11000: # MICROSEC + elif freq_group == FR_US: fmt = b'%Y-%m-%d %H:%M:%S.%u' - elif freq_group == 12000: # NANOSEC + elif freq_group == FR_NS: fmt = b'%Y-%m-%d %H:%M:%S.%n' else: raise ValueError(f"Unknown freq: {freq}") From 62bd33a90c62e1c9314c9c3c251e707f2a66a164 Mon Sep 17 00:00:00 2001 From: gabrielvf1 Date: Mon, 8 Jun 2020 20:22:19 -0300 Subject: [PATCH 0052/1025] Fix .isin Considers "1" and 1 equal (#34267) --- pandas/tests/test_algos.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 6008b4f9f4e33..ff5f890cc41f8 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -944,6 +944,33 @@ def test_different_nans_as_float64(self): expected = np.array([True, True]) tm.assert_numpy_array_equal(result, expected) + @pytest.mark.xfail(reason="problem related with issue #34125") + def test_isin_int_df_string_search(self): + """Comparing df with int`s (1,2) with a string at isin() ("1") + -> should not match values because int 1 is not equal str 1""" + df = pd.DataFrame({"values": [1, 2]}) + result = df.isin(["1"]) + expected_false = pd.DataFrame({"values": [False, False]}) + tm.assert_frame_equal(result, expected_false) + + @pytest.mark.xfail(reason="problem related with issue #34125") + def test_isin_nan_df_string_search(self): + """Comparing df with nan value (np.nan,2) with a string at isin() ("NaN") + -> should not match values because np.nan is not equal str NaN """ + df = pd.DataFrame({"values": [np.nan, 2]}) + result = df.isin(["NaN"]) + expected_false = pd.DataFrame({"values": [False, False]}) + tm.assert_frame_equal(result, expected_false) + + @pytest.mark.xfail(reason="problem related with issue #34125") + def test_isin_float_df_string_search(self): + """Comparing df with floats (1.4245,2.32441) with a string at isin() ("1.4245") + -> should not match values because float 1.4245 is not equal str 1.4245""" + df = pd.DataFrame({"values": [1.4245, 2.32441]}) + result = df.isin(["1.4245"]) + expected_false = pd.DataFrame({"values": [False, False]}) + tm.assert_frame_equal(result, expected_false) + class TestValueCounts: def test_value_counts(self): From de199426dcc73e797869255249c348828e917ba3 Mon Sep 17 00:00:00 2001 From: Robert de Vries Date: Tue, 9 Jun 2020 01:36:04 +0200 Subject: [PATCH 0053/1025] BUG: Add errors argument to to_csv() call to enable error handling for encoders (#32702) --- doc/source/whatsnew/v1.1.0.rst | 1 + pandas/core/generic.py | 8 ++++++++ pandas/io/common.py | 11 +++++++++-- pandas/io/formats/csvs.py | 4 ++++ pandas/tests/io/formats/test_to_csv.py | 10 ++++++++++ 5 files changed, 32 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index cf5a6976524de..c6ba196a8a985 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -288,6 +288,7 @@ Other enhancements - :meth:`HDFStore.put` now accepts `track_times` parameter. Parameter is passed to ``create_table`` method of ``PyTables`` (:issue:`32682`). - Make :class:`pandas.core.window.Rolling` and :class:`pandas.core.window.Expanding` iterable(:issue:`11704`) - Make ``option_context`` a :class:`contextlib.ContextDecorator`, which allows it to be used as a decorator over an entire function (:issue:`34253`). +- :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` now accept an ``errors`` argument (:issue:`22610`) - :meth:`groupby.transform` now allows ``func`` to be ``pad``, ``backfill`` and ``cumcount`` (:issue:`31269`). - :meth:`~pandas.io.json.read_json` now accepts `nrows` parameter. (:issue:`33916`). - :meth `~pandas.io.gbq.read_gbq` now allows to disable progress bar (:issue:`33360`). diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 06be602c8090d..6183638ab587e 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3049,6 +3049,7 @@ def to_csv( doublequote: bool_t = True, escapechar: Optional[str] = None, decimal: Optional[str] = ".", + errors: str = "strict", ) -> Optional[str]: r""" Write object to a comma-separated values (csv) file. @@ -3143,6 +3144,12 @@ def to_csv( decimal : str, default '.' Character recognized as decimal separator. E.g. use ',' for European data. + errors : str, default 'strict' + Specifies how encoding and decoding errors are to be handled. + See the errors argument for :func:`open` for a full list + of options. + + .. versionadded:: 1.1.0 Returns ------- @@ -3180,6 +3187,7 @@ def to_csv( line_terminator=line_terminator, sep=sep, encoding=encoding, + errors=errors, compression=compression, quoting=quoting, na_rep=na_rep, diff --git a/pandas/io/common.py b/pandas/io/common.py index 8349acafca1e3..055f84970e916 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -352,6 +352,7 @@ def get_handle( compression: Optional[Union[str, Mapping[str, Any]]] = None, memory_map: bool = False, is_text: bool = True, + errors=None, ): """ Get file handle for given path/buffer and mode. @@ -390,6 +391,12 @@ def get_handle( is_text : boolean, default True whether file/buffer is in text format (csv, json, etc.), or in binary mode (pickle, etc.). + errors : str, default 'strict' + Specifies how encoding and decoding errors are to be handled. + See the errors argument for :func:`open` for a full list + of options. + + .. versionadded:: 1.1.0 Returns ------- @@ -475,7 +482,7 @@ def get_handle( elif is_path: if encoding: # Encoding - f = open(path_or_buf, mode, encoding=encoding, newline="") + f = open(path_or_buf, mode, encoding=encoding, errors=errors, newline="") elif is_text: # No explicit encoding f = open(path_or_buf, mode, errors="replace", newline="") @@ -488,7 +495,7 @@ def get_handle( if is_text and (compression or isinstance(f, need_text_wrapping)): from io import TextIOWrapper - g = TextIOWrapper(f, encoding=encoding, newline="") + g = TextIOWrapper(f, encoding=encoding, errors=errors, newline="") if not isinstance(f, (BufferedIOBase, RawIOBase)): handles.append(g) f = g diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index dcd764bec7426..5bd51dc8351f6 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -44,6 +44,7 @@ def __init__( index_label: Optional[Union[bool, Hashable, Sequence[Hashable]]] = None, mode: str = "w", encoding: Optional[str] = None, + errors: str = "strict", compression: Union[str, Mapping[str, str], None] = "infer", quoting: Optional[int] = None, line_terminator="\n", @@ -77,6 +78,7 @@ def __init__( if encoding is None: encoding = "utf-8" self.encoding = encoding + self.errors = errors self.compression = infer_compression(self.path_or_buf, compression) if quoting is None: @@ -184,6 +186,7 @@ def save(self) -> None: self.path_or_buf, self.mode, encoding=self.encoding, + errors=self.errors, compression=dict(self.compression_args, method=self.compression), ) close = True @@ -215,6 +218,7 @@ def save(self) -> None: self.path_or_buf, self.mode, encoding=self.encoding, + errors=self.errors, compression=compression, ) f.write(buf) diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py index b3ee8da52dece..4c86e3a16b135 100644 --- a/pandas/tests/io/formats/test_to_csv.py +++ b/pandas/tests/io/formats/test_to_csv.py @@ -597,3 +597,13 @@ def test_na_rep_truncated(self): result = pd.Series([1.1, 2.2]).to_csv(na_rep=".") expected = tm.convert_rows_list_to_csv_str([",0", "0,1.1", "1,2.2"]) assert result == expected + + @pytest.mark.parametrize("errors", ["surrogatepass", "ignore", "replace"]) + def test_to_csv_errors(self, errors): + # GH 22610 + data = ["\ud800foo"] + ser = pd.Series(data, index=pd.Index(data)) + with tm.ensure_clean("test.csv") as path: + ser.to_csv(path, errors=errors) + # No use in reading back the data as it is not the same anymore + # due to the error handling From fb6b1da549bffc272b49c13ca7ebf514fd31c7c3 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 8 Jun 2020 17:53:54 -0700 Subject: [PATCH 0054/1025] REF: de-duplicate month/year rolling in libperiod (#34648) * REF: implement helpers to de-duplicate year/month rolling * move func to better location --- pandas/_libs/tslibs/period.pyx | 73 +++++++++++++++------------------- 1 file changed, 32 insertions(+), 41 deletions(-) diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index bbdcb63d18175..4e8da6504d1ea 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -329,56 +329,34 @@ cdef inline int64_t transform_via_day(int64_t ordinal, # -------------------------------------------------------------------- # Conversion _to_ Daily Freq -cdef void AtoD_ym(int64_t ordinal, int64_t *year, - int *month, asfreq_info *af_info) nogil: - year[0] = ordinal + 1970 - month[0] = 1 - - if af_info.from_end != 12: - month[0] += af_info.from_end - if month[0] > 12: - # This case is never reached, but is kept for symmetry - # with QtoD_ym - month[0] -= 12 - else: - year[0] -= 1 - - cdef int64_t asfreq_AtoDT(int64_t ordinal, asfreq_info *af_info) nogil: cdef: - int64_t unix_date, year - int month + int64_t unix_date + npy_datetimestruct dts ordinal += af_info.is_end - AtoD_ym(ordinal, &year, &month, af_info) - unix_date = unix_date_from_ymd(year, month, 1) + dts.year = ordinal + 1970 + dts.month = 1 + adjust_dts_for_month(&dts, af_info.from_end) + + unix_date = unix_date_from_ymd(dts.year, dts.month, 1) unix_date -= af_info.is_end return upsample_daytime(unix_date, af_info) -cdef void QtoD_ym(int64_t ordinal, int *year, - int *month, asfreq_info *af_info) nogil: - year[0] = ordinal // 4 + 1970 - month[0] = (ordinal % 4) * 3 + 1 - - if af_info.from_end != 12: - month[0] += af_info.from_end - if month[0] > 12: - month[0] -= 12 - else: - year[0] -= 1 - - cdef int64_t asfreq_QtoDT(int64_t ordinal, asfreq_info *af_info) nogil: cdef: int64_t unix_date - int year, month + npy_datetimestruct dts ordinal += af_info.is_end - QtoD_ym(ordinal, &year, &month, af_info) - unix_date = unix_date_from_ymd(year, month, 1) + dts.year = ordinal // 4 + 1970 + dts.month = (ordinal % 4) * 3 + 1 + adjust_dts_for_month(&dts, af_info.from_end) + + unix_date = unix_date_from_ymd(dts.year, dts.month, 1) unix_date -= af_info.is_end return upsample_daytime(unix_date, af_info) @@ -486,12 +464,7 @@ cdef int DtoQ_yq(int64_t ordinal, asfreq_info *af_info, npy_datetimestruct* dts) int quarter pandas_datetime_to_datetimestruct(ordinal, NPY_FR_D, dts) - if af_info.to_end != 12: - dts.month -= af_info.to_end - if dts.month <= 0: - dts.month += 12 - else: - dts.year += 1 + adjust_dts_for_qtr(dts, af_info.to_end) quarter = month_to_quarter(dts.month) return quarter @@ -712,6 +685,24 @@ cdef inline int get_freq_group_index(int freq) nogil: return freq // 1000 +cdef void adjust_dts_for_month(npy_datetimestruct* dts, int from_end) nogil: + if from_end != 12: + dts.month += from_end + if dts.month > 12: + dts.month -= 12 + else: + dts.year -= 1 + + +cdef void adjust_dts_for_qtr(npy_datetimestruct* dts, int to_end) nogil: + if to_end != 12: + dts.month -= to_end + if dts.month <= 0: + dts.month += 12 + else: + dts.year += 1 + + # Find the unix_date (days elapsed since datetime(1970, 1, 1) # for the given year/month/day. # Assumes GREGORIAN_CALENDAR */ From f46b7a63f38689682ae2b958ca58e828384a18b3 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 8 Jun 2020 18:46:59 -0700 Subject: [PATCH 0055/1025] CLN: disallow passing tuples for Period freq (#34658) * CLN: disallow passing tuples for Period freq * whatsnew --- doc/source/whatsnew/v1.1.0.rst | 1 + pandas/_libs/tslibs/period.pyx | 7 ++++--- pandas/core/indexes/datetimes.py | 2 +- pandas/core/indexes/period.py | 2 +- 4 files changed, 7 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index c6ba196a8a985..e5e0b2577d595 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -403,6 +403,7 @@ Backwards incompatible API changes - :func:`read_excel` no longer takes ``**kwds`` arguments. This means that passing in keyword ``chunksize`` now raises a ``TypeError`` (previously raised a ``NotImplementedError``), while passing in keyword ``encoding`` now raises a ``TypeError`` (:issue:`34464`) - :func: `merge` now checks ``suffixes`` parameter type to be ``tuple`` and raises ``TypeError``, whereas before a ``list`` or ``set`` were accepted and that the ``set`` could produce unexpected results (:issue:`33740`) +- :class:`Period` no longer accepts tuples for the ``freq`` argument (:issue:`34658`) ``MultiIndex.get_indexer`` interprets `method` argument differently ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index 4e8da6504d1ea..d2c9ccc8ab4ae 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -1507,9 +1507,10 @@ cdef class _Period: ------- DateOffset """ - if isinstance(freq, (int, tuple)): - code, stride = get_freq_code(freq) - freq = get_freq_str(code, stride) + if isinstance(freq, int): + # We already have a dtype code + dtype = PeriodDtypeBase(freq) + freq = dtype.date_offset freq = to_offset(freq) diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index e1f0221eaee65..d8654dee56319 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -501,7 +501,7 @@ def _parsed_string_to_bounds(self, reso: str, parsed: datetime): raise KeyError grp = get_freq_group(reso) - per = Period(parsed, freq=(grp, 1)) + per = Period(parsed, freq=grp) start, end = per.start_time, per.end_time # GH 24076 diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index 14922000c9707..0fafeef078d78 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -574,7 +574,7 @@ def _parsed_string_to_bounds(self, reso: str, parsed: datetime): raise KeyError(reso) grp = get_freq_group(reso) - iv = Period(parsed, freq=(grp, 1)) + iv = Period(parsed, freq=grp) return (iv.asfreq(self.freq, how="start"), iv.asfreq(self.freq, how="end")) def _validate_partial_date_slice(self, reso: str): From eecfdb80e80d580975829965e1b98557f549b040 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Tue, 9 Jun 2020 13:36:20 +0100 Subject: [PATCH 0056/1025] CLN: deduplicate in core.internals.blocks.interpolate (#34638) --- pandas/core/internals/blocks.py | 18 +++--------------- 1 file changed, 3 insertions(+), 15 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index a4a8d672895ce..e2a778f729470 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1084,14 +1084,9 @@ def interpolate( inplace = validate_bool_kwarg(inplace, "inplace") - def check_int_bool(self, inplace): - # Only FloatBlocks will contain NaNs. - # timedelta subclasses IntBlock - if (self.is_bool or self.is_integer) and not self.is_timedelta: - if inplace: - return self - else: - return self.copy() + # Only FloatBlocks will contain NaNs. timedelta subclasses IntBlock + if (self.is_bool or self.is_integer) and not self.is_timedelta: + return self if inplace else self.copy() # a fill na type method try: @@ -1100,9 +1095,6 @@ def check_int_bool(self, inplace): m = None if m is not None: - r = check_int_bool(self, inplace) - if r is not None: - return r return self._interpolate_with_fill( method=m, axis=axis, @@ -1115,10 +1107,6 @@ def check_int_bool(self, inplace): # validate the interp method m = missing.clean_interp_method(method, **kwargs) - r = check_int_bool(self, inplace) - if r is not None: - return r - assert index is not None # for mypy return self._interpolate( From bbf3c0d0e291ae57f8c793f94c20d8be9ffa06f0 Mon Sep 17 00:00:00 2001 From: "Uwe L. Korn" Date: Tue, 9 Jun 2020 17:37:44 +0200 Subject: [PATCH 0057/1025] BUG: Allow plain bools in ExtensionArray.equals (#34661) --- pandas/core/arrays/base.py | 2 +- pandas/tests/extension/arrow/arrays.py | 28 +++++++++++++++++++++-- pandas/tests/extension/arrow/test_bool.py | 5 ++++ 3 files changed, 32 insertions(+), 3 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 79f0039a9df65..7f2c61ff7d955 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -738,7 +738,7 @@ def equals(self, other: "ExtensionArray") -> bool: # boolean array with NA -> fill with False equal_values = equal_values.fillna(False) equal_na = self.isna() & other.isna() - return (equal_values | equal_na).all().item() + return bool((equal_values | equal_na).all()) def _values_for_factorize(self) -> Tuple[np.ndarray, Any]: """ diff --git a/pandas/tests/extension/arrow/arrays.py b/pandas/tests/extension/arrow/arrays.py index ffebc9f8b3359..29cfe1e0fe606 100644 --- a/pandas/tests/extension/arrow/arrays.py +++ b/pandas/tests/extension/arrow/arrays.py @@ -8,6 +8,7 @@ """ import copy import itertools +import operator from typing import Type import numpy as np @@ -106,6 +107,27 @@ def astype(self, dtype, copy=True): def dtype(self): return self._dtype + def _boolean_op(self, other, op): + if not isinstance(other, type(self)): + raise NotImplementedError() + + result = op(np.array(self._data), np.array(other._data)) + return ArrowBoolArray( + pa.chunked_array([pa.array(result, mask=pd.isna(self._data.to_pandas()))]) + ) + + def __eq__(self, other): + if not isinstance(other, type(self)): + return False + + return self._boolean_op(other, operator.eq) + + def __and__(self, other): + return self._boolean_op(other, operator.and_) + + def __or__(self, other): + return self._boolean_op(other, operator.or_) + @property def nbytes(self): return sum( @@ -153,10 +175,12 @@ def _reduce(self, method, skipna=True, **kwargs): return op(**kwargs) def any(self, axis=0, out=None): - return self._data.to_pandas().any() + # Explicitly return a plain bool to reproduce GH-34660 + return bool(self._data.to_pandas().any()) def all(self, axis=0, out=None): - return self._data.to_pandas().all() + # Explicitly return a plain bool to reproduce GH-34660 + return bool(self._data.to_pandas().all()) class ArrowBoolArray(ArrowExtensionArray): diff --git a/pandas/tests/extension/arrow/test_bool.py b/pandas/tests/extension/arrow/test_bool.py index 48f1c34764313..7841360e568ed 100644 --- a/pandas/tests/extension/arrow/test_bool.py +++ b/pandas/tests/extension/arrow/test_bool.py @@ -29,6 +29,11 @@ def data_missing(): return ArrowBoolArray.from_scalars([None, True]) +def test_basic_equals(data): + # https://github.com/pandas-dev/pandas/issues/34660 + assert pd.Series(data).equals(pd.Series(data)) + + class BaseArrowTests: pass From 1180fceb5fb498b44fdbbd066dd61c08df6b8efc Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 9 Jun 2020 10:14:51 -0700 Subject: [PATCH 0058/1025] REF: avoid get_freq_code (#34659) --- pandas/_libs/tslibs/period.pyx | 24 +++++++++++++++++------- pandas/core/arrays/period.py | 14 +++++++++----- 2 files changed, 26 insertions(+), 12 deletions(-) diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index d2c9ccc8ab4ae..c06f34e37ec49 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -94,6 +94,7 @@ from pandas._libs.tslibs.offsets cimport ( is_tick_object, is_offset_object, ) +from pandas._libs.tslibs.offsets import INVALID_FREQ_ERR_MSG from pandas._libs.tslibs.tzconversion cimport tz_convert_utc_to_tzlocal @@ -1649,7 +1650,7 @@ cdef class _Period: freq = self._maybe_convert_freq(freq) how = validate_end_alias(how) base1 = self._dtype.dtype_code - base2, _ = get_freq_code(freq) + base2 = freq_to_dtype_code(freq) # self.n can't be negative or 0 end = how == 'E' @@ -1739,10 +1740,11 @@ cdef class _Period: if freq is None: base = self._dtype.dtype_code freq = get_to_timestamp_base(base) + base = freq else: freq = self._maybe_convert_freq(freq) + base = freq._period_dtype_code - base, _ = get_freq_code(freq) val = self.asfreq(freq, how) dt64 = period_ordinal_to_dt64(val.ordinal, base) @@ -2386,8 +2388,7 @@ class Period(_Period): elif is_period_object(value): other = value - if freq is None or get_freq_code( - freq) == get_freq_code(other.freq): + if freq is None or freq._period_dtype_code == other.freq._period_dtype_code: ordinal = other.ordinal freq = other.freq else: @@ -2414,6 +2415,7 @@ class Period(_Period): except KeyError: raise ValueError(f"Invalid frequency or could not " f"infer: {reso}") + freq = to_offset(freq) elif PyDateTime_Check(value): dt = value @@ -2432,7 +2434,7 @@ class Period(_Period): raise ValueError(msg) if ordinal is None: - base, _ = get_freq_code(freq) + base = freq_to_dtype_code(freq) ordinal = period_ordinal(dt.year, dt.month, dt.day, dt.hour, dt.minute, dt.second, dt.microsecond, 0, base) @@ -2444,9 +2446,17 @@ cdef bint is_period_object(object obj): return isinstance(obj, _Period) +cpdef int freq_to_dtype_code(BaseOffset freq) except? -1: + try: + return freq._period_dtype_code + except AttributeError as err: + raise ValueError(INVALID_FREQ_ERR_MSG) from err + + cdef int64_t _ordinal_from_fields(int year, int month, quarter, int day, - int hour, int minute, int second, freq): - base, mult = get_freq_code(freq) + int hour, int minute, int second, + BaseOffset freq): + base = freq_to_dtype_code(freq) if quarter is not None: year, month = quarter_to_myear(year, quarter, freq) diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index b16a3df003512..0d866aa7eae26 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -442,10 +442,11 @@ def to_timestamp(self, freq=None, how="start"): if freq is None: base = self.freq._period_dtype_code freq = libfrequencies.get_to_timestamp_base(base) + base = freq else: freq = Period._maybe_convert_freq(freq) + base = freq._period_dtype_code - base, _ = libfrequencies.get_freq_code(freq) new_data = self.asfreq(freq, how=how) new_data = libperiod.periodarr_to_dt64arr(new_data.asi8, base) @@ -962,7 +963,8 @@ def _get_ordinal_range(start, end, periods, freq, mult=1): ) if freq is not None: - _, mult = libfrequencies.get_freq_code(freq) + freq = to_offset(freq) + mult = freq.n if start is not None: start = Period(start, freq) @@ -1024,10 +1026,11 @@ def _range_from_fields( if quarter is not None: if freq is None: - freq = "Q" + freq = to_offset("Q") base = libfrequencies.FreqGroup.FR_QTR else: - base, mult = libfrequencies.get_freq_code(freq) + freq = to_offset(freq) + base = libperiod.freq_to_dtype_code(freq) if base != libfrequencies.FreqGroup.FR_QTR: raise AssertionError("base must equal FR_QTR") @@ -1037,7 +1040,8 @@ def _range_from_fields( val = libperiod.period_ordinal(y, m, 1, 1, 1, 1, 0, 0, base) ordinals.append(val) else: - base, mult = libfrequencies.get_freq_code(freq) + freq = to_offset(freq) + base = libperiod.freq_to_dtype_code(freq) arrays = _make_field_arrays(year, month, day, hour, minute, second) for y, mth, d, h, mn, s in zip(*arrays): ordinals.append(libperiod.period_ordinal(y, mth, d, h, mn, s, 0, 0, base)) From c7192748e3f8e47cf54030f32f4f119181cabd99 Mon Sep 17 00:00:00 2001 From: willpeppo Date: Tue, 9 Jun 2020 13:35:41 -0400 Subject: [PATCH 0059/1025] DOC: updated plotting/_misc.py for PR08 errors (#34652) --- pandas/plotting/_misc.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/plotting/_misc.py b/pandas/plotting/_misc.py index 3056977ec78ad..22a2d7617fded 100644 --- a/pandas/plotting/_misc.py +++ b/pandas/plotting/_misc.py @@ -160,7 +160,7 @@ def radviz(frame, class_column, ax=None, color=None, colormap=None, **kwds): Parameters ---------- frame : `DataFrame` - pandas object holding the data. + Object holding the data. class_column : str Column name containing the name of the data point category. ax : :class:`matplotlib.axes.Axes`, optional @@ -294,7 +294,7 @@ def bootstrap_plot(series, fig=None, size=50, samples=500, **kwds): Parameters ---------- series : pandas.Series - pandas Series from where to get the samplings for the bootstrapping. + Series from where to get the samplings for the bootstrapping. fig : matplotlib.figure.Figure, default None If given, it will use the `fig` reference for plotting instead of creating a new one with default parameters. From eaf3d751b6c69cdad28eace502482ee1b757a5b5 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 9 Jun 2020 10:59:26 -0700 Subject: [PATCH 0060/1025] PERF: normalize_i8_timestamps (#34672) --- pandas/_libs/tslibs/conversion.pyx | 35 +++++++++++++----------------- 1 file changed, 15 insertions(+), 20 deletions(-) diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index b0bad119d6a46..152e9a5ad7ddc 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -785,7 +785,6 @@ cpdef ndarray[int64_t] normalize_i8_timestamps(const int64_t[:] stamps, tzinfo t int64_t[:] deltas str typ Py_ssize_t[:] pos - npy_datetimestruct dts int64_t delta, local_val if tz is None or is_utc(tz): @@ -795,16 +794,14 @@ cpdef ndarray[int64_t] normalize_i8_timestamps(const int64_t[:] stamps, tzinfo t result[i] = NPY_NAT continue local_val = stamps[i] - dt64_to_dtstruct(local_val, &dts) - result[i] = _normalized_stamp(&dts) + result[i] = _normalize_i8_stamp(local_val) elif is_tzlocal(tz): for i in range(n): if stamps[i] == NPY_NAT: result[i] = NPY_NAT continue local_val = tz_convert_utc_to_tzlocal(stamps[i], tz) - dt64_to_dtstruct(local_val, &dts) - result[i] = _normalized_stamp(&dts) + result[i] = _normalize_i8_stamp(local_val) else: # Adjust datetime64 timestamp, recompute datetimestruct trans, deltas, typ = get_dst_info(tz) @@ -816,38 +813,36 @@ cpdef ndarray[int64_t] normalize_i8_timestamps(const int64_t[:] stamps, tzinfo t if stamps[i] == NPY_NAT: result[i] = NPY_NAT continue - dt64_to_dtstruct(stamps[i] + delta, &dts) - result[i] = _normalized_stamp(&dts) + local_val = stamps[i] + delta + result[i] = _normalize_i8_stamp(local_val) else: pos = trans.searchsorted(stamps, side='right') - 1 for i in range(n): if stamps[i] == NPY_NAT: result[i] = NPY_NAT continue - dt64_to_dtstruct(stamps[i] + deltas[pos[i]], &dts) - result[i] = _normalized_stamp(&dts) + local_val = stamps[i] + deltas[pos[i]] + result[i] = _normalize_i8_stamp(local_val) return result.base # `.base` to access underlying ndarray -cdef inline int64_t _normalized_stamp(npy_datetimestruct *dts) nogil: +@cython.cdivision +cdef inline int64_t _normalize_i8_stamp(int64_t local_val) nogil: """ - Normalize the given datetimestruct to midnight, then convert to int64_t. + Round the localized nanosecond timestamp down to the previous midnight. Parameters ---------- - *dts : pointer to npy_datetimestruct + local_val : int64_t Returns ------- - stamp : int64 - """ - dts.hour = 0 - dts.min = 0 - dts.sec = 0 - dts.us = 0 - dts.ps = 0 - return dtstruct_to_dt64(dts) + int64_t + """ + cdef: + int64_t day_nanos = 24 * 3600 * 1_000_000_000 + return local_val - (local_val % day_nanos) @cython.wraparound(False) From 442c6c87baf11a955795b26d6545a77474027a22 Mon Sep 17 00:00:00 2001 From: John Paton Date: Wed, 10 Jun 2020 00:17:19 +0200 Subject: [PATCH 0061/1025] Add max_results kwarg to read_gbq (#34639) (#34641) Since max_results is a new kwarg (added in pandas-gbq 0.12.0), it is handled and tested in the same way as use_bqstorage_api, using the "new kwargs" mechanism to maintain backwards compatibility with older pandas-gbq versions. --- doc/source/whatsnew/v1.1.0.rst | 1 + pandas/io/gbq.py | 12 +++++++++++- pandas/tests/io/test_gbq.py | 4 +++- 3 files changed, 15 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index e5e0b2577d595..be355cb7a461c 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -292,6 +292,7 @@ Other enhancements - :meth:`groupby.transform` now allows ``func`` to be ``pad``, ``backfill`` and ``cumcount`` (:issue:`31269`). - :meth:`~pandas.io.json.read_json` now accepts `nrows` parameter. (:issue:`33916`). - :meth `~pandas.io.gbq.read_gbq` now allows to disable progress bar (:issue:`33360`). +- :meth:`~pandas.io.gbq.read_gbq` now supports the ``max_results`` kwarg from ``pandas-gbq`` (:issue:`34639`). .. --------------------------------------------------------------------------- diff --git a/pandas/io/gbq.py b/pandas/io/gbq.py index 9b46f970afc66..3d0792357297f 100644 --- a/pandas/io/gbq.py +++ b/pandas/io/gbq.py @@ -30,6 +30,7 @@ def read_gbq( configuration: Optional[Dict[str, Any]] = None, credentials=None, use_bqstorage_api: Optional[bool] = None, + max_results: Optional[int] = None, private_key=None, verbose=None, progress_bar_type: Optional[str] = None, @@ -125,6 +126,13 @@ def read_gbq( ``fastavro`` packages. .. versionadded:: 0.25.0 + max_results : int, optional + If set, limit the maximum number of rows to fetch from the query + results. + + *New in version 0.12.0 of pandas-gbq*. + + .. versionadded:: 1.1.0 progress_bar_type : Optional, str If set, use the `tqdm `__ library to display a progress bar while the data downloads. Install the @@ -162,11 +170,13 @@ def read_gbq( """ pandas_gbq = _try_import() - kwargs: Dict[str, Union[str, bool, None]] = {} + kwargs: Dict[str, Union[str, bool, int, None]] = {} # START: new kwargs. Don't populate unless explicitly set. if use_bqstorage_api is not None: kwargs["use_bqstorage_api"] = use_bqstorage_api + if max_results is not None: + kwargs["max_results"] = max_results kwargs["progress_bar_type"] = progress_bar_type # END: new kwargs diff --git a/pandas/tests/io/test_gbq.py b/pandas/tests/io/test_gbq.py index e9cefe3056130..df107259d38cd 100644 --- a/pandas/tests/io/test_gbq.py +++ b/pandas/tests/io/test_gbq.py @@ -113,9 +113,10 @@ def mock_read_gbq(sql, **kwargs): return DataFrame([[1.0]]) monkeypatch.setattr("pandas_gbq.read_gbq", mock_read_gbq) - pd.read_gbq("SELECT 1", use_bqstorage_api=True) + pd.read_gbq("SELECT 1", use_bqstorage_api=True, max_results=1) assert captured_kwargs["use_bqstorage_api"] + assert captured_kwargs["max_results"] def test_read_gbq_without_new_kwargs(monkeypatch): @@ -129,6 +130,7 @@ def mock_read_gbq(sql, **kwargs): pd.read_gbq("SELECT 1") assert "use_bqstorage_api" not in captured_kwargs + assert "max_results" not in captured_kwargs @pytest.mark.parametrize("progress_bar", [None, "foo"]) From 37a647eed1e7aa3c483f9b0c92512be4d355abb6 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 9 Jun 2020 15:21:37 -0700 Subject: [PATCH 0062/1025] CLN: dont consolidate in groupby (#34680) --- pandas/core/groupby/groupby.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 9838cff9b34f9..e385a78142ba5 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -497,7 +497,6 @@ def __init__( self._selection = selection assert isinstance(obj, NDFrame), type(obj) - obj._consolidate_inplace() self.level = level From de0bd9a3e07ae6f56056b77dc8394726b81da4ff Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Tue, 9 Jun 2020 23:31:39 +0100 Subject: [PATCH 0063/1025] BUG: Pandas changes dtypes of columns when no float (or other) assignments are done to this column #34573 (#34599) --- doc/source/whatsnew/v1.1.0.rst | 1 + pandas/core/indexing.py | 5 ++++- pandas/tests/frame/indexing/test_setitem.py | 24 +++++++++++++++++++++ 3 files changed, 29 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index be355cb7a461c..92f7c0f6b59a3 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -897,6 +897,7 @@ Indexing - Bug in :meth:`DataFrame.truncate` and :meth:`Series.truncate` where index was assumed to be monotone increasing (:issue:`33756`) - Indexing with a list of strings representing datetimes failed on :class:`DatetimeIndex` or :class:`PeriodIndex`(:issue:`11278`) - Bug in :meth:`Series.at` when used with a :class:`MultiIndex` would raise an exception on valid inputs (:issue:`26989`) +- Bug in :meth:`DataFrame.loc` with dictionary of values changes columns with dtype of ``int`` to ``float`` (:issue:`34573`) - Bug in :meth:`Series.loc` when used with a :class:`MultiIndex` would raise an IndexingError when accessing a None value (:issue:`34318`) Missing diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index ab515cb5e606b..326bd00270eca 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1836,7 +1836,10 @@ def _setitem_with_indexer_missing(self, indexer, value): # append a Series value = value.reindex(index=self.obj.columns, copy=True) value.name = indexer - + elif isinstance(value, dict): + value = Series( + value, index=self.obj.columns, name=indexer, dtype=object + ) else: # a list-list if is_list_like_indexer(value): diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py index d53665539309c..8fcdae95fbab5 100644 --- a/pandas/tests/frame/indexing/test_setitem.py +++ b/pandas/tests/frame/indexing/test_setitem.py @@ -126,3 +126,27 @@ def test_setitem_with_unaligned_sparse_value(self): df["new_column"] = sp_series expected = Series(SparseArray([1, 0, 0]), name="new_column") tm.assert_series_equal(df["new_column"], expected) + + def test_setitem_dict_preserves_dtypes(self): + # https://github.com/pandas-dev/pandas/issues/34573 + expected = DataFrame( + { + "a": Series([0, 1, 2], dtype="int64"), + "b": Series([1, 2, 3], dtype=float), + "c": Series([1, 2, 3], dtype=float), + } + ) + df = DataFrame( + { + "a": Series([], dtype="int64"), + "b": Series([], dtype=float), + "c": Series([], dtype=float), + } + ) + for idx, b in enumerate([1, 2, 3]): + df.loc[df.shape[0]] = { + "a": int(idx), + "b": float(b), + "c": float(b), + } + tm.assert_frame_equal(df, expected) From 7f5b834ced13d7d329dc8e6317bcb02388bd0a52 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 9 Jun 2020 15:32:54 -0700 Subject: [PATCH 0064/1025] ENH: Resolutions for month/qtr/year (#34587) --- pandas/_libs/tslibs/resolution.pyx | 52 ++++++++++++++++++- pandas/core/arrays/datetimelike.py | 5 -- pandas/core/indexes/datetimelike.py | 12 +++-- pandas/core/indexes/datetimes.py | 17 +++--- pandas/core/indexes/period.py | 20 +++---- pandas/tests/indexes/period/test_ops.py | 35 ++++++------- .../tseries/frequencies/test_freq_code.py | 10 ++-- 7 files changed, 98 insertions(+), 53 deletions(-) diff --git a/pandas/_libs/tslibs/resolution.pyx b/pandas/_libs/tslibs/resolution.pyx index c0baabdc98acd..7453933ddbb4f 100644 --- a/pandas/_libs/tslibs/resolution.pyx +++ b/pandas/_libs/tslibs/resolution.pyx @@ -8,9 +8,10 @@ from pandas._libs.tslibs.util cimport get_nat from pandas._libs.tslibs.np_datetime cimport ( npy_datetimestruct, dt64_to_dtstruct) from pandas._libs.tslibs.frequencies cimport attrname_to_abbrevs +from pandas._libs.tslibs.frequencies import FreqGroup from pandas._libs.tslibs.timezones cimport ( is_utc, is_tzlocal, maybe_get_tz, get_dst_info) -from pandas._libs.tslibs.ccalendar cimport get_days_in_month +from pandas._libs.tslibs.ccalendar cimport get_days_in_month, c_MONTH_NUMBERS from pandas._libs.tslibs.tzconversion cimport tz_convert_utc_to_tzlocal # ---------------------------------------------------------------------- @@ -26,6 +27,9 @@ cdef: int RESO_MIN = 4 int RESO_HR = 5 int RESO_DAY = 6 + int RESO_MTH = 7 + int RESO_QTR = 8 + int RESO_YR = 9 _abbrev_to_attrnames = {v: k for k, v in attrname_to_abbrevs.items()} @@ -37,6 +41,9 @@ _reso_str_map = { RESO_MIN: "minute", RESO_HR: "hour", RESO_DAY: "day", + RESO_MTH: "month", + RESO_QTR: "quarter", + RESO_YR: "year", } _str_reso_map = {v: k for k, v in _reso_str_map.items()} @@ -126,6 +133,9 @@ class Resolution(Enum): RESO_MIN = 4 RESO_HR = 5 RESO_DAY = 6 + RESO_MTH = 7 + RESO_QTR = 8 + RESO_YR = 9 def __lt__(self, other): return self.value < other.value @@ -133,6 +143,32 @@ class Resolution(Enum): def __ge__(self, other): return self.value >= other.value + @property + def freq_group(self): + # TODO: annotate as returning FreqGroup once that is an enum + if self == Resolution.RESO_NS: + return FreqGroup.FR_NS + elif self == Resolution.RESO_US: + return FreqGroup.FR_US + elif self == Resolution.RESO_MS: + return FreqGroup.FR_MS + elif self == Resolution.RESO_SEC: + return FreqGroup.FR_SEC + elif self == Resolution.RESO_MIN: + return FreqGroup.FR_MIN + elif self == Resolution.RESO_HR: + return FreqGroup.FR_HR + elif self == Resolution.RESO_DAY: + return FreqGroup.FR_DAY + elif self == Resolution.RESO_MTH: + return FreqGroup.FR_MTH + elif self == Resolution.RESO_QTR: + return FreqGroup.FR_QTR + elif self == Resolution.RESO_YR: + return FreqGroup.FR_ANN + else: + raise ValueError(self) + @property def attrname(self) -> str: """ @@ -175,7 +211,19 @@ class Resolution(Enum): >>> Resolution.get_reso_from_freq('H') == Resolution.RESO_HR True """ - attr_name = _abbrev_to_attrnames[freq] + try: + attr_name = _abbrev_to_attrnames[freq] + except KeyError: + # For quarterly and yearly resolutions, we need to chop off + # a month string. + split_freq = freq.split("-") + if len(split_freq) != 2: + raise + if split_freq[1] not in c_MONTH_NUMBERS: + # i.e. we want e.g. "Q-DEC", not "Q-INVALID" + raise + attr_name = _abbrev_to_attrnames[split_freq[0]] + return cls.from_attrname(attr_name) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index e2ecb6c343b7a..8af23815b54ef 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -1122,11 +1122,6 @@ def resolution(self) -> str: """ Returns day, hour, minute, second, millisecond or microsecond """ - if self._resolution_obj is None: - if is_period_dtype(self.dtype): - # somewhere in the past it was decided we default to day - return "day" - # otherwise we fall through and will raise return self._resolution_obj.attrname # type: ignore @classmethod diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 21f4b3f8bb76a..ca6eb45e22c69 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -363,19 +363,23 @@ def _format_attrs(self): # -------------------------------------------------------------------- # Indexing Methods - def _validate_partial_date_slice(self, reso: str): + def _validate_partial_date_slice(self, reso: Resolution): raise NotImplementedError - def _parsed_string_to_bounds(self, reso: str, parsed: datetime): + def _parsed_string_to_bounds(self, reso: Resolution, parsed: datetime): raise NotImplementedError def _partial_date_slice( - self, reso: str, parsed: datetime, use_lhs: bool = True, use_rhs: bool = True + self, + reso: Resolution, + parsed: datetime, + use_lhs: bool = True, + use_rhs: bool = True, ): """ Parameters ---------- - reso : str + reso : Resolution parsed : datetime use_lhs : bool, default True use_rhs : bool, default True diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index d8654dee56319..2919ef0f878a4 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -7,7 +7,6 @@ from pandas._libs import NaT, Period, Timestamp, index as libindex, lib, tslib from pandas._libs.tslibs import Resolution, fields, parsing, timezones, to_offset -from pandas._libs.tslibs.frequencies import get_freq_group from pandas._libs.tslibs.offsets import prefix_mapping from pandas._typing import DtypeObj, Label from pandas.util._decorators import cache_readonly @@ -470,7 +469,7 @@ def snap(self, freq="S"): dta = DatetimeArray(snapped, dtype=self.dtype) return DatetimeIndex._simple_new(dta, name=self.name) - def _parsed_string_to_bounds(self, reso: str, parsed: datetime): + def _parsed_string_to_bounds(self, reso: Resolution, parsed: datetime): """ Calculate datetime bounds for parsed time string and its resolution. @@ -485,6 +484,7 @@ def _parsed_string_to_bounds(self, reso: str, parsed: datetime): ------- lower, upper: pd.Timestamp """ + assert isinstance(reso, Resolution), (type(reso), reso) valid_resos = { "year", "month", @@ -497,10 +497,10 @@ def _parsed_string_to_bounds(self, reso: str, parsed: datetime): "second", "microsecond", } - if reso not in valid_resos: + if reso.attrname not in valid_resos: raise KeyError - grp = get_freq_group(reso) + grp = reso.freq_group per = Period(parsed, freq=grp) start, end = per.start_time, per.end_time @@ -521,11 +521,12 @@ def _parsed_string_to_bounds(self, reso: str, parsed: datetime): end = end.tz_localize(self.tz) return start, end - def _validate_partial_date_slice(self, reso: str): + def _validate_partial_date_slice(self, reso: Resolution): + assert isinstance(reso, Resolution), (type(reso), reso) if ( self.is_monotonic - and reso in ["day", "hour", "minute", "second"] - and self._resolution_obj >= Resolution.from_attrname(reso) + and reso.attrname in ["day", "hour", "minute", "second"] + and self._resolution_obj >= reso ): # These resolution/monotonicity validations came from GH3931, # GH3452 and GH2369. @@ -625,6 +626,7 @@ def _maybe_cast_slice_bound(self, label, side: str, kind): if isinstance(label, str): freq = getattr(self, "freqstr", getattr(self, "inferred_freq", None)) parsed, reso = parsing.parse_time_string(label, freq) + reso = Resolution.from_attrname(reso) lower, upper = self._parsed_string_to_bounds(reso, parsed) # lower, upper form the half-open interval: # [parsed, parsed + 1 freq) @@ -641,6 +643,7 @@ def _maybe_cast_slice_bound(self, label, side: str, kind): def _get_string_slice(self, key: str, use_lhs: bool = True, use_rhs: bool = True): freq = getattr(self, "freqstr", getattr(self, "inferred_freq", None)) parsed, reso = parsing.parse_time_string(key, freq) + reso = Resolution.from_attrname(reso) loc = self._partial_date_slice(reso, parsed, use_lhs=use_lhs, use_rhs=use_rhs) return loc diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index 0fafeef078d78..43dfd94b49215 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -5,7 +5,7 @@ from pandas._libs import index as libindex from pandas._libs.lib import no_default -from pandas._libs.tslibs import Period +from pandas._libs.tslibs import Period, Resolution from pandas._libs.tslibs.frequencies import get_freq_group from pandas._libs.tslibs.parsing import DateParseError, parse_time_string from pandas._typing import DtypeObj, Label @@ -501,7 +501,8 @@ def get_loc(self, key, method=None, tolerance=None): # A string with invalid format raise KeyError(f"Cannot interpret '{key}' as period") from err - grp = get_freq_group(reso) + reso = Resolution.from_attrname(reso) + grp = reso.freq_group freqn = get_freq_group(self.freq) # _get_string_slice will handle cases where grp < freqn @@ -558,6 +559,7 @@ def _maybe_cast_slice_bound(self, label, side: str, kind: str): elif isinstance(label, str): try: parsed, reso = parse_time_string(label, self.freq) + reso = Resolution.from_attrname(reso) bounds = self._parsed_string_to_bounds(reso, parsed) return bounds[0 if side == "left" else 1] except ValueError as err: @@ -569,16 +571,14 @@ def _maybe_cast_slice_bound(self, label, side: str, kind: str): return label - def _parsed_string_to_bounds(self, reso: str, parsed: datetime): - if reso not in ["year", "month", "quarter", "day", "hour", "minute", "second"]: - raise KeyError(reso) - - grp = get_freq_group(reso) + def _parsed_string_to_bounds(self, reso: Resolution, parsed: datetime): + grp = reso.freq_group iv = Period(parsed, freq=grp) return (iv.asfreq(self.freq, how="start"), iv.asfreq(self.freq, how="end")) - def _validate_partial_date_slice(self, reso: str): - grp = get_freq_group(reso) + def _validate_partial_date_slice(self, reso: Resolution): + assert isinstance(reso, Resolution), (type(reso), reso) + grp = reso.freq_group freqn = get_freq_group(self.freq) if not grp < freqn: @@ -590,7 +590,7 @@ def _validate_partial_date_slice(self, reso: str): def _get_string_slice(self, key: str, use_lhs: bool = True, use_rhs: bool = True): # TODO: Check for non-True use_lhs/use_rhs parsed, reso = parse_time_string(key, self.freq) - + reso = Resolution.from_attrname(reso) try: return self._partial_date_slice(reso, parsed, use_lhs, use_rhs) except KeyError as err: diff --git a/pandas/tests/indexes/period/test_ops.py b/pandas/tests/indexes/period/test_ops.py index fc44226f9d72f..e7dd76584d780 100644 --- a/pandas/tests/indexes/period/test_ops.py +++ b/pandas/tests/indexes/period/test_ops.py @@ -7,24 +7,23 @@ class TestPeriodIndexOps: - def test_resolution(self): - for freq, expected in zip( - ["A", "Q", "M", "D", "H", "T", "S", "L", "U"], - [ - "day", - "day", - "day", - "day", - "hour", - "minute", - "second", - "millisecond", - "microsecond", - ], - ): - - idx = pd.period_range(start="2013-04-01", periods=30, freq=freq) - assert idx.resolution == expected + @pytest.mark.parametrize( + "freq,expected", + [ + ("A", "year"), + ("Q", "quarter"), + ("M", "month"), + ("D", "day"), + ("H", "hour"), + ("T", "minute"), + ("S", "second"), + ("L", "millisecond"), + ("U", "microsecond"), + ], + ) + def test_resolution(self, freq, expected): + idx = pd.period_range(start="2013-04-01", periods=30, freq=freq) + assert idx.resolution == expected def test_value_counts_unique(self): # GH 7735 diff --git a/pandas/tests/tseries/frequencies/test_freq_code.py b/pandas/tests/tseries/frequencies/test_freq_code.py index 4df221913b805..f0ff449d902d0 100644 --- a/pandas/tests/tseries/frequencies/test_freq_code.py +++ b/pandas/tests/tseries/frequencies/test_freq_code.py @@ -90,6 +90,9 @@ def test_get_to_timestamp_base(freqstr, exp_freqstr): @pytest.mark.parametrize( "freqstr,expected", [ + ("A", "year"), + ("Q", "quarter"), + ("M", "month"), ("D", "day"), ("H", "hour"), ("T", "minute"), @@ -103,13 +106,6 @@ def test_get_attrname_from_abbrev(freqstr, expected): assert Resolution.get_reso_from_freq(freqstr).attrname == expected -@pytest.mark.parametrize("freq", ["A", "Q", "M"]) -def test_get_freq_unsupported_(freq): - # Lowest-frequency resolution is for Day - with pytest.raises(KeyError, match=freq.lower()): - Resolution.get_reso_from_freq(freq) - - @pytest.mark.parametrize("freq", ["D", "H", "T", "S", "L", "U", "N"]) def test_get_freq_roundtrip2(freq): obj = Resolution.get_reso_from_freq(freq) From c2534a08c81c5f930dc475b6406a44c7f9357389 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 9 Jun 2020 18:05:05 -0700 Subject: [PATCH 0065/1025] CLN: remove get_freq_code (#34674) * REF: operate on Resolution objs instead of ints * restore * ENH: Resolutions for month/qtr/year * revert comments * remove commented-out * blackify * REF: avoid get_freq_code * CLN: remove get_freq_code * mypy fixup --- pandas/_libs/tslibs/__init__.py | 2 +- pandas/_libs/tslibs/frequencies.pxd | 2 - pandas/_libs/tslibs/frequencies.pyx | 132 +----------------- pandas/_libs/tslibs/period.pyx | 2 - pandas/core/arrays/datetimes.py | 4 +- pandas/core/indexes/period.py | 4 +- pandas/plotting/_matplotlib/timeseries.py | 3 +- .../indexes/datetimes/test_scalar_compat.py | 3 +- .../tests/indexes/datetimes/test_to_period.py | 2 +- .../indexes/timedeltas/test_scalar_compat.py | 4 +- pandas/tests/scalar/period/test_asfreq.py | 3 +- pandas/tests/scalar/period/test_period.py | 3 +- .../tests/scalar/timestamp/test_unary_ops.py | 2 +- .../tseries/frequencies/test_freq_code.py | 123 +--------------- .../tseries/frequencies/test_inference.py | 2 +- pandas/tests/tseries/offsets/test_fiscal.py | 2 +- pandas/tests/tseries/offsets/test_offsets.py | 13 +- pandas/tests/tslibs/test_libfrequencies.py | 49 ------- pandas/tests/tslibs/test_period_asfreq.py | 18 ++- 19 files changed, 42 insertions(+), 331 deletions(-) diff --git a/pandas/_libs/tslibs/__init__.py b/pandas/_libs/tslibs/__init__.py index 6dbb4ce7bc974..6f173a4542bb0 100644 --- a/pandas/_libs/tslibs/__init__.py +++ b/pandas/_libs/tslibs/__init__.py @@ -18,7 +18,7 @@ "to_offset", ] -from . import dtypes # type: ignore +from . import dtypes from .conversion import localize_pydatetime from .nattype import NaT, NaTType, iNaT, is_null_datetimelike, nat_strings from .np_datetime import OutOfBoundsDatetime diff --git a/pandas/_libs/tslibs/frequencies.pxd b/pandas/_libs/tslibs/frequencies.pxd index 098944c965df0..896eec77ef4fe 100644 --- a/pandas/_libs/tslibs/frequencies.pxd +++ b/pandas/_libs/tslibs/frequencies.pxd @@ -1,5 +1,3 @@ cdef dict attrname_to_abbrevs -cpdef get_freq_code(freqstr) cpdef int get_to_timestamp_base(int base) -cpdef str get_freq_str(base, mult=*) diff --git a/pandas/_libs/tslibs/frequencies.pyx b/pandas/_libs/tslibs/frequencies.pyx index 9ff34ef0b6f89..6e525500ec37a 100644 --- a/pandas/_libs/tslibs/frequencies.pyx +++ b/pandas/_libs/tslibs/frequencies.pyx @@ -1,18 +1,5 @@ -cimport numpy as cnp -cnp.import_array() -from pandas._libs.tslibs.util cimport is_integer_object - -from pandas._libs.tslibs.offsets cimport is_offset_object -from pandas._libs.tslibs.offsets import ( - INVALID_FREQ_ERR_MSG, - _dont_uppercase, - _lite_rule_alias, - base_and_stride, - opattern, -) - -from .dtypes import FreqGroup, _period_code_map, _reverse_period_code_map +from .dtypes import FreqGroup # --------------------------------------------------------------------- # Period codes @@ -36,131 +23,22 @@ cdef dict attrname_to_abbrevs = _attrname_to_abbrevs # ---------------------------------------------------------------------- -def get_freq_group(freq) -> int: +# TODO: this is now identical to the version in libperiod +def get_freq_group(freq: int) -> int: """ Return frequency code group of given frequency str or offset. Examples -------- - >>> get_freq_group('W-MON') + >>> get_freq_group(4001) 4000 - >>> get_freq_group('W-FRI') + >>> get_freq_group(4006) 4000 """ - if is_offset_object(freq): - freq = freq.rule_code - - if isinstance(freq, str): - freq = attrname_to_abbrevs.get(freq, freq) - base, mult = get_freq_code(freq) - freq = base - elif isinstance(freq, int): - pass - else: - raise ValueError('input must be str, offset or int') return (freq // 1000) * 1000 -cpdef get_freq_code(freqstr): - """ - Return freq str or tuple to freq code and stride (mult) - - Parameters - ---------- - freqstr : str or tuple - - Returns - ------- - return : tuple of base frequency code and stride (mult) - - Raises - ------ - TypeError : if passed a tuple witth incorrect types - - Examples - -------- - >>> get_freq_code('3D') - (6000, 3) - - >>> get_freq_code('D') - (6000, 1) - - >>> get_freq_code(('D', 3)) - (6000, 3) - """ - if is_offset_object(freqstr): - freqstr = (freqstr.rule_code, freqstr.n) - - if isinstance(freqstr, tuple): - if is_integer_object(freqstr[0]) and is_integer_object(freqstr[1]): - # e.g., freqstr = (2000, 1) - return freqstr - elif is_integer_object(freqstr[0]): - # Note: passing freqstr[1] below will raise TypeError if that - # is not a str - code = _period_str_to_code(freqstr[1]) - stride = freqstr[0] - return code, stride - else: - # e.g., freqstr = ('T', 5) - code = _period_str_to_code(freqstr[0]) - stride = freqstr[1] - return code, stride - - if is_integer_object(freqstr): - return freqstr, 1 - - base, stride = base_and_stride(freqstr) - code = _period_str_to_code(base) - - return code, stride - - -cpdef _period_str_to_code(str freqstr): - freqstr = _lite_rule_alias.get(freqstr, freqstr) - - if freqstr not in _dont_uppercase: - lower = freqstr.lower() - freqstr = _lite_rule_alias.get(lower, freqstr) - - if freqstr not in _dont_uppercase: - freqstr = freqstr.upper() - try: - return _period_code_map[freqstr] - except KeyError: - raise ValueError(INVALID_FREQ_ERR_MSG.format(freqstr)) - - -cpdef str get_freq_str(base, mult=1): - """ - Return the summary string associated with this offset code, possibly - adjusted by a multiplier. - - Parameters - ---------- - base : int (member of FreqGroup) - - Returns - ------- - freq_str : str - - Examples - -------- - >>> get_freq_str(1000) - 'A-DEC' - - >>> get_freq_str(2000, 2) - '2Q-DEC' - - >>> get_freq_str("foo") - """ - code = _reverse_period_code_map.get(base) - if mult == 1: - return code - return str(mult) + code - - cpdef int get_to_timestamp_base(int base): """ Return frequency code group used for base of to_timestamp against diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index c06f34e37ec49..47ebf139ed496 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -75,8 +75,6 @@ from pandas._libs.tslibs.dtypes cimport ( from pandas._libs.tslibs.frequencies cimport ( attrname_to_abbrevs, - get_freq_code, - get_freq_str, get_to_timestamp_base, ) from pandas._libs.tslibs.parsing cimport get_rule_month diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 8eb1bdadf9156..90513e355e732 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -11,8 +11,8 @@ Timestamp, conversion, fields, - frequencies as libfrequencies, iNaT, + offsets as liboffsets, resolution as libresolution, timezones, to_offset, @@ -1106,7 +1106,7 @@ def to_period(self, freq=None): # https://github.com/pandas-dev/pandas/issues/33358 if res is None: - base, stride = libfrequencies.base_and_stride(freq) + base, stride = liboffsets.base_and_stride(freq) res = f"{stride}{base}" freq = res diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index 43dfd94b49215..49cb78340d104 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -503,7 +503,7 @@ def get_loc(self, key, method=None, tolerance=None): reso = Resolution.from_attrname(reso) grp = reso.freq_group - freqn = get_freq_group(self.freq) + freqn = get_freq_group(self.dtype.dtype_code) # _get_string_slice will handle cases where grp < freqn assert grp >= freqn @@ -579,7 +579,7 @@ def _parsed_string_to_bounds(self, reso: Resolution, parsed: datetime): def _validate_partial_date_slice(self, reso: Resolution): assert isinstance(reso, Resolution), (type(reso), reso) grp = reso.freq_group - freqn = get_freq_group(self.freq) + freqn = get_freq_group(self.dtype.dtype_code) if not grp < freqn: # TODO: we used to also check for diff --git a/pandas/plotting/_matplotlib/timeseries.py b/pandas/plotting/_matplotlib/timeseries.py index a9cca32271b9f..99fc730e818c4 100644 --- a/pandas/plotting/_matplotlib/timeseries.py +++ b/pandas/plotting/_matplotlib/timeseries.py @@ -6,7 +6,8 @@ import numpy as np from pandas._libs.tslibs import Period, to_offset -from pandas._libs.tslibs.frequencies import FreqGroup, base_and_stride +from pandas._libs.tslibs.frequencies import FreqGroup +from pandas._libs.tslibs.offsets import base_and_stride from pandas._typing import FrameOrSeriesUnion from pandas.core.dtypes.generic import ( diff --git a/pandas/tests/indexes/datetimes/test_scalar_compat.py b/pandas/tests/indexes/datetimes/test_scalar_compat.py index 23dedf6f86a09..0d39e034905d2 100644 --- a/pandas/tests/indexes/datetimes/test_scalar_compat.py +++ b/pandas/tests/indexes/datetimes/test_scalar_compat.py @@ -7,6 +7,7 @@ import pytest from pandas._libs.tslibs import OutOfBoundsDatetime, to_offset +from pandas._libs.tslibs.offsets import INVALID_FREQ_ERR_MSG import pandas as pd from pandas import DatetimeIndex, Timestamp, date_range @@ -118,7 +119,7 @@ def test_round(self, tz_naive_fixture): tm.assert_index_equal(rng.round(freq="H"), expected_rng) assert elt.round(freq="H") == expected_elt - msg = pd._libs.tslibs.frequencies.INVALID_FREQ_ERR_MSG + msg = INVALID_FREQ_ERR_MSG with pytest.raises(ValueError, match=msg): rng.round(freq="foo") with pytest.raises(ValueError, match=msg): diff --git a/pandas/tests/indexes/datetimes/test_to_period.py b/pandas/tests/indexes/datetimes/test_to_period.py index d82fc1ef6743b..51cc6af2eed08 100644 --- a/pandas/tests/indexes/datetimes/test_to_period.py +++ b/pandas/tests/indexes/datetimes/test_to_period.py @@ -6,7 +6,7 @@ import pytz from pandas._libs.tslibs.ccalendar import MONTHS -from pandas._libs.tslibs.frequencies import INVALID_FREQ_ERR_MSG +from pandas._libs.tslibs.period import INVALID_FREQ_ERR_MSG from pandas import ( DatetimeIndex, diff --git a/pandas/tests/indexes/timedeltas/test_scalar_compat.py b/pandas/tests/indexes/timedeltas/test_scalar_compat.py index 1b86cd1df5a7a..16c19b8d00380 100644 --- a/pandas/tests/indexes/timedeltas/test_scalar_compat.py +++ b/pandas/tests/indexes/timedeltas/test_scalar_compat.py @@ -5,6 +5,8 @@ import numpy as np import pytest +from pandas._libs.tslibs.offsets import INVALID_FREQ_ERR_MSG + import pandas as pd from pandas import Index, Series, Timedelta, TimedeltaIndex, timedelta_range import pandas._testing as tm @@ -58,7 +60,7 @@ def test_tdi_round(self): tm.assert_index_equal(td.round(freq="H"), expected_rng) assert elt.round(freq="H") == expected_elt - msg = pd._libs.tslibs.frequencies.INVALID_FREQ_ERR_MSG + msg = INVALID_FREQ_ERR_MSG with pytest.raises(ValueError, match=msg): td.round(freq="foo") with pytest.raises(ValueError, match=msg): diff --git a/pandas/tests/scalar/period/test_asfreq.py b/pandas/tests/scalar/period/test_asfreq.py index b9f637c178d53..56281521deb90 100644 --- a/pandas/tests/scalar/period/test_asfreq.py +++ b/pandas/tests/scalar/period/test_asfreq.py @@ -1,6 +1,7 @@ import pytest -from pandas._libs.tslibs.frequencies import INVALID_FREQ_ERR_MSG, _period_code_map +from pandas._libs.tslibs.dtypes import _period_code_map +from pandas._libs.tslibs.period import INVALID_FREQ_ERR_MSG from pandas.errors import OutOfBoundsDatetime from pandas import Period, Timestamp, offsets diff --git a/pandas/tests/scalar/period/test_period.py b/pandas/tests/scalar/period/test_period.py index 3e769b577582a..702899f163e06 100644 --- a/pandas/tests/scalar/period/test_period.py +++ b/pandas/tests/scalar/period/test_period.py @@ -6,9 +6,8 @@ from pandas._libs.tslibs import iNaT, period as libperiod from pandas._libs.tslibs.ccalendar import DAYS, MONTHS -from pandas._libs.tslibs.frequencies import INVALID_FREQ_ERR_MSG from pandas._libs.tslibs.parsing import DateParseError -from pandas._libs.tslibs.period import IncompatibleFrequency +from pandas._libs.tslibs.period import INVALID_FREQ_ERR_MSG, IncompatibleFrequency from pandas._libs.tslibs.timezones import dateutil_gettz, maybe_get_tz from pandas.compat.numpy import np_datetime64_compat diff --git a/pandas/tests/scalar/timestamp/test_unary_ops.py b/pandas/tests/scalar/timestamp/test_unary_ops.py index 388ff4ea039be..8641bbd0a66f2 100644 --- a/pandas/tests/scalar/timestamp/test_unary_ops.py +++ b/pandas/tests/scalar/timestamp/test_unary_ops.py @@ -6,7 +6,7 @@ from pytz import utc from pandas._libs.tslibs import NaT, Timestamp, conversion, to_offset -from pandas._libs.tslibs.frequencies import INVALID_FREQ_ERR_MSG +from pandas._libs.tslibs.period import INVALID_FREQ_ERR_MSG import pandas.util._test_decorators as td import pandas._testing as tm diff --git a/pandas/tests/tseries/frequencies/test_freq_code.py b/pandas/tests/tseries/frequencies/test_freq_code.py index f0ff449d902d0..189a0cc2171ad 100644 --- a/pandas/tests/tseries/frequencies/test_freq_code.py +++ b/pandas/tests/tseries/frequencies/test_freq_code.py @@ -1,80 +1,7 @@ import pytest -from pandas._libs.tslibs import Resolution, offsets, to_offset -from pandas._libs.tslibs.frequencies import ( - FreqGroup, - _attrname_to_abbrevs, - _period_code_map, - get_freq_code, - get_freq_group, - get_to_timestamp_base, -) - - -@pytest.fixture(params=list(_period_code_map.items())) -def period_code_item(request): - return request.param - - -@pytest.mark.parametrize( - "freqstr,expected", - [ - ("A", 1000), - ("3A", 1000), - ("-1A", 1000), - ("Y", 1000), - ("3Y", 1000), - ("-1Y", 1000), - ("W", 4000), - ("W-MON", 4001), - ("W-FRI", 4005), - ], -) -def test_freq_code(freqstr, expected): - assert get_freq_code(freqstr)[0] == expected - - -def test_freq_code_match(period_code_item): - freqstr, code = period_code_item - assert get_freq_code(freqstr)[0] == code - - -@pytest.mark.parametrize( - "freqstr,expected", - [ - ("A", 1000), - ("3A", 1000), - ("-1A", 1000), - ("A-JAN", 1000), - ("A-MAY", 1000), - ("Y", 1000), - ("3Y", 1000), - ("-1Y", 1000), - ("Y-JAN", 1000), - ("Y-MAY", 1000), - (offsets.YearEnd(), 1000), - (offsets.YearEnd(month=1), 1000), - (offsets.YearEnd(month=5), 1000), - ("W", 4000), - ("W-MON", 4000), - ("W-FRI", 4000), - (offsets.Week(), 4000), - (offsets.Week(weekday=1), 4000), - (offsets.Week(weekday=5), 4000), - ("T", FreqGroup.FR_MIN), - ], -) -def test_freq_group(freqstr, expected): - assert get_freq_group(freqstr) == expected - - -def test_freq_group_match(period_code_item): - freqstr, code = period_code_item - - str_group = get_freq_group(freqstr) - code_group = get_freq_group(code) - - assert str_group == code_group == code // 1000 * 1000 +from pandas._libs.tslibs import Resolution, to_offset +from pandas._libs.tslibs.frequencies import _attrname_to_abbrevs, get_to_timestamp_base @pytest.mark.parametrize( @@ -82,9 +9,9 @@ def test_freq_group_match(period_code_item): [("D", "D"), ("W", "D"), ("M", "D"), ("S", "S"), ("T", "S"), ("H", "S")], ) def test_get_to_timestamp_base(freqstr, exp_freqstr): - tsb = get_to_timestamp_base - - assert tsb(get_freq_code(freqstr)[0]) == get_freq_code(exp_freqstr)[0] + left_code = to_offset(freqstr)._period_dtype_code + exp_code = to_offset(exp_freqstr)._period_dtype_code + assert get_to_timestamp_base(left_code) == exp_code @pytest.mark.parametrize( @@ -144,43 +71,3 @@ def test_cat(args): with pytest.raises(ValueError, match=msg): to_offset(str(args[0]) + args[1]) - - -@pytest.mark.parametrize( - "freq_input,expected", - [ - # Frequency string. - ("A", (get_freq_code("A")[0], 1)), - ("3D", (get_freq_code("D")[0], 3)), - ("-2M", (get_freq_code("M")[0], -2)), - # Tuple. - (("D", 1), (get_freq_code("D")[0], 1)), - (("A", 3), (get_freq_code("A")[0], 3)), - (("M", -2), (get_freq_code("M")[0], -2)), - ((5, "T"), (FreqGroup.FR_MIN, 5)), - # Numeric Tuple. - ((1000, 1), (1000, 1)), - # Offsets. - (offsets.Day(), (get_freq_code("D")[0], 1)), - (offsets.Day(3), (get_freq_code("D")[0], 3)), - (offsets.Day(-2), (get_freq_code("D")[0], -2)), - (offsets.MonthEnd(), (get_freq_code("M")[0], 1)), - (offsets.MonthEnd(3), (get_freq_code("M")[0], 3)), - (offsets.MonthEnd(-2), (get_freq_code("M")[0], -2)), - (offsets.Week(), (get_freq_code("W")[0], 1)), - (offsets.Week(3), (get_freq_code("W")[0], 3)), - (offsets.Week(-2), (get_freq_code("W")[0], -2)), - (offsets.Hour(), (FreqGroup.FR_HR, 1)), - # Monday is weekday=0. - (offsets.Week(weekday=1), (get_freq_code("W-TUE")[0], 1)), - (offsets.Week(3, weekday=0), (get_freq_code("W-MON")[0], 3)), - (offsets.Week(-2, weekday=4), (get_freq_code("W-FRI")[0], -2)), - ], -) -def test_get_freq_code(freq_input, expected): - assert get_freq_code(freq_input) == expected - - -def test_get_code_invalid(): - with pytest.raises(ValueError, match="Invalid frequency"): - get_freq_code((5, "baz")) diff --git a/pandas/tests/tseries/frequencies/test_inference.py b/pandas/tests/tseries/frequencies/test_inference.py index c32ad5087ab9e..95edd038dab9b 100644 --- a/pandas/tests/tseries/frequencies/test_inference.py +++ b/pandas/tests/tseries/frequencies/test_inference.py @@ -4,7 +4,7 @@ import pytest from pandas._libs.tslibs.ccalendar import DAYS, MONTHS -from pandas._libs.tslibs.frequencies import INVALID_FREQ_ERR_MSG +from pandas._libs.tslibs.period import INVALID_FREQ_ERR_MSG from pandas.compat import is_platform_windows from pandas import DatetimeIndex, Index, Series, Timestamp, date_range, period_range diff --git a/pandas/tests/tseries/offsets/test_fiscal.py b/pandas/tests/tseries/offsets/test_fiscal.py index f0ce104a68e29..7713be67a7e05 100644 --- a/pandas/tests/tseries/offsets/test_fiscal.py +++ b/pandas/tests/tseries/offsets/test_fiscal.py @@ -6,7 +6,7 @@ from dateutil.relativedelta import relativedelta import pytest -from pandas._libs.tslibs.frequencies import INVALID_FREQ_ERR_MSG +from pandas._libs.tslibs.period import INVALID_FREQ_ERR_MSG from pandas import Timestamp import pandas._testing as tm diff --git a/pandas/tests/tseries/offsets/test_offsets.py b/pandas/tests/tseries/offsets/test_offsets.py index e3a89d9ed57a6..784c04f225630 100644 --- a/pandas/tests/tseries/offsets/test_offsets.py +++ b/pandas/tests/tseries/offsets/test_offsets.py @@ -11,13 +11,9 @@ conversion, timezones, ) -from pandas._libs.tslibs.frequencies import ( - INVALID_FREQ_ERR_MSG, - get_freq_code, - get_freq_str, -) import pandas._libs.tslibs.offsets as liboffsets from pandas._libs.tslibs.offsets import ApplyTypeError, _get_offset, _offset_map +from pandas._libs.tslibs.period import INVALID_FREQ_ERR_MSG import pandas.compat as compat from pandas.compat.numpy import np_datetime64_compat from pandas.errors import PerformanceWarning @@ -4112,13 +4108,6 @@ def test_rule_code(self): assert alias == _get_offset(alias).rule_code assert alias == (_get_offset(alias) * 5).rule_code - lst = ["M", "D", "B", "H", "T", "S", "L", "U"] - for k in lst: - code, stride = get_freq_code("3" + k) - assert isinstance(code, int) - assert stride == 3 - assert k == get_freq_str(code) - def test_dateoffset_misc(): oset = offsets.DateOffset(months=2, days=4) diff --git a/pandas/tests/tslibs/test_libfrequencies.py b/pandas/tests/tslibs/test_libfrequencies.py index feaaaf6adca6f..993f2f4c8ef10 100644 --- a/pandas/tests/tslibs/test_libfrequencies.py +++ b/pandas/tests/tslibs/test_libfrequencies.py @@ -1,6 +1,5 @@ import pytest -from pandas._libs.tslibs.frequencies import INVALID_FREQ_ERR_MSG, _period_str_to_code from pandas._libs.tslibs.parsing import get_rule_month from pandas.tseries import offsets @@ -28,51 +27,3 @@ def test_get_rule_month(obj, expected): result = get_rule_month(obj) assert result == expected - - -@pytest.mark.parametrize( - "obj,expected", - [ - ("A", 1000), - ("A-DEC", 1000), - ("A-JAN", 1001), - ("Y", 1000), - ("Y-DEC", 1000), - ("Y-JAN", 1001), - ("Q", 2000), - ("Q-DEC", 2000), - ("Q-FEB", 2002), - ("W", 4000), - ("W-SUN", 4000), - ("W-FRI", 4005), - ("Min", 8000), - ("ms", 10000), - ("US", 11000), - ("NS", 12000), - ], -) -def test_period_str_to_code(obj, expected): - assert _period_str_to_code(obj) == expected - - -@pytest.mark.parametrize( - "freq,expected,aliases", - [ - ("D", 6000, ["DAY", "DLY", "DAILY"]), - ("M", 3000, ["MTH", "MONTH", "MONTHLY"]), - ("N", 12000, ["NANOSECOND", "NANOSECONDLY"]), - ("H", 7000, ["HR", "HOUR", "HRLY", "HOURLY"]), - ("T", 8000, ["minute", "MINUTE", "MINUTELY"]), - ("L", 10000, ["MILLISECOND", "MILLISECONDLY"]), - ("U", 11000, ["MICROSECOND", "MICROSECONDLY"]), - ("S", 9000, ["sec", "SEC", "SECOND", "SECONDLY"]), - ("B", 5000, ["BUS", "BUSINESS", "BUSINESSLY", "WEEKDAY"]), - ], -) -def test_assert_aliases_deprecated(freq, expected, aliases): - assert isinstance(aliases, list) - assert _period_str_to_code(freq) == expected - - for alias in aliases: - with pytest.raises(ValueError, match=INVALID_FREQ_ERR_MSG): - _period_str_to_code(alias) diff --git a/pandas/tests/tslibs/test_period_asfreq.py b/pandas/tests/tslibs/test_period_asfreq.py index 7205c3cc676cf..63298b657e341 100644 --- a/pandas/tests/tslibs/test_period_asfreq.py +++ b/pandas/tests/tslibs/test_period_asfreq.py @@ -1,9 +1,15 @@ import pytest -from pandas._libs.tslibs.frequencies import get_freq_code +from pandas._libs.tslibs import to_offset from pandas._libs.tslibs.period import period_asfreq, period_ordinal +def get_freq_code(freqstr: str) -> int: + off = to_offset(freqstr) + code = off._period_dtype_code + return code + + @pytest.mark.parametrize( "freq1,freq2,expected", [ @@ -32,8 +38,7 @@ ) def test_intra_day_conversion_factors(freq1, freq2, expected): assert ( - period_asfreq(1, get_freq_code(freq1)[0], get_freq_code(freq2)[0], False) - == expected + period_asfreq(1, get_freq_code(freq1), get_freq_code(freq2), False) == expected ) @@ -42,7 +47,7 @@ def test_intra_day_conversion_factors(freq1, freq2, expected): ) def test_period_ordinal_start_values(freq, expected): # information for Jan. 1, 1970. - assert period_ordinal(1970, 1, 1, 0, 0, 0, 0, 0, get_freq_code(freq)[0]) == expected + assert period_ordinal(1970, 1, 1, 0, 0, 0, 0, 0, get_freq_code(freq)) == expected @pytest.mark.parametrize( @@ -55,7 +60,7 @@ def test_period_ordinal_start_values(freq, expected): ], ) def test_period_ordinal_week(dt, expected): - args = dt + (get_freq_code("W")[0],) + args = dt + (get_freq_code("W"),) assert period_ordinal(*args) == expected @@ -77,5 +82,6 @@ def test_period_ordinal_week(dt, expected): ], ) def test_period_ordinal_business_day(day, expected): - args = (2013, 10, day, 0, 0, 0, 0, 0, get_freq_code("B")[0]) + # 5000 is PeriodDtypeCode for BusinessDay + args = (2013, 10, day, 0, 0, 0, 0, 0, 5000) assert period_ordinal(*args) == expected From ee6af4a7e5a756507136b1af868c93388ec18873 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Wed, 10 Jun 2020 10:23:16 +0100 Subject: [PATCH 0066/1025] TYP: some type annotations in core\tools\datetimes.py (#34630) --- pandas/_typing.py | 3 +- pandas/core/indexes/datetimes.py | 25 +++++- pandas/core/series.py | 2 +- pandas/core/tools/datetimes.py | 127 +++++++++++++++++++++++-------- pandas/io/excel/_odfreader.py | 6 +- 5 files changed, 123 insertions(+), 40 deletions(-) diff --git a/pandas/_typing.py b/pandas/_typing.py index 71df27119bd96..4892abc5f6f51 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -1,4 +1,4 @@ -from datetime import datetime, timedelta +from datetime import datetime, timedelta, tzinfo from pathlib import Path from typing import ( IO, @@ -52,6 +52,7 @@ TimedeltaConvertibleTypes = Union[ "Timedelta", timedelta, np.timedelta64, int, np.int64, float, str ] +Timezone = Union[str, tzinfo] # other diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 2919ef0f878a4..6bcfb3bccf5c7 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -9,7 +9,7 @@ from pandas._libs.tslibs import Resolution, fields, parsing, timezones, to_offset from pandas._libs.tslibs.offsets import prefix_mapping from pandas._typing import DtypeObj, Label -from pandas.util._decorators import cache_readonly +from pandas.util._decorators import cache_readonly, doc from pandas.core.dtypes.common import ( DT64NS_DTYPE, @@ -63,9 +63,13 @@ def _new_DatetimeIndex(cls, d): @inherit_names( - ["to_period", "to_perioddelta", "to_julian_date", "strftime", "isocalendar"] + ["to_perioddelta", "to_julian_date", "strftime", "isocalendar"] + DatetimeArray._field_ops - + DatetimeArray._datetimelike_methods, + + [ + method + for method in DatetimeArray._datetimelike_methods + if method not in ("tz_localize",) + ], DatetimeArray, wrap=True, ) @@ -217,6 +221,21 @@ class DatetimeIndex(DatetimeTimedeltaMixin): _data: DatetimeArray tz: Optional[tzinfo] + # -------------------------------------------------------------------- + # methods that dispatch to array and wrap result in DatetimeIndex + + @doc(DatetimeArray.tz_localize) + def tz_localize( + self, tz, ambiguous="raise", nonexistent="raise" + ) -> "DatetimeIndex": + arr = self._data.tz_localize(tz, ambiguous, nonexistent) + return type(self)._simple_new(arr, name=self.name) + + @doc(DatetimeArray.to_period) + def to_period(self, freq=None) -> "DatetimeIndex": + arr = self._data.to_period(freq) + return type(self)._simple_new(arr, name=self.name) + # -------------------------------------------------------------------- # Constructors diff --git a/pandas/core/series.py b/pandas/core/series.py index 71ffdcbd40fe7..b51c08fa592d5 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -4846,7 +4846,7 @@ def to_period(self, freq=None, copy=True) -> "Series": if not isinstance(self.index, DatetimeIndex): raise TypeError(f"unsupported Type {type(self.index).__name__}") - new_index = self.index.to_period(freq=freq) # type: ignore + new_index = self.index.to_period(freq=freq) return self._constructor(new_values, index=new_index).__finalize__( self, method="to_period" ) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 42bffa0374472..0adab143f6052 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -2,7 +2,16 @@ from datetime import datetime from functools import partial from itertools import islice -from typing import TYPE_CHECKING, Optional, TypeVar, Union +from typing import ( + TYPE_CHECKING, + Callable, + List, + Optional, + Tuple, + TypeVar, + Union, + overload, +) import warnings import numpy as np @@ -15,7 +24,7 @@ _guess_datetime_format, ) from pandas._libs.tslibs.strptime import array_strptime -from pandas._typing import ArrayLike +from pandas._typing import ArrayLike, Label, Timezone from pandas.core.dtypes.common import ( ensure_object, @@ -45,16 +54,15 @@ if TYPE_CHECKING: from pandas import Series # noqa:F401 + from pandas._libs.tslibs.nattype import NaTType # noqa:F401 # --------------------------------------------------------------------- # types used in annotations -ArrayConvertible = Union[list, tuple, ArrayLike, "Series"] +ArrayConvertible = Union[List, Tuple, ArrayLike, "Series"] Scalar = Union[int, float, str] DatetimeScalar = TypeVar("DatetimeScalar", Scalar, datetime) -DatetimeScalarOrArrayConvertible = Union[ - DatetimeScalar, list, tuple, ArrayLike, "Series" -] +DatetimeScalarOrArrayConvertible = Union[DatetimeScalar, ArrayConvertible] # --------------------------------------------------------------------- @@ -123,7 +131,12 @@ def should_cache( return do_caching -def _maybe_cache(arg, format, cache, convert_listlike): +def _maybe_cache( + arg: ArrayConvertible, + format: Optional[str], + cache: bool, + convert_listlike: Callable, +) -> "Series": """ Create a cache of unique dates from an array of dates @@ -159,7 +172,7 @@ def _maybe_cache(arg, format, cache, convert_listlike): def _box_as_indexlike( - dt_array: ArrayLike, utc: Optional[bool] = None, name: Optional[str] = None + dt_array: ArrayLike, utc: Optional[bool] = None, name: Label = None ) -> Index: """ Properly boxes the ndarray of datetimes to DatetimeIndex @@ -244,15 +257,15 @@ def _return_parsed_timezone_results(result, timezones, tz, name): def _convert_listlike_datetimes( arg, - format, - name=None, - tz=None, - unit=None, - errors=None, - infer_datetime_format=None, - dayfirst=None, - yearfirst=None, - exact=None, + format: Optional[str], + name: Label = None, + tz: Optional[Timezone] = None, + unit: Optional[str] = None, + errors: Optional[str] = None, + infer_datetime_format: Optional[bool] = None, + dayfirst: Optional[bool] = None, + yearfirst: Optional[bool] = None, + exact: Optional[bool] = None, ): """ Helper function for to_datetime. Performs the conversions of 1D listlike @@ -306,9 +319,7 @@ def _convert_listlike_datetimes( pass elif tz: # DatetimeArray, DatetimeIndex - # error: Item "DatetimeIndex" of "Union[DatetimeArray, DatetimeIndex]" has - # no attribute "tz_localize" - return arg.tz_localize(tz) # type: ignore + return arg.tz_localize(tz) return arg @@ -539,19 +550,70 @@ def _adjust_to_origin(arg, origin, unit): return arg +@overload def to_datetime( - arg, - errors="raise", - dayfirst=False, - yearfirst=False, - utc=None, - format=None, - exact=True, - unit=None, - infer_datetime_format=False, + arg: DatetimeScalar, + errors: str = ..., + dayfirst: bool = ..., + yearfirst: bool = ..., + utc: Optional[bool] = ..., + format: Optional[str] = ..., + exact: bool = ..., + unit: Optional[str] = ..., + infer_datetime_format: bool = ..., + origin=..., + cache: bool = ..., +) -> Union[DatetimeScalar, "NaTType"]: + ... + + +@overload +def to_datetime( + arg: "Series", + errors: str = ..., + dayfirst: bool = ..., + yearfirst: bool = ..., + utc: Optional[bool] = ..., + format: Optional[str] = ..., + exact: bool = ..., + unit: Optional[str] = ..., + infer_datetime_format: bool = ..., + origin=..., + cache: bool = ..., +) -> "Series": + ... + + +@overload +def to_datetime( + arg: Union[List, Tuple], + errors: str = ..., + dayfirst: bool = ..., + yearfirst: bool = ..., + utc: Optional[bool] = ..., + format: Optional[str] = ..., + exact: bool = ..., + unit: Optional[str] = ..., + infer_datetime_format: bool = ..., + origin=..., + cache: bool = ..., +) -> DatetimeIndex: + ... + + +def to_datetime( + arg: DatetimeScalarOrArrayConvertible, + errors: str = "raise", + dayfirst: bool = False, + yearfirst: bool = False, + utc: Optional[bool] = None, + format: Optional[str] = None, + exact: bool = True, + unit: Optional[str] = None, + infer_datetime_format: bool = False, origin="unix", - cache=True, -): + cache: bool = True, +) -> Union[DatetimeIndex, "Series", DatetimeScalar, "NaTType"]: """ Convert argument to datetime. @@ -746,8 +808,7 @@ def to_datetime( if not cache_array.empty: result = _convert_and_box_cache(arg, cache_array, name=arg.name) else: - convert_listlike = partial(convert_listlike, name=arg.name) - result = convert_listlike(arg, format) + result = convert_listlike(arg, format, name=arg.name) elif is_list_like(arg): try: cache_array = _maybe_cache(arg, format, cache, convert_listlike) diff --git a/pandas/io/excel/_odfreader.py b/pandas/io/excel/_odfreader.py index 739c77d1c0b99..be86b57ca2066 100644 --- a/pandas/io/excel/_odfreader.py +++ b/pandas/io/excel/_odfreader.py @@ -1,4 +1,4 @@ -from typing import List +from typing import List, cast from pandas._typing import FilePathOrBuffer, Scalar from pandas.compat._optional import import_optional_dependency @@ -179,7 +179,9 @@ def _get_cell_value(self, cell, convert_float: bool) -> Scalar: cell_value = cell.attributes.get((OFFICENS, "date-value")) return pd.to_datetime(cell_value) elif cell_type == "time": - return pd.to_datetime(str(cell)).time() + result = pd.to_datetime(str(cell)) + result = cast(pd.Timestamp, result) + return result.time() else: raise ValueError(f"Unrecognized type {cell_type}") From fb7b8a86d87aa7685cc5daac408938b91e471cc4 Mon Sep 17 00:00:00 2001 From: Giovanni Lanzani Date: Wed, 10 Jun 2020 16:54:14 +0200 Subject: [PATCH 0067/1025] DOC: Add note about shallow clones in contributing guide (#34690) Co-authored-by: Uwe L. Korn --- doc/source/development/contributing.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/doc/source/development/contributing.rst b/doc/source/development/contributing.rst index 163d345b4f829..b85e9403038ab 100644 --- a/doc/source/development/contributing.rst +++ b/doc/source/development/contributing.rst @@ -136,6 +136,10 @@ want to clone your fork to your machine:: This creates the directory `pandas-yourname` and connects your repository to the upstream (main project) *pandas* repository. +Note that performing a shallow clone (with ``--depth==N``, for some ``N`` greater +or equal to 1) might break some tests and features as ``pd.show_versions()`` +as the version number cannot be computed anymore. + .. _contributing.dev_env: Creating a development environment From acd69cbd919bf86f65f7d96b596fc58e879633ab Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 11 Jun 2020 10:26:57 -0500 Subject: [PATCH 0068/1025] BLD: Pin cython for 37-locale build (#34711) --- ci/deps/azure-37-locale.yaml | 3 ++- environment.yml | 2 +- requirements-dev.txt | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/ci/deps/azure-37-locale.yaml b/ci/deps/azure-37-locale.yaml index 31155ac93931a..25ee821afe7bd 100644 --- a/ci/deps/azure-37-locale.yaml +++ b/ci/deps/azure-37-locale.yaml @@ -5,7 +5,8 @@ dependencies: - python=3.7.* # tools - - cython>=0.29.16 + # Cython pin for https://github.com/pandas-dev/pandas/issues/34704 + - cython==0.29.19 - pytest>=5.0.1 - pytest-xdist>=1.21 - pytest-asyncio diff --git a/environment.yml b/environment.yml index b81404094fa4c..bfe0e78c891cf 100644 --- a/environment.yml +++ b/environment.yml @@ -12,7 +12,7 @@ dependencies: - asv # building - - cython>=0.29.16 + - cython=0.29.19 # code checks - black=19.10b0 diff --git a/requirements-dev.txt b/requirements-dev.txt index 754ec7ae28748..791dc7cd79128 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -5,7 +5,7 @@ numpy>=1.15 python-dateutil>=2.7.3 pytz asv -cython>=0.29.16 +cython==0.29.19 black==19.10b0 cpplint flake8<3.8.0 From 9db0dd0566bc343605721a9457f37b2fcadc8c05 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 11 Jun 2020 10:29:01 -0500 Subject: [PATCH 0069/1025] BLD: pyproject.toml for Py38 (#34667) --- doc/source/whatsnew/v1.0.5.rst | 3 ++- pyproject.toml | 6 ++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.0.5.rst b/doc/source/whatsnew/v1.0.5.rst index 1edc7e1cad72f..5dbc911407784 100644 --- a/doc/source/whatsnew/v1.0.5.rst +++ b/doc/source/whatsnew/v1.0.5.rst @@ -22,7 +22,8 @@ Fixed regressions Bug fixes ~~~~~~~~~ -- + +- Fixed building from source with Python 3.8 fetching the wrong version of NumPy (:issue:`34666`) - Contributors diff --git a/pyproject.toml b/pyproject.toml index efeb24edbdeb1..aaebcff8e4c1e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,9 +6,11 @@ requires = [ "wheel", "Cython>=0.29.16", # Note: sync with setup.py "numpy==1.15.4; python_version=='3.6' and platform_system!='AIX'", - "numpy==1.15.4; python_version>='3.7' and platform_system!='AIX'", + "numpy==1.15.4; python_version=='3.7' and platform_system!='AIX'", + "numpy==1.17.3; python_version>='3.8' and platform_system!='AIX'", "numpy==1.16.0; python_version=='3.6' and platform_system=='AIX'", - "numpy==1.16.0; python_version>='3.7' and platform_system=='AIX'", + "numpy==1.16.0; python_version=='3.7' and platform_system=='AIX'", + "numpy==1.17.3; python_version>='3.8' and platform_system=='AIX'", ] [tool.black] From 0512172519df45bfc33a9fe55e4e314b3181cfeb Mon Sep 17 00:00:00 2001 From: alimcmaster1 Date: Fri, 12 Jun 2020 19:17:06 +0100 Subject: [PATCH 0070/1025] REG: Fix read_parquet from file-like objects (#34500) Co-authored-by: Joris Van den Bossche --- pandas/io/parquet.py | 19 ++++++++++++++----- pandas/tests/io/data/parquet/simple.parquet | Bin 0 -> 2157 bytes pandas/tests/io/test_parquet.py | 18 ++++++++++++++++++ 3 files changed, 32 insertions(+), 5 deletions(-) create mode 100644 pandas/tests/io/data/parquet/simple.parquet diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index cde7a98eb42ae..de9a14c82b3cb 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -122,11 +122,20 @@ def write( file_obj_or_path.close() def read(self, path, columns=None, **kwargs): - parquet_ds = self.api.parquet.ParquetDataset( - path, filesystem=get_fs_for_path(path), **kwargs - ) - kwargs["columns"] = columns - result = parquet_ds.read_pandas(**kwargs).to_pandas() + fs = get_fs_for_path(path) + should_close = None + # Avoid calling get_filepath_or_buffer for s3/gcs URLs since + # since it returns an S3File which doesn't support dir reads in arrow + if not fs: + path, _, _, should_close = get_filepath_or_buffer(path) + + kwargs["use_pandas_metadata"] = True + result = self.api.parquet.read_table( + path, columns=columns, filesystem=fs, **kwargs + ).to_pandas() + if should_close: + path.close() + return result diff --git a/pandas/tests/io/data/parquet/simple.parquet b/pandas/tests/io/data/parquet/simple.parquet new file mode 100644 index 0000000000000000000000000000000000000000..2862a91f508ea225ea9829d4843f330473977134 GIT binary patch literal 2157 zcmcJRUvJ_@5Wo#VBzKDX5S@{cc!;b!bW#b$B;?BJ`miPeOo0SQuyIvI{ug4fjn9AL zkSH&G?kA}Ff%+M$s!x6Fhv@7YXh`Y9>2xc>&d$uv{ARslI7fYsPSGX0*rwBTYLB9r zRDybu_>iRHv9*|Kihe~i1?&W$bdK3sT9}>0!Y4z$-Jk{I%hxzuI+5IZP38YemH#!Bqj#5VJYl=f{FKQ5ww3>7E4Q|BL;7U)Lxp0}zIJ2S z3Y`L;H%TawqIdDkzoFVW$d^fHbJ@ZdJciG!BJ(Fa5uk(l6-8wWOxjf(UGbTpQxB}^ z=*HcZHnT$8@!9wLKRmEg(1FByiZeFY!anOLwgCz!v@BTpf#LTy}YO&e9JyQfmx?u5#GKA5v!gtm`-qfM9yH1V5NQnfRE>~033^=SpL5$ zi0WC2Pb-sTAxG>PN-*>iZ{oFyK}|!W)V((Yk0mnbFj8l}G12iTdTNib@^ocC z?cx2fhC!kbyMSO5j}Xix2+T{T5lDPA5-Zb@)B!ny{ro$@wnDivJd(=YTeUe1RHtK} znY?PuTb=^G!ellUoYswM>y}_fW5KT6sNyB|33c(#Z`Y8i9rkZDrw2L|E!MYMO*j@# zI=)bA-Rq5EgyVE1PRI69x$5g87fR(gw=Au4h9i-)u&?S2eYJ-3$*wHkO{MaNyp1}r z*YBmneno3wCS>Yj#fA7&c_852z{o|$uv#)Cp2QXGs;St+iC!c2K$iM95Ti620B_Sb zWwA0*%Y9e1H4<+)>^IsMWApkMyEX^^dI0CQ_={u3|ghvk5N>EZr>fm??s z`$4+I{Q~{h#Qg(f>4g1iEv(^5sJP$o8vIX4pJ|8$Av$PZJh1C2as7^B;LYHD42|GG lbKU*L>tUZOR*Kxgpo`82&9jSa0e Date: Fri, 12 Jun 2020 13:54:55 -0700 Subject: [PATCH 0071/1025] Debug CI Issue (#34721) --- pandas/compat/numpy/__init__.py | 2 ++ pandas/tests/extension/base/dtype.py | 22 +++++++++++++--------- pandas/tests/plotting/test_misc.py | 6 +++++- 3 files changed, 20 insertions(+), 10 deletions(-) diff --git a/pandas/compat/numpy/__init__.py b/pandas/compat/numpy/__init__.py index a8f49d91f040e..789a4668b6fee 100644 --- a/pandas/compat/numpy/__init__.py +++ b/pandas/compat/numpy/__init__.py @@ -11,6 +11,8 @@ _np_version_under1p16 = _nlv < LooseVersion("1.16") _np_version_under1p17 = _nlv < LooseVersion("1.17") _np_version_under1p18 = _nlv < LooseVersion("1.18") +_np_version_under1p19 = _nlv < LooseVersion("1.19") +_np_version_under1p20 = _nlv < LooseVersion("1.20") _is_numpy_dev = ".dev" in str(_nlv) diff --git a/pandas/tests/extension/base/dtype.py b/pandas/tests/extension/base/dtype.py index 65e32d716a4db..154fcdc38826d 100644 --- a/pandas/tests/extension/base/dtype.py +++ b/pandas/tests/extension/base/dtype.py @@ -68,18 +68,22 @@ def test_check_dtype(self, data): {"A": pd.Series(data, dtype=dtype), "B": data, "C": "foo", "D": 1} ) - # np.dtype('int64') == 'Int64' == 'int64' - # so can't distinguish - if dtype.name == "Int64": - expected = pd.Series([True, True, False, True], index=list("ABCD")) - else: - expected = pd.Series([True, True, False, False], index=list("ABCD")) - - # FIXME: This should probably be *fixed* not ignored. - # See libops.scalar_compare + # TODO(numpy-1.20): This warnings filter and if block can be removed + # once we require numpy>=1.20 with warnings.catch_warnings(): warnings.simplefilter("ignore", DeprecationWarning) result = df.dtypes == str(dtype) + # NumPy>=1.20.0, but not pandas.compat.numpy till there + # is a wheel available with this change. + try: + new_numpy_behavior = np.dtype("int64") != "Int64" + except TypeError: + new_numpy_behavior = True + + if dtype.name == "Int64" and not new_numpy_behavior: + expected = pd.Series([True, True, False, True], index=list("ABCD")) + else: + expected = pd.Series([True, True, False, False], index=list("ABCD")) self.assert_series_equal(result, expected) diff --git a/pandas/tests/plotting/test_misc.py b/pandas/tests/plotting/test_misc.py index 27039948dfc16..0b0d23632e827 100644 --- a/pandas/tests/plotting/test_misc.py +++ b/pandas/tests/plotting/test_misc.py @@ -96,13 +96,17 @@ def test_bootstrap_plot(self): class TestDataFramePlots(TestPlotBase): @td.skip_if_no_scipy def test_scatter_matrix_axis(self): + from pandas.plotting._matplotlib.compat import _mpl_ge_3_0_0 + scatter_matrix = plotting.scatter_matrix with tm.RNGContext(42): df = DataFrame(randn(100, 3)) # we are plotting multiples on a sub-plot - with tm.assert_produces_warning(UserWarning): + with tm.assert_produces_warning( + UserWarning, raise_on_extra_warnings=_mpl_ge_3_0_0() + ): axes = _check_plot_works( scatter_matrix, filterwarnings="always", frame=df, range_padding=0.1 ) From ef71a0ef618c233f9d1ce874107100fe423a8960 Mon Sep 17 00:00:00 2001 From: ObliviousParadigm <47667852+ObliviousParadigm@users.noreply.github.com> Date: Sat, 13 Jun 2020 02:27:23 +0530 Subject: [PATCH 0072/1025] Changed the way a few sentences were written (#34729) --- README.md | 6 +++--- setup.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 7edee8d3feeed..a72e8402e68a0 100644 --- a/README.md +++ b/README.md @@ -20,7 +20,7 @@ ## What is it? -**pandas** is a Python package providing fast, flexible, and expressive data +**pandas** is a Python package that provides fast, flexible, and expressive data structures designed to make working with "relational" or "labeled" data both easy and intuitive. It aims to be the fundamental high-level building block for doing practical, **real world** data analysis in Python. Additionally, it has @@ -154,11 +154,11 @@ For usage questions, the best place to go to is [StackOverflow](https://stackove Further, general questions and discussions can also take place on the [pydata mailing list](https://groups.google.com/forum/?fromgroups#!forum/pydata). ## Discussion and Development -Most development discussion is taking place on github in this repo. Further, the [pandas-dev mailing list](https://mail.python.org/mailman/listinfo/pandas-dev) can also be used for specialized discussions or design issues, and a [Gitter channel](https://gitter.im/pydata/pandas) is available for quick development related questions. +Most development discussions take place on github in this repo. Further, the [pandas-dev mailing list](https://mail.python.org/mailman/listinfo/pandas-dev) can also be used for specialized discussions or design issues, and a [Gitter channel](https://gitter.im/pydata/pandas) is available for quick development related questions. ## Contributing to pandas [![Open Source Helpers](https://www.codetriage.com/pandas-dev/pandas/badges/users.svg)](https://www.codetriage.com/pandas-dev/pandas) -All contributions, bug reports, bug fixes, documentation improvements, enhancements and ideas are welcome. +All contributions, bug reports, bug fixes, documentation improvements, enhancements, and ideas are welcome. A detailed overview on how to contribute can be found in the **[contributing guide](https://pandas.pydata.org/docs/dev/development/contributing.html)**. There is also an [overview](.github/CONTRIBUTING.md) on GitHub. diff --git a/setup.py b/setup.py index 9f411ec10cd80..3caea5c5e79da 100755 --- a/setup.py +++ b/setup.py @@ -117,7 +117,7 @@ def build_extensions(self): DESCRIPTION = "Powerful data structures for data analysis, time series, and statistics" LONG_DESCRIPTION = """ -**pandas** is a Python package providing fast, flexible, and expressive data +**pandas** is a Python package that provides fast, flexible, and expressive data structures designed to make working with structured (tabular, multidimensional, potentially heterogeneous) and time series data both easy and intuitive. It aims to be the fundamental high-level building block for doing practical, From 6774fad0af3c760b0eb0de229c63c880809f347e Mon Sep 17 00:00:00 2001 From: willpeppo Date: Fri, 12 Jun 2020 16:58:27 -0400 Subject: [PATCH 0073/1025] DOC: updated pandas/core/series.py for SS06 errors (#34716) --- pandas/core/series.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index b51c08fa592d5..b32a4c36a8247 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2935,8 +2935,10 @@ def combine_first(self, other) -> "Series": def update(self, other) -> None: """ - Modify Series in place using non-NA values from passed - Series. Aligns on index. + Modify Series in place using values from passed Series. + + Uses non-NA values from passed Series to make updates. Aligns + on index. Parameters ---------- @@ -3451,6 +3453,8 @@ def sort_index( def argsort(self, axis=0, kind="quicksort", order=None) -> "Series": """ + Return the integer indices that would sort the Series values. + Override ndarray.argsort. Argsorts the value, omitting NA/null values, and places the result in the same locations as the non-NA values. @@ -3733,8 +3737,7 @@ def reorder_levels(self, order) -> "Series": def explode(self) -> "Series": """ - Transform each element of a list-like to a row, replicating the - index values. + Transform each element of a list-like to a row. .. versionadded:: 0.25.0 @@ -3792,6 +3795,7 @@ def explode(self) -> "Series": def unstack(self, level=-1, fill_value=None): """ Unstack, also known as pivot, Series with MultiIndex to produce DataFrame. + The level involved will automatically get sorted. Parameters @@ -4825,8 +4829,7 @@ def to_timestamp(self, freq=None, how="start", copy=True) -> "Series": def to_period(self, freq=None, copy=True) -> "Series": """ - Convert Series from DatetimeIndex to PeriodIndex with desired - frequency (inferred from index if not passed). + Convert Series from DatetimeIndex to PeriodIndex. Parameters ---------- From 385eff970110afa40e504323fbaa839fb93f2142 Mon Sep 17 00:00:00 2001 From: willpeppo Date: Fri, 12 Jun 2020 16:59:55 -0400 Subject: [PATCH 0074/1025] DOC: updated core/groupby/generic.py for SS06 errors (#34715) --- pandas/core/groupby/generic.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 3d07f90bf7f94..5894066dd33c8 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1573,8 +1573,10 @@ def _transform_item_by_item(self, obj: DataFrame, wrapper) -> DataFrame: def filter(self, func, dropna=True, *args, **kwargs): """ - Return a copy of a DataFrame excluding elements from groups that - do not satisfy the boolean criterion specified by func. + Return a copy of a DataFrame excluding filtered elements. + + Elements from groups are filtered if they do not satisfy the + boolean criterion specified by func. Parameters ---------- @@ -1835,8 +1837,7 @@ def count(self): def nunique(self, dropna: bool = True): """ - Return DataFrame with number of distinct observations per group for - each column. + Return DataFrame with counts of unique elements in each position. Parameters ---------- From cfbb9c3adc79509812f6dbcb0d7864d7e42e4cef Mon Sep 17 00:00:00 2001 From: willpeppo Date: Fri, 12 Jun 2020 17:00:55 -0400 Subject: [PATCH 0075/1025] DOC: updated core/indexes/base.py for SS06 errors (#34713) --- pandas/core/indexes/base.py | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 240882e561bc6..4a99d2dfe339a 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -609,9 +609,10 @@ def view(self, cls=None): def astype(self, dtype, copy=True): """ - Create an Index with values cast to dtypes. The class of a new Index - is determined by dtype. When conversion is impossible, a ValueError - exception is raised. + Create an Index with values cast to dtypes. + + The class of a new Index is determined by dtype. When conversion is + impossible, a ValueError exception is raised. Parameters ---------- @@ -2197,8 +2198,9 @@ def dropna(self, how="any"): def unique(self, level=None): """ - Return unique values in the index. Uniques are returned in order - of appearance, this does NOT sort. + Return unique values in the index. + + Unique values are returned in order of appearance, this does NOT sort. Parameters ---------- @@ -2675,8 +2677,7 @@ def intersection(self, other, sort=False): def difference(self, other, sort=None): """ - Return a new Index with elements from the index that are not in - `other`. + Return a new Index with elements of index not in `other`. This is the set difference of two Index objects. @@ -3271,8 +3272,7 @@ def _can_reindex(self, indexer): def reindex(self, target, method=None, level=None, limit=None, tolerance=None): """ - Create index with target's values (move/add/delete values - as necessary). + Create index with target's values. Parameters ---------- @@ -4253,8 +4253,7 @@ def equals(self, other: Any) -> bool: def identical(self, other) -> bool: """ - Similar to equals, but check that other comparable attributes are - also equal. + Similar to equals, but checks that object attributes and types are also equal. Returns ------- @@ -4340,8 +4339,7 @@ def asof(self, label): def asof_locs(self, where, mask): """ - Find the locations (indices) of the labels from the index for - every entry in the `where` argument. + Return the locations (indices) of labels in the index. As in the `asof` function, if the label (a particular entry in `where`) is not in the index, the latest index label up to the @@ -4551,8 +4549,9 @@ def argsort(self, *args, **kwargs) -> np.ndarray: def get_value(self, series: "Series", key): """ - Fast lookup of value from 1-dimensional ndarray. Only use this if you - know what you're doing. + Fast lookup of value from 1-dimensional ndarray. + + Only use this if you know what you're doing. Returns ------- @@ -4905,8 +4904,9 @@ def _get_string_slice(self, key: str_t, use_lhs: bool = True, use_rhs: bool = Tr def slice_indexer(self, start=None, end=None, step=None, kind=None): """ - For an ordered or unique index, compute the slice indexer for input - labels and step. + Compute the slice indexer for input labels and step. + + Index needs to be ordered and unique. Parameters ---------- From 005f4c01a518f83fb1a31787f6a433e6b4007fa2 Mon Sep 17 00:00:00 2001 From: willpeppo Date: Sat, 13 Jun 2020 13:04:13 -0400 Subject: [PATCH 0076/1025] DOC: updated strings.py for SS06 errors (#34745) --- pandas/core/strings.py | 56 +++++++++++++++++++++++++++--------------- 1 file changed, 36 insertions(+), 20 deletions(-) diff --git a/pandas/core/strings.py b/pandas/core/strings.py index b27ad744dbdba..a1db7742916de 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -570,9 +570,9 @@ def str_endswith(arr, pat, na=np.nan): def str_replace(arr, pat, repl, n=-1, case=None, flags=0, regex=True): r""" - Replace occurrences of pattern/regex in the Series/Index with - some other string. Equivalent to :meth:`str.replace` or - :func:`re.sub`, depending on the regex value. + Replace each occurrence of pattern/regex in the Series/Index. + + Equivalent to :meth:`str.replace` or :func:`re.sub`, depending on the regex value. Parameters ---------- @@ -1063,6 +1063,8 @@ def str_extract(arr, pat, flags=0, expand=True): def str_extractall(arr, pat, flags=0): r""" + Extract capture groups in the regex `pat` as columns in DataFrame. + For each subject string in the Series, extract groups from all matches of regular expression pat. When each subject string in the Series has exactly one match, extractall(pat).xs(0, level='match') @@ -1174,7 +1176,9 @@ def str_extractall(arr, pat, flags=0): def str_get_dummies(arr, sep="|"): """ - Split each string in the Series by sep and return a DataFrame + Return DataFrame of dummy/indicator variables for Series. + + Each string in Series is split by sep and returned as a DataFrame of dummy/indicator variables. Parameters @@ -1743,8 +1747,7 @@ def str_strip(arr, to_strip=None, side="both"): def str_wrap(arr, width, **kwargs): r""" - Wrap long strings in the Series/Index to be formatted in - paragraphs with length less than a given width. + Wrap strings in Series/Index at specified line width. This method has the same keyword parameters and defaults as :class:`textwrap.TextWrapper`. @@ -1807,6 +1810,7 @@ def str_wrap(arr, width, **kwargs): def str_translate(arr, table): """ Map all characters in the string through the given mapping table. + Equivalent to standard :meth:`str.translate`. Parameters @@ -1889,6 +1893,7 @@ def f(x): def str_decode(arr, encoding, errors="strict"): """ Decode character string in the Series/Index using indicated encoding. + Equivalent to :meth:`str.decode` in python2 and :meth:`bytes.decode` in python3. @@ -1913,6 +1918,7 @@ def str_decode(arr, encoding, errors="strict"): def str_encode(arr, encoding, errors="strict"): """ Encode character string in the Series/Index using indicated encoding. + Equivalent to :meth:`str.encode`. Parameters @@ -2068,9 +2074,11 @@ def do_copy(target): class StringMethods(NoNewAttributesMixin): """ - Vectorized string functions for Series and Index. NAs stay NA unless - handled otherwise by a particular method. Patterned after Python's string - methods, with some inspiration from R's stringr package. + Vectorized string functions for Series and Index. + + NAs stay NA unless handled otherwise by a particular method. + Patterned after Python's string methods, with some inspiration from + R's stringr package. Examples -------- @@ -2853,8 +2861,9 @@ def pad(self, width, side="left", fillchar=" "): _shared_docs[ "str_pad" ] = """ - Filling %(side)s side of strings in the Series/Index with an - additional character. Equivalent to :meth:`str.%(method)s`. + Pad %(side)s side of strings in the Series/Index. + + Equivalent to :meth:`str.%(method)s`. Parameters ---------- @@ -3117,9 +3126,11 @@ def extractall(self, pat, flags=0): _shared_docs[ "find" ] = """ - Return %(side)s indexes in each strings in the Series/Index - where the substring is fully contained between [start:end]. - Return -1 on failure. Equivalent to standard :meth:`str.%(method)s`. + Return %(side)s indexes in each strings in the Series/Index. + + Each of returned indexes corresponds to the position where the + substring is fully contained between [start:end]. Return -1 on + failure. Equivalent to standard :meth:`str.%(method)s`. Parameters ---------- @@ -3169,6 +3180,7 @@ def rfind(self, sub, start=0, end=None): def normalize(self, form): """ Return the Unicode normal form for the strings in the Series/Index. + For more information on the forms, see the :func:`unicodedata.normalize`. @@ -3190,10 +3202,13 @@ def normalize(self, form): _shared_docs[ "index" ] = """ - Return %(side)s indexes in each strings where the substring is - fully contained between [start:end]. This is the same as - ``str.%(similar)s`` except instead of returning -1, it raises a ValueError - when the substring is not found. Equivalent to standard ``str.%(method)s``. + Return %(side)s indexes in each string in Series/Index. + + Each of the returned indexes corresponds to the position where the + substring is fully contained between [start:end]. This is the same + as ``str.%(similar)s`` except instead of returning -1, it raises a + ValueError when the substring is not found. Equivalent to standard + ``str.%(method)s``. Parameters ---------- @@ -3244,8 +3259,9 @@ def rindex(self, sub, start=0, end=None): _shared_docs[ "len" ] = """ - Compute the length of each element in the Series/Index. The element may be - a sequence (such as a string, tuple or list) or a collection + Compute the length of each element in the Series/Index. + + The element may be a sequence (such as a string, tuple or list) or a collection (such as a dictionary). Returns From 8773899c37e8f00be6a7c57be2be6a56852a47d2 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 13 Jun 2020 19:06:51 +0200 Subject: [PATCH 0077/1025] CLN: remove the old 'nature_with_gtoc' sphinx doc theme (#34742) --- .../themes/nature_with_gtoc/layout.html | 108 ------ .../nature_with_gtoc/static/nature.css_t | 356 ------------------ doc/source/themes/nature_with_gtoc/theme.conf | 7 - 3 files changed, 471 deletions(-) delete mode 100644 doc/source/themes/nature_with_gtoc/layout.html delete mode 100644 doc/source/themes/nature_with_gtoc/static/nature.css_t delete mode 100644 doc/source/themes/nature_with_gtoc/theme.conf diff --git a/doc/source/themes/nature_with_gtoc/layout.html b/doc/source/themes/nature_with_gtoc/layout.html deleted file mode 100644 index 6e7d8ece35133..0000000000000 --- a/doc/source/themes/nature_with_gtoc/layout.html +++ /dev/null @@ -1,108 +0,0 @@ -{# - -Subset of agogo theme -agogo/layout.html - -Sphinx layout template for the agogo theme, originally written -by Andi Albrecht. - -:copyright: Copyright 2007-2011 by the Sphinx team, see AUTHORS. -:license: BSD, see LICENSE for details. -#} -{% extends "basic/layout.html" %} - -{%- block content %} -
-
-
-
- {%- block sidebar1 %} - {%- block sidebartoc %} -

{{ _('Table Of Contents') }}

- {{ toctree(includehidden=True) }} - {%- endblock %} - {%- block sidebarsearch %} -

{{ _('Search') }}

- - -

- {{ _('Enter search terms or a module, class or function name.') }} -

- -
- {%- endblock %} - {# possible location for sidebar #} {% endblock %} - - - {%- block document %} -
- {%- if render_sidebar %} -
- {%- endif %} -
- {% block body %} {% endblock %} -
- {%- if render_sidebar %} -
- {%- endif %} -
- {%- endblock %} - - {%- block sidebar2 %} - - {% endblock %} -
-
-
-
-{%- endblock %} - -{%- block footer %} - -Scroll To Top - - - - - - - -{% endblock %} diff --git a/doc/source/themes/nature_with_gtoc/static/nature.css_t b/doc/source/themes/nature_with_gtoc/static/nature.css_t deleted file mode 100644 index 4571d97ec50ba..0000000000000 --- a/doc/source/themes/nature_with_gtoc/static/nature.css_t +++ /dev/null @@ -1,356 +0,0 @@ -/* - * nature.css_t - * ~~~~~~~~~~~~ - * - * Sphinx stylesheet -- nature theme. - * - * :copyright: Copyright 2007-2011 by the Sphinx team, see AUTHORS. - * :license: BSD, see LICENSE for details. - * - */ - -@import url("basic.css"); - -/* -- page layout ----------------------------------------------------------- */ - -body { - font-family: Arial, sans-serif; - font-size: 100%; - background-color: #111; - color: #555; - margin: 0; - padding: 0; -} - - -div.documentwrapper { - width: 100%; -} - -div.bodywrapper { -/* ugly hack, probably not attractive with other font size for re*/ - margin: 0 0 0 {{ theme_sidebarwidth|toint}}px; - min-width: 540px; - max-width: 800px; -} - - -hr { - border: 1px solid #B1B4B6; -} - -div.document { - background-color: #eee; -} - -div.body { - background-color: #ffffff; - color: #3E4349; - padding: 0 30px 30px 30px; - font-size: 0.9em; -} - -div.footer { - color: #555; - width: 100%; - padding: 13px 0; - text-align: center; - font-size: 75%; -} - -div.footer a { - color: #444; - text-decoration: underline; -} - -div.related { - background-color: #6BA81E; - line-height: 32px; - color: #fff; - text-shadow: 0px 1px 0 #444; - font-size: 0.9em; -} - -div.related a { - color: #E2F3CC; -} - -div.sphinxsidebar { - font-size: 0.75em; - line-height: 1.5em; - width: {{ theme_sidebarwidth|toint }}px; - margin: 0 ; - float: left; - - background-color: #eee; -} -/* -div.sphinxsidebarwrapper{ - padding: 20px 0; -} -*/ -div.sphinxsidebar h3, -div.sphinxsidebar h4 { - font-family: Arial, sans-serif; - color: #222; - font-size: 1.2em; - font-weight: normal; - margin: 20px 0 0 0; - padding: 5px 10px; - background-color: #ddd; - text-shadow: 1px 1px 0 white -} - -div.sphinxsidebar h4{ - font-size: 1.1em; -} - -div.sphinxsidebar h3 a { - color: #444; -} - - -div.sphinxsidebar p { - color: #888; -/* padding: 5px 20px;*/ -} - -div.sphinxsidebar p.searchtip { - color: #888; - padding: 5px 20px; -} - - -div.sphinxsidebar p.topless { -} - -div.sphinxsidebar ul { - margin: 10px 20px; - padding: 0; - color: #000; -} - -div.sphinxsidebar a { - color: #444; -} - -div.sphinxsidebar input { - border: 1px solid #ccc; - font-family: sans-serif; - font-size: 1em; -} - -div.sphinxsidebar input[type=text]{ - margin-left: 20px; -} - -/* -- body styles ----------------------------------------------------------- */ - -a { - color: #005B81; - text-decoration: none; -} - -a:hover { - color: #E32E00; - text-decoration: underline; -} - -div.body h1, -div.body h2, -div.body h3, -div.body h4, -div.body h5, -div.body h6 { - font-family: Arial, sans-serif; - background-color: #BED4EB; - font-weight: normal; - color: #212224; - margin: 30px 0px 10px 0px; - padding: 5px 0 5px 10px; - text-shadow: 0px 1px 0 white -} - -div.body h1 { border-top: 20px solid white; margin-top: 0; font-size: 200%; } -div.body h2 { font-size: 150%; background-color: #C8D5E3; } -div.body h3 { font-size: 120%; background-color: #D8DEE3; } -div.body h4 { font-size: 110%; background-color: #D8DEE3; } -div.body h5 { font-size: 100%; background-color: #D8DEE3; } -div.body h6 { font-size: 100%; background-color: #D8DEE3; } - -p.rubric { - border-bottom: 1px solid rgb(201, 201, 201); -} - -a.headerlink { - color: #c60f0f; - font-size: 0.8em; - padding: 0 4px 0 4px; - text-decoration: none; -} - -a.headerlink:hover { - background-color: #c60f0f; - color: white; -} - -div.body p, div.body dd, div.body li { - line-height: 1.5em; -} - -div.admonition p.admonition-title + p, div.deprecated p { - display: inline; -} - -div.deprecated { - margin-bottom: 10px; - margin-top: 10px; - padding: 7px; - background-color: #ffe4e4; - border: 1px solid #f66; -} - -div.highlight{ - background-color: white; -} - -div.note { - background-color: #eee; - border: 1px solid #ccc; -} - -div.seealso { - background-color: #ffc; - border: 1px solid #ff6; -} - -div.topic { - background-color: #eee; -} - -div.warning { - background-color: #ffe4e4; - border: 1px solid #f66; -} - -p.admonition-title { - display: inline; -} - -p.admonition-title:after { - content: ":"; -} - -pre { - padding: 10px; - background-color: rgb(250,250,250); - color: #222; - line-height: 1.2em; - border: 1px solid rgb(201,201,201); - font-size: 1.1em; - margin: 1.5em 0 1.5em 0; - -webkit-box-shadow: 1px 1px 1px #d8d8d8; - -moz-box-shadow: 1px 1px 1px #d8d8d8; -} - -tt { - background-color: #ecf0f3; - color: #222; - /* padding: 1px 2px; */ - font-size: 1.1em; - font-family: monospace; -} - -.viewcode-back { - font-family: Arial, sans-serif; -} - -div.viewcode-block:target { - background-color: #f4debf; - border-top: 1px solid #ac9; - border-bottom: 1px solid #ac9; -} - - -/** - * Styling for field lists - */ - - /* grey highlighting of 'parameter' and 'returns' field */ -table.field-list { - border-collapse: separate; - border-spacing: 10px; - margin-left: 1px; - /* border-left: 5px solid rgb(238, 238, 238) !important; */ -} - -table.field-list th.field-name { - /* display: inline-block; */ - padding: 1px 8px 1px 5px; - white-space: nowrap; - background-color: rgb(238, 238, 238); -} - -/* italic font for parameter types */ -table.field-list td.field-body > p { - font-style: italic; -} - -table.field-list td.field-body > p > strong { - font-style: normal; -} - -/* reduced space around parameter description */ -td.field-body blockquote { - border-left: none; - margin: 0em 0em 0.3em; - padding-left: 30px; -} - -// Adapted from the new Jupyter notebook style -// https://github.com/jupyter/notebook/blob/c8841b68c4c0739bbee1291e0214771f24194079/notebook/static/notebook/less/renderedhtml.less#L59 -table { - margin-left: auto; - margin-right: auto; - border: none; - border-collapse: collapse; - border-spacing: 0; - color: @rendered_html_border_color; - table-layout: fixed; -} -thead { - border-bottom: 1px solid @rendered_html_border_color; - vertical-align: bottom; -} -tr, th, td { - vertical-align: middle; - padding: 0.5em 0.5em; - line-height: normal; - white-space: normal; - max-width: none; - border: none; -} -th { - font-weight: bold; -} -th.col_heading { - text-align: right; -} -tbody tr:nth-child(odd) { - background: #f5f5f5; -} - -table td.data, table th.row_heading table th.col_heading { - font-family: monospace; - text-align: right; -} - - -/** - * See also - */ - -div.seealso dd { - margin-top: 0; - margin-bottom: 0; -} diff --git a/doc/source/themes/nature_with_gtoc/theme.conf b/doc/source/themes/nature_with_gtoc/theme.conf deleted file mode 100644 index 290a07bde8806..0000000000000 --- a/doc/source/themes/nature_with_gtoc/theme.conf +++ /dev/null @@ -1,7 +0,0 @@ -[theme] -inherit = basic -stylesheet = nature.css -pygments_style = tango - -[options] -sidebarwidth = 270 From e945868e4508991661e38b6ee6b5cc01b9e2d9a5 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sat, 13 Jun 2020 18:57:27 +0100 Subject: [PATCH 0078/1025] TYP: update setup.cfg (#34688) --- setup.cfg | 6 ------ 1 file changed, 6 deletions(-) diff --git a/setup.cfg b/setup.cfg index ea3d4c67d9358..e31d6001e065d 100644 --- a/setup.cfg +++ b/setup.cfg @@ -136,9 +136,6 @@ check_untyped_defs=False [mypy-pandas.conftest] ignore_errors=True -[mypy-pandas.tests.arithmetic.test_datetime64] -ignore_errors=True - [mypy-pandas.tests.tools.test_to_datetime] ignore_errors=True @@ -303,6 +300,3 @@ check_untyped_defs=False [mypy-pandas.tseries.holiday] check_untyped_defs=False - -[mypy-pandas.tseries.offsets] -check_untyped_defs=False From b8139ea1d5e239742dcbcf98e7763da4a3633c71 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sat, 13 Jun 2020 19:01:46 +0100 Subject: [PATCH 0079/1025] TYP: type annotations for read_sas (#34697) --- pandas/io/sas/sasreader.py | 45 +++++++++++++++++++++++++++++++------- 1 file changed, 37 insertions(+), 8 deletions(-) diff --git a/pandas/io/sas/sasreader.py b/pandas/io/sas/sasreader.py index bd8c3be271505..291c9d1ee7f0c 100644 --- a/pandas/io/sas/sasreader.py +++ b/pandas/io/sas/sasreader.py @@ -1,11 +1,16 @@ """ Read SAS sas7bdat or xport files. """ - from abc import ABCMeta, abstractmethod +from typing import TYPE_CHECKING, Optional, Union, overload + +from pandas._typing import FilePathOrBuffer, Label from pandas.io.common import stringify_path +if TYPE_CHECKING: + from pandas import DataFrame # noqa: F401 + # TODO(PY38): replace with Protocol in Python 3.8 class ReaderBase(metaclass=ABCMeta): @@ -22,14 +27,38 @@ def close(self): pass +@overload +def read_sas( + filepath_or_buffer: FilePathOrBuffer, + format: Optional[str] = ..., + index: Optional[Label] = ..., + encoding: Optional[str] = ..., + chunksize: int = ..., + iterator: bool = ..., +) -> ReaderBase: + ... + + +@overload +def read_sas( + filepath_or_buffer: FilePathOrBuffer, + format: Optional[str] = ..., + index: Optional[Label] = ..., + encoding: Optional[str] = ..., + chunksize: None = ..., + iterator: bool = ..., +) -> Union["DataFrame", ReaderBase]: + ... + + def read_sas( - filepath_or_buffer, - format=None, - index=None, - encoding=None, - chunksize=None, - iterator=False, -): + filepath_or_buffer: FilePathOrBuffer, + format: Optional[str] = None, + index: Optional[Label] = None, + encoding: Optional[str] = None, + chunksize: Optional[int] = None, + iterator: bool = False, +) -> Union["DataFrame", ReaderBase]: """ Read SAS files stored as either XPORT or SAS7BDAT format files. From db9d93bdc1f1868899fc5d3baf4ba7d55a7d96a9 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sat, 13 Jun 2020 19:25:53 +0100 Subject: [PATCH 0080/1025] TYP: check_untyped_defs pandas.core.resample (#34692) --- pandas/core/indexes/period.py | 11 +++++++++-- pandas/core/resample.py | 12 +++++++----- setup.cfg | 3 --- 3 files changed, 16 insertions(+), 10 deletions(-) diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index 49cb78340d104..fc29f786a1476 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -64,8 +64,7 @@ def _new_PeriodIndex(cls, **d): @inherit_names( - ["strftime", "to_timestamp", "asfreq", "start_time", "end_time"] - + PeriodArray._field_ops, + ["strftime", "to_timestamp", "start_time", "end_time"] + PeriodArray._field_ops, PeriodArray, wrap=True, ) @@ -152,6 +151,14 @@ class PeriodIndex(DatetimeIndexOpsMixin, Int64Index): _engine_type = libindex.PeriodEngine _supports_partial_string_indexing = True + # -------------------------------------------------------------------- + # methods that dispatch to array and wrap result in PeriodIndex + + @doc(PeriodArray.asfreq) + def asfreq(self, freq=None, how: str = "E") -> "PeriodIndex": + arr = self._data.asfreq(freq, how) + return type(self)._simple_new(arr, name=self.name) + # ------------------------------------------------------------------------ # Index Constructors diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 32e947dc414d2..5e363f2814d39 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -966,7 +966,8 @@ def __init__(self, obj, *args, **kwargs): for attr in self._attributes: setattr(self, attr, kwargs.get(attr, getattr(parent, attr))) - super().__init__(None) + # error: Too many arguments for "__init__" of "object" + super().__init__(None) # type: ignore self._groupby = groupby self._groupby.mutated = True self._groupby.grouper.mutated = True @@ -1553,7 +1554,7 @@ def _get_time_delta_bins(self, ax): return binner, bins, labels - def _get_time_period_bins(self, ax): + def _get_time_period_bins(self, ax: DatetimeIndex): if not isinstance(ax, DatetimeIndex): raise TypeError( "axis must be a DatetimeIndex, but got " @@ -1569,13 +1570,13 @@ def _get_time_period_bins(self, ax): labels = binner = period_range(start=ax[0], end=ax[-1], freq=freq, name=ax.name) end_stamps = (labels + freq).asfreq(freq, "s").to_timestamp() - if ax.tzinfo: - end_stamps = end_stamps.tz_localize(ax.tzinfo) + if ax.tz: + end_stamps = end_stamps.tz_localize(ax.tz) bins = ax.searchsorted(end_stamps, side="left") return binner, bins, labels - def _get_period_bins(self, ax): + def _get_period_bins(self, ax: PeriodIndex): if not isinstance(ax, PeriodIndex): raise TypeError( "axis must be a PeriodIndex, but got " @@ -1898,6 +1899,7 @@ def _asfreq_compat(index, freq): raise ValueError( "Can only set arbitrary freq for empty DatetimeIndex or TimedeltaIndex" ) + new_index: Index if isinstance(index, PeriodIndex): new_index = index.asfreq(freq=freq) else: diff --git a/setup.cfg b/setup.cfg index e31d6001e065d..65749941c3da5 100644 --- a/setup.cfg +++ b/setup.cfg @@ -214,9 +214,6 @@ check_untyped_defs=False [mypy-pandas.core.ops.docstrings] check_untyped_defs=False -[mypy-pandas.core.resample] -check_untyped_defs=False - [mypy-pandas.core.reshape.merge] check_untyped_defs=False From 8b8bd96d48d611078ef5af07e8049a4dde54619a Mon Sep 17 00:00:00 2001 From: OlivierLuG <59281854+OlivierLuG@users.noreply.github.com> Date: Sat, 13 Jun 2020 21:58:20 +0200 Subject: [PATCH 0081/1025] TST: boolean indexing using .iloc #20627 (#34622) * added a test for issue #20627 * added a test for issue #20627 (review taken into account) * TST: boolean indexing using .iloc #20627 * TST #20627 added tests, and take review into account * Updated test_iloc.py to pass CI testing --- pandas/tests/indexing/test_iloc.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index c97cd81c84726..c5f40102874dd 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -705,6 +705,25 @@ def test_iloc_setitem_categorical_updates_inplace(self): expected = pd.Categorical(["C", "B", "A"]) tm.assert_categorical_equal(cat, expected) + def test_iloc_with_boolean_operation(self): + # GH 20627 + result = DataFrame([[0, 1], [2, 3], [4, 5], [6, np.nan]]) + result.iloc[result.index <= 2] *= 2 + expected = DataFrame([[0, 2], [4, 6], [8, 10], [6, np.nan]]) + tm.assert_frame_equal(result, expected) + + result.iloc[result.index > 2] *= 2 + expected = DataFrame([[0, 2], [4, 6], [8, 10], [12, np.nan]]) + tm.assert_frame_equal(result, expected) + + result.iloc[[True, True, False, False]] *= 2 + expected = DataFrame([[0, 4], [8, 12], [8, 10], [12, np.nan]]) + tm.assert_frame_equal(result, expected) + + result.iloc[[False, False, True, True]] /= 2 + expected = DataFrame([[0.0, 4.0], [8.0, 12.0], [4.0, 5.0], [6.0, np.nan]]) + tm.assert_frame_equal(result, expected) + class TestILocSetItemDuplicateColumns: def test_iloc_setitem_scalar_duplicate_columns(self): From 4295f8161042cfba9c34cef44014ea5a494a6436 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sat, 13 Jun 2020 21:12:52 +0100 Subject: [PATCH 0082/1025] CLN: clean and deduplicate in core.missing.interpolate_1d (#34744) * interpolate_1d returns function * CLN: clean and deduplicate in core.missing.interpolate_1d --- pandas/core/missing.py | 102 ++++++++++++++++++----------------------- 1 file changed, 45 insertions(+), 57 deletions(-) diff --git a/pandas/core/missing.py b/pandas/core/missing.py index d8671616f944e..7802c5cbdbfb3 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -94,30 +94,37 @@ def clean_fill_method(method, allow_nearest=False): return method +# interpolation methods that dispatch to np.interp + +NP_METHODS = ["linear", "time", "index", "values"] + +# interpolation methods that dispatch to _interpolate_scipy_wrapper + +SP_METHODS = [ + "nearest", + "zero", + "slinear", + "quadratic", + "cubic", + "barycentric", + "krogh", + "spline", + "polynomial", + "from_derivatives", + "piecewise_polynomial", + "pchip", + "akima", + "cubicspline", +] + + def clean_interp_method(method: str, **kwargs) -> str: order = kwargs.get("order") - valid = [ - "linear", - "time", - "index", - "values", - "nearest", - "zero", - "slinear", - "quadratic", - "cubic", - "barycentric", - "polynomial", - "krogh", - "piecewise_polynomial", - "pchip", - "akima", - "spline", - "from_derivatives", - "cubicspline", - ] + if method in ("spline", "polynomial") and order is None: raise ValueError("You must specify the order of the spline or polynomial.") + + valid = NP_METHODS + SP_METHODS if method not in valid: raise ValueError(f"method must be one of {valid}. Got '{method}' instead.") @@ -180,8 +187,6 @@ def interpolate_1d( Bounds_error is currently hardcoded to False since non-scipy ones don't take it as an argument. """ - # Treat the original, non-scipy methods first. - invalid = isna(yvalues) valid = ~invalid @@ -261,50 +266,32 @@ def interpolate_1d( # sort preserve_nans and covert to list preserve_nans = sorted(preserve_nans) - xvalues = getattr(xvalues, "values", xvalues) yvalues = getattr(yvalues, "values", yvalues) result = yvalues.copy() - if method in ["linear", "time", "index", "values"]: + # xvalues to pass to NumPy/SciPy + + xvalues = getattr(xvalues, "values", xvalues) + if method == "linear": + inds = xvalues + else: + inds = np.asarray(xvalues) + + # hack for DatetimeIndex, #1646 + if needs_i8_conversion(inds.dtype): + inds = inds.view(np.int64) + if method in ("values", "index"): - inds = np.asarray(xvalues) - # hack for DatetimeIndex, #1646 - if needs_i8_conversion(inds.dtype): - inds = inds.view(np.int64) if inds.dtype == np.object_: inds = lib.maybe_convert_objects(inds) - else: - inds = xvalues + + if method in NP_METHODS: # np.interp requires sorted X values, #21037 indexer = np.argsort(inds[valid]) result[invalid] = np.interp( inds[invalid], inds[valid][indexer], yvalues[valid][indexer] ) - result[preserve_nans] = np.nan - return result - - sp_methods = [ - "nearest", - "zero", - "slinear", - "quadratic", - "cubic", - "barycentric", - "krogh", - "spline", - "polynomial", - "from_derivatives", - "piecewise_polynomial", - "pchip", - "akima", - "cubicspline", - ] - - if method in sp_methods: - inds = np.asarray(xvalues) - # hack for DatetimeIndex, #1646 - if issubclass(inds.dtype.type, np.datetime64): - inds = inds.view(np.int64) + else: result[invalid] = _interpolate_scipy_wrapper( inds[valid], yvalues[valid], @@ -315,8 +302,9 @@ def interpolate_1d( order=order, **kwargs, ) - result[preserve_nans] = np.nan - return result + + result[preserve_nans] = np.nan + return result def _interpolate_scipy_wrapper( From 116b8b8511dd10f6b87a18d1328949924046e57f Mon Sep 17 00:00:00 2001 From: George Hartzell Date: Sat, 13 Jun 2020 18:29:27 -0700 Subject: [PATCH 0083/1025] typo: pivot_table -> pivot (#34758) --- .../getting_started/intro_tutorials/07_reshape_table_layout.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/getting_started/intro_tutorials/07_reshape_table_layout.rst b/doc/source/getting_started/intro_tutorials/07_reshape_table_layout.rst index a9652969ffc79..c16fec6aaba9f 100644 --- a/doc/source/getting_started/intro_tutorials/07_reshape_table_layout.rst +++ b/doc/source/getting_started/intro_tutorials/07_reshape_table_layout.rst @@ -196,7 +196,7 @@ I want the values for the three stations as separate columns next to each other no2_subset.pivot(columns="location", values="value") -The :meth:`~pandas.pivot_table` function is purely reshaping of the data: a single value +The :meth:`~pandas.pivot` function is purely reshaping of the data: a single value for each index/column combination is required. .. raw:: html From a776f912d773a377a0ddff7ce2747cdb3a014928 Mon Sep 17 00:00:00 2001 From: George Hartzell Date: Sat, 13 Jun 2020 18:30:26 -0700 Subject: [PATCH 0084/1025] typo: rows -> columns (#34757) --- .../getting_started/intro_tutorials/01_table_oriented.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/getting_started/intro_tutorials/01_table_oriented.rst b/doc/source/getting_started/intro_tutorials/01_table_oriented.rst index 9ee3bfc3b8e79..dc9bec2284aab 100644 --- a/doc/source/getting_started/intro_tutorials/01_table_oriented.rst +++ b/doc/source/getting_started/intro_tutorials/01_table_oriented.rst @@ -51,7 +51,7 @@ I want to store passenger data of the Titanic. For a number of passengers, I kno df To manually store data in a table, create a ``DataFrame``. When using a Python dictionary of lists, the dictionary keys will be used as column headers and -the values in each list as rows of the ``DataFrame``. +the values in each list as columns of the ``DataFrame``. .. raw:: html @@ -215,4 +215,4 @@ A more extended explanation to ``DataFrame`` and ``Series`` is provided in the : .. raw:: html - \ No newline at end of file + From aeb13c53bb38608ef9f834249f17ee7a9df8f63d Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sun, 14 Jun 2020 10:51:39 +0100 Subject: [PATCH 0085/1025] TYP: check_untyped_defs pandas.io.json._table_schema (#34695) Co-authored-by: William Ayd --- pandas/io/json/_table_schema.py | 21 +++++++++++++++++---- setup.cfg | 3 --- 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/pandas/io/json/_table_schema.py b/pandas/io/json/_table_schema.py index 239ff6241aab0..84146a5d732e1 100644 --- a/pandas/io/json/_table_schema.py +++ b/pandas/io/json/_table_schema.py @@ -3,10 +3,11 @@ https://specs.frictionlessdata.io/json-table-schema/ """ +from typing import TYPE_CHECKING, Any, Dict, Optional, cast import warnings import pandas._libs.json as json -from pandas._typing import DtypeObj +from pandas._typing import DtypeObj, FrameOrSeries, JSONSerializable from pandas.core.dtypes.common import ( is_bool_dtype, @@ -24,6 +25,9 @@ from pandas import DataFrame import pandas.core.common as com +if TYPE_CHECKING: + from pandas.core.indexes.multi import MultiIndex # noqa: F401 + loads = json.loads @@ -103,7 +107,10 @@ def convert_pandas_type_to_json_field(arr): name = "values" else: name = arr.name - field = {"name": name, "type": as_json_table_type(dtype)} + field: Dict[str, JSONSerializable] = { + "name": name, + "type": as_json_table_type(dtype), + } if is_categorical_dtype(dtype): cats = dtype.categories @@ -182,7 +189,12 @@ def convert_json_field_to_pandas_type(field): raise ValueError(f"Unsupported or invalid field type: {typ}") -def build_table_schema(data, index=True, primary_key=None, version=True): +def build_table_schema( + data: FrameOrSeries, + index: bool = True, + primary_key: Optional[bool] = None, + version: bool = True, +) -> Dict[str, JSONSerializable]: """ Create a Table schema from ``data``. @@ -233,11 +245,12 @@ def build_table_schema(data, index=True, primary_key=None, version=True): if index is True: data = set_default_names(data) - schema = {} + schema: Dict[str, Any] = {} fields = [] if index: if data.index.nlevels > 1: + data.index = cast("MultiIndex", data.index) for level, name in zip(data.index.levels, data.index.names): new_field = convert_pandas_type_to_json_field(level) new_field["name"] = name diff --git a/setup.cfg b/setup.cfg index 65749941c3da5..aaebff44139eb 100644 --- a/setup.cfg +++ b/setup.cfg @@ -268,9 +268,6 @@ check_untyped_defs=False [mypy-pandas.io.json._json] check_untyped_defs=False -[mypy-pandas.io.json._table_schema] -check_untyped_defs=False - [mypy-pandas.io.parsers] check_untyped_defs=False From a296e95fead8f4ef343d4db561342df8cbea1c91 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 14 Jun 2020 07:13:00 -0700 Subject: [PATCH 0086/1025] BUG: DataFrame.unstack with non-consolidated (#34709) --- pandas/core/reshape/reshape.py | 7 +++---- pandas/tests/frame/test_reshape.py | 11 +++++++++++ 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 133fba0246497..391313fbb5283 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -41,8 +41,7 @@ class _Unstacker: Parameters ---------- - index : object - Pandas ``Index`` + index : MultiIndex level : int or str, default last level Level to "unstack". Accepts a name for the level. fill_value : scalar, optional @@ -83,7 +82,7 @@ class _Unstacker: """ def __init__( - self, index, level=-1, constructor=None, + self, index: MultiIndex, level=-1, constructor=None, ): if constructor is None: @@ -415,7 +414,7 @@ def unstack(obj, level, fill_value=None): level = obj.index._get_level_number(level) if isinstance(obj, DataFrame): - if isinstance(obj.index, MultiIndex) or not obj._can_fast_transpose: + if isinstance(obj.index, MultiIndex): return _unstack_frame(obj, level, fill_value=fill_value) else: return obj.T.stack(dropna=False) diff --git a/pandas/tests/frame/test_reshape.py b/pandas/tests/frame/test_reshape.py index 2e707342a0793..a6c4089dc71e6 100644 --- a/pandas/tests/frame/test_reshape.py +++ b/pandas/tests/frame/test_reshape.py @@ -140,6 +140,17 @@ def test_stack_mixed_level(self): expected = expected[["a", "b"]] tm.assert_frame_equal(result, expected) + def test_unstack_not_consolidated(self): + # Gh#34708 + df = pd.DataFrame({"x": [1, 2, np.NaN], "y": [3.0, 4, np.NaN]}) + df2 = df[["x"]] + df2["y"] = df["y"] + assert len(df2._mgr.blocks) == 2 + + res = df2.unstack() + expected = df.unstack() + tm.assert_series_equal(res, expected) + def test_unstack_fill(self): # GH #9746: fill_value keyword argument for Series From ecace5a6e0a61b0dcfa18bf40dd5f17b7b330668 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 14 Jun 2020 07:13:46 -0700 Subject: [PATCH 0087/1025] PERF: is_date_array_normalized (#34707) --- pandas/_libs/tslibs/conversion.pyx | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index 152e9a5ad7ddc..40b2d44235d8b 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -445,7 +445,7 @@ cdef _TSObject convert_datetime_to_tsobject(datetime ts, object tz, cdef _TSObject create_tsobject_tz_using_offset(npy_datetimestruct dts, - int tzoffset, object tz=None): + int tzoffset, tzinfo tz=None): """ Convert a datetimestruct `dts`, along with initial timezone offset `tzoffset` to a _TSObject (with timezone object `tz` - optional). @@ -847,7 +847,7 @@ cdef inline int64_t _normalize_i8_stamp(int64_t local_val) nogil: @cython.wraparound(False) @cython.boundscheck(False) -def is_date_array_normalized(const int64_t[:] stamps, object tz=None): +def is_date_array_normalized(const int64_t[:] stamps, tzinfo tz=None): """ Check if all of the given (nanosecond) timestamps are normalized to midnight, i.e. hour == minute == second == 0. If the optional timezone @@ -867,20 +867,20 @@ def is_date_array_normalized(const int64_t[:] stamps, object tz=None): ndarray[int64_t] trans int64_t[:] deltas intp_t[:] pos - npy_datetimestruct dts int64_t local_val, delta str typ + int64_t day_nanos = 24 * 3600 * 1_000_000_000 if tz is None or is_utc(tz): for i in range(n): - dt64_to_dtstruct(stamps[i], &dts) - if (dts.hour + dts.min + dts.sec + dts.us) > 0: + local_val = stamps[i] + if local_val % day_nanos != 0: return False + elif is_tzlocal(tz): for i in range(n): local_val = tz_convert_utc_to_tzlocal(stamps[i], tz) - dt64_to_dtstruct(local_val, &dts) - if (dts.hour + dts.min + dts.sec + dts.us) > 0: + if local_val % day_nanos != 0: return False else: trans, deltas, typ = get_dst_info(tz) @@ -890,16 +890,16 @@ def is_date_array_normalized(const int64_t[:] stamps, object tz=None): delta = deltas[0] for i in range(n): # Adjust datetime64 timestamp, recompute datetimestruct - dt64_to_dtstruct(stamps[i] + delta, &dts) - if (dts.hour + dts.min + dts.sec + dts.us) > 0: + local_val = stamps[i] + delta + if local_val % day_nanos != 0: return False else: pos = trans.searchsorted(stamps) - 1 for i in range(n): # Adjust datetime64 timestamp, recompute datetimestruct - dt64_to_dtstruct(stamps[i] + deltas[pos[i]], &dts) - if (dts.hour + dts.min + dts.sec + dts.us) > 0: + local_val = stamps[i] + deltas[pos[i]] + if local_val % day_nanos != 0: return False return True From 4e4d610f1ddff8acc1952b161cd89bab1f3d0ea8 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 14 Jun 2020 07:15:11 -0700 Subject: [PATCH 0088/1025] CLN: remove usages of base_and_stride (#34700) --- pandas/_libs/tslibs/offsets.pyx | 6 +++--- pandas/core/arrays/datetimes.py | 4 +--- pandas/plotting/_matplotlib/timeseries.py | 8 ++------ 3 files changed, 6 insertions(+), 12 deletions(-) diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index 4069d192d9e88..250ff608308d8 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -3491,7 +3491,7 @@ INVALID_FREQ_ERR_MSG = "Invalid frequency: {0}" _offset_map = {} -cpdef base_and_stride(str freqstr): +cdef _base_and_stride(str freqstr): """ Return base freq and stride info from string representation @@ -3502,7 +3502,7 @@ cpdef base_and_stride(str freqstr): Examples -------- - _freq_and_stride('5Min') -> 'Min', 5 + _base_and_stride('5Min') -> 'Min', 5 """ groups = opattern.match(freqstr) @@ -3606,7 +3606,7 @@ cpdef to_offset(freq): stride = freq[1] if isinstance(stride, str): name, stride = stride, name - name, _ = base_and_stride(name) + name, _ = _base_and_stride(name) delta = _get_offset(name) * stride elif isinstance(freq, timedelta): diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 90513e355e732..b6c27abc321e1 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -12,7 +12,6 @@ conversion, fields, iNaT, - offsets as liboffsets, resolution as libresolution, timezones, to_offset, @@ -1106,8 +1105,7 @@ def to_period(self, freq=None): # https://github.com/pandas-dev/pandas/issues/33358 if res is None: - base, stride = liboffsets.base_and_stride(freq) - res = f"{stride}{base}" + res = freq freq = res diff --git a/pandas/plotting/_matplotlib/timeseries.py b/pandas/plotting/_matplotlib/timeseries.py index 99fc730e818c4..fa8051954e435 100644 --- a/pandas/plotting/_matplotlib/timeseries.py +++ b/pandas/plotting/_matplotlib/timeseries.py @@ -7,7 +7,6 @@ from pandas._libs.tslibs import Period, to_offset from pandas._libs.tslibs.frequencies import FreqGroup -from pandas._libs.tslibs.offsets import base_and_stride from pandas._typing import FrameOrSeriesUnion from pandas.core.dtypes.generic import ( @@ -167,12 +166,9 @@ def _get_ax_freq(ax): def _get_period_alias(freq) -> Optional[str]: - if isinstance(freq, DateOffset): - freq = freq.rule_code - else: - freq = base_and_stride(freq)[0] + freqstr = to_offset(freq).rule_code - freq = get_period_alias(freq) + freq = get_period_alias(freqstr) return freq From 5b50c05c0e9565a8580df1f1852c61c399c8bd7c Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 14 Jun 2020 07:20:26 -0700 Subject: [PATCH 0089/1025] CLN: remove libfrequencies.get_freq_group (#34701) --- pandas/_libs/tslibs/dtypes.pxd | 1 + pandas/_libs/tslibs/dtypes.pyx | 26 ++++++++++++++ pandas/_libs/tslibs/frequencies.pxd | 2 -- pandas/_libs/tslibs/frequencies.pyx | 35 ------------------- pandas/_libs/tslibs/period.pyx | 6 ++-- pandas/_libs/tslibs/resolution.pyx | 2 +- pandas/core/indexes/period.py | 5 ++- pandas/plotting/_matplotlib/converter.py | 12 +++---- .../tseries/frequencies/test_freq_code.py | 3 +- 9 files changed, 40 insertions(+), 52 deletions(-) diff --git a/pandas/_libs/tslibs/dtypes.pxd b/pandas/_libs/tslibs/dtypes.pxd index bce071d45c12f..f43bc283d98c7 100644 --- a/pandas/_libs/tslibs/dtypes.pxd +++ b/pandas/_libs/tslibs/dtypes.pxd @@ -1,3 +1,4 @@ +cdef dict attrname_to_abbrevs cdef enum c_FreqGroup: # Mirrors FreqGroup in the .pxy file diff --git a/pandas/_libs/tslibs/dtypes.pyx b/pandas/_libs/tslibs/dtypes.pyx index e38cfe21a65cc..0752910317077 100644 --- a/pandas/_libs/tslibs/dtypes.pyx +++ b/pandas/_libs/tslibs/dtypes.pyx @@ -21,6 +21,11 @@ cdef class PeriodDtypeBase: return False return self.dtype_code == other.dtype_code + @property + def freq_group(self) -> int: + # See also: libperiod.get_freq_group + return (self.dtype_code // 1000) * 1000 + @property def date_offset(self): """ @@ -108,6 +113,22 @@ _period_code_map.update({ }) +# Map attribute-name resolutions to resolution abbreviations +_attrname_to_abbrevs = { + "year": "A", + "quarter": "Q", + "month": "M", + "day": "D", + "hour": "H", + "minute": "T", + "second": "S", + "millisecond": "L", + "microsecond": "U", + "nanosecond": "N", +} +cdef dict attrname_to_abbrevs = _attrname_to_abbrevs + + class FreqGroup: # Mirrors c_FreqGroup in the .pxd file FR_ANN = 1000 @@ -123,3 +144,8 @@ class FreqGroup: FR_US = 11000 FR_NS = 12000 FR_UND = -10000 # undefined + + @staticmethod + def get_freq_group(code: int) -> int: + # See also: PeriodDtypeBase.freq_group + return (code // 1000) * 1000 diff --git a/pandas/_libs/tslibs/frequencies.pxd b/pandas/_libs/tslibs/frequencies.pxd index 896eec77ef4fe..b3ad6e6c19ee3 100644 --- a/pandas/_libs/tslibs/frequencies.pxd +++ b/pandas/_libs/tslibs/frequencies.pxd @@ -1,3 +1 @@ -cdef dict attrname_to_abbrevs - cpdef int get_to_timestamp_base(int base) diff --git a/pandas/_libs/tslibs/frequencies.pyx b/pandas/_libs/tslibs/frequencies.pyx index 6e525500ec37a..fd28240abd882 100644 --- a/pandas/_libs/tslibs/frequencies.pyx +++ b/pandas/_libs/tslibs/frequencies.pyx @@ -1,43 +1,8 @@ from .dtypes import FreqGroup -# --------------------------------------------------------------------- -# Period codes - - -# Map attribute-name resolutions to resolution abbreviations -_attrname_to_abbrevs = { - "year": "A", - "quarter": "Q", - "month": "M", - "day": "D", - "hour": "H", - "minute": "T", - "second": "S", - "millisecond": "L", - "microsecond": "U", - "nanosecond": "N", -} -cdef dict attrname_to_abbrevs = _attrname_to_abbrevs - - # ---------------------------------------------------------------------- -# TODO: this is now identical to the version in libperiod -def get_freq_group(freq: int) -> int: - """ - Return frequency code group of given frequency str or offset. - - Examples - -------- - >>> get_freq_group(4001) - 4000 - - >>> get_freq_group(4006) - 4000 - """ - return (freq // 1000) * 1000 - cpdef int get_to_timestamp_base(int base): """ diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index 47ebf139ed496..d14f9d82eb5be 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -71,12 +71,10 @@ from pandas._libs.tslibs.dtypes cimport ( FR_MS, FR_US, FR_NS, -) - -from pandas._libs.tslibs.frequencies cimport ( attrname_to_abbrevs, - get_to_timestamp_base, ) + +from pandas._libs.tslibs.frequencies cimport get_to_timestamp_base from pandas._libs.tslibs.parsing cimport get_rule_month from pandas._libs.tslibs.parsing import parse_time_string from pandas._libs.tslibs.nattype cimport ( diff --git a/pandas/_libs/tslibs/resolution.pyx b/pandas/_libs/tslibs/resolution.pyx index 7453933ddbb4f..55522e99459cb 100644 --- a/pandas/_libs/tslibs/resolution.pyx +++ b/pandas/_libs/tslibs/resolution.pyx @@ -5,9 +5,9 @@ from numpy cimport ndarray, int64_t, int32_t from pandas._libs.tslibs.util cimport get_nat +from pandas._libs.tslibs.dtypes cimport attrname_to_abbrevs from pandas._libs.tslibs.np_datetime cimport ( npy_datetimestruct, dt64_to_dtstruct) -from pandas._libs.tslibs.frequencies cimport attrname_to_abbrevs from pandas._libs.tslibs.frequencies import FreqGroup from pandas._libs.tslibs.timezones cimport ( is_utc, is_tzlocal, maybe_get_tz, get_dst_info) diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index fc29f786a1476..2022a4a563678 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -6,7 +6,6 @@ from pandas._libs import index as libindex from pandas._libs.lib import no_default from pandas._libs.tslibs import Period, Resolution -from pandas._libs.tslibs.frequencies import get_freq_group from pandas._libs.tslibs.parsing import DateParseError, parse_time_string from pandas._typing import DtypeObj, Label from pandas.util._decorators import Appender, cache_readonly, doc @@ -510,7 +509,7 @@ def get_loc(self, key, method=None, tolerance=None): reso = Resolution.from_attrname(reso) grp = reso.freq_group - freqn = get_freq_group(self.dtype.dtype_code) + freqn = self.dtype.freq_group # _get_string_slice will handle cases where grp < freqn assert grp >= freqn @@ -586,7 +585,7 @@ def _parsed_string_to_bounds(self, reso: Resolution, parsed: datetime): def _validate_partial_date_slice(self, reso: Resolution): assert isinstance(reso, Resolution), (type(reso), reso) grp = reso.freq_group - freqn = get_freq_group(self.dtype.dtype_code) + freqn = self.dtype.freq_group if not grp < freqn: # TODO: we used to also check for diff --git a/pandas/plotting/_matplotlib/converter.py b/pandas/plotting/_matplotlib/converter.py index 65f030223c7ca..05377e0c240b9 100644 --- a/pandas/plotting/_matplotlib/converter.py +++ b/pandas/plotting/_matplotlib/converter.py @@ -10,9 +10,9 @@ import matplotlib.units as units import numpy as np -from pandas._libs import lib, tslibs -from pandas._libs.tslibs import to_offset -from pandas._libs.tslibs.frequencies import FreqGroup, get_freq_group +from pandas._libs import lib +from pandas._libs.tslibs import Timestamp, to_offset +from pandas._libs.tslibs.dtypes import FreqGroup from pandas._libs.tslibs.offsets import BaseOffset from pandas.core.dtypes.common import ( @@ -45,7 +45,7 @@ def get_pairs(): pairs = [ - (tslibs.Timestamp, DatetimeConverter), + (Timestamp, DatetimeConverter), (Period, PeriodConverter), (pydt.datetime, DatetimeConverter), (pydt.date, DatetimeConverter), @@ -281,7 +281,7 @@ def try_parse(values): if isinstance(values, (datetime, pydt.date)): return _dt_to_float_ordinal(values) elif isinstance(values, np.datetime64): - return _dt_to_float_ordinal(tslibs.Timestamp(values)) + return _dt_to_float_ordinal(Timestamp(values)) elif isinstance(values, pydt.time): return dates.date2num(values) elif is_integer(values) or is_float(values): @@ -553,7 +553,7 @@ def _daily_finder(vmin, vmax, freq: BaseOffset): elif dtype_code == FreqGroup.FR_DAY: periodsperyear = 365 periodspermonth = 28 - elif get_freq_group(dtype_code) == FreqGroup.FR_WK: + elif FreqGroup.get_freq_group(dtype_code) == FreqGroup.FR_WK: periodsperyear = 52 periodspermonth = 3 else: # pragma: no cover diff --git a/pandas/tests/tseries/frequencies/test_freq_code.py b/pandas/tests/tseries/frequencies/test_freq_code.py index 189a0cc2171ad..5383c1ff1c2c9 100644 --- a/pandas/tests/tseries/frequencies/test_freq_code.py +++ b/pandas/tests/tseries/frequencies/test_freq_code.py @@ -1,7 +1,8 @@ import pytest from pandas._libs.tslibs import Resolution, to_offset -from pandas._libs.tslibs.frequencies import _attrname_to_abbrevs, get_to_timestamp_base +from pandas._libs.tslibs.dtypes import _attrname_to_abbrevs +from pandas._libs.tslibs.frequencies import get_to_timestamp_base @pytest.mark.parametrize( From 1ea362edb811a4df3aeaacc2678d314c3cf8c3f3 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 14 Jun 2020 07:21:53 -0700 Subject: [PATCH 0090/1025] CLN: day->day_opt, remove unused case (#34762) --- pandas/_libs/tslibs/offsets.pyx | 37 ++++++++++++++------------------- 1 file changed, 16 insertions(+), 21 deletions(-) diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index 250ff608308d8..c9c2672c55be0 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -3712,7 +3712,7 @@ cdef shift_quarters( const int64_t[:] dtindex, int quarters, int q1start_month, - object day, + object day_opt, int modby=3, ): """ @@ -3724,7 +3724,7 @@ cdef shift_quarters( dtindex : int64_t[:] timestamps for input dates quarters : int number of quarters to shift q1start_month : int month in which Q1 begins by convention - day : {'start', 'end', 'business_start', 'business_end'} + day_opt : {'start', 'end', 'business_start', 'business_end'} modby : int (3 for quarters, 12 for years) Returns @@ -3737,9 +3737,9 @@ cdef shift_quarters( int count = len(dtindex) int months_to_roll, months_since, n, compare_day bint roll_check - int64_t[:] out = np.empty(count, dtype='int64') + int64_t[:] out = np.empty(count, dtype="int64") - if day == 'start': + if day_opt == "start": with nogil: for i in range(count): if dtindex[i] == NPY_NAT: @@ -3763,7 +3763,7 @@ cdef shift_quarters( out[i] = dtstruct_to_dt64(&dts) - elif day == 'end': + elif day_opt == "end": with nogil: for i in range(count): if dtindex[i] == NPY_NAT: @@ -3792,7 +3792,7 @@ cdef shift_quarters( out[i] = dtstruct_to_dt64(&dts) - elif day == 'business_start': + elif day_opt == "business_start": with nogil: for i in range(count): if dtindex[i] == NPY_NAT: @@ -3823,7 +3823,7 @@ cdef shift_quarters( out[i] = dtstruct_to_dt64(&dts) - elif day == 'business_end': + elif day_opt == "business_end": with nogil: for i in range(count): if dtindex[i] == NPY_NAT: @@ -3863,12 +3863,12 @@ cdef shift_quarters( @cython.wraparound(False) @cython.boundscheck(False) -def shift_months(const int64_t[:] dtindex, int months, object day=None): +def shift_months(const int64_t[:] dtindex, int months, object day_opt=None): """ Given an int64-based datetime index, shift all elements specified number of months using DateOffset semantics - day: {None, 'start', 'end'} + day_opt: {None, 'start', 'end', 'business_start', 'business_end'} * None: day of month * 'start' 1st day of month * 'end' last day of month @@ -3879,9 +3879,9 @@ def shift_months(const int64_t[:] dtindex, int months, object day=None): int count = len(dtindex) int months_to_roll bint roll_check - int64_t[:] out = np.empty(count, dtype='int64') + int64_t[:] out = np.empty(count, dtype="int64") - if day is None: + if day_opt is None: with nogil: for i in range(count): if dtindex[i] == NPY_NAT: @@ -3894,7 +3894,7 @@ def shift_months(const int64_t[:] dtindex, int months, object day=None): dts.day = min(dts.day, get_days_in_month(dts.year, dts.month)) out[i] = dtstruct_to_dt64(&dts) - elif day == 'start': + elif day_opt == "start": roll_check = False if months <= 0: months += 1 @@ -3918,7 +3918,7 @@ def shift_months(const int64_t[:] dtindex, int months, object day=None): dts.day = 1 out[i] = dtstruct_to_dt64(&dts) - elif day == 'end': + elif day_opt == "end": roll_check = False if months > 0: months -= 1 @@ -3944,7 +3944,7 @@ def shift_months(const int64_t[:] dtindex, int months, object day=None): dts.day = get_days_in_month(dts.year, dts.month) out[i] = dtstruct_to_dt64(&dts) - elif day == 'business_start': + elif day_opt == "business_start": with nogil: for i in range(count): if dtindex[i] == NPY_NAT: @@ -3964,7 +3964,7 @@ def shift_months(const int64_t[:] dtindex, int months, object day=None): dts.day = get_firstbday(dts.year, dts.month) out[i] = dtstruct_to_dt64(&dts) - elif day == 'business_end': + elif day_opt == "business_end": with nogil: for i in range(count): if dtindex[i] == NPY_NAT: @@ -4060,13 +4060,11 @@ cdef int get_day_of_month(datetime other, day_opt) except? -1: Parameters ---------- other : datetime or Timestamp - day_opt : 'start', 'end', 'business_start', 'business_end', or int + day_opt : {'start', 'end', 'business_start', 'business_end'} 'start': returns 1 'end': returns last day of the month 'business_start': returns the first business day of the month 'business_end': returns the last business day of the month - int: returns the day in the month indicated by `other`, or the last of - day the month if the value exceeds in that month's number of days. Returns ------- @@ -4095,9 +4093,6 @@ cdef int get_day_of_month(datetime other, day_opt) except? -1: elif day_opt == 'business_end': # last business day of month return get_lastbday(other.year, other.month) - elif is_integer_object(day_opt): - days_in_month = get_days_in_month(other.year, other.month) - return min(day_opt, days_in_month) elif day_opt is None: # Note: unlike `shift_month`, get_day_of_month does not # allow day_opt = None From eda48bb59c1cd05581f978b25b5532174075107a Mon Sep 17 00:00:00 2001 From: Bharat Raghunathan Date: Sun, 14 Jun 2020 19:59:34 +0530 Subject: [PATCH 0091/1025] Bump up minimum numpy version in windows37 job (#34750) --- ci/azure/windows.yml | 2 +- ci/deps/azure-windows-37.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ci/azure/windows.yml b/ci/azure/windows.yml index 187a5db99802f..87f1bfd2adb79 100644 --- a/ci/azure/windows.yml +++ b/ci/azure/windows.yml @@ -13,7 +13,7 @@ jobs: CONDA_PY: "36" PATTERN: "not slow and not network" - py37_np141: + py37_np18: ENV_FILE: ci/deps/azure-windows-37.yaml CONDA_PY: "37" PATTERN: "not slow and not network" diff --git a/ci/deps/azure-windows-37.yaml b/ci/deps/azure-windows-37.yaml index e491fd57b240b..889d5c1bcfcdd 100644 --- a/ci/deps/azure-windows-37.yaml +++ b/ci/deps/azure-windows-37.yaml @@ -22,7 +22,7 @@ dependencies: - matplotlib=2.2.* - moto - numexpr - - numpy=1.14.* + - numpy=1.18.* - openpyxl - pyarrow=0.14 - pytables From 965134d01eef1da3b1d48afa6df383123185bc75 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sun, 14 Jun 2020 15:31:06 +0100 Subject: [PATCH 0092/1025] REF: refactor NDFrame.interpolate to avoid dispatching to fillna (#34752) --- pandas/core/generic.py | 42 ++++++++++++++++-------------------------- 1 file changed, 16 insertions(+), 26 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 6183638ab587e..823a0a6a35f9e 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6888,42 +6888,33 @@ def interpolate( inplace = validate_bool_kwarg(inplace, "inplace") axis = self._get_axis_number(axis) - index = self._get_axis(axis) - if isinstance(self.index, MultiIndex) and method != "linear": + fillna_methods = ["ffill", "bfill", "pad", "backfill"] + should_transpose = axis == 1 and method not in fillna_methods + + obj = self.T if should_transpose else self + + if method not in fillna_methods: + axis = self._info_axis_number + + if isinstance(obj.index, MultiIndex) and method != "linear": raise ValueError( "Only `method=linear` interpolation is supported on MultiIndexes." ) - # for the methods backfill, bfill, pad, ffill limit_direction and limit_area - # are being ignored, see gh-26796 for more information - if method in ["backfill", "bfill", "pad", "ffill"]: - return self.fillna( - method=method, - axis=axis, - inplace=inplace, - limit=limit, - downcast=downcast, - ) - - # Currently we need this to call the axis correctly inside the various - # interpolation methods - if axis == 0: - df = self - else: - df = self.T - - if self.ndim == 2 and np.all(self.dtypes == np.dtype(object)): + if obj.ndim == 2 and np.all(obj.dtypes == np.dtype(object)): raise TypeError( "Cannot interpolate with all object-dtype columns " "in the DataFrame. Try setting at least one " "column to a numeric dtype." ) + # create/use the index if method == "linear": # prior default - index = np.arange(len(df.index)) + index = np.arange(len(obj.index)) else: + index = obj.index methods = {"index", "values", "nearest", "time"} is_numeric_or_datetime = ( is_numeric_dtype(index.dtype) @@ -6944,10 +6935,9 @@ def interpolate( "has not been implemented. Try filling " "those NaNs before interpolating." ) - data = df._mgr - new_data = data.interpolate( + new_data = obj._mgr.interpolate( method=method, - axis=self._info_axis_number, + axis=axis, index=index, limit=limit, limit_direction=limit_direction, @@ -6958,7 +6948,7 @@ def interpolate( ) result = self._constructor(new_data) - if axis == 1: + if should_transpose: result = result.T if inplace: return self._update_inplace(result) From 2646f016566a59b4298142da3eb07626848585c5 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sun, 14 Jun 2020 09:34:35 -0500 Subject: [PATCH 0093/1025] PERF: avoid copy in replace (#34737) --- pandas/core/internals/blocks.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index e2a778f729470..13b98279169fd 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -906,8 +906,7 @@ def putmask( mask = _extract_bool_array(mask) assert not isinstance(new, (ABCIndexClass, ABCSeries, ABCDataFrame)) - new_values = self.values if inplace else self.values.copy() - + new_values = self.values # delay copy if possible. # if we are passed a scalar None, convert it here if not is_list_like(new) and isna(new) and not self.is_object: # FIXME: make sure we have compatible NA @@ -917,7 +916,7 @@ def putmask( # We only get here for non-Extension Blocks, so _try_coerce_args # is only relevant for DatetimeBlock and TimedeltaBlock if lib.is_scalar(new): - new = convert_scalar_for_putitemlike(new, new_values.dtype) + new = convert_scalar_for_putitemlike(new, self.values.dtype) if transpose: new_values = new_values.T @@ -929,6 +928,8 @@ def putmask( new = np.repeat(new, new_values.shape[-1]).reshape(self.shape) new = new.astype(new_values.dtype) + if new_values is self.values and not inplace: + new_values = new_values.copy() # we require exact matches between the len of the # values we are setting (or is compat). np.putmask # doesn't check this and will simply truncate / pad @@ -1000,6 +1001,8 @@ def f(mask, val, idx): return [self] if transpose: + if new_values is None: + new_values = self.values if inplace else self.values.copy() new_values = new_values.T return [self.make_block(new_values)] From b0fb30927e952a256d9c5edcc016089571ba449e Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Sun, 14 Jun 2020 15:46:34 +0100 Subject: [PATCH 0094/1025] BUG: don't plot colorbar if c is column containing colors (#34344) --- doc/source/whatsnew/v1.1.0.rst | 2 +- pandas/plotting/_matplotlib/core.py | 10 +++++++--- pandas/tests/plotting/test_frame.py | 11 +++++++++++ 3 files changed, 19 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 92f7c0f6b59a3..b64edeff47a1e 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -969,7 +969,7 @@ Plotting - Bug in :meth:`DataFrame.boxplot` and :meth:`DataFrame.plot.boxplot` lost color attributes of ``medianprops``, ``whiskerprops``, ``capprops`` and ``medianprops`` (:issue:`30346`) - Bug in :meth:`DataFrame.hist` where the order of ``column`` argument was ignored (:issue:`29235`) - Bug in :meth:`DataFrame.plot.scatter` that when adding multiple plots with different ``cmap``, colorbars alway use the first ``cmap`` (:issue:`33389`) - +- Bug in :meth:`DataFrame.plot.scatter` was adding a colorbar to the plot even if the argument `c` was assigned to a column containing color names (:issue:`34316`) Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index 1d87c56ab959a..f3682e0a008a6 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -14,6 +14,7 @@ is_iterator, is_list_like, is_number, + is_numeric_dtype, ) from pandas.core.dtypes.generic import ( ABCDataFrame, @@ -952,9 +953,6 @@ def _make_plot(self): c_is_column = is_hashable(c) and c in self.data.columns - # plot a colorbar only if a colormap is provided or necessary - cb = self.kwds.pop("colorbar", self.colormap or c_is_column) - # pandas uses colormap, matplotlib uses cmap. cmap = self.colormap or "Greys" cmap = self.plt.cm.get_cmap(cmap) @@ -970,6 +968,12 @@ def _make_plot(self): else: c_values = c + # plot colorbar if + # 1. colormap is assigned, and + # 2.`c` is a column containing only numeric values + plot_colorbar = self.colormap or c_is_column + cb = self.kwds.pop("colorbar", is_numeric_dtype(c_values) and plot_colorbar) + if self.legend and hasattr(self, "label"): label = self.label else: diff --git a/pandas/tests/plotting/test_frame.py b/pandas/tests/plotting/test_frame.py index c84a09f21f46b..8992e27a78d6b 100644 --- a/pandas/tests/plotting/test_frame.py +++ b/pandas/tests/plotting/test_frame.py @@ -1306,6 +1306,17 @@ def test_plot_scatter_with_c(self): float_array = np.array([0.0, 1.0]) df.plot.scatter(x="A", y="B", c=float_array, cmap="spring") + @pytest.mark.parametrize("cmap", [None, "Greys"]) + def test_scatter_with_c_column_name_with_colors(self, cmap): + # https://github.com/pandas-dev/pandas/issues/34316 + df = pd.DataFrame( + [[5.1, 3.5], [4.9, 3.0], [7.0, 3.2], [6.4, 3.2], [5.9, 3.0]], + columns=["length", "width"], + ) + df["species"] = ["r", "r", "g", "g", "b"] + ax = df.plot.scatter(x=0, y=1, c="species", cmap=cmap) + assert ax.collections[0].colorbar is None + def test_plot_scatter_with_s(self): # this refers to GH 32904 df = DataFrame(np.random.random((10, 3)) * 100, columns=["a", "b", "c"],) From 9234a3de7fcf1ba04aa200507b4fa29891fe5140 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sun, 14 Jun 2020 15:56:36 +0100 Subject: [PATCH 0095/1025] TYP: check_untyped_defs pandas.core.nanops (#34689) --- pandas/core/nanops.py | 20 +++++++++++++------- setup.cfg | 3 --- 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 6b8518d8a47a0..e7e5e37bb7817 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -86,7 +86,7 @@ def __init__(self, name=None, **kwargs): self.name = name self.kwargs = kwargs - def __call__(self, alt): + def __call__(self, alt: F) -> F: bn_name = self.name or alt.__name__ try: @@ -130,7 +130,7 @@ def f( return result - return f + return cast(F, f) def _bn_ok_dtype(dtype: DtypeObj, name: str) -> bool: @@ -514,7 +514,12 @@ def nansum( @disallow(PeriodDtype) @bottleneck_switch() -def nanmean(values, axis=None, skipna=True, mask=None): +def nanmean( + values: np.ndarray, + axis: Optional[int] = None, + skipna: bool = True, + mask: Optional[np.ndarray] = None, +) -> float: """ Compute the mean of the element along an axis ignoring NaNs @@ -528,7 +533,7 @@ def nanmean(values, axis=None, skipna=True, mask=None): Returns ------- - result : float + float Unless input is a float array, in which case use the same precision as the input array. @@ -558,6 +563,7 @@ def nanmean(values, axis=None, skipna=True, mask=None): the_sum = _ensure_numeric(values.sum(axis, dtype=dtype_sum)) if axis is not None and getattr(the_sum, "ndim", False): + count = cast(np.ndarray, count) with np.errstate(all="ignore"): # suppress division by zero warnings the_mean = the_sum / count @@ -1205,17 +1211,17 @@ def _maybe_arg_null_out( def _get_counts( - values_shape: Tuple[int], + values_shape: Tuple[int, ...], mask: Optional[np.ndarray], axis: Optional[int], dtype: Dtype = float, -) -> Union[int, np.ndarray]: +) -> Union[int, float, np.ndarray]: """ Get the count of non-null values along an axis Parameters ---------- - values_shape : Tuple[int] + values_shape : tuple of int shape tuple from values ndarray, used if mask is None mask : Optional[ndarray[bool]] locations in values that should be considered missing diff --git a/setup.cfg b/setup.cfg index aaebff44139eb..49a57b7a525f0 100644 --- a/setup.cfg +++ b/setup.cfg @@ -208,9 +208,6 @@ check_untyped_defs=False [mypy-pandas.core.missing] check_untyped_defs=False -[mypy-pandas.core.nanops] -check_untyped_defs=False - [mypy-pandas.core.ops.docstrings] check_untyped_defs=False From a71b1943bc21493d9fcb21bb41b7489a44825851 Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Sun, 14 Jun 2020 16:02:04 +0100 Subject: [PATCH 0096/1025] BUG: Dataframe.groupby aggregations with categorical columns lead to incorrect results. (#32546) --- doc/source/whatsnew/v1.1.0.rst | 1 + pandas/core/groupby/generic.py | 2 ++ .../tests/groupby/transform/test_transform.py | 33 +++++++++++++++++++ 3 files changed, 36 insertions(+) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index b64edeff47a1e..3f06b0c63a15e 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -976,6 +976,7 @@ Groupby/resample/rolling - Bug in :meth:`GroupBy.apply` raises ``ValueError`` when the ``by`` axis is not sorted and has duplicates and the applied ``func`` does not mutate passed in objects (:issue:`30667`) - Bug in :meth:`DataFrameGroupby.transform` produces incorrect result with transformation functions (:issue:`30918`) +- Bug in :meth:`Groupby.transform` was returning the wrong result when grouping by multiple keys of which some were categorical and others not (:issue:`32494`) - Bug in :meth:`GroupBy.count` causes segmentation fault when grouped-by column contains NaNs (:issue:`32841`) - Bug in :meth:`DataFrame.groupby` and :meth:`Series.groupby` produces inconsistent type when aggregating Boolean series (:issue:`32894`) - Bug in :meth:`DataFrameGroupBy.sum` and :meth:`SeriesGroupBy.sum` where a large negative number would be returned when the number of non-null values was below ``min_count`` for nullable integer dtypes (:issue:`32861`) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 5894066dd33c8..db5df9818b0b0 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -546,6 +546,7 @@ def _transform_fast(self, result, func_nm: str) -> Series: builtin/cythonizable functions """ ids, _, ngroup = self.grouper.group_info + result = result.reindex(self.grouper.result_index, copy=False) cast = self._transform_should_cast(func_nm) out = algorithms.take_1d(result._values, ids) if cast: @@ -1496,6 +1497,7 @@ def _transform_fast(self, result: DataFrame, func_nm: str) -> DataFrame: # for each col, reshape to to size of original frame # by take operation ids, _, ngroup = self.grouper.group_info + result = result.reindex(self.grouper.result_index, copy=False) output = [] for i, _ in enumerate(result.columns): res = algorithms.take_1d(result.iloc[:, i].values, ids) diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index e7bc3801a08a7..fd4ee2a81ebd8 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -1205,3 +1205,36 @@ def test_transform_lambda_indexing(): ), ) tm.assert_frame_equal(result, expected) + + +def test_categorical_and_not_categorical_key(observed): + # Checks that groupby-transform, when grouping by both a categorical + # and a non-categorical key, doesn't try to expand the output to include + # non-observed categories but instead matches the input shape. + # GH 32494 + df_with_categorical = pd.DataFrame( + { + "A": pd.Categorical(["a", "b", "a"], categories=["a", "b", "c"]), + "B": [1, 2, 3], + "C": ["a", "b", "a"], + } + ) + df_without_categorical = pd.DataFrame( + {"A": ["a", "b", "a"], "B": [1, 2, 3], "C": ["a", "b", "a"]} + ) + + # DataFrame case + result = df_with_categorical.groupby(["A", "C"], observed=observed).transform("sum") + expected = df_without_categorical.groupby(["A", "C"]).transform("sum") + tm.assert_frame_equal(result, expected) + expected_explicit = pd.DataFrame({"B": [4, 2, 4]}) + tm.assert_frame_equal(result, expected_explicit) + + # Series case + result = df_with_categorical.groupby(["A", "C"], observed=observed)["B"].transform( + "sum" + ) + expected = df_without_categorical.groupby(["A", "C"])["B"].transform("sum") + tm.assert_series_equal(result, expected) + expected_explicit = pd.Series([4, 2, 4], name="B") + tm.assert_series_equal(result, expected_explicit) From 5ed02177db3b786ae4a56a7365c018a926029c01 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sun, 14 Jun 2020 17:18:09 +0200 Subject: [PATCH 0097/1025] TST/REF: arithmetic tests for BooleanArray + consolidate with integer masked tests (#34623) --- pandas/_testing.py | 26 +++ pandas/core/arrays/boolean.py | 15 +- .../tests/arrays/boolean/test_arithmetic.py | 105 +++++++++--- .../tests/arrays/integer/test_arithmetic.py | 146 +--------------- pandas/tests/arrays/masked/test_arithmetic.py | 158 ++++++++++++++++++ pandas/tests/extension/base/ops.py | 12 +- 6 files changed, 285 insertions(+), 177 deletions(-) create mode 100644 pandas/tests/arrays/masked/test_arithmetic.py diff --git a/pandas/_testing.py b/pandas/_testing.py index 61eab6b8152e1..ebb53dd81682c 100644 --- a/pandas/_testing.py +++ b/pandas/_testing.py @@ -4,6 +4,7 @@ from datetime import datetime from functools import wraps import gzip +import operator import os from shutil import rmtree import string @@ -2758,3 +2759,28 @@ def get_cython_table_params(ndframe, func_names_and_expected): if name == func_name ] return results + + +def get_op_from_name(op_name: str) -> Callable: + """ + The operator function for a given op name. + + Parameters + ---------- + op_name : string + The op name, in form of "add" or "__add__". + + Returns + ------- + function + A function performing the operation. + """ + short_opname = op_name.strip("_") + try: + op = getattr(operator, short_opname) + except AttributeError: + # Assume it is the reverse operator + rop = getattr(operator, short_opname[1:]) + op = lambda x, y: rop(y, x) + + return op diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index 5d791ffd20f01..9f1c2c6e668ad 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -717,11 +717,22 @@ def boolean_arithmetic_method(self, other): # nans propagate if mask is None: mask = self._mask + if other is libmissing.NA: + mask |= True else: mask = self._mask | mask - with np.errstate(all="ignore"): - result = op(self._data, other) + if other is libmissing.NA: + # if other is NA, the result will be all NA and we can't run the + # actual op, so we need to choose the resulting dtype manually + if op_name in {"floordiv", "rfloordiv", "mod", "rmod", "pow", "rpow"}: + dtype = "int8" + else: + dtype = "bool" + result = np.zeros(len(self._data), dtype=dtype) + else: + with np.errstate(all="ignore"): + result = op(self._data, other) # divmod returns a tuple if op_name == "divmod": diff --git a/pandas/tests/arrays/boolean/test_arithmetic.py b/pandas/tests/arrays/boolean/test_arithmetic.py index df4c218cbf9bf..1a4ab9799e8e5 100644 --- a/pandas/tests/arrays/boolean/test_arithmetic.py +++ b/pandas/tests/arrays/boolean/test_arithmetic.py @@ -1,8 +1,10 @@ +import operator + import numpy as np import pytest import pandas as pd -from pandas.tests.extension.base import BaseOpsUtil +import pandas._testing as tm @pytest.fixture @@ -13,30 +15,87 @@ def data(): ) -class TestArithmeticOps(BaseOpsUtil): - def test_error(self, data, all_arithmetic_operators): - # invalid ops +@pytest.fixture +def left_array(): + return pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean") - op = all_arithmetic_operators - s = pd.Series(data) - ops = getattr(s, op) - opa = getattr(data, op) - # invalid scalars - with pytest.raises(TypeError): - ops("foo") - with pytest.raises(TypeError): - ops(pd.Timestamp("20180101")) +@pytest.fixture +def right_array(): + return pd.array([True, False, None] * 3, dtype="boolean") + - # invalid array-likes - if op not in ("__mul__", "__rmul__"): - # TODO(extension) numpy's mul with object array sees booleans as numbers - with pytest.raises(TypeError): - ops(pd.Series("foo", index=s.index)) +# Basic test for the arithmetic array ops +# ----------------------------------------------------------------------------- - # 2d - result = opa(pd.DataFrame({"A": s})) - assert result is NotImplemented - with pytest.raises(NotImplementedError): - opa(np.arange(len(s)).reshape(-1, len(s))) +@pytest.mark.parametrize( + "opname, exp", + [ + ("add", [True, True, None, True, False, None, None, None, None]), + ("mul", [True, False, None, False, False, None, None, None, None]), + ], + ids=["add", "mul"], +) +def test_add_mul(left_array, right_array, opname, exp): + op = getattr(operator, opname) + result = op(left_array, right_array) + expected = pd.array(exp, dtype="boolean") + tm.assert_extension_array_equal(result, expected) + + +def test_sub(left_array, right_array): + with pytest.raises(TypeError): + # numpy points to ^ operator or logical_xor function instead + left_array - right_array + + +def test_div(left_array, right_array): + # for now division gives a float numpy array + result = left_array / right_array + expected = np.array( + [1.0, np.inf, np.nan, 0.0, np.nan, np.nan, np.nan, np.nan, np.nan], + dtype="float64", + ) + tm.assert_numpy_array_equal(result, expected) + + +@pytest.mark.parametrize( + "opname", + [ + "floordiv", + "mod", + pytest.param( + "pow", marks=pytest.mark.xfail(reason="TODO follow int8 behaviour? GH34686") + ), + ], +) +def test_op_int8(left_array, right_array, opname): + op = getattr(operator, opname) + result = op(left_array, right_array) + expected = op(left_array.astype("Int8"), right_array.astype("Int8")) + tm.assert_extension_array_equal(result, expected) + + +# Test generic characteristics / errors +# ----------------------------------------------------------------------------- + + +def test_error_invalid_values(data, all_arithmetic_operators): + # invalid ops + + op = all_arithmetic_operators + s = pd.Series(data) + ops = getattr(s, op) + + # invalid scalars + with pytest.raises(TypeError): + ops("foo") + with pytest.raises(TypeError): + ops(pd.Timestamp("20180101")) + + # invalid array-likes + if op not in ("__mul__", "__rmul__"): + # TODO(extension) numpy's mul with object array sees booleans as numbers + with pytest.raises(TypeError): + ops(pd.Series("foo", index=s.index)) diff --git a/pandas/tests/arrays/integer/test_arithmetic.py b/pandas/tests/arrays/integer/test_arithmetic.py index a6c47f3192175..d309f6423e0c1 100644 --- a/pandas/tests/arrays/integer/test_arithmetic.py +++ b/pandas/tests/arrays/integer/test_arithmetic.py @@ -5,23 +5,9 @@ import pandas as pd import pandas._testing as tm -from pandas.core.arrays import ExtensionArray, integer_array +from pandas.core.arrays import integer_array import pandas.core.ops as ops - -# TODO need to use existing utility function or move this somewhere central -def get_op_from_name(op_name): - short_opname = op_name.strip("_") - try: - op = getattr(operator, short_opname) - except AttributeError: - # Assume it is the reverse operator - rop = getattr(operator, short_opname[1:]) - op = lambda x, y: rop(y, x) - - return op - - # Basic test for the arithmetic array ops # ----------------------------------------------------------------------------- @@ -151,55 +137,6 @@ def test_rpow_one_to_na(): tm.assert_numpy_array_equal(result, expected) -# Test equivalence of scalars, numpy arrays with array ops -# ----------------------------------------------------------------------------- - - -def test_array_scalar_like_equivalence(data, all_arithmetic_operators): - op = get_op_from_name(all_arithmetic_operators) - - scalar = 2 - scalar_array = pd.array([2] * len(data), dtype=data.dtype) - - # TODO also add len-1 array (np.array([2], dtype=data.dtype.numpy_dtype)) - for scalar in [2, data.dtype.type(2)]: - result = op(data, scalar) - expected = op(data, scalar_array) - if isinstance(expected, ExtensionArray): - tm.assert_extension_array_equal(result, expected) - else: - # TODO div still gives float ndarray -> remove this once we have Float EA - tm.assert_numpy_array_equal(result, expected) - - -def test_array_NA(data, all_arithmetic_operators): - if "truediv" in all_arithmetic_operators: - pytest.skip("division with pd.NA raises") - op = get_op_from_name(all_arithmetic_operators) - - scalar = pd.NA - scalar_array = pd.array([pd.NA] * len(data), dtype=data.dtype) - - result = op(data, scalar) - expected = op(data, scalar_array) - tm.assert_extension_array_equal(result, expected) - - -def test_numpy_array_equivalence(data, all_arithmetic_operators): - op = get_op_from_name(all_arithmetic_operators) - - numpy_array = np.array([2] * len(data), dtype=data.dtype.numpy_dtype) - pd_array = pd.array(numpy_array, dtype=data.dtype) - - result = op(data, numpy_array) - expected = op(data, pd_array) - if isinstance(expected, ExtensionArray): - tm.assert_extension_array_equal(result, expected) - else: - # TODO div still gives float ndarray -> remove this once we have Float EA - tm.assert_numpy_array_equal(result, expected) - - @pytest.mark.parametrize("other", [0, 0.5]) def test_numpy_zero_dim_ndarray(other): arr = integer_array([1, None, 2]) @@ -208,53 +145,7 @@ def test_numpy_zero_dim_ndarray(other): tm.assert_equal(result, expected) -# Test equivalence with Series and DataFrame ops -# ----------------------------------------------------------------------------- - - -def test_frame(data, all_arithmetic_operators): - op = get_op_from_name(all_arithmetic_operators) - - # DataFrame with scalar - df = pd.DataFrame({"A": data}) - scalar = 2 - - result = op(df, scalar) - expected = pd.DataFrame({"A": op(data, scalar)}) - tm.assert_frame_equal(result, expected) - - -def test_series(data, all_arithmetic_operators): - op = get_op_from_name(all_arithmetic_operators) - - s = pd.Series(data) - - # Series with scalar - scalar = 2 - result = op(s, scalar) - expected = pd.Series(op(data, scalar)) - tm.assert_series_equal(result, expected) - - # Series with np.ndarray - other = np.ones(len(data), dtype=data.dtype.type) - result = op(s, other) - expected = pd.Series(op(data, other)) - tm.assert_series_equal(result, expected) - - # Series with pd.array - other = pd.array(np.ones(len(data)), dtype=data.dtype) - result = op(s, other) - expected = pd.Series(op(data, other)) - tm.assert_series_equal(result, expected) - - # Series with Series - other = pd.Series(np.ones(len(data)), dtype=data.dtype) - result = op(s, other) - expected = pd.Series(op(data, other.array)) - tm.assert_series_equal(result, expected) - - -# Test generic charachteristics / errors +# Test generic characteristics / errors # ----------------------------------------------------------------------------- @@ -291,35 +182,6 @@ def test_error_invalid_values(data, all_arithmetic_operators): ops(pd.Series(pd.date_range("20180101", periods=len(s)))) -def test_error_invalid_object(data, all_arithmetic_operators): - - op = all_arithmetic_operators - opa = getattr(data, op) - - # 2d -> return NotImplemented - result = opa(pd.DataFrame({"A": data})) - assert result is NotImplemented - - msg = r"can only perform ops with 1-d structures" - with pytest.raises(NotImplementedError, match=msg): - opa(np.arange(len(data)).reshape(-1, len(data))) - - -def test_error_len_mismatch(all_arithmetic_operators): - # operating with a list-like with non-matching length raises - op = get_op_from_name(all_arithmetic_operators) - - data = pd.array([1, 2, 3], dtype="Int64") - - for other in [[1, 2], np.array([1.0, 2.0])]: - with pytest.raises(ValueError, match="Lengths must match"): - op(data, other) - - s = pd.Series(data) - with pytest.raises(ValueError, match="Lengths must match"): - op(s, other) - - # Various # ----------------------------------------------------------------------------- @@ -328,7 +190,7 @@ def test_error_len_mismatch(all_arithmetic_operators): def test_arith_coerce_scalar(data, all_arithmetic_operators): - op = get_op_from_name(all_arithmetic_operators) + op = tm.get_op_from_name(all_arithmetic_operators) s = pd.Series(data) other = 0.01 @@ -345,7 +207,7 @@ def test_arith_coerce_scalar(data, all_arithmetic_operators): def test_arithmetic_conversion(all_arithmetic_operators, other): # if we have a float operand we should have a float result # if that is equal to an integer - op = get_op_from_name(all_arithmetic_operators) + op = tm.get_op_from_name(all_arithmetic_operators) s = pd.Series([1, 2, 3], dtype="Int64") result = op(s, other) diff --git a/pandas/tests/arrays/masked/test_arithmetic.py b/pandas/tests/arrays/masked/test_arithmetic.py new file mode 100644 index 0000000000000..db938c36fe7ae --- /dev/null +++ b/pandas/tests/arrays/masked/test_arithmetic.py @@ -0,0 +1,158 @@ +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm +from pandas.core.arrays import ExtensionArray + +arrays = [pd.array([1, 2, 3, None], dtype=dtype) for dtype in tm.ALL_EA_INT_DTYPES] +scalars = [2] * len(arrays) +arrays += [pd.array([True, False, True, None], dtype="boolean")] +scalars += [False] + + +@pytest.fixture(params=zip(arrays, scalars), ids=[a.dtype.name for a in arrays]) +def data(request): + return request.param + + +def check_skip(data, op_name): + if isinstance(data.dtype, pd.BooleanDtype) and "sub" in op_name: + pytest.skip("subtract not implemented for boolean") + + +# Test equivalence of scalars, numpy arrays with array ops +# ----------------------------------------------------------------------------- + + +def test_array_scalar_like_equivalence(data, all_arithmetic_operators): + data, scalar = data + op = tm.get_op_from_name(all_arithmetic_operators) + check_skip(data, all_arithmetic_operators) + + scalar_array = pd.array([scalar] * len(data), dtype=data.dtype) + + # TODO also add len-1 array (np.array([scalar], dtype=data.dtype.numpy_dtype)) + for scalar in [scalar, data.dtype.type(scalar)]: + result = op(data, scalar) + expected = op(data, scalar_array) + if isinstance(expected, ExtensionArray): + tm.assert_extension_array_equal(result, expected) + else: + # TODO div still gives float ndarray -> remove this once we have Float EA + tm.assert_numpy_array_equal(result, expected) + + +def test_array_NA(data, all_arithmetic_operators): + if "truediv" in all_arithmetic_operators: + pytest.skip("division with pd.NA raises") + data, _ = data + op = tm.get_op_from_name(all_arithmetic_operators) + check_skip(data, all_arithmetic_operators) + + scalar = pd.NA + scalar_array = pd.array([pd.NA] * len(data), dtype=data.dtype) + + result = op(data, scalar) + expected = op(data, scalar_array) + tm.assert_extension_array_equal(result, expected) + + +def test_numpy_array_equivalence(data, all_arithmetic_operators): + data, scalar = data + op = tm.get_op_from_name(all_arithmetic_operators) + check_skip(data, all_arithmetic_operators) + + numpy_array = np.array([scalar] * len(data), dtype=data.dtype.numpy_dtype) + pd_array = pd.array(numpy_array, dtype=data.dtype) + + result = op(data, numpy_array) + expected = op(data, pd_array) + if isinstance(expected, ExtensionArray): + tm.assert_extension_array_equal(result, expected) + else: + # TODO div still gives float ndarray -> remove this once we have Float EA + tm.assert_numpy_array_equal(result, expected) + + +# Test equivalence with Series and DataFrame ops +# ----------------------------------------------------------------------------- + + +def test_frame(data, all_arithmetic_operators): + data, scalar = data + op = tm.get_op_from_name(all_arithmetic_operators) + check_skip(data, all_arithmetic_operators) + + # DataFrame with scalar + df = pd.DataFrame({"A": data}) + + result = op(df, scalar) + expected = pd.DataFrame({"A": op(data, scalar)}) + tm.assert_frame_equal(result, expected) + + +def test_series(data, all_arithmetic_operators): + data, scalar = data + op = tm.get_op_from_name(all_arithmetic_operators) + check_skip(data, all_arithmetic_operators) + + s = pd.Series(data) + + # Series with scalar + result = op(s, scalar) + expected = pd.Series(op(data, scalar)) + tm.assert_series_equal(result, expected) + + # Series with np.ndarray + other = np.array([scalar] * len(data), dtype=data.dtype.numpy_dtype) + result = op(s, other) + expected = pd.Series(op(data, other)) + tm.assert_series_equal(result, expected) + + # Series with pd.array + other = pd.array([scalar] * len(data), dtype=data.dtype) + result = op(s, other) + expected = pd.Series(op(data, other)) + tm.assert_series_equal(result, expected) + + # Series with Series + other = pd.Series([scalar] * len(data), dtype=data.dtype) + result = op(s, other) + expected = pd.Series(op(data, other.array)) + tm.assert_series_equal(result, expected) + + +# Test generic characteristics / errors +# ----------------------------------------------------------------------------- + + +def test_error_invalid_object(data, all_arithmetic_operators): + data, _ = data + + op = all_arithmetic_operators + opa = getattr(data, op) + + # 2d -> return NotImplemented + result = opa(pd.DataFrame({"A": data})) + assert result is NotImplemented + + msg = r"can only perform ops with 1-d structures" + with pytest.raises(NotImplementedError, match=msg): + opa(np.arange(len(data)).reshape(-1, len(data))) + + +def test_error_len_mismatch(data, all_arithmetic_operators): + # operating with a list-like with non-matching length raises + data, scalar = data + op = tm.get_op_from_name(all_arithmetic_operators) + + other = [scalar] * (len(data) - 1) + + for other in [other, np.array(other)]: + with pytest.raises(ValueError, match="Lengths must match"): + op(data, other) + + s = pd.Series(data) + with pytest.raises(ValueError, match="Lengths must match"): + op(s, other) diff --git a/pandas/tests/extension/base/ops.py b/pandas/tests/extension/base/ops.py index 188893c8b067c..359acf230ce14 100644 --- a/pandas/tests/extension/base/ops.py +++ b/pandas/tests/extension/base/ops.py @@ -1,9 +1,9 @@ -import operator from typing import Optional, Type import pytest import pandas as pd +import pandas._testing as tm from pandas.core import ops from .base import BaseExtensionTests @@ -11,15 +11,7 @@ class BaseOpsUtil(BaseExtensionTests): def get_op_from_name(self, op_name): - short_opname = op_name.strip("_") - try: - op = getattr(operator, short_opname) - except AttributeError: - # Assume it is the reverse operator - rop = getattr(operator, short_opname[1:]) - op = lambda x, y: rop(y, x) - - return op + return tm.get_op_from_name(op_name) def check_opname(self, s, op_name, other, exc=Exception): op = self.get_op_from_name(op_name) From 74454cc905e5d661fbba573afa1c46a0aa0defc6 Mon Sep 17 00:00:00 2001 From: Daniel Saxton <2658661+dsaxton@users.noreply.github.com> Date: Sun, 14 Jun 2020 10:22:15 -0500 Subject: [PATCH 0098/1025] ENH: Implement groupby.sample (#34069) --- doc/source/reference/groupby.rst | 1 + doc/source/whatsnew/v1.1.0.rst | 1 + pandas/core/generic.py | 4 + pandas/core/groupby/base.py | 1 + pandas/core/groupby/groupby.py | 113 ++++++++++++++++++++++ pandas/tests/groupby/test_sample.py | 125 +++++++++++++++++++++++++ pandas/tests/groupby/test_whitelist.py | 1 + 7 files changed, 246 insertions(+) create mode 100644 pandas/tests/groupby/test_sample.py diff --git a/doc/source/reference/groupby.rst b/doc/source/reference/groupby.rst index 5f6bef2579d27..76cb53559f334 100644 --- a/doc/source/reference/groupby.rst +++ b/doc/source/reference/groupby.rst @@ -116,6 +116,7 @@ application to columns of a specific data type. DataFrameGroupBy.quantile DataFrameGroupBy.rank DataFrameGroupBy.resample + DataFrameGroupBy.sample DataFrameGroupBy.shift DataFrameGroupBy.size DataFrameGroupBy.skew diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 3f06b0c63a15e..e680c2db55a43 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -275,6 +275,7 @@ Other enhancements such as ``dict`` and ``list``, mirroring the behavior of :meth:`DataFrame.update` (:issue:`33215`) - :meth:`~pandas.core.groupby.GroupBy.transform` and :meth:`~pandas.core.groupby.GroupBy.aggregate` has gained ``engine`` and ``engine_kwargs`` arguments that supports executing functions with ``Numba`` (:issue:`32854`, :issue:`33388`) - :meth:`~pandas.core.resample.Resampler.interpolate` now supports SciPy interpolation method :class:`scipy.interpolate.CubicSpline` as method ``cubicspline`` (:issue:`33670`) +- :class:`~pandas.core.groupby.generic.DataFrameGroupBy` and :class:`~pandas.core.groupby.generic.SeriesGroupBy` now implement the ``sample`` method for doing random sampling within groups (:issue:`31775`) - :meth:`DataFrame.to_numpy` now supports the ``na_value`` keyword to control the NA sentinel in the output array (:issue:`33820`) - The ``ExtensionArray`` class has now an :meth:`~pandas.arrays.ExtensionArray.equals` method, similarly to :meth:`Series.equals` (:issue:`27081`). diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 823a0a6a35f9e..c340460857b9f 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -4868,6 +4868,10 @@ def sample( See Also -------- + DataFrameGroupBy.sample: Generates random samples from each group of a + DataFrame object. + SeriesGroupBy.sample: Generates random samples from each group of a + Series object. numpy.random.choice: Generates a random sample from a given 1-D numpy array. diff --git a/pandas/core/groupby/base.py b/pandas/core/groupby/base.py index 363286704ba95..08352d737dee0 100644 --- a/pandas/core/groupby/base.py +++ b/pandas/core/groupby/base.py @@ -180,6 +180,7 @@ def _gotitem(self, key, ndim, subset=None): "tail", "take", "transform", + "sample", ] ) # Valid values of `name` for `groupby.transform(name)` diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index e385a78142ba5..c2be8d96402df 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -23,6 +23,7 @@ class providing the base-class of operations. List, Mapping, Optional, + Sequence, Tuple, Type, TypeVar, @@ -2695,6 +2696,118 @@ def _reindex_output( return output.reset_index(drop=True) + def sample( + self, + n: Optional[int] = None, + frac: Optional[float] = None, + replace: bool = False, + weights: Optional[Union[Sequence, Series]] = None, + random_state=None, + ): + """ + Return a random sample of items from each group. + + You can use `random_state` for reproducibility. + + .. versionadded:: 1.1.0 + + Parameters + ---------- + n : int, optional + Number of items to return for each group. Cannot be used with + `frac` and must be no larger than the smallest group unless + `replace` is True. Default is one if `frac` is None. + frac : float, optional + Fraction of items to return. Cannot be used with `n`. + replace : bool, default False + Allow or disallow sampling of the same row more than once. + weights : list-like, optional + Default None results in equal probability weighting. + If passed a list-like then values must have the same length as + the underlying DataFrame or Series object and will be used as + sampling probabilities after normalization within each group. + Values must be non-negative with at least one positive element + within each group. + random_state : int, array-like, BitGenerator, np.random.RandomState, optional + If int, array-like, or BitGenerator (NumPy>=1.17), seed for + random number generator + If np.random.RandomState, use as numpy RandomState object. + + Returns + ------- + Series or DataFrame + A new object of same type as caller containing items randomly + sampled within each group from the caller object. + + See Also + -------- + DataFrame.sample: Generate random samples from a DataFrame object. + numpy.random.choice: Generate a random sample from a given 1-D numpy + array. + + Examples + -------- + >>> df = pd.DataFrame( + ... {"a": ["red"] * 2 + ["blue"] * 2 + ["black"] * 2, "b": range(6)} + ... ) + >>> df + a b + 0 red 0 + 1 red 1 + 2 blue 2 + 3 blue 3 + 4 black 4 + 5 black 5 + + Select one row at random for each distinct value in column a. The + `random_state` argument can be used to guarantee reproducibility: + + >>> df.groupby("a").sample(n=1, random_state=1) + a b + 4 black 4 + 2 blue 2 + 1 red 1 + + Set `frac` to sample fixed proportions rather than counts: + + >>> df.groupby("a")["b"].sample(frac=0.5, random_state=2) + 5 5 + 2 2 + 0 0 + Name: b, dtype: int64 + + Control sample probabilities within groups by setting weights: + + >>> df.groupby("a").sample( + ... n=1, + ... weights=[1, 1, 1, 0, 0, 1], + ... random_state=1, + ... ) + a b + 5 black 5 + 2 blue 2 + 0 red 0 + """ + from pandas.core.reshape.concat import concat + + if weights is not None: + weights = Series(weights, index=self._selected_obj.index) + ws = [weights[idx] for idx in self.indices.values()] + else: + ws = [None] * self.ngroups + + if random_state is not None: + random_state = com.random_state(random_state) + + samples = [ + obj.sample( + n=n, frac=frac, replace=replace, weights=w, random_state=random_state + ) + for (_, obj), w in zip(self, ws) + ] + + return concat(samples, axis=self.axis) + @doc(GroupBy) def get_groupby( diff --git a/pandas/tests/groupby/test_sample.py b/pandas/tests/groupby/test_sample.py new file mode 100644 index 0000000000000..412e3e8f732de --- /dev/null +++ b/pandas/tests/groupby/test_sample.py @@ -0,0 +1,125 @@ +import pytest + +from pandas import DataFrame, Index, Series +import pandas._testing as tm + + +@pytest.mark.parametrize("n, frac", [(2, None), (None, 0.2)]) +def test_groupby_sample_balanced_groups_shape(n, frac): + values = [1] * 10 + [2] * 10 + df = DataFrame({"a": values, "b": values}) + + result = df.groupby("a").sample(n=n, frac=frac) + values = [1] * 2 + [2] * 2 + expected = DataFrame({"a": values, "b": values}, index=result.index) + tm.assert_frame_equal(result, expected) + + result = df.groupby("a")["b"].sample(n=n, frac=frac) + expected = Series(values, name="b", index=result.index) + tm.assert_series_equal(result, expected) + + +def test_groupby_sample_unbalanced_groups_shape(): + values = [1] * 10 + [2] * 20 + df = DataFrame({"a": values, "b": values}) + + result = df.groupby("a").sample(n=5) + values = [1] * 5 + [2] * 5 + expected = DataFrame({"a": values, "b": values}, index=result.index) + tm.assert_frame_equal(result, expected) + + result = df.groupby("a")["b"].sample(n=5) + expected = Series(values, name="b", index=result.index) + tm.assert_series_equal(result, expected) + + +def test_groupby_sample_index_value_spans_groups(): + values = [1] * 3 + [2] * 3 + df = DataFrame({"a": values, "b": values}, index=[1, 2, 2, 2, 2, 2]) + + result = df.groupby("a").sample(n=2) + values = [1] * 2 + [2] * 2 + expected = DataFrame({"a": values, "b": values}, index=result.index) + tm.assert_frame_equal(result, expected) + + result = df.groupby("a")["b"].sample(n=2) + expected = Series(values, name="b", index=result.index) + tm.assert_series_equal(result, expected) + + +def test_groupby_sample_n_and_frac_raises(): + df = DataFrame({"a": [1, 2], "b": [1, 2]}) + msg = "Please enter a value for `frac` OR `n`, not both" + + with pytest.raises(ValueError, match=msg): + df.groupby("a").sample(n=1, frac=1.0) + + with pytest.raises(ValueError, match=msg): + df.groupby("a")["b"].sample(n=1, frac=1.0) + + +def test_groupby_sample_frac_gt_one_without_replacement_raises(): + df = DataFrame({"a": [1, 2], "b": [1, 2]}) + msg = "Replace has to be set to `True` when upsampling the population `frac` > 1." + + with pytest.raises(ValueError, match=msg): + df.groupby("a").sample(frac=1.5, replace=False) + + with pytest.raises(ValueError, match=msg): + df.groupby("a")["b"].sample(frac=1.5, replace=False) + + +@pytest.mark.parametrize("n", [-1, 1.5]) +def test_groupby_sample_invalid_n_raises(n): + df = DataFrame({"a": [1, 2], "b": [1, 2]}) + + if n < 0: + msg = "Please provide positive value" + else: + msg = "Only integers accepted as `n` values" + + with pytest.raises(ValueError, match=msg): + df.groupby("a").sample(n=n) + + with pytest.raises(ValueError, match=msg): + df.groupby("a")["b"].sample(n=n) + + +def test_groupby_sample_oversample(): + values = [1] * 10 + [2] * 10 + df = DataFrame({"a": values, "b": values}) + + result = df.groupby("a").sample(frac=2.0, replace=True) + values = [1] * 20 + [2] * 20 + expected = DataFrame({"a": values, "b": values}, index=result.index) + tm.assert_frame_equal(result, expected) + + result = df.groupby("a")["b"].sample(frac=2.0, replace=True) + expected = Series(values, name="b", index=result.index) + tm.assert_series_equal(result, expected) + + +def test_groupby_sample_without_n_or_frac(): + values = [1] * 10 + [2] * 10 + df = DataFrame({"a": values, "b": values}) + + result = df.groupby("a").sample(n=None, frac=None) + expected = DataFrame({"a": [1, 2], "b": [1, 2]}, index=result.index) + tm.assert_frame_equal(result, expected) + + result = df.groupby("a")["b"].sample(n=None, frac=None) + expected = Series([1, 2], name="b", index=result.index) + tm.assert_series_equal(result, expected) + + +def test_groupby_sample_with_weights(): + values = [1] * 2 + [2] * 2 + df = DataFrame({"a": values, "b": values}, index=Index(["w", "x", "y", "z"])) + + result = df.groupby("a").sample(n=2, replace=True, weights=[1, 0, 1, 0]) + expected = DataFrame({"a": values, "b": values}, index=Index(["w", "w", "y", "y"])) + tm.assert_frame_equal(result, expected) + + result = df.groupby("a")["b"].sample(n=2, replace=True, weights=[1, 0, 1, 0]) + expected = Series(values, name="b", index=Index(["w", "w", "y", "y"])) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/test_whitelist.py b/pandas/tests/groupby/test_whitelist.py index 6b33049a664de..1598cc24ba6fb 100644 --- a/pandas/tests/groupby/test_whitelist.py +++ b/pandas/tests/groupby/test_whitelist.py @@ -328,6 +328,7 @@ def test_tab_completion(mframe): "rolling", "expanding", "pipe", + "sample", } assert results == expected From 21abf8f93ad7d5f5a6c1bd41a92c0c68eb671959 Mon Sep 17 00:00:00 2001 From: Josh Dimarsky <24758845+yehoshuadimarsky@users.noreply.github.com> Date: Sun, 14 Jun 2020 11:51:16 -0400 Subject: [PATCH 0099/1025] DOC: Add bcpandas to Ecosystem in docs (#34558) --- doc/source/ecosystem.rst | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/doc/source/ecosystem.rst b/doc/source/ecosystem.rst index 62065f016e438..72e24e34bc5c1 100644 --- a/doc/source/ecosystem.rst +++ b/doc/source/ecosystem.rst @@ -320,6 +320,20 @@ provide a pandas-like and pandas-compatible toolkit for analytics on multi- dimensional arrays, rather than the tabular data for which pandas excels. +.. _ecosystem.io: + +IO +-- + +`BCPandas `__ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +BCPandas provides high performance writes from pandas to Microsoft SQL Server, +far exceeding the performance of the native ``df.to_sql`` method. Internally, it uses +Microsoft's BCP utility, but the complexity is fully abstracted away from the end user. +Rigorously tested, it is a complete replacement for ``df.to_sql``. + + .. _ecosystem.out-of-core: Out-of-core From 8451a67260f8069d9b71b5f66291d5f691f35852 Mon Sep 17 00:00:00 2001 From: William Ayd Date: Sun, 14 Jun 2020 08:57:49 -0700 Subject: [PATCH 0100/1025] Removed __div__ impls (#34718) --- ci/deps/azure-37-locale.yaml | 3 +-- environment.yml | 2 +- pandas/_libs/interval.pyx | 5 ----- pandas/_libs/tslibs/nattype.pyx | 3 --- requirements-dev.txt | 2 +- 5 files changed, 3 insertions(+), 12 deletions(-) diff --git a/ci/deps/azure-37-locale.yaml b/ci/deps/azure-37-locale.yaml index 25ee821afe7bd..31155ac93931a 100644 --- a/ci/deps/azure-37-locale.yaml +++ b/ci/deps/azure-37-locale.yaml @@ -5,8 +5,7 @@ dependencies: - python=3.7.* # tools - # Cython pin for https://github.com/pandas-dev/pandas/issues/34704 - - cython==0.29.19 + - cython>=0.29.16 - pytest>=5.0.1 - pytest-xdist>=1.21 - pytest-asyncio diff --git a/environment.yml b/environment.yml index bfe0e78c891cf..b81404094fa4c 100644 --- a/environment.yml +++ b/environment.yml @@ -12,7 +12,7 @@ dependencies: - asv # building - - cython=0.29.19 + - cython>=0.29.16 # code checks - black=19.10b0 diff --git a/pandas/_libs/interval.pyx b/pandas/_libs/interval.pyx index b5f5ef0a3f593..95881ebf1385c 100644 --- a/pandas/_libs/interval.pyx +++ b/pandas/_libs/interval.pyx @@ -424,11 +424,6 @@ cdef class Interval(IntervalMixin): return Interval(y.left * self, y.right * self, closed=y.closed) return NotImplemented - def __div__(self, y): - if isinstance(y, numbers.Number): - return Interval(self.left / y, self.right / y, closed=self.closed) - return NotImplemented - def __truediv__(self, y): if isinstance(y, numbers.Number): return Interval(self.left / y, self.right / y, closed=self.closed) diff --git a/pandas/_libs/tslibs/nattype.pyx b/pandas/_libs/tslibs/nattype.pyx index f079c5157eeb3..71f151e6eb876 100644 --- a/pandas/_libs/tslibs/nattype.pyx +++ b/pandas/_libs/tslibs/nattype.pyx @@ -221,9 +221,6 @@ cdef class _NaT(datetime): def __neg__(self): return NaT - def __div__(self, other): - return _nat_divide_op(self, other) - def __truediv__(self, other): return _nat_divide_op(self, other) diff --git a/requirements-dev.txt b/requirements-dev.txt index 791dc7cd79128..754ec7ae28748 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -5,7 +5,7 @@ numpy>=1.15 python-dateutil>=2.7.3 pytz asv -cython==0.29.19 +cython>=0.29.16 black==19.10b0 cpplint flake8<3.8.0 From 9ecb22c8b3300919d389985f511b2c029ba5aa4d Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 14 Jun 2020 09:39:02 -0700 Subject: [PATCH 0101/1025] REF: make get_day_of_month nogil (#34764) --- pandas/_libs/tslibs/np_datetime.pxd | 1 + pandas/_libs/tslibs/np_datetime.pyx | 6 +++- pandas/_libs/tslibs/offsets.pyx | 49 +++++++++++++++++++---------- 3 files changed, 38 insertions(+), 18 deletions(-) diff --git a/pandas/_libs/tslibs/np_datetime.pxd b/pandas/_libs/tslibs/np_datetime.pxd index 038632e1575c3..eebdcb3ace507 100644 --- a/pandas/_libs/tslibs/np_datetime.pxd +++ b/pandas/_libs/tslibs/np_datetime.pxd @@ -63,6 +63,7 @@ cdef void td64_to_tdstruct(int64_t td64, pandas_timedeltastruct* out) nogil cdef int64_t pydatetime_to_dt64(datetime val, npy_datetimestruct *dts) cdef int64_t pydate_to_dt64(date val, npy_datetimestruct *dts) +cdef void pydate_to_dtstruct(date val, npy_datetimestruct *dts) cdef npy_datetime get_datetime64_value(object obj) nogil cdef npy_timedelta get_timedelta64_value(object obj) nogil diff --git a/pandas/_libs/tslibs/np_datetime.pyx b/pandas/_libs/tslibs/np_datetime.pyx index 5ac0e4fa44bee..31cc55ad981bb 100644 --- a/pandas/_libs/tslibs/np_datetime.pyx +++ b/pandas/_libs/tslibs/np_datetime.pyx @@ -152,12 +152,16 @@ cdef inline int64_t pydatetime_to_dt64(datetime val, return dtstruct_to_dt64(dts) -cdef inline int64_t pydate_to_dt64(date val, npy_datetimestruct *dts): +cdef inline void pydate_to_dtstruct(date val, npy_datetimestruct *dts): dts.year = PyDateTime_GET_YEAR(val) dts.month = PyDateTime_GET_MONTH(val) dts.day = PyDateTime_GET_DAY(val) dts.hour = dts.min = dts.sec = dts.us = 0 dts.ps = dts.as = 0 + return + +cdef inline int64_t pydate_to_dt64(date val, npy_datetimestruct *dts): + pydate_to_dtstruct(val, dts) return dtstruct_to_dt64(dts) diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index c9c2672c55be0..3d6a9c2310c2f 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -42,7 +42,11 @@ from pandas._libs.tslibs.conversion cimport ( ) from pandas._libs.tslibs.nattype cimport NPY_NAT, c_NaT as NaT from pandas._libs.tslibs.np_datetime cimport ( - npy_datetimestruct, dtstruct_to_dt64, dt64_to_dtstruct) + npy_datetimestruct, + dtstruct_to_dt64, + dt64_to_dtstruct, + pydate_to_dtstruct, +) from pandas._libs.tslibs.timezones cimport utc_pytz as UTC from pandas._libs.tslibs.tzconversion cimport tz_convert_single @@ -607,7 +611,10 @@ cdef class BaseOffset: def _get_offset_day(self, datetime other): # subclass must implement `_day_opt`; calling from the base class # will raise NotImplementedError. - return get_day_of_month(other, self._day_opt) + cdef: + npy_datetimestruct dts + pydate_to_dtstruct(other, &dts) + return get_day_of_month(&dts, self._day_opt) def is_on_offset(self, dt) -> bool: if self.normalize and not _is_normalized(dt): @@ -1864,10 +1871,11 @@ cdef class YearOffset(SingleConstructorOffset): def _get_offset_day(self, other) -> int: # override BaseOffset method to use self.month instead of other.month - # TODO: there may be a more performant way to do this - return get_day_of_month( - other.replace(month=self.month), self._day_opt - ) + cdef: + npy_datetimestruct dts + pydate_to_dtstruct(other, &dts) + dts.month = self.month + return get_day_of_month(&dts, self._day_opt) @apply_wraps def apply(self, other): @@ -4052,14 +4060,14 @@ def shift_month(stamp: datetime, months: int, return stamp.replace(year=year, month=month, day=day) -cdef int get_day_of_month(datetime other, day_opt) except? -1: +cdef int get_day_of_month(npy_datetimestruct* dts, day_opt) nogil except? -1: """ Find the day in `other`'s month that satisfies a DateOffset's is_on_offset policy, as described by the `day_opt` argument. Parameters ---------- - other : datetime or Timestamp + dts : npy_datetimestruct* day_opt : {'start', 'end', 'business_start', 'business_end'} 'start': returns 1 'end': returns last day of the month @@ -4085,20 +4093,20 @@ cdef int get_day_of_month(datetime other, day_opt) except? -1: if day_opt == 'start': return 1 elif day_opt == 'end': - days_in_month = get_days_in_month(other.year, other.month) + days_in_month = get_days_in_month(dts.year, dts.month) return days_in_month elif day_opt == 'business_start': # first business day of month - return get_firstbday(other.year, other.month) + return get_firstbday(dts.year, dts.month) elif day_opt == 'business_end': # last business day of month - return get_lastbday(other.year, other.month) + return get_lastbday(dts.year, dts.month) + elif day_opt is not None: + raise ValueError(day_opt) elif day_opt is None: # Note: unlike `shift_month`, get_day_of_month does not # allow day_opt = None raise NotImplementedError - else: - raise ValueError(day_opt) cpdef int roll_convention(int other, int n, int compare) nogil: @@ -4151,6 +4159,9 @@ def roll_qtrday(other: datetime, n: int, month: int, """ cdef: int months_since + npy_datetimestruct dts + pydate_to_dtstruct(other, &dts) + # TODO: Merge this with roll_yearday by setting modby=12 there? # code de-duplication versus perf hit? # TODO: with small adjustments this could be used in shift_quarters @@ -4158,14 +4169,14 @@ def roll_qtrday(other: datetime, n: int, month: int, if n > 0: if months_since < 0 or (months_since == 0 and - other.day < get_day_of_month(other, + other.day < get_day_of_month(&dts, day_opt)): # pretend to roll back if on same month but # before compare_day n -= 1 else: if months_since > 0 or (months_since == 0 and - other.day > get_day_of_month(other, + other.day > get_day_of_month(&dts, day_opt)): # make sure to roll forward, so negate n += 1 @@ -4232,18 +4243,22 @@ def roll_yearday(other: datetime, n: int, month: int, day_opt: object) -> int: -6 """ + cdef: + npy_datetimestruct dts + pydate_to_dtstruct(other, &dts) + # Note: The other.day < ... condition will never hold when day_opt=='start' # and the other.day > ... condition will never hold when day_opt=='end'. # At some point these extra checks may need to be optimized away. # But that point isn't today. if n > 0: if other.month < month or (other.month == month and - other.day < get_day_of_month(other, + other.day < get_day_of_month(&dts, day_opt)): n -= 1 else: if other.month > month or (other.month == month and - other.day > get_day_of_month(other, + other.day > get_day_of_month(&dts, day_opt)): n += 1 return n From 0e6b2f09df5c2f53860cdca34e7a8c1aa7e2a117 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sun, 14 Jun 2020 11:39:58 -0500 Subject: [PATCH 0102/1025] BUG: Fixed regression in PeriodIndex loc (#34736) Closes https://github.com/pandas-dev/pandas/issues/33964 --- pandas/core/indexes/period.py | 6 +++++- pandas/tests/indexes/period/test_indexing.py | 6 ++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index 2022a4a563678..31783f6dbaaf7 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -514,7 +514,11 @@ def get_loc(self, key, method=None, tolerance=None): # _get_string_slice will handle cases where grp < freqn assert grp >= freqn - if grp == freqn: + # BusinessDay is a bit strange. It has a *lower* code, but we never parse + # a string as "BusinessDay" resolution, just Day. + if grp == freqn or ( + reso == Resolution.RESO_DAY and self.dtype.freq.name == "B" + ): key = Period(asdt, freq=self.freq) loc = self.get_loc(key, method=method, tolerance=tolerance) return loc diff --git a/pandas/tests/indexes/period/test_indexing.py b/pandas/tests/indexes/period/test_indexing.py index eaba0bb3793b2..12454c20d2bb4 100644 --- a/pandas/tests/indexes/period/test_indexing.py +++ b/pandas/tests/indexes/period/test_indexing.py @@ -693,6 +693,12 @@ def test_get_value(self): result2 = idx2.get_value(input2, p1) tm.assert_series_equal(result2, expected2) + def test_loc_str(self): + # https://github.com/pandas-dev/pandas/issues/33964 + index = pd.period_range(start="2000", periods=20, freq="B") + series = pd.Series(range(20), index=index) + assert series.loc["2000-01-14"] == 9 + @pytest.mark.parametrize("freq", ["H", "D"]) def test_get_value_datetime_hourly(self, freq): # get_loc and get_value should treat datetime objects symmetrically From c884eff1b6e2b73f1766c89b37d5328de0e98fa3 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 14 Jun 2020 09:52:39 -0700 Subject: [PATCH 0103/1025] REF: remove roll_check, use roll_convention (#34763) --- pandas/_libs/tslibs/offsets.pyx | 21 ++++++--------------- 1 file changed, 6 insertions(+), 15 deletions(-) diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index 3d6a9c2310c2f..6931360997420 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -3744,7 +3744,6 @@ cdef shift_quarters( npy_datetimestruct dts int count = len(dtindex) int months_to_roll, months_since, n, compare_day - bint roll_check int64_t[:] out = np.empty(count, dtype="int64") if day_opt == "start": @@ -3886,7 +3885,6 @@ def shift_months(const int64_t[:] dtindex, int months, object day_opt=None): npy_datetimestruct dts int count = len(dtindex) int months_to_roll - bint roll_check int64_t[:] out = np.empty(count, dtype="int64") if day_opt is None: @@ -3903,10 +3901,6 @@ def shift_months(const int64_t[:] dtindex, int months, object day_opt=None): dts.day = min(dts.day, get_days_in_month(dts.year, dts.month)) out[i] = dtstruct_to_dt64(&dts) elif day_opt == "start": - roll_check = False - if months <= 0: - months += 1 - roll_check = True with nogil: for i in range(count): if dtindex[i] == NPY_NAT: @@ -3915,11 +3909,12 @@ def shift_months(const int64_t[:] dtindex, int months, object day_opt=None): dt64_to_dtstruct(dtindex[i], &dts) months_to_roll = months + compare_day = 1 # offset semantics - if on the anchor point and going backwards # shift to next - if roll_check and dts.day == 1: - months_to_roll -= 1 + months_to_roll = roll_convention(dts.day, months_to_roll, + compare_day) dts.year = year_add_months(dts, months_to_roll) dts.month = month_add_months(dts, months_to_roll) @@ -3927,10 +3922,6 @@ def shift_months(const int64_t[:] dtindex, int months, object day_opt=None): out[i] = dtstruct_to_dt64(&dts) elif day_opt == "end": - roll_check = False - if months > 0: - months -= 1 - roll_check = True with nogil: for i in range(count): if dtindex[i] == NPY_NAT: @@ -3939,12 +3930,12 @@ def shift_months(const int64_t[:] dtindex, int months, object day_opt=None): dt64_to_dtstruct(dtindex[i], &dts) months_to_roll = months + compare_day = get_days_in_month(dts.year, dts.month) # similar semantics - when adding shift forward by one # month if already at an end of month - if roll_check and dts.day == get_days_in_month(dts.year, - dts.month): - months_to_roll += 1 + months_to_roll = roll_convention(dts.day, months_to_roll, + compare_day) dts.year = year_add_months(dts, months_to_roll) dts.month = month_add_months(dts, months_to_roll) From f5c4ce8d1bd7b6e8726640b709a0f77dc7bc3032 Mon Sep 17 00:00:00 2001 From: patrick <61934744+phofl@users.noreply.github.com> Date: Sun, 14 Jun 2020 20:04:53 +0200 Subject: [PATCH 0104/1025] =?UTF-8?q?BUG:=20Groupby=20lost=20index,=20when?= =?UTF-8?q?=20one=20of=20the=20agg=20keys=20had=20no=20function=20all?= =?UTF-8?q?=E2=80=A6=20(#33086)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- doc/source/whatsnew/v1.1.0.rst | 1 + pandas/core/base.py | 8 ++++++- .../tests/groupby/aggregate/test_aggregate.py | 21 +++++++++++++++++++ 3 files changed, 29 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index e680c2db55a43..df5e425950873 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -990,6 +990,7 @@ Groupby/resample/rolling indices. In particular, the result index shape might change if a copy of the input would be returned. The behaviour now is consistent, independent of internal heuristics. (:issue:`31612`, :issue:`14927`, :issue:`13056`) - Bug in :meth:`SeriesGroupBy.agg` where any column name was accepted in the named aggregation of ``SeriesGroupBy`` previously. The behaviour now allows only ``str`` and callables else would raise ``TypeError``. (:issue:`34422`) +- Bug in :meth:`DataFrame.groupby` lost index, when one of the ``agg`` keys referenced an empty list (:issue:`32580`) Reshaping ^^^^^^^^^ diff --git a/pandas/core/base.py b/pandas/core/base.py index a8a736b6aafdf..bb1afc8f8ef20 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -438,7 +438,13 @@ def is_any_frame() -> bool: # we have a dict of DataFrames # return a MI DataFrame - return concat([result[k] for k in keys], keys=keys, axis=1), True + keys_to_use = [k for k in keys if not result[k].empty] + # Have to check, if at least one DataFrame is not empty. + keys_to_use = keys_to_use if keys_to_use != [] else keys + return ( + concat([result[k] for k in keys_to_use], keys=keys_to_use, axis=1), + True, + ) elif isinstance(self, ABCSeries) and is_any_series(): diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 371ec11cdba77..962288d5d59e1 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -732,6 +732,27 @@ def test_agg_relabel_multiindex_duplicates(): tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize("kwargs", [{"c": ["min"]}, {"b": [], "c": ["min"]}]) +def test_groupby_aggregate_empty_key(kwargs): + # GH: 32580 + df = pd.DataFrame({"a": [1, 1, 2], "b": [1, 2, 3], "c": [1, 2, 4]}) + result = df.groupby("a").agg(kwargs) + expected = pd.DataFrame( + [1, 4], + index=pd.Index([1, 2], dtype="int64", name="a"), + columns=pd.MultiIndex.from_tuples([["c", "min"]]), + ) + tm.assert_frame_equal(result, expected) + + +def test_groupby_aggregate_empty_key_empty_return(): + # GH: 32580 Check if everything works, when return is empty + df = pd.DataFrame({"a": [1, 1, 2], "b": [1, 2, 3], "c": [1, 2, 4]}) + result = df.groupby("a").agg({"b": []}) + expected = pd.DataFrame(columns=pd.MultiIndex(levels=[["b"], []], codes=[[], []])) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( "func", [lambda s: s.mean(), lambda s: np.mean(s), lambda s: np.nanmean(s)] ) From 4992572fb9e4d5e78ccff2b8fe4647d1544fa2c8 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sun, 14 Jun 2020 19:06:40 +0100 Subject: [PATCH 0105/1025] API: validate `limit_direction` parameter of NDFrame.interpolate (#34746) --- doc/source/whatsnew/v1.1.0.rst | 1 + pandas/core/generic.py | 38 +++++++++++++++++-- .../tests/series/methods/test_interpolate.py | 21 ++++++++++ 3 files changed, 56 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index df5e425950873..2a02041244362 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -406,6 +406,7 @@ Backwards incompatible API changes (previously raised a ``NotImplementedError``), while passing in keyword ``encoding`` now raises a ``TypeError`` (:issue:`34464`) - :func: `merge` now checks ``suffixes`` parameter type to be ``tuple`` and raises ``TypeError``, whereas before a ``list`` or ``set`` were accepted and that the ``set`` could produce unexpected results (:issue:`33740`) - :class:`Period` no longer accepts tuples for the ``freq`` argument (:issue:`34658`) +- :meth:`Series.interpolate` and :meth:`DataFrame.interpolate` now raises ValueError if ``limit_direction`` is 'forward' or 'both' and ``method`` is 'backfill' or 'bfill' or ``limit_direction`` is 'backward' or 'both' and ``method`` is 'pad' or 'ffill' (:issue:`34746`) ``MultiIndex.get_indexer`` interprets `method` argument differently ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/pandas/core/generic.py b/pandas/core/generic.py index c340460857b9f..bad61a440b8c5 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6728,9 +6728,24 @@ def replace( 0. inplace : bool, default False Update the data in place if possible. - limit_direction : {'forward', 'backward', 'both'}, default 'forward' - If limit is specified, consecutive NaNs will be filled in this - direction. + limit_direction : {'forward', 'backward', 'both'}, Optional + Consecutive NaNs will be filled in this direction. + + If limit is specified: + * If 'method' is 'pad' or 'ffill', 'limit_direction' must be 'forward'. + * If 'method' is 'backfill' or 'bfill', 'limit_direction' must be + 'backwards'. + + If 'limit' is not specified: + * If 'method' is 'backfill' or 'bfill', the default is 'backward' + * else the default is 'forward' + + .. versionchanged:: 1.1.0 + raises ValueError if `limit_direction` is 'forward' or 'both' and + method is 'backfill' or 'bfill'. + raises ValueError if `limit_direction` is 'backward' or 'both' and + method is 'pad' or 'ffill'. + limit_area : {`None`, 'inside', 'outside'}, default None If limit is specified, consecutive NaNs will be filled with this restriction. @@ -6881,7 +6896,7 @@ def interpolate( axis: Axis = 0, limit: Optional[int] = None, inplace: bool_t = False, - limit_direction: str = "forward", + limit_direction: Optional[str] = None, limit_area: Optional[str] = None, downcast: Optional[str] = None, **kwargs, @@ -6906,6 +6921,21 @@ def interpolate( "Only `method=linear` interpolation is supported on MultiIndexes." ) + # Set `limit_direction` depending on `method` + if limit_direction is None: + limit_direction = ( + "backward" if method in ("backfill", "bfill") else "forward" + ) + else: + if method in ("pad", "ffill") and limit_direction != "forward": + raise ValueError( + f"`limit_direction` must be 'forward' for method `{method}`" + ) + if method in ("backfill", "bfill") and limit_direction != "backward": + raise ValueError( + f"`limit_direction` must be 'backward' for method `{method}`" + ) + if obj.ndim == 2 and np.all(obj.dtypes == np.dtype(object)): raise TypeError( "Cannot interpolate with all object-dtype columns " diff --git a/pandas/tests/series/methods/test_interpolate.py b/pandas/tests/series/methods/test_interpolate.py index db1c07e1bd276..c4b10e0ccdc3e 100644 --- a/pandas/tests/series/methods/test_interpolate.py +++ b/pandas/tests/series/methods/test_interpolate.py @@ -429,6 +429,27 @@ def test_interp_limit_area(self): with pytest.raises(ValueError, match=msg): s.interpolate(method="linear", limit_area="abc") + @pytest.mark.parametrize( + "method, limit_direction, expected", + [ + ("pad", "backward", "forward"), + ("ffill", "backward", "forward"), + ("backfill", "forward", "backward"), + ("bfill", "forward", "backward"), + ("pad", "both", "forward"), + ("ffill", "both", "forward"), + ("backfill", "both", "backward"), + ("bfill", "both", "backward"), + ], + ) + def test_interp_limit_direction_raises(self, method, limit_direction, expected): + # https://github.com/pandas-dev/pandas/pull/34746 + s = Series([1, 2, 3]) + + msg = f"`limit_direction` must be '{expected}' for method `{method}`" + with pytest.raises(ValueError, match=msg): + s.interpolate(method=method, limit_direction=limit_direction) + def test_interp_limit_direction(self): # These tests are for issue #9218 -- fill NaNs in both directions. s = Series([1, 3, np.nan, np.nan, np.nan, 11]) From 79745a6f282e230db16ca0f02b46258dfadab1e6 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Sun, 14 Jun 2020 11:36:22 -0700 Subject: [PATCH 0106/1025] CLN/TYPE: EWM (#34770) * Move min_periods validation to init * Type signatures * Undo unnecessary casting * consolidate some cython type declariations * tighten up typing and black Co-authored-by: Matt Roeschke --- pandas/_libs/window/aggregations.pyx | 14 ++--- pandas/core/window/ewm.py | 76 +++++++++++++++------------- 2 files changed, 45 insertions(+), 45 deletions(-) diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index 9e088062d7280..646444d10e416 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -1759,7 +1759,7 @@ def roll_weighted_var(float64_t[:] values, float64_t[:] weights, # Exponentially weighted moving average -def ewma(float64_t[:] vals, float64_t com, int adjust, bint ignore_na, int minp): +def ewma(float64_t[:] vals, float64_t com, bint adjust, bint ignore_na, int minp): """ Compute exponentially-weighted moving average using center-of-mass. @@ -1777,17 +1777,14 @@ def ewma(float64_t[:] vals, float64_t com, int adjust, bint ignore_na, int minp) """ cdef: - Py_ssize_t N = len(vals) + Py_ssize_t i, nobs, N = len(vals) ndarray[float64_t] output = np.empty(N, dtype=float) float64_t alpha, old_wt_factor, new_wt, weighted_avg, old_wt, cur - Py_ssize_t i, nobs bint is_observation if N == 0: return output - minp = max(minp, 1) - alpha = 1. / (1. + com) old_wt_factor = 1. - alpha new_wt = 1. if adjust else alpha @@ -1831,7 +1828,7 @@ def ewma(float64_t[:] vals, float64_t com, int adjust, bint ignore_na, int minp) def ewmcov(float64_t[:] input_x, float64_t[:] input_y, - float64_t com, int adjust, bint ignore_na, int minp, int bias): + float64_t com, bint adjust, bint ignore_na, int minp, bint bias): """ Compute exponentially-weighted moving variance using center-of-mass. @@ -1851,11 +1848,10 @@ def ewmcov(float64_t[:] input_x, float64_t[:] input_y, """ cdef: - Py_ssize_t N = len(input_x), M = len(input_y) + Py_ssize_t i, nobs, N = len(input_x), M = len(input_y) float64_t alpha, old_wt_factor, new_wt, mean_x, mean_y, cov float64_t sum_wt, sum_wt2, old_wt, cur_x, cur_y, old_mean_x, old_mean_y float64_t numerator, denominator - Py_ssize_t i, nobs ndarray[float64_t] output bint is_observation @@ -1866,8 +1862,6 @@ def ewmcov(float64_t[:] input_x, float64_t[:] input_y, if N == 0: return output - minp = max(minp, 1) - alpha = 1. / (1. + com) old_wt_factor = 1. - alpha new_wt = 1. if adjust else alpha diff --git a/pandas/core/window/ewm.py b/pandas/core/window/ewm.py index a5e30c900cae2..0e39b94574a12 100644 --- a/pandas/core/window/ewm.py +++ b/pandas/core/window/ewm.py @@ -1,9 +1,11 @@ from functools import partial from textwrap import dedent +from typing import Optional, Union import numpy as np import pandas._libs.window.aggregations as window_aggregations +from pandas._typing import FrameOrSeries from pandas.compat.numpy import function as nv from pandas.util._decorators import Appender, Substitution @@ -24,7 +26,12 @@ """ -def get_center_of_mass(comass, span, halflife, alpha) -> float: +def get_center_of_mass( + comass: Optional[float], + span: Optional[float], + halflife: Optional[float], + alpha: Optional[float], +) -> float: valid_count = com.count_not_none(comass, span, halflife, alpha) if valid_count > 1: raise ValueError("comass, span, halflife, and alpha are mutually exclusive") @@ -114,7 +121,7 @@ class EWM(_Rolling): used in calculating the final weighted average of [:math:`x_0`, None, :math:`x_2`] are :math:`1-\alpha` and :math:`1` if ``adjust=True``, and :math:`1-\alpha` and :math:`\alpha` if ``adjust=False``. - axis : {0 or 'index', 1 or 'columns'}, default 0 + axis : {0, 1}, default 0 The axis to use. The value 0 identifies the rows, and 1 identifies the columns. @@ -159,18 +166,18 @@ class EWM(_Rolling): def __init__( self, obj, - com=None, - span=None, - halflife=None, - alpha=None, - min_periods=0, - adjust=True, - ignore_na=False, - axis=0, + com: Optional[float] = None, + span: Optional[float] = None, + halflife: Optional[float] = None, + alpha: Optional[float] = None, + min_periods: int = 0, + adjust: bool = True, + ignore_na: bool = False, + axis: int = 0, ): self.obj = obj self.com = get_center_of_mass(com, span, halflife, alpha) - self.min_periods = min_periods + self.min_periods = max(int(min_periods), 1) self.adjust = adjust self.ignore_na = ignore_na self.axis = axis @@ -274,16 +281,16 @@ def mean(self, *args, **kwargs): window_func = partial( window_func, com=self.com, - adjust=int(self.adjust), + adjust=self.adjust, ignore_na=self.ignore_na, - minp=int(self.min_periods), + minp=self.min_periods, ) return self._apply(window_func) @Substitution(name="ewm", func_name="std") @Appender(_doc_template) @Appender(_bias_template) - def std(self, bias=False, *args, **kwargs): + def std(self, bias: bool = False, *args, **kwargs): """ Exponential weighted moving stddev. """ @@ -295,7 +302,7 @@ def std(self, bias=False, *args, **kwargs): @Substitution(name="ewm", func_name="var") @Appender(_doc_template) @Appender(_bias_template) - def var(self, bias=False, *args, **kwargs): + def var(self, bias: bool = False, *args, **kwargs): """ Exponential weighted moving variance. """ @@ -303,20 +310,20 @@ def var(self, bias=False, *args, **kwargs): def f(arg): return window_aggregations.ewmcov( - arg, - arg, - self.com, - int(self.adjust), - int(self.ignore_na), - int(self.min_periods), - int(bias), + arg, arg, self.com, self.adjust, self.ignore_na, self.min_periods, bias, ) return self._apply(f) @Substitution(name="ewm", func_name="cov") @Appender(_doc_template) - def cov(self, other=None, pairwise=None, bias=False, **kwargs): + def cov( + self, + other: Optional[Union[np.ndarray, FrameOrSeries]] = None, + pairwise: Optional[bool] = None, + bias: bool = False, + **kwargs, + ): """ Exponential weighted sample covariance. @@ -350,10 +357,10 @@ def _get_cov(X, Y): X._prep_values(), Y._prep_values(), self.com, - int(self.adjust), - int(self.ignore_na), - int(self.min_periods), - int(bias), + self.adjust, + self.ignore_na, + self.min_periods, + bias, ) return X._wrap_result(cov) @@ -363,7 +370,12 @@ def _get_cov(X, Y): @Substitution(name="ewm", func_name="corr") @Appender(_doc_template) - def corr(self, other=None, pairwise=None, **kwargs): + def corr( + self, + other: Optional[Union[np.ndarray, FrameOrSeries]] = None, + pairwise: Optional[bool] = None, + **kwargs, + ): """ Exponential weighted sample correlation. @@ -394,13 +406,7 @@ def _get_corr(X, Y): def _cov(x, y): return window_aggregations.ewmcov( - x, - y, - self.com, - int(self.adjust), - int(self.ignore_na), - int(self.min_periods), - 1, + x, y, self.com, self.adjust, self.ignore_na, self.min_periods, 1, ) x_values = X._prep_values() From bcb7362fecec2752be4fdbadb137580887baef08 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 14 Jun 2020 12:45:24 -0700 Subject: [PATCH 0107/1025] inline get_day_of_month (#34772) --- pandas/_libs/tslibs/offsets.pyx | 53 ++++++++++++++++++--------------- 1 file changed, 29 insertions(+), 24 deletions(-) diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index 6931360997420..9e6356b55dcec 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -3757,16 +3757,22 @@ cdef shift_quarters( n = quarters months_since = (dts.month - q1start_month) % modby + compare_day = get_day_of_month(&dts, day_opt) # offset semantics - if on the anchor point and going backwards # shift to next if n <= 0 and (months_since != 0 or - (months_since == 0 and dts.day > 1)): + (months_since == 0 and dts.day > compare_day)): + # make sure to roll forward, so negate n += 1 + elif n > 0 and (months_since == 0 and dts.day < compare_day): + # pretend to roll back if on same month but + # before compare_day + n -= 1 dts.year = year_add_months(dts, modby * n - months_since) dts.month = month_add_months(dts, modby * n - months_since) - dts.day = 1 + dts.day = get_day_of_month(&dts, day_opt) out[i] = dtstruct_to_dt64(&dts) @@ -3781,21 +3787,20 @@ cdef shift_quarters( n = quarters months_since = (dts.month - q1start_month) % modby + compare_day = get_day_of_month(&dts, day_opt) - if n <= 0 and months_since != 0: - # The general case of this condition would be - # `months_since != 0 or (months_since == 0 and - # dts.day > get_days_in_month(dts.year, dts.month))` - # but the get_days_in_month inequality would never hold. + if n <= 0 and (months_since != 0 or + (months_since == 0 and dts.day > compare_day)): + # make sure to roll forward, so negate n += 1 - elif n > 0 and (months_since == 0 and - dts.day < get_days_in_month(dts.year, - dts.month)): + elif n > 0 and (months_since == 0 and dts.day < compare_day): + # pretend to roll back if on same month but + # before compare_day n -= 1 dts.year = year_add_months(dts, modby * n - months_since) dts.month = month_add_months(dts, modby * n - months_since) - dts.day = get_days_in_month(dts.year, dts.month) + dts.day = get_day_of_month(&dts, day_opt) out[i] = dtstruct_to_dt64(&dts) @@ -3812,7 +3817,7 @@ cdef shift_quarters( months_since = (dts.month - q1start_month) % modby # compare_day is only relevant for comparison in the case # where months_since == 0. - compare_day = get_firstbday(dts.year, dts.month) + compare_day = get_day_of_month(&dts, day_opt) if n <= 0 and (months_since != 0 or (months_since == 0 and dts.day > compare_day)): @@ -3826,7 +3831,7 @@ cdef shift_quarters( dts.year = year_add_months(dts, modby * n - months_since) dts.month = month_add_months(dts, modby * n - months_since) - dts.day = get_firstbday(dts.year, dts.month) + dts.day = get_day_of_month(&dts, day_opt) out[i] = dtstruct_to_dt64(&dts) @@ -3843,7 +3848,7 @@ cdef shift_quarters( months_since = (dts.month - q1start_month) % modby # compare_day is only relevant for comparison in the case # where months_since == 0. - compare_day = get_lastbday(dts.year, dts.month) + compare_day = get_day_of_month(&dts, day_opt) if n <= 0 and (months_since != 0 or (months_since == 0 and dts.day > compare_day)): @@ -3857,7 +3862,7 @@ cdef shift_quarters( dts.year = year_add_months(dts, modby * n - months_since) dts.month = month_add_months(dts, modby * n - months_since) - dts.day = get_lastbday(dts.year, dts.month) + dts.day = get_day_of_month(&dts, day_opt) out[i] = dtstruct_to_dt64(&dts) @@ -3909,7 +3914,7 @@ def shift_months(const int64_t[:] dtindex, int months, object day_opt=None): dt64_to_dtstruct(dtindex[i], &dts) months_to_roll = months - compare_day = 1 + compare_day = get_day_of_month(&dts, day_opt) # offset semantics - if on the anchor point and going backwards # shift to next @@ -3918,7 +3923,7 @@ def shift_months(const int64_t[:] dtindex, int months, object day_opt=None): dts.year = year_add_months(dts, months_to_roll) dts.month = month_add_months(dts, months_to_roll) - dts.day = 1 + dts.day = get_day_of_month(&dts, day_opt) out[i] = dtstruct_to_dt64(&dts) elif day_opt == "end": @@ -3930,7 +3935,7 @@ def shift_months(const int64_t[:] dtindex, int months, object day_opt=None): dt64_to_dtstruct(dtindex[i], &dts) months_to_roll = months - compare_day = get_days_in_month(dts.year, dts.month) + compare_day = get_day_of_month(&dts, day_opt) # similar semantics - when adding shift forward by one # month if already at an end of month @@ -3940,7 +3945,7 @@ def shift_months(const int64_t[:] dtindex, int months, object day_opt=None): dts.year = year_add_months(dts, months_to_roll) dts.month = month_add_months(dts, months_to_roll) - dts.day = get_days_in_month(dts.year, dts.month) + dts.day = get_day_of_month(&dts, day_opt) out[i] = dtstruct_to_dt64(&dts) elif day_opt == "business_start": @@ -3952,7 +3957,7 @@ def shift_months(const int64_t[:] dtindex, int months, object day_opt=None): dt64_to_dtstruct(dtindex[i], &dts) months_to_roll = months - compare_day = get_firstbday(dts.year, dts.month) + compare_day = get_day_of_month(&dts, day_opt) months_to_roll = roll_convention(dts.day, months_to_roll, compare_day) @@ -3960,7 +3965,7 @@ def shift_months(const int64_t[:] dtindex, int months, object day_opt=None): dts.year = year_add_months(dts, months_to_roll) dts.month = month_add_months(dts, months_to_roll) - dts.day = get_firstbday(dts.year, dts.month) + dts.day = get_day_of_month(&dts, day_opt) out[i] = dtstruct_to_dt64(&dts) elif day_opt == "business_end": @@ -3972,7 +3977,7 @@ def shift_months(const int64_t[:] dtindex, int months, object day_opt=None): dt64_to_dtstruct(dtindex[i], &dts) months_to_roll = months - compare_day = get_lastbday(dts.year, dts.month) + compare_day = get_day_of_month(&dts, day_opt) months_to_roll = roll_convention(dts.day, months_to_roll, compare_day) @@ -3980,7 +3985,7 @@ def shift_months(const int64_t[:] dtindex, int months, object day_opt=None): dts.year = year_add_months(dts, months_to_roll) dts.month = month_add_months(dts, months_to_roll) - dts.day = get_lastbday(dts.year, dts.month) + dts.day = get_day_of_month(&dts, day_opt) out[i] = dtstruct_to_dt64(&dts) else: @@ -4051,7 +4056,7 @@ def shift_month(stamp: datetime, months: int, return stamp.replace(year=year, month=month, day=day) -cdef int get_day_of_month(npy_datetimestruct* dts, day_opt) nogil except? -1: +cdef inline int get_day_of_month(npy_datetimestruct* dts, day_opt) nogil except? -1: """ Find the day in `other`'s month that satisfies a DateOffset's is_on_offset policy, as described by the `day_opt` argument. From 0eb74908488d2b2862b20df5d44e4ae7947d31c9 Mon Sep 17 00:00:00 2001 From: Danilo Horta Date: Sun, 14 Jun 2020 21:37:29 +0100 Subject: [PATCH 0108/1025] Export InvalidIndexError (#34570) --- doc/source/reference/general_utility_functions.rst | 3 ++- doc/source/whatsnew/v1.1.0.rst | 2 +- pandas/core/generic.py | 10 ++-------- pandas/core/groupby/grouper.py | 2 +- pandas/core/index.py | 1 - pandas/core/indexes/api.py | 2 +- pandas/core/indexes/base.py | 5 +---- pandas/core/indexes/datetimes.py | 3 ++- pandas/core/indexes/interval.py | 2 +- pandas/core/indexes/multi.py | 9 ++------- pandas/core/indexes/period.py | 2 +- pandas/core/indexes/timedeltas.py | 3 ++- pandas/core/indexing.py | 4 ++-- pandas/core/series.py | 9 ++------- pandas/errors/__init__.py | 8 ++++++++ pandas/tests/indexes/common.py | 2 +- pandas/tests/indexes/datetimes/test_indexing.py | 3 ++- pandas/tests/indexes/interval/test_indexing.py | 3 ++- pandas/tests/indexes/interval/test_interval.py | 3 ++- pandas/tests/indexes/multi/test_indexing.py | 3 ++- pandas/tests/indexes/period/test_indexing.py | 2 +- pandas/tests/resample/test_period_index.py | 2 +- 22 files changed, 39 insertions(+), 44 deletions(-) diff --git a/doc/source/reference/general_utility_functions.rst b/doc/source/reference/general_utility_functions.rst index 993107dc09756..72a84217323ab 100644 --- a/doc/source/reference/general_utility_functions.rst +++ b/doc/source/reference/general_utility_functions.rst @@ -38,10 +38,11 @@ Exceptions and warnings errors.AccessorRegistrationWarning errors.DtypeWarning errors.EmptyDataError - errors.OutOfBoundsDatetime + errors.InvalidIndexError errors.MergeError errors.NullFrequencyError errors.NumbaUtilError + errors.OutOfBoundsDatetime errors.ParserError errors.ParserWarning errors.PerformanceWarning diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 2a02041244362..f087d969c7cd2 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -382,7 +382,7 @@ Other API changes - ``loc`` lookups with an object-dtype :class:`Index` and an integer key will now raise ``KeyError`` instead of ``TypeError`` when key is missing (:issue:`31905`) - Using a :func:`pandas.api.indexers.BaseIndexer` with ``count``, ``min``, ``max``, ``median``, ``skew``, ``cov``, ``corr`` will now return correct results for any monotonic :func:`pandas.api.indexers.BaseIndexer` descendant (:issue:`32865`) - Added a :func:`pandas.api.indexers.FixedForwardWindowIndexer` class to support forward-looking windows during ``rolling`` operations. -- +- Added :class:`pandas.errors.InvalidIndexError` (:issue:`34570`). Backwards incompatible API changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/pandas/core/generic.py b/pandas/core/generic.py index bad61a440b8c5..9dcdcaca2f689 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -47,7 +47,7 @@ from pandas.compat import set_function_name from pandas.compat._optional import import_optional_dependency from pandas.compat.numpy import function as nv -from pandas.errors import AbstractMethodError +from pandas.errors import AbstractMethodError, InvalidIndexError from pandas.util._decorators import ( Appender, Substitution, @@ -90,13 +90,7 @@ from pandas.core.base import PandasObject, SelectionMixin import pandas.core.common as com from pandas.core.construction import create_series_with_explicit_dtype -from pandas.core.indexes.api import ( - Index, - InvalidIndexError, - MultiIndex, - RangeIndex, - ensure_index, -) +from pandas.core.indexes.api import Index, MultiIndex, RangeIndex, ensure_index from pandas.core.indexes.datetimes import DatetimeIndex from pandas.core.indexes.period import Period, PeriodIndex import pandas.core.indexing as indexing diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 39892d87bfd69..67003dffb90bb 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -8,6 +8,7 @@ import numpy as np from pandas._typing import FrameOrSeries +from pandas.errors import InvalidIndexError from pandas.util._decorators import cache_readonly from pandas.core.dtypes.common import ( @@ -26,7 +27,6 @@ from pandas.core.groupby import ops from pandas.core.groupby.categorical import recode_for_groupby, recode_from_groupby from pandas.core.indexes.api import CategoricalIndex, Index, MultiIndex -from pandas.core.indexes.base import InvalidIndexError from pandas.core.series import Series from pandas.io.formats.printing import pprint_thing diff --git a/pandas/core/index.py b/pandas/core/index.py index 8cff53d7a8b74..a315b9619b0e7 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -7,7 +7,6 @@ Index, Int64Index, IntervalIndex, - InvalidIndexError, MultiIndex, NaT, NumericIndex, diff --git a/pandas/core/indexes/api.py b/pandas/core/indexes/api.py index fcce82e7a69db..4c5a70f4088ee 100644 --- a/pandas/core/indexes/api.py +++ b/pandas/core/indexes/api.py @@ -2,11 +2,11 @@ from typing import List, Set from pandas._libs import NaT, lib +from pandas.errors import InvalidIndexError import pandas.core.common as com from pandas.core.indexes.base import ( Index, - InvalidIndexError, _new_Index, ensure_index, ensure_index_from_sequences, diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 4a99d2dfe339a..c046d6465ce67 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -16,6 +16,7 @@ from pandas._typing import DtypeObj, Label from pandas.compat import set_function_name from pandas.compat.numpy import function as nv +from pandas.errors import InvalidIndexError from pandas.util._decorators import Appender, Substitution, cache_readonly, doc from pandas.core.dtypes import concat as _concat @@ -153,10 +154,6 @@ def index_arithmetic_method(self, other): return set_function_name(index_arithmetic_method, name, cls) -class InvalidIndexError(Exception): - pass - - _o_dtype = np.dtype(object) _Identity = object diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 6bcfb3bccf5c7..f3c96db0a8d6e 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -9,6 +9,7 @@ from pandas._libs.tslibs import Resolution, fields, parsing, timezones, to_offset from pandas._libs.tslibs.offsets import prefix_mapping from pandas._typing import DtypeObj, Label +from pandas.errors import InvalidIndexError from pandas.util._decorators import cache_readonly, doc from pandas.core.dtypes.common import ( @@ -24,7 +25,7 @@ from pandas.core.arrays.datetimes import DatetimeArray, tz_to_dtype import pandas.core.common as com -from pandas.core.indexes.base import Index, InvalidIndexError, maybe_extract_name +from pandas.core.indexes.base import Index, maybe_extract_name from pandas.core.indexes.datetimelike import DatetimeTimedeltaMixin from pandas.core.indexes.extension import inherit_names from pandas.core.tools.times import to_time diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 1a59e066879cc..3be2bcd4888cb 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -11,6 +11,7 @@ from pandas._libs.interval import Interval, IntervalMixin, IntervalTree from pandas._libs.tslibs import Timedelta, Timestamp, to_offset from pandas._typing import AnyArrayLike, Label +from pandas.errors import InvalidIndexError from pandas.util._decorators import Appender, Substitution, cache_readonly from pandas.util._exceptions import rewrite_exception @@ -44,7 +45,6 @@ import pandas.core.indexes.base as ibase from pandas.core.indexes.base import ( Index, - InvalidIndexError, _index_shared_docs, default_pprint, ensure_index, diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index fc2d4cf4621c4..a09e5a657f9fb 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -20,7 +20,7 @@ from pandas._libs.hashtable import duplicated_int64 from pandas._typing import AnyArrayLike, Scalar from pandas.compat.numpy import function as nv -from pandas.errors import PerformanceWarning, UnsortedIndexError +from pandas.errors import InvalidIndexError, PerformanceWarning, UnsortedIndexError from pandas.util._decorators import Appender, cache_readonly, doc from pandas.core.dtypes.cast import coerce_indexer_dtype @@ -45,12 +45,7 @@ from pandas.core.arrays.categorical import factorize_from_iterables import pandas.core.common as com import pandas.core.indexes.base as ibase -from pandas.core.indexes.base import ( - Index, - InvalidIndexError, - _index_shared_docs, - ensure_index, -) +from pandas.core.indexes.base import Index, _index_shared_docs, ensure_index from pandas.core.indexes.frozen import FrozenList from pandas.core.indexes.numeric import Int64Index import pandas.core.missing as missing diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index 31783f6dbaaf7..68c2b44b23964 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -8,6 +8,7 @@ from pandas._libs.tslibs import Period, Resolution from pandas._libs.tslibs.parsing import DateParseError, parse_time_string from pandas._typing import DtypeObj, Label +from pandas.errors import InvalidIndexError from pandas.util._decorators import Appender, cache_readonly, doc from pandas.core.dtypes.common import ( @@ -32,7 +33,6 @@ import pandas.core.common as com import pandas.core.indexes.base as ibase from pandas.core.indexes.base import ( - InvalidIndexError, _index_shared_docs, ensure_index, maybe_extract_name, diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index ce3ff17814a25..a14994866c0f7 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -3,6 +3,7 @@ from pandas._libs import index as libindex, lib from pandas._libs.tslibs import Timedelta, to_offset from pandas._typing import DtypeObj, Label +from pandas.errors import InvalidIndexError from pandas.util._decorators import doc from pandas.core.dtypes.common import ( @@ -18,7 +19,7 @@ from pandas.core.arrays import datetimelike as dtl from pandas.core.arrays.timedeltas import TimedeltaArray import pandas.core.common as com -from pandas.core.indexes.base import Index, InvalidIndexError, maybe_extract_name +from pandas.core.indexes.base import Index, maybe_extract_name from pandas.core.indexes.datetimelike import ( DatetimeIndexOpsMixin, DatetimeTimedeltaMixin, diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 326bd00270eca..9c8b01003bece 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -4,7 +4,7 @@ from pandas._libs.indexing import _NDFrameIndexerBase from pandas._libs.lib import item_from_zerodim -from pandas.errors import AbstractMethodError +from pandas.errors import AbstractMethodError, InvalidIndexError from pandas.util._decorators import doc from pandas.core.dtypes.common import ( @@ -29,7 +29,7 @@ is_list_like_indexer, length_of_indexer, ) -from pandas.core.indexes.api import Index, InvalidIndexError +from pandas.core.indexes.api import Index if TYPE_CHECKING: from pandas import DataFrame # noqa:F401 diff --git a/pandas/core/series.py b/pandas/core/series.py index b32a4c36a8247..a27e44efe1a97 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -34,6 +34,7 @@ ValueKeyFunc, ) from pandas.compat.numpy import function as nv +from pandas.errors import InvalidIndexError from pandas.util._decorators import Appender, Substitution, doc from pandas.util._validators import validate_bool_kwarg, validate_percentile @@ -79,13 +80,7 @@ from pandas.core.generic import NDFrame from pandas.core.indexers import unpack_1tuple from pandas.core.indexes.accessors import CombinedDatetimelikeProperties -from pandas.core.indexes.api import ( - Float64Index, - Index, - InvalidIndexError, - MultiIndex, - ensure_index, -) +from pandas.core.indexes.api import Float64Index, Index, MultiIndex, ensure_index import pandas.core.indexes.base as ibase from pandas.core.indexes.datetimes import DatetimeIndex from pandas.core.indexes.period import PeriodIndex diff --git a/pandas/errors/__init__.py b/pandas/errors/__init__.py index 4c4ce9df85543..e3427d93f3d84 100644 --- a/pandas/errors/__init__.py +++ b/pandas/errors/__init__.py @@ -200,3 +200,11 @@ class NumbaUtilError(Exception): """ Error raised for unsupported Numba engine routines. """ + + +class InvalidIndexError(Exception): + """ + Exception raised when attemping to use an invalid index key. + + .. versionadded:: 1.1.0 + """ diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index 0f9509c372bdf..37ff97f028e81 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -5,6 +5,7 @@ import pytest from pandas._libs import iNaT +from pandas.errors import InvalidIndexError from pandas.core.dtypes.common import is_datetime64tz_dtype from pandas.core.dtypes.dtypes import CategoricalDtype @@ -25,7 +26,6 @@ isna, ) import pandas._testing as tm -from pandas.core.indexes.base import InvalidIndexError from pandas.core.indexes.datetimelike import DatetimeIndexOpsMixin diff --git a/pandas/tests/indexes/datetimes/test_indexing.py b/pandas/tests/indexes/datetimes/test_indexing.py index f08472fe72631..b1faaa2115f55 100644 --- a/pandas/tests/indexes/datetimes/test_indexing.py +++ b/pandas/tests/indexes/datetimes/test_indexing.py @@ -3,10 +3,11 @@ import numpy as np import pytest +from pandas.errors import InvalidIndexError + import pandas as pd from pandas import DatetimeIndex, Index, Timestamp, date_range, notna import pandas._testing as tm -from pandas.core.indexes.base import InvalidIndexError from pandas.tseries.offsets import BDay, CDay diff --git a/pandas/tests/indexes/interval/test_indexing.py b/pandas/tests/indexes/interval/test_indexing.py index 718136fca6c80..3abc6e348748a 100644 --- a/pandas/tests/indexes/interval/test_indexing.py +++ b/pandas/tests/indexes/interval/test_indexing.py @@ -3,6 +3,8 @@ import numpy as np import pytest +from pandas.errors import InvalidIndexError + from pandas import ( CategoricalIndex, Interval, @@ -12,7 +14,6 @@ timedelta_range, ) import pandas._testing as tm -from pandas.core.indexes.base import InvalidIndexError class TestGetLoc: diff --git a/pandas/tests/indexes/interval/test_interval.py b/pandas/tests/indexes/interval/test_interval.py index 997887cc18d61..2755b186f3eae 100644 --- a/pandas/tests/indexes/interval/test_interval.py +++ b/pandas/tests/indexes/interval/test_interval.py @@ -4,6 +4,8 @@ import numpy as np import pytest +from pandas.errors import InvalidIndexError + import pandas as pd from pandas import ( Index, @@ -19,7 +21,6 @@ ) import pandas._testing as tm import pandas.core.common as com -from pandas.core.indexes.base import InvalidIndexError @pytest.fixture(scope="class", params=[None, "foo"]) diff --git a/pandas/tests/indexes/multi/test_indexing.py b/pandas/tests/indexes/multi/test_indexing.py index 4cc67986ad065..03ae2ae6a1f85 100644 --- a/pandas/tests/indexes/multi/test_indexing.py +++ b/pandas/tests/indexes/multi/test_indexing.py @@ -3,10 +3,11 @@ import numpy as np import pytest +from pandas.errors import InvalidIndexError + import pandas as pd from pandas import Categorical, Index, MultiIndex, date_range import pandas._testing as tm -from pandas.core.indexes.base import InvalidIndexError class TestSliceLocs: diff --git a/pandas/tests/indexes/period/test_indexing.py b/pandas/tests/indexes/period/test_indexing.py index 12454c20d2bb4..b61d1d903f89a 100644 --- a/pandas/tests/indexes/period/test_indexing.py +++ b/pandas/tests/indexes/period/test_indexing.py @@ -5,6 +5,7 @@ import pytest from pandas._libs.tslibs import period as libperiod +from pandas.errors import InvalidIndexError import pandas as pd from pandas import ( @@ -19,7 +20,6 @@ period_range, ) import pandas._testing as tm -from pandas.core.indexes.base import InvalidIndexError class TestGetItem: diff --git a/pandas/tests/resample/test_period_index.py b/pandas/tests/resample/test_period_index.py index 3db9a91118ebc..fe02eaef8ba82 100644 --- a/pandas/tests/resample/test_period_index.py +++ b/pandas/tests/resample/test_period_index.py @@ -7,11 +7,11 @@ from pandas._libs.tslibs.ccalendar import DAYS, MONTHS from pandas._libs.tslibs.period import IncompatibleFrequency +from pandas.errors import InvalidIndexError import pandas as pd from pandas import DataFrame, Series, Timestamp import pandas._testing as tm -from pandas.core.indexes.base import InvalidIndexError from pandas.core.indexes.datetimes import date_range from pandas.core.indexes.period import Period, PeriodIndex, period_range from pandas.core.resample import _get_period_range_edges From 35add5ff6019f38668c016544857508ea571e01a Mon Sep 17 00:00:00 2001 From: OlivierLuG <59281854+OlivierLuG@users.noreply.github.com> Date: Sun, 14 Jun 2020 22:46:33 +0200 Subject: [PATCH 0109/1025] CI/TST #34131 fixed test_floordiv_axis0_numexpr_path (#34537) --- pandas/tests/frame/test_arithmetic.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index d9f251a1b5304..a6b0ece58b095 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -11,6 +11,7 @@ from pandas import DataFrame, MultiIndex, Series import pandas._testing as tm import pandas.core.common as com +from pandas.core.computation.expressions import _MIN_ELEMENTS, _NUMEXPR_INSTALLED from pandas.tests.frame.common import _check_mixed_float, _check_mixed_int # ------------------------------------------------------------------- @@ -374,13 +375,13 @@ def test_floordiv_axis0(self): result2 = df.floordiv(ser.values, axis=0) tm.assert_frame_equal(result2, expected) - @pytest.mark.slow + @pytest.mark.skipif(not _NUMEXPR_INSTALLED, reason="numexpr not installed") @pytest.mark.parametrize("opname", ["floordiv", "pow"]) def test_floordiv_axis0_numexpr_path(self, opname): # case that goes through numexpr and has to fall back to masked_arith_op op = getattr(operator, opname) - arr = np.arange(10 ** 6).reshape(100, -1) + arr = np.arange(_MIN_ELEMENTS + 100).reshape(_MIN_ELEMENTS // 100 + 1, -1) * 100 df = pd.DataFrame(arr) df["C"] = 1.0 From 643b0a24ab835555eecbaa1975431824bb859c23 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 14 Jun 2020 13:47:18 -0700 Subject: [PATCH 0110/1025] REF: De-duplicate roll_yearday/roll_qtrday (#34760) --- pandas/_libs/tslibs/offsets.pyx | 92 ++------------------------ pandas/tests/tslibs/test_liboffsets.py | 12 ++-- 2 files changed, 13 insertions(+), 91 deletions(-) diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index 9e6356b55dcec..95f3c6ae54640 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -1879,7 +1879,7 @@ cdef class YearOffset(SingleConstructorOffset): @apply_wraps def apply(self, other): - years = roll_yearday(other, self.n, self.month, self._day_opt) + years = roll_qtrday(other, self.n, self.month, self._day_opt, modby=12) months = years * 12 + (self.month - other.month) return shift_month(other, months, self._day_opt) @@ -4158,10 +4158,13 @@ def roll_qtrday(other: datetime, n: int, month: int, npy_datetimestruct dts pydate_to_dtstruct(other, &dts) - # TODO: Merge this with roll_yearday by setting modby=12 there? - # code de-duplication versus perf hit? # TODO: with small adjustments this could be used in shift_quarters - months_since = other.month % modby - month % modby + + if modby == 12: + # We care about the month-of-year, not month-of-quarter, so skip mod + months_since = other.month - month + else: + months_since = other.month % modby - month % modby if n > 0: if months_since < 0 or (months_since == 0 and @@ -4177,84 +4180,3 @@ def roll_qtrday(other: datetime, n: int, month: int, # make sure to roll forward, so negate n += 1 return n - - -def roll_yearday(other: datetime, n: int, month: int, day_opt: object) -> int: - """ - Possibly increment or decrement the number of periods to shift - based on rollforward/rollbackward conventions. - - Parameters - ---------- - other : datetime or Timestamp - n : number of periods to increment, before adjusting for rolling - month : reference month giving the first month of the year - day_opt : 'start', 'end', 'business_start', 'business_end', or int - The day of the month to compare against that of `other` when - incrementing or decrementing the number of periods: - - 'start': 1 - 'end': last day of the month - 'business_start': first business day of the month - 'business_end': last business day of the month - int: day in the month indicated by `other`, or the last of day - the month if the value exceeds in that month's number of days. - - Returns - ------- - n : int number of periods to increment - - Notes - ----- - * Mirrors `roll_check` in shift_months - - Examples - ------- - >>> month = 3 - >>> day_opt = 'start' # `other` will be compared to March 1 - >>> other = datetime(2017, 2, 10) # before March 1 - >>> roll_yearday(other, 2, month, day_opt) - 1 - >>> roll_yearday(other, -7, month, day_opt) - -7 - >>> - >>> other = Timestamp('2014-03-15', tz='US/Eastern') # after March 1 - >>> roll_yearday(other, 2, month, day_opt) - 2 - >>> roll_yearday(other, -7, month, day_opt) - -6 - - >>> month = 6 - >>> day_opt = 'end' # `other` will be compared to June 30 - >>> other = datetime(1999, 6, 29) # before June 30 - >>> roll_yearday(other, 5, month, day_opt) - 4 - >>> roll_yearday(other, -7, month, day_opt) - -7 - >>> - >>> other = Timestamp(2072, 8, 24, 6, 17, 18) # after June 30 - >>> roll_yearday(other, 5, month, day_opt) - 5 - >>> roll_yearday(other, -7, month, day_opt) - -6 - - """ - cdef: - npy_datetimestruct dts - pydate_to_dtstruct(other, &dts) - - # Note: The other.day < ... condition will never hold when day_opt=='start' - # and the other.day > ... condition will never hold when day_opt=='end'. - # At some point these extra checks may need to be optimized away. - # But that point isn't today. - if n > 0: - if other.month < month or (other.month == month and - other.day < get_day_of_month(&dts, - day_opt)): - n -= 1 - else: - if other.month > month or (other.month == month and - other.day > get_day_of_month(&dts, - day_opt)): - n += 1 - return n diff --git a/pandas/tests/tslibs/test_liboffsets.py b/pandas/tests/tslibs/test_liboffsets.py index 6ff2ae669c8df..206a604788c7e 100644 --- a/pandas/tests/tslibs/test_liboffsets.py +++ b/pandas/tests/tslibs/test_liboffsets.py @@ -88,11 +88,11 @@ def test_shift_month_error(): ], ) @pytest.mark.parametrize("n", [2, -7, 0]) -def test_roll_yearday(other, expected, n): +def test_roll_qtrday_year(other, expected, n): month = 3 day_opt = "start" # `other` will be compared to March 1. - assert liboffsets.roll_yearday(other, n, month, day_opt) == expected[n] + assert roll_qtrday(other, n, month, day_opt, modby=12) == expected[n] @pytest.mark.parametrize( @@ -105,22 +105,22 @@ def test_roll_yearday(other, expected, n): ], ) @pytest.mark.parametrize("n", [5, -7, 0]) -def test_roll_yearday2(other, expected, n): +def test_roll_qtrday_year2(other, expected, n): month = 6 day_opt = "end" # `other` will be compared to June 30. - assert liboffsets.roll_yearday(other, n, month, day_opt) == expected[n] + assert roll_qtrday(other, n, month, day_opt, modby=12) == expected[n] def test_get_day_of_month_error(): # get_day_of_month is not directly exposed. - # We test it via roll_yearday. + # We test it via roll_qtrday. dt = datetime(2017, 11, 15) day_opt = "foo" with pytest.raises(ValueError, match=day_opt): # To hit the raising case we need month == dt.month and n > 0. - liboffsets.roll_yearday(dt, n=3, month=11, day_opt=day_opt) + roll_qtrday(dt, n=3, month=11, day_opt=day_opt, modby=12) @pytest.mark.parametrize( From 95eac70bd141153825384ce87a5002a4a6b92f85 Mon Sep 17 00:00:00 2001 From: Robert de Vries Date: Mon, 15 Jun 2020 00:20:18 +0200 Subject: [PATCH 0111/1025] BUG: Fix HDFStore empty keys on native HDF5 file by adding keyword include (#32723) --- doc/source/whatsnew/v1.1.0.rst | 1 + pandas/io/pytables.py | 27 ++++++++++++++++++-- pandas/tests/io/pytables/test_store.py | 34 ++++++++++++++++++++++++++ 3 files changed, 60 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index f087d969c7cd2..8db3d7affc5a5 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -962,6 +962,7 @@ I/O - Bug in :meth:`~DataFrame.to_excel` could not handle the column name `render` and was raising an ``KeyError`` (:issue:`34331`) - Bug in :meth:`~SQLDatabase.execute` was raising a ``ProgrammingError`` for some DB-API drivers when the SQL statement contained the `%` character and no parameters were present (:issue:`34211`) - Bug in :meth:`~pandas.io.stata.StataReader` which resulted in categorical variables with difference dtypes when reading data using an iterator. (:issue:`31544`) +- :meth:`HDFStore.keys` has now an optional `include` parameter that allows the retrieval of all native HDF5 table names (:issue:`29916`) Plotting ^^^^^^^^ diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 497b25d73df3e..8aac8f9531512 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -580,16 +580,39 @@ def __enter__(self): def __exit__(self, exc_type, exc_value, traceback): self.close() - def keys(self) -> List[str]: + def keys(self, include: str = "pandas") -> List[str]: """ Return a list of keys corresponding to objects stored in HDFStore. + Parameters + ---------- + + include : str, default 'pandas' + When kind equals 'pandas' return pandas objects + When kind equals 'native' return native HDF5 Table objects + + .. versionadded:: 1.1.0 + Returns ------- list List of ABSOLUTE path-names (e.g. have the leading '/'). + + Raises + ------ + raises ValueError if kind has an illegal value """ - return [n._v_pathname for n in self.groups()] + if include == "pandas": + return [n._v_pathname for n in self.groups()] + + elif include == "native": + assert self._handle is not None # mypy + return [ + n._v_pathname for n in self._handle.walk_nodes("/", classname="Table") + ] + raise ValueError( + f"`include` should be either 'pandas' or 'native' but is '{include}'" + ) def __iter__(self): return iter(self.keys()) diff --git a/pandas/tests/io/pytables/test_store.py b/pandas/tests/io/pytables/test_store.py index fe59b989bab7e..30b64b1750aa9 100644 --- a/pandas/tests/io/pytables/test_store.py +++ b/pandas/tests/io/pytables/test_store.py @@ -341,6 +341,40 @@ def create_h5_and_return_checksum(track_times): # checksums are NOT same if track_time = True assert checksum_0_tt_true != checksum_1_tt_true + def test_non_pandas_keys(self, setup_path): + class Table1(tables.IsDescription): + value1 = tables.Float32Col() + + class Table2(tables.IsDescription): + value2 = tables.Float32Col() + + class Table3(tables.IsDescription): + value3 = tables.Float32Col() + + with ensure_clean_path(setup_path) as path: + with tables.open_file(path, mode="w") as h5file: + group = h5file.create_group("/", "group") + h5file.create_table(group, "table1", Table1, "Table 1") + h5file.create_table(group, "table2", Table2, "Table 2") + h5file.create_table(group, "table3", Table3, "Table 3") + with HDFStore(path) as store: + assert len(store.keys(include="native")) == 3 + expected = {"/group/table1", "/group/table2", "/group/table3"} + assert set(store.keys(include="native")) == expected + assert set(store.keys(include="pandas")) == set() + for name in expected: + df = store.get(name) + assert len(df.columns) == 1 + + def test_keys_illegal_include_keyword_value(self, setup_path): + with ensure_clean_store(setup_path) as store: + with pytest.raises( + ValueError, + match="`include` should be either 'pandas' or 'native' " + "but is 'illegal'", + ): + store.keys(include="illegal") + def test_keys_ignore_hdf_softlink(self, setup_path): # GH 20523 From f7a949b2160017d2773e07749e098c8e8777a787 Mon Sep 17 00:00:00 2001 From: Fangchen Li Date: Sun, 14 Jun 2020 17:20:50 -0500 Subject: [PATCH 0112/1025] TST: groupby apply with indexing and column aggregation returns the column #7002 (#34647) --- pandas/tests/groupby/test_apply.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index bc8067212d60e..8468a21904bf8 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -961,3 +961,16 @@ def fn(x): name="col2", ) tm.assert_series_equal(result, expected) + + +def test_apply_function_with_indexing_return_column(): + # GH: 7002 + df = DataFrame( + { + "foo1": ["one", "two", "two", "three", "one", "two"], + "foo2": [1, 2, 4, 4, 5, 6], + } + ) + result = df.groupby("foo1", as_index=False).apply(lambda x: x.mean()) + expected = DataFrame({"foo1": ["one", "three", "two"], "foo2": [3.0, 4.0, 4.0]}) + tm.assert_frame_equal(result, expected) From 7cc6de3a41017906a53bb2e6c61a1f2c494920f0 Mon Sep 17 00:00:00 2001 From: willpeppo Date: Sun, 14 Jun 2020 18:21:47 -0400 Subject: [PATCH 0113/1025] DOC: updated multi.py docstring for SS06 errors (#34775) --- pandas/core/indexes/multi.py | 33 ++++++++++++++++++--------------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index a09e5a657f9fb..af70707bd3dfc 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -898,8 +898,7 @@ def _set_codes( def set_codes(self, codes, level=None, inplace=False, verify_integrity=True): """ - Set new codes on MultiIndex. Defaults to returning - new index. + Set new codes on MultiIndex. Defaults to returning new index. .. versionadded:: 0.24.0 @@ -1536,8 +1535,9 @@ def _get_level_values(self, level, unique=False): def get_level_values(self, level): """ - Return vector of label values for requested level, - equal to the length of the index. + Return vector of label values for requested level. + + Length of returned vector is equal to the length of the index. Parameters ---------- @@ -1792,12 +1792,12 @@ def _sort_levels_monotonic(self): def remove_unused_levels(self): """ - Create a new MultiIndex from the current that removes - unused levels, meaning that they are not expressed in the labels. + Create new MultiIndex from current that removes unused levels. - The resulting MultiIndex will have the same outward - appearance, meaning the same .values and ordering. It will also - be .equals() to the original. + Unused level(s) means levels that are not expressed in the + labels. The resulting MultiIndex will have the same outward + appearance, meaning the same .values and ordering. It will + also be .equals() to the original. Returns ------- @@ -2190,8 +2190,10 @@ def cats(level_codes): def sortlevel(self, level=0, ascending=True, sort_remaining=True): """ - Sort MultiIndex at the requested level. The result will respect the - original ordering of the associated factor at that level. + Sort MultiIndex at the requested level. + + The result will respect the original ordering of the associated + factor at that level. Parameters ---------- @@ -2629,8 +2631,10 @@ def _get_loc_single_level_index(self, level_index: Index, key: Hashable) -> int: def get_loc(self, key, method=None): """ - Get location for a label or a tuple of labels as an integer, slice or - boolean mask. + Get location for a label or a tuple of labels. + + The location is returned as an integer/slice or boolean + mask. Parameters ---------- @@ -2738,8 +2742,7 @@ def _maybe_to_slice(loc): def get_loc_level(self, key, level=0, drop_level: bool = True): """ - Get both the location for the requested label(s) and the - resulting sliced index. + Get location and sliced index for requested label(s)/level(s). Parameters ---------- From 6e09cc09f710d534dc0a805f359acea17caea9d5 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 14 Jun 2020 15:55:40 -0700 Subject: [PATCH 0114/1025] REF: implement shift_bday (#34761) --- pandas/_libs/tslibs/offsets.pyx | 78 +++++++++++++++++++++++++-------- 1 file changed, 59 insertions(+), 19 deletions(-) diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index 95f3c6ae54640..093d53db21dc1 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -18,7 +18,7 @@ from dateutil.easter import easter import numpy as np cimport numpy as cnp -from numpy cimport int64_t +from numpy cimport int64_t, ndarray cnp.import_array() # TODO: formalize having _libs.properties "above" tslibs in the dependency structure @@ -1380,24 +1380,7 @@ cdef class BusinessDay(BusinessMixin): @apply_index_wraps def apply_index(self, dtindex): i8other = dtindex.asi8 - time = (i8other % DAY_NANOS).view("timedelta64[ns]") - - # to_period rolls forward to next BDay; track and - # reduce n where it does when rolling forward - asper = dtindex.to_period("B") - - if self.n > 0: - shifted = (dtindex.to_perioddelta("B") - time).asi8 != 0 - - roll = np.where(shifted, self.n - 1, self.n) - shifted = asper._addsub_int_array(roll, operator.add) - else: - # Integer addition is deprecated, so we use _time_shift directly - roll = self.n - shifted = asper._time_shift(roll) - - result = shifted.to_timestamp() + time - return result + return shift_bdays(i8other, self.n) def is_on_offset(self, dt) -> bool: if self.normalize and not _is_normalized(dt): @@ -3995,6 +3978,63 @@ def shift_months(const int64_t[:] dtindex, int months, object day_opt=None): return np.asarray(out) +cdef ndarray[int64_t] shift_bdays(const int64_t[:] i8other, int periods): + """ + Implementation of BusinessDay.apply_offset. + + Parameters + ---------- + i8other : const int64_t[:] + periods : int + + Returns + ------- + ndarray[int64_t] + """ + cdef: + Py_ssize_t i, n = len(i8other) + int64_t[:] result = np.empty(n, dtype="i8") + int64_t val, res + int wday, nadj, days + npy_datetimestruct dts + + for i in range(n): + val = i8other[i] + if val == NPY_NAT: + result[i] = NPY_NAT + else: + # The rest of this is effectively a copy of BusinessDay.apply + nadj = periods + weeks = nadj // 5 + dt64_to_dtstruct(val, &dts) + wday = dayofweek(dts.year, dts.month, dts.day) + + if nadj <= 0 and wday > 4: + # roll forward + nadj += 1 + + nadj -= 5 * weeks + + # nadj is always >= 0 at this point + if nadj == 0 and wday > 4: + # roll back + days = 4 - wday + elif wday > 4: + # roll forward + days = (7 - wday) + (nadj - 1) + elif wday + nadj <= 4: + # shift by n days without leaving the current week + days = nadj + else: + # shift by nadj days plus 2 to get past the weekend + days = nadj + 2 + + res = val + (7 * weeks + days) * DAY_NANOS + result[i] = res + + return result.base + + def shift_month(stamp: datetime, months: int, day_opt: object=None) -> datetime: """ From 13a8502a85adba2645415d2791a1ffe6602be425 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 14 Jun 2020 17:17:50 -0700 Subject: [PATCH 0115/1025] REF: de-duplicate code in liboffsets (#34778) --- pandas/_libs/tslibs/offsets.pyx | 286 +++++++++----------------------- 1 file changed, 79 insertions(+), 207 deletions(-) diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index 093d53db21dc1..37be1e7aeda40 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -3723,136 +3723,14 @@ cdef shift_quarters( out : ndarray[int64_t] """ cdef: - Py_ssize_t i - npy_datetimestruct dts - int count = len(dtindex) - int months_to_roll, months_since, n, compare_day + Py_ssize_t count = len(dtindex) int64_t[:] out = np.empty(count, dtype="int64") - if day_opt == "start": - with nogil: - for i in range(count): - if dtindex[i] == NPY_NAT: - out[i] = NPY_NAT - continue - - dt64_to_dtstruct(dtindex[i], &dts) - n = quarters - - months_since = (dts.month - q1start_month) % modby - compare_day = get_day_of_month(&dts, day_opt) - - # offset semantics - if on the anchor point and going backwards - # shift to next - if n <= 0 and (months_since != 0 or - (months_since == 0 and dts.day > compare_day)): - # make sure to roll forward, so negate - n += 1 - elif n > 0 and (months_since == 0 and dts.day < compare_day): - # pretend to roll back if on same month but - # before compare_day - n -= 1 - - dts.year = year_add_months(dts, modby * n - months_since) - dts.month = month_add_months(dts, modby * n - months_since) - dts.day = get_day_of_month(&dts, day_opt) - - out[i] = dtstruct_to_dt64(&dts) - - elif day_opt == "end": - with nogil: - for i in range(count): - if dtindex[i] == NPY_NAT: - out[i] = NPY_NAT - continue - - dt64_to_dtstruct(dtindex[i], &dts) - n = quarters - - months_since = (dts.month - q1start_month) % modby - compare_day = get_day_of_month(&dts, day_opt) - - if n <= 0 and (months_since != 0 or - (months_since == 0 and dts.day > compare_day)): - # make sure to roll forward, so negate - n += 1 - elif n > 0 and (months_since == 0 and dts.day < compare_day): - # pretend to roll back if on same month but - # before compare_day - n -= 1 - - dts.year = year_add_months(dts, modby * n - months_since) - dts.month = month_add_months(dts, modby * n - months_since) - dts.day = get_day_of_month(&dts, day_opt) - - out[i] = dtstruct_to_dt64(&dts) - - elif day_opt == "business_start": - with nogil: - for i in range(count): - if dtindex[i] == NPY_NAT: - out[i] = NPY_NAT - continue - - dt64_to_dtstruct(dtindex[i], &dts) - n = quarters - - months_since = (dts.month - q1start_month) % modby - # compare_day is only relevant for comparison in the case - # where months_since == 0. - compare_day = get_day_of_month(&dts, day_opt) - - if n <= 0 and (months_since != 0 or - (months_since == 0 and dts.day > compare_day)): - # make sure to roll forward, so negate - n += 1 - elif n > 0 and (months_since == 0 and dts.day < compare_day): - # pretend to roll back if on same month but - # before compare_day - n -= 1 - - dts.year = year_add_months(dts, modby * n - months_since) - dts.month = month_add_months(dts, modby * n - months_since) - - dts.day = get_day_of_month(&dts, day_opt) - - out[i] = dtstruct_to_dt64(&dts) - - elif day_opt == "business_end": - with nogil: - for i in range(count): - if dtindex[i] == NPY_NAT: - out[i] = NPY_NAT - continue - - dt64_to_dtstruct(dtindex[i], &dts) - n = quarters - - months_since = (dts.month - q1start_month) % modby - # compare_day is only relevant for comparison in the case - # where months_since == 0. - compare_day = get_day_of_month(&dts, day_opt) - - if n <= 0 and (months_since != 0 or - (months_since == 0 and dts.day > compare_day)): - # make sure to roll forward, so negate - n += 1 - elif n > 0 and (months_since == 0 and dts.day < compare_day): - # pretend to roll back if on same month but - # before compare_day - n -= 1 - - dts.year = year_add_months(dts, modby * n - months_since) - dts.month = month_add_months(dts, modby * n - months_since) - - dts.day = get_day_of_month(&dts, day_opt) - - out[i] = dtstruct_to_dt64(&dts) - - else: + if day_opt not in ["start", "end", "business_start", "business_end"]: raise ValueError("day must be None, 'start', 'end', " "'business_start', or 'business_end'") + _shift_quarters(dtindex, out, count, quarters, q1start_month, day_opt, modby) return np.asarray(out) @@ -3872,7 +3750,6 @@ def shift_months(const int64_t[:] dtindex, int months, object day_opt=None): Py_ssize_t i npy_datetimestruct dts int count = len(dtindex) - int months_to_roll int64_t[:] out = np.empty(count, dtype="int64") if day_opt is None: @@ -3888,94 +3765,90 @@ def shift_months(const int64_t[:] dtindex, int months, object day_opt=None): dts.day = min(dts.day, get_days_in_month(dts.year, dts.month)) out[i] = dtstruct_to_dt64(&dts) - elif day_opt == "start": - with nogil: - for i in range(count): - if dtindex[i] == NPY_NAT: - out[i] = NPY_NAT - continue - - dt64_to_dtstruct(dtindex[i], &dts) - months_to_roll = months - compare_day = get_day_of_month(&dts, day_opt) + elif day_opt in ["start", "end", "business_start", "business_end"]: + _shift_months(dtindex, out, count, months, day_opt) - # offset semantics - if on the anchor point and going backwards - # shift to next - months_to_roll = roll_convention(dts.day, months_to_roll, - compare_day) - - dts.year = year_add_months(dts, months_to_roll) - dts.month = month_add_months(dts, months_to_roll) - dts.day = get_day_of_month(&dts, day_opt) - - out[i] = dtstruct_to_dt64(&dts) - elif day_opt == "end": - with nogil: - for i in range(count): - if dtindex[i] == NPY_NAT: - out[i] = NPY_NAT - continue + else: + raise ValueError("day must be None, 'start', 'end', " + "'business_start', or 'business_end'") - dt64_to_dtstruct(dtindex[i], &dts) - months_to_roll = months - compare_day = get_day_of_month(&dts, day_opt) + return np.asarray(out) - # similar semantics - when adding shift forward by one - # month if already at an end of month - months_to_roll = roll_convention(dts.day, months_to_roll, - compare_day) - dts.year = year_add_months(dts, months_to_roll) - dts.month = month_add_months(dts, months_to_roll) +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline void _shift_months(const int64_t[:] dtindex, + int64_t[:] out, + Py_ssize_t count, + int months, + str day_opt) nogil: + """See shift_months.__doc__""" + cdef: + Py_ssize_t i + int months_to_roll, compare_day + npy_datetimestruct dts - dts.day = get_day_of_month(&dts, day_opt) - out[i] = dtstruct_to_dt64(&dts) + for i in range(count): + if dtindex[i] == NPY_NAT: + out[i] = NPY_NAT + continue - elif day_opt == "business_start": - with nogil: - for i in range(count): - if dtindex[i] == NPY_NAT: - out[i] = NPY_NAT - continue + dt64_to_dtstruct(dtindex[i], &dts) + months_to_roll = months + compare_day = get_day_of_month(&dts, day_opt) - dt64_to_dtstruct(dtindex[i], &dts) - months_to_roll = months - compare_day = get_day_of_month(&dts, day_opt) + months_to_roll = roll_convention(dts.day, months_to_roll, + compare_day) - months_to_roll = roll_convention(dts.day, months_to_roll, - compare_day) + dts.year = year_add_months(dts, months_to_roll) + dts.month = month_add_months(dts, months_to_roll) + dts.day = get_day_of_month(&dts, day_opt) - dts.year = year_add_months(dts, months_to_roll) - dts.month = month_add_months(dts, months_to_roll) + out[i] = dtstruct_to_dt64(&dts) - dts.day = get_day_of_month(&dts, day_opt) - out[i] = dtstruct_to_dt64(&dts) - elif day_opt == "business_end": - with nogil: - for i in range(count): - if dtindex[i] == NPY_NAT: - out[i] = NPY_NAT - continue +@cython.wraparound(False) +@cython.boundscheck(False) +cdef inline void _shift_quarters(const int64_t[:] dtindex, + int64_t[:] out, + Py_ssize_t count, + int quarters, + int q1start_month, + str day_opt, + int modby) nogil: + """See shift_quarters.__doc__""" + cdef: + Py_ssize_t i + int months_since, compare_day, n + npy_datetimestruct dts - dt64_to_dtstruct(dtindex[i], &dts) - months_to_roll = months - compare_day = get_day_of_month(&dts, day_opt) + for i in range(count): + if dtindex[i] == NPY_NAT: + out[i] = NPY_NAT + continue - months_to_roll = roll_convention(dts.day, months_to_roll, - compare_day) + dt64_to_dtstruct(dtindex[i], &dts) + n = quarters - dts.year = year_add_months(dts, months_to_roll) - dts.month = month_add_months(dts, months_to_roll) + months_since = (dts.month - q1start_month) % modby + compare_day = get_day_of_month(&dts, day_opt) - dts.day = get_day_of_month(&dts, day_opt) - out[i] = dtstruct_to_dt64(&dts) + # offset semantics - if on the anchor point and going backwards + # shift to next + if n <= 0 and (months_since != 0 or + (months_since == 0 and dts.day > compare_day)): + # make sure to roll forward, so negate + n += 1 + elif n > 0 and (months_since == 0 and dts.day < compare_day): + # pretend to roll back if on same month but + # before compare_day + n -= 1 - else: - raise ValueError("day must be None, 'start', 'end', " - "'business_start', or 'business_end'") + dts.year = year_add_months(dts, modby * n - months_since) + dts.month = month_add_months(dts, modby * n - months_since) + dts.day = get_day_of_month(&dts, day_opt) - return np.asarray(out) + out[i] = dtstruct_to_dt64(&dts) cdef ndarray[int64_t] shift_bdays(const int64_t[:] i8other, int periods): @@ -4035,8 +3908,7 @@ cdef ndarray[int64_t] shift_bdays(const int64_t[:] i8other, int periods): return result.base -def shift_month(stamp: datetime, months: int, - day_opt: object=None) -> datetime: +def shift_month(stamp: datetime, months: int, day_opt: object=None) -> datetime: """ Given a datetime (or Timestamp) `stamp`, an integer `months` and an option `day_opt`, return a new datetimelike that many months later, @@ -4078,14 +3950,14 @@ def shift_month(stamp: datetime, months: int, if day_opt is None: days_in_month = get_days_in_month(year, month) day = min(stamp.day, days_in_month) - elif day_opt == 'start': + elif day_opt == "start": day = 1 - elif day_opt == 'end': + elif day_opt == "end": day = get_days_in_month(year, month) - elif day_opt == 'business_start': + elif day_opt == "business_start": # first business day of month day = get_firstbday(year, month) - elif day_opt == 'business_end': + elif day_opt == "business_end": # last business day of month day = get_lastbday(year, month) elif is_integer_object(day_opt): @@ -4126,15 +3998,15 @@ cdef inline int get_day_of_month(npy_datetimestruct* dts, day_opt) nogil except? cdef: int days_in_month - if day_opt == 'start': + if day_opt == "start": return 1 - elif day_opt == 'end': + elif day_opt == "end": days_in_month = get_days_in_month(dts.year, dts.month) return days_in_month - elif day_opt == 'business_start': + elif day_opt == "business_start": # first business day of month return get_firstbday(dts.year, dts.month) - elif day_opt == 'business_end': + elif day_opt == "business_end": # last business day of month return get_lastbday(dts.year, dts.month) elif day_opt is not None: From aa6ac219ab23cbe545b9ca7c42b301ccc900c449 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 14 Jun 2020 17:57:16 -0700 Subject: [PATCH 0116/1025] CLN: disallow tuple in to_offset (#34703) --- doc/source/whatsnew/v1.1.0.rst | 1 + pandas/_libs/tslibs/offsets.pyx | 43 +++---------------- .../indexes/datetimes/test_constructors.py | 11 ++--- .../tests/indexes/period/test_constructors.py | 10 ++--- pandas/tests/indexes/period/test_period.py | 6 --- pandas/tests/scalar/period/test_period.py | 6 ++- pandas/tests/tslibs/test_to_offset.py | 9 +++- 7 files changed, 27 insertions(+), 59 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 8db3d7affc5a5..5f8668f85c3b3 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -820,6 +820,7 @@ Datetimelike - Bug in :meth:`DatetimeIndex.intersection` and :meth:`TimedeltaIndex.intersection` with results not having the correct ``name`` attribute (:issue:`33904`) - Bug in :meth:`DatetimeArray.__setitem__`, :meth:`TimedeltaArray.__setitem__`, :meth:`PeriodArray.__setitem__` incorrectly allowing values with ``int64`` dtype to be silently cast (:issue:`33717`) - Bug in subtracting :class:`TimedeltaIndex` from :class:`Period` incorrectly raising ``TypeError`` in some cases where it should succeed and ``IncompatibleFrequency`` in some cases where it should raise ``TypeError`` (:issue:`33883`) +- The ``freq`` keyword in :class:`Period`, :func:`date_range`, :func:`period_range`, :func:`pd.tseries.frequencies.to_offset` no longer allows tuples, pass as string instead (:issue:`34703`) Timedelta ^^^^^^^^^ diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index 37be1e7aeda40..1dae34e1ac49c 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -3482,36 +3482,6 @@ INVALID_FREQ_ERR_MSG = "Invalid frequency: {0}" _offset_map = {} -cdef _base_and_stride(str freqstr): - """ - Return base freq and stride info from string representation - - Returns - ------- - base : str - stride : int - - Examples - -------- - _base_and_stride('5Min') -> 'Min', 5 - """ - groups = opattern.match(freqstr) - - if not groups: - raise ValueError(f"Could not evaluate {freqstr}") - - stride = groups.group(1) - - if len(stride): - stride = int(stride) - else: - stride = 1 - - base = groups.group(2) - - return base, stride - - # TODO: better name? def _get_offset(name: str) -> BaseOffset: """ @@ -3574,10 +3544,10 @@ cpdef to_offset(freq): >>> to_offset("1D1H") <25 * Hours> - >>> to_offset(("W", 2)) + >>> to_offset("2W") <2 * Weeks: weekday=6> - >>> to_offset((2, "B")) + >>> to_offset("2B") <2 * BusinessDays> >>> to_offset(pd.Timedelta(days=1)) @@ -3593,12 +3563,9 @@ cpdef to_offset(freq): return freq if isinstance(freq, tuple): - name = freq[0] - stride = freq[1] - if isinstance(stride, str): - name, stride = stride, name - name, _ = _base_and_stride(name) - delta = _get_offset(name) * stride + raise TypeError( + f"to_offset does not support tuples {freq}, pass as a string instead" + ) elif isinstance(freq, timedelta): return delta_to_tick(freq) diff --git a/pandas/tests/indexes/datetimes/test_constructors.py b/pandas/tests/indexes/datetimes/test_constructors.py index b15549839de03..c150e7901c86a 100644 --- a/pandas/tests/indexes/datetimes/test_constructors.py +++ b/pandas/tests/indexes/datetimes/test_constructors.py @@ -946,11 +946,6 @@ def test_datetimeindex_constructor_misc(self): assert idx[0] == sdate + 0 * offsets.BDay() assert idx.freq == "B" - idx = date_range(end=edate, freq=("D", 5), periods=20) - assert len(idx) == 20 - assert idx[-1] == edate - assert idx.freq == "5D" - idx1 = date_range(start=sdate, end=edate, freq="W-SUN") idx2 = date_range(start=sdate, end=edate, freq=offsets.Week(weekday=6)) assert len(idx1) == len(idx2) @@ -979,6 +974,12 @@ def test_pass_datetimeindex_to_index(self): tm.assert_numpy_array_equal(idx.values, expected.values) + def test_date_range_tuple_freq_raises(self): + # GH#34703 + edate = datetime(2000, 1, 1) + with pytest.raises(TypeError, match="pass as a string instead"): + date_range(end=edate, freq=("D", 5), periods=20) + def test_timestamp_constructor_invalid_fold_raise(): # Test for #25057 diff --git a/pandas/tests/indexes/period/test_constructors.py b/pandas/tests/indexes/period/test_constructors.py index 4ec7ef64e2272..f85f37e4127c3 100644 --- a/pandas/tests/indexes/period/test_constructors.py +++ b/pandas/tests/indexes/period/test_constructors.py @@ -463,12 +463,6 @@ def test_constructor(self): assert (i1 == i2).all() assert i1.freq == i2.freq - end_intv = Period("2006-12-31", ("w", 1)) - i2 = period_range(end=end_intv, periods=10) - assert len(i1) == len(i2) - assert (i1 == i2).all() - assert i1.freq == i2.freq - end_intv = Period("2005-05-01", "B") i1 = period_range(start=start, end=end_intv) @@ -490,6 +484,10 @@ def test_constructor(self): with pytest.raises(IncompatibleFrequency, match=msg): PeriodIndex(vals) + # tuple freq disallowed GH#34703 + with pytest.raises(TypeError, match="pass as a string instead"): + Period("2006-12-31", ("w", 1)) + @pytest.mark.parametrize( "freq", ["M", "Q", "A", "D", "B", "T", "S", "L", "U", "N", "H"] ) diff --git a/pandas/tests/indexes/period/test_period.py b/pandas/tests/indexes/period/test_period.py index d247d6571f5d0..47617802be11c 100644 --- a/pandas/tests/indexes/period/test_period.py +++ b/pandas/tests/indexes/period/test_period.py @@ -172,12 +172,6 @@ def test_period_index_length(self): assert (i1 == i2).all() assert i1.freq == i2.freq - end_intv = Period("2006-12-31", ("w", 1)) - i2 = period_range(end=end_intv, periods=10) - assert len(i1) == len(i2) - assert (i1 == i2).all() - assert i1.freq == i2.freq - msg = "start and end must have same freq" with pytest.raises(ValueError, match=msg): period_range(start=start, end=end_intv) diff --git a/pandas/tests/scalar/period/test_period.py b/pandas/tests/scalar/period/test_period.py index 702899f163e06..dcef0615121c1 100644 --- a/pandas/tests/scalar/period/test_period.py +++ b/pandas/tests/scalar/period/test_period.py @@ -48,8 +48,6 @@ def test_construction(self): i1 = Period("1982", freq="min") i2 = Period("1982", freq="MIN") assert i1 == i2 - i2 = Period("1982", freq=("Min", 1)) - assert i1 == i2 i1 = Period(year=2005, month=3, day=1, freq="D") i2 = Period("3/1/2005", freq="D") @@ -80,6 +78,10 @@ def test_construction(self): with pytest.raises(ValueError, match=msg): Period("2007-1-1", freq="X") + # GH#34703 tuple freq disallowed + with pytest.raises(TypeError, match="pass as a string instead"): + Period("1982", freq=("Min", 1)) + def test_construction_bday(self): # Biz day construction, roll forward if non-weekday diff --git a/pandas/tests/tslibs/test_to_offset.py b/pandas/tests/tslibs/test_to_offset.py index 04be0e445a3b2..93e5e2c801c09 100644 --- a/pandas/tests/tslibs/test_to_offset.py +++ b/pandas/tests/tslibs/test_to_offset.py @@ -10,7 +10,6 @@ [ (to_offset("10us"), offsets.Micro(10)), (offsets.Hour(), offsets.Hour()), - ((5, "T"), offsets.Minute(5)), ("2h30min", offsets.Minute(150)), ("2h 30min", offsets.Minute(150)), ("2h30min15s", offsets.Second(150 * 60 + 15)), @@ -89,10 +88,16 @@ def test_to_offset_invalid(freqstr): def test_to_offset_no_evaluate(): - with pytest.raises(ValueError, match="Could not evaluate"): + msg = str(("", "")) + with pytest.raises(TypeError, match=msg): to_offset(("", "")) +def test_to_offset_tuple_unsupported(): + with pytest.raises(TypeError, match="pass as a string instead"): + to_offset((5, "T")) + + @pytest.mark.parametrize( "freqstr,expected", [ From 7052c7bafc33c722daab6fd80884191b75526e6d Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sun, 14 Jun 2020 22:19:39 -0400 Subject: [PATCH 0117/1025] TST: remove super slow cases on upsample_nearest_limit (#34780) --- .pre-commit-config.yaml | 2 +- pandas/tests/io/parser/test_multi_thread.py | 3 +++ pandas/tests/resample/test_datetime_index.py | 22 +------------------- 3 files changed, 5 insertions(+), 22 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 896765722bf32..b7fd797fb7230 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -3,7 +3,7 @@ repos: rev: 19.10b0 hooks: - id: black - language_version: python3.7 + language_version: python3 - repo: https://gitlab.com/pycqa/flake8 rev: 3.7.7 hooks: diff --git a/pandas/tests/io/parser/test_multi_thread.py b/pandas/tests/io/parser/test_multi_thread.py index 458ff4da55ed3..d50560c684084 100644 --- a/pandas/tests/io/parser/test_multi_thread.py +++ b/pandas/tests/io/parser/test_multi_thread.py @@ -6,6 +6,7 @@ from multiprocessing.pool import ThreadPool import numpy as np +import pytest import pandas as pd from pandas import DataFrame @@ -34,6 +35,7 @@ def _construct_dataframe(num_rows): return df +@pytest.mark.slow def test_multi_thread_string_io_read_csv(all_parsers): # see gh-11786 parser = all_parsers @@ -126,6 +128,7 @@ def reader(arg): return final_dataframe +@pytest.mark.slow def test_multi_thread_path_multipart_read_csv(all_parsers): # see gh-11786 num_tasks = 4 diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index 8d7d45f54ad5f..43d2bf80505db 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -2,7 +2,6 @@ from functools import partial from io import StringIO -from dateutil.tz import tzlocal import numpy as np import pytest import pytz @@ -477,15 +476,10 @@ def test_upsample_with_limit(): tm.assert_series_equal(result, expected) -@pytest.mark.parametrize("freq", ["Y", "10M", "5D", "10H", "5Min", "10S"]) +@pytest.mark.parametrize("freq", ["5D", "10H", "5Min", "10S"]) @pytest.mark.parametrize("rule", ["Y", "3M", "15D", "30H", "15Min", "30S"]) def test_nearest_upsample_with_limit(tz_aware_fixture, freq, rule): # GH 33939 - tz = tz_aware_fixture - if str(tz) == "tzlocal()" and rule == "30S" and freq in ["Y", "10M"]: - # GH#34413 separate these so we can mark as slow, see - # test_nearest_upsample_with_limit_tzlocal - return rng = date_range("1/1/2000", periods=3, freq=freq, tz=tz_aware_fixture) ts = Series(np.random.randn(len(rng)), rng) @@ -494,20 +488,6 @@ def test_nearest_upsample_with_limit(tz_aware_fixture, freq, rule): tm.assert_series_equal(result, expected) -@pytest.mark.slow -@pytest.mark.parametrize("freq", ["Y", "10M"]) -def test_nearest_upsample_with_limit_tzlocal(freq): - # GH#33939, GH#34413 split off from test_nearest_upsample_with_limit - rule = "30S" - tz = tzlocal() - rng = date_range("1/1/2000", periods=3, freq=freq, tz=tz) - ts = Series(np.random.randn(len(rng)), rng) - - result = ts.resample(rule).nearest(limit=2) - expected = ts.reindex(result.index, method="nearest", limit=2) - tm.assert_series_equal(result, expected) - - def test_resample_ohlc(series): s = series From a0c37386a748eddb7b2d52443fb244a1aec0c3db Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 14 Jun 2020 19:19:53 -0700 Subject: [PATCH 0118/1025] REF: implement _roll_qtrday (#34781) --- pandas/_libs/tslibs/offsets.pyx | 34 ++++++++++++++------------------- 1 file changed, 14 insertions(+), 20 deletions(-) diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index 1dae34e1ac49c..1e002f4a1af88 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -3786,7 +3786,7 @@ cdef inline void _shift_quarters(const int64_t[:] dtindex, """See shift_quarters.__doc__""" cdef: Py_ssize_t i - int months_since, compare_day, n + int months_since, n npy_datetimestruct dts for i in range(count): @@ -3798,18 +3798,7 @@ cdef inline void _shift_quarters(const int64_t[:] dtindex, n = quarters months_since = (dts.month - q1start_month) % modby - compare_day = get_day_of_month(&dts, day_opt) - - # offset semantics - if on the anchor point and going backwards - # shift to next - if n <= 0 and (months_since != 0 or - (months_since == 0 and dts.day > compare_day)): - # make sure to roll forward, so negate - n += 1 - elif n > 0 and (months_since == 0 and dts.day < compare_day): - # pretend to roll back if on same month but - # before compare_day - n -= 1 + n = _roll_qtrday(&dts, n, months_since, day_opt) dts.year = year_add_months(dts, modby * n - months_since) dts.month = month_add_months(dts, modby * n - months_since) @@ -4009,7 +3998,7 @@ cpdef int roll_convention(int other, int n, int compare) nogil: def roll_qtrday(other: datetime, n: int, month: int, - day_opt: object, modby: int=3) -> int: + day_opt: object, modby: int) -> int: """ Possibly increment or decrement the number of periods to shift based on rollforward/rollbackward conventions. @@ -4037,25 +4026,30 @@ def roll_qtrday(other: datetime, n: int, month: int, npy_datetimestruct dts pydate_to_dtstruct(other, &dts) - # TODO: with small adjustments this could be used in shift_quarters - if modby == 12: # We care about the month-of-year, not month-of-quarter, so skip mod months_since = other.month - month else: months_since = other.month % modby - month % modby + return _roll_qtrday(&dts, n, months_since, day_opt) + + +cdef inline int _roll_qtrday(npy_datetimestruct* dts, + int n, + int months_since, + str day_opt) nogil except? -1: + """See roll_qtrday.__doc__""" + if n > 0: if months_since < 0 or (months_since == 0 and - other.day < get_day_of_month(&dts, - day_opt)): + dts.day < get_day_of_month(dts, day_opt)): # pretend to roll back if on same month but # before compare_day n -= 1 else: if months_since > 0 or (months_since == 0 and - other.day > get_day_of_month(&dts, - day_opt)): + dts.day > get_day_of_month(dts, day_opt)): # make sure to roll forward, so negate n += 1 return n From 2ad8a867a47c760df0e2c588ed6de4eea57dd682 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 14 Jun 2020 19:20:07 -0700 Subject: [PATCH 0119/1025] REF: avoid using DTA/PA methods in apply_index (#34782) --- pandas/_libs/tslibs/offsets.pyx | 50 +++++++++++++++++++-------------- 1 file changed, 29 insertions(+), 21 deletions(-) diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index 1e002f4a1af88..9234cfb295d65 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -2507,6 +2507,8 @@ cdef class Week(SingleConstructorOffset): else: return self._end_apply_index(dtindex) + @cython.wraparound(False) + @cython.boundscheck(False) def _end_apply_index(self, dtindex): """ Add self to the given DatetimeIndex, specialized for case where @@ -2518,31 +2520,37 @@ cdef class Week(SingleConstructorOffset): Returns ------- - result : DatetimeIndex + ndarray[int64_t] """ - i8other = dtindex.asi8 - off = (i8other % DAY_NANOS).view("timedelta64[ns]") + cdef: + int64_t[:] i8other = dtindex.view("i8") + Py_ssize_t i, count = len(i8other) + int64_t val + int64_t[:] out = np.empty(count, dtype="i8") + npy_datetimestruct dts + int wday, days, weeks, n = self.n + int anchor_weekday = self.weekday - base = self._period_dtype_code - base_period = dtindex.to_period(base) + with nogil: + for i in range(count): + val = i8other[i] + if val == NPY_NAT: + out[i] = NPY_NAT + continue - if self.n > 0: - # when adding, dates on end roll to next - normed = dtindex - off + Timedelta(1, "D") - Timedelta(1, "ns") - roll = np.where( - base_period.to_timestamp(how="end") == normed, self.n, self.n - 1 - ) - # integer-array addition on PeriodIndex is deprecated, - # so we use _addsub_int_array directly - shifted = base_period._addsub_int_array(roll, operator.add) - base = shifted.to_timestamp(how="end") - else: - # integer addition on PeriodIndex is deprecated, - # so we use _time_shift directly - roll = self.n - base = base_period._time_shift(roll).to_timestamp(how="end") + dt64_to_dtstruct(val, &dts) + wday = dayofweek(dts.year, dts.month, dts.day) + + days = 0 + weeks = n + if wday != anchor_weekday: + days = (anchor_weekday - wday) % 7 + if weeks > 0: + weeks -= 1 + + out[i] = val + (7 * weeks + days) * DAY_NANOS - return base + off + Timedelta(1, "ns") - Timedelta(1, "D") + return out.base def is_on_offset(self, dt) -> bool: if self.normalize and not _is_normalized(dt): From f87b480967efb7c9c631f95944bc866bb8f0bce7 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 15 Jun 2020 05:23:05 -0700 Subject: [PATCH 0120/1025] REF: avoid DTA/PA methods in SemiMonthOffset.apply_index (#34783) --- pandas/_libs/tslibs/offsets.pyx | 157 ++++++++++---------------------- 1 file changed, 49 insertions(+), 108 deletions(-) diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index 9234cfb295d65..d22f2b9117326 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -2269,56 +2269,62 @@ cdef class SemiMonthOffset(SingleConstructorOffset): raise NotImplementedError(self) @apply_index_wraps + @cython.wraparound(False) + @cython.boundscheck(False) def apply_index(self, dtindex): - # determine how many days away from the 1st of the month we are - - dti = dtindex - i8other = dtindex.asi8 - days_from_start = dtindex.to_perioddelta("M").asi8 - delta = Timedelta(days=self.day_of_month - 1).value - - # get boolean array for each element before the day_of_month - before_day_of_month = days_from_start < delta - - # get boolean array for each element after the day_of_month - after_day_of_month = days_from_start > delta - - # determine the correct n for each date in dtindex - roll = self._get_roll(i8other, before_day_of_month, after_day_of_month) - - # isolate the time since it will be striped away one the next line - time = (i8other % DAY_NANOS).view("timedelta64[ns]") - - # apply the correct number of months - - # integer-array addition on PeriodIndex is deprecated, - # so we use _addsub_int_array directly - asper = dtindex.to_period("M") + cdef: + int64_t[:] i8other = dtindex.view("i8") + Py_ssize_t i, count = len(i8other) + int64_t val + int64_t[:] out = np.empty(count, dtype="i8") + npy_datetimestruct dts + int months, to_day, nadj, n = self.n + int days_in_month, day, anchor_dom = self.day_of_month + bint is_start = isinstance(self, SemiMonthBegin) - shifted = asper._addsub_int_array(roll // 2, operator.add) - dtindex = type(dti)(shifted.to_timestamp()) - dt64other = np.asarray(dtindex) + with nogil: + for i in range(count): + val = i8other[i] + if val == NPY_NAT: + out[i] = NPY_NAT + continue - # apply the correct day - dt64result = self._apply_index_days(dt64other, roll) + dt64_to_dtstruct(val, &dts) + day = dts.day + + # Adjust so that we are always looking at self.day_of_month, + # incrementing/decrementing n if necessary. + nadj = roll_convention(day, n, anchor_dom) + + days_in_month = get_days_in_month(dts.year, dts.month) + # For SemiMonthBegin on other.day == 1 and + # SemiMonthEnd on other.day == days_in_month, + # shifting `other` to `self.day_of_month` _always_ requires + # incrementing/decrementing `n`, regardless of whether it is + # initially positive. + if is_start and (n <= 0 and day == 1): + nadj -= 1 + elif (not is_start) and (n > 0 and day == days_in_month): + nadj += 1 + + if is_start: + # See also: SemiMonthBegin._apply + months = nadj // 2 + nadj % 2 + to_day = 1 if nadj % 2 else anchor_dom - return dt64result + time + else: + # See also: SemiMonthEnd._apply + months = nadj // 2 + to_day = 31 if nadj % 2 else anchor_dom - def _get_roll(self, i8other, before_day_of_month, after_day_of_month): - """ - Return an array with the correct n for each date in dtindex. + dts.year = year_add_months(dts, months) + dts.month = month_add_months(dts, months) + days_in_month = get_days_in_month(dts.year, dts.month) + dts.day = min(to_day, days_in_month) - The roll array is based on the fact that dtindex gets rolled back to - the first day of the month. - """ - # before_day_of_month and after_day_of_month are ndarray[bool] - raise NotImplementedError + out[i] = dtstruct_to_dt64(&dts) - def _apply_index_days(self, dt64other, roll): - """ - Apply the correct day for each date in dt64other. - """ - raise NotImplementedError + return out.base cdef class SemiMonthEnd(SemiMonthOffset): @@ -2347,39 +2353,6 @@ cdef class SemiMonthEnd(SemiMonthOffset): day = 31 if n % 2 else self.day_of_month return shift_month(other, months, day) - def _get_roll(self, i8other, before_day_of_month, after_day_of_month): - # before_day_of_month and after_day_of_month are ndarray[bool] - n = self.n - is_month_end = get_start_end_field(i8other, "is_month_end") - if n > 0: - roll_end = np.where(is_month_end, 1, 0) - roll_before = np.where(before_day_of_month, n, n + 1) - roll = roll_end + roll_before - elif n == 0: - roll_after = np.where(after_day_of_month, 2, 0) - roll_before = np.where(~after_day_of_month, 1, 0) - roll = roll_before + roll_after - else: - roll = np.where(after_day_of_month, n + 2, n + 1) - return roll - - def _apply_index_days(self, dt64other, roll): - """ - Add days portion of offset to dt64other. - - Parameters - ---------- - dt64other : ndarray[datetime64[ns]] - roll : ndarray[int64_t] - - Returns - ------- - ndarray[datetime64[ns]] - """ - nanos = (roll % 2) * Timedelta(days=self.day_of_month).value - dt64other += nanos.astype("timedelta64[ns]") - return dt64other + Timedelta(days=-1) - cdef class SemiMonthBegin(SemiMonthOffset): """ @@ -2405,38 +2378,6 @@ cdef class SemiMonthBegin(SemiMonthOffset): day = 1 if n % 2 else self.day_of_month return shift_month(other, months, day) - def _get_roll(self, i8other, before_day_of_month, after_day_of_month): - # before_day_of_month and after_day_of_month are ndarray[bool] - n = self.n - is_month_start = get_start_end_field(i8other, "is_month_start") - if n > 0: - roll = np.where(before_day_of_month, n, n + 1) - elif n == 0: - roll_start = np.where(is_month_start, 0, 1) - roll_after = np.where(after_day_of_month, 1, 0) - roll = roll_start + roll_after - else: - roll_after = np.where(after_day_of_month, n + 2, n + 1) - roll_start = np.where(is_month_start, -1, 0) - roll = roll_after + roll_start - return roll - - def _apply_index_days(self, dt64other, roll): - """ - Add days portion of offset to dt64other. - - Parameters - ---------- - dt64other : ndarray[datetime64[ns]] - roll : ndarray[int64_t] - - Returns - ------- - ndarray[datetime64[ns]] - """ - nanos = (roll % 2) * Timedelta(days=self.day_of_month - 1).value - return dt64other + nanos.astype("timedelta64[ns]") - # --------------------------------------------------------------------- # Week-Based Offset Classes From b939acc1f7099cce9a48976ec32f1dd9d0848577 Mon Sep 17 00:00:00 2001 From: rhshadrach <45562402+rhshadrach@users.noreply.github.com> Date: Mon, 15 Jun 2020 08:29:18 -0400 Subject: [PATCH 0121/1025] BUG: Groupby with as_index=False raises error when type is Category (#34767) --- doc/source/whatsnew/v1.1.0.rst | 20 ++++++++++++++++++++ pandas/core/frame.py | 2 +- pandas/core/groupby/groupby.py | 17 +++++++++++------ pandas/tests/groupby/test_groupby.py | 7 +++++-- pandas/tests/groupby/test_size.py | 16 ++++++++++++++++ 5 files changed, 53 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 5f8668f85c3b3..f7e36de059e84 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -676,6 +676,25 @@ Using :meth:`DataFrame.groupby` with ``as_index=False`` and the function ``idxma df.groupby("a", as_index=False).nunique() +The method :meth:`core.DataFrameGroupBy.size` would previously ignore ``as_index=False``. Now the grouping columns are returned as columns, making the result a `DataFrame` instead of a `Series`. (:issue:`32599`) + +*Previous behavior*: + +.. code-block:: ipython + + In [3]: df.groupby("a", as_index=False).size() + Out[4]: + a + x 2 + y 2 + dtype: int64 + +*New behavior*: + +.. ipython:: python + + df.groupby("a", as_index=False).size() + .. _whatsnew_110.api_breaking.apply_applymap_first_once: apply and applymap on ``DataFrame`` evaluates first row/column only once @@ -995,6 +1014,7 @@ Groupby/resample/rolling - Bug in :meth:`SeriesGroupBy.agg` where any column name was accepted in the named aggregation of ``SeriesGroupBy`` previously. The behaviour now allows only ``str`` and callables else would raise ``TypeError``. (:issue:`34422`) - Bug in :meth:`DataFrame.groupby` lost index, when one of the ``agg`` keys referenced an empty list (:issue:`32580`) + Reshaping ^^^^^^^^^ diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b522920ec9f23..68c06715e1ea4 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5450,7 +5450,7 @@ def value_counts( if subset is None: subset = self.columns.tolist() - counts = self.groupby(subset).size() + counts = self.groupby(subset).grouper.size() if sort: counts = counts.sort_values(ascending=ascending) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index c2be8d96402df..904049923859d 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -942,9 +942,9 @@ def _transform_should_cast(self, func_nm: str) -> bool: bool Whether transform should attempt to cast the result of aggregation """ - return (self.size().fillna(0) > 0).any() and ( - func_nm not in base.cython_cast_blacklist - ) + filled_series = self.grouper.size().fillna(0) + assert filled_series is not None + return filled_series.gt(0).any() and func_nm not in base.cython_cast_blacklist def _cython_transform(self, how: str, numeric_only: bool = True, **kwargs): output: Dict[base.OutputKey, np.ndarray] = {} @@ -1507,14 +1507,15 @@ def sem(self, ddof: int = 1): @Substitution(name="groupby") @Appender(_common_see_also) - def size(self): + def size(self) -> FrameOrSeriesUnion: """ Compute group sizes. Returns ------- - Series - Number of rows in each group. + DataFrame or Series + Number of rows in each group as a Series if as_index is True + or a DataFrame if as_index is False. """ result = self.grouper.size() @@ -1523,6 +1524,10 @@ def size(self): result = self._obj_1d_constructor(result, name=self.obj.name) else: result = self._obj_1d_constructor(result) + + if not self.as_index: + result = result.rename("size").reset_index() + return self._reindex_output(result, fill_value=0) @doc(_groupby_agg_method_template, fname="sum", no=True, mc=0) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 80f34bb91cdfd..664c30e003632 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -668,11 +668,14 @@ def test_ops_not_as_index(reduction_func): if reduction_func in ("corrwith",): pytest.skip("Test not applicable") - if reduction_func in ("nth", "ngroup", "size",): + if reduction_func in ("nth", "ngroup",): pytest.skip("Skip until behavior is determined (GH #5755)") df = DataFrame(np.random.randint(0, 5, size=(100, 2)), columns=["a", "b"]) - expected = getattr(df.groupby("a"), reduction_func)().reset_index() + expected = getattr(df.groupby("a"), reduction_func)() + if reduction_func == "size": + expected = expected.rename("size") + expected = expected.reset_index() g = df.groupby("a", as_index=False) diff --git a/pandas/tests/groupby/test_size.py b/pandas/tests/groupby/test_size.py index 42bccc67fe0f8..9cff8b966dad0 100644 --- a/pandas/tests/groupby/test_size.py +++ b/pandas/tests/groupby/test_size.py @@ -44,3 +44,19 @@ def test_size_period_index(): grp = ser.groupby(level="A") result = grp.size() tm.assert_series_equal(result, ser) + + +@pytest.mark.parametrize("as_index", [True, False]) +def test_size_on_categorical(as_index): + df = DataFrame([[1, 1], [2, 2]], columns=["A", "B"]) + df["A"] = df["A"].astype("category") + result = df.groupby(["A", "B"], as_index=as_index).size() + + expected = DataFrame( + [[1, 1, 1], [1, 2, 0], [2, 1, 0], [2, 2, 1]], columns=["A", "B", "size"], + ) + expected["A"] = expected["A"].astype("category") + if as_index: + expected = expected.set_index(["A", "B"])["size"].rename(None) + + tm.assert_equal(result, expected) From 61b59dfb9a882e85274061f9d7ce6e467f5ea5a6 Mon Sep 17 00:00:00 2001 From: Giovanni Lanzani Date: Mon, 15 Jun 2020 14:53:17 +0200 Subject: [PATCH 0122/1025] TST Add test for rolling window, see GH 34605 (#34705) --- pandas/tests/window/test_rolling.py | 33 +++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index f9b0e6856337b..8d72e2cb92ca9 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -663,3 +663,36 @@ def test_iter_rolling_datetime(expected, expected_index, window): for (expected, actual) in zip(expected, ser.rolling(window)): tm.assert_series_equal(actual, expected) + + +@pytest.mark.parametrize( + "grouping,_index", + [ + ( + {"level": 0}, + pd.MultiIndex.from_tuples( + [(0, 0), (0, 0), (1, 1), (1, 1), (1, 1)], names=[None, None] + ), + ), + ( + {"by": "X"}, + pd.MultiIndex.from_tuples( + [(0, 0), (1, 0), (2, 1), (3, 1), (4, 1)], names=["X", None] + ), + ), + ], +) +def test_rolling_positional_argument(grouping, _index, raw): + # GH 34605 + + def scaled_sum(*args): + if len(args) < 2: + raise ValueError("The function needs two arguments") + array, scale = args + return array.sum() / scale + + df = DataFrame(data={"X": range(5)}, index=[0, 0, 1, 1, 1]) + + expected = DataFrame(data={"X": [0.0, 0.5, 1.0, 1.5, 2.0]}, index=_index) + result = df.groupby(**grouping).rolling(1).apply(scaled_sum, raw=raw, args=(2,)) + tm.assert_frame_equal(result, expected) From eae9be0c5d284ce5e5347e11bb6374dc89a7ccf1 Mon Sep 17 00:00:00 2001 From: Jiaxiang Date: Mon, 15 Jun 2020 21:20:33 +0800 Subject: [PATCH 0123/1025] DEPR: Deprecate tshift and integrate it to shift (#34545) --- doc/source/user_guide/timeseries.rst | 20 +-- doc/source/whatsnew/v1.1.0.rst | 1 + pandas/core/generic.py | 159 +++++++++++------- pandas/tests/frame/methods/test_shift.py | 59 ++++++- pandas/tests/generic/test_finalize.py | 20 ++- pandas/tests/groupby/test_groupby.py | 1 + pandas/tests/groupby/test_groupby_subclass.py | 1 + pandas/tests/groupby/test_whitelist.py | 1 + pandas/tests/resample/test_datetime_index.py | 4 +- pandas/tests/series/methods/test_shift.py | 54 +++++- 10 files changed, 239 insertions(+), 81 deletions(-) diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst index 5351c3ee6b624..648d93a45d210 100644 --- a/doc/source/user_guide/timeseries.rst +++ b/doc/source/user_guide/timeseries.rst @@ -516,7 +516,7 @@ The ``DatetimeIndex`` class contains many time series related optimizations: * A large range of dates for various offsets are pre-computed and cached under the hood in order to make generating subsequent date ranges very fast (just have to grab a slice). -* Fast shifting using the ``shift`` and ``tshift`` method on pandas objects. +* Fast shifting using the ``shift`` method on pandas objects. * Unioning of overlapping ``DatetimeIndex`` objects with the same frequency is very fast (important for fast data alignment). * Quick access to date fields via properties such as ``year``, ``month``, etc. @@ -1462,23 +1462,19 @@ the pandas objects. The ``shift`` method accepts an ``freq`` argument which can accept a ``DateOffset`` class or other ``timedelta``-like object or also an -:ref:`offset alias `: +:ref:`offset alias `. + +When ``freq`` is specified, ``shift`` method changes all the dates in the index +rather than changing the alignment of the data and the index: .. ipython:: python + ts.shift(5, freq='D') ts.shift(5, freq=pd.offsets.BDay()) ts.shift(5, freq='BM') -Rather than changing the alignment of the data and the index, ``DataFrame`` and -``Series`` objects also have a :meth:`~Series.tshift` convenience method that -changes all the dates in the index by a specified number of offsets: - -.. ipython:: python - - ts.tshift(5, freq='D') - -Note that with ``tshift``, the leading entry is no longer NaN because the data -is not being realigned. +Note that with when ``freq`` is specified, the leading entry is no longer NaN +because the data is not being realigned. Frequency conversion ~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index f7e36de059e84..0c746b197c5b8 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -767,6 +767,7 @@ Deprecations - :meth:`DatetimeIndex.week` and `DatetimeIndex.weekofyear` are deprecated and will be removed in a future version, use :meth:`DatetimeIndex.isocalendar().week` instead (:issue:`33595`) - :meth:`DatetimeArray.week` and `DatetimeArray.weekofyear` are deprecated and will be removed in a future version, use :meth:`DatetimeArray.isocalendar().week` instead (:issue:`33595`) - :meth:`DateOffset.__call__` is deprecated and will be removed in a future version, use ``offset + other`` instead (:issue:`34171`) +- :meth:`DataFrame.tshift` and :meth:`Series.tshift` are deprecated and will be removed in a future version, use :meth:`DataFrame.shift` and :meth:`Series.shift` instead (:issue:`11631`) - Indexing an :class:`Index` object with a float key is deprecated, and will raise an ``IndexError`` in the future. You can manually convert to an integer key instead (:issue:`34191`). diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 9dcdcaca2f689..7c3e975c889e1 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -182,7 +182,7 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): ] _internal_names_set: Set[str] = set(_internal_names) _accessors: Set[str] = set() - _deprecations: FrozenSet[str] = frozenset(["get_values"]) + _deprecations: FrozenSet[str] = frozenset(["get_values", "tshift"]) _metadata: List[str] = [] _is_copy = None _mgr: BlockManager @@ -9162,7 +9162,9 @@ def shift( When `freq` is not passed, shift the index without realigning the data. If `freq` is passed (in this case, the index must be date or datetime, or it will raise a `NotImplementedError`), the index will be - increased using the periods and the `freq`. + increased using the periods and the `freq`. `freq` can be inferred + when specified as "infer" as long as either freq or inferred_freq + attribute is set in the index. Parameters ---------- @@ -9173,6 +9175,9 @@ def shift( If `freq` is specified then the index values are shifted but the data is not realigned. That is, use `freq` if you would like to extend the index when shifting and preserve the original data. + If `freq` is specified as "infer" then it will be inferred from + the freq or inferred_freq attributes of the index. If neither of + those attributes exist, a ValueError is thrown axis : {{0 or 'index', 1 or 'columns', None}}, default None Shift direction. fill_value : object, optional @@ -9182,7 +9187,7 @@ def shift( For datetime, timedelta, or period data, etc. :attr:`NaT` is used. For extension dtypes, ``self.dtype.na_value`` is used. - .. versionchanged:: 0.24.0 + .. versionchanged:: 1.1.0 Returns ------- @@ -9199,46 +9204,99 @@ def shift( Examples -------- - >>> df = pd.DataFrame({{'Col1': [10, 20, 15, 30, 45], - ... 'Col2': [13, 23, 18, 33, 48], - ... 'Col3': [17, 27, 22, 37, 52]}}) + >>> df = pd.DataFrame({{"Col1": [10, 20, 15, 30, 45], + ... "Col2": [13, 23, 18, 33, 48], + ... "Col3": [17, 27, 22, 37, 52]}}, + ... index=pd.date_range("2020-01-01", "2020-01-05")) + >>> df + Col1 Col2 Col3 + 2020-01-01 10 13 17 + 2020-01-02 20 23 27 + 2020-01-03 15 18 22 + 2020-01-04 30 33 37 + 2020-01-05 45 48 52 >>> df.shift(periods=3) - Col1 Col2 Col3 - 0 NaN NaN NaN - 1 NaN NaN NaN - 2 NaN NaN NaN - 3 10.0 13.0 17.0 - 4 20.0 23.0 27.0 - - >>> df.shift(periods=1, axis='columns') - Col1 Col2 Col3 - 0 NaN 10.0 13.0 - 1 NaN 20.0 23.0 - 2 NaN 15.0 18.0 - 3 NaN 30.0 33.0 - 4 NaN 45.0 48.0 + Col1 Col2 Col3 + 2020-01-01 NaN NaN NaN + 2020-01-02 NaN NaN NaN + 2020-01-03 NaN NaN NaN + 2020-01-04 10.0 13.0 17.0 + 2020-01-05 20.0 23.0 27.0 + + >>> df.shift(periods=1, axis="columns") + Col1 Col2 Col3 + 2020-01-01 NaN 10.0 13.0 + 2020-01-02 NaN 20.0 23.0 + 2020-01-03 NaN 15.0 18.0 + 2020-01-04 NaN 30.0 33.0 + 2020-01-05 NaN 45.0 48.0 >>> df.shift(periods=3, fill_value=0) - Col1 Col2 Col3 - 0 0 0 0 - 1 0 0 0 - 2 0 0 0 - 3 10 13 17 - 4 20 23 27 + Col1 Col2 Col3 + 2020-01-01 0 0 0 + 2020-01-02 0 0 0 + 2020-01-03 0 0 0 + 2020-01-04 10 13 17 + 2020-01-05 20 23 27 + + >>> df.shift(periods=3, freq="D") + Col1 Col2 Col3 + 2020-01-04 10 13 17 + 2020-01-05 20 23 27 + 2020-01-06 15 18 22 + 2020-01-07 30 33 37 + 2020-01-08 45 48 52 + + >>> df.shift(periods=3, freq="infer") + Col1 Col2 Col3 + 2020-01-04 10 13 17 + 2020-01-05 20 23 27 + 2020-01-06 15 18 22 + 2020-01-07 30 33 37 + 2020-01-08 45 48 52 """ if periods == 0: return self.copy() - block_axis = self._get_block_manager_axis(axis) if freq is None: + # when freq is None, data is shifted, index is not + block_axis = self._get_block_manager_axis(axis) new_data = self._mgr.shift( periods=periods, axis=block_axis, fill_value=fill_value ) + return self._constructor(new_data).__finalize__(self, method="shift") + + # when freq is given, index is shifted, data is not + index = self._get_axis(axis) + + if freq == "infer": + freq = getattr(index, "freq", None) + + if freq is None: + freq = getattr(index, "inferred_freq", None) + + if freq is None: + msg = "Freq was not set in the index hence cannot be inferred" + raise ValueError(msg) + + elif isinstance(freq, str): + freq = to_offset(freq) + + if isinstance(index, PeriodIndex): + orig_freq = to_offset(index.freq) + if freq != orig_freq: + assert orig_freq is not None # for mypy + raise ValueError( + f"Given freq {freq.rule_code} does not match " + f"PeriodIndex freq {orig_freq.rule_code}" + ) + new_ax = index.shift(periods) else: - return self.tshift(periods, freq) + new_ax = index.shift(periods, freq) - return self._constructor(new_data).__finalize__(self, method="shift") + result = self.set_axis(new_ax, axis) + return result.__finalize__(self, method="shift") def slice_shift(self: FrameOrSeries, periods: int = 1, axis=0) -> FrameOrSeries: """ @@ -9283,6 +9341,9 @@ def tshift( """ Shift the time index, using the index's frequency if available. + .. deprecated:: 1.1.0 + Use `shift` instead. + Parameters ---------- periods : int @@ -9303,39 +9364,19 @@ def tshift( attributes of the index. If neither of those attributes exist, a ValueError is thrown """ - index = self._get_axis(axis) - if freq is None: - freq = getattr(index, "freq", None) - - if freq is None: - freq = getattr(index, "inferred_freq", None) + warnings.warn( + ( + "tshift is deprecated and will be removed in a future version. " + "Please use shift instead." + ), + FutureWarning, + stacklevel=2, + ) if freq is None: - msg = "Freq was not given and was not set in the index" - raise ValueError(msg) - - if periods == 0: - return self - - if isinstance(freq, str): - freq = to_offset(freq) - - axis = self._get_axis_number(axis) - if isinstance(index, PeriodIndex): - orig_freq = to_offset(index.freq) - if freq != orig_freq: - assert orig_freq is not None # for mypy - raise ValueError( - f"Given freq {freq.rule_code} does not match " - f"PeriodIndex freq {orig_freq.rule_code}" - ) - new_ax = index.shift(periods) - else: - new_ax = index.shift(periods, freq) + freq = "infer" - result = self.copy() - result.set_axis(new_ax, axis, inplace=True) - return result.__finalize__(self, method="tshift") + return self.shift(periods, freq, axis) def truncate( self: FrameOrSeries, before=None, after=None, axis=None, copy: bool_t = True diff --git a/pandas/tests/frame/methods/test_shift.py b/pandas/tests/frame/methods/test_shift.py index 95f9fd9d7caf3..9ec029a6c4304 100644 --- a/pandas/tests/frame/methods/test_shift.py +++ b/pandas/tests/frame/methods/test_shift.py @@ -145,7 +145,10 @@ def test_shift_duplicate_columns(self): tm.assert_frame_equal(shifted[0], shifted[1]) tm.assert_frame_equal(shifted[0], shifted[2]) + @pytest.mark.filterwarnings("ignore:tshift is deprecated:FutureWarning") def test_tshift(self, datetime_frame): + # TODO: remove this test when tshift deprecation is enforced + # PeriodIndex ps = tm.makePeriodFrame() shifted = ps.tshift(1) @@ -159,7 +162,8 @@ def test_tshift(self, datetime_frame): shifted3 = ps.tshift(freq=offsets.BDay()) tm.assert_frame_equal(shifted, shifted3) - with pytest.raises(ValueError, match="does not match"): + msg = "Given freq M does not match PeriodIndex freq B" + with pytest.raises(ValueError, match=msg): ps.tshift(freq="M") # DatetimeIndex @@ -186,10 +190,61 @@ def test_tshift(self, datetime_frame): tm.assert_frame_equal(unshifted, inferred_ts) no_freq = datetime_frame.iloc[[0, 5, 7], :] - msg = "Freq was not given and was not set in the index" + msg = "Freq was not set in the index hence cannot be inferred" with pytest.raises(ValueError, match=msg): no_freq.tshift() + def test_tshift_deprecated(self, datetime_frame): + # GH#11631 + with tm.assert_produces_warning(FutureWarning): + datetime_frame.tshift() + + def test_period_index_frame_shift_with_freq(self): + ps = tm.makePeriodFrame() + + shifted = ps.shift(1, freq="infer") + unshifted = shifted.shift(-1, freq="infer") + tm.assert_frame_equal(unshifted, ps) + + shifted2 = ps.shift(freq="B") + tm.assert_frame_equal(shifted, shifted2) + + shifted3 = ps.shift(freq=offsets.BDay()) + tm.assert_frame_equal(shifted, shifted3) + + def test_datetime_frame_shift_with_freq(self, datetime_frame): + shifted = datetime_frame.shift(1, freq="infer") + unshifted = shifted.shift(-1, freq="infer") + tm.assert_frame_equal(datetime_frame, unshifted) + + shifted2 = datetime_frame.shift(freq=datetime_frame.index.freq) + tm.assert_frame_equal(shifted, shifted2) + + inferred_ts = DataFrame( + datetime_frame.values, + Index(np.asarray(datetime_frame.index)), + columns=datetime_frame.columns, + ) + shifted = inferred_ts.shift(1, freq="infer") + expected = datetime_frame.shift(1, freq="infer") + expected.index = expected.index._with_freq(None) + tm.assert_frame_equal(shifted, expected) + + unshifted = shifted.shift(-1, freq="infer") + tm.assert_frame_equal(unshifted, inferred_ts) + + def test_period_index_frame_shift_with_freq_error(self): + ps = tm.makePeriodFrame() + msg = "Given freq M does not match PeriodIndex freq B" + with pytest.raises(ValueError, match=msg): + ps.shift(freq="M") + + def test_datetime_frame_shift_with_freq_error(self, datetime_frame): + no_freq = datetime_frame.iloc[[0, 5, 7], :] + msg = "Freq was not set in the index hence cannot be inferred" + with pytest.raises(ValueError, match=msg): + no_freq.shift(freq="infer") + def test_shift_dt64values_int_fill_deprecated(self): # GH#31971 ser = pd.Series([pd.Timestamp("2020-01-01"), pd.Timestamp("2020-01-02")]) diff --git a/pandas/tests/generic/test_finalize.py b/pandas/tests/generic/test_finalize.py index d307eef8beb62..a152bc203721f 100644 --- a/pandas/tests/generic/test_finalize.py +++ b/pandas/tests/generic/test_finalize.py @@ -438,11 +438,21 @@ (pd.DataFrame, frame_data, operator.methodcaller("mask", np.array([[True]]))), (pd.Series, ([1, 2],), operator.methodcaller("slice_shift")), (pd.DataFrame, frame_data, operator.methodcaller("slice_shift")), - (pd.Series, (1, pd.date_range("2000", periods=4)), operator.methodcaller("tshift")), - ( - pd.DataFrame, - ({"A": [1, 1, 1, 1]}, pd.date_range("2000", periods=4)), - operator.methodcaller("tshift"), + pytest.param( + ( + pd.Series, + (1, pd.date_range("2000", periods=4)), + operator.methodcaller("tshift"), + ), + marks=pytest.mark.filterwarnings("ignore::FutureWarning"), + ), + pytest.param( + ( + pd.DataFrame, + ({"A": [1, 1, 1, 1]}, pd.date_range("2000", periods=4)), + operator.methodcaller("tshift"), + ), + marks=pytest.mark.filterwarnings("ignore::FutureWarning"), ), (pd.Series, ([1, 2],), operator.methodcaller("truncate", before=0)), (pd.DataFrame, frame_data, operator.methodcaller("truncate", before=0)), diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 664c30e003632..0d040b8e6955a 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1982,6 +1982,7 @@ def test_bool_aggs_dup_column_labels(bool_agg_func): @pytest.mark.parametrize( "idx", [pd.Index(["a", "a"]), pd.MultiIndex.from_tuples((("a", "a"), ("a", "a")))] ) +@pytest.mark.filterwarnings("ignore:tshift is deprecated:FutureWarning") def test_dup_labels_output_shape(groupby_func, idx): if groupby_func in {"size", "ngroup", "cumcount"}: pytest.skip("Not applicable") diff --git a/pandas/tests/groupby/test_groupby_subclass.py b/pandas/tests/groupby/test_groupby_subclass.py index 6adae19005c3a..7271911c5f80f 100644 --- a/pandas/tests/groupby/test_groupby_subclass.py +++ b/pandas/tests/groupby/test_groupby_subclass.py @@ -14,6 +14,7 @@ tm.SubclassedSeries(np.arange(0, 10), name="A"), ], ) +@pytest.mark.filterwarnings("ignore:tshift is deprecated:FutureWarning") def test_groupby_preserves_subclass(obj, groupby_func): # GH28330 -- preserve subclass through groupby operations diff --git a/pandas/tests/groupby/test_whitelist.py b/pandas/tests/groupby/test_whitelist.py index 1598cc24ba6fb..9b595328d9230 100644 --- a/pandas/tests/groupby/test_whitelist.py +++ b/pandas/tests/groupby/test_whitelist.py @@ -340,6 +340,7 @@ def test_groupby_function_rename(mframe): assert f.__name__ == name +@pytest.mark.filterwarnings("ignore:tshift is deprecated:FutureWarning") def test_groupby_selection_with_methods(df): # some methods which require DatetimeIndex rng = date_range("2014", periods=len(df)) diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index 43d2bf80505db..e7637a598403f 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -1067,7 +1067,7 @@ def test_resample_anchored_intraday(simple_date_range_series): tm.assert_frame_equal(result, expected) result = df.resample("M", closed="left").mean() - exp = df.tshift(1, freq="D").resample("M", kind="period").mean() + exp = df.shift(1, freq="D").resample("M", kind="period").mean() exp = exp.to_timestamp(how="end") exp.index = exp.index + Timedelta(1, "ns") - Timedelta(1, "D") @@ -1086,7 +1086,7 @@ def test_resample_anchored_intraday(simple_date_range_series): tm.assert_frame_equal(result, expected) result = df.resample("Q", closed="left").mean() - expected = df.tshift(1, freq="D").resample("Q", kind="period", closed="left").mean() + expected = df.shift(1, freq="D").resample("Q", kind="period", closed="left").mean() expected = expected.to_timestamp(how="end") expected.index += Timedelta(1, "ns") - Timedelta(1, "D") expected.index._data.freq = "Q" diff --git a/pandas/tests/series/methods/test_shift.py b/pandas/tests/series/methods/test_shift.py index f981e98100d31..6257eecf4fc08 100644 --- a/pandas/tests/series/methods/test_shift.py +++ b/pandas/tests/series/methods/test_shift.py @@ -181,7 +181,10 @@ def test_shift_dst(self): tm.assert_series_equal(res, exp) assert res.dtype == "datetime64[ns, US/Eastern]" + @pytest.mark.filterwarnings("ignore:tshift is deprecated:FutureWarning") def test_tshift(self, datetime_series): + # TODO: remove this test when tshift deprecation is enforced + # PeriodIndex ps = tm.makePeriodSeries() shifted = ps.tshift(1) @@ -220,10 +223,59 @@ def test_tshift(self, datetime_series): tm.assert_series_equal(unshifted, inferred_ts) no_freq = datetime_series[[0, 5, 7]] - msg = "Freq was not given and was not set in the index" + msg = "Freq was not set in the index hence cannot be inferred" with pytest.raises(ValueError, match=msg): no_freq.tshift() + def test_tshift_deprecated(self, datetime_series): + # GH#11631 + with tm.assert_produces_warning(FutureWarning): + datetime_series.tshift() + + def test_period_index_series_shift_with_freq(self): + ps = tm.makePeriodSeries() + + shifted = ps.shift(1, freq="infer") + unshifted = shifted.shift(-1, freq="infer") + tm.assert_series_equal(unshifted, ps) + + shifted2 = ps.shift(freq="B") + tm.assert_series_equal(shifted, shifted2) + + shifted3 = ps.shift(freq=BDay()) + tm.assert_series_equal(shifted, shifted3) + + def test_datetime_series_shift_with_freq(self, datetime_series): + shifted = datetime_series.shift(1, freq="infer") + unshifted = shifted.shift(-1, freq="infer") + tm.assert_series_equal(datetime_series, unshifted) + + shifted2 = datetime_series.shift(freq=datetime_series.index.freq) + tm.assert_series_equal(shifted, shifted2) + + inferred_ts = Series( + datetime_series.values, Index(np.asarray(datetime_series.index)), name="ts" + ) + shifted = inferred_ts.shift(1, freq="infer") + expected = datetime_series.shift(1, freq="infer") + expected.index = expected.index._with_freq(None) + tm.assert_series_equal(shifted, expected) + + unshifted = shifted.shift(-1, freq="infer") + tm.assert_series_equal(unshifted, inferred_ts) + + def test_period_index_series_shift_with_freq_error(self): + ps = tm.makePeriodSeries() + msg = "Given freq M does not match PeriodIndex freq B" + with pytest.raises(ValueError, match=msg): + ps.shift(freq="M") + + def test_datetime_series_shift_with_freq_error(self, datetime_series): + no_freq = datetime_series[[0, 5, 7]] + msg = "Freq was not set in the index hence cannot be inferred" + with pytest.raises(ValueError, match=msg): + no_freq.shift(freq="infer") + def test_shift_int(self, datetime_series): ts = datetime_series.astype(int) shifted = ts.shift(1) From 6822ae28696f7ce22fc9d2a06b5c0e392284755e Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 15 Jun 2020 15:36:12 +0200 Subject: [PATCH 0124/1025] DOC: add release note about revert for 1.0.5 (#34785) --- doc/source/whatsnew/v1.0.5.rst | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.0.5.rst b/doc/source/whatsnew/v1.0.5.rst index 5dbc911407784..7dfac54279e6f 100644 --- a/doc/source/whatsnew/v1.0.5.rst +++ b/doc/source/whatsnew/v1.0.5.rst @@ -15,8 +15,14 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ -- -- + +- Fix regression in :meth:`read_parquet` when reading from file-like objects + (:issue:`34467`). +- Fix regression in reading from public S3 buckets (:issue:`34626`). + +Note this disables the ability to read Parquet files from directories on S3 +again (:issue:`26388`, :issue:`34632`), which was added in the 1.0.4 release, +but is now targeted for pandas 1.1.0. .. _whatsnew_105.bug_fixes: @@ -24,7 +30,6 @@ Bug fixes ~~~~~~~~~ - Fixed building from source with Python 3.8 fetching the wrong version of NumPy (:issue:`34666`) -- Contributors ~~~~~~~~~~~~ From 8ab912da91fcb36fe179cfee22a143cc54ee61bd Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 15 Jun 2020 09:24:11 -0500 Subject: [PATCH 0125/1025] API: Removed PeriodDtype.dtype_code from public API (#34796) --- pandas/_libs/tslibs/dtypes.pxd | 2 +- pandas/_libs/tslibs/dtypes.pyx | 12 ++++++------ pandas/_libs/tslibs/period.pyx | 34 +++++++++++++++++----------------- 3 files changed, 24 insertions(+), 24 deletions(-) diff --git a/pandas/_libs/tslibs/dtypes.pxd b/pandas/_libs/tslibs/dtypes.pxd index f43bc283d98c7..71b4eeabbaaf5 100644 --- a/pandas/_libs/tslibs/dtypes.pxd +++ b/pandas/_libs/tslibs/dtypes.pxd @@ -73,4 +73,4 @@ cdef enum PeriodDtypeCode: cdef class PeriodDtypeBase: cdef readonly: - PeriodDtypeCode dtype_code + PeriodDtypeCode _dtype_code diff --git a/pandas/_libs/tslibs/dtypes.pyx b/pandas/_libs/tslibs/dtypes.pyx index 0752910317077..143eac7f1ef6e 100644 --- a/pandas/_libs/tslibs/dtypes.pyx +++ b/pandas/_libs/tslibs/dtypes.pyx @@ -8,10 +8,10 @@ cdef class PeriodDtypeBase: describing a PeriodDtype in an integer code. """ # cdef readonly: - # PeriodDtypeCode dtype_code + # PeriodDtypeCode _dtype_code def __cinit__(self, PeriodDtypeCode code): - self.dtype_code = code + self._dtype_code = code def __eq__(self, other): if not isinstance(other, PeriodDtypeBase): @@ -19,12 +19,12 @@ cdef class PeriodDtypeBase: if not isinstance(self, PeriodDtypeBase): # cython semantics, this is a reversed op return False - return self.dtype_code == other.dtype_code + return self._dtype_code == other._dtype_code @property def freq_group(self) -> int: # See also: libperiod.get_freq_group - return (self.dtype_code // 1000) * 1000 + return (self._dtype_code // 1000) * 1000 @property def date_offset(self): @@ -35,8 +35,8 @@ cdef class PeriodDtypeBase: """ from .offsets import to_offset - freqstr = _reverse_period_code_map.get(self.dtype_code) - # equiv: freqstr = libfrequencies.get_freq_str(self.dtype_code) + freqstr = _reverse_period_code_map.get(self._dtype_code) + # equiv: freqstr = libfrequencies.get_freq_str(self._dtype_code) return to_offset(freqstr) diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index d14f9d82eb5be..30caddf81b6e8 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -1645,7 +1645,7 @@ cdef class _Period: """ freq = self._maybe_convert_freq(freq) how = validate_end_alias(how) - base1 = self._dtype.dtype_code + base1 = self._dtype._dtype_code base2 = freq_to_dtype_code(freq) # self.n can't be negative or 0 @@ -1734,7 +1734,7 @@ cdef class _Period: return endpoint - Timedelta(1, 'ns') if freq is None: - base = self._dtype.dtype_code + base = self._dtype._dtype_code freq = get_to_timestamp_base(base) base = freq else: @@ -1748,12 +1748,12 @@ cdef class _Period: @property def year(self) -> int: - base = self._dtype.dtype_code + base = self._dtype._dtype_code return pyear(self.ordinal, base) @property def month(self) -> int: - base = self._dtype.dtype_code + base = self._dtype._dtype_code return pmonth(self.ordinal, base) @property @@ -1776,7 +1776,7 @@ cdef class _Period: >>> p.day 11 """ - base = self._dtype.dtype_code + base = self._dtype._dtype_code return pday(self.ordinal, base) @property @@ -1806,7 +1806,7 @@ cdef class _Period: >>> p.hour 0 """ - base = self._dtype.dtype_code + base = self._dtype._dtype_code return phour(self.ordinal, base) @property @@ -1830,7 +1830,7 @@ cdef class _Period: >>> p.minute 3 """ - base = self._dtype.dtype_code + base = self._dtype._dtype_code return pminute(self.ordinal, base) @property @@ -1854,12 +1854,12 @@ cdef class _Period: >>> p.second 12 """ - base = self._dtype.dtype_code + base = self._dtype._dtype_code return psecond(self.ordinal, base) @property def weekofyear(self) -> int: - base = self._dtype.dtype_code + base = self._dtype._dtype_code return pweek(self.ordinal, base) @property @@ -1940,7 +1940,7 @@ cdef class _Period: >>> per.end_time.dayofweek 2 """ - base = self._dtype.dtype_code + base = self._dtype._dtype_code return pweekday(self.ordinal, base) @property @@ -2028,12 +2028,12 @@ cdef class _Period: >>> period.dayofyear 1 """ - base = self._dtype.dtype_code + base = self._dtype._dtype_code return pday_of_year(self.ordinal, base) @property def quarter(self) -> int: - base = self._dtype.dtype_code + base = self._dtype._dtype_code return pquarter(self.ordinal, base) @property @@ -2077,7 +2077,7 @@ cdef class _Period: >>> per.year 2017 """ - base = self._dtype.dtype_code + base = self._dtype._dtype_code return pqyear(self.ordinal, base) @property @@ -2111,7 +2111,7 @@ cdef class _Period: >>> p.days_in_month 29 """ - base = self._dtype.dtype_code + base = self._dtype._dtype_code return pdays_in_month(self.ordinal, base) @property @@ -2149,7 +2149,7 @@ cdef class _Period: return self.freq.freqstr def __repr__(self) -> str: - base = self._dtype.dtype_code + base = self._dtype._dtype_code formatted = period_format(self.ordinal, base) return f"Period('{formatted}', '{self.freqstr}')" @@ -2157,7 +2157,7 @@ cdef class _Period: """ Return a string representation for a particular DataFrame """ - base = self._dtype.dtype_code + base = self._dtype._dtype_code formatted = period_format(self.ordinal, base) value = str(formatted) return value @@ -2309,7 +2309,7 @@ cdef class _Period: >>> a.strftime('%b. %d, %Y was a %A') 'Jan. 01, 2001 was a Monday' """ - base = self._dtype.dtype_code + base = self._dtype._dtype_code return period_format(self.ordinal, base, fmt) From 09adc986d64a09a7f6fcdb9f443c6f8dc9d150b4 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 15 Jun 2020 09:38:53 -0500 Subject: [PATCH 0126/1025] PERF: Fixed perf regression in TimedeltaIndex.get_loc (#34734) --- pandas/core/arrays/datetimelike.py | 10 ++++++++-- pandas/core/indexes/timedeltas.py | 3 +-- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 8af23815b54ef..1fea6ca1b8a3d 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -776,15 +776,19 @@ def _validate_shift_value(self, fill_value): return self._unbox(fill_value) - def _validate_scalar(self, value, msg: str, cast_str: bool = False): + def _validate_scalar( + self, value, msg: Optional[str] = None, cast_str: bool = False + ): """ Validate that the input value can be cast to our scalar_type. Parameters ---------- value : object - msg : str + msg : str, optional. Message to raise in TypeError on invalid input. + If not provided, `value` is cast to a str and used + as the message. cast_str : bool, default False Whether to try to parse string input to scalar_type. @@ -807,6 +811,8 @@ def _validate_scalar(self, value, msg: str, cast_str: bool = False): value = self._scalar_type(value) # type: ignore else: + if msg is None: + msg = str(value) raise TypeError(msg) return value diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index a14994866c0f7..f6661c6b50dfb 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -213,9 +213,8 @@ def get_loc(self, key, method=None, tolerance=None): if not is_scalar(key): raise InvalidIndexError(key) - msg = str(key) try: - key = self._data._validate_scalar(key, msg, cast_str=True) + key = self._data._validate_scalar(key, cast_str=True) except TypeError as err: raise KeyError(key) from err From 79eefe6c35d42f65df16959db22c25e030652221 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 15 Jun 2020 18:25:09 +0200 Subject: [PATCH 0127/1025] TST: ensure read_parquet filter argument is correctly passed though (pyarrow engine) (#34804) --- pandas/tests/io/test_parquet.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 7ee551194bf76..efd34c58d7d19 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -671,6 +671,17 @@ def test_timestamp_nanoseconds(self, pa): df = pd.DataFrame({"a": pd.date_range("2017-01-01", freq="1n", periods=10)}) check_round_trip(df, pa, write_kwargs={"version": "2.0"}) + @td.skip_if_no("pyarrow", min_version="0.17") + def test_filter_row_groups(self, pa): + # https://github.com/pandas-dev/pandas/issues/26551 + df = pd.DataFrame({"a": list(range(0, 3))}) + with tm.ensure_clean() as path: + df.to_parquet(path, pa) + result = read_parquet( + path, pa, filters=[("a", "==", 0)], use_legacy_dataset=False + ) + assert len(result) == 1 + class TestParquetFastParquet(Base): @td.skip_if_no("fastparquet", min_version="0.3.2") From bae26ffaa032b3a17742d6dcc759df05b5830ad0 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 15 Jun 2020 12:27:28 -0500 Subject: [PATCH 0128/1025] Regression in to_timedelta with errors="coerce" and unit (#34806) --- pandas/_libs/tslibs/timedeltas.pyx | 2 +- pandas/core/arrays/timedeltas.py | 7 ++++--- pandas/core/tools/timedeltas.py | 27 ++++++++++++++++--------- pandas/tests/tools/test_to_timedelta.py | 11 ++++++++++ 4 files changed, 34 insertions(+), 13 deletions(-) diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index a5b502f3f4071..1c3e69e21aa18 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -237,7 +237,7 @@ def array_to_timedelta64(object[:] values, unit=None, errors='raise'): if unit is not None: for i in range(n): - if isinstance(values[i], str): + if isinstance(values[i], str) and errors != "coerce": raise ValueError( "unit must not be specified if the input contains a str" ) diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index d0657994dd81c..f33b569b3d1f7 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -882,9 +882,10 @@ def sequence_to_td64ns(data, copy=False, unit=None, errors="raise"): ---------- data : list-like copy : bool, default False - unit : str, default "ns" - The timedelta unit to treat integers as multiples of. - Must be un-specifed if the data contains a str. + unit : str, optional + The timedelta unit to treat integers as multiples of. For numeric + data this defaults to ``'ns'``. + Must be un-specified if the data contains a str and ``errors=="raise"``. errors : {"raise", "coerce", "ignore"}, default "raise" How to handle elements that cannot be converted to timedelta64[ns]. See ``pandas.to_timedelta`` for details. diff --git a/pandas/core/tools/timedeltas.py b/pandas/core/tools/timedeltas.py index 87eac93a6072c..a643c312ec358 100644 --- a/pandas/core/tools/timedeltas.py +++ b/pandas/core/tools/timedeltas.py @@ -26,15 +26,24 @@ def to_timedelta(arg, unit=None, errors="raise"): ---------- arg : str, timedelta, list-like or Series The data to be converted to timedelta. - unit : str, default 'ns' - Must not be specified if the arg is/contains a str. - Denotes the unit of the arg. Possible values: - ('W', 'D', 'days', 'day', 'hours', hour', 'hr', 'h', - 'm', 'minute', 'min', 'minutes', 'T', 'S', 'seconds', - 'sec', 'second', 'ms', 'milliseconds', 'millisecond', - 'milli', 'millis', 'L', 'us', 'microseconds', 'microsecond', - 'micro', 'micros', 'U', 'ns', 'nanoseconds', 'nano', 'nanos', - 'nanosecond', 'N'). + unit : str, optional + Denotes the unit of the arg for numeric `arg`. Defaults to ``"ns"``. + + Possible values: + + * 'W' + * 'D' / 'days' / 'day' + * 'hours' / 'hour' / 'hr' / 'h' + * 'm' / 'minute' / 'min' / 'minutes' / 'T' + * 'S' / 'seconds' / 'sec' / 'second' + * 'ms' / 'milliseconds' / 'millisecond' / 'milli' / 'millis' / 'L' + * 'us' / 'microseconds' / 'microsecond' / 'micro' / 'micros' / 'U' + * 'ns' / 'nanoseconds' / 'nano' / 'nanos' / 'nanosecond' / 'N' + + .. versionchanged:: 1.1.0 + + Must not be specified when `arg` context strings and + ``errors="raise"``. errors : {'ignore', 'raise', 'coerce'}, default 'raise' - If 'raise', then invalid parsing will raise an exception. diff --git a/pandas/tests/tools/test_to_timedelta.py b/pandas/tests/tools/test_to_timedelta.py index e3cf3a7f16a82..1e193f22a6698 100644 --- a/pandas/tests/tools/test_to_timedelta.py +++ b/pandas/tests/tools/test_to_timedelta.py @@ -155,3 +155,14 @@ def test_to_timedelta_float(self): result = pd.to_timedelta(arr, unit="s") expected_asi8 = np.arange(999990000, int(1e9), 1000, dtype="int64") tm.assert_numpy_array_equal(result.asi8, expected_asi8) + + def test_to_timedelta_coerce_strings_unit(self): + arr = np.array([1, 2, "error"], dtype=object) + result = pd.to_timedelta(arr, unit="ns", errors="coerce") + expected = pd.to_timedelta([1, 2, pd.NaT], unit="ns") + tm.assert_index_equal(result, expected) + + def test_to_timedelta_ignore_strings_unit(self): + arr = np.array([1, 2, "error"], dtype=object) + result = pd.to_timedelta(arr, unit="ns", errors="ignore") + tm.assert_numpy_array_equal(result, arr) From 9845c3e471f7b3eddeb7e56771d5aa8b67486a38 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 15 Jun 2020 10:28:16 -0700 Subject: [PATCH 0129/1025] CLN: dont consolidate in NDFrame._is_numeric_mixed_type (#34678) --- pandas/core/generic.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 7c3e975c889e1..9014e576eeb39 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -5331,15 +5331,10 @@ def _is_mixed_type(self) -> bool_t: f = lambda: self._mgr.is_mixed_type return self._protect_consolidate(f) - @property - def _is_numeric_mixed_type(self) -> bool_t: - f = lambda: self._mgr.is_numeric_mixed_type - return self._protect_consolidate(f) - def _check_inplace_setting(self, value) -> bool_t: """ check whether we allow in-place setting with this type of value """ if self._is_mixed_type: - if not self._is_numeric_mixed_type: + if not self._mgr.is_numeric_mixed_type: # allow an actual np.nan thru if is_float(value) and np.isnan(value): From bfe39d7380ea438d21a452c169479d38fa5240a7 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 15 Jun 2020 12:19:54 -0700 Subject: [PATCH 0130/1025] CLN: liboffsets annotate, de-duplicate (#34808) --- pandas/_libs/tslibs/offsets.pyx | 324 +++++++++++++++----------------- 1 file changed, 156 insertions(+), 168 deletions(-) diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index d22f2b9117326..bf2998bfcd9d1 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -491,18 +491,20 @@ cdef class BaseOffset: # Name and Rendering Methods def __repr__(self) -> str: - className = getattr(self, '_outputName', type(self).__name__) + # _output_name used by B(Year|Quarter)(End|Begin) to + # expand "B" -> "Business" + class_name = getattr(self, "_output_name", type(self).__name__) if abs(self.n) != 1: - plural = 's' + plural = "s" else: - plural = '' + plural = "" n_str = "" if self.n != 1: n_str = f"{self.n} * " - out = f'<{n_str}{className}{plural}{self._repr_attrs()}>' + out = f"<{n_str}{class_name}{plural}{self._repr_attrs()}>" return out def _repr_attrs(self) -> str: @@ -608,7 +610,7 @@ cdef class BaseOffset: dt = dt + type(self)(1, normalize=self.normalize, **self.kwds) return dt - def _get_offset_day(self, datetime other): + def _get_offset_day(self, other: datetime) -> int: # subclass must implement `_day_opt`; calling from the base class # will raise NotImplementedError. cdef: @@ -632,7 +634,7 @@ cdef class BaseOffset: # Staticmethod so we can call from Tick.__init__, will be unnecessary # once BaseOffset is a cdef class and is inherited by Tick @staticmethod - def _validate_n(n): + def _validate_n(n) -> int: """ Require that `n` be an integer. @@ -1010,7 +1012,7 @@ cdef class RelativeDeltaOffset(BaseOffset): self.__dict__.update(state) @apply_wraps - def apply(self, other): + def apply(self, other: datetime) -> datetime: if self._use_relativedelta: other = _as_datetime(other) @@ -1379,7 +1381,7 @@ cdef class BusinessDay(BusinessMixin): @apply_index_wraps def apply_index(self, dtindex): - i8other = dtindex.asi8 + i8other = dtindex.view("i8") return shift_bdays(i8other, self.n) def is_on_offset(self, dt) -> bool: @@ -1482,7 +1484,7 @@ cdef class BusinessHour(BusinessMixin): until = datetime(2014, 4, day, end.hour, end.minute) return int((until - dtstart).total_seconds()) - def _get_closing_time(self, dt): + def _get_closing_time(self, dt: datetime) -> datetime: """ Get the closing time of a business hour interval by its opening time. @@ -1582,7 +1584,7 @@ cdef class BusinessHour(BusinessMixin): return datetime(other.year, other.month, other.day, hour, minute) - def _prev_opening_time(self, other): + def _prev_opening_time(self, other: datetime) -> datetime: """ If n is positive, return the latest opening time earlier than or equal to current time. @@ -1602,7 +1604,7 @@ cdef class BusinessHour(BusinessMixin): return self._next_opening_time(other, sign=-1) @apply_wraps - def rollback(self, dt): + def rollback(self, dt: datetime) -> datetime: """ Roll provided date backward to next offset only if not on offset. """ @@ -1615,7 +1617,7 @@ cdef class BusinessHour(BusinessMixin): return dt @apply_wraps - def rollforward(self, dt): + def rollforward(self, dt: datetime) -> datetime: """ Roll provided date forward to next offset only if not on offset. """ @@ -1627,108 +1629,105 @@ cdef class BusinessHour(BusinessMixin): return dt @apply_wraps - def apply(self, other): - if PyDateTime_Check(other): - # used for detecting edge condition - nanosecond = getattr(other, "nanosecond", 0) - # reset timezone and nanosecond - # other may be a Timestamp, thus not use replace - other = datetime( - other.year, - other.month, - other.day, - other.hour, - other.minute, - other.second, - other.microsecond, - ) - n = self.n + def apply(self, other: datetime) -> datetime: + # used for detecting edge condition + nanosecond = getattr(other, "nanosecond", 0) + # reset timezone and nanosecond + # other may be a Timestamp, thus not use replace + other = datetime( + other.year, + other.month, + other.day, + other.hour, + other.minute, + other.second, + other.microsecond, + ) + n = self.n - # adjust other to reduce number of cases to handle - if n >= 0: - if other.time() in self.end or not self._is_on_offset(other): - other = self._next_opening_time(other) + # adjust other to reduce number of cases to handle + if n >= 0: + if other.time() in self.end or not self._is_on_offset(other): + other = self._next_opening_time(other) + else: + if other.time() in self.start: + # adjustment to move to previous business day + other = other - timedelta(seconds=1) + if not self._is_on_offset(other): + other = self._next_opening_time(other) + other = self._get_closing_time(other) + + # get total business hours by sec in one business day + businesshours = sum( + self._get_business_hours_by_sec(st, en) + for st, en in zip(self.start, self.end) + ) + + bd, r = divmod(abs(n * 60), businesshours // 60) + if n < 0: + bd, r = -bd, -r + + # adjust by business days first + if bd != 0: + if self._prefix.startswith("C"): + # GH#30593 this is a Custom offset + skip_bd = CustomBusinessDay( + n=bd, + weekmask=self.weekmask, + holidays=self.holidays, + calendar=self.calendar, + ) else: - if other.time() in self.start: - # adjustment to move to previous business day - other = other - timedelta(seconds=1) - if not self._is_on_offset(other): - other = self._next_opening_time(other) - other = self._get_closing_time(other) - - # get total business hours by sec in one business day - businesshours = sum( - self._get_business_hours_by_sec(st, en) - for st, en in zip(self.start, self.end) - ) + skip_bd = BusinessDay(n=bd) + # midnight business hour may not on BusinessDay + if not self.next_bday.is_on_offset(other): + prev_open = self._prev_opening_time(other) + remain = other - prev_open + other = prev_open + skip_bd + remain + else: + other = other + skip_bd - bd, r = divmod(abs(n * 60), businesshours // 60) - if n < 0: - bd, r = -bd, -r - - # adjust by business days first - if bd != 0: - if self._prefix.startswith("C"): - # GH#30593 this is a Custom offset - skip_bd = CustomBusinessDay( - n=bd, - weekmask=self.weekmask, - holidays=self.holidays, - calendar=self.calendar, - ) + # remaining business hours to adjust + bhour_remain = timedelta(minutes=r) + + if n >= 0: + while bhour_remain != timedelta(0): + # business hour left in this business time interval + bhour = ( + self._get_closing_time(self._prev_opening_time(other)) - other + ) + if bhour_remain < bhour: + # finish adjusting if possible + other += bhour_remain + bhour_remain = timedelta(0) else: - skip_bd = BusinessDay(n=bd) - # midnight business hour may not on BusinessDay - if not self.next_bday.is_on_offset(other): - prev_open = self._prev_opening_time(other) - remain = other - prev_open - other = prev_open + skip_bd + remain + # go to next business time interval + bhour_remain -= bhour + other = self._next_opening_time(other + bhour) + else: + while bhour_remain != timedelta(0): + # business hour left in this business time interval + bhour = self._next_opening_time(other) - other + if ( + bhour_remain > bhour + or bhour_remain == bhour + and nanosecond != 0 + ): + # finish adjusting if possible + other += bhour_remain + bhour_remain = timedelta(0) else: - other = other + skip_bd - - # remaining business hours to adjust - bhour_remain = timedelta(minutes=r) - - if n >= 0: - while bhour_remain != timedelta(0): - # business hour left in this business time interval - bhour = ( - self._get_closing_time(self._prev_opening_time(other)) - other - ) - if bhour_remain < bhour: - # finish adjusting if possible - other += bhour_remain - bhour_remain = timedelta(0) - else: - # go to next business time interval - bhour_remain -= bhour - other = self._next_opening_time(other + bhour) - else: - while bhour_remain != timedelta(0): - # business hour left in this business time interval - bhour = self._next_opening_time(other) - other - if ( - bhour_remain > bhour - or bhour_remain == bhour - and nanosecond != 0 - ): - # finish adjusting if possible - other += bhour_remain - bhour_remain = timedelta(0) - else: - # go to next business time interval - bhour_remain -= bhour - other = self._get_closing_time( - self._next_opening_time( - other + bhour - timedelta(seconds=1) - ) + # go to next business time interval + bhour_remain -= bhour + other = self._get_closing_time( + self._next_opening_time( + other + bhour - timedelta(seconds=1) ) + ) - return other - else: - raise ApplyTypeError("Only know how to combine business hour with datetime") + return other - def is_on_offset(self, dt): + def is_on_offset(self, dt: datetime) -> bool: if self.normalize and not _is_normalized(dt): return False @@ -1740,7 +1739,7 @@ cdef class BusinessHour(BusinessMixin): # Distinguish by the time spent from previous opening time return self._is_on_offset(dt) - def _is_on_offset(self, dt): + def _is_on_offset(self, dt: datetime) -> bool: """ Slight speedups using calculated values. """ @@ -1779,14 +1778,11 @@ cdef class WeekOfMonthMixin(SingleConstructorOffset): raise ValueError(f"Day must be 0<=day<=6, got {weekday}") @apply_wraps - def apply(self, other): + def apply(self, other: datetime) -> datetime: compare_day = self._get_offset_day(other) months = self.n - if months > 0 and compare_day > other.day: - months -= 1 - elif months <= 0 and compare_day < other.day: - months += 1 + months = roll_convention(other.day, months, compare_day) shifted = shift_month(other, months, "start") to_day = self._get_offset_day(shifted) @@ -1861,7 +1857,7 @@ cdef class YearOffset(SingleConstructorOffset): return get_day_of_month(&dts, self._day_opt) @apply_wraps - def apply(self, other): + def apply(self, other: datetime) -> datetime: years = roll_qtrday(other, self.n, self.month, self._day_opt, modby=12) months = years * 12 + (self.month - other.month) return shift_month(other, months, self._day_opt) @@ -1869,7 +1865,7 @@ cdef class YearOffset(SingleConstructorOffset): @apply_index_wraps def apply_index(self, dtindex): shifted = shift_quarters( - dtindex.asi8, self.n, self.month, self._day_opt, modby=12 + dtindex.view("i8"), self.n, self.month, self._day_opt, modby=12 ) return shifted @@ -1963,8 +1959,8 @@ cdef class QuarterOffset(SingleConstructorOffset): # startingMonth vs month attr names are resolved # FIXME: python annotations here breaks things - # _default_startingMonth: int - # _from_name_startingMonth: int + # _default_starting_month: int + # _from_name_starting_month: int cdef readonly: int startingMonth @@ -1973,7 +1969,7 @@ cdef class QuarterOffset(SingleConstructorOffset): BaseOffset.__init__(self, n, normalize) if startingMonth is None: - startingMonth = self._default_startingMonth + startingMonth = self._default_starting_month self.startingMonth = startingMonth cpdef __setstate__(self, state): @@ -1987,8 +1983,8 @@ cdef class QuarterOffset(SingleConstructorOffset): if suffix: kwargs["startingMonth"] = MONTH_TO_CAL_NUM[suffix] else: - if cls._from_name_startingMonth is not None: - kwargs["startingMonth"] = cls._from_name_startingMonth + if cls._from_name_starting_month is not None: + kwargs["startingMonth"] = cls._from_name_starting_month return cls(**kwargs) @property @@ -2006,7 +2002,7 @@ cdef class QuarterOffset(SingleConstructorOffset): return mod_month == 0 and dt.day == self._get_offset_day(dt) @apply_wraps - def apply(self, other): + def apply(self, other: datetime) -> datetime: # months_since: find the calendar quarter containing other.month, # e.g. if other.month == 8, the calendar quarter is [Jul, Aug, Sep]. # Then find the month in that quarter containing an is_on_offset date for @@ -2022,7 +2018,7 @@ cdef class QuarterOffset(SingleConstructorOffset): @apply_index_wraps def apply_index(self, dtindex): shifted = shift_quarters( - dtindex.asi8, self.n, self.startingMonth, self._day_opt + dtindex.view("i8"), self.n, self.startingMonth, self._day_opt ) return shifted @@ -2048,9 +2044,9 @@ cdef class BQuarterEnd(QuarterOffset): >>> ts + BQuarterEnd(startingMonth=2) Timestamp('2020-05-29 05:01:15') """ - _outputName = "BusinessQuarterEnd" - _default_startingMonth = 3 - _from_name_startingMonth = 12 + _output_name = "BusinessQuarterEnd" + _default_starting_month = 3 + _from_name_starting_month = 12 _prefix = "BQ" _day_opt = "business_end" @@ -2076,9 +2072,9 @@ cdef class BQuarterBegin(QuarterOffset): >>> ts + BQuarterBegin(-1) Timestamp('2020-03-02 05:01:15') """ - _outputName = "BusinessQuarterBegin" - _default_startingMonth = 3 - _from_name_startingMonth = 1 + _output_name = "BusinessQuarterBegin" + _default_starting_month = 3 + _from_name_starting_month = 1 _prefix = "BQS" _day_opt = "business_start" @@ -2091,8 +2087,7 @@ cdef class QuarterEnd(QuarterOffset): startingMonth = 2 corresponds to dates like 2/28/2007, 5/31/2007, ... startingMonth = 3 corresponds to dates like 3/31/2007, 6/30/2007, ... """ - _outputName = "QuarterEnd" - _default_startingMonth = 3 + _default_starting_month = 3 _prefix = "Q" _day_opt = "end" @@ -2105,6 +2100,7 @@ cdef class QuarterEnd(QuarterOffset): QuarterOffset.__init__(self, n, normalize, startingMonth) self._period_dtype_code = PeriodDtypeCode.Q_DEC + self.startingMonth % 12 + cdef class QuarterBegin(QuarterOffset): """ DateOffset increments between Quarter start dates. @@ -2113,9 +2109,8 @@ cdef class QuarterBegin(QuarterOffset): startingMonth = 2 corresponds to dates like 2/01/2007, 5/01/2007, ... startingMonth = 3 corresponds to dates like 3/01/2007, 6/01/2007, ... """ - _outputName = "QuarterBegin" - _default_startingMonth = 3 - _from_name_startingMonth = 1 + _default_starting_month = 3 + _from_name_starting_month = 1 _prefix = "QS" _day_opt = "start" @@ -2130,14 +2125,14 @@ cdef class MonthOffset(SingleConstructorOffset): return dt.day == self._get_offset_day(dt) @apply_wraps - def apply(self, other): + def apply(self, other: datetime) -> datetime: compare_day = self._get_offset_day(other) n = roll_convention(other.day, self.n, compare_day) return shift_month(other, n, self._day_opt) @apply_index_wraps def apply_index(self, dtindex): - shifted = shift_months(dtindex.asi8, self.n, self._day_opt) + shifted = shift_months(dtindex.view("i8"), self.n, self._day_opt) return shifted cpdef __setstate__(self, state): @@ -2244,29 +2239,31 @@ cdef class SemiMonthOffset(SingleConstructorOffset): return self._prefix + suffix @apply_wraps - def apply(self, other): + def apply(self, other: datetime) -> datetime: + is_start = isinstance(self, SemiMonthBegin) + # shift `other` to self.day_of_month, incrementing `n` if necessary n = roll_convention(other.day, self.n, self.day_of_month) days_in_month = get_days_in_month(other.year, other.month) - # For SemiMonthBegin on other.day == 1 and # SemiMonthEnd on other.day == days_in_month, # shifting `other` to `self.day_of_month` _always_ requires # incrementing/decrementing `n`, regardless of whether it is # initially positive. - if type(self) is SemiMonthBegin and (self.n <= 0 and other.day == 1): + if is_start and (self.n <= 0 and other.day == 1): n -= 1 - elif type(self) is SemiMonthEnd and (self.n > 0 and other.day == days_in_month): + elif (not is_start) and (self.n > 0 and other.day == days_in_month): n += 1 - return self._apply(n, other) + if is_start: + months = n // 2 + n % 2 + to_day = 1 if n % 2 else self.day_of_month + else: + months = n // 2 + to_day = 31 if n % 2 else self.day_of_month - def _apply(self, n, other): - """ - Handle specific apply logic for child classes. - """ - raise NotImplementedError(self) + return shift_month(other, months, to_day) @apply_index_wraps @cython.wraparound(False) @@ -2348,11 +2345,6 @@ cdef class SemiMonthEnd(SemiMonthOffset): days_in_month = get_days_in_month(dt.year, dt.month) return dt.day in (self.day_of_month, days_in_month) - def _apply(self, n, other): - months = n // 2 - day = 31 if n % 2 else self.day_of_month - return shift_month(other, months, day) - cdef class SemiMonthBegin(SemiMonthOffset): """ @@ -2373,11 +2365,6 @@ cdef class SemiMonthBegin(SemiMonthOffset): return False return dt.day in (1, self.day_of_month) - def _apply(self, n, other): - months = n // 2 + n % 2 - day = 1 if n % 2 else self.day_of_month - return shift_month(other, months, day) - # --------------------------------------------------------------------- # Week-Based Offset Classes @@ -2446,25 +2433,25 @@ cdef class Week(SingleConstructorOffset): td64 = np.timedelta64(td, "ns") return dtindex + td64 else: - return self._end_apply_index(dtindex) + i8other = dtindex.view("i8") + return self._end_apply_index(i8other) @cython.wraparound(False) @cython.boundscheck(False) - def _end_apply_index(self, dtindex): + cdef _end_apply_index(self, const int64_t[:] i8other): """ Add self to the given DatetimeIndex, specialized for case where self.weekday is non-null. Parameters ---------- - dtindex : DatetimeIndex + i8other : const int64_t[:] Returns ------- ndarray[int64_t] """ cdef: - int64_t[:] i8other = dtindex.view("i8") Py_ssize_t i, count = len(i8other) int64_t val int64_t[:] out = np.empty(count, dtype="i8") @@ -2493,7 +2480,7 @@ cdef class Week(SingleConstructorOffset): return out.base - def is_on_offset(self, dt) -> bool: + def is_on_offset(self, dt: datetime) -> bool: if self.normalize and not _is_normalized(dt): return False elif self.weekday is None: @@ -2647,6 +2634,7 @@ cdef class LastWeekOfMonth(WeekOfMonthMixin): weekday = weekday_to_int[suffix] return cls(weekday=weekday) + # --------------------------------------------------------------------- # Special Offset Classes @@ -2767,7 +2755,7 @@ cdef class FY5253(FY5253Mixin): return year_end == dt @apply_wraps - def apply(self, other): + def apply(self, other: datetime) -> datetime: norm = Timestamp(other).normalize() n = self.n @@ -2822,7 +2810,7 @@ cdef class FY5253(FY5253Mixin): ) return result - def get_year_end(self, dt): + def get_year_end(self, dt: datetime) -> datetime: assert dt.tzinfo is None dim = get_days_in_month(dt.year, self.startingMonth) @@ -2968,7 +2956,7 @@ cdef class FY5253Quarter(FY5253Mixin): variation=self.variation, ) - def _rollback_to_year(self, other): + def _rollback_to_year(self, other: datetime): """ Roll `other` back to the most recent date that was on a fiscal year end. @@ -3016,7 +3004,7 @@ cdef class FY5253Quarter(FY5253Mixin): return start, num_qtrs, tdelta @apply_wraps - def apply(self, other): + def apply(self, other: datetime) -> datetime: # Note: self.n == 0 is not allowed. n = self.n @@ -3044,7 +3032,7 @@ cdef class FY5253Quarter(FY5253Mixin): return res - def get_weeks(self, dt): + def get_weeks(self, dt: datetime): ret = [13] * 4 year_has_extra_week = self.year_has_extra_week(dt) @@ -3107,7 +3095,7 @@ cdef class Easter(SingleConstructorOffset): self.normalize = state.pop("normalize") @apply_wraps - def apply(self, other): + def apply(self, other: datetime) -> datetime: current_easter = easter(other.year) current_easter = datetime( current_easter.year, current_easter.month, current_easter.day @@ -3329,7 +3317,7 @@ cdef class _CustomBusinessMonth(BusinessMixin): return roll_func @apply_wraps - def apply(self, other): + def apply(self, other: datetime) -> datetime: # First move to month offset cur_month_offset_date = self.month_roll(other) @@ -3947,7 +3935,7 @@ cpdef int roll_convention(int other, int n, int compare) nogil: def roll_qtrday(other: datetime, n: int, month: int, - day_opt: object, modby: int) -> int: + day_opt: str, modby: int) -> int: """ Possibly increment or decrement the number of periods to shift based on rollforward/rollbackward conventions. @@ -3957,7 +3945,7 @@ def roll_qtrday(other: datetime, n: int, month: int, other : datetime or Timestamp n : number of periods to increment, before adjusting for rolling month : int reference month giving the first month of the year - day_opt : 'start', 'end', 'business_start', 'business_end', or int + day_opt : {'start', 'end', 'business_start', 'business_end'} The convention to use in finding the day in a given month against which to compare for rollforward/rollbackward decisions. modby : int 3 for quarters, 12 for years From e150ddd2b2f4fef8a332bec3b7556c05899ebb7c Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 15 Jun 2020 14:21:19 -0500 Subject: [PATCH 0131/1025] BUG: Fixed Series.replace for EA with casting (#34733) --- doc/source/whatsnew/v1.0.5.rst | 2 ++ pandas/core/internals/blocks.py | 6 +++++- pandas/tests/series/methods/test_replace.py | 5 +++++ 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.0.5.rst b/doc/source/whatsnew/v1.0.5.rst index 7dfac54279e6f..fdf08dd381050 100644 --- a/doc/source/whatsnew/v1.0.5.rst +++ b/doc/source/whatsnew/v1.0.5.rst @@ -24,6 +24,8 @@ Note this disables the ability to read Parquet files from directories on S3 again (:issue:`26388`, :issue:`34632`), which was added in the 1.0.4 release, but is now targeted for pandas 1.1.0. +- Fixed regression in :meth:`~DataFrame.replace` raising an ``AssertionError`` when replacing values in an extension dtype with values of a different dtype (:issue:`34530`) + .. _whatsnew_105.bug_fixes: Bug fixes diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 13b98279169fd..38c495e1dd0f3 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -745,7 +745,11 @@ def replace( if is_object_dtype(self): raise - assert not self._can_hold_element(value), value + if not self.is_extension: + # TODO: https://github.com/pandas-dev/pandas/issues/32586 + # Need an ExtensionArray._can_hold_element to indicate whether + # a scalar value can be placed in the array. + assert not self._can_hold_element(value), value # try again with a compatible block block = self.astype(object) diff --git a/pandas/tests/series/methods/test_replace.py b/pandas/tests/series/methods/test_replace.py index 330c682216f53..8f57cf3191d5d 100644 --- a/pandas/tests/series/methods/test_replace.py +++ b/pandas/tests/series/methods/test_replace.py @@ -402,3 +402,8 @@ def test_replace_only_one_dictlike_arg(self): msg = "Series.replace cannot use dict-value and non-None to_replace" with pytest.raises(ValueError, match=msg): ser.replace(to_replace, value) + + def test_replace_extension_other(self): + # https://github.com/pandas-dev/pandas/issues/34530 + ser = pd.Series(pd.array([1, 2, 3], dtype="Int64")) + ser.replace("", "") # no exception From 4e48c26ff0d6dafc47d3fbeec7c51aa36420c185 Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Mon, 15 Jun 2020 22:36:37 +0100 Subject: [PATCH 0132/1025] BUG: pd.NA.__format__ fails with format_specs (#34740) --- doc/source/whatsnew/v1.1.0.rst | 1 + pandas/_libs/missing.pyx | 6 ++++++ pandas/tests/scalar/test_na_scalar.py | 11 +++++++++++ 3 files changed, 18 insertions(+) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 0c746b197c5b8..f68135bf8cf9c 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -931,6 +931,7 @@ Missing - Clarified documentation on interpolate with method =akima. The ``der`` parameter must be scalar or None (:issue:`33426`) - :meth:`DataFrame.interpolate` uses the correct axis convention now. Previously interpolating along columns lead to interpolation along indices and vice versa. Furthermore interpolating with methods ``pad``, ``ffill``, ``bfill`` and ``backfill`` are identical to using these methods with :meth:`fillna` (:issue:`12918`, :issue:`29146`) - Bug in :meth:`DataFrame.interpolate` when called on a DataFrame with column names of string type was throwing a ValueError. The method is no independing of the type of column names (:issue:`33956`) +- passing :class:`NA` will into a format string using format specs will now work. For example ``"{:.1f}".format(pd.NA)`` would previously raise a ``ValueError``, but will now return the string ``""`` (:issue:`34740`) MultiIndex ^^^^^^^^^^ diff --git a/pandas/_libs/missing.pyx b/pandas/_libs/missing.pyx index 6d4d1e95fe8c3..fdd06fe631b97 100644 --- a/pandas/_libs/missing.pyx +++ b/pandas/_libs/missing.pyx @@ -349,6 +349,12 @@ class NAType(C_NAType): def __repr__(self) -> str: return "" + def __format__(self, format_spec) -> str: + try: + return self.__repr__().__format__(format_spec) + except ValueError: + return self.__repr__() + def __bool__(self): raise TypeError("boolean value of NA is ambiguous") diff --git a/pandas/tests/scalar/test_na_scalar.py b/pandas/tests/scalar/test_na_scalar.py index a0e3f8984fbe4..dc5eb15348c1b 100644 --- a/pandas/tests/scalar/test_na_scalar.py +++ b/pandas/tests/scalar/test_na_scalar.py @@ -22,6 +22,17 @@ def test_repr(): assert str(NA) == "" +def test_format(): + # GH-34740 + assert format(NA) == "" + assert format(NA, ">10") == " " + assert format(NA, "xxx") == "" # NA is flexible, accept any format spec + + assert "{}".format(NA) == "" + assert "{:>10}".format(NA) == " " + assert "{:xxx}".format(NA) == "" + + def test_truthiness(): msg = "boolean value of NA is ambiguous" From 983de54505b6ae75bae6d1f9039c377ba2684d23 Mon Sep 17 00:00:00 2001 From: patrick <61934744+phofl@users.noreply.github.com> Date: Tue, 16 Jun 2020 00:53:26 +0200 Subject: [PATCH 0133/1025] TST: GroupBy(..., as_index=True).agg() drops index (#33098) --- pandas/tests/groupby/aggregate/test_aggregate.py | 12 +++++++++++- pandas/tests/groupby/test_categorical.py | 13 +++++++++++++ 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 962288d5d59e1..1b726860eeb66 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -809,7 +809,17 @@ def test_aggregate_mixed_types(): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(reason="Not implemented.") +@pytest.mark.parametrize("func", ["min", "max"]) +def test_aggregate_categorical_lost_index(func: str): + # GH: 28641 groupby drops index, when grouping over categorical column with min/max + ds = pd.Series(["b"], dtype="category").cat.as_ordered() + df = pd.DataFrame({"A": [1997], "B": ds}) + result = df.groupby("A").agg({"B": func}) + expected = pd.DataFrame({"B": ["b"]}, index=pd.Index([1997], name="A")) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.xfail(reason="Not implemented;see GH 31256") def test_aggregate_udf_na_extension_type(): # https://github.com/pandas-dev/pandas/pull/31359 # This is currently failing to cast back to Int64Dtype. diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index f9e89d36084c6..ff35ec04952b1 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -1388,6 +1388,19 @@ def test_groupby_agg_non_numeric(): tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize("func", ["first", "last"]) +def test_groupy_first_returned_categorical_instead_of_dataframe(func): + # GH 28641: groupby drops index, when grouping over categorical column with + # first/last. Renamed Categorical instead of DataFrame previously. + df = pd.DataFrame( + {"A": [1997], "B": pd.Series(["b"], dtype="category").cat.as_ordered()} + ) + df_grouped = df.groupby("A")["B"] + result = getattr(df_grouped, func)() + expected = pd.Series(["b"], index=pd.Index([1997], name="A"), name="B") + tm.assert_series_equal(result, expected) + + def test_read_only_category_no_sort(): # GH33410 cats = np.array([1, 2]) From a8456856841c35f4793e0a599ae9d60c7efc911e Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 16 Jun 2020 14:44:28 +0200 Subject: [PATCH 0134/1025] CLN: remove unused args/kwargs in BlockManager.reduce (#34818) --- pandas/core/internals/managers.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 8e16d31b49150..e496694ee7899 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -327,16 +327,16 @@ def _verify_integrity(self) -> None: f"tot_items: {tot_items}" ) - def reduce(self, func, *args, **kwargs): + def reduce(self, func): # If 2D, we assume that we're operating column-wise if self.ndim == 1: # we'll be returning a scalar blk = self.blocks[0] - return func(blk.values, *args, **kwargs) + return func(blk.values) res = {} for blk in self.blocks: - bres = func(blk.values, *args, **kwargs) + bres = func(blk.values) if np.ndim(bres) == 0: # EA @@ -344,7 +344,7 @@ def reduce(self, func, *args, **kwargs): new_res = zip(blk.mgr_locs.as_array, [bres]) else: assert bres.ndim == 1, bres.shape - assert blk.shape[0] == len(bres), (blk.shape, bres.shape, args, kwargs) + assert blk.shape[0] == len(bres), (blk.shape, bres.shape) new_res = zip(blk.mgr_locs.as_array, bres) nr = dict(new_res) From 92825ee175c178664d4773b5d00634b1bfffc438 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Tue, 16 Jun 2020 05:47:56 -0700 Subject: [PATCH 0135/1025] BUG: Respect center=True in rolling.apply when numba engine is used (#34816) --- doc/source/whatsnew/v1.1.0.rst | 2 +- pandas/core/window/rolling.py | 13 ++++++++----- pandas/tests/window/test_numba.py | 8 +++++--- 3 files changed, 14 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index f68135bf8cf9c..7e04d8f906cb0 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -1015,7 +1015,7 @@ Groupby/resample/rolling The behaviour now is consistent, independent of internal heuristics. (:issue:`31612`, :issue:`14927`, :issue:`13056`) - Bug in :meth:`SeriesGroupBy.agg` where any column name was accepted in the named aggregation of ``SeriesGroupBy`` previously. The behaviour now allows only ``str`` and callables else would raise ``TypeError``. (:issue:`34422`) - Bug in :meth:`DataFrame.groupby` lost index, when one of the ``agg`` keys referenced an empty list (:issue:`32580`) - +- Bug in :meth:`Rolling.apply` where ``center=True`` was ignored when ``engine='numba'`` was specified (:issue:`34784`) Reshaping ^^^^^^^^^ diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 92be2d056cfcb..ce0a2a9b95025 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -150,7 +150,7 @@ def __init__( obj, window=None, min_periods: Optional[int] = None, - center: Optional[bool] = False, + center: bool = False, win_type: Optional[str] = None, axis: Axis = 0, on: Optional[Union[str, Index]] = None, @@ -1353,17 +1353,20 @@ def apply( kwargs = {} kwargs.pop("_level", None) kwargs.pop("floor", None) - window = self._get_window() - offset = calculate_center_offset(window) if self.center else 0 if not is_bool(raw): raise ValueError("raw parameter must be `True` or `False`") if engine == "cython": if engine_kwargs is not None: raise ValueError("cython engine does not accept engine_kwargs") + # Cython apply functions handle center, so don't need to use + # _apply's center handling + window = self._get_window() + offset = calculate_center_offset(window) if self.center else 0 apply_func = self._generate_cython_apply_func( args, kwargs, raw, offset, func ) + center = False elif engine == "numba": if raw is False: raise ValueError("raw must be `True` when using the numba engine") @@ -1375,14 +1378,14 @@ def apply( apply_func = generate_numba_apply_func( args, kwargs, func, engine_kwargs ) + center = self.center else: raise ValueError("engine must be either 'numba' or 'cython'") - # TODO: Why do we always pass center=False? # name=func & raw=raw for WindowGroupByMixin._apply return self._apply( apply_func, - center=False, + center=center, floor=0, name=func, use_numba_cache=engine == "numba", diff --git a/pandas/tests/window/test_numba.py b/pandas/tests/window/test_numba.py index 8ecf64b171df4..7e049af0ca1f8 100644 --- a/pandas/tests/window/test_numba.py +++ b/pandas/tests/window/test_numba.py @@ -13,7 +13,7 @@ # Filter warnings when parallel=True and the function can't be parallelized by Numba class TestApply: @pytest.mark.parametrize("jit", [True, False]) - def test_numba_vs_cython(self, jit, nogil, parallel, nopython): + def test_numba_vs_cython(self, jit, nogil, parallel, nopython, center): def f(x, *args): arg_sum = 0 for arg in args: @@ -29,10 +29,12 @@ def f(x, *args): args = (2,) s = Series(range(10)) - result = s.rolling(2).apply( + result = s.rolling(2, center=center).apply( f, args=args, engine="numba", engine_kwargs=engine_kwargs, raw=True ) - expected = s.rolling(2).apply(f, engine="cython", args=args, raw=True) + expected = s.rolling(2, center=center).apply( + f, engine="cython", args=args, raw=True + ) tm.assert_series_equal(result, expected) @pytest.mark.parametrize("jit", [True, False]) From 8641780a77f7738038e9bebe338f23272910a808 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 16 Jun 2020 05:49:29 -0700 Subject: [PATCH 0136/1025] CLN: liboffsets annotations (#34815) --- pandas/_libs/tslibs/offsets.pyx | 47 ++++++++++++++------------------- 1 file changed, 20 insertions(+), 27 deletions(-) diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index bf2998bfcd9d1..df43ebcfd9df2 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -358,7 +358,6 @@ cdef class BaseOffset: Base class for DateOffset methods that are not overridden by subclasses and will (after pickle errors are resolved) go into a cdef class. """ - _typ = "dateoffset" _day_opt = None _attributes = tuple(["n", "normalize"]) _use_relativedelta = False @@ -394,7 +393,7 @@ cdef class BaseOffset: def __ne__(self, other): return not self == other - def __hash__(self): + def __hash__(self) -> int: return hash(self._params) @cache_readonly @@ -422,10 +421,10 @@ cdef class BaseOffset: return params @property - def kwds(self): + def kwds(self) -> dict: # for backwards-compatibility kwds = {name: getattr(self, name, None) for name in self._attributes - if name not in ['n', 'normalize']} + if name not in ["n", "normalize"]} return {name: kwds[name] for name in kwds if kwds[name] is not None} @property @@ -582,7 +581,7 @@ cdef class BaseOffset: "does not have a vectorized implementation" ) - def rollback(self, dt): + def rollback(self, dt) -> datetime: """ Roll provided date backward to next offset only if not on offset. @@ -596,7 +595,7 @@ cdef class BaseOffset: dt = dt - type(self)(1, normalize=self.normalize, **self.kwds) return dt - def rollforward(self, dt): + def rollforward(self, dt) -> datetime: """ Roll provided date forward to next offset only if not on offset. @@ -618,7 +617,7 @@ cdef class BaseOffset: pydate_to_dtstruct(other, &dts) return get_day_of_month(&dts, self._day_opt) - def is_on_offset(self, dt) -> bool: + def is_on_offset(self, dt: datetime) -> bool: if self.normalize and not _is_normalized(dt): return False @@ -780,6 +779,8 @@ cdef class Tick(SingleConstructorOffset): def nanos(self) -> int64_t: return self.n * self._nanos_inc + # FIXME: This should be typed as datetime, but we DatetimeLikeIndex.insert + # checks self.freq.is_on_offset with a Timedelta sometimes. def is_on_offset(self, dt) -> bool: return True @@ -861,16 +862,8 @@ cdef class Tick(SingleConstructorOffset): def apply(self, other): # Timestamp can handle tz and nano sec, thus no need to use apply_wraps if isinstance(other, _Timestamp): - # GH#15126 - # in order to avoid a recursive - # call of __add__ and __radd__ if there is - # an exception, when we call using the + operator, - # we directly call the known method - result = other.__add__(self) - if result is NotImplemented: - raise OverflowError - return result + return other + self.delta elif other is NaT: return NaT elif is_datetime64_object(other) or PyDate_Check(other): @@ -1097,7 +1090,7 @@ cdef class RelativeDeltaOffset(BaseOffset): "applied vectorized" ) - def is_on_offset(self, dt) -> bool: + def is_on_offset(self, dt: datetime) -> bool: if self.normalize and not _is_normalized(dt): return False # TODO: see GH#1395 @@ -1384,7 +1377,7 @@ cdef class BusinessDay(BusinessMixin): i8other = dtindex.view("i8") return shift_bdays(i8other, self.n) - def is_on_offset(self, dt) -> bool: + def is_on_offset(self, dt: datetime) -> bool: if self.normalize and not _is_normalized(dt): return False return dt.weekday() < 5 @@ -1788,7 +1781,7 @@ cdef class WeekOfMonthMixin(SingleConstructorOffset): to_day = self._get_offset_day(shifted) return shift_day(shifted, to_day - shifted.day) - def is_on_offset(self, dt) -> bool: + def is_on_offset(self, dt: datetime) -> bool: if self.normalize and not _is_normalized(dt): return False return dt.day == self._get_offset_day(dt) @@ -1843,12 +1836,12 @@ cdef class YearOffset(SingleConstructorOffset): month = MONTH_ALIASES[self.month] return f"{self._prefix}-{month}" - def is_on_offset(self, dt) -> bool: + def is_on_offset(self, dt: datetime) -> bool: if self.normalize and not _is_normalized(dt): return False return dt.month == self.month and dt.day == self._get_offset_day(dt) - def _get_offset_day(self, other) -> int: + def _get_offset_day(self, other: datetime) -> int: # override BaseOffset method to use self.month instead of other.month cdef: npy_datetimestruct dts @@ -1995,7 +1988,7 @@ cdef class QuarterOffset(SingleConstructorOffset): def is_anchored(self) -> bool: return self.n == 1 and self.startingMonth is not None - def is_on_offset(self, dt) -> bool: + def is_on_offset(self, dt: datetime) -> bool: if self.normalize and not _is_normalized(dt): return False mod_month = (dt.month - self.startingMonth) % 3 @@ -2119,7 +2112,7 @@ cdef class QuarterBegin(QuarterOffset): # Month-Based Offset Classes cdef class MonthOffset(SingleConstructorOffset): - def is_on_offset(self, dt) -> bool: + def is_on_offset(self, dt: datetime) -> bool: if self.normalize and not _is_normalized(dt): return False return dt.day == self._get_offset_day(dt) @@ -2339,7 +2332,7 @@ cdef class SemiMonthEnd(SemiMonthOffset): _prefix = "SM" _min_day_of_month = 1 - def is_on_offset(self, dt) -> bool: + def is_on_offset(self, dt: datetime) -> bool: if self.normalize and not _is_normalized(dt): return False days_in_month = get_days_in_month(dt.year, dt.month) @@ -2360,7 +2353,7 @@ cdef class SemiMonthBegin(SemiMonthOffset): _prefix = "SMS" - def is_on_offset(self, dt) -> bool: + def is_on_offset(self, dt: datetime) -> bool: if self.normalize and not _is_normalized(dt): return False return dt.day in (1, self.day_of_month) @@ -2375,8 +2368,8 @@ cdef class Week(SingleConstructorOffset): Weekly offset. Parameters - ----------f - weekday : int, default None + ---------- + weekday : int or None, default None Always generate specific day of week. 0 for Monday. """ From 7d433f77a7bfbca2486ba6e142bb79d3a9ecc928 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 16 Jun 2020 05:51:01 -0700 Subject: [PATCH 0137/1025] REF: put Resolution in tslibs.dtypes (#34813) --- pandas/_libs/tslibs/dtypes.pyx | 126 ++++++++++++++++++++++++++++ pandas/_libs/tslibs/resolution.pyx | 127 +---------------------------- 2 files changed, 128 insertions(+), 125 deletions(-) diff --git a/pandas/_libs/tslibs/dtypes.pyx b/pandas/_libs/tslibs/dtypes.pyx index 143eac7f1ef6e..70acb42712201 100644 --- a/pandas/_libs/tslibs/dtypes.pyx +++ b/pandas/_libs/tslibs/dtypes.pyx @@ -1,5 +1,6 @@ # period frequency constants corresponding to scikits timeseries # originals +from enum import Enum cdef class PeriodDtypeBase: @@ -112,6 +113,9 @@ _period_code_map.update({ "C": 5000, # Custom Business Day }) +cdef set _month_names = { + x.split("-")[-1] for x in _period_code_map.keys() if x.startswith("A-") +} # Map attribute-name resolutions to resolution abbreviations _attrname_to_abbrevs = { @@ -127,6 +131,7 @@ _attrname_to_abbrevs = { "nanosecond": "N", } cdef dict attrname_to_abbrevs = _attrname_to_abbrevs +cdef dict _abbrev_to_attrnames = {v: k for k, v in attrname_to_abbrevs.items()} class FreqGroup: @@ -149,3 +154,124 @@ class FreqGroup: def get_freq_group(code: int) -> int: # See also: PeriodDtypeBase.freq_group return (code // 1000) * 1000 + + +class Resolution(Enum): + + # Note: cython won't allow us to reference the cdef versions at the + # module level + RESO_NS = 0 + RESO_US = 1 + RESO_MS = 2 + RESO_SEC = 3 + RESO_MIN = 4 + RESO_HR = 5 + RESO_DAY = 6 + RESO_MTH = 7 + RESO_QTR = 8 + RESO_YR = 9 + + def __lt__(self, other): + return self.value < other.value + + def __ge__(self, other): + return self.value >= other.value + + @property + def freq_group(self): + # TODO: annotate as returning FreqGroup once that is an enum + if self == Resolution.RESO_NS: + return FreqGroup.FR_NS + elif self == Resolution.RESO_US: + return FreqGroup.FR_US + elif self == Resolution.RESO_MS: + return FreqGroup.FR_MS + elif self == Resolution.RESO_SEC: + return FreqGroup.FR_SEC + elif self == Resolution.RESO_MIN: + return FreqGroup.FR_MIN + elif self == Resolution.RESO_HR: + return FreqGroup.FR_HR + elif self == Resolution.RESO_DAY: + return FreqGroup.FR_DAY + elif self == Resolution.RESO_MTH: + return FreqGroup.FR_MTH + elif self == Resolution.RESO_QTR: + return FreqGroup.FR_QTR + elif self == Resolution.RESO_YR: + return FreqGroup.FR_ANN + else: + raise ValueError(self) + + @property + def attrname(self) -> str: + """ + Return datetime attribute name corresponding to this Resolution. + + Examples + -------- + >>> Resolution.RESO_SEC.attrname + 'second' + """ + return _reso_str_map[self.value] + + @classmethod + def from_attrname(cls, attrname: str) -> "Resolution": + """ + Return resolution str against resolution code. + + Examples + -------- + >>> Resolution.from_attrname('second') + 2 + + >>> Resolution.from_attrname('second') == Resolution.RESO_SEC + True + """ + return cls(_str_reso_map[attrname]) + + @classmethod + def get_reso_from_freq(cls, freq: str) -> "Resolution": + """ + Return resolution code against frequency str. + + `freq` is given by the `offset.freqstr` for some DateOffset object. + + Examples + -------- + >>> Resolution.get_reso_from_freq('H') + 4 + + >>> Resolution.get_reso_from_freq('H') == Resolution.RESO_HR + True + """ + try: + attr_name = _abbrev_to_attrnames[freq] + except KeyError: + # For quarterly and yearly resolutions, we need to chop off + # a month string. + split_freq = freq.split("-") + if len(split_freq) != 2: + raise + if split_freq[1] not in _month_names: + # i.e. we want e.g. "Q-DEC", not "Q-INVALID" + raise + attr_name = _abbrev_to_attrnames[split_freq[0]] + + return cls.from_attrname(attr_name) + + +cdef dict _reso_str_map = { + Resolution.RESO_NS.value: "nanosecond", + Resolution.RESO_US.value: "microsecond", + Resolution.RESO_MS.value: "millisecond", + Resolution.RESO_SEC.value: "second", + Resolution.RESO_MIN.value: "minute", + Resolution.RESO_HR.value: "hour", + Resolution.RESO_DAY.value: "day", + Resolution.RESO_MTH.value: "month", + Resolution.RESO_QTR.value: "quarter", + Resolution.RESO_YR.value: "year", +} + +cdef dict _str_reso_map = {v: k for k, v in _reso_str_map.items()} diff --git a/pandas/_libs/tslibs/resolution.pyx b/pandas/_libs/tslibs/resolution.pyx index 55522e99459cb..4dbecc76ad986 100644 --- a/pandas/_libs/tslibs/resolution.pyx +++ b/pandas/_libs/tslibs/resolution.pyx @@ -1,17 +1,15 @@ -from enum import Enum import numpy as np from numpy cimport ndarray, int64_t, int32_t from pandas._libs.tslibs.util cimport get_nat -from pandas._libs.tslibs.dtypes cimport attrname_to_abbrevs +from pandas._libs.tslibs.dtypes import Resolution from pandas._libs.tslibs.np_datetime cimport ( npy_datetimestruct, dt64_to_dtstruct) -from pandas._libs.tslibs.frequencies import FreqGroup from pandas._libs.tslibs.timezones cimport ( is_utc, is_tzlocal, maybe_get_tz, get_dst_info) -from pandas._libs.tslibs.ccalendar cimport get_days_in_month, c_MONTH_NUMBERS +from pandas._libs.tslibs.ccalendar cimport get_days_in_month from pandas._libs.tslibs.tzconversion cimport tz_convert_utc_to_tzlocal # ---------------------------------------------------------------------- @@ -31,22 +29,6 @@ cdef: int RESO_QTR = 8 int RESO_YR = 9 -_abbrev_to_attrnames = {v: k for k, v in attrname_to_abbrevs.items()} - -_reso_str_map = { - RESO_NS: "nanosecond", - RESO_US: "microsecond", - RESO_MS: "millisecond", - RESO_SEC: "second", - RESO_MIN: "minute", - RESO_HR: "hour", - RESO_DAY: "day", - RESO_MTH: "month", - RESO_QTR: "quarter", - RESO_YR: "year", -} - -_str_reso_map = {v: k for k, v in _reso_str_map.items()} # ---------------------------------------------------------------------- @@ -122,111 +104,6 @@ cdef inline int _reso_stamp(npy_datetimestruct *dts): return RESO_DAY -class Resolution(Enum): - - # Note: cython won't allow us to reference the cdef versions at the - # module level - RESO_NS = 0 - RESO_US = 1 - RESO_MS = 2 - RESO_SEC = 3 - RESO_MIN = 4 - RESO_HR = 5 - RESO_DAY = 6 - RESO_MTH = 7 - RESO_QTR = 8 - RESO_YR = 9 - - def __lt__(self, other): - return self.value < other.value - - def __ge__(self, other): - return self.value >= other.value - - @property - def freq_group(self): - # TODO: annotate as returning FreqGroup once that is an enum - if self == Resolution.RESO_NS: - return FreqGroup.FR_NS - elif self == Resolution.RESO_US: - return FreqGroup.FR_US - elif self == Resolution.RESO_MS: - return FreqGroup.FR_MS - elif self == Resolution.RESO_SEC: - return FreqGroup.FR_SEC - elif self == Resolution.RESO_MIN: - return FreqGroup.FR_MIN - elif self == Resolution.RESO_HR: - return FreqGroup.FR_HR - elif self == Resolution.RESO_DAY: - return FreqGroup.FR_DAY - elif self == Resolution.RESO_MTH: - return FreqGroup.FR_MTH - elif self == Resolution.RESO_QTR: - return FreqGroup.FR_QTR - elif self == Resolution.RESO_YR: - return FreqGroup.FR_ANN - else: - raise ValueError(self) - - @property - def attrname(self) -> str: - """ - Return datetime attribute name corresponding to this Resolution. - - Examples - -------- - >>> Resolution.RESO_SEC.attrname - 'second' - """ - return _reso_str_map[self.value] - - @classmethod - def from_attrname(cls, attrname: str) -> "Resolution": - """ - Return resolution str against resolution code. - - Examples - -------- - >>> Resolution.from_attrname('second') - 2 - - >>> Resolution.from_attrname('second') == Resolution.RESO_SEC - True - """ - return cls(_str_reso_map[attrname]) - - @classmethod - def get_reso_from_freq(cls, freq: str) -> "Resolution": - """ - Return resolution code against frequency str. - - `freq` is given by the `offset.freqstr` for some DateOffset object. - - Examples - -------- - >>> Resolution.get_reso_from_freq('H') - 4 - - >>> Resolution.get_reso_from_freq('H') == Resolution.RESO_HR - True - """ - try: - attr_name = _abbrev_to_attrnames[freq] - except KeyError: - # For quarterly and yearly resolutions, we need to chop off - # a month string. - split_freq = freq.split("-") - if len(split_freq) != 2: - raise - if split_freq[1] not in c_MONTH_NUMBERS: - # i.e. we want e.g. "Q-DEC", not "Q-INVALID" - raise - attr_name = _abbrev_to_attrnames[split_freq[0]] - - return cls.from_attrname(attr_name) - - # ---------------------------------------------------------------------- # Frequency Inference From b4f484aac7deb8ce5e3932aaab6249522956ef73 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 16 Jun 2020 14:53:04 +0200 Subject: [PATCH 0138/1025] DOC: move 'Other API changes' under correct section (#34817) --- doc/source/whatsnew/v1.1.0.rst | 217 +++++++++++++++++---------------- 1 file changed, 110 insertions(+), 107 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 7e04d8f906cb0..10522ff797c59 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -297,116 +297,10 @@ Other enhancements .. --------------------------------------------------------------------------- -Increased minimum versions for dependencies -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Some minimum supported versions of dependencies were updated (:issue:`33718`, :issue:`29766`, :issue:`29723`, pytables >= 3.4.3). -If installed, we now require: - -+-----------------+-----------------+----------+---------+ -| Package | Minimum Version | Required | Changed | -+=================+=================+==========+=========+ -| numpy | 1.15.4 | X | X | -+-----------------+-----------------+----------+---------+ -| pytz | 2015.4 | X | | -+-----------------+-----------------+----------+---------+ -| python-dateutil | 2.7.3 | X | X | -+-----------------+-----------------+----------+---------+ -| bottleneck | 1.2.1 | | | -+-----------------+-----------------+----------+---------+ -| numexpr | 2.6.2 | | | -+-----------------+-----------------+----------+---------+ -| pytest (dev) | 4.0.2 | | | -+-----------------+-----------------+----------+---------+ - -For `optional libraries `_ the general recommendation is to use the latest version. -The following table lists the lowest version per library that is currently being tested throughout the development of pandas. -Optional libraries below the lowest tested version may still work, but are not considered supported. - -+-----------------+-----------------+---------+ -| Package | Minimum Version | Changed | -+=================+=================+=========+ -| beautifulsoup4 | 4.6.0 | | -+-----------------+-----------------+---------+ -| fastparquet | 0.3.2 | | -+-----------------+-----------------+---------+ -| gcsfs | 0.2.2 | | -+-----------------+-----------------+---------+ -| lxml | 3.8.0 | | -+-----------------+-----------------+---------+ -| matplotlib | 2.2.2 | | -+-----------------+-----------------+---------+ -| numba | 0.46.0 | | -+-----------------+-----------------+---------+ -| openpyxl | 2.5.7 | | -+-----------------+-----------------+---------+ -| pyarrow | 0.13.0 | | -+-----------------+-----------------+---------+ -| pymysql | 0.7.1 | | -+-----------------+-----------------+---------+ -| pytables | 3.4.3 | X | -+-----------------+-----------------+---------+ -| s3fs | 0.3.0 | | -+-----------------+-----------------+---------+ -| scipy | 1.2.0 | X | -+-----------------+-----------------+---------+ -| sqlalchemy | 1.1.4 | | -+-----------------+-----------------+---------+ -| xarray | 0.8.2 | | -+-----------------+-----------------+---------+ -| xlrd | 1.1.0 | | -+-----------------+-----------------+---------+ -| xlsxwriter | 0.9.8 | | -+-----------------+-----------------+---------+ -| xlwt | 1.2.0 | | -+-----------------+-----------------+---------+ -| pandas-gbq | 1.2.0 | X | -+-----------------+-----------------+---------+ - -See :ref:`install.dependencies` and :ref:`install.optional_dependencies` for more. - -Development Changes -^^^^^^^^^^^^^^^^^^^ - -- The minimum version of Cython is now the most recent bug-fix version (0.29.16) (:issue:`33334`). - -.. _whatsnew_110.api.other: - -Other API changes -^^^^^^^^^^^^^^^^^ - -- :meth:`Series.describe` will now show distribution percentiles for ``datetime`` dtypes, statistics ``first`` and ``last`` - will now be ``min`` and ``max`` to match with numeric dtypes in :meth:`DataFrame.describe` (:issue:`30164`) -- Added :meth:`DataFrame.value_counts` (:issue:`5377`) -- :meth:`Groupby.groups` now returns an abbreviated representation when called on large dataframes (:issue:`1135`) -- ``loc`` lookups with an object-dtype :class:`Index` and an integer key will now raise ``KeyError`` instead of ``TypeError`` when key is missing (:issue:`31905`) -- Using a :func:`pandas.api.indexers.BaseIndexer` with ``count``, ``min``, ``max``, ``median``, ``skew``, ``cov``, ``corr`` will now return correct results for any monotonic :func:`pandas.api.indexers.BaseIndexer` descendant (:issue:`32865`) -- Added a :func:`pandas.api.indexers.FixedForwardWindowIndexer` class to support forward-looking windows during ``rolling`` operations. -- Added :class:`pandas.errors.InvalidIndexError` (:issue:`34570`). +.. _whatsnew_110.api: Backwards incompatible API changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -- :meth:`DataFrame.swaplevels` now raises a ``TypeError`` if the axis is not a :class:`MultiIndex`. - Previously an ``AttributeError`` was raised (:issue:`31126`) -- :meth:`DataFrame.xs` now raises a ``TypeError`` if a ``level`` keyword is supplied and the axis is not a :class:`MultiIndex`. - Previously an ``AttributeError`` was raised (:issue:`33610`) -- :meth:`DataFrameGroupby.mean` and :meth:`SeriesGroupby.mean` (and similarly for :meth:`~DataFrameGroupby.median`, :meth:`~DataFrameGroupby.std` and :meth:`~DataFrameGroupby.var`) - now raise a ``TypeError`` if a not-accepted keyword argument is passed into it. - Previously a ``UnsupportedFunctionCall`` was raised (``AssertionError`` if ``min_count`` passed into :meth:`~DataFrameGroupby.median`) (:issue:`31485`) -- :meth:`DataFrame.at` and :meth:`Series.at` will raise a ``TypeError`` instead of a ``ValueError`` if an incompatible key is passed, and ``KeyError`` if a missing key is passed, matching the behavior of ``.loc[]`` (:issue:`31722`) -- Passing an integer dtype other than ``int64`` to ``np.array(period_index, dtype=...)`` will now raise ``TypeError`` instead of incorrectly using ``int64`` (:issue:`32255`) -- Passing an invalid ``fill_value`` to :meth:`Categorical.take` raises a ``ValueError`` instead of ``TypeError`` (:issue:`33660`) -- Combining a ``Categorical`` with integer categories and which contains missing values - with a float dtype column in operations such as :func:`concat` or :meth:`~DataFrame.append` - will now result in a float column instead of an object dtyped column (:issue:`33607`) -- :meth:`Series.to_timestamp` now raises a ``TypeError`` if the axis is not a :class:`PeriodIndex`. Previously an ``AttributeError`` was raised (:issue:`33327`) -- :meth:`Series.to_period` now raises a ``TypeError`` if the axis is not a :class:`DatetimeIndex`. Previously an ``AttributeError`` was raised (:issue:`33327`) -- :func: `pandas.api.dtypes.is_string_dtype` no longer incorrectly identifies categorical series as string. -- :func:`read_excel` no longer takes ``**kwds`` arguments. This means that passing in keyword ``chunksize`` now raises a ``TypeError`` - (previously raised a ``NotImplementedError``), while passing in keyword ``encoding`` now raises a ``TypeError`` (:issue:`34464`) -- :func: `merge` now checks ``suffixes`` parameter type to be ``tuple`` and raises ``TypeError``, whereas before a ``list`` or ``set`` were accepted and that the ``set`` could produce unexpected results (:issue:`33740`) -- :class:`Period` no longer accepts tuples for the ``freq`` argument (:issue:`34658`) -- :meth:`Series.interpolate` and :meth:`DataFrame.interpolate` now raises ValueError if ``limit_direction`` is 'forward' or 'both' and ``method`` is 'backfill' or 'bfill' or ``limit_direction`` is 'backward' or 'both' and ``method`` is 'pad' or 'ffill' (:issue:`34746`) ``MultiIndex.get_indexer`` interprets `method` argument differently ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -733,6 +627,115 @@ apply and applymap on ``DataFrame`` evaluates first row/column only once df.apply(func, axis=1) +.. _whatsnew_110.api.other: + +Other API changes +^^^^^^^^^^^^^^^^^ + +- :meth:`Series.describe` will now show distribution percentiles for ``datetime`` dtypes, statistics ``first`` and ``last`` + will now be ``min`` and ``max`` to match with numeric dtypes in :meth:`DataFrame.describe` (:issue:`30164`) +- Added :meth:`DataFrame.value_counts` (:issue:`5377`) +- :meth:`Groupby.groups` now returns an abbreviated representation when called on large dataframes (:issue:`1135`) +- ``loc`` lookups with an object-dtype :class:`Index` and an integer key will now raise ``KeyError`` instead of ``TypeError`` when key is missing (:issue:`31905`) +- Using a :func:`pandas.api.indexers.BaseIndexer` with ``count``, ``min``, ``max``, ``median``, ``skew``, ``cov``, ``corr`` will now return correct results for any monotonic :func:`pandas.api.indexers.BaseIndexer` descendant (:issue:`32865`) +- Added a :func:`pandas.api.indexers.FixedForwardWindowIndexer` class to support forward-looking windows during ``rolling`` operations. +- Added :class:`pandas.errors.InvalidIndexError` (:issue:`34570`). +- :meth:`DataFrame.swaplevels` now raises a ``TypeError`` if the axis is not a :class:`MultiIndex`. + Previously an ``AttributeError`` was raised (:issue:`31126`) +- :meth:`DataFrame.xs` now raises a ``TypeError`` if a ``level`` keyword is supplied and the axis is not a :class:`MultiIndex`. + Previously an ``AttributeError`` was raised (:issue:`33610`) +- :meth:`DataFrameGroupby.mean` and :meth:`SeriesGroupby.mean` (and similarly for :meth:`~DataFrameGroupby.median`, :meth:`~DataFrameGroupby.std` and :meth:`~DataFrameGroupby.var`) + now raise a ``TypeError`` if a not-accepted keyword argument is passed into it. + Previously a ``UnsupportedFunctionCall`` was raised (``AssertionError`` if ``min_count`` passed into :meth:`~DataFrameGroupby.median`) (:issue:`31485`) +- :meth:`DataFrame.at` and :meth:`Series.at` will raise a ``TypeError`` instead of a ``ValueError`` if an incompatible key is passed, and ``KeyError`` if a missing key is passed, matching the behavior of ``.loc[]`` (:issue:`31722`) +- Passing an integer dtype other than ``int64`` to ``np.array(period_index, dtype=...)`` will now raise ``TypeError`` instead of incorrectly using ``int64`` (:issue:`32255`) +- Passing an invalid ``fill_value`` to :meth:`Categorical.take` raises a ``ValueError`` instead of ``TypeError`` (:issue:`33660`) +- Combining a ``Categorical`` with integer categories and which contains missing values + with a float dtype column in operations such as :func:`concat` or :meth:`~DataFrame.append` + will now result in a float column instead of an object dtyped column (:issue:`33607`) +- :meth:`Series.to_timestamp` now raises a ``TypeError`` if the axis is not a :class:`PeriodIndex`. Previously an ``AttributeError`` was raised (:issue:`33327`) +- :meth:`Series.to_period` now raises a ``TypeError`` if the axis is not a :class:`DatetimeIndex`. Previously an ``AttributeError`` was raised (:issue:`33327`) +- :func: `pandas.api.dtypes.is_string_dtype` no longer incorrectly identifies categorical series as string. +- :func:`read_excel` no longer takes ``**kwds`` arguments. This means that passing in keyword ``chunksize`` now raises a ``TypeError`` + (previously raised a ``NotImplementedError``), while passing in keyword ``encoding`` now raises a ``TypeError`` (:issue:`34464`) +- :func: `merge` now checks ``suffixes`` parameter type to be ``tuple`` and raises ``TypeError``, whereas before a ``list`` or ``set`` were accepted and that the ``set`` could produce unexpected results (:issue:`33740`) +- :class:`Period` no longer accepts tuples for the ``freq`` argument (:issue:`34658`) +- :meth:`Series.interpolate` and :meth:`DataFrame.interpolate` now raises ValueError if ``limit_direction`` is 'forward' or 'both' and ``method`` is 'backfill' or 'bfill' or ``limit_direction`` is 'backward' or 'both' and ``method`` is 'pad' or 'ffill' (:issue:`34746`) + + +Increased minimum versions for dependencies +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Some minimum supported versions of dependencies were updated (:issue:`33718`, :issue:`29766`, :issue:`29723`, pytables >= 3.4.3). +If installed, we now require: + ++-----------------+-----------------+----------+---------+ +| Package | Minimum Version | Required | Changed | ++=================+=================+==========+=========+ +| numpy | 1.15.4 | X | X | ++-----------------+-----------------+----------+---------+ +| pytz | 2015.4 | X | | ++-----------------+-----------------+----------+---------+ +| python-dateutil | 2.7.3 | X | X | ++-----------------+-----------------+----------+---------+ +| bottleneck | 1.2.1 | | | ++-----------------+-----------------+----------+---------+ +| numexpr | 2.6.2 | | | ++-----------------+-----------------+----------+---------+ +| pytest (dev) | 4.0.2 | | | ++-----------------+-----------------+----------+---------+ + +For `optional libraries `_ the general recommendation is to use the latest version. +The following table lists the lowest version per library that is currently being tested throughout the development of pandas. +Optional libraries below the lowest tested version may still work, but are not considered supported. + ++-----------------+-----------------+---------+ +| Package | Minimum Version | Changed | ++=================+=================+=========+ +| beautifulsoup4 | 4.6.0 | | ++-----------------+-----------------+---------+ +| fastparquet | 0.3.2 | | ++-----------------+-----------------+---------+ +| gcsfs | 0.2.2 | | ++-----------------+-----------------+---------+ +| lxml | 3.8.0 | | ++-----------------+-----------------+---------+ +| matplotlib | 2.2.2 | | ++-----------------+-----------------+---------+ +| numba | 0.46.0 | | ++-----------------+-----------------+---------+ +| openpyxl | 2.5.7 | | ++-----------------+-----------------+---------+ +| pyarrow | 0.13.0 | | ++-----------------+-----------------+---------+ +| pymysql | 0.7.1 | | ++-----------------+-----------------+---------+ +| pytables | 3.4.3 | X | ++-----------------+-----------------+---------+ +| s3fs | 0.3.0 | | ++-----------------+-----------------+---------+ +| scipy | 1.2.0 | X | ++-----------------+-----------------+---------+ +| sqlalchemy | 1.1.4 | | ++-----------------+-----------------+---------+ +| xarray | 0.8.2 | | ++-----------------+-----------------+---------+ +| xlrd | 1.1.0 | | ++-----------------+-----------------+---------+ +| xlsxwriter | 0.9.8 | | ++-----------------+-----------------+---------+ +| xlwt | 1.2.0 | | ++-----------------+-----------------+---------+ +| pandas-gbq | 1.2.0 | X | ++-----------------+-----------------+---------+ + +See :ref:`install.dependencies` and :ref:`install.optional_dependencies` for more. + +Development Changes +^^^^^^^^^^^^^^^^^^^ + +- The minimum version of Cython is now the most recent bug-fix version (0.29.16) (:issue:`33334`). + .. _whatsnew_110.deprecations: From 5bf63a67e8e05ac375951f7c7ce297cbb1299139 Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Tue, 16 Jun 2020 20:29:37 +0100 Subject: [PATCH 0139/1025] BUG, TST: fix-_check_ticks_props (#34768) --- pandas/tests/plotting/common.py | 4 ++-- pandas/tests/plotting/test_common.py | 24 ++++++++++++++++++++++++ pandas/tests/plotting/test_frame.py | 2 ++ 3 files changed, 28 insertions(+), 2 deletions(-) create mode 100644 pandas/tests/plotting/test_common.py diff --git a/pandas/tests/plotting/common.py b/pandas/tests/plotting/common.py index f2f7b37170ec9..896d3278cdde1 100644 --- a/pandas/tests/plotting/common.py +++ b/pandas/tests/plotting/common.py @@ -272,7 +272,7 @@ def _check_ticks_props( axes = self._flatten_visible(axes) for ax in axes: - if xlabelsize or xrot: + if xlabelsize is not None or xrot is not None: if isinstance(ax.xaxis.get_minor_formatter(), NullFormatter): # If minor ticks has NullFormatter, rot / fontsize are not # retained @@ -286,7 +286,7 @@ def _check_ticks_props( if xrot is not None: tm.assert_almost_equal(label.get_rotation(), xrot) - if ylabelsize or yrot: + if ylabelsize is not None or yrot is not None: if isinstance(ax.yaxis.get_minor_formatter(), NullFormatter): labels = ax.get_yticklabels() else: diff --git a/pandas/tests/plotting/test_common.py b/pandas/tests/plotting/test_common.py new file mode 100644 index 0000000000000..af67ed7ec215b --- /dev/null +++ b/pandas/tests/plotting/test_common.py @@ -0,0 +1,24 @@ +import pytest + +import pandas.util._test_decorators as td + +from pandas import DataFrame +from pandas.tests.plotting.common import TestPlotBase, _check_plot_works + + +@td.skip_if_no_mpl +class TestCommon(TestPlotBase): + def test__check_ticks_props(self): + # GH 34768 + df = DataFrame({"b": [0, 1, 0], "a": [1, 2, 3]}) + ax = _check_plot_works(df.plot, rot=30) + ax.yaxis.set_tick_params(rotation=30) + msg = "expected 0.00000 but got " + with pytest.raises(AssertionError, match=msg): + self._check_ticks_props(ax, xrot=0) + with pytest.raises(AssertionError, match=msg): + self._check_ticks_props(ax, xlabelsize=0) + with pytest.raises(AssertionError, match=msg): + self._check_ticks_props(ax, yrot=0) + with pytest.raises(AssertionError, match=msg): + self._check_ticks_props(ax, ylabelsize=0) diff --git a/pandas/tests/plotting/test_frame.py b/pandas/tests/plotting/test_frame.py index 8992e27a78d6b..e4299490e7601 100644 --- a/pandas/tests/plotting/test_frame.py +++ b/pandas/tests/plotting/test_frame.py @@ -48,6 +48,7 @@ def _assert_xtickslabels_visibility(self, axes, expected): for ax, exp in zip(axes, expected): self._check_visible(ax.get_xticklabels(), visible=exp) + @pytest.mark.xfail(reason="Waiting for PR 34334", strict=True) @pytest.mark.slow def test_plot(self): from pandas.plotting._matplotlib.compat import _mpl_ge_3_1_0 @@ -467,6 +468,7 @@ def test_groupby_boxplot_sharex(self): expected = [False, False, True, True] self._assert_xtickslabels_visibility(axes, expected) + @pytest.mark.xfail(reason="Waiting for PR 34334", strict=True) @pytest.mark.slow def test_subplots_timeseries(self): idx = date_range(start="2014-07-01", freq="M", periods=10) From 2e8af181f948e07089a2e9aaf5540239123793ab Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 16 Jun 2020 13:47:01 -0700 Subject: [PATCH 0140/1025] REF: remove libfrequencies (#34828) --- pandas/_libs/tslibs/frequencies.pxd | 1 - pandas/_libs/tslibs/frequencies.pyx | 40 ------------------- pandas/_libs/tslibs/period.pyx | 29 ++++++++++++-- pandas/core/arrays/period.py | 12 +++--- pandas/plotting/_matplotlib/timeseries.py | 2 +- .../tseries/frequencies/test_freq_code.py | 10 +++-- pandas/tests/tslibs/test_api.py | 1 - setup.py | 2 - 8 files changed, 38 insertions(+), 59 deletions(-) delete mode 100644 pandas/_libs/tslibs/frequencies.pxd delete mode 100644 pandas/_libs/tslibs/frequencies.pyx diff --git a/pandas/_libs/tslibs/frequencies.pxd b/pandas/_libs/tslibs/frequencies.pxd deleted file mode 100644 index b3ad6e6c19ee3..0000000000000 --- a/pandas/_libs/tslibs/frequencies.pxd +++ /dev/null @@ -1 +0,0 @@ -cpdef int get_to_timestamp_base(int base) diff --git a/pandas/_libs/tslibs/frequencies.pyx b/pandas/_libs/tslibs/frequencies.pyx deleted file mode 100644 index fd28240abd882..0000000000000 --- a/pandas/_libs/tslibs/frequencies.pyx +++ /dev/null @@ -1,40 +0,0 @@ - -from .dtypes import FreqGroup - -# ---------------------------------------------------------------------- - - -cpdef int get_to_timestamp_base(int base): - """ - Return frequency code group used for base of to_timestamp against - frequency code. - - Parameters - ---------- - base : int (member of FreqGroup) - - Returns - ------- - base : int - - Examples - -------- - # Return day freq code against longer freq than day - >>> get_to_timestamp_base(get_freq_code('D')[0]) - 6000 - >>> get_to_timestamp_base(get_freq_code('W')[0]) - 6000 - >>> get_to_timestamp_base(get_freq_code('M')[0]) - 6000 - - # Return second freq code against hour between second - >>> get_to_timestamp_base(get_freq_code('H')[0]) - 9000 - >>> get_to_timestamp_base(get_freq_code('S')[0]) - 9000 - """ - if base < FreqGroup.FR_BUS: - return FreqGroup.FR_DAY - elif FreqGroup.FR_HR <= base <= FreqGroup.FR_SEC: - return FreqGroup.FR_SEC - return base diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index 30caddf81b6e8..a2250234dbd14 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -74,7 +74,6 @@ from pandas._libs.tslibs.dtypes cimport ( attrname_to_abbrevs, ) -from pandas._libs.tslibs.frequencies cimport get_to_timestamp_base from pandas._libs.tslibs.parsing cimport get_rule_month from pandas._libs.tslibs.parsing import parse_time_string from pandas._libs.tslibs.nattype cimport ( @@ -1478,7 +1477,30 @@ class IncompatibleFrequency(ValueError): pass -cdef class _Period: +cdef class PeriodMixin: + # Methods shared between Period and PeriodArray + + cpdef int _get_to_timestamp_base(self): + """ + Return frequency code group used for base of to_timestamp against + frequency code. + + Return day freq code against longer freq than day. + Return second freq code against hour between second. + + Returns + ------- + int + """ + base = self._dtype._dtype_code + if base < FR_BUS: + return FR_DAY + elif FR_HR <= base <= FR_SEC: + return FR_SEC + return base + + +cdef class _Period(PeriodMixin): cdef readonly: int64_t ordinal @@ -1734,8 +1756,7 @@ cdef class _Period: return endpoint - Timedelta(1, 'ns') if freq is None: - base = self._dtype._dtype_code - freq = get_to_timestamp_base(base) + freq = self._get_to_timestamp_base() base = freq else: freq = self._maybe_convert_freq(freq) diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 0d866aa7eae26..7902dd0410910 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -9,17 +9,18 @@ NaTType, Timedelta, delta_to_nanoseconds, - frequencies as libfrequencies, iNaT, period as libperiod, to_offset, ) +from pandas._libs.tslibs.dtypes import FreqGroup from pandas._libs.tslibs.fields import isleapyear_arr from pandas._libs.tslibs.offsets import Tick, delta_to_tick from pandas._libs.tslibs.period import ( DIFFERENT_FREQ, IncompatibleFrequency, Period, + PeriodMixin, get_period_field_arr, period_asfreq_arr, ) @@ -61,7 +62,7 @@ def f(self): return property(f) -class PeriodArray(dtl.DatetimeLikeArrayMixin, dtl.DatelikeOps): +class PeriodArray(PeriodMixin, dtl.DatetimeLikeArrayMixin, dtl.DatelikeOps): """ Pandas ExtensionArray for storing Period data. @@ -440,8 +441,7 @@ def to_timestamp(self, freq=None, how="start"): return (self + self.freq).to_timestamp(how="start") - adjust if freq is None: - base = self.freq._period_dtype_code - freq = libfrequencies.get_to_timestamp_base(base) + freq = self._get_to_timestamp_base() base = freq else: freq = Period._maybe_convert_freq(freq) @@ -1027,11 +1027,11 @@ def _range_from_fields( if quarter is not None: if freq is None: freq = to_offset("Q") - base = libfrequencies.FreqGroup.FR_QTR + base = FreqGroup.FR_QTR else: freq = to_offset(freq) base = libperiod.freq_to_dtype_code(freq) - if base != libfrequencies.FreqGroup.FR_QTR: + if base != FreqGroup.FR_QTR: raise AssertionError("base must equal FR_QTR") year, quarter = _make_field_arrays(year, quarter) diff --git a/pandas/plotting/_matplotlib/timeseries.py b/pandas/plotting/_matplotlib/timeseries.py index fa8051954e435..8ffd30567b9ac 100644 --- a/pandas/plotting/_matplotlib/timeseries.py +++ b/pandas/plotting/_matplotlib/timeseries.py @@ -6,7 +6,7 @@ import numpy as np from pandas._libs.tslibs import Period, to_offset -from pandas._libs.tslibs.frequencies import FreqGroup +from pandas._libs.tslibs.dtypes import FreqGroup from pandas._typing import FrameOrSeriesUnion from pandas.core.dtypes.generic import ( diff --git a/pandas/tests/tseries/frequencies/test_freq_code.py b/pandas/tests/tseries/frequencies/test_freq_code.py index 5383c1ff1c2c9..20cadde45e7a0 100644 --- a/pandas/tests/tseries/frequencies/test_freq_code.py +++ b/pandas/tests/tseries/frequencies/test_freq_code.py @@ -1,8 +1,7 @@ import pytest -from pandas._libs.tslibs import Resolution, to_offset +from pandas._libs.tslibs import Period, Resolution, to_offset from pandas._libs.tslibs.dtypes import _attrname_to_abbrevs -from pandas._libs.tslibs.frequencies import get_to_timestamp_base @pytest.mark.parametrize( @@ -10,9 +9,12 @@ [("D", "D"), ("W", "D"), ("M", "D"), ("S", "S"), ("T", "S"), ("H", "S")], ) def test_get_to_timestamp_base(freqstr, exp_freqstr): - left_code = to_offset(freqstr)._period_dtype_code + off = to_offset(freqstr) + per = Period._from_ordinal(1, off) exp_code = to_offset(exp_freqstr)._period_dtype_code - assert get_to_timestamp_base(left_code) == exp_code + + result_code = per._get_to_timestamp_base() + assert result_code == exp_code @pytest.mark.parametrize( diff --git a/pandas/tests/tslibs/test_api.py b/pandas/tests/tslibs/test_api.py index b0c524a257684..a119db6c68635 100644 --- a/pandas/tests/tslibs/test_api.py +++ b/pandas/tests/tslibs/test_api.py @@ -11,7 +11,6 @@ def test_namespace(): "conversion", "dtypes", "fields", - "frequencies", "nattype", "np_datetime", "offsets", diff --git a/setup.py b/setup.py index 3caea5c5e79da..e9d305d831653 100755 --- a/setup.py +++ b/setup.py @@ -319,7 +319,6 @@ class CheckSDist(sdist_class): "pandas/_libs/tslibs/conversion.pyx", "pandas/_libs/tslibs/fields.pyx", "pandas/_libs/tslibs/offsets.pyx", - "pandas/_libs/tslibs/frequencies.pyx", "pandas/_libs/tslibs/resolution.pyx", "pandas/_libs/tslibs/parsing.pyx", "pandas/_libs/tslibs/tzconversion.pyx", @@ -615,7 +614,6 @@ def srcpath(name=None, suffix=".pyx", subdir="src"): "pyxfile": "_libs/tslibs/fields", "depends": tseries_depends, }, - "_libs.tslibs.frequencies": {"pyxfile": "_libs/tslibs/frequencies"}, "_libs.tslibs.nattype": {"pyxfile": "_libs/tslibs/nattype"}, "_libs.tslibs.np_datetime": { "pyxfile": "_libs/tslibs/np_datetime", From c2a0e21fe89eab924ff1260b0704522922e50e8c Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Wed, 17 Jun 2020 11:56:59 +0100 Subject: [PATCH 0141/1025] DOC: 1.0.5 release date (#34845) --- doc/source/whatsnew/v1.0.5.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.0.5.rst b/doc/source/whatsnew/v1.0.5.rst index fdf08dd381050..9a5128a07bbfd 100644 --- a/doc/source/whatsnew/v1.0.5.rst +++ b/doc/source/whatsnew/v1.0.5.rst @@ -1,7 +1,7 @@ .. _whatsnew_105: -What's new in 1.0.5 (June XX, 2020) +What's new in 1.0.5 (June 17, 2020) ----------------------------------- These are the changes in pandas 1.0.5. See :ref:`release` for a full changelog From e9e0f3cb5844d052efde75b51392e606c017945e Mon Sep 17 00:00:00 2001 From: William Ayd Date: Wed, 17 Jun 2020 10:26:56 -0700 Subject: [PATCH 0142/1025] Replaced numpy type aliases; fix CI failure (#34835) --- asv_bench/benchmarks/pandas_vb_common.py | 2 +- asv_bench/benchmarks/series_methods.py | 18 ++- asv_bench/benchmarks/sparse.py | 2 +- doc/source/user_guide/io.rst | 2 +- pandas/_libs/hashtable_class_helper.pxi.in | 2 +- pandas/_libs/hashtable_func_helper.pxi.in | 2 +- pandas/_libs/parsers.pyx | 2 +- pandas/_libs/sparse.pyx | 2 +- pandas/_libs/testing.pyx | 2 +- pandas/core/algorithms.py | 2 +- pandas/core/arrays/sparse/array.py | 4 +- pandas/core/base.py | 2 +- pandas/core/dtypes/cast.py | 10 +- pandas/core/dtypes/common.py | 4 +- pandas/core/generic.py | 4 +- pandas/core/groupby/groupby.py | 6 +- pandas/core/indexes/base.py | 2 +- pandas/core/internals/managers.py | 2 +- pandas/core/util/hashing.py | 2 +- pandas/io/parsers.py | 6 +- pandas/io/sas/sas7bdat.py | 2 +- pandas/io/stata.py | 8 +- pandas/plotting/_matplotlib/tools.py | 2 +- pandas/tests/arithmetic/test_period.py | 26 ++-- pandas/tests/arrays/boolean/test_logical.py | 4 +- .../tests/arrays/categorical/test_dtypes.py | 4 +- .../tests/arrays/sparse/test_arithmetics.py | 18 +-- pandas/tests/arrays/sparse/test_array.py | 32 ++--- .../dtypes/cast/test_find_common_type.py | 44 +++---- pandas/tests/dtypes/cast/test_infer_dtype.py | 8 +- pandas/tests/dtypes/test_common.py | 7 +- pandas/tests/dtypes/test_dtypes.py | 2 +- pandas/tests/dtypes/test_inference.py | 7 +- pandas/tests/extension/base/setitem.py | 4 +- pandas/tests/frame/methods/test_duplicated.py | 2 +- pandas/tests/frame/methods/test_isin.py | 2 +- pandas/tests/frame/methods/test_to_records.py | 2 +- pandas/tests/frame/test_analytics.py | 2 +- pandas/tests/frame/test_dtypes.py | 10 +- pandas/tests/frame/test_reshape.py | 4 +- pandas/tests/frame/test_to_csv.py | 4 +- pandas/tests/generic/test_finalize.py | 2 - .../tests/groupby/aggregate/test_aggregate.py | 2 +- pandas/tests/groupby/test_apply.py | 2 +- .../indexes/categorical/test_category.py | 6 +- pandas/tests/indexes/common.py | 4 +- pandas/tests/indexes/multi/test_indexing.py | 4 +- pandas/tests/indexes/period/test_period.py | 2 +- pandas/tests/indexes/test_base.py | 4 +- pandas/tests/indexing/test_coercion.py | 120 ++++++++---------- pandas/tests/indexing/test_indexing.py | 4 +- pandas/tests/indexing/test_loc.py | 2 +- pandas/tests/io/formats/test_format.py | 2 +- .../tests/io/json/test_json_table_schema.py | 16 +-- pandas/tests/io/json/test_pandas.py | 4 +- pandas/tests/io/json/test_ujson.py | 12 +- pandas/tests/io/parser/test_c_parser_only.py | 4 +- pandas/tests/io/parser/test_common.py | 4 +- pandas/tests/io/parser/test_dtypes.py | 4 +- pandas/tests/io/pytables/test_store.py | 2 +- pandas/tests/io/test_sql.py | 2 +- pandas/tests/io/test_stata.py | 4 +- pandas/tests/plotting/test_hist_method.py | 2 +- pandas/tests/plotting/test_series.py | 2 +- pandas/tests/resample/test_base.py | 2 +- pandas/tests/reshape/test_concat.py | 4 +- pandas/tests/series/indexing/test_where.py | 2 +- pandas/tests/series/test_apply.py | 4 +- pandas/tests/series/test_combine_concat.py | 4 +- pandas/tests/test_algos.py | 16 +-- pandas/tests/test_take.py | 2 +- pandas/tests/tslibs/test_fields.py | 4 +- .../window/moments/test_moments_rolling.py | 4 +- 73 files changed, 247 insertions(+), 277 deletions(-) diff --git a/asv_bench/benchmarks/pandas_vb_common.py b/asv_bench/benchmarks/pandas_vb_common.py index fd1770df8e5d3..23286343d7367 100644 --- a/asv_bench/benchmarks/pandas_vb_common.py +++ b/asv_bench/benchmarks/pandas_vb_common.py @@ -33,7 +33,7 @@ np.uint8, ] datetime_dtypes = [np.datetime64, np.timedelta64] -string_dtypes = [np.object] +string_dtypes = [object] try: extension_dtypes = [ pd.Int8Dtype, diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py index d78419c12ce0d..258c29c145721 100644 --- a/asv_bench/benchmarks/series_methods.py +++ b/asv_bench/benchmarks/series_methods.py @@ -58,17 +58,15 @@ def time_isin_nan_values(self): class IsInForObjects: def setup(self): - self.s_nans = Series(np.full(10 ** 4, np.nan)).astype(np.object) - self.vals_nans = np.full(10 ** 4, np.nan).astype(np.object) - self.s_short = Series(np.arange(2)).astype(np.object) - self.s_long = Series(np.arange(10 ** 5)).astype(np.object) - self.vals_short = np.arange(2).astype(np.object) - self.vals_long = np.arange(10 ** 5).astype(np.object) + self.s_nans = Series(np.full(10 ** 4, np.nan)).astype(object) + self.vals_nans = np.full(10 ** 4, np.nan).astype(object) + self.s_short = Series(np.arange(2)).astype(object) + self.s_long = Series(np.arange(10 ** 5)).astype(object) + self.vals_short = np.arange(2).astype(object) + self.vals_long = np.arange(10 ** 5).astype(object) # because of nans floats are special: - self.s_long_floats = Series(np.arange(10 ** 5, dtype=np.float)).astype( - np.object - ) - self.vals_long_floats = np.arange(10 ** 5, dtype=np.float).astype(np.object) + self.s_long_floats = Series(np.arange(10 ** 5, dtype=np.float)).astype(object) + self.vals_long_floats = np.arange(10 ** 5, dtype=np.float).astype(object) def time_isin_nans(self): # if nan-objects are different objects, diff --git a/asv_bench/benchmarks/sparse.py b/asv_bench/benchmarks/sparse.py index d6aa41a7e0f32..28ceb25eebd96 100644 --- a/asv_bench/benchmarks/sparse.py +++ b/asv_bench/benchmarks/sparse.py @@ -32,7 +32,7 @@ def time_series_to_frame(self): class SparseArrayConstructor: - params = ([0.1, 0.01], [0, np.nan], [np.int64, np.float64, np.object]) + params = ([0.1, 0.01], [0, np.nan], [np.int64, np.float64, object]) param_names = ["dense_proportion", "fill_value", "dtype"] def setup(self, dense_proportion, fill_value, dtype): diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index df6b44ac654ce..d4be9d802d697 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -1884,7 +1884,7 @@ Fallback behavior If the JSON serializer cannot handle the container contents directly it will fall back in the following manner: -* if the dtype is unsupported (e.g. ``np.complex``) then the ``default_handler``, if provided, will be called +* if the dtype is unsupported (e.g. ``np.complex_``) then the ``default_handler``, if provided, will be called for each value, otherwise an exception is raised. * if an object is unsupported it will attempt the following: diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index ad65f9707610b..e0e026fe7cb5e 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -178,7 +178,7 @@ cdef class StringVector: Py_ssize_t n object val - ao = np.empty(self.data.n, dtype=np.object) + ao = np.empty(self.data.n, dtype=object) for i in range(self.data.n): val = self.data.data[i] ao[i] = val diff --git a/pandas/_libs/hashtable_func_helper.pxi.in b/pandas/_libs/hashtable_func_helper.pxi.in index 326ae36c6a12c..0cc0a6b192df5 100644 --- a/pandas/_libs/hashtable_func_helper.pxi.in +++ b/pandas/_libs/hashtable_func_helper.pxi.in @@ -94,7 +94,7 @@ cpdef value_count_{{dtype}}({{c_type}}[:] values, bint dropna): build_count_table_{{dtype}}(values, table, dropna) {{endif}} - result_keys = np.empty(table.n_occupied, dtype=np.{{dtype}}) + result_keys = np.empty(table.n_occupied, '{{dtype}}') result_counts = np.zeros(table.n_occupied, dtype=np.int64) {{if dtype == 'object'}} diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 461419239c730..6ffb036e01595 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -2037,7 +2037,7 @@ def _concatenate_chunks(list chunks): numpy_dtypes = {x for x in dtypes if not is_categorical_dtype(x)} if len(numpy_dtypes) > 1: common_type = np.find_common_type(numpy_dtypes, []) - if common_type == np.object: + if common_type == object: warning_columns.append(str(name)) dtype = dtypes.pop() diff --git a/pandas/_libs/sparse.pyx b/pandas/_libs/sparse.pyx index d853ddf3de7d4..7c9575d921dc9 100644 --- a/pandas/_libs/sparse.pyx +++ b/pandas/_libs/sparse.pyx @@ -791,4 +791,4 @@ def make_mask_object_ndarray(ndarray[object, ndim=1] arr, object fill_value): if value == fill_value and type(value) == type(fill_value): mask[i] = 0 - return mask.view(dtype=np.bool) + return mask.view(dtype=bool) diff --git a/pandas/_libs/testing.pyx b/pandas/_libs/testing.pyx index 9d3959d0a070a..ca18afebf410b 100644 --- a/pandas/_libs/testing.pyx +++ b/pandas/_libs/testing.pyx @@ -11,7 +11,7 @@ cdef NUMERIC_TYPES = ( bool, int, float, - np.bool, + np.bool_, np.int8, np.int16, np.int32, diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index d270a6431be56..dcf2015245518 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -171,7 +171,7 @@ def _ensure_data( return values, dtype # we have failed, return object - values = np.asarray(values, dtype=np.object) + values = np.asarray(values, dtype=object) return ensure_object(values), np.dtype("object") diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index 9b89ec99e8df6..4996a10002c63 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -150,7 +150,7 @@ def _sparse_array_op( # to make template simple, cast here left_sp_values = left.sp_values.view(np.uint8) right_sp_values = right.sp_values.view(np.uint8) - result_dtype = np.bool + result_dtype = bool else: opname = f"sparse_{name}_{dtype}" left_sp_values = left.sp_values @@ -183,7 +183,7 @@ def _wrap_result(name, data, sparse_index, fill_value, dtype=None): name = name[2:-2] if name in ("eq", "ne", "lt", "gt", "le", "ge"): - dtype = np.bool + dtype = bool fill_value = lib.item_from_zerodim(fill_value) diff --git a/pandas/core/base.py b/pandas/core/base.py index bb1afc8f8ef20..e790b1d7f106e 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -1520,7 +1520,7 @@ def drop_duplicates(self, keep="first"): def duplicated(self, keep="first"): if isinstance(self, ABCIndexClass): if self.is_unique: - return np.zeros(len(self), dtype=np.bool) + return np.zeros(len(self), dtype=bool) return duplicated(self, keep=keep) else: return self._constructor( diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 2a47a03b8d387..e69e3bab10af8 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -225,7 +225,7 @@ def trans(x): # if we have any nulls, then we are done return result - elif not isinstance(r[0], (np.integer, np.floating, np.bool, int, float, bool)): + elif not isinstance(r[0], (np.integer, np.floating, int, float, bool)): # a comparable, e.g. a Decimal may slip in here return result @@ -315,7 +315,7 @@ def maybe_cast_result_dtype(dtype: DtypeObj, how: str) -> DtypeObj: from pandas.core.arrays.boolean import BooleanDtype from pandas.core.arrays.integer import Int64Dtype - if how in ["add", "cumsum", "sum"] and (dtype == np.dtype(np.bool)): + if how in ["add", "cumsum", "sum"] and (dtype == np.dtype(bool)): return np.dtype(np.int64) elif how in ["add", "cumsum", "sum"] and isinstance(dtype, BooleanDtype): return Int64Dtype() @@ -597,7 +597,7 @@ def _ensure_dtype_type(value, dtype): """ Ensure that the given value is an instance of the given dtype. - e.g. if out dtype is np.complex64, we should have an instance of that + e.g. if out dtype is np.complex64_, we should have an instance of that as opposed to a python complex object. Parameters @@ -1483,7 +1483,7 @@ def find_common_type(types: List[DtypeObj]) -> DtypeObj: if has_bools: for t in types: if is_integer_dtype(t) or is_float_dtype(t) or is_complex_dtype(t): - return np.object + return object return np.find_common_type(types, []) @@ -1742,7 +1742,7 @@ def validate_numeric_casting(dtype: np.dtype, value): if is_float(value) and np.isnan(value): raise ValueError("Cannot assign nan to integer series") - if issubclass(dtype.type, (np.integer, np.floating, np.complex)) and not issubclass( + if issubclass(dtype.type, (np.integer, np.floating, complex)) and not issubclass( dtype.type, np.bool_ ): if is_bool(value): diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index a4a5ae1bfefff..9e960375e9bf4 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -1354,7 +1354,7 @@ def is_bool_dtype(arr_or_dtype) -> bool: False >>> is_bool_dtype(bool) True - >>> is_bool_dtype(np.bool) + >>> is_bool_dtype(np.bool_) True >>> is_bool_dtype(np.array(['a', 'b'])) False @@ -1526,7 +1526,7 @@ def is_complex_dtype(arr_or_dtype) -> bool: False >>> is_complex_dtype(int) False - >>> is_complex_dtype(np.complex) + >>> is_complex_dtype(np.complex_) True >>> is_complex_dtype(np.array(['a', 'b'])) False diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 9014e576eeb39..26770efb5c9f9 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -10024,7 +10024,7 @@ def describe( Including only string columns in a ``DataFrame`` description. - >>> df.describe(include=[np.object]) # doctest: +SKIP + >>> df.describe(include=[object]) # doctest: +SKIP object count 3 unique 3 @@ -10051,7 +10051,7 @@ def describe( Excluding object columns from a ``DataFrame`` description. - >>> df.describe(exclude=[np.object]) # doctest: +SKIP + >>> df.describe(exclude=[object]) # doctest: +SKIP categorical numeric count 3 3.0 unique 3 NaN diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 904049923859d..48fdb14ebe90c 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1267,9 +1267,9 @@ def objs_to_bool(vals: np.ndarray) -> Tuple[np.ndarray, Type]: if is_object_dtype(vals): vals = np.array([bool(x) for x in vals]) else: - vals = vals.astype(np.bool) + vals = vals.astype(bool) - return vals.view(np.uint8), np.bool + return vals.view(np.uint8), bool def result_to_bool(result: np.ndarray, inference: Type) -> np.ndarray: return result.astype(inference, copy=False) @@ -2059,7 +2059,7 @@ def pre_processor(vals: np.ndarray) -> Tuple[np.ndarray, Optional[Type]]: vals = vals.to_numpy(dtype=float, na_value=np.nan) elif is_datetime64_dtype(vals.dtype): inference = "datetime64[ns]" - vals = np.asarray(vals).astype(np.float) + vals = np.asarray(vals).astype(float) return vals, inference diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index c046d6465ce67..057adceda7efd 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -374,7 +374,7 @@ def __new__( return UInt64Index(data, copy=copy, dtype=dtype, name=name) elif is_float_dtype(data.dtype): return Float64Index(data, copy=copy, dtype=dtype, name=name) - elif issubclass(data.dtype.type, np.bool) or is_bool_dtype(data): + elif issubclass(data.dtype.type, bool) or is_bool_dtype(data): subarr = data.astype("object") else: subarr = com.asarray_tuplesafe(data, dtype=object) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index e496694ee7899..eaf59051205d6 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1951,7 +1951,7 @@ def _check_comparison_types( if isinstance(result, np.ndarray): # The shape of the mask can differ to that of the result # since we may compare only a subset of a's or b's elements - tmp = np.zeros(mask.shape, dtype=np.bool) + tmp = np.zeros(mask.shape, dtype=np.bool_) tmp[mask] = result result = tmp diff --git a/pandas/core/util/hashing.py b/pandas/core/util/hashing.py index 1d6e02254e44a..1b56b6d5a46fa 100644 --- a/pandas/core/util/hashing.py +++ b/pandas/core/util/hashing.py @@ -264,7 +264,7 @@ def hash_array( # First, turn whatever array this is into unsigned 64-bit ints, if we can # manage it. - elif isinstance(dtype, np.bool): + elif isinstance(dtype, bool): vals = vals.astype("u8") elif issubclass(dtype.type, (np.datetime64, np.timedelta64)): vals = vals.view("i8").astype("u8", copy=False) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index c54e264faedd2..679cf4c2d8929 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -3476,13 +3476,13 @@ def _get_empty_meta(columns, index_col, index_names, dtype=None): # This will enable us to write `dtype[col_name]` # without worrying about KeyError issues later on. if not isinstance(dtype, dict): - # if dtype == None, default will be np.object. - default_dtype = dtype or np.object + # if dtype == None, default will be object. + default_dtype = dtype or object dtype = defaultdict(lambda: default_dtype) else: # Save a copy of the dictionary. _dtype = dtype.copy() - dtype = defaultdict(lambda: np.object) + dtype = defaultdict(lambda: object) # Convert column indexes to column names. for k, v in _dtype.items(): diff --git a/pandas/io/sas/sas7bdat.py b/pandas/io/sas/sas7bdat.py index c8f1336bcec60..3d9be7c15726b 100644 --- a/pandas/io/sas/sas7bdat.py +++ b/pandas/io/sas/sas7bdat.py @@ -685,7 +685,7 @@ def read(self, nrows=None): nd = self._column_types.count(b"d") ns = self._column_types.count(b"s") - self._string_chunk = np.empty((ns, nrows), dtype=np.object) + self._string_chunk = np.empty((ns, nrows), dtype=object) self._byte_chunk = np.zeros((nd, 8 * nrows), dtype=np.uint8) self._current_row_in_chunk_index = 0 diff --git a/pandas/io/stata.py b/pandas/io/stata.py index e9adf5292ef6f..7677d8a94d521 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -322,7 +322,7 @@ def convert_delta_safe(base, deltas, unit) -> Series: elif fmt.startswith(("%tC", "tC")): warnings.warn("Encountered %tC format. Leaving in Stata Internal Format.") - conv_dates = Series(dates, dtype=np.object) + conv_dates = Series(dates, dtype=object) if has_bad_values: conv_dates[bad_locs] = NaT return conv_dates @@ -451,7 +451,7 @@ def g(x: datetime.datetime) -> int: conv_dates = 4 * (d.year - stata_epoch.year) + (d.month - 1) // 3 elif fmt in ["%th", "th"]: d = parse_dates_safe(dates, year=True) - conv_dates = 2 * (d.year - stata_epoch.year) + (d.month > 6).astype(np.int) + conv_dates = 2 * (d.year - stata_epoch.year) + (d.month > 6).astype(int) elif fmt in ["%ty", "ty"]: d = parse_dates_safe(dates, year=True) conv_dates = d.year @@ -553,7 +553,7 @@ def _cast_to_stata_types(data: DataFrame) -> DataFrame: ws = "" # original, if small, if large conversion_data = ( - (np.bool, np.int8, np.int8), + (np.bool_, np.int8, np.int8), (np.uint8, np.int8, np.int16), (np.uint16, np.int16, np.int32), (np.uint32, np.int32, np.int64), @@ -1725,7 +1725,7 @@ def _do_convert_missing(self, data: DataFrame, convert_missing: bool) -> DataFra if convert_missing: # Replacement follows Stata notation missing_loc = np.nonzero(np.asarray(missing))[0] umissing, umissing_loc = np.unique(series[missing], return_inverse=True) - replacement = Series(series, dtype=np.object) + replacement = Series(series, dtype=object) for j, um in enumerate(umissing): missing_value = StataMissingValue(um) diff --git a/pandas/plotting/_matplotlib/tools.py b/pandas/plotting/_matplotlib/tools.py index ef8376bfef8a9..caf2f27de9276 100644 --- a/pandas/plotting/_matplotlib/tools.py +++ b/pandas/plotting/_matplotlib/tools.py @@ -301,7 +301,7 @@ def _handle_shared_axes(axarr, nplots, naxes, nrows, ncols, sharex, sharey): try: # first find out the ax layout, # so that we can correctly handle 'gaps" - layout = np.zeros((nrows + 1, ncols + 1), dtype=np.bool) + layout = np.zeros((nrows + 1, ncols + 1), dtype=np.bool_) for ax in axarr: layout[row_num(ax), col_num(ax)] = ax.get_visible() diff --git a/pandas/tests/arithmetic/test_period.py b/pandas/tests/arithmetic/test_period.py index ccd03e841a40d..6c7b989bb9f2e 100644 --- a/pandas/tests/arithmetic/test_period.py +++ b/pandas/tests/arithmetic/test_period.py @@ -457,27 +457,27 @@ def test_pi_comp_period(self): ) f = lambda x: x == pd.Period("2011-03", freq="M") - exp = np.array([False, False, True, False], dtype=np.bool) + exp = np.array([False, False, True, False], dtype=np.bool_) self._check(idx, f, exp) f = lambda x: pd.Period("2011-03", freq="M") == x self._check(idx, f, exp) f = lambda x: x != pd.Period("2011-03", freq="M") - exp = np.array([True, True, False, True], dtype=np.bool) + exp = np.array([True, True, False, True], dtype=np.bool_) self._check(idx, f, exp) f = lambda x: pd.Period("2011-03", freq="M") != x self._check(idx, f, exp) f = lambda x: pd.Period("2011-03", freq="M") >= x - exp = np.array([True, True, True, False], dtype=np.bool) + exp = np.array([True, True, True, False], dtype=np.bool_) self._check(idx, f, exp) f = lambda x: x > pd.Period("2011-03", freq="M") - exp = np.array([False, False, False, True], dtype=np.bool) + exp = np.array([False, False, False, True], dtype=np.bool_) self._check(idx, f, exp) f = lambda x: pd.Period("2011-03", freq="M") >= x - exp = np.array([True, True, True, False], dtype=np.bool) + exp = np.array([True, True, True, False], dtype=np.bool_) self._check(idx, f, exp) def test_pi_comp_period_nat(self): @@ -486,43 +486,43 @@ def test_pi_comp_period_nat(self): ) f = lambda x: x == pd.Period("2011-03", freq="M") - exp = np.array([False, False, True, False], dtype=np.bool) + exp = np.array([False, False, True, False], dtype=np.bool_) self._check(idx, f, exp) f = lambda x: pd.Period("2011-03", freq="M") == x self._check(idx, f, exp) f = lambda x: x == pd.NaT - exp = np.array([False, False, False, False], dtype=np.bool) + exp = np.array([False, False, False, False], dtype=np.bool_) self._check(idx, f, exp) f = lambda x: pd.NaT == x self._check(idx, f, exp) f = lambda x: x != pd.Period("2011-03", freq="M") - exp = np.array([True, True, False, True], dtype=np.bool) + exp = np.array([True, True, False, True], dtype=np.bool_) self._check(idx, f, exp) f = lambda x: pd.Period("2011-03", freq="M") != x self._check(idx, f, exp) f = lambda x: x != pd.NaT - exp = np.array([True, True, True, True], dtype=np.bool) + exp = np.array([True, True, True, True], dtype=np.bool_) self._check(idx, f, exp) f = lambda x: pd.NaT != x self._check(idx, f, exp) f = lambda x: pd.Period("2011-03", freq="M") >= x - exp = np.array([True, False, True, False], dtype=np.bool) + exp = np.array([True, False, True, False], dtype=np.bool_) self._check(idx, f, exp) f = lambda x: x < pd.Period("2011-03", freq="M") - exp = np.array([True, False, False, False], dtype=np.bool) + exp = np.array([True, False, False, False], dtype=np.bool_) self._check(idx, f, exp) f = lambda x: x > pd.NaT - exp = np.array([False, False, False, False], dtype=np.bool) + exp = np.array([False, False, False, False], dtype=np.bool_) self._check(idx, f, exp) f = lambda x: pd.NaT >= x - exp = np.array([False, False, False, False], dtype=np.bool) + exp = np.array([False, False, False, False], dtype=np.bool_) self._check(idx, f, exp) diff --git a/pandas/tests/arrays/boolean/test_logical.py b/pandas/tests/arrays/boolean/test_logical.py index bf4775bbd7b32..e79262e1b7934 100644 --- a/pandas/tests/arrays/boolean/test_logical.py +++ b/pandas/tests/arrays/boolean/test_logical.py @@ -14,8 +14,8 @@ def test_numpy_scalars_ok(self, all_logical_operators): a = pd.array([True, False, None], dtype="boolean") op = getattr(a, all_logical_operators) - tm.assert_extension_array_equal(op(True), op(np.bool(True))) - tm.assert_extension_array_equal(op(False), op(np.bool(False))) + tm.assert_extension_array_equal(op(True), op(np.bool_(True))) + tm.assert_extension_array_equal(op(False), op(np.bool_(False))) def get_op_from_name(self, op_name): short_opname = op_name.strip("_") diff --git a/pandas/tests/arrays/categorical/test_dtypes.py b/pandas/tests/arrays/categorical/test_dtypes.py index 9922a8863ebc2..47ce9cb4089f9 100644 --- a/pandas/tests/arrays/categorical/test_dtypes.py +++ b/pandas/tests/arrays/categorical/test_dtypes.py @@ -127,11 +127,11 @@ def test_astype(self, ordered): tm.assert_numpy_array_equal(result, expected) result = cat.astype(int) - expected = np.array(cat, dtype=np.int) + expected = np.array(cat, dtype=int) tm.assert_numpy_array_equal(result, expected) result = cat.astype(float) - expected = np.array(cat, dtype=np.float) + expected = np.array(cat, dtype=float) tm.assert_numpy_array_equal(result, expected) @pytest.mark.parametrize("dtype_ordered", [True, False]) diff --git a/pandas/tests/arrays/sparse/test_arithmetics.py b/pandas/tests/arrays/sparse/test_arithmetics.py index 4ae1c1e6b63ce..c9f1dd7f589fc 100644 --- a/pandas/tests/arrays/sparse/test_arithmetics.py +++ b/pandas/tests/arrays/sparse/test_arithmetics.py @@ -53,7 +53,7 @@ def _check_numeric_ops(self, a, b, a_dense, b_dense, mix, op): def _check_bool_result(self, res): assert isinstance(res, self._klass) assert isinstance(res.dtype, SparseDtype) - assert res.dtype.subtype == np.bool + assert res.dtype.subtype == np.bool_ assert isinstance(res.fill_value, bool) def _check_comparison_ops(self, a, b, a_dense, b_dense): @@ -306,22 +306,22 @@ def test_int_array_comparison(self, kind): def test_bool_same_index(self, kind, fill_value): # GH 14000 # when sp_index are the same - values = self._base([True, False, True, True], dtype=np.bool) - rvalues = self._base([True, False, True, True], dtype=np.bool) + values = self._base([True, False, True, True], dtype=np.bool_) + rvalues = self._base([True, False, True, True], dtype=np.bool_) - a = self._klass(values, kind=kind, dtype=np.bool, fill_value=fill_value) - b = self._klass(rvalues, kind=kind, dtype=np.bool, fill_value=fill_value) + a = self._klass(values, kind=kind, dtype=np.bool_, fill_value=fill_value) + b = self._klass(rvalues, kind=kind, dtype=np.bool_, fill_value=fill_value) self._check_logical_ops(a, b, values, rvalues) @pytest.mark.parametrize("fill_value", [True, False, np.nan]) def test_bool_array_logical(self, kind, fill_value): # GH 14000 # when sp_index are the same - values = self._base([True, False, True, False, True, True], dtype=np.bool) - rvalues = self._base([True, False, False, True, False, True], dtype=np.bool) + values = self._base([True, False, True, False, True, True], dtype=np.bool_) + rvalues = self._base([True, False, False, True, False, True], dtype=np.bool_) - a = self._klass(values, kind=kind, dtype=np.bool, fill_value=fill_value) - b = self._klass(rvalues, kind=kind, dtype=np.bool, fill_value=fill_value) + a = self._klass(values, kind=kind, dtype=np.bool_, fill_value=fill_value) + b = self._klass(rvalues, kind=kind, dtype=np.bool_, fill_value=fill_value) self._check_logical_ops(a, b, values, rvalues) def test_mixed_array_float_int(self, kind, mix, all_arithmetic_functions): diff --git a/pandas/tests/arrays/sparse/test_array.py b/pandas/tests/arrays/sparse/test_array.py index 8450253f853c3..2f2907fbaaebc 100644 --- a/pandas/tests/arrays/sparse/test_array.py +++ b/pandas/tests/arrays/sparse/test_array.py @@ -74,22 +74,22 @@ def test_constructor_sparse_dtype_str(self): def test_constructor_object_dtype(self): # GH 11856 - arr = SparseArray(["A", "A", np.nan, "B"], dtype=np.object) - assert arr.dtype == SparseDtype(np.object) + arr = SparseArray(["A", "A", np.nan, "B"], dtype=object) + assert arr.dtype == SparseDtype(object) assert np.isnan(arr.fill_value) - arr = SparseArray(["A", "A", np.nan, "B"], dtype=np.object, fill_value="A") - assert arr.dtype == SparseDtype(np.object, "A") + arr = SparseArray(["A", "A", np.nan, "B"], dtype=object, fill_value="A") + assert arr.dtype == SparseDtype(object, "A") assert arr.fill_value == "A" # GH 17574 data = [False, 0, 100.0, 0.0] - arr = SparseArray(data, dtype=np.object, fill_value=False) - assert arr.dtype == SparseDtype(np.object, False) + arr = SparseArray(data, dtype=object, fill_value=False) + assert arr.dtype == SparseDtype(object, False) assert arr.fill_value is False - arr_expected = np.array(data, dtype=np.object) + arr_expected = np.array(data, dtype=object) it = (type(x) == type(y) and x == y for x, y in zip(arr, arr_expected)) - assert np.fromiter(it, dtype=np.bool).all() + assert np.fromiter(it, dtype=np.bool_).all() @pytest.mark.parametrize("dtype", [SparseDtype(int, 0), int]) def test_constructor_na_dtype(self, dtype): @@ -445,15 +445,15 @@ def test_constructor_bool(self): def test_constructor_bool_fill_value(self): arr = SparseArray([True, False, True], dtype=None) - assert arr.dtype == SparseDtype(np.bool) + assert arr.dtype == SparseDtype(np.bool_) assert not arr.fill_value - arr = SparseArray([True, False, True], dtype=np.bool) - assert arr.dtype == SparseDtype(np.bool) + arr = SparseArray([True, False, True], dtype=np.bool_) + assert arr.dtype == SparseDtype(np.bool_) assert not arr.fill_value - arr = SparseArray([True, False, True], dtype=np.bool, fill_value=True) - assert arr.dtype == SparseDtype(np.bool, True) + arr = SparseArray([True, False, True], dtype=np.bool_, fill_value=True) + assert arr.dtype == SparseDtype(np.bool_, True) assert arr.fill_value def test_constructor_float32(self): @@ -588,7 +588,7 @@ def test_set_fill_value(self): arr.fill_value = np.nan assert np.isnan(arr.fill_value) - arr = SparseArray([True, False, True], fill_value=False, dtype=np.bool) + arr = SparseArray([True, False, True], fill_value=False, dtype=np.bool_) arr.fill_value = True assert arr.fill_value @@ -605,7 +605,7 @@ def test_set_fill_value(self): @pytest.mark.parametrize("val", [[1, 2, 3], np.array([1, 2]), (1, 2, 3)]) def test_set_fill_invalid_non_scalar(self, val): - arr = SparseArray([True, False, True], fill_value=False, dtype=np.bool) + arr = SparseArray([True, False, True], fill_value=False, dtype=np.bool_) msg = "fill_value must be a scalar" with pytest.raises(ValueError, match=msg): @@ -625,7 +625,7 @@ def test_values_asarray(self): ([0, 0, 0, 0, 0], (5,), None), ([], (0,), None), ([0], (1,), None), - (["A", "A", np.nan, "B"], (4,), np.object), + (["A", "A", np.nan, "B"], (4,), object), ], ) def test_shape(self, data, shape, dtype): diff --git a/pandas/tests/dtypes/cast/test_find_common_type.py b/pandas/tests/dtypes/cast/test_find_common_type.py index ac7a5221d3469..8dac92f469703 100644 --- a/pandas/tests/dtypes/cast/test_find_common_type.py +++ b/pandas/tests/dtypes/cast/test_find_common_type.py @@ -11,7 +11,7 @@ ((np.int64,), np.int64), ((np.uint64,), np.uint64), ((np.float32,), np.float32), - ((np.object,), np.object), + ((object,), object), # Into ints. ((np.int16, np.int64), np.int64), ((np.int32, np.uint32), np.int64), @@ -25,20 +25,20 @@ ((np.float16, np.int64), np.float64), # Into others. ((np.complex128, np.int32), np.complex128), - ((np.object, np.float32), np.object), - ((np.object, np.int16), np.object), + ((object, np.float32), object), + ((object, np.int16), object), # Bool with int. - ((np.dtype("bool"), np.int64), np.object), - ((np.dtype("bool"), np.int32), np.object), - ((np.dtype("bool"), np.int16), np.object), - ((np.dtype("bool"), np.int8), np.object), - ((np.dtype("bool"), np.uint64), np.object), - ((np.dtype("bool"), np.uint32), np.object), - ((np.dtype("bool"), np.uint16), np.object), - ((np.dtype("bool"), np.uint8), np.object), + ((np.dtype("bool"), np.int64), object), + ((np.dtype("bool"), np.int32), object), + ((np.dtype("bool"), np.int16), object), + ((np.dtype("bool"), np.int8), object), + ((np.dtype("bool"), np.uint64), object), + ((np.dtype("bool"), np.uint32), object), + ((np.dtype("bool"), np.uint16), object), + ((np.dtype("bool"), np.uint8), object), # Bool with float. - ((np.dtype("bool"), np.float64), np.object), - ((np.dtype("bool"), np.float32), np.object), + ((np.dtype("bool"), np.float64), object), + ((np.dtype("bool"), np.float32), object), ( (np.dtype("datetime64[ns]"), np.dtype("datetime64[ns]")), np.dtype("datetime64[ns]"), @@ -55,8 +55,8 @@ (np.dtype("timedelta64[ms]"), np.dtype("timedelta64[ns]")), np.dtype("timedelta64[ns]"), ), - ((np.dtype("datetime64[ns]"), np.dtype("timedelta64[ns]")), np.object), - ((np.dtype("datetime64[ns]"), np.int64), np.object), + ((np.dtype("datetime64[ns]"), np.dtype("timedelta64[ns]")), object), + ((np.dtype("datetime64[ns]"), np.int64), object), ], ) def test_numpy_dtypes(source_dtypes, expected_common_dtype): @@ -72,7 +72,7 @@ def test_raises_empty_input(): "dtypes,exp_type", [ ([CategoricalDtype()], "category"), - ([np.object, CategoricalDtype()], np.object), + ([object, CategoricalDtype()], object), ([CategoricalDtype(), CategoricalDtype()], "category"), ], ) @@ -90,14 +90,14 @@ def test_datetimetz_dtype_match(): [ DatetimeTZDtype(unit="ns", tz="Asia/Tokyo"), np.dtype("datetime64[ns]"), - np.object, + object, np.int64, ], ) def test_datetimetz_dtype_mismatch(dtype2): dtype = DatetimeTZDtype(unit="ns", tz="US/Eastern") - assert find_common_type([dtype, dtype2]) == np.object - assert find_common_type([dtype2, dtype]) == np.object + assert find_common_type([dtype, dtype2]) == object + assert find_common_type([dtype2, dtype]) == object def test_period_dtype_match(): @@ -112,11 +112,11 @@ def test_period_dtype_match(): PeriodDtype(freq="2D"), PeriodDtype(freq="H"), np.dtype("datetime64[ns]"), - np.object, + object, np.int64, ], ) def test_period_dtype_mismatch(dtype2): dtype = PeriodDtype(freq="D") - assert find_common_type([dtype, dtype2]) == np.object - assert find_common_type([dtype2, dtype]) == np.object + assert find_common_type([dtype, dtype2]) == object + assert find_common_type([dtype2, dtype]) == object diff --git a/pandas/tests/dtypes/cast/test_infer_dtype.py b/pandas/tests/dtypes/cast/test_infer_dtype.py index 2744cfa8ddc62..70d38aad951cc 100644 --- a/pandas/tests/dtypes/cast/test_infer_dtype.py +++ b/pandas/tests/dtypes/cast/test_infer_dtype.py @@ -43,7 +43,9 @@ def test_infer_dtype_from_float_scalar(float_dtype): assert dtype == float_dtype -@pytest.mark.parametrize("data,exp_dtype", [(12, np.int64), (np.float(12), np.float64)]) +@pytest.mark.parametrize( + "data,exp_dtype", [(12, np.int64), (np.float_(12), np.float64)] +) def test_infer_dtype_from_python_scalar(data, exp_dtype): dtype, val = infer_dtype_from_scalar(data) assert dtype == exp_dtype @@ -184,8 +186,8 @@ def test_infer_dtype_from_array(arr, expected, pandas_dtype): (1, np.int64), (1.1, np.float64), (Timestamp("2011-01-01"), "datetime64[ns]"), - (Timestamp("2011-01-01", tz="US/Eastern"), np.object), - (Period("2011-01-01", freq="D"), np.object), + (Timestamp("2011-01-01", tz="US/Eastern"), object), + (Period("2011-01-01", freq="D"), object), ], ) def test_cast_scalar_to_array(obj, dtype): diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index 1708139a397ab..ce12718e48d0d 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -112,7 +112,7 @@ def test_period_dtype(self, dtype): period=PeriodDtype("D"), integer=np.dtype(np.int64), float=np.dtype(np.float64), - object=np.dtype(np.object), + object=np.dtype(object), category=com.pandas_dtype("category"), ) @@ -547,7 +547,7 @@ def test_is_bool_dtype(): assert not com.is_bool_dtype(pd.Index(["a", "b"])) assert com.is_bool_dtype(bool) - assert com.is_bool_dtype(np.bool) + assert com.is_bool_dtype(np.bool_) assert com.is_bool_dtype(np.array([True, False])) assert com.is_bool_dtype(pd.Index([True, False])) @@ -615,7 +615,8 @@ def test_is_complex_dtype(): assert not com.is_complex_dtype(pd.Series([1, 2])) assert not com.is_complex_dtype(np.array(["a", "b"])) - assert com.is_complex_dtype(np.complex) + assert com.is_complex_dtype(np.complex_) + assert com.is_complex_dtype(complex) assert com.is_complex_dtype(np.array([1 + 1j, 5])) diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index 3b9d3dc0b91f6..b1fe673e9e2f1 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -951,7 +951,7 @@ def test_registry_find(dtype, expected): (str, False), (int, False), (bool, True), - (np.bool, True), + (np.bool_, True), (np.array(["a", "b"]), False), (pd.Series([1, 2]), False), (np.array([True, False]), True), diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index e97716f7a5e9c..e40a12f7bc8d1 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -1246,7 +1246,6 @@ def test_is_number(self): assert is_number(1) assert is_number(1.1) assert is_number(1 + 3j) - assert is_number(np.bool(False)) assert is_number(np.int64(1)) assert is_number(np.float64(1.1)) assert is_number(np.complex128(1 + 3j)) @@ -1267,7 +1266,7 @@ def test_is_number(self): def test_is_bool(self): assert is_bool(True) - assert is_bool(np.bool(False)) + assert is_bool(False) assert is_bool(np.bool_(False)) assert not is_bool(1) @@ -1294,7 +1293,7 @@ def test_is_integer(self): assert not is_integer(True) assert not is_integer(1.1) assert not is_integer(1 + 3j) - assert not is_integer(np.bool(False)) + assert not is_integer(False) assert not is_integer(np.bool_(False)) assert not is_integer(np.float64(1.1)) assert not is_integer(np.complex128(1 + 3j)) @@ -1317,7 +1316,7 @@ def test_is_float(self): assert not is_float(True) assert not is_float(1) assert not is_float(1 + 3j) - assert not is_float(np.bool(False)) + assert not is_float(False) assert not is_float(np.bool_(False)) assert not is_float(np.int64(1)) assert not is_float(np.complex128(1 + 3j)) diff --git a/pandas/tests/extension/base/setitem.py b/pandas/tests/extension/base/setitem.py index eed9a584cc030..bfa53ad02525b 100644 --- a/pandas/tests/extension/base/setitem.py +++ b/pandas/tests/extension/base/setitem.py @@ -319,13 +319,13 @@ def test_setitem_dataframe_column_without_index(self, data): def test_setitem_series_with_index(self, data): # https://github.com/pandas-dev/pandas/issues/32395 ser = expected = pd.Series(data, name="data") - result = pd.Series(index=ser.index, dtype=np.object, name="data") + result = pd.Series(index=ser.index, dtype=object, name="data") result.loc[ser.index] = ser self.assert_series_equal(result, expected) def test_setitem_series_without_index(self, data): # https://github.com/pandas-dev/pandas/issues/32395 ser = expected = pd.Series(data, name="data") - result = pd.Series(index=ser.index, dtype=np.object, name="data") + result = pd.Series(index=ser.index, dtype=object, name="data") result.loc[:] = ser self.assert_series_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_duplicated.py b/pandas/tests/frame/methods/test_duplicated.py index 82fd6d88b82b9..7a1c16adc2a09 100644 --- a/pandas/tests/frame/methods/test_duplicated.py +++ b/pandas/tests/frame/methods/test_duplicated.py @@ -30,7 +30,7 @@ def test_duplicated_do_not_fail_on_wide_dataframes(): # calculation. Actual values doesn't matter here, though usually it's all # False in this case assert isinstance(result, Series) - assert result.dtype == np.bool + assert result.dtype == np.bool_ @pytest.mark.parametrize( diff --git a/pandas/tests/frame/methods/test_isin.py b/pandas/tests/frame/methods/test_isin.py index 6307738021f68..79ea70a38f145 100644 --- a/pandas/tests/frame/methods/test_isin.py +++ b/pandas/tests/frame/methods/test_isin.py @@ -164,7 +164,7 @@ def test_isin_multiIndex(self): tm.assert_frame_equal(result, expected) df2.index = idx - expected = df2.values.astype(np.bool) + expected = df2.values.astype(bool) expected[:, 1] = ~expected[:, 1] expected = DataFrame(expected, columns=["A", "B"], index=idx) diff --git a/pandas/tests/frame/methods/test_to_records.py b/pandas/tests/frame/methods/test_to_records.py index 34b323e55d8cd..d9c999c9119f4 100644 --- a/pandas/tests/frame/methods/test_to_records.py +++ b/pandas/tests/frame/methods/test_to_records.py @@ -163,7 +163,7 @@ def test_to_records_with_categorical(self): ), # Pass in a type instance. ( - dict(column_dtypes=np.unicode), + dict(column_dtypes=str), np.rec.array( [("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")], dtype=[("index", "", "<"]) def test_bool_uint(self, byteorder, version): - s0 = Series([0, 1, True], dtype=np.bool) + s0 = Series([0, 1, True], dtype=np.bool_) s1 = Series([0, 1, 100], dtype=np.uint8) s2 = Series([0, 1, 255], dtype=np.uint8) s3 = Series([0, 1, 2 ** 15 - 100], dtype=np.uint16) @@ -855,7 +855,7 @@ def test_big_dates(self): expected[5][2] = expected[5][3] = expected[5][4] = datetime(1677, 10, 1) expected[5][5] = expected[5][6] = datetime(1678, 1, 1) - expected = DataFrame(expected, columns=columns, dtype=np.object) + expected = DataFrame(expected, columns=columns, dtype=object) parsed_115 = read_stata(self.dta18_115) parsed_117 = read_stata(self.dta18_117) tm.assert_frame_equal(expected, parsed_115, check_datetimelike_compat=True) diff --git a/pandas/tests/plotting/test_hist_method.py b/pandas/tests/plotting/test_hist_method.py index 5a30e9fbb91c6..0d3425d001229 100644 --- a/pandas/tests/plotting/test_hist_method.py +++ b/pandas/tests/plotting/test_hist_method.py @@ -205,7 +205,7 @@ def test_hist_df_legacy(self): def test_hist_non_numerical_raises(self): # gh-10444 df = DataFrame(np.random.rand(10, 2)) - df_o = df.astype(np.object) + df_o = df.astype(object) msg = "hist method requires numerical columns, nothing to plot." with pytest.raises(ValueError, match=msg): diff --git a/pandas/tests/plotting/test_series.py b/pandas/tests/plotting/test_series.py index 5341878d4986e..6da892c15f489 100644 --- a/pandas/tests/plotting/test_series.py +++ b/pandas/tests/plotting/test_series.py @@ -617,7 +617,7 @@ def test_kde_kwargs(self): sample_points = np.linspace(-100, 100, 20) _check_plot_works(self.ts.plot.kde, bw_method="scott", ind=20) _check_plot_works(self.ts.plot.kde, bw_method=None, ind=20) - _check_plot_works(self.ts.plot.kde, bw_method=None, ind=np.int(20)) + _check_plot_works(self.ts.plot.kde, bw_method=None, ind=np.int_(20)) _check_plot_works(self.ts.plot.kde, bw_method=0.5, ind=sample_points) _check_plot_works(self.ts.plot.density, bw_method=0.5, ind=sample_points) _, ax = self.plt.subplots() diff --git a/pandas/tests/resample/test_base.py b/pandas/tests/resample/test_base.py index 485535bec20d0..28d33ebb23c20 100644 --- a/pandas/tests/resample/test_base.py +++ b/pandas/tests/resample/test_base.py @@ -180,7 +180,7 @@ def test_resample_size_empty_dataframe(freq, empty_frame_dti): @pytest.mark.parametrize("index", tm.all_timeseries_index_generator(0)) -@pytest.mark.parametrize("dtype", [np.float, np.int, np.object, "datetime64[ns]"]) +@pytest.mark.parametrize("dtype", [float, int, object, "datetime64[ns]"]) def test_resample_empty_dtypes(index, dtype, resample_method): # Empty series were sometimes causing a segfault (for the functions # with Cython bounds-checking disabled) or an IndexError. We just run diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py index 19fd8db5322ed..1c9d00a4b4c90 100644 --- a/pandas/tests/reshape/test_concat.py +++ b/pandas/tests/reshape/test_concat.py @@ -2759,8 +2759,8 @@ def test_concat_sparse(): def test_concat_dense_sparse(): # GH 30668 - a = pd.Series(pd.arrays.SparseArray([1, None]), dtype=np.float) - b = pd.Series([1], dtype=np.float) + a = pd.Series(pd.arrays.SparseArray([1, None]), dtype=float) + b = pd.Series([1], dtype=float) expected = pd.Series(data=[1, None, 1], index=[0, 1, 0]).astype( pd.SparseDtype(np.float64, None) ) diff --git a/pandas/tests/series/indexing/test_where.py b/pandas/tests/series/indexing/test_where.py index 8daea84492871..3f85abb4b2817 100644 --- a/pandas/tests/series/indexing/test_where.py +++ b/pandas/tests/series/indexing/test_where.py @@ -278,7 +278,7 @@ def test_where_setitem_invalid(): "mask", [[True, False, False, False, False], [True, False], [False]] ) @pytest.mark.parametrize( - "item", [2.0, np.nan, np.finfo(np.float).max, np.finfo(np.float).min] + "item", [2.0, np.nan, np.finfo(float).max, np.finfo(float).min] ) # Test numpy arrays, lists and tuples as the input to be # broadcast diff --git a/pandas/tests/series/test_apply.py b/pandas/tests/series/test_apply.py index e6f86dda05893..d51dceae53a1c 100644 --- a/pandas/tests/series/test_apply.py +++ b/pandas/tests/series/test_apply.py @@ -180,7 +180,7 @@ def test_apply_categorical(self): result = ser.apply(lambda x: "A") exp = pd.Series(["A"] * 7, name="XX", index=list("abcdefg")) tm.assert_series_equal(result, exp) - assert result.dtype == np.object + assert result.dtype == object @pytest.mark.parametrize("series", [["1-1", "1-1", np.NaN], ["1-1", "1-2", np.NaN]]) def test_apply_categorical_with_nan_values(self, series): @@ -717,7 +717,7 @@ def test_map_categorical(self): result = s.map(lambda x: "A") exp = pd.Series(["A"] * 7, name="XX", index=list("abcdefg")) tm.assert_series_equal(result, exp) - assert result.dtype == np.object + assert result.dtype == object with pytest.raises(NotImplementedError): s.map(lambda x: x, na_action="ignore") diff --git a/pandas/tests/series/test_combine_concat.py b/pandas/tests/series/test_combine_concat.py index 0766bfc37d7ca..95eba6ccc4df8 100644 --- a/pandas/tests/series/test_combine_concat.py +++ b/pandas/tests/series/test_combine_concat.py @@ -68,9 +68,9 @@ def get_result_type(dtype, dtype2): (np.bool_, np.int32, np.int32), (np.bool_, np.float32, np.object_), # datetime-like - ("m8[ns]", np.bool, np.object_), + ("m8[ns]", np.bool_, np.object_), ("m8[ns]", np.int64, np.object_), - ("M8[ns]", np.bool, np.object_), + ("M8[ns]", np.bool_, np.object_), ("M8[ns]", np.int64, np.object_), # categorical ("category", "category", "category"), diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index ff5f890cc41f8..44a8452964f5a 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -713,7 +713,7 @@ def test_first_nan_kept(self): NAN2 = struct.unpack("d", struct.pack("=Q", bits_for_nan2))[0] assert NAN1 != NAN1 assert NAN2 != NAN2 - for el_type in [np.float64, np.object]: + for el_type in [np.float64, object]: a = np.array([NAN1, NAN2], dtype=el_type) result = pd.unique(a) assert result.size == 1 @@ -725,7 +725,7 @@ def test_do_not_mangle_na_values(self, unique_nulls_fixture, unique_nulls_fixtur # GH 22295 if unique_nulls_fixture is unique_nulls_fixture2: return # skip it, values not unique - a = np.array([unique_nulls_fixture, unique_nulls_fixture2], dtype=np.object) + a = np.array([unique_nulls_fixture, unique_nulls_fixture2], dtype=object) result = pd.unique(a) assert result.size == 2 assert a[0] is unique_nulls_fixture @@ -886,7 +886,7 @@ def test_different_nans(self): # as object-array: result = algos.isin( - np.asarray(comps, dtype=np.object), np.asarray(values, dtype=np.object) + np.asarray(comps, dtype=object), np.asarray(values, dtype=object) ) tm.assert_numpy_array_equal(np.array([True]), result) @@ -916,8 +916,8 @@ def test_empty(self, empty): def test_different_nan_objects(self): # GH 22119 - comps = np.array(["nan", np.nan * 1j, float("nan")], dtype=np.object) - vals = np.array([float("nan")], dtype=np.object) + comps = np.array(["nan", np.nan * 1j, float("nan")], dtype=object) + vals = np.array([float("nan")], dtype=object) expected = np.array([False, False, True]) result = algos.isin(comps, vals) tm.assert_numpy_array_equal(expected, result) @@ -1157,7 +1157,7 @@ def test_dropna(self): def test_value_counts_normalized(self): # GH12558 s = Series([1, 2, np.nan, np.nan, np.nan]) - dtypes = (np.float64, np.object, "M8[ns]") + dtypes = (np.float64, object, "M8[ns]") for t in dtypes: s_typed = s.astype(t) result = s_typed.value_counts(normalize=True, dropna=False) @@ -2290,10 +2290,10 @@ def test_mode_single(self): exp = Series(exp_multi, dtype=dt) tm.assert_series_equal(algos.mode(s), exp) - exp = Series([1], dtype=np.int) + exp = Series([1], dtype=int) tm.assert_series_equal(algos.mode([1]), exp) - exp = Series(["a", "b", "c"], dtype=np.object) + exp = Series(["a", "b", "c"], dtype=object) tm.assert_series_equal(algos.mode(["a", "b", "c"]), exp) def test_number_mode(self): diff --git a/pandas/tests/test_take.py b/pandas/tests/test_take.py index 2a42eb5d73136..9f0632917037c 100644 --- a/pandas/tests/test_take.py +++ b/pandas/tests/test_take.py @@ -31,7 +31,7 @@ def writeable(request): (np.int16, False), (np.int8, False), (np.object_, True), - (np.bool, False), + (np.bool_, False), ] ) def dtype_can_hold_na(request): diff --git a/pandas/tests/tslibs/test_fields.py b/pandas/tests/tslibs/test_fields.py index 943f4207df543..a45fcab56759f 100644 --- a/pandas/tests/tslibs/test_fields.py +++ b/pandas/tests/tslibs/test_fields.py @@ -12,9 +12,7 @@ def test_fields_readonly(): dtindex.flags.writeable = False result = fields.get_date_name_field(dtindex, "month_name") - expected = np.array( - ["January", "February", "March", "April", "May"], dtype=np.object - ) + expected = np.array(["January", "February", "March", "April", "May"], dtype=object) tm.assert_numpy_array_equal(result, expected) result = fields.get_date_field(dtindex, "Y") diff --git a/pandas/tests/window/moments/test_moments_rolling.py b/pandas/tests/window/moments/test_moments_rolling.py index 3e5475e6b274f..f6e2834965da3 100644 --- a/pandas/tests/window/moments/test_moments_rolling.py +++ b/pandas/tests/window/moments/test_moments_rolling.py @@ -515,7 +515,7 @@ def test_cmov_window_regular(win_types): @td.skip_if_no_scipy def test_cmov_window_regular_linear_range(win_types): # GH 8238 - vals = np.array(range(10), dtype=np.float) + vals = np.array(range(10), dtype=float) xp = vals.copy() xp[:2] = np.nan xp[-2:] = np.nan @@ -718,7 +718,7 @@ def test_cmov_window_special_linear_range(win_types_special): "exponential": {"tau": 10}, } - vals = np.array(range(10), dtype=np.float) + vals = np.array(range(10), dtype=float) xp = vals.copy() xp[:2] = np.nan xp[-2:] = np.nan From 2d8937e002fb6db7fee5b01c217010903666a966 Mon Sep 17 00:00:00 2001 From: Linda Chen Date: Wed, 17 Jun 2020 17:11:57 -0700 Subject: [PATCH 0143/1025] DOC: Clarify where to the additional arguments for some win_types (#34615) * DOC: Clarify some syntax when using win_types DOC: Clarify where to the additional arguments for some win_types For example, std needs to specify when win_types is gaussian. However, std should be specified in the operation argument, not as one of the rolling arguments. This change is to clarify this point. Closes: #34593 * DOC: Clarify where to the additional arguments for some win_types Edit: 711add5 First Commit Original issue: #34615 * DOC: Clarify where to the additional arguments for some win_types Edit: a3e38ac Second Commit What's new: I shortened line 935 so that each line is less than 88 characters. Original Issue: #34615 * Revert "DOC: Clarify where to the additional arguments for some win_types" This reverts commit 45be5386b656650443404fae1842f169a65a8919. * Revert "Revert "DOC: Clarify where to the additional arguments for some win_types"" This reverts commit 05ed0eb01dcc0a4ec4743f30ba411c9fa6ee402f. * DOC: Remove whitespace in docstring * DOC: Remove indent of line934 * Update pandas/core/window/rolling.py Co-authored-by: Tom Augspurger --- pandas/core/window/rolling.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index ce0a2a9b95025..9cd750265133e 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -922,16 +922,19 @@ class Window(_Window): * ``blackmanharris`` * ``nuttall`` * ``barthann`` - * ``kaiser`` (needs beta) - * ``gaussian`` (needs std) - * ``general_gaussian`` (needs power, width) - * ``slepian`` (needs width) - * ``exponential`` (needs tau), center is set to None. + * ``kaiser`` (needs parameter: beta) + * ``gaussian`` (needs parameter: std) + * ``general_gaussian`` (needs parameters: power, width) + * ``slepian`` (needs parameter: width) + * ``exponential`` (needs parameter: tau), center is set to None. If ``win_type=None`` all points are evenly weighted. To learn more about different window types see `scipy.signal window functions `__. + Certain window types require additional parameters to be passed. Please see + the third example below on how to add the additional parameters. + Examples -------- >>> df = pd.DataFrame({'B': [0, 1, 2, np.nan, 4]}) From c749052ff1f9fa75e47807adb9bc5b486ab69d24 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 18 Jun 2020 08:47:44 +0200 Subject: [PATCH 0144/1025] BUG: fix construction from read-only non-ns datetime64 numpy array (#34844) --- doc/source/whatsnew/v1.1.0.rst | 3 +++ pandas/_libs/tslibs/conversion.pyx | 3 ++- pandas/tests/base/test_constructors.py | 24 ++++++++++++++++++++++++ 3 files changed, 29 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 10522ff797c59..6a6c7ebd49db1 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -843,6 +843,9 @@ Datetimelike - Bug in :meth:`DatetimeIndex.intersection` and :meth:`TimedeltaIndex.intersection` with results not having the correct ``name`` attribute (:issue:`33904`) - Bug in :meth:`DatetimeArray.__setitem__`, :meth:`TimedeltaArray.__setitem__`, :meth:`PeriodArray.__setitem__` incorrectly allowing values with ``int64`` dtype to be silently cast (:issue:`33717`) - Bug in subtracting :class:`TimedeltaIndex` from :class:`Period` incorrectly raising ``TypeError`` in some cases where it should succeed and ``IncompatibleFrequency`` in some cases where it should raise ``TypeError`` (:issue:`33883`) +- Bug in constructing a Series or Index from a read-only NumPy array with non-ns + resolution which converted to object dtype instead of coercing to ``datetime64[ns]`` + dtype when within the timestamp bounds (:issue:`34843`). - The ``freq`` keyword in :class:`Period`, :func:`date_range`, :func:`period_range`, :func:`pd.tseries.frequencies.to_offset` no longer allows tuples, pass as string instead (:issue:`34703`) Timedelta diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index 40b2d44235d8b..0811ba22977fd 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -167,7 +167,8 @@ def ensure_datetime64ns(arr: ndarray, copy: bool=True): """ cdef: Py_ssize_t i, n = arr.size - int64_t[:] ivalues, iresult + const int64_t[:] ivalues + int64_t[:] iresult NPY_DATETIMEUNIT unit npy_datetimestruct dts diff --git a/pandas/tests/base/test_constructors.py b/pandas/tests/base/test_constructors.py index e27b5c307cd99..697364fc87175 100644 --- a/pandas/tests/base/test_constructors.py +++ b/pandas/tests/base/test_constructors.py @@ -13,6 +13,19 @@ from pandas.core.base import NoNewAttributesMixin, PandasObject +@pytest.fixture( + params=[ + Series, + lambda x, **kwargs: DataFrame({"a": x}, **kwargs)["a"], + lambda x, **kwargs: DataFrame(x, **kwargs)[0], + Index, + ], + ids=["Series", "DataFrame-dict", "DataFrame-array", "Index"], +) +def constructor(request): + return request.param + + class TestPandasDelegate: class Delegator: _properties = ["foo"] @@ -145,3 +158,14 @@ def test_constructor_datetime_outofbound(self, a, klass): msg = "Out of bounds" with pytest.raises(pd.errors.OutOfBoundsDatetime, match=msg): klass(a, dtype="datetime64[ns]") + + def test_constructor_datetime_nonns(self, constructor): + arr = np.array(["2020-01-01T00:00:00.000000"], dtype="datetime64[us]") + expected = constructor(pd.to_datetime(["2020-01-01"])) + result = constructor(arr) + tm.assert_equal(result, expected) + + # https://github.com/pandas-dev/pandas/issues/34843 + arr.flags.writeable = False + result = constructor(arr) + tm.assert_equal(result, expected) From 307e3a6e5a079811bd00b806ca0b806979a2d471 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Quang=20Nguy=E1=BB=85n?= <30631476+quangngd@users.noreply.github.com> Date: Thu, 18 Jun 2020 14:22:44 +0700 Subject: [PATCH 0145/1025] DOC: Fix syntax in 1.1.0 whatsnew (#34856) --- doc/source/whatsnew/v1.1.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 6a6c7ebd49db1..f216418c3a8b0 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -292,7 +292,7 @@ Other enhancements - :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` now accept an ``errors`` argument (:issue:`22610`) - :meth:`groupby.transform` now allows ``func`` to be ``pad``, ``backfill`` and ``cumcount`` (:issue:`31269`). - :meth:`~pandas.io.json.read_json` now accepts `nrows` parameter. (:issue:`33916`). -- :meth `~pandas.io.gbq.read_gbq` now allows to disable progress bar (:issue:`33360`). +- :meth:`~pandas.io.gbq.read_gbq` now allows to disable progress bar (:issue:`33360`). - :meth:`~pandas.io.gbq.read_gbq` now supports the ``max_results`` kwarg from ``pandas-gbq`` (:issue:`34639`). .. --------------------------------------------------------------------------- From 3aaf35b1f5027a6a21bc48b830de91bfac24cd88 Mon Sep 17 00:00:00 2001 From: Matteo Felici Date: Thu, 18 Jun 2020 15:59:00 +0200 Subject: [PATCH 0146/1025] CLN: GH29547 change string formatting with f-strings (6 files changed) (#34831) --- pandas/tests/io/parser/test_header.py | 3 +-- pandas/tests/io/test_sql.py | 9 +++++---- pandas/tests/reductions/test_reductions.py | 5 ++--- 3 files changed, 8 insertions(+), 9 deletions(-) diff --git a/pandas/tests/io/parser/test_header.py b/pandas/tests/io/parser/test_header.py index 7dc106ef0c186..4cd110136d7b0 100644 --- a/pandas/tests/io/parser/test_header.py +++ b/pandas/tests/io/parser/test_header.py @@ -528,12 +528,11 @@ def test_multi_index_unnamed(all_parsers, index_col, columns): parser.read_csv(StringIO(data), header=header, index_col=index_col) else: result = parser.read_csv(StringIO(data), header=header, index_col=index_col) - template = "Unnamed: {i}_level_0" exp_columns = [] for i, col in enumerate(columns): if not col: # Unnamed. - col = template.format(i=i if index_col is None else i + 1) + col = f"Unnamed: {i if index_col is None else i + 1}_level_0" exp_columns.append(col) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index fa04eabb71627..70713768c8d1e 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -1900,9 +1900,9 @@ class _TestMySQLAlchemy: @classmethod def connect(cls): - url = "mysql+{driver}://root@localhost/pandas_nosetest" return sqlalchemy.create_engine( - url.format(driver=cls.driver), connect_args=cls.connect_args + f"mysql+{cls.driver}://root@localhost/pandas_nosetest", + connect_args=cls.connect_args, ) @classmethod @@ -1969,8 +1969,9 @@ class _TestPostgreSQLAlchemy: @classmethod def connect(cls): - url = "postgresql+{driver}://postgres@localhost/pandas_nosetest" - return sqlalchemy.create_engine(url.format(driver=cls.driver)) + return sqlalchemy.create_engine( + f"postgresql+{cls.driver}://postgres@localhost/pandas_nosetest" + ) @classmethod def setup_driver(cls): diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py index f6e0d2f0c1751..a112bc80b60b0 100644 --- a/pandas/tests/reductions/test_reductions.py +++ b/pandas/tests/reductions/test_reductions.py @@ -349,11 +349,10 @@ def test_invalid_td64_reductions(self, opname): msg = "|".join( [ - "reduction operation '{op}' not allowed for this dtype", - r"cannot perform {op} with type timedelta64\[ns\]", + f"reduction operation '{opname}' not allowed for this dtype", + rf"cannot perform {opname} with type timedelta64\[ns\]", ] ) - msg = msg.format(op=opname) with pytest.raises(TypeError, match=msg): getattr(td, opname)() From 8ca5ceb5bbe5537145b7d12181093aa62cda54bd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Quang=20Nguy=E1=BB=85n?= <30631476+quangngd@users.noreply.github.com> Date: Thu, 18 Jun 2020 22:21:37 +0700 Subject: [PATCH 0147/1025] EHN: to_{html, string} col_space col specific (#32903) --- doc/source/whatsnew/v1.1.0.rst | 1 + pandas/core/frame.py | 4 +-- pandas/io/formats/format.py | 47 ++++++++++++++++++------- pandas/io/formats/html.py | 13 ++++--- pandas/tests/io/formats/test_format.py | 27 ++++++++++++++ pandas/tests/io/formats/test_to_html.py | 34 ++++++++++++++++++ 6 files changed, 108 insertions(+), 18 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index f216418c3a8b0..bd47bef397aa7 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -294,6 +294,7 @@ Other enhancements - :meth:`~pandas.io.json.read_json` now accepts `nrows` parameter. (:issue:`33916`). - :meth:`~pandas.io.gbq.read_gbq` now allows to disable progress bar (:issue:`33360`). - :meth:`~pandas.io.gbq.read_gbq` now supports the ``max_results`` kwarg from ``pandas-gbq`` (:issue:`34639`). +- :meth:`DataFrame.to_html` and :meth:`DataFrame.to_string`'s ``col_space`` parameter now accepts a list of dict to change only some specific columns' width (:issue:`28917`). .. --------------------------------------------------------------------------- diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 68c06715e1ea4..2c80f57e4ef5d 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -776,7 +776,7 @@ def _repr_html_(self) -> Optional[str]: header="Write out the column names. If a list of strings " "is given, it is assumed to be aliases for the " "column names", - col_space_type="int", + col_space_type="int, list or dict of int", col_space="The minimum width of each column", ) @Substitution(shared_params=fmt.common_docstring, returns=fmt.return_docstring) @@ -2328,7 +2328,7 @@ def to_parquet( @Substitution( header_type="bool", header="Whether to print column labels, default True", - col_space_type="str or int", + col_space_type="str or int, list or dict of int or str", col_space="The minimum width of each column in CSS length " "units. An int is assumed to be px units.\n\n" " .. versionadded:: 0.25.0\n" diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index b94b56da7b279..9ea2f6510b253 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -38,7 +38,7 @@ from pandas._libs.tslib import format_array_from_datetime from pandas._libs.tslibs import NaT, Timedelta, Timestamp, iNaT from pandas._libs.tslibs.nattype import NaTType -from pandas._typing import FilePathOrBuffer +from pandas._typing import FilePathOrBuffer, Label from pandas.errors import AbstractMethodError from pandas.core.dtypes.common import ( @@ -77,6 +77,10 @@ List[Callable], Tuple[Callable, ...], Mapping[Union[str, int], Callable] ] FloatFormatType = Union[str, Callable, "EngFormatter"] +ColspaceType = Mapping[Label, Union[str, int]] +ColspaceArgType = Union[ + str, int, Sequence[Union[str, int]], Mapping[Label, Union[str, int]], +] common_docstring = """ Parameters @@ -530,11 +534,13 @@ class DataFrameFormatter(TableFormatter): __doc__ = __doc__ if __doc__ else "" __doc__ += common_docstring + return_docstring + col_space: ColspaceType + def __init__( self, frame: "DataFrame", columns: Optional[Sequence[str]] = None, - col_space: Optional[Union[str, int]] = None, + col_space: Optional[ColspaceArgType] = None, header: Union[bool, Sequence[str]] = True, index: bool = True, na_rep: str = "NaN", @@ -574,7 +580,27 @@ def __init__( ) self.na_rep = na_rep self.decimal = decimal - self.col_space = col_space + if col_space is None: + self.col_space = {} + elif isinstance(col_space, (int, str)): + self.col_space = {"": col_space} + self.col_space.update({column: col_space for column in self.frame.columns}) + elif isinstance(col_space, dict): + for column in col_space.keys(): + if column not in self.frame.columns and column != "": + raise ValueError( + f"Col_space is defined for an unknown column: {column}" + ) + self.col_space = col_space + else: + col_space = cast(Sequence, col_space) + if len(frame.columns) != len(col_space): + raise ValueError( + f"Col_space length({len(col_space)}) should match " + f"DataFrame number of columns({len(frame.columns)})" + ) + self.col_space = dict(zip(self.frame.columns, col_space)) + self.header = header self.index = index self.line_width = line_width @@ -702,7 +728,7 @@ def _to_str_columns(self) -> List[List[str]]: """ # this method is not used by to_html where self.col_space # could be a string so safe to cast - self.col_space = cast(int, self.col_space) + col_space = {k: cast(int, v) for k, v in self.col_space.items()} frame = self.tr_frame # may include levels names also @@ -714,10 +740,7 @@ def _to_str_columns(self) -> List[List[str]]: for i, c in enumerate(frame): fmt_values = self._format_col(i) fmt_values = _make_fixed_width( - fmt_values, - self.justify, - minimum=(self.col_space or 0), - adj=self.adj, + fmt_values, self.justify, minimum=col_space.get(c, 0), adj=self.adj, ) stringified.append(fmt_values) else: @@ -741,7 +764,7 @@ def _to_str_columns(self) -> List[List[str]]: for i, c in enumerate(frame): cheader = str_columns[i] header_colwidth = max( - self.col_space or 0, *(self.adj.len(x) for x in cheader) + col_space.get(c, 0), *(self.adj.len(x) for x in cheader) ) fmt_values = self._format_col(i) fmt_values = _make_fixed_width( @@ -932,7 +955,7 @@ def _format_col(self, i: int) -> List[str]: formatter, float_format=self.float_format, na_rep=self.na_rep, - space=self.col_space, + space=self.col_space.get(frame.columns[i]), decimal=self.decimal, ) @@ -1025,7 +1048,7 @@ def show_col_idx_names(self) -> bool: def _get_formatted_index(self, frame: "DataFrame") -> List[str]: # Note: this is only used by to_string() and to_latex(), not by # to_html(). so safe to cast col_space here. - self.col_space = cast(int, self.col_space) + col_space = {k: cast(int, v) for k, v in self.col_space.items()} index = frame.index columns = frame.columns fmt = self._get_formatter("__index__") @@ -1043,7 +1066,7 @@ def _get_formatted_index(self, frame: "DataFrame") -> List[str]: fmt_index = [ tuple( _make_fixed_width( - list(x), justify="left", minimum=(self.col_space or 0), adj=self.adj + list(x), justify="left", minimum=col_space.get("", 0), adj=self.adj, ) ) for x in fmt_index diff --git a/pandas/io/formats/html.py b/pandas/io/formats/html.py index e31d977512f1e..7ea2417ceb24b 100644 --- a/pandas/io/formats/html.py +++ b/pandas/io/formats/html.py @@ -53,8 +53,11 @@ def __init__( self.border = border self.table_id = self.fmt.table_id self.render_links = self.fmt.render_links - if isinstance(self.fmt.col_space, int): - self.fmt.col_space = f"{self.fmt.col_space}px" + + self.col_space = { + column: f"{value}px" if isinstance(value, int) else value + for column, value in self.fmt.col_space.items() + } @property def show_row_idx_names(self) -> bool: @@ -120,9 +123,11 @@ def write_th( ------- A written cell. """ - if header and self.fmt.col_space is not None: + col_space = self.col_space.get(s, None) + + if header and col_space is not None: tags = tags or "" - tags += f'style="min-width: {self.fmt.col_space};"' + tags += f'style="min-width: {col_space};"' self._write_cell(s, kind="th", indent=indent, tags=tags) diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index 1bd9ab594408f..0a79f2321c432 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -1047,6 +1047,33 @@ def test_to_string_with_col_space(self): no_header = df.to_string(col_space=20, header=False) assert len(with_header_row1) == len(no_header) + def test_to_string_with_column_specific_col_space_raises(self): + df = DataFrame(np.random.random(size=(3, 3)), columns=["a", "b", "c"]) + + msg = ( + "Col_space length\\(\\d+\\) should match " + "DataFrame number of columns\\(\\d+\\)" + ) + with pytest.raises(ValueError, match=msg): + df.to_string(col_space=[30, 40]) + + with pytest.raises(ValueError, match=msg): + df.to_string(col_space=[30, 40, 50, 60]) + + msg = "unknown column" + with pytest.raises(ValueError, match=msg): + df.to_string(col_space={"a": "foo", "b": 23, "d": 34}) + + def test_to_string_with_column_specific_col_space(self): + df = DataFrame(np.random.random(size=(3, 3)), columns=["a", "b", "c"]) + + result = df.to_string(col_space={"a": 10, "b": 11, "c": 12}) + # 3 separating space + each col_space for (id, a, b, c) + assert len(result.split("\n")[1]) == (3 + 1 + 10 + 11 + 12) + + result = df.to_string(col_space=[10, 11, 12]) + assert len(result.split("\n")[1]) == (3 + 1 + 10 + 11 + 12) + def test_to_string_truncate_indices(self): for index in [ tm.makeStringIndex, diff --git a/pandas/tests/io/formats/test_to_html.py b/pandas/tests/io/formats/test_to_html.py index 9a14022d6f776..e85fd398964d0 100644 --- a/pandas/tests/io/formats/test_to_html.py +++ b/pandas/tests/io/formats/test_to_html.py @@ -78,6 +78,40 @@ def test_to_html_with_col_space(col_space): assert str(col_space) in h +def test_to_html_with_column_specific_col_space_raises(): + df = DataFrame(np.random.random(size=(3, 3)), columns=["a", "b", "c"]) + + msg = ( + "Col_space length\\(\\d+\\) should match " + "DataFrame number of columns\\(\\d+\\)" + ) + with pytest.raises(ValueError, match=msg): + df.to_html(col_space=[30, 40]) + + with pytest.raises(ValueError, match=msg): + df.to_html(col_space=[30, 40, 50, 60]) + + msg = "unknown column" + with pytest.raises(ValueError, match=msg): + df.to_html(col_space={"a": "foo", "b": 23, "d": 34}) + + +def test_to_html_with_column_specific_col_space(): + df = DataFrame(np.random.random(size=(3, 3)), columns=["a", "b", "c"]) + + result = df.to_html(col_space={"a": "2em", "b": 23}) + hdrs = [x for x in result.split("\n") if re.search(r"\s]", x)] + assert 'min-width: 2em;">a' in hdrs[1] + assert 'min-width: 23px;">b' in hdrs[2] + assert "c" in hdrs[3] + + result = df.to_html(col_space=["1em", 2, 3]) + hdrs = [x for x in result.split("\n") if re.search(r"\s]", x)] + assert 'min-width: 1em;">a' in hdrs[1] + assert 'min-width: 2px;">b' in hdrs[2] + assert 'min-width: 3px;">c' in hdrs[3] + + def test_to_html_with_empty_string_label(): # GH 3547, to_html regards empty string labels as repeated labels data = {"c1": ["a", "b"], "c2": ["a", ""], "data": [1, 2]} From 0702055003d758ddec506007cba1cb335865cb50 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 18 Jun 2020 09:33:52 -0700 Subject: [PATCH 0148/1025] DEPR: to_perioddelta (#34853) --- doc/source/whatsnew/v1.1.0.rst | 1 + pandas/core/arrays/datetimes.py | 9 ++++++++- pandas/tests/arrays/test_datetimelike.py | 9 +++++++-- 3 files changed, 16 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index bd47bef397aa7..14d1e1b49a726 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -777,6 +777,7 @@ Deprecations instead (:issue:`34191`). - The ``squeeze`` keyword in the ``groupby`` function is deprecated and will be removed in a future version (:issue:`32380`) - The ``tz`` keyword in :meth:`Period.to_timestamp` is deprecated and will be removed in a future version; use `per.to_timestamp(...).tz_localize(tz)`` instead (:issue:`34522`) +- :meth:`DatetimeIndex.to_perioddelta` is deprecated and will be removed in a future version. Use ``index - index.to_period(freq).to_timestamp()`` instead (:issue:`34853`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index b6c27abc321e1..461f71ff821fa 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -1125,7 +1125,14 @@ def to_perioddelta(self, freq): ------- TimedeltaArray/Index """ - # TODO: consider privatizing (discussion in GH#23113) + # Deprecaation GH#34853 + warnings.warn( + "to_perioddelta is deprecated and will be removed in a " + "future version. " + "Use `dtindex - dtindex.to_period(freq).to_timestamp()` instead", + FutureWarning, + stacklevel=3, + ) from pandas.core.arrays.timedeltas import TimedeltaArray i8delta = self.asi8 - self.to_period(freq).to_timestamp().asi8 diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index 1a61b379de943..b1ab700427c28 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -511,8 +511,13 @@ def test_to_perioddelta(self, datetime_index, freqstr): dti = datetime_index arr = DatetimeArray(dti) - expected = dti.to_perioddelta(freq=freqstr) - result = arr.to_perioddelta(freq=freqstr) + with tm.assert_produces_warning(FutureWarning): + # Deprecation GH#34853 + expected = dti.to_perioddelta(freq=freqstr) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + # stacklevel is chosen to be "correct" for DatetimeIndex, not + # DatetimeArray + result = arr.to_perioddelta(freq=freqstr) assert isinstance(result, TimedeltaArray) # placeholder until these become actual EA subclasses and we can use From e8c1bea9ea4233c139c8e2071d84da6d14e29c17 Mon Sep 17 00:00:00 2001 From: rhshadrach <45562402+rhshadrach@users.noreply.github.com> Date: Thu, 18 Jun 2020 18:59:07 -0400 Subject: [PATCH 0149/1025] CLN: Unify signatures in _libs.groupby (#34372) --- pandas/_libs/groupby.pyx | 11 +++--- pandas/core/groupby/generic.py | 3 +- pandas/core/groupby/groupby.py | 64 ++++++++++++++++++++++++++-------- 3 files changed, 58 insertions(+), 20 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 4e792da31e1d5..7c57e6ee9dbfd 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -378,8 +378,8 @@ def group_fillna_indexer(ndarray[int64_t] out, ndarray[int64_t] labels, @cython.boundscheck(False) @cython.wraparound(False) def group_any_all(uint8_t[:] out, - const int64_t[:] labels, const uint8_t[:] values, + const int64_t[:] labels, const uint8_t[:] mask, object val_test, bint skipna): @@ -560,7 +560,8 @@ def _group_var(floating[:, :] out, int64_t[:] counts, floating[:, :] values, const int64_t[:] labels, - Py_ssize_t min_count=-1): + Py_ssize_t min_count=-1, + int64_t ddof=1): cdef: Py_ssize_t i, j, N, K, lab, ncounts = len(counts) floating val, ct, oldmean @@ -600,10 +601,10 @@ def _group_var(floating[:, :] out, for i in range(ncounts): for j in range(K): ct = nobs[i, j] - if ct < 2: + if ct <= ddof: out[i, j] = NAN else: - out[i, j] /= (ct - 1) + out[i, j] /= (ct - ddof) group_var_float32 = _group_var['float'] @@ -715,8 +716,8 @@ group_ohlc_float64 = _group_ohlc['double'] @cython.boundscheck(False) @cython.wraparound(False) def group_quantile(ndarray[float64_t] out, - ndarray[int64_t] labels, numeric[:] values, + ndarray[int64_t] labels, ndarray[uint8_t] mask, float64_t q, object interpolation): diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index db5df9818b0b0..cec3d9711a8ca 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1736,7 +1736,8 @@ def _wrap_aggregated_output( DataFrame """ indexed_output = {key.position: val for key, val in output.items()} - columns = Index(key.label for key in output) + name = self._obj_with_exclusions._get_axis(1 - self.axis).name + columns = Index([key.label for key in output], name=name) result = self.obj._constructor(indexed_output) result.columns = columns diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 48fdb14ebe90c..b92e75f16e965 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1277,6 +1277,7 @@ def result_to_bool(result: np.ndarray, inference: Type) -> np.ndarray: return self._get_cythonized_result( "group_any_all", aggregate=True, + numeric_only=False, cython_dtype=np.dtype(np.uint8), needs_values=True, needs_mask=True, @@ -1433,18 +1434,16 @@ def std(self, ddof: int = 1): Series or DataFrame Standard deviation of values within each group. """ - result = self.var(ddof=ddof) - if result.ndim == 1: - result = np.sqrt(result) - else: - cols = result.columns.get_indexer_for( - result.columns.difference(self.exclusions).unique() - ) - # TODO(GH-22046) - setting with iloc broken if labels are not unique - # .values to remove labels - result.iloc[:, cols] = np.sqrt(result.iloc[:, cols]).values - - return result + return self._get_cythonized_result( + "group_var_float64", + aggregate=True, + needs_counts=True, + needs_values=True, + needs_2d=True, + cython_dtype=np.dtype(np.float64), + post_processing=lambda vals, inference: np.sqrt(vals), + ddof=ddof, + ) @Substitution(name="groupby") @Appender(_common_see_also) @@ -1778,6 +1777,7 @@ def _fill(self, direction, limit=None): return self._get_cythonized_result( "group_fillna_indexer", + numeric_only=False, needs_mask=True, cython_dtype=np.dtype(np.int64), result_is_index=True, @@ -2078,6 +2078,7 @@ def post_processor(vals: np.ndarray, inference: Optional[Type]) -> np.ndarray: return self._get_cythonized_result( "group_quantile", aggregate=True, + numeric_only=False, needs_values=True, needs_mask=True, cython_dtype=np.dtype(np.float64), @@ -2367,7 +2368,11 @@ def _get_cythonized_result( how: str, cython_dtype: np.dtype, aggregate: bool = False, + numeric_only: bool = True, + needs_counts: bool = False, needs_values: bool = False, + needs_2d: bool = False, + min_count: Optional[int] = None, needs_mask: bool = False, needs_ngroups: bool = False, result_is_index: bool = False, @@ -2386,9 +2391,18 @@ def _get_cythonized_result( aggregate : bool, default False Whether the result should be aggregated to match the number of groups + numeric_only : bool, default True + Whether only numeric datatypes should be computed + needs_counts : bool, default False + Whether the counts should be a part of the Cython call needs_values : bool, default False Whether the values should be a part of the Cython call signature + needs_2d : bool, default False + Whether the values and result of the Cython call signature + are at least 2-dimensional. + min_count : int, default None + When not None, min_count for the Cython call needs_mask : bool, default False Whether boolean mask needs to be part of the Cython call signature @@ -2418,7 +2432,7 @@ def _get_cythonized_result( if result_is_index and aggregate: raise ValueError("'result_is_index' and 'aggregate' cannot both be True!") if post_processing: - if not callable(pre_processing): + if not callable(post_processing): raise ValueError("'post_processing' must be a callable!") if pre_processing: if not callable(pre_processing): @@ -2438,21 +2452,39 @@ def _get_cythonized_result( name = obj.name values = obj._values + if numeric_only and not is_numeric_dtype(values): + continue + if aggregate: result_sz = ngroups else: result_sz = len(values) result = np.zeros(result_sz, dtype=cython_dtype) - func = partial(base_func, result, labels) + if needs_2d: + result = result.reshape((-1, 1)) + func = partial(base_func, result) + inferences = None + if needs_counts: + counts = np.zeros(self.ngroups, dtype=np.int64) + func = partial(func, counts) + if needs_values: vals = values if pre_processing: vals, inferences = pre_processing(vals) + if needs_2d: + vals = vals.reshape((-1, 1)) + vals = vals.astype(cython_dtype, copy=False) func = partial(func, vals) + func = partial(func, labels) + + if min_count is not None: + func = partial(func, min_count) + if needs_mask: mask = isna(values).view(np.uint8) func = partial(func, mask) @@ -2462,6 +2494,9 @@ def _get_cythonized_result( func(**kwargs) # Call func to modify indexer values in place + if needs_2d: + result = result.reshape(-1) + if result_is_index: result = algorithms.take_nd(values, result) @@ -2512,6 +2547,7 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None): return self._get_cythonized_result( "group_shift_indexer", + numeric_only=False, cython_dtype=np.dtype(np.int64), needs_ngroups=True, result_is_index=True, From e0e9ccb6abdad932f6867f1356d092d2ee0d20ee Mon Sep 17 00:00:00 2001 From: Suvayu Ali Date: Thu, 18 Jun 2020 23:08:40 +0000 Subject: [PATCH 0150/1025] base/test_unique.py: regression test for bad unicode string (#34851) closes #34550 --- pandas/tests/base/test_unique.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/pandas/tests/base/test_unique.py b/pandas/tests/base/test_unique.py index 8cf234012d02f..e5592cef59592 100644 --- a/pandas/tests/base/test_unique.py +++ b/pandas/tests/base/test_unique.py @@ -105,3 +105,19 @@ def test_nunique_null(null_obj, index_or_series_obj): num_unique_values = len(obj.unique()) assert obj.nunique() == max(0, num_unique_values - 1) assert obj.nunique(dropna=False) == max(0, num_unique_values) + + +@pytest.mark.parametrize( + "idx_or_series_w_bad_unicode", [pd.Index(["\ud83d"] * 2), pd.Series(["\ud83d"] * 2)] +) +def test_unique_bad_unicode(idx_or_series_w_bad_unicode): + # regression test for #34550 + obj = idx_or_series_w_bad_unicode + result = obj.unique() + + if isinstance(obj, pd.Index): + expected = pd.Index(["\ud83d"], dtype=object) + tm.assert_index_equal(result, expected) + else: + expected = np.array(["\ud83d"], dtype=object) + tm.assert_numpy_array_equal(result, expected) From 01d35b0f1252ef6b2c68cdf998544ded1f9ccfaa Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 19 Jun 2020 11:16:10 +0200 Subject: [PATCH 0151/1025] BUG: Fix concat of frames with extension types (no reindexed columns) (#34339) --- doc/source/whatsnew/v1.1.0.rst | 3 +++ pandas/core/arrays/integer.py | 8 ++++++-- pandas/core/dtypes/concat.py | 6 +----- pandas/core/internals/concat.py | 11 ++++++++++- pandas/tests/extension/base/reshaping.py | 4 ++-- pandas/tests/indexing/test_indexing.py | 16 ++++++++++++++-- pandas/tests/reshape/test_concat.py | 14 ++++++++++++++ 7 files changed, 50 insertions(+), 12 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 14d1e1b49a726..a27e6e8433779 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -292,6 +292,9 @@ Other enhancements - :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` now accept an ``errors`` argument (:issue:`22610`) - :meth:`groupby.transform` now allows ``func`` to be ``pad``, ``backfill`` and ``cumcount`` (:issue:`31269`). - :meth:`~pandas.io.json.read_json` now accepts `nrows` parameter. (:issue:`33916`). +- :func:`concat` and :meth:`~DataFrame.append` now preserve extension dtypes, for example + combining a nullable integer column with a numpy integer column will no longer + result in object dtype but preserve the integer dtype (:issue:`33607`, :issue:`34339`). - :meth:`~pandas.io.gbq.read_gbq` now allows to disable progress bar (:issue:`33360`). - :meth:`~pandas.io.gbq.read_gbq` now supports the ``max_results`` kwarg from ``pandas-gbq`` (:issue:`34639`). - :meth:`DataFrame.to_html` and :meth:`DataFrame.to_string`'s ``col_space`` parameter now accepts a list of dict to change only some specific columns' width (:issue:`28917`). diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index ac06f7cce88d5..df43b5d6115ba 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -93,10 +93,14 @@ def construct_array_type(cls) -> Type["IntegerArray"]: def _get_common_dtype(self, dtypes: List[DtypeObj]) -> Optional[DtypeObj]: # for now only handle other integer types - if not all(isinstance(t, _IntegerDtype) for t in dtypes): + if not all( + isinstance(t, _IntegerDtype) + or (isinstance(t, np.dtype) and np.issubdtype(t, np.integer)) + for t in dtypes + ): return None np_dtype = np.find_common_type( - [t.numpy_dtype for t in dtypes], [] # type: ignore + [t.numpy_dtype if isinstance(t, BaseMaskedDtype) else t for t in dtypes], [] ) if np.issubdtype(np_dtype, np.integer): return _dtypes[str(np_dtype)] diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index fb47b33ce9890..71686bfc313fb 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -147,7 +147,7 @@ def is_nonempty(x) -> bool: single_dtype = len({x.dtype for x in to_concat}) == 1 any_ea = any(is_extension_array_dtype(x.dtype) for x in to_concat) - if any_ea and axis == 0: + if any_ea: if not single_dtype: target_dtype = find_common_type([x.dtype for x in to_concat]) to_concat = [_cast_to_common_type(arr, target_dtype) for arr in to_concat] @@ -161,10 +161,6 @@ def is_nonempty(x) -> bool: elif _contains_datetime or "timedelta" in typs: return concat_datetime(to_concat, axis=axis, typs=typs) - elif any_ea and axis == 1: - to_concat = [np.atleast_2d(x.astype("object")) for x in to_concat] - return np.concatenate(to_concat, axis=axis) - elif all_empty: # we have all empties, but may need to coerce the result dtype to # object if we have non-numeric type operands (numpy would otherwise diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index e25c4c2341217..fd8c5f5e27c02 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -319,6 +319,15 @@ def _concatenate_join_units(join_units, concat_axis, copy): concat_values = concat_values.copy() else: concat_values = concat_values.copy() + elif any(isinstance(t, ExtensionArray) for t in to_concat): + # concatting with at least one EA means we are concatting a single column + # the non-EA values are 2D arrays with shape (1, n) + to_concat = [t if isinstance(t, ExtensionArray) else t[0, :] for t in to_concat] + concat_values = concat_compat(to_concat, axis=concat_axis) + if not isinstance(concat_values, ExtensionArray): + # if the result of concat is not an EA but an ndarray, reshape to + # 2D to put it a non-EA Block + concat_values = np.atleast_2d(concat_values) else: concat_values = concat_compat(to_concat, axis=concat_axis) @@ -443,7 +452,7 @@ def _is_uniform_join_units(join_units: List[JoinUnit]) -> bool: # cannot necessarily join return ( # all blocks need to have the same type - all(isinstance(ju.block, type(join_units[0].block)) for ju in join_units) + all(type(ju.block) is type(join_units[0].block) for ju in join_units) and # noqa # no blocks that would get missing values (can lead to type upcasts) # unless we're an extension dtype. diff --git a/pandas/tests/extension/base/reshaping.py b/pandas/tests/extension/base/reshaping.py index c9445ceec2c77..cd932e842e00c 100644 --- a/pandas/tests/extension/base/reshaping.py +++ b/pandas/tests/extension/base/reshaping.py @@ -62,11 +62,11 @@ def test_concat_mixed_dtypes(self, data): self.assert_series_equal(result, expected) # simple test for just EA and one other - result = pd.concat([df1, df2]) + result = pd.concat([df1, df2.astype(object)]) expected = pd.concat([df1.astype("object"), df2.astype("object")]) self.assert_frame_equal(result, expected) - result = pd.concat([df1["A"], df2["A"]]) + result = pd.concat([df1["A"], df2["A"].astype(object)]) expected = pd.concat([df1["A"].astype("object"), df2["A"].astype("object")]) self.assert_series_equal(result, expected) diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index dd63a26f139e9..5c0230e75021c 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -1006,12 +1006,24 @@ def test_extension_array_cross_section(): def test_extension_array_cross_section_converts(): + # all numeric columns -> numeric series df = pd.DataFrame( - {"A": pd.core.arrays.integer_array([1, 2]), "B": np.array([1, 2])}, + {"A": pd.array([1, 2], dtype="Int64"), "B": np.array([1, 2])}, index=["a", "b"], + ) + result = df.loc["a"] + expected = pd.Series([1, 1], dtype="Int64", index=["A", "B"], name="a") + tm.assert_series_equal(result, expected) + + result = df.iloc[0] + tm.assert_series_equal(result, expected) + + # mixed columns -> object series + df = pd.DataFrame( + {"A": pd.array([1, 2], dtype="Int64"), "B": np.array(["a", "b"])}, index=["a", "b"], ) result = df.loc["a"] - expected = pd.Series([1, 1], dtype=object, index=["A", "B"], name="a") + expected = pd.Series([1, "a"], dtype=object, index=["A", "B"], name="a") tm.assert_series_equal(result, expected) result = df.iloc[0] diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py index 1c9d00a4b4c90..ffeb5ff0f8aaa 100644 --- a/pandas/tests/reshape/test_concat.py +++ b/pandas/tests/reshape/test_concat.py @@ -2843,3 +2843,17 @@ def test_concat_preserves_subclass(obj): result = concat([obj, obj]) assert isinstance(result, type(obj)) + + +def test_concat_frame_axis0_extension_dtypes(): + # preserve extension dtype (through common_dtype mechanism) + df1 = pd.DataFrame({"a": pd.array([1, 2, 3], dtype="Int64")}) + df2 = pd.DataFrame({"a": np.array([4, 5, 6])}) + + result = pd.concat([df1, df2], ignore_index=True) + expected = pd.DataFrame({"a": [1, 2, 3, 4, 5, 6]}, dtype="Int64") + tm.assert_frame_equal(result, expected) + + result = pd.concat([df2, df1], ignore_index=True) + expected = pd.DataFrame({"a": [4, 5, 6, 1, 2, 3]}, dtype="Int64") + tm.assert_frame_equal(result, expected) From eb2aafb76353c5120b09d5cf2204ab8b61f2464e Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 19 Jun 2020 13:01:21 +0200 Subject: [PATCH 0152/1025] DOC: reduce API docs for offset aliases (#34433) --- doc/source/reference/offset_frequency.rst | 299 ++++------------------ 1 file changed, 54 insertions(+), 245 deletions(-) diff --git a/doc/source/reference/offset_frequency.rst b/doc/source/reference/offset_frequency.rst index ee89df3114048..1b63253cde2c5 100644 --- a/doc/source/reference/offset_frequency.rst +++ b/doc/source/reference/offset_frequency.rst @@ -42,11 +42,20 @@ Methods BusinessDay ----------- + .. autosummary:: :toctree: api/ BusinessDay +Alias: + +.. autosummary:: + :toctree: api/ + :template: autosummary/class_without_autosummary.rst + + BDay + Properties ~~~~~~~~~~ .. autosummary:: @@ -117,11 +126,20 @@ Methods CustomBusinessDay ----------------- + .. autosummary:: :toctree: api/ CustomBusinessDay +Alias: + +.. autosummary:: + :toctree: api/ + :template: autosummary/class_without_autosummary.rst + + CDay + Properties ~~~~~~~~~~ .. autosummary:: @@ -260,11 +278,20 @@ Methods BusinessMonthEnd ---------------- + .. autosummary:: :toctree: api/ BusinessMonthEnd +Alias: + +.. autosummary:: + :toctree: api/ + :template: autosummary/class_without_autosummary.rst + + BMonthEnd + Properties ~~~~~~~~~~ .. autosummary:: @@ -294,11 +321,20 @@ Methods BusinessMonthBegin ------------------ + .. autosummary:: :toctree: api/ BusinessMonthBegin +Alias: + +.. autosummary:: + :toctree: api/ + :template: autosummary/class_without_autosummary.rst + + BMonthBegin + Properties ~~~~~~~~~~ .. autosummary:: @@ -328,11 +364,20 @@ Methods CustomBusinessMonthEnd ---------------------- + .. autosummary:: :toctree: api/ CustomBusinessMonthEnd +Alias: + +.. autosummary:: + :toctree: api/ + :template: autosummary/class_without_autosummary.rst + + CBMonthEnd + Properties ~~~~~~~~~~ .. autosummary:: @@ -365,11 +410,20 @@ Methods CustomBusinessMonthBegin ------------------------ + .. autosummary:: :toctree: api/ CustomBusinessMonthBegin +Alias: + +.. autosummary:: + :toctree: api/ + :template: autosummary/class_without_autosummary.rst + + CBMonthBegin + Properties ~~~~~~~~~~ .. autosummary:: @@ -1238,251 +1292,6 @@ Methods Nano.__call__ Nano.apply -BDay ----- -.. autosummary:: - :toctree: api/ - - BDay - -Properties -~~~~~~~~~~ -.. autosummary:: - :toctree: api/ - - BDay.base - BDay.freqstr - BDay.kwds - BDay.name - BDay.nanos - BDay.normalize - BDay.offset - BDay.rule_code - BDay.n - BDay.weekmask - BDay.holidays - BDay.calendar - -Methods -~~~~~~~ -.. autosummary:: - :toctree: api/ - - BDay.apply - BDay.apply_index - BDay.copy - BDay.isAnchored - BDay.onOffset - BDay.is_anchored - BDay.is_on_offset - BDay.rollback - BDay.rollforward - BDay.__call__ - -BMonthEnd ---------- -.. autosummary:: - :toctree: api/ - - BMonthEnd - -Properties -~~~~~~~~~~ -.. autosummary:: - :toctree: api/ - - BMonthEnd.base - BMonthEnd.freqstr - BMonthEnd.kwds - BMonthEnd.name - BMonthEnd.nanos - BMonthEnd.normalize - BMonthEnd.rule_code - BMonthEnd.n - -Methods -~~~~~~~ -.. autosummary:: - :toctree: api/ - - BMonthEnd.apply - BMonthEnd.apply_index - BMonthEnd.copy - BMonthEnd.isAnchored - BMonthEnd.onOffset - BMonthEnd.is_anchored - BMonthEnd.is_on_offset - BMonthEnd.rollback - BMonthEnd.rollforward - BMonthEnd.__call__ - -BMonthBegin ------------ -.. autosummary:: - :toctree: api/ - - BMonthBegin - -Properties -~~~~~~~~~~ -.. autosummary:: - :toctree: api/ - - BMonthBegin.base - BMonthBegin.freqstr - BMonthBegin.kwds - BMonthBegin.name - BMonthBegin.nanos - BMonthBegin.normalize - BMonthBegin.rule_code - BMonthBegin.n - -Methods -~~~~~~~ -.. autosummary:: - :toctree: api/ - - BMonthBegin.apply - BMonthBegin.apply_index - BMonthBegin.copy - BMonthBegin.isAnchored - BMonthBegin.onOffset - BMonthBegin.is_anchored - BMonthBegin.is_on_offset - BMonthBegin.rollback - BMonthBegin.rollforward - BMonthBegin.__call__ - -CBMonthEnd ----------- -.. autosummary:: - :toctree: api/ - - CBMonthEnd - -Properties -~~~~~~~~~~ -.. autosummary:: - :toctree: api/ - - CBMonthEnd.base - CBMonthEnd.cbday_roll - CBMonthEnd.freqstr - CBMonthEnd.kwds - CBMonthEnd.m_offset - CBMonthEnd.month_roll - CBMonthEnd.name - CBMonthEnd.nanos - CBMonthEnd.normalize - CBMonthEnd.offset - CBMonthEnd.rule_code - CBMonthEnd.n - CBMonthEnd.weekmask - CBMonthEnd.holidays - CBMonthEnd.calendar - -Methods -~~~~~~~ -.. autosummary:: - :toctree: api/ - - CBMonthEnd.apply - CBMonthEnd.apply_index - CBMonthEnd.copy - CBMonthEnd.isAnchored - CBMonthEnd.onOffset - CBMonthEnd.is_anchored - CBMonthEnd.is_on_offset - CBMonthEnd.rollback - CBMonthEnd.rollforward - CBMonthEnd.__call__ - -CBMonthBegin ------------- -.. autosummary:: - :toctree: api/ - - CBMonthBegin - -Properties -~~~~~~~~~~ -.. autosummary:: - :toctree: api/ - - CBMonthBegin.base - CBMonthBegin.cbday_roll - CBMonthBegin.freqstr - CBMonthBegin.kwds - CBMonthBegin.m_offset - CBMonthBegin.month_roll - CBMonthBegin.name - CBMonthBegin.nanos - CBMonthBegin.normalize - CBMonthBegin.offset - CBMonthBegin.rule_code - CBMonthBegin.n - CBMonthBegin.weekmask - CBMonthBegin.holidays - CBMonthBegin.calendar - -Methods -~~~~~~~ -.. autosummary:: - :toctree: api/ - - CBMonthBegin.apply - CBMonthBegin.apply_index - CBMonthBegin.copy - CBMonthBegin.isAnchored - CBMonthBegin.onOffset - CBMonthBegin.is_anchored - CBMonthBegin.is_on_offset - CBMonthBegin.rollback - CBMonthBegin.rollforward - CBMonthBegin.__call__ - -CDay ----- -.. autosummary:: - :toctree: api/ - - CDay - -Properties -~~~~~~~~~~ -.. autosummary:: - :toctree: api/ - - CDay.base - CDay.freqstr - CDay.kwds - CDay.name - CDay.nanos - CDay.normalize - CDay.offset - CDay.rule_code - CDay.n - CDay.weekmask - CDay.calendar - CDay.holidays - -Methods -~~~~~~~ -.. autosummary:: - :toctree: api/ - - CDay.apply - CDay.apply_index - CDay.copy - CDay.isAnchored - CDay.onOffset - CDay.is_anchored - CDay.is_on_offset - CDay.rollback - CDay.rollforward - CDay.__call__ - - .. _api.frequencies: =========== From 41c79ada76ca7a05b21532da105966e6ccfc7f1f Mon Sep 17 00:00:00 2001 From: smartvinnetou <61093810+smartvinnetou@users.noreply.github.com> Date: Fri, 19 Jun 2020 13:01:13 +0100 Subject: [PATCH 0153/1025] CLN: Using doc decorator instead of Appender/Substitution in several Series/DataFrame methods (#33277) --- pandas/core/frame.py | 32 +- pandas/core/generic.py | 564 ++++++++++++++------------------ pandas/core/groupby/generic.py | 10 +- pandas/core/groupby/groupby.py | 14 +- pandas/core/resample.py | 4 +- pandas/core/series.py | 86 +++-- pandas/core/window/ewm.py | 6 +- pandas/core/window/expanding.py | 6 +- pandas/core/window/rolling.py | 10 +- 9 files changed, 360 insertions(+), 372 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 2c80f57e4ef5d..39ca7ed47f7fa 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2183,9 +2183,10 @@ def to_feather(self, path, **kwargs) -> None: to_feather(self, path, **kwargs) - @Appender( - """ - Examples + @doc( + Series.to_markdown, + klass=_shared_doc_kwargs["klass"], + examples="""Examples -------- >>> df = pd.DataFrame( ... data={"animal_1": ["elk", "pig"], "animal_2": ["dog", "quetzal"]} @@ -2206,10 +2207,8 @@ def to_feather(self, path, **kwargs) -> None: +----+------------+------------+ | 1 | pig | quetzal | +----+------------+------------+ - """ + """, ) - @Substitution(klass="DataFrame") - @Appender(_shared_docs["to_markdown"]) def to_markdown( self, buf: Optional[IO[str]] = None, mode: Optional[str] = None, **kwargs ) -> Optional[str]: @@ -4758,20 +4757,20 @@ def _maybe_casted_values(index, labels=None): # ---------------------------------------------------------------------- # Reindex-based selection methods - @Appender(_shared_docs["isna"] % _shared_doc_kwargs) + @doc(NDFrame.isna, klass=_shared_doc_kwargs["klass"]) def isna(self) -> "DataFrame": result = self._constructor(self._data.isna(func=isna)) return result.__finalize__(self, method="isna") - @Appender(_shared_docs["isna"] % _shared_doc_kwargs) + @doc(NDFrame.isna, klass=_shared_doc_kwargs["klass"]) def isnull(self) -> "DataFrame": return self.isna() - @Appender(_shared_docs["notna"] % _shared_doc_kwargs) + @doc(NDFrame.notna, klass=_shared_doc_kwargs["klass"]) def notna(self) -> "DataFrame": return ~self.isna() - @Appender(_shared_docs["notna"] % _shared_doc_kwargs) + @doc(NDFrame.notna, klass=_shared_doc_kwargs["klass"]) def notnull(self) -> "DataFrame": return ~self.isna() @@ -7330,13 +7329,14 @@ def _gotitem( """ ) - @Substitution( + @doc( + _shared_docs["aggregate"], + klass=_shared_doc_kwargs["klass"], + axis=_shared_doc_kwargs["axis"], see_also=_agg_summary_and_see_also_doc, examples=_agg_examples_doc, versionadded="\n.. versionadded:: 0.20.0\n", - **_shared_doc_kwargs, ) - @Appender(_shared_docs["aggregate"]) def aggregate(self, func, axis=0, *args, **kwargs): axis = self._get_axis_number(axis) @@ -7364,7 +7364,11 @@ def _aggregate(self, arg, axis=0, *args, **kwargs): agg = aggregate - @Appender(_shared_docs["transform"] % _shared_doc_kwargs) + @doc( + NDFrame.transform, + klass=_shared_doc_kwargs["klass"], + axis=_shared_doc_kwargs["axis"], + ) def transform(self, func, axis=0, *args, **kwargs) -> "DataFrame": axis = self._get_axis_number(axis) if axis == 1: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 26770efb5c9f9..701909c9df857 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -847,7 +847,7 @@ def rename( copy : bool, default True Also copy underlying data. inplace : bool, default False - Whether to return a new %(klass)s. If True then value of copy is + Whether to return a new {klass}. If True then value of copy is ignored. level : int or level name, default None In case of a MultiIndex, only rename labels in the specified @@ -861,7 +861,7 @@ def rename( Returns ------- - renamed : %(klass)s (new object) + renamed : {klass} (new object) Raises ------ @@ -1897,29 +1897,6 @@ def _repr_data_resource_(self): # ---------------------------------------------------------------------- # I/O Methods - _shared_docs[ - "to_markdown" - ] = """ - Print %(klass)s in Markdown-friendly format. - - .. versionadded:: 1.0.0 - - Parameters - ---------- - buf : str, Path or StringIO-like, optional, default None - Buffer to write to. If None, the output is returned as a string. - mode : str, optional - Mode in which file is opened. - **kwargs - These parameters will be passed to `tabulate \ - `_. - - Returns - ------- - str - %(klass)s in Markdown-friendly format. - """ - @doc(klass="object") def to_excel( self, @@ -4236,9 +4213,15 @@ def sort_values( """ raise AbstractMethodError(self) + @doc( + klass=_shared_doc_kwargs["klass"], + axes=_shared_doc_kwargs["axes"], + optional_labels="", + optional_axis="", + ) def reindex(self: FrameOrSeries, *args, **kwargs) -> FrameOrSeries: """ - Conform %(klass)s to new index with optional filling logic. + Conform {klass} to new index with optional filling logic. Places NA/NaN in locations having no value in the previous index. A new object is produced unless the new index is equivalent to the current one and @@ -4246,12 +4229,12 @@ def reindex(self: FrameOrSeries, *args, **kwargs) -> FrameOrSeries: Parameters ---------- - %(optional_labels)s - %(axes)s : array-like, optional + {optional_labels} + {axes} : array-like, optional New labels / index to conform to, should be specified using keywords. Preferably an Index object to avoid duplicating data. - %(optional_axis)s - method : {None, 'backfill'/'bfill', 'pad'/'ffill', 'nearest'} + {optional_axis} + method : {{None, 'backfill'/'bfill', 'pad'/'ffill', 'nearest'}} Method to use for filling holes in reindexed DataFrame. Please note: this is only applicable to DataFrames/Series with a monotonically increasing/decreasing index. @@ -4285,7 +4268,7 @@ def reindex(self: FrameOrSeries, *args, **kwargs) -> FrameOrSeries: Returns ------- - %(klass)s with changed index. + {klass} with changed index. See Also -------- @@ -4298,7 +4281,7 @@ def reindex(self: FrameOrSeries, *args, **kwargs) -> FrameOrSeries: ``DataFrame.reindex`` supports two calling conventions * ``(index=index_labels, columns=column_labels, ...)`` - * ``(labels, axis={'index', 'columns'}, ...)`` + * ``(labels, axis={{'index', 'columns'}}, ...)`` We *highly* recommend using keyword arguments to clarify your intent. @@ -4306,8 +4289,8 @@ def reindex(self: FrameOrSeries, *args, **kwargs) -> FrameOrSeries: Create a dataframe with some fictional data. >>> index = ['Firefox', 'Chrome', 'Safari', 'IE10', 'Konqueror'] - >>> df = pd.DataFrame({'http_status': [200, 200, 404, 404, 301], - ... 'response_time': [0.04, 0.02, 0.07, 0.08, 1.0]}, + >>> df = pd.DataFrame({{'http_status': [200, 200, 404, 404, 301], + ... 'response_time': [0.04, 0.02, 0.07, 0.08, 1.0]}}, ... index=index) >>> df http_status response_time @@ -4378,7 +4361,7 @@ def reindex(self: FrameOrSeries, *args, **kwargs) -> FrameOrSeries: of dates). >>> date_index = pd.date_range('1/1/2010', periods=6, freq='D') - >>> df2 = pd.DataFrame({"prices": [100, 101, np.nan, 100, 89, 88]}, + >>> df2 = pd.DataFrame({{"prices": [100, 101, np.nan, 100, 89, 88]}}, ... index=date_index) >>> df2 prices @@ -5012,19 +4995,19 @@ def sample( locs = rs.choice(axis_length, size=n, replace=replace, p=weights) return self.take(locs, axis=axis) - _shared_docs[ - "pipe" - ] = r""" + @doc(klass=_shared_doc_kwargs["klass"]) + def pipe(self, func, *args, **kwargs): + r""" Apply func(self, \*args, \*\*kwargs). Parameters ---------- func : function - Function to apply to the %(klass)s. + Function to apply to the {klass}. ``args``, and ``kwargs`` are passed into ``func``. Alternatively a ``(callable, data_keyword)`` tuple where ``data_keyword`` is a string indicating the keyword of - ``callable`` that expects the %(klass)s. + ``callable`` that expects the {klass}. args : iterable, optional Positional arguments passed into ``func``. kwargs : mapping, optional @@ -5064,121 +5047,49 @@ def sample( ... .pipe((func, 'arg2'), arg1=a, arg3=c) ... ) # doctest: +SKIP """ - - @Appender(_shared_docs["pipe"] % _shared_doc_kwargs) - def pipe(self, func, *args, **kwargs): return com.pipe(self, func, *args, **kwargs) _shared_docs["aggregate"] = dedent( """ - Aggregate using one or more operations over the specified axis. - %(versionadded)s - Parameters - ---------- - func : function, str, list or dict - Function to use for aggregating the data. If a function, must either - work when passed a %(klass)s or when passed to %(klass)s.apply. - - Accepted combinations are: - - - function - - string function name - - list of functions and/or function names, e.g. ``[np.sum, 'mean']`` - - dict of axis labels -> functions, function names or list of such. - %(axis)s - *args - Positional arguments to pass to `func`. - **kwargs - Keyword arguments to pass to `func`. - - Returns - ------- - scalar, Series or DataFrame - - The return can be: - - * scalar : when Series.agg is called with single function - * Series : when DataFrame.agg is called with a single function - * DataFrame : when DataFrame.agg is called with several functions - - Return scalar, Series or DataFrame. - %(see_also)s - Notes - ----- - `agg` is an alias for `aggregate`. Use the alias. - - A passed user-defined-function will be passed a Series for evaluation. - %(examples)s""" - ) + Aggregate using one or more operations over the specified axis. + {versionadded} + Parameters + ---------- + func : function, str, list or dict + Function to use for aggregating the data. If a function, must either + work when passed a {klass} or when passed to {klass}.apply. + + Accepted combinations are: + + - function + - string function name + - list of functions and/or function names, e.g. ``[np.sum, 'mean']`` + - dict of axis labels -> functions, function names or list of such. + {axis} + *args + Positional arguments to pass to `func`. + **kwargs + Keyword arguments to pass to `func`. - _shared_docs[ - "transform" - ] = """ - Call ``func`` on self producing a %(klass)s with transformed values. + Returns + ------- + scalar, Series or DataFrame - Produced %(klass)s will have same axis length as self. + The return can be: - Parameters - ---------- - func : function, str, list or dict - Function to use for transforming the data. If a function, must either - work when passed a %(klass)s or when passed to %(klass)s.apply. - - Accepted combinations are: - - - function - - string function name - - list of functions and/or function names, e.g. ``[np.exp. 'sqrt']`` - - dict of axis labels -> functions, function names or list of such. - %(axis)s - *args - Positional arguments to pass to `func`. - **kwargs - Keyword arguments to pass to `func`. - - Returns - ------- - %(klass)s - A %(klass)s that must have the same length as self. - - Raises - ------ - ValueError : If the returned %(klass)s has a different length than self. - - See Also - -------- - %(klass)s.agg : Only perform aggregating type operations. - %(klass)s.apply : Invoke function on a %(klass)s. - - Examples - -------- - >>> df = pd.DataFrame({'A': range(3), 'B': range(1, 4)}) - >>> df - A B - 0 0 1 - 1 1 2 - 2 2 3 - >>> df.transform(lambda x: x + 1) - A B - 0 1 2 - 1 2 3 - 2 3 4 - - Even though the resulting %(klass)s must have the same length as the - input %(klass)s, it is possible to provide several input functions: - - >>> s = pd.Series(range(3)) - >>> s - 0 0 - 1 1 - 2 2 - dtype: int64 - >>> s.transform([np.sqrt, np.exp]) - sqrt exp - 0 0.000000 1.000000 - 1 1.000000 2.718282 - 2 1.414214 7.389056 - """ + * scalar : when Series.agg is called with single function + * Series : when DataFrame.agg is called with a single function + * DataFrame : when DataFrame.agg is called with several functions + + Return scalar, Series or DataFrame. + {see_also} + Notes + ----- + `agg` is an alias for `aggregate`. Use the alias. + + A passed user-defined-function will be passed a Series for evaluation. + {examples}""" + ) # ---------------------------------------------------------------------- # Attribute access @@ -6188,7 +6099,7 @@ def ffill( Returns ------- - %(klass)s or None + {klass} or None Object with missing values filled or None if ``inplace=True``. """ return self.fillna( @@ -6209,7 +6120,7 @@ def bfill( Returns ------- - %(klass)s or None + {klass} or None Object with missing values filled or None if ``inplace=True``. """ return self.fillna( @@ -6680,9 +6591,18 @@ def replace( else: return result.__finalize__(self, method="replace") - _shared_docs[ - "interpolate" - ] = """ + def interpolate( + self: FrameOrSeries, + method: str = "linear", + axis: Axis = 0, + limit: Optional[int] = None, + inplace: bool_t = False, + limit_direction: Optional[str] = None, + limit_area: Optional[str] = None, + downcast: Optional[str] = None, + **kwargs, + ) -> Optional[FrameOrSeries]: + """ Please note that only ``method='linear'`` is supported for DataFrame/Series with a MultiIndex. @@ -6710,14 +6630,14 @@ def replace( `scipy.interpolate.BPoly.from_derivatives` which replaces 'piecewise_polynomial' interpolation method in scipy 0.18. - axis : {0 or 'index', 1 or 'columns', None}, default None + axis : {{0 or 'index', 1 or 'columns', None}}, default None Axis to interpolate along. limit : int, optional Maximum number of consecutive NaNs to fill. Must be greater than 0. inplace : bool, default False Update the data in place if possible. - limit_direction : {'forward', 'backward', 'both'}, Optional + limit_direction : {{'forward', 'backward', 'both'}}, Optional Consecutive NaNs will be filled in this direction. If limit is specified: @@ -6735,7 +6655,7 @@ def replace( raises ValueError if `limit_direction` is 'backward' or 'both' and method is 'pad' or 'ffill'. - limit_area : {`None`, 'inside', 'outside'}, default None + limit_area : {{`None`, 'inside', 'outside'}}, default None If limit is specified, consecutive NaNs will be filled with this restriction. @@ -6877,22 +6797,6 @@ def replace( 3 16.0 Name: d, dtype: float64 """ - - @Appender(_shared_docs["interpolate"] % _shared_doc_kwargs) - def interpolate( - self: FrameOrSeries, - method: str = "linear", - axis: Axis = 0, - limit: Optional[int] = None, - inplace: bool_t = False, - limit_direction: Optional[str] = None, - limit_area: Optional[str] = None, - downcast: Optional[str] = None, - **kwargs, - ) -> Optional[FrameOrSeries]: - """ - Interpolate values according to different methods. - """ inplace = validate_bool_kwarg(inplace, "inplace") axis = self._get_axis_number(axis) @@ -7148,9 +7052,9 @@ def asof(self, where, subset=None): # ---------------------------------------------------------------------- # Action Methods - _shared_docs[ - "isna" - ] = """ + @doc(klass=_shared_doc_kwargs["klass"]) + def isna(self: FrameOrSeries) -> FrameOrSeries: + """ Detect missing values. Return a boolean same-sized object indicating if the values are NA. @@ -7162,26 +7066,26 @@ def asof(self, where, subset=None): Returns ------- - %(klass)s - Mask of bool values for each element in %(klass)s that + {klass} + Mask of bool values for each element in {klass} that indicates whether an element is not an NA value. See Also -------- - %(klass)s.isnull : Alias of isna. - %(klass)s.notna : Boolean inverse of isna. - %(klass)s.dropna : Omit axes labels with missing values. + {klass}.isnull : Alias of isna. + {klass}.notna : Boolean inverse of isna. + {klass}.dropna : Omit axes labels with missing values. isna : Top-level isna. Examples -------- Show which entries in a DataFrame are NA. - >>> df = pd.DataFrame({'age': [5, 6, np.NaN], + >>> df = pd.DataFrame({{'age': [5, 6, np.NaN], ... 'born': [pd.NaT, pd.Timestamp('1939-05-27'), ... pd.Timestamp('1940-04-25')], ... 'name': ['Alfred', 'Batman', ''], - ... 'toy': [None, 'Batmobile', 'Joker']}) + ... 'toy': [None, 'Batmobile', 'Joker']}}) >>> df age born name toy 0 5.0 NaT Alfred None @@ -7209,18 +7113,15 @@ def asof(self, where, subset=None): 2 True dtype: bool """ - - @Appender(_shared_docs["isna"] % _shared_doc_kwargs) - def isna(self: FrameOrSeries) -> FrameOrSeries: return isna(self).__finalize__(self, method="isna") - @Appender(_shared_docs["isna"] % _shared_doc_kwargs) + @doc(isna, klass=_shared_doc_kwargs["klass"]) def isnull(self: FrameOrSeries) -> FrameOrSeries: return isna(self).__finalize__(self, method="isnull") - _shared_docs[ - "notna" - ] = """ + @doc(klass=_shared_doc_kwargs["klass"]) + def notna(self: FrameOrSeries) -> FrameOrSeries: + """ Detect existing (non-missing) values. Return a boolean same-sized object indicating if the values are not NA. @@ -7232,26 +7133,26 @@ def isnull(self: FrameOrSeries) -> FrameOrSeries: Returns ------- - %(klass)s - Mask of bool values for each element in %(klass)s that + {klass} + Mask of bool values for each element in {klass} that indicates whether an element is not an NA value. See Also -------- - %(klass)s.notnull : Alias of notna. - %(klass)s.isna : Boolean inverse of notna. - %(klass)s.dropna : Omit axes labels with missing values. + {klass}.notnull : Alias of notna. + {klass}.isna : Boolean inverse of notna. + {klass}.dropna : Omit axes labels with missing values. notna : Top-level notna. Examples -------- Show which entries in a DataFrame are not NA. - >>> df = pd.DataFrame({'age': [5, 6, np.NaN], + >>> df = pd.DataFrame({{'age': [5, 6, np.NaN], ... 'born': [pd.NaT, pd.Timestamp('1939-05-27'), ... pd.Timestamp('1940-04-25')], ... 'name': ['Alfred', 'Batman', ''], - ... 'toy': [None, 'Batmobile', 'Joker']}) + ... 'toy': [None, 'Batmobile', 'Joker']}}) >>> df age born name toy 0 5.0 NaT Alfred None @@ -7279,12 +7180,9 @@ def isnull(self: FrameOrSeries) -> FrameOrSeries: 2 False dtype: bool """ - - @Appender(_shared_docs["notna"] % _shared_doc_kwargs) - def notna(self: FrameOrSeries) -> FrameOrSeries: return notna(self).__finalize__(self, method="notna") - @Appender(_shared_docs["notna"] % _shared_doc_kwargs) + @doc(notna, klass=_shared_doc_kwargs["klass"]) def notnull(self: FrameOrSeries) -> FrameOrSeries: return notna(self).__finalize__(self, method="notnull") @@ -8966,32 +8864,47 @@ def _where( result = self._constructor(new_data) return result.__finalize__(self) - _shared_docs[ - "where" - ] = """ - Replace values where the condition is %(cond_rev)s. + @doc( + klass=_shared_doc_kwargs["klass"], + cond="True", + cond_rev="False", + name="where", + name_other="mask", + ) + def where( + self, + cond, + other=np.nan, + inplace=False, + axis=None, + level=None, + errors="raise", + try_cast=False, + ): + """ + Replace values where the condition is {cond_rev}. Parameters ---------- - cond : bool %(klass)s, array-like, or callable - Where `cond` is %(cond)s, keep the original value. Where - %(cond_rev)s, replace with corresponding value from `other`. - If `cond` is callable, it is computed on the %(klass)s and - should return boolean %(klass)s or array. The callable must - not change input %(klass)s (though pandas doesn't check it). - other : scalar, %(klass)s, or callable - Entries where `cond` is %(cond_rev)s are replaced with + cond : bool {klass}, array-like, or callable + Where `cond` is {cond}, keep the original value. Where + {cond_rev}, replace with corresponding value from `other`. + If `cond` is callable, it is computed on the {klass} and + should return boolean {klass} or array. The callable must + not change input {klass} (though pandas doesn't check it). + other : scalar, {klass}, or callable + Entries where `cond` is {cond_rev} are replaced with corresponding value from `other`. - If other is callable, it is computed on the %(klass)s and - should return scalar or %(klass)s. The callable must not - change input %(klass)s (though pandas doesn't check it). + If other is callable, it is computed on the {klass} and + should return scalar or {klass}. The callable must not + change input {klass} (though pandas doesn't check it). inplace : bool, default False Whether to perform the operation in place on the data. axis : int, default None Alignment axis if needed. level : int, default None Alignment level if needed. - errors : str, {'raise', 'ignore'}, default 'raise' + errors : str, {{'raise', 'ignore'}}, default 'raise' Note that currently this parameter won't affect the results and will always coerce to a suitable dtype. @@ -9007,13 +8920,13 @@ def _where( See Also -------- - :func:`DataFrame.%(name_other)s` : Return an object of same shape as + :func:`DataFrame.{name_other}` : Return an object of same shape as self. Notes ----- - The %(name)s method is an application of the if-then idiom. For each - element in the calling DataFrame, if ``cond`` is ``%(cond)s`` the + The {name} method is an application of the if-then idiom. For each + element in the calling DataFrame, if ``cond`` is ``{cond}`` the element is used; otherwise the corresponding element from the DataFrame ``other`` is used. @@ -9021,7 +8934,7 @@ def _where( :func:`numpy.where`. Roughly ``df1.where(m, df2)`` is equivalent to ``np.where(m, df1, df2)``. - For further details and examples see the ``%(name)s`` documentation in + For further details and examples see the ``{name}`` documentation in :ref:`indexing `. Examples @@ -9059,7 +8972,7 @@ def _where( 2 4 5 3 6 7 4 8 9 - >>> m = df %% 3 == 0 + >>> m = df % 3 == 0 >>> df.where(m, -df) A B 0 0 -1 @@ -9082,42 +8995,18 @@ def _where( 3 True True 4 True True """ - - @Appender( - _shared_docs["where"] - % dict( - _shared_doc_kwargs, - cond="True", - cond_rev="False", - name="where", - name_other="mask", - ) - ) - def where( - self, - cond, - other=np.nan, - inplace=False, - axis=None, - level=None, - errors="raise", - try_cast=False, - ): - other = com.apply_if_callable(other, self) return self._where( cond, other, inplace, axis, level, errors=errors, try_cast=try_cast ) - @Appender( - _shared_docs["where"] - % dict( - _shared_doc_kwargs, - cond="False", - cond_rev="True", - name="mask", - name_other="where", - ) + @doc( + where, + klass=_shared_doc_kwargs["klass"], + cond="False", + cond_rev="True", + name="mask", + name_other="where", ) def mask( self, @@ -9548,7 +9437,7 @@ def tz_convert( Returns ------- - %(klass)s + {klass} Object with time zone converted axis. Raises @@ -10171,9 +10060,15 @@ def describe_1d(data): d.columns = data.columns.copy() return d - _shared_docs[ - "pct_change" - ] = """ + def pct_change( + self: FrameOrSeries, + periods=1, + fill_method="pad", + limit=None, + freq=None, + **kwargs, + ) -> FrameOrSeries: + """ Percentage change between the current and a prior element. Computes the percentage change from the immediately previous row by @@ -10287,17 +10182,6 @@ def describe_1d(data): GOOG NaN -0.151997 -0.086016 APPL NaN 0.337604 0.012002 """ - - @Appender(_shared_docs["pct_change"] % _shared_doc_kwargs) - def pct_change( - self: FrameOrSeries, - periods=1, - fill_method="pad", - limit=None, - freq=None, - **kwargs, - ) -> FrameOrSeries: - # TODO: Not sure if above is correct - need someone to confirm. axis = self._get_axis_number(kwargs.pop("axis", self._stat_axis_name)) if fill_method is None: data = self @@ -10357,18 +10241,35 @@ def _add_numeric_operations(cls): empty_value=True, ) - @Substitution( + @doc( desc="Return the mean absolute deviation of the values " "for the requested axis.", name1=name1, name2=name2, axis_descr=axis_descr, - min_count="", see_also="", examples="", ) - @Appender(_num_doc_mad) def mad(self, axis=None, skipna=None, level=None): + """ + {desc} + + Parameters + ---------- + axis : {axis_descr} + Axis for the function to be applied on. + skipna : bool, default None + Exclude NA/null values when computing the result. + level : int or level name, default None + If the axis is a MultiIndex (hierarchical), count along a + particular level, collapsing into a {name1}. + + Returns + ------- + {name1} or {name2} (if level specified)\ + {see_also}\ + {examples} + """ if skipna is None: skipna = True if axis is None: @@ -10633,8 +10534,74 @@ def ewm( cls.ewm = ewm - @Appender(_shared_docs["transform"] % dict(axis="", **_shared_doc_kwargs)) + @doc(klass=_shared_doc_kwargs["klass"], axis="") def transform(self, func, *args, **kwargs): + """ + Call ``func`` on self producing a {klass} with transformed values. + + Produced {klass} will have same axis length as self. + + Parameters + ---------- + func : function, str, list or dict + Function to use for transforming the data. If a function, must either + work when passed a {klass} or when passed to {klass}.apply. + + Accepted combinations are: + + - function + - string function name + - list of functions and/or function names, e.g. ``[np.exp. 'sqrt']`` + - dict of axis labels -> functions, function names or list of such. + {axis} + *args + Positional arguments to pass to `func`. + **kwargs + Keyword arguments to pass to `func`. + + Returns + ------- + {klass} + A {klass} that must have the same length as self. + + Raises + ------ + ValueError : If the returned {klass} has a different length than self. + + See Also + -------- + {klass}.agg : Only perform aggregating type operations. + {klass}.apply : Invoke function on a {klass}. + + Examples + -------- + >>> df = pd.DataFrame({{'A': range(3), 'B': range(1, 4)}}) + >>> df + A B + 0 0 1 + 1 1 2 + 2 2 3 + >>> df.transform(lambda x: x + 1) + A B + 0 1 2 + 1 2 3 + 2 3 4 + + Even though the resulting {klass} must have the same length as the + input {klass}, it is possible to provide several input functions: + + >>> s = pd.Series(range(3)) + >>> s + 0 0 + 1 1 + 2 2 + dtype: int64 + >>> s.transform([np.sqrt, np.exp]) + sqrt exp + 0 0.000000 1.000000 + 1 1.000000 2.718282 + 2 1.414214 7.389056 + """ result = self.agg(func, *args, **kwargs) if is_scalar(result) or len(result) != len(self): raise ValueError("transforms cannot produce aggregated results") @@ -10644,21 +10611,6 @@ def transform(self, func, *args, **kwargs): # ---------------------------------------------------------------------- # Misc methods - _shared_docs[ - "valid_index" - ] = """ - Return index for %(position)s non-NA/null value. - - Returns - ------- - scalar : type of index - - Notes - ----- - If all elements are non-NA/null, returns None. - Also returns None for empty %(klass)s. - """ - def _find_valid_index(self, how: str): """ Retrieves the index of the first valid value. @@ -10677,15 +10629,23 @@ def _find_valid_index(self, how: str): return None return self.index[idxpos] - @Appender( - _shared_docs["valid_index"] % {"position": "first", "klass": "Series/DataFrame"} - ) + @doc(position="first", klass=_shared_doc_kwargs["klass"]) def first_valid_index(self): + """ + Return index for {position} non-NA/null value. + + Returns + ------- + scalar : type of index + + Notes + ----- + If all elements are non-NA/null, returns None. + Also returns None for empty {klass}. + """ return self._find_valid_index("first") - @Appender( - _shared_docs["valid_index"] % {"position": "last", "klass": "Series/DataFrame"} - ) + @doc(first_valid_index, position="last", klass=_shared_doc_kwargs["klass"]) def last_valid_index(self): return self._find_valid_index("last") @@ -10726,26 +10686,6 @@ def _doc_parms(cls): %(examples)s """ -_num_doc_mad = """ -%(desc)s - -Parameters ----------- -axis : %(axis_descr)s - Axis for the function to be applied on. -skipna : bool, default None - Exclude NA/null values when computing the result. -level : int or level name, default None - If the axis is a MultiIndex (hierarchical), count along a - particular level, collapsing into a %(name1)s. - -Returns -------- -%(name1)s or %(name2)s (if level specified)\ -%(see_also)s\ -%(examples)s -""" - _num_ddof_doc = """ %(desc)s diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index cec3d9711a8ca..bc5cf595e49f9 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -224,10 +224,9 @@ def _selection_name(self): def apply(self, func, *args, **kwargs): return super().apply(func, *args, **kwargs) - @Substitution( - examples=_agg_examples_doc, klass="Series", + @doc( + _agg_template, examples=_agg_examples_doc, klass="Series", ) - @Appender(_agg_template) def aggregate( self, func=None, *args, engine="cython", engine_kwargs=None, **kwargs ): @@ -915,10 +914,9 @@ class DataFrameGroupBy(GroupBy[DataFrame]): See :ref:`groupby.aggregate.named` for more.""" ) - @Substitution( - examples=_agg_examples_doc, klass="DataFrame", + @doc( + _agg_template, examples=_agg_examples_doc, klass="DataFrame", ) - @Appender(_agg_template) def aggregate( self, func=None, *args, engine="cython", engine_kwargs=None, **kwargs ): diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index b92e75f16e965..02f7f605a7605 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -372,7 +372,7 @@ class providing the base-class of operations. ---------- func : function, str, list or dict Function to use for aggregating the data. If a function, must either - work when passed a %(klass)s or when passed to %(klass)s.apply. + work when passed a {klass} or when passed to {klass}.apply. Accepted combinations are: @@ -403,7 +403,7 @@ class providing the base-class of operations. * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` and ``parallel`` dictionary keys. The values must either be ``True`` or ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is - ``{'nopython': True, 'nogil': False, 'parallel': False}`` and will be + ``{{'nopython': True, 'nogil': False, 'parallel': False}}`` and will be applied to the function .. versionadded:: 1.1.0 @@ -412,20 +412,20 @@ class providing the base-class of operations. Returns ------- -%(klass)s +{klass} See Also -------- -%(klass)s.groupby.apply -%(klass)s.groupby.transform -%(klass)s.aggregate +{klass}.groupby.apply +{klass}.groupby.transform +{klass}.aggregate Notes ----- When using ``engine='numba'``, there will be no "fall back" behavior internally. The group data and group index will be passed as numpy arrays to the JITed user defined function, and no alternative execution attempts will be tried. -%(examples)s +{examples} """ diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 5e363f2814d39..bfdfc65723433 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -274,14 +274,14 @@ def pipe(self, func, *args, **kwargs): """ ) - @Substitution( + @doc( + _shared_docs["aggregate"], see_also=_agg_see_also_doc, examples=_agg_examples_doc, versionadded="", klass="DataFrame", axis="", ) - @Appender(_shared_docs["aggregate"]) def aggregate(self, func, *args, **kwargs): self._set_binner() diff --git a/pandas/core/series.py b/pandas/core/series.py index a27e44efe1a97..cab8dd133b579 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1405,8 +1405,46 @@ def to_string( with open(buf, "w") as f: f.write(result) - @Appender( + @doc( + klass=_shared_doc_kwargs["klass"], + examples=dedent( + """ + Examples + -------- + >>> s = pd.Series(["elk", "pig", "dog", "quetzal"], name="animal") + >>> print(s.to_markdown()) + | | animal | + |---:|:---------| + | 0 | elk | + | 1 | pig | + | 2 | dog | + | 3 | quetzal | + """ + ), + ) + def to_markdown( + self, buf: Optional[IO[str]] = None, mode: Optional[str] = None, **kwargs + ) -> Optional[str]: """ + Print {klass} in Markdown-friendly format. + + .. versionadded:: 1.0.0 + + Parameters + ---------- + buf : str, Path or StringIO-like, optional, default None + Buffer to write to. If None, the output is returned as a string. + mode : str, optional + Mode in which file is opened. + **kwargs + These parameters will be passed to `tabulate \ + `_. + + Returns + ------- + str + {klass} in Markdown-friendly format. + Examples -------- >>> s = pd.Series(["elk", "pig", "dog", "quetzal"], name="animal") @@ -1433,12 +1471,6 @@ def to_string( | 3 | quetzal | +----+----------+ """ - ) - @Substitution(klass="Series") - @Appender(generic._shared_docs["to_markdown"]) - def to_markdown( - self, buf: Optional[IO[str]] = None, mode: Optional[str] = None, **kwargs - ) -> Optional[str]: return self.to_frame().to_markdown(buf, mode, **kwargs) # ---------------------------------------------------------------------- @@ -3959,13 +3991,14 @@ def _gotitem(self, key, ndim, subset=None) -> "Series": """ ) - @Substitution( + @doc( + generic._shared_docs["aggregate"], + klass=_shared_doc_kwargs["klass"], + axis=_shared_doc_kwargs["axis"], see_also=_agg_see_also_doc, examples=_agg_examples_doc, versionadded="\n.. versionadded:: 0.20.0\n", - **_shared_doc_kwargs, ) - @Appender(generic._shared_docs["aggregate"]) def aggregate(self, func, axis=0, *args, **kwargs): # Validate the axis parameter self._get_axis_number(axis) @@ -3994,7 +4027,11 @@ def aggregate(self, func, axis=0, *args, **kwargs): agg = aggregate - @Appender(generic._shared_docs["transform"] % _shared_doc_kwargs) + @doc( + NDFrame.transform, + klass=_shared_doc_kwargs["klass"], + axis=_shared_doc_kwargs["axis"], + ) def transform(self, func, axis=0, *args, **kwargs): # Validate the axis parameter self._get_axis_number(axis) @@ -4185,7 +4222,11 @@ def _needs_reindex_multi(self, axes, method, level): """ return False - @doc(NDFrame.align, **_shared_doc_kwargs) + @doc( + NDFrame.align, + klass=_shared_doc_kwargs["klass"], + axes_single_arg=_shared_doc_kwargs["axes_single_arg"], + ) def align( self, other, @@ -4316,8 +4357,13 @@ def rename( def set_axis(self, labels, axis: Axis = 0, inplace: bool = False): return super().set_axis(labels, axis=axis, inplace=inplace) - @Substitution(**_shared_doc_kwargs) - @Appender(generic.NDFrame.reindex.__doc__) + @doc( + NDFrame.reindex, + klass=_shared_doc_kwargs["klass"], + axes=_shared_doc_kwargs["axes"], + optional_labels=_shared_doc_kwargs["optional_labels"], + optional_axis=_shared_doc_kwargs["optional_axis"], + ) def reindex(self, index=None, **kwargs): return super().reindex(index=index, **kwargs) @@ -4446,7 +4492,7 @@ def fillna( downcast=downcast, ) - @doc(NDFrame.replace, **_shared_doc_kwargs) + @doc(NDFrame.replace, klass=_shared_doc_kwargs["klass"]) def replace( self, to_replace=None, @@ -4465,7 +4511,7 @@ def replace( method=method, ) - @doc(NDFrame.shift, **_shared_doc_kwargs) + @doc(NDFrame.shift, klass=_shared_doc_kwargs["klass"]) def shift(self, periods=1, freq=None, axis=0, fill_value=None) -> "Series": return super().shift( periods=periods, freq=freq, axis=axis, fill_value=fill_value @@ -4686,19 +4732,19 @@ def _convert_dtypes( result = input_series.copy() return result - @Appender(generic._shared_docs["isna"] % _shared_doc_kwargs) + @doc(NDFrame.isna, klass=_shared_doc_kwargs["klass"]) def isna(self) -> "Series": return super().isna() - @Appender(generic._shared_docs["isna"] % _shared_doc_kwargs) + @doc(NDFrame.isna, klass=_shared_doc_kwargs["klass"]) def isnull(self) -> "Series": return super().isnull() - @Appender(generic._shared_docs["notna"] % _shared_doc_kwargs) + @doc(NDFrame.notna, klass=_shared_doc_kwargs["klass"]) def notna(self) -> "Series": return super().notna() - @Appender(generic._shared_docs["notna"] % _shared_doc_kwargs) + @doc(NDFrame.notna, klass=_shared_doc_kwargs["klass"]) def notnull(self) -> "Series": return super().notnull() diff --git a/pandas/core/window/ewm.py b/pandas/core/window/ewm.py index 0e39b94574a12..b708020be90d2 100644 --- a/pandas/core/window/ewm.py +++ b/pandas/core/window/ewm.py @@ -7,7 +7,7 @@ import pandas._libs.window.aggregations as window_aggregations from pandas._typing import FrameOrSeries from pandas.compat.numpy import function as nv -from pandas.util._decorators import Appender, Substitution +from pandas.util._decorators import Appender, Substitution, doc from pandas.core.dtypes.generic import ABCDataFrame @@ -214,14 +214,14 @@ def _constructor(self): """ ) - @Substitution( + @doc( + _shared_docs["aggregate"], see_also=_agg_see_also_doc, examples=_agg_examples_doc, versionadded="", klass="Series/Dataframe", axis="", ) - @Appender(_shared_docs["aggregate"]) def aggregate(self, func, *args, **kwargs): return super().aggregate(func, *args, **kwargs) diff --git a/pandas/core/window/expanding.py b/pandas/core/window/expanding.py index 438032a0c4419..bbc19fad8b799 100644 --- a/pandas/core/window/expanding.py +++ b/pandas/core/window/expanding.py @@ -2,7 +2,7 @@ from typing import Dict, Optional from pandas.compat.numpy import function as nv -from pandas.util._decorators import Appender, Substitution +from pandas.util._decorators import Appender, Substitution, doc from pandas.core.window.common import WindowGroupByMixin, _doc_template, _shared_docs from pandas.core.window.rolling import _Rolling_and_Expanding @@ -113,14 +113,14 @@ def _get_window(self, other=None, **kwargs): """ ) - @Substitution( + @doc( + _shared_docs["aggregate"], see_also=_agg_see_also_doc, examples=_agg_examples_doc, versionadded="", klass="Series/Dataframe", axis="", ) - @Appender(_shared_docs["aggregate"]) def aggregate(self, func, *args, **kwargs): return super().aggregate(func, *args, **kwargs) diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 9cd750265133e..7d76f8b117b5e 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -15,7 +15,7 @@ from pandas._typing import Axis, FrameOrSeries, Scalar from pandas.compat._optional import import_optional_dependency from pandas.compat.numpy import function as nv -from pandas.util._decorators import Appender, Substitution, cache_readonly +from pandas.util._decorators import Appender, Substitution, cache_readonly, doc from pandas.core.dtypes.common import ( ensure_float64, @@ -1154,14 +1154,14 @@ def _get_window( """ ) - @Substitution( + @doc( + _shared_docs["aggregate"], see_also=_agg_see_also_doc, examples=_agg_examples_doc, versionadded="", klass="Series/DataFrame", axis="", ) - @Appender(_shared_docs["aggregate"]) def aggregate(self, func, *args, **kwargs): result, how = self._aggregate(func, *args, **kwargs) if result is None: @@ -2026,14 +2026,14 @@ def _validate_freq(self): """ ) - @Substitution( + @doc( + _shared_docs["aggregate"], see_also=_agg_see_also_doc, examples=_agg_examples_doc, versionadded="", klass="Series/Dataframe", axis="", ) - @Appender(_shared_docs["aggregate"]) def aggregate(self, func, *args, **kwargs): return super().aggregate(func, *args, **kwargs) From 36d0f7d02390e6f188695465de5c9a2c9494e329 Mon Sep 17 00:00:00 2001 From: Lucca Delchiaro Costabile Date: Fri, 19 Jun 2020 19:33:20 -0300 Subject: [PATCH 0154/1025] GH34529 (#34812) --- doc/source/whatsnew/v1.1.0.rst | 1 + pandas/core/dtypes/cast.py | 2 +- pandas/tests/frame/test_apply.py | 11 +++++++++++ 3 files changed, 13 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index a27e6e8433779..f6ad3a800283d 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -1058,6 +1058,7 @@ Reshaping - Bug in :func:`Dataframe.aggregate` and :func:`Series.aggregate` was causing recursive loop in some cases (:issue:`34224`) - Fixed bug in :func:`melt` where melting MultiIndex columns with ``col_level`` > 0 would raise a ``KeyError`` on ``id_vars`` (:issue:`34129`) - Bug in :meth:`Series.where` with an empty Series and empty ``cond`` having non-bool dtype (:issue:`34592`) +- Fixed regression where :meth:`DataFrame.apply` would raise ``ValueError`` for elements whth ``S`` dtype (:issue:`34529`) Sparse ^^^^^^ diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index e69e3bab10af8..d0417d51da497 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1608,7 +1608,7 @@ def construct_1d_ndarray_preserving_na( """ subarr = np.array(values, dtype=dtype, copy=copy) - if dtype is not None and dtype.kind in ("U", "S"): + if dtype is not None and dtype.kind == "U": # GH-21083 # We can't just return np.array(subarr, dtype='str') since # NumPy will convert the non-string objects into strings diff --git a/pandas/tests/frame/test_apply.py b/pandas/tests/frame/test_apply.py index d12699397d1e4..48a141a657cbb 100644 --- a/pandas/tests/frame/test_apply.py +++ b/pandas/tests/frame/test_apply.py @@ -785,6 +785,17 @@ def non_reducing_function(val): df.applymap(func) assert values == df.a.to_list() + def test_apply_with_byte_string(self): + # GH 34529 + df = pd.DataFrame(np.array([b"abcd", b"efgh"]), columns=["col"]) + expected = pd.DataFrame( + np.array([b"abcd", b"efgh"]), columns=["col"], dtype=object + ) + # After we make the aply we exect a dataframe just + # like the original but with the object datatype + result = df.apply(lambda x: x.astype("object")) + tm.assert_frame_equal(result, expected) + class TestInferOutputShape: # the user has supplied an opaque UDF where From 0886e3ad9c5aef290408e2beb1a812a2c45a174e Mon Sep 17 00:00:00 2001 From: Carsten van Weelden Date: Sat, 20 Jun 2020 13:50:49 +0200 Subject: [PATCH 0155/1025] DOC: Fix validation issues with Index.is_ docstring (#34882) --- pandas/core/indexes/base.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 057adceda7efd..b12a556a8291d 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -518,7 +518,12 @@ def is_(self, other) -> bool: Returns ------- - True if both have same underlying data, False otherwise : bool + bool + True if both have same underlying data, False otherwise. + + See Also + -------- + Index.identical : Works like ``Index.is_`` but also checks metadata. """ # use something other than None to be clearer return self._id is getattr(other, "_id", Ellipsis) and self._id is not None From 139a336e5c55323a79954171433f20451529e5c0 Mon Sep 17 00:00:00 2001 From: pvanhauw Date: Sat, 20 Jun 2020 13:58:13 +0200 Subject: [PATCH 0156/1025] DOC: Fixed table formatting in box plot section (#34885) --- doc/source/user_guide/visualization.rst | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/doc/source/user_guide/visualization.rst b/doc/source/user_guide/visualization.rst index 4cd7b9e8cecca..305221b767aff 100644 --- a/doc/source/user_guide/visualization.rst +++ b/doc/source/user_guide/visualization.rst @@ -443,9 +443,8 @@ Faceting, created by ``DataFrame.boxplot`` with the ``by`` keyword, will affect the output type as well: ================ ======= ========================== -``return_type=`` Faceted Output type ----------------- ------- -------------------------- - +``return_type`` Faceted Output type +================ ======= ========================== ``None`` No axes ``None`` Yes 2-D ndarray of axes ``'axes'`` No axes From f76b2f73a62b5950b960cb5de01704236294f3e1 Mon Sep 17 00:00:00 2001 From: Ali McMaster Date: Sat, 20 Jun 2020 13:30:43 +0100 Subject: [PATCH 0157/1025] TST: Feather RoundTrip Column Ordering (#34883) --- pandas/tests/io/test_feather.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py index e59100146249a..a8a5c8f00e6bf 100644 --- a/pandas/tests/io/test_feather.py +++ b/pandas/tests/io/test_feather.py @@ -115,6 +115,12 @@ def test_read_columns(self): columns = ["col1", "col3"] self.check_round_trip(df, expected=df[columns], columns=columns) + @td.skip_if_no("pyarrow", min_version="0.17.1") + def read_columns_different_order(self): + # GH 33878 + df = pd.DataFrame({"A": [1, 2], "B": ["x", "y"], "C": [True, False]}) + self.check_round_trip(df, columns=["B", "A"]) + def test_unsupported_other(self): # mixed python objects From 383a2ff822f9a108b55cea8264a6c0b350fcb21e Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 20 Jun 2020 09:07:52 -0400 Subject: [PATCH 0158/1025] move 3.9 travis build to allowed failuresss (#34894) --- .travis.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index c5dbddacc6a43..fdea9876d5d89 100644 --- a/.travis.yml +++ b/.travis.yml @@ -69,9 +69,9 @@ matrix: env: - JOB="3.7, arm64" PYTEST_WORKERS=8 ENV_FILE="ci/deps/travis-37-arm64.yaml" PATTERN="(not slow and not network and not clipboard)" - dist: bionic - python: 3.9-dev env: - - JOB="3.9-dev" PATTERN="(not slow and not network)" + - JOB="3.9-dev" PATTERN="(not slow and not network and not clipboard)" + before_install: - echo "before_install" From 2d0ca2bc7501e6a258be6b8288962b6495db4178 Mon Sep 17 00:00:00 2001 From: MBrouns Date: Sat, 20 Jun 2020 15:35:17 +0200 Subject: [PATCH 0159/1025] TST: Add test to verify 'dropna' behaviour on SparseArray (#34879) --- pandas/tests/arrays/sparse/test_array.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/pandas/tests/arrays/sparse/test_array.py b/pandas/tests/arrays/sparse/test_array.py index 2f2907fbaaebc..d0cdec712f39d 100644 --- a/pandas/tests/arrays/sparse/test_array.py +++ b/pandas/tests/arrays/sparse/test_array.py @@ -1295,3 +1295,15 @@ def test_map_missing(): result = arr.map({0: 10, 1: 11}) tm.assert_sp_array_equal(result, expected) + + +@pytest.mark.parametrize("fill_value", [np.nan, 1]) +def test_dropna(fill_value): + # GH-28287 + arr = SparseArray([np.nan, 1], fill_value=fill_value) + exp = SparseArray([1.0], fill_value=fill_value) + tm.assert_sp_array_equal(arr.dropna(), exp) + + df = pd.DataFrame({"a": [0, 1], "b": arr}) + expected_df = pd.DataFrame({"a": [1], "b": exp}, index=pd.Int64Index([1])) + tm.assert_equal(df.dropna(), expected_df) From 8b19d7de5bb07da2fc5e6f0f51326a79d08e4109 Mon Sep 17 00:00:00 2001 From: Aidan Montare <47719225+aidanmontare-edu@users.noreply.github.com> Date: Sat, 20 Jun 2020 09:45:10 -0400 Subject: [PATCH 0160/1025] DOC: add mention of optional dependencies in users guide (#34890) --- doc/source/user_guide/enhancingperf.rst | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/doc/source/user_guide/enhancingperf.rst b/doc/source/user_guide/enhancingperf.rst index 2056fe2f754f8..24fcb369804c6 100644 --- a/doc/source/user_guide/enhancingperf.rst +++ b/doc/source/user_guide/enhancingperf.rst @@ -13,6 +13,14 @@ when we use Cython and Numba on a test function operating row-wise on the ``DataFrame``. Using :func:`pandas.eval` we will speed up a sum by an order of ~2. +.. note:: + + In addition to following the steps in this tutorial, users interested in enhancing + performance are highly encouraged to install the + :ref:`recommended dependencies` for pandas. + These dependencies are often not installed by default, but will offer speed + improvements if present. + .. _enhancingperf.cython: Cython (writing C extensions for pandas) From e1155ff1c5e6fb80180298ce077af0f4761427df Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 20 Jun 2020 10:43:33 -0400 Subject: [PATCH 0161/1025] TST: skip gbq integration tests, xref #34779 (#34895) --- pandas/tests/io/test_gbq.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/io/test_gbq.py b/pandas/tests/io/test_gbq.py index df107259d38cd..870d78ef1c533 100644 --- a/pandas/tests/io/test_gbq.py +++ b/pandas/tests/io/test_gbq.py @@ -148,6 +148,7 @@ def mock_read_gbq(sql, **kwargs): @pytest.mark.single +@pytest.mark.xfail(reason="skipping gbq integration for now, xref #34779") class TestToGBQIntegrationWithServiceAccountKeyPath: @pytest.fixture() def gbq_dataset(self): From d2bbdf396ea7ea6f035d3a0bab0e3bee2bc7fd04 Mon Sep 17 00:00:00 2001 From: Ram Rachum Date: Sat, 20 Jun 2020 17:47:20 +0300 Subject: [PATCH 0162/1025] Remove redundant lists in array.py (#34847) --- pandas/tests/extension/json/array.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py index 3132b39a7d6d6..447a6108fc3c7 100644 --- a/pandas/tests/extension/json/array.py +++ b/pandas/tests/extension/json/array.py @@ -179,13 +179,11 @@ def astype(self, dtype, copy=True): def unique(self): # Parent method doesn't work since np.array will try to infer # a 2-dim object. - return type(self)( - [dict(x) for x in list({tuple(d.items()) for d in self.data})] - ) + return type(self)([dict(x) for x in {tuple(d.items()) for d in self.data}]) @classmethod def _concat_same_type(cls, to_concat): - data = list(itertools.chain.from_iterable([x.data for x in to_concat])) + data = list(itertools.chain.from_iterable(x.data for x in to_concat)) return cls(data) def _values_for_factorize(self): From 96e2ddcd2e2fe2b6b1051acdd107817779500aa2 Mon Sep 17 00:00:00 2001 From: Marvzinc Date: Sat, 20 Jun 2020 18:50:33 +0200 Subject: [PATCH 0163/1025] TST: IntegerNA Support for DataFrame.diff() (#34889) --- pandas/tests/frame/methods/test_diff.py | 45 +++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/pandas/tests/frame/methods/test_diff.py b/pandas/tests/frame/methods/test_diff.py index e876e40aa2eb1..45f134a93a23a 100644 --- a/pandas/tests/frame/methods/test_diff.py +++ b/pandas/tests/frame/methods/test_diff.py @@ -169,3 +169,48 @@ def test_diff_sparse(self): ) tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "axis,expected", + [ + ( + 0, + pd.DataFrame( + { + "a": [np.nan, 0, 1, 0, np.nan, np.nan, np.nan, 0], + "b": [np.nan, 1, np.nan, np.nan, -2, 1, np.nan, np.nan], + "c": np.repeat(np.nan, 8), + "d": [np.nan, 3, 5, 7, 9, 11, 13, 15], + }, + dtype="Int64", + ), + ), + ( + 1, + pd.DataFrame( + { + "a": np.repeat(np.nan, 8), + "b": [0, 1, np.nan, 1, np.nan, np.nan, np.nan, 0], + "c": np.repeat(np.nan, 8), + "d": np.repeat(np.nan, 8), + }, + dtype="Int64", + ), + ), + ], + ) + def test_diff_integer_na(self, axis, expected): + # GH#24171 IntegerNA Support for DataFrame.diff() + df = pd.DataFrame( + { + "a": np.repeat([0, 1, np.nan, 2], 2), + "b": np.tile([0, 1, np.nan, 2], 2), + "c": np.repeat(np.nan, 8), + "d": np.arange(1, 9) ** 2, + }, + dtype="Int64", + ) + + # Test case for default behaviour of diff + result = df.diff(axis=axis) + tm.assert_frame_equal(result, expected) From fc5850906709e6f4d781b507d42a1deed0a0e0a7 Mon Sep 17 00:00:00 2001 From: marydmit Date: Sat, 20 Jun 2020 20:26:39 +0200 Subject: [PATCH 0164/1025] DOC: fixed labels in "Plotting with error bars" (#34884) --- doc/source/user_guide/visualization.rst | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/doc/source/user_guide/visualization.rst b/doc/source/user_guide/visualization.rst index 305221b767aff..6ba5cab71bf14 100644 --- a/doc/source/user_guide/visualization.rst +++ b/doc/source/user_guide/visualization.rst @@ -1423,7 +1423,7 @@ Here is an example of one way to easily plot group means with standard deviation # Plot fig, ax = plt.subplots() @savefig errorbar_example.png - means.plot.bar(yerr=errors, ax=ax, capsize=4) + means.plot.bar(yerr=errors, ax=ax, capsize=4, rot=0) .. ipython:: python :suppress: @@ -1444,9 +1444,9 @@ Plotting with matplotlib table is now supported in :meth:`DataFrame.plot` and : .. ipython:: python - fig, ax = plt.subplots(1, 1) + fig, ax = plt.subplots(1, 1, figsize=(7, 6.5)) df = pd.DataFrame(np.random.rand(5, 3), columns=['a', 'b', 'c']) - ax.get_xaxis().set_visible(False) # Hide Ticks + ax.xaxis.tick_top() # Display x-axis ticks on top. @savefig line_plot_table_true.png df.plot(table=True, ax=ax) @@ -1463,8 +1463,9 @@ as seen in the example below. .. ipython:: python - fig, ax = plt.subplots(1, 1) - ax.get_xaxis().set_visible(False) # Hide Ticks + fig, ax = plt.subplots(1, 1, figsize=(7, 6.75)) + ax.xaxis.tick_top() # Display x-axis ticks on top. + @savefig line_plot_table_data.png df.plot(table=np.round(df.T, 2), ax=ax) From 138a48f0d0134caeddbb7d63b4b419ba8337ef16 Mon Sep 17 00:00:00 2001 From: DanBasson Date: Sat, 20 Jun 2020 22:59:21 +0300 Subject: [PATCH 0165/1025] CLN: GH29547 format with f-strings (#34502) --- pandas/tests/series/indexing/test_take.py | 6 +++--- pandas/tests/series/indexing/test_where.py | 16 +++++++++------- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/pandas/tests/series/indexing/test_take.py b/pandas/tests/series/indexing/test_take.py index 9368d49e5ff2b..dc161b6be5d66 100644 --- a/pandas/tests/series/indexing/test_take.py +++ b/pandas/tests/series/indexing/test_take.py @@ -16,10 +16,10 @@ def test_take(): expected = Series([4, 2, 4], index=[4, 3, 4]) tm.assert_series_equal(actual, expected) - msg = "index {} is out of bounds for( axis 0 with)? size 5" - with pytest.raises(IndexError, match=msg.format(10)): + msg = lambda x: f"index {x} is out of bounds for( axis 0 with)? size 5" + with pytest.raises(IndexError, match=msg(10)): ser.take([1, 10]) - with pytest.raises(IndexError, match=msg.format(5)): + with pytest.raises(IndexError, match=msg(5)): ser.take([2, 5]) diff --git a/pandas/tests/series/indexing/test_where.py b/pandas/tests/series/indexing/test_where.py index 3f85abb4b2817..c4a2cb90f7090 100644 --- a/pandas/tests/series/indexing/test_where.py +++ b/pandas/tests/series/indexing/test_where.py @@ -222,12 +222,14 @@ def test_where_setitem_invalid(): # GH 2702 # make sure correct exceptions are raised on invalid list assignment - msg = "cannot set using a {} indexer with a different length than the value" - + msg = ( + lambda x: f"cannot set using a {x} indexer with a " + "different length than the value" + ) # slice s = Series(list("abc")) - with pytest.raises(ValueError, match=msg.format("slice")): + with pytest.raises(ValueError, match=msg("slice")): s[0:3] = list(range(27)) s[0:3] = list(range(3)) @@ -237,7 +239,7 @@ def test_where_setitem_invalid(): # slice with step s = Series(list("abcdef")) - with pytest.raises(ValueError, match=msg.format("slice")): + with pytest.raises(ValueError, match=msg("slice")): s[0:4:2] = list(range(27)) s = Series(list("abcdef")) @@ -248,7 +250,7 @@ def test_where_setitem_invalid(): # neg slices s = Series(list("abcdef")) - with pytest.raises(ValueError, match=msg.format("slice")): + with pytest.raises(ValueError, match=msg("slice")): s[:-1] = list(range(27)) s[-3:-1] = list(range(2)) @@ -258,12 +260,12 @@ def test_where_setitem_invalid(): # list s = Series(list("abc")) - with pytest.raises(ValueError, match=msg.format("list-like")): + with pytest.raises(ValueError, match=msg("list-like")): s[[0, 1, 2]] = list(range(27)) s = Series(list("abc")) - with pytest.raises(ValueError, match=msg.format("list-like")): + with pytest.raises(ValueError, match=msg("list-like")): s[[0, 1, 2]] = list(range(2)) # scalar From d949336cd59ede71ff98dd380752878aba0d67d9 Mon Sep 17 00:00:00 2001 From: Andrew Wieteska <48889395+arw2019@users.noreply.github.com> Date: Sat, 20 Jun 2020 16:04:27 -0400 Subject: [PATCH 0166/1025] DataFrame.truncate drops MultiIndex names (#34589) --- doc/source/whatsnew/v1.1.0.rst | 1 + pandas/core/indexes/multi.py | 7 ++++++- pandas/tests/frame/methods/test_truncate.py | 13 +++++++++++++ pandas/tests/indexes/multi/test_analytics.py | 10 ++++++++-- pandas/tests/series/methods/test_truncate.py | 14 ++++++++++++++ 5 files changed, 42 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index f6ad3a800283d..567b6853bd633 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -956,6 +956,7 @@ MultiIndex df.loc[(['b', 'a'], [2, 1]), :] - Bug in :meth:`MultiIndex.intersection` was not guaranteed to preserve order when ``sort=False``. (:issue:`31325`) +- Bug in :meth:`DataFrame.truncate` was dropping :class:`MultiIndex` names. (:issue:`34564`) .. ipython:: python diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index af70707bd3dfc..15db6c51a1f2f 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -3193,7 +3193,12 @@ def truncate(self, before=None, after=None): new_codes = [level_codes[left:right] for level_codes in self.codes] new_codes[0] = new_codes[0] - i - return MultiIndex(levels=new_levels, codes=new_codes, verify_integrity=False) + return MultiIndex( + levels=new_levels, + codes=new_codes, + names=self._names, + verify_integrity=False, + ) def equals(self, other) -> bool: """ diff --git a/pandas/tests/frame/methods/test_truncate.py b/pandas/tests/frame/methods/test_truncate.py index 768a5f22fb063..674f482c478a0 100644 --- a/pandas/tests/frame/methods/test_truncate.py +++ b/pandas/tests/frame/methods/test_truncate.py @@ -104,3 +104,16 @@ def test_truncate_decreasing_index(self, before, after, indices, klass): result = values.truncate(before=before, after=after) expected = values.loc[indices] tm.assert_frame_equal(result, expected) + + def test_truncate_multiindex(self): + # GH 34564 + mi = pd.MultiIndex.from_product([[1, 2, 3, 4], ["A", "B"]], names=["L1", "L2"]) + s1 = pd.DataFrame(range(mi.shape[0]), index=mi, columns=["col"]) + result = s1.truncate(before=2, after=3) + + df = pd.DataFrame.from_dict( + {"L1": [2, 2, 3, 3], "L2": ["A", "B", "A", "B"], "col": [2, 3, 4, 5]} + ) + expected = df.set_index(["L1", "L2"]) + + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/indexes/multi/test_analytics.py b/pandas/tests/indexes/multi/test_analytics.py index 154ed22214830..9e4e73e793bac 100644 --- a/pandas/tests/indexes/multi/test_analytics.py +++ b/pandas/tests/indexes/multi/test_analytics.py @@ -30,7 +30,8 @@ def test_groupby(idx): tm.assert_dict_equal(groups, exp) -def test_truncate(): +def test_truncate_multiindex(): + # GH 34564 for MultiIndex level names check major_axis = Index(list(range(4))) minor_axis = Index(list(range(2))) @@ -38,19 +39,24 @@ def test_truncate(): minor_codes = np.array([0, 1, 0, 1, 0, 1]) index = MultiIndex( - levels=[major_axis, minor_axis], codes=[major_codes, minor_codes] + levels=[major_axis, minor_axis], + codes=[major_codes, minor_codes], + names=["L1", "L2"], ) result = index.truncate(before=1) assert "foo" not in result.levels[0] assert 1 in result.levels[0] + assert index.names == result.names result = index.truncate(after=1) assert 2 not in result.levels[0] assert 1 in result.levels[0] + assert index.names == result.names result = index.truncate(before=1, after=2) assert len(result.levels[0]) == 2 + assert index.names == result.names msg = "after < before" with pytest.raises(ValueError, match=msg): diff --git a/pandas/tests/series/methods/test_truncate.py b/pandas/tests/series/methods/test_truncate.py index 47947f0287494..8a2c62cee7e24 100644 --- a/pandas/tests/series/methods/test_truncate.py +++ b/pandas/tests/series/methods/test_truncate.py @@ -126,3 +126,17 @@ def test_truncate_periodindex(self): expected_idx2 = pd.PeriodIndex([pd.Period("2017-09-02")]) tm.assert_series_equal(result2, pd.Series([2], index=expected_idx2)) + + def test_truncate_multiindex(self): + # GH 34564 + mi = pd.MultiIndex.from_product([[1, 2, 3, 4], ["A", "B"]], names=["L1", "L2"]) + s1 = pd.Series(range(mi.shape[0]), index=mi, name="col") + result = s1.truncate(before=2, after=3) + + df = pd.DataFrame.from_dict( + {"L1": [2, 2, 3, 3], "L2": ["A", "B", "A", "B"], "col": [2, 3, 4, 5]} + ) + df.set_index(["L1", "L2"], inplace=True) + expected = df.col + + tm.assert_series_equal(result, expected) From 9a4b138bce10dc1bfb289226ae493f05fe4fa277 Mon Sep 17 00:00:00 2001 From: Dennis Bakhuis Date: Sat, 20 Jun 2020 22:08:39 +0200 Subject: [PATCH 0167/1025] DOC: document support for in-memory HDFStore GH33166 (#34888) --- doc/source/user_guide/cookbook.rst | 19 +++++++++++++++ pandas/io/pytables.py | 39 +++++++++++++++++++++--------- 2 files changed, 46 insertions(+), 12 deletions(-) diff --git a/doc/source/user_guide/cookbook.rst b/doc/source/user_guide/cookbook.rst index 56ef6fc479f2c..50b946999092a 100644 --- a/doc/source/user_guide/cookbook.rst +++ b/doc/source/user_guide/cookbook.rst @@ -1166,6 +1166,25 @@ Storing Attributes to a group node store.close() os.remove('test.h5') +You can create or load a HDFStore in-memory by passing the ``driver`` +parameter to PyTables. Changes are only written to disk when the HDFStore +is closed. + +.. ipython:: python + + store = pd.HDFStore('test.h5', 'w', diver='H5FD_CORE') + + df = pd.DataFrame(np.random.randn(8, 3)) + store['test'] = df + + # only after closing the store, data is written to disk: + store.close() + +.. ipython:: python + :suppress: + + os.remove('test.h5') + .. _cookbook.binary: Binary files diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 8aac8f9531512..800e9474cc0f8 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -447,8 +447,8 @@ class HDFStore: Parameters ---------- - path : string - File path to HDF5 file + path : str + File path to HDF5 file. mode : {'a', 'w', 'r', 'r+'}, default 'a' ``'r'`` @@ -462,18 +462,20 @@ class HDFStore: ``'r+'`` It is similar to ``'a'``, but the file must already exist. complevel : int, 0-9, default None - Specifies a compression level for data. - A value of 0 or None disables compression. + Specifies a compression level for data. + A value of 0 or None disables compression. complib : {'zlib', 'lzo', 'bzip2', 'blosc'}, default 'zlib' - Specifies the compression library to be used. - As of v0.20.2 these additional compressors for Blosc are supported - (default if no compressor specified: 'blosc:blosclz'): - {'blosc:blosclz', 'blosc:lz4', 'blosc:lz4hc', 'blosc:snappy', - 'blosc:zlib', 'blosc:zstd'}. - Specifying a compression library which is not available issues - a ValueError. + Specifies the compression library to be used. + As of v0.20.2 these additional compressors for Blosc are supported + (default if no compressor specified: 'blosc:blosclz'): + {'blosc:blosclz', 'blosc:lz4', 'blosc:lz4hc', 'blosc:snappy', + 'blosc:zlib', 'blosc:zstd'}. + Specifying a compression library which is not available issues + a ValueError. fletcher32 : bool, default False - If applying compression use the fletcher32 checksum + If applying compression use the fletcher32 checksum. + **kwargs + These parameters will be passed to the PyTables open_file method. Examples -------- @@ -482,6 +484,17 @@ class HDFStore: >>> store['foo'] = bar # write to HDF5 >>> bar = store['foo'] # retrieve >>> store.close() + + **Create or load HDF5 file in-memory** + + When passing the `driver` option to the PyTables open_file method through + **kwargs, the HDF5 file is loaded or created in-memory and will only be + written when closed: + + >>> bar = pd.DataFrame(np.random.randn(10, 4)) + >>> store = pd.HDFStore('test.h5', driver='H5FD_CORE') + >>> store['foo'] = bar + >>> store.close() # only now, data is written to disk """ _handle: Optional["File"] @@ -634,6 +647,8 @@ def open(self, mode: str = "a", **kwargs): ---------- mode : {'a', 'w', 'r', 'r+'}, default 'a' See HDFStore docstring or tables.open_file for info about modes + **kwargs + These parameters will be passed to the PyTables open_file method. """ tables = _tables() From 3fa4c1f0336b613c5a6fead60c353380abf96d0b Mon Sep 17 00:00:00 2001 From: Aidan Montare <47719225+aidanmontare-edu@users.noreply.github.com> Date: Sat, 20 Jun 2020 16:11:53 -0400 Subject: [PATCH 0168/1025] DOC: add note about the values of unit for pd.to_datetime (#34899) --- doc/source/user_guide/timeseries.rst | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst index 648d93a45d210..a03ba6c775e68 100644 --- a/doc/source/user_guide/timeseries.rst +++ b/doc/source/user_guide/timeseries.rst @@ -235,6 +235,8 @@ inferred frequency upon creation: pd.DatetimeIndex(['2018-01-01', '2018-01-03', '2018-01-05'], freq='infer') +.. _timeseries.converting.format: + Providing a format argument ~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -319,6 +321,12 @@ which can be specified. These are computed from the starting point specified by pd.to_datetime([1349720105100, 1349720105200, 1349720105300, 1349720105400, 1349720105500], unit='ms') +.. note:: + + The ``unit`` parameter does not use the same strings as the ``format`` parameter + that was discussed :ref:`above`). The + available units are listed on the documentation for :func:`pandas.to_datetime`. + Constructing a :class:`Timestamp` or :class:`DatetimeIndex` with an epoch timestamp with the ``tz`` argument specified will currently localize the epoch timestamps to UTC first then convert the result to the specified time zone. However, this behavior From 842d1fce5e903b4dd28a30400588634a29c41f4a Mon Sep 17 00:00:00 2001 From: avinashpancham <44933366+avinashpancham@users.noreply.github.com> Date: Sun, 21 Jun 2020 00:17:51 +0200 Subject: [PATCH 0169/1025] TST: Verify whether Datetime subclasses are also of dtype datetime (#34911) --- pandas/tests/frame/test_constructors.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index baac87755c6d2..756f3fec82b84 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -2539,6 +2539,14 @@ def test_from_M8_structured(self): assert isinstance(s[0], Timestamp) assert s[0] == dates[0][0] + def test_from_datetime_subclass(self): + # GH21142 Verify whether Datetime subclasses are also of dtype datetime + class DatetimeSubclass(datetime): + pass + + data = pd.DataFrame({"datetime": [DatetimeSubclass(2020, 1, 1, 1, 1)]}) + assert data.datetime.dtype == "datetime64[ns]" + class TestDataFrameConstructorWithDatetimeTZ: def test_from_dict(self): From a31f0cd286d667cb24e6e04aebb10123e86750c9 Mon Sep 17 00:00:00 2001 From: Krishna Chivukula <63070026+KrishnaSai2020@users.noreply.github.com> Date: Sat, 20 Jun 2020 23:20:58 +0100 Subject: [PATCH 0170/1025] DOC: explain EWM (#34910) * fixed issue #34867 * pep8: line too long * Update pandas/core/generic.py Co-authored-by: Kaiqi Dong * fixed issue #34867 * fixed issue #34867 * fixed pep8 issue * renamed exponentialmoving back to ewm.py * pep 8 issues * pep 8 issues Co-authored-by: KrishnaSai2020 Co-authored-by: Kaiqi Dong --- doc/redirects.csv | 10 +++++----- doc/source/reference/window.rst | 12 ++++++------ doc/source/user_guide/computation.rst | 20 ++++++++++---------- doc/source/whatsnew/v0.25.0.rst | 2 +- pandas/core/frame.py | 4 ++-- pandas/core/generic.py | 11 ++++++++--- pandas/core/window/__init__.py | 2 +- pandas/core/window/ewm.py | 4 ++-- pandas/tests/window/test_ewm.py | 4 ++-- 9 files changed, 37 insertions(+), 32 deletions(-) diff --git a/doc/redirects.csv b/doc/redirects.csv index b59ccf649ee21..bceb4b5961324 100644 --- a/doc/redirects.csv +++ b/doc/redirects.csv @@ -269,11 +269,11 @@ generated/pandas.core.resample.Resampler.std,../reference/api/pandas.core.resamp generated/pandas.core.resample.Resampler.sum,../reference/api/pandas.core.resample.Resampler.sum generated/pandas.core.resample.Resampler.transform,../reference/api/pandas.core.resample.Resampler.transform generated/pandas.core.resample.Resampler.var,../reference/api/pandas.core.resample.Resampler.var -generated/pandas.core.window.EWM.corr,../reference/api/pandas.core.window.EWM.corr -generated/pandas.core.window.EWM.cov,../reference/api/pandas.core.window.EWM.cov -generated/pandas.core.window.EWM.mean,../reference/api/pandas.core.window.EWM.mean -generated/pandas.core.window.EWM.std,../reference/api/pandas.core.window.EWM.std -generated/pandas.core.window.EWM.var,../reference/api/pandas.core.window.EWM.var +generated/pandas.core.window.ExponentialMovingWindow.corr,../reference/api/pandas.core.window.ExponentialMovingWindow.corr +generated/pandas.core.window.ExponentialMovingWindow.cov,../reference/api/pandas.core.window.ExponentialMovingWindow.cov +generated/pandas.core.window.ExponentialMovingWindow.mean,../reference/api/pandas.core.window.ExponentialMovingWindow.mean +generated/pandas.core.window.ExponentialMovingWindow.std,../reference/api/pandas.core.window.ExponentialMovingWindow.std +generated/pandas.core.window.ExponentialMovingWindow.var,../reference/api/pandas.core.window.ExponentialMovingWindow.var generated/pandas.core.window.Expanding.aggregate,../reference/api/pandas.core.window.Expanding.aggregate generated/pandas.core.window.Expanding.apply,../reference/api/pandas.core.window.Expanding.apply generated/pandas.core.window.Expanding.corr,../reference/api/pandas.core.window.Expanding.corr diff --git a/doc/source/reference/window.rst b/doc/source/reference/window.rst index fb60a0d387ca2..d7e6405a3732b 100644 --- a/doc/source/reference/window.rst +++ b/doc/source/reference/window.rst @@ -8,7 +8,7 @@ Window Rolling objects are returned by ``.rolling`` calls: :func:`pandas.DataFrame.rolling`, :func:`pandas.Series.rolling`, etc. Expanding objects are returned by ``.expanding`` calls: :func:`pandas.DataFrame.expanding`, :func:`pandas.Series.expanding`, etc. -EWM objects are returned by ``.ewm`` calls: :func:`pandas.DataFrame.ewm`, :func:`pandas.Series.ewm`, etc. +ExponentialMovingWindow objects are returned by ``.ewm`` calls: :func:`pandas.DataFrame.ewm`, :func:`pandas.Series.ewm`, etc. Standard moving window functions -------------------------------- @@ -69,11 +69,11 @@ Exponentially-weighted moving window functions .. autosummary:: :toctree: api/ - EWM.mean - EWM.std - EWM.var - EWM.corr - EWM.cov + ExponentialMovingWindow.mean + ExponentialMovingWindow.std + ExponentialMovingWindow.var + ExponentialMovingWindow.corr + ExponentialMovingWindow.cov Window indexer -------------- diff --git a/doc/source/user_guide/computation.rst b/doc/source/user_guide/computation.rst index cf630a9671013..19fdb541a6a45 100644 --- a/doc/source/user_guide/computation.rst +++ b/doc/source/user_guide/computation.rst @@ -230,7 +230,7 @@ see the :ref:`groupby docs `. The API for window statistics is quite similar to the way one works with ``GroupBy`` objects, see the documentation :ref:`here `. We work with ``rolling``, ``expanding`` and ``exponentially weighted`` data through the corresponding -objects, :class:`~pandas.core.window.Rolling`, :class:`~pandas.core.window.Expanding` and :class:`~pandas.core.window.EWM`. +objects, :class:`~pandas.core.window.Rolling`, :class:`~pandas.core.window.Expanding` and :class:`~pandas.core.window.ExponentialMovingWindow`. .. ipython:: python @@ -777,7 +777,7 @@ columns by reshaping and indexing: Aggregation ----------- -Once the ``Rolling``, ``Expanding`` or ``EWM`` objects have been created, several methods are available to +Once the ``Rolling``, ``Expanding`` or ``ExponentialMovingWindow`` objects have been created, several methods are available to perform multiple computations on the data. These operations are similar to the :ref:`aggregating API `, :ref:`groupby API `, and :ref:`resample API `. @@ -971,7 +971,7 @@ Exponentially weighted windows A related set of functions are exponentially weighted versions of several of the above statistics. A similar interface to ``.rolling`` and ``.expanding`` is accessed -through the ``.ewm`` method to receive an :class:`~EWM` object. +through the ``.ewm`` method to receive an :class:`~ExponentialMovingWindow` object. A number of expanding EW (exponentially weighted) methods are provided: @@ -980,11 +980,11 @@ methods are provided: :header: "Function", "Description" :widths: 20, 80 - :meth:`~EWM.mean`, EW moving average - :meth:`~EWM.var`, EW moving variance - :meth:`~EWM.std`, EW moving standard deviation - :meth:`~EWM.corr`, EW moving correlation - :meth:`~EWM.cov`, EW moving covariance + :meth:`~ExponentialMovingWindow.mean`, EW moving average + :meth:`~ExponentialMovingWindow.var`, EW moving variance + :meth:`~ExponentialMovingWindow.std`, EW moving standard deviation + :meth:`~ExponentialMovingWindow.corr`, EW moving correlation + :meth:`~ExponentialMovingWindow.cov`, EW moving covariance In general, a weighted moving average is calculated as @@ -1090,12 +1090,12 @@ Here is an example for a univariate time series: @savefig ewma_ex.png s.ewm(span=20).mean().plot(style='k') -EWM has a ``min_periods`` argument, which has the same +ExponentialMovingWindow has a ``min_periods`` argument, which has the same meaning it does for all the ``.expanding`` and ``.rolling`` methods: no output values will be set until at least ``min_periods`` non-null values are encountered in the (expanding) window. -EWM also has an ``ignore_na`` argument, which determines how +ExponentialMovingWindow also has an ``ignore_na`` argument, which determines how intermediate null values affect the calculation of the weights. When ``ignore_na=False`` (the default), weights are calculated based on absolute positions, so that intermediate null values affect the result. diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 44558fd63ba15..3cd920158f774 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -1206,7 +1206,7 @@ Groupby/resample/rolling - Bug in :meth:`pandas.core.groupby.GroupBy.agg` where incorrect results are returned for uint64 columns. (:issue:`26310`) - Bug in :meth:`pandas.core.window.Rolling.median` and :meth:`pandas.core.window.Rolling.quantile` where MemoryError is raised with empty window (:issue:`26005`) - Bug in :meth:`pandas.core.window.Rolling.median` and :meth:`pandas.core.window.Rolling.quantile` where incorrect results are returned with ``closed='left'`` and ``closed='neither'`` (:issue:`26005`) -- Improved :class:`pandas.core.window.Rolling`, :class:`pandas.core.window.Window` and :class:`pandas.core.window.EWM` functions to exclude nuisance columns from results instead of raising errors and raise a ``DataError`` only if all columns are nuisance (:issue:`12537`) +- Improved :class:`pandas.core.window.Rolling`, :class:`pandas.core.window.Window` and :class:`pandas.core.window.ExponentialMovingWindow` functions to exclude nuisance columns from results instead of raising errors and raise a ``DataError`` only if all columns are nuisance (:issue:`12537`) - Bug in :meth:`pandas.core.window.Rolling.max` and :meth:`pandas.core.window.Rolling.min` where incorrect results are returned with an empty variable window (:issue:`26005`) - Raise a helpful exception when an unsupported weighted window function is used as an argument of :meth:`pandas.core.window.Window.aggregate` (:issue:`26597`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 39ca7ed47f7fa..d12ebeafe8510 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -7288,7 +7288,7 @@ def _gotitem( core.resample.Resampler : Perform operations over resampled bins. core.window.Rolling : Perform operations over rolling window. core.window.Expanding : Perform operations over expanding window. - core.window.EWM : Perform operation over exponential weighted + core.window.ExponentialMovingWindow : Perform operation over exponential weighted window. """ ) @@ -8171,7 +8171,7 @@ def cov(self, min_periods=None) -> "DataFrame": See Also -------- Series.cov : Compute covariance with another Series. - core.window.EWM.cov: Exponential weighted sample covariance. + core.window.ExponentialMovingWindow.cov: Exponential weighted sample covariance. core.window.Expanding.cov : Expanding sample covariance. core.window.Rolling.cov : Rolling sample covariance. diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 701909c9df857..1404d225eea97 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -10460,7 +10460,12 @@ def _add_series_or_dataframe_operations(cls): Add the series or dataframe only operations to the cls; evaluate the doc strings again. """ - from pandas.core.window import EWM, Expanding, Rolling, Window + from pandas.core.window import ( + Expanding, + ExponentialMovingWindow, + Rolling, + Window, + ) @doc(Rolling) def rolling( @@ -10507,7 +10512,7 @@ def expanding(self, min_periods=1, center=False, axis=0): cls.expanding = expanding - @doc(EWM) + @doc(ExponentialMovingWindow) def ewm( self, com=None, @@ -10520,7 +10525,7 @@ def ewm( axis=0, ): axis = self._get_axis_number(axis) - return EWM( + return ExponentialMovingWindow( self, com=com, span=span, diff --git a/pandas/core/window/__init__.py b/pandas/core/window/__init__.py index dcf58a4c0dd5b..304c61ac0e489 100644 --- a/pandas/core/window/__init__.py +++ b/pandas/core/window/__init__.py @@ -1,3 +1,3 @@ -from pandas.core.window.ewm import EWM # noqa:F401 +from pandas.core.window.ewm import ExponentialMovingWindow # noqa:F401 from pandas.core.window.expanding import Expanding, ExpandingGroupby # noqa:F401 from pandas.core.window.rolling import Rolling, RollingGroupby, Window # noqa:F401 diff --git a/pandas/core/window/ewm.py b/pandas/core/window/ewm.py index b708020be90d2..ee80f80b320e4 100644 --- a/pandas/core/window/ewm.py +++ b/pandas/core/window/ewm.py @@ -59,7 +59,7 @@ def get_center_of_mass( return float(comass) -class EWM(_Rolling): +class ExponentialMovingWindow(_Rolling): r""" Provide exponential weighted (EW) functions. @@ -185,7 +185,7 @@ def __init__( @property def _constructor(self): - return EWM + return ExponentialMovingWindow _agg_see_also_doc = dedent( """ diff --git a/pandas/tests/window/test_ewm.py b/pandas/tests/window/test_ewm.py index 9ba194dcf0959..0957cac7aff95 100644 --- a/pandas/tests/window/test_ewm.py +++ b/pandas/tests/window/test_ewm.py @@ -4,7 +4,7 @@ from pandas.errors import UnsupportedFunctionCall from pandas import DataFrame, Series -from pandas.core.window import EWM +from pandas.core.window import ExponentialMovingWindow def test_doc_string(): @@ -56,7 +56,7 @@ def test_constructor(which): @pytest.mark.parametrize("method", ["std", "mean", "var"]) def test_numpy_compat(method): # see gh-12811 - e = EWM(Series([2, 4, 6]), alpha=0.5) + e = ExponentialMovingWindow(Series([2, 4, 6]), alpha=0.5) msg = "numpy operations are not valid with window objects" From b7f1e6de7b1ed436e63b0cc506c95aed66bda508 Mon Sep 17 00:00:00 2001 From: Rohith295 <57575037+Rohith295@users.noreply.github.com> Date: Sat, 20 Jun 2020 15:24:24 -0700 Subject: [PATCH 0171/1025] TST: groupby apply called multiple times (#34897) * :white_check_mark: * :white_check_mark: * reformatted accordingly, for linting issues * Fixed as per the review comments --- pandas/tests/groupby/test_apply.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index d03b03b3f862c..1945647ced08f 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -190,6 +190,27 @@ def f_constant_df(group): assert names == group_names +def test_group_apply_once_per_group2(capsys): + # GH: 31111 + # groupby-apply need to execute len(set(group_by_columns)) times + + expected = 2 # Number of times `apply` should call a function for the current test + + df = pd.DataFrame( + { + "group_by_column": [0, 0, 0, 0, 1, 1, 1, 1], + "test_column": ["0", "2", "4", "6", "8", "10", "12", "14"], + }, + index=["0", "2", "4", "6", "8", "10", "12", "14"], + ) + + df.groupby("group_by_column").apply(lambda df: print("function_called")) + + result = capsys.readouterr().out.count("function_called") + # If `groupby` behaves unexpectedly, this test will break + assert result == expected + + def test_apply_fast_slow_identical(): # GH 31613 From 87d7852cd03ad05e0640d0505a7508208486b978 Mon Sep 17 00:00:00 2001 From: avinashpancham <44933366+avinashpancham@users.noreply.github.com> Date: Sun, 21 Jun 2020 00:25:58 +0200 Subject: [PATCH 0172/1025] TST: Ensure dtypes are set correctly for empty integer columns #24386 (#34886) * TST: Ensure dtypes are set correctly for empty integer columns #24386 * Add comment to refer to GH issue tracker * Refactor check, use == instead of is * Moved file to test_constructors.py and added test for other dtypes * Add support for more dtypes * Refactor testing for data types using containers in _testing.py --- pandas/tests/frame/test_constructors.py | 27 +++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 756f3fec82b84..39cab3d5ec0b8 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -2245,6 +2245,33 @@ def test_from_records_empty_with_nonempty_fields_gh3682(self): tm.assert_index_equal(df.index, Index([], name="id")) assert df.index.name == "id" + @pytest.mark.parametrize( + "dtype", + tm.ALL_INT_DTYPES + + tm.ALL_EA_INT_DTYPES + + tm.FLOAT_DTYPES + + tm.COMPLEX_DTYPES + + tm.DATETIME64_DTYPES + + tm.TIMEDELTA64_DTYPES + + tm.BOOL_DTYPES, + ) + def test_check_dtype_empty_numeric_column(self, dtype): + # GH24386: Ensure dtypes are set correctly for an empty DataFrame. + # Empty DataFrame is generated via dictionary data with non-overlapping columns. + data = pd.DataFrame({"a": [1, 2]}, columns=["b"], dtype=dtype) + + assert data.b.dtype == dtype + + @pytest.mark.parametrize( + "dtype", tm.STRING_DTYPES + tm.BYTES_DTYPES + tm.OBJECT_DTYPES + ) + def test_check_dtype_empty_string_column(self, dtype): + # GH24386: Ensure dtypes are set correctly for an empty DataFrame. + # Empty DataFrame is generated via dictionary data with non-overlapping columns. + data = pd.DataFrame({"a": [1, 2]}, columns=["b"], dtype=dtype) + + assert data.b.dtype.name == "object" + def test_from_records_with_datetimes(self): # this may fail on certain platforms because of a numpy issue From d0194b87d67dbf78980a3e3b1760cbd3827250fe Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 20 Jun 2020 16:16:12 -0700 Subject: [PATCH 0173/1025] REF: dont use compute_reduction (#34913) --- pandas/core/apply.py | 8 +------- pandas/tests/frame/test_apply.py | 3 --- 2 files changed, 1 insertion(+), 10 deletions(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 0a274d8becd72..90cb0e2e1be4c 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -220,13 +220,7 @@ def apply_empty_result(self): def apply_raw(self): """ apply to the values as a numpy array """ - result, reduction_success = libreduction.compute_reduction( - self.values, self.f, axis=self.axis - ) - - # We expect np.apply_along_axis to give a two-dimensional result, or raise. - if not reduction_success: - result = np.apply_along_axis(self.f, self.axis, self.values) + result = np.apply_along_axis(self.f, self.axis, self.values) # TODO: mixed type case if result.ndim == 2: diff --git a/pandas/tests/frame/test_apply.py b/pandas/tests/frame/test_apply.py index 48a141a657cbb..8f0d3d9fbc734 100644 --- a/pandas/tests/frame/test_apply.py +++ b/pandas/tests/frame/test_apply.py @@ -745,9 +745,6 @@ def non_reducing_function(row): df.apply(func, axis=1) assert names == list(df.index) - @pytest.mark.xfail( - reason="The 'run once' enhancement for apply_raw not implemented yet." - ) def test_apply_raw_function_runs_once(self): # https://github.com/pandas-dev/pandas/issues/34506 From ae7cc4548a09d50e76f0a540197323153c3e127f Mon Sep 17 00:00:00 2001 From: Marc Garcia Date: Mon, 22 Jun 2020 07:32:44 +0100 Subject: [PATCH 0174/1025] DOC: Improve docstring of Series/DataFrame.bool (#34229) --- pandas/core/generic.py | 30 +++++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 1404d225eea97..bb2810ba7857f 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1372,16 +1372,36 @@ def __nonzero__(self): def bool(self): """ - Return the bool of a single element PandasObject. + Return the bool of a single element Series or DataFrame. - This must be a boolean scalar value, either True or False. Raise a - ValueError if the PandasObject does not have exactly 1 element, or that - element is not boolean + This must be a boolean scalar value, either True or False. It will raise a + ValueError if the Series or DataFrame does not have exactly 1 element, or that + element is not boolean (integer values 0 and 1 will also raise an exception). Returns ------- bool - Same single boolean value converted to bool type. + The value in the Series or DataFrame. + + See Also + -------- + Series.astype : Change the data type of a Series, including to boolean. + DataFrame.astype : Change the data type of a DataFrame, including to boolean. + numpy.bool_ : NumPy boolean data type, used by pandas for boolean values. + + Examples + -------- + The method will only work for single element objects with a boolean value: + + >>> pd.Series([True]).bool() + True + >>> pd.Series([False]).bool() + False + + >>> pd.DataFrame({'col': [True]}).bool() + True + >>> pd.DataFrame({'col': [False]}).bool() + False """ v = self.squeeze() if isinstance(v, (bool, np.bool_)): From 4f038ee1b2aa1ca23df99e1f6337ae31cb211a96 Mon Sep 17 00:00:00 2001 From: rhshadrach <45562402+rhshadrach@users.noreply.github.com> Date: Mon, 22 Jun 2020 11:30:54 -0400 Subject: [PATCH 0175/1025] BUG: groupby.hist legend should use group keys (#33493) --- doc/source/whatsnew/v1.1.0.rst | 1 + pandas/plotting/_core.py | 62 +++++++++++++++-------- pandas/plotting/_matplotlib/hist.py | 32 +++++++++++- pandas/tests/plotting/test_groupby.py | 49 +++++++++++++++++- pandas/tests/plotting/test_hist_method.py | 55 +++++++++++++++++++- 5 files changed, 175 insertions(+), 24 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 567b6853bd633..22b83425b58c2 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -292,6 +292,7 @@ Other enhancements - :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` now accept an ``errors`` argument (:issue:`22610`) - :meth:`groupby.transform` now allows ``func`` to be ``pad``, ``backfill`` and ``cumcount`` (:issue:`31269`). - :meth:`~pandas.io.json.read_json` now accepts `nrows` parameter. (:issue:`33916`). +- :meth:`DataFrame.hist`, :meth:`Series.hist`, :meth:`core.groupby.DataFrameGroupBy.hist`, and :meth:`core.groupby.SeriesGroupBy.hist` have gained the ``legend`` argument. Set to True to show a legend in the histogram. (:issue:`6279`) - :func:`concat` and :meth:`~DataFrame.append` now preserve extension dtypes, for example combining a nullable integer column with a numpy integer column will no longer result in object dtype but preserve the integer dtype (:issue:`33607`, :issue:`34339`). diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index 32cd89383dde9..4f5b7b2d7a888 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -1,7 +1,9 @@ import importlib +from typing import TYPE_CHECKING, Optional, Sequence, Tuple, Union from pandas._config import get_option +from pandas._typing import Label from pandas.util._decorators import Appender, Substitution from pandas.core.dtypes.common import is_integer, is_list_like @@ -9,19 +11,23 @@ from pandas.core.base import PandasObject +if TYPE_CHECKING: + from pandas import DataFrame + def hist_series( self, by=None, ax=None, - grid=True, - xlabelsize=None, - xrot=None, - ylabelsize=None, - yrot=None, - figsize=None, - bins=10, - backend=None, + grid: bool = True, + xlabelsize: Optional[int] = None, + xrot: Optional[float] = None, + ylabelsize: Optional[int] = None, + yrot: Optional[float] = None, + figsize: Optional[Tuple[int, int]] = None, + bins: Union[int, Sequence[int]] = 10, + backend: Optional[str] = None, + legend: bool = False, **kwargs, ): """ @@ -58,6 +64,11 @@ def hist_series( .. versionadded:: 1.0.0 + legend : bool, default False + Whether to show the legend. + + ..versionadded:: 1.1.0 + **kwargs To be passed to the actual plotting function. @@ -82,26 +93,28 @@ def hist_series( yrot=yrot, figsize=figsize, bins=bins, + legend=legend, **kwargs, ) def hist_frame( - data, - column=None, + data: "DataFrame", + column: Union[Label, Sequence[Label]] = None, by=None, - grid=True, - xlabelsize=None, - xrot=None, - ylabelsize=None, - yrot=None, + grid: bool = True, + xlabelsize: Optional[int] = None, + xrot: Optional[float] = None, + ylabelsize: Optional[int] = None, + yrot: Optional[float] = None, ax=None, - sharex=False, - sharey=False, - figsize=None, - layout=None, - bins=10, - backend=None, + sharex: bool = False, + sharey: bool = False, + figsize: Optional[Tuple[int, int]] = None, + layout: Optional[Tuple[int, int]] = None, + bins: Union[int, Sequence[int]] = 10, + backend: Optional[str] = None, + legend: bool = False, **kwargs, ): """ @@ -154,6 +167,7 @@ def hist_frame( bin edges are calculated and returned. If bins is a sequence, gives bin edges, including left edge of first bin and right edge of last bin. In this case, bins is returned unmodified. + backend : str, default None Backend to use instead of the backend specified in the option ``plotting.backend``. For instance, 'matplotlib'. Alternatively, to @@ -162,6 +176,11 @@ def hist_frame( .. versionadded:: 1.0.0 + legend : bool, default False + Whether to show the legend. + + ..versionadded:: 1.1.0 + **kwargs All other plotting keyword arguments to be passed to :meth:`matplotlib.pyplot.hist`. @@ -203,6 +222,7 @@ def hist_frame( sharey=sharey, figsize=figsize, layout=layout, + legend=legend, bins=bins, **kwargs, ) diff --git a/pandas/plotting/_matplotlib/hist.py b/pandas/plotting/_matplotlib/hist.py index b0ce43dc2eb36..ee41479b3c7c9 100644 --- a/pandas/plotting/_matplotlib/hist.py +++ b/pandas/plotting/_matplotlib/hist.py @@ -225,6 +225,7 @@ def _grouped_hist( xrot=None, ylabelsize=None, yrot=None, + legend=False, **kwargs, ): """ @@ -243,15 +244,26 @@ def _grouped_hist( sharey : bool, default False rot : int, default 90 grid : bool, default True + legend: : bool, default False kwargs : dict, keyword arguments passed to matplotlib.Axes.hist Returns ------- collection of Matplotlib Axes """ + if legend: + assert "label" not in kwargs + if data.ndim == 1: + kwargs["label"] = data.name + elif column is None: + kwargs["label"] = data.columns + else: + kwargs["label"] = column def plot_group(group, ax): ax.hist(group.dropna().values, bins=bins, **kwargs) + if legend: + ax.legend() if xrot is None: xrot = rot @@ -290,10 +302,14 @@ def hist_series( yrot=None, figsize=None, bins=10, + legend: bool = False, **kwds, ): import matplotlib.pyplot as plt + if legend and "label" in kwds: + raise ValueError("Cannot use both legend and label") + if by is None: if kwds.get("layout", None) is not None: raise ValueError("The 'layout' keyword is not supported when 'by' is None") @@ -308,8 +324,11 @@ def hist_series( elif ax.get_figure() != fig: raise AssertionError("passed axis not bound to passed figure") values = self.dropna().values - + if legend: + kwds["label"] = self.name ax.hist(values, bins=bins, **kwds) + if legend: + ax.legend() ax.grid(grid) axes = np.array([ax]) @@ -334,6 +353,7 @@ def hist_series( xrot=xrot, ylabelsize=ylabelsize, yrot=yrot, + legend=legend, **kwds, ) @@ -358,8 +378,11 @@ def hist_frame( figsize=None, layout=None, bins=10, + legend: bool = False, **kwds, ): + if legend and "label" in kwds: + raise ValueError("Cannot use both legend and label") if by is not None: axes = _grouped_hist( data, @@ -376,6 +399,7 @@ def hist_frame( xrot=xrot, ylabelsize=ylabelsize, yrot=yrot, + legend=legend, **kwds, ) return axes @@ -401,11 +425,17 @@ def hist_frame( ) _axes = _flatten(axes) + can_set_label = "label" not in kwds + for i, col in enumerate(data.columns): ax = _axes[i] + if legend and can_set_label: + kwds["label"] = col ax.hist(data[col].dropna().values, bins=bins, **kwds) ax.set_title(col) ax.grid(grid) + if legend: + ax.legend() _set_ticks_props( axes, xlabelsize=xlabelsize, xrot=xrot, ylabelsize=ylabelsize, yrot=yrot diff --git a/pandas/tests/plotting/test_groupby.py b/pandas/tests/plotting/test_groupby.py index 238639bd3732d..4ac23c2cffa15 100644 --- a/pandas/tests/plotting/test_groupby.py +++ b/pandas/tests/plotting/test_groupby.py @@ -2,10 +2,11 @@ import numpy as np +import pytest import pandas.util._test_decorators as td -from pandas import DataFrame, Series +from pandas import DataFrame, Index, Series import pandas._testing as tm from pandas.tests.plotting.common import TestPlotBase @@ -65,3 +66,49 @@ def test_plot_kwargs(self): res = df.groupby("z").plot.scatter(x="x", y="y") assert len(res["a"].collections) == 1 + + @pytest.mark.parametrize("column, expected_axes_num", [(None, 2), ("b", 1)]) + def test_groupby_hist_frame_with_legend(self, column, expected_axes_num): + # GH 6279 - DataFrameGroupBy histogram can have a legend + expected_layout = (1, expected_axes_num) + expected_labels = column or [["a"], ["b"]] + + index = Index(15 * ["1"] + 15 * ["2"], name="c") + df = DataFrame(np.random.randn(30, 2), index=index, columns=["a", "b"]) + g = df.groupby("c") + + for axes in g.hist(legend=True, column=column): + self._check_axes_shape( + axes, axes_num=expected_axes_num, layout=expected_layout + ) + for ax, expected_label in zip(axes[0], expected_labels): + self._check_legend_labels(ax, expected_label) + + @pytest.mark.parametrize("column", [None, "b"]) + def test_groupby_hist_frame_with_legend_raises(self, column): + # GH 6279 - DataFrameGroupBy histogram with legend and label raises + index = Index(15 * ["1"] + 15 * ["2"], name="c") + df = DataFrame(np.random.randn(30, 2), index=index, columns=["a", "b"]) + g = df.groupby("c") + + with pytest.raises(ValueError, match="Cannot use both legend and label"): + g.hist(legend=True, column=column, label="d") + + def test_groupby_hist_series_with_legend(self): + # GH 6279 - SeriesGroupBy histogram can have a legend + index = Index(15 * ["1"] + 15 * ["2"], name="c") + df = DataFrame(np.random.randn(30, 2), index=index, columns=["a", "b"]) + g = df.groupby("c") + + for ax in g["a"].hist(legend=True): + self._check_axes_shape(ax, axes_num=1, layout=(1, 1)) + self._check_legend_labels(ax, ["1", "2"]) + + def test_groupby_hist_series_with_legend_raises(self): + # GH 6279 - SeriesGroupBy histogram with legend and label raises + index = Index(15 * ["1"] + 15 * ["2"], name="c") + df = DataFrame(np.random.randn(30, 2), index=index, columns=["a", "b"]) + g = df.groupby("c") + + with pytest.raises(ValueError, match="Cannot use both legend and label"): + g.hist(legend=True, label="d") diff --git a/pandas/tests/plotting/test_hist_method.py b/pandas/tests/plotting/test_hist_method.py index 0d3425d001229..b6a6c326c3df3 100644 --- a/pandas/tests/plotting/test_hist_method.py +++ b/pandas/tests/plotting/test_hist_method.py @@ -6,7 +6,7 @@ import pandas.util._test_decorators as td -from pandas import DataFrame, Series +from pandas import DataFrame, Index, Series import pandas._testing as tm from pandas.tests.plotting.common import TestPlotBase, _check_plot_works @@ -129,6 +129,29 @@ def test_plot_fails_when_ax_differs_from_figure(self): with pytest.raises(AssertionError): self.ts.hist(ax=ax1, figure=fig2) + @pytest.mark.parametrize( + "by, expected_axes_num, expected_layout", [(None, 1, (1, 1)), ("b", 2, (1, 2))] + ) + def test_hist_with_legend(self, by, expected_axes_num, expected_layout): + # GH 6279 - Series histogram can have a legend + index = 15 * ["1"] + 15 * ["2"] + s = Series(np.random.randn(30), index=index, name="a") + s.index.name = "b" + + axes = _check_plot_works(s.hist, legend=True, by=by) + self._check_axes_shape(axes, axes_num=expected_axes_num, layout=expected_layout) + self._check_legend_labels(axes, "a") + + @pytest.mark.parametrize("by", [None, "b"]) + def test_hist_with_legend_raises(self, by): + # GH 6279 - Series histogram with legend and label raises + index = 15 * ["1"] + 15 * ["2"] + s = Series(np.random.randn(30), index=index, name="a") + s.index.name = "b" + + with pytest.raises(ValueError, match="Cannot use both legend and label"): + s.hist(legend=True, by=by, label="c") + @td.skip_if_no_mpl class TestDataFramePlots(TestPlotBase): @@ -293,6 +316,36 @@ def test_hist_column_order_unchanged(self, column, expected): assert result == expected + @pytest.mark.parametrize("by", [None, "c"]) + @pytest.mark.parametrize("column", [None, "b"]) + def test_hist_with_legend(self, by, column): + # GH 6279 - DataFrame histogram can have a legend + expected_axes_num = 1 if by is None and column is not None else 2 + expected_layout = (1, expected_axes_num) + expected_labels = column or ["a", "b"] + if by is not None: + expected_labels = [expected_labels] * 2 + + index = Index(15 * ["1"] + 15 * ["2"], name="c") + df = DataFrame(np.random.randn(30, 2), index=index, columns=["a", "b"]) + + axes = _check_plot_works(df.hist, legend=True, by=by, column=column) + self._check_axes_shape(axes, axes_num=expected_axes_num, layout=expected_layout) + if by is None and column is None: + axes = axes[0] + for expected_label, ax in zip(expected_labels, axes): + self._check_legend_labels(ax, expected_label) + + @pytest.mark.parametrize("by", [None, "c"]) + @pytest.mark.parametrize("column", [None, "b"]) + def test_hist_with_legend_raises(self, by, column): + # GH 6279 - DataFrame histogram with legend and label raises + index = Index(15 * ["1"] + 15 * ["2"], name="c") + df = DataFrame(np.random.randn(30, 2), index=index, columns=["a", "b"]) + + with pytest.raises(ValueError, match="Cannot use both legend and label"): + df.hist(legend=True, by=by, column=column, label="d") + @td.skip_if_no_mpl class TestDataFrameGroupByPlots(TestPlotBase): From b88c79a1edb6af65a4b8f869add0871451151b97 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Tue, 23 Jun 2020 09:43:24 -0400 Subject: [PATCH 0176/1025] ENH: add fsspec support (#34266) Co-authored-by: Julian de Ruiter --- ci/deps/azure-36-locale.yaml | 2 - ci/deps/azure-37-locale.yaml | 1 - ci/deps/azure-windows-37.yaml | 5 +- ci/deps/travis-36-cov.yaml | 5 +- ci/deps/travis-36-locale.yaml | 2 - ci/deps/travis-36-slow.yaml | 3 +- ci/deps/travis-37.yaml | 3 +- doc/source/getting_started/install.rst | 5 +- doc/source/whatsnew/v1.1.0.rst | 22 +++++- environment.yml | 4 +- pandas/compat/_optional.py | 5 +- pandas/io/common.py | 80 ++++++++----------- pandas/io/gcs.py | 22 ------ pandas/io/parquet.py | 67 ++++++++-------- pandas/io/s3.py | 53 ------------- pandas/tests/io/json/test_pandas.py | 14 +++- pandas/tests/io/test_common.py | 10 +++ pandas/tests/io/test_fsspec.py | 102 +++++++++++++++++++++++++ pandas/tests/io/test_gcs.py | 43 ++++++----- pandas/tests/io/test_parquet.py | 29 +++++-- pandas/tests/io/test_pickle.py | 40 +--------- pandas/tests/io/test_s3.py | 8 -- requirements-dev.txt | 4 +- 23 files changed, 279 insertions(+), 250 deletions(-) delete mode 100644 pandas/io/gcs.py delete mode 100644 pandas/io/s3.py create mode 100644 pandas/tests/io/test_fsspec.py diff --git a/ci/deps/azure-36-locale.yaml b/ci/deps/azure-36-locale.yaml index 56da56b45b702..a9b9a5a47ccf5 100644 --- a/ci/deps/azure-36-locale.yaml +++ b/ci/deps/azure-36-locale.yaml @@ -15,7 +15,6 @@ dependencies: # pandas dependencies - beautifulsoup4 - - gcsfs - html5lib - ipython - jinja2 @@ -31,7 +30,6 @@ dependencies: - pytables - python-dateutil - pytz - - s3fs - scipy - xarray - xlrd diff --git a/ci/deps/azure-37-locale.yaml b/ci/deps/azure-37-locale.yaml index 31155ac93931a..81e336cf1ed7f 100644 --- a/ci/deps/azure-37-locale.yaml +++ b/ci/deps/azure-37-locale.yaml @@ -27,7 +27,6 @@ dependencies: - pytables - python-dateutil - pytz - - s3fs - scipy - xarray - xlrd diff --git a/ci/deps/azure-windows-37.yaml b/ci/deps/azure-windows-37.yaml index 889d5c1bcfcdd..5bbd0e2795d7e 100644 --- a/ci/deps/azure-windows-37.yaml +++ b/ci/deps/azure-windows-37.yaml @@ -15,7 +15,8 @@ dependencies: # pandas dependencies - beautifulsoup4 - bottleneck - - gcsfs + - fsspec>=0.7.4 + - gcsfs>=0.6.0 - html5lib - jinja2 - lxml @@ -28,7 +29,7 @@ dependencies: - pytables - python-dateutil - pytz - - s3fs + - s3fs>=0.4.0 - scipy - sqlalchemy - xlrd diff --git a/ci/deps/travis-36-cov.yaml b/ci/deps/travis-36-cov.yaml index 2968c8f188d49..177e0d3f4c0af 100644 --- a/ci/deps/travis-36-cov.yaml +++ b/ci/deps/travis-36-cov.yaml @@ -18,7 +18,8 @@ dependencies: - cython>=0.29.16 - dask - fastparquet>=0.3.2 - - gcsfs + - fsspec>=0.7.4 + - gcsfs>=0.6.0 - geopandas - html5lib - matplotlib @@ -35,7 +36,7 @@ dependencies: - pytables - python-snappy - pytz - - s3fs + - s3fs>=0.4.0 - scikit-learn - scipy - sqlalchemy diff --git a/ci/deps/travis-36-locale.yaml b/ci/deps/travis-36-locale.yaml index 2c8403acf6971..03a1e751b6a86 100644 --- a/ci/deps/travis-36-locale.yaml +++ b/ci/deps/travis-36-locale.yaml @@ -16,7 +16,6 @@ dependencies: - blosc=1.14.3 - python-blosc - fastparquet=0.3.2 - - gcsfs=0.2.2 - html5lib - ipython - jinja2 @@ -33,7 +32,6 @@ dependencies: - pytables - python-dateutil - pytz - - s3fs=0.3.0 - scipy - sqlalchemy=1.1.4 - xarray=0.10 diff --git a/ci/deps/travis-36-slow.yaml b/ci/deps/travis-36-slow.yaml index df693f0e22c71..87bad59fa4873 100644 --- a/ci/deps/travis-36-slow.yaml +++ b/ci/deps/travis-36-slow.yaml @@ -13,6 +13,7 @@ dependencies: # pandas dependencies - beautifulsoup4 + - fsspec>=0.7.4 - html5lib - lxml - matplotlib @@ -25,7 +26,7 @@ dependencies: - pytables - python-dateutil - pytz - - s3fs + - s3fs>=0.4.0 - scipy - sqlalchemy - xlrd diff --git a/ci/deps/travis-37.yaml b/ci/deps/travis-37.yaml index 986728d0a4a40..e896233aac63c 100644 --- a/ci/deps/travis-37.yaml +++ b/ci/deps/travis-37.yaml @@ -13,12 +13,13 @@ dependencies: # pandas dependencies - botocore>=1.11 + - fsspec>=0.7.4 - numpy - python-dateutil - nomkl - pyarrow - pytz - - s3fs + - s3fs>=0.4.0 - tabulate - pyreadstat - pip diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index da1161c8f68b4..b79a9cd872c47 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -267,8 +267,9 @@ SQLAlchemy 1.1.4 SQL support for databases other tha SciPy 0.19.0 Miscellaneous statistical functions XLsxWriter 0.9.8 Excel writing blosc Compression for HDF5 +fsspec 0.7.4 Handling files aside from local and HTTP fastparquet 0.3.2 Parquet reading / writing -gcsfs 0.2.2 Google Cloud Storage access +gcsfs 0.6.0 Google Cloud Storage access html5lib HTML parser for read_html (see :ref:`note `) lxml 3.8.0 HTML parser for read_html (see :ref:`note `) matplotlib 2.2.2 Visualization @@ -282,7 +283,7 @@ pyreadstat SPSS files (.sav) reading pytables 3.4.3 HDF5 reading / writing pyxlsb 1.0.6 Reading for xlsb files qtpy Clipboard I/O -s3fs 0.3.0 Amazon S3 access +s3fs 0.4.0 Amazon S3 access tabulate 0.8.3 Printing in Markdown-friendly format (see `tabulate`_) xarray 0.8.2 pandas-like API for N-dimensional data xclip Clipboard I/O on linux diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 22b83425b58c2..9d9d809a295ea 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -245,6 +245,22 @@ If needed you can adjust the bins with the argument ``offset`` (a Timedelta) tha For a full example, see: :ref:`timeseries.adjust-the-start-of-the-bins`. +fsspec now used for filesystem handling +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +For reading and writing to filesystems other than local and reading from HTTP(S), +the optional dependency ``fsspec`` will be used to dispatch operations (:issue:`33452`). +This will give unchanged +functionality for S3 and GCS storage, which were already supported, but also add +support for several other storage implementations such as `Azure Datalake and Blob`_, +SSH, FTP, dropbox and github. For docs and capabilities, see the `fsspec docs`_. + +The existing capability to interface with S3 and GCS will be unaffected by this +change, as ``fsspec`` will still bring in the same packages as before. + +.. _Azure Datalake and Blob: https://github.com/dask/adlfs + +.. _fsspec docs: https://filesystem-spec.readthedocs.io/en/latest/ .. _whatsnew_110.enhancements.other: @@ -701,7 +717,9 @@ Optional libraries below the lowest tested version may still work, but are not c +-----------------+-----------------+---------+ | fastparquet | 0.3.2 | | +-----------------+-----------------+---------+ -| gcsfs | 0.2.2 | | +| fsspec | 0.7.4 | | ++-----------------+-----------------+---------+ +| gcsfs | 0.6.0 | X | +-----------------+-----------------+---------+ | lxml | 3.8.0 | | +-----------------+-----------------+---------+ @@ -717,7 +735,7 @@ Optional libraries below the lowest tested version may still work, but are not c +-----------------+-----------------+---------+ | pytables | 3.4.3 | X | +-----------------+-----------------+---------+ -| s3fs | 0.3.0 | | +| s3fs | 0.4.0 | X | +-----------------+-----------------+---------+ | scipy | 1.2.0 | X | +-----------------+-----------------+---------+ diff --git a/environment.yml b/environment.yml index b81404094fa4c..3783b7d360f1a 100644 --- a/environment.yml +++ b/environment.yml @@ -98,7 +98,9 @@ dependencies: - pyqt>=5.9.2 # pandas.read_clipboard - pytables>=3.4.3 # pandas.read_hdf, DataFrame.to_hdf - - s3fs # pandas.read_csv... when using 's3://...' path + - s3fs>=0.4.0 # file IO when using 's3://...' path + - fsspec>=0.7.4 # for generic remote file operations + - gcsfs>=0.6.0 # file IO when using 'gcs://...' path - sqlalchemy # pandas.read_sql, DataFrame.to_sql - xarray # DataFrame.to_xarray - cftime # Needed for downstream xarray.CFTimeIndex test diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py index 0a5e0f5050040..6423064732def 100644 --- a/pandas/compat/_optional.py +++ b/pandas/compat/_optional.py @@ -8,8 +8,9 @@ VERSIONS = { "bs4": "4.6.0", "bottleneck": "1.2.1", + "fsspec": "0.7.4", "fastparquet": "0.3.2", - "gcsfs": "0.2.2", + "gcsfs": "0.6.0", "lxml.etree": "3.8.0", "matplotlib": "2.2.2", "numexpr": "2.6.2", @@ -20,7 +21,7 @@ "pytables": "3.4.3", "pytest": "5.0.1", "pyxlsb": "1.0.6", - "s3fs": "0.3.0", + "s3fs": "0.4.0", "scipy": "1.2.0", "sqlalchemy": "1.1.4", "tables": "3.4.3", diff --git a/pandas/io/common.py b/pandas/io/common.py index 055f84970e916..51323c5ff3ef5 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -31,6 +31,7 @@ from pandas._typing import FilePathOrBuffer from pandas.compat import _get_lzma_file, _import_lzma +from pandas.compat._optional import import_optional_dependency from pandas.core.dtypes.common import is_file_like @@ -126,20 +127,6 @@ def stringify_path( return _expand_user(filepath_or_buffer) -def is_s3_url(url) -> bool: - """Check for an s3, s3n, or s3a url""" - if not isinstance(url, str): - return False - return parse_url(url).scheme in ["s3", "s3n", "s3a"] - - -def is_gcs_url(url) -> bool: - """Check for a gcs url""" - if not isinstance(url, str): - return False - return parse_url(url).scheme in ["gcs", "gs"] - - def urlopen(*args, **kwargs): """ Lazy-import wrapper for stdlib urlopen, as that imports a big chunk of @@ -150,31 +137,16 @@ def urlopen(*args, **kwargs): return urllib.request.urlopen(*args, **kwargs) -def get_fs_for_path(filepath: str): +def is_fsspec_url(url: FilePathOrBuffer) -> bool: """ - Get appropriate filesystem given a filepath. - Supports s3fs, gcs and local file system. - - Parameters - ---------- - filepath : str - File path. e.g s3://bucket/object, /local/path, gcs://pandas/obj - - Returns - ------- - s3fs.S3FileSystem, gcsfs.GCSFileSystem, None - Appropriate FileSystem to use. None for local filesystem. + Returns true if the given URL looks like + something fsspec can handle """ - if is_s3_url(filepath): - from pandas.io import s3 - - return s3.get_fs() - elif is_gcs_url(filepath): - from pandas.io import gcs - - return gcs.get_fs() - else: - return None + return ( + isinstance(url, str) + and "://" in url + and not url.startswith(("http://", "https://")) + ) def get_filepath_or_buffer( @@ -182,6 +154,7 @@ def get_filepath_or_buffer( encoding: Optional[str] = None, compression: Optional[str] = None, mode: Optional[str] = None, + storage_options: Optional[Dict[str, Any]] = None, ): """ If the filepath_or_buffer is a url, translate and return the buffer. @@ -194,6 +167,8 @@ def get_filepath_or_buffer( compression : {{'gzip', 'bz2', 'zip', 'xz', None}}, optional encoding : the encoding to use to decode bytes, default is 'utf-8' mode : str, optional + storage_options: dict, optional + passed on to fsspec, if using it; this is not yet accessed by the public API Returns ------- @@ -204,6 +179,7 @@ def get_filepath_or_buffer( filepath_or_buffer = stringify_path(filepath_or_buffer) if isinstance(filepath_or_buffer, str) and is_url(filepath_or_buffer): + # TODO: fsspec can also handle HTTP via requests, but leaving this unchanged req = urlopen(filepath_or_buffer) content_encoding = req.headers.get("Content-Encoding", None) if content_encoding == "gzip": @@ -213,19 +189,23 @@ def get_filepath_or_buffer( req.close() return reader, encoding, compression, True - if is_s3_url(filepath_or_buffer): - from pandas.io import s3 - - return s3.get_filepath_or_buffer( - filepath_or_buffer, encoding=encoding, compression=compression, mode=mode - ) - - if is_gcs_url(filepath_or_buffer): - from pandas.io import gcs - - return gcs.get_filepath_or_buffer( - filepath_or_buffer, encoding=encoding, compression=compression, mode=mode - ) + if is_fsspec_url(filepath_or_buffer): + assert isinstance( + filepath_or_buffer, str + ) # just to appease mypy for this branch + # two special-case s3-like protocols; these have special meaning in Hadoop, + # but are equivalent to just "s3" from fsspec's point of view + # cc #11071 + if filepath_or_buffer.startswith("s3a://"): + filepath_or_buffer = filepath_or_buffer.replace("s3a://", "s3://") + if filepath_or_buffer.startswith("s3n://"): + filepath_or_buffer = filepath_or_buffer.replace("s3n://", "s3://") + fsspec = import_optional_dependency("fsspec") + + file_obj = fsspec.open( + filepath_or_buffer, mode=mode or "rb", **(storage_options or {}) + ).open() + return file_obj, encoding, compression, True if isinstance(filepath_or_buffer, (str, bytes, mmap.mmap)): return _expand_user(filepath_or_buffer), None, compression, False diff --git a/pandas/io/gcs.py b/pandas/io/gcs.py deleted file mode 100644 index d2d8fc2d2139f..0000000000000 --- a/pandas/io/gcs.py +++ /dev/null @@ -1,22 +0,0 @@ -""" GCS support for remote file interactivity """ -from pandas.compat._optional import import_optional_dependency - -gcsfs = import_optional_dependency( - "gcsfs", extra="The gcsfs library is required to handle GCS files" -) - - -def get_fs(): - return gcsfs.GCSFileSystem() - - -def get_filepath_or_buffer( - filepath_or_buffer, encoding=None, compression=None, mode=None -): - - if mode is None: - mode = "rb" - - fs = get_fs() - filepath_or_buffer = fs.open(filepath_or_buffer, mode) - return filepath_or_buffer, None, compression, True diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index de9a14c82b3cb..a0c9242684f0f 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -8,12 +8,7 @@ from pandas import DataFrame, get_option -from pandas.io.common import ( - get_filepath_or_buffer, - get_fs_for_path, - is_gcs_url, - is_s3_url, -) +from pandas.io.common import _expand_user, get_filepath_or_buffer, is_fsspec_url def get_engine(engine: str) -> "BaseImpl": @@ -97,16 +92,24 @@ def write( **kwargs, ): self.validate_dataframe(df) - file_obj_or_path, _, _, should_close = get_filepath_or_buffer(path, mode="wb") from_pandas_kwargs: Dict[str, Any] = {"schema": kwargs.pop("schema", None)} if index is not None: from_pandas_kwargs["preserve_index"] = index table = self.api.Table.from_pandas(df, **from_pandas_kwargs) - # write_to_dataset does not support a file-like object when - # a directory path is used, so just pass the path string. + + if is_fsspec_url(path) and "filesystem" not in kwargs: + # make fsspec instance, which pyarrow will use to open paths + import_optional_dependency("fsspec") + import fsspec.core + + fs, path = fsspec.core.url_to_fs(path) + kwargs["filesystem"] = fs + else: + path = _expand_user(path) if partition_cols is not None: + # writes to multiple files under the given path self.api.parquet.write_to_dataset( table, path, @@ -115,17 +118,21 @@ def write( **kwargs, ) else: - self.api.parquet.write_table( - table, file_obj_or_path, compression=compression, **kwargs - ) - if should_close: - file_obj_or_path.close() + # write to single output file + self.api.parquet.write_table(table, path, compression=compression, **kwargs) def read(self, path, columns=None, **kwargs): - fs = get_fs_for_path(path) - should_close = None - # Avoid calling get_filepath_or_buffer for s3/gcs URLs since - # since it returns an S3File which doesn't support dir reads in arrow + if is_fsspec_url(path) and "filesystem" not in kwargs: + import_optional_dependency("fsspec") + import fsspec.core + + fs, path = fsspec.core.url_to_fs(path) + should_close = False + else: + fs = kwargs.pop("filesystem", None) + should_close = False + path = _expand_user(path) + if not fs: path, _, _, should_close = get_filepath_or_buffer(path) @@ -173,13 +180,11 @@ def write( if partition_cols is not None: kwargs["file_scheme"] = "hive" - if is_s3_url(path) or is_gcs_url(path): - # if path is s3:// or gs:// we need to open the file in 'wb' mode. - # TODO: Support 'ab' + if is_fsspec_url(path): + fsspec = import_optional_dependency("fsspec") - path, _, _, _ = get_filepath_or_buffer(path, mode="wb") - # And pass the opened file to the fastparquet internal impl. - kwargs["open_with"] = lambda path, _: path + # if filesystem is provided by fsspec, file must be opened in 'wb' mode. + kwargs["open_with"] = lambda path, _: fsspec.open(path, "wb").open() else: path, _, _, _ = get_filepath_or_buffer(path) @@ -194,17 +199,11 @@ def write( ) def read(self, path, columns=None, **kwargs): - if is_s3_url(path): - from pandas.io.s3 import get_file_and_filesystem + if is_fsspec_url(path): + fsspec = import_optional_dependency("fsspec") - # When path is s3:// an S3File is returned. - # We need to retain the original path(str) while also - # pass the S3File().open function to fastparquet impl. - s3, filesystem = get_file_and_filesystem(path) - try: - parquet_file = self.api.ParquetFile(path, open_with=filesystem.open) - finally: - s3.close() + open_with = lambda path, _: fsspec.open(path, "rb").open() + parquet_file = self.api.ParquetFile(path, open_with=open_with) else: path, _, _, _ = get_filepath_or_buffer(path) parquet_file = self.api.ParquetFile(path) diff --git a/pandas/io/s3.py b/pandas/io/s3.py deleted file mode 100644 index 329c861d2386a..0000000000000 --- a/pandas/io/s3.py +++ /dev/null @@ -1,53 +0,0 @@ -""" s3 support for remote file interactivity """ -from typing import IO, Any, Optional, Tuple -from urllib.parse import urlparse as parse_url - -from pandas._typing import FilePathOrBuffer -from pandas.compat._optional import import_optional_dependency - -s3fs = import_optional_dependency( - "s3fs", extra="The s3fs package is required to handle s3 files." -) - - -def _strip_schema(url): - """Returns the url without the s3:// part""" - result = parse_url(url, allow_fragments=False) - return result.netloc + result.path - - -def get_fs(): - return s3fs.S3FileSystem(anon=False) - - -def get_file_and_filesystem( - filepath_or_buffer: FilePathOrBuffer, mode: Optional[str] = None -) -> Tuple[IO, Any]: - from botocore.exceptions import NoCredentialsError - - if mode is None: - mode = "rb" - - fs = get_fs() - try: - file = fs.open(_strip_schema(filepath_or_buffer), mode) - except (FileNotFoundError, NoCredentialsError): - # boto3 has troubles when trying to access a public file - # when credentialed... - # An OSError is raised if you have credentials, but they - # aren't valid for that bucket. - # A NoCredentialsError is raised if you don't have creds - # for that bucket. - fs = get_fs() - file = fs.open(_strip_schema(filepath_or_buffer), mode) - return file, fs - - -def get_filepath_or_buffer( - filepath_or_buffer: FilePathOrBuffer, - encoding: Optional[str] = None, - compression: Optional[str] = None, - mode: Optional[str] = None, -) -> Tuple[IO, Optional[str], Optional[str], bool]: - file, _fs = get_file_and_filesystem(filepath_or_buffer, mode=mode) - return file, None, compression, True diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 56b854bee77d7..8578b31fbb81e 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1665,13 +1665,21 @@ def test_json_multiindex(self, dataframe, expected): assert result == expected def test_to_s3(self, s3_resource): + import time + # GH 28375 mock_bucket_name, target_file = "pandas-test", "test.json" df = DataFrame({"x": [1, 2, 3], "y": [2, 4, 6]}) df.to_json(f"s3://{mock_bucket_name}/{target_file}") - assert target_file in ( - obj.key for obj in s3_resource.Bucket("pandas-test").objects.all() - ) + timeout = 5 + while True: + if target_file in ( + obj.key for obj in s3_resource.Bucket("pandas-test").objects.all() + ): + break + time.sleep(0.1) + timeout -= 0.1 + assert timeout > 0, "Timed out waiting for file to appear on moto" def test_json_pandas_na(self): # GH 31615 diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index 6f1d4daeb39cb..e2f4ae04c1f9f 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -367,3 +367,13 @@ def test_unknown_engine(self): df.to_csv(path) with pytest.raises(ValueError, match="Unknown engine"): pd.read_csv(path, engine="pyt") + + +def test_is_fsspec_url(): + assert icom.is_fsspec_url("gcs://pandas/somethingelse.com") + assert icom.is_fsspec_url("gs://pandas/somethingelse.com") + # the following is the only remote URL that is handled without fsspec + assert not icom.is_fsspec_url("http://pandas/somethingelse.com") + assert not icom.is_fsspec_url("random:pandas/somethingelse.com") + assert not icom.is_fsspec_url("/local/path") + assert not icom.is_fsspec_url("relative/local/path") diff --git a/pandas/tests/io/test_fsspec.py b/pandas/tests/io/test_fsspec.py new file mode 100644 index 0000000000000..c397a61616c1c --- /dev/null +++ b/pandas/tests/io/test_fsspec.py @@ -0,0 +1,102 @@ +import numpy as np +import pytest + +from pandas import DataFrame, date_range, read_csv, read_parquet +import pandas._testing as tm +from pandas.util import _test_decorators as td + +df1 = DataFrame( + { + "int": [1, 3], + "float": [2.0, np.nan], + "str": ["t", "s"], + "dt": date_range("2018-06-18", periods=2), + } +) +# the ignore on the following line accounts for to_csv returning Optional(str) +# in general, but always str in the case we give no filename +text = df1.to_csv(index=False).encode() # type: ignore + + +@pytest.fixture +def cleared_fs(): + fsspec = pytest.importorskip("fsspec") + + memfs = fsspec.filesystem("memory") + yield memfs + memfs.store.clear() + + +def test_read_csv(cleared_fs): + from fsspec.implementations.memory import MemoryFile + + cleared_fs.store["test/test.csv"] = MemoryFile(data=text) + df2 = read_csv("memory://test/test.csv", parse_dates=["dt"]) + + tm.assert_frame_equal(df1, df2) + + +def test_reasonable_error(monkeypatch, cleared_fs): + from fsspec.registry import known_implementations + from fsspec import registry + + registry.target.clear() + with pytest.raises(ValueError) as e: + read_csv("nosuchprotocol://test/test.csv") + assert "nosuchprotocol" in str(e.value) + err_mgs = "test error messgae" + monkeypatch.setitem( + known_implementations, + "couldexist", + {"class": "unimportable.CouldExist", "err": err_mgs}, + ) + with pytest.raises(ImportError) as e: + read_csv("couldexist://test/test.csv") + assert err_mgs in str(e.value) + + +def test_to_csv(cleared_fs): + df1.to_csv("memory://test/test.csv", index=True) + df2 = read_csv("memory://test/test.csv", parse_dates=["dt"], index_col=0) + + tm.assert_frame_equal(df1, df2) + + +@td.skip_if_no("fastparquet") +def test_to_parquet_new_file(monkeypatch, cleared_fs): + """Regression test for writing to a not-yet-existent GCS Parquet file.""" + df1.to_parquet( + "memory://test/test.csv", index=True, engine="fastparquet", compression=None + ) + + +@td.skip_if_no("s3fs") +def test_from_s3_csv(s3_resource, tips_file): + tm.assert_equal(read_csv("s3://pandas-test/tips.csv"), read_csv(tips_file)) + # the following are decompressed by pandas, not fsspec + tm.assert_equal(read_csv("s3://pandas-test/tips.csv.gz"), read_csv(tips_file)) + tm.assert_equal(read_csv("s3://pandas-test/tips.csv.bz2"), read_csv(tips_file)) + + +@pytest.mark.parametrize("protocol", ["s3", "s3a", "s3n"]) +@td.skip_if_no("s3fs") +def test_s3_protocols(s3_resource, tips_file, protocol): + tm.assert_equal( + read_csv("%s://pandas-test/tips.csv" % protocol), read_csv(tips_file) + ) + + +@td.skip_if_no("s3fs") +@td.skip_if_no("fastparquet") +def test_s3_parquet(s3_resource): + fn = "s3://pandas-test/test.parquet" + df1.to_parquet(fn, index=False, engine="fastparquet", compression=None) + df2 = read_parquet(fn, engine="fastparquet") + tm.assert_equal(df1, df2) + + +@td.skip_if_installed("fsspec") +def test_not_present_exception(): + with pytest.raises(ImportError) as e: + read_csv("memory://test/test.csv") + assert "fsspec library is required" in str(e.value) diff --git a/pandas/tests/io/test_gcs.py b/pandas/tests/io/test_gcs.py index cf745fcc492a1..4d93119ffa3f5 100644 --- a/pandas/tests/io/test_gcs.py +++ b/pandas/tests/io/test_gcs.py @@ -1,4 +1,4 @@ -from io import StringIO +from io import BytesIO import os import numpy as np @@ -8,17 +8,14 @@ import pandas._testing as tm from pandas.util import _test_decorators as td -from pandas.io.common import is_gcs_url - - -def test_is_gcs_url(): - assert is_gcs_url("gcs://pandas/somethingelse.com") - assert is_gcs_url("gs://pandas/somethingelse.com") - assert not is_gcs_url("s3://pandas/somethingelse.com") - @td.skip_if_no("gcsfs") def test_read_csv_gcs(monkeypatch): + from fsspec import AbstractFileSystem + from fsspec import registry + + registry.target.clear() # noqa # remove state + df1 = DataFrame( { "int": [1, 3], @@ -28,9 +25,9 @@ def test_read_csv_gcs(monkeypatch): } ) - class MockGCSFileSystem: - def open(*args): - return StringIO(df1.to_csv(index=False)) + class MockGCSFileSystem(AbstractFileSystem): + def open(*args, **kwargs): + return BytesIO(df1.to_csv(index=False).encode()) monkeypatch.setattr("gcsfs.GCSFileSystem", MockGCSFileSystem) df2 = read_csv("gs://test/test.csv", parse_dates=["dt"]) @@ -40,6 +37,10 @@ def open(*args): @td.skip_if_no("gcsfs") def test_to_csv_gcs(monkeypatch): + from fsspec import AbstractFileSystem + from fsspec import registry + + registry.target.clear() # noqa # remove state df1 = DataFrame( { "int": [1, 3], @@ -48,20 +49,22 @@ def test_to_csv_gcs(monkeypatch): "dt": date_range("2018-06-18", periods=2), } ) - s = StringIO() + s = BytesIO() + s.close = lambda: True - class MockGCSFileSystem: - def open(*args): + class MockGCSFileSystem(AbstractFileSystem): + def open(*args, **kwargs): + s.seek(0) return s monkeypatch.setattr("gcsfs.GCSFileSystem", MockGCSFileSystem) df1.to_csv("gs://test/test.csv", index=True) def mock_get_filepath_or_buffer(*args, **kwargs): - return StringIO(df1.to_csv()), None, None, False + return BytesIO(df1.to_csv(index=True).encode()), None, None, False monkeypatch.setattr( - "pandas.io.gcs.get_filepath_or_buffer", mock_get_filepath_or_buffer + "pandas.io.common.get_filepath_or_buffer", mock_get_filepath_or_buffer ) df2 = read_csv("gs://test/test.csv", parse_dates=["dt"], index_col=0) @@ -73,6 +76,10 @@ def mock_get_filepath_or_buffer(*args, **kwargs): @td.skip_if_no("gcsfs") def test_to_parquet_gcs_new_file(monkeypatch, tmpdir): """Regression test for writing to a not-yet-existent GCS Parquet file.""" + from fsspec import AbstractFileSystem + from fsspec import registry + + registry.target.clear() # noqa # remove state df1 = DataFrame( { "int": [1, 3], @@ -82,7 +89,7 @@ def test_to_parquet_gcs_new_file(monkeypatch, tmpdir): } ) - class MockGCSFileSystem: + class MockGCSFileSystem(AbstractFileSystem): def open(self, path, mode="r", *args): if "w" not in mode: raise FileNotFoundError diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index efd34c58d7d19..82157f3d722a9 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -537,6 +537,18 @@ def test_categorical(self, pa): expected = df.astype(object) check_round_trip(df, pa, expected=expected) + def test_s3_roundtrip_explicit_fs(self, df_compat, s3_resource, pa): + s3fs = pytest.importorskip("s3fs") + s3 = s3fs.S3FileSystem() + kw = dict(filesystem=s3) + check_round_trip( + df_compat, + pa, + path="pandas-test/pyarrow.parquet", + read_kwargs=kw, + write_kwargs=kw, + ) + def test_s3_roundtrip(self, df_compat, s3_resource, pa): # GH #19134 check_round_trip(df_compat, pa, path="s3://pandas-test/pyarrow.parquet") @@ -544,8 +556,6 @@ def test_s3_roundtrip(self, df_compat, s3_resource, pa): @td.skip_if_no("s3fs") @pytest.mark.parametrize("partition_col", [["A"], []]) def test_s3_roundtrip_for_dir(self, df_compat, s3_resource, pa, partition_col): - from pandas.io.s3 import get_fs as get_s3_fs - # GH #26388 # https://github.com/apache/arrow/blob/master/python/pyarrow/tests/test_parquet.py#L2716 # As per pyarrow partitioned columns become 'categorical' dtypes @@ -559,11 +569,7 @@ def test_s3_roundtrip_for_dir(self, df_compat, s3_resource, pa, partition_col): pa, expected=expected_df, path="s3://pandas-test/parquet_dir", - write_kwargs={ - "partition_cols": partition_col, - "compression": None, - "filesystem": get_s3_fs(), - }, + write_kwargs={"partition_cols": partition_col, "compression": None}, check_like=True, repeat=1, ) @@ -585,6 +591,15 @@ def test_read_file_like_obj_support(self, df_compat): df_from_buf = pd.read_parquet(buffer) tm.assert_frame_equal(df_compat, df_from_buf) + @td.skip_if_no("pyarrow") + def test_expand_user(self, df_compat, monkeypatch): + monkeypatch.setenv("HOME", "TestingUser") + monkeypatch.setenv("USERPROFILE", "TestingUser") + with pytest.raises(OSError, match=r".*TestingUser.*"): + pd.read_parquet("~/file.parquet") + with pytest.raises(OSError, match=r".*TestingUser.*"): + df_compat.to_parquet("~/file.parquet") + def test_partition_cols_supported(self, pa, df_full): # GH #23283 partition_cols = ["bool", "int"] diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py index 42b4ea5ad9aac..e4d43db7834e3 100644 --- a/pandas/tests/io/test_pickle.py +++ b/pandas/tests/io/test_pickle.py @@ -455,42 +455,10 @@ def mock_urlopen_read(*args, **kwargs): tm.assert_frame_equal(df, result) -@td.skip_if_no("gcsfs") -@pytest.mark.parametrize("mockurl", ["gs://gcs.com", "gcs://gcs.com"]) -def test_pickle_gcsurl_roundtrip(monkeypatch, mockurl): - with tm.ensure_clean() as path: - - class MockGCSFileSystem: - def __init__(self, *args, **kwargs): - pass - - def open(self, *args): - mode = args[1] or None - f = open(path, mode) - return f - - monkeypatch.setattr("gcsfs.GCSFileSystem", MockGCSFileSystem) - df = tm.makeDataFrame() - df.to_pickle(mockurl) - result = pd.read_pickle(mockurl) - tm.assert_frame_equal(df, result) - - -@td.skip_if_no("s3fs") -@pytest.mark.parametrize("mockurl", ["s3://s3.com", "s3n://s3.com", "s3a://s3.com"]) -def test_pickle_s3url_roundtrip(monkeypatch, mockurl): - with tm.ensure_clean() as path: - - class MockS3FileSystem: - def __init__(self, *args, **kwargs): - pass - - def open(self, *args): - mode = args[1] or None - f = open(path, mode) - return f - - monkeypatch.setattr("s3fs.S3FileSystem", MockS3FileSystem) +@td.skip_if_no("fsspec") +def test_pickle_fsspec_roundtrip(): + with tm.ensure_clean(): + mockurl = "memory://afile" df = tm.makeDataFrame() df.to_pickle(mockurl) result = pd.read_pickle(mockurl) diff --git a/pandas/tests/io/test_s3.py b/pandas/tests/io/test_s3.py index 04c6979596eca..a76be9465f62a 100644 --- a/pandas/tests/io/test_s3.py +++ b/pandas/tests/io/test_s3.py @@ -4,14 +4,6 @@ from pandas import read_csv -from pandas.io.common import is_s3_url - - -class TestS3URL: - def test_is_s3_url(self): - assert is_s3_url("s3://pandas/somethingelse.com") - assert not is_s3_url("s4://pandas/somethingelse.com") - def test_streaming_s3_objects(): # GH17135 diff --git a/requirements-dev.txt b/requirements-dev.txt index 754ec7ae28748..90f9fec2f4bdf 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -65,7 +65,9 @@ pyarrow>=0.13.1 python-snappy pyqt5>=5.9.2 tables>=3.4.3 -s3fs +s3fs>=0.4.0 +fsspec>=0.7.4 +gcsfs>=0.6.0 sqlalchemy xarray cftime From 4129172f238fad7e5900150f3c9cfac9e4f6455b Mon Sep 17 00:00:00 2001 From: Erfan Nariman <34067903+erfannariman@users.noreply.github.com> Date: Tue, 23 Jun 2020 16:00:06 +0200 Subject: [PATCH 0177/1025] Removed unnecessary variable call (#34949) --- pandas/tests/series/methods/test_explode.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/tests/series/methods/test_explode.py b/pandas/tests/series/methods/test_explode.py index 979199e1efc62..a25cfadf12467 100644 --- a/pandas/tests/series/methods/test_explode.py +++ b/pandas/tests/series/methods/test_explode.py @@ -88,7 +88,6 @@ def test_typical_usecase(): columns=["var1", "var2"], ) exploded = df.var1.str.split(",").explode() - exploded result = df[["var2"]].join(exploded) expected = pd.DataFrame( {"var2": [1, 1, 1, 2, 2, 2], "var1": list("abcdef")}, From 772e426296ffc133504510a1d43257ba151ecb91 Mon Sep 17 00:00:00 2001 From: Sergey <40598665+glechic@users.noreply.github.com> Date: Tue, 23 Jun 2020 19:02:25 +0300 Subject: [PATCH 0178/1025] Update advanced.rst (#34950) --- doc/source/user_guide/advanced.rst | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/doc/source/user_guide/advanced.rst b/doc/source/user_guide/advanced.rst index d6f5c0c758b60..a0331dd632583 100644 --- a/doc/source/user_guide/advanced.rst +++ b/doc/source/user_guide/advanced.rst @@ -260,7 +260,9 @@ You don't have to specify all levels of the ``MultiIndex`` by passing only the first elements of the tuple. For example, you can use "partial" indexing to get all elements with ``bar`` in the first level as follows: -df.loc['bar'] +.. ipython:: python + + df.loc['bar'] This is a shortcut for the slightly more verbose notation ``df.loc[('bar',),]`` (equivalent to ``df.loc['bar',]`` in this example). From 96b446d31a46302e326c676726fc55f315f1c92b Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Tue, 23 Jun 2020 09:31:29 -0700 Subject: [PATCH 0179/1025] CLN: Update Cython data pointers for rolling apply (#34930) * CLN: Update Cython data pointers for rolling apply --- pandas/_libs/window/aggregations.pyx | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index 646444d10e416..ec4a412b5adc7 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -1377,17 +1377,11 @@ def roll_generic_fixed(object obj, output[i] = NaN # remaining full-length windows - buf = arr.data - bufarr = np.empty(win, dtype=float) - oldbuf = bufarr.data - for i in range((win - offset), (N - offset)): - buf = buf + 1 - bufarr.data = buf + for j, i in enumerate(range((win - offset), (N - offset)), 1): if counts[i] >= minp: - output[i] = func(bufarr, *args, **kwargs) + output[i] = func(arr[j:j + win], *args, **kwargs) else: output[i] = NaN - bufarr.data = oldbuf # truncated windows at the end for i in range(int_max(N - offset, 0), N): From 2a31ea56bfec627f72f5dd9da7a4bb3d9825dea8 Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Tue, 23 Jun 2020 18:33:08 +0100 Subject: [PATCH 0180/1025] BUILD: make tests discoverable in .devcontainer.json (#34929) --- .devcontainer.json | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.devcontainer.json b/.devcontainer.json index 315a1ff647012..8bea96aea29c1 100644 --- a/.devcontainer.json +++ b/.devcontainer.json @@ -17,7 +17,9 @@ "python.linting.pylintEnabled": false, "python.linting.mypyEnabled": true, "python.testing.pytestEnabled": true, - "python.testing.cwd": "pandas/tests" + "python.testing.pytestArgs": [ + "pandas" + ] }, // Add the IDs of extensions you want installed when the container is created in the array below. From 4414d2a8fad21ee1ea66513ad8a1f5738d1860e4 Mon Sep 17 00:00:00 2001 From: Gaurav Chauhan <2796gaurav@gmail.com> Date: Tue, 23 Jun 2020 23:05:07 +0530 Subject: [PATCH 0181/1025] #34569 Added proper description for pandas.Series.pop (#34606) --- pandas/core/frame.py | 43 +++++++++++++++++++++++++++++++++++ pandas/core/generic.py | 45 +++---------------------------------- pandas/core/reshape/melt.py | 7 ++++-- pandas/core/series.py | 27 ++++++++++++++++++++++ 4 files changed, 78 insertions(+), 44 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index d12ebeafe8510..1872f34dfcd7f 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4263,6 +4263,49 @@ def fillna( downcast=downcast, ) + def pop(self, item: Label) -> Series: + """ + Return item and drop from frame. Raise KeyError if not found. + + Parameters + ---------- + item : label + Label of column to be popped. + + Returns + ------- + Series + + Examples + -------- + >>> df = pd.DataFrame([('falcon', 'bird', 389.0), + ... ('parrot', 'bird', 24.0), + ... ('lion', 'mammal', 80.5), + ... ('monkey', 'mammal', np.nan)], + ... columns=('name', 'class', 'max_speed')) + >>> df + name class max_speed + 0 falcon bird 389.0 + 1 parrot bird 24.0 + 2 lion mammal 80.5 + 3 monkey mammal NaN + + >>> df.pop('class') + 0 bird + 1 bird + 2 mammal + 3 mammal + Name: class, dtype: object + + >>> df + name max_speed + 0 falcon 389.0 + 1 parrot 24.0 + 2 lion 80.5 + 3 monkey NaN + """ + return super().pop(item=item) + @doc(NDFrame.replace, **_shared_doc_kwargs) def replace( self, diff --git a/pandas/core/generic.py b/pandas/core/generic.py index bb2810ba7857f..fa92f702f07f5 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -105,6 +105,7 @@ if TYPE_CHECKING: from pandas.core.resample import Resampler + from pandas.core.series import Series # noqa: F401 # goal is to be able to define the docs close to function, while still being # able to share @@ -657,47 +658,7 @@ def droplevel(self: FrameOrSeries, level, axis=0) -> FrameOrSeries: result = self.set_axis(new_labels, axis=axis, inplace=False) return result - def pop(self: FrameOrSeries, item) -> FrameOrSeries: - """ - Return item and drop from frame. Raise KeyError if not found. - - Parameters - ---------- - item : str - Label of column to be popped. - - Returns - ------- - Series - - Examples - -------- - >>> df = pd.DataFrame([('falcon', 'bird', 389.0), - ... ('parrot', 'bird', 24.0), - ... ('lion', 'mammal', 80.5), - ... ('monkey', 'mammal', np.nan)], - ... columns=('name', 'class', 'max_speed')) - >>> df - name class max_speed - 0 falcon bird 389.0 - 1 parrot bird 24.0 - 2 lion mammal 80.5 - 3 monkey mammal NaN - - >>> df.pop('class') - 0 bird - 1 bird - 2 mammal - 3 mammal - Name: class, dtype: object - - >>> df - name max_speed - 0 falcon 389.0 - 1 parrot 24.0 - 2 lion 80.5 - 3 monkey NaN - """ + def pop(self, item: Label) -> Union["Series", Any]: result = self[item] del self[item] if self.ndim == 2: @@ -5396,7 +5357,7 @@ def dtypes(self): string object dtype: object """ - from pandas import Series + from pandas import Series # noqa: F811 return Series(self._mgr.get_dtypes(), index=self._info_axis, dtype=np.object_) diff --git a/pandas/core/reshape/melt.py b/pandas/core/reshape/melt.py index 7d22b86c5c07c..845f6b67693f4 100644 --- a/pandas/core/reshape/melt.py +++ b/pandas/core/reshape/melt.py @@ -1,5 +1,5 @@ import re -from typing import List +from typing import TYPE_CHECKING, List, cast import numpy as np @@ -16,6 +16,9 @@ from pandas.core.reshape.concat import concat from pandas.core.tools.numeric import to_numeric +if TYPE_CHECKING: + from pandas import Series # noqa: F401 + @Appender( _shared_docs["melt"] @@ -106,7 +109,7 @@ def melt( for col in id_vars: id_data = frame.pop(col) if is_extension_array_dtype(id_data): - id_data = concat([id_data] * K, ignore_index=True) + id_data = cast("Series", concat([id_data] * K, ignore_index=True)) else: id_data = np.tile(id_data._values, K) mdata[col] = id_data diff --git a/pandas/core/series.py b/pandas/core/series.py index cab8dd133b579..e8c72125e9998 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -4492,6 +4492,33 @@ def fillna( downcast=downcast, ) + def pop(self, item: Label) -> Any: + """ + Return item and drops from series. Raise KeyError if not found. + + Parameters + ---------- + item : label + Index of the element that needs to be removed. + + Returns + ------- + Value that is popped from series. + + Examples + -------- + >>> ser = pd.Series([1,2,3]) + + >>> ser.pop(0) + 1 + + >>> ser + 1 2 + 2 3 + dtype: int64 + """ + return super().pop(item=item) + @doc(NDFrame.replace, klass=_shared_doc_kwargs["klass"]) def replace( self, From b37e8b7f5a2af9c334dfb5712f5316c6bf03e31a Mon Sep 17 00:00:00 2001 From: avinashpancham <44933366+avinashpancham@users.noreply.github.com> Date: Tue, 23 Jun 2020 19:35:56 +0200 Subject: [PATCH 0182/1025] TST: Verify whether non writable numpy array is shiftable (21049) (#34919) --- pandas/tests/series/methods/test_shift.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/pandas/tests/series/methods/test_shift.py b/pandas/tests/series/methods/test_shift.py index 6257eecf4fc08..da6407c73104c 100644 --- a/pandas/tests/series/methods/test_shift.py +++ b/pandas/tests/series/methods/test_shift.py @@ -344,3 +344,16 @@ def test_shift_preserve_freqstr(self, periods): index=pd.date_range("2016-1-1 02:00:00", periods=periods, freq="H"), ) tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "input_data, output_data", + [(np.empty(shape=(0,)), []), (np.ones(shape=(2,)), [np.nan, 1.0])], + ) + def test_shift_non_writable_array(self, input_data, output_data): + # GH21049 Verify whether non writable numpy array is shiftable + input_data.setflags(write=False) + + result = pd.Series(input_data).shift(1) + expected = pd.Series(output_data, dtype="float64") + + tm.assert_series_equal(result, expected) From 9e837f209030d3a18c1ff1ec71c23c49ebbd099f Mon Sep 17 00:00:00 2001 From: smartswdeveloper Date: Tue, 23 Jun 2020 13:37:57 -0400 Subject: [PATCH 0183/1025] Add ddof to cov methods (#34611) --- doc/source/whatsnew/v1.1.0.rst | 1 + pandas/core/frame.py | 14 +++++++++++--- pandas/core/nanops.py | 9 +++++++-- pandas/core/series.py | 16 ++++++++++++++-- pandas/tests/frame/methods/test_cov_corr.py | 11 +++++++++++ pandas/tests/series/methods/test_cov_corr.py | 15 +++++++++++++++ 6 files changed, 59 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 9d9d809a295ea..8dbf14a83d3b7 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -314,6 +314,7 @@ Other enhancements result in object dtype but preserve the integer dtype (:issue:`33607`, :issue:`34339`). - :meth:`~pandas.io.gbq.read_gbq` now allows to disable progress bar (:issue:`33360`). - :meth:`~pandas.io.gbq.read_gbq` now supports the ``max_results`` kwarg from ``pandas-gbq`` (:issue:`34639`). +- :meth:`Dataframe.cov` and :meth:`Series.cov` now support a new parameter ddof to support delta degrees of freedom as in the corresponding numpy methods (:issue:`34611`). - :meth:`DataFrame.to_html` and :meth:`DataFrame.to_string`'s ``col_space`` parameter now accepts a list of dict to change only some specific columns' width (:issue:`28917`). .. --------------------------------------------------------------------------- diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 1872f34dfcd7f..55b30100175ae 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -8181,7 +8181,9 @@ def corr(self, method="pearson", min_periods=1) -> "DataFrame": return self._constructor(correl, index=idx, columns=cols) - def cov(self, min_periods=None) -> "DataFrame": + def cov( + self, min_periods: Optional[int] = None, ddof: Optional[int] = 1 + ) -> "DataFrame": """ Compute pairwise covariance of columns, excluding NA/null values. @@ -8206,6 +8208,12 @@ def cov(self, min_periods=None) -> "DataFrame": Minimum number of observations required per pair of columns to have a valid result. + ddof : int, default 1 + Delta degrees of freedom. The divisor used in calculations + is ``N - ddof``, where ``N`` represents the number of elements. + + versionadded:: 1.1.0 + Returns ------- DataFrame @@ -8221,7 +8229,7 @@ def cov(self, min_periods=None) -> "DataFrame": Notes ----- Returns the covariance matrix of the DataFrame's time series. - The covariance is normalized by N-1. + The covariance is normalized by N-ddof. For DataFrames that have Series that are missing data (assuming that data is `missing at random @@ -8284,7 +8292,7 @@ def cov(self, min_periods=None) -> "DataFrame": base_cov = np.empty((mat.shape[1], mat.shape[1])) base_cov.fill(np.nan) else: - base_cov = np.cov(mat.T) + base_cov = np.cov(mat.T, ddof=ddof) base_cov = base_cov.reshape((len(cols), len(cols))) else: base_cov = libalgos.nancorr(mat, cov=True, minp=min_periods) diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index e7e5e37bb7817..e7e28798d84a2 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -1383,7 +1383,12 @@ def func(a, b): @disallow("M8", "m8") -def nancov(a: np.ndarray, b: np.ndarray, min_periods: Optional[int] = None): +def nancov( + a: np.ndarray, + b: np.ndarray, + min_periods: Optional[int] = None, + ddof: Optional[int] = 1, +): if len(a) != len(b): raise AssertionError("Operands to nancov must have same size") @@ -1398,7 +1403,7 @@ def nancov(a: np.ndarray, b: np.ndarray, min_periods: Optional[int] = None): if len(a) < min_periods: return np.nan - return np.cov(a, b)[0, 1] + return np.cov(a, b, ddof=ddof)[0, 1] def _ensure_numeric(x): diff --git a/pandas/core/series.py b/pandas/core/series.py index e8c72125e9998..1aeb6271056c6 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2323,7 +2323,12 @@ def corr(self, other, method="pearson", min_periods=None) -> float: f"'{method}' was supplied" ) - def cov(self, other, min_periods=None) -> float: + def cov( + self, + other: "Series", + min_periods: Optional[int] = None, + ddof: Optional[int] = 1, + ) -> float: """ Compute covariance with Series, excluding missing values. @@ -2333,6 +2338,11 @@ def cov(self, other, min_periods=None) -> float: Series with which to compute the covariance. min_periods : int, optional Minimum number of observations needed to have a valid result. + ddof : int, default 1 + Delta degrees of freedom. The divisor used in calculations + is ``N - ddof``, where ``N`` represents the number of elements. + + versionadded:: 1.1.0 Returns ------- @@ -2354,7 +2364,9 @@ def cov(self, other, min_periods=None) -> float: this, other = self.align(other, join="inner", copy=False) if len(this) == 0: return np.nan - return nanops.nancov(this.values, other.values, min_periods=min_periods) + return nanops.nancov( + this.values, other.values, min_periods=min_periods, ddof=ddof + ) @doc( klass="Series", diff --git a/pandas/tests/frame/methods/test_cov_corr.py b/pandas/tests/frame/methods/test_cov_corr.py index 7d75db55c3073..d3548b639572d 100644 --- a/pandas/tests/frame/methods/test_cov_corr.py +++ b/pandas/tests/frame/methods/test_cov_corr.py @@ -58,6 +58,17 @@ def test_cov(self, float_frame, float_string_frame): ) tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize("test_ddof", [None, 0, 1, 2, 3]) + def test_cov_ddof(self, test_ddof): + # GH#34611 + np_array1 = np.random.rand(10) + np_array2 = np.random.rand(10) + df = DataFrame({0: np_array1, 1: np_array2}) + result = df.cov(ddof=test_ddof) + expected_np = np.cov(np_array1, np_array2, ddof=test_ddof) + expected = DataFrame(expected_np) + tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize( "other_column", [pd.array([1, 2, 3]), np.array([1.0, 2.0, 3.0])] ) diff --git a/pandas/tests/series/methods/test_cov_corr.py b/pandas/tests/series/methods/test_cov_corr.py index 1f6033d435323..282f499506aae 100644 --- a/pandas/tests/series/methods/test_cov_corr.py +++ b/pandas/tests/series/methods/test_cov_corr.py @@ -1,3 +1,5 @@ +import math + import numpy as np import pytest @@ -36,6 +38,19 @@ def test_cov(self, datetime_series): ts2 = datetime_series[5:].reindex(datetime_series.index) assert isna(ts1.cov(ts2, min_periods=12)) + @pytest.mark.parametrize("test_ddof", [None, 0, 1, 2, 3]) + def test_cov_ddof(self, test_ddof): + # GH#34611 + np_array1 = np.random.rand(10) + np_array2 = np.random.rand(10) + + s1 = Series(np_array1) + s2 = Series(np_array2) + + result = s1.cov(s2, ddof=test_ddof) + expected = np.cov(np_array1, np_array2, ddof=test_ddof)[0][1] + assert math.isclose(expected, result) + class TestSeriesCorr: @td.skip_if_no_scipy From c44fa213730a67dbae05984177773e6887dcdb05 Mon Sep 17 00:00:00 2001 From: Prakhar Pandey Date: Tue, 23 Jun 2020 23:15:32 +0530 Subject: [PATCH 0184/1025] =?UTF-8?q?TST=20:=20Added=20test=20for=20creati?= =?UTF-8?q?ng=20empty=20dataframe=20with=20column=20of=20type=20str?= =?UTF-8?q?=E2=80=A6=20(#34920)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * TST : Added test for creating empty dataframe with column of type string (#34915) --- pandas/tests/frame/test_constructors.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 39cab3d5ec0b8..02a871666c78d 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -1643,6 +1643,12 @@ def test_constructor_empty_with_string_dtype(self): df = DataFrame(index=[0, 1], columns=[0, 1], dtype="U5") tm.assert_frame_equal(df, expected) + def test_constructor_empty_with_string_extension(self): + # GH 34915 + expected = DataFrame(index=[], columns=["c1"], dtype="string") + df = DataFrame(columns=["c1"], dtype="string") + tm.assert_frame_equal(df, expected) + def test_constructor_single_value(self): # expecting single value upcasting here df = DataFrame(0.0, index=[1, 2, 3], columns=["a", "b", "c"]) From 53124cc3b0334ef1c75cef5aa3850531a7a4837f Mon Sep 17 00:00:00 2001 From: Kevin Bowey Date: Tue, 23 Jun 2020 21:28:52 +0200 Subject: [PATCH 0185/1025] TST: pandas/test/window/ changes for #30999 (#34907) --- .../tests/window/moments/test_moments_ewm.py | 23 +++++++---- .../window/moments/test_moments_rolling.py | 24 ++++++----- pandas/tests/window/test_dtypes.py | 3 +- pandas/tests/window/test_ewm.py | 19 +++++---- pandas/tests/window/test_expanding.py | 7 +++- pandas/tests/window/test_timeseries_window.py | 41 ++++++++++++++----- 6 files changed, 77 insertions(+), 40 deletions(-) diff --git a/pandas/tests/window/moments/test_moments_ewm.py b/pandas/tests/window/moments/test_moments_ewm.py index c6a92c0ad47b6..89d46a8bb6cb5 100644 --- a/pandas/tests/window/moments/test_moments_ewm.py +++ b/pandas/tests/window/moments/test_moments_ewm.py @@ -116,10 +116,12 @@ def test_ewma_span_com_args(series): A = series.ewm(com=9.5).mean() B = series.ewm(span=20).mean() tm.assert_almost_equal(A, B) - - with pytest.raises(ValueError): + msg = "comass, span, halflife, and alpha are mutually exclusive" + with pytest.raises(ValueError, match=msg): series.ewm(com=9.5, span=20) - with pytest.raises(ValueError): + + msg = "Must pass one of comass, span, halflife, or alpha" + with pytest.raises(ValueError, match=msg): series.ewm().mean() @@ -127,8 +129,8 @@ def test_ewma_halflife_arg(series): A = series.ewm(com=13.932726172912965).mean() B = series.ewm(halflife=10.0).mean() tm.assert_almost_equal(A, B) - - with pytest.raises(ValueError): + msg = "comass, span, halflife, and alpha are mutually exclusive" + with pytest.raises(ValueError, match=msg): series.ewm(span=20, halflife=50) with pytest.raises(ValueError): series.ewm(com=9.5, halflife=50) @@ -153,13 +155,16 @@ def test_ewm_alpha(arr): def test_ewm_alpha_arg(series): # GH 10789 s = series - with pytest.raises(ValueError): + msg = "Must pass one of comass, span, halflife, or alpha" + with pytest.raises(ValueError, match=msg): s.ewm() - with pytest.raises(ValueError): + + msg = "comass, span, halflife, and alpha are mutually exclusive" + with pytest.raises(ValueError, match=msg): s.ewm(com=10.0, alpha=0.5) - with pytest.raises(ValueError): + with pytest.raises(ValueError, match=msg): s.ewm(span=10.0, alpha=0.5) - with pytest.raises(ValueError): + with pytest.raises(ValueError, match=msg): s.ewm(halflife=10.0, alpha=0.5) diff --git a/pandas/tests/window/moments/test_moments_rolling.py b/pandas/tests/window/moments/test_moments_rolling.py index f6e2834965da3..81f020fe7de23 100644 --- a/pandas/tests/window/moments/test_moments_rolling.py +++ b/pandas/tests/window/moments/test_moments_rolling.py @@ -198,7 +198,8 @@ def test_centered_axis_validation(): Series(np.ones(10)).rolling(window=3, center=True, axis=0).mean() # bad axis - with pytest.raises(ValueError): + msg = "No axis named 1 for object type Series" + with pytest.raises(ValueError, match=msg): Series(np.ones(10)).rolling(window=3, center=True, axis=1).mean() # ok ok @@ -206,7 +207,8 @@ def test_centered_axis_validation(): DataFrame(np.ones((10, 10))).rolling(window=3, center=True, axis=1).mean() # bad axis - with pytest.raises(ValueError): + msg = "No axis named 2 for object type DataFrame" + with pytest.raises(ValueError, match=msg): (DataFrame(np.ones((10, 10))).rolling(window=3, center=True, axis=2).mean()) @@ -743,8 +745,8 @@ def test_rolling_min(raw, series, frame): result = a.rolling(window=100, min_periods=1).min() expected = pd.Series(np.ones(len(a))) tm.assert_series_equal(result, expected) - - with pytest.raises(ValueError): + msg = "min_periods 5 must be <= window 3" + with pytest.raises(ValueError, match=msg): pd.Series([1, 2, 3]).rolling(window=3, min_periods=5).min() @@ -754,8 +756,8 @@ def test_rolling_max(raw, series, frame): a = pd.Series([1, 2, 3, 4, 5], dtype=np.float64) b = a.rolling(window=100, min_periods=1).max() tm.assert_almost_equal(a, b) - - with pytest.raises(ValueError): + msg = "min_periods 5 must be <= window 3" + with pytest.raises(ValueError, match=msg): pd.Series([1, 2, 3]).rolling(window=3, min_periods=5).max() @@ -841,14 +843,16 @@ def test_invalid_quantile_value(): def test_rolling_quantile_param(): ser = Series([0.0, 0.1, 0.5, 0.9, 1.0]) - - with pytest.raises(ValueError): + msg = "quantile value -0.1 not in \\[0, 1\\]" + with pytest.raises(ValueError, match=msg): ser.rolling(3).quantile(-0.1) - with pytest.raises(ValueError): + msg = "quantile value 10.0 not in \\[0, 1\\]" + with pytest.raises(ValueError, match=msg): ser.rolling(3).quantile(10.0) - with pytest.raises(TypeError): + msg = "must be real number, not str" + with pytest.raises(TypeError, match=msg): ser.rolling(3).quantile("foo") diff --git a/pandas/tests/window/test_dtypes.py b/pandas/tests/window/test_dtypes.py index b1c9b66ab09d3..0aa5bf019ff5e 100644 --- a/pandas/tests/window/test_dtypes.py +++ b/pandas/tests/window/test_dtypes.py @@ -220,7 +220,8 @@ def check_dtypes(self, f, f_name, d, d_name, exp): tm.assert_almost_equal(result, exp) else: - with pytest.raises(DataError): + msg = "No numeric types to aggregate" + with pytest.raises(DataError, match=msg): f(roll) diff --git a/pandas/tests/window/test_ewm.py b/pandas/tests/window/test_ewm.py index 0957cac7aff95..44015597ddb19 100644 --- a/pandas/tests/window/test_ewm.py +++ b/pandas/tests/window/test_ewm.py @@ -28,28 +28,33 @@ def test_constructor(which): c(halflife=0.75, alpha=None) # not valid: mutually exclusive - with pytest.raises(ValueError): + msg = "comass, span, halflife, and alpha are mutually exclusive" + with pytest.raises(ValueError, match=msg): c(com=0.5, alpha=0.5) - with pytest.raises(ValueError): + with pytest.raises(ValueError, match=msg): c(span=1.5, halflife=0.75) - with pytest.raises(ValueError): + with pytest.raises(ValueError, match=msg): c(alpha=0.5, span=1.5) # not valid: com < 0 - with pytest.raises(ValueError): + msg = "comass must satisfy: comass >= 0" + with pytest.raises(ValueError, match=msg): c(com=-0.5) # not valid: span < 1 - with pytest.raises(ValueError): + msg = "span must satisfy: span >= 1" + with pytest.raises(ValueError, match=msg): c(span=0.5) # not valid: halflife <= 0 - with pytest.raises(ValueError): + msg = "halflife must satisfy: halflife > 0" + with pytest.raises(ValueError, match=msg): c(halflife=0) # not valid: alpha <= 0 or alpha > 1 + msg = "alpha must satisfy: 0 < alpha <= 1" for alpha in (-0.5, 1.5): - with pytest.raises(ValueError): + with pytest.raises(ValueError, match=msg): c(alpha=alpha) diff --git a/pandas/tests/window/test_expanding.py b/pandas/tests/window/test_expanding.py index b57467385d371..30d65ebe84a1f 100644 --- a/pandas/tests/window/test_expanding.py +++ b/pandas/tests/window/test_expanding.py @@ -28,9 +28,12 @@ def test_constructor(which): # not valid for w in [2.0, "foo", np.array([2])]: - with pytest.raises(ValueError): + msg = "min_periods must be an integer" + with pytest.raises(ValueError, match=msg): c(min_periods=w) - with pytest.raises(ValueError): + + msg = "center must be a boolean" + with pytest.raises(ValueError, match=msg): c(min_periods=1, center=w) diff --git a/pandas/tests/window/test_timeseries_window.py b/pandas/tests/window/test_timeseries_window.py index 0c5289cd78fed..8aa4d7103e48a 100644 --- a/pandas/tests/window/test_timeseries_window.py +++ b/pandas/tests/window/test_timeseries_window.py @@ -55,28 +55,35 @@ def test_valid(self): df = self.regular # not a valid freq - with pytest.raises(ValueError): + msg = "passed window foobar is not compatible with a datetimelike index" + with pytest.raises(ValueError, match=msg): df.rolling(window="foobar") - # not a datetimelike index - with pytest.raises(ValueError): + msg = "window must be an integer" + with pytest.raises(ValueError, match=msg): df.reset_index().rolling(window="foobar") # non-fixed freqs + msg = "\\<2 \\* MonthBegins\\> is a non-fixed frequency" for freq in ["2MS", offsets.MonthBegin(2)]: - with pytest.raises(ValueError): + with pytest.raises(ValueError, match=msg): df.rolling(window=freq) for freq in ["1D", offsets.Day(2), "2ms"]: df.rolling(window=freq) # non-integer min_periods + msg = ( + r"local variable 'minp' referenced before assignment|" + "min_periods must be an integer" + ) for minp in [1.0, "foo", np.array([1, 2, 3])]: - with pytest.raises(ValueError): + with pytest.raises(ValueError, match=msg): df.rolling(window="1D", min_periods=minp) # center is not implemented - with pytest.raises(NotImplementedError): + msg = "center is not implemented for datetimelike and offset based windows" + with pytest.raises(NotImplementedError, match=msg): df.rolling(window="1D", center=True) def test_on(self): @@ -84,7 +91,11 @@ def test_on(self): df = self.regular # not a valid column - with pytest.raises(ValueError): + msg = ( + r"invalid on specified as foobar, must be a column " + "\\(of DataFrame\\), an Index or None" + ) + with pytest.raises(ValueError, match=msg): df.rolling(window="2s", on="foobar") # column is valid @@ -93,7 +104,8 @@ def test_on(self): df.rolling(window="2d", on="C").sum() # invalid columns - with pytest.raises(ValueError): + msg = "window must be an integer" + with pytest.raises(ValueError, match=msg): df.rolling(window="2d", on="B") # ok even though on non-selected @@ -125,11 +137,17 @@ def test_non_monotonic_on(self): assert not df.index.is_monotonic - with pytest.raises(ValueError): + msg = "index must be monotonic" + with pytest.raises(ValueError, match=msg): df.rolling("2s").sum() df = df.reset_index() - with pytest.raises(ValueError): + + msg = ( + r"invalid on specified as A, must be a column " + "\\(of DataFrame\\), an Index or None" + ) + with pytest.raises(ValueError, match=msg): df.rolling("2s", on="A").sum() def test_frame_on(self): @@ -254,7 +272,8 @@ def test_closed(self): ) # closed must be 'right', 'left', 'both', 'neither' - with pytest.raises(ValueError): + msg = "closed must be 'right', 'left', 'both' or 'neither'" + with pytest.raises(ValueError, match=msg): self.regular.rolling(window="2s", closed="blabla") expected = df.copy() From d8f65d04bbf2f5e5f7245bf3d4194be442a43068 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mar=C3=ADa=20Marino?= <44098774+marinomaria@users.noreply.github.com> Date: Tue, 23 Jun 2020 16:44:18 -0300 Subject: [PATCH 0186/1025] DOC: Fix #33451 (#34955) --- pandas/core/frame.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 55b30100175ae..de3276124a795 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4006,7 +4006,8 @@ def drop( level : int or level name, optional For MultiIndex, level from which the labels will be removed. inplace : bool, default False - If True, do operation inplace and return None. + If False, return a copy. Otherwise, do operation + inplace and return None. errors : {'ignore', 'raise'}, default 'raise' If 'ignore', suppress error and only existing labels are dropped. From bf8f0bc4bbe8bea3f37cf96ba810c0d641321abe Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Tue, 23 Jun 2020 21:08:42 +0100 Subject: [PATCH 0187/1025] DOC: misc sphinx directive fixes (#34960) --- pandas/core/common.py | 2 +- pandas/core/frame.py | 2 +- pandas/core/generic.py | 2 +- pandas/core/series.py | 2 +- pandas/plotting/_core.py | 4 ++-- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/pandas/core/common.py b/pandas/core/common.py index af24f8d707abd..b4f726f4e59a9 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -404,7 +404,7 @@ def random_state(state=None): If receives `None`, returns np.random. If receives anything else, raises an informative ValueError. - ..versionchanged:: 1.1.0 + .. versionchanged:: 1.1.0 array-like and BitGenerator (for NumPy>=1.18) object now passed to np.random.RandomState() as seed diff --git a/pandas/core/frame.py b/pandas/core/frame.py index de3276124a795..4e8a75fe1b597 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -8213,7 +8213,7 @@ def cov( Delta degrees of freedom. The divisor used in calculations is ``N - ddof``, where ``N`` represents the number of elements. - versionadded:: 1.1.0 + .. versionadded:: 1.1.0 Returns ------- diff --git a/pandas/core/generic.py b/pandas/core/generic.py index fa92f702f07f5..61361c3331d5e 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -4809,7 +4809,7 @@ def sample( random number generator If np.random.RandomState, use as numpy RandomState object. - ..versionchanged:: 1.1.0 + .. versionchanged:: 1.1.0 array-like and BitGenerator (for NumPy>=1.17) object now passed to np.random.RandomState() as seed diff --git a/pandas/core/series.py b/pandas/core/series.py index 1aeb6271056c6..3674537c0137e 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2342,7 +2342,7 @@ def cov( Delta degrees of freedom. The divisor used in calculations is ``N - ddof``, where ``N`` represents the number of elements. - versionadded:: 1.1.0 + .. versionadded:: 1.1.0 Returns ------- diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index 4f5b7b2d7a888..4eb68367560b6 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -67,7 +67,7 @@ def hist_series( legend : bool, default False Whether to show the legend. - ..versionadded:: 1.1.0 + .. versionadded:: 1.1.0 **kwargs To be passed to the actual plotting function. @@ -179,7 +179,7 @@ def hist_frame( legend : bool, default False Whether to show the legend. - ..versionadded:: 1.1.0 + .. versionadded:: 1.1.0 **kwargs All other plotting keyword arguments to be passed to From 0ec4794e6e38530a5744bf8bfdb2eea22673f1f8 Mon Sep 17 00:00:00 2001 From: Phan Duc Nhat Minh Date: Wed, 24 Jun 2020 04:13:51 +0800 Subject: [PATCH 0188/1025] Fix DataFrame/Series stack/unstack docs (#34927) --- pandas/core/frame.py | 4 ---- pandas/core/series.py | 2 -- 2 files changed, 6 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 4e8a75fe1b597..e5de9b428e2d5 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6783,8 +6783,6 @@ def stack(self, level=-1, dropna=True): level(s) is (are) taken from the prescribed level(s) and the output is a DataFrame. - The new index levels are sorted. - Parameters ---------- level : int, str, list, default -1 @@ -7020,8 +7018,6 @@ def unstack(self, level=-1, fill_value=None): If the index is not a MultiIndex, the output will be a Series (the analogue of stack when the columns are not a MultiIndex). - The level involved will automatically get sorted. - Parameters ---------- level : int, str, or list of these, default -1 (last level) diff --git a/pandas/core/series.py b/pandas/core/series.py index 3674537c0137e..d8cf8308142a6 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -3835,8 +3835,6 @@ def unstack(self, level=-1, fill_value=None): """ Unstack, also known as pivot, Series with MultiIndex to produce DataFrame. - The level involved will automatically get sorted. - Parameters ---------- level : int, str, or list of these, default last level From 1a1ee16d6bea2fcb7d556605ccda5a3abef8ac0e Mon Sep 17 00:00:00 2001 From: Ali McMaster Date: Tue, 23 Jun 2020 21:16:38 +0100 Subject: [PATCH 0189/1025] Fix language style (#34924) --- asv_bench/benchmarks/groupby.py | 4 +- ci/code_checks.sh | 6 +-- doc/source/whatsnew/v0.14.1.rst | 2 +- pandas/core/groupby/base.py | 14 +++--- pandas/core/groupby/generic.py | 20 ++++---- pandas/core/groupby/groupby.py | 8 ++-- .../{test_whitelist.py => test_allowlist.py} | 48 +++++++++---------- .../tests/groupby/transform/test_transform.py | 2 +- 8 files changed, 52 insertions(+), 52 deletions(-) rename pandas/tests/groupby/{test_whitelist.py => test_allowlist.py} (90%) diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index c9ac275cc4ea7..5ffda03fad80f 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -16,7 +16,7 @@ from .pandas_vb_common import tm -method_blacklist = { +method_blocklist = { "object": { "median", "prod", @@ -403,7 +403,7 @@ class GroupByMethods: ] def setup(self, dtype, method, application): - if method in method_blacklist.get(dtype, {}): + if method in method_blocklist.get(dtype, {}): raise NotImplementedError # skip benchmark ngroups = 1000 size = ngroups * 2 diff --git a/ci/code_checks.sh b/ci/code_checks.sh index f7a513ca22d53..7b12de387d648 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -248,19 +248,19 @@ fi ### CODE ### if [[ -z "$CHECK" || "$CHECK" == "code" ]]; then - MSG='Check import. No warnings, and blacklist some optional dependencies' ; echo $MSG + MSG='Check import. No warnings, and blocklist some optional dependencies' ; echo $MSG python -W error -c " import sys import pandas -blacklist = {'bs4', 'gcsfs', 'html5lib', 'http', 'ipython', 'jinja2', 'hypothesis', +blocklist = {'bs4', 'gcsfs', 'html5lib', 'http', 'ipython', 'jinja2', 'hypothesis', 'lxml', 'matplotlib', 'numexpr', 'openpyxl', 'py', 'pytest', 's3fs', 'scipy', 'tables', 'urllib.request', 'xlrd', 'xlsxwriter', 'xlwt'} # GH#28227 for some of these check for top-level modules, while others are # more specific (e.g. urllib.request) import_mods = set(m.split('.')[0] for m in sys.modules) | set(sys.modules) -mods = blacklist & import_mods +mods = blocklist & import_mods if mods: sys.stderr.write('err: pandas should not import: {}\n'.format(', '.join(mods))) sys.exit(len(mods)) diff --git a/doc/source/whatsnew/v0.14.1.rst b/doc/source/whatsnew/v0.14.1.rst index 3dfc4272681df..5de193007474c 100644 --- a/doc/source/whatsnew/v0.14.1.rst +++ b/doc/source/whatsnew/v0.14.1.rst @@ -131,7 +131,7 @@ Enhancements - Implemented ``sem`` (standard error of the mean) operation for ``Series``, ``DataFrame``, ``Panel``, and ``Groupby`` (:issue:`6897`) -- Add ``nlargest`` and ``nsmallest`` to the ``Series`` ``groupby`` whitelist, +- Add ``nlargest`` and ``nsmallest`` to the ``Series`` ``groupby`` allowlist, which means you can now use these methods on a ``SeriesGroupBy`` object (:issue:`7053`). - All offsets ``apply``, ``rollforward`` and ``rollback`` can now handle ``np.datetime64``, previously results in ``ApplyTypeError`` (:issue:`7452`) diff --git a/pandas/core/groupby/base.py b/pandas/core/groupby/base.py index 08352d737dee0..e71b2f94c8014 100644 --- a/pandas/core/groupby/base.py +++ b/pandas/core/groupby/base.py @@ -1,6 +1,6 @@ """ Provide basic components for groupby. These definitions -hold the whitelist of methods that are exposed on the +hold the allowlist of methods that are exposed on the SeriesGroupBy and the DataFrameGroupBy objects. """ import collections @@ -53,7 +53,7 @@ def _gotitem(self, key, ndim, subset=None): # forwarding methods from NDFrames plotting_methods = frozenset(["plot", "hist"]) -common_apply_whitelist = ( +common_apply_allowlist = ( frozenset( [ "quantile", @@ -72,9 +72,9 @@ def _gotitem(self, key, ndim, subset=None): | plotting_methods ) -series_apply_whitelist = ( +series_apply_allowlist = ( ( - common_apply_whitelist + common_apply_allowlist | { "nlargest", "nsmallest", @@ -84,13 +84,13 @@ def _gotitem(self, key, ndim, subset=None): ) ) | frozenset(["dtype", "unique"]) -dataframe_apply_whitelist = common_apply_whitelist | frozenset(["dtypes", "corrwith"]) +dataframe_apply_allowlist = common_apply_allowlist | frozenset(["dtypes", "corrwith"]) # cythonized transformations or canned "agg+broadcast", which do not # require postprocessing of the result by transform. cythonized_kernels = frozenset(["cumprod", "cumsum", "shift", "cummin", "cummax"]) -cython_cast_blacklist = frozenset(["rank", "count", "size", "idxmin", "idxmax"]) +cython_cast_blocklist = frozenset(["rank", "count", "size", "idxmin", "idxmax"]) # List of aggregation/reduction functions. # These map each group to a single numeric value @@ -186,4 +186,4 @@ def _gotitem(self, key, ndim, subset=None): # Valid values of `name` for `groupby.transform(name)` # NOTE: do NOT edit this directly. New additions should be inserted # into the appropriate list above. -transform_kernel_whitelist = reduction_kernels | transformation_kernels +transform_kernel_allowlist = reduction_kernels | transformation_kernels diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index bc5cf595e49f9..dab8475d9580c 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -121,15 +121,15 @@ def prop(self): return property(prop) -def pin_whitelisted_properties(klass: Type[FrameOrSeries], whitelist: FrozenSet[str]): +def pin_allowlisted_properties(klass: Type[FrameOrSeries], allowlist: FrozenSet[str]): """ - Create GroupBy member defs for DataFrame/Series names in a whitelist. + Create GroupBy member defs for DataFrame/Series names in a allowlist. Parameters ---------- klass : DataFrame or Series class class where members are defined. - whitelist : frozenset[str] + allowlist : frozenset[str] Set of names of klass methods to be constructed Returns @@ -143,7 +143,7 @@ class decorator """ def pinner(cls): - for name in whitelist: + for name in allowlist: if hasattr(cls, name): # don't override anything that was explicitly defined # in the base class @@ -157,9 +157,9 @@ def pinner(cls): return pinner -@pin_whitelisted_properties(Series, base.series_apply_whitelist) +@pin_allowlisted_properties(Series, base.series_apply_allowlist) class SeriesGroupBy(GroupBy[Series]): - _apply_whitelist = base.series_apply_whitelist + _apply_allowlist = base.series_apply_allowlist def _iterate_slices(self) -> Iterable[Series]: yield self._selected_obj @@ -473,7 +473,7 @@ def transform(self, func, *args, engine="cython", engine_kwargs=None, **kwargs): func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs ) - elif func not in base.transform_kernel_whitelist: + elif func not in base.transform_kernel_allowlist: msg = f"'{func}' is not a valid function name for transform(name)" raise ValueError(msg) elif func in base.cythonized_kernels: @@ -835,10 +835,10 @@ def pct_change(self, periods=1, fill_method="pad", limit=None, freq=None): return (filled / shifted) - 1 -@pin_whitelisted_properties(DataFrame, base.dataframe_apply_whitelist) +@pin_allowlisted_properties(DataFrame, base.dataframe_apply_allowlist) class DataFrameGroupBy(GroupBy[DataFrame]): - _apply_whitelist = base.dataframe_apply_whitelist + _apply_allowlist = base.dataframe_apply_allowlist _agg_examples_doc = dedent( """ @@ -1456,7 +1456,7 @@ def transform(self, func, *args, engine="cython", engine_kwargs=None, **kwargs): func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs ) - elif func not in base.transform_kernel_whitelist: + elif func not in base.transform_kernel_allowlist: msg = f"'{func}' is not a valid function name for transform(name)" raise ValueError(msg) elif func in base.cythonized_kernels: diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 02f7f605a7605..d039b715b3c08 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -475,7 +475,7 @@ def _group_selection_context(groupby): class _GroupBy(PandasObject, SelectionMixin, Generic[FrameOrSeries]): _group_selection = None - _apply_whitelist: FrozenSet[str] = frozenset() + _apply_allowlist: FrozenSet[str] = frozenset() def __init__( self, @@ -689,7 +689,7 @@ def _set_result_index_ordered(self, result): return result def _dir_additions(self): - return self.obj._dir_additions() | self._apply_whitelist + return self.obj._dir_additions() | self._apply_allowlist def __getattr__(self, attr: str): if attr in self._internal_names_set: @@ -729,7 +729,7 @@ def pipe(self, func, *args, **kwargs): plot = property(GroupByPlot) def _make_wrapper(self, name): - assert name in self._apply_whitelist + assert name in self._apply_allowlist self._set_group_selection() @@ -944,7 +944,7 @@ def _transform_should_cast(self, func_nm: str) -> bool: """ filled_series = self.grouper.size().fillna(0) assert filled_series is not None - return filled_series.gt(0).any() and func_nm not in base.cython_cast_blacklist + return filled_series.gt(0).any() and func_nm not in base.cython_cast_blocklist def _cython_transform(self, how: str, numeric_only: bool = True, **kwargs): output: Dict[base.OutputKey, np.ndarray] = {} diff --git a/pandas/tests/groupby/test_whitelist.py b/pandas/tests/groupby/test_allowlist.py similarity index 90% rename from pandas/tests/groupby/test_whitelist.py rename to pandas/tests/groupby/test_allowlist.py index 9b595328d9230..0fd66cc047017 100644 --- a/pandas/tests/groupby/test_whitelist.py +++ b/pandas/tests/groupby/test_allowlist.py @@ -31,7 +31,7 @@ ] AGG_FUNCTIONS_WITH_SKIPNA = ["skew", "mad"] -df_whitelist = [ +df_allowlist = [ "quantile", "fillna", "mad", @@ -50,12 +50,12 @@ ] -@pytest.fixture(params=df_whitelist) -def df_whitelist_fixture(request): +@pytest.fixture(params=df_allowlist) +def df_allowlist_fixture(request): return request.param -s_whitelist = [ +s_allowlist = [ "quantile", "fillna", "mad", @@ -78,8 +78,8 @@ def df_whitelist_fixture(request): ] -@pytest.fixture(params=s_whitelist) -def s_whitelist_fixture(request): +@pytest.fixture(params=s_allowlist) +def s_allowlist_fixture(request): return request.param @@ -119,10 +119,10 @@ def df_letters(): return df -@pytest.mark.parametrize("whitelist", [df_whitelist, s_whitelist]) -def test_groupby_whitelist(df_letters, whitelist): +@pytest.mark.parametrize("allowlist", [df_allowlist, s_allowlist]) +def test_groupby_allowlist(df_letters, allowlist): df = df_letters - if whitelist == df_whitelist: + if allowlist == df_allowlist: # dataframe obj = df_letters else: @@ -130,11 +130,11 @@ def test_groupby_whitelist(df_letters, whitelist): gb = obj.groupby(df.letters) - assert set(whitelist) == set(gb._apply_whitelist) + assert set(allowlist) == set(gb._apply_allowlist) -def check_whitelist(obj, df, m): - # check the obj for a particular whitelist m +def check_allowlist(obj, df, m): + # check the obj for a particular allowlist m gb = obj.groupby(df.letters) @@ -155,16 +155,16 @@ def check_whitelist(obj, df, m): assert n.endswith(m) -def test_groupby_series_whitelist(df_letters, s_whitelist_fixture): - m = s_whitelist_fixture +def test_groupby_series_allowlist(df_letters, s_allowlist_fixture): + m = s_allowlist_fixture df = df_letters - check_whitelist(df.letters, df, m) + check_allowlist(df.letters, df, m) -def test_groupby_frame_whitelist(df_letters, df_whitelist_fixture): - m = df_whitelist_fixture +def test_groupby_frame_allowlist(df_letters, df_allowlist_fixture): + m = df_allowlist_fixture df = df_letters - check_whitelist(df, df, m) + check_allowlist(df, df, m) @pytest.fixture @@ -187,10 +187,10 @@ def raw_frame(): @pytest.mark.parametrize("axis", [0, 1]) @pytest.mark.parametrize("skipna", [True, False]) @pytest.mark.parametrize("sort", [True, False]) -def test_regression_whitelist_methods(raw_frame, op, level, axis, skipna, sort): +def test_regression_allowlist_methods(raw_frame, op, level, axis, skipna, sort): # GH6944 # GH 17537 - # explicitly test the whitelist methods + # explicitly test the allowlist methods if axis == 0: frame = raw_frame @@ -213,11 +213,11 @@ def test_regression_whitelist_methods(raw_frame, op, level, axis, skipna, sort): tm.assert_frame_equal(result, expected) -def test_groupby_blacklist(df_letters): +def test_groupby_blocklist(df_letters): df = df_letters s = df_letters.floats - blacklist = [ + blocklist = [ "eval", "query", "abs", @@ -234,9 +234,9 @@ def test_groupby_blacklist(df_letters): ] to_methods = [method for method in dir(df) if method.startswith("to_")] - blacklist.extend(to_methods) + blocklist.extend(to_methods) - for bl in blacklist: + for bl in blocklist: for obj in (df, s): gb = obj.groupby(df.letters) diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index fd4ee2a81ebd8..cdaf27e214d80 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -728,7 +728,7 @@ def test_cython_transform_frame(op, args, targop): # dict(by=['int','string'])]: gb = df.groupby(**gb_target) - # whitelisted methods set the selection before applying + # allowlisted methods set the selection before applying # bit a of hack to make sure the cythonized shift # is equivalent to pre 0.17.1 behavior if op == "shift": From ece7e706eb12bcb1ce77fe6a3132b2bfd9474e48 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 23 Jun 2020 13:20:28 -0700 Subject: [PATCH 0190/1025] REF: simplify _is_single_block/is_mixed_type (#34935) --- pandas/core/internals/blocks.py | 9 ++------- pandas/core/internals/managers.py | 13 +++---------- 2 files changed, 5 insertions(+), 17 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 38c495e1dd0f3..0c98a779424bd 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -168,10 +168,6 @@ def _holder(self): def _consolidate_key(self): return (self._can_consolidate, self.dtype.name) - @property - def _is_single_block(self) -> bool: - return self.ndim == 1 - @property def is_view(self) -> bool: """ return a boolean if I am possibly a view """ @@ -259,7 +255,7 @@ def make_block_same_class(self, values, placement=None, ndim=None): def __repr__(self) -> str: # don't want to print out all of the items here name = type(self).__name__ - if self._is_single_block: + if self.ndim == 1: result = f"{name}: {len(self)} dtype: {self.dtype}" else: @@ -476,8 +472,7 @@ def downcast(self, dtypes=None): values = self.values - # single block handling - if self._is_single_block: + if self.ndim == 1: # try to cast all non-floats here if dtypes is None: diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index eaf59051205d6..6055a6205d286 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -220,16 +220,8 @@ def set_axis(self, axis: int, new_labels: Index) -> None: @property def _is_single_block(self) -> bool: - if self.ndim == 1: - return True - - if len(self.blocks) != 1: - return False - - blk = self.blocks[0] - return blk.mgr_locs.is_slice_like and blk.mgr_locs.as_slice == slice( - 0, len(self), 1 - ) + # Assumes we are 2D; overriden by SingleBlockManager + return len(self.blocks) == 1 def _rebuild_blknos_and_blklocs(self) -> None: """ @@ -1486,6 +1478,7 @@ class SingleBlockManager(BlockManager): _is_consolidated = True _known_consolidated = True __slots__ = () + _is_single_block = True def __init__( self, From 2be52514ef770ee1045e996a861deb64538dd0c6 Mon Sep 17 00:00:00 2001 From: rhshadrach <45562402+rhshadrach@users.noreply.github.com> Date: Tue, 23 Jun 2020 18:07:00 -0400 Subject: [PATCH 0191/1025] BUG: reset_index doesn't preserve dtype on empty frame with MultiIndex (#34942) --- doc/source/whatsnew/v1.1.0.rst | 1 + pandas/core/frame.py | 2 +- pandas/tests/frame/methods/test_reset_index.py | 8 ++++++++ pandas/tests/series/methods/test_reset_index.py | 10 ++++++++++ 4 files changed, 20 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 8dbf14a83d3b7..1a9387801a283 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -953,6 +953,7 @@ Indexing - Bug in :meth:`Series.at` when used with a :class:`MultiIndex` would raise an exception on valid inputs (:issue:`26989`) - Bug in :meth:`DataFrame.loc` with dictionary of values changes columns with dtype of ``int`` to ``float`` (:issue:`34573`) - Bug in :meth:`Series.loc` when used with a :class:`MultiIndex` would raise an IndexingError when accessing a None value (:issue:`34318`) +- Bug in :meth:`DataFrame.reset_index` and :meth:`Series.reset_index` would not preserve data types on an empty :class:`DataFrame` or :class:`Series` with a :class:`MultiIndex` (:issue:`19602`) Missing ^^^^^^^ diff --git a/pandas/core/frame.py b/pandas/core/frame.py index e5de9b428e2d5..678e64db2beba 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4723,7 +4723,7 @@ def _maybe_casted_values(index, labels=None): # we can have situations where the whole mask is -1, # meaning there is nothing found in labels, so make all nan's if mask.all(): - values = np.empty(len(mask)) + values = np.empty(len(mask), dtype=index.dtype) values.fill(np.nan) else: values = values.take(labels) diff --git a/pandas/tests/frame/methods/test_reset_index.py b/pandas/tests/frame/methods/test_reset_index.py index 6586c19af2539..79442acccb326 100644 --- a/pandas/tests/frame/methods/test_reset_index.py +++ b/pandas/tests/frame/methods/test_reset_index.py @@ -297,3 +297,11 @@ def test_reset_index_range(self): index=RangeIndex(stop=2), ) tm.assert_frame_equal(result, expected) + + +def test_reset_index_dtypes_on_empty_frame_with_multiindex(): + # GH 19602 - Preserve dtype on empty DataFrame with MultiIndex + idx = MultiIndex.from_product([[0, 1], [0.5, 1.0], ["a", "b"]]) + result = DataFrame(index=idx)[:0].reset_index().dtypes + expected = Series({"level_0": np.int64, "level_1": np.float64, "level_2": object}) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/methods/test_reset_index.py b/pandas/tests/series/methods/test_reset_index.py index f0c4895ad7c10..a11590d42552d 100644 --- a/pandas/tests/series/methods/test_reset_index.py +++ b/pandas/tests/series/methods/test_reset_index.py @@ -108,3 +108,13 @@ def test_reset_index_drop_errors(self): s = Series(range(4), index=MultiIndex.from_product([[1, 2]] * 2)) with pytest.raises(KeyError, match="not found"): s.reset_index("wrong", drop=True) + + +def test_reset_index_dtypes_on_empty_series_with_multiindex(): + # GH 19602 - Preserve dtype on empty Series with MultiIndex + idx = MultiIndex.from_product([[0, 1], [0.5, 1.0], ["a", "b"]]) + result = Series(dtype=object, index=idx)[:0].reset_index().dtypes + expected = Series( + {"level_0": np.int64, "level_1": np.float64, "level_2": object, 0: object} + ) + tm.assert_series_equal(result, expected) From 301bbc7ce2a59f8dbfaf41d54582481eac0c6d04 Mon Sep 17 00:00:00 2001 From: Kevin Bowey Date: Wed, 24 Jun 2020 00:08:39 +0200 Subject: [PATCH 0192/1025] TST: disallow bare pytest raises (#34940) --- pandas/tests/frame/methods/test_assign.py | 6 ++- pandas/tests/frame/methods/test_at_time.py | 3 +- .../tests/frame/methods/test_between_time.py | 3 +- .../frame/methods/test_first_and_last.py | 6 ++- .../tests/frame/methods/test_interpolate.py | 15 ++++++- pandas/tests/frame/methods/test_replace.py | 6 ++- pandas/tests/frame/test_query_eval.py | 41 +++++++++++++------ 7 files changed, 58 insertions(+), 22 deletions(-) diff --git a/pandas/tests/frame/methods/test_assign.py b/pandas/tests/frame/methods/test_assign.py index 63b9f031de188..0ae501d43e742 100644 --- a/pandas/tests/frame/methods/test_assign.py +++ b/pandas/tests/frame/methods/test_assign.py @@ -65,9 +65,11 @@ def test_assign_bad(self): df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) # non-keyword argument - with pytest.raises(TypeError): + msg = r"assign\(\) takes 1 positional argument but 2 were given" + with pytest.raises(TypeError, match=msg): df.assign(lambda x: x.A) - with pytest.raises(AttributeError): + msg = "'DataFrame' object has no attribute 'C'" + with pytest.raises(AttributeError, match=msg): df.assign(C=df.A, D=df.A + df.C) def test_assign_dependent(self): diff --git a/pandas/tests/frame/methods/test_at_time.py b/pandas/tests/frame/methods/test_at_time.py index 71368f270147f..ac98d632c5dcd 100644 --- a/pandas/tests/frame/methods/test_at_time.py +++ b/pandas/tests/frame/methods/test_at_time.py @@ -65,7 +65,8 @@ def test_at_time_tz(self): def test_at_time_raises(self): # GH#20725 df = DataFrame([[1, 2, 3], [4, 5, 6]]) - with pytest.raises(TypeError): # index is not a DatetimeIndex + msg = "Index must be DatetimeIndex" + with pytest.raises(TypeError, match=msg): # index is not a DatetimeIndex df.at_time("00:00") @pytest.mark.parametrize("axis", ["index", "columns", 0, 1]) diff --git a/pandas/tests/frame/methods/test_between_time.py b/pandas/tests/frame/methods/test_between_time.py index b40604b4f4a16..19e802d0fa663 100644 --- a/pandas/tests/frame/methods/test_between_time.py +++ b/pandas/tests/frame/methods/test_between_time.py @@ -68,7 +68,8 @@ def test_between_time(self, close_open_fixture): def test_between_time_raises(self): # GH#20725 df = DataFrame([[1, 2, 3], [4, 5, 6]]) - with pytest.raises(TypeError): # index is not a DatetimeIndex + msg = "Index must be DatetimeIndex" + with pytest.raises(TypeError, match=msg): # index is not a DatetimeIndex df.between_time(start_time="00:00", end_time="12:00") def test_between_time_axis(self, axis): diff --git a/pandas/tests/frame/methods/test_first_and_last.py b/pandas/tests/frame/methods/test_first_and_last.py index 73e4128ddebb9..2b3756969acca 100644 --- a/pandas/tests/frame/methods/test_first_and_last.py +++ b/pandas/tests/frame/methods/test_first_and_last.py @@ -31,7 +31,8 @@ def test_first_subset(self): def test_first_raises(self): # GH#20725 df = DataFrame([[1, 2, 3], [4, 5, 6]]) - with pytest.raises(TypeError): # index is not a DatetimeIndex + msg = "'first' only supports a DatetimeIndex index" + with pytest.raises(TypeError, match=msg): # index is not a DatetimeIndex df.first("1D") def test_last_subset(self): @@ -57,5 +58,6 @@ def test_last_subset(self): def test_last_raises(self): # GH20725 df = DataFrame([[1, 2, 3], [4, 5, 6]]) - with pytest.raises(TypeError): # index is not a DatetimeIndex + msg = "'last' only supports a DatetimeIndex index" + with pytest.raises(TypeError, match=msg): # index is not a DatetimeIndex df.last("1D") diff --git a/pandas/tests/frame/methods/test_interpolate.py b/pandas/tests/frame/methods/test_interpolate.py index efb3d719016bb..facb116646573 100644 --- a/pandas/tests/frame/methods/test_interpolate.py +++ b/pandas/tests/frame/methods/test_interpolate.py @@ -43,7 +43,14 @@ def test_interp_bad_method(self): "D": list("abcd"), } ) - with pytest.raises(ValueError): + msg = ( + r"method must be one of \['linear', 'time', 'index', 'values', " + r"'nearest', 'zero', 'slinear', 'quadratic', 'cubic', " + r"'barycentric', 'krogh', 'spline', 'polynomial', " + r"'from_derivatives', 'piecewise_polynomial', 'pchip', 'akima', " + r"'cubicspline'\]. Got 'not_a_method' instead." + ) + with pytest.raises(ValueError, match=msg): df.interpolate(method="not_a_method") def test_interp_combo(self): @@ -67,7 +74,11 @@ def test_interp_combo(self): def test_interp_nan_idx(self): df = DataFrame({"A": [1, 2, np.nan, 4], "B": [np.nan, 2, 3, 4]}) df = df.set_index("A") - with pytest.raises(NotImplementedError): + msg = ( + "Interpolation with NaNs in the index has not been implemented. " + "Try filling those NaNs before interpolating." + ) + with pytest.raises(NotImplementedError, match=msg): df.interpolate(method="values") @td.skip_if_no_scipy diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py index 3bcc26e85e347..3b9a724d74c7d 100644 --- a/pandas/tests/frame/methods/test_replace.py +++ b/pandas/tests/frame/methods/test_replace.py @@ -1314,7 +1314,11 @@ def test_categorical_replace_with_dict(self, replace_dict, final_data): expected = DataFrame({"a": a, "b": b}) result = df.replace(replace_dict, 3) tm.assert_frame_equal(result, expected) - with pytest.raises(AssertionError): + msg = ( + r"Attributes of DataFrame.iloc\[:, 0\] \(column name=\"a\"\) are " + "different" + ) + with pytest.raises(AssertionError, match=msg): # ensure non-inplace call does not affect original tm.assert_frame_equal(df, expected) df.replace(replace_dict, 3, inplace=True) diff --git a/pandas/tests/frame/test_query_eval.py b/pandas/tests/frame/test_query_eval.py index 89f268f8b6bc6..98a2a33822e3b 100644 --- a/pandas/tests/frame/test_query_eval.py +++ b/pandas/tests/frame/test_query_eval.py @@ -71,9 +71,14 @@ def test_query_numexpr(self): result = df.eval("A+1", engine="numexpr") tm.assert_series_equal(result, self.expected2, check_names=False) else: - with pytest.raises(ImportError): + msg = ( + r"'numexpr' is not installed or an unsupported version. " + r"Cannot use engine='numexpr' for query/eval if 'numexpr' is " + r"not installed" + ) + with pytest.raises(ImportError, match=msg): df.query("A>0", engine="numexpr") - with pytest.raises(ImportError): + with pytest.raises(ImportError, match=msg): df.eval("A+1", engine="numexpr") @@ -452,14 +457,16 @@ def test_date_query_with_non_date(self): result = df.query("dates != nondate", parser=parser, engine=engine) tm.assert_frame_equal(result, df) + msg = r"Invalid comparison between dtype=datetime64\[ns\] and ndarray" for op in ["<", ">", "<=", ">="]: - with pytest.raises(TypeError): + with pytest.raises(TypeError, match=msg): df.query(f"dates {op} nondate", parser=parser, engine=engine) def test_query_syntax_error(self): engine, parser = self.engine, self.parser df = DataFrame({"i": range(10), "+": range(3, 13), "r": range(4, 14)}) - with pytest.raises(SyntaxError): + msg = "invalid syntax" + with pytest.raises(SyntaxError, match=msg): df.query("i - +", engine=engine, parser=parser) def test_query_scope(self): @@ -781,7 +788,8 @@ def test_date_index_query_with_NaT_duplicates(self): df["dates3"] = date_range("1/1/2014", periods=n) df.loc[np.random.rand(n) > 0.5, "dates1"] = pd.NaT df.set_index("dates1", inplace=True, drop=True) - with pytest.raises(NotImplementedError): + msg = r"'BoolOp' nodes are not implemented" + with pytest.raises(NotImplementedError, match=msg): df.query("index < 20130101 < dates3", engine=engine, parser=parser) def test_nested_scope(self): @@ -798,7 +806,8 @@ def test_nested_scope(self): df2 = DataFrame(np.random.randn(5, 3)) # don't have the pandas parser - with pytest.raises(SyntaxError): + msg = r"The '@' prefix is only supported by the pandas parser" + with pytest.raises(SyntaxError, match=msg): df.query("(@df>0) & (@df2>0)", engine=engine, parser=parser) with pytest.raises(UndefinedVariableError, match="name 'df' is not defined"): @@ -867,10 +876,10 @@ def test_str_query_method(self, parser, engine): eq, ne = "==", "!=" ops = 2 * ([eq] + [ne]) + msg = r"'(Not)?In' nodes are not implemented" for lhs, op, rhs in zip(lhs, ops, rhs): ex = f"{lhs} {op} {rhs}" - msg = r"'(Not)?In' nodes are not implemented" with pytest.raises(NotImplementedError, match=msg): df.query( ex, @@ -908,10 +917,11 @@ def test_str_list_query_method(self, parser, engine): eq, ne = "==", "!=" ops = 2 * ([eq] + [ne]) + msg = r"'(Not)?In' nodes are not implemented" for lhs, op, rhs in zip(lhs, ops, rhs): ex = f"{lhs} {op} {rhs}" - with pytest.raises(NotImplementedError): + with pytest.raises(NotImplementedError, match=msg): df.query(ex, engine=engine, parser=parser) else: res = df.query('strings == ["a", "b"]', engine=engine, parser=parser) @@ -946,10 +956,12 @@ def test_query_with_string_columns(self, parser, engine): expec = df[df.a.isin(df.b) & (df.c < df.d)] tm.assert_frame_equal(res, expec) else: - with pytest.raises(NotImplementedError): + msg = r"'(Not)?In' nodes are not implemented" + with pytest.raises(NotImplementedError, match=msg): df.query("a in b", parser=parser, engine=engine) - with pytest.raises(NotImplementedError): + msg = r"'BoolOp' nodes are not implemented" + with pytest.raises(NotImplementedError, match=msg): df.query("a in b and c < d", parser=parser, engine=engine) def test_object_array_eq_ne(self, parser, engine): @@ -1186,15 +1198,18 @@ def test_missing_attribute(self, df): df.eval("@pd.thing") def test_failing_quote(self, df): - with pytest.raises(SyntaxError): + msg = r"(Could not convert ).*( to a valid Python identifier.)" + with pytest.raises(SyntaxError, match=msg): df.query("`it's` > `that's`") def test_failing_character_outside_range(self, df): - with pytest.raises(SyntaxError): + msg = r"(Could not convert ).*( to a valid Python identifier.)" + with pytest.raises(SyntaxError, match=msg): df.query("`☺` > 4") def test_failing_hashtag(self, df): - with pytest.raises(SyntaxError): + msg = "Failed to parse backticks" + with pytest.raises(SyntaxError, match=msg): df.query("`foo#bar` > 4") def test_call_non_named_expression(self, df): From 1e670806604a4999d28de000e3ca9d9c1d00b2c5 Mon Sep 17 00:00:00 2001 From: Olga Lyashevska Date: Tue, 23 Jun 2020 23:36:21 +0100 Subject: [PATCH 0193/1025] BUG: plotting layout patch (#34905) --- doc/source/whatsnew/v1.1.0.rst | 1 + pandas/plotting/_matplotlib/misc.py | 1 + 2 files changed, 2 insertions(+) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 1a9387801a283..60aa1759958f6 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -1028,6 +1028,7 @@ Plotting - Bug in :meth:`DataFrame.hist` where the order of ``column`` argument was ignored (:issue:`29235`) - Bug in :meth:`DataFrame.plot.scatter` that when adding multiple plots with different ``cmap``, colorbars alway use the first ``cmap`` (:issue:`33389`) - Bug in :meth:`DataFrame.plot.scatter` was adding a colorbar to the plot even if the argument `c` was assigned to a column containing color names (:issue:`34316`) +- Bug in :meth:`pandas.plotting.bootstrap_plot` was causing cluttered axes and overlapping labels (:issue:`34905`) Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/pandas/plotting/_matplotlib/misc.py b/pandas/plotting/_matplotlib/misc.py index 0cafcfed38a54..bb6530b0f6412 100644 --- a/pandas/plotting/_matplotlib/misc.py +++ b/pandas/plotting/_matplotlib/misc.py @@ -301,6 +301,7 @@ def bootstrap_plot(series, fig=None, size=50, samples=500, **kwds): for axis in axes: plt.setp(axis.get_xticklabels(), fontsize=8) plt.setp(axis.get_yticklabels(), fontsize=8) + plt.tight_layout() return fig From 6eade47f5f390ef5c96a1568094773531fe8444f Mon Sep 17 00:00:00 2001 From: Mohammad Jafar Mashhadi Date: Wed, 24 Jun 2020 07:25:57 -0600 Subject: [PATCH 0194/1025] =?UTF-8?q?ENH:=20GH34946=20Check=20type=20of=20?= =?UTF-8?q?names=20argument=20to=20`read=5Fcsv`,=20`read=5Ftable`=E2=80=A6?= =?UTF-8?q?=20(#34956)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- doc/source/whatsnew/v1.1.0.rst | 1 + pandas/io/parsers.py | 7 +++++-- pandas/tests/io/parser/test_common.py | 10 ++++++++++ 3 files changed, 16 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 60aa1759958f6..9d151c78b2048 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -1121,6 +1121,7 @@ Other - :class:`IntegerArray` now implements the ``sum`` operation (:issue:`33172`) - Bug in :class:`Tick` comparisons raising ``TypeError`` when comparing against timedelta-like objects (:issue:`34088`) - Bug in :class:`Tick` multiplication raising ``TypeError`` when multiplying by a float (:issue:`34486`) +- Passing a `set` as `names` argument to :func:`pandas.read_csv`, :func:`pandas.read_table`, or :func:`pandas.read_fwf` will raise ``ValueError: Names should be an ordered collection.`` (:issue:`34946`) .. --------------------------------------------------------------------------- diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 679cf4c2d8929..62347f7110d76 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -397,7 +397,8 @@ def _validate_integer(name, val, min_val=0): def _validate_names(names): """ - Raise ValueError if the `names` parameter contains duplicates. + Raise ValueError if the `names` parameter contains duplicates or has an + invalid data type. Parameters ---------- @@ -407,11 +408,13 @@ def _validate_names(names): Raises ------ ValueError - If names are not unique. + If names are not unique or are not ordered (e.g. set). """ if names is not None: if len(names) != len(set(names)): raise ValueError("Duplicate names are not allowed.") + if not is_list_like(names, allow_sets=False): + raise ValueError("Names should be an ordered collection.") def _read(filepath_or_buffer: FilePathOrBuffer, kwds): diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index e38fcf1380220..e6e868689b060 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -2135,3 +2135,13 @@ def test_no_header_two_extra_columns(all_parsers): parser = all_parsers df = parser.read_csv(stream, header=None, names=column_names, index_col=False) tm.assert_frame_equal(df, ref) + + +def test_read_csv_names_not_accepting_sets(all_parsers): + # GH 34946 + data = """\ + 1,2,3 + 4,5,6\n""" + parser = all_parsers + with pytest.raises(ValueError, match="Names should be an ordered collection."): + parser.read_csv(StringIO(data), names=set("QAZ")) From dd263c119876fa66190d23e5d5928530d332e28f Mon Sep 17 00:00:00 2001 From: Robert de Vries Date: Wed, 24 Jun 2020 17:13:57 +0200 Subject: [PATCH 0195/1025] ENH: Add ods writer (#32911) --- doc/source/whatsnew/v1.1.0.rst | 2 + pandas/core/config_init.py | 10 + pandas/io/excel/__init__.py | 4 + pandas/io/excel/_base.py | 57 +++-- pandas/io/excel/_odfreader.py | 9 +- pandas/io/excel/_odswriter.py | 272 ++++++++++++++++++++++++ pandas/io/excel/_util.py | 7 +- pandas/tests/io/excel/test_odswriter.py | 17 ++ pandas/tests/io/excel/test_writers.py | 71 ++++--- 9 files changed, 405 insertions(+), 44 deletions(-) create mode 100644 pandas/io/excel/_odswriter.py create mode 100644 pandas/tests/io/excel/test_odswriter.py diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 9d151c78b2048..7c9fa53568f45 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -316,6 +316,7 @@ Other enhancements - :meth:`~pandas.io.gbq.read_gbq` now supports the ``max_results`` kwarg from ``pandas-gbq`` (:issue:`34639`). - :meth:`Dataframe.cov` and :meth:`Series.cov` now support a new parameter ddof to support delta degrees of freedom as in the corresponding numpy methods (:issue:`34611`). - :meth:`DataFrame.to_html` and :meth:`DataFrame.to_string`'s ``col_space`` parameter now accepts a list of dict to change only some specific columns' width (:issue:`28917`). +- :meth:`DataFrame.to_excel` can now also write OpenOffice spreadsheet (.ods) files (:issue:`27222`) .. --------------------------------------------------------------------------- @@ -1018,6 +1019,7 @@ I/O - Bug in :meth:`~SQLDatabase.execute` was raising a ``ProgrammingError`` for some DB-API drivers when the SQL statement contained the `%` character and no parameters were present (:issue:`34211`) - Bug in :meth:`~pandas.io.stata.StataReader` which resulted in categorical variables with difference dtypes when reading data using an iterator. (:issue:`31544`) - :meth:`HDFStore.keys` has now an optional `include` parameter that allows the retrieval of all native HDF5 table names (:issue:`29916`) +- Bug in :meth:`read_excel` for ODS files removes 0.0 values (:issue:`27222`) Plotting ^^^^^^^^ diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index 5089445c79897..54d23fe8829e6 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -553,6 +553,7 @@ def use_inf_as_na_cb(key): _xls_options = ["xlwt"] _xlsm_options = ["openpyxl"] _xlsx_options = ["openpyxl", "xlsxwriter"] +_ods_options = ["odf"] with cf.config_prefix("io.excel.xls"): @@ -581,6 +582,15 @@ def use_inf_as_na_cb(key): ) +with cf.config_prefix("io.excel.ods"): + cf.register_option( + "writer", + "auto", + writer_engine_doc.format(ext="ods", others=", ".join(_ods_options)), + validator=str, + ) + + # Set up the io.parquet specific configuration. parquet_engine_doc = """ : string diff --git a/pandas/io/excel/__init__.py b/pandas/io/excel/__init__.py index 455abaa7fb589..d035223957a76 100644 --- a/pandas/io/excel/__init__.py +++ b/pandas/io/excel/__init__.py @@ -1,4 +1,5 @@ from pandas.io.excel._base import ExcelFile, ExcelWriter, read_excel +from pandas.io.excel._odswriter import _ODSWriter from pandas.io.excel._openpyxl import _OpenpyxlWriter from pandas.io.excel._util import register_writer from pandas.io.excel._xlsxwriter import _XlsxWriter @@ -14,3 +15,6 @@ register_writer(_XlsxWriter) + + +register_writer(_ODSWriter) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 6c3b49b9afc68..4fa4f158e9c3c 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -1,8 +1,9 @@ import abc import datetime -from io import BytesIO +from io import BufferedIOBase, BytesIO, RawIOBase import os from textwrap import fill +from typing import Union from pandas._config import config @@ -533,13 +534,13 @@ class ExcelWriter(metaclass=abc.ABCMeta): """ Class for writing DataFrame objects into excel sheets. - Default is to use xlwt for xls, openpyxl for xlsx. + Default is to use xlwt for xls, openpyxl for xlsx, odf for ods. See DataFrame.to_excel for typical usage. Parameters ---------- path : str - Path to xls or xlsx file. + Path to xls or xlsx or ods file. engine : str (optional) Engine to use for writing. If None, defaults to ``io.excel..writer``. NOTE: can only be passed as a keyword @@ -692,10 +693,7 @@ def __init__( # validate that this engine can handle the extension if isinstance(path, str): ext = os.path.splitext(path)[-1] - else: - ext = "xls" if engine == "xlwt" else "xlsx" - - self.check_extension(ext) + self.check_extension(ext) self.path = path self.sheets = {} @@ -781,6 +779,34 @@ def close(self): return self.save() +def _is_ods_stream(stream: Union[BufferedIOBase, RawIOBase]) -> bool: + """ + Check if the stream is an OpenDocument Spreadsheet (.ods) file + + It uses magic values inside the stream + + Parameters + ---------- + stream : Union[BufferedIOBase, RawIOBase] + IO stream with data which might be an ODS file + + Returns + ------- + is_ods : bool + Boolean indication that this is indeed an ODS file or not + """ + stream.seek(0) + is_ods = False + if stream.read(4) == b"PK\003\004": + stream.seek(30) + is_ods = ( + stream.read(54) == b"mimetype" + b"application/vnd.oasis.opendocument.spreadsheet" + ) + stream.seek(0) + return is_ods + + class ExcelFile: """ Class for parsing tabular excel sheets into DataFrame objects. @@ -789,8 +815,8 @@ class ExcelFile: Parameters ---------- - io : str, path object (pathlib.Path or py._path.local.LocalPath), - a file-like object, xlrd workbook or openpypl workbook. + path_or_buffer : str, path object (pathlib.Path or py._path.local.LocalPath), + a file-like object, xlrd workbook or openpypl workbook. If a string or path object, expected to be a path to a .xls, .xlsx, .xlsb, .xlsm, .odf, .ods, or .odt file. engine : str, default None @@ -816,18 +842,25 @@ class ExcelFile: "pyxlsb": _PyxlsbReader, } - def __init__(self, io, engine=None): + def __init__(self, path_or_buffer, engine=None): if engine is None: engine = "xlrd" + if isinstance(path_or_buffer, (BufferedIOBase, RawIOBase)): + if _is_ods_stream(path_or_buffer): + engine = "odf" + else: + ext = os.path.splitext(str(path_or_buffer))[-1] + if ext == ".ods": + engine = "odf" if engine not in self._engines: raise ValueError(f"Unknown engine: {engine}") self.engine = engine # Could be a str, ExcelFile, Book, etc. - self.io = io + self.io = path_or_buffer # Always a string - self._io = stringify_path(io) + self._io = stringify_path(path_or_buffer) self._reader = self._engines[engine](self._io) diff --git a/pandas/io/excel/_odfreader.py b/pandas/io/excel/_odfreader.py index be86b57ca2066..85ec9afaaec25 100644 --- a/pandas/io/excel/_odfreader.py +++ b/pandas/io/excel/_odfreader.py @@ -1,5 +1,7 @@ from typing import List, cast +import numpy as np + from pandas._typing import FilePathOrBuffer, Scalar from pandas.compat._optional import import_optional_dependency @@ -148,6 +150,9 @@ def _is_empty_row(self, row) -> bool: def _get_cell_value(self, cell, convert_float: bool) -> Scalar: from odf.namespaces import OFFICENS + if str(cell) == "#N/A": + return np.nan + cell_type = cell.attributes.get((OFFICENS, "value-type")) if cell_type == "boolean": if str(cell) == "TRUE": @@ -158,10 +163,6 @@ def _get_cell_value(self, cell, convert_float: bool) -> Scalar: elif cell_type == "float": # GH5394 cell_value = float(cell.attributes.get((OFFICENS, "value"))) - - if cell_value == 0.0: # NA handling - return str(cell) - if convert_float: val = int(cell_value) if val == cell_value: diff --git a/pandas/io/excel/_odswriter.py b/pandas/io/excel/_odswriter.py new file mode 100644 index 0000000000000..0131240f99cf6 --- /dev/null +++ b/pandas/io/excel/_odswriter.py @@ -0,0 +1,272 @@ +from collections import defaultdict +import datetime +from typing import Any, DefaultDict, Dict, List, Optional, Tuple, Union + +import pandas._libs.json as json + +from pandas.io.excel._base import ExcelWriter +from pandas.io.excel._util import _validate_freeze_panes +from pandas.io.formats.excel import ExcelCell + + +class _ODSWriter(ExcelWriter): + engine = "odf" + supported_extensions = (".ods",) + + def __init__( + self, path: str, engine: Optional[str] = None, mode: str = "w", **engine_kwargs + ): + from odf.opendocument import OpenDocumentSpreadsheet + + engine_kwargs["engine"] = engine + + if mode == "a": + raise ValueError("Append mode is not supported with odf!") + + super().__init__(path, mode=mode, **engine_kwargs) + + self.book: OpenDocumentSpreadsheet = OpenDocumentSpreadsheet() + self._style_dict: Dict[str, str] = {} + + def save(self) -> None: + """ + Save workbook to disk. + """ + for sheet in self.sheets.values(): + self.book.spreadsheet.addElement(sheet) + self.book.save(self.path) + + def write_cells( + self, + cells: List[ExcelCell], + sheet_name: Optional[str] = None, + startrow: int = 0, + startcol: int = 0, + freeze_panes: Optional[List] = None, + ) -> None: + """ + Write the frame cells using odf + """ + from odf.table import Table, TableCell, TableRow + from odf.text import P + + sheet_name = self._get_sheet_name(sheet_name) + assert sheet_name is not None + + if sheet_name in self.sheets: + wks = self.sheets[sheet_name] + else: + wks = Table(name=sheet_name) + self.sheets[sheet_name] = wks + + if _validate_freeze_panes(freeze_panes): + assert freeze_panes is not None + self._create_freeze_panes(sheet_name, freeze_panes) + + for _ in range(startrow): + wks.addElement(TableRow()) + + rows: DefaultDict = defaultdict(TableRow) + col_count: DefaultDict = defaultdict(int) + + for cell in sorted(cells, key=lambda cell: (cell.row, cell.col)): + # only add empty cells if the row is still empty + if not col_count[cell.row]: + for _ in range(startcol): + rows[cell.row].addElement(TableCell()) + + # fill with empty cells if needed + for _ in range(cell.col - col_count[cell.row]): + rows[cell.row].addElement(TableCell()) + col_count[cell.row] += 1 + + pvalue, tc = self._make_table_cell(cell) + rows[cell.row].addElement(tc) + col_count[cell.row] += 1 + p = P(text=pvalue) + tc.addElement(p) + + # add all rows to the sheet + for row_nr in range(max(rows.keys()) + 1): + wks.addElement(rows[row_nr]) + + def _make_table_cell_attributes(self, cell) -> Dict[str, Union[int, str]]: + """Convert cell attributes to OpenDocument attributes + + Parameters + ---------- + cell : ExcelCell + Spreadsheet cell data + + Returns + ------- + attributes : Dict[str, Union[int, str]] + Dictionary with attributes and attribute values + """ + attributes: Dict[str, Union[int, str]] = {} + style_name = self._process_style(cell.style) + if style_name is not None: + attributes["stylename"] = style_name + if cell.mergestart is not None and cell.mergeend is not None: + attributes["numberrowsspanned"] = max(1, cell.mergestart) + attributes["numbercolumnsspanned"] = cell.mergeend + return attributes + + def _make_table_cell(self, cell) -> Tuple[str, Any]: + """Convert cell data to an OpenDocument spreadsheet cell + + Parameters + ---------- + cell : ExcelCell + Spreadsheet cell data + + Returns + ------- + pvalue, cell : Tuple[str, TableCell] + Display value, Cell value + """ + from odf.table import TableCell + + attributes = self._make_table_cell_attributes(cell) + val, fmt = self._value_with_fmt(cell.val) + pvalue = value = val + if isinstance(val, bool): + value = str(val).lower() + pvalue = str(val).upper() + if isinstance(val, datetime.datetime): + value = val.isoformat() + pvalue = val.strftime("%c") + return ( + pvalue, + TableCell(valuetype="date", datevalue=value, attributes=attributes), + ) + elif isinstance(val, datetime.date): + value = val.strftime("%Y-%m-%d") + pvalue = val.strftime("%x") + return ( + pvalue, + TableCell(valuetype="date", datevalue=value, attributes=attributes), + ) + else: + class_to_cell_type = { + str: "string", + int: "float", + float: "float", + bool: "boolean", + } + return ( + pvalue, + TableCell( + valuetype=class_to_cell_type[type(val)], + value=value, + attributes=attributes, + ), + ) + + def _process_style(self, style: Dict[str, Any]) -> str: + """Convert a style dictionary to a OpenDocument style sheet + + Parameters + ---------- + style : Dict + Style dictionary + + Returns + ------- + style_key : str + Unique style key for for later reference in sheet + """ + from odf.style import ( + ParagraphProperties, + Style, + TableCellProperties, + TextProperties, + ) + + if style is None: + return None + style_key = json.dumps(style) + if style_key in self._style_dict: + return self._style_dict[style_key] + name = f"pd{len(self._style_dict)+1}" + self._style_dict[style_key] = name + odf_style = Style(name=name, family="table-cell") + if "font" in style: + font = style["font"] + if font.get("bold", False): + odf_style.addElement(TextProperties(fontweight="bold")) + if "borders" in style: + borders = style["borders"] + for side, thickness in borders.items(): + thickness_translation = {"thin": "0.75pt solid #000000"} + odf_style.addElement( + TableCellProperties( + attributes={f"border{side}": thickness_translation[thickness]} + ) + ) + if "alignment" in style: + alignment = style["alignment"] + horizontal = alignment.get("horizontal") + if horizontal: + odf_style.addElement(ParagraphProperties(textalign=horizontal)) + vertical = alignment.get("vertical") + if vertical: + odf_style.addElement(TableCellProperties(verticalalign=vertical)) + self.book.styles.addElement(odf_style) + return name + + def _create_freeze_panes(self, sheet_name: str, freeze_panes: List[int]) -> None: + """Create freeze panes in the sheet + + Parameters + ---------- + sheet_name : str + Name of the spreadsheet + freeze_panes : list + Freeze pane location x and y + """ + from odf.config import ( + ConfigItem, + ConfigItemMapEntry, + ConfigItemMapIndexed, + ConfigItemMapNamed, + ConfigItemSet, + ) + + config_item_set = ConfigItemSet(name="ooo:view-settings") + self.book.settings.addElement(config_item_set) + + config_item_map_indexed = ConfigItemMapIndexed(name="Views") + config_item_set.addElement(config_item_map_indexed) + + config_item_map_entry = ConfigItemMapEntry() + config_item_map_indexed.addElement(config_item_map_entry) + + config_item_map_named = ConfigItemMapNamed(name="Tables") + config_item_map_entry.addElement(config_item_map_named) + + config_item_map_entry = ConfigItemMapEntry(name=sheet_name) + config_item_map_named.addElement(config_item_map_entry) + + config_item_map_entry.addElement( + ConfigItem(name="HorizontalSplitMode", type="short", text="2") + ) + config_item_map_entry.addElement( + ConfigItem(name="VerticalSplitMode", type="short", text="2") + ) + config_item_map_entry.addElement( + ConfigItem( + name="HorizontalSplitPosition", type="int", text=str(freeze_panes[0]) + ) + ) + config_item_map_entry.addElement( + ConfigItem( + name="VerticalSplitPosition", type="int", text=str(freeze_panes[1]) + ) + ) + config_item_map_entry.addElement( + ConfigItem(name="PositionRight", type="int", text=str(freeze_panes[0])) + ) + config_item_map_entry.addElement( + ConfigItem(name="PositionBottom", type="int", text=str(freeze_panes[1])) + ) diff --git a/pandas/io/excel/_util.py b/pandas/io/excel/_util.py index 7c8e1abb497bc..285aeaf7d4c6e 100644 --- a/pandas/io/excel/_util.py +++ b/pandas/io/excel/_util.py @@ -35,7 +35,12 @@ def _get_default_writer(ext): str The default engine for the extension. """ - _default_writers = {"xlsx": "openpyxl", "xlsm": "openpyxl", "xls": "xlwt"} + _default_writers = { + "xlsx": "openpyxl", + "xlsm": "openpyxl", + "xls": "xlwt", + "ods": "odf", + } xlsxwriter = import_optional_dependency( "xlsxwriter", raise_on_missing=False, on_version="warn" ) diff --git a/pandas/tests/io/excel/test_odswriter.py b/pandas/tests/io/excel/test_odswriter.py new file mode 100644 index 0000000000000..b50c641ebf0c0 --- /dev/null +++ b/pandas/tests/io/excel/test_odswriter.py @@ -0,0 +1,17 @@ +import pytest + +import pandas._testing as tm + +from pandas.io.excel import ExcelWriter + +odf = pytest.importorskip("odf") + +pytestmark = pytest.mark.parametrize("ext", [".ods"]) + + +def test_write_append_mode_raises(ext): + msg = "Append mode is not supported with odf!" + + with tm.ensure_clean(ext) as f: + with pytest.raises(ValueError, match=msg): + ExcelWriter(f, engine="odf", mode="a") diff --git a/pandas/tests/io/excel/test_writers.py b/pandas/tests/io/excel/test_writers.py index ba759c7766fa5..e3ee53b63e102 100644 --- a/pandas/tests/io/excel/test_writers.py +++ b/pandas/tests/io/excel/test_writers.py @@ -48,11 +48,19 @@ def set_engine(engine, ext): set_option(option_name, prev_engine) # Roll back option change -@td.skip_if_no("xlrd") -@pytest.mark.parametrize("ext", [".xls", ".xlsx", ".xlsm"]) +@pytest.mark.parametrize( + "ext", + [ + pytest.param(".xlsx", marks=[td.skip_if_no("openpyxl"), td.skip_if_no("xlrd")]), + pytest.param(".xlsm", marks=[td.skip_if_no("openpyxl"), td.skip_if_no("xlrd")]), + pytest.param(".xls", marks=[td.skip_if_no("xlwt"), td.skip_if_no("xlrd")]), + pytest.param( + ".xlsx", marks=[td.skip_if_no("xlsxwriter"), td.skip_if_no("xlrd")] + ), + pytest.param(".ods", marks=td.skip_if_no("odf")), + ], +) class TestRoundTrip: - @td.skip_if_no("xlwt") - @td.skip_if_no("openpyxl") @pytest.mark.parametrize( "header,expected", [(None, DataFrame([np.nan] * 4)), (0, DataFrame({"Unnamed: 0": [np.nan] * 3}))], @@ -70,8 +78,6 @@ def test_read_one_empty_col_no_header(self, ext, header, expected): tm.assert_frame_equal(result, expected) - @td.skip_if_no("xlwt") - @td.skip_if_no("openpyxl") @pytest.mark.parametrize( "header,expected", [(None, DataFrame([0] + [np.nan] * 4)), (0, DataFrame([np.nan] * 4))], @@ -88,8 +94,6 @@ def test_read_one_empty_col_with_header(self, ext, header, expected): tm.assert_frame_equal(result, expected) - @td.skip_if_no("openpyxl") - @td.skip_if_no("xlwt") def test_set_column_names_in_parameter(self, ext): # GH 12870 : pass down column names associated with # keyword argument names @@ -116,8 +120,6 @@ def test_set_column_names_in_parameter(self, ext): tm.assert_frame_equal(xlsdf_no_head, refdf) tm.assert_frame_equal(xlsdf_with_head, refdf) - @td.skip_if_no("xlwt") - @td.skip_if_no("openpyxl") def test_creating_and_reading_multiple_sheets(self, ext): # see gh-9450 # @@ -142,7 +144,6 @@ def tdf(col_sheet_name): for s in sheets: tm.assert_frame_equal(dfs[s], dfs_returned[s]) - @td.skip_if_no("xlsxwriter") def test_read_excel_multiindex_empty_level(self, ext): # see gh-12453 with tm.ensure_clean(ext) as path: @@ -190,7 +191,6 @@ def test_read_excel_multiindex_empty_level(self, ext): actual = pd.read_excel(path, header=[0, 1], index_col=0) tm.assert_frame_equal(actual, expected) - @td.skip_if_no("xlsxwriter") @pytest.mark.parametrize("c_idx_names", [True, False]) @pytest.mark.parametrize("r_idx_names", [True, False]) @pytest.mark.parametrize("c_idx_levels", [1, 3]) @@ -240,8 +240,6 @@ def test_excel_multindex_roundtrip( ) tm.assert_frame_equal(df, act, check_names=check_names) - @td.skip_if_no("xlwt") - @td.skip_if_no("openpyxl") def test_read_excel_parse_dates(self, ext): # see gh-11544, gh-12051 df = DataFrame( @@ -296,14 +294,28 @@ def test_multiindex_interval_datetimes(self, ext): tm.assert_frame_equal(result, expected) -@td.skip_if_no("xlrd") @pytest.mark.parametrize( "engine,ext", [ - pytest.param("openpyxl", ".xlsx", marks=td.skip_if_no("openpyxl")), - pytest.param("openpyxl", ".xlsm", marks=td.skip_if_no("openpyxl")), - pytest.param("xlwt", ".xls", marks=td.skip_if_no("xlwt")), - pytest.param("xlsxwriter", ".xlsx", marks=td.skip_if_no("xlsxwriter")), + pytest.param( + "openpyxl", + ".xlsx", + marks=[td.skip_if_no("openpyxl"), td.skip_if_no("xlrd")], + ), + pytest.param( + "openpyxl", + ".xlsm", + marks=[td.skip_if_no("openpyxl"), td.skip_if_no("xlrd")], + ), + pytest.param( + "xlwt", ".xls", marks=[td.skip_if_no("xlwt"), td.skip_if_no("xlrd")] + ), + pytest.param( + "xlsxwriter", + ".xlsx", + marks=[td.skip_if_no("xlsxwriter"), td.skip_if_no("xlrd")], + ), + pytest.param("odf", ".ods", marks=td.skip_if_no("odf")), ], ) @pytest.mark.usefixtures("set_engine") @@ -326,9 +338,7 @@ def test_excel_sheet_size(self, path): with pytest.raises(ValueError, match=msg): col_df.to_excel(path) - def test_excel_sheet_by_name_raise(self, path): - import xlrd - + def test_excel_sheet_by_name_raise(self, path, engine): gt = DataFrame(np.random.randn(10, 2)) gt.to_excel(path) @@ -337,9 +347,16 @@ def test_excel_sheet_by_name_raise(self, path): tm.assert_frame_equal(gt, df) - msg = "No sheet named <'0'>" - with pytest.raises(xlrd.XLRDError, match=msg): - pd.read_excel(xl, sheet_name="0") + if engine == "odf": + msg = "sheet 0 not found" + with pytest.raises(ValueError, match=msg): + pd.read_excel(xl, "0") + else: + import xlrd + + msg = "No sheet named <'0'>" + with pytest.raises(xlrd.XLRDError, match=msg): + pd.read_excel(xl, sheet_name="0") def test_excel_writer_context_manager(self, frame, path): with ExcelWriter(path) as writer: @@ -1246,7 +1263,7 @@ def test_path_path_lib(self, engine, ext): writer = partial(df.to_excel, engine=engine) reader = partial(pd.read_excel, index_col=0) - result = tm.round_trip_pathlib(writer, reader, path=f"foo.{ext}") + result = tm.round_trip_pathlib(writer, reader, path=f"foo{ext}") tm.assert_frame_equal(result, df) def test_path_local_path(self, engine, ext): @@ -1254,7 +1271,7 @@ def test_path_local_path(self, engine, ext): writer = partial(df.to_excel, engine=engine) reader = partial(pd.read_excel, index_col=0) - result = tm.round_trip_pathlib(writer, reader, path=f"foo.{ext}") + result = tm.round_trip_localpath(writer, reader, path=f"foo{ext}") tm.assert_frame_equal(result, df) def test_merged_cell_custom_objects(self, merge_cells, path): From 246cd8b947dcdccc1beff06a9601844aaaf438bc Mon Sep 17 00:00:00 2001 From: Niklas Weber Date: Wed, 24 Jun 2020 17:59:23 +0200 Subject: [PATCH 0196/1025] TST:add test for df replace GH34871 (#34904) --- pandas/tests/frame/methods/test_replace.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py index 3b9a724d74c7d..498f7f7790514 100644 --- a/pandas/tests/frame/methods/test_replace.py +++ b/pandas/tests/frame/methods/test_replace.py @@ -1407,3 +1407,16 @@ def test_replace_with_duplicate_columns(self, replacement): result["B"] = result["B"].replace(7, replacement) tm.assert_frame_equal(result, expected) + + @pytest.mark.xfail( + reason="replace() changes dtype from period to object, see GH34871", strict=True + ) + def test_replace_period_ignore_float(self): + """ + Regression test for GH#34871: if df.replace(1.0, 0.0) is called on a df + with a Period column the old, faulty behavior is to raise TypeError. + """ + df = pd.DataFrame({"Per": [pd.Period("2020-01")] * 3}) + result = df.replace(1.0, 0.0) + expected = pd.DataFrame({"Per": [pd.Period("2020-01")] * 3}) + tm.assert_frame_equal(expected, result) From bab926afdcb444084aa896aeef5cc3a9657bc7bf Mon Sep 17 00:00:00 2001 From: Ketan <6256964+ketanarlulkar@users.noreply.github.com> Date: Wed, 24 Jun 2020 21:30:49 +0530 Subject: [PATCH 0197/1025] BUG: indexing regression with datetime index (#34917) --- pandas/tests/indexing/test_partial.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/pandas/tests/indexing/test_partial.py b/pandas/tests/indexing/test_partial.py index 513ca039366cb..350f86b4e9fd0 100644 --- a/pandas/tests/indexing/test_partial.py +++ b/pandas/tests/indexing/test_partial.py @@ -650,3 +650,13 @@ def test_loc_with_list_of_strings_representing_datetimes_not_matched_type( s[labels] with pytest.raises(KeyError, match=msg): df.loc[labels] + + def test_indexing_timeseries_regression(self): + # Issue 34860 + arr = date_range("1/1/2008", "1/1/2009") + result = arr.to_series()["2008"] + + rng = date_range(start="2008-01-01", end="2008-12-31") + expected = Series(rng, index=rng) + + tm.assert_series_equal(result, expected) From 86976164b4d0f961d3c7b666ab64c0078b466114 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 24 Jun 2020 15:18:58 -0700 Subject: [PATCH 0198/1025] add Tick, BaseOffset to tslibs namespace (#34963) --- pandas/_libs/tslibs/__init__.py | 4 +++- pandas/compat/pickle_compat.py | 6 +++--- pandas/core/arrays/_ranges.py | 6 ++---- pandas/core/arrays/datetimelike.py | 11 ++++++----- pandas/core/arrays/period.py | 9 ++++----- pandas/core/arrays/timedeltas.py | 4 +--- pandas/core/generic.py | 3 +-- pandas/core/indexes/datetimelike.py | 6 ++---- pandas/core/indexes/interval.py | 8 +++----- pandas/core/indexes/period.py | 8 +++----- pandas/core/window/rolling.py | 6 ++---- pandas/plotting/_matplotlib/timeseries.py | 5 ++--- pandas/tests/plotting/test_datetimelike.py | 9 ++++----- pandas/tests/tslibs/test_api.py | 2 ++ 14 files changed, 38 insertions(+), 49 deletions(-) diff --git a/pandas/_libs/tslibs/__init__.py b/pandas/_libs/tslibs/__init__.py index 6f173a4542bb0..76e356370de70 100644 --- a/pandas/_libs/tslibs/__init__.py +++ b/pandas/_libs/tslibs/__init__.py @@ -16,13 +16,15 @@ "Timestamp", "tz_convert_single", "to_offset", + "Tick", + "BaseOffset", ] from . import dtypes from .conversion import localize_pydatetime from .nattype import NaT, NaTType, iNaT, is_null_datetimelike, nat_strings from .np_datetime import OutOfBoundsDatetime -from .offsets import to_offset +from .offsets import BaseOffset, Tick, to_offset from .period import IncompatibleFrequency, Period from .resolution import Resolution from .timedeltas import Timedelta, delta_to_nanoseconds, ints_to_pytimedelta diff --git a/pandas/compat/pickle_compat.py b/pandas/compat/pickle_compat.py index 8a2626f9a7e68..0484de3fa165d 100644 --- a/pandas/compat/pickle_compat.py +++ b/pandas/compat/pickle_compat.py @@ -9,9 +9,9 @@ from typing import TYPE_CHECKING, Optional import warnings -from pandas import Index +from pandas._libs.tslibs import BaseOffset -from pandas.tseries.offsets import DateOffset +from pandas import Index if TYPE_CHECKING: from pandas import Series, DataFrame @@ -42,7 +42,7 @@ def load_reduce(self): return except TypeError: pass - elif args and issubclass(args[0], DateOffset): + elif args and issubclass(args[0], BaseOffset): # TypeError: object.__new__(Day) is not safe, use Day.__new__() cls = args[0] stack[-1] = cls.__new__(*args) diff --git a/pandas/core/arrays/_ranges.py b/pandas/core/arrays/_ranges.py index 3b090ca458d88..14b442bf71080 100644 --- a/pandas/core/arrays/_ranges.py +++ b/pandas/core/arrays/_ranges.py @@ -7,16 +7,14 @@ import numpy as np -from pandas._libs.tslibs import OutOfBoundsDatetime, Timedelta, Timestamp - -from pandas.tseries.offsets import DateOffset +from pandas._libs.tslibs import BaseOffset, OutOfBoundsDatetime, Timedelta, Timestamp def generate_regular_range( start: Union[Timestamp, Timedelta], end: Union[Timestamp, Timedelta], periods: int, - freq: DateOffset, + freq: BaseOffset, ): """ Generate a range of dates or timestamps with the spans between dates diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 1fea6ca1b8a3d..a306268cd8ede 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -7,10 +7,12 @@ from pandas._libs import algos, lib from pandas._libs.tslibs import ( + BaseOffset, NaT, NaTType, Period, Resolution, + Tick, Timestamp, delta_to_nanoseconds, iNaT, @@ -62,7 +64,6 @@ from pandas.core.ops.invalid import invalid_comparison, make_invalid_op from pandas.tseries import frequencies -from pandas.tseries.offsets import DateOffset, Tick DTScalarOrNaT = Union[DatetimeLikeScalar, NaTType] @@ -421,7 +422,7 @@ def _with_freq(self, freq): if freq is None: # Always valid pass - elif len(self) == 0 and isinstance(freq, DateOffset): + elif len(self) == 0 and isinstance(freq, BaseOffset): # Always valid. In the TimedeltaArray case, we assume this # is a Tick offset. pass @@ -1398,7 +1399,7 @@ def __add__(self, other): result = self._add_nat() elif isinstance(other, (Tick, timedelta, np.timedelta64)): result = self._add_timedeltalike_scalar(other) - elif isinstance(other, DateOffset): + elif isinstance(other, BaseOffset): # specifically _not_ a Tick result = self._add_offset(other) elif isinstance(other, (datetime, np.datetime64)): @@ -1454,7 +1455,7 @@ def __sub__(self, other): result = self._sub_nat() elif isinstance(other, (Tick, timedelta, np.timedelta64)): result = self._add_timedeltalike_scalar(-other) - elif isinstance(other, DateOffset): + elif isinstance(other, BaseOffset): # specifically _not_ a Tick result = self._add_offset(-other) elif isinstance(other, (datetime, np.datetime64)): @@ -1778,7 +1779,7 @@ def maybe_infer_freq(freq): Whether we should inherit the freq of passed data. """ freq_infer = False - if not isinstance(freq, DateOffset): + if not isinstance(freq, BaseOffset): # if a passed freq is None, don't infer automatically if freq != "infer": freq = to_offset(freq) diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 7902dd0410910..4b4df3445be4e 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -5,6 +5,7 @@ import numpy as np from pandas._libs.tslibs import ( + BaseOffset, NaT, NaTType, Timedelta, @@ -48,8 +49,6 @@ from pandas.core.arrays import datetimelike as dtl import pandas.core.common as com -from pandas.tseries.offsets import DateOffset - def _field_accessor(name: str, docstring=None): def f(self): @@ -280,7 +279,7 @@ def dtype(self) -> PeriodDtype: # error: Read-only property cannot override read-write property [misc] @property # type: ignore - def freq(self) -> DateOffset: + def freq(self) -> BaseOffset: """ Return the frequency object for this PeriodArray. """ @@ -656,7 +655,7 @@ def _addsub_int_array( res_values[self._isnan] = iNaT return type(self)(res_values, freq=self.freq) - def _add_offset(self, other: DateOffset): + def _add_offset(self, other: BaseOffset): assert not isinstance(other, Tick) if other.base != self.freq.base: @@ -784,7 +783,7 @@ def raise_on_incompatible(left, right): # GH#24283 error message format depends on whether right is scalar if isinstance(right, (np.ndarray, ABCTimedeltaArray)) or right is None: other_freq = None - elif isinstance(right, (ABCPeriodIndex, PeriodArray, Period, DateOffset)): + elif isinstance(right, (ABCPeriodIndex, PeriodArray, Period, BaseOffset)): other_freq = right.freqstr else: other_freq = delta_to_tick(Timedelta(right)).freqstr diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index f33b569b3d1f7..a378423df788b 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -4,7 +4,7 @@ import numpy as np from pandas._libs import lib, tslibs -from pandas._libs.tslibs import NaT, Period, Timedelta, Timestamp, iNaT, to_offset +from pandas._libs.tslibs import NaT, Period, Tick, Timedelta, Timestamp, iNaT, to_offset from pandas._libs.tslibs.conversion import precision_from_unit from pandas._libs.tslibs.fields import get_timedelta_field from pandas._libs.tslibs.timedeltas import array_to_timedelta64, parse_timedelta_unit @@ -35,8 +35,6 @@ from pandas.core.construction import extract_array from pandas.core.ops.common import unpack_zerodim_and_defer -from pandas.tseries.offsets import Tick - def _field_accessor(name, alias, docstring=None): def f(self): diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 61361c3331d5e..eda1ba844b5ac 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -31,7 +31,7 @@ from pandas._config import config from pandas._libs import lib -from pandas._libs.tslibs import Timestamp, to_offset +from pandas._libs.tslibs import Tick, Timestamp, to_offset from pandas._typing import ( Axis, FilePathOrBuffer, @@ -101,7 +101,6 @@ from pandas.io.formats import format as fmt from pandas.io.formats.format import DataFrameFormatter, format_percentiles from pandas.io.formats.printing import pprint_thing -from pandas.tseries.offsets import Tick if TYPE_CHECKING: from pandas.core.resample import Resampler diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index ca6eb45e22c69..49b8ec3276e37 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -7,7 +7,7 @@ import numpy as np from pandas._libs import NaT, Timedelta, iNaT, join as libjoin, lib -from pandas._libs.tslibs import Resolution, timezones +from pandas._libs.tslibs import BaseOffset, Resolution, Tick, timezones from pandas._libs.tslibs.parsing import DateParseError from pandas._typing import Label from pandas.compat.numpy import function as nv @@ -44,8 +44,6 @@ from pandas.core.sorting import ensure_key_mapped from pandas.core.tools.timedeltas import to_timedelta -from pandas.tseries.offsets import DateOffset, Tick - _index_doc_kwargs = dict(ibase._index_doc_kwargs) _T = TypeVar("_T", bound="DatetimeIndexOpsMixin") @@ -91,7 +89,7 @@ class DatetimeIndexOpsMixin(ExtensionIndex): """ _data: Union[DatetimeArray, TimedeltaArray, PeriodArray] - freq: Optional[DateOffset] + freq: Optional[BaseOffset] freqstr: Optional[str] _resolution_obj: Resolution _bool_ops: List[str] = [] diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 3be2bcd4888cb..f7a7b382b853f 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -9,7 +9,7 @@ from pandas._libs import lib from pandas._libs.interval import Interval, IntervalMixin, IntervalTree -from pandas._libs.tslibs import Timedelta, Timestamp, to_offset +from pandas._libs.tslibs import BaseOffset, Timedelta, Timestamp, to_offset from pandas._typing import AnyArrayLike, Label from pandas.errors import InvalidIndexError from pandas.util._decorators import Appender, Substitution, cache_readonly @@ -56,8 +56,6 @@ from pandas.core.indexes.timedeltas import TimedeltaIndex, timedelta_range from pandas.core.ops import get_op_result_name -from pandas.tseries.offsets import DateOffset - _VALID_CLOSED = {"left", "right", "both", "neither"} _index_doc_kwargs = dict(ibase._index_doc_kwargs) @@ -1161,8 +1159,8 @@ def _is_type_compatible(a, b) -> bool: """ Helper for interval_range to check type compat of start/end/freq. """ - is_ts_compat = lambda x: isinstance(x, (Timestamp, DateOffset)) - is_td_compat = lambda x: isinstance(x, (Timedelta, DateOffset)) + is_ts_compat = lambda x: isinstance(x, (Timestamp, BaseOffset)) + is_td_compat = lambda x: isinstance(x, (Timedelta, BaseOffset)) return ( (is_number(a) and is_number(b)) or (is_ts_compat(a) and is_ts_compat(b)) diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index 68c2b44b23964..03e11b652477f 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -5,7 +5,7 @@ from pandas._libs import index as libindex from pandas._libs.lib import no_default -from pandas._libs.tslibs import Period, Resolution +from pandas._libs.tslibs import BaseOffset, Period, Resolution, Tick from pandas._libs.tslibs.parsing import DateParseError, parse_time_string from pandas._typing import DtypeObj, Label from pandas.errors import InvalidIndexError @@ -43,8 +43,6 @@ from pandas.core.indexes.numeric import Int64Index from pandas.core.ops import get_op_result_name -from pandas.tseries.offsets import DateOffset, Tick - _index_doc_kwargs = dict(ibase._index_doc_kwargs) _index_doc_kwargs.update(dict(target_klass="PeriodIndex or list of Periods")) @@ -145,7 +143,7 @@ class PeriodIndex(DatetimeIndexOpsMixin, Int64Index): _is_numeric_dtype = False _data: PeriodArray - freq: DateOffset + freq: BaseOffset _engine_type = libindex.PeriodEngine _supports_partial_string_indexing = True @@ -287,7 +285,7 @@ def _maybe_convert_timedelta(self, other): # _check_timedeltalike_freq_compat will raise if incompatible delta = self._data._check_timedeltalike_freq_compat(other) return delta - elif isinstance(other, DateOffset): + elif isinstance(other, BaseOffset): if other.base == self.freq.base: return other.n diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 7d76f8b117b5e..8cb53ebd92214 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -10,7 +10,7 @@ import numpy as np -from pandas._libs.tslibs import to_offset +from pandas._libs.tslibs import BaseOffset, to_offset import pandas._libs.window.aggregations as window_aggregations from pandas._typing import Axis, FrameOrSeries, Scalar from pandas.compat._optional import import_optional_dependency @@ -55,8 +55,6 @@ ) from pandas.core.window.numba_ import generate_numba_apply_func -from pandas.tseries.offsets import DateOffset - def calculate_center_offset(window) -> int: """ @@ -1935,7 +1933,7 @@ def validate(self): # we allow rolling on a datetimelike index if (self.obj.empty or self.is_datetimelike) and isinstance( - self.window, (str, DateOffset, timedelta) + self.window, (str, BaseOffset, timedelta) ): self._validate_monotonic() diff --git a/pandas/plotting/_matplotlib/timeseries.py b/pandas/plotting/_matplotlib/timeseries.py index 8ffd30567b9ac..8f3571cf13cbc 100644 --- a/pandas/plotting/_matplotlib/timeseries.py +++ b/pandas/plotting/_matplotlib/timeseries.py @@ -5,7 +5,7 @@ import numpy as np -from pandas._libs.tslibs import Period, to_offset +from pandas._libs.tslibs import BaseOffset, Period, to_offset from pandas._libs.tslibs.dtypes import FreqGroup from pandas._typing import FrameOrSeriesUnion @@ -22,7 +22,6 @@ TimeSeries_TimedeltaFormatter, ) from pandas.tseries.frequencies import get_period_alias, is_subperiod, is_superperiod -from pandas.tseries.offsets import DateOffset if TYPE_CHECKING: from pandas import Series, Index # noqa:F401 @@ -218,7 +217,7 @@ def _use_dynamic_x(ax, data: "FrameOrSeriesUnion") -> bool: return True -def _get_index_freq(index: "Index") -> Optional[DateOffset]: +def _get_index_freq(index: "Index") -> Optional[BaseOffset]: freq = getattr(index, "freq", None) if freq is None: freq = getattr(index, "inferred_freq", None) diff --git a/pandas/tests/plotting/test_datetimelike.py b/pandas/tests/plotting/test_datetimelike.py index fa129167a744f..201856669103a 100644 --- a/pandas/tests/plotting/test_datetimelike.py +++ b/pandas/tests/plotting/test_datetimelike.py @@ -6,18 +6,17 @@ import numpy as np import pytest -from pandas._libs.tslibs import to_offset +from pandas._libs.tslibs import BaseOffset, to_offset import pandas.util._test_decorators as td from pandas import DataFrame, Index, NaT, Series, isna import pandas._testing as tm -from pandas.core.indexes.datetimes import bdate_range, date_range +from pandas.core.indexes.datetimes import DatetimeIndex, bdate_range, date_range from pandas.core.indexes.period import Period, PeriodIndex, period_range from pandas.core.indexes.timedeltas import timedelta_range -from pandas.core.resample import DatetimeIndex from pandas.tests.plotting.common import TestPlotBase -from pandas.tseries.offsets import DateOffset, WeekOfMonth +from pandas.tseries.offsets import WeekOfMonth @td.skip_if_no_mpl @@ -1509,7 +1508,7 @@ def _check_plot_works(f, freq=None, series=None, *args, **kwargs): ax = kwargs.pop("ax", plt.gca()) if series is not None: dfreq = series.index.freq - if isinstance(dfreq, DateOffset): + if isinstance(dfreq, BaseOffset): dfreq = dfreq.rule_code if orig_axfreq is None: assert ax.freq == dfreq diff --git a/pandas/tests/tslibs/test_api.py b/pandas/tests/tslibs/test_api.py index a119db6c68635..840a8c2fb68b1 100644 --- a/pandas/tests/tslibs/test_api.py +++ b/pandas/tests/tslibs/test_api.py @@ -25,6 +25,7 @@ def test_namespace(): ] api = [ + "BaseOffset", "NaT", "NaTType", "iNaT", @@ -34,6 +35,7 @@ def test_namespace(): "Period", "IncompatibleFrequency", "Resolution", + "Tick", "Timedelta", "Timestamp", "delta_to_nanoseconds", From 939ace4af49965a7aae0d2cf4ba4c949fcbf2a73 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jo=C3=A3o=20Veiga?= Date: Wed, 24 Jun 2020 23:25:20 +0100 Subject: [PATCH 0199/1025] ENH: Allow relative and/or absolute precision in assert_almost_equal (#30562) --- doc/source/whatsnew/v1.1.0.rst | 3 + pandas/_libs/testing.pyx | 58 ++--- pandas/_testing.py | 223 ++++++++++++++++-- pandas/tests/frame/test_analytics.py | 23 +- pandas/tests/groupby/test_function.py | 2 +- pandas/tests/io/json/test_ujson.py | 4 +- pandas/tests/io/test_sql.py | 4 +- pandas/tests/plotting/test_converter.py | 12 +- pandas/tests/test_algos.py | 4 +- pandas/tests/test_nanops.py | 20 +- pandas/tests/util/conftest.py | 4 +- pandas/tests/util/test_assert_almost_equal.py | 75 +++++- .../util/test_assert_extension_array_equal.py | 15 +- pandas/tests/util/test_assert_index_equal.py | 14 +- pandas/tests/util/test_assert_series_equal.py | 32 ++- 15 files changed, 354 insertions(+), 139 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 7c9fa53568f45..40d2f0b07e10c 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -802,6 +802,9 @@ Deprecations - The ``squeeze`` keyword in the ``groupby`` function is deprecated and will be removed in a future version (:issue:`32380`) - The ``tz`` keyword in :meth:`Period.to_timestamp` is deprecated and will be removed in a future version; use `per.to_timestamp(...).tz_localize(tz)`` instead (:issue:`34522`) - :meth:`DatetimeIndex.to_perioddelta` is deprecated and will be removed in a future version. Use ``index - index.to_period(freq).to_timestamp()`` instead (:issue:`34853`) +- :meth:`util.testing.assert_almost_equal` now accepts both relative and absolute + precision through the ``rtol``, and ``atol`` parameters, thus deprecating the + ``check_less_precise`` parameter. (:issue:`13357`). .. --------------------------------------------------------------------------- diff --git a/pandas/_libs/testing.pyx b/pandas/_libs/testing.pyx index ca18afebf410b..785a4d1f8b923 100644 --- a/pandas/_libs/testing.pyx +++ b/pandas/_libs/testing.pyx @@ -1,3 +1,5 @@ +import math + import numpy as np from numpy cimport import_array import_array() @@ -42,12 +44,6 @@ cdef bint is_dictlike(obj): return hasattr(obj, 'keys') and hasattr(obj, '__getitem__') -cdef bint decimal_almost_equal(double desired, double actual, int decimal): - # Code from - # https://numpy.org/doc/stable/reference/generated/numpy.testing.assert_almost_equal.html - return abs(desired - actual) < (0.5 * 10.0 ** -decimal) - - cpdef assert_dict_equal(a, b, bint compare_keys=True): assert is_dictlike(a) and is_dictlike(b), ( "Cannot compare dict objects, one or both is not dict-like" @@ -66,7 +62,7 @@ cpdef assert_dict_equal(a, b, bint compare_keys=True): cpdef assert_almost_equal(a, b, - check_less_precise=False, + rtol=1.e-5, atol=1.e-8, bint check_dtype=True, obj=None, lobj=None, robj=None, index_values=None): """ @@ -76,31 +72,33 @@ cpdef assert_almost_equal(a, b, ---------- a : object b : object - check_less_precise : bool or int, default False - Specify comparison precision. - 5 digits (False) or 3 digits (True) after decimal points are - compared. If an integer, then this will be the number of decimal - points to compare + rtol : float, default 1e-5 + Relative tolerance. + + .. versionadded:: 1.1.0 + atol : float, default 1e-8 + Absolute tolerance. + + .. versionadded:: 1.1.0 check_dtype: bool, default True - check dtype if both a and b are np.ndarray + check dtype if both a and b are np.ndarray. obj : str, default None Specify object name being compared, internally used to show - appropriate assertion message + appropriate assertion message. lobj : str, default None Specify left object name being compared, internally used to show - appropriate assertion message + appropriate assertion message. robj : str, default None Specify right object name being compared, internally used to show - appropriate assertion message + appropriate assertion message. index_values : ndarray, default None Specify shared index values of objects being compared, internally used - to show appropriate assertion message + to show appropriate assertion message. .. versionadded:: 1.1.0 """ cdef: - int decimal double diff = 0.0 Py_ssize_t i, na, nb double fa, fb @@ -111,8 +109,6 @@ cpdef assert_almost_equal(a, b, if robj is None: robj = b - assert isinstance(check_less_precise, (int, bool)) - if isinstance(a, dict) or isinstance(b, dict): return assert_dict_equal(a, b) @@ -170,8 +166,7 @@ cpdef assert_almost_equal(a, b, for i in range(len(a)): try: - assert_almost_equal(a[i], b[i], - check_less_precise=check_less_precise) + assert_almost_equal(a[i], b[i], rtol=rtol, atol=atol) except AssertionError: is_unequal = True diff += 1 @@ -203,24 +198,11 @@ cpdef assert_almost_equal(a, b, # inf comparison return True - if check_less_precise is True: - decimal = 3 - elif check_less_precise is False: - decimal = 5 - else: - decimal = check_less_precise - fa, fb = a, b - # case for zero - if abs(fa) < 1e-5: - if not decimal_almost_equal(fa, fb, decimal): - assert False, (f'(very low values) expected {fb:.5f} ' - f'but got {fa:.5f}, with decimal {decimal}') - else: - if not decimal_almost_equal(1, fb / fa, decimal): - assert False, (f'expected {fb:.5f} but got {fa:.5f}, ' - f'with decimal {decimal}') + if not math.isclose(fa, fb, rel_tol=rtol, abs_tol=atol): + assert False, (f"expected {fb:.5f} but got {fa:.5f}, " + f"with rtol={rtol}, atol={atol}") return True raise AssertionError(f"{a} != {b}") diff --git a/pandas/_testing.py b/pandas/_testing.py index ebb53dd81682c..fc6df7a95e348 100644 --- a/pandas/_testing.py +++ b/pandas/_testing.py @@ -22,6 +22,7 @@ set_locale, ) +from pandas._libs.lib import no_default import pandas._libs.testing as _testing from pandas._typing import Dtype, FilePathOrBuffer, FrameOrSeries from pandas.compat import _get_lzma_file, _import_lzma @@ -64,6 +65,7 @@ TimedeltaArray, period_array, ) +from pandas.core.arrays.datetimelike import DatetimeLikeArrayMixin from pandas.io.common import urlopen from pandas.io.formats.printing import pprint_thing @@ -303,11 +305,54 @@ def write_to_compressed(compression, path, data, dest="test"): getattr(f, method)(*args) +def _get_tol_from_less_precise(check_less_precise: Union[bool, int]) -> float: + """ + Return the tolerance equivalent to the deprecated `check_less_precise` + parameter. + + Parameters + ---------- + check_less_precise : bool or int + + Returns + ------- + float + Tolerance to be used as relative/absolute tolerance. + + Examples + -------- + >>> # Using check_less_precise as a bool: + >>> _get_tol_from_less_precise(False) + 0.5e-5 + >>> _get_tol_from_less_precise(True) + 0.5e-3 + >>> # Using check_less_precise as an int representing the decimal + >>> # tolerance intended: + >>> _get_tol_from_less_precise(2) + 0.5e-2 + >>> _get_tol_from_less_precise(8) + 0.5e-8 + + """ + if isinstance(check_less_precise, bool): + if check_less_precise: + # 3-digit tolerance + return 0.5e-3 + else: + # 5-digit tolerance + return 0.5e-5 + else: + # Equivalent to setting checking_less_precise= + return 0.5 * 10 ** -check_less_precise + + def assert_almost_equal( left, right, check_dtype: Union[bool, str] = "equiv", - check_less_precise: Union[bool, int] = False, + check_less_precise: Union[bool, int] = no_default, + rtol: float = 1.0e-5, + atol: float = 1.0e-8, **kwargs, ): """ @@ -334,14 +379,37 @@ def assert_almost_equal( they are equivalent within the specified precision. Otherwise, we compare the **ratio** of the second number to the first number and check whether it is equivalent to 1 within the specified precision. + + .. deprecated:: 1.1.0 + Use `rtol` and `atol` instead to define relative/absolute + tolerance, respectively. Similar to :func:`math.isclose`. + rtol : float, default 1e-5 + Relative tolerance. + + .. versionadded:: 1.1.0 + atol : float, default 1e-8 + Absolute tolerance. + + .. versionadded:: 1.1.0 """ + if check_less_precise is not no_default: + warnings.warn( + "The 'check_less_precise' keyword in testing.assert_*_equal " + "is deprecated and will be removed in a future version. " + "You can stop passing 'check_less_precise' to silence this warning.", + FutureWarning, + stacklevel=2, + ) + rtol = atol = _get_tol_from_less_precise(check_less_precise) + if isinstance(left, pd.Index): assert_index_equal( left, right, check_exact=False, exact=check_dtype, - check_less_precise=check_less_precise, + rtol=rtol, + atol=atol, **kwargs, ) @@ -351,7 +419,8 @@ def assert_almost_equal( right, check_exact=False, check_dtype=check_dtype, - check_less_precise=check_less_precise, + rtol=rtol, + atol=atol, **kwargs, ) @@ -361,7 +430,8 @@ def assert_almost_equal( right, check_exact=False, check_dtype=check_dtype, - check_less_precise=check_less_precise, + rtol=rtol, + atol=atol, **kwargs, ) @@ -381,11 +451,7 @@ def assert_almost_equal( obj = "Input" assert_class_equal(left, right, obj=obj) _testing.assert_almost_equal( - left, - right, - check_dtype=check_dtype, - check_less_precise=check_less_precise, - **kwargs, + left, right, check_dtype=check_dtype, rtol=rtol, atol=atol, **kwargs ) @@ -596,9 +662,11 @@ def assert_index_equal( right: Index, exact: Union[bool, str] = "equiv", check_names: bool = True, - check_less_precise: Union[bool, int] = False, + check_less_precise: Union[bool, int] = no_default, check_exact: bool = True, check_categorical: bool = True, + rtol: float = 1.0e-5, + atol: float = 1.0e-8, obj: str = "Index", ) -> None: """ @@ -618,10 +686,22 @@ def assert_index_equal( Specify comparison precision. Only used when check_exact is False. 5 digits (False) or 3 digits (True) after decimal points are compared. If int, then specify the digits to compare. + + .. deprecated:: 1.1.0 + Use `rtol` and `atol` instead to define relative/absolute + tolerance, respectively. Similar to :func:`math.isclose`. check_exact : bool, default True Whether to compare number exactly. check_categorical : bool, default True Whether to compare internal Categorical exactly. + rtol : float, default 1e-5 + Relative tolerance. Only used when check_exact is False. + + .. versionadded:: 1.1.0 + atol : float, default 1e-8 + Absolute tolerance. Only used when check_exact is False. + + .. versionadded:: 1.1.0 obj : str, default 'Index' Specify object name being compared, internally used to show appropriate assertion message. @@ -650,6 +730,16 @@ def _get_ilevel_values(index, level): values = unique._shallow_copy(filled, name=index.names[level]) return values + if check_less_precise is not no_default: + warnings.warn( + "The 'check_less_precise' keyword in testing.assert_*_equal " + "is deprecated and will be removed in a future version. " + "You can stop passing 'check_less_precise' to silence this warning.", + FutureWarning, + stacklevel=2, + ) + rtol = atol = _get_tol_from_less_precise(check_less_precise) + # instance validation _check_isinstance(left, right, Index) @@ -686,8 +776,9 @@ def _get_ilevel_values(index, level): rlevel, exact=exact, check_names=check_names, - check_less_precise=check_less_precise, check_exact=check_exact, + rtol=rtol, + atol=atol, obj=lobj, ) # get_level_values may change dtype @@ -703,7 +794,8 @@ def _get_ilevel_values(index, level): _testing.assert_almost_equal( left.values, right.values, - check_less_precise=check_less_precise, + rtol=rtol, + atol=atol, check_dtype=exact, obj=obj, lobj=left, @@ -1028,9 +1120,11 @@ def assert_extension_array_equal( left, right, check_dtype=True, - check_less_precise=False, - check_exact=False, index_values=None, + check_less_precise=no_default, + check_exact=False, + rtol: float = 1.0e-5, + atol: float = 1.0e-8, ): """ Check that left and right ExtensionArrays are equal. @@ -1041,14 +1135,26 @@ def assert_extension_array_equal( The two arrays to compare. check_dtype : bool, default True Whether to check if the ExtensionArray dtypes are identical. + index_values : numpy.ndarray, default None + Optional index (shared by both left and right), used in output. check_less_precise : bool or int, default False Specify comparison precision. Only used when check_exact is False. 5 digits (False) or 3 digits (True) after decimal points are compared. If int, then specify the digits to compare. + + .. deprecated:: 1.1.0 + Use `rtol` and `atol` instead to define relative/absolute + tolerance, respectively. Similar to :func:`math.isclose`. check_exact : bool, default False Whether to compare number exactly. - index_values : numpy.ndarray, default None - Optional index (shared by both left and right), used in output. + rtol : float, default 1e-5 + Relative tolerance. Only used when check_exact is False. + + .. versionadded:: 1.1.0 + atol : float, default 1e-8 + Absolute tolerance. Only used when check_exact is False. + + .. versionadded:: 1.1.0 Notes ----- @@ -1056,12 +1162,26 @@ def assert_extension_array_equal( A mask of missing values is computed for each and checked to match. The remaining all-valid values are cast to object dtype and checked. """ + if check_less_precise is not no_default: + warnings.warn( + "The 'check_less_precise' keyword in testing.assert_*_equal " + "is deprecated and will be removed in a future version. " + "You can stop passing 'check_less_precise' to silence this warning.", + FutureWarning, + stacklevel=2, + ) + rtol = atol = _get_tol_from_less_precise(check_less_precise) + assert isinstance(left, ExtensionArray), "left is not an ExtensionArray" assert isinstance(right, ExtensionArray), "right is not an ExtensionArray" if check_dtype: assert_attr_equal("dtype", left, right, obj="ExtensionArray") - if hasattr(left, "asi8") and type(right) == type(left): + if ( + isinstance(left, DatetimeLikeArrayMixin) + and isinstance(right, DatetimeLikeArrayMixin) + and type(right) == type(left) + ): # Avoid slow object-dtype comparisons # np.asarray for case where we have a np.MaskedArray assert_numpy_array_equal( @@ -1086,7 +1206,8 @@ def assert_extension_array_equal( left_valid, right_valid, check_dtype=check_dtype, - check_less_precise=check_less_precise, + rtol=rtol, + atol=atol, obj="ExtensionArray", index_values=index_values, ) @@ -1099,13 +1220,15 @@ def assert_series_equal( check_dtype=True, check_index_type="equiv", check_series_type=True, - check_less_precise=False, + check_less_precise=no_default, check_names=True, check_exact=False, check_datetimelike_compat=False, check_categorical=True, check_category_order=True, check_freq=True, + rtol=1.0e-5, + atol=1.0e-8, obj="Series", ): """ @@ -1132,6 +1255,10 @@ def assert_series_equal( they are equivalent within the specified precision. Otherwise, we compare the **ratio** of the second number to the first number and check whether it is equivalent to 1 within the specified precision. + + .. deprecated:: 1.1.0 + Use `rtol` and `atol` instead to define relative/absolute + tolerance, respectively. Similar to :func:`math.isclose`. check_names : bool, default True Whether to check the Series and Index names attribute. check_exact : bool, default False @@ -1146,6 +1273,12 @@ def assert_series_equal( .. versionadded:: 1.0.2 check_freq : bool, default True Whether to check the `freq` attribute on a DatetimeIndex or TimedeltaIndex. + rtol : float, default 1e-5 + Relative tolerance. Only used when check_exact is False. + + .. versionadded:: 1.1.0 + atol : float, default 1e-8 + Absolute tolerance. Only used when check_exact is False. .. versionadded:: 1.1.0 obj : str, default 'Series' @@ -1154,6 +1287,16 @@ def assert_series_equal( """ __tracebackhide__ = True + if check_less_precise is not no_default: + warnings.warn( + "The 'check_less_precise' keyword in testing.assert_*_equal " + "is deprecated and will be removed in a future version. " + "You can stop passing 'check_less_precise' to silence this warning.", + FutureWarning, + stacklevel=2, + ) + rtol = atol = _get_tol_from_less_precise(check_less_precise) + # instance validation _check_isinstance(left, right, Series) @@ -1172,9 +1315,10 @@ def assert_series_equal( right.index, exact=check_index_type, check_names=check_names, - check_less_precise=check_less_precise, check_exact=check_exact, check_categorical=check_categorical, + rtol=rtol, + atol=atol, obj=f"{obj}.index", ) if check_freq and isinstance(left.index, (pd.DatetimeIndex, pd.TimedeltaIndex)): @@ -1227,7 +1371,8 @@ def assert_series_equal( _testing.assert_almost_equal( left._values, right._values, - check_less_precise=check_less_precise, + rtol=rtol, + atol=atol, check_dtype=check_dtype, obj=str(obj), index_values=np.asarray(left.index), @@ -1245,7 +1390,8 @@ def assert_series_equal( _testing.assert_almost_equal( left._values, right._values, - check_less_precise=check_less_precise, + rtol=rtol, + atol=atol, check_dtype=check_dtype, obj=str(obj), index_values=np.asarray(left.index), @@ -1273,7 +1419,7 @@ def assert_frame_equal( check_index_type="equiv", check_column_type="equiv", check_frame_type=True, - check_less_precise=False, + check_less_precise=no_default, check_names=True, by_blocks=False, check_exact=False, @@ -1281,6 +1427,8 @@ def assert_frame_equal( check_categorical=True, check_like=False, check_freq=True, + rtol=1.0e-5, + atol=1.0e-8, obj="DataFrame", ): """ @@ -1318,6 +1466,10 @@ def assert_frame_equal( they are equivalent within the specified precision. Otherwise, we compare the **ratio** of the second number to the first number and check whether it is equivalent to 1 within the specified precision. + + .. deprecated:: 1.1.0 + Use `rtol` and `atol` instead to define relative/absolute + tolerance, respectively. Similar to :func:`math.isclose`. check_names : bool, default True Whether to check that the `names` attribute for both the `index` and `column` attributes of the DataFrame is identical. @@ -1336,6 +1488,12 @@ def assert_frame_equal( (same as in columns) - same labels must be with the same data. check_freq : bool, default True Whether to check the `freq` attribute on a DatetimeIndex or TimedeltaIndex. + rtol : float, default 1e-5 + Relative tolerance. Only used when check_exact is False. + + .. versionadded:: 1.1.0 + atol : float, default 1e-8 + Absolute tolerance. Only used when check_exact is False. .. versionadded:: 1.1.0 obj : str, default 'DataFrame' @@ -1377,6 +1535,16 @@ def assert_frame_equal( """ __tracebackhide__ = True + if check_less_precise is not no_default: + warnings.warn( + "The 'check_less_precise' keyword in testing.assert_*_equal " + "is deprecated and will be removed in a future version. " + "You can stop passing 'check_less_precise' to silence this warning.", + FutureWarning, + stacklevel=2, + ) + rtol = atol = _get_tol_from_less_precise(check_less_precise) + # instance validation _check_isinstance(left, right, DataFrame) @@ -1399,9 +1567,10 @@ def assert_frame_equal( right.index, exact=check_index_type, check_names=check_names, - check_less_precise=check_less_precise, check_exact=check_exact, check_categorical=check_categorical, + rtol=rtol, + atol=atol, obj=f"{obj}.index", ) @@ -1411,9 +1580,10 @@ def assert_frame_equal( right.columns, exact=check_column_type, check_names=check_names, - check_less_precise=check_less_precise, check_exact=check_exact, check_categorical=check_categorical, + rtol=rtol, + atol=atol, obj=f"{obj}.columns", ) @@ -1439,13 +1609,14 @@ def assert_frame_equal( rcol, check_dtype=check_dtype, check_index_type=check_index_type, - check_less_precise=check_less_precise, check_exact=check_exact, check_names=check_names, check_datetimelike_compat=check_datetimelike_compat, check_categorical=check_categorical, check_freq=check_freq, obj=f'{obj}.iloc[:, {i}] (column name="{col}")', + rtol=rtol, + atol=atol, ) diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index db21161f84cf7..db8bb5ca3c437 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -32,7 +32,8 @@ def assert_stat_op_calc( has_skipna=True, check_dtype=True, check_dates=False, - check_less_precise=False, + rtol=1e-5, + atol=1e-8, skipna_alternative=None, ): """ @@ -54,9 +55,10 @@ def assert_stat_op_calc( "alternative(frame)" should be checked. check_dates : bool, default false Whether opname should be tested on a Datetime Series - check_less_precise : bool, default False - Whether results should only be compared approximately; - passed on to tm.assert_series_equal + rtol : float, default 1e-5 + Relative tolerance. + atol : float, default 1e-8 + Absolute tolerance. skipna_alternative : function, default None NaN-safe version of alternative """ @@ -87,14 +89,16 @@ def wrapper(x): result0, frame.apply(wrapper), check_dtype=check_dtype, - check_less_precise=check_less_precise, + rtol=rtol, + atol=atol, ) # HACK: win32 tm.assert_series_equal( result1, frame.apply(wrapper, axis=1), check_dtype=False, - check_less_precise=check_less_precise, + rtol=rtol, + atol=atol, ) else: skipna_wrapper = alternative @@ -105,13 +109,14 @@ def wrapper(x): result0, frame.apply(skipna_wrapper), check_dtype=check_dtype, - check_less_precise=check_less_precise, + rtol=rtol, + atol=atol, ) if opname in ["sum", "prod"]: expected = frame.apply(skipna_wrapper, axis=1) tm.assert_series_equal( - result1, expected, check_dtype=False, check_less_precise=check_less_precise + result1, expected, check_dtype=False, rtol=rtol, atol=atol, ) # check dtypes @@ -339,7 +344,7 @@ def kurt(x): np.sum, mixed_float_frame.astype("float32"), check_dtype=False, - check_less_precise=True, + rtol=1e-3, ) assert_stat_op_calc( diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 9303a084f1e71..6f19ec40c2520 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -649,7 +649,7 @@ def test_nlargest_mi_grouper(): ] expected = Series(exp_values, index=exp_idx) - tm.assert_series_equal(result, expected, check_exact=False) + tm.assert_series_equal(result, expected, check_exact=False, rtol=1e-3) def test_nsmallest(): diff --git a/pandas/tests/io/json/test_ujson.py b/pandas/tests/io/json/test_ujson.py index 7dc73d5be1538..7b6acf7eed685 100644 --- a/pandas/tests/io/json/test_ujson.py +++ b/pandas/tests/io/json/test_ujson.py @@ -1081,9 +1081,7 @@ def test_decode_array_with_big_int(self): @pytest.mark.parametrize("sign", [-1, 1]) def test_decode_floating_point(self, sign, float_number): float_number *= sign - tm.assert_almost_equal( - float_number, ujson.loads(str(float_number)), check_less_precise=15 - ) + tm.assert_almost_equal(float_number, ujson.loads(str(float_number)), rtol=1e-15) def test_encode_big_set(self): s = set() diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 70713768c8d1e..a07e7a74b7573 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -2389,7 +2389,7 @@ def test_write_row_by_row(self): result = sql.read_sql("select * from test", con=self.conn) result.index = frame.index - tm.assert_frame_equal(result, frame, check_less_precise=True) + tm.assert_frame_equal(result, frame, rtol=1e-3) def test_execute(self): frame = tm.makeTimeDataFrame() @@ -2649,7 +2649,7 @@ def test_write_row_by_row(self): result = sql.read_sql("select * from test", con=self.conn) result.index = frame.index - tm.assert_frame_equal(result, frame, check_less_precise=True) + tm.assert_frame_equal(result, frame, rtol=1e-3) # GH#32571 result comes back rounded to 6 digits in some builds; # no obvious pattern diff --git a/pandas/tests/plotting/test_converter.py b/pandas/tests/plotting/test_converter.py index e54f4784e9c4f..df2c9ecbd7a0a 100644 --- a/pandas/tests/plotting/test_converter.py +++ b/pandas/tests/plotting/test_converter.py @@ -201,19 +201,19 @@ def test_conversion(self): assert rs[1] == xp def test_conversion_float(self): - decimals = 9 + rtol = 0.5 * 10 ** -9 rs = self.dtc.convert(Timestamp("2012-1-1 01:02:03", tz="UTC"), None, None) xp = converter.dates.date2num(Timestamp("2012-1-1 01:02:03", tz="UTC")) - tm.assert_almost_equal(rs, xp, decimals) + tm.assert_almost_equal(rs, xp, rtol=rtol) rs = self.dtc.convert( Timestamp("2012-1-1 09:02:03", tz="Asia/Hong_Kong"), None, None ) - tm.assert_almost_equal(rs, xp, decimals) + tm.assert_almost_equal(rs, xp, rtol=rtol) rs = self.dtc.convert(datetime(2012, 1, 1, 1, 2, 3), None, None) - tm.assert_almost_equal(rs, xp, decimals) + tm.assert_almost_equal(rs, xp, rtol=rtol) def test_conversion_outofbounds_datetime(self): # 2579 @@ -249,13 +249,13 @@ def test_time_formatter(self, time, format_expected): assert result == format_expected def test_dateindex_conversion(self): - decimals = 9 + rtol = 10 ** -9 for freq in ("B", "L", "S"): dateindex = tm.makeDateIndex(k=10, freq=freq) rs = self.dtc.convert(dateindex, None, None) xp = converter.dates.date2num(dateindex._mpl_repr()) - tm.assert_almost_equal(rs, xp, decimals) + tm.assert_almost_equal(rs, xp, rtol=rtol) def test_resolution(self): def _assert_less(ts1, ts2): diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 44a8452964f5a..a080bf0feaebc 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -1472,7 +1472,7 @@ def test_group_var_generic_2d_some_nan(self): expected_counts = counts + 2 self.algo(out, counts, values, labels) - tm.assert_almost_equal(out, expected_out, check_less_precise=6) + tm.assert_almost_equal(out, expected_out, rtol=0.5e-06) tm.assert_numpy_array_equal(counts, expected_counts) def test_group_var_constant(self): @@ -1510,7 +1510,7 @@ def test_group_var_large_inputs(self): self.algo(out, counts, values, labels) assert counts[0] == 10 ** 6 - tm.assert_almost_equal(out[0, 0], 1.0 / 12, check_less_precise=True) + tm.assert_almost_equal(out[0, 0], 1.0 / 12, rtol=0.5e-3) class TestGroupVarFloat32(GroupVarTestMixin): diff --git a/pandas/tests/test_nanops.py b/pandas/tests/test_nanops.py index cac6a59527a6e..0d60e6e8a978f 100644 --- a/pandas/tests/test_nanops.py +++ b/pandas/tests/test_nanops.py @@ -782,27 +782,27 @@ def setup_method(self, method): def test_nanvar_all_finite(self): samples = self.samples actual_variance = nanops.nanvar(samples) - tm.assert_almost_equal(actual_variance, self.variance, check_less_precise=2) + tm.assert_almost_equal(actual_variance, self.variance, rtol=1e-2) def test_nanvar_nans(self): samples = np.nan * np.ones(2 * self.samples.shape[0]) samples[::2] = self.samples actual_variance = nanops.nanvar(samples, skipna=True) - tm.assert_almost_equal(actual_variance, self.variance, check_less_precise=2) + tm.assert_almost_equal(actual_variance, self.variance, rtol=1e-2) actual_variance = nanops.nanvar(samples, skipna=False) - tm.assert_almost_equal(actual_variance, np.nan, check_less_precise=2) + tm.assert_almost_equal(actual_variance, np.nan, rtol=1e-2) def test_nanstd_nans(self): samples = np.nan * np.ones(2 * self.samples.shape[0]) samples[::2] = self.samples actual_std = nanops.nanstd(samples, skipna=True) - tm.assert_almost_equal(actual_std, self.variance ** 0.5, check_less_precise=2) + tm.assert_almost_equal(actual_std, self.variance ** 0.5, rtol=1e-2) actual_std = nanops.nanvar(samples, skipna=False) - tm.assert_almost_equal(actual_std, np.nan, check_less_precise=2) + tm.assert_almost_equal(actual_std, np.nan, rtol=1e-2) def test_nanvar_axis(self): # Generate some sample data. @@ -812,7 +812,7 @@ def test_nanvar_axis(self): actual_variance = nanops.nanvar(samples, axis=1) tm.assert_almost_equal( - actual_variance, np.array([self.variance, 1.0 / 12]), check_less_precise=2 + actual_variance, np.array([self.variance, 1.0 / 12]), rtol=1e-2 ) def test_nanvar_ddof(self): @@ -826,15 +826,13 @@ def test_nanvar_ddof(self): # The unbiased estimate. var = 1.0 / 12 - tm.assert_almost_equal(variance_1, var, check_less_precise=2) + tm.assert_almost_equal(variance_1, var, rtol=1e-2) # The underestimated variance. - tm.assert_almost_equal(variance_0, (n - 1.0) / n * var, check_less_precise=2) + tm.assert_almost_equal(variance_0, (n - 1.0) / n * var, rtol=1e-2) # The overestimated variance. - tm.assert_almost_equal( - variance_2, (n - 1.0) / (n - 2.0) * var, check_less_precise=2 - ) + tm.assert_almost_equal(variance_2, (n - 1.0) / (n - 2.0) * var, rtol=1e-2) def test_ground_truth(self): # Test against values that were precomputed with Numpy. diff --git a/pandas/tests/util/conftest.py b/pandas/tests/util/conftest.py index 5eff49ab774b5..b68bcc93431d0 100644 --- a/pandas/tests/util/conftest.py +++ b/pandas/tests/util/conftest.py @@ -16,8 +16,8 @@ def check_index_type(request): return request.param -@pytest.fixture(params=[True, False]) -def check_less_precise(request): +@pytest.fixture(params=[0.5e-3, 0.5e-5]) +def rtol(request): return request.param diff --git a/pandas/tests/util/test_assert_almost_equal.py b/pandas/tests/util/test_assert_almost_equal.py index b8048891e4876..c25668c33bfc4 100644 --- a/pandas/tests/util/test_assert_almost_equal.py +++ b/pandas/tests/util/test_assert_almost_equal.py @@ -17,7 +17,7 @@ def _assert_almost_equal_both(a, b, **kwargs): The first object to compare. b : object The second object to compare. - kwargs : dict + **kwargs The arguments passed to `tm.assert_almost_equal`. """ tm.assert_almost_equal(a, b, **kwargs) @@ -34,7 +34,7 @@ def _assert_not_almost_equal(a, b, **kwargs): The first object to compare. b : object The second object to compare. - kwargs : dict + **kwargs The arguments passed to `tm.assert_almost_equal`. """ try: @@ -57,13 +57,23 @@ def _assert_not_almost_equal_both(a, b, **kwargs): The first object to compare. b : object The second object to compare. - kwargs : dict + **kwargs The arguments passed to `tm.assert_almost_equal`. """ _assert_not_almost_equal(a, b, **kwargs) _assert_not_almost_equal(b, a, **kwargs) +@pytest.mark.parametrize( + "a,b,check_less_precise", + [(1.1, 1.1, False), (1.1, 1.100001, True), (1.1, 1.1001, 2)], +) +def test_assert_almost_equal_deprecated(a, b, check_less_precise): + # GH#30562 + with tm.assert_produces_warning(FutureWarning): + _assert_almost_equal_both(a, b, check_less_precise=check_less_precise) + + @pytest.mark.parametrize( "a,b", [ @@ -78,12 +88,65 @@ def test_assert_almost_equal_numbers(a, b): _assert_almost_equal_both(a, b) -@pytest.mark.parametrize("a,b", [(1.1, 1), (1.1, True), (1, 2), (1.0001, np.int16(1))]) +@pytest.mark.parametrize( + "a,b", + [ + (1.1, 1), + (1.1, True), + (1, 2), + (1.0001, np.int16(1)), + # The following two examples are not "almost equal" due to tol. + (0.1, 0.1001), + (0.0011, 0.0012), + ], +) def test_assert_not_almost_equal_numbers(a, b): _assert_not_almost_equal_both(a, b) -@pytest.mark.parametrize("a,b", [(0, 0), (0, 0.0), (0, np.float64(0)), (0.000001, 0)]) +@pytest.mark.parametrize( + "a,b", + [ + (1.1, 1.1), + (1.1, 1.100001), + (1.1, 1.1001), + (0.000001, 0.000005), + (1000.0, 1000.0005), + # Testing this example, as per #13357 + (0.000011, 0.000012), + ], +) +def test_assert_almost_equal_numbers_atol(a, b): + # Equivalent to the deprecated check_less_precise=True + _assert_almost_equal_both(a, b, rtol=0.5e-3, atol=0.5e-3) + + +@pytest.mark.parametrize("a,b", [(1.1, 1.11), (0.1, 0.101), (0.000011, 0.001012)]) +def test_assert_not_almost_equal_numbers_atol(a, b): + _assert_not_almost_equal_both(a, b, atol=1e-3) + + +@pytest.mark.parametrize( + "a,b", + [ + (1.1, 1.1), + (1.1, 1.100001), + (1.1, 1.1001), + (1000.0, 1000.0005), + (1.1, 1.11), + (0.1, 0.101), + ], +) +def test_assert_almost_equal_numbers_rtol(a, b): + _assert_almost_equal_both(a, b, rtol=0.05) + + +@pytest.mark.parametrize("a,b", [(0.000011, 0.000012), (0.000001, 0.000005)]) +def test_assert_not_almost_equal_numbers_rtol(a, b): + _assert_not_almost_equal_both(a, b, rtol=0.05) + + +@pytest.mark.parametrize("a,b", [(0, 0), (0, 0.0), (0, np.float64(0)), (0.00000001, 0)]) def test_assert_almost_equal_numbers_with_zeros(a, b): _assert_almost_equal_both(a, b) @@ -235,7 +298,7 @@ def test_assert_almost_equal_object(): def test_assert_almost_equal_value_mismatch(): - msg = "expected 2\\.00000 but got 1\\.00000, with decimal 5" + msg = "expected 2\\.00000 but got 1\\.00000, with rtol=1e-05, atol=1e-08" with pytest.raises(AssertionError, match=msg): tm.assert_almost_equal(1, 2) diff --git a/pandas/tests/util/test_assert_extension_array_equal.py b/pandas/tests/util/test_assert_extension_array_equal.py index 0547323b882f6..d9fdf1491c328 100644 --- a/pandas/tests/util/test_assert_extension_array_equal.py +++ b/pandas/tests/util/test_assert_extension_array_equal.py @@ -32,16 +32,13 @@ def test_assert_extension_array_equal_not_exact(kwargs): tm.assert_extension_array_equal(arr1, arr2, **kwargs) -@pytest.mark.parametrize( - "check_less_precise", [True, False, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9] -) -def test_assert_extension_array_equal_less_precise(check_less_precise): +@pytest.mark.parametrize("decimals", range(10)) +def test_assert_extension_array_equal_less_precise(decimals): + rtol = 0.5 * 10 ** -decimals arr1 = SparseArray([0.5, 0.123456]) arr2 = SparseArray([0.5, 0.123457]) - kwargs = dict(check_less_precise=check_less_precise) - - if check_less_precise is False or check_less_precise >= 5: + if decimals >= 5: msg = """\ ExtensionArray are different @@ -50,9 +47,9 @@ def test_assert_extension_array_equal_less_precise(check_less_precise): \\[right\\]: \\[0\\.5, 0\\.123457\\]""" with pytest.raises(AssertionError, match=msg): - tm.assert_extension_array_equal(arr1, arr2, **kwargs) + tm.assert_extension_array_equal(arr1, arr2, rtol=rtol) else: - tm.assert_extension_array_equal(arr1, arr2, **kwargs) + tm.assert_extension_array_equal(arr1, arr2, rtol=rtol) def test_assert_extension_array_equal_dtype_mismatch(check_dtype): diff --git a/pandas/tests/util/test_assert_index_equal.py b/pandas/tests/util/test_assert_index_equal.py index bbbeebcec2569..125af6ef78593 100644 --- a/pandas/tests/util/test_assert_index_equal.py +++ b/pandas/tests/util/test_assert_index_equal.py @@ -82,12 +82,12 @@ def test_index_equal_values_close(check_exact): tm.assert_index_equal(idx1, idx2, check_exact=check_exact) -def test_index_equal_values_less_close(check_exact, check_less_precise): +def test_index_equal_values_less_close(check_exact, rtol): idx1 = Index([1, 2, 3.0]) idx2 = Index([1, 2, 3.0001]) - kwargs = dict(check_exact=check_exact, check_less_precise=check_less_precise) + kwargs = dict(check_exact=check_exact, rtol=rtol) - if check_exact or not check_less_precise: + if check_exact or rtol < 0.5e-3: msg = """Index are different Index values are different \\(33\\.33333 %\\) @@ -100,10 +100,10 @@ def test_index_equal_values_less_close(check_exact, check_less_precise): tm.assert_index_equal(idx1, idx2, **kwargs) -def test_index_equal_values_too_far(check_exact, check_less_precise): +def test_index_equal_values_too_far(check_exact, rtol): idx1 = Index([1, 2, 3]) idx2 = Index([1, 2, 4]) - kwargs = dict(check_exact=check_exact, check_less_precise=check_less_precise) + kwargs = dict(check_exact=check_exact, rtol=rtol) msg = """Index are different @@ -115,10 +115,10 @@ def test_index_equal_values_too_far(check_exact, check_less_precise): tm.assert_index_equal(idx1, idx2, **kwargs) -def test_index_equal_level_values_mismatch(check_exact, check_less_precise): +def test_index_equal_level_values_mismatch(check_exact, rtol): idx1 = MultiIndex.from_tuples([("A", 2), ("A", 2), ("B", 3), ("B", 4)]) idx2 = MultiIndex.from_tuples([("A", 1), ("A", 2), ("B", 3), ("B", 4)]) - kwargs = dict(check_exact=check_exact, check_less_precise=check_less_precise) + kwargs = dict(check_exact=check_exact, rtol=rtol) msg = """MultiIndex level \\[1\\] are different diff --git a/pandas/tests/util/test_assert_series_equal.py b/pandas/tests/util/test_assert_series_equal.py index 337a06b91e443..859c8474562a3 100644 --- a/pandas/tests/util/test_assert_series_equal.py +++ b/pandas/tests/util/test_assert_series_equal.py @@ -102,22 +102,20 @@ def test_series_not_equal_metadata_mismatch(kwargs): @pytest.mark.parametrize("data1,data2", [(0.12345, 0.12346), (0.1235, 0.1236)]) @pytest.mark.parametrize("dtype", ["float32", "float64"]) -@pytest.mark.parametrize("check_less_precise", [False, True, 0, 1, 2, 3, 10]) -def test_less_precise(data1, data2, dtype, check_less_precise): +@pytest.mark.parametrize("decimals", [0, 1, 2, 3, 5, 10]) +def test_less_precise(data1, data2, dtype, decimals): + rtol = 10 ** -decimals s1 = Series([data1], dtype=dtype) s2 = Series([data2], dtype=dtype) - kwargs = dict(check_less_precise=check_less_precise) - - if (check_less_precise is False or check_less_precise == 10) or ( - (check_less_precise is True or check_less_precise >= 3) - and abs(data1 - data2) >= 0.0001 + if (decimals == 5 or decimals == 10) or ( + decimals >= 3 and abs(data1 - data2) >= 0.0005 ): msg = "Series values are different" with pytest.raises(AssertionError, match=msg): - tm.assert_series_equal(s1, s2, **kwargs) + tm.assert_series_equal(s1, s2, rtol=rtol) else: - _assert_series_equal_both(s1, s2, **kwargs) + _assert_series_equal_both(s1, s2, rtol=rtol) @pytest.mark.parametrize( @@ -151,7 +149,7 @@ def test_series_equal_index_dtype(s1, s2, msg, check_index_type): tm.assert_series_equal(s1, s2, **kwargs) -def test_series_equal_length_mismatch(check_less_precise): +def test_series_equal_length_mismatch(rtol): msg = """Series are different Series length are different @@ -162,10 +160,10 @@ def test_series_equal_length_mismatch(check_less_precise): s2 = Series([1, 2, 3, 4]) with pytest.raises(AssertionError, match=msg): - tm.assert_series_equal(s1, s2, check_less_precise=check_less_precise) + tm.assert_series_equal(s1, s2, rtol=rtol) -def test_series_equal_numeric_values_mismatch(check_less_precise): +def test_series_equal_numeric_values_mismatch(rtol): msg = """Series are different Series values are different \\(33\\.33333 %\\) @@ -177,10 +175,10 @@ def test_series_equal_numeric_values_mismatch(check_less_precise): s2 = Series([1, 2, 4]) with pytest.raises(AssertionError, match=msg): - tm.assert_series_equal(s1, s2, check_less_precise=check_less_precise) + tm.assert_series_equal(s1, s2, rtol=rtol) -def test_series_equal_categorical_values_mismatch(check_less_precise): +def test_series_equal_categorical_values_mismatch(rtol): msg = """Series are different Series values are different \\(66\\.66667 %\\) @@ -194,10 +192,10 @@ def test_series_equal_categorical_values_mismatch(check_less_precise): s2 = Series(Categorical(["a", "c", "b"])) with pytest.raises(AssertionError, match=msg): - tm.assert_series_equal(s1, s2, check_less_precise=check_less_precise) + tm.assert_series_equal(s1, s2, rtol=rtol) -def test_series_equal_datetime_values_mismatch(check_less_precise): +def test_series_equal_datetime_values_mismatch(rtol): msg = """numpy array are different numpy array values are different \\(100.0 %\\) @@ -209,7 +207,7 @@ def test_series_equal_datetime_values_mismatch(check_less_precise): s2 = Series(pd.date_range("2019-02-02", periods=3, freq="D")) with pytest.raises(AssertionError, match=msg): - tm.assert_series_equal(s1, s2, check_less_precise=check_less_precise) + tm.assert_series_equal(s1, s2, rtol=rtol) def test_series_equal_categorical_mismatch(check_categorical): From d22aca90412407e9937079590e639aa878afbac4 Mon Sep 17 00:00:00 2001 From: David Rouquet Date: Thu, 25 Jun 2020 00:35:58 +0200 Subject: [PATCH 0200/1025] Add test for #32108 (error with groupby on series with period index) (#33105) --- pandas/tests/groupby/test_timegrouper.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/pandas/tests/groupby/test_timegrouper.py b/pandas/tests/groupby/test_timegrouper.py index 06a83f4c000cf..84fd7a1bdfb05 100644 --- a/pandas/tests/groupby/test_timegrouper.py +++ b/pandas/tests/groupby/test_timegrouper.py @@ -769,3 +769,17 @@ def test_scalar_call_versus_list_call(self): expected = grouped.count() tm.assert_frame_equal(result, expected) + + def test_grouper_period_index(self): + # GH 32108 + periods = 2 + index = pd.period_range( + start="2018-01", periods=periods, freq="M", name="Month" + ) + period_series = pd.Series(range(periods), index=index) + result = period_series.groupby(period_series.index.month).sum() + + expected = pd.Series( + range(0, periods), index=Index(range(1, periods + 1), name=index.name), + ) + tm.assert_series_equal(result, expected) From 5d8bc1e3c0a639ab25d7e0a40224423403a7d2e0 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 24 Jun 2020 17:36:59 -0500 Subject: [PATCH 0201/1025] PERF: Fixed cut regression, improve Categorical (#34952) --- asv_bench/benchmarks/categoricals.py | 4 ++ doc/source/whatsnew/v1.1.0.rst | 2 + pandas/core/arrays/categorical.py | 5 +++ .../arrays/categorical/test_constructors.py | 42 +++++++++++++++++++ 4 files changed, 53 insertions(+) diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py index 107b9b9edcd5d..a0b24342091ec 100644 --- a/asv_bench/benchmarks/categoricals.py +++ b/asv_bench/benchmarks/categoricals.py @@ -34,6 +34,7 @@ def setup(self): self.values_all_int8 = np.ones(N, "int8") self.categorical = pd.Categorical(self.values, self.categories) self.series = pd.Series(self.categorical) + self.intervals = pd.interval_range(0, 1, periods=N // 10) def time_regular(self): pd.Categorical(self.values, self.categories) @@ -44,6 +45,9 @@ def time_fastpath(self): def time_datetimes(self): pd.Categorical(self.datetimes) + def time_interval(self): + pd.Categorical(self.datetimes, categories=self.datetimes) + def time_datetimes_with_nat(self): pd.Categorical(self.datetimes_with_nat) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 40d2f0b07e10c..b25a310a15d19 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -828,6 +828,8 @@ Performance improvements - Performance improvement for groupby methods :meth:`~pandas.core.groupby.groupby.Groupby.first` and :meth:`~pandas.core.groupby.groupby.Groupby.last` (:issue:`34178`) - Performance improvement in :func:`factorize` for nullable (integer and boolean) dtypes (:issue:`33064`). +- Performance improvement when constructing :class:`Categorical` objects (:issue:`33921`) +- Fixed performance regression in :func:`pandas.qcut` and :func:`pandas.cut` (:issue:`33921`) - Performance improvement in reductions (sum, prod, min, max) for nullable (integer and boolean) dtypes (:issue:`30982`, :issue:`33261`, :issue:`33442`). - Performance improvement in arithmetic operations between two :class:`DataFrame` objects (:issue:`32779`) - Performance improvement in :class:`pandas.core.groupby.RollingGroupby` (:issue:`34052`) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 80fe1ac7ce619..3d469ec28b9c4 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2611,6 +2611,11 @@ def _get_codes_for_values(values, categories): values = ensure_object(values) categories = ensure_object(categories) + if isinstance(categories, ABCIndexClass): + return coerce_indexer_dtype(categories.get_indexer_for(values), categories) + + # Only hit here when we've already coerced to object dtypee. + hash_klass, vals = _get_data_algo(values) _, cats = _get_data_algo(categories) t = hash_klass(len(cats)) diff --git a/pandas/tests/arrays/categorical/test_constructors.py b/pandas/tests/arrays/categorical/test_constructors.py index 9be741274c15a..ca942c9288898 100644 --- a/pandas/tests/arrays/categorical/test_constructors.py +++ b/pandas/tests/arrays/categorical/test_constructors.py @@ -643,3 +643,45 @@ def test_constructor_string_and_tuples(self): c = pd.Categorical(np.array(["c", ("a", "b"), ("b", "a"), "c"], dtype=object)) expected_index = pd.Index([("a", "b"), ("b", "a"), "c"]) assert c.categories.equals(expected_index) + + def test_interval(self): + idx = pd.interval_range(0, 10, periods=10) + cat = pd.Categorical(idx, categories=idx) + expected_codes = np.arange(10, dtype="int8") + tm.assert_numpy_array_equal(cat.codes, expected_codes) + tm.assert_index_equal(cat.categories, idx) + + # infer categories + cat = pd.Categorical(idx) + tm.assert_numpy_array_equal(cat.codes, expected_codes) + tm.assert_index_equal(cat.categories, idx) + + # list values + cat = pd.Categorical(list(idx)) + tm.assert_numpy_array_equal(cat.codes, expected_codes) + tm.assert_index_equal(cat.categories, idx) + + # list values, categories + cat = pd.Categorical(list(idx), categories=list(idx)) + tm.assert_numpy_array_equal(cat.codes, expected_codes) + tm.assert_index_equal(cat.categories, idx) + + # shuffled + values = idx.take([1, 2, 0]) + cat = pd.Categorical(values, categories=idx) + tm.assert_numpy_array_equal(cat.codes, np.array([1, 2, 0], dtype="int8")) + tm.assert_index_equal(cat.categories, idx) + + # extra + values = pd.interval_range(8, 11, periods=3) + cat = pd.Categorical(values, categories=idx) + expected_codes = np.array([8, 9, -1], dtype="int8") + tm.assert_numpy_array_equal(cat.codes, expected_codes) + tm.assert_index_equal(cat.categories, idx) + + # overlapping + idx = pd.IntervalIndex([pd.Interval(0, 2), pd.Interval(0, 1)]) + cat = pd.Categorical(idx, categories=idx) + expected_codes = np.array([0, 1], dtype="int8") + tm.assert_numpy_array_equal(cat.codes, expected_codes) + tm.assert_index_equal(cat.categories, idx) From 3993306abe63e2f249bf4ebd09a0f53d1289785f Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Wed, 24 Jun 2020 23:40:20 +0100 Subject: [PATCH 0202/1025] BUG: repr of Categorical does not distinguish int and str. (#34222) --- doc/source/whatsnew/v1.1.0.rst | 1 + pandas/core/algorithms.py | 4 +- pandas/core/arrays/base.py | 12 +- pandas/core/arrays/categorical.py | 105 +++++++++--------- pandas/core/base.py | 8 +- pandas/core/construction.py | 12 +- pandas/core/dtypes/concat.py | 20 ++-- pandas/core/dtypes/dtypes.py | 2 +- pandas/core/reshape/tile.py | 8 +- pandas/core/series.py | 12 +- pandas/io/formats/format.py | 19 +++- pandas/tests/arrays/categorical/test_repr.py | 31 ++++-- pandas/tests/series/test_repr.py | 4 +- pandas/tests/util/test_assert_series_equal.py | 8 +- web/pandas/community/blog/extension-arrays.md | 6 +- 15 files changed, 138 insertions(+), 114 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index b25a310a15d19..19f029d6aed68 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -851,6 +851,7 @@ Categorical - Bug when passing categorical data to :class:`Index` constructor along with ``dtype=object`` incorrectly returning a :class:`CategoricalIndex` instead of object-dtype :class:`Index` (:issue:`32167`) - Bug where :class:`Categorical` comparison operator ``__ne__`` would incorrectly evaluate to ``False`` when either element was missing (:issue:`32276`) - :meth:`Categorical.fillna` now accepts :class:`Categorical` ``other`` argument (:issue:`32420`) +- Repr of :class:`Categorical` was not distinguishing between int and str (:issue:`33676`) Datetimelike ^^^^^^^^^^^^ diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index dcf2015245518..9e3ca4cc53363 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -604,8 +604,8 @@ def factorize( >>> codes array([0, 0, 1]...) >>> uniques - [a, c] - Categories (3, object): [a, b, c] + ['a', 'c'] + Categories (3, object): ['a', 'b', 'c'] Notice that ``'b'`` is in ``uniques.categories``, despite not being present in ``cat.values``. diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 7f2c61ff7d955..5565b85f8d59a 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -846,14 +846,14 @@ def factorize(self, na_sentinel: int = -1) -> Tuple[np.ndarray, "ExtensionArray" -------- >>> cat = pd.Categorical(['a', 'b', 'c']) >>> cat - [a, b, c] - Categories (3, object): [a, b, c] + ['a', 'b', 'c'] + Categories (3, object): ['a', 'b', 'c'] >>> cat.repeat(2) - [a, a, b, b, c, c] - Categories (3, object): [a, b, c] + ['a', 'a', 'b', 'b', 'c', 'c'] + Categories (3, object): ['a', 'b', 'c'] >>> cat.repeat([1, 2, 3]) - [a, b, b, c, c, c] - Categories (3, object): [a, b, c] + ['a', 'b', 'b', 'c', 'c', 'c'] + Categories (3, object): ['a', 'b', 'c'] """ @Substitution(klass="ExtensionArray") diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 3d469ec28b9c4..1fedfa70cc469 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1,3 +1,5 @@ +from csv import QUOTE_NONNUMERIC +from functools import partial import operator from shutil import get_terminal_size from typing import Dict, Hashable, List, Type, Union, cast @@ -275,8 +277,8 @@ class Categorical(NDArrayBackedExtensionArray, PandasObject): Categories (3, int64): [1, 2, 3] >>> pd.Categorical(['a', 'b', 'c', 'a', 'b', 'c']) - [a, b, c, a, b, c] - Categories (3, object): [a, b, c] + ['a', 'b', 'c', 'a', 'b', 'c'] + Categories (3, object): ['a', 'b', 'c'] Ordered `Categoricals` can be sorted according to the custom order of the categories and can have a min and max value. @@ -284,8 +286,8 @@ class Categorical(NDArrayBackedExtensionArray, PandasObject): >>> c = pd.Categorical(['a', 'b', 'c', 'a', 'b', 'c'], ordered=True, ... categories=['c', 'b', 'a']) >>> c - [a, b, c, a, b, c] - Categories (3, object): [c < b < a] + ['a', 'b', 'c', 'a', 'b', 'c'] + Categories (3, object): ['c' < 'b' < 'a'] >>> c.min() 'c' """ @@ -598,8 +600,8 @@ def from_codes(cls, codes, categories=None, ordered=None, dtype=None): -------- >>> dtype = pd.CategoricalDtype(['a', 'b'], ordered=True) >>> pd.Categorical.from_codes(codes=[0, 1, 0, 1], dtype=dtype) - [a, b, a, b] - Categories (2, object): [a < b] + ['a', 'b', 'a', 'b'] + Categories (2, object): ['a' < 'b'] """ dtype = CategoricalDtype._from_values_or_dtype( categories=categories, ordered=ordered, dtype=dtype @@ -659,13 +661,13 @@ def _set_categories(self, categories, fastpath=False): -------- >>> c = pd.Categorical(['a', 'b']) >>> c - [a, b] - Categories (2, object): [a, b] + ['a', 'b'] + Categories (2, object): ['a', 'b'] >>> c._set_categories(pd.Index(['a', 'c'])) >>> c - [a, c] - Categories (2, object): [a, c] + ['a', 'c'] + Categories (2, object): ['a', 'c'] """ if fastpath: new_dtype = CategoricalDtype._from_fastpath(categories, self.ordered) @@ -885,14 +887,14 @@ def rename_categories(self, new_categories, inplace=False): categories not in the dictionary are passed through >>> c.rename_categories({'a': 'A', 'c': 'C'}) - [A, A, b] - Categories (2, object): [A, b] + ['A', 'A', 'b'] + Categories (2, object): ['A', 'b'] You may also provide a callable to create the new categories >>> c.rename_categories(lambda x: x.upper()) - [A, A, B] - Categories (2, object): [A, B] + ['A', 'A', 'B'] + Categories (2, object): ['A', 'B'] """ inplace = validate_bool_kwarg(inplace, "inplace") cat = self if inplace else self.copy() @@ -1128,22 +1130,22 @@ def map(self, mapper): -------- >>> cat = pd.Categorical(['a', 'b', 'c']) >>> cat - [a, b, c] - Categories (3, object): [a, b, c] + ['a', 'b', 'c'] + Categories (3, object): ['a', 'b', 'c'] >>> cat.map(lambda x: x.upper()) - [A, B, C] - Categories (3, object): [A, B, C] + ['A', 'B', 'C'] + Categories (3, object): ['A', 'B', 'C'] >>> cat.map({'a': 'first', 'b': 'second', 'c': 'third'}) - [first, second, third] - Categories (3, object): [first, second, third] + ['first', 'second', 'third'] + Categories (3, object): ['first', 'second', 'third'] If the mapping is one-to-one the ordering of the categories is preserved: >>> cat = pd.Categorical(['a', 'b', 'c'], ordered=True) >>> cat - [a, b, c] - Categories (3, object): [a < b < c] + ['a', 'b', 'c'] + Categories (3, object): ['a' < 'b' < 'c'] >>> cat.map({'a': 3, 'b': 2, 'c': 1}) [3, 2, 1] Categories (3, int64): [3 < 2 < 1] @@ -1778,29 +1780,29 @@ def take(self: _T, indexer, allow_fill: bool = False, fill_value=None) -> _T: -------- >>> cat = pd.Categorical(['a', 'a', 'b']) >>> cat - [a, a, b] - Categories (2, object): [a, b] + ['a', 'a', 'b'] + Categories (2, object): ['a', 'b'] Specify ``allow_fill==False`` to have negative indices mean indexing from the right. >>> cat.take([0, -1, -2], allow_fill=False) - [a, b, a] - Categories (2, object): [a, b] + ['a', 'b', 'a'] + Categories (2, object): ['a', 'b'] With ``allow_fill=True``, indices equal to ``-1`` mean "missing" values that should be filled with the `fill_value`, which is ``np.nan`` by default. >>> cat.take([0, -1, -1], allow_fill=True) - [a, NaN, NaN] - Categories (2, object): [a, b] + ['a', NaN, NaN] + Categories (2, object): ['a', 'b'] The fill value can be specified. >>> cat.take([0, -1, -1], allow_fill=True, fill_value='a') - [a, a, a] - Categories (2, object): [a, b] + ['a', 'a', 'a'] + Categories (2, object): ['a', 'b'] Specifying a fill value that's not in ``self.categories`` will raise a ``ValueError``. @@ -1872,13 +1874,16 @@ def _repr_categories(self): ) from pandas.io.formats import format as fmt + format_array = partial( + fmt.format_array, formatter=None, quoting=QUOTE_NONNUMERIC + ) if len(self.categories) > max_categories: num = max_categories // 2 - head = fmt.format_array(self.categories[:num], None) - tail = fmt.format_array(self.categories[-num:], None) + head = format_array(self.categories[:num]) + tail = format_array(self.categories[-num:]) category_strs = head + ["..."] + tail else: - category_strs = fmt.format_array(self.categories, None) + category_strs = format_array(self.categories) # Strip all leading spaces, which format_array adds for columns... category_strs = [x.strip() for x in category_strs] @@ -2051,8 +2056,8 @@ def _reverse_indexer(self) -> Dict[Hashable, np.ndarray]: -------- >>> c = pd.Categorical(list('aabca')) >>> c - [a, a, b, c, a] - Categories (3, object): [a, b, c] + ['a', 'a', 'b', 'c', 'a'] + Categories (3, object): ['a', 'b', 'c'] >>> c.categories Index(['a', 'b', 'c'], dtype='object') >>> c.codes @@ -2199,20 +2204,20 @@ def unique(self): order of appearance. >>> pd.Categorical(list("baabc")).unique() - [b, a, c] - Categories (3, object): [b, a, c] + ['b', 'a', 'c'] + Categories (3, object): ['b', 'a', 'c'] >>> pd.Categorical(list("baabc"), categories=list("abc")).unique() - [b, a, c] - Categories (3, object): [b, a, c] + ['b', 'a', 'c'] + Categories (3, object): ['b', 'a', 'c'] An ordered Categorical preserves the category ordering. >>> pd.Categorical( ... list("baabc"), categories=list("abc"), ordered=True ... ).unique() - [b, a, c] - Categories (3, object): [a < b < c] + ['b', 'a', 'c'] + Categories (3, object): ['a' < 'b' < 'c'] """ # unlike np.unique, unique1d does not sort unique_codes = unique1d(self.codes) @@ -2465,7 +2470,7 @@ class CategoricalAccessor(PandasDelegate, PandasObject, NoNewAttributesMixin): 4 c 5 c dtype: category - Categories (3, object): [a, b, c] + Categories (3, object): ['a', 'b', 'c'] >>> s.cat.categories Index(['a', 'b', 'c'], dtype='object') @@ -2478,7 +2483,7 @@ class CategoricalAccessor(PandasDelegate, PandasObject, NoNewAttributesMixin): 4 a 5 a dtype: category - Categories (3, object): [c, b, a] + Categories (3, object): ['c', 'b', 'a'] >>> s.cat.reorder_categories(list("cba")) 0 a @@ -2488,7 +2493,7 @@ class CategoricalAccessor(PandasDelegate, PandasObject, NoNewAttributesMixin): 4 c 5 c dtype: category - Categories (3, object): [c, b, a] + Categories (3, object): ['c', 'b', 'a'] >>> s.cat.add_categories(["d", "e"]) 0 a @@ -2498,7 +2503,7 @@ class CategoricalAccessor(PandasDelegate, PandasObject, NoNewAttributesMixin): 4 c 5 c dtype: category - Categories (5, object): [a, b, c, d, e] + Categories (5, object): ['a', 'b', 'c', 'd', 'e'] >>> s.cat.remove_categories(["a", "c"]) 0 NaN @@ -2508,7 +2513,7 @@ class CategoricalAccessor(PandasDelegate, PandasObject, NoNewAttributesMixin): 4 NaN 5 NaN dtype: category - Categories (1, object): [b] + Categories (1, object): ['b'] >>> s1 = s.cat.add_categories(["d", "e"]) >>> s1.cat.remove_unused_categories() @@ -2519,7 +2524,7 @@ class CategoricalAccessor(PandasDelegate, PandasObject, NoNewAttributesMixin): 4 c 5 c dtype: category - Categories (3, object): [a, b, c] + Categories (3, object): ['a', 'b', 'c'] >>> s.cat.set_categories(list("abcde")) 0 a @@ -2529,7 +2534,7 @@ class CategoricalAccessor(PandasDelegate, PandasObject, NoNewAttributesMixin): 4 c 5 c dtype: category - Categories (5, object): [a, b, c, d, e] + Categories (5, object): ['a', 'b', 'c', 'd', 'e'] >>> s.cat.as_ordered() 0 a @@ -2539,7 +2544,7 @@ class CategoricalAccessor(PandasDelegate, PandasObject, NoNewAttributesMixin): 4 c 5 c dtype: category - Categories (3, object): [a < b < c] + Categories (3, object): ['a' < 'b' < 'c'] >>> s.cat.as_unordered() 0 a @@ -2549,7 +2554,7 @@ class CategoricalAccessor(PandasDelegate, PandasObject, NoNewAttributesMixin): 4 c 5 c dtype: category - Categories (3, object): [a, b, c] + Categories (3, object): ['a', 'b', 'c'] """ def __init__(self, data): diff --git a/pandas/core/base.py b/pandas/core/base.py index e790b1d7f106e..813de491ffdb3 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -743,8 +743,8 @@ def array(self) -> ExtensionArray: >>> ser = pd.Series(pd.Categorical(['a', 'b', 'a'])) >>> ser.array - [a, b, a] - Categories (2, object): [a, b] + ['a', 'b', 'a'] + Categories (2, object): ['a', 'b'] """ raise AbstractMethodError(self) @@ -1481,8 +1481,8 @@ def factorize(self, sort=False, na_sentinel=-1): ... ['apple', 'bread', 'bread', 'cheese', 'milk'], ordered=True ... ) >>> ser - [apple, bread, bread, cheese, milk] - Categories (4, object): [apple < bread < cheese < milk] + ['apple', 'bread', 'bread', 'cheese', 'milk'] + Categories (4, object): ['apple' < 'bread' < 'cheese' < 'milk'] >>> ser.searchsorted('bread') 1 diff --git a/pandas/core/construction.py b/pandas/core/construction.py index b110a316a76d9..9ac661f97a56e 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -217,15 +217,15 @@ def array( You can use the string alias for `dtype` >>> pd.array(['a', 'b', 'a'], dtype='category') - [a, b, a] - Categories (2, object): [a, b] + ['a', 'b', 'a'] + Categories (2, object): ['a', 'b'] Or specify the actual dtype >>> pd.array(['a', 'b', 'a'], ... dtype=pd.CategoricalDtype(['a', 'b', 'c'], ordered=True)) - [a, b, a] - Categories (3, object): [a < b < c] + ['a', 'b', 'a'] + Categories (3, object): ['a' < 'b' < 'c'] If pandas does not infer a dedicated extension type a :class:`arrays.PandasArray` is returned. @@ -357,8 +357,8 @@ def extract_array(obj, extract_numpy: bool = False): Examples -------- >>> extract_array(pd.Series(['a', 'b', 'c'], dtype='category')) - [a, b, c] - Categories (3, object): [a, b, c] + ['a', 'b', 'c'] + Categories (3, object): ['a', 'b', 'c'] Other objects like lists, arrays, and DataFrames are just passed through. diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index 71686bfc313fb..4b7c818f487ac 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -228,16 +228,16 @@ def union_categoricals( >>> a = pd.Categorical(["b", "c"]) >>> b = pd.Categorical(["a", "b"]) >>> union_categoricals([a, b]) - [b, c, a, b] - Categories (3, object): [b, c, a] + ['b', 'c', 'a', 'b'] + Categories (3, object): ['b', 'c', 'a'] By default, the resulting categories will be ordered as they appear in the `categories` of the data. If you want the categories to be lexsorted, use `sort_categories=True` argument. >>> union_categoricals([a, b], sort_categories=True) - [b, c, a, b] - Categories (3, object): [a, b, c] + ['b', 'c', 'a', 'b'] + Categories (3, object): ['a', 'b', 'c'] `union_categoricals` also works with the case of combining two categoricals of the same categories and order information (e.g. what @@ -246,8 +246,8 @@ def union_categoricals( >>> a = pd.Categorical(["a", "b"], ordered=True) >>> b = pd.Categorical(["a", "b", "a"], ordered=True) >>> union_categoricals([a, b]) - [a, b, a, b, a] - Categories (2, object): [a < b] + ['a', 'b', 'a', 'b', 'a'] + Categories (2, object): ['a' < 'b'] Raises `TypeError` because the categories are ordered and not identical. @@ -266,8 +266,8 @@ def union_categoricals( >>> a = pd.Categorical(["a", "b", "c"], ordered=True) >>> b = pd.Categorical(["c", "b", "a"], ordered=True) >>> union_categoricals([a, b], ignore_order=True) - [a, b, c, c, b, a] - Categories (3, object): [a, b, c] + ['a', 'b', 'c', 'c', 'b', 'a'] + Categories (3, object): ['a', 'b', 'c'] `union_categoricals` also works with a `CategoricalIndex`, or `Series` containing categorical data, but note that the resulting array will @@ -276,8 +276,8 @@ def union_categoricals( >>> a = pd.Series(["b", "c"], dtype='category') >>> b = pd.Series(["a", "b"], dtype='category') >>> union_categoricals([a, b]) - [b, c, a, b] - Categories (3, object): [b, c, a] + ['b', 'c', 'a', 'b'] + Categories (3, object): ['b', 'c', 'a'] """ from pandas import Categorical from pandas.core.arrays.categorical import recode_for_categories diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index b9d16ac5959e3..a9d2430717e4f 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -230,7 +230,7 @@ class CategoricalDtype(PandasExtensionDtype, ExtensionDtype): 2 a 3 NaN dtype: category - Categories (2, object): [b < a] + Categories (2, object): ['b' < 'a'] An empty CategoricalDtype with a specific dtype can be created by providing an empty index. As follows, diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index b6735282acaff..f7723bee532ff 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -150,16 +150,16 @@ def cut( >>> pd.cut(np.array([1, 7, 5, 4, 6, 3]), ... 3, labels=["bad", "medium", "good"]) - [bad, good, medium, medium, good, bad] - Categories (3, object): [bad < medium < good] + ['bad', 'good', 'medium', 'medium', 'good', 'bad'] + Categories (3, object): ['bad' < 'medium' < 'good'] ``ordered=False`` will result in unordered categories when labels are passed. This parameter can be used to allow non-unique labels: >>> pd.cut(np.array([1, 7, 5, 4, 6, 3]), 3, ... labels=["B", "A", "B"], ordered=False) - [B, B, A, A, B, B] - Categories (2, object): [A, B] + ['B', 'B', 'A', 'A', 'B', 'B'] + Categories (2, object): ['A', 'B'] ``labels=False`` implies you just want the bins back. diff --git a/pandas/core/series.py b/pandas/core/series.py index d8cf8308142a6..a652af5efc590 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -524,8 +524,8 @@ def values(self): array(['a', 'a', 'b', 'c'], dtype=object) >>> pd.Series(list('aabc')).astype('category').values - [a, a, b, c] - Categories (3, object): [a, b, c] + ['a', 'a', 'b', 'c'] + Categories (3, object): ['a', 'b', 'c'] Timezone aware datetime data is converted to UTC: @@ -1850,15 +1850,15 @@ def unique(self): appearance. >>> pd.Series(pd.Categorical(list('baabc'))).unique() - [b, a, c] - Categories (3, object): [b, a, c] + ['b', 'a', 'c'] + Categories (3, object): ['b', 'a', 'c'] An ordered Categorical preserves the category ordering. >>> pd.Series(pd.Categorical(list('baabc'), categories=list('abc'), ... ordered=True)).unique() - [b, a, c] - Categories (3, object): [a < b < c] + ['b', 'a', 'c'] + Categories (3, object): ['a' < 'b' < 'c'] """ result = super().unique() return result diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 9ea2f6510b253..0c07b97a10fa3 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -4,6 +4,7 @@ """ from contextlib import contextmanager +from csv import QUOTE_NONE, QUOTE_NONNUMERIC from datetime import tzinfo import decimal from functools import partial @@ -176,6 +177,7 @@ def __init__( self.na_rep = na_rep self.length = length self.footer = footer + self.quoting = QUOTE_NONNUMERIC def _get_footer(self) -> str: footer = "" @@ -200,6 +202,7 @@ def _get_formatted_values(self) -> List[str]: None, float_format=None, na_rep=self.na_rep, + quoting=self.quoting, ) def to_string(self) -> str: @@ -1109,6 +1112,7 @@ def format_array( justify: str = "right", decimal: str = ".", leading_space: Optional[bool] = None, + quoting: Optional[int] = None, ) -> List[str]: """ Format an array for printing. @@ -1171,6 +1175,7 @@ def format_array( justify=justify, decimal=decimal, leading_space=leading_space, + quoting=quoting, ) return fmt_obj.get_result() @@ -1216,11 +1221,15 @@ def _format_strings(self) -> List[str]: else: float_format = self.float_format - formatter = ( - self.formatter - if self.formatter is not None - else (lambda x: pprint_thing(x, escape_chars=("\t", "\r", "\n"))) - ) + if self.formatter is not None: + formatter = self.formatter + else: + quote_strings = self.quoting is not None and self.quoting != QUOTE_NONE + formatter = partial( + pprint_thing, + escape_chars=("\t", "\r", "\n"), + quote_strings=quote_strings, + ) def _format(x): if self.na_rep is not None and is_scalar(x) and isna(x): diff --git a/pandas/tests/arrays/categorical/test_repr.py b/pandas/tests/arrays/categorical/test_repr.py index d08c4b47dd3cb..735b062eae80e 100644 --- a/pandas/tests/arrays/categorical/test_repr.py +++ b/pandas/tests/arrays/categorical/test_repr.py @@ -14,7 +14,10 @@ class TestCategoricalReprWithFactor(TestCategorical): def test_print(self): - expected = ["[a, b, b, a, a, c, c, c]", "Categories (3, object): [a < b < c]"] + expected = [ + "['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']", + "Categories (3, object): ['a' < 'b' < 'c']", + ] expected = "\n".join(expected) actual = repr(self.factor) assert actual == expected @@ -24,9 +27,9 @@ class TestCategoricalRepr: def test_big_print(self): factor = Categorical([0, 1, 2, 0, 1, 2] * 100, ["a", "b", "c"], fastpath=True) expected = [ - "[a, b, c, a, b, ..., b, c, a, b, c]", + "['a', 'b', 'c', 'a', 'b', ..., 'b', 'c', 'a', 'b', 'c']", "Length: 600", - "Categories (3, object): [a, b, c]", + "Categories (3, object): ['a', 'b', 'c']", ] expected = "\n".join(expected) @@ -36,13 +39,13 @@ def test_big_print(self): def test_empty_print(self): factor = Categorical([], ["a", "b", "c"]) - expected = "[], Categories (3, object): [a, b, c]" + expected = "[], Categories (3, object): ['a', 'b', 'c']" actual = repr(factor) assert actual == expected assert expected == actual factor = Categorical([], ["a", "b", "c"], ordered=True) - expected = "[], Categories (3, object): [a < b < c]" + expected = "[], Categories (3, object): ['a' < 'b' < 'c']" actual = repr(factor) assert expected == actual @@ -64,17 +67,17 @@ def test_print_none_width(self): def test_unicode_print(self): c = Categorical(["aaaaa", "bb", "cccc"] * 20) expected = """\ -[aaaaa, bb, cccc, aaaaa, bb, ..., bb, cccc, aaaaa, bb, cccc] +['aaaaa', 'bb', 'cccc', 'aaaaa', 'bb', ..., 'bb', 'cccc', 'aaaaa', 'bb', 'cccc'] Length: 60 -Categories (3, object): [aaaaa, bb, cccc]""" +Categories (3, object): ['aaaaa', 'bb', 'cccc']""" assert repr(c) == expected c = Categorical(["ああああ", "いいいいい", "ううううううう"] * 20) expected = """\ -[ああああ, いいいいい, ううううううう, ああああ, いいいいい, ..., いいいいい, ううううううう, ああああ, いいいいい, ううううううう] +['ああああ', 'いいいいい', 'ううううううう', 'ああああ', 'いいいいい', ..., 'いいいいい', 'ううううううう', 'ああああ', 'いいいいい', 'ううううううう'] Length: 60 -Categories (3, object): [ああああ, いいいいい, ううううううう]""" # noqa +Categories (3, object): ['ああああ', 'いいいいい', 'ううううううう']""" # noqa assert repr(c) == expected @@ -83,9 +86,9 @@ def test_unicode_print(self): with option_context("display.unicode.east_asian_width", True): c = Categorical(["ああああ", "いいいいい", "ううううううう"] * 20) - expected = """[ああああ, いいいいい, ううううううう, ああああ, いいいいい, ..., いいいいい, ううううううう, ああああ, いいいいい, ううううううう] + expected = """['ああああ', 'いいいいい', 'ううううううう', 'ああああ', 'いいいいい', ..., 'いいいいい', 'ううううううう', 'ああああ', 'いいいいい', 'ううううううう'] Length: 60 -Categories (3, object): [ああああ, いいいいい, ううううううう]""" # noqa +Categories (3, object): ['ああああ', 'いいいいい', 'ううううううう']""" # noqa assert repr(c) == expected @@ -523,3 +526,9 @@ def test_categorical_index_repr_timedelta_ordered(self): categories=[0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00, 3 days 01:00:00, 4 days 01:00:00, 5 days 01:00:00, 6 days 01:00:00, 7 days 01:00:00, ...], ordered=True, dtype='category')""" # noqa assert repr(i) == exp + + def test_categorical_str_repr(self): + # GH 33676 + result = repr(Categorical([1, "2", 3, 4])) + expected = "[1, '2', 3, 4]\nCategories (4, object): [1, 3, 4, '2']" + assert result == expected diff --git a/pandas/tests/series/test_repr.py b/pandas/tests/series/test_repr.py index 77f942a9e32ec..b861b37b49f89 100644 --- a/pandas/tests/series/test_repr.py +++ b/pandas/tests/series/test_repr.py @@ -270,7 +270,7 @@ def test_categorical_repr(self): "0 a\n1 b\n" + " ..\n" + "48 a\n49 b\n" - + "Length: 50, dtype: category\nCategories (2, object): [a, b]" + + "Length: 50, dtype: category\nCategories (2, object): ['a', 'b']" ) with option_context("display.max_rows", 5): assert exp == repr(a) @@ -279,7 +279,7 @@ def test_categorical_repr(self): a = Series(Categorical(["a", "b"], categories=levs, ordered=True)) exp = ( "0 a\n1 b\n" + "dtype: category\n" - "Categories (26, object): [a < b < c < d ... w < x < y < z]" + "Categories (26, object): ['a' < 'b' < 'c' < 'd' ... 'w' < 'x' < 'y' < 'z']" ) assert exp == a.__str__() diff --git a/pandas/tests/util/test_assert_series_equal.py b/pandas/tests/util/test_assert_series_equal.py index 859c8474562a3..1284cc9d4f49b 100644 --- a/pandas/tests/util/test_assert_series_equal.py +++ b/pandas/tests/util/test_assert_series_equal.py @@ -183,10 +183,10 @@ def test_series_equal_categorical_values_mismatch(rtol): Series values are different \\(66\\.66667 %\\) \\[index\\]: \\[0, 1, 2\\] -\\[left\\]: \\[a, b, c\\] -Categories \\(3, object\\): \\[a, b, c\\] -\\[right\\]: \\[a, c, b\\] -Categories \\(3, object\\): \\[a, b, c\\]""" +\\[left\\]: \\['a', 'b', 'c'\\] +Categories \\(3, object\\): \\['a', 'b', 'c'\\] +\\[right\\]: \\['a', 'c', 'b'\\] +Categories \\(3, object\\): \\['a', 'b', 'c'\\]""" s1 = Series(Categorical(["a", "b", "c"])) s2 = Series(Categorical(["a", "c", "b"])) diff --git a/web/pandas/community/blog/extension-arrays.md b/web/pandas/community/blog/extension-arrays.md index ea8a9a28ba242..61a77738a259c 100644 --- a/web/pandas/community/blog/extension-arrays.md +++ b/web/pandas/community/blog/extension-arrays.md @@ -117,11 +117,11 @@ library). For example, consider `Categorical`, 1 b 2 a dtype: category -Categories (3, object): [a, b, c] +Categories (3, object): ['a', 'b', 'c'] >>> ser.values [a, b, a] -Categories (3, object): [a, b, c] +Categories (3, object): ['a', 'b', 'c'] ``` In this case `.values` is a Categorical, not a NumPy array. For period-dtype @@ -143,7 +143,7 @@ So with our Categorical example, ```python >>> ser.array [a, b, a] -Categories (3, object): [a, b, c] +Categories (3, object): ['a', 'b', 'c'] >>> ser.to_numpy() array(['a', 'b', 'a'], dtype=object) From 291d4e0d36f98c7eaa4fcf106e990bd5edec98d2 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 24 Jun 2020 16:35:17 -0700 Subject: [PATCH 0203/1025] BUG: clear cache on DataFrame._is_homogeneous_dtype (#34937) --- pandas/core/frame.py | 3 ++- pandas/tests/frame/test_dtypes.py | 12 ++++++++++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 678e64db2beba..45d3b065d0b44 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -611,7 +611,8 @@ def _is_homogeneous_type(self) -> bool: if self._mgr.any_extension_types: return len({block.dtype for block in self._mgr.blocks}) == 1 else: - return not self._mgr.is_mixed_type + # Note: consolidates inplace + return not self._is_mixed_type @property def _can_fast_transpose(self) -> bool: diff --git a/pandas/tests/frame/test_dtypes.py b/pandas/tests/frame/test_dtypes.py index 9c415564fd99a..f3e3ef9bae5c6 100644 --- a/pandas/tests/frame/test_dtypes.py +++ b/pandas/tests/frame/test_dtypes.py @@ -233,6 +233,18 @@ def test_constructor_list_str_na(self, string_dtype): def test_is_homogeneous_type(self, data, expected): assert data._is_homogeneous_type is expected + def test_is_homogeneous_type_clears_cache(self): + ser = pd.Series([1, 2, 3]) + df = ser.to_frame("A") + df["B"] = ser + + assert len(df._mgr.blocks) == 2 + + a = df["B"] # caches lookup + df._is_homogeneous_type # _should_ clear cache + assert len(df._mgr.blocks) == 1 + assert df["B"] is not a + def test_asarray_homogenous(self): df = pd.DataFrame({"A": pd.Categorical([1, 2]), "B": pd.Categorical([1, 2])}) result = np.asarray(df) From 022034d80fe7dbd8b0bb5abb8e30b63d894bb032 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 24 Jun 2020 16:36:10 -0700 Subject: [PATCH 0204/1025] REF: implement _shared_docs to de-circularize dependencies (#34837) --- pandas/core/frame.py | 100 +-------------------------------- pandas/core/generic.py | 2 +- pandas/core/reshape/concat.py | 25 +++++---- pandas/core/reshape/melt.py | 14 ++--- pandas/core/shared_docs.py | 102 ++++++++++++++++++++++++++++++++++ 5 files changed, 125 insertions(+), 118 deletions(-) create mode 100644 pandas/core/shared_docs.py diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 45d3b065d0b44..521d16ac0b905 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -135,6 +135,7 @@ sanitize_index, to_arrays, ) +from pandas.core.reshape.melt import melt from pandas.core.series import Series from pandas.core.sorting import ensure_key_mapped @@ -7070,104 +7071,6 @@ def unstack(self, level=-1, fill_value=None): return unstack(self, level, fill_value) - _shared_docs[ - "melt" - ] = """ - Unpivot a DataFrame from wide to long format, optionally leaving identifiers set. - - This function is useful to massage a DataFrame into a format where one - or more columns are identifier variables (`id_vars`), while all other - columns, considered measured variables (`value_vars`), are "unpivoted" to - the row axis, leaving just two non-identifier columns, 'variable' and - 'value'. - %(versionadded)s - Parameters - ---------- - id_vars : tuple, list, or ndarray, optional - Column(s) to use as identifier variables. - value_vars : tuple, list, or ndarray, optional - Column(s) to unpivot. If not specified, uses all columns that - are not set as `id_vars`. - var_name : scalar - Name to use for the 'variable' column. If None it uses - ``frame.columns.name`` or 'variable'. - value_name : scalar, default 'value' - Name to use for the 'value' column. - col_level : int or str, optional - If columns are a MultiIndex then use this level to melt. - - Returns - ------- - DataFrame - Unpivoted DataFrame. - - See Also - -------- - %(other)s : Identical method. - pivot_table : Create a spreadsheet-style pivot table as a DataFrame. - DataFrame.pivot : Return reshaped DataFrame organized - by given index / column values. - DataFrame.explode : Explode a DataFrame from list-like - columns to long format. - - Examples - -------- - >>> df = pd.DataFrame({'A': {0: 'a', 1: 'b', 2: 'c'}, - ... 'B': {0: 1, 1: 3, 2: 5}, - ... 'C': {0: 2, 1: 4, 2: 6}}) - >>> df - A B C - 0 a 1 2 - 1 b 3 4 - 2 c 5 6 - - >>> %(caller)sid_vars=['A'], value_vars=['B']) - A variable value - 0 a B 1 - 1 b B 3 - 2 c B 5 - - >>> %(caller)sid_vars=['A'], value_vars=['B', 'C']) - A variable value - 0 a B 1 - 1 b B 3 - 2 c B 5 - 3 a C 2 - 4 b C 4 - 5 c C 6 - - The names of 'variable' and 'value' columns can be customized: - - >>> %(caller)sid_vars=['A'], value_vars=['B'], - ... var_name='myVarname', value_name='myValname') - A myVarname myValname - 0 a B 1 - 1 b B 3 - 2 c B 5 - - If you have multi-index columns: - - >>> df.columns = [list('ABC'), list('DEF')] - >>> df - A B C - D E F - 0 a 1 2 - 1 b 3 4 - 2 c 5 6 - - >>> %(caller)scol_level=0, id_vars=['A'], value_vars=['B']) - A variable value - 0 a B 1 - 1 b B 3 - 2 c B 5 - - >>> %(caller)sid_vars=[('A', 'D')], value_vars=[('B', 'E')]) - (A, D) variable_0 variable_1 value - 0 a B E 1 - 1 b B E 3 - 2 c B E 5 - """ - @Appender( _shared_docs["melt"] % dict( @@ -7184,7 +7087,6 @@ def melt( value_name="value", col_level=None, ) -> "DataFrame": - from pandas.core.reshape.melt import melt return melt( self, diff --git a/pandas/core/generic.py b/pandas/core/generic.py index eda1ba844b5ac..488dd00686a17 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -97,6 +97,7 @@ from pandas.core.internals import BlockManager from pandas.core.missing import find_valid_index from pandas.core.ops import _align_method_FRAME +from pandas.core.shared_docs import _shared_docs from pandas.io.formats import format as fmt from pandas.io.formats.format import DataFrameFormatter, format_percentiles @@ -108,7 +109,6 @@ # goal is to be able to define the docs close to function, while still being # able to share -_shared_docs: Dict[str, str] = dict() _shared_doc_kwargs = dict( axes="keywords for axes", klass="Series/DataFrame", diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index db7e9265ac21d..299b68c6e71e0 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -3,7 +3,7 @@ """ from collections import abc -from typing import Iterable, List, Mapping, Union, overload +from typing import TYPE_CHECKING, Iterable, List, Mapping, Union, overload import numpy as np @@ -12,14 +12,14 @@ from pandas.core.dtypes.concat import concat_compat from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries -from pandas import DataFrame, Index, MultiIndex, Series from pandas.core.arrays.categorical import ( factorize_from_iterable, factorize_from_iterables, ) import pandas.core.common as com -from pandas.core.generic import NDFrame from pandas.core.indexes.api import ( + Index, + MultiIndex, all_indexes_same, ensure_index, get_consensus_names, @@ -28,6 +28,9 @@ import pandas.core.indexes.base as ibase from pandas.core.internals import concatenate_block_managers +if TYPE_CHECKING: + from pandas import DataFrame + # --------------------------------------------------------------------- # Concatenate DataFrame objects @@ -291,7 +294,7 @@ class _Concatenator: def __init__( self, - objs, + objs: Union[Iterable[FrameOrSeries], Mapping[Label, FrameOrSeries]], axis=0, join: str = "outer", keys=None, @@ -302,7 +305,7 @@ def __init__( copy: bool = True, sort=False, ): - if isinstance(objs, (NDFrame, str)): + if isinstance(objs, (ABCSeries, ABCDataFrame, str)): raise TypeError( "first argument must be an iterable of pandas " f'objects, you passed an object of type "{type(objs).__name__}"' @@ -348,7 +351,7 @@ def __init__( # consolidate data & figure out what our result ndim is going to be ndims = set() for obj in objs: - if not isinstance(obj, (Series, DataFrame)): + if not isinstance(obj, (ABCSeries, ABCDataFrame)): msg = ( f"cannot concatenate object of type '{type(obj)}'; " "only Series and DataFrame objs are valid" @@ -374,7 +377,7 @@ def __init__( # filter out the empties if we have not multi-index possibilities # note to keep empty Series as it affect to result columns / name non_empties = [ - obj for obj in objs if sum(obj.shape) > 0 or isinstance(obj, Series) + obj for obj in objs if sum(obj.shape) > 0 or isinstance(obj, ABCSeries) ] if len(non_empties) and ( @@ -388,15 +391,15 @@ def __init__( self.objs = objs # Standardize axis parameter to int - if isinstance(sample, Series): - axis = DataFrame._get_axis_number(axis) + if isinstance(sample, ABCSeries): + axis = sample._constructor_expanddim._get_axis_number(axis) else: axis = sample._get_axis_number(axis) # Need to flip BlockManager axis in the DataFrame special case self._is_frame = isinstance(sample, ABCDataFrame) if self._is_frame: - axis = DataFrame._get_block_manager_axis(axis) + axis = sample._get_block_manager_axis(axis) self._is_series = isinstance(sample, ABCSeries) if not 0 <= axis <= sample.ndim: @@ -543,7 +546,7 @@ def _get_concat_axis(self) -> Index: num = 0 has_names = False for i, x in enumerate(self.objs): - if not isinstance(x, Series): + if not isinstance(x, ABCSeries): raise TypeError( f"Cannot concatenate type 'Series' with " f"object of type '{type(x).__name__}'" diff --git a/pandas/core/reshape/melt.py b/pandas/core/reshape/melt.py index 845f6b67693f4..cd0619738677d 100644 --- a/pandas/core/reshape/melt.py +++ b/pandas/core/reshape/melt.py @@ -11,13 +11,13 @@ from pandas.core.arrays import Categorical import pandas.core.common as com -from pandas.core.frame import DataFrame, _shared_docs from pandas.core.indexes.api import Index, MultiIndex from pandas.core.reshape.concat import concat +from pandas.core.shared_docs import _shared_docs from pandas.core.tools.numeric import to_numeric if TYPE_CHECKING: - from pandas import Series # noqa: F401 + from pandas import DataFrame, Series # noqa: F401 @Appender( @@ -25,13 +25,13 @@ % dict(caller="pd.melt(df, ", versionadded="", other="DataFrame.melt") ) def melt( - frame: DataFrame, + frame: "DataFrame", id_vars=None, value_vars=None, var_name=None, value_name="value", col_level=None, -) -> DataFrame: +) -> "DataFrame": # TODO: what about the existing index? # If multiindex, gather names of columns on all level for checking presence # of `id_vars` and `value_vars` @@ -125,7 +125,7 @@ def melt( @deprecate_kwarg(old_arg_name="label", new_arg_name=None) -def lreshape(data: DataFrame, groups, dropna: bool = True, label=None) -> DataFrame: +def lreshape(data: "DataFrame", groups, dropna: bool = True, label=None) -> "DataFrame": """ Reshape long-format data to wide. Generalized inverse of DataFrame.pivot @@ -195,8 +195,8 @@ def lreshape(data: DataFrame, groups, dropna: bool = True, label=None) -> DataFr def wide_to_long( - df: DataFrame, stubnames, i, j, sep: str = "", suffix: str = r"\d+" -) -> DataFrame: + df: "DataFrame", stubnames, i, j, sep: str = "", suffix: str = r"\d+" +) -> "DataFrame": r""" Wide panel to long format. Less flexible but more user-friendly than melt. diff --git a/pandas/core/shared_docs.py b/pandas/core/shared_docs.py new file mode 100644 index 0000000000000..1894f551afea5 --- /dev/null +++ b/pandas/core/shared_docs.py @@ -0,0 +1,102 @@ +from typing import Dict + +_shared_docs: Dict[str, str] = dict() + + +_shared_docs[ + "melt" +] = """ + Unpivot a DataFrame from wide to long format, optionally leaving identifiers set. + + This function is useful to massage a DataFrame into a format where one + or more columns are identifier variables (`id_vars`), while all other + columns, considered measured variables (`value_vars`), are "unpivoted" to + the row axis, leaving just two non-identifier columns, 'variable' and + 'value'. + %(versionadded)s + Parameters + ---------- + id_vars : tuple, list, or ndarray, optional + Column(s) to use as identifier variables. + value_vars : tuple, list, or ndarray, optional + Column(s) to unpivot. If not specified, uses all columns that + are not set as `id_vars`. + var_name : scalar + Name to use for the 'variable' column. If None it uses + ``frame.columns.name`` or 'variable'. + value_name : scalar, default 'value' + Name to use for the 'value' column. + col_level : int or str, optional + If columns are a MultiIndex then use this level to melt. + + Returns + ------- + DataFrame + Unpivoted DataFrame. + + See Also + -------- + %(other)s : Identical method. + pivot_table : Create a spreadsheet-style pivot table as a DataFrame. + DataFrame.pivot : Return reshaped DataFrame organized + by given index / column values. + DataFrame.explode : Explode a DataFrame from list-like + columns to long format. + + Examples + -------- + >>> df = pd.DataFrame({'A': {0: 'a', 1: 'b', 2: 'c'}, + ... 'B': {0: 1, 1: 3, 2: 5}, + ... 'C': {0: 2, 1: 4, 2: 6}}) + >>> df + A B C + 0 a 1 2 + 1 b 3 4 + 2 c 5 6 + + >>> %(caller)sid_vars=['A'], value_vars=['B']) + A variable value + 0 a B 1 + 1 b B 3 + 2 c B 5 + + >>> %(caller)sid_vars=['A'], value_vars=['B', 'C']) + A variable value + 0 a B 1 + 1 b B 3 + 2 c B 5 + 3 a C 2 + 4 b C 4 + 5 c C 6 + + The names of 'variable' and 'value' columns can be customized: + + >>> %(caller)sid_vars=['A'], value_vars=['B'], + ... var_name='myVarname', value_name='myValname') + A myVarname myValname + 0 a B 1 + 1 b B 3 + 2 c B 5 + + If you have multi-index columns: + + >>> df.columns = [list('ABC'), list('DEF')] + >>> df + A B C + D E F + 0 a 1 2 + 1 b 3 4 + 2 c 5 6 + + >>> %(caller)scol_level=0, id_vars=['A'], value_vars=['B']) + A variable value + 0 a B 1 + 1 b B 3 + 2 c B 5 + + >>> %(caller)sid_vars=[('A', 'D')], value_vars=[('B', 'E')]) + (A, D) variable_0 variable_1 value + 0 a B E 1 + 1 b B E 3 + 2 c B E 5 + """ From 2307478e647185529312fbf4b9e67c71c81037e6 Mon Sep 17 00:00:00 2001 From: Andrew Wieteska <48889395+arw2019@users.noreply.github.com> Date: Wed, 24 Jun 2020 19:44:40 -0400 Subject: [PATCH 0205/1025] fix to_json for numbers larger than sys.maxsize (#34473) * BUG: overflow on to_json with numbers larger than sys.maxsize * TST: overflow on to_json with numbers larger than sys.maxsize (#34395) * DOC: update with issue #34395 * TST: removed unused import * ENH: added case JT_BIGNUM to encode * ENH: added JT_BIGNUM to JSTYPES * BUG: changed error for ints>sys.maxsize into JT_BIGNUM * ENH: removed debug statements * BUG: removed dumps wrapper * removed bigNum from TypeContext * TST: fixed bug in the test * added pointer to string rep converter for BigNum * TST: removed ujson.loads from the test * added getBigNumStringValue * added code to JT_BIGNUM handler by analogy with JT_UTF8 * TST: update pandas/tests/io/json/test_ujson.py Co-authored-by: William Ayd * added Object_getBigNumStringValue to pyEncoder * added skeletal code for Object_GetBigNumStringValue * completed Object_getBigNumStringValue using PyObject_Repr * BUG: changed Object_getBigNumStringValue * improved Object_getBigNumStringValue some more * update getBigNumStringValue argument * corrected Object_getBigNumStringValue * more fixes to Object_getBigNumStringValue * Update pandas/_libs/src/ujson/python/objToJSON.c Co-authored-by: William Ayd * Update pandas/_libs/src/ujson/python/objToJSON.c Co-authored-by: William Ayd * Update pandas/_libs/src/ujson/python/objToJSON.c Co-authored-by: William Ayd * Update pandas/_libs/src/ujson/python/objToJSON.c Co-authored-by: William Ayd * Update pandas/_libs/src/ujson/python/objToJSON.c Co-authored-by: William Ayd * Update pandas/_libs/src/ujson/python/objToJSON.c Co-authored-by: William Ayd * Update pandas/_libs/src/ujson/python/objToJSON.c * Update pandas/_libs/src/ujson/python/objToJSON.c * updated pyEncoder for JT_BIGNUM * updated pyEncoder * moved getBigNumStringValue to pyEncoder * fixed declaration of Object_getBigNumStringValue * fixed Object_getBigNumStringValue * catch overflow error with PyLong_AsLongLongAndOverflow * remove unnecessary error check * added shortcircuit for error check * simplify int overflow error catching Co-authored-by: William Ayd * Update long int test in pandas/tests/io/json/test_ujson.py Co-authored-by: William Ayd * removed tests expecting numeric overflow * remove underscore from overflow Co-authored-by: William Ayd * removed underscores from _overflow everywhere * fixed small typo * fix type of exc * deleted numeric overflow tests * remove extraneous condition in if statement Co-authored-by: William Ayd * remove extraneous condition in if statement Co-authored-by: William Ayd * change _Bool into int Co-authored-by: William Ayd * Update pandas/_libs/src/ujson/python/objToJSON.c Co-authored-by: William Ayd * Update pandas/_libs/src/ujson/lib/ultrajsonenc.c Co-authored-by: William Ayd * allocate an extra byte in Object_getBigNumStringValue Co-authored-by: William Ayd * allocate an extra byte in Object_getBigNumStringValue Co-authored-by: William Ayd * reinstate RESERVE_STRING(szlen) in JT_BIGNUM case * replaced (private) with (public) in whatnew * release bytes in Object_endTypeContext * in JT_BIGNUM change if+if into if+else if * added reallocation of bigNum_bytes * removed bigNum_bytes * added to_json test for ints>sys.maxsize * Use python malloc to match PyObject_Free in endTypeContext Co-authored-by: William Ayd * TST: added manually constructed strs to compare encodings * fixed styling to minimize diff with master * fixed styling * fixed conflicts with master * fix styling to minimize diff * fix styling to minimize diff * fixed styling * added negative nigNum to test_to_json_large_numers * added negative nigNum to test_to_json_large_numers * Update pandas/tests/io/json/test_ujson.py Co-authored-by: William Ayd * fixe test_to_json_for_large_nums for -ve * TST: added xfail for ujson.encode with long int input * TST: fixed variable names in test_to_json_large_numbers * TST: added xfail test for json.decode Series with long int * TST: added xfail test for json.decode DataFrame with long int * BENCH: added benchmarks for long ints Co-authored-by: William Ayd --- asv_bench/benchmarks/io/json.py | 30 ++++++++++++++++++++++ doc/source/whatsnew/v1.1.0.rst | 1 + pandas/_libs/src/ujson/lib/ultrajson.h | 3 +++ pandas/_libs/src/ujson/lib/ultrajsonenc.c | 29 +++++++++++++++++++++ pandas/_libs/src/ujson/python/objToJSON.c | 31 ++++++++++++++++++----- pandas/tests/io/json/test_pandas.py | 24 ++++++++++++++++++ pandas/tests/io/json/test_ujson.py | 24 +++++++++--------- 7 files changed, 123 insertions(+), 19 deletions(-) diff --git a/asv_bench/benchmarks/io/json.py b/asv_bench/benchmarks/io/json.py index a490e250943f5..ed0fb5b8fe342 100644 --- a/asv_bench/benchmarks/io/json.py +++ b/asv_bench/benchmarks/io/json.py @@ -1,3 +1,5 @@ +import sys + import numpy as np from pandas import DataFrame, concat, date_range, read_json, timedelta_range @@ -82,6 +84,7 @@ def setup(self, orient, frame): timedeltas = timedelta_range(start=1, periods=N, freq="s") datetimes = date_range(start=1, periods=N, freq="s") ints = np.random.randint(100000000, size=N) + longints = sys.maxsize * np.random.randint(100000000, size=N) floats = np.random.randn(N) strings = tm.makeStringIndex(N) self.df = DataFrame(np.random.randn(N, ncols), index=np.arange(N)) @@ -120,6 +123,18 @@ def setup(self, orient, frame): index=index, ) + self.df_longint_float_str = DataFrame( + { + "longint_1": longints, + "longint_2": longints, + "float_1": floats, + "float_2": floats, + "str_1": strings, + "str_2": strings, + }, + index=index, + ) + def time_to_json(self, orient, frame): getattr(self, frame).to_json(self.fname, orient=orient) @@ -172,6 +187,7 @@ def setup(self): timedeltas = timedelta_range(start=1, periods=N, freq="s") datetimes = date_range(start=1, periods=N, freq="s") ints = np.random.randint(100000000, size=N) + longints = sys.maxsize * np.random.randint(100000000, size=N) floats = np.random.randn(N) strings = tm.makeStringIndex(N) self.df = DataFrame(np.random.randn(N, ncols), index=np.arange(N)) @@ -209,6 +225,17 @@ def setup(self): }, index=index, ) + self.df_longint_float_str = DataFrame( + { + "longint_1": longints, + "longint_2": longints, + "float_1": floats, + "float_2": floats, + "str_1": strings, + "str_2": strings, + }, + index=index, + ) def time_floats_with_int_idex_lines(self): self.df.to_json(self.fname, orient="records", lines=True) @@ -225,6 +252,9 @@ def time_float_int_lines(self): def time_float_int_str_lines(self): self.df_int_float_str.to_json(self.fname, orient="records", lines=True) + def time_float_longint_str_lines(self): + self.df_longint_float_str.to_json(self.fname, orient="records", lines=True) + class ToJSONMem: def setup_cache(self): diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 19f029d6aed68..0d2254e401103 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -1026,6 +1026,7 @@ I/O - Bug in :meth:`~pandas.io.stata.StataReader` which resulted in categorical variables with difference dtypes when reading data using an iterator. (:issue:`31544`) - :meth:`HDFStore.keys` has now an optional `include` parameter that allows the retrieval of all native HDF5 table names (:issue:`29916`) - Bug in :meth:`read_excel` for ODS files removes 0.0 values (:issue:`27222`) +- Bug in :meth:`ujson.encode` was raising an `OverflowError` with numbers larger than sys.maxsize (:issue: `34395`) Plotting ^^^^^^^^ diff --git a/pandas/_libs/src/ujson/lib/ultrajson.h b/pandas/_libs/src/ujson/lib/ultrajson.h index acb66b668e8dc..69284e1c3f2ab 100644 --- a/pandas/_libs/src/ujson/lib/ultrajson.h +++ b/pandas/_libs/src/ujson/lib/ultrajson.h @@ -150,6 +150,7 @@ enum JSTYPES { JT_INT, // (JSINT32 (signed 32-bit)) JT_LONG, // (JSINT64 (signed 64-bit)) JT_DOUBLE, // (double) + JT_BIGNUM, // integer larger than sys.maxsize JT_UTF8, // (char 8-bit) JT_ARRAY, // Array structure JT_OBJECT, // Key/Value structure @@ -187,6 +188,8 @@ typedef struct __JSONObjectEncoder { JSINT64 (*getLongValue)(JSOBJ obj, JSONTypeContext *tc); JSINT32 (*getIntValue)(JSOBJ obj, JSONTypeContext *tc); double (*getDoubleValue)(JSOBJ obj, JSONTypeContext *tc); + const char *(*getBigNumStringValue)(JSOBJ obj, JSONTypeContext *tc, + size_t *_outLen); /* Begin iteration of an iteratable object (JS_ARRAY or JS_OBJECT) diff --git a/pandas/_libs/src/ujson/lib/ultrajsonenc.c b/pandas/_libs/src/ujson/lib/ultrajsonenc.c index 065e3b2c60cf9..51aa39a16920e 100644 --- a/pandas/_libs/src/ujson/lib/ultrajsonenc.c +++ b/pandas/_libs/src/ujson/lib/ultrajsonenc.c @@ -1107,6 +1107,35 @@ void encode(JSOBJ obj, JSONObjectEncoder *enc, const char *name, Buffer_AppendCharUnchecked(enc, '\"'); break; } + + case JT_BIGNUM: { + value = enc->getBigNumStringValue(obj, &tc, &szlen); + + Buffer_Reserve(enc, RESERVE_STRING(szlen)); + if (enc->errorMsg) { + enc->endTypeContext(obj, &tc); + return; + } + + if (enc->forceASCII) { + if (!Buffer_EscapeStringValidated(obj, enc, value, + value + szlen)) { + enc->endTypeContext(obj, &tc); + enc->level--; + return; + } + } else { + if (!Buffer_EscapeStringUnvalidated(enc, value, + value + szlen)) { + enc->endTypeContext(obj, &tc); + enc->level--; + return; + } + } + + break; + + } } enc->endTypeContext(obj, &tc); diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index c71e941f7d6e8..1de9642761961 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -1629,15 +1629,20 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { if (PyLong_Check(obj)) { PRINTMARK(); tc->type = JT_LONG; - GET_TC(tc)->longValue = PyLong_AsLongLong(obj); + int overflow = 0; + GET_TC(tc)->longValue = PyLong_AsLongLongAndOverflow(obj, &overflow); + int err; + err = (GET_TC(tc)->longValue == -1) && PyErr_Occurred(); - exc = PyErr_Occurred(); - - if (exc && PyErr_ExceptionMatches(PyExc_OverflowError)) { + if (overflow){ + PRINTMARK(); + tc->type = JT_BIGNUM; + } + else if (err) { PRINTMARK(); goto INVALID; } - + return; } else if (PyFloat_Check(obj)) { PRINTMARK(); @@ -2105,7 +2110,6 @@ void Object_endTypeContext(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { NpyArr_freeLabels(GET_TC(tc)->columnLabels, GET_TC(tc)->columnLabelsLen); GET_TC(tc)->columnLabels = NULL; - PyObject_Free(GET_TC(tc)->cStr); GET_TC(tc)->cStr = NULL; PyObject_Free(tc->prv); @@ -2126,6 +2130,19 @@ double Object_getDoubleValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { return GET_TC(tc)->doubleValue; } +const char *Object_getBigNumStringValue(JSOBJ obj, JSONTypeContext *tc, + size_t *_outLen) { + PyObject* repr = PyObject_Str(obj); + const char *str = PyUnicode_AsUTF8AndSize(repr, (Py_ssize_t *) _outLen); + char* bytes = PyObject_Malloc(*_outLen + 1); + memcpy(bytes, str, *_outLen + 1); + GET_TC(tc)->cStr = bytes; + + Py_DECREF(repr); + + return GET_TC(tc)->cStr; +} + static void Object_releaseObject(JSOBJ _obj) { Py_DECREF((PyObject *)_obj); } void Object_iterBegin(JSOBJ obj, JSONTypeContext *tc) { @@ -2181,6 +2198,7 @@ PyObject *objToJSON(PyObject *Py_UNUSED(self), PyObject *args, Object_getLongValue, NULL, // getIntValue is unused Object_getDoubleValue, + Object_getBigNumStringValue, Object_iterBegin, Object_iterNext, Object_iterEnd, @@ -2294,7 +2312,6 @@ PyObject *objToJSON(PyObject *Py_UNUSED(self), PyObject *args, if (ret != buffer) { encoder->free(ret); } - PyErr_Format(PyExc_OverflowError, "%s", encoder->errorMsg); return NULL; } diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 8578b31fbb81e..10f49b9b81528 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -4,6 +4,7 @@ from io import StringIO import json import os +import sys import numpy as np import pytest @@ -1242,6 +1243,29 @@ def test_read_jsonl_unicode_chars(self): expected = DataFrame([["foo\u201d", "bar"], ["foo", "bar"]], columns=["a", "b"]) tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize("bigNum", [sys.maxsize + 1, -(sys.maxsize + 2)]) + def test_to_json_large_numbers(self, bigNum): + # GH34473 + series = Series(bigNum, dtype=object, index=["articleId"]) + json = series.to_json() + expected = '{"articleId":' + str(bigNum) + "}" + assert json == expected + # GH 20599 + with pytest.raises(ValueError): + json = StringIO(json) + result = read_json(json) + tm.assert_series_equal(series, result) + + df = DataFrame(bigNum, dtype=object, index=["articleId"], columns=[0]) + json = df.to_json() + expected = '{"0":{"articleId":' + str(bigNum) + "}}" + assert json == expected + # GH 20599 + with pytest.raises(ValueError): + json = StringIO(json) + result = read_json(json) + tm.assert_frame_equal(df, result) + def test_read_json_large_numbers(self): # GH18842 json = '{"articleId": "1404366058080022500245"}' diff --git a/pandas/tests/io/json/test_ujson.py b/pandas/tests/io/json/test_ujson.py index 7b6acf7eed685..952c583040360 100644 --- a/pandas/tests/io/json/test_ujson.py +++ b/pandas/tests/io/json/test_ujson.py @@ -5,6 +5,7 @@ import locale import math import re +import sys import time import dateutil @@ -559,6 +560,17 @@ def test_encode_long_conversion(self): assert output == json.dumps(long_input) assert long_input == ujson.decode(output) + @pytest.mark.parametrize("bigNum", [sys.maxsize + 1, -(sys.maxsize + 2)]) + def test_dumps_ints_larger_than_maxsize(self, bigNum): + # GH34395 + bigNum = sys.maxsize + 1 + encoding = ujson.encode(bigNum) + assert str(bigNum) == encoding + + # GH20599 + with pytest.raises(ValueError): + assert ujson.loads(encoding) == bigNum + @pytest.mark.parametrize( "int_exp", ["1337E40", "1.337E40", "1337E+9", "1.337e+40", "1.337E-4"] ) @@ -570,18 +582,6 @@ def test_loads_non_str_bytes_raises(self): with pytest.raises(TypeError, match=msg): ujson.loads(None) - def test_encode_numeric_overflow(self): - with pytest.raises(OverflowError): - ujson.encode(12839128391289382193812939) - - def test_encode_numeric_overflow_nested(self): - class Nested: - x = 12839128391289382193812939 - - for _ in range(0, 100): - with pytest.raises(OverflowError): - ujson.encode(Nested()) - @pytest.mark.parametrize("val", [3590016419, 2 ** 31, 2 ** 32, (2 ** 32) - 1]) def test_decode_number_with_32bit_sign_bit(self, val): # Test that numbers that fit within 32 bits but would have the From 25b0e6d0d96da6782f6280a9e960c054826eea9e Mon Sep 17 00:00:00 2001 From: Andrew Wieteska <48889395+arw2019@users.noreply.github.com> Date: Thu, 25 Jun 2020 01:10:26 -0400 Subject: [PATCH 0206/1025] DOC: improve explanation of con argument DataFrame.to_sql (#34944) * DOC: added example of Connection as con argument to to_sql * DOC: fixed styling in to_sql docstring * DOC: fixed styling in to_sql docstring * DOC: fixed styling in to_sql docstring * revert accidental changes * DOC: fixed context manager example in to_sql * DOC: fixed typo in to_sql docstring --- pandas/core/generic.py | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 488dd00686a17..307bf84068424 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2468,7 +2468,7 @@ def to_sql( ---------- name : str Name of SQL table. - con : sqlalchemy.engine.Engine or sqlite3.Connection + con : sqlalchemy.engine.(Engine or Connection) or sqlite3.Connection Using SQLAlchemy makes it possible to use any DB supported by that library. Legacy support is provided for sqlite3.Connection objects. The user is responsible for engine disposal and connection closure for the SQLAlchemy @@ -2556,18 +2556,27 @@ def to_sql( >>> engine.execute("SELECT * FROM users").fetchall() [(0, 'User 1'), (1, 'User 2'), (2, 'User 3')] - >>> df1 = pd.DataFrame({'name' : ['User 4', 'User 5']}) - >>> df1.to_sql('users', con=engine, if_exists='append') + An `sqlalchemy.engine.Connection` can also be passed to to `con`: + >>> with engine.begin() as connection: + ... df1 = pd.DataFrame({'name' : ['User 4', 'User 5']}) + ... df1.to_sql('users', con=connection, if_exists='append') + + This is allowed to support operations that require that the same + DBAPI connection is used for the entire operation. + + >>> df2 = pd.DataFrame({'name' : ['User 6', 'User 7']}) + >>> df2.to_sql('users', con=engine, if_exists='append') >>> engine.execute("SELECT * FROM users").fetchall() [(0, 'User 1'), (1, 'User 2'), (2, 'User 3'), - (0, 'User 4'), (1, 'User 5')] + (0, 'User 4'), (1, 'User 5'), (0, 'User 6'), + (1, 'User 7')] - Overwrite the table with just ``df1``. + Overwrite the table with just ``df2``. - >>> df1.to_sql('users', con=engine, if_exists='replace', + >>> df2.to_sql('users', con=engine, if_exists='replace', ... index_label='id') >>> engine.execute("SELECT * FROM users").fetchall() - [(0, 'User 4'), (1, 'User 5')] + [(0, 'User 6'), (1, 'User 7')] Specify the dtype (especially useful for integers with missing values). Notice that while pandas is forced to store the data as floating point, From 7c21357a881912c6d640915462e0b0a439022f65 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Thu, 25 Jun 2020 15:39:06 +0100 Subject: [PATCH 0207/1025] DOC: typo in release notes (#34989) --- doc/source/whatsnew/v1.1.0.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 0d2254e401103..10dac7e2863f9 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -314,8 +314,8 @@ Other enhancements result in object dtype but preserve the integer dtype (:issue:`33607`, :issue:`34339`). - :meth:`~pandas.io.gbq.read_gbq` now allows to disable progress bar (:issue:`33360`). - :meth:`~pandas.io.gbq.read_gbq` now supports the ``max_results`` kwarg from ``pandas-gbq`` (:issue:`34639`). -- :meth:`Dataframe.cov` and :meth:`Series.cov` now support a new parameter ddof to support delta degrees of freedom as in the corresponding numpy methods (:issue:`34611`). -- :meth:`DataFrame.to_html` and :meth:`DataFrame.to_string`'s ``col_space`` parameter now accepts a list of dict to change only some specific columns' width (:issue:`28917`). +- :meth:`DataFrame.cov` and :meth:`Series.cov` now support a new parameter ddof to support delta degrees of freedom as in the corresponding numpy methods (:issue:`34611`). +- :meth:`DataFrame.to_html` and :meth:`DataFrame.to_string`'s ``col_space`` parameter now accepts a list or dict to change only some specific columns' width (:issue:`28917`). - :meth:`DataFrame.to_excel` can now also write OpenOffice spreadsheet (.ods) files (:issue:`27222`) .. --------------------------------------------------------------------------- From 821d0ff0ca2a6c4e090a58714cd67c13cc53db85 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 25 Jun 2020 07:40:54 -0700 Subject: [PATCH 0208/1025] REF: simplify advance/move/set_length in libreduction (#34982) --- pandas/_libs/reduction.pyx | 40 ++++++++++++++++---------------------- 1 file changed, 17 insertions(+), 23 deletions(-) diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx index 99c6f8bde5dd8..58de682c56d55 100644 --- a/pandas/_libs/reduction.pyx +++ b/pandas/_libs/reduction.pyx @@ -205,8 +205,7 @@ cdef class _BaseGrouper: cdef inline object _apply_to_group(self, object cached_typ, object cached_ityp, - Slider islider, Slider vslider, - Py_ssize_t group_size, bint initialized): + bint initialized): """ Call self.f on our new group, then update to the next group. """ @@ -222,9 +221,6 @@ cdef class _BaseGrouper: initialized = True _check_result_array(res, len(self.dummy_arr)) - islider.advance(group_size) - vslider.advance(group_size) - return res, initialized @@ -269,7 +265,7 @@ cdef class SeriesBinGrouper(_BaseGrouper): cdef: ndarray arr, result ndarray[int64_t] counts - Py_ssize_t i, n, group_size + Py_ssize_t i, n, group_size, start, end object res bint initialized = 0 Slider vslider, islider @@ -293,19 +289,21 @@ cdef class SeriesBinGrouper(_BaseGrouper): result = np.empty(self.ngroups, dtype='O') + start = 0 try: for i in range(self.ngroups): group_size = counts[i] + end = start + group_size - islider.set_length(group_size) - vslider.set_length(group_size) + islider.move(start, end) + vslider.move(start, end) cached_typ, cached_ityp = self._update_cached_objs( cached_typ, cached_ityp, islider, vslider) res, initialized = self._apply_to_group(cached_typ, cached_ityp, - islider, vslider, - group_size, initialized) + initialized) + start += group_size result[i] = res @@ -361,7 +359,7 @@ cdef class SeriesGrouper(_BaseGrouper): # Define result to avoid UnboundLocalError ndarray arr, result = None ndarray[int64_t] labels, counts - Py_ssize_t i, n, group_size, lab + Py_ssize_t i, n, group_size, lab, start, end object res bint initialized = 0 Slider vslider, islider @@ -377,6 +375,7 @@ cdef class SeriesGrouper(_BaseGrouper): result = np.empty(self.ngroups, dtype='O') + start = 0 try: for i in range(n): group_size += 1 @@ -385,20 +384,21 @@ cdef class SeriesGrouper(_BaseGrouper): if i == n - 1 or lab != labels[i + 1]: if lab == -1: - islider.advance(group_size) - vslider.advance(group_size) + start += group_size group_size = 0 continue - islider.set_length(group_size) - vslider.set_length(group_size) + end = start + group_size + islider.move(start, end) + vslider.move(start, end) cached_typ, cached_ityp = self._update_cached_objs( cached_typ, cached_ityp, islider, vslider) res, initialized = self._apply_to_group(cached_typ, cached_ityp, - islider, vslider, - group_size, initialized) + initialized) + + start += group_size result[lab] = res counts[lab] = group_size @@ -458,9 +458,6 @@ cdef class Slider: self.buf.data = self.values.data self.buf.strides[0] = self.stride - cdef advance(self, Py_ssize_t k): - self.buf.data = self.buf.data + self.stride * k - cdef move(self, int start, int end): """ For slicing @@ -468,9 +465,6 @@ cdef class Slider: self.buf.data = self.values.data + self.stride * start self.buf.shape[0] = end - start - cdef set_length(self, Py_ssize_t length): - self.buf.shape[0] = length - cdef reset(self): self.buf.shape[0] = self.orig_len From f593530a288c4113f84473d34b95ee96e2d4468d Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 25 Jun 2020 07:42:14 -0700 Subject: [PATCH 0209/1025] PERF: optimize Block.getitem_block (#34978) --- pandas/_libs/internals.pyx | 1 + pandas/core/internals/blocks.py | 18 +++++++++++++++++- 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/internals.pyx b/pandas/_libs/internals.pyx index db452cb0f1fa4..8b4b490f49b12 100644 --- a/pandas/_libs/internals.pyx +++ b/pandas/_libs/internals.pyx @@ -16,6 +16,7 @@ cnp.import_array() from pandas._libs.algos import ensure_int64 +@cython.final cdef class BlockPlacement: # __slots__ = '_as_slice', '_as_array', '_len' cdef: diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 0c98a779424bd..6207785fb2975 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -8,6 +8,7 @@ from pandas._libs import NaT, algos as libalgos, lib, writers import pandas._libs.internals as libinternals +from pandas._libs.internals import BlockPlacement from pandas._libs.tslibs import conversion from pandas._libs.tslibs.timezones import tz_compare from pandas._typing import ArrayLike @@ -112,6 +113,19 @@ class Block(PandasObject): _verify_integrity = True _validate_ndim = True + @classmethod + def _simple_new( + cls, values: ArrayLike, placement: BlockPlacement, ndim: int + ) -> "Block": + """ + Fastpath constructor, does *no* validation + """ + obj = object.__new__(cls) + obj.ndim = ndim + obj.values = values + obj._mgr_locs = placement + return obj + def __init__(self, values, placement, ndim=None): self.ndim = self._check_ndim(values, ndim) self.mgr_locs = placement @@ -289,13 +303,15 @@ def getitem_block(self, slicer, new_mgr_locs=None): if new_mgr_locs is None: axis0_slicer = slicer[0] if isinstance(slicer, tuple) else slicer new_mgr_locs = self.mgr_locs[axis0_slicer] + elif not isinstance(new_mgr_locs, BlockPlacement): + new_mgr_locs = BlockPlacement(new_mgr_locs) new_values = self._slice(slicer) if self._validate_ndim and new_values.ndim != self.ndim: raise ValueError("Only same dim slicing is allowed") - return self.make_block_same_class(new_values, new_mgr_locs) + return type(self)._simple_new(new_values, new_mgr_locs, self.ndim) @property def shape(self): From 6e10efc3b5de8dbf387a8e3560a03b5ce7129d13 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Thu, 25 Jun 2020 08:50:48 -0700 Subject: [PATCH 0210/1025] DOC: Demonstrate custom rolling indexer with Businessday (#34947) --- doc/source/user_guide/computation.rst | 4 +- pandas/core/window/indexers.py | 85 ++++++++++++++++++++++++ pandas/tests/window/test_base_indexer.py | 23 ++++++- 3 files changed, 109 insertions(+), 3 deletions(-) diff --git a/doc/source/user_guide/computation.rst b/doc/source/user_guide/computation.rst index 19fdb541a6a45..897e5d5fb0e24 100644 --- a/doc/source/user_guide/computation.rst +++ b/doc/source/user_guide/computation.rst @@ -561,7 +561,7 @@ For example, if we have the following ``DataFrame``: df and we want to use an expanding window where ``use_expanding`` is ``True`` otherwise a window of size -1, we can create the following ``BaseIndexer``: +1, we can create the following ``BaseIndexer`` subclass: .. code-block:: ipython @@ -593,6 +593,8 @@ and we want to use an expanding window where ``use_expanding`` is ``True`` other 3 3.0 4 10.0 +You can view other examples of ``BaseIndexer`` subclasses `here `__ + .. versionadded:: 1.1 For some problems knowledge of the future is available for analysis. For example, this occurs when diff --git a/pandas/core/window/indexers.py b/pandas/core/window/indexers.py index f0a76dc17b411..b710a35410458 100644 --- a/pandas/core/window/indexers.py +++ b/pandas/core/window/indexers.py @@ -1,4 +1,5 @@ """Indexer objects for computing start/end window bounds for rolling operations""" +from datetime import timedelta from typing import Dict, Optional, Tuple, Type, Union import numpy as np @@ -6,6 +7,8 @@ from pandas._libs.window.indexers import calculate_variable_window_bounds from pandas.util._decorators import Appender +from pandas.tseries.offsets import Nano + get_window_bounds_doc = """ Computes the bounds of a window. @@ -104,6 +107,88 @@ def get_window_bounds( ) +class NonFixedVariableWindowIndexer(BaseIndexer): + """Calculate window boundaries based on a non-fixed offset such as a BusinessDay""" + + def __init__( + self, + index_array: Optional[np.ndarray] = None, + window_size: int = 0, + index=None, + offset=None, + **kwargs, + ): + super().__init__(index_array, window_size, **kwargs) + self.index = index + self.offset = offset + + @Appender(get_window_bounds_doc) + def get_window_bounds( + self, + num_values: int = 0, + min_periods: Optional[int] = None, + center: Optional[bool] = None, + closed: Optional[str] = None, + ) -> Tuple[np.ndarray, np.ndarray]: + + # if windows is variable, default is 'right', otherwise default is 'both' + if closed is None: + closed = "right" if self.index is not None else "both" + + right_closed = closed in ["right", "both"] + left_closed = closed in ["left", "both"] + + if self.index[num_values - 1] < self.index[0]: + index_growth_sign = -1 + else: + index_growth_sign = 1 + + start = np.empty(num_values, dtype="int64") + start.fill(-1) + end = np.empty(num_values, dtype="int64") + end.fill(-1) + + start[0] = 0 + + # right endpoint is closed + if right_closed: + end[0] = 1 + # right endpoint is open + else: + end[0] = 0 + + # start is start of slice interval (including) + # end is end of slice interval (not including) + for i in range(1, num_values): + end_bound = self.index[i] + start_bound = self.index[i] - index_growth_sign * self.offset + + # left endpoint is closed + if left_closed: + start_bound -= Nano(1) + + # advance the start bound until we are + # within the constraint + start[i] = i + for j in range(start[i - 1], i): + if (self.index[j] - start_bound) * index_growth_sign > timedelta(0): + start[i] = j + break + + # end bound is previous end + # or current index + if (self.index[end[i - 1]] - end_bound) * index_growth_sign <= timedelta(0): + end[i] = i + 1 + else: + end[i] = end[i - 1] + + # right endpoint is open + if not right_closed: + end[i] -= 1 + + return start, end + + class ExpandingIndexer(BaseIndexer): """Calculate expanding window bounds, mimicking df.expanding()""" diff --git a/pandas/tests/window/test_base_indexer.py b/pandas/tests/window/test_base_indexer.py index df58028dee862..6f64a376b6fad 100644 --- a/pandas/tests/window/test_base_indexer.py +++ b/pandas/tests/window/test_base_indexer.py @@ -1,10 +1,12 @@ import numpy as np import pytest -from pandas import DataFrame, Series +from pandas import DataFrame, Series, date_range import pandas._testing as tm from pandas.api.indexers import BaseIndexer, FixedForwardWindowIndexer -from pandas.core.window.indexers import ExpandingIndexer +from pandas.core.window.indexers import ExpandingIndexer, NonFixedVariableWindowIndexer + +from pandas.tseries.offsets import BusinessDay def test_bad_get_window_bounds_signature(): @@ -234,3 +236,20 @@ def test_rolling_forward_cov_corr(func, expected): expected = Series(expected) expected.name = result.name tm.assert_equal(result, expected) + + +@pytest.mark.parametrize( + "closed,expected_data", + [ + ["right", [0.0, 1.0, 2.0, 3.0, 7.0, 12.0, 6.0, 7.0, 8.0, 9.0]], + ["left", [0.0, 0.0, 1.0, 2.0, 5.0, 9.0, 5.0, 6.0, 7.0, 8.0]], + ], +) +def test_non_fixed_variable_window_indexer(closed, expected_data): + index = date_range("2020", periods=10) + df = DataFrame(range(10), index=index) + offset = BusinessDay(1) + indexer = NonFixedVariableWindowIndexer(index=index, offset=offset) + result = df.rolling(indexer, closed=closed).sum() + expected = DataFrame(expected_data, index=index) + tm.assert_frame_equal(result, expected) From b7b8a364837f9e9054b85e80d49e08e6e4464b84 Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Thu, 25 Jun 2020 18:38:08 +0100 Subject: [PATCH 0211/1025] TYP: make the type annotations of read_csv & read_table discoverable (#34976) --- doc/source/whatsnew/v1.1.0.rst | 1 + pandas/io/parsers.py | 369 +++++++++++++++----------- pandas/tests/io/parser/test_common.py | 34 +++ 3 files changed, 246 insertions(+), 158 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 10dac7e2863f9..6808737d4fa5e 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -1025,6 +1025,7 @@ I/O - Bug in :meth:`~SQLDatabase.execute` was raising a ``ProgrammingError`` for some DB-API drivers when the SQL statement contained the `%` character and no parameters were present (:issue:`34211`) - Bug in :meth:`~pandas.io.stata.StataReader` which resulted in categorical variables with difference dtypes when reading data using an iterator. (:issue:`31544`) - :meth:`HDFStore.keys` has now an optional `include` parameter that allows the retrieval of all native HDF5 table names (:issue:`29916`) +- `TypeError` exceptions raised by :meth:`read_csv` and :meth:`read_table` were showing as ``parser_f`` when an unexpected keyword argument was passed (:issue:`25648`) - Bug in :meth:`read_excel` for ODS files removes 0.0 values (:issue:`27222`) - Bug in :meth:`ujson.encode` was raising an `OverflowError` with numbers larger than sys.maxsize (:issue: `34395`) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 62347f7110d76..c427d3a198b10 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -530,176 +530,229 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds): _deprecated_args: Set[str] = set() -def _make_parser_function(name, default_sep=","): - def parser_f( - filepath_or_buffer: FilePathOrBuffer, - sep=default_sep, - delimiter=None, - # Column and Index Locations and Names - header="infer", - names=None, - index_col=None, - usecols=None, - squeeze=False, - prefix=None, - mangle_dupe_cols=True, - # General Parsing Configuration - dtype=None, - engine=None, - converters=None, - true_values=None, - false_values=None, - skipinitialspace=False, - skiprows=None, - skipfooter=0, - nrows=None, - # NA and Missing Data Handling - na_values=None, - keep_default_na=True, - na_filter=True, - verbose=False, - skip_blank_lines=True, - # Datetime Handling - parse_dates=False, - infer_datetime_format=False, - keep_date_col=False, - date_parser=None, - dayfirst=False, - cache_dates=True, - # Iteration - iterator=False, - chunksize=None, - # Quoting, Compression, and File Format - compression="infer", - thousands=None, - decimal: str = ".", - lineterminator=None, - quotechar='"', - quoting=csv.QUOTE_MINIMAL, - doublequote=True, - escapechar=None, - comment=None, - encoding=None, - dialect=None, - # Error Handling - error_bad_lines=True, - warn_bad_lines=True, - # Internal - delim_whitespace=False, - low_memory=_c_parser_defaults["low_memory"], - memory_map=False, - float_precision=None, - ): - - # gh-23761 - # - # When a dialect is passed, it overrides any of the overlapping - # parameters passed in directly. We don't want to warn if the - # default parameters were passed in (since it probably means - # that the user didn't pass them in explicitly in the first place). - # - # "delimiter" is the annoying corner case because we alias it to - # "sep" before doing comparison to the dialect values later on. - # Thus, we need a flag to indicate that we need to "override" - # the comparison to dialect values by checking if default values - # for BOTH "delimiter" and "sep" were provided. - if dialect is not None: - sep_override = delimiter is None and sep == default_sep - kwds = dict(sep_override=sep_override) - else: - kwds = dict() - - # Alias sep -> delimiter. - if delimiter is None: - delimiter = sep - - if delim_whitespace and delimiter != default_sep: - raise ValueError( - "Specified a delimiter with both sep and " - "delim_whitespace=True; you can only specify one." - ) +@Appender( + _doc_read_csv_and_table.format( + func_name="read_csv", + summary="Read a comma-separated values (csv) file into DataFrame.", + _default_sep="','", + ) +) +def read_csv( + filepath_or_buffer: FilePathOrBuffer, + sep=",", + delimiter=None, + # Column and Index Locations and Names + header="infer", + names=None, + index_col=None, + usecols=None, + squeeze=False, + prefix=None, + mangle_dupe_cols=True, + # General Parsing Configuration + dtype=None, + engine=None, + converters=None, + true_values=None, + false_values=None, + skipinitialspace=False, + skiprows=None, + skipfooter=0, + nrows=None, + # NA and Missing Data Handling + na_values=None, + keep_default_na=True, + na_filter=True, + verbose=False, + skip_blank_lines=True, + # Datetime Handling + parse_dates=False, + infer_datetime_format=False, + keep_date_col=False, + date_parser=None, + dayfirst=False, + cache_dates=True, + # Iteration + iterator=False, + chunksize=None, + # Quoting, Compression, and File Format + compression="infer", + thousands=None, + decimal: str = ".", + lineterminator=None, + quotechar='"', + quoting=csv.QUOTE_MINIMAL, + doublequote=True, + escapechar=None, + comment=None, + encoding=None, + dialect=None, + # Error Handling + error_bad_lines=True, + warn_bad_lines=True, + # Internal + delim_whitespace=False, + low_memory=_c_parser_defaults["low_memory"], + memory_map=False, + float_precision=None, +): + # gh-23761 + # + # When a dialect is passed, it overrides any of the overlapping + # parameters passed in directly. We don't want to warn if the + # default parameters were passed in (since it probably means + # that the user didn't pass them in explicitly in the first place). + # + # "delimiter" is the annoying corner case because we alias it to + # "sep" before doing comparison to the dialect values later on. + # Thus, we need a flag to indicate that we need to "override" + # the comparison to dialect values by checking if default values + # for BOTH "delimiter" and "sep" were provided. + default_sep = "," + + if dialect is not None: + sep_override = delimiter is None and sep == default_sep + kwds = dict(sep_override=sep_override) + else: + kwds = dict() - if engine is not None: - engine_specified = True - else: - engine = "c" - engine_specified = False + # Alias sep -> delimiter. + if delimiter is None: + delimiter = sep - kwds.update( - delimiter=delimiter, - engine=engine, - dialect=dialect, - compression=compression, - engine_specified=engine_specified, - doublequote=doublequote, - escapechar=escapechar, - quotechar=quotechar, - quoting=quoting, - skipinitialspace=skipinitialspace, - lineterminator=lineterminator, - header=header, - index_col=index_col, - names=names, - prefix=prefix, - skiprows=skiprows, - skipfooter=skipfooter, - na_values=na_values, - true_values=true_values, - false_values=false_values, - keep_default_na=keep_default_na, - thousands=thousands, - comment=comment, - decimal=decimal, - parse_dates=parse_dates, - keep_date_col=keep_date_col, - dayfirst=dayfirst, - date_parser=date_parser, - cache_dates=cache_dates, - nrows=nrows, - iterator=iterator, - chunksize=chunksize, - converters=converters, - dtype=dtype, - usecols=usecols, - verbose=verbose, - encoding=encoding, - squeeze=squeeze, - memory_map=memory_map, - float_precision=float_precision, - na_filter=na_filter, - delim_whitespace=delim_whitespace, - warn_bad_lines=warn_bad_lines, - error_bad_lines=error_bad_lines, - low_memory=low_memory, - mangle_dupe_cols=mangle_dupe_cols, - infer_datetime_format=infer_datetime_format, - skip_blank_lines=skip_blank_lines, + if delim_whitespace and delimiter != default_sep: + raise ValueError( + "Specified a delimiter with both sep and " + "delim_whitespace=True; you can only specify one." ) - return _read(filepath_or_buffer, kwds) - - parser_f.__name__ = name - - return parser_f + if engine is not None: + engine_specified = True + else: + engine = "c" + engine_specified = False + + kwds.update( + delimiter=delimiter, + engine=engine, + dialect=dialect, + compression=compression, + engine_specified=engine_specified, + doublequote=doublequote, + escapechar=escapechar, + quotechar=quotechar, + quoting=quoting, + skipinitialspace=skipinitialspace, + lineterminator=lineterminator, + header=header, + index_col=index_col, + names=names, + prefix=prefix, + skiprows=skiprows, + skipfooter=skipfooter, + na_values=na_values, + true_values=true_values, + false_values=false_values, + keep_default_na=keep_default_na, + thousands=thousands, + comment=comment, + decimal=decimal, + parse_dates=parse_dates, + keep_date_col=keep_date_col, + dayfirst=dayfirst, + date_parser=date_parser, + cache_dates=cache_dates, + nrows=nrows, + iterator=iterator, + chunksize=chunksize, + converters=converters, + dtype=dtype, + usecols=usecols, + verbose=verbose, + encoding=encoding, + squeeze=squeeze, + memory_map=memory_map, + float_precision=float_precision, + na_filter=na_filter, + delim_whitespace=delim_whitespace, + warn_bad_lines=warn_bad_lines, + error_bad_lines=error_bad_lines, + low_memory=low_memory, + mangle_dupe_cols=mangle_dupe_cols, + infer_datetime_format=infer_datetime_format, + skip_blank_lines=skip_blank_lines, + ) + return _read(filepath_or_buffer, kwds) -read_csv = _make_parser_function("read_csv", default_sep=",") -read_csv = Appender( - _doc_read_csv_and_table.format( - func_name="read_csv", - summary="Read a comma-separated values (csv) file into DataFrame.", - _default_sep="','", - ) -)(read_csv) -read_table = _make_parser_function("read_table", default_sep="\t") -read_table = Appender( +@Appender( _doc_read_csv_and_table.format( func_name="read_table", summary="Read general delimited file into DataFrame.", _default_sep=r"'\\t' (tab-stop)", ) -)(read_table) +) +def read_table( + filepath_or_buffer: FilePathOrBuffer, + sep="\t", + delimiter=None, + # Column and Index Locations and Names + header="infer", + names=None, + index_col=None, + usecols=None, + squeeze=False, + prefix=None, + mangle_dupe_cols=True, + # General Parsing Configuration + dtype=None, + engine=None, + converters=None, + true_values=None, + false_values=None, + skipinitialspace=False, + skiprows=None, + skipfooter=0, + nrows=None, + # NA and Missing Data Handling + na_values=None, + keep_default_na=True, + na_filter=True, + verbose=False, + skip_blank_lines=True, + # Datetime Handling + parse_dates=False, + infer_datetime_format=False, + keep_date_col=False, + date_parser=None, + dayfirst=False, + cache_dates=True, + # Iteration + iterator=False, + chunksize=None, + # Quoting, Compression, and File Format + compression="infer", + thousands=None, + decimal: str = ".", + lineterminator=None, + quotechar='"', + quoting=csv.QUOTE_MINIMAL, + doublequote=True, + escapechar=None, + comment=None, + encoding=None, + dialect=None, + # Error Handling + error_bad_lines=True, + warn_bad_lines=True, + # Internal + delim_whitespace=False, + low_memory=_c_parser_defaults["low_memory"], + memory_map=False, + float_precision=None, +): + return read_csv(**locals()) def read_fwf( diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index e6e868689b060..12e73bae40eac 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -5,6 +5,7 @@ import codecs import csv from datetime import datetime +from inspect import signature from io import StringIO import os import platform @@ -2071,6 +2072,39 @@ def test_read_csv_raises_on_header_prefix(all_parsers): parser.read_csv(s, header=0, prefix="_X") +def test_unexpected_keyword_parameter_exception(all_parsers): + # GH-34976 + parser = all_parsers + + msg = "{}\\(\\) got an unexpected keyword argument 'foo'" + with pytest.raises(TypeError, match=msg.format("read_csv")): + parser.read_csv("foo.csv", foo=1) + with pytest.raises(TypeError, match=msg.format("read_table")): + parser.read_table("foo.tsv", foo=1) + + +def test_read_table_same_signature_as_read_csv(all_parsers): + # GH-34976 + parser = all_parsers + + table_sign = signature(parser.read_table) + csv_sign = signature(parser.read_csv) + + assert table_sign.parameters.keys() == csv_sign.parameters.keys() + assert table_sign.return_annotation == csv_sign.return_annotation + + for key, csv_param in csv_sign.parameters.items(): + table_param = table_sign.parameters[key] + if key == "sep": + assert csv_param.default == "," + assert table_param.default == "\t" + assert table_param.annotation == csv_param.annotation + assert table_param.kind == csv_param.kind + continue + else: + assert table_param == csv_param + + def test_read_table_equivalency_to_read_csv(all_parsers): # see gh-21948 # As of 0.25.0, read_table is undeprecated From 813e079e05074475a7a7fcbd4a1bb6a030434bf8 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Thu, 25 Jun 2020 18:38:43 +0100 Subject: [PATCH 0212/1025] TYP: remove inappropraite use of cast (#34990) --- pandas/io/formats/format.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 0c07b97a10fa3..66be1cedbc3bf 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -588,7 +588,7 @@ def __init__( elif isinstance(col_space, (int, str)): self.col_space = {"": col_space} self.col_space.update({column: col_space for column in self.frame.columns}) - elif isinstance(col_space, dict): + elif isinstance(col_space, Mapping): for column in col_space.keys(): if column not in self.frame.columns and column != "": raise ValueError( @@ -596,7 +596,6 @@ def __init__( ) self.col_space = col_space else: - col_space = cast(Sequence, col_space) if len(frame.columns) != len(col_space): raise ValueError( f"Col_space length({len(col_space)}) should match " From 1dc8990dcc542cb0883d81020788ab30200dcab1 Mon Sep 17 00:00:00 2001 From: Fangchen Li Date: Thu, 25 Jun 2020 17:56:45 -0500 Subject: [PATCH 0213/1025] HDFStore append_to_multiple with min_itemsize (#34939) --- doc/source/whatsnew/v1.1.0.rst | 1 + pandas/io/pytables.py | 9 ++++++++- pandas/tests/io/pytables/test_store.py | 27 ++++++++++++++++++++++++++ 3 files changed, 36 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 6808737d4fa5e..d836db480beb0 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -1028,6 +1028,7 @@ I/O - `TypeError` exceptions raised by :meth:`read_csv` and :meth:`read_table` were showing as ``parser_f`` when an unexpected keyword argument was passed (:issue:`25648`) - Bug in :meth:`read_excel` for ODS files removes 0.0 values (:issue:`27222`) - Bug in :meth:`ujson.encode` was raising an `OverflowError` with numbers larger than sys.maxsize (:issue: `34395`) +- Bug in :meth:`HDFStore.append_to_multiple` was raising a ``ValueError`` when the min_itemsize parameter is set (:issue:`11238`) Plotting ^^^^^^^^ diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 800e9474cc0f8..0e5d7b007bd89 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -1303,6 +1303,8 @@ def append_to_multiple( valid_index = valid_index.intersection(index) value = value.loc[valid_index] + min_itemsize = kwargs.pop("min_itemsize", None) + # append for k, v in d.items(): dc = data_columns if k == selector else None @@ -1310,7 +1312,12 @@ def append_to_multiple( # compute the val val = value.reindex(v, axis=axis) - self.append(k, val, data_columns=dc, **kwargs) + filtered = ( + {key: value for (key, value) in min_itemsize.items() if key in v} + if min_itemsize is not None + else None + ) + self.append(k, val, data_columns=dc, min_itemsize=filtered, **kwargs) def create_table_index( self, diff --git a/pandas/tests/io/pytables/test_store.py b/pandas/tests/io/pytables/test_store.py index 524e9f41a7731..c69992471fc9b 100644 --- a/pandas/tests/io/pytables/test_store.py +++ b/pandas/tests/io/pytables/test_store.py @@ -3697,6 +3697,33 @@ def test_append_to_multiple_dropna_false(self, setup_path): assert not store.select("df1a").index.equals(store.select("df2a").index) + def test_append_to_multiple_min_itemsize(self, setup_path): + # GH 11238 + df = pd.DataFrame( + { + "IX": np.arange(1, 21), + "Num": np.arange(1, 21), + "BigNum": np.arange(1, 21) * 88, + "Str": ["a" for _ in range(20)], + "LongStr": ["abcde" for _ in range(20)], + } + ) + expected = df.iloc[[0]] + + with ensure_clean_store(setup_path) as store: + store.append_to_multiple( + { + "index": ["IX"], + "nums": ["Num", "BigNum"], + "strs": ["Str", "LongStr"], + }, + df.iloc[[0]], + "index", + min_itemsize={"Str": 10, "LongStr": 100, "Num": 2}, + ) + result = store.select_as_multiple(["index", "nums", "strs"]) + tm.assert_frame_equal(result, expected) + def test_select_as_multiple(self, setup_path): df1 = tm.makeTimeDataFrame() From 6336dba0f27a2c8bc19aea65bf554d226f48d42b Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 25 Jun 2020 16:06:10 -0700 Subject: [PATCH 0214/1025] PERF: avoid creating many Series in apply_standard (#34909) --- pandas/core/apply.py | 113 ++++++++++++++++++------------------------- 1 file changed, 48 insertions(+), 65 deletions(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 90cb0e2e1be4c..102c457f94a95 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -4,16 +4,13 @@ import numpy as np +from pandas._config import option_context + from pandas._libs import reduction as libreduction from pandas._typing import Axis from pandas.util._decorators import cache_readonly -from pandas.core.dtypes.common import ( - is_dict_like, - is_extension_array_dtype, - is_list_like, - is_sequence, -) +from pandas.core.dtypes.common import is_dict_like, is_list_like, is_sequence from pandas.core.dtypes.generic import ABCSeries from pandas.core.construction import create_series_with_explicit_dtype @@ -260,53 +257,6 @@ def apply_standard(self): # partial result that may be returned from reduction partial_result = None - # try to reduce first (by default) - # this only matters if the reduction in values is of different dtype - # e.g. if we want to apply to a SparseFrame, then can't directly reduce - - # we cannot reduce using non-numpy dtypes, - # as demonstrated in gh-12244 - if ( - self.result_type in ["reduce", None] - and not self.dtypes.apply(is_extension_array_dtype).any() - # Disallow dtypes where setting _index_data will break - # ExtensionArray values, see GH#31182 - and not self.dtypes.apply(lambda x: x.kind in ["m", "M"]).any() - # Disallow complex_internals since libreduction shortcut raises a TypeError - and not self.agg_axis._has_complex_internals - ): - - values = self.values - index = self.obj._get_axis(self.axis) - labels = self.agg_axis - empty_arr = np.empty(len(index), dtype=values.dtype) - - # Preserve subclass for e.g. test_subclassed_apply - dummy = self.obj._constructor_sliced( - empty_arr, index=index, dtype=values.dtype - ) - - try: - result, reduction_success = libreduction.compute_reduction( - values, self.f, axis=self.axis, dummy=dummy, labels=labels - ) - except TypeError: - # e.g. test_apply_ignore_failures we just ignore - if not self.ignore_failures: - raise - except ZeroDivisionError: - # reached via numexpr; fall back to python implementation - pass - else: - if reduction_success: - return self.obj._constructor_sliced(result, index=labels) - - # no exceptions - however reduction was unsuccessful, - # use the computed function result for first element - partial_result = result[0] - if isinstance(partial_result, ABCSeries): - partial_result = partial_result.infer_objects() - # compute the result using the series generator, # use the result computed while trying to reduce if available. results, res_index = self.apply_series_generator(partial_result) @@ -344,7 +294,14 @@ def apply_series_generator(self, partial_result=None) -> Tuple[ResType, "Index"] else: for i, v in series_gen_enumeration: - results[i] = self.f(v) + with option_context("mode.chained_assignment", None): + # ignore SettingWithCopy here in case the user mutates + results[i] = self.f(v) + + if isinstance(results[i], ABCSeries): + # If we have a view on v, we need to make a copy because + # series_generator will swap out the underlying data + results[i] = results[i].copy(deep=False) return results, res_index @@ -355,7 +312,6 @@ def wrap_results( # see if we can infer the results if len(results) > 0 and 0 in results and is_sequence(results[0]): - return self.wrap_results_for_axis(results, res_index) # dict of scalars @@ -395,9 +351,30 @@ def result_columns(self) -> "Index": def wrap_results_for_axis( self, results: ResType, res_index: "Index" - ) -> "DataFrame": + ) -> Union["Series", "DataFrame"]: """ return the results for the rows """ - result = self.obj._constructor(data=results) + + if self.result_type == "reduce": + # e.g. test_apply_dict GH#8735 + return self.obj._constructor_sliced(results) + elif self.result_type is None and all( + isinstance(x, dict) for x in results.values() + ): + # Our operation was a to_dict op e.g. + # test_apply_dict GH#8735, test_apply_reduce_rows_to_dict GH#25196 + return self.obj._constructor_sliced(results) + + try: + result = self.obj._constructor(data=results) + except ValueError as err: + if "arrays must all be same length" in str(err): + # e.g. result = [[2, 3], [1.5], ['foo', 'bar']] + # see test_agg_listlike_result GH#29587 + res = self.obj._constructor_sliced(results) + res.index = res_index + return res + else: + raise if not isinstance(results[0], ABCSeries): if len(result.index) == len(self.res_columns): @@ -418,11 +395,19 @@ def apply_broadcast(self, target: "DataFrame") -> "DataFrame": @property def series_generator(self): - constructor = self.obj._constructor_sliced - return ( - constructor(arr, index=self.columns, name=name) - for i, (arr, name) in enumerate(zip(self.values, self.index)) - ) + values = self.values + assert len(values) > 0 + + # We create one Series object, and will swap out the data inside + # of it. Kids: don't do this at home. + ser = self.obj._ixs(0, axis=0) + mgr = ser._mgr + blk = mgr.blocks[0] + + for (arr, name) in zip(values, self.index): + blk.values = arr + ser.name = name + yield ser @property def result_index(self) -> "Index": @@ -444,9 +429,7 @@ def wrap_results_for_axis( # we have a non-series and don't want inference elif not isinstance(results[0], ABCSeries): - from pandas import Series - - result = Series(results) + result = self.obj._constructor_sliced(results) result.index = res_index # we may want to infer results From 3f34bf1e20532cf40ba60989ff8295ae677c14ed Mon Sep 17 00:00:00 2001 From: Valentin Iovene Date: Fri, 26 Jun 2020 01:12:59 +0200 Subject: [PATCH 0215/1025] BUG: conversion of empty DataFrame to SparseDtype (#33113) (#33118) --- doc/source/whatsnew/v1.1.0.rst | 1 + pandas/core/generic.py | 4 ++++ pandas/tests/extension/base/casting.py | 6 ++++++ pandas/tests/frame/methods/test_astype.py | 8 ++++++++ 4 files changed, 19 insertions(+) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index d836db480beb0..3cd2beaa9e4bf 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -1101,6 +1101,7 @@ Sparse - Bug in :meth:`Series.sum` with ``SparseArray`` raises ``TypeError`` (:issue:`25777`) - Bug where :class:`DataFrame` containing :class:`SparseArray` filled with ``NaN`` when indexed by a list-like (:issue:`27781`, :issue:`29563`) - The repr of :class:`SparseDtype` now includes the repr of its ``fill_value`` attribute. Previously it used ``fill_value``'s string representation (:issue:`34352`) +- Bug where empty :class:`DataFrame` could not be cast to :class:`SparseDtype` (:issue:`33113`) ExtensionArray ^^^^^^^^^^^^^^ diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 307bf84068424..4e0247bfcddca 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -5537,6 +5537,10 @@ def astype( new_data = self._mgr.astype(dtype=dtype, copy=copy, errors=errors,) return self._constructor(new_data).__finalize__(self, method="astype") + # GH 33113: handle empty frame or series + if not results: + return self.copy() + # GH 19920: retain column metadata after concat result = pd.concat(results, axis=1, copy=False) result.columns = self.columns diff --git a/pandas/tests/extension/base/casting.py b/pandas/tests/extension/base/casting.py index 567a62a8b33a5..3aaf040a4279b 100644 --- a/pandas/tests/extension/base/casting.py +++ b/pandas/tests/extension/base/casting.py @@ -50,3 +50,9 @@ def test_to_numpy(self, data): result = pd.Series(data).to_numpy() self.assert_equal(result, expected) + + def test_astype_empty_dataframe(self, dtype): + # https://github.com/pandas-dev/pandas/issues/33113 + df = pd.DataFrame() + result = df.astype(dtype) + self.assert_frame_equal(result, df) diff --git a/pandas/tests/frame/methods/test_astype.py b/pandas/tests/frame/methods/test_astype.py index b06c3d72a2c77..b0fd0496ea81e 100644 --- a/pandas/tests/frame/methods/test_astype.py +++ b/pandas/tests/frame/methods/test_astype.py @@ -557,3 +557,11 @@ def test_astype_dt64tz_to_str(self, timezone_frame): assert ( "2 2013-01-03 2013-01-03 00:00:00-05:00 2013-01-03 00:00:00+01:00" ) in result + + def test_astype_empty_dtype_dict(self): + # issue mentioned further down in the following issue's thread + # https://github.com/pandas-dev/pandas/issues/33113 + df = DataFrame() + result = df.astype(dict()) + tm.assert_frame_equal(result, df) + assert result is not df From 2e8fa28e5e8fb8b8a9deed25d9af29ee1d8cef5a Mon Sep 17 00:00:00 2001 From: Andrew Wieteska <48889395+arw2019@users.noreply.github.com> Date: Thu, 25 Jun 2020 19:24:25 -0400 Subject: [PATCH 0216/1025] BUG: exponential moving window covariance fails for multiIndexed DataFrame (#34943) * added test for df.ewm.cov with multiindex * BUG: fixed _flex_binary_moment for multiindex * added reference to GH issue * DOC: updated whatnew * DOC: moved note to rolling section of whatsnew * changed df to fixed seed, linearly spaced ints * removed extraneous comment * TST: hardcoded expected df for test_multiindex_cov * TST: cleaned up comment + blank line * TST: clean up index definition --- doc/source/whatsnew/v1.1.0.rst | 1 + pandas/core/window/common.py | 5 ++++- pandas/tests/window/test_pairwise.py | 27 ++++++++++++++++++++++++++- 3 files changed, 31 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 3cd2beaa9e4bf..ce0ac1a84b6fa 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -1061,6 +1061,7 @@ Groupby/resample/rolling - Bug in :meth:`SeriesGroupBy.agg` where any column name was accepted in the named aggregation of ``SeriesGroupBy`` previously. The behaviour now allows only ``str`` and callables else would raise ``TypeError``. (:issue:`34422`) - Bug in :meth:`DataFrame.groupby` lost index, when one of the ``agg`` keys referenced an empty list (:issue:`32580`) - Bug in :meth:`Rolling.apply` where ``center=True`` was ignored when ``engine='numba'`` was specified (:issue:`34784`) +- Bug in :meth:`DataFrame.ewm.cov` was throwing ``AssertionError`` for :class:`MultiIndex` inputs (:issue:`34440`) Reshaping ^^^^^^^^^ diff --git a/pandas/core/window/common.py b/pandas/core/window/common.py index 413fe648903ac..58e7841d4dde5 100644 --- a/pandas/core/window/common.py +++ b/pandas/core/window/common.py @@ -179,7 +179,10 @@ def dataframe_from_int_dict(data, frame_template): result.index = MultiIndex.from_product( arg2.columns.levels + [result_index] ) - result = result.reorder_levels([2, 0, 1]).sort_index() + # GH 34440 + num_levels = len(result.index.levels) + new_order = [num_levels - 1] + list(range(num_levels - 1)) + result = result.reorder_levels(new_order).sort_index() else: result.index = MultiIndex.from_product( [range(len(arg2.columns)), range(len(result_index))] diff --git a/pandas/tests/window/test_pairwise.py b/pandas/tests/window/test_pairwise.py index bb305e93a3cf1..e82d4b8cbf770 100644 --- a/pandas/tests/window/test_pairwise.py +++ b/pandas/tests/window/test_pairwise.py @@ -3,7 +3,7 @@ import numpy as np import pytest -from pandas import DataFrame, Series, date_range +from pandas import DataFrame, MultiIndex, Series, date_range import pandas._testing as tm from pandas.core.algorithms import safe_sort @@ -189,3 +189,28 @@ def test_corr_freq_memory_error(self): result = s.rolling("12H").corr(s) expected = Series([np.nan] * 5, index=date_range("2020", periods=5)) tm.assert_series_equal(result, expected) + + def test_cov_mulittindex(self): + # GH 34440 + + columns = MultiIndex.from_product([list("ab"), list("xy"), list("AB")]) + index = range(3) + df = DataFrame(np.arange(24).reshape(3, 8), index=index, columns=columns,) + + result = df.ewm(alpha=0.1).cov() + + index = MultiIndex.from_product([range(3), list("ab"), list("xy"), list("AB")]) + columns = MultiIndex.from_product([list("ab"), list("xy"), list("AB")]) + expected = DataFrame( + np.vstack( + ( + np.full((8, 8), np.NaN), + np.full((8, 8), 32.000000), + np.full((8, 8), 63.881919), + ) + ), + index=index, + columns=columns, + ) + + tm.assert_frame_equal(result, expected) From 50f05db40cfc3dcd49777c405b92923f1037c0c0 Mon Sep 17 00:00:00 2001 From: Kaiqi Dong Date: Fri, 26 Jun 2020 02:37:08 +0200 Subject: [PATCH 0217/1025] ENH: Implement xlabel and ylabel options in Series.plot and DataFrame.plot (#34223) --- doc/source/user_guide/visualization.rst | 28 +++++++++++++ doc/source/whatsnew/v1.1.0.rst | 1 + pandas/plotting/_core.py | 14 +++++++ pandas/plotting/_matplotlib/core.py | 21 ++++++++-- pandas/tests/plotting/test_frame.py | 56 +++++++++++++++++++++++++ pandas/tests/plotting/test_misc.py | 2 +- pandas/tests/plotting/test_series.py | 20 +++++++++ 7 files changed, 138 insertions(+), 4 deletions(-) diff --git a/doc/source/user_guide/visualization.rst b/doc/source/user_guide/visualization.rst index 6ba5cab71bf14..27826e7cde9e1 100644 --- a/doc/source/user_guide/visualization.rst +++ b/doc/source/user_guide/visualization.rst @@ -1108,6 +1108,34 @@ shown by default. plt.close('all') + +Controlling the labels +~~~~~~~~~~~~~~~~~~~~~~ + +.. versionadded:: 1.1.0 + +You may set the ``xlabel`` and ``ylabel`` arguments to give the plot custom labels +for x and y axis. By default, pandas will pick up index name as xlabel, while leaving +it empty for ylabel. + +.. ipython:: python + :suppress: + + plt.figure() + +.. ipython:: python + + df.plot() + + @savefig plot_xlabel_ylabel.png + df.plot(xlabel="new x", ylabel="new y") + +.. ipython:: python + :suppress: + + plt.close('all') + + Scales ~~~~~~ diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index ce0ac1a84b6fa..75f406d908c73 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -303,6 +303,7 @@ Other enhancements :class:`~pandas.io.stata.StataWriter`, :class:`~pandas.io.stata.StataWriter117`, and :class:`~pandas.io.stata.StataWriterUTF8` (:issue:`26599`). - :meth:`HDFStore.put` now accepts `track_times` parameter. Parameter is passed to ``create_table`` method of ``PyTables`` (:issue:`32682`). +- :meth:`Series.plot` and :meth:`DataFrame.plot` now accepts `xlabel` and `ylabel` parameters to present labels on x and y axis (:issue:`9093`). - Make :class:`pandas.core.window.Rolling` and :class:`pandas.core.window.Expanding` iterable(:issue:`11704`) - Make ``option_context`` a :class:`contextlib.ContextDecorator`, which allows it to be used as a decorator over an entire function (:issue:`34253`). - :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` now accept an ``errors`` argument (:issue:`22610`) diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index 4eb68367560b6..3a8cc5c299640 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -673,6 +673,16 @@ class PlotAccessor(PandasObject): Set the x limits of the current axes. ylim : 2-tuple/list Set the y limits of the current axes. + xlabel : label, optional + Name to use for the xlabel on x-axis. Default uses index name as xlabel. + + .. versionadded:: 1.1.0 + + ylabel : label, optional + Name to use for the ylabel on y-axis. Default will show no ylabel. + + .. versionadded:: 1.1.0 + rot : int, default None Rotation for ticks (xticks for vertical, yticks for horizontal plots). @@ -779,6 +789,8 @@ def _get_call_args(backend_name, data, args, kwargs): ("xerr", None), ("label", None), ("secondary_y", False), + ("xlabel", None), + ("ylabel", None), ] elif isinstance(data, ABCDataFrame): arg_def = [ @@ -811,6 +823,8 @@ def _get_call_args(backend_name, data, args, kwargs): ("xerr", None), ("secondary_y", False), ("sort_columns", False), + ("xlabel", None), + ("ylabel", None), ] else: raise TypeError( diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index f3682e0a008a6..e510f7140519a 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -1,9 +1,11 @@ import re -from typing import Optional +from typing import List, Optional import warnings +from matplotlib.artist import Artist import numpy as np +from pandas._typing import Label from pandas.errors import AbstractMethodError from pandas.util._decorators import cache_readonly @@ -97,6 +99,8 @@ def __init__( ylim=None, xticks=None, yticks=None, + xlabel: Optional[Label] = None, + ylabel: Optional[Label] = None, sort_columns=False, fontsize=None, secondary_y=False, @@ -138,6 +142,8 @@ def __init__( self.ylim = ylim self.title = title self.use_index = use_index + self.xlabel = xlabel + self.ylabel = ylabel self.fontsize = fontsize @@ -155,8 +161,8 @@ def __init__( self.grid = grid self.legend = legend - self.legend_handles = [] - self.legend_labels = [] + self.legend_handles: List[Artist] = [] + self.legend_labels: List[Label] = [] for attr in self._pop_attributes: value = kwds.pop(attr, self._attr_defaults.get(attr, None)) @@ -482,6 +488,11 @@ def _adorn_subplots(self): if self.xlim is not None: ax.set_xlim(self.xlim) + # GH9093, currently Pandas does not show ylabel, so if users provide + # ylabel will set it as ylabel in the plot. + if self.ylabel is not None: + ax.set_ylabel(pprint_thing(self.ylabel)) + ax.grid(self.grid) if self.title: @@ -668,6 +679,10 @@ def _get_index_name(self): if name is not None: name = pprint_thing(name) + # GH 9093, override the default xlabel if xlabel is provided. + if self.xlabel is not None: + name = pprint_thing(self.xlabel) + return name @classmethod diff --git a/pandas/tests/plotting/test_frame.py b/pandas/tests/plotting/test_frame.py index e4299490e7601..3d85e79b15c4c 100644 --- a/pandas/tests/plotting/test_frame.py +++ b/pandas/tests/plotting/test_frame.py @@ -3363,6 +3363,62 @@ def test_colors_of_columns_with_same_name(self): for legend, line in zip(result.get_legend().legendHandles, result.lines): assert legend.get_color() == line.get_color() + @pytest.mark.parametrize( + "index_name, old_label, new_label", + [ + (None, "", "new"), + ("old", "old", "new"), + (None, "", ""), + (None, "", 1), + (None, "", [1, 2]), + ], + ) + @pytest.mark.parametrize("kind", ["line", "area", "bar"]) + def test_xlabel_ylabel_dataframe_single_plot( + self, kind, index_name, old_label, new_label + ): + # GH 9093 + df = pd.DataFrame([[1, 2], [2, 5]], columns=["Type A", "Type B"]) + df.index.name = index_name + + # default is the ylabel is not shown and xlabel is index name + ax = df.plot(kind=kind) + assert ax.get_xlabel() == old_label + assert ax.get_ylabel() == "" + + # old xlabel will be overriden and assigned ylabel will be used as ylabel + ax = df.plot(kind=kind, ylabel=new_label, xlabel=new_label) + assert ax.get_ylabel() == str(new_label) + assert ax.get_xlabel() == str(new_label) + + @pytest.mark.parametrize( + "index_name, old_label, new_label", + [ + (None, "", "new"), + ("old", "old", "new"), + (None, "", ""), + (None, "", 1), + (None, "", [1, 2]), + ], + ) + @pytest.mark.parametrize("kind", ["line", "area", "bar"]) + def test_xlabel_ylabel_dataframe_subplots( + self, kind, index_name, old_label, new_label + ): + # GH 9093 + df = pd.DataFrame([[1, 2], [2, 5]], columns=["Type A", "Type B"]) + df.index.name = index_name + + # default is the ylabel is not shown and xlabel is index name + axes = df.plot(kind=kind, subplots=True) + assert all(ax.get_ylabel() == "" for ax in axes) + assert all(ax.get_xlabel() == old_label for ax in axes) + + # old xlabel will be overriden and assigned ylabel will be used as ylabel + axes = df.plot(kind=kind, ylabel=new_label, xlabel=new_label, subplots=True) + assert all(ax.get_ylabel() == str(new_label) for ax in axes) + assert all(ax.get_xlabel() == str(new_label) for ax in axes) + def _generate_4_axes_via_gridspec(): import matplotlib.pyplot as plt diff --git a/pandas/tests/plotting/test_misc.py b/pandas/tests/plotting/test_misc.py index 0b0d23632e827..75eeede472fe9 100644 --- a/pandas/tests/plotting/test_misc.py +++ b/pandas/tests/plotting/test_misc.py @@ -54,7 +54,7 @@ def test_get_accessor_args(): assert x is None assert y is None assert kind == "line" - assert len(kwargs) == 22 + assert len(kwargs) == 24 @td.skip_if_no_mpl diff --git a/pandas/tests/plotting/test_series.py b/pandas/tests/plotting/test_series.py index 6da892c15f489..64da98f57676f 100644 --- a/pandas/tests/plotting/test_series.py +++ b/pandas/tests/plotting/test_series.py @@ -934,3 +934,23 @@ def test_style_single_ok(self): s = pd.Series([1, 2]) ax = s.plot(style="s", color="C3") assert ax.lines[0].get_color() == ["C3"] + + @pytest.mark.parametrize( + "index_name, old_label, new_label", + [(None, "", "new"), ("old", "old", "new"), (None, "", "")], + ) + @pytest.mark.parametrize("kind", ["line", "area", "bar"]) + def test_xlabel_ylabel_series(self, kind, index_name, old_label, new_label): + # GH 9093 + ser = pd.Series([1, 2, 3, 4]) + ser.index.name = index_name + + # default is the ylabel is not shown and xlabel is index name + ax = ser.plot(kind=kind) + assert ax.get_ylabel() == "" + assert ax.get_xlabel() == old_label + + # old xlabel will be overriden and assigned ylabel will be used as ylabel + ax = ser.plot(kind=kind, ylabel=new_label, xlabel=new_label) + assert ax.get_ylabel() == new_label + assert ax.get_xlabel() == new_label From abdd5cc265ebdc1029800a479bb74366a502867f Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Fri, 26 Jun 2020 13:26:42 +0100 Subject: [PATCH 0218/1025] CI: lint failure on master (#35007) --- pandas/core/apply.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 102c457f94a95..9c223d66b727b 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -6,7 +6,6 @@ from pandas._config import option_context -from pandas._libs import reduction as libreduction from pandas._typing import Axis from pandas.util._decorators import cache_readonly From 6caf6a4803f2e6a659756c3aa0454d215a23cf91 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Fri, 26 Jun 2020 13:34:09 +0100 Subject: [PATCH 0219/1025] CLN: remove redundant code in IndexOpsMixin.item (#35008) --- pandas/core/base.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/pandas/core/base.py b/pandas/core/base.py index 813de491ffdb3..b62ef668df5e1 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -22,7 +22,6 @@ is_list_like, is_object_dtype, is_scalar, - needs_i8_conversion, ) from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries from pandas.core.dtypes.missing import isna @@ -656,13 +655,6 @@ def item(self): ValueError If the data is not length-1. """ - if not ( - is_extension_array_dtype(self.dtype) or needs_i8_conversion(self.dtype) - ): - # numpy returns ints instead of datetime64/timedelta64 objects, - # which we need to wrap in Timestamp/Timedelta/Period regardless. - return self._values.item() - if len(self) == 1: return next(iter(self)) raise ValueError("can only convert an array of size 1 to a Python scalar") From 166c04acacdede50f4a267aabcef94b711ff8099 Mon Sep 17 00:00:00 2001 From: SanthoshBala18 Date: Fri, 26 Jun 2020 18:42:42 +0530 Subject: [PATCH 0220/1025] Fix issue #35010: Double requirement given for fsspec (#35012) --- environment.yml | 1 - requirements-dev.txt | 1 - 2 files changed, 2 deletions(-) diff --git a/environment.yml b/environment.yml index 3783b7d360f1a..2429f4ab3d699 100644 --- a/environment.yml +++ b/environment.yml @@ -37,7 +37,6 @@ dependencies: # Dask and its dependencies (that dont install with dask) - dask-core - toolz>=0.7.3 - - fsspec>=0.5.1 - partd>=0.3.10 - cloudpickle>=0.2.1 diff --git a/requirements-dev.txt b/requirements-dev.txt index 90f9fec2f4bdf..44c975a3b3cfb 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -22,7 +22,6 @@ nbsphinx pandoc dask toolz>=0.7.3 -fsspec>=0.5.1 partd>=0.3.10 cloudpickle>=0.2.1 markdown From ec721935572be6c65e7ddcb158eb76d5571579cf Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 26 Jun 2020 13:18:30 -0700 Subject: [PATCH 0221/1025] CLN: remove libreduction.Reducer (#35001) --- pandas/_libs/reduction.pyx | 174 +---------------------- pandas/tests/groupby/test_bin_groupby.py | 36 +---- 2 files changed, 2 insertions(+), 208 deletions(-) diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx index 58de682c56d55..97c491776f831 100644 --- a/pandas/_libs/reduction.pyx +++ b/pandas/_libs/reduction.pyx @@ -1,17 +1,12 @@ from copy import copy from cython import Py_ssize_t -from cpython.ref cimport Py_INCREF from libc.stdlib cimport malloc, free import numpy as np cimport numpy as cnp -from numpy cimport (ndarray, - int64_t, - PyArray_SETITEM, - PyArray_ITER_NEXT, PyArray_ITER_DATA, PyArray_IterNew, - flatiter) +from numpy cimport ndarray, int64_t cnp.import_array() from pandas._libs cimport util @@ -26,146 +21,6 @@ cdef _check_result_array(object obj, Py_ssize_t cnt): raise ValueError('Function does not reduce') -cdef class Reducer: - """ - Performs generic reduction operation on a C or Fortran-contiguous ndarray - while avoiding ndarray construction overhead - """ - cdef: - Py_ssize_t increment, chunksize, nresults - object dummy, f, labels, typ, ityp, index - ndarray arr - - def __init__( - self, ndarray arr, object f, int axis=1, object dummy=None, object labels=None - ): - cdef: - Py_ssize_t n, k - - n, k = (arr).shape - - if axis == 0: - if not arr.flags.f_contiguous: - arr = arr.copy('F') - - self.nresults = k - self.chunksize = n - self.increment = n * arr.dtype.itemsize - else: - if not arr.flags.c_contiguous: - arr = arr.copy('C') - - self.nresults = n - self.chunksize = k - self.increment = k * arr.dtype.itemsize - - self.f = f - self.arr = arr - self.labels = labels - self.dummy, self.typ, self.index, self.ityp = self._check_dummy( - dummy=dummy) - - cdef _check_dummy(self, object dummy=None): - cdef: - object index = None, typ = None, ityp = None - - if dummy is None: - dummy = np.empty(self.chunksize, dtype=self.arr.dtype) - - # our ref is stolen later since we are creating this array - # in cython, so increment first - Py_INCREF(dummy) - - else: - - # we passed a Series - typ = type(dummy) - index = dummy.index - dummy = dummy.values - - if dummy.dtype != self.arr.dtype: - raise ValueError('Dummy array must be same dtype') - if len(dummy) != self.chunksize: - raise ValueError(f'Dummy array must be length {self.chunksize}') - - return dummy, typ, index, ityp - - def get_result(self): - cdef: - char* dummy_buf - ndarray arr, result, chunk - Py_ssize_t i - flatiter it - object res, name, labels - object cached_typ = None - - arr = self.arr - chunk = self.dummy - dummy_buf = chunk.data - chunk.data = arr.data - labels = self.labels - - result = np.empty(self.nresults, dtype='O') - it = PyArray_IterNew(result) - reduction_success = True - - try: - for i in range(self.nresults): - - # create the cached type - # each time just reassign the data - if i == 0: - - if self.typ is not None: - # In this case, we also have self.index - name = labels[i] - cached_typ = self.typ( - chunk, index=self.index, name=name, dtype=arr.dtype) - - # use the cached_typ if possible - if cached_typ is not None: - # In this case, we also have non-None labels - name = labels[i] - - object.__setattr__( - cached_typ._mgr._block, 'values', chunk) - object.__setattr__(cached_typ, 'name', name) - res = self.f(cached_typ) - else: - res = self.f(chunk) - - # TODO: reason for not squeezing here? - extracted_res = _extract_result(res, squeeze=False) - if i == 0: - # On the first pass, we check the output shape to see - # if this looks like a reduction. - # If it does not, return the computed value to be used by the - # pure python implementation, - # so the function won't be called twice on the same object, - # and side effects would occur twice - try: - _check_result_array(extracted_res, len(self.dummy)) - except ValueError as err: - if "Function does not reduce" not in str(err): - # catch only the specific exception - raise - - reduction_success = False - PyArray_SETITEM(result, PyArray_ITER_DATA(it), copy(res)) - break - - PyArray_SETITEM(result, PyArray_ITER_DATA(it), extracted_res) - chunk.data = chunk.data + self.increment - PyArray_ITER_NEXT(it) - - finally: - # so we don't free the wrong memory - chunk.data = dummy_buf - - result = maybe_convert_objects(result) - return result, reduction_success - - cdef class _BaseGrouper: cdef _check_dummy(self, object dummy): # both values and index must be an ndarray! @@ -610,30 +465,3 @@ cdef class BlockSlider: # axis=1 is the frame's axis=0 arr.data = self.base_ptrs[i] arr.shape[1] = 0 - - -def compute_reduction(arr: ndarray, f, axis: int = 0, dummy=None, labels=None): - """ - - Parameters - ----------- - arr : np.ndarray - f : function - axis : integer axis - dummy : type of reduced output (series) - labels : Index or None - """ - - # We either have both dummy and labels, or neither of them - if (labels is None) ^ (dummy is None): - raise ValueError("Must pass either dummy and labels, or neither") - - if labels is not None: - # Caller is responsible for ensuring we don't have MultiIndex - assert labels.nlevels == 1 - - # pass as an ndarray/ExtensionArray - labels = labels._values - - reducer = Reducer(arr, f, axis=axis, dummy=dummy, labels=labels) - return reducer.get_result() diff --git a/pandas/tests/groupby/test_bin_groupby.py b/pandas/tests/groupby/test_bin_groupby.py index 9df45f7a23f55..f20eed4575e91 100644 --- a/pandas/tests/groupby/test_bin_groupby.py +++ b/pandas/tests/groupby/test_bin_groupby.py @@ -6,7 +6,7 @@ from pandas.core.dtypes.common import ensure_int64 import pandas as pd -from pandas import Index, Series, isna +from pandas import Series, isna import pandas._testing as tm @@ -136,37 +136,3 @@ def _ohlc(group): class TestMoments: pass - - -class TestReducer: - def test_int_index(self): - arr = np.random.randn(100, 4) - - msg = "Must pass either dummy and labels, or neither" - # we must pass either both labels and dummy, or neither - with pytest.raises(ValueError, match=msg): - libreduction.compute_reduction(arr, np.sum, labels=Index(np.arange(4))) - - with pytest.raises(ValueError, match=msg): - libreduction.compute_reduction( - arr, np.sum, axis=1, labels=Index(np.arange(100)) - ) - - dummy = Series(0.0, index=np.arange(100)) - result, _ = libreduction.compute_reduction( - arr, np.sum, dummy=dummy, labels=Index(np.arange(4)) - ) - expected = arr.sum(0) - tm.assert_almost_equal(result, expected) - - dummy = Series(0.0, index=np.arange(4)) - result, _ = libreduction.compute_reduction( - arr, np.sum, axis=1, dummy=dummy, labels=Index(np.arange(100)) - ) - expected = arr.sum(1) - tm.assert_almost_equal(result, expected) - - result, _ = libreduction.compute_reduction( - arr, np.sum, axis=1, dummy=dummy, labels=Index(np.arange(100)) - ) - tm.assert_almost_equal(result, expected) From ce1d047455db661d2f7c185a08e363b95541d780 Mon Sep 17 00:00:00 2001 From: timhunderwood <43515959+timhunderwood@users.noreply.github.com> Date: Fri, 26 Jun 2020 21:21:16 +0100 Subject: [PATCH 0222/1025] ENH: specificy missing labels in loc calls GH34272 (#34912) --- doc/source/whatsnew/v1.1.0.rst | 9 ++++++++ pandas/core/indexing.py | 19 +++++++++++------ pandas/tests/indexing/test_indexing.py | 29 ++++++++++++++++++++++++++ 3 files changed, 51 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 75f406d908c73..260949da3c55f 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -13,6 +13,15 @@ including other versions of pandas. Enhancements ~~~~~~~~~~~~ +.. _whatsnew_110.specify_missing_labels: + +KeyErrors raised by loc specify missing labels +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Previously, if labels were missing for a loc call, a KeyError was raised stating that this was no longer supported. + +Now the error message also includes a list of the missing labels (max 10 items, display width 80 characters). See :issue:`34272`. + + .. _whatsnew_110.astype_string: All dtypes can now be converted to ``StringDtype`` diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 9c8b01003bece..3cf20b68c84f4 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -2,6 +2,8 @@ import numpy as np +from pandas._config.config import option_context + from pandas._libs.indexing import _NDFrameIndexerBase from pandas._libs.lib import item_from_zerodim from pandas.errors import AbstractMethodError, InvalidIndexError @@ -1283,7 +1285,8 @@ def _validate_read_indexer( return # Count missing values: - missing = (indexer < 0).sum() + missing_mask = indexer < 0 + missing = (missing_mask).sum() if missing: if missing == len(indexer): @@ -1302,11 +1305,15 @@ def _validate_read_indexer( # code, so we want to avoid warning & then # just raising if not ax.is_categorical(): - raise KeyError( - "Passing list-likes to .loc or [] with any missing labels " - "is no longer supported, see " - "https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike" # noqa:E501 - ) + not_found = key[missing_mask] + + with option_context("display.max_seq_items", 10, "display.width", 80): + raise KeyError( + "Passing list-likes to .loc or [] with any missing labels " + "is no longer supported. " + f"The following labels were missing: {not_found}. " + "See https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike" # noqa:E501 + ) @doc(IndexingMixin.iloc) diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index 5c0230e75021c..b77c47f927517 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -1075,3 +1075,32 @@ def test_setitem_with_bool_mask_and_values_matching_n_trues_in_length(): result = ser expected = pd.Series([None] * 3 + list(range(5)) + [None] * 2).astype("object") tm.assert_series_equal(result, expected) + + +def test_missing_labels_inside_loc_matched_in_error_message(): + # GH34272 + s = pd.Series({"a": 1, "b": 2, "c": 3}) + error_message_regex = "missing_0.*missing_1.*missing_2" + with pytest.raises(KeyError, match=error_message_regex): + s.loc[["a", "b", "missing_0", "c", "missing_1", "missing_2"]] + + +def test_many_missing_labels_inside_loc_error_message_limited(): + # GH34272 + n = 10000 + missing_labels = [f"missing_{label}" for label in range(n)] + s = pd.Series({"a": 1, "b": 2, "c": 3}) + # regex checks labels between 4 and 9995 are replaced with ellipses + error_message_regex = "missing_4.*\\.\\.\\..*missing_9995" + with pytest.raises(KeyError, match=error_message_regex): + s.loc[["a", "c"] + missing_labels] + + +def test_long_text_missing_labels_inside_loc_error_message_limited(): + # GH34272 + s = pd.Series({"a": 1, "b": 2, "c": 3}) + missing_labels = [f"long_missing_label_text_{i}" * 5 for i in range(3)] + # regex checks for very long labels there are new lines between each + error_message_regex = "long_missing_label_text_0.*\\\\n.*long_missing_label_text_1" + with pytest.raises(KeyError, match=error_message_regex): + s.loc[["a", "c"] + missing_labels] From da9371a7f84dd62f1f6046d20d8c224f38129b9f Mon Sep 17 00:00:00 2001 From: Erfan Nariman <34067903+erfannariman@users.noreply.github.com> Date: Fri, 26 Jun 2020 22:22:23 +0200 Subject: [PATCH 0223/1025] ENH: add ignore_index option in DataFrame.explode (#34933) --- doc/source/whatsnew/v1.1.0.rst | 1 + pandas/core/frame.py | 13 +++++++++++-- pandas/core/series.py | 19 +++++++++++++++---- pandas/tests/frame/methods/test_explode.py | 10 ++++++++++ pandas/tests/series/methods/test_explode.py | 8 ++++++++ 5 files changed, 45 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 260949da3c55f..c5eb2febe8ae9 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -327,6 +327,7 @@ Other enhancements - :meth:`DataFrame.cov` and :meth:`Series.cov` now support a new parameter ddof to support delta degrees of freedom as in the corresponding numpy methods (:issue:`34611`). - :meth:`DataFrame.to_html` and :meth:`DataFrame.to_string`'s ``col_space`` parameter now accepts a list or dict to change only some specific columns' width (:issue:`28917`). - :meth:`DataFrame.to_excel` can now also write OpenOffice spreadsheet (.ods) files (:issue:`27222`) +- :meth:`~Series.explode` now accepts ``ignore_index`` to reset the index, similarly to :meth:`pd.concat` or :meth:`DataFrame.sort_values` (:issue:`34932`). .. --------------------------------------------------------------------------- diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 521d16ac0b905..39f93af1670bf 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6939,7 +6939,9 @@ def stack(self, level=-1, dropna=True): else: return stack(self, level, dropna=dropna) - def explode(self, column: Union[str, Tuple]) -> "DataFrame": + def explode( + self, column: Union[str, Tuple], ignore_index: bool = False + ) -> "DataFrame": """ Transform each element of a list-like to a row, replicating index values. @@ -6949,6 +6951,10 @@ def explode(self, column: Union[str, Tuple]) -> "DataFrame": ---------- column : str or tuple Column to explode. + ignore_index : bool, default False + If True, the resulting index will be labeled 0, 1, …, n - 1. + + .. versionadded:: 1.1.0 Returns ------- @@ -7005,7 +7011,10 @@ def explode(self, column: Union[str, Tuple]) -> "DataFrame": assert df is not None # needed for mypy result = df[column].explode() result = df.drop([column], axis=1).join(result) - result.index = self.index.take(result.index) + if ignore_index: + result.index = ibase.default_index(len(result)) + else: + result.index = self.index.take(result.index) result = result.reindex(columns=self.columns, copy=False) return result diff --git a/pandas/core/series.py b/pandas/core/series.py index a652af5efc590..54b85afea4964 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -3774,12 +3774,19 @@ def reorder_levels(self, order) -> "Series": result.index = result.index.reorder_levels(order) return result - def explode(self) -> "Series": + def explode(self, ignore_index: bool = False) -> "Series": """ Transform each element of a list-like to a row. .. versionadded:: 0.25.0 + Parameters + ---------- + ignore_index : bool, default False + If True, the resulting index will be labeled 0, 1, …, n - 1. + + .. versionadded:: 1.1.0 + Returns ------- Series @@ -3826,9 +3833,13 @@ def explode(self) -> "Series": values, counts = reshape.explode(np.asarray(self.array)) - result = self._constructor( - values, index=self.index.repeat(counts), name=self.name - ) + if ignore_index: + index = ibase.default_index(len(values)) + else: + index = self.index.repeat(counts) + + result = self._constructor(values, index=index, name=self.name) + return result def unstack(self, level=-1, fill_value=None): diff --git a/pandas/tests/frame/methods/test_explode.py b/pandas/tests/frame/methods/test_explode.py index bad8349ec977b..2bbe8ac2d5b81 100644 --- a/pandas/tests/frame/methods/test_explode.py +++ b/pandas/tests/frame/methods/test_explode.py @@ -162,3 +162,13 @@ def test_duplicate_index(input_dict, input_index, expected_dict, expected_index) result = df.explode("col1") expected = pd.DataFrame(expected_dict, index=expected_index, dtype=object) tm.assert_frame_equal(result, expected) + + +def test_ignore_index(): + # GH 34932 + df = pd.DataFrame({"id": range(0, 20, 10), "values": [list("ab"), list("cd")]}) + result = df.explode("values", ignore_index=True) + expected = pd.DataFrame( + {"id": [0, 0, 10, 10], "values": list("abcd")}, index=[0, 1, 2, 3] + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/series/methods/test_explode.py b/pandas/tests/series/methods/test_explode.py index a25cfadf12467..4b65e042f7b02 100644 --- a/pandas/tests/series/methods/test_explode.py +++ b/pandas/tests/series/methods/test_explode.py @@ -118,3 +118,11 @@ def test_duplicate_index(): result = s.explode() expected = pd.Series([1, 2, 3, 4], index=[0, 0, 0, 0], dtype=object) tm.assert_series_equal(result, expected) + + +def test_ignore_index(): + # GH 34932 + s = pd.Series([[1, 2], [3, 4]]) + result = s.explode(ignore_index=True) + expected = pd.Series([1, 2, 3, 4], index=[0, 1, 2, 3], dtype=object) + tm.assert_series_equal(result, expected) From ef74de12577fdf5c38d8036eb49a6a7ed71843c1 Mon Sep 17 00:00:00 2001 From: Daniel Saxton <2658661+dsaxton@users.noreply.github.com> Date: Fri, 26 Jun 2020 15:23:04 -0500 Subject: [PATCH 0224/1025] ERR: Fix to_timedelta error message (#34981) --- pandas/_libs/tslibs/timedeltas.pyx | 2 +- pandas/core/indexes/timedeltas.py | 2 +- pandas/core/tools/timedeltas.py | 2 +- pandas/tests/indexes/timedeltas/test_constructors.py | 2 +- pandas/tests/scalar/timedelta/test_timedelta.py | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index 1c3e69e21aa18..2862e62e3d522 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -1151,7 +1151,7 @@ class Timedelta(_Timedelta): if unit in {'Y', 'y', 'M'}: raise ValueError( - "Units 'M' and 'Y' are no longer supported, as they do not " + "Units 'M', 'Y', and 'y' are no longer supported, as they do not " "represent unambiguous timedelta values durations." ) diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index f6661c6b50dfb..dccc8369c5366 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -136,7 +136,7 @@ def __new__( if unit in {"Y", "y", "M"}: raise ValueError( - "Units 'M' and 'Y' are no longer supported, as they do not " + "Units 'M', 'Y', and 'y' are no longer supported, as they do not " "represent unambiguous timedelta values durations." ) diff --git a/pandas/core/tools/timedeltas.py b/pandas/core/tools/timedeltas.py index a643c312ec358..e457a8819f27a 100644 --- a/pandas/core/tools/timedeltas.py +++ b/pandas/core/tools/timedeltas.py @@ -94,7 +94,7 @@ def to_timedelta(arg, unit=None, errors="raise"): if unit in {"Y", "y", "M"}: raise ValueError( - "Units 'M' and 'Y' are no longer supported, as they do not " + "Units 'M', 'Y', and 'y' are no longer supported, as they do not " "represent unambiguous timedelta values durations." ) diff --git a/pandas/tests/indexes/timedeltas/test_constructors.py b/pandas/tests/indexes/timedeltas/test_constructors.py index acc68dfe7301f..41e4e220c999c 100644 --- a/pandas/tests/indexes/timedeltas/test_constructors.py +++ b/pandas/tests/indexes/timedeltas/test_constructors.py @@ -12,7 +12,7 @@ class TestTimedeltaIndex: @pytest.mark.parametrize("unit", ["Y", "y", "M"]) def test_unit_m_y_raises(self, unit): - msg = "Units 'M' and 'Y' are no longer supported" + msg = "Units 'M', 'Y', and 'y' are no longer supported" with pytest.raises(ValueError, match=msg): TimedeltaIndex([1, 3, 7], unit) diff --git a/pandas/tests/scalar/timedelta/test_timedelta.py b/pandas/tests/scalar/timedelta/test_timedelta.py index 38e77321418d1..a01921bd6c4c2 100644 --- a/pandas/tests/scalar/timedelta/test_timedelta.py +++ b/pandas/tests/scalar/timedelta/test_timedelta.py @@ -265,7 +265,7 @@ def test_unit_parser(self, units, np_unit, wrapper): @pytest.mark.parametrize("unit", ["Y", "y", "M"]) def test_unit_m_y_raises(self, unit): - msg = "Units 'M' and 'Y' are no longer supported" + msg = "Units 'M', 'Y', and 'y' are no longer supported" with pytest.raises(ValueError, match=msg): Timedelta(10, unit) From 1499814a1263e464a2a88a3ba700378f101b0ea2 Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Sat, 27 Jun 2020 00:53:00 +0100 Subject: [PATCH 0225/1025] TST: rename fixtures named 'indices' to 'index' (#35024) --- pandas/conftest.py | 8 +- pandas/tests/base/test_misc.py | 3 +- pandas/tests/generic/test_generic.py | 10 +- pandas/tests/generic/test_to_xarray.py | 16 +- .../indexes/categorical/test_category.py | 14 +- pandas/tests/indexes/common.py | 286 ++++++++------- .../indexes/datetimes/test_datetimelike.py | 2 +- pandas/tests/indexes/interval/test_base.py | 2 +- pandas/tests/indexes/period/test_period.py | 2 +- pandas/tests/indexes/ranges/test_range.py | 8 +- pandas/tests/indexes/test_any_index.py | 77 ++-- pandas/tests/indexes/test_base.py | 333 +++++++++--------- pandas/tests/indexes/test_common.py | 252 ++++++------- pandas/tests/indexes/test_numeric.py | 10 +- pandas/tests/indexes/test_numpy_compat.py | 42 ++- pandas/tests/indexes/test_setops.py | 10 +- .../indexes/timedeltas/test_timedelta.py | 2 +- pandas/tests/indexing/test_indexing.py | 16 +- pandas/tests/series/methods/test_to_period.py | 3 +- .../tests/series/methods/test_to_timestamp.py | 3 +- pandas/tests/series/test_apply.py | 6 +- 21 files changed, 543 insertions(+), 562 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index e4cb3270b9acf..d74c43069574f 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -408,7 +408,7 @@ def _create_mi_with_dt64tz_level(): @pytest.fixture(params=indices_dict.keys()) -def indices(request): +def index(request): """ Fixture for many "simple" kinds of indices. @@ -423,7 +423,7 @@ def indices(request): # Needed to generate cartesian product of indices -index_fixture2 = indices +index_fixture2 = index # ---------------------------------------------------------------- @@ -478,11 +478,11 @@ def _create_series(index): @pytest.fixture -def series_with_simple_index(indices): +def series_with_simple_index(index): """ Fixture for tests on series with changing types of indices. """ - return _create_series(indices) + return _create_series(index) _narrow_dtypes = [ diff --git a/pandas/tests/base/test_misc.py b/pandas/tests/base/test_misc.py index 527f806483d94..78a830c7f43d8 100644 --- a/pandas/tests/base/test_misc.py +++ b/pandas/tests/base/test_misc.py @@ -173,8 +173,7 @@ def test_searchsorted(index_or_series_obj): assert 0 <= index <= len(obj) -def test_access_by_position(indices): - index = indices +def test_access_by_position(index): if len(index) == 0: pytest.skip("Test doesn't make sense on empty data") diff --git a/pandas/tests/generic/test_generic.py b/pandas/tests/generic/test_generic.py index 05588ead54be4..94747a52136c4 100644 --- a/pandas/tests/generic/test_generic.py +++ b/pandas/tests/generic/test_generic.py @@ -251,13 +251,13 @@ def test_metadata_propagation(self): self.check_metadata(v1 & v2) self.check_metadata(v1 | v2) - def test_head_tail(self, indices): + def test_head_tail(self, index): # GH5370 - o = self._construct(shape=len(indices)) + o = self._construct(shape=len(index)) axis = o._get_axis_name(0) - setattr(o, axis, indices) + setattr(o, axis, index) o.head() @@ -273,8 +273,8 @@ def test_head_tail(self, indices): self._compare(o.tail(len(o) + 1), o) # neg index - self._compare(o.head(-3), o.head(len(indices) - 3)) - self._compare(o.tail(-3), o.tail(len(indices) - 3)) + self._compare(o.head(-3), o.head(len(index) - 3)) + self._compare(o.tail(-3), o.tail(len(index) - 3)) def test_sample(self): # Fixes issue: 2419 diff --git a/pandas/tests/generic/test_to_xarray.py b/pandas/tests/generic/test_to_xarray.py index 2fde96a1c8f89..ab56a752f7e90 100644 --- a/pandas/tests/generic/test_to_xarray.py +++ b/pandas/tests/generic/test_to_xarray.py @@ -10,10 +10,10 @@ class TestDataFrameToXArray: @td.skip_if_no("xarray", "0.10.0") - def test_to_xarray_index_types(self, indices): - if isinstance(indices, pd.MultiIndex): + def test_to_xarray_index_types(self, index): + if isinstance(index, pd.MultiIndex): pytest.skip("MultiIndex is tested separately") - if len(indices) == 0: + if len(index) == 0: pytest.skip("Test doesn't make sense for empty index") from xarray import Dataset @@ -31,7 +31,7 @@ def test_to_xarray_index_types(self, indices): } ) - df.index = indices[:3] + df.index = index[:3] df.index.name = "foo" df.columns.name = "bar" result = df.to_xarray() @@ -93,17 +93,17 @@ def test_to_xarray(self): class TestSeriesToXArray: @td.skip_if_no("xarray", "0.10.0") - def test_to_xarray_index_types(self, indices): - if isinstance(indices, pd.MultiIndex): + def test_to_xarray_index_types(self, index): + if isinstance(index, pd.MultiIndex): pytest.skip("MultiIndex is tested separately") from xarray import DataArray - s = Series(range(len(indices)), index=indices, dtype="int64") + s = Series(range(len(index)), index=index, dtype="int64") s.index.name = "foo" result = s.to_xarray() repr(result) - assert len(result) == len(indices) + assert len(result) == len(index) assert len(result.coords) == 1 tm.assert_almost_equal(list(result.coords.keys()), ["foo"]) assert isinstance(result, DataArray) diff --git a/pandas/tests/indexes/categorical/test_category.py b/pandas/tests/indexes/categorical/test_category.py index 7a1ccba08853b..7f30a77872bc1 100644 --- a/pandas/tests/indexes/categorical/test_category.py +++ b/pandas/tests/indexes/categorical/test_category.py @@ -15,7 +15,7 @@ class TestCategoricalIndex(Base): _holder = CategoricalIndex @pytest.fixture - def indices(self, request): + def index(self, request): return tm.makeCategoricalIndex(100) def create_index(self, categories=None, ordered=False): @@ -354,7 +354,7 @@ def test_identical(self): assert ci1.identical(ci1.copy()) assert not ci1.identical(ci2) - def test_ensure_copied_data(self, indices): + def test_ensure_copied_data(self, index): # gh-12309: Check the "copy" argument of each # Index.__new__ is honored. # @@ -364,12 +364,12 @@ def test_ensure_copied_data(self, indices): # FIXME: is this test still meaningful? _base = lambda ar: ar if getattr(ar, "base", None) is None else ar.base - result = CategoricalIndex(indices.values, copy=True) - tm.assert_index_equal(indices, result) - assert _base(indices.values) is not _base(result.values) + result = CategoricalIndex(index.values, copy=True) + tm.assert_index_equal(index, result) + assert _base(index.values) is not _base(result.values) - result = CategoricalIndex(indices.values, copy=False) - assert _base(indices.values) is _base(result.values) + result = CategoricalIndex(index.values, copy=False) + assert _base(index.values) is _base(result.values) def test_equals_categorical(self): ci1 = CategoricalIndex(["a", "b"], categories=["a", "b"], ordered=True) diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index ae297bf1069b0..30c58506f619d 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -180,21 +180,21 @@ def test_reindex_base(self): with pytest.raises(ValueError, match="Invalid fill method"): idx.get_indexer(idx, method="invalid") - def test_get_indexer_consistency(self, indices): + def test_get_indexer_consistency(self, index): # See GH 16819 - if isinstance(indices, IntervalIndex): + if isinstance(index, IntervalIndex): return - if indices.is_unique or isinstance(indices, CategoricalIndex): - indexer = indices.get_indexer(indices[0:2]) + if index.is_unique or isinstance(index, CategoricalIndex): + indexer = index.get_indexer(index[0:2]) assert isinstance(indexer, np.ndarray) assert indexer.dtype == np.intp else: e = "Reindexing only valid with uniquely valued Index objects" with pytest.raises(InvalidIndexError, match=e): - indices.get_indexer(indices[0:2]) + index.get_indexer(index[0:2]) - indexer, _ = indices.get_indexer_non_unique(indices[0:2]) + indexer, _ = index.get_indexer_non_unique(index[0:2]) assert isinstance(indexer, np.ndarray) assert indexer.dtype == np.intp @@ -224,20 +224,20 @@ def test_repr_max_seq_item_setting(self): repr(idx) assert "..." not in str(idx) - def test_copy_name(self, indices): + def test_copy_name(self, index): # gh-12309: Check that the "name" argument # passed at initialization is honored. - if isinstance(indices, MultiIndex): + if isinstance(index, MultiIndex): return - first = type(indices)(indices, copy=True, name="mario") + first = type(index)(index, copy=True, name="mario") second = type(first)(first, copy=False) # Even though "copy=False", we want a new object. assert first is not second # Not using tm.assert_index_equal() since names differ. - assert indices.equals(first) + assert index.equals(first) assert first.name == "mario" assert second.name == "mario" @@ -245,78 +245,76 @@ def test_copy_name(self, indices): s1 = Series(2, index=first) s2 = Series(3, index=second[:-1]) - if not isinstance(indices, CategoricalIndex): + if not isinstance(index, CategoricalIndex): # See gh-13365 s3 = s1 * s2 assert s3.index.name == "mario" - def test_ensure_copied_data(self, indices): + def test_ensure_copied_data(self, index): # Check the "copy" argument of each Index.__new__ is honoured # GH12309 init_kwargs = {} - if isinstance(indices, PeriodIndex): + if isinstance(index, PeriodIndex): # Needs "freq" specification: - init_kwargs["freq"] = indices.freq - elif isinstance(indices, (RangeIndex, MultiIndex, CategoricalIndex)): + init_kwargs["freq"] = index.freq + elif isinstance(index, (RangeIndex, MultiIndex, CategoricalIndex)): # RangeIndex cannot be initialized from data # MultiIndex and CategoricalIndex are tested separately return - index_type = type(indices) - result = index_type(indices.values, copy=True, **init_kwargs) - if is_datetime64tz_dtype(indices.dtype): - result = result.tz_localize("UTC").tz_convert(indices.tz) - if isinstance(indices, (DatetimeIndex, TimedeltaIndex)): - indices = indices._with_freq(None) + index_type = type(index) + result = index_type(index.values, copy=True, **init_kwargs) + if is_datetime64tz_dtype(index.dtype): + result = result.tz_localize("UTC").tz_convert(index.tz) + if isinstance(index, (DatetimeIndex, TimedeltaIndex)): + index = index._with_freq(None) - tm.assert_index_equal(indices, result) + tm.assert_index_equal(index, result) - if isinstance(indices, PeriodIndex): + if isinstance(index, PeriodIndex): # .values an object array of Period, thus copied - result = index_type(ordinal=indices.asi8, copy=False, **init_kwargs) - tm.assert_numpy_array_equal(indices.asi8, result.asi8, check_same="same") - elif isinstance(indices, IntervalIndex): + result = index_type(ordinal=index.asi8, copy=False, **init_kwargs) + tm.assert_numpy_array_equal(index.asi8, result.asi8, check_same="same") + elif isinstance(index, IntervalIndex): # checked in test_interval.py pass else: - result = index_type(indices.values, copy=False, **init_kwargs) - tm.assert_numpy_array_equal( - indices.values, result.values, check_same="same" - ) + result = index_type(index.values, copy=False, **init_kwargs) + tm.assert_numpy_array_equal(index.values, result.values, check_same="same") - def test_memory_usage(self, indices): - indices._engine.clear_mapping() - result = indices.memory_usage() - if indices.empty: + def test_memory_usage(self, index): + index._engine.clear_mapping() + result = index.memory_usage() + if index.empty: # we report 0 for no-length assert result == 0 return # non-zero length - indices.get_loc(indices[0]) - result2 = indices.memory_usage() - result3 = indices.memory_usage(deep=True) + index.get_loc(index[0]) + result2 = index.memory_usage() + result3 = index.memory_usage(deep=True) # RangeIndex, IntervalIndex # don't have engines - if not isinstance(indices, (RangeIndex, IntervalIndex)): + if not isinstance(index, (RangeIndex, IntervalIndex)): assert result2 > result - if indices.inferred_type == "object": + if index.inferred_type == "object": assert result3 > result2 - def test_argsort(self, request, indices): + def test_argsort(self, request, index): # separately tested - if isinstance(indices, CategoricalIndex): + if isinstance(index, CategoricalIndex): return - result = indices.argsort() - expected = np.array(indices).argsort() + result = index.argsort() + expected = np.array(index).argsort() tm.assert_numpy_array_equal(result, expected, check_dtype=False) - def test_numpy_argsort(self, indices): - result = np.argsort(indices) - expected = indices.argsort() + def test_numpy_argsort(self, index): + result = np.argsort(index) + expected = index.argsort() tm.assert_numpy_array_equal(result, expected) # these are the only two types that perform @@ -326,34 +324,34 @@ def test_numpy_argsort(self, indices): # defined in pandas.core.indexes/base.py - they # cannot be changed at the moment due to # backwards compatibility concerns - if isinstance(type(indices), (CategoricalIndex, RangeIndex)): + if isinstance(type(index), (CategoricalIndex, RangeIndex)): msg = "the 'axis' parameter is not supported" with pytest.raises(ValueError, match=msg): - np.argsort(indices, axis=1) + np.argsort(index, axis=1) msg = "the 'kind' parameter is not supported" with pytest.raises(ValueError, match=msg): - np.argsort(indices, kind="mergesort") + np.argsort(index, kind="mergesort") msg = "the 'order' parameter is not supported" with pytest.raises(ValueError, match=msg): - np.argsort(indices, order=("a", "b")) + np.argsort(index, order=("a", "b")) - def test_take(self, indices): + def test_take(self, index): indexer = [4, 3, 0, 2] - if len(indices) < 5: + if len(index) < 5: # not enough elements; ignore return - result = indices.take(indexer) - expected = indices[indexer] + result = index.take(indexer) + expected = index[indexer] assert result.equals(expected) - if not isinstance(indices, (DatetimeIndex, PeriodIndex, TimedeltaIndex)): + if not isinstance(index, (DatetimeIndex, PeriodIndex, TimedeltaIndex)): # GH 10791 msg = r"'(.*Index)' object has no attribute 'freq'" with pytest.raises(AttributeError, match=msg): - indices.freq + index.freq def test_take_invalid_kwargs(self): idx = self.create_index() @@ -413,22 +411,22 @@ def test_where(self, klass): @pytest.mark.parametrize( "method", ["intersection", "union", "difference", "symmetric_difference"] ) - def test_set_ops_error_cases(self, case, method, indices): + def test_set_ops_error_cases(self, case, method, index): # non-iterable input msg = "Input must be Index or array-like" with pytest.raises(TypeError, match=msg): - getattr(indices, method)(case) + getattr(index, method)(case) - def test_intersection_base(self, indices): - if isinstance(indices, CategoricalIndex): + def test_intersection_base(self, index): + if isinstance(index, CategoricalIndex): return - first = indices[:5] - second = indices[:3] + first = index[:5] + second = index[:3] intersect = first.intersection(second) assert tm.equalContents(intersect, second) - if is_datetime64tz_dtype(indices.dtype): + if is_datetime64tz_dtype(index.dtype): # The second.values below will drop tz, so the rest of this test # is not applicable. return @@ -439,19 +437,19 @@ def test_intersection_base(self, indices): result = first.intersection(case) assert tm.equalContents(result, second) - if isinstance(indices, MultiIndex): + if isinstance(index, MultiIndex): msg = "other must be a MultiIndex or a list of tuples" with pytest.raises(TypeError, match=msg): first.intersection([1, 2, 3]) - def test_union_base(self, indices): - first = indices[3:] - second = indices[:5] - everything = indices + def test_union_base(self, index): + first = index[3:] + second = index[:5] + everything = index union = first.union(second) assert tm.equalContents(union, everything) - if is_datetime64tz_dtype(indices.dtype): + if is_datetime64tz_dtype(index.dtype): # The second.values below will drop tz, so the rest of this test # is not applicable. return @@ -459,29 +457,29 @@ def test_union_base(self, indices): # GH 10149 cases = [klass(second.values) for klass in [np.array, Series, list]] for case in cases: - if not isinstance(indices, CategoricalIndex): + if not isinstance(index, CategoricalIndex): result = first.union(case) assert tm.equalContents(result, everything) - if isinstance(indices, MultiIndex): + if isinstance(index, MultiIndex): msg = "other must be a MultiIndex or a list of tuples" with pytest.raises(TypeError, match=msg): first.union([1, 2, 3]) - def test_difference_base(self, sort, indices): - first = indices[2:] - second = indices[:4] - if isinstance(indices, CategoricalIndex) or indices.is_boolean(): + def test_difference_base(self, sort, index): + first = index[2:] + second = index[:4] + if isinstance(index, CategoricalIndex) or index.is_boolean(): answer = [] else: - answer = indices[4:] + answer = index[4:] result = first.difference(second, sort) assert tm.equalContents(result, answer) # GH 10149 cases = [klass(second.values) for klass in [np.array, Series, list]] for case in cases: - if isinstance(indices, (DatetimeIndex, TimedeltaIndex)): + if isinstance(index, (DatetimeIndex, TimedeltaIndex)): assert type(result) == type(answer) tm.assert_numpy_array_equal( result.sort_values().asi8, answer.sort_values().asi8 @@ -490,18 +488,18 @@ def test_difference_base(self, sort, indices): result = first.difference(case, sort) assert tm.equalContents(result, answer) - if isinstance(indices, MultiIndex): + if isinstance(index, MultiIndex): msg = "other must be a MultiIndex or a list of tuples" with pytest.raises(TypeError, match=msg): first.difference([1, 2, 3], sort) - def test_symmetric_difference(self, indices): - if isinstance(indices, CategoricalIndex): + def test_symmetric_difference(self, index): + if isinstance(index, CategoricalIndex): return - first = indices[1:] - second = indices[:-1] - answer = indices[[0, -1]] + first = index[1:] + second = index[:-1] + answer = index[[0, -1]] result = first.symmetric_difference(second) assert tm.equalContents(result, answer) @@ -511,64 +509,64 @@ def test_symmetric_difference(self, indices): result = first.symmetric_difference(case) assert tm.equalContents(result, answer) - if isinstance(indices, MultiIndex): + if isinstance(index, MultiIndex): msg = "other must be a MultiIndex or a list of tuples" with pytest.raises(TypeError, match=msg): first.symmetric_difference([1, 2, 3]) - def test_insert_base(self, indices): - result = indices[1:4] + def test_insert_base(self, index): + result = index[1:4] - if not len(indices): + if not len(index): return # test 0th element - assert indices[0:4].equals(result.insert(0, indices[0])) + assert index[0:4].equals(result.insert(0, index[0])) - def test_delete_base(self, indices): - if not len(indices): + def test_delete_base(self, index): + if not len(index): return - if isinstance(indices, RangeIndex): + if isinstance(index, RangeIndex): # tested in class return - expected = indices[1:] - result = indices.delete(0) + expected = index[1:] + result = index.delete(0) assert result.equals(expected) assert result.name == expected.name - expected = indices[:-1] - result = indices.delete(-1) + expected = index[:-1] + result = index.delete(-1) assert result.equals(expected) assert result.name == expected.name - length = len(indices) + length = len(index) msg = f"index {length} is out of bounds for axis 0 with size {length}" with pytest.raises(IndexError, match=msg): - indices.delete(length) + index.delete(length) - def test_equals(self, indices): - if isinstance(indices, IntervalIndex): + def test_equals(self, index): + if isinstance(index, IntervalIndex): # IntervalIndex tested separately return - assert indices.equals(indices) - assert indices.equals(indices.copy()) - assert indices.equals(indices.astype(object)) + assert index.equals(index) + assert index.equals(index.copy()) + assert index.equals(index.astype(object)) - assert not indices.equals(list(indices)) - assert not indices.equals(np.array(indices)) + assert not index.equals(list(index)) + assert not index.equals(np.array(index)) # Cannot pass in non-int64 dtype to RangeIndex - if not isinstance(indices, RangeIndex): - same_values = Index(indices, dtype=object) - assert indices.equals(same_values) - assert same_values.equals(indices) + if not isinstance(index, RangeIndex): + same_values = Index(index, dtype=object) + assert index.equals(same_values) + assert same_values.equals(index) - if indices.nlevels == 1: + if index.nlevels == 1: # do not test MultiIndex - assert not indices.equals(Series(indices)) + assert not index.equals(Series(index)) def test_equals_op(self): # GH9947, GH10637 @@ -634,50 +632,50 @@ def test_equals_op(self): tm.assert_numpy_array_equal(index_a == item, expected3) tm.assert_series_equal(series_a == item, Series(expected3)) - def test_hasnans_isnans(self, indices): + def test_hasnans_isnans(self, index): # GH 11343, added tests for hasnans / isnans - if isinstance(indices, MultiIndex): + if isinstance(index, MultiIndex): return # cases in indices doesn't include NaN - idx = indices.copy(deep=True) + idx = index.copy(deep=True) expected = np.array([False] * len(idx), dtype=bool) tm.assert_numpy_array_equal(idx._isnan, expected) assert idx.hasnans is False - idx = indices.copy(deep=True) + idx = index.copy(deep=True) values = np.asarray(idx.values) - if len(indices) == 0: + if len(index) == 0: return - elif isinstance(indices, DatetimeIndexOpsMixin): + elif isinstance(index, DatetimeIndexOpsMixin): values[1] = iNaT - elif isinstance(indices, (Int64Index, UInt64Index)): + elif isinstance(index, (Int64Index, UInt64Index)): return else: values[1] = np.nan - if isinstance(indices, PeriodIndex): - idx = type(indices)(values, freq=indices.freq) + if isinstance(index, PeriodIndex): + idx = type(index)(values, freq=index.freq) else: - idx = type(indices)(values) + idx = type(index)(values) expected = np.array([False] * len(idx), dtype=bool) expected[1] = True tm.assert_numpy_array_equal(idx._isnan, expected) assert idx.hasnans is True - def test_fillna(self, indices): + def test_fillna(self, index): # GH 11343 - if len(indices) == 0: + if len(index) == 0: pass - elif isinstance(indices, MultiIndex): - idx = indices.copy(deep=True) + elif isinstance(index, MultiIndex): + idx = index.copy(deep=True) msg = "isna is not defined for MultiIndex" with pytest.raises(NotImplementedError, match=msg): idx.fillna(idx[0]) else: - idx = indices.copy(deep=True) + idx = index.copy(deep=True) result = idx.fillna(idx[0]) tm.assert_index_equal(result, idx) assert result is not idx @@ -686,47 +684,43 @@ def test_fillna(self, indices): with pytest.raises(TypeError, match=msg): idx.fillna([idx[0]]) - idx = indices.copy(deep=True) + idx = index.copy(deep=True) values = np.asarray(idx.values) - if isinstance(indices, DatetimeIndexOpsMixin): + if isinstance(index, DatetimeIndexOpsMixin): values[1] = iNaT - elif isinstance(indices, (Int64Index, UInt64Index)): + elif isinstance(index, (Int64Index, UInt64Index)): return else: values[1] = np.nan - if isinstance(indices, PeriodIndex): - idx = type(indices)(values, freq=indices.freq) + if isinstance(index, PeriodIndex): + idx = type(index)(values, freq=index.freq) else: - idx = type(indices)(values) + idx = type(index)(values) expected = np.array([False] * len(idx), dtype=bool) expected[1] = True tm.assert_numpy_array_equal(idx._isnan, expected) assert idx.hasnans is True - def test_nulls(self, indices): + def test_nulls(self, index): # this is really a smoke test for the methods # as these are adequately tested for function elsewhere - if len(indices) == 0: - tm.assert_numpy_array_equal(indices.isna(), np.array([], dtype=bool)) - elif isinstance(indices, MultiIndex): - idx = indices.copy() + if len(index) == 0: + tm.assert_numpy_array_equal(index.isna(), np.array([], dtype=bool)) + elif isinstance(index, MultiIndex): + idx = index.copy() msg = "isna is not defined for MultiIndex" with pytest.raises(NotImplementedError, match=msg): idx.isna() - elif not indices.hasnans: - tm.assert_numpy_array_equal( - indices.isna(), np.zeros(len(indices), dtype=bool) - ) - tm.assert_numpy_array_equal( - indices.notna(), np.ones(len(indices), dtype=bool) - ) + elif not index.hasnans: + tm.assert_numpy_array_equal(index.isna(), np.zeros(len(index), dtype=bool)) + tm.assert_numpy_array_equal(index.notna(), np.ones(len(index), dtype=bool)) else: - result = isna(indices) - tm.assert_numpy_array_equal(indices.isna(), result) - tm.assert_numpy_array_equal(indices.notna(), ~result) + result = isna(index) + tm.assert_numpy_array_equal(index.isna(), result) + tm.assert_numpy_array_equal(index.notna(), ~result) def test_empty(self): # GH 15270 diff --git a/pandas/tests/indexes/datetimes/test_datetimelike.py b/pandas/tests/indexes/datetimes/test_datetimelike.py index e4785e5f80256..7345ae3032463 100644 --- a/pandas/tests/indexes/datetimes/test_datetimelike.py +++ b/pandas/tests/indexes/datetimes/test_datetimelike.py @@ -14,7 +14,7 @@ class TestDatetimeIndex(DatetimeLike): params=[tm.makeDateIndex(10), date_range("20130110", periods=10, freq="-1D")], ids=["index_inc", "index_dec"], ) - def indices(self, request): + def index(self, request): return request.param def create_index(self) -> DatetimeIndex: diff --git a/pandas/tests/indexes/interval/test_base.py b/pandas/tests/indexes/interval/test_base.py index d8c2ba8413cfb..891640234d26e 100644 --- a/pandas/tests/indexes/interval/test_base.py +++ b/pandas/tests/indexes/interval/test_base.py @@ -15,7 +15,7 @@ class TestBase(Base): _holder = IntervalIndex @pytest.fixture - def indices(self): + def index(self): return tm.makeIntervalIndex(10) def create_index(self, closed="right"): diff --git a/pandas/tests/indexes/period/test_period.py b/pandas/tests/indexes/period/test_period.py index 8d767663fc208..15a88ab3819ce 100644 --- a/pandas/tests/indexes/period/test_period.py +++ b/pandas/tests/indexes/period/test_period.py @@ -32,7 +32,7 @@ class TestPeriodIndex(DatetimeLike): ], ids=["index_inc", "index_dec"], ) - def indices(self, request): + def index(self, request): return request.param def create_index(self) -> PeriodIndex: diff --git a/pandas/tests/indexes/ranges/test_range.py b/pandas/tests/indexes/ranges/test_range.py index 2438cd352f86f..5b6f9cb358b7d 100644 --- a/pandas/tests/indexes/ranges/test_range.py +++ b/pandas/tests/indexes/ranges/test_range.py @@ -27,7 +27,7 @@ class TestRangeIndex(Numeric): ], ids=["index_inc", "index_dec"], ) - def indices(self, request): + def index(self, request): return request.param def create_index(self) -> RangeIndex: @@ -324,9 +324,9 @@ def test_explicit_conversions(self): result = a - fidx tm.assert_index_equal(result, expected) - def test_has_duplicates(self, indices): - assert indices.is_unique - assert not indices.has_duplicates + def test_has_duplicates(self, index): + assert index.is_unique + assert not index.has_duplicates def test_extended_gcd(self): index = self.create_index() diff --git a/pandas/tests/indexes/test_any_index.py b/pandas/tests/indexes/test_any_index.py index 8cbea846bc870..5e7065f785309 100644 --- a/pandas/tests/indexes/test_any_index.py +++ b/pandas/tests/indexes/test_any_index.py @@ -8,85 +8,84 @@ import pandas._testing as tm -def test_boolean_context_compat(indices): +def test_boolean_context_compat(index): with pytest.raises(ValueError, match="The truth value of a"): - if indices: + if index: pass -def test_sort(indices): +def test_sort(index): msg = "cannot sort an Index object in-place, use sort_values instead" with pytest.raises(TypeError, match=msg): - indices.sort() + index.sort() -def test_hash_error(indices): - index = indices +def test_hash_error(index): with pytest.raises(TypeError, match=f"unhashable type: '{type(index).__name__}'"): - hash(indices) + hash(index) -def test_mutability(indices): - if not len(indices): +def test_mutability(index): + if not len(index): return msg = "Index does not support mutable operations" with pytest.raises(TypeError, match=msg): - indices[0] = indices[0] + index[0] = index[0] -def test_wrong_number_names(indices): - names = indices.nlevels * ["apple", "banana", "carrot"] +def test_wrong_number_names(index): + names = index.nlevels * ["apple", "banana", "carrot"] with pytest.raises(ValueError, match="^Length"): - indices.names = names + index.names = names class TestConversion: - def test_to_series(self, indices): + def test_to_series(self, index): # assert that we are creating a copy of the index - ser = indices.to_series() - assert ser.values is not indices.values - assert ser.index is not indices - assert ser.name == indices.name + ser = index.to_series() + assert ser.values is not index.values + assert ser.index is not index + assert ser.name == index.name - def test_to_series_with_arguments(self, indices): + def test_to_series_with_arguments(self, index): # GH#18699 # index kwarg - ser = indices.to_series(index=indices) + ser = index.to_series(index=index) - assert ser.values is not indices.values - assert ser.index is indices - assert ser.name == indices.name + assert ser.values is not index.values + assert ser.index is index + assert ser.name == index.name # name kwarg - ser = indices.to_series(name="__test") + ser = index.to_series(name="__test") - assert ser.values is not indices.values - assert ser.index is not indices - assert ser.name != indices.name + assert ser.values is not index.values + assert ser.index is not index + assert ser.name != index.name - def test_tolist_matches_list(self, indices): - assert indices.tolist() == list(indices) + def test_tolist_matches_list(self, index): + assert index.tolist() == list(index) class TestRoundTrips: - def test_pickle_roundtrip(self, indices): - result = tm.round_trip_pickle(indices) - tm.assert_index_equal(result, indices) + def test_pickle_roundtrip(self, index): + result = tm.round_trip_pickle(index) + tm.assert_index_equal(result, index) if result.nlevels > 1: # GH#8367 round-trip with timezone - assert indices.equal_levels(result) + assert index.equal_levels(result) class TestIndexing: - def test_slice_keeps_name(self, indices): - assert indices.name == indices[1:].name + def test_slice_keeps_name(self, index): + assert index.name == index[1:].name class TestRendering: - def test_str(self, indices): + def test_str(self, index): # test the string repr - indices.name = "foo" - assert "'foo'" in str(indices) - assert type(indices).__name__ in str(indices) + index.name = "foo" + assert "'foo'" in str(index) + assert type(index).__name__ in str(index) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index f31b49ab82f3b..099c7ced5e2ce 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -54,35 +54,35 @@ def test_can_hold_identifiers(self): key = index[0] assert index._can_hold_identifiers_and_holds_name(key) is True - @pytest.mark.parametrize("indices", ["datetime"], indirect=True) - def test_new_axis(self, indices): + @pytest.mark.parametrize("index", ["datetime"], indirect=True) + def test_new_axis(self, index): with tm.assert_produces_warning(DeprecationWarning): # GH#30588 multi-dimensional indexing deprecated - new_index = indices[None, :] + new_index = index[None, :] assert new_index.ndim == 2 assert isinstance(new_index, np.ndarray) - @pytest.mark.parametrize("indices", ["int", "uint", "float"], indirect=True) - def test_copy_and_deepcopy(self, indices): - new_copy2 = indices.copy(dtype=int) + @pytest.mark.parametrize("index", ["int", "uint", "float"], indirect=True) + def test_copy_and_deepcopy(self, index): + new_copy2 = index.copy(dtype=int) assert new_copy2.dtype.kind == "i" - def test_constructor_regular(self, indices): - tm.assert_contains_all(indices, indices) + def test_constructor_regular(self, index): + tm.assert_contains_all(index, index) - @pytest.mark.parametrize("indices", ["string"], indirect=True) - def test_constructor_casting(self, indices): + @pytest.mark.parametrize("index", ["string"], indirect=True) + def test_constructor_casting(self, index): # casting - arr = np.array(indices) + arr = np.array(index) new_index = Index(arr) tm.assert_contains_all(arr, new_index) - tm.assert_index_equal(indices, new_index) + tm.assert_index_equal(index, new_index) - @pytest.mark.parametrize("indices", ["string"], indirect=True) - def test_constructor_copy(self, indices): + @pytest.mark.parametrize("index", ["string"], indirect=True) + def test_constructor_copy(self, index): # copy # index = self.create_index() - arr = np.array(indices) + arr = np.array(index) new_index = Index(arr, copy=True, name="name") assert isinstance(new_index, Index) assert new_index.name == "name" @@ -436,7 +436,7 @@ def test_constructor_overflow_int64(self): Index([np.iinfo(np.uint64).max - 1], dtype="int64") @pytest.mark.parametrize( - "indices", + "index", [ "datetime", "float", @@ -450,11 +450,11 @@ def test_constructor_overflow_int64(self): ], indirect=True, ) - def test_view_with_args(self, indices): - indices.view("i8") + def test_view_with_args(self, index): + index.view("i8") @pytest.mark.parametrize( - "indices", + "index", [ "unicode", "string", @@ -464,21 +464,21 @@ def test_view_with_args(self, indices): ], indirect=True, ) - def test_view_with_args_object_array_raises(self, indices): + def test_view_with_args_object_array_raises(self, index): msg = "Cannot change data-type for object array" with pytest.raises(TypeError, match=msg): - indices.view("i8") + index.view("i8") - @pytest.mark.parametrize("indices", ["int", "range"], indirect=True) - def test_astype(self, indices): - casted = indices.astype("i8") + @pytest.mark.parametrize("index", ["int", "range"], indirect=True) + def test_astype(self, index): + casted = index.astype("i8") # it works! casted.get_loc(5) # pass on name - indices.name = "foobar" - casted = indices.astype("i8") + index.name = "foobar" + casted = index.astype("i8") assert casted.name == "foobar" def test_equals_object(self): @@ -546,17 +546,17 @@ def test_is_(self): ind2 = Index(arr, copy=False) assert not ind1.is_(ind2) - @pytest.mark.parametrize("indices", ["datetime"], indirect=True) - def test_asof(self, indices): - d = indices[0] - assert indices.asof(d) == d - assert isna(indices.asof(d - timedelta(1))) + @pytest.mark.parametrize("index", ["datetime"], indirect=True) + def test_asof(self, index): + d = index[0] + assert index.asof(d) == d + assert isna(index.asof(d - timedelta(1))) - d = indices[-1] - assert indices.asof(d + timedelta(1)) == d + d = index[-1] + assert index.asof(d + timedelta(1)) == d - d = indices[0].to_pydatetime() - assert isinstance(indices.asof(d), Timestamp) + d = index[0].to_pydatetime() + assert isinstance(index.asof(d), Timestamp) def test_asof_datetime_partial(self): index = pd.date_range("2010-01-01", periods=2, freq="m") @@ -578,17 +578,17 @@ def test_nanosecond_index_access(self): expected_ts = np_datetime64_compat("2013-01-01 00:00:00.000000050+0000", "ns") assert first_value == x[Timestamp(expected_ts)] - @pytest.mark.parametrize("indices", ["string"], indirect=True) - def test_booleanindex(self, indices): - bool_index = np.ones(len(indices), dtype=bool) + @pytest.mark.parametrize("index", ["string"], indirect=True) + def test_booleanindex(self, index): + bool_index = np.ones(len(index), dtype=bool) bool_index[5:30:2] = False - sub_index = indices[bool_index] + sub_index = index[bool_index] for i, val in enumerate(sub_index): assert sub_index.get_loc(val) == i - sub_index = indices[list(bool_index)] + sub_index = index[list(bool_index)] for i, val in enumerate(sub_index): assert sub_index.get_loc(val) == i @@ -598,32 +598,32 @@ def test_fancy(self): for i in sl: assert i == sl[sl.get_loc(i)] - @pytest.mark.parametrize("indices", ["string", "int", "float"], indirect=True) + @pytest.mark.parametrize("index", ["string", "int", "float"], indirect=True) @pytest.mark.parametrize("dtype", [np.int_, np.bool_]) - def test_empty_fancy(self, indices, dtype): + def test_empty_fancy(self, index, dtype): empty_arr = np.array([], dtype=dtype) - empty_index = type(indices)([]) + empty_index = type(index)([]) - assert indices[[]].identical(empty_index) - assert indices[empty_arr].identical(empty_index) + assert index[[]].identical(empty_index) + assert index[empty_arr].identical(empty_index) - @pytest.mark.parametrize("indices", ["string", "int", "float"], indirect=True) - def test_empty_fancy_raises(self, indices): + @pytest.mark.parametrize("index", ["string", "int", "float"], indirect=True) + def test_empty_fancy_raises(self, index): # pd.DatetimeIndex is excluded, because it overrides getitem and should # be tested separately. empty_farr = np.array([], dtype=np.float_) - empty_index = type(indices)([]) + empty_index = type(index)([]) - assert indices[[]].identical(empty_index) + assert index[[]].identical(empty_index) # np.ndarray only accepts ndarray of int & bool dtypes, so should Index msg = r"arrays used as indices must be of integer \(or boolean\) type" with pytest.raises(IndexError, match=msg): - indices[empty_farr] + index[empty_farr] - @pytest.mark.parametrize("indices", ["string"], indirect=True) - def test_intersection(self, indices, sort): - first = indices[:20] - second = indices[:10] + @pytest.mark.parametrize("index", ["string"], indirect=True) + def test_intersection(self, index, sort): + first = index[:20] + second = index[:10] intersect = first.intersection(second, sort=sort) if sort is None: tm.assert_index_equal(intersect, second.sort_values()) @@ -652,16 +652,16 @@ def test_intersection_name_preservation(self, index2, keeps_name, sort): assert result.name == expected.name tm.assert_index_equal(result, expected) - @pytest.mark.parametrize("indices", ["string"], indirect=True) + @pytest.mark.parametrize("index", ["string"], indirect=True) @pytest.mark.parametrize( "first_name,second_name,expected_name", [("A", "A", "A"), ("A", "B", None), (None, "B", None)], ) def test_intersection_name_preservation2( - self, indices, first_name, second_name, expected_name, sort + self, index, first_name, second_name, expected_name, sort ): - first = indices[5:20] - second = indices[:10] + first = index[5:20] + second = index[:10] first.name = first_name second.name = second_name intersect = first.intersection(second, sort=sort) @@ -731,11 +731,11 @@ def test_chained_union(self, sort): expected = j1.union(j2, sort=sort).union(j3, sort=sort) tm.assert_index_equal(union, expected) - @pytest.mark.parametrize("indices", ["string"], indirect=True) - def test_union(self, indices, sort): - first = indices[5:20] - second = indices[:10] - everything = indices[:20] + @pytest.mark.parametrize("index", ["string"], indirect=True) + def test_union(self, index, sort): + first = index[5:20] + second = index[:10] + everything = index[:20] union = first.union(second, sort=sort) if sort is None: @@ -769,12 +769,12 @@ def test_union_sort_special_true(self, slice_): tm.assert_index_equal(result, expected) @pytest.mark.parametrize("klass", [np.array, Series, list]) - @pytest.mark.parametrize("indices", ["string"], indirect=True) - def test_union_from_iterables(self, indices, klass, sort): + @pytest.mark.parametrize("index", ["string"], indirect=True) + def test_union_from_iterables(self, index, klass, sort): # GH 10149 - first = indices[5:20] - second = indices[:10] - everything = indices[:20] + first = index[5:20] + second = index[:10] + everything = index[:20] case = klass(second.values) result = first.union(case, sort=sort) @@ -782,9 +782,9 @@ def test_union_from_iterables(self, indices, klass, sort): tm.assert_index_equal(result, everything.sort_values()) assert tm.equalContents(result, everything) - @pytest.mark.parametrize("indices", ["string"], indirect=True) - def test_union_identity(self, indices, sort): - first = indices[5:20] + @pytest.mark.parametrize("index", ["string"], indirect=True) + def test_union_identity(self, index, sort): + first = index[5:20] union = first.union(first, sort=sort) # i.e. identity is not preserved when sort is True @@ -838,9 +838,9 @@ def test_union_dt_as_obj(self, sort): tm.assert_contains_all(index, second_cat) tm.assert_contains_all(date_index, first_cat) - def test_map_identity_mapping(self, indices): + def test_map_identity_mapping(self, index): # GH 12766 - tm.assert_index_equal(indices, indices.map(lambda x: x)) + tm.assert_index_equal(index, index.map(lambda x: x)) def test_map_with_tuples(self): # GH 12766 @@ -901,22 +901,22 @@ def test_map_dictlike_simple(self, mapper): lambda values, index: pd.Series(values, index), ], ) - def test_map_dictlike(self, indices, mapper): + def test_map_dictlike(self, index, mapper): # GH 12756 - if isinstance(indices, CategoricalIndex): + if isinstance(index, CategoricalIndex): # Tested in test_categorical return - elif not indices.is_unique: + elif not index.is_unique: # Cannot map duplicated index return - if indices.empty: + if index.empty: # to match proper result coercion for uints expected = Index([]) else: - expected = Index(np.arange(len(indices), 0, -1)) + expected = Index(np.arange(len(index), 0, -1)) - result = indices.map(mapper(expected, indices)) + result = index.map(mapper(expected, index)) tm.assert_index_equal(result, expected) @pytest.mark.parametrize( @@ -953,12 +953,12 @@ def test_append_empty_preserve_name(self, name, expected): result = left.append(right) assert result.name == expected - @pytest.mark.parametrize("indices", ["string"], indirect=True) + @pytest.mark.parametrize("index", ["string"], indirect=True) @pytest.mark.parametrize("second_name,expected", [(None, None), ("name", "name")]) - def test_difference_name_preservation(self, indices, second_name, expected, sort): - first = indices[5:20] - second = indices[:10] - answer = indices[10:20] + def test_difference_name_preservation(self, index, second_name, expected, sort): + first = index[5:20] + second = index[:10] + answer = index[10:20] first.name = "name" second.name = second_name @@ -971,31 +971,31 @@ def test_difference_name_preservation(self, indices, second_name, expected, sort else: assert result.name == expected - @pytest.mark.parametrize("indices", ["string"], indirect=True) - def test_difference_empty_arg(self, indices, sort): - first = indices[5:20] + @pytest.mark.parametrize("index", ["string"], indirect=True) + def test_difference_empty_arg(self, index, sort): + first = index[5:20] first.name = "name" result = first.difference([], sort) assert tm.equalContents(result, first) assert result.name == first.name - @pytest.mark.parametrize("indices", ["string"], indirect=True) - def test_difference_identity(self, indices, sort): - first = indices[5:20] + @pytest.mark.parametrize("index", ["string"], indirect=True) + def test_difference_identity(self, index, sort): + first = index[5:20] first.name = "name" result = first.difference(first, sort) assert len(result) == 0 assert result.name == first.name - @pytest.mark.parametrize("indices", ["string"], indirect=True) - def test_difference_sort(self, indices, sort): - first = indices[5:20] - second = indices[:10] + @pytest.mark.parametrize("index", ["string"], indirect=True) + def test_difference_sort(self, index, sort): + first = index[5:20] + second = index[:10] result = first.difference(second, sort) - expected = indices[10:20] + expected = index[10:20] if sort is None: expected = expected.sort_values() @@ -1088,25 +1088,25 @@ def test_symmetric_difference_non_index(self, sort): assert tm.equalContents(result, expected) assert result.name == "new_name" - def test_difference_type(self, indices, sort): + def test_difference_type(self, index, sort): # GH 20040 # If taking difference of a set and itself, it # needs to preserve the type of the index - if not indices.is_unique: + if not index.is_unique: return - result = indices.difference(indices, sort=sort) - expected = indices.drop(indices) + result = index.difference(index, sort=sort) + expected = index.drop(index) tm.assert_index_equal(result, expected) - def test_intersection_difference(self, indices, sort): + def test_intersection_difference(self, index, sort): # GH 20040 # Test that the intersection of an index with an # empty index produces the same index as the difference # of an index with itself. Test for all types - if not indices.is_unique: + if not index.is_unique: return - inter = indices.intersection(indices.drop(indices)) - diff = indices.difference(indices, sort=sort) + inter = index.intersection(index.drop(index)) + diff = index.difference(index, sort=sort) tm.assert_index_equal(inter, diff) def test_is_mixed_deprecated(self): @@ -1116,7 +1116,7 @@ def test_is_mixed_deprecated(self): index.is_mixed() @pytest.mark.parametrize( - "indices, expected", + "index, expected", [ ("string", False), ("bool", False), @@ -1125,13 +1125,13 @@ def test_is_mixed_deprecated(self): ("datetime", False), ("float", True), ], - indirect=["indices"], + indirect=["index"], ) - def test_is_numeric(self, indices, expected): - assert indices.is_numeric() is expected + def test_is_numeric(self, index, expected): + assert index.is_numeric() is expected @pytest.mark.parametrize( - "indices, expected", + "index, expected", [ ("string", True), ("bool", True), @@ -1140,13 +1140,13 @@ def test_is_numeric(self, indices, expected): ("datetime", False), ("float", False), ], - indirect=["indices"], + indirect=["index"], ) - def test_is_object(self, indices, expected): - assert indices.is_object() is expected + def test_is_object(self, index, expected): + assert index.is_object() is expected @pytest.mark.parametrize( - "indices, expected", + "index, expected", [ ("string", False), ("bool", False), @@ -1155,13 +1155,13 @@ def test_is_object(self, indices, expected): ("datetime", True), ("float", False), ], - indirect=["indices"], + indirect=["index"], ) - def test_is_all_dates(self, indices, expected): - assert indices.is_all_dates is expected + def test_is_all_dates(self, index, expected): + assert index.is_all_dates is expected - def test_summary(self, indices): - self._check_method_works(Index._summary, indices) + def test_summary(self, index): + self._check_method_works(Index._summary, index) def test_summary_bug(self): # GH3869` @@ -1171,8 +1171,8 @@ def test_summary_bug(self): assert "~:{range}:0" in result assert "{other}%s" in result - def test_format(self, indices): - self._check_method_works(Index.format, indices) + def test_format(self, index): + self._check_method_works(Index.format, index) def test_format_bug(self): # GH 14626 @@ -1538,37 +1538,37 @@ def test_slice_locs_negative_step(self, in_slice, expected): expected = pd.Index(list(expected)) tm.assert_index_equal(result, expected) - @pytest.mark.parametrize("indices", ["string", "int", "float"], indirect=True) - def test_drop_by_str_label(self, indices): - n = len(indices) - drop = indices[list(range(5, 10))] - dropped = indices.drop(drop) + @pytest.mark.parametrize("index", ["string", "int", "float"], indirect=True) + def test_drop_by_str_label(self, index): + n = len(index) + drop = index[list(range(5, 10))] + dropped = index.drop(drop) - expected = indices[list(range(5)) + list(range(10, n))] + expected = index[list(range(5)) + list(range(10, n))] tm.assert_index_equal(dropped, expected) - dropped = indices.drop(indices[0]) - expected = indices[1:] + dropped = index.drop(index[0]) + expected = index[1:] tm.assert_index_equal(dropped, expected) - @pytest.mark.parametrize("indices", ["string", "int", "float"], indirect=True) + @pytest.mark.parametrize("index", ["string", "int", "float"], indirect=True) @pytest.mark.parametrize("keys", [["foo", "bar"], ["1", "bar"]]) - def test_drop_by_str_label_raises_missing_keys(self, indices, keys): + def test_drop_by_str_label_raises_missing_keys(self, index, keys): with pytest.raises(KeyError, match=""): - indices.drop(keys) + index.drop(keys) - @pytest.mark.parametrize("indices", ["string", "int", "float"], indirect=True) - def test_drop_by_str_label_errors_ignore(self, indices): - n = len(indices) - drop = indices[list(range(5, 10))] + @pytest.mark.parametrize("index", ["string", "int", "float"], indirect=True) + def test_drop_by_str_label_errors_ignore(self, index): + n = len(index) + drop = index[list(range(5, 10))] mixed = drop.tolist() + ["foo"] - dropped = indices.drop(mixed, errors="ignore") + dropped = index.drop(mixed, errors="ignore") - expected = indices[list(range(5)) + list(range(10, n))] + expected = index[list(range(5)) + list(range(10, n))] tm.assert_index_equal(dropped, expected) - dropped = indices.drop(["foo", "bar"], errors="ignore") - expected = indices[list(range(n))] + dropped = index.drop(["foo", "bar"], errors="ignore") + expected = index[list(range(n))] tm.assert_index_equal(dropped, expected) def test_drop_by_numeric_label_loc(self): @@ -1688,20 +1688,20 @@ def test_set_value_deprecated(self): assert arr[1] == 80 @pytest.mark.parametrize( - "indices", ["string", "int", "datetime", "timedelta"], indirect=True + "index", ["string", "int", "datetime", "timedelta"], indirect=True ) - def test_get_value(self, indices): + def test_get_value(self, index): # TODO: Remove function? GH 19728 values = np.random.randn(100) - value = indices[67] + value = index[67] with pytest.raises(AttributeError, match="has no attribute '_values'"): # Index.get_value requires a Series, not an ndarray with tm.assert_produces_warning(FutureWarning): - indices.get_value(values, value) + index.get_value(values, value) with tm.assert_produces_warning(FutureWarning): - result = indices.get_value(Series(values, index=values), value) + result = index.get_value(Series(values, index=values), value) tm.assert_almost_equal(result, values[67]) @pytest.mark.parametrize("values", [["foo", "bar", "quux"], {"foo", "bar", "quux"}]) @@ -1779,15 +1779,13 @@ def test_isin_level_kwarg(self, level, index): index.name = "foobar" tm.assert_numpy_array_equal(expected, index.isin(values, level="foobar")) - def test_isin_level_kwarg_bad_level_raises(self, indices): - index = indices + def test_isin_level_kwarg_bad_level_raises(self, index): for level in [10, index.nlevels, -(index.nlevels + 1)]: with pytest.raises(IndexError, match="Too many levels"): index.isin([], level=level) @pytest.mark.parametrize("label", [1.0, "foobar", "xyzzy", np.nan]) - def test_isin_level_kwarg_bad_label_raises(self, label, indices): - index = indices + def test_isin_level_kwarg_bad_label_raises(self, label, index): if isinstance(index, MultiIndex): index = index.rename(["foo", "bar"] + index.names[2:]) msg = f"'Level {label} not found'" @@ -1823,10 +1821,10 @@ def test_boolean_cmp(self, values): tm.assert_numpy_array_equal(result, expected) - @pytest.mark.parametrize("indices", ["string"], indirect=True) + @pytest.mark.parametrize("index", ["string"], indirect=True) @pytest.mark.parametrize("name,level", [(None, 0), ("a", "a")]) - def test_get_level_values(self, indices, name, level): - expected = indices.copy() + def test_get_level_values(self, index, name, level): + expected = index.copy() if name: expected.name = name @@ -1838,13 +1836,13 @@ def test_slice_keep_name(self): assert index.name == index[1:].name @pytest.mark.parametrize( - "indices", + "index", ["unicode", "string", "datetime", "int", "uint", "float"], indirect=True, ) - def test_join_self(self, indices, join_type): - joined = indices.join(indices, how=join_type) - assert indices is joined + def test_join_self(self, index, join_type): + joined = index.join(index, how=join_type) + assert index is joined @pytest.mark.parametrize("method", ["strip", "rstrip", "lstrip"]) def test_str_attribute(self, method): @@ -2215,14 +2213,14 @@ async def test_tab_complete_warning(self, ip): with provisionalcompleter("ignore"): list(ip.Completer.completions("idx.", 4)) - def test_contains_method_removed(self, indices): + def test_contains_method_removed(self, index): # GH#30103 method removed for all types except IntervalIndex - if isinstance(indices, pd.IntervalIndex): - indices.contains(1) + if isinstance(index, pd.IntervalIndex): + index.contains(1) else: - msg = f"'{type(indices).__name__}' object has no attribute 'contains'" + msg = f"'{type(index).__name__}' object has no attribute 'contains'" with pytest.raises(AttributeError, match=msg): - indices.contains(1) + index.contains(1) class TestMixedIntIndex(Base): @@ -2232,7 +2230,7 @@ class TestMixedIntIndex(Base): _holder = Index @pytest.fixture(params=[[0, "a", 1, "b", 2, "c"]], ids=["mixedIndex"]) - def indices(self, request): + def index(self, request): return Index(request.param) def create_index(self) -> Index: @@ -2494,13 +2492,13 @@ def test_ensure_index_mixed_closed_intervals(self): "divmod", ], ) -def test_generated_op_names(opname, indices): - if isinstance(indices, ABCIndex) and opname == "rsub": +def test_generated_op_names(opname, index): + if isinstance(index, ABCIndex) and opname == "rsub": # pd.Index.__rsub__ does not exist; though the method does exist # for subclasses. see GH#19723 return opname = f"__{opname}__" - method = getattr(indices, opname) + method = getattr(index, opname) assert method.__name__ == opname @@ -2566,20 +2564,19 @@ def test_validate_1d_input(): ser.index = np.array([[2, 3]] * 4) -def test_convert_almost_null_slice(indices): +def test_convert_almost_null_slice(index): # slice with None at both ends, but not step - idx = indices key = slice(None, None, "foo") - if isinstance(idx, pd.IntervalIndex): + if isinstance(index, pd.IntervalIndex): msg = "label-based slicing with step!=1 is not supported for IntervalIndex" with pytest.raises(ValueError, match=msg): - idx._convert_slice_indexer(key, "loc") + index._convert_slice_indexer(key, "loc") else: msg = "'>=' not supported between instances of 'str' and 'int'" with pytest.raises(TypeError, match=msg): - idx._convert_slice_indexer(key, "loc") + index._convert_slice_indexer(key, "loc") dtlike_dtypes = [ diff --git a/pandas/tests/indexes/test_common.py b/pandas/tests/indexes/test_common.py index a08001e042f36..02a173eb4958d 100644 --- a/pandas/tests/indexes/test_common.py +++ b/pandas/tests/indexes/test_common.py @@ -18,32 +18,32 @@ class TestCommon: - def test_droplevel(self, indices): + def test_droplevel(self, index): # GH 21115 - if isinstance(indices, MultiIndex): + if isinstance(index, MultiIndex): # Tested separately in test_multi.py return - assert indices.droplevel([]).equals(indices) + assert index.droplevel([]).equals(index) - for level in indices.name, [indices.name]: - if isinstance(indices.name, tuple) and level is indices.name: + for level in index.name, [index.name]: + if isinstance(index.name, tuple) and level is index.name: # GH 21121 : droplevel with tuple name continue with pytest.raises(ValueError): - indices.droplevel(level) + index.droplevel(level) for level in "wrong", ["wrong"]: with pytest.raises( KeyError, match=r"'Requested level \(wrong\) does not match index name \(None\)'", ): - indices.droplevel(level) + index.droplevel(level) - def test_constructor_non_hashable_name(self, indices): + def test_constructor_non_hashable_name(self, index): # GH 20527 - if isinstance(indices, MultiIndex): + if isinstance(index, MultiIndex): pytest.skip("multiindex handled in test_multi.py") message = "Index.name must be a hashable type" @@ -51,25 +51,25 @@ def test_constructor_non_hashable_name(self, indices): # With .rename() with pytest.raises(TypeError, match=message): - indices.rename(name=renamed) + index.rename(name=renamed) # With .set_names() with pytest.raises(TypeError, match=message): - indices.set_names(names=renamed) + index.set_names(names=renamed) - def test_constructor_unwraps_index(self, indices): - if isinstance(indices, pd.MultiIndex): + def test_constructor_unwraps_index(self, index): + if isinstance(index, pd.MultiIndex): raise pytest.skip("MultiIndex has no ._data") - a = indices + a = index b = type(a)(a) tm.assert_equal(a._data, b._data) @pytest.mark.parametrize("itm", [101, "no_int"]) # FutureWarning from non-tuple sequence of nd indexing @pytest.mark.filterwarnings("ignore::FutureWarning") - def test_getitem_error(self, indices, itm): + def test_getitem_error(self, index, itm): with pytest.raises(IndexError): - indices[itm] + index[itm] @pytest.mark.parametrize( "fname, sname, expected_name", @@ -81,123 +81,123 @@ def test_getitem_error(self, indices, itm): (None, None, None), ], ) - def test_corner_union(self, indices, fname, sname, expected_name): + def test_corner_union(self, index, fname, sname, expected_name): # GH 9943 9862 # Test unions with various name combinations # Do not test MultiIndex or repeats - if isinstance(indices, MultiIndex) or not indices.is_unique: + if isinstance(index, MultiIndex) or not index.is_unique: pytest.skip("Not for MultiIndex or repeated indices") # Test copy.union(copy) - first = indices.copy().set_names(fname) - second = indices.copy().set_names(sname) + first = index.copy().set_names(fname) + second = index.copy().set_names(sname) union = first.union(second) - expected = indices.copy().set_names(expected_name) + expected = index.copy().set_names(expected_name) tm.assert_index_equal(union, expected) # Test copy.union(empty) - first = indices.copy().set_names(fname) - second = indices.drop(indices).set_names(sname) + first = index.copy().set_names(fname) + second = index.drop(index).set_names(sname) union = first.union(second) - expected = indices.copy().set_names(expected_name) + expected = index.copy().set_names(expected_name) tm.assert_index_equal(union, expected) # Test empty.union(copy) - first = indices.drop(indices).set_names(fname) - second = indices.copy().set_names(sname) + first = index.drop(index).set_names(fname) + second = index.copy().set_names(sname) union = first.union(second) - expected = indices.copy().set_names(expected_name) + expected = index.copy().set_names(expected_name) tm.assert_index_equal(union, expected) # Test empty.union(empty) - first = indices.drop(indices).set_names(fname) - second = indices.drop(indices).set_names(sname) + first = index.drop(index).set_names(fname) + second = index.drop(index).set_names(sname) union = first.union(second) - expected = indices.drop(indices).set_names(expected_name) + expected = index.drop(index).set_names(expected_name) tm.assert_index_equal(union, expected) - def test_to_flat_index(self, indices): + def test_to_flat_index(self, index): # 22866 - if isinstance(indices, MultiIndex): + if isinstance(index, MultiIndex): pytest.skip("Separate expectation for MultiIndex") - result = indices.to_flat_index() - tm.assert_index_equal(result, indices) + result = index.to_flat_index() + tm.assert_index_equal(result, index) - def test_set_name_methods(self, indices): + def test_set_name_methods(self, index): new_name = "This is the new name for this index" # don't tests a MultiIndex here (as its tested separated) - if isinstance(indices, MultiIndex): + if isinstance(index, MultiIndex): pytest.skip("Skip check for MultiIndex") - original_name = indices.name - new_ind = indices.set_names([new_name]) + original_name = index.name + new_ind = index.set_names([new_name]) assert new_ind.name == new_name - assert indices.name == original_name - res = indices.rename(new_name, inplace=True) + assert index.name == original_name + res = index.rename(new_name, inplace=True) # should return None assert res is None - assert indices.name == new_name - assert indices.names == [new_name] + assert index.name == new_name + assert index.names == [new_name] # FIXME: dont leave commented-out # with pytest.raises(TypeError, match="list-like"): # # should still fail even if it would be the right length # ind.set_names("a") with pytest.raises(ValueError, match="Level must be None"): - indices.set_names("a", level=0) + index.set_names("a", level=0) # rename in place just leaves tuples and other containers alone name = ("A", "B") - indices.rename(name, inplace=True) - assert indices.name == name - assert indices.names == [name] + index.rename(name, inplace=True) + assert index.name == name + assert index.names == [name] - def test_copy_and_deepcopy(self, indices): + def test_copy_and_deepcopy(self, index): from copy import copy, deepcopy - if isinstance(indices, MultiIndex): + if isinstance(index, MultiIndex): pytest.skip("Skip check for MultiIndex") for func in (copy, deepcopy): - idx_copy = func(indices) - assert idx_copy is not indices - assert idx_copy.equals(indices) + idx_copy = func(index) + assert idx_copy is not index + assert idx_copy.equals(index) - new_copy = indices.copy(deep=True, name="banana") + new_copy = index.copy(deep=True, name="banana") assert new_copy.name == "banana" - def test_unique(self, indices): + def test_unique(self, index): # don't test a MultiIndex here (as its tested separated) # don't test a CategoricalIndex because categories change (GH 18291) - if isinstance(indices, (MultiIndex, CategoricalIndex)): + if isinstance(index, (MultiIndex, CategoricalIndex)): pytest.skip("Skip check for MultiIndex/CategoricalIndex") # GH 17896 - expected = indices.drop_duplicates() - for level in 0, indices.name, None: - result = indices.unique(level=level) + expected = index.drop_duplicates() + for level in 0, index.name, None: + result = index.unique(level=level) tm.assert_index_equal(result, expected) msg = "Too many levels: Index has only 1 level, not 4" with pytest.raises(IndexError, match=msg): - indices.unique(level=3) + index.unique(level=3) msg = ( fr"Requested level \(wrong\) does not match index name " - fr"\({re.escape(indices.name.__repr__())}\)" + fr"\({re.escape(index.name.__repr__())}\)" ) with pytest.raises(KeyError, match=msg): - indices.unique(level="wrong") + index.unique(level="wrong") - def test_get_unique_index(self, indices): + def test_get_unique_index(self, index): # MultiIndex tested separately - if not len(indices) or isinstance(indices, MultiIndex): + if not len(index) or isinstance(index, MultiIndex): pytest.skip("Skip check for empty Index and MultiIndex") - idx = indices[[0] * 5] - idx_unique = indices[[0]] + idx = index[[0] * 5] + idx_unique = index[[0]] # We test against `idx_unique`, so first we make sure it's unique # and doesn't contain nans. @@ -212,109 +212,109 @@ def test_get_unique_index(self, indices): tm.assert_index_equal(result, idx_unique) # nans: - if not indices._can_hold_na: + if not index._can_hold_na: pytest.skip("Skip na-check if index cannot hold na") - if is_period_dtype(indices.dtype): - vals = indices[[0] * 5]._data + if is_period_dtype(index.dtype): + vals = index[[0] * 5]._data vals[0] = pd.NaT - elif needs_i8_conversion(indices.dtype): - vals = indices.asi8[[0] * 5] + elif needs_i8_conversion(index.dtype): + vals = index.asi8[[0] * 5] vals[0] = iNaT else: - vals = indices.values[[0] * 5] + vals = index.values[[0] * 5] vals[0] = np.nan vals_unique = vals[:2] - idx_nan = indices._shallow_copy(vals) - idx_unique_nan = indices._shallow_copy(vals_unique) + idx_nan = index._shallow_copy(vals) + idx_unique_nan = index._shallow_copy(vals_unique) assert idx_unique_nan.is_unique is True - assert idx_nan.dtype == indices.dtype - assert idx_unique_nan.dtype == indices.dtype + assert idx_nan.dtype == index.dtype + assert idx_unique_nan.dtype == index.dtype for dropna, expected in zip([False, True], [idx_unique_nan, idx_unique]): for i in [idx_nan, idx_unique_nan]: result = i._get_unique_index(dropna=dropna) tm.assert_index_equal(result, expected) - def test_mutability(self, indices): - if not len(indices): + def test_mutability(self, index): + if not len(index): pytest.skip("Skip check for empty Index") msg = "Index does not support mutable operations" with pytest.raises(TypeError, match=msg): - indices[0] = indices[0] + index[0] = index[0] - def test_view(self, indices): - assert indices.view().name == indices.name + def test_view(self, index): + assert index.view().name == index.name - def test_searchsorted_monotonic(self, indices): + def test_searchsorted_monotonic(self, index): # GH17271 # not implemented for tuple searches in MultiIndex # or Intervals searches in IntervalIndex - if isinstance(indices, (MultiIndex, pd.IntervalIndex)): + if isinstance(index, (MultiIndex, pd.IntervalIndex)): pytest.skip("Skip check for MultiIndex/IntervalIndex") # nothing to test if the index is empty - if indices.empty: + if index.empty: pytest.skip("Skip check for empty Index") - value = indices[0] + value = index[0] # determine the expected results (handle dupes for 'right') - expected_left, expected_right = 0, (indices == value).argmin() + expected_left, expected_right = 0, (index == value).argmin() if expected_right == 0: # all values are the same, expected_right should be length - expected_right = len(indices) + expected_right = len(index) # test _searchsorted_monotonic in all cases # test searchsorted only for increasing - if indices.is_monotonic_increasing: - ssm_left = indices._searchsorted_monotonic(value, side="left") + if index.is_monotonic_increasing: + ssm_left = index._searchsorted_monotonic(value, side="left") assert expected_left == ssm_left - ssm_right = indices._searchsorted_monotonic(value, side="right") + ssm_right = index._searchsorted_monotonic(value, side="right") assert expected_right == ssm_right - ss_left = indices.searchsorted(value, side="left") + ss_left = index.searchsorted(value, side="left") assert expected_left == ss_left - ss_right = indices.searchsorted(value, side="right") + ss_right = index.searchsorted(value, side="right") assert expected_right == ss_right - elif indices.is_monotonic_decreasing: - ssm_left = indices._searchsorted_monotonic(value, side="left") + elif index.is_monotonic_decreasing: + ssm_left = index._searchsorted_monotonic(value, side="left") assert expected_left == ssm_left - ssm_right = indices._searchsorted_monotonic(value, side="right") + ssm_right = index._searchsorted_monotonic(value, side="right") assert expected_right == ssm_right else: # non-monotonic should raise. with pytest.raises(ValueError): - indices._searchsorted_monotonic(value, side="left") + index._searchsorted_monotonic(value, side="left") - def test_pickle(self, indices): - original_name, indices.name = indices.name, "foo" - unpickled = tm.round_trip_pickle(indices) - assert indices.equals(unpickled) - indices.name = original_name + def test_pickle(self, index): + original_name, index.name = index.name, "foo" + unpickled = tm.round_trip_pickle(index) + assert index.equals(unpickled) + index.name = original_name - def test_drop_duplicates(self, indices, keep): - if isinstance(indices, MultiIndex): + def test_drop_duplicates(self, index, keep): + if isinstance(index, MultiIndex): pytest.skip("MultiIndex is tested separately") - if isinstance(indices, RangeIndex): + if isinstance(index, RangeIndex): pytest.skip( "RangeIndex is tested in test_drop_duplicates_no_duplicates " "as it cannot hold duplicates" ) - if len(indices) == 0: + if len(index) == 0: pytest.skip( "empty index is tested in test_drop_duplicates_no_duplicates " "as it cannot hold duplicates" ) # make unique index - holder = type(indices) - unique_values = list(set(indices)) + holder = type(index) + unique_values = list(set(index)) unique_idx = holder(unique_values) # make duplicated index @@ -332,17 +332,17 @@ def test_drop_duplicates(self, indices, keep): expected_dropped = holder(pd.Series(idx).drop_duplicates(keep=keep)) tm.assert_index_equal(idx.drop_duplicates(keep=keep), expected_dropped) - def test_drop_duplicates_no_duplicates(self, indices): - if isinstance(indices, MultiIndex): + def test_drop_duplicates_no_duplicates(self, index): + if isinstance(index, MultiIndex): pytest.skip("MultiIndex is tested separately") # make unique index - if isinstance(indices, RangeIndex): + if isinstance(index, RangeIndex): # RangeIndex cannot have duplicates - unique_idx = indices + unique_idx = index else: - holder = type(indices) - unique_values = list(set(indices)) + holder = type(index) + unique_values = list(set(index)) unique_idx = holder(unique_values) # check on unique index @@ -353,20 +353,20 @@ def test_drop_duplicates_no_duplicates(self, indices): # validate shallow copy assert result_dropped is not unique_idx - def test_drop_duplicates_inplace(self, indices): + def test_drop_duplicates_inplace(self, index): msg = r"drop_duplicates\(\) got an unexpected keyword argument" with pytest.raises(TypeError, match=msg): - indices.drop_duplicates(inplace=True) + index.drop_duplicates(inplace=True) - def test_has_duplicates(self, indices): - holder = type(indices) - if not len(indices) or isinstance(indices, (MultiIndex, RangeIndex)): + def test_has_duplicates(self, index): + holder = type(index) + if not len(index) or isinstance(index, (MultiIndex, RangeIndex)): # MultiIndex tested separately in: # tests/indexes/multi/test_unique_and_duplicates. # RangeIndex is unique by definition. pytest.skip("Skip check for empty Index, MultiIndex, and RangeIndex") - idx = holder([indices[0]] * 5) + idx = holder([index[0]] * 5) assert idx.is_unique is False assert idx.has_duplicates is True @@ -375,23 +375,23 @@ def test_has_duplicates(self, indices): ["int64", "uint64", "float64", "category", "datetime64[ns]", "timedelta64[ns]"], ) @pytest.mark.parametrize("copy", [True, False]) - def test_astype_preserves_name(self, indices, dtype, copy): + def test_astype_preserves_name(self, index, dtype, copy): # https://github.com/pandas-dev/pandas/issues/32013 - if isinstance(indices, MultiIndex): - indices.names = ["idx" + str(i) for i in range(indices.nlevels)] + if isinstance(index, MultiIndex): + index.names = ["idx" + str(i) for i in range(index.nlevels)] else: - indices.name = "idx" + index.name = "idx" try: # Some of these conversions cannot succeed so we use a try / except if copy: - result = indices.copy(dtype=dtype) + result = index.copy(dtype=dtype) else: - result = indices.astype(dtype) + result = index.astype(dtype) except (ValueError, TypeError, NotImplementedError, SystemError): return - if isinstance(indices, MultiIndex): - assert result.names == indices.names + if isinstance(index, MultiIndex): + assert result.names == index.names else: - assert result.name == indices.name + assert result.name == index.name diff --git a/pandas/tests/indexes/test_numeric.py b/pandas/tests/indexes/test_numeric.py index 081090731a9b4..33de0800658f2 100644 --- a/pandas/tests/indexes/test_numeric.py +++ b/pandas/tests/indexes/test_numeric.py @@ -97,7 +97,7 @@ class TestFloat64Index(Numeric): ], ids=["mixed", "float", "mixed_dec", "float_dec"], ) - def indices(self, request): + def index(self, request): return Float64Index(request.param) @pytest.fixture @@ -111,8 +111,8 @@ def float_index(self): def create_index(self) -> Float64Index: return Float64Index(np.arange(5, dtype="float64")) - def test_repr_roundtrip(self, indices): - tm.assert_index_equal(eval(repr(indices)), indices) + def test_repr_roundtrip(self, index): + tm.assert_index_equal(eval(repr(index)), index) def check_is_index(self, i): assert isinstance(i, Index) @@ -428,7 +428,7 @@ class TestInt64Index(NumericInt): @pytest.fixture( params=[range(0, 20, 2), range(19, -1, -1)], ids=["index_inc", "index_dec"] ) - def indices(self, request): + def index(self, request): return Int64Index(request.param) def create_index(self) -> Int64Index: @@ -537,7 +537,7 @@ class TestUInt64Index(NumericInt): ], ids=["index_inc", "index_dec"], ) - def indices(self, request): + def index(self, request): return UInt64Index(request.param) @pytest.fixture diff --git a/pandas/tests/indexes/test_numpy_compat.py b/pandas/tests/indexes/test_numpy_compat.py index 3340945ca1690..043539c173427 100644 --- a/pandas/tests/indexes/test_numpy_compat.py +++ b/pandas/tests/indexes/test_numpy_compat.py @@ -44,78 +44,76 @@ ], ids=lambda x: x.__name__, ) -def test_numpy_ufuncs_basic(indices, func): +def test_numpy_ufuncs_basic(index, func): # test ufuncs of numpy, see: # https://numpy.org/doc/stable/reference/ufuncs.html - idx = indices - if isinstance(idx, DatetimeIndexOpsMixin): + if isinstance(index, DatetimeIndexOpsMixin): # raise TypeError or ValueError (PeriodIndex) with pytest.raises(Exception): with np.errstate(all="ignore"): - func(idx) - elif isinstance(idx, (Float64Index, Int64Index, UInt64Index)): + func(index) + elif isinstance(index, (Float64Index, Int64Index, UInt64Index)): # coerces to float (e.g. np.sin) with np.errstate(all="ignore"): - result = func(idx) - exp = Index(func(idx.values), name=idx.name) + result = func(index) + exp = Index(func(index.values), name=index.name) tm.assert_index_equal(result, exp) assert isinstance(result, Float64Index) else: # raise AttributeError or TypeError - if len(idx) == 0: + if len(index) == 0: pass else: with pytest.raises(Exception): with np.errstate(all="ignore"): - func(idx) + func(index) @pytest.mark.parametrize( "func", [np.isfinite, np.isinf, np.isnan, np.signbit], ids=lambda x: x.__name__ ) -def test_numpy_ufuncs_other(indices, func): +def test_numpy_ufuncs_other(index, func): # test ufuncs of numpy, see: # https://numpy.org/doc/stable/reference/ufuncs.html - idx = indices - if isinstance(idx, (DatetimeIndex, TimedeltaIndex)): - if isinstance(idx, DatetimeIndex) and idx.tz is not None: + if isinstance(index, (DatetimeIndex, TimedeltaIndex)): + if isinstance(index, DatetimeIndex) and index.tz is not None: if func in [np.isfinite, np.isnan, np.isinf]: pytest.xfail(reason="__array_ufunc__ is not defined") if not _np_version_under1p18 and func in [np.isfinite, np.isinf, np.isnan]: # numpy 1.18(dev) changed isinf and isnan to not raise on dt64/tfd64 - result = func(idx) + result = func(index) assert isinstance(result, np.ndarray) elif not _np_version_under1p17 and func in [np.isfinite]: # ok under numpy >= 1.17 # Results in bool array - result = func(idx) + result = func(index) assert isinstance(result, np.ndarray) else: # raise TypeError or ValueError (PeriodIndex) with pytest.raises(Exception): - func(idx) + func(index) - elif isinstance(idx, PeriodIndex): + elif isinstance(index, PeriodIndex): # raise TypeError or ValueError (PeriodIndex) with pytest.raises(Exception): - func(idx) + func(index) - elif isinstance(idx, (Float64Index, Int64Index, UInt64Index)): + elif isinstance(index, (Float64Index, Int64Index, UInt64Index)): # Results in bool array - result = func(idx) + result = func(index) assert isinstance(result, np.ndarray) assert not isinstance(result, Index) else: - if len(idx) == 0: + if len(index) == 0: pass else: with pytest.raises(Exception): - func(idx) + func(index) def test_elementwise_comparison_warning(): diff --git a/pandas/tests/indexes/test_setops.py b/pandas/tests/indexes/test_setops.py index 818d5474eddf5..1a40fe550be61 100644 --- a/pandas/tests/indexes/test_setops.py +++ b/pandas/tests/indexes/test_setops.py @@ -20,18 +20,18 @@ } -def test_union_same_types(indices): +def test_union_same_types(index): # Union with a non-unique, non-monotonic index raises error # Only needed for bool index factory - idx1 = indices.sort_values() - idx2 = indices.sort_values() + idx1 = index.sort_values() + idx2 = index.sort_values() assert idx1.union(idx2).dtype == idx1.dtype -def test_union_different_types(indices, index_fixture2): +def test_union_different_types(index, index_fixture2): # This test only considers combinations of indices # GH 23525 - idx1, idx2 = indices, index_fixture2 + idx1, idx2 = index, index_fixture2 type_pair = tuple(sorted([type(idx1), type(idx2)], key=lambda x: str(x))) if type_pair in COMPATIBLE_INCONSISTENT_PAIRS: pytest.xfail("This test only considers non compatible indexes.") diff --git a/pandas/tests/indexes/timedeltas/test_timedelta.py b/pandas/tests/indexes/timedeltas/test_timedelta.py index a0521658ffc1e..4a1749ff734c1 100644 --- a/pandas/tests/indexes/timedeltas/test_timedelta.py +++ b/pandas/tests/indexes/timedeltas/test_timedelta.py @@ -25,7 +25,7 @@ class TestTimedeltaIndex(DatetimeLike): _holder = TimedeltaIndex @pytest.fixture - def indices(self): + def index(self): return tm.makeTimedeltaIndex(10) def create_index(self) -> TimedeltaIndex: diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index b77c47f927517..ced70069dd955 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -67,9 +67,9 @@ def test_setitem_ndarray_1d(self): (lambda x: x.iloc, "iloc"), ], ) - def test_getitem_ndarray_3d(self, indices, obj, idxr, idxr_id): + def test_getitem_ndarray_3d(self, index, obj, idxr, idxr_id): # GH 25567 - obj = obj(indices) + obj = obj(index) idxr = idxr(obj) nd3 = np.random.randint(5, size=(2, 2, 2)) @@ -105,17 +105,13 @@ def test_getitem_ndarray_3d(self, indices, obj, idxr, idxr_id): (lambda x: x.iloc, "iloc"), ], ) - def test_setitem_ndarray_3d(self, indices, obj, idxr, idxr_id): + def test_setitem_ndarray_3d(self, index, obj, idxr, idxr_id): # GH 25567 - obj = obj(indices) + obj = obj(index) idxr = idxr(obj) nd3 = np.random.randint(5, size=(2, 2, 2)) - if ( - (len(indices) == 0) - and (idxr_id == "iloc") - and isinstance(obj, pd.DataFrame) - ): + if (len(index) == 0) and (idxr_id == "iloc") and isinstance(obj, pd.DataFrame): # gh-32896 pytest.skip("This is currently failing. There's an xfailed test below.") @@ -123,7 +119,7 @@ def test_setitem_ndarray_3d(self, indices, obj, idxr, idxr_id): err = ValueError msg = f"Cannot set values with ndim > {obj.ndim}" elif ( - isinstance(indices, pd.IntervalIndex) + isinstance(index, pd.IntervalIndex) and idxr_id == "setitem" and obj.ndim == 1 ): diff --git a/pandas/tests/series/methods/test_to_period.py b/pandas/tests/series/methods/test_to_period.py index 5bc4a36498c58..b40fc81931e20 100644 --- a/pandas/tests/series/methods/test_to_period.py +++ b/pandas/tests/series/methods/test_to_period.py @@ -47,9 +47,8 @@ def test_to_period(self): expected.columns = exp_idx tm.assert_frame_equal(df.to_period(axis=1), expected) - def test_to_period_raises(self, indices): + def test_to_period_raises(self, index): # https://github.com/pandas-dev/pandas/issues/33327 - index = indices ser = Series(index=index, dtype=object) if not isinstance(index, DatetimeIndex): msg = f"unsupported Type {type(index).__name__}" diff --git a/pandas/tests/series/methods/test_to_timestamp.py b/pandas/tests/series/methods/test_to_timestamp.py index 296a1c15619f2..13a2042a2f639 100644 --- a/pandas/tests/series/methods/test_to_timestamp.py +++ b/pandas/tests/series/methods/test_to_timestamp.py @@ -55,9 +55,8 @@ def _get_with_delta(delta, freq="A-DEC"): tm.assert_index_equal(result.index, exp_index) assert result.name == "foo" - def test_to_timestamp_raises(self, indices): + def test_to_timestamp_raises(self, index): # https://github.com/pandas-dev/pandas/issues/33327 - index = indices ser = Series(index=index, dtype=object) if not isinstance(index, PeriodIndex): msg = f"unsupported Type {type(index).__name__}" diff --git a/pandas/tests/series/test_apply.py b/pandas/tests/series/test_apply.py index d51dceae53a1c..308398642895c 100644 --- a/pandas/tests/series/test_apply.py +++ b/pandas/tests/series/test_apply.py @@ -528,11 +528,11 @@ def test_map(self, datetime_series): exp = Series([np.nan, "B", "C", "D"]) tm.assert_series_equal(a.map(c), exp) - def test_map_empty(self, indices): - if isinstance(indices, MultiIndex): + def test_map_empty(self, index): + if isinstance(index, MultiIndex): pytest.skip("Initializing a Series from a MultiIndex is not supported") - s = Series(indices) + s = Series(index) result = s.map({}) expected = pd.Series(np.nan, index=s.index) From bb587dfea561c65104ee9983fb4737a5d4644116 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 26 Jun 2020 16:53:23 -0700 Subject: [PATCH 0226/1025] BUG: item_cache not cleared on DataFrame.values (#34999) --- pandas/core/frame.py | 1 + pandas/core/generic.py | 3 ++- pandas/core/internals/managers.py | 2 +- pandas/tests/frame/test_block_internals.py | 5 +++++ 4 files changed, 9 insertions(+), 2 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 39f93af1670bf..4fc96993f5b2e 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1341,6 +1341,7 @@ def to_numpy( array([[1, 3.0, Timestamp('2000-01-01 00:00:00')], [2, 4.5, Timestamp('2000-01-02 00:00:00')]], dtype=object) """ + self._consolidate_inplace() result = self._mgr.as_array( transpose=self._AXIS_REVERSED, dtype=dtype, copy=copy, na_value=na_value ) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 4e0247bfcddca..a66cade3b81b0 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -5330,6 +5330,7 @@ def values(self) -> np.ndarray: ['lion', 80.5, 1], ['monkey', nan, None]], dtype=object) """ + self._consolidate_inplace() return self._mgr.as_array(transpose=self._AXIS_REVERSED) @property @@ -6530,7 +6531,7 @@ def replace( f"Replacement lists must match in length. " f"Expecting {len(to_replace)} got {len(value)} " ) - + self._consolidate_inplace() new_data = self._mgr.replace_list( src_list=to_replace, dest_list=value, diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 6055a6205d286..843e7ce40fef8 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -812,7 +812,7 @@ def as_array( .values.to_numpy(dtype=dtype, na_value=na_value) .reshape(self.blocks[0].shape) ) - elif self._is_single_block or not self.is_mixed_type: + elif self._is_single_block: arr = np.asarray(self.blocks[0].get_values()) if dtype: arr = arr.astype(dtype, copy=False) diff --git a/pandas/tests/frame/test_block_internals.py b/pandas/tests/frame/test_block_internals.py index e2910a2eb6100..d5554860c034d 100644 --- a/pandas/tests/frame/test_block_internals.py +++ b/pandas/tests/frame/test_block_internals.py @@ -86,9 +86,14 @@ def test_modify_values(self, float_frame): # unconsolidated float_frame["E"] = 7.0 + col = float_frame["E"] float_frame.values[6] = 6 assert (float_frame.values[6] == 6).all() + # check that item_cache was cleared + assert float_frame["E"] is not col + assert (col == 7).all() + def test_boolean_set_uncons(self, float_frame): float_frame["E"] = 7.0 From 40d7dbef58b38fffb864442016ea65e0974a6ccb Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 26 Jun 2020 16:59:11 -0700 Subject: [PATCH 0227/1025] CLN: dont consolidate in indexing (#34679) --- pandas/core/indexing.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 3cf20b68c84f4..708b687434327 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1683,7 +1683,6 @@ def isetter(loc, v): ser = v else: # set the item, possibly having a dtype change - ser._consolidate_inplace() ser = ser.copy() ser._mgr = ser._mgr.setitem(indexer=pi, value=v) ser._maybe_update_cacher(clear=True) From ab83fabba850587f19dc96968cd244e628571695 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Mon, 29 Jun 2020 01:36:31 +0100 Subject: [PATCH 0228/1025] CI: troubleshoot (#35044) * CI: troubleshoot * test_ops_ndarray + lint fixup --- pandas/tests/arithmetic/test_datetime64.py | 1 + pandas/tests/arithmetic/test_numeric.py | 6 ++++-- pandas/tests/arithmetic/test_period.py | 6 +++++- pandas/tests/scalar/timedelta/test_arithmetic.py | 5 ++++- 4 files changed, 14 insertions(+), 4 deletions(-) diff --git a/pandas/tests/arithmetic/test_datetime64.py b/pandas/tests/arithmetic/test_datetime64.py index b3f4d5f5d9ee5..5dfaea7c77420 100644 --- a/pandas/tests/arithmetic/test_datetime64.py +++ b/pandas/tests/arithmetic/test_datetime64.py @@ -1046,6 +1046,7 @@ def test_dt64arr_add_sub_invalid(self, dti_freq, other, box_with_array): "cannot (add|subtract)", "cannot use operands with types", "ufunc '?(add|subtract)'? cannot use operands with types", + "Concatenation operation is not implemented for NumPy arrays", ] ) assert_invalid_addsub_type(dtarr, other, msg) diff --git a/pandas/tests/arithmetic/test_numeric.py b/pandas/tests/arithmetic/test_numeric.py index a37339c66bf6e..2155846b271fc 100644 --- a/pandas/tests/arithmetic/test_numeric.py +++ b/pandas/tests/arithmetic/test_numeric.py @@ -234,7 +234,8 @@ def test_add_sub_timedeltalike_invalid(self, numeric_idx, other, box): "unsupported operand type|" "Addition/subtraction of integers and integer-arrays|" "Instead of adding/subtracting|" - "cannot use operands with types dtype" + "cannot use operands with types dtype|" + "Concatenation operation is not implemented for NumPy arrays" ) with pytest.raises(TypeError, match=msg): left + other @@ -263,7 +264,8 @@ def test_add_sub_datetimelike_invalid(self, numeric_idx, other, box): msg = ( "unsupported operand type|" "Cannot (add|subtract) NaT (to|from) ndarray|" - "Addition/subtraction of integers and integer-arrays" + "Addition/subtraction of integers and integer-arrays|" + "Concatenation operation is not implemented for NumPy arrays" ) with pytest.raises(TypeError, match=msg): left + other diff --git a/pandas/tests/arithmetic/test_period.py b/pandas/tests/arithmetic/test_period.py index 6c7b989bb9f2e..930435074efc1 100644 --- a/pandas/tests/arithmetic/test_period.py +++ b/pandas/tests/arithmetic/test_period.py @@ -682,7 +682,11 @@ def test_parr_add_sub_float_raises(self, op, other, box_with_array): dti = pd.DatetimeIndex(["2011-01-01", "2011-01-02"], freq="D") pi = dti.to_period("D") pi = tm.box_expected(pi, box_with_array) - msg = r"unsupported operand type\(s\) for [+-]: .* and .*" + msg = ( + r"unsupported operand type\(s\) for [+-]: .* and .*|" + "Concatenation operation is not implemented for NumPy arrays" + ) + with pytest.raises(TypeError, match=msg): op(pi, other) diff --git a/pandas/tests/scalar/timedelta/test_arithmetic.py b/pandas/tests/scalar/timedelta/test_arithmetic.py index 2114962cfc0bd..cb33f99d9bd91 100644 --- a/pandas/tests/scalar/timedelta/test_arithmetic.py +++ b/pandas/tests/scalar/timedelta/test_arithmetic.py @@ -263,7 +263,10 @@ def test_ops_ndarray(self): msg = r"unsupported operand type\(s\) for \+: 'Timedelta' and 'int'" with pytest.raises(TypeError, match=msg): td + np.array([1]) - msg = r"unsupported operand type\(s\) for \+: 'numpy.ndarray' and 'Timedelta'" + msg = ( + r"unsupported operand type\(s\) for \+: 'numpy.ndarray' and 'Timedelta'|" + "Concatenation operation is not implemented for NumPy arrays" + ) with pytest.raises(TypeError, match=msg): np.array([1]) + td From 2b3e8db3cf65680d60379204e12a85dc141bc70b Mon Sep 17 00:00:00 2001 From: biddwan09 Date: Mon, 29 Jun 2020 13:26:51 +0600 Subject: [PATCH 0229/1025] Fix issue #29837: added test case for aggregation with isnan (#35039) --- .../tests/groupby/aggregate/test_aggregate.py | 27 +++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 96db519578106..7bc2ce10bba6d 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -994,3 +994,30 @@ def test_groupby_get_by_index(): res = df.groupby("A").agg({"B": lambda x: x.get(x.index[-1])}) expected = pd.DataFrame(dict(A=["S", "W"], B=[1.0, 2.0])).set_index("A") pd.testing.assert_frame_equal(res, expected) + + +def test_aggregate_categorical_with_isnan(): + # GH 29837 + df = pd.DataFrame( + { + "A": [1, 1, 1, 1], + "B": [1, 2, 1, 2], + "numerical_col": [0.1, 0.2, np.nan, 0.3], + "object_col": ["foo", "bar", "foo", "fee"], + "categorical_col": ["foo", "bar", "foo", "fee"], + } + ) + + df = df.astype({"categorical_col": "category"}) + + result = df.groupby(["A", "B"]).agg(lambda df: df.isna().sum()) + index = pd.MultiIndex.from_arrays([[1, 1], [1, 2]], names=("A", "B")) + expected = pd.DataFrame( + data={ + "numerical_col": [1.0, 0.0], + "object_col": [0, 0], + "categorical_col": [0, 0], + }, + index=index, + ) + tm.assert_frame_equal(result, expected) From 946ab9a99764f5f169113a3f7dfacfeb6d8d3c6d Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 29 Jun 2020 10:52:44 -0700 Subject: [PATCH 0230/1025] PERF: avoid duplicate is_single_block check (#35034) --- pandas/core/internals/managers.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 843e7ce40fef8..b2f2277d9a7dc 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -805,17 +805,17 @@ def as_array( # mutating the original object copy = copy or na_value is not lib.no_default - if self._is_single_block and self.blocks[0].is_extension: - # Avoid implicit conversion of extension blocks to object - arr = ( - self.blocks[0] - .values.to_numpy(dtype=dtype, na_value=na_value) - .reshape(self.blocks[0].shape) - ) - elif self._is_single_block: - arr = np.asarray(self.blocks[0].get_values()) - if dtype: - arr = arr.astype(dtype, copy=False) + if self._is_single_block: + blk = self.blocks[0] + if blk.is_extension: + # Avoid implicit conversion of extension blocks to object + arr = blk.values.to_numpy(dtype=dtype, na_value=na_value).reshape( + blk.shape + ) + else: + arr = np.asarray(blk.get_values()) + if dtype: + arr = arr.astype(dtype, copy=False) else: arr = self._interleave(dtype=dtype, na_value=na_value) # The underlying data was copied within _interleave From 345c43304d1a85b3dd8e330a46a959bcfd9b32bd Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Mon, 29 Jun 2020 22:17:55 +0100 Subject: [PATCH 0231/1025] CLN: move categorical tests from test_aggregate to test_categorical (#35052) --- .../tests/groupby/aggregate/test_aggregate.py | 53 ------------------- pandas/tests/groupby/test_categorical.py | 53 +++++++++++++++++++ 2 files changed, 53 insertions(+), 53 deletions(-) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 7bc2ce10bba6d..dbd713a0af4cf 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -458,22 +458,6 @@ def test_agg_split_object_part_datetime(): tm.assert_frame_equal(result, expected) -def test_agg_cython_category_not_implemented_fallback(): - # https://github.com/pandas-dev/pandas/issues/31450 - df = pd.DataFrame({"col_num": [1, 1, 2, 3]}) - df["col_cat"] = df["col_num"].astype("category") - - result = df.groupby("col_num").col_cat.first() - expected = pd.Series( - [1, 2, 3], index=pd.Index([1, 2, 3], name="col_num"), name="col_cat" - ) - tm.assert_series_equal(result, expected) - - result = df.groupby("col_num").agg({"col_cat": "first"}) - expected = expected.to_frame() - tm.assert_frame_equal(result, expected) - - class TestNamedAggregationSeries: def test_series_named_agg(self): df = pd.Series([1, 2, 3, 4]) @@ -809,16 +793,6 @@ def test_aggregate_mixed_types(): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("func", ["min", "max"]) -def test_aggregate_categorical_lost_index(func: str): - # GH: 28641 groupby drops index, when grouping over categorical column with min/max - ds = pd.Series(["b"], dtype="category").cat.as_ordered() - df = pd.DataFrame({"A": [1997], "B": ds}) - result = df.groupby("A").agg({"B": func}) - expected = pd.DataFrame({"B": ["b"]}, index=pd.Index([1997], name="A")) - tm.assert_frame_equal(result, expected) - - @pytest.mark.xfail(reason="Not implemented;see GH 31256") def test_aggregate_udf_na_extension_type(): # https://github.com/pandas-dev/pandas/pull/31359 @@ -994,30 +968,3 @@ def test_groupby_get_by_index(): res = df.groupby("A").agg({"B": lambda x: x.get(x.index[-1])}) expected = pd.DataFrame(dict(A=["S", "W"], B=[1.0, 2.0])).set_index("A") pd.testing.assert_frame_equal(res, expected) - - -def test_aggregate_categorical_with_isnan(): - # GH 29837 - df = pd.DataFrame( - { - "A": [1, 1, 1, 1], - "B": [1, 2, 1, 2], - "numerical_col": [0.1, 0.2, np.nan, 0.3], - "object_col": ["foo", "bar", "foo", "fee"], - "categorical_col": ["foo", "bar", "foo", "fee"], - } - ) - - df = df.astype({"categorical_col": "category"}) - - result = df.groupby(["A", "B"]).agg(lambda df: df.isna().sum()) - index = pd.MultiIndex.from_arrays([[1, 1], [1, 2]], names=("A", "B")) - expected = pd.DataFrame( - data={ - "numerical_col": [1.0, 0.0], - "object_col": [0, 0], - "categorical_col": [0, 0], - }, - index=index, - ) - tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index ff35ec04952b1..60c82bf1fb71c 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -1456,3 +1456,56 @@ def test_sorted_missing_category_values(): result = df.groupby(["bar", "foo"]).size().unstack() tm.assert_frame_equal(result, expected) + + +def test_agg_cython_category_not_implemented_fallback(): + # https://github.com/pandas-dev/pandas/issues/31450 + df = pd.DataFrame({"col_num": [1, 1, 2, 3]}) + df["col_cat"] = df["col_num"].astype("category") + + result = df.groupby("col_num").col_cat.first() + expected = pd.Series( + [1, 2, 3], index=pd.Index([1, 2, 3], name="col_num"), name="col_cat" + ) + tm.assert_series_equal(result, expected) + + result = df.groupby("col_num").agg({"col_cat": "first"}) + expected = expected.to_frame() + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("func", ["min", "max"]) +def test_aggregate_categorical_lost_index(func: str): + # GH: 28641 groupby drops index, when grouping over categorical column with min/max + ds = pd.Series(["b"], dtype="category").cat.as_ordered() + df = pd.DataFrame({"A": [1997], "B": ds}) + result = df.groupby("A").agg({"B": func}) + expected = pd.DataFrame({"B": ["b"]}, index=pd.Index([1997], name="A")) + tm.assert_frame_equal(result, expected) + + +def test_aggregate_categorical_with_isnan(): + # GH 29837 + df = pd.DataFrame( + { + "A": [1, 1, 1, 1], + "B": [1, 2, 1, 2], + "numerical_col": [0.1, 0.2, np.nan, 0.3], + "object_col": ["foo", "bar", "foo", "fee"], + "categorical_col": ["foo", "bar", "foo", "fee"], + } + ) + + df = df.astype({"categorical_col": "category"}) + + result = df.groupby(["A", "B"]).agg(lambda df: df.isna().sum()) + index = pd.MultiIndex.from_arrays([[1, 1], [1, 2]], names=("A", "B")) + expected = pd.DataFrame( + data={ + "numerical_col": [1.0, 0.0], + "object_col": [0, 0], + "categorical_col": [0, 0], + }, + index=index, + ) + tm.assert_frame_equal(result, expected) From bc75d8a5f9c1f0cce3d59e12971ba097aebe4279 Mon Sep 17 00:00:00 2001 From: Eric Groszman Date: Mon, 29 Jun 2020 14:38:20 -0700 Subject: [PATCH 0232/1025] DOC: correction to "size" in the plotting.bootstrap_plot docstring (#35026) --- pandas/plotting/_misc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/plotting/_misc.py b/pandas/plotting/_misc.py index 22a2d7617fded..9410dbfe8e90a 100644 --- a/pandas/plotting/_misc.py +++ b/pandas/plotting/_misc.py @@ -300,7 +300,7 @@ def bootstrap_plot(series, fig=None, size=50, samples=500, **kwds): creating a new one with default parameters. size : int, default 50 Number of data points to consider during each sampling. It must be - greater or equal than the length of the `series`. + less than or equal to the length of the `series`. samples : int, default 500 Number of times the bootstrap procedure is performed. **kwds From 1c33e461749d166461322e4ed4e44e12ea4281ed Mon Sep 17 00:00:00 2001 From: Fangchen Li Date: Mon, 29 Jun 2020 16:42:18 -0500 Subject: [PATCH 0233/1025] BUG: reading line-format JSON from file url #27135 (#34811) --- doc/source/whatsnew/v1.1.0.rst | 1 + pandas/io/json/_json.py | 5 ++++- pandas/tests/io/json/data/line_delimited.json | 3 +++ pandas/tests/io/json/test_readlines.py | 16 ++++++++++++++++ 4 files changed, 24 insertions(+), 1 deletion(-) create mode 100644 pandas/tests/io/json/data/line_delimited.json diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index c5eb2febe8ae9..70c45acec9f35 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -1040,6 +1040,7 @@ I/O - Bug in :meth:`read_excel` for ODS files removes 0.0 values (:issue:`27222`) - Bug in :meth:`ujson.encode` was raising an `OverflowError` with numbers larger than sys.maxsize (:issue: `34395`) - Bug in :meth:`HDFStore.append_to_multiple` was raising a ``ValueError`` when the min_itemsize parameter is set (:issue:`11238`) +- :meth:`read_json` now could read line-delimited json file from a file url while `lines` and `chunksize` are set. Plotting ^^^^^^^^ diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index b973553a767ba..ff37c36962aec 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -1,6 +1,6 @@ from collections import abc import functools -from io import StringIO +from io import BytesIO, StringIO from itertools import islice import os from typing import Any, Callable, Optional, Type @@ -724,6 +724,9 @@ def _get_data_from_filepath(self, filepath_or_buffer): self.should_close = True self.open_stream = data + if isinstance(data, BytesIO): + data = data.getvalue().decode() + return data def _combine_lines(self, lines) -> str: diff --git a/pandas/tests/io/json/data/line_delimited.json b/pandas/tests/io/json/data/line_delimited.json new file mode 100644 index 0000000000000..be84245329583 --- /dev/null +++ b/pandas/tests/io/json/data/line_delimited.json @@ -0,0 +1,3 @@ + {"a": 1, "b": 2} + {"a": 3, "b": 4} + {"a": 5, "b": 6} diff --git a/pandas/tests/io/json/test_readlines.py b/pandas/tests/io/json/test_readlines.py index 53462eaaada8d..b475fa2c514ff 100644 --- a/pandas/tests/io/json/test_readlines.py +++ b/pandas/tests/io/json/test_readlines.py @@ -1,4 +1,5 @@ from io import StringIO +from pathlib import Path import pytest @@ -219,3 +220,18 @@ def test_readjson_nrows_requires_lines(): msg = "nrows can only be passed if lines=True" with pytest.raises(ValueError, match=msg): pd.read_json(jsonl, lines=False, nrows=2) + + +def test_readjson_lines_chunks_fileurl(datapath): + # GH 27135 + # Test reading line-format JSON from file url + df_list_expected = [ + pd.DataFrame([[1, 2]], columns=["a", "b"], index=[0]), + pd.DataFrame([[3, 4]], columns=["a", "b"], index=[1]), + pd.DataFrame([[5, 6]], columns=["a", "b"], index=[2]), + ] + os_path = datapath("io", "json", "data", "line_delimited.json") + file_url = Path(os_path).as_uri() + url_reader = pd.read_json(file_url, lines=True, chunksize=1) + for index, chuck in enumerate(url_reader): + tm.assert_frame_equal(chuck, df_list_expected[index]) From 16263397ba1c5c98f479959dcf4208e741960da4 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 29 Jun 2020 16:13:45 -0700 Subject: [PATCH 0234/1025] CLN: assorted tslibs cleanups, annotations (#35045) --- pandas/_libs/tslibs/conversion.pyx | 2 +- pandas/_libs/tslibs/fields.pyx | 8 +++---- pandas/_libs/tslibs/nattype.pyx | 20 +++++++++--------- pandas/_libs/tslibs/period.pyx | 3 ++- pandas/_libs/tslibs/tzconversion.pyx | 31 ++++++++++++++-------------- 5 files changed, 33 insertions(+), 31 deletions(-) diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index 0811ba22977fd..884715f482cad 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -77,7 +77,7 @@ cdef inline int64_t cast_from_unit(object ts, str unit) except? -1: return (base * m) + (frac * m) -cpdef inline object precision_from_unit(str unit): +cpdef inline (int64_t, int) precision_from_unit(str unit): """ Return a casting of the unit represented to nanoseconds + the precision to round the fractional part. diff --git a/pandas/_libs/tslibs/fields.pyx b/pandas/_libs/tslibs/fields.pyx index 8d83eeb011866..0e5a6d3c4db46 100644 --- a/pandas/_libs/tslibs/fields.pyx +++ b/pandas/_libs/tslibs/fields.pyx @@ -91,7 +91,7 @@ def build_field_sarray(const int64_t[:] dtindex): @cython.wraparound(False) @cython.boundscheck(False) -def get_date_name_field(const int64_t[:] dtindex, object field, object locale=None): +def get_date_name_field(const int64_t[:] dtindex, str field, object locale=None): """ Given a int64-based datetime index, return array of strings of date name based on requested field (e.g. day_name) @@ -141,7 +141,7 @@ def get_date_name_field(const int64_t[:] dtindex, object field, object locale=No @cython.wraparound(False) @cython.boundscheck(False) -def get_start_end_field(const int64_t[:] dtindex, object field, +def get_start_end_field(const int64_t[:] dtindex, str field, object freqstr=None, int month_kw=12): """ Given an int64-based datetime index return array of indicators @@ -386,7 +386,7 @@ def get_start_end_field(const int64_t[:] dtindex, object field, @cython.wraparound(False) @cython.boundscheck(False) -def get_date_field(const int64_t[:] dtindex, object field): +def get_date_field(const int64_t[:] dtindex, str field): """ Given a int64-based datetime index, extract the year, month, etc., field and return an array of these values. @@ -548,7 +548,7 @@ def get_date_field(const int64_t[:] dtindex, object field): @cython.wraparound(False) @cython.boundscheck(False) -def get_timedelta_field(const int64_t[:] tdindex, object field): +def get_timedelta_field(const int64_t[:] tdindex, str field): """ Given a int64-based timedelta index, extract the days, hrs, sec., field and return an array of these values. diff --git a/pandas/_libs/tslibs/nattype.pyx b/pandas/_libs/tslibs/nattype.pyx index 71f151e6eb876..264013f928d22 100644 --- a/pandas/_libs/tslibs/nattype.pyx +++ b/pandas/_libs/tslibs/nattype.pyx @@ -50,7 +50,7 @@ _nat_scalar_rules[Py_GE] = False # ---------------------------------------------------------------------- -def _make_nan_func(func_name, doc): +def _make_nan_func(func_name: str, doc: str): def f(*args, **kwargs): return np.nan f.__name__ = func_name @@ -58,7 +58,7 @@ def _make_nan_func(func_name, doc): return f -def _make_nat_func(func_name, doc): +def _make_nat_func(func_name: str, doc: str): def f(*args, **kwargs): return c_NaT f.__name__ = func_name @@ -66,7 +66,7 @@ def _make_nat_func(func_name, doc): return f -def _make_error_func(func_name, cls): +def _make_error_func(func_name: str, cls): def f(*args, **kwargs): raise ValueError(f"NaTType does not support {func_name}") @@ -282,31 +282,31 @@ cdef class _NaT(datetime): return NPY_NAT @property - def is_leap_year(self): + def is_leap_year(self) -> bool: return False @property - def is_month_start(self): + def is_month_start(self) -> bool: return False @property - def is_quarter_start(self): + def is_quarter_start(self) -> bool: return False @property - def is_year_start(self): + def is_year_start(self) -> bool: return False @property - def is_month_end(self): + def is_month_end(self) -> bool: return False @property - def is_quarter_end(self): + def is_quarter_end(self) -> bool: return False @property - def is_year_end(self): + def is_year_end(self) -> bool: return False diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index a2250234dbd14..c0641297c4b8a 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -14,6 +14,7 @@ import cython from cpython.datetime cimport ( datetime, + tzinfo, PyDate_Check, PyDateTime_Check, PyDateTime_IMPORT, @@ -1417,7 +1418,7 @@ def extract_freq(ndarray[object] values): @cython.wraparound(False) @cython.boundscheck(False) -def dt64arr_to_periodarr(const int64_t[:] stamps, int freq, object tz): +def dt64arr_to_periodarr(const int64_t[:] stamps, int freq, tzinfo tz): cdef: Py_ssize_t n = len(stamps) int64_t[:] result = np.empty(n, dtype=np.int64) diff --git a/pandas/_libs/tslibs/tzconversion.pyx b/pandas/_libs/tslibs/tzconversion.pyx index 02fe203637d62..6e6b106b8f21a 100644 --- a/pandas/_libs/tslibs/tzconversion.pyx +++ b/pandas/_libs/tslibs/tzconversion.pyx @@ -5,7 +5,7 @@ import cython from cython import Py_ssize_t from cpython.datetime cimport ( - PyDateTime_IMPORT, PyDelta_Check, datetime, tzinfo) + PyDateTime_IMPORT, PyDelta_Check, datetime, timedelta, tzinfo) PyDateTime_IMPORT import pytz @@ -421,23 +421,22 @@ cdef int64_t[:] _tz_convert_one_way(int64_t[:] vals, tzinfo tz, bint to_utc): converted : ndarray[int64_t] """ cdef: - int64_t[:] converted, result + int64_t[:] converted Py_ssize_t i, n = len(vals) int64_t val - if not is_utc(tz): + if is_utc(tz): + converted = vals + elif is_tzlocal(tz): converted = np.empty(n, dtype=np.int64) - if is_tzlocal(tz): - for i in range(n): - val = vals[i] - if val == NPY_NAT: - converted[i] = NPY_NAT - else: - converted[i] = _tz_convert_tzlocal_utc(val, tz, to_utc) - else: - converted = _tz_convert_dst(vals, tz, to_utc) + for i in range(n): + val = vals[i] + if val == NPY_NAT: + converted[i] = NPY_NAT + else: + converted[i] = _tz_convert_tzlocal_utc(val, tz, to_utc) else: - converted = vals + converted = _tz_convert_dst(vals, tz, to_utc) return converted @@ -471,11 +470,12 @@ cdef inline int64_t _tzlocal_get_offset_components(int64_t val, tzinfo tz, npy_datetimestruct dts datetime dt int64_t delta + timedelta td dt64_to_dtstruct(val, &dts) dt = datetime(dts.year, dts.month, dts.day, dts.hour, dts.min, dts.sec, dts.us) - # get_utcoffset (tz.utcoffset under the hood) only makes sense if datetime + # tz.utcoffset only makes sense if datetime # is _wall time_, so if val is a UTC timestamp convert to wall time if not to_utc: dt = dt.replace(tzinfo=tzutc()) @@ -484,7 +484,8 @@ cdef inline int64_t _tzlocal_get_offset_components(int64_t val, tzinfo tz, if fold is not NULL: fold[0] = dt.fold - return int(get_utcoffset(tz, dt).total_seconds()) * 1000000000 + td = tz.utcoffset(dt) + return int(td.total_seconds() * 1_000_000_000) cdef int64_t _tz_convert_tzlocal_utc(int64_t val, tzinfo tz, bint to_utc=True, From ecd26edb1b9daeca0619b50f9340d4a49cd660e2 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Mon, 29 Jun 2020 16:15:45 -0700 Subject: [PATCH 0235/1025] DOC: Add example of NonFixedVariableWindowIndexer usage (#34994) --- doc/source/user_guide/computation.rst | 12 ++++++++++++ doc/source/whatsnew/v1.1.0.rst | 1 + pandas/api/indexers/__init__.py | 13 +++++++++++-- 3 files changed, 24 insertions(+), 2 deletions(-) diff --git a/doc/source/user_guide/computation.rst b/doc/source/user_guide/computation.rst index 897e5d5fb0e24..3a524996ea6d9 100644 --- a/doc/source/user_guide/computation.rst +++ b/doc/source/user_guide/computation.rst @@ -597,6 +597,18 @@ You can view other examples of ``BaseIndexer`` subclasses `here Date: Tue, 30 Jun 2020 00:19:44 +0100 Subject: [PATCH 0236/1025] CLN: make Info and DataFrameInfo subclasses (#34743) --- pandas/core/frame.py | 32 ++-- pandas/io/formats/info.py | 377 ++++++++++++++++++++++++-------------- 2 files changed, 252 insertions(+), 157 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 4fc96993f5b2e..f6ea6f51d88a8 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -141,7 +141,7 @@ from pandas.io.common import get_filepath_or_buffer from pandas.io.formats import console, format as fmt -from pandas.io.formats.info import info +from pandas.io.formats.info import DataFrameInfo import pandas.plotting if TYPE_CHECKING: @@ -2462,11 +2462,11 @@ def to_html( RangeIndex: 5 entries, 0 to 4 Data columns (total 3 columns): - # Column Non-Null Count Dtype + # Column Non-Null Count Dtype --- ------ -------------- ----- - 0 int_col 5 non-null int64 - 1 text_col 5 non-null object - 2 float_col 5 non-null float64 + 0 int_col 5 non-null int64 + 1 text_col 5 non-null object + 2 float_col 5 non-null float64 dtypes: float64(1), int64(1), object(1) memory usage: 248.0+ bytes @@ -2505,11 +2505,11 @@ def to_html( RangeIndex: 1000000 entries, 0 to 999999 Data columns (total 3 columns): - # Column Non-Null Count Dtype + # Column Non-Null Count Dtype --- ------ -------------- ----- - 0 column_1 1000000 non-null object - 1 column_2 1000000 non-null object - 2 column_3 1000000 non-null object + 0 column_1 1000000 non-null object + 1 column_2 1000000 non-null object + 2 column_3 1000000 non-null object dtypes: object(3) memory usage: 22.9+ MB @@ -2517,11 +2517,11 @@ def to_html( RangeIndex: 1000000 entries, 0 to 999999 Data columns (total 3 columns): - # Column Non-Null Count Dtype + # Column Non-Null Count Dtype --- ------ -------------- ----- - 0 column_1 1000000 non-null object - 1 column_2 1000000 non-null object - 2 column_3 1000000 non-null object + 0 column_1 1000000 non-null object + 1 column_2 1000000 non-null object + 2 column_3 1000000 non-null object dtypes: object(3) memory usage: 188.8 MB""" ), @@ -2532,7 +2532,7 @@ def to_html( DataFrame.memory_usage: Memory usage of DataFrame columns.""" ), ) - @doc(info) + @doc(DataFrameInfo.info) def info( self, verbose: Optional[bool] = None, @@ -2541,7 +2541,9 @@ def info( memory_usage: Optional[Union[bool, str]] = None, null_counts: Optional[bool] = None, ) -> None: - return info(self, verbose, buf, max_cols, memory_usage, null_counts) + return DataFrameInfo( + self, verbose, buf, max_cols, memory_usage, null_counts + ).info() def memory_usage(self, index=True, deep=False) -> Series: """ diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index b1dcafa7a7a8f..7a53b46a4ac0f 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -1,15 +1,17 @@ +from abc import ABCMeta, abstractmethod import sys -from typing import IO, TYPE_CHECKING, Optional, Tuple, Union +from typing import IO, TYPE_CHECKING, List, Optional, Tuple, Union from pandas._config import get_option from pandas._typing import Dtype, FrameOrSeries +from pandas.core.indexes.api import Index + from pandas.io.formats import format as fmt from pandas.io.formats.printing import pprint_thing if TYPE_CHECKING: - from pandas.core.indexes.api import Index # noqa: F401 from pandas.core.series import Series # noqa: F401 @@ -39,115 +41,247 @@ def _put_str(s: Union[str, Dtype], space: int) -> str: return str(s)[:space].ljust(space) -def _get_ids_and_dtypes(data: FrameOrSeries) -> Tuple["Index", "Series"]: +def _sizeof_fmt(num: Union[int, float], size_qualifier: str) -> str: """ - Get DataFrame's columns and dtypes. + Return size in human readable format. Parameters ---------- - data : DataFrame - Object that `info` was called on. + num : int + Size in bytes. + size_qualifier : str + Either empty, or '+' (if lower bound). Returns ------- - ids : Index - DataFrame's columns. - dtypes : Series - Dtype of each of the DataFrame's columns. - """ - ids = data.columns - dtypes = data.dtypes - return ids, dtypes - - -def info( - data: FrameOrSeries, - verbose: Optional[bool] = None, - buf: Optional[IO[str]] = None, - max_cols: Optional[int] = None, - memory_usage: Optional[Union[bool, str]] = None, - null_counts: Optional[bool] = None, -) -> None: - """ - Print a concise summary of a %(klass)s. - - This method prints information about a %(klass)s including - the index dtype%(type_sub)s, non-null values and memory usage. - - Parameters - ---------- - data : %(klass)s - %(klass)s to print information about. - verbose : bool, optional - Whether to print the full summary. By default, the setting in - ``pandas.options.display.max_info_columns`` is followed. - buf : writable buffer, defaults to sys.stdout - Where to send the output. By default, the output is printed to - sys.stdout. Pass a writable buffer if you need to further process - the output. - %(max_cols_sub)s - memory_usage : bool, str, optional - Specifies whether total memory usage of the %(klass)s - elements (including the index) should be displayed. By default, - this follows the ``pandas.options.display.memory_usage`` setting. - - True always show memory usage. False never shows memory usage. - A value of 'deep' is equivalent to "True with deep introspection". - Memory usage is shown in human-readable units (base-2 - representation). Without deep introspection a memory estimation is - made based in column dtype and number of rows assuming values - consume the same memory amount for corresponding dtypes. With deep - memory introspection, a real memory usage calculation is performed - at the cost of computational resources. - null_counts : bool, optional - Whether to show the non-null counts. By default, this is shown - only if the %(klass)s is smaller than - ``pandas.options.display.max_info_rows`` and - ``pandas.options.display.max_info_columns``. A value of True always - shows the counts, and False never shows the counts. - - Returns - ------- - None - This method prints a summary of a %(klass)s and returns None. - - See Also - -------- - %(see_also_sub)s + str + Size in human readable format. Examples -------- - %(examples_sub)s - """ - if buf is None: # pragma: no cover - buf = sys.stdout - - lines = [] - - lines.append(str(type(data))) - lines.append(data.index._summary()) - - ids, dtypes = _get_ids_and_dtypes(data) - col_count = len(ids) - - if col_count == 0: - lines.append(f"Empty {type(data).__name__}") - fmt.buffer_put_lines(buf, lines) - return - - # hack - if max_cols is None: - max_cols = get_option("display.max_info_columns", col_count + 1) - - max_rows = get_option("display.max_info_rows", len(data) + 1) + >>> _sizeof_fmt(23028, '') + '22.5 KB' - if null_counts is None: - show_counts = (col_count <= max_cols) and (len(data) < max_rows) - else: - show_counts = null_counts - exceeds_info_cols = col_count > max_cols + >>> _sizeof_fmt(23028, '+') + '22.5+ KB' + """ + for x in ["bytes", "KB", "MB", "GB", "TB"]: + if num < 1024.0: + return f"{num:3.1f}{size_qualifier} {x}" + num /= 1024.0 + return f"{num:3.1f}{size_qualifier} PB" + + +class BaseInfo(metaclass=ABCMeta): + def __init__( + self, + data: FrameOrSeries, + verbose: Optional[bool] = None, + buf: Optional[IO[str]] = None, + max_cols: Optional[int] = None, + memory_usage: Optional[Union[bool, str]] = None, + null_counts: Optional[bool] = None, + ): + if buf is None: # pragma: no cover + buf = sys.stdout + if memory_usage is None: + memory_usage = get_option("display.memory_usage") + + self.data = data + self.verbose = verbose + self.buf = buf + self.max_cols = max_cols + self.memory_usage = memory_usage + self.null_counts = null_counts + + @abstractmethod + def _get_mem_usage(self, deep: bool) -> int: + """ + Get memory usage in bytes. + + Parameters + ---------- + deep : bool + If True, introspect the data deeply by interrogating object dtypes + for system-level memory consumption, and include it in the returned + values. + + Returns + ------- + mem_usage : int + Object's total memory usage in bytes. + """ + pass + + @abstractmethod + def _get_ids_and_dtypes(self) -> Tuple["Index", "Series"]: + """ + Get column names and dtypes. + + Returns + ------- + ids : Index + DataFrame's column names. + dtypes : Series + Dtype of each of the DataFrame's columns. + """ + pass + + @abstractmethod + def _verbose_repr( + self, lines: List[str], ids: "Index", dtypes: "Series", show_counts: bool + ) -> None: + """ + Append name, non-null count (optional), and dtype for each column to `lines`. + + Parameters + ---------- + lines : List[str] + Lines that will contain `info` representation. + ids : Index + The DataFrame's column names. + dtypes : Series + The DataFrame's columns' dtypes. + show_counts : bool + If True, count of non-NA cells for each column will be appended to `lines`. + """ + pass + + @abstractmethod + def _non_verbose_repr(self, lines: List[str], ids: "Index") -> None: + """ + Append short summary of columns' names to `lines`. + + Parameters + ---------- + lines : List[str] + Lines that will contain `info` representation. + ids : Index + The DataFrame's column names. + """ + pass + + def info(self) -> None: + """ + Print a concise summary of a %(klass)s. + + This method prints information about a %(klass)s including + the index dtype%(type_sub)s, non-null values and memory usage. + + Parameters + ---------- + data : %(klass)s + %(klass)s to print information about. + verbose : bool, optional + Whether to print the full summary. By default, the setting in + ``pandas.options.display.max_info_columns`` is followed. + buf : writable buffer, defaults to sys.stdout + Where to send the output. By default, the output is printed to + sys.stdout. Pass a writable buffer if you need to further process + the output. + %(max_cols_sub)s + memory_usage : bool, str, optional + Specifies whether total memory usage of the %(klass)s + elements (including the index) should be displayed. By default, + this follows the ``pandas.options.display.memory_usage`` setting. + + True always show memory usage. False never shows memory usage. + A value of 'deep' is equivalent to "True with deep introspection". + Memory usage is shown in human-readable units (base-2 + representation). Without deep introspection a memory estimation is + made based in column dtype and number of rows assuming values + consume the same memory amount for corresponding dtypes. With deep + memory introspection, a real memory usage calculation is performed + at the cost of computational resources. + null_counts : bool, optional + Whether to show the non-null counts. By default, this is shown + only if the %(klass)s is smaller than + ``pandas.options.display.max_info_rows`` and + ``pandas.options.display.max_info_columns``. A value of True always + shows the counts, and False never shows the counts. + + Returns + ------- + None + This method prints a summary of a %(klass)s and returns None. + + See Also + -------- + %(see_also_sub)s + + Examples + -------- + %(examples_sub)s + """ + lines = [] + + lines.append(str(type(self.data))) + lines.append(self.data.index._summary()) + + ids, dtypes = self._get_ids_and_dtypes() + col_count = len(ids) + + if col_count == 0: + lines.append(f"Empty {type(self.data).__name__}") + fmt.buffer_put_lines(self.buf, lines) + return + + # hack + max_cols = self.max_cols + if max_cols is None: + max_cols = get_option("display.max_info_columns", col_count + 1) + + max_rows = get_option("display.max_info_rows", len(self.data) + 1) + + if self.null_counts is None: + show_counts = (col_count <= max_cols) and (len(self.data) < max_rows) + else: + show_counts = self.null_counts + exceeds_info_cols = col_count > max_cols - def _verbose_repr(): + if self.verbose: + self._verbose_repr(lines, ids, dtypes, show_counts) + elif self.verbose is False: # specifically set to False, not necessarily None + self._non_verbose_repr(lines, ids) + else: + if exceeds_info_cols: + self._non_verbose_repr(lines, ids) + else: + self._verbose_repr(lines, ids, dtypes, show_counts) + + # groupby dtype.name to collect e.g. Categorical columns + counts = dtypes.value_counts().groupby(lambda x: x.name).sum() + collected_dtypes = [f"{k[0]}({k[1]:d})" for k in sorted(counts.items())] + lines.append(f"dtypes: {', '.join(collected_dtypes)}") + + if self.memory_usage: + # append memory usage of df to display + size_qualifier = "" + if self.memory_usage == "deep": + deep = True + else: + # size_qualifier is just a best effort; not guaranteed to catch + # all cases (e.g., it misses categorical data even with object + # categories) + deep = False + if "object" in counts or self.data.index._is_memory_usage_qualified(): + size_qualifier = "+" + mem_usage = self._get_mem_usage(deep=deep) + lines.append(f"memory usage: {_sizeof_fmt(mem_usage, size_qualifier)}\n") + fmt.buffer_put_lines(self.buf, lines) + + +class DataFrameInfo(BaseInfo): + def _get_mem_usage(self, deep: bool) -> int: + return self.data.memory_usage(index=True, deep=deep).sum() + + def _get_ids_and_dtypes(self) -> Tuple["Index", "Series"]: + return self.data.columns, self.data.dtypes + + def _verbose_repr( + self, lines: List[str], ids: "Index", dtypes: "Series", show_counts: bool + ) -> None: + col_count = len(ids) lines.append(f"Data columns (total {col_count} columns):") id_head = " # " @@ -164,7 +298,7 @@ def _verbose_repr(): header = _put_str(id_head, space_num) + _put_str(column_head, space) if show_counts: - counts = data.count() + counts = self.data.count() if col_count != len(counts): # pragma: no cover raise AssertionError( f"Columns must equal counts ({col_count} != {len(counts)})" @@ -213,46 +347,5 @@ def _verbose_repr(): + _put_str(dtype, space_dtype) ) - def _non_verbose_repr(): + def _non_verbose_repr(self, lines: List[str], ids: "Index") -> None: lines.append(ids._summary(name="Columns")) - - def _sizeof_fmt(num, size_qualifier): - # returns size in human readable format - for x in ["bytes", "KB", "MB", "GB", "TB"]: - if num < 1024.0: - return f"{num:3.1f}{size_qualifier} {x}" - num /= 1024.0 - return f"{num:3.1f}{size_qualifier} PB" - - if verbose: - _verbose_repr() - elif verbose is False: # specifically set to False, not necessarily None - _non_verbose_repr() - else: - if exceeds_info_cols: - _non_verbose_repr() - else: - _verbose_repr() - - # groupby dtype.name to collect e.g. Categorical columns - counts = dtypes.value_counts().groupby(lambda x: x.name).sum() - collected_dtypes = [f"{k[0]}({k[1]:d})" for k in sorted(counts.items())] - lines.append(f"dtypes: {', '.join(collected_dtypes)}") - - if memory_usage is None: - memory_usage = get_option("display.memory_usage") - if memory_usage: - # append memory usage of df to display - size_qualifier = "" - if memory_usage == "deep": - deep = True - else: - # size_qualifier is just a best effort; not guaranteed to catch - # all cases (e.g., it misses categorical data even with object - # categories) - deep = False - if "object" in counts or data.index._is_memory_usage_qualified(): - size_qualifier = "+" - mem_usage = data.memory_usage(index=True, deep=deep).sum() - lines.append(f"memory usage: {_sizeof_fmt(mem_usage, size_qualifier)}\n") - fmt.buffer_put_lines(buf, lines) From 8347ad86ad6bb144f79da4757266baf3f83662cb Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 30 Jun 2020 06:54:56 -0500 Subject: [PATCH 0237/1025] DOC/TST: DataFrame constructor with a list of DataFrames (#34991) * DOC/TST: DataFrame constructor with a list of DataFrames Closes #32289 --- doc/source/whatsnew/v1.1.0.rst | 1 + pandas/core/internals/construction.py | 2 +- pandas/tests/frame/test_constructors.py | 38 +++++++++---------------- 3 files changed, 16 insertions(+), 25 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index cd2b8a8055e68..040253ebe7279 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -696,6 +696,7 @@ Other API changes - :func: `merge` now checks ``suffixes`` parameter type to be ``tuple`` and raises ``TypeError``, whereas before a ``list`` or ``set`` were accepted and that the ``set`` could produce unexpected results (:issue:`33740`) - :class:`Period` no longer accepts tuples for the ``freq`` argument (:issue:`34658`) - :meth:`Series.interpolate` and :meth:`DataFrame.interpolate` now raises ValueError if ``limit_direction`` is 'forward' or 'both' and ``method`` is 'backfill' or 'bfill' or ``limit_direction`` is 'backward' or 'both' and ``method`` is 'pad' or 'ffill' (:issue:`34746`) +- The :class:`DataFrame` constructor no longer accepts a list of ``DataFrame`` objects. Because of changes to NumPy, ``DataFrame`` objects are now consistently treated as 2D objects, so a list of ``DataFrames`` is considered 3D, and no longer acceptible for the ``DataFrame`` constructor (:issue:`32289`). Increased minimum versions for dependencies diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index d49f1f154a2c1..4b9db810dead0 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -321,7 +321,7 @@ def convert(v): if values.ndim == 1: values = values.reshape((values.shape[0], 1)) elif values.ndim != 2: - raise ValueError("Must pass 2-d input") + raise ValueError(f"Must pass 2-d input. shape={values.shape}") return values diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 02a871666c78d..dba243f1a339a 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -11,7 +11,7 @@ import pytz from pandas.compat import PY37, is_platform_little_endian -from pandas.compat.numpy import _is_numpy_dev +from pandas.compat.numpy import _np_version_under1p19 from pandas.core.dtypes.common import is_integer_dtype @@ -147,14 +147,20 @@ def test_constructor_dtype_list_data(self): assert df.loc[1, 0] is None assert df.loc[0, 1] == "2" - @pytest.mark.xfail(_is_numpy_dev, reason="Interprets list of frame as 3D") - def test_constructor_list_frames(self): - # see gh-3243 - result = DataFrame([DataFrame()]) - assert result.shape == (1, 0) + @pytest.mark.skipif(_np_version_under1p19, reason="NumPy change.") + def test_constructor_list_of_2d_raises(self): + # https://github.com/pandas-dev/pandas/issues/32289 + a = pd.DataFrame() + b = np.empty((0, 0)) + with pytest.raises(ValueError, match=r"shape=\(1, 0, 0\)"): + pd.DataFrame([a]) - result = DataFrame([DataFrame(dict(A=np.arange(5)))]) - assert isinstance(result.iloc[0, 0], DataFrame) + with pytest.raises(ValueError, match=r"shape=\(1, 0, 0\)"): + pd.DataFrame([b]) + + a = pd.DataFrame({"A": [1, 2]}) + with pytest.raises(ValueError, match=r"shape=\(2, 2, 1\)"): + pd.DataFrame([a, a]) def test_constructor_mixed_dtypes(self): def _make_mixed_dtypes_df(typ, ad=None): @@ -507,22 +513,6 @@ def test_constructor_error_msgs(self): with pytest.raises(ValueError, match=msg): DataFrame({"a": False, "b": True}) - @pytest.mark.xfail(_is_numpy_dev, reason="Interprets embedded frame as 3D") - def test_constructor_with_embedded_frames(self): - - # embedded data frames - df1 = DataFrame({"a": [1, 2, 3], "b": [3, 4, 5]}) - df2 = DataFrame([df1, df1 + 10]) - - df2.dtypes - str(df2) - - result = df2.loc[0, 0] - tm.assert_frame_equal(result, df1) - - result = df2.loc[1, 0] - tm.assert_frame_equal(result, df1 + 10) - def test_constructor_subclass_dict(self, float_frame, dict_subclass): # Test for passing dict subclass to constructor data = { From 9d18987b9db303bb059d4ac831d1f47b9cbc8522 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Tue, 30 Jun 2020 05:51:45 -0700 Subject: [PATCH 0238/1025] REF: Rename NonFixedVariableWindowIndexer to VariableOffsetWindowIndexer (#35059) Co-authored-by: Matt Roeschke --- doc/source/user_guide/computation.rst | 6 +++--- doc/source/whatsnew/v1.1.0.rst | 2 +- pandas/api/indexers/__init__.py | 4 ++-- pandas/core/window/indexers.py | 2 +- pandas/tests/window/test_base_indexer.py | 4 ++-- 5 files changed, 9 insertions(+), 9 deletions(-) diff --git a/doc/source/user_guide/computation.rst b/doc/source/user_guide/computation.rst index 3a524996ea6d9..f36c6e06044f2 100644 --- a/doc/source/user_guide/computation.rst +++ b/doc/source/user_guide/computation.rst @@ -597,15 +597,15 @@ You can view other examples of ``BaseIndexer`` subclasses `here Date: Tue, 30 Jun 2020 05:52:22 -0700 Subject: [PATCH 0239/1025] PERF: put some Timetamp methods in _Timestamp (#35036) --- pandas/_libs/tslibs/timestamps.pxd | 4 +- pandas/_libs/tslibs/timestamps.pyx | 291 +++++++++++++++-------------- 2 files changed, 151 insertions(+), 144 deletions(-) diff --git a/pandas/_libs/tslibs/timestamps.pxd b/pandas/_libs/tslibs/timestamps.pxd index 88d21b19e1e37..27b659980e526 100644 --- a/pandas/_libs/tslibs/timestamps.pxd +++ b/pandas/_libs/tslibs/timestamps.pxd @@ -16,8 +16,8 @@ cdef class _Timestamp(ABCTimestamp): int64_t value, nanosecond object freq - cpdef bint _get_start_end_field(self, str field) - cpdef _get_date_name_field(self, object field, object locale) + cdef bint _get_start_end_field(self, str field) + cdef _get_date_name_field(self, str field, object locale) cdef int64_t _maybe_convert_value_to_local(self) cpdef to_datetime64(self) cdef _assert_tzawareness_compat(_Timestamp self, datetime other) diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index 471ed557f4327..15fcfc742ecf3 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -441,6 +441,8 @@ cdef class _Timestamp(ABCTimestamp): return NotImplemented + # ----------------------------------------------------------------- + cdef int64_t _maybe_convert_value_to_local(self): """Convert UTC i8 value to local i8 value if tz exists""" cdef: @@ -450,7 +452,7 @@ cdef class _Timestamp(ABCTimestamp): val = tz_convert_single(self.value, UTC, self.tz) return val - cpdef bint _get_start_end_field(self, str field): + cdef bint _get_start_end_field(self, str field): cdef: int64_t val dict kwds @@ -471,7 +473,67 @@ cdef class _Timestamp(ABCTimestamp): field, freqstr, month_kw) return out[0] - cpdef _get_date_name_field(self, object field, object locale): + @property + def is_month_start(self) -> bool: + """ + Return True if date is first day of month. + """ + if self.freq is None: + # fast-path for non-business frequencies + return self.day == 1 + return self._get_start_end_field("is_month_start") + + @property + def is_month_end(self) -> bool: + """ + Return True if date is last day of month. + """ + if self.freq is None: + # fast-path for non-business frequencies + return self.day == self.days_in_month + return self._get_start_end_field("is_month_end") + + @property + def is_quarter_start(self) -> bool: + """ + Return True if date is first day of the quarter. + """ + if self.freq is None: + # fast-path for non-business frequencies + return self.day == 1 and self.month % 3 == 1 + return self._get_start_end_field("is_quarter_start") + + @property + def is_quarter_end(self) -> bool: + """ + Return True if date is last day of the quarter. + """ + if self.freq is None: + # fast-path for non-business frequencies + return (self.month % 3) == 0 and self.day == self.days_in_month + return self._get_start_end_field("is_quarter_end") + + @property + def is_year_start(self) -> bool: + """ + Return True if date is first day of the year. + """ + if self.freq is None: + # fast-path for non-business frequencies + return self.day == self.month == 1 + return self._get_start_end_field("is_year_start") + + @property + def is_year_end(self) -> bool: + """ + Return True if date is last day of the year. + """ + if self.freq is None: + # fast-path for non-business frequencies + return self.month == 12 and self.day == 31 + return self._get_start_end_field("is_year_end") + + cdef _get_date_name_field(self, str field, object locale): cdef: int64_t val object[:] out @@ -481,6 +543,85 @@ cdef class _Timestamp(ABCTimestamp): field, locale=locale) return out[0] + def day_name(self, locale=None) -> str: + """ + Return the day name of the Timestamp with specified locale. + + Parameters + ---------- + locale : str, default None (English locale) + Locale determining the language in which to return the day name. + + Returns + ------- + day_name : string + + .. versionadded:: 0.23.0 + """ + return self._get_date_name_field("day_name", locale) + + def month_name(self, locale=None) -> str: + """ + Return the month name of the Timestamp with specified locale. + + Parameters + ---------- + locale : str, default None (English locale) + Locale determining the language in which to return the month name. + + Returns + ------- + month_name : string + + .. versionadded:: 0.23.0 + """ + return self._get_date_name_field("month_name", locale) + + @property + def is_leap_year(self) -> bool: + """ + Return True if year is a leap year. + """ + return bool(ccalendar.is_leapyear(self.year)) + + @property + def dayofweek(self) -> int: + """ + Return day of the week. + """ + return self.weekday() + + @property + def dayofyear(self) -> int: + """ + Return the day of the year. + """ + return ccalendar.get_day_of_year(self.year, self.month, self.day) + + @property + def quarter(self) -> int: + """ + Return the quarter of the year. + """ + return ((self.month - 1) // 3) + 1 + + @property + def week(self) -> int: + """ + Return the week number of the year. + """ + return ccalendar.get_week_of_year(self.year, self.month, self.day) + + @property + def days_in_month(self) -> int: + """ + Return the number of days in the month. + """ + return ccalendar.get_days_in_month(self.year, self.month) + + # ----------------------------------------------------------------- + # Rendering Methods + @property def _repr_base(self) -> str: return f"{self._date_repr} {self._time_repr}" @@ -514,6 +655,8 @@ cdef class _Timestamp(ABCTimestamp): return self._date_repr return self._repr_base + # ----------------------------------------------------------------- + @property def asm8(self) -> np.datetime64: """ @@ -1040,79 +1183,6 @@ timedelta}, default 'raise' return Period(self, freq=freq) - @property - def dayofweek(self) -> int: - """ - Return day of the week. - """ - return self.weekday() - - def day_name(self, locale=None) -> str: - """ - Return the day name of the Timestamp with specified locale. - - Parameters - ---------- - locale : str, default None (English locale) - Locale determining the language in which to return the day name. - - Returns - ------- - day_name : string - - .. versionadded:: 0.23.0 - """ - return self._get_date_name_field('day_name', locale) - - def month_name(self, locale=None) -> str: - """ - Return the month name of the Timestamp with specified locale. - - Parameters - ---------- - locale : str, default None (English locale) - Locale determining the language in which to return the month name. - - Returns - ------- - month_name : string - - .. versionadded:: 0.23.0 - """ - return self._get_date_name_field('month_name', locale) - - @property - def dayofyear(self) -> int: - """ - Return the day of the year. - """ - return ccalendar.get_day_of_year(self.year, self.month, self.day) - - @property - def week(self) -> int: - """ - Return the week number of the year. - """ - return ccalendar.get_week_of_year(self.year, self.month, self.day) - - weekofyear = week - - @property - def quarter(self) -> int: - """ - Return the quarter of the year. - """ - return ((self.month - 1) // 3) + 1 - - @property - def days_in_month(self) -> int: - """ - Return the number of days in the month. - """ - return ccalendar.get_days_in_month(self.year, self.month) - - daysinmonth = days_in_month - @property def freqstr(self): """ @@ -1120,73 +1190,6 @@ timedelta}, default 'raise' """ return getattr(self.freq, 'freqstr', self.freq) - @property - def is_month_start(self) -> bool: - """ - Return True if date is first day of month. - """ - if self.freq is None: - # fast-path for non-business frequencies - return self.day == 1 - return self._get_start_end_field('is_month_start') - - @property - def is_month_end(self) -> bool: - """ - Return True if date is last day of month. - """ - if self.freq is None: - # fast-path for non-business frequencies - return self.day == self.days_in_month - return self._get_start_end_field('is_month_end') - - @property - def is_quarter_start(self) -> bool: - """ - Return True if date is first day of the quarter. - """ - if self.freq is None: - # fast-path for non-business frequencies - return self.day == 1 and self.month % 3 == 1 - return self._get_start_end_field('is_quarter_start') - - @property - def is_quarter_end(self) -> bool: - """ - Return True if date is last day of the quarter. - """ - if self.freq is None: - # fast-path for non-business frequencies - return (self.month % 3) == 0 and self.day == self.days_in_month - return self._get_start_end_field('is_quarter_end') - - @property - def is_year_start(self) -> bool: - """ - Return True if date is first day of the year. - """ - if self.freq is None: - # fast-path for non-business frequencies - return self.day == self.month == 1 - return self._get_start_end_field('is_year_start') - - @property - def is_year_end(self) -> bool: - """ - Return True if date is last day of the year. - """ - if self.freq is None: - # fast-path for non-business frequencies - return self.month == 12 and self.day == 31 - return self._get_start_end_field('is_year_end') - - @property - def is_leap_year(self) -> bool: - """ - Return True if year is a leap year. - """ - return bool(ccalendar.is_leapyear(self.year)) - def tz_localize(self, tz, ambiguous='raise', nonexistent='raise'): """ Convert naive Timestamp to local time zone, or remove @@ -1456,6 +1459,10 @@ default 'raise' return Timestamp(normalized[0]).tz_localize(own_tz) +# Aliases +Timestamp.weekofyear = Timestamp.week +Timestamp.daysinmonth = Timestamp.days_in_month + # Add the min and max fields at the class level cdef int64_t _NS_UPPER_BOUND = np.iinfo(np.int64).max # the smallest value we could actually represent is From a502a372df1f2513e04ebf7cc4a293f28bde640e Mon Sep 17 00:00:00 2001 From: Andrew Wieteska <48889395+arw2019@users.noreply.github.com> Date: Tue, 30 Jun 2020 09:01:23 -0400 Subject: [PATCH 0240/1025] BUG: HDFStore unable to create colindex w/o error thrown (#34983) --- doc/source/whatsnew/v1.1.0.rst | 1 + pandas/io/pytables.py | 8 ++++++- pandas/tests/io/pytables/test_store.py | 31 ++++++++++++++++++++++++++ 3 files changed, 39 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index f172fee7bcdbe..0ca19ffd1f496 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -1042,6 +1042,7 @@ I/O - Bug in :meth:`read_excel` for ODS files removes 0.0 values (:issue:`27222`) - Bug in :meth:`ujson.encode` was raising an `OverflowError` with numbers larger than sys.maxsize (:issue: `34395`) - Bug in :meth:`HDFStore.append_to_multiple` was raising a ``ValueError`` when the min_itemsize parameter is set (:issue:`11238`) +- Bug in :meth:`~HDFStore.create_table` now raises an error when `column` argument was not specified in `data_columns` on input (:issue:`28156`) - :meth:`read_json` now could read line-delimited json file from a file url while `lines` and `chunksize` are set. Plotting diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 0e5d7b007bd89..981b380f8b5e9 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -3569,7 +3569,6 @@ def create_index(self, columns=None, optlevel=None, kind: Optional[str] = None): for c in columns: v = getattr(table.cols, c, None) if v is not None: - # remove the index if the kind/optlevel have changed if v.is_indexed: index = v.index @@ -3597,6 +3596,13 @@ def create_index(self, columns=None, optlevel=None, kind: Optional[str] = None): "data_columns when initializing the table." ) v.create_index(**kw) + elif c in self.non_index_axes[0][1]: + # GH 28156 + raise AttributeError( + f"column {c} is not a data_column.\n" + f"In order to read column {c} you must reload the dataframe \n" + f"into HDFStore and include {c} with the data_columns argument." + ) def _read_axes( self, where, start: Optional[int] = None, stop: Optional[int] = None diff --git a/pandas/tests/io/pytables/test_store.py b/pandas/tests/io/pytables/test_store.py index c69992471fc9b..df014171be817 100644 --- a/pandas/tests/io/pytables/test_store.py +++ b/pandas/tests/io/pytables/test_store.py @@ -1727,6 +1727,37 @@ def col(t, column): with pytest.raises(TypeError): store.create_table_index("f2") + def test_create_table_index_data_columns_argument(self, setup_path): + # GH 28156 + + with ensure_clean_store(setup_path) as store: + + with catch_warnings(record=True): + + def col(t, column): + return getattr(store.get_storer(t).table.cols, column) + + # data columns + df = tm.makeTimeDataFrame() + df["string"] = "foo" + df["string2"] = "bar" + store.append("f", df, data_columns=["string"]) + assert col("f", "index").is_indexed is True + assert col("f", "string").is_indexed is True + + msg = "'Cols' object has no attribute 'string2'" + with pytest.raises(AttributeError, match=msg): + col("f", "string2").is_indexed + + # try to index a col which isn't a data_column + msg = ( + f"column string2 is not a data_column.\n" + f"In order to read column string2 you must reload the dataframe \n" + f"into HDFStore and include string2 with the data_columns argument." + ) + with pytest.raises(AttributeError, match=msg): + store.create_table_index("f", columns=["string2"]) + def test_append_hierarchical(self, setup_path): index = MultiIndex( levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], From 6c4ecf54064b4716ea76d3444ad4aaf6bac91572 Mon Sep 17 00:00:00 2001 From: Erfan Nariman <34067903+erfannariman@users.noreply.github.com> Date: Tue, 30 Jun 2020 18:59:23 +0200 Subject: [PATCH 0241/1025] Alligned docstring for ignore_index in append (#35056) --- pandas/core/frame.py | 2 +- pandas/core/series.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index f6ea6f51d88a8..a21a45f415a47 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -7551,7 +7551,7 @@ def append( other : DataFrame or Series/dict-like object, or list of these The data to append. ignore_index : bool, default False - If True, do not use the index labels. + If True, the resulting axis will be labeled 0, 1, …, n - 1. verify_integrity : bool, default False If True, raise ValueError on creating index with duplicates. sort : bool, default False diff --git a/pandas/core/series.py b/pandas/core/series.py index 54b85afea4964..be4099d56d43a 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2609,7 +2609,7 @@ def append(self, to_append, ignore_index=False, verify_integrity=False): to_append : Series or list/tuple of Series Series to append with self. ignore_index : bool, default False - If True, do not use the index labels. + If True, the resulting axis will be labeled 0, 1, …, n - 1. verify_integrity : bool, default False If True, raise Exception on creating index with duplicates. From a5652a7ad40508d4ab9a7f6c26764597bceafaa4 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 30 Jun 2020 15:58:43 -0500 Subject: [PATCH 0242/1025] API: Allow non-tuples in pandas.merge (#34810) --- doc/source/whatsnew/v1.1.0.rst | 2 +- pandas/core/frame.py | 11 +++++++---- pandas/core/reshape/merge.py | 12 ++++++++---- pandas/tests/reshape/merge/test_merge.py | 18 +++++++----------- 4 files changed, 23 insertions(+), 20 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 0ca19ffd1f496..d32eeb493b2c2 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -693,7 +693,6 @@ Other API changes - :func: `pandas.api.dtypes.is_string_dtype` no longer incorrectly identifies categorical series as string. - :func:`read_excel` no longer takes ``**kwds`` arguments. This means that passing in keyword ``chunksize`` now raises a ``TypeError`` (previously raised a ``NotImplementedError``), while passing in keyword ``encoding`` now raises a ``TypeError`` (:issue:`34464`) -- :func: `merge` now checks ``suffixes`` parameter type to be ``tuple`` and raises ``TypeError``, whereas before a ``list`` or ``set`` were accepted and that the ``set`` could produce unexpected results (:issue:`33740`) - :class:`Period` no longer accepts tuples for the ``freq`` argument (:issue:`34658`) - :meth:`Series.interpolate` and :meth:`DataFrame.interpolate` now raises ValueError if ``limit_direction`` is 'forward' or 'both' and ``method`` is 'backfill' or 'bfill' or ``limit_direction`` is 'backward' or 'both' and ``method`` is 'pad' or 'ffill' (:issue:`34746`) - The :class:`DataFrame` constructor no longer accepts a list of ``DataFrame`` objects. Because of changes to NumPy, ``DataFrame`` objects are now consistently treated as 2D objects, so a list of ``DataFrames`` is considered 3D, and no longer acceptible for the ``DataFrame`` constructor (:issue:`32289`). @@ -787,6 +786,7 @@ Deprecations - :meth:`DataFrame.to_dict` has deprecated accepting short names for ``orient`` in future versions (:issue:`32515`) - :meth:`Categorical.to_dense` is deprecated and will be removed in a future version, use ``np.asarray(cat)`` instead (:issue:`32639`) - The ``fastpath`` keyword in the ``SingleBlockManager`` constructor is deprecated and will be removed in a future version (:issue:`33092`) +- Providing ``suffixes`` as a ``set`` in :func:`pandas.merge` is deprecated. Provide a tuple instead (:issue:`33740`, :issue:`34741`). - :meth:`Index.is_mixed` is deprecated and will be removed in a future version, check ``index.inferred_type`` directly instead (:issue:`32922`) - Passing any arguments but the first one to :func:`read_html` as diff --git a/pandas/core/frame.py b/pandas/core/frame.py index a21a45f415a47..b6993e9ed851a 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -227,10 +227,13 @@ sort : bool, default False Sort the join keys lexicographically in the result DataFrame. If False, the order of the join keys depends on the join type (how keyword). -suffixes : tuple of (str, str), default ('_x', '_y') - Suffix to apply to overlapping column names in the left and right - side, respectively. To raise an exception on overlapping columns use - (False, False). +suffixes : list-like, default is ("_x", "_y") + A length-2 sequence where each element is optionally a string + indicating the suffix to add to overlapping column names in + `left` and `right` respectively. Pass a value of `None` instead + of a string to indicate that the column name from `left` or + `right` should be left as-is, with no suffix. At least one of the + values must not be None. copy : bool, default True If False, avoid copy if possible. indicator : bool or str, default False diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 5e4eb89f0b45f..27b331babe692 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -194,7 +194,7 @@ def merge_ordered( left DataFrame. fill_method : {'ffill', None}, default None Interpolation method for data. - suffixes : Sequence, default is ("_x", "_y") + suffixes : list-like, default is ("_x", "_y") A length-2 sequence where each element is optionally a string indicating the suffix to add to overlapping column names in `left` and `right` respectively. Pass a value of `None` instead @@ -2072,9 +2072,13 @@ def _items_overlap_with_suffix(left: Index, right: Index, suffixes: Tuple[str, s If corresponding suffix is empty, the entry is simply converted to string. """ - if not isinstance(suffixes, tuple): - raise TypeError( - f"suffixes should be tuple of (str, str). But got {type(suffixes).__name__}" + if not is_list_like(suffixes, allow_sets=False): + warnings.warn( + f"Passing 'suffixes' as a {type(suffixes)}, is not supported and may give " + "unexpected results. Provide 'suffixes' as a tuple instead. In the " + "future a 'TypeError' will be raised.", + FutureWarning, + stacklevel=4, ) to_rename = left.intersection(right) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 0a4d5f17a48cc..4fd3c688b8771 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -1999,6 +1999,7 @@ def test_merge_series(on, left_on, right_on, left_index, right_index, nm): (0, 0, dict(suffixes=("", "_dup")), ["0", "0_dup"]), (0, 0, dict(suffixes=(None, "_dup")), [0, "0_dup"]), (0, 0, dict(suffixes=("_x", "_y")), ["0_x", "0_y"]), + (0, 0, dict(suffixes=["_x", "_y"]), ["0_x", "0_y"]), ("a", 0, dict(suffixes=(None, "_y")), ["a", 0]), (0.0, 0.0, dict(suffixes=("_x", None)), ["0.0_x", 0.0]), ("b", "b", dict(suffixes=(None, "_y")), ["b", "b_y"]), @@ -2069,18 +2070,13 @@ def test_merge_suffix_error(col1, col2, suffixes): pd.merge(a, b, left_index=True, right_index=True, suffixes=suffixes) -@pytest.mark.parametrize( - "col1, col2, suffixes", [("a", "a", {"a", "b"}), ("a", "a", None), (0, 0, None)], -) -def test_merge_suffix_type_error(col1, col2, suffixes): - a = pd.DataFrame({col1: [1, 2, 3]}) - b = pd.DataFrame({col2: [3, 4, 5]}) +@pytest.mark.parametrize("suffixes", [{"left", "right"}, {"left": 0, "right": 0}]) +def test_merge_suffix_warns(suffixes): + a = pd.DataFrame({"a": [1, 2, 3]}) + b = pd.DataFrame({"b": [3, 4, 5]}) - msg = ( - f"suffixes should be tuple of \\(str, str\\). But got {type(suffixes).__name__}" - ) - with pytest.raises(TypeError, match=msg): - pd.merge(a, b, left_index=True, right_index=True, suffixes=suffixes) + with tm.assert_produces_warning(FutureWarning): + pd.merge(a, b, left_index=True, right_index=True, suffixes={"left", "right"}) @pytest.mark.parametrize( From 641d8128db00b09eeb0fff3192f50aa26697bbfb Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 30 Jun 2020 14:05:51 -0700 Subject: [PATCH 0243/1025] CLN: type get_resolution tz as tzinfo (#35065) --- pandas/_libs/tslibs/resolution.pyx | 8 +++----- pandas/_libs/tslibs/tzconversion.pyx | 2 +- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/pandas/_libs/tslibs/resolution.pyx b/pandas/_libs/tslibs/resolution.pyx index 4dbecc76ad986..d5f10374d2860 100644 --- a/pandas/_libs/tslibs/resolution.pyx +++ b/pandas/_libs/tslibs/resolution.pyx @@ -1,3 +1,4 @@ +from cpython.datetime cimport tzinfo import numpy as np from numpy cimport ndarray, int64_t, int32_t @@ -8,7 +9,7 @@ from pandas._libs.tslibs.dtypes import Resolution from pandas._libs.tslibs.np_datetime cimport ( npy_datetimestruct, dt64_to_dtstruct) from pandas._libs.tslibs.timezones cimport ( - is_utc, is_tzlocal, maybe_get_tz, get_dst_info) + is_utc, is_tzlocal, get_dst_info) from pandas._libs.tslibs.ccalendar cimport get_days_in_month from pandas._libs.tslibs.tzconversion cimport tz_convert_utc_to_tzlocal @@ -33,7 +34,7 @@ cdef: # ---------------------------------------------------------------------- -def get_resolution(const int64_t[:] stamps, tz=None): +def get_resolution(const int64_t[:] stamps, tzinfo tz=None): cdef: Py_ssize_t i, n = len(stamps) npy_datetimestruct dts @@ -43,9 +44,6 @@ def get_resolution(const int64_t[:] stamps, tz=None): Py_ssize_t[:] pos int64_t local_val, delta - if tz is not None: - tz = maybe_get_tz(tz) - if is_utc(tz) or tz is None: for i in range(n): if stamps[i] == NPY_NAT: diff --git a/pandas/_libs/tslibs/tzconversion.pyx b/pandas/_libs/tslibs/tzconversion.pyx index 6e6b106b8f21a..925b3c5615435 100644 --- a/pandas/_libs/tslibs/tzconversion.pyx +++ b/pandas/_libs/tslibs/tzconversion.pyx @@ -21,7 +21,7 @@ from pandas._libs.tslibs.nattype cimport NPY_NAT from pandas._libs.tslibs.np_datetime cimport ( npy_datetimestruct, dt64_to_dtstruct) from pandas._libs.tslibs.timezones cimport ( - get_dst_info, is_tzlocal, is_utc, get_timezone, get_utcoffset) + get_dst_info, is_tzlocal, is_utc, get_timezone) # TODO: cdef scalar version to call from convert_str_to_tsobject From 2f2ba0b6885d6c2b0e6b00b42dd13c4d747685f6 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 30 Jun 2020 14:58:43 -0700 Subject: [PATCH 0244/1025] PERF: type tz kwarg in create_timestamp_from_ts (#35067) --- pandas/_libs/tslib.pyx | 17 ++++++++++------- pandas/_libs/tslibs/timestamps.pxd | 4 ++-- pandas/_libs/tslibs/timestamps.pyx | 2 +- 3 files changed, 13 insertions(+), 10 deletions(-) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 44693d60486a9..f494e74bde55f 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -8,6 +8,7 @@ from cpython.datetime cimport ( datetime, time, timedelta, + tzinfo, ) # import datetime C API PyDateTime_IMPORT @@ -77,9 +78,9 @@ from pandas._libs.missing cimport checknull_with_nat_and_na cdef inline object create_datetime_from_ts( int64_t value, npy_datetimestruct dts, - object tz, + tzinfo tz, object freq, - bint fold + bint fold, ): """ Convenience routine to construct a datetime.datetime from its parts. @@ -92,7 +93,7 @@ cdef inline object create_datetime_from_ts( cdef inline object create_date_from_ts( int64_t value, npy_datetimestruct dts, - object tz, + tzinfo tz, object freq, bint fold ): @@ -106,7 +107,7 @@ cdef inline object create_date_from_ts( cdef inline object create_time_from_ts( int64_t value, npy_datetimestruct dts, - object tz, + tzinfo tz, object freq, bint fold ): @@ -120,7 +121,7 @@ cdef inline object create_time_from_ts( @cython.boundscheck(False) def ints_to_pydatetime( const int64_t[:] arr, - object tz=None, + tzinfo tz=None, object freq=None, bint fold=False, str box="datetime" @@ -162,7 +163,7 @@ def ints_to_pydatetime( str typ int64_t value, delta, local_value ndarray[object] result = np.empty(n, dtype=object) - object (*func_create)(int64_t, npy_datetimestruct, object, object, bint) + object (*func_create)(int64_t, npy_datetimestruct, tzinfo, object, bint) if box == "date": assert (tz is None), "tz should be None when converting to date" @@ -178,7 +179,9 @@ def ints_to_pydatetime( elif box == "datetime": func_create = create_datetime_from_ts else: - raise ValueError("box must be one of 'datetime', 'date', 'time' or 'timestamp'") + raise ValueError( + "box must be one of 'datetime', 'date', 'time' or 'timestamp'" + ) if is_utc(tz) or tz is None: for i in range(n): diff --git a/pandas/_libs/tslibs/timestamps.pxd b/pandas/_libs/tslibs/timestamps.pxd index 27b659980e526..307b6dfc90715 100644 --- a/pandas/_libs/tslibs/timestamps.pxd +++ b/pandas/_libs/tslibs/timestamps.pxd @@ -1,4 +1,4 @@ -from cpython.datetime cimport datetime +from cpython.datetime cimport datetime, tzinfo from numpy cimport int64_t @@ -8,7 +8,7 @@ from pandas._libs.tslibs.np_datetime cimport npy_datetimestruct cdef object create_timestamp_from_ts(int64_t value, npy_datetimestruct dts, - object tz, object freq, bint fold) + tzinfo tz, object freq, bint fold) cdef class _Timestamp(ABCTimestamp): diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index 15fcfc742ecf3..355dc0dbc5820 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -69,7 +69,7 @@ _no_input = object() cdef inline object create_timestamp_from_ts(int64_t value, npy_datetimestruct dts, - object tz, object freq, bint fold): + tzinfo tz, object freq, bint fold): """ convenience routine to construct a Timestamp from its parts """ cdef _Timestamp ts_base ts_base = _Timestamp.__new__(Timestamp, dts.year, dts.month, From ed2463750d11f3ade7bfad30311695901cd844cb Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 30 Jun 2020 15:49:52 -0700 Subject: [PATCH 0245/1025] CLN: collect Timestamp methods (#35062) --- pandas/_libs/tslibs/timestamps.pyx | 234 +++++++++++++++-------------- 1 file changed, 119 insertions(+), 115 deletions(-) diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index 355dc0dbc5820..abaf0b4bf6d87 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -265,37 +265,6 @@ cdef class _Timestamp(ABCTimestamp): self._assert_tzawareness_compat(ots) return cmp_scalar(self.value, ots.value, op) - def __reduce_ex__(self, protocol): - # python 3.6 compat - # https://bugs.python.org/issue28730 - # now __reduce_ex__ is defined and higher priority than __reduce__ - return self.__reduce__() - - def __repr__(self) -> str: - stamp = self._repr_base - zone = None - - try: - stamp += self.strftime('%z') - if self.tzinfo: - zone = get_timezone(self.tzinfo) - except ValueError: - year2000 = self.replace(year=2000) - stamp += year2000.strftime('%z') - if self.tzinfo: - zone = get_timezone(self.tzinfo) - - try: - stamp += zone.strftime(' %%Z') - except AttributeError: - # e.g. tzlocal has no `strftime` - pass - - tz = f", tz='{zone}'" if zone is not None else "" - freq = "" if self.freq is None else f", freq='{self.freqstr}'" - - return f"Timestamp('{stamp}'{tz}{freq})" - cdef bint _compare_outside_nanorange(_Timestamp self, datetime other, int op) except -1: cdef: @@ -312,46 +281,6 @@ cdef class _Timestamp(ABCTimestamp): elif other.tzinfo is None: raise TypeError('Cannot compare tz-naive and tz-aware timestamps') - cpdef datetime to_pydatetime(_Timestamp self, bint warn=True): - """ - Convert a Timestamp object to a native Python datetime object. - - If warn=True, issue a warning if nanoseconds is nonzero. - """ - if self.nanosecond != 0 and warn: - warnings.warn("Discarding nonzero nanoseconds in conversion", - UserWarning, stacklevel=2) - - return datetime(self.year, self.month, self.day, - self.hour, self.minute, self.second, - self.microsecond, self.tzinfo) - - cpdef to_datetime64(self): - """ - Return a numpy.datetime64 object with 'ns' precision. - """ - return np.datetime64(self.value, 'ns') - - def to_numpy(self, dtype=None, copy=False) -> np.datetime64: - """ - Convert the Timestamp to a NumPy datetime64. - - .. versionadded:: 0.25.0 - - This is an alias method for `Timestamp.to_datetime64()`. The dtype and - copy parameters are available here only for compatibility. Their values - will not affect the return value. - - Returns - ------- - numpy.datetime64 - - See Also - -------- - DatetimeIndex.to_numpy : Similar method for DatetimeIndex. - """ - return self.to_datetime64() - def __add__(self, other): cdef: int64_t nanos = 0 @@ -619,9 +548,69 @@ cdef class _Timestamp(ABCTimestamp): """ return ccalendar.get_days_in_month(self.year, self.month) + # ----------------------------------------------------------------- + # Pickle Methods + + def __reduce_ex__(self, protocol): + # python 3.6 compat + # https://bugs.python.org/issue28730 + # now __reduce_ex__ is defined and higher priority than __reduce__ + return self.__reduce__() + + def __setstate__(self, state): + self.value = state[0] + self.freq = state[1] + self.tzinfo = state[2] + + def __reduce__(self): + object_state = self.value, self.freq, self.tzinfo + return (Timestamp, object_state) + # ----------------------------------------------------------------- # Rendering Methods + def isoformat(self, sep: str = "T") -> str: + base = super(_Timestamp, self).isoformat(sep=sep) + if self.nanosecond == 0: + return base + + if self.tzinfo is not None: + base1, base2 = base[:-6], base[-6:] + else: + base1, base2 = base, "" + + if self.microsecond != 0: + base1 += f"{self.nanosecond:03d}" + else: + base1 += f".{self.nanosecond:09d}" + + return base1 + base2 + + def __repr__(self) -> str: + stamp = self._repr_base + zone = None + + try: + stamp += self.strftime('%z') + if self.tzinfo: + zone = get_timezone(self.tzinfo) + except ValueError: + year2000 = self.replace(year=2000) + stamp += year2000.strftime('%z') + if self.tzinfo: + zone = get_timezone(self.tzinfo) + + try: + stamp += zone.strftime(' %%Z') + except AttributeError: + # e.g. tzlocal has no `strftime` + pass + + tz = f", tz='{zone}'" if zone is not None else "" + freq = "" if self.freq is None else f", freq='{self.freqstr}'" + + return f"Timestamp('{stamp}'{tz}{freq})" + @property def _repr_base(self) -> str: return f"{self._date_repr} {self._time_repr}" @@ -656,6 +645,7 @@ cdef class _Timestamp(ABCTimestamp): return self._repr_base # ----------------------------------------------------------------- + # Conversion Methods @property def asm8(self) -> np.datetime64: @@ -670,6 +660,64 @@ cdef class _Timestamp(ABCTimestamp): # Note: Naive timestamps will not match datetime.stdlib return round(self.value / 1e9, 6) + cpdef datetime to_pydatetime(_Timestamp self, bint warn=True): + """ + Convert a Timestamp object to a native Python datetime object. + + If warn=True, issue a warning if nanoseconds is nonzero. + """ + if self.nanosecond != 0 and warn: + warnings.warn("Discarding nonzero nanoseconds in conversion", + UserWarning, stacklevel=2) + + return datetime(self.year, self.month, self.day, + self.hour, self.minute, self.second, + self.microsecond, self.tzinfo) + + cpdef to_datetime64(self): + """ + Return a numpy.datetime64 object with 'ns' precision. + """ + return np.datetime64(self.value, "ns") + + def to_numpy(self, dtype=None, copy=False) -> np.datetime64: + """ + Convert the Timestamp to a NumPy datetime64. + + .. versionadded:: 0.25.0 + + This is an alias method for `Timestamp.to_datetime64()`. The dtype and + copy parameters are available here only for compatibility. Their values + will not affect the return value. + + Returns + ------- + numpy.datetime64 + + See Also + -------- + DatetimeIndex.to_numpy : Similar method for DatetimeIndex. + """ + return self.to_datetime64() + + def to_period(self, freq=None): + """ + Return an period of which this timestamp is an observation. + """ + from pandas import Period + + if self.tz is not None: + # GH#21333 + warnings.warn( + "Converting to Period representation will drop timezone information.", + UserWarning, + ) + + if freq is None: + freq = self.freq + + return Period(self, freq=freq) + # ---------------------------------------------------------------------- @@ -1156,33 +1204,6 @@ timedelta}, default 'raise' "Use tz_localize() or tz_convert() as appropriate" ) - def __setstate__(self, state): - self.value = state[0] - self.freq = state[1] - self.tzinfo = state[2] - - def __reduce__(self): - object_state = self.value, self.freq, self.tzinfo - return (Timestamp, object_state) - - def to_period(self, freq=None): - """ - Return an period of which this timestamp is an observation. - """ - from pandas import Period - - if self.tz is not None: - # GH#21333 - warnings.warn( - "Converting to Period representation will drop timezone information.", - UserWarning, - ) - - if freq is None: - freq = self.freq - - return Period(self, freq=freq) - @property def freqstr(self): """ @@ -1404,23 +1425,6 @@ default 'raise' return create_timestamp_from_ts(value, dts, _tzinfo, self.freq, fold) - def isoformat(self, sep='T'): - base = super(_Timestamp, self).isoformat(sep=sep) - if self.nanosecond == 0: - return base - - if self.tzinfo is not None: - base1, base2 = base[:-6], base[-6:] - else: - base1, base2 = base, "" - - if self.microsecond != 0: - base1 += f"{self.nanosecond:03d}" - else: - base1 += f".{self.nanosecond:09d}" - - return base1 + base2 - def to_julian_date(self) -> np.float64: """ Convert TimeStamp to a Julian Date. From 5727b5ef32d7f3f972467eb6dce2ed52a877a7e3 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 30 Jun 2020 16:26:28 -0700 Subject: [PATCH 0246/1025] PERF: _maybe_convert_value_to_local (#35070) --- asv_bench/benchmarks/tslibs/timestamp.py | 6 +++--- pandas/_libs/tslibs/timestamps.pyx | 11 ++++++++--- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/asv_bench/benchmarks/tslibs/timestamp.py b/asv_bench/benchmarks/tslibs/timestamp.py index 3ef9b814dd79e..b7e11089535d7 100644 --- a/asv_bench/benchmarks/tslibs/timestamp.py +++ b/asv_bench/benchmarks/tslibs/timestamp.py @@ -63,9 +63,6 @@ def time_tz(self, tz, freq): def time_dayofweek(self, tz, freq): self.ts.dayofweek - def time_weekday_name(self, tz, freq): - self.ts.day_name - def time_dayofyear(self, tz, freq): self.ts.dayofyear @@ -108,6 +105,9 @@ def time_microsecond(self, tz, freq): def time_month_name(self, tz, freq): self.ts.month_name() + def time_weekday_name(self, tz, freq): + self.ts.day_name() + class TimestampOps: params = [None, "US/Eastern", pytz.UTC, dateutil.tz.tzutc()] diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index abaf0b4bf6d87..159e4366d1f3f 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -47,6 +47,7 @@ from pandas._libs.tslibs.nattype cimport NPY_NAT, c_NaT as NaT from pandas._libs.tslibs.np_datetime cimport ( check_dts_bounds, npy_datetimestruct, dt64_to_dtstruct, cmp_scalar, + pydatetime_to_dt64, ) from pandas._libs.tslibs.np_datetime import OutOfBoundsDatetime from pandas._libs.tslibs.offsets cimport to_offset, is_tick_object, is_offset_object @@ -376,9 +377,13 @@ cdef class _Timestamp(ABCTimestamp): """Convert UTC i8 value to local i8 value if tz exists""" cdef: int64_t val - val = self.value - if self.tz is not None and not is_utc(self.tz): - val = tz_convert_single(self.value, UTC, self.tz) + tzinfo own_tz = self.tzinfo + npy_datetimestruct dts + + if own_tz is not None and not is_utc(own_tz): + val = pydatetime_to_dt64(self, &dts) + self.nanosecond + else: + val = self.value return val cdef bint _get_start_end_field(self, str field): From 86ad459c0a385464d6acc56aab8153746c0b9777 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 30 Jun 2020 17:12:54 -0700 Subject: [PATCH 0247/1025] CLN: remove unnecessary get_timezone calls (#35071) --- pandas/_libs/tslibs/conversion.pyx | 4 ++-- pandas/_libs/tslibs/tzconversion.pyx | 7 +++---- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index 884715f482cad..c1162ed482048 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -27,7 +27,7 @@ from pandas._libs.tslibs.util cimport ( from pandas._libs.tslibs.timezones cimport ( is_utc, is_tzlocal, is_fixed_offset, get_utcoffset, get_dst_info, - get_timezone, maybe_get_tz, tz_compare, + maybe_get_tz, tz_compare, utc_pytz as UTC, ) from pandas._libs.tslibs.parsing import parse_datetime_string @@ -267,7 +267,7 @@ def datetime_to_datetime64(ndarray[object] values): if not tz_compare(val.tzinfo, inferred_tz): raise ValueError('Array must be all same time zone') else: - inferred_tz = get_timezone(val.tzinfo) + inferred_tz = val.tzinfo _ts = convert_datetime_to_tsobject(val, None) iresult[i] = _ts.value diff --git a/pandas/_libs/tslibs/tzconversion.pyx b/pandas/_libs/tslibs/tzconversion.pyx index 925b3c5615435..a096b2807c640 100644 --- a/pandas/_libs/tslibs/tzconversion.pyx +++ b/pandas/_libs/tslibs/tzconversion.pyx @@ -20,8 +20,7 @@ from pandas._libs.tslibs.ccalendar cimport DAY_NANOS, HOUR_NANOS from pandas._libs.tslibs.nattype cimport NPY_NAT from pandas._libs.tslibs.np_datetime cimport ( npy_datetimestruct, dt64_to_dtstruct) -from pandas._libs.tslibs.timezones cimport ( - get_dst_info, is_tzlocal, is_utc, get_timezone) +from pandas._libs.tslibs.timezones cimport get_dst_info, is_tzlocal, is_utc # TODO: cdef scalar version to call from convert_str_to_tsobject @@ -358,13 +357,13 @@ cpdef int64_t tz_convert_single(int64_t val, tzinfo tz1, tzinfo tz2): # Convert to UTC if is_tzlocal(tz1): utc_date = _tz_convert_tzlocal_utc(val, tz1, to_utc=True) - elif not is_utc(get_timezone(tz1)): + elif not is_utc(tz1): arr[0] = val utc_date = _tz_convert_dst(arr, tz1, to_utc=True)[0] else: utc_date = val - if is_utc(get_timezone(tz2)): + if is_utc(tz2): return utc_date elif is_tzlocal(tz2): return _tz_convert_tzlocal_utc(utc_date, tz2, to_utc=False) From 44891f973419224973b82bcec36a6c92562e557e Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 30 Jun 2020 17:13:58 -0700 Subject: [PATCH 0248/1025] REF: re-use month_offset (#35073) --- pandas/_libs/tslibs/ccalendar.pxd | 2 ++ pandas/_libs/tslibs/ccalendar.pyx | 4 ++-- pandas/_libs/tslibs/fields.pyx | 37 +++++++++++++------------------ 3 files changed, 19 insertions(+), 24 deletions(-) diff --git a/pandas/_libs/tslibs/ccalendar.pxd b/pandas/_libs/tslibs/ccalendar.pxd index b55780fe7d5b9..41cc477413607 100644 --- a/pandas/_libs/tslibs/ccalendar.pxd +++ b/pandas/_libs/tslibs/ccalendar.pxd @@ -14,3 +14,5 @@ cpdef int32_t get_day_of_year(int year, int month, int day) nogil cdef int64_t DAY_NANOS cdef int64_t HOUR_NANOS cdef dict c_MONTH_NUMBERS + +cdef int32_t* month_offset diff --git a/pandas/_libs/tslibs/ccalendar.pyx b/pandas/_libs/tslibs/ccalendar.pyx index 2006214169a74..9f8cf6c28adab 100644 --- a/pandas/_libs/tslibs/ccalendar.pyx +++ b/pandas/_libs/tslibs/ccalendar.pyx @@ -27,7 +27,7 @@ cdef int* sakamoto_arr = [0, 3, 2, 5, 0, 3, 5, 1, 4, 6, 2, 4] # The first 13 entries give the month days elapsed as of the first of month N # (or the total number of days in the year for N=13) in non-leap years. # The remaining 13 entries give the days elapsed in leap years. -cdef int32_t* _month_offset = [ +cdef int32_t* month_offset = [ 0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 365, 0, 31, 60, 91, 121, 152, 182, 213, 244, 274, 305, 335, 366] @@ -242,7 +242,7 @@ cpdef int32_t get_day_of_year(int year, int month, int day) nogil: isleap = is_leapyear(year) - mo_off = _month_offset[isleap * 13 + month - 1] + mo_off = month_offset[isleap * 13 + month - 1] day_of_year = mo_off + day return day_of_year diff --git a/pandas/_libs/tslibs/fields.pyx b/pandas/_libs/tslibs/fields.pyx index 0e5a6d3c4db46..126deb67e4189 100644 --- a/pandas/_libs/tslibs/fields.pyx +++ b/pandas/_libs/tslibs/fields.pyx @@ -17,7 +17,9 @@ from pandas._libs.tslibs.ccalendar import ( from pandas._libs.tslibs.ccalendar cimport ( DAY_NANOS, get_days_in_month, is_leapyear, dayofweek, get_week_of_year, - get_day_of_year, get_iso_calendar, iso_calendar_t) + get_day_of_year, get_iso_calendar, iso_calendar_t, + month_offset, +) from pandas._libs.tslibs.np_datetime cimport ( npy_datetimestruct, pandas_timedeltastruct, dt64_to_dtstruct, td64_to_tdstruct) @@ -155,19 +157,10 @@ def get_start_end_field(const int64_t[:] dtindex, str field, int end_month = 12 int start_month = 1 ndarray[int8_t] out - ndarray[int32_t, ndim=2] _month_offset bint isleap npy_datetimestruct dts int mo_off, dom, doy, dow, ldom - _month_offset = np.array( - [ - [0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 365], - [0, 31, 60, 91, 121, 152, 182, 213, 244, 274, 305, 335, 366], - ], - dtype=np.int32, - ) - out = np.zeros(count, dtype='int8') if freqstr: @@ -226,10 +219,10 @@ def get_start_end_field(const int64_t[:] dtindex, str field, dt64_to_dtstruct(dtindex[i], &dts) isleap = is_leapyear(dts.year) - mo_off = _month_offset[isleap, dts.month - 1] + mo_off = month_offset[isleap * 13 + dts.month - 1] dom = dts.day doy = mo_off + dom - ldom = _month_offset[isleap, dts.month] + ldom = month_offset[isleap * 13 + dts.month] dow = dayofweek(dts.year, dts.month, dts.day) if (ldom == doy and dow < 5) or ( @@ -244,10 +237,10 @@ def get_start_end_field(const int64_t[:] dtindex, str field, dt64_to_dtstruct(dtindex[i], &dts) isleap = is_leapyear(dts.year) - mo_off = _month_offset[isleap, dts.month - 1] + mo_off = month_offset[isleap * 13 + dts.month - 1] dom = dts.day doy = mo_off + dom - ldom = _month_offset[isleap, dts.month] + ldom = month_offset[isleap * 13 + dts.month] if ldom == doy: out[i] = 1 @@ -288,10 +281,10 @@ def get_start_end_field(const int64_t[:] dtindex, str field, dt64_to_dtstruct(dtindex[i], &dts) isleap = is_leapyear(dts.year) - mo_off = _month_offset[isleap, dts.month - 1] + mo_off = month_offset[isleap * 13 + dts.month - 1] dom = dts.day doy = mo_off + dom - ldom = _month_offset[isleap, dts.month] + ldom = month_offset[isleap * 13 + dts.month] dow = dayofweek(dts.year, dts.month, dts.day) if ((dts.month - end_month) % 3 == 0) and ( @@ -307,10 +300,10 @@ def get_start_end_field(const int64_t[:] dtindex, str field, dt64_to_dtstruct(dtindex[i], &dts) isleap = is_leapyear(dts.year) - mo_off = _month_offset[isleap, dts.month - 1] + mo_off = month_offset[isleap * 13 + dts.month - 1] dom = dts.day doy = mo_off + dom - ldom = _month_offset[isleap, dts.month] + ldom = month_offset[isleap * 13 + dts.month] if ((dts.month - end_month) % 3 == 0) and (ldom == doy): out[i] = 1 @@ -352,10 +345,10 @@ def get_start_end_field(const int64_t[:] dtindex, str field, dt64_to_dtstruct(dtindex[i], &dts) isleap = is_leapyear(dts.year) dom = dts.day - mo_off = _month_offset[isleap, dts.month - 1] + mo_off = month_offset[isleap * 13 + dts.month - 1] doy = mo_off + dom dow = dayofweek(dts.year, dts.month, dts.day) - ldom = _month_offset[isleap, dts.month] + ldom = month_offset[isleap * 13 + dts.month] if (dts.month == end_month) and ( (ldom == doy and dow < 5) or ( @@ -370,10 +363,10 @@ def get_start_end_field(const int64_t[:] dtindex, str field, dt64_to_dtstruct(dtindex[i], &dts) isleap = is_leapyear(dts.year) - mo_off = _month_offset[isleap, dts.month - 1] + mo_off = month_offset[isleap * 13 + dts.month - 1] dom = dts.day doy = mo_off + dom - ldom = _month_offset[isleap, dts.month] + ldom = month_offset[isleap * 13 + dts.month] if (dts.month == end_month) and (ldom == doy): out[i] = 1 From 25b017e5be5cac7fecee21577590d9408184b160 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 30 Jun 2020 18:40:41 -0700 Subject: [PATCH 0249/1025] PERF: Timestamp.normalize (#35068) * PERF: Timestamp.normalize * lint fixup --- asv_bench/benchmarks/tslibs/timestamp.py | 25 +++++++++++++++------ pandas/_libs/tslibs/conversion.pxd | 1 + pandas/_libs/tslibs/conversion.pyx | 10 ++++----- pandas/_libs/tslibs/timestamps.pyx | 28 +++++++++++++----------- 4 files changed, 39 insertions(+), 25 deletions(-) diff --git a/asv_bench/benchmarks/tslibs/timestamp.py b/asv_bench/benchmarks/tslibs/timestamp.py index b7e11089535d7..40f8e561f5238 100644 --- a/asv_bench/benchmarks/tslibs/timestamp.py +++ b/asv_bench/benchmarks/tslibs/timestamp.py @@ -1,17 +1,29 @@ -import datetime +from datetime import datetime, timedelta, timezone -import dateutil +from dateutil.tz import gettz, tzlocal, tzutc import numpy as np import pytz from pandas import Timestamp +# One case for each type of tzinfo object that has its own code path +# in tzconversion code. +_tzs = [ + None, + pytz.timezone("Europe/Amsterdam"), + gettz("US/Central"), + pytz.UTC, + tzutc(), + timezone(timedelta(minutes=60)), + tzlocal(), +] + class TimestampConstruction: def setup(self): self.npdatetime64 = np.datetime64("2020-01-01 00:00:00") - self.dttime_unaware = datetime.datetime(2020, 1, 1, 0, 0, 0) - self.dttime_aware = datetime.datetime(2020, 1, 1, 0, 0, 0, 0, pytz.UTC) + self.dttime_unaware = datetime(2020, 1, 1, 0, 0, 0) + self.dttime_aware = datetime(2020, 1, 1, 0, 0, 0, 0, pytz.UTC) self.ts = Timestamp("2020-01-01 00:00:00") def time_parse_iso8601_no_tz(self): @@ -49,7 +61,6 @@ def time_from_pd_timestamp(self): class TimestampProperties: - _tzs = [None, pytz.timezone("Europe/Amsterdam"), pytz.UTC, dateutil.tz.tzutc()] _freqs = [None, "B"] params = [_tzs, _freqs] param_names = ["tz", "freq"] @@ -110,7 +121,7 @@ def time_weekday_name(self, tz, freq): class TimestampOps: - params = [None, "US/Eastern", pytz.UTC, dateutil.tz.tzutc()] + params = _tzs param_names = ["tz"] def setup(self, tz): @@ -148,7 +159,7 @@ def time_ceil(self, tz): class TimestampAcrossDst: def setup(self): - dt = datetime.datetime(2016, 3, 27, 1) + dt = datetime(2016, 3, 27, 1) self.tzinfo = pytz.timezone("CET").localize(dt, is_dst=False).tzinfo self.ts2 = Timestamp(dt) diff --git a/pandas/_libs/tslibs/conversion.pxd b/pandas/_libs/tslibs/conversion.pxd index 94f6d1d9020d2..623d9f14d646b 100644 --- a/pandas/_libs/tslibs/conversion.pxd +++ b/pandas/_libs/tslibs/conversion.pxd @@ -26,3 +26,4 @@ cpdef datetime localize_pydatetime(datetime dt, object tz) cdef int64_t cast_from_unit(object ts, str unit) except? -1 cpdef ndarray[int64_t] normalize_i8_timestamps(const int64_t[:] stamps, tzinfo tz) +cdef int64_t normalize_i8_stamp(int64_t local_val) nogil diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index c1162ed482048..ac24dd546d9d3 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -795,14 +795,14 @@ cpdef ndarray[int64_t] normalize_i8_timestamps(const int64_t[:] stamps, tzinfo t result[i] = NPY_NAT continue local_val = stamps[i] - result[i] = _normalize_i8_stamp(local_val) + result[i] = normalize_i8_stamp(local_val) elif is_tzlocal(tz): for i in range(n): if stamps[i] == NPY_NAT: result[i] = NPY_NAT continue local_val = tz_convert_utc_to_tzlocal(stamps[i], tz) - result[i] = _normalize_i8_stamp(local_val) + result[i] = normalize_i8_stamp(local_val) else: # Adjust datetime64 timestamp, recompute datetimestruct trans, deltas, typ = get_dst_info(tz) @@ -815,7 +815,7 @@ cpdef ndarray[int64_t] normalize_i8_timestamps(const int64_t[:] stamps, tzinfo t result[i] = NPY_NAT continue local_val = stamps[i] + delta - result[i] = _normalize_i8_stamp(local_val) + result[i] = normalize_i8_stamp(local_val) else: pos = trans.searchsorted(stamps, side='right') - 1 for i in range(n): @@ -823,13 +823,13 @@ cpdef ndarray[int64_t] normalize_i8_timestamps(const int64_t[:] stamps, tzinfo t result[i] = NPY_NAT continue local_val = stamps[i] + deltas[pos[i]] - result[i] = _normalize_i8_stamp(local_val) + result[i] = normalize_i8_stamp(local_val) return result.base # `.base` to access underlying ndarray @cython.cdivision -cdef inline int64_t _normalize_i8_stamp(int64_t local_val) nogil: +cdef inline int64_t normalize_i8_stamp(int64_t local_val) nogil: """ Round the localized nanosecond timestamp down to the previous midnight. diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index 159e4366d1f3f..ba6cee3d7ad8e 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -40,7 +40,7 @@ from pandas._libs.tslibs.conversion cimport ( _TSObject, convert_to_tsobject, convert_datetime_to_tsobject, - normalize_i8_timestamps, + normalize_i8_stamp, ) from pandas._libs.tslibs.fields import get_start_end_field, get_date_name_field from pandas._libs.tslibs.nattype cimport NPY_NAT, c_NaT as NaT @@ -553,6 +553,20 @@ cdef class _Timestamp(ABCTimestamp): """ return ccalendar.get_days_in_month(self.year, self.month) + # ----------------------------------------------------------------- + # Transformation Methods + + def normalize(self) -> "Timestamp": + """ + Normalize Timestamp to midnight, preserving tz information. + """ + cdef: + local_val = self._maybe_convert_value_to_local() + int64_t normalized + + normalized = normalize_i8_stamp(local_val) + return Timestamp(normalized).tz_localize(self.tzinfo) + # ----------------------------------------------------------------- # Pickle Methods @@ -1455,18 +1469,6 @@ default 'raise' self.nanosecond / 3600.0 / 1e+9 ) / 24.0) - def normalize(self): - """ - Normalize Timestamp to midnight, preserving tz information. - """ - cdef: - ndarray[int64_t] normalized - tzinfo own_tz = self.tzinfo # could be None - - normalized = normalize_i8_timestamps( - np.array([self.value], dtype="i8"), tz=own_tz) - return Timestamp(normalized[0]).tz_localize(own_tz) - # Aliases Timestamp.weekofyear = Timestamp.week From 4643934fd06742af8a2a797949996706163b7e5d Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 30 Jun 2020 20:43:25 -0500 Subject: [PATCH 0250/1025] BUG: Fixed concat with reindex and extension types (#33522) * BUG: Fixed concat with reindex and extension types Closes https://github.com/pandas-dev/pandas/issues/27692 Closes https://github.com/pandas-dev/pandas/issues/33027 * rebase * fixup * cleanup * fixups --- doc/source/whatsnew/v1.1.0.rst | 1 + pandas/core/internals/concat.py | 31 +++++++++++++++++----- pandas/core/reshape/concat.py | 2 +- pandas/tests/extension/base/reshaping.py | 13 +++++++++ pandas/tests/extension/test_categorical.py | 3 ++- 5 files changed, 41 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index d32eeb493b2c2..2fac8d1a8f63f 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -1123,6 +1123,7 @@ ExtensionArray ^^^^^^^^^^^^^^ - Fixed bug where :meth:`Series.value_counts` would raise on empty input of ``Int64`` dtype (:issue:`33317`) +- Fixed bug in :func:`concat` when concatenating DataFrames with non-overlaping columns resulting in object-dtype columns rather than preserving the extension dtype (:issue:`27692`, :issue:`33027`) - Fixed bug where :meth:`StringArray.isna` would return ``False`` for NA values when ``pandas.options.mode.use_inf_as_na`` was set to ``True`` (:issue:`33655`) - Fixed bug in :class:`Series` construction with EA dtype and index but no data or scalar data fails (:issue:`26469`) - Fixed bug that caused :meth:`Series.__repr__()` to crash for extension types whose elements are multidimensional arrays (:issue:`33770`). diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index fd8c5f5e27c02..2cc7461986c8f 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -29,7 +29,7 @@ def concatenate_block_managers( - mgrs_indexers, axes, concat_axis: int, copy: bool + mgrs_indexers, axes, concat_axis: int, copy: bool, ) -> BlockManager: """ Concatenate block managers into one. @@ -76,7 +76,7 @@ def concatenate_block_managers( b = make_block(values, placement=placement, ndim=blk.ndim) else: b = make_block( - _concatenate_join_units(join_units, concat_axis, copy=copy), + _concatenate_join_units(join_units, concat_axis, copy=copy,), placement=placement, ) blocks.append(b) @@ -260,6 +260,16 @@ def get_reindexed_values(self, empty_dtype, upcasted_na): pass elif getattr(self.block, "is_extension", False): pass + elif is_extension_array_dtype(empty_dtype): + missing_arr = empty_dtype.construct_array_type()._from_sequence( + [], dtype=empty_dtype + ) + ncols, nrows = self.shape + assert ncols == 1, ncols + empty_arr = -1 * np.ones((nrows,), dtype=np.intp) + return missing_arr.take( + empty_arr, allow_fill=True, fill_value=fill_value + ) else: missing_arr = np.empty(self.shape, dtype=empty_dtype) missing_arr.fill(fill_value) @@ -329,7 +339,7 @@ def _concatenate_join_units(join_units, concat_axis, copy): # 2D to put it a non-EA Block concat_values = np.atleast_2d(concat_values) else: - concat_values = concat_compat(to_concat, axis=concat_axis) + concat_values = concat_compat(to_concat, axis=concat_axis,) return concat_values @@ -374,6 +384,10 @@ def _get_empty_dtype_and_na(join_units): upcast_cls = "category" elif is_datetime64tz_dtype(dtype): upcast_cls = "datetimetz" + + elif is_extension_array_dtype(dtype): + upcast_cls = "extension" + elif issubclass(dtype.type, np.bool_): upcast_cls = "bool" elif issubclass(dtype.type, np.object_): @@ -384,8 +398,6 @@ def _get_empty_dtype_and_na(join_units): upcast_cls = "timedelta" elif is_sparse(dtype): upcast_cls = dtype.subtype.name - elif is_extension_array_dtype(dtype): - upcast_cls = "object" elif is_float_dtype(dtype) or is_numeric_dtype(dtype): upcast_cls = dtype.name else: @@ -401,10 +413,15 @@ def _get_empty_dtype_and_na(join_units): if not upcast_classes: upcast_classes = null_upcast_classes - # TODO: de-duplicate with maybe_promote? # create the result - if "object" in upcast_classes: + if "extension" in upcast_classes: + if len(upcast_classes) == 1: + cls = upcast_classes["extension"][0] + return cls, cls.na_value + else: + return np.dtype("object"), np.nan + elif "object" in upcast_classes: return np.dtype(np.object_), np.nan elif "bool" in upcast_classes: if has_none_blocks: diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index 299b68c6e71e0..9e8fb643791f2 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -500,7 +500,7 @@ def get_result(self): mgrs_indexers.append((obj._mgr, indexers)) new_data = concatenate_block_managers( - mgrs_indexers, self.new_axes, concat_axis=self.bm_axis, copy=self.copy + mgrs_indexers, self.new_axes, concat_axis=self.bm_axis, copy=self.copy, ) if not self.copy: new_data._consolidate_inplace() diff --git a/pandas/tests/extension/base/reshaping.py b/pandas/tests/extension/base/reshaping.py index cd932e842e00c..3774e018a8e51 100644 --- a/pandas/tests/extension/base/reshaping.py +++ b/pandas/tests/extension/base/reshaping.py @@ -107,6 +107,19 @@ def test_concat_extension_arrays_copy_false(self, data, na_value): result = pd.concat([df1, df2], axis=1, copy=False) self.assert_frame_equal(result, expected) + def test_concat_with_reindex(self, data): + # GH-33027 + a = pd.DataFrame({"a": data[:5]}) + b = pd.DataFrame({"b": data[:5]}) + result = pd.concat([a, b], ignore_index=True) + expected = pd.DataFrame( + { + "a": data.take(list(range(5)) + ([-1] * 5), allow_fill=True), + "b": data.take(([-1] * 5) + list(range(5)), allow_fill=True), + } + ) + self.assert_frame_equal(result, expected) + def test_align(self, data, na_value): a = data[:3] b = data[2:5] diff --git a/pandas/tests/extension/test_categorical.py b/pandas/tests/extension/test_categorical.py index d1211e477fe3e..f7b572a70073a 100644 --- a/pandas/tests/extension/test_categorical.py +++ b/pandas/tests/extension/test_categorical.py @@ -93,7 +93,8 @@ class TestConstructors(base.BaseConstructorsTests): class TestReshaping(base.BaseReshapingTests): - pass + def test_concat_with_reindex(self, data): + pytest.xfail(reason="Deliberately upcast to object?") class TestGetitem(base.BaseGetitemTests): From 1dd650123ad1702554ef2f9561b4ab79a7c98b42 Mon Sep 17 00:00:00 2001 From: William Ayd Date: Wed, 1 Jul 2020 08:22:48 -0700 Subject: [PATCH 0251/1025] Assorted ujson Cleanups (#35064) --- pandas/_libs/src/ujson/python/objToJSON.c | 47 ++++++----------------- 1 file changed, 11 insertions(+), 36 deletions(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index 1de9642761961..e841f00489887 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -324,19 +324,6 @@ static npy_float64 total_seconds(PyObject *td) { return double_val; } -static PyObject *get_item(PyObject *obj, Py_ssize_t i) { - PyObject *tmp = PyLong_FromSsize_t(i); - PyObject *ret; - - if (tmp == 0) { - return 0; - } - ret = PyObject_GetItem(obj, tmp); - Py_DECREF(tmp); - - return ret; -} - static char *PyBytesToUTF8(JSOBJ _obj, JSONTypeContext *Py_UNUSED(tc), size_t *_outLen) { PyObject *obj = (PyObject *)_obj; @@ -704,7 +691,6 @@ void PdBlock_iterBegin(JSOBJ _obj, JSONTypeContext *tc) { PdBlockContext *blkCtxt; NpyArrContext *npyarr; Py_ssize_t i; - PyArray_Descr *dtype; NpyIter *iter; NpyIter_IterNextFunc *iternext; npy_int64 **dataptr; @@ -712,10 +698,6 @@ void PdBlock_iterBegin(JSOBJ _obj, JSONTypeContext *tc) { npy_intp idx; PRINTMARK(); - - i = 0; - blocks = NULL; - dtype = PyArray_DescrFromType(NPY_INT64); obj = (PyObject *)_obj; GET_TC(tc)->iterGetName = GET_TC(tc)->transpose @@ -726,7 +708,7 @@ void PdBlock_iterBegin(JSOBJ _obj, JSONTypeContext *tc) { if (!blkCtxt) { PyErr_NoMemory(); GET_TC(tc)->iterNext = NpyArr_iterNextNone; - goto BLKRET; + return; } GET_TC(tc)->pdblock = blkCtxt; @@ -739,7 +721,7 @@ void PdBlock_iterBegin(JSOBJ _obj, JSONTypeContext *tc) { blkCtxt->cindices = NULL; GET_TC(tc)->iterNext = NpyArr_iterNextNone; - goto BLKRET; + return; } blkCtxt->npyCtxts = @@ -747,30 +729,30 @@ void PdBlock_iterBegin(JSOBJ _obj, JSONTypeContext *tc) { if (!blkCtxt->npyCtxts) { PyErr_NoMemory(); GET_TC(tc)->iterNext = NpyArr_iterNextNone; - goto BLKRET; - } - for (i = 0; i < blkCtxt->ncols; i++) { - blkCtxt->npyCtxts[i] = NULL; + return; } blkCtxt->cindices = PyObject_Malloc(sizeof(int) * blkCtxt->ncols); if (!blkCtxt->cindices) { PyErr_NoMemory(); GET_TC(tc)->iterNext = NpyArr_iterNextNone; - goto BLKRET; + return; } blocks = get_sub_attr(obj, "_mgr", "blocks"); if (!blocks) { GET_TC(tc)->iterNext = NpyArr_iterNextNone; - goto BLKRET; + return; + } else if (!PyTuple_Check(blocks)) { + PyErr_SetString(PyExc_TypeError, "blocks must be a tuple!"); + goto BLKRET; } // force transpose so each NpyArrContext strides down its column GET_TC(tc)->transpose = 1; for (i = 0; i < PyObject_Length(blocks); i++) { - block = get_item(blocks, i); + block = PyTuple_GET_ITEM(blocks, i); if (!block) { GET_TC(tc)->iterNext = NpyArr_iterNextNone; goto BLKRET; @@ -779,7 +761,6 @@ void PdBlock_iterBegin(JSOBJ _obj, JSONTypeContext *tc) { tmp = PyObject_CallMethod(block, "get_block_values_for_json", NULL); if (!tmp) { ((JSONObjectEncoder *)tc->encoder)->errorMsg = ""; - Py_DECREF(block); GET_TC(tc)->iterNext = NpyArr_iterNextNone; goto BLKRET; } @@ -787,23 +768,20 @@ void PdBlock_iterBegin(JSOBJ _obj, JSONTypeContext *tc) { values = PyArray_Transpose((PyArrayObject *)tmp, NULL); Py_DECREF(tmp); if (!values) { - Py_DECREF(block); GET_TC(tc)->iterNext = NpyArr_iterNextNone; goto BLKRET; } locs = (PyArrayObject *)get_sub_attr(block, "mgr_locs", "as_array"); if (!locs) { - Py_DECREF(block); Py_DECREF(values); GET_TC(tc)->iterNext = NpyArr_iterNextNone; goto BLKRET; } iter = NpyIter_New(locs, NPY_ITER_READONLY, NPY_KEEPORDER, - NPY_NO_CASTING, dtype); + NPY_NO_CASTING, NULL); if (!iter) { - Py_DECREF(block); Py_DECREF(values); Py_DECREF(locs); GET_TC(tc)->iterNext = NpyArr_iterNextNone; @@ -812,7 +790,6 @@ void PdBlock_iterBegin(JSOBJ _obj, JSONTypeContext *tc) { iternext = NpyIter_GetIterNext(iter, NULL); if (!iternext) { NpyIter_Deallocate(iter); - Py_DECREF(block); Py_DECREF(values); Py_DECREF(locs); GET_TC(tc)->iterNext = NpyArr_iterNextNone; @@ -846,15 +823,13 @@ void PdBlock_iterBegin(JSOBJ _obj, JSONTypeContext *tc) { } while (iternext(iter)); NpyIter_Deallocate(iter); - Py_DECREF(block); Py_DECREF(values); Py_DECREF(locs); } GET_TC(tc)->npyarr = blkCtxt->npyCtxts[0]; BLKRET: - Py_XDECREF(dtype); - Py_XDECREF(blocks); + Py_DECREF(blocks); } void PdBlock_iterEnd(JSOBJ obj, JSONTypeContext *tc) { From 21246061eeccac867e700bea80a549ec51f0ece2 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 1 Jul 2020 08:28:45 -0700 Subject: [PATCH 0252/1025] CLN: typing in libtimezones (#35079) --- pandas/_libs/tslibs/timezones.pxd | 2 +- pandas/_libs/tslibs/timezones.pyx | 28 +++++++++++++++------------- 2 files changed, 16 insertions(+), 14 deletions(-) diff --git a/pandas/_libs/tslibs/timezones.pxd b/pandas/_libs/tslibs/timezones.pxd index 14c0523787422..2428993c45f56 100644 --- a/pandas/_libs/tslibs/timezones.pxd +++ b/pandas/_libs/tslibs/timezones.pxd @@ -14,4 +14,4 @@ cpdef object maybe_get_tz(object tz) cdef get_utcoffset(tzinfo tz, obj) cdef bint is_fixed_offset(tzinfo tz) -cdef object get_dst_info(object tz) +cdef object get_dst_info(tzinfo tz) diff --git a/pandas/_libs/tslibs/timezones.pyx b/pandas/_libs/tslibs/timezones.pyx index 7fbb50fcbfd41..4b8244c269828 100644 --- a/pandas/_libs/tslibs/timezones.pyx +++ b/pandas/_libs/tslibs/timezones.pyx @@ -116,7 +116,7 @@ def _p_tz_cache_key(tz): dst_cache = {} -cdef inline object tz_cache_key(object tz): +cdef inline object tz_cache_key(tzinfo tz): """ Return the key in the cache for the timezone info object or None if unknown. @@ -210,13 +210,16 @@ cdef int64_t[:] unbox_utcoffsets(object transinfo): # Daylight Savings -cdef object get_dst_info(object tz): +cdef object get_dst_info(tzinfo tz): """ - return a tuple of : - (UTC times of DST transitions, - UTC offsets in microseconds corresponding to DST transitions, - string of type of transitions) - + Returns + ------- + ndarray[int64_t] + Nanosecond UTC times of DST transitions. + ndarray[int64_t] + Nanosecond UTC offsets corresponding to DST transitions. + str + Desscribing the type of tzinfo object. """ cache_key = tz_cache_key(tz) if cache_key is None: @@ -225,7 +228,7 @@ cdef object get_dst_info(object tz): num = int(get_utcoffset(tz, None).total_seconds()) * 1_000_000_000 return (np.array([NPY_NAT + 1], dtype=np.int64), np.array([num], dtype=np.int64), - None) + "unknown") if cache_key not in dst_cache: if treat_tz_as_pytz(tz): @@ -267,14 +270,13 @@ cdef object get_dst_info(object tz): # (under the just-deleted code that returned empty arrays) raise AssertionError("dateutil tzinfo is not a FixedOffset " "and has an empty `_trans_list`.", tz) - else: - # static tzinfo - # TODO: This case is not hit in tests (2018-07-17); is it possible? + # static tzinfo, we can get here with pytz.StaticTZInfo + # which are not caught by treat_tz_as_pytz trans = np.array([NPY_NAT + 1], dtype=np.int64) - num = int(get_utcoffset(tz, None).total_seconds()) * 1000000000 + num = int(get_utcoffset(tz, None).total_seconds()) * 1_000_000_000 deltas = np.array([num], dtype=np.int64) - typ = 'static' + typ = "static" dst_cache[cache_key] = (trans, deltas, typ) From 955c64066bf43ad2e8c8f2278cfc2203894a0676 Mon Sep 17 00:00:00 2001 From: Graham Wetzler Date: Wed, 1 Jul 2020 11:53:34 -0500 Subject: [PATCH 0253/1025] TST: Add test for df.apply(lambda x: x.dtype) (#35072) --- pandas/tests/frame/test_apply.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/pandas/tests/frame/test_apply.py b/pandas/tests/frame/test_apply.py index 8f0d3d9fbc734..114b3c0d0a3fc 100644 --- a/pandas/tests/frame/test_apply.py +++ b/pandas/tests/frame/test_apply.py @@ -1501,3 +1501,12 @@ def test_consistency_of_aggregates_of_columns_with_missing_values(self, df, meth tm.assert_series_equal( none_in_first_column_result, none_in_second_column_result ) + + @pytest.mark.parametrize("col", [1, 1.0, True, "a", np.nan]) + def test_apply_dtype(self, col): + # GH 31466 + df = pd.DataFrame([[1.0, col]], columns=["a", "b"]) + result = df.apply(lambda x: x.dtype) + expected = df.dtypes + + tm.assert_series_equal(result, expected) From 549957013553921563cd9f666c847c14928dbfc1 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 1 Jul 2020 10:07:39 -0700 Subject: [PATCH 0254/1025] CLN: stronger typing in libtimezones (#35082) --- pandas/_libs/tslibs/timestamps.pyx | 2 +- pandas/_libs/tslibs/timezones.pxd | 6 +++--- pandas/_libs/tslibs/timezones.pyx | 17 ++++++++++------- 3 files changed, 14 insertions(+), 11 deletions(-) diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index ba6cee3d7ad8e..1328bc8a5175f 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -991,7 +991,7 @@ class Timestamp(_Timestamp): "Timestamp from components." ) - if tz is not None and treat_tz_as_pytz(tz): + if tz is not None and PyTZInfo_Check(tz) and treat_tz_as_pytz(tz): raise ValueError( "pytz timezones do not support fold. Please use dateutil " "timezones." diff --git a/pandas/_libs/tslibs/timezones.pxd b/pandas/_libs/tslibs/timezones.pxd index 2428993c45f56..78483bd6fe09e 100644 --- a/pandas/_libs/tslibs/timezones.pxd +++ b/pandas/_libs/tslibs/timezones.pxd @@ -2,10 +2,10 @@ from cpython.datetime cimport tzinfo cdef tzinfo utc_pytz -cpdef bint is_utc(object tz) -cdef bint is_tzlocal(object tz) +cpdef bint is_utc(tzinfo tz) +cdef bint is_tzlocal(tzinfo tz) -cdef bint treat_tz_as_pytz(object tz) +cdef bint treat_tz_as_pytz(tzinfo tz) cpdef bint tz_compare(object start, object end) cpdef object get_timezone(object tz) diff --git a/pandas/_libs/tslibs/timezones.pyx b/pandas/_libs/tslibs/timezones.pyx index 4b8244c269828..7d8aabcc47835 100644 --- a/pandas/_libs/tslibs/timezones.pyx +++ b/pandas/_libs/tslibs/timezones.pyx @@ -1,5 +1,6 @@ -from cpython.datetime cimport tzinfo from datetime import timezone +from cpython.datetime cimport tzinfo, PyTZInfo_Check, PyDateTime_IMPORT +PyDateTime_IMPORT # dateutil compat from dateutil.tz import ( @@ -29,20 +30,20 @@ cdef tzinfo utc_pytz = UTC # ---------------------------------------------------------------------- -cpdef inline bint is_utc(object tz): +cpdef inline bint is_utc(tzinfo tz): return tz is utc_pytz or tz is utc_stdlib or isinstance(tz, _dateutil_tzutc) -cdef inline bint is_tzlocal(object tz): +cdef inline bint is_tzlocal(tzinfo tz): return isinstance(tz, _dateutil_tzlocal) -cdef inline bint treat_tz_as_pytz(object tz): +cdef inline bint treat_tz_as_pytz(tzinfo tz): return (hasattr(tz, '_utc_transition_times') and hasattr(tz, '_transition_info')) -cdef inline bint treat_tz_as_dateutil(object tz): +cdef inline bint treat_tz_as_dateutil(tzinfo tz): return hasattr(tz, '_trans_list') and hasattr(tz, '_trans_idx') @@ -59,7 +60,9 @@ cpdef inline object get_timezone(object tz): the tz name. It needs to be a string so that we can serialize it with UJSON/pytables. maybe_get_tz (below) is the inverse of this process. """ - if is_utc(tz): + if not PyTZInfo_Check(tz): + return tz + elif is_utc(tz): return tz else: if treat_tz_as_dateutil(tz): @@ -327,7 +330,7 @@ cpdef bint tz_compare(object start, object end): return get_timezone(start) == get_timezone(end) -def tz_standardize(tz: object): +def tz_standardize(tz: tzinfo): """ If the passed tz is a pytz timezone object, "normalize" it to the a consistent version From 19a641dcdab9b43c7265228766f4ee4a3f3981d1 Mon Sep 17 00:00:00 2001 From: Erfan Nariman <34067903+erfannariman@users.noreply.github.com> Date: Wed, 1 Jul 2020 20:25:39 +0200 Subject: [PATCH 0255/1025] TST: Test for groupby transform on categorical column (#35074) * GH29037 Added test for groupby transform on categorical column * GH29037 - changes for black * GH29037 - moved test to test_categorical --- pandas/tests/groupby/test_categorical.py | 50 ++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 60c82bf1fb71c..4de61f719dfbb 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -1509,3 +1509,53 @@ def test_aggregate_categorical_with_isnan(): index=index, ) tm.assert_frame_equal(result, expected) + + +def test_categorical_transform(): + # GH 29037 + df = pd.DataFrame( + { + "package_id": [1, 1, 1, 2, 2, 3], + "status": [ + "Waiting", + "OnTheWay", + "Delivered", + "Waiting", + "OnTheWay", + "Waiting", + ], + } + ) + + delivery_status_type = pd.CategoricalDtype( + categories=["Waiting", "OnTheWay", "Delivered"], ordered=True + ) + df["status"] = df["status"].astype(delivery_status_type) + df["last_status"] = df.groupby("package_id")["status"].transform(max) + result = df.copy() + + expected = pd.DataFrame( + { + "package_id": [1, 1, 1, 2, 2, 3], + "status": [ + "Waiting", + "OnTheWay", + "Delivered", + "Waiting", + "OnTheWay", + "Waiting", + ], + "last_status": [ + "Delivered", + "Delivered", + "Delivered", + "OnTheWay", + "OnTheWay", + "Waiting", + ], + } + ) + + expected["status"] = expected["status"].astype(delivery_status_type) + + tm.assert_frame_equal(result, expected) From ad2935e863463bd4a8851e6f31841b59780719fd Mon Sep 17 00:00:00 2001 From: Andrew Wieteska <48889395+arw2019@users.noreply.github.com> Date: Wed, 1 Jul 2020 14:28:31 -0400 Subject: [PATCH 0256/1025] Test for pd.to_sql column error if data contains -np.inf (#34493) * TST: pd.to_sql for dataframes with -np.inf (#34431) * DOC: updated what's new (#34431) * DOC: improved entry (#34431) * TST: moved to _TestSQLAlchemy + added round trips * TST: rename + add comment with GH issue # * TST: rewrote using pytest.mark.parametrize for arg to DataFrame * TST: removed underscore from _input * DOC: added double backtick to np.inf & removed extraneous space * TST: pd.to_sql for dataframes with -np.inf (#34431) * DOC: updated what's new (#34431) * DOC: improved entry (#34431) * TST: moved to _TestSQLAlchemy + added round trips * TST: rename + add comment with GH issue # * TST: rewrote using pytest.mark.parametrize for arg to DataFrame * TST: removed underscore from _input * DOC: added double backtick to np.inf & removed extraneous space * BUG: add catch for MySQL error with np.inf * use regex for string match + add runtime import * clean up regex * TST: update to catch error for -np.inf with MySQL * DOC: resolved conflict in whatsnew * TST: update test_to_sql_with_neg_npinf * fixed error handler syntax in SQLDatabase.to_sql * fixed error handler syntax in SQLDatabase.to_sql * TST: added an xfail test for npinf entries with mysql * fixed imports * added reference to GH issue * fixed test_to_sql_with_npinf error catch * fixed spelling error in message (can not -> cannot) * DOC: added info re MySQL ValueError to whatsnew * fixed variable name in to_sql * replaced sqlalchemy's dialect.name with flavor * fixed typo in test_to_sql-with-npinf --- doc/source/whatsnew/v1.1.0.rst | 1 + pandas/io/sql.py | 15 ++++++++++++++- pandas/tests/io/test_sql.py | 18 ++++++++++++++++++ 3 files changed, 33 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 2fac8d1a8f63f..8c6add92e658b 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -1044,6 +1044,7 @@ I/O - Bug in :meth:`HDFStore.append_to_multiple` was raising a ``ValueError`` when the min_itemsize parameter is set (:issue:`11238`) - Bug in :meth:`~HDFStore.create_table` now raises an error when `column` argument was not specified in `data_columns` on input (:issue:`28156`) - :meth:`read_json` now could read line-delimited json file from a file url while `lines` and `chunksize` are set. +- Bug in :meth:`DataFrame.to_sql` when reading DataFrames with ``-np.inf`` entries with MySQL now has a more explicit ``ValueError`` (:issue:`34431`) Plotting ^^^^^^^^ diff --git a/pandas/io/sql.py b/pandas/io/sql.py index b137608475b3d..9177696ca13d6 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -1391,7 +1391,20 @@ def to_sql( dtype=dtype, ) table.create() - table.insert(chunksize, method=method) + + from sqlalchemy import exc + + try: + table.insert(chunksize, method=method) + except exc.SQLAlchemyError as err: + # GH34431 + msg = "(1054, \"Unknown column 'inf' in 'field list'\")" + err_text = str(err.orig) + if re.search(msg, err_text): + raise ValueError("inf cannot be used with MySQL") from err + else: + raise err + if not name.isdigit() and not name.islower(): # check for potentially case sensitivity issues (GH7815) # Only check when name is not a number and name is not lower case diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index a07e7a74b7573..0991fae39138e 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -1813,6 +1813,24 @@ def main(connectable): DataFrame({"test_foo_data": [0, 1, 2]}).to_sql("test_foo_data", self.conn) main(self.conn) + @pytest.mark.parametrize( + "input", + [{"foo": [np.inf]}, {"foo": [-np.inf]}, {"foo": [-np.inf], "infe0": ["bar"]}], + ) + def test_to_sql_with_negative_npinf(self, input): + # GH 34431 + + df = pd.DataFrame(input) + + if self.flavor == "mysql": + msg = "inf cannot be used with MySQL" + with pytest.raises(ValueError, match=msg): + df.to_sql("foobar", self.conn, index=False) + else: + df.to_sql("foobar", self.conn, index=False) + res = sql.read_sql_table("foobar", self.conn) + tm.assert_equal(df, res) + def test_temporary_table(self): test_data = "Hello, World!" expected = DataFrame({"spam": [test_data]}) From c828aeb78fed468993c6f5a4518a6bc242da70e3 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 1 Jul 2020 17:16:05 -0700 Subject: [PATCH 0257/1025] TYP: annotations, typing for infer_tzinfo (#35084) --- pandas/_libs/tslibs/conversion.pyx | 3 ++- pandas/_libs/tslibs/timezones.pyx | 4 ++-- pandas/core/arrays/datetimes.py | 13 ++++++++----- pandas/io/pytables.py | 2 +- 4 files changed, 13 insertions(+), 9 deletions(-) diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index ac24dd546d9d3..9ee76a8c291a8 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -246,11 +246,12 @@ def datetime_to_datetime64(ndarray[object] values): """ cdef: Py_ssize_t i, n = len(values) - object val, inferred_tz = None + object val int64_t[:] iresult npy_datetimestruct dts _TSObject _ts bint found_naive = False + tzinfo inferred_tz = None result = np.empty(n, dtype='M8[ns]') iresult = result.view('i8') diff --git a/pandas/_libs/tslibs/timezones.pyx b/pandas/_libs/tslibs/timezones.pyx index 7d8aabcc47835..112da2eb5b624 100644 --- a/pandas/_libs/tslibs/timezones.pyx +++ b/pandas/_libs/tslibs/timezones.pyx @@ -1,5 +1,5 @@ from datetime import timezone -from cpython.datetime cimport tzinfo, PyTZInfo_Check, PyDateTime_IMPORT +from cpython.datetime cimport datetime, tzinfo, PyTZInfo_Check, PyDateTime_IMPORT PyDateTime_IMPORT # dateutil compat @@ -286,7 +286,7 @@ cdef object get_dst_info(tzinfo tz): return dst_cache[cache_key] -def infer_tzinfo(start, end): +def infer_tzinfo(datetime start, datetime end): if start is not None and end is not None: tz = start.tzinfo if not tz_compare(tz, end.tzinfo): diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 461f71ff821fa..296f11583c372 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -1,4 +1,4 @@ -from datetime import datetime, time, timedelta +from datetime import datetime, time, timedelta, tzinfo from typing import Optional, Union import warnings @@ -1908,6 +1908,7 @@ def sequence_to_dt64ns( inferred_freq = None dtype = _validate_dt64_dtype(dtype) + tz = timezones.maybe_get_tz(tz) if not hasattr(data, "dtype"): # e.g. list, tuple @@ -1950,14 +1951,14 @@ def sequence_to_dt64ns( data, inferred_tz = objects_to_datetime64ns( data, dayfirst=dayfirst, yearfirst=yearfirst ) - tz = maybe_infer_tz(tz, inferred_tz) + tz = _maybe_infer_tz(tz, inferred_tz) data_dtype = data.dtype # `data` may have originally been a Categorical[datetime64[ns, tz]], # so we need to handle these types. if is_datetime64tz_dtype(data_dtype): # DatetimeArray -> ndarray - tz = maybe_infer_tz(tz, data.tz) + tz = _maybe_infer_tz(tz, data.tz) result = data._data elif is_datetime64_dtype(data_dtype): @@ -2144,7 +2145,9 @@ def maybe_convert_dtype(data, copy): # Validation and Inference -def maybe_infer_tz(tz, inferred_tz): +def _maybe_infer_tz( + tz: Optional[tzinfo], inferred_tz: Optional[tzinfo] +) -> Optional[tzinfo]: """ If a timezone is inferred from data, check that it is compatible with the user-provided timezone, if any. @@ -2216,7 +2219,7 @@ def _validate_dt64_dtype(dtype): return dtype -def validate_tz_from_dtype(dtype, tz): +def validate_tz_from_dtype(dtype, tz: Optional[tzinfo]) -> Optional[tzinfo]: """ If the given dtype is a DatetimeTZDtype, extract the implied tzinfo object from it and check that it does not conflict with the given diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 981b380f8b5e9..b67a1c5781d91 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -4714,7 +4714,7 @@ def _set_tz( if tz is not None: name = getattr(values, "name", None) values = values.ravel() - tz = timezones.get_timezone(_ensure_decoded(tz)) + tz = _ensure_decoded(tz) values = DatetimeIndex(values, name=name) values = values.tz_localize("UTC").tz_convert(tz) elif coerce: From 896f9ad689154566d3c8c54e6e4ea12fd3e38aee Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 2 Jul 2020 07:42:05 -0700 Subject: [PATCH 0258/1025] TYP: types for tz_compare (#35093) --- pandas/_libs/lib.pyx | 4 ++-- pandas/_libs/tslibs/timezones.pxd | 2 +- pandas/_libs/tslibs/timezones.pyx | 3 +-- pandas/core/arrays/datetimes.py | 4 +++- 4 files changed, 7 insertions(+), 6 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index ea97bab2198eb..37d83a73c6597 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -73,7 +73,7 @@ from pandas._libs.tslibs.nattype cimport ( ) from pandas._libs.tslibs.conversion cimport convert_to_tsobject from pandas._libs.tslibs.timedeltas cimport convert_to_timedelta64 -from pandas._libs.tslibs.timezones cimport get_timezone, tz_compare +from pandas._libs.tslibs.timezones cimport tz_compare from pandas._libs.tslibs.period cimport is_period_object from pandas._libs.tslibs.offsets cimport is_offset_object @@ -1789,7 +1789,7 @@ def is_datetime_with_singletz_array(values: ndarray) -> bool: for i in range(n): base_val = values[i] if base_val is not NaT: - base_tz = get_timezone(getattr(base_val, 'tzinfo', None)) + base_tz = getattr(base_val, 'tzinfo', None) break for j in range(i, n): diff --git a/pandas/_libs/tslibs/timezones.pxd b/pandas/_libs/tslibs/timezones.pxd index 78483bd6fe09e..0179be3cdd8e6 100644 --- a/pandas/_libs/tslibs/timezones.pxd +++ b/pandas/_libs/tslibs/timezones.pxd @@ -7,7 +7,7 @@ cdef bint is_tzlocal(tzinfo tz) cdef bint treat_tz_as_pytz(tzinfo tz) -cpdef bint tz_compare(object start, object end) +cpdef bint tz_compare(tzinfo start, tzinfo end) cpdef object get_timezone(object tz) cpdef object maybe_get_tz(object tz) diff --git a/pandas/_libs/tslibs/timezones.pyx b/pandas/_libs/tslibs/timezones.pyx index 112da2eb5b624..5bf47efeccfb0 100644 --- a/pandas/_libs/tslibs/timezones.pyx +++ b/pandas/_libs/tslibs/timezones.pyx @@ -301,7 +301,7 @@ def infer_tzinfo(datetime start, datetime end): return tz -cpdef bint tz_compare(object start, object end): +cpdef bint tz_compare(tzinfo start, tzinfo end): """ Compare string representations of timezones @@ -324,7 +324,6 @@ cpdef bint tz_compare(object start, object end): Returns: ------- bool - """ # GH 18523 return get_timezone(start) == get_timezone(end) diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 296f11583c372..12eefec0c149b 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -2266,7 +2266,9 @@ def validate_tz_from_dtype(dtype, tz: Optional[tzinfo]) -> Optional[tzinfo]: return tz -def _infer_tz_from_endpoints(start, end, tz): +def _infer_tz_from_endpoints( + start: Timestamp, end: Timestamp, tz: Optional[tzinfo] +) -> Optional[tzinfo]: """ If a timezone is not explicitly given via `tz`, see if one can be inferred from the `start` and `end` endpoints. If more than one From dede81afc309d2b79e1901fd48959c513a6ccdd5 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 2 Jul 2020 07:42:54 -0700 Subject: [PATCH 0259/1025] BENCH: implement asvs for ints_to_pydatetime (#35091) --- asv_bench/benchmarks/tslibs/tslib.py | 55 ++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) create mode 100644 asv_bench/benchmarks/tslibs/tslib.py diff --git a/asv_bench/benchmarks/tslibs/tslib.py b/asv_bench/benchmarks/tslibs/tslib.py new file mode 100644 index 0000000000000..eacf5a5731dc2 --- /dev/null +++ b/asv_bench/benchmarks/tslibs/tslib.py @@ -0,0 +1,55 @@ +""" +ipython analogue: + +tr = TimeIntsToPydatetime() +mi = pd.MultiIndex.from_product( + tr.params[:-1] + ([str(x) for x in tr.params[-1]],) +) +df = pd.DataFrame(np.nan, index=mi, columns=["mean", "stdev"]) +for box in tr.params[0]: + for size in tr.params[1]: + for tz in tr.params[2]: + tr.setup(box, size, tz) + key = (box, size, str(tz)) + print(key) + val = %timeit -o tr.time_ints_to_pydatetime(box, size, tz) + df.loc[key] = (val.average, val.stdev) +""" +from datetime import timedelta, timezone + +from dateutil.tz import gettz, tzlocal +import numpy as np +import pytz + +from pandas._libs.tslib import ints_to_pydatetime + +_tzs = [ + None, + timezone.utc, + timezone(timedelta(minutes=60)), + pytz.timezone("US/Pacific"), + gettz("Asia/Tokyo"), + tzlocal(), +] +_sizes = [0, 1, 100, 10 ** 4, 10 ** 6] + + +class TimeIntsToPydatetime: + params = ( + ["time", "date", "datetime", "timestamp"], + _sizes, + _tzs, + ) + param_names = ["box", "size", "tz"] + # TODO: fold? freq? + + def setup(self, box, size, tz): + arr = np.random.randint(0, 10, size=size, dtype="i8") + self.i8data = arr + + def time_ints_to_pydatetime(self, box, size, tz): + if box == "date": + # ints_to_pydatetime does not allow non-None tz with date; + # this will mean doing some duplicate benchmarks + tz = None + ints_to_pydatetime(self.i8data, tz, box=box) From 7f4049fe33d0f66e00180047d4221c878a396fc0 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 2 Jul 2020 07:44:09 -0700 Subject: [PATCH 0260/1025] CLN: tighter typing in libconversion (#35088) --- pandas/_libs/tslibs/conversion.pxd | 4 +- pandas/_libs/tslibs/conversion.pyx | 36 ++++++------- pandas/_libs/tslibs/timestamps.pyx | 50 ++++++++++--------- .../scalar/timestamp/test_constructors.py | 5 +- 4 files changed, 46 insertions(+), 49 deletions(-) diff --git a/pandas/_libs/tslibs/conversion.pxd b/pandas/_libs/tslibs/conversion.pxd index 623d9f14d646b..2cf75944a8196 100644 --- a/pandas/_libs/tslibs/conversion.pxd +++ b/pandas/_libs/tslibs/conversion.pxd @@ -13,11 +13,11 @@ cdef class _TSObject: bint fold -cdef convert_to_tsobject(object ts, object tz, object unit, +cdef convert_to_tsobject(object ts, tzinfo tz, object unit, bint dayfirst, bint yearfirst, int32_t nanos=*) -cdef _TSObject convert_datetime_to_tsobject(datetime ts, object tz, +cdef _TSObject convert_datetime_to_tsobject(datetime ts, tzinfo tz, int32_t nanos=*) cdef int64_t get_datetime64_nanos(object val) except? -1 diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index 9ee76a8c291a8..95500f66db156 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -307,7 +307,7 @@ cdef class _TSObject: return self.value -cdef convert_to_tsobject(object ts, object tz, object unit, +cdef convert_to_tsobject(object ts, tzinfo tz, object unit, bint dayfirst, bint yearfirst, int32_t nanos=0): """ Extract datetime and int64 from any of: @@ -326,13 +326,10 @@ cdef convert_to_tsobject(object ts, object tz, object unit, cdef: _TSObject obj - if tz is not None: - tz = maybe_get_tz(tz) - obj = _TSObject() if isinstance(ts, str): - return convert_str_to_tsobject(ts, tz, unit, dayfirst, yearfirst) + return _convert_str_to_tsobject(ts, tz, unit, dayfirst, yearfirst) if ts is None or ts is NaT: obj.value = NPY_NAT @@ -374,16 +371,16 @@ cdef convert_to_tsobject(object ts, object tz, object unit, f'Timestamp') if tz is not None: - localize_tso(obj, tz) + _localize_tso(obj, tz) if obj.value != NPY_NAT: - # check_overflows needs to run after localize_tso + # check_overflows needs to run after _localize_tso check_dts_bounds(&obj.dts) check_overflows(obj) return obj -cdef _TSObject convert_datetime_to_tsobject(datetime ts, object tz, +cdef _TSObject convert_datetime_to_tsobject(datetime ts, tzinfo tz, int32_t nanos=0): """ Convert a datetime (or Timestamp) input `ts`, along with optional timezone @@ -446,8 +443,8 @@ cdef _TSObject convert_datetime_to_tsobject(datetime ts, object tz, return obj -cdef _TSObject create_tsobject_tz_using_offset(npy_datetimestruct dts, - int tzoffset, tzinfo tz=None): +cdef _TSObject _create_tsobject_tz_using_offset(npy_datetimestruct dts, + int tzoffset, tzinfo tz=None): """ Convert a datetimestruct `dts`, along with initial timezone offset `tzoffset` to a _TSObject (with timezone object `tz` - optional). @@ -500,9 +497,9 @@ cdef _TSObject create_tsobject_tz_using_offset(npy_datetimestruct dts, return obj -cdef _TSObject convert_str_to_tsobject(object ts, object tz, object unit, - bint dayfirst=False, - bint yearfirst=False): +cdef _TSObject _convert_str_to_tsobject(object ts, tzinfo tz, object unit, + bint dayfirst=False, + bint yearfirst=False): """ Convert a string input `ts`, along with optional timezone object`tz` to a _TSObject. @@ -532,11 +529,6 @@ cdef _TSObject convert_str_to_tsobject(object ts, object tz, object unit, int out_local = 0, out_tzoffset = 0 bint do_parse_datetime_string = False - if tz is not None: - tz = maybe_get_tz(tz) - - assert isinstance(ts, str) - if len(ts) == 0 or ts in nat_strings: ts = NaT elif ts == 'now': @@ -557,12 +549,12 @@ cdef _TSObject convert_str_to_tsobject(object ts, object tz, object unit, if not string_to_dts_failed: check_dts_bounds(&dts) if out_local == 1: - return create_tsobject_tz_using_offset(dts, - out_tzoffset, tz) + return _create_tsobject_tz_using_offset(dts, + out_tzoffset, tz) else: ts = dtstruct_to_dt64(&dts) if tz is not None: - # shift for localize_tso + # shift for _localize_tso ts = tz_localize_to_utc(np.array([ts], dtype='i8'), tz, ambiguous='raise')[0] @@ -613,7 +605,7 @@ cdef inline check_overflows(_TSObject obj): # ---------------------------------------------------------------------- # Localization -cdef inline void localize_tso(_TSObject obj, tzinfo tz): +cdef inline void _localize_tso(_TSObject obj, tzinfo tz): """ Given the UTC nanosecond timestamp in obj.value, find the wall-clock representation of that timestamp in the given timezone. diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index 1328bc8a5175f..e104b722ea119 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -20,6 +20,7 @@ from cpython.datetime cimport ( datetime, time, tzinfo, + tzinfo as tzinfo_type, # alias bc `tzinfo` is a kwarg below PyDateTime_Check, PyDelta_Check, PyTZInfo_Check, @@ -932,7 +933,7 @@ class Timestamp(_Timestamp): second=None, microsecond=None, nanosecond=None, - tzinfo=None, + tzinfo_type tzinfo=None, *, fold=None ): @@ -957,18 +958,17 @@ class Timestamp(_Timestamp): # # Mixing pydatetime positional and keyword arguments is forbidden! - cdef _TSObject ts + cdef: + _TSObject ts + tzinfo_type tzobj _date_attributes = [year, month, day, hour, minute, second, microsecond, nanosecond] if tzinfo is not None: - if not PyTZInfo_Check(tzinfo): - # tzinfo must be a datetime.tzinfo object, GH#17690 - raise TypeError( - f"tzinfo must be a datetime.tzinfo object, not {type(tzinfo)}" - ) - elif tz is not None: + # GH#17690 tzinfo must be a datetime.tzinfo object, ensured + # by the cython annotation. + if tz is not None: raise ValueError('Can provide at most one of tz, tzinfo') # User passed tzinfo instead of tz; avoid silently ignoring @@ -1055,7 +1055,8 @@ class Timestamp(_Timestamp): raise ValueError("Cannot pass a datetime or Timestamp with tzinfo with " "the tz parameter. Use tz_convert instead.") - ts = convert_to_tsobject(ts_input, tz, unit, 0, 0, nanosecond or 0) + tzobj = maybe_get_tz(tz) + ts = convert_to_tsobject(ts_input, tzobj, unit, 0, 0, nanosecond or 0) if ts.value == NPY_NAT: return NaT @@ -1378,15 +1379,16 @@ default 'raise' cdef: npy_datetimestruct dts - int64_t value, value_tz, offset - object _tzinfo, result, k, v + int64_t value, value_tz + object k, v datetime ts_input + tzinfo_type tzobj # set to naive if needed - _tzinfo = self.tzinfo + tzobj = self.tzinfo value = self.value - if _tzinfo is not None: - value_tz = tz_convert_single(value, _tzinfo, UTC) + if tzobj is not None: + value_tz = tz_convert_single(value, tzobj, UTC) value += value - value_tz # setup components @@ -1419,30 +1421,30 @@ default 'raise' if nanosecond is not None: dts.ps = validate('nanosecond', nanosecond) * 1000 if tzinfo is not object: - _tzinfo = tzinfo + tzobj = tzinfo # reconstruct & check bounds - if _tzinfo is not None and treat_tz_as_pytz(_tzinfo): + if tzobj is not None and treat_tz_as_pytz(tzobj): # replacing across a DST boundary may induce a new tzinfo object # see GH#18319 - ts_input = _tzinfo.localize(datetime(dts.year, dts.month, dts.day, - dts.hour, dts.min, dts.sec, - dts.us), - is_dst=not bool(fold)) - _tzinfo = ts_input.tzinfo + ts_input = tzobj.localize(datetime(dts.year, dts.month, dts.day, + dts.hour, dts.min, dts.sec, + dts.us), + is_dst=not bool(fold)) + tzobj = ts_input.tzinfo else: kwargs = {'year': dts.year, 'month': dts.month, 'day': dts.day, 'hour': dts.hour, 'minute': dts.min, 'second': dts.sec, - 'microsecond': dts.us, 'tzinfo': _tzinfo, + 'microsecond': dts.us, 'tzinfo': tzobj, 'fold': fold} ts_input = datetime(**kwargs) - ts = convert_datetime_to_tsobject(ts_input, _tzinfo) + ts = convert_datetime_to_tsobject(ts_input, tzobj) value = ts.value + (dts.ps // 1000) if value != NPY_NAT: check_dts_bounds(&dts) - return create_timestamp_from_ts(value, dts, _tzinfo, self.freq, fold) + return create_timestamp_from_ts(value, dts, tzobj, self.freq, fold) def to_julian_date(self) -> np.float64: """ diff --git a/pandas/tests/scalar/timestamp/test_constructors.py b/pandas/tests/scalar/timestamp/test_constructors.py index 770753f42a4c8..316a299ba1cbb 100644 --- a/pandas/tests/scalar/timestamp/test_constructors.py +++ b/pandas/tests/scalar/timestamp/test_constructors.py @@ -174,7 +174,10 @@ def test_constructor_invalid(self): def test_constructor_invalid_tz(self): # GH#17690 - msg = "must be a datetime.tzinfo" + msg = ( + "Argument 'tzinfo' has incorrect type " + r"\(expected datetime.tzinfo, got str\)" + ) with pytest.raises(TypeError, match=msg): Timestamp("2017-10-22", tzinfo="US/Eastern") From d3b1db342e48e6bd1f324f29013f24495b9c64cf Mon Sep 17 00:00:00 2001 From: Tom Date: Thu, 2 Jul 2020 15:51:27 +0100 Subject: [PATCH 0261/1025] CLN: Removed unreached else block GH33478 (#35086) --- pandas/core/groupby/generic.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index dab8475d9580c..6f956a3dcc9b6 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1218,7 +1218,7 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): return self.obj._constructor() elif isinstance(first_not_none, DataFrame): return self._concat_objects(keys, values, not_indexed_same=not_indexed_same) - elif self.grouper.groupings is not None: + else: if len(self.grouper.groupings) > 1: key_index = self.grouper.result_index @@ -1373,10 +1373,6 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): # of columns return self.obj._constructor_sliced(values, index=key_index) - else: - # Handle cases like BinGrouper - return self._concat_objects(keys, values, not_indexed_same=not_indexed_same) - def _transform_general( self, func, *args, engine="cython", engine_kwargs=None, **kwargs ): From 8df0ff36f73ce24ca0ea8fe7ad461a513c1d4b65 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 2 Jul 2020 07:57:18 -0700 Subject: [PATCH 0262/1025] BENCH: implement asvs for get_resolution (#35075) --- asv_bench/benchmarks/tslibs/resolution.py | 50 +++++++++++++++++++++++ 1 file changed, 50 insertions(+) create mode 100644 asv_bench/benchmarks/tslibs/resolution.py diff --git a/asv_bench/benchmarks/tslibs/resolution.py b/asv_bench/benchmarks/tslibs/resolution.py new file mode 100644 index 0000000000000..274aa1ad6d4a9 --- /dev/null +++ b/asv_bench/benchmarks/tslibs/resolution.py @@ -0,0 +1,50 @@ +""" +ipython analogue: + +tr = TimeResolution() +mi = pd.MultiIndex.from_product(tr.params[:-1] + ([str(x) for x in tr.params[-1]],)) +df = pd.DataFrame(np.nan, index=mi, columns=["mean", "stdev"]) + +for unit in tr.params[0]: + for size in tr.params[1]: + for tz in tr.params[2]: + tr.setup(unit, size, tz) + key = (unit, size, str(tz)) + print(key) + + val = %timeit -o tr.time_get_resolution(unit, size, tz) + + df.loc[key] = (val.average, val.stdev) + +""" +from datetime import timedelta, timezone + +from dateutil.tz import gettz, tzlocal +import numpy as np +import pytz + +from pandas._libs.tslibs.resolution import get_resolution + + +class TimeResolution: + params = ( + ["D", "h", "m", "s", "us", "ns"], + [1, 100, 10 ** 4, 10 ** 6], + [ + None, + timezone.utc, + timezone(timedelta(minutes=60)), + pytz.timezone("US/Pacific"), + gettz("Asia/Tokyo"), + tzlocal(), + ], + ) + param_names = ["unit", "size", "tz"] + + def setup(self, unit, size, tz): + arr = np.random.randint(0, 10, size=size, dtype="i8") + arr = arr.view(f"M8[{unit}]").astype("M8[ns]").view("i8") + self.i8data = arr + + def time_get_resolution(self, unit, size, tz): + get_resolution(self.i8data, tz) From 20e2d94b2e1709302bd0ad79c25602db62d2dd43 Mon Sep 17 00:00:00 2001 From: SanthoshBala18 Date: Thu, 2 Jul 2020 20:31:39 +0530 Subject: [PATCH 0263/1025] Fixed #34859: Added support for '0' and '1' in BooleanArray._from_sequence_of_strings method (#35061) --- doc/source/whatsnew/v1.1.0.rst | 1 + pandas/core/arrays/boolean.py | 4 ++-- .../tests/arrays/boolean/test_construction.py | 5 +++-- pandas/tests/io/parser/test_dtypes.py | 22 ++++++++++++++++++- 4 files changed, 27 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 8c6add92e658b..40839f6c70d6e 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -328,6 +328,7 @@ Other enhancements - :meth:`DataFrame.to_html` and :meth:`DataFrame.to_string`'s ``col_space`` parameter now accepts a list or dict to change only some specific columns' width (:issue:`28917`). - :meth:`DataFrame.to_excel` can now also write OpenOffice spreadsheet (.ods) files (:issue:`27222`) - :meth:`~Series.explode` now accepts ``ignore_index`` to reset the index, similarly to :meth:`pd.concat` or :meth:`DataFrame.sort_values` (:issue:`34932`). +- :meth:`read_csv` now accepts string values like "0", "0.0", "1", "1.0" as convertible to the nullable boolean dtype (:issue:`34859`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index 9f1c2c6e668ad..dbce71b77a425 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -286,9 +286,9 @@ def _from_sequence_of_strings( def map_string(s): if isna(s): return s - elif s in ["True", "TRUE", "true"]: + elif s in ["True", "TRUE", "true", "1", "1.0"]: return True - elif s in ["False", "FALSE", "false"]: + elif s in ["False", "FALSE", "false", "0", "0.0"]: return False else: raise ValueError(f"{s} cannot be cast to bool") diff --git a/pandas/tests/arrays/boolean/test_construction.py b/pandas/tests/arrays/boolean/test_construction.py index f7354a089df3b..2f5c61304d415 100644 --- a/pandas/tests/arrays/boolean/test_construction.py +++ b/pandas/tests/arrays/boolean/test_construction.py @@ -247,10 +247,11 @@ def test_coerce_to_numpy_array(): def test_to_boolean_array_from_strings(): result = BooleanArray._from_sequence_of_strings( - np.array(["True", "False", np.nan], dtype=object) + np.array(["True", "False", "1", "1.0", "0", "0.0", np.nan], dtype=object) ) expected = BooleanArray( - np.array([True, False, False]), np.array([False, False, True]) + np.array([True, False, True, True, False, False, False]), + np.array([False, False, False, False, False, False, True]), ) tm.assert_extension_array_equal(result, expected) diff --git a/pandas/tests/io/parser/test_dtypes.py b/pandas/tests/io/parser/test_dtypes.py index 6298d1e5498f3..6ac310e3b2227 100644 --- a/pandas/tests/io/parser/test_dtypes.py +++ b/pandas/tests/io/parser/test_dtypes.py @@ -561,9 +561,13 @@ def test_boolean_dtype(all_parsers): "True", "TRUE", "true", + "1", + "1.0", "False", "FALSE", "false", + "0", + "0.0", "NaN", "nan", "NA", @@ -576,7 +580,23 @@ def test_boolean_dtype(all_parsers): expected = pd.DataFrame( { "a": pd.array( - [True, True, True, False, False, False, None, None, None, None, None], + [ + True, + True, + True, + True, + True, + False, + False, + False, + False, + False, + None, + None, + None, + None, + None, + ], dtype="boolean", ) } From 2d3bd59aac290e99a3a3f3813b17847bead6469e Mon Sep 17 00:00:00 2001 From: pizzathief Date: Fri, 3 Jul 2020 01:26:43 +1000 Subject: [PATCH 0264/1025] BUG: DataFrame.melt gives unexpected result when column "value" already exists (#35003) --- doc/source/whatsnew/v1.1.0.rst | 1 + pandas/core/reshape/melt.py | 11 +++++++++++ pandas/tests/reshape/test_melt.py | 14 ++++++++++++++ 3 files changed, 26 insertions(+) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 40839f6c70d6e..9bd4ddbb624d9 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -819,6 +819,7 @@ Deprecations - :meth:`util.testing.assert_almost_equal` now accepts both relative and absolute precision through the ``rtol``, and ``atol`` parameters, thus deprecating the ``check_less_precise`` parameter. (:issue:`13357`). +- :func:`DataFrame.melt` accepting a value_name that already exists is deprecated, and will be removed in a future version (:issue:`34731`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/reshape/melt.py b/pandas/core/reshape/melt.py index cd0619738677d..923b9e7462d8b 100644 --- a/pandas/core/reshape/melt.py +++ b/pandas/core/reshape/melt.py @@ -1,5 +1,6 @@ import re from typing import TYPE_CHECKING, List, cast +import warnings import numpy as np @@ -40,6 +41,16 @@ def melt( else: cols = list(frame.columns) + if value_name in frame.columns: + warnings.warn( + "This dataframe has a column name that matches the 'value_name' column " + "name of the resultiing Dataframe. " + "In the future this will raise an error, please set the 'value_name' " + "parameter of DataFrame.melt to a unique name.", + FutureWarning, + stacklevel=3, + ) + if id_vars is not None: if not is_list_like(id_vars): id_vars = [id_vars] diff --git a/pandas/tests/reshape/test_melt.py b/pandas/tests/reshape/test_melt.py index 000a6354277ab..a0fa10802f860 100644 --- a/pandas/tests/reshape/test_melt.py +++ b/pandas/tests/reshape/test_melt.py @@ -1014,3 +1014,17 @@ def test_col_substring_of_stubname(self): ) result = pd.wide_to_long(wide_df, stubnames="PA", i=["node_id", "A"], j="time") tm.assert_frame_equal(result, expected) + + def test_warn_of_column_name_value(self): + # GH34731 + # raise a warning if the resultant value column name matches + # a name in the dataframe already (default name is "value") + df = pd.DataFrame({"col": list("ABC"), "value": range(10, 16, 2)}) + expected = pd.DataFrame( + [["A", "col", "A"], ["B", "col", "B"], ["C", "col", "C"]], + columns=["value", "variable", "value"], + ) + + with tm.assert_produces_warning(FutureWarning): + result = df.melt(id_vars="value") + tm.assert_frame_equal(result, expected) From 50e32e4d5b9ba11ef726e8e82d276b0347f78622 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 2 Jul 2020 09:59:10 -0700 Subject: [PATCH 0265/1025] PERF: tz_convert/tz_convert_single (#35087) --- pandas/_libs/tslibs/tzconversion.pyx | 48 +++++++++++++------------- pandas/tests/tslibs/test_conversion.py | 7 ++-- 2 files changed, 28 insertions(+), 27 deletions(-) diff --git a/pandas/_libs/tslibs/tzconversion.pyx b/pandas/_libs/tslibs/tzconversion.pyx index a096b2807c640..d1d6bc40ef288 100644 --- a/pandas/_libs/tslibs/tzconversion.pyx +++ b/pandas/_libs/tslibs/tzconversion.pyx @@ -345,36 +345,28 @@ cpdef int64_t tz_convert_single(int64_t val, tzinfo tz1, tzinfo tz2): converted: int64 """ cdef: - int64_t utc_date int64_t arr[1] + bint to_utc = is_utc(tz2) + tzinfo tz # See GH#17734 We should always be converting either from UTC or to UTC - assert is_utc(tz1) or is_utc(tz2) + assert is_utc(tz1) or to_utc if val == NPY_NAT: return val - # Convert to UTC - if is_tzlocal(tz1): - utc_date = _tz_convert_tzlocal_utc(val, tz1, to_utc=True) - elif not is_utc(tz1): - arr[0] = val - utc_date = _tz_convert_dst(arr, tz1, to_utc=True)[0] + if to_utc: + tz = tz1 else: - utc_date = val + tz = tz2 - if is_utc(tz2): - return utc_date - elif is_tzlocal(tz2): - return _tz_convert_tzlocal_utc(utc_date, tz2, to_utc=False) + if is_utc(tz): + return val + elif is_tzlocal(tz): + return _tz_convert_tzlocal_utc(val, tz, to_utc=to_utc) else: - # Convert UTC to other timezone - arr[0] = utc_date - # Note: at least with cython 0.28.3, doing a lookup `[0]` in the next - # line is sensitive to the declared return type of _tz_convert_dst; - # if it is declared as returning ndarray[int64_t], a compile-time error - # is raised. - return _tz_convert_dst(arr, tz2, to_utc=False)[0] + arr[0] = val + return _tz_convert_dst(arr, tz, to_utc=to_utc)[0] def tz_convert(int64_t[:] vals, tzinfo tz1, tzinfo tz2): @@ -392,14 +384,22 @@ def tz_convert(int64_t[:] vals, tzinfo tz1, tzinfo tz2): int64 ndarray of converted """ cdef: - int64_t[:] utc_dates, converted + int64_t[:] converted + bint to_utc = is_utc(tz2) + tzinfo tz + + # See GH#17734 We should always be converting either from UTC or to UTC + assert is_utc(tz1) or to_utc if len(vals) == 0: return np.array([], dtype=np.int64) - # Convert to UTC - utc_dates = _tz_convert_one_way(vals, tz1, to_utc=True) - converted = _tz_convert_one_way(utc_dates, tz2, to_utc=False) + if to_utc: + tz = tz1 + else: + tz = tz2 + + converted = _tz_convert_one_way(vals, tz, to_utc=to_utc) return np.array(converted, dtype=np.int64) diff --git a/pandas/tests/tslibs/test_conversion.py b/pandas/tests/tslibs/test_conversion.py index fd8c9df026674..3a7e06fb14a5f 100644 --- a/pandas/tests/tslibs/test_conversion.py +++ b/pandas/tests/tslibs/test_conversion.py @@ -57,9 +57,10 @@ def test_tz_convert_single_matches_tz_convert(tz_aware_fixture, freq): ], ) def test_tz_convert_corner(arr): - result = tzconversion.tz_convert( - arr, timezones.maybe_get_tz("US/Eastern"), timezones.maybe_get_tz("Asia/Tokyo") - ) + result = tzconversion.tz_convert(arr, timezones.maybe_get_tz("US/Eastern"), UTC) + tm.assert_numpy_array_equal(result, arr) + + result = tzconversion.tz_convert(arr, UTC, timezones.maybe_get_tz("Asia/Tokyo")) tm.assert_numpy_array_equal(result, arr) From 24b64cc41a1b429f40a3e34ab2bf3cbd2ce2910e Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 2 Jul 2020 10:34:51 -0700 Subject: [PATCH 0266/1025] TYP: type for get_timezone (#35096) --- pandas/_libs/tslibs/timezones.pxd | 2 +- pandas/_libs/tslibs/timezones.pyx | 9 +++------ pandas/core/arrays/datetimes.py | 17 ++++++----------- pandas/core/indexes/datetimes.py | 4 +--- 4 files changed, 11 insertions(+), 21 deletions(-) diff --git a/pandas/_libs/tslibs/timezones.pxd b/pandas/_libs/tslibs/timezones.pxd index 0179be3cdd8e6..0784b090b3edb 100644 --- a/pandas/_libs/tslibs/timezones.pxd +++ b/pandas/_libs/tslibs/timezones.pxd @@ -8,7 +8,7 @@ cdef bint is_tzlocal(tzinfo tz) cdef bint treat_tz_as_pytz(tzinfo tz) cpdef bint tz_compare(tzinfo start, tzinfo end) -cpdef object get_timezone(object tz) +cpdef object get_timezone(tzinfo tz) cpdef object maybe_get_tz(object tz) cdef get_utcoffset(tzinfo tz, obj) diff --git a/pandas/_libs/tslibs/timezones.pyx b/pandas/_libs/tslibs/timezones.pyx index 5bf47efeccfb0..0460591048801 100644 --- a/pandas/_libs/tslibs/timezones.pyx +++ b/pandas/_libs/tslibs/timezones.pyx @@ -1,6 +1,5 @@ from datetime import timezone -from cpython.datetime cimport datetime, tzinfo, PyTZInfo_Check, PyDateTime_IMPORT -PyDateTime_IMPORT +from cpython.datetime cimport datetime, tzinfo # dateutil compat from dateutil.tz import ( @@ -47,7 +46,7 @@ cdef inline bint treat_tz_as_dateutil(tzinfo tz): return hasattr(tz, '_trans_list') and hasattr(tz, '_trans_idx') -cpdef inline object get_timezone(object tz): +cpdef inline object get_timezone(tzinfo tz): """ We need to do several things here: 1) Distinguish between pytz and dateutil timezones @@ -60,9 +59,7 @@ cpdef inline object get_timezone(object tz): the tz name. It needs to be a string so that we can serialize it with UJSON/pytables. maybe_get_tz (below) is the inverse of this process. """ - if not PyTZInfo_Check(tz): - return tz - elif is_utc(tz): + if is_utc(tz): return tz else: if treat_tz_as_dateutil(tz): diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 12eefec0c149b..64e8582baaa98 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -522,13 +522,6 @@ def tzinfo(self): """ return self.tz - @property # NB: override with cache_readonly in immutable subclasses - def _timezone(self): - """ - Comparable timezone both for pytz / dateutil - """ - return timezones.get_timezone(self.tzinfo) - @property # NB: override with cache_readonly in immutable subclasses def is_normalized(self): """ @@ -617,15 +610,17 @@ def _format_native_types(self, na_rep="NaT", date_format=None, **kwargs): # ----------------------------------------------------------------- # Comparison Methods - def _has_same_tz(self, other): - zzone = self._timezone + def _has_same_tz(self, other) -> bool: # vzone shouldn't be None if value is non-datetime like if isinstance(other, np.datetime64): # convert to Timestamp as np.datetime64 doesn't have tz attr other = Timestamp(other) - vzone = timezones.get_timezone(getattr(other, "tzinfo", "__no_tz__")) - return zzone == vzone + + if not hasattr(other, "tzinfo"): + return False + other_tz = other.tzinfo + return timezones.tz_compare(self.tzinfo, other_tz) def _assert_tzawareness_compat(self, other): # adapted from _Timestamp._assert_tzawareness_compat diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index f3c96db0a8d6e..86c6cdf5b15c7 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -74,9 +74,7 @@ def _new_DatetimeIndex(cls, d): DatetimeArray, wrap=True, ) -@inherit_names( - ["_timezone", "is_normalized", "_resolution_obj"], DatetimeArray, cache=True -) +@inherit_names(["is_normalized", "_resolution_obj"], DatetimeArray, cache=True) @inherit_names( [ "_bool_ops", From 6759e57bee378cf1e4b0f51d5e4dcaae9335a7b2 Mon Sep 17 00:00:00 2001 From: Steffen Rehberg Date: Fri, 3 Jul 2020 16:14:18 +0200 Subject: [PATCH 0267/1025] DOC: Fix code formatting and typos in Series.tz_localize (#35110) --- pandas/core/generic.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index a66cade3b81b0..d892e2487b31c 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -9580,8 +9580,9 @@ def tz_localize( dtype: int64 If the DST transition causes nonexistent times, you can shift these - dates forward or backwards with a timedelta object or `'shift_forward'` - or `'shift_backwards'`. + dates forward or backward with a timedelta object or `'shift_forward'` + or `'shift_backward'`. + >>> s = pd.Series(range(2), ... index=pd.DatetimeIndex(['2015-03-29 02:30:00', ... '2015-03-29 03:30:00'])) From c7c22e10520d9c195e42afaacf87412eb7780859 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 3 Jul 2020 10:07:31 -0500 Subject: [PATCH 0268/1025] PERF: Fix quantile perf regression (#35101) --- pandas/util/_validators.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/pandas/util/_validators.py b/pandas/util/_validators.py index bb6c6de441558..fa7201a5188a5 100644 --- a/pandas/util/_validators.py +++ b/pandas/util/_validators.py @@ -371,14 +371,13 @@ def validate_percentile(q: Union[float, Iterable[float]]) -> np.ndarray: ValueError if percentiles are not in given interval([0, 1]). """ q_arr = np.asarray(q) - msg = ( - "percentiles should all be in the interval [0, 1]." - f"Try {q_arr / 100.0} instead." - ) + # Don't change this to an f-string. The string formatting + # is too expensive for cases where we don't need it. + msg = "percentiles should all be in the interval [0, 1]. Try {} instead." if q_arr.ndim == 0: if not 0 <= q_arr <= 1: - raise ValueError(msg) + raise ValueError(msg.format(q_arr / 100.0)) else: if not all(0 <= qs <= 1 for qs in q_arr): - raise ValueError(msg) + raise ValueError(msg.format(q_arr / 100.0)) return q_arr From 2479b6f6d1e509c9b4cc85368dc6014fccefdc87 Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Sat, 4 Jul 2020 16:30:15 +0100 Subject: [PATCH 0269/1025] CLN: convert lambda to function (#35117) --- pandas/io/formats/printing.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/pandas/io/formats/printing.py b/pandas/io/formats/printing.py index 36e774305b577..1cf79dc105901 100644 --- a/pandas/io/formats/printing.py +++ b/pandas/io/formats/printing.py @@ -276,9 +276,13 @@ class TableSchemaFormatter(BaseFormatter): formatters[mimetype].enabled = False -default_pprint = lambda x, max_seq_items=None: pprint_thing( - x, escape_chars=("\t", "\r", "\n"), quote_strings=True, max_seq_items=max_seq_items -) +def default_pprint(thing: Any, max_seq_items: Optional[int] = None) -> str: + return pprint_thing( + thing, + escape_chars=("\t", "\r", "\n"), + quote_strings=True, + max_seq_items=max_seq_items, + ) def format_object_summary( From 282f1ee4f271ec204d584dcb2ba3a47bf14971db Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 5 Jul 2020 17:12:46 -0700 Subject: [PATCH 0270/1025] REF: dont consolidate in BlockManager.equals (#34962) * REF: dont consolidate in BlockManager.equals * doctest fixup * Remove Block.equals * simplify, comments --- pandas/core/internals/blocks.py | 27 +------------ pandas/core/internals/managers.py | 48 +++++++++++++++--------- pandas/tests/internals/test_internals.py | 4 +- 3 files changed, 33 insertions(+), 46 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 6207785fb2975..d8779dae7c384 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -56,12 +56,7 @@ ABCPandasArray, ABCSeries, ) -from pandas.core.dtypes.missing import ( - _isna_compat, - array_equivalent, - is_valid_nat_for_dtype, - isna, -) +from pandas.core.dtypes.missing import _isna_compat, is_valid_nat_for_dtype, isna import pandas.core.algorithms as algos from pandas.core.array_algos.transforms import shift @@ -1383,11 +1378,6 @@ def where_func(cond, values, other): return result_blocks - def equals(self, other) -> bool: - if self.dtype != other.dtype or self.shape != other.shape: - return False - return array_equivalent(self.values, other.values) - def _unstack(self, unstacker, fill_value, new_placement): """ Return a list of unstacked blocks of self @@ -1881,9 +1871,6 @@ def where( return [self.make_block_same_class(result, placement=self.mgr_locs)] - def equals(self, other) -> bool: - return self.values.equals(other.values) - def _unstack(self, unstacker, fill_value, new_placement): # ExtensionArray-safe unstack. # We override ObjectBlock._unstack, which unstacks directly on the @@ -1929,12 +1916,6 @@ class NumericBlock(Block): class FloatOrComplexBlock(NumericBlock): __slots__ = () - def equals(self, other) -> bool: - if self.dtype != other.dtype or self.shape != other.shape: - return False - left, right = self.values, other.values - return ((left == right) | (np.isnan(left) & np.isnan(right))).all() - class FloatBlock(FloatOrComplexBlock): __slots__ = () @@ -2298,12 +2279,6 @@ def setitem(self, indexer, value): ) return newb.setitem(indexer, value) - def equals(self, other) -> bool: - # override for significant performance improvement - if self.dtype != other.dtype or self.shape != other.shape: - return False - return (self.values.view("i8") == other.values.view("i8")).all() - def quantile(self, qs, interpolation="linear", axis=0): naive = self.values.view("M8[ns]") diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index b2f2277d9a7dc..c82670106d3b6 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -19,6 +19,7 @@ from pandas.core.dtypes.common import ( DT64NS_DTYPE, is_datetimelike_v_numeric, + is_dtype_equal, is_extension_array_dtype, is_list_like, is_numeric_v_string_like, @@ -27,9 +28,10 @@ from pandas.core.dtypes.concat import concat_compat from pandas.core.dtypes.dtypes import ExtensionDtype from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries -from pandas.core.dtypes.missing import isna +from pandas.core.dtypes.missing import array_equivalent, isna import pandas.core.algorithms as algos +from pandas.core.arrays import ExtensionArray from pandas.core.arrays.sparse import SparseDtype from pandas.core.base import PandasObject import pandas.core.common as com @@ -1409,29 +1411,39 @@ def take(self, indexer, axis: int = 1, verify: bool = True, convert: bool = True new_axis=new_labels, indexer=indexer, axis=axis, allow_dups=True ) - def equals(self, other) -> bool: + def equals(self, other: "BlockManager") -> bool: self_axes, other_axes = self.axes, other.axes if len(self_axes) != len(other_axes): return False if not all(ax1.equals(ax2) for ax1, ax2 in zip(self_axes, other_axes)): return False - self._consolidate_inplace() - other._consolidate_inplace() - if len(self.blocks) != len(other.blocks): - return False - # canonicalize block order, using a tuple combining the mgr_locs - # then type name because there might be unconsolidated - # blocks (say, Categorical) which can only be distinguished by - # the iteration order - def canonicalize(block): - return (block.mgr_locs.as_array.tolist(), block.dtype.name) - - self_blocks = sorted(self.blocks, key=canonicalize) - other_blocks = sorted(other.blocks, key=canonicalize) - return all( - block.equals(oblock) for block, oblock in zip(self_blocks, other_blocks) - ) + if self.ndim == 1: + # For SingleBlockManager (i.e.Series) + if other.ndim != 1: + return False + left = self.blocks[0].values + right = other.blocks[0].values + if not is_dtype_equal(left.dtype, right.dtype): + return False + elif isinstance(left, ExtensionArray): + return left.equals(right) + else: + return array_equivalent(left, right) + + for i in range(len(self.items)): + # Check column-wise, return False if any column doesnt match + left = self.iget_values(i) + right = other.iget_values(i) + if not is_dtype_equal(left.dtype, right.dtype): + return False + elif isinstance(left, ExtensionArray): + if not left.equals(right): + return False + else: + if not array_equivalent(left, right): + return False + return True def unstack(self, unstacker, fill_value) -> "BlockManager": """ diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index 5fd44d7cd74a9..06ccdd2484a2a 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -377,7 +377,7 @@ def test_copy(self, mgr): for blk, cp_blk in zip(mgr.blocks, cp.blocks): # view assertion - assert cp_blk.equals(blk) + tm.assert_equal(cp_blk.values, blk.values) if isinstance(blk.values, np.ndarray): assert cp_blk.values.base is blk.values.base else: @@ -389,7 +389,7 @@ def test_copy(self, mgr): # copy assertion we either have a None for a base or in case of # some blocks it is an array (e.g. datetimetz), but was copied - assert cp_blk.equals(blk) + tm.assert_equal(cp_blk.values, blk.values) if not isinstance(cp_blk.values, np.ndarray): assert cp_blk.values._data.base is not blk.values._data.base else: From 0dde19788161cf03b3a2787c669b7c8eeb4d1f5b Mon Sep 17 00:00:00 2001 From: Alex Kirko Date: Mon, 6 Jul 2020 17:19:43 +0300 Subject: [PATCH 0271/1025] CI: pin isort version (#35136) --- environment.yml | 2 +- requirements-dev.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/environment.yml b/environment.yml index 2429f4ab3d699..24c0832b6fb4c 100644 --- a/environment.yml +++ b/environment.yml @@ -20,7 +20,7 @@ dependencies: - flake8<3.8.0 # temporary pin, GH#34150 - flake8-comprehensions>=3.1.0 # used by flake8, linting of unnecessary comprehensions - flake8-rst>=0.6.0,<=0.7.0 # linting of code blocks in rst files - - isort # check that imports are in the right order + - isort=4.3.21 # check that imports are in the right order - mypy=0.730 - pycodestyle # used by flake8 diff --git a/requirements-dev.txt b/requirements-dev.txt index 44c975a3b3cfb..eda0fa8f32b19 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -11,7 +11,7 @@ cpplint flake8<3.8.0 flake8-comprehensions>=3.1.0 flake8-rst>=0.6.0,<=0.7.0 -isort +isort==4.3.21 mypy==0.730 pycodestyle gitpython From dd0dcd88ed456cc27d6cfda7975768ee8bf02767 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 6 Jul 2020 16:30:35 +0200 Subject: [PATCH 0272/1025] CI: pin sphinx <= 3.1.1 for autodoc failure (#35139) --- environment.yml | 2 +- requirements-dev.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/environment.yml b/environment.yml index 24c0832b6fb4c..80dbffebf6b9d 100644 --- a/environment.yml +++ b/environment.yml @@ -27,7 +27,7 @@ dependencies: # documentation - gitpython # obtain contributors from git for whatsnew - gitdb2=2.0.6 # GH-32060 - - sphinx + - sphinx<=3.1.1 # documentation (jupyter notebooks) - nbconvert>=5.4.1 diff --git a/requirements-dev.txt b/requirements-dev.txt index eda0fa8f32b19..886f400caf44f 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -16,7 +16,7 @@ mypy==0.730 pycodestyle gitpython gitdb2==2.0.6 -sphinx +sphinx<=3.1.1 nbconvert>=5.4.1 nbsphinx pandoc From 72607845f26cbeba5263e6aa91c449e3ce661905 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 6 Jul 2020 10:19:36 -0500 Subject: [PATCH 0273/1025] Fix numpy warning (#35085) --- pandas/conftest.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index d74c43069574f..5fe4cc45b0006 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -256,9 +256,7 @@ def nselect_method(request): # ---------------------------------------------------------------- # Missing values & co. # ---------------------------------------------------------------- -@pytest.fixture( - params=[None, np.nan, pd.NaT, float("nan"), np.float("NaN"), pd.NA], ids=str -) +@pytest.fixture(params=[None, np.nan, pd.NaT, float("nan"), pd.NA], ids=str) def nulls_fixture(request): """ Fixture for each null type in pandas. From 7ee499e088f32138b37140dc120dc44ccb865427 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 6 Jul 2020 11:01:30 -0700 Subject: [PATCH 0274/1025] TYP: type unit as str (#35099) --- pandas/_libs/tslib.pyx | 6 +++--- pandas/_libs/tslibs/conversion.pxd | 2 +- pandas/_libs/tslibs/conversion.pyx | 20 ++++++++++++++++---- pandas/_libs/tslibs/timedeltas.pxd | 2 +- pandas/_libs/tslibs/timedeltas.pyx | 6 +++--- pandas/_libs/tslibs/timestamps.pyx | 1 + 6 files changed, 25 insertions(+), 12 deletions(-) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index f494e74bde55f..8d8a62a58f25f 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -363,8 +363,8 @@ def format_array_from_datetime( def array_with_unit_to_datetime( ndarray values, - object unit, - str errors='coerce' + str unit, + str errors="coerce" ): """ Convert the ndarray to datetime according to the time unit. @@ -384,7 +384,7 @@ def array_with_unit_to_datetime( ---------- values : ndarray of object Date-like objects to convert. - unit : object + unit : str Time unit to use during conversion. errors : str, default 'raise' Error behavior when parsing. diff --git a/pandas/_libs/tslibs/conversion.pxd b/pandas/_libs/tslibs/conversion.pxd index 2cf75944a8196..0eb94fecf7d6b 100644 --- a/pandas/_libs/tslibs/conversion.pxd +++ b/pandas/_libs/tslibs/conversion.pxd @@ -13,7 +13,7 @@ cdef class _TSObject: bint fold -cdef convert_to_tsobject(object ts, tzinfo tz, object unit, +cdef convert_to_tsobject(object ts, tzinfo tz, str unit, bint dayfirst, bint yearfirst, int32_t nanos=*) diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index 95500f66db156..a1f074b1b29a8 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -56,8 +56,19 @@ TD64NS_DTYPE = np.dtype('m8[ns]') # Unit Conversion Helpers cdef inline int64_t cast_from_unit(object ts, str unit) except? -1: - """ return a casting of the unit represented to nanoseconds - round the fractional part of a float to our precision, p """ + """ + Return a casting of the unit represented to nanoseconds + round the fractional part of a float to our precision, p. + + Parameters + ---------- + ts : int, float, or None + unit : str + + Returns + ------- + int64_t + """ cdef: int64_t m int p @@ -307,7 +318,7 @@ cdef class _TSObject: return self.value -cdef convert_to_tsobject(object ts, tzinfo tz, object unit, +cdef convert_to_tsobject(object ts, tzinfo tz, str unit, bint dayfirst, bint yearfirst, int32_t nanos=0): """ Extract datetime and int64 from any of: @@ -497,7 +508,7 @@ cdef _TSObject _create_tsobject_tz_using_offset(npy_datetimestruct dts, return obj -cdef _TSObject _convert_str_to_tsobject(object ts, tzinfo tz, object unit, +cdef _TSObject _convert_str_to_tsobject(object ts, tzinfo tz, str unit, bint dayfirst=False, bint yearfirst=False): """ @@ -513,6 +524,7 @@ cdef _TSObject _convert_str_to_tsobject(object ts, tzinfo tz, object unit, Value to be converted to _TSObject tz : tzinfo or None timezone for the timezone-aware output + unit : str or None dayfirst : bool, default False When parsing an ambiguous date string, interpret e.g. "3/4/1975" as April 3, as opposed to the standard US interpretation March 4. diff --git a/pandas/_libs/tslibs/timedeltas.pxd b/pandas/_libs/tslibs/timedeltas.pxd index 70a418d7803d1..4142861e9ad38 100644 --- a/pandas/_libs/tslibs/timedeltas.pxd +++ b/pandas/_libs/tslibs/timedeltas.pxd @@ -3,7 +3,7 @@ from numpy cimport int64_t # Exposed for tslib, not intended for outside use. cpdef int64_t delta_to_nanoseconds(delta) except? -1 -cdef convert_to_timedelta64(object ts, object unit) +cdef convert_to_timedelta64(object ts, str unit) cdef bint is_any_td_scalar(object obj) diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index 2862e62e3d522..8f3a599bf107c 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -160,7 +160,7 @@ cpdef int64_t delta_to_nanoseconds(delta) except? -1: raise TypeError(type(delta)) -cdef convert_to_timedelta64(object ts, object unit): +cdef convert_to_timedelta64(object ts, str unit): """ Convert an incoming object to a timedelta64 if possible. Before calling, unit must be standardized to avoid repeated unit conversion @@ -218,7 +218,7 @@ cdef convert_to_timedelta64(object ts, object unit): @cython.boundscheck(False) @cython.wraparound(False) -def array_to_timedelta64(object[:] values, unit=None, errors='raise'): +def array_to_timedelta64(object[:] values, str unit=None, str errors="raise"): """ Convert an ndarray to an array of timedeltas. If errors == 'coerce', coerce non-convertible objects to NaT. Otherwise, raise. @@ -470,7 +470,7 @@ cdef inline timedelta_from_spec(object number, object frac, object unit): return cast_from_unit(float(n), unit) -cpdef inline str parse_timedelta_unit(object unit): +cpdef inline str parse_timedelta_unit(str unit): """ Parameters ---------- diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index e104b722ea119..b79a5c49bdd10 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -1050,6 +1050,7 @@ class Timestamp(_Timestamp): nanosecond = hour tz = minute freq = None + unit = None if getattr(ts_input, 'tzinfo', None) is not None and tz is not None: raise ValueError("Cannot pass a datetime or Timestamp with tzinfo with " From 5aa2063f2efbb51cb278bde8438998fa4eec313e Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Mon, 6 Jul 2020 21:55:02 +0100 Subject: [PATCH 0275/1025] BUG: reset_index is passing a bad dtype to NumPy (#35111) --- pandas/core/frame.py | 10 +++++++--- pandas/tests/frame/methods/test_reset_index.py | 17 ++++++++++++++--- pandas/tests/series/methods/test_reset_index.py | 17 ++++++++++++++--- 3 files changed, 35 insertions(+), 9 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b6993e9ed851a..87041341ac3a6 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -75,6 +75,7 @@ from pandas.core.dtypes.cast import ( cast_scalar_to_array, coerce_to_dtypes, + construct_1d_arraylike_from_scalar, find_common_type, infer_dtype_from_scalar, invalidate_string_dtypes, @@ -109,7 +110,7 @@ needs_i8_conversion, pandas_dtype, ) -from pandas.core.dtypes.missing import isna, notna +from pandas.core.dtypes.missing import isna, na_value_for_dtype, notna from pandas.core import algorithms, common as com, nanops, ops from pandas.core.accessor import CachedAccessor @@ -4731,8 +4732,11 @@ def _maybe_casted_values(index, labels=None): # we can have situations where the whole mask is -1, # meaning there is nothing found in labels, so make all nan's if mask.all(): - values = np.empty(len(mask), dtype=index.dtype) - values.fill(np.nan) + dtype = index.dtype + fill_value = na_value_for_dtype(dtype) + values = construct_1d_arraylike_from_scalar( + fill_value, len(mask), dtype + ) else: values = values.take(labels) diff --git a/pandas/tests/frame/methods/test_reset_index.py b/pandas/tests/frame/methods/test_reset_index.py index 79442acccb326..cf0bbe144caa5 100644 --- a/pandas/tests/frame/methods/test_reset_index.py +++ b/pandas/tests/frame/methods/test_reset_index.py @@ -3,6 +3,7 @@ import numpy as np import pytest +import pandas as pd from pandas import ( DataFrame, Index, @@ -299,9 +300,19 @@ def test_reset_index_range(self): tm.assert_frame_equal(result, expected) -def test_reset_index_dtypes_on_empty_frame_with_multiindex(): +@pytest.mark.parametrize( + "array, dtype", + [ + (["a", "b"], object), + ( + pd.period_range("12-1-2000", periods=2, freq="Q-DEC"), + pd.PeriodDtype(freq="Q-DEC"), + ), + ], +) +def test_reset_index_dtypes_on_empty_frame_with_multiindex(array, dtype): # GH 19602 - Preserve dtype on empty DataFrame with MultiIndex - idx = MultiIndex.from_product([[0, 1], [0.5, 1.0], ["a", "b"]]) + idx = MultiIndex.from_product([[0, 1], [0.5, 1.0], array]) result = DataFrame(index=idx)[:0].reset_index().dtypes - expected = Series({"level_0": np.int64, "level_1": np.float64, "level_2": object}) + expected = Series({"level_0": np.int64, "level_1": np.float64, "level_2": dtype}) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/methods/test_reset_index.py b/pandas/tests/series/methods/test_reset_index.py index a11590d42552d..597b43a370ef5 100644 --- a/pandas/tests/series/methods/test_reset_index.py +++ b/pandas/tests/series/methods/test_reset_index.py @@ -1,6 +1,7 @@ import numpy as np import pytest +import pandas as pd from pandas import DataFrame, Index, MultiIndex, RangeIndex, Series import pandas._testing as tm @@ -110,11 +111,21 @@ def test_reset_index_drop_errors(self): s.reset_index("wrong", drop=True) -def test_reset_index_dtypes_on_empty_series_with_multiindex(): +@pytest.mark.parametrize( + "array, dtype", + [ + (["a", "b"], object), + ( + pd.period_range("12-1-2000", periods=2, freq="Q-DEC"), + pd.PeriodDtype(freq="Q-DEC"), + ), + ], +) +def test_reset_index_dtypes_on_empty_series_with_multiindex(array, dtype): # GH 19602 - Preserve dtype on empty Series with MultiIndex - idx = MultiIndex.from_product([[0, 1], [0.5, 1.0], ["a", "b"]]) + idx = MultiIndex.from_product([[0, 1], [0.5, 1.0], array]) result = Series(dtype=object, index=idx)[:0].reset_index().dtypes expected = Series( - {"level_0": np.int64, "level_1": np.float64, "level_2": object, 0: object} + {"level_0": np.int64, "level_1": np.float64, "level_2": dtype, 0: object} ) tm.assert_series_equal(result, expected) From 62916dd8026b371997c8355a0442ecdbe3088bd3 Mon Sep 17 00:00:00 2001 From: Gabriel Tutui Date: Mon, 6 Jul 2020 18:28:07 -0300 Subject: [PATCH 0276/1025] TST: Add test for category equalness on applies (#21239) (#35125) --- pandas/tests/frame/test_apply.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/pandas/tests/frame/test_apply.py b/pandas/tests/frame/test_apply.py index 114b3c0d0a3fc..3a32278e2a4b1 100644 --- a/pandas/tests/frame/test_apply.py +++ b/pandas/tests/frame/test_apply.py @@ -793,6 +793,18 @@ def test_apply_with_byte_string(self): result = df.apply(lambda x: x.astype("object")) tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize("val", ["asd", 12, None, np.NaN]) + def test_apply_category_equalness(self, val): + # Check if categorical comparisons on apply, GH 21239 + df_values = ["asd", None, 12, "asd", "cde", np.NaN] + df = pd.DataFrame({"a": df_values}, dtype="category") + + result = df.a.apply(lambda x: x == val) + expected = pd.Series( + [np.NaN if pd.isnull(x) else x == val for x in df_values], name="a" + ) + tm.assert_series_equal(result, expected) + class TestInferOutputShape: # the user has supplied an opaque UDF where From bcb17e4f23a8e0d788c6a9c4353692062de1c09a Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Mon, 6 Jul 2020 15:37:56 -0700 Subject: [PATCH 0277/1025] ENH: Add support for calculating EWMA with a time component (#34839) --- asv_bench/benchmarks/rolling.py | 7 ++ doc/source/user_guide/computation.rst | 19 +++++ doc/source/whatsnew/v1.1.0.rst | 1 + pandas/_libs/window/aggregations.pyx | 61 ++++++++++++++--- pandas/core/generic.py | 2 + pandas/core/window/ewm.py | 99 +++++++++++++++++++++++---- pandas/tests/window/conftest.py | 8 ++- pandas/tests/window/test_ewm.py | 60 +++++++++++++++- 8 files changed, 233 insertions(+), 24 deletions(-) diff --git a/asv_bench/benchmarks/rolling.py b/asv_bench/benchmarks/rolling.py index b1f6d052919bd..f0dd908f81043 100644 --- a/asv_bench/benchmarks/rolling.py +++ b/asv_bench/benchmarks/rolling.py @@ -91,11 +91,18 @@ class EWMMethods: def setup(self, constructor, window, dtype, method): N = 10 ** 5 arr = (100 * np.random.random(N)).astype(dtype) + times = pd.date_range("1900", periods=N, freq="23s") self.ewm = getattr(pd, constructor)(arr).ewm(halflife=window) + self.ewm_times = getattr(pd, constructor)(arr).ewm( + halflife="1 Day", times=times + ) def time_ewm(self, constructor, window, dtype, method): getattr(self.ewm, method)() + def time_ewm_times(self, constructor, window, dtype, method): + self.ewm.mean() + class VariableWindowMethods(Methods): params = ( diff --git a/doc/source/user_guide/computation.rst b/doc/source/user_guide/computation.rst index f36c6e06044f2..d7875e5b8d861 100644 --- a/doc/source/user_guide/computation.rst +++ b/doc/source/user_guide/computation.rst @@ -1095,6 +1095,25 @@ and **alpha** to the EW functions: one half. * **Alpha** specifies the smoothing factor directly. +.. versionadded:: 1.1.0 + +You can also specify ``halflife`` in terms of a timedelta convertible unit to specify the amount of +time it takes for an observation to decay to half its value when also specifying a sequence +of ``times``. + +.. ipython:: python + + df = pd.DataFrame({'B': [0, 1, 2, np.nan, 4]}) + df + times = ['2020-01-01', '2020-01-03', '2020-01-10', '2020-01-15', '2020-01-17'] + df.ewm(halflife='4 days', times=pd.DatetimeIndex(times)).mean() + +The following formula is used to compute exponentially weighted mean with an input vector of times: + +.. math:: + + y_t = \frac{\sum_{i=0}^t 0.5^\frac{t_{t} - t_{i}}{\lambda} x_{t-i}}{0.5^\frac{t_{t} - t_{i}}{\lambda}}, + Here is an example for a univariate time series: .. ipython:: python diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 9bd4ddbb624d9..563ae6d1d2ed1 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -329,6 +329,7 @@ Other enhancements - :meth:`DataFrame.to_excel` can now also write OpenOffice spreadsheet (.ods) files (:issue:`27222`) - :meth:`~Series.explode` now accepts ``ignore_index`` to reset the index, similarly to :meth:`pd.concat` or :meth:`DataFrame.sort_values` (:issue:`34932`). - :meth:`read_csv` now accepts string values like "0", "0.0", "1", "1.0" as convertible to the nullable boolean dtype (:issue:`34859`) +- :class:`pandas.core.window.ExponentialMovingWindow` now supports a ``times`` argument that allows ``mean`` to be calculated with observations spaced by the timestamps in ``times`` (:issue:`34839`) .. --------------------------------------------------------------------------- diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index ec4a412b5adc7..362d0e6263697 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -8,7 +8,7 @@ from libc.stdlib cimport malloc, free import numpy as np cimport numpy as cnp -from numpy cimport ndarray, int64_t, float64_t, float32_t +from numpy cimport ndarray, int64_t, float64_t, float32_t, uint8_t cnp.import_array() @@ -1752,6 +1752,51 @@ def roll_weighted_var(float64_t[:] values, float64_t[:] weights, # ---------------------------------------------------------------------- # Exponentially weighted moving average +def ewma_time(ndarray[float64_t] vals, int minp, ndarray[int64_t] times, + int64_t halflife): + """ + Compute exponentially-weighted moving average using halflife and time + distances. + + Parameters + ---------- + vals : ndarray[float_64] + minp : int + times : ndarray[int64] + halflife : int64 + + Returns + ------- + ndarray + """ + cdef: + Py_ssize_t i, num_not_nan = 0, N = len(vals) + bint is_not_nan + float64_t last_result + ndarray[uint8_t] mask = np.zeros(N, dtype=np.uint8) + ndarray[float64_t] weights, observations, output = np.empty(N, dtype=np.float64) + + if N == 0: + return output + + last_result = vals[0] + + for i in range(N): + is_not_nan = vals[i] == vals[i] + num_not_nan += is_not_nan + if is_not_nan: + mask[i] = 1 + weights = 0.5 ** ((times[i] - times[mask.view(np.bool_)]) / halflife) + observations = vals[mask.view(np.bool_)] + last_result = np.sum(weights * observations) / np.sum(weights) + + if num_not_nan >= minp: + output[i] = last_result + else: + output[i] = NaN + + return output + def ewma(float64_t[:] vals, float64_t com, bint adjust, bint ignore_na, int minp): """ @@ -1761,9 +1806,9 @@ def ewma(float64_t[:] vals, float64_t com, bint adjust, bint ignore_na, int minp ---------- vals : ndarray (float64 type) com : float64 - adjust: int - ignore_na: bool - minp: int + adjust : int + ignore_na : bool + minp : int Returns ------- @@ -1831,10 +1876,10 @@ def ewmcov(float64_t[:] input_x, float64_t[:] input_y, input_x : ndarray (float64 type) input_y : ndarray (float64 type) com : float64 - adjust: int - ignore_na: bool - minp: int - bias: int + adjust : int + ignore_na : bool + minp : int + bias : int Returns ------- diff --git a/pandas/core/generic.py b/pandas/core/generic.py index d892e2487b31c..0b76960558721 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -10518,6 +10518,7 @@ def ewm( adjust=True, ignore_na=False, axis=0, + times=None, ): axis = self._get_axis_number(axis) return ExponentialMovingWindow( @@ -10530,6 +10531,7 @@ def ewm( adjust=adjust, ignore_na=ignore_na, axis=axis, + times=times, ) cls.ewm = ewm diff --git a/pandas/core/window/ewm.py b/pandas/core/window/ewm.py index ee80f80b320e4..7a2d8e84bec76 100644 --- a/pandas/core/window/ewm.py +++ b/pandas/core/window/ewm.py @@ -1,18 +1,21 @@ +import datetime from functools import partial from textwrap import dedent from typing import Optional, Union import numpy as np +from pandas._libs.tslibs import Timedelta import pandas._libs.window.aggregations as window_aggregations -from pandas._typing import FrameOrSeries +from pandas._typing import FrameOrSeries, TimedeltaConvertibleTypes from pandas.compat.numpy import function as nv from pandas.util._decorators import Appender, Substitution, doc +from pandas.core.dtypes.common import is_datetime64_ns_dtype from pandas.core.dtypes.generic import ABCDataFrame from pandas.core.base import DataError -import pandas.core.common as com +import pandas.core.common as common from pandas.core.window.common import _doc_template, _shared_docs, zsqrt from pandas.core.window.rolling import _flex_binary_moment, _Rolling @@ -32,7 +35,7 @@ def get_center_of_mass( halflife: Optional[float], alpha: Optional[float], ) -> float: - valid_count = com.count_not_none(comass, span, halflife, alpha) + valid_count = common.count_not_none(comass, span, halflife, alpha) if valid_count > 1: raise ValueError("comass, span, halflife, and alpha are mutually exclusive") @@ -76,10 +79,17 @@ class ExponentialMovingWindow(_Rolling): span : float, optional Specify decay in terms of span, :math:`\alpha = 2 / (span + 1)`, for :math:`span \geq 1`. - halflife : float, optional + halflife : float, str, timedelta, optional Specify decay in terms of half-life, :math:`\alpha = 1 - \exp\left(-\ln(2) / halflife\right)`, for :math:`halflife > 0`. + + If ``times`` is specified, the time unit (str or timedelta) over which an + observation decays to half its value. Only applicable to ``mean()`` + and halflife value will not apply to the other functions. + + .. versionadded:: 1.1.0 + alpha : float, optional Specify smoothing factor :math:`\alpha` directly, :math:`0 < \alpha \leq 1`. @@ -124,6 +134,18 @@ class ExponentialMovingWindow(_Rolling): axis : {0, 1}, default 0 The axis to use. The value 0 identifies the rows, and 1 identifies the columns. + times : str, np.ndarray, Series, default None + + .. versionadded:: 1.1.0 + + Times corresponding to the observations. Must be monotonically increasing and + ``datetime64[ns]`` dtype. + + If str, the name of the column in the DataFrame representing the times. + + If 1-D array like, a sequence with the same shape as the observations. + + Only applicable to ``mean()``. Returns ------- @@ -159,6 +181,17 @@ class ExponentialMovingWindow(_Rolling): 2 1.615385 3 1.615385 4 3.670213 + + Specifying ``times`` with a timedelta ``halflife`` when computing mean. + + >>> times = ['2020-01-01', '2020-01-03', '2020-01-10', '2020-01-15', '2020-01-17'] + >>> df.ewm(halflife='4 days', times=pd.DatetimeIndex(times)).mean() + B + 0 0.000000 + 1 0.585786 + 2 1.523889 + 3 1.523889 + 4 3.233686 """ _attributes = ["com", "min_periods", "adjust", "ignore_na", "axis"] @@ -168,20 +201,49 @@ def __init__( obj, com: Optional[float] = None, span: Optional[float] = None, - halflife: Optional[float] = None, + halflife: Optional[Union[float, TimedeltaConvertibleTypes]] = None, alpha: Optional[float] = None, min_periods: int = 0, adjust: bool = True, ignore_na: bool = False, axis: int = 0, + times: Optional[Union[str, np.ndarray, FrameOrSeries]] = None, ): + self.com: Optional[float] self.obj = obj - self.com = get_center_of_mass(com, span, halflife, alpha) self.min_periods = max(int(min_periods), 1) self.adjust = adjust self.ignore_na = ignore_na self.axis = axis self.on = None + if times is not None: + if isinstance(times, str): + times = self._selected_obj[times] + if not is_datetime64_ns_dtype(times): + raise ValueError("times must be datetime64[ns] dtype.") + if len(times) != len(obj): + raise ValueError("times must be the same length as the object.") + if not isinstance(halflife, (str, datetime.timedelta)): + raise ValueError( + "halflife must be a string or datetime.timedelta object" + ) + self.times = np.asarray(times.astype(np.int64)) + self.halflife = Timedelta(halflife).value + # Halflife is no longer applicable when calculating COM + # But allow COM to still be calculated if the user passes other decay args + if common.count_not_none(com, span, alpha) > 0: + self.com = get_center_of_mass(com, span, None, alpha) + else: + self.com = None + else: + if halflife is not None and isinstance(halflife, (str, datetime.timedelta)): + raise ValueError( + "halflife can only be a timedelta convertible argument if " + "times is not None." + ) + self.times = None + self.halflife = None + self.com = get_center_of_mass(com, span, halflife, alpha) @property def _constructor(self): @@ -277,14 +339,23 @@ def mean(self, *args, **kwargs): Arguments and keyword arguments to be passed into func. """ nv.validate_window_func("mean", args, kwargs) - window_func = self._get_roll_func("ewma") - window_func = partial( - window_func, - com=self.com, - adjust=self.adjust, - ignore_na=self.ignore_na, - minp=self.min_periods, - ) + if self.times is not None: + window_func = self._get_roll_func("ewma_time") + window_func = partial( + window_func, + minp=self.min_periods, + times=self.times, + halflife=self.halflife, + ) + else: + window_func = self._get_roll_func("ewma") + window_func = partial( + window_func, + com=self.com, + adjust=self.adjust, + ignore_na=self.ignore_na, + minp=self.min_periods, + ) return self._apply(window_func) @Substitution(name="ewm", func_name="std") diff --git a/pandas/tests/window/conftest.py b/pandas/tests/window/conftest.py index 74f3406d30225..eb8252d5731be 100644 --- a/pandas/tests/window/conftest.py +++ b/pandas/tests/window/conftest.py @@ -1,4 +1,4 @@ -from datetime import datetime +from datetime import datetime, timedelta import numpy as np from numpy.random import randn @@ -302,3 +302,9 @@ def series(): def which(request): """Turn parametrized which as fixture for series and frame""" return request.param + + +@pytest.fixture(params=["1 day", timedelta(days=1)]) +def halflife_with_times(request): + """Halflife argument for EWM when times is specified.""" + return request.param diff --git a/pandas/tests/window/test_ewm.py b/pandas/tests/window/test_ewm.py index 44015597ddb19..12c314d5e9ec9 100644 --- a/pandas/tests/window/test_ewm.py +++ b/pandas/tests/window/test_ewm.py @@ -3,7 +3,8 @@ from pandas.errors import UnsupportedFunctionCall -from pandas import DataFrame, Series +from pandas import DataFrame, DatetimeIndex, Series, date_range +import pandas._testing as tm from pandas.core.window import ExponentialMovingWindow @@ -69,3 +70,60 @@ def test_numpy_compat(method): getattr(e, method)(1, 2, 3) with pytest.raises(UnsupportedFunctionCall, match=msg): getattr(e, method)(dtype=np.float64) + + +def test_ewma_times_not_datetime_type(): + msg = r"times must be datetime64\[ns\] dtype." + with pytest.raises(ValueError, match=msg): + Series(range(5)).ewm(times=np.arange(5)) + + +def test_ewma_times_not_same_length(): + msg = "times must be the same length as the object." + with pytest.raises(ValueError, match=msg): + Series(range(5)).ewm(times=np.arange(4).astype("datetime64[ns]")) + + +def test_ewma_halflife_not_correct_type(): + msg = "halflife must be a string or datetime.timedelta object" + with pytest.raises(ValueError, match=msg): + Series(range(5)).ewm(halflife=1, times=np.arange(5).astype("datetime64[ns]")) + + +def test_ewma_halflife_without_times(halflife_with_times): + msg = "halflife can only be a timedelta convertible argument if times is not None." + with pytest.raises(ValueError, match=msg): + Series(range(5)).ewm(halflife=halflife_with_times) + + +@pytest.mark.parametrize( + "times", + [ + np.arange(10).astype("datetime64[D]").astype("datetime64[ns]"), + date_range("2000", freq="D", periods=10), + date_range("2000", freq="D", periods=10).tz_localize("UTC"), + "time_col", + ], +) +@pytest.mark.parametrize("min_periods", [0, 2]) +def test_ewma_with_times_equal_spacing(halflife_with_times, times, min_periods): + halflife = halflife_with_times + data = np.arange(10) + data[::2] = np.nan + df = DataFrame({"A": data, "time_col": date_range("2000", freq="D", periods=10)}) + result = df.ewm(halflife=halflife, min_periods=min_periods, times=times).mean() + expected = df.ewm(halflife=1.0, min_periods=min_periods).mean() + tm.assert_frame_equal(result, expected) + + +def test_ewma_with_times_variable_spacing(tz_aware_fixture): + tz = tz_aware_fixture + halflife = "23 days" + times = DatetimeIndex( + ["2020-01-01", "2020-01-10T00:04:05", "2020-02-23T05:00:23"] + ).tz_localize(tz) + data = np.arange(3) + df = DataFrame(data) + result = df.ewm(halflife=halflife, times=times).mean() + expected = DataFrame([0.0, 0.5674161888241773, 1.545239952073459]) + tm.assert_frame_equal(result, expected) From 46155452cd182cc9f00de0e54b8f503111e1e482 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 6 Jul 2020 15:51:43 -0700 Subject: [PATCH 0278/1025] TYP: get_utcoffset (#35107) --- pandas/_libs/tslibs/timezones.pxd | 4 ++-- pandas/_libs/tslibs/timezones.pyx | 4 ++-- pandas/core/arrays/datetimes.py | 3 +-- pandas/tseries/frequencies.py | 5 ++--- 4 files changed, 7 insertions(+), 9 deletions(-) diff --git a/pandas/_libs/tslibs/timezones.pxd b/pandas/_libs/tslibs/timezones.pxd index 0784b090b3edb..f51ee41cb99a6 100644 --- a/pandas/_libs/tslibs/timezones.pxd +++ b/pandas/_libs/tslibs/timezones.pxd @@ -1,4 +1,4 @@ -from cpython.datetime cimport tzinfo +from cpython.datetime cimport datetime, timedelta, tzinfo cdef tzinfo utc_pytz @@ -11,7 +11,7 @@ cpdef bint tz_compare(tzinfo start, tzinfo end) cpdef object get_timezone(tzinfo tz) cpdef object maybe_get_tz(object tz) -cdef get_utcoffset(tzinfo tz, obj) +cdef timedelta get_utcoffset(tzinfo tz, datetime obj) cdef bint is_fixed_offset(tzinfo tz) cdef object get_dst_info(tzinfo tz) diff --git a/pandas/_libs/tslibs/timezones.pyx b/pandas/_libs/tslibs/timezones.pyx index 0460591048801..3b2104f75956a 100644 --- a/pandas/_libs/tslibs/timezones.pyx +++ b/pandas/_libs/tslibs/timezones.pyx @@ -1,5 +1,5 @@ from datetime import timezone -from cpython.datetime cimport datetime, tzinfo +from cpython.datetime cimport datetime, timedelta, tzinfo # dateutil compat from dateutil.tz import ( @@ -153,7 +153,7 @@ cdef inline object tz_cache_key(tzinfo tz): # UTC Offsets -cdef get_utcoffset(tzinfo tz, obj): +cdef timedelta get_utcoffset(tzinfo tz, datetime obj): try: return tz._utcoffset except AttributeError: diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 64e8582baaa98..fcfbaa4ac2a1c 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -3,7 +3,6 @@ import warnings import numpy as np -from pytz import utc from pandas._libs import lib, tslib from pandas._libs.tslibs import ( @@ -725,7 +724,7 @@ def _local_timestamps(self): This is used to calculate time-of-day information as if the timestamps were timezone-naive. """ - return tzconversion.tz_convert(self.asi8, utc, self.tz) + return tzconversion.tz_convert(self.asi8, timezones.UTC, self.tz) def tz_convert(self, tz): """ diff --git a/pandas/tseries/frequencies.py b/pandas/tseries/frequencies.py index 7516d9748c18f..f94c8ef6550a5 100644 --- a/pandas/tseries/frequencies.py +++ b/pandas/tseries/frequencies.py @@ -4,7 +4,7 @@ import numpy as np from pandas._libs.algos import unique_deltas -from pandas._libs.tslibs import Timestamp +from pandas._libs.tslibs import Timestamp, tzconversion from pandas._libs.tslibs.ccalendar import ( DAYS, MONTH_ALIASES, @@ -22,7 +22,6 @@ from pandas._libs.tslibs.parsing import get_rule_month from pandas._libs.tslibs.resolution import month_position_check from pandas._libs.tslibs.timezones import UTC -from pandas._libs.tslibs.tzconversion import tz_convert from pandas.util._decorators import cache_readonly from pandas.core.dtypes.common import ( @@ -199,7 +198,7 @@ def __init__(self, index, warn: bool = True): # the timezone so they are in local time if hasattr(index, "tz"): if index.tz is not None: - self.i8values = tz_convert(self.i8values, UTC, index.tz) + self.i8values = tzconversion.tz_convert(self.i8values, UTC, index.tz) self.warn = warn From 49db0f60f68654a5e30bb4b926e6d3dd5097d720 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 6 Jul 2020 15:59:07 -0700 Subject: [PATCH 0279/1025] PERF: ints_to_pydatetime (#35113) --- pandas/_libs/tslib.pyx | 91 +++++++++++++++++------------------------- 1 file changed, 36 insertions(+), 55 deletions(-) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 8d8a62a58f25f..4fda44e766109 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -15,7 +15,7 @@ PyDateTime_IMPORT cimport numpy as cnp -from numpy cimport float64_t, int64_t, ndarray, uint8_t +from numpy cimport float64_t, int64_t, ndarray, uint8_t, intp_t import numpy as np cnp.import_array() @@ -157,13 +157,15 @@ def ints_to_pydatetime( Py_ssize_t i, n = len(arr) ndarray[int64_t] trans int64_t[:] deltas - Py_ssize_t pos + intp_t[:] pos npy_datetimestruct dts object dt, new_tz str typ - int64_t value, delta, local_value + int64_t value, local_value, delta = NPY_NAT # dummy for delta ndarray[object] result = np.empty(n, dtype=object) object (*func_create)(int64_t, npy_datetimestruct, tzinfo, object, bint) + bint use_utc = False, use_tzlocal = False, use_fixed = False + bint use_pytz = False if box == "date": assert (tz is None), "tz should be None when converting to date" @@ -184,66 +186,45 @@ def ints_to_pydatetime( ) if is_utc(tz) or tz is None: - for i in range(n): - value = arr[i] - if value == NPY_NAT: - result[i] = NaT - else: - dt64_to_dtstruct(value, &dts) - result[i] = func_create(value, dts, tz, freq, fold) + use_utc = True elif is_tzlocal(tz): - for i in range(n): - value = arr[i] - if value == NPY_NAT: - result[i] = NaT - else: - # Python datetime objects do not support nanosecond - # resolution (yet, PEP 564). Need to compute new value - # using the i8 representation. - local_value = tz_convert_utc_to_tzlocal(value, tz) - dt64_to_dtstruct(local_value, &dts) - result[i] = func_create(value, dts, tz, freq, fold) + use_tzlocal = True else: trans, deltas, typ = get_dst_info(tz) - - if typ not in ['pytz', 'dateutil']: + if typ not in ["pytz", "dateutil"]: # static/fixed; in this case we know that len(delta) == 1 + use_fixed = True delta = deltas[0] - for i in range(n): - value = arr[i] - if value == NPY_NAT: - result[i] = NaT - else: - # Adjust datetime64 timestamp, recompute datetimestruct - dt64_to_dtstruct(value + delta, &dts) - result[i] = func_create(value, dts, tz, freq, fold) + else: + pos = trans.searchsorted(arr, side="right") - 1 + use_pytz = typ == "pytz" - elif typ == 'dateutil': - # no zone-name change for dateutil tzs - dst etc - # represented in single object. - for i in range(n): - value = arr[i] - if value == NPY_NAT: - result[i] = NaT - else: - # Adjust datetime64 timestamp, recompute datetimestruct - pos = trans.searchsorted(value, side='right') - 1 - dt64_to_dtstruct(value + deltas[pos], &dts) - result[i] = func_create(value, dts, tz, freq, fold) + for i in range(n): + new_tz = tz + value = arr[i] + + if value == NPY_NAT: + result[i] = NaT else: - # pytz - for i in range(n): - value = arr[i] - if value == NPY_NAT: - result[i] = NaT - else: - # Adjust datetime64 timestamp, recompute datetimestruct - pos = trans.searchsorted(value, side='right') - 1 - # find right representation of dst etc in pytz timezone - new_tz = tz._tzinfos[tz._transition_info[pos]] + if use_utc: + local_value = value + elif use_tzlocal: + local_value = tz_convert_utc_to_tzlocal(value, tz) + elif use_fixed: + local_value = value + delta + elif not use_pytz: + # i.e. dateutil + # no zone-name change for dateutil tzs - dst etc + # represented in single object. + local_value = value + deltas[pos[i]] + else: + # pytz + # find right representation of dst etc in pytz timezone + new_tz = tz._tzinfos[tz._transition_info[pos[i]]] + local_value = value + deltas[pos[i]] - dt64_to_dtstruct(value + deltas[pos], &dts) - result[i] = func_create(value, dts, new_tz, freq, fold) + dt64_to_dtstruct(local_value, &dts) + result[i] = func_create(value, dts, new_tz, freq, fold) return result From 5e1b8756209d54fc6963abff0bc28b4cb50210f9 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 6 Jul 2020 16:00:00 -0700 Subject: [PATCH 0280/1025] CLN: unused imports in tslibs (#35133) --- pandas/_libs/tslibs/offsets.pyx | 16 +--------------- pandas/_libs/tslibs/timestamps.pyx | 4 ++-- 2 files changed, 3 insertions(+), 17 deletions(-) diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index df43ebcfd9df2..e4d05e0d70e2f 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -51,7 +51,6 @@ from pandas._libs.tslibs.timezones cimport utc_pytz as UTC from pandas._libs.tslibs.tzconversion cimport tz_convert_single from .dtypes cimport PeriodDtypeCode -from .fields import get_start_end_field from .timedeltas cimport delta_to_nanoseconds from .timedeltas import Timedelta from .timestamps cimport _Timestamp @@ -99,12 +98,6 @@ def apply_index_wraps(func): # do @functools.wraps(func) manually since it doesn't work on cdef funcs wrapper.__name__ = func.__name__ wrapper.__doc__ = func.__doc__ - try: - wrapper.__module__ = func.__module__ - except AttributeError: - # AttributeError: 'method_descriptor' object has no - # attribute '__module__' - pass return wrapper @@ -159,12 +152,6 @@ def apply_wraps(func): # do @functools.wraps(func) manually since it doesn't work on cdef funcs wrapper.__name__ = func.__name__ wrapper.__doc__ = func.__doc__ - try: - wrapper.__module__ = func.__module__ - except AttributeError: - # AttributeError: 'method_descriptor' object has no - # attribute '__module__' - pass return wrapper @@ -355,8 +342,7 @@ class ApplyTypeError(TypeError): cdef class BaseOffset: """ - Base class for DateOffset methods that are not overridden by subclasses - and will (after pickle errors are resolved) go into a cdef class. + Base class for DateOffset methods that are not overridden by subclasses. """ _day_opt = None _attributes = tuple(["n", "normalize"]) diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index b79a5c49bdd10..9035d1bb2f643 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -14,7 +14,7 @@ from numpy cimport int64_t, int8_t, uint8_t, ndarray cnp.import_array() from cpython.object cimport (PyObject_RichCompareBool, PyObject_RichCompare, - Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE) + Py_EQ, Py_NE) from cpython.datetime cimport ( datetime, @@ -51,7 +51,7 @@ from pandas._libs.tslibs.np_datetime cimport ( pydatetime_to_dt64, ) from pandas._libs.tslibs.np_datetime import OutOfBoundsDatetime -from pandas._libs.tslibs.offsets cimport to_offset, is_tick_object, is_offset_object +from pandas._libs.tslibs.offsets cimport to_offset, is_offset_object from pandas._libs.tslibs.timedeltas cimport is_any_td_scalar, delta_to_nanoseconds from pandas._libs.tslibs.timedeltas import Timedelta from pandas._libs.tslibs.timezones cimport ( From 56e7a21317e0ac3db8e9e67d487bdba91b6fbd2b Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 6 Jul 2020 18:25:27 -0500 Subject: [PATCH 0281/1025] DEPR: Deprecate n-dim indexing for Series (#35141) --- doc/source/whatsnew/v1.1.0.rst | 1 + pandas/core/indexers.py | 10 +++++----- pandas/core/series.py | 12 ++++-------- pandas/tests/indexes/common.py | 2 +- pandas/tests/indexes/datetimes/test_indexing.py | 4 ++-- pandas/tests/indexes/interval/test_base.py | 2 +- pandas/tests/indexes/test_base.py | 4 ++-- pandas/tests/series/indexing/test_getitem.py | 12 +++++++----- 8 files changed, 23 insertions(+), 24 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 563ae6d1d2ed1..2bb933f6bdb60 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -789,6 +789,7 @@ Deprecations - :meth:`Categorical.to_dense` is deprecated and will be removed in a future version, use ``np.asarray(cat)`` instead (:issue:`32639`) - The ``fastpath`` keyword in the ``SingleBlockManager`` constructor is deprecated and will be removed in a future version (:issue:`33092`) - Providing ``suffixes`` as a ``set`` in :func:`pandas.merge` is deprecated. Provide a tuple instead (:issue:`33740`, :issue:`34741`). +- Indexing a series with a multi-dimensional indexer like ``[:, None]`` to return an ndarray now raises a ``FutureWarning``. Convert to a NumPy array before indexing instead (:issue:`27837`) - :meth:`Index.is_mixed` is deprecated and will be removed in a future version, check ``index.inferred_type`` directly instead (:issue:`32922`) - Passing any arguments but the first one to :func:`read_html` as diff --git a/pandas/core/indexers.py b/pandas/core/indexers.py index 6dbcfef46fa98..d9aa02db3e42a 100644 --- a/pandas/core/indexers.py +++ b/pandas/core/indexers.py @@ -295,7 +295,7 @@ def length_of_indexer(indexer, target=None) -> int: raise AssertionError("cannot find the length of the indexer") -def deprecate_ndim_indexing(result): +def deprecate_ndim_indexing(result, stacklevel=3): """ Helper function to raise the deprecation warning for multi-dimensional indexing on 1D Series/Index. @@ -306,11 +306,11 @@ def deprecate_ndim_indexing(result): """ if np.ndim(result) > 1: warnings.warn( - "Support for multi-dimensional indexing (e.g. `index[:, None]`) " - "on an Index is deprecated and will be removed in a future " + "Support for multi-dimensional indexing (e.g. `obj[:, None]`) " + "is deprecated and will be removed in a future " "version. Convert to a numpy array before indexing instead.", - DeprecationWarning, - stacklevel=3, + FutureWarning, + stacklevel=stacklevel, ) diff --git a/pandas/core/series.py b/pandas/core/series.py index be4099d56d43a..6c1d21e4526cf 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -78,7 +78,7 @@ sanitize_array, ) from pandas.core.generic import NDFrame -from pandas.core.indexers import unpack_1tuple +from pandas.core.indexers import deprecate_ndim_indexing, unpack_1tuple from pandas.core.indexes.accessors import CombinedDatetimelikeProperties from pandas.core.indexes.api import Float64Index, Index, MultiIndex, ensure_index import pandas.core.indexes.base as ibase @@ -950,13 +950,9 @@ def _get_with(self, key): def _get_values_tuple(self, key): # mpl hackaround if com.any_none(*key): - # suppress warning from slicing the index with a 2d indexer. - # eventually we'll want Series itself to warn. - with warnings.catch_warnings(): - warnings.filterwarnings( - "ignore", "Support for multi-dim", DeprecationWarning - ) - return self._get_values(key) + result = self._get_values(key) + deprecate_ndim_indexing(result, stacklevel=5) + return result if not isinstance(self.index, MultiIndex): raise ValueError("Can only tuple-index with a MultiIndex") diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index 30c58506f619d..c8b780455f862 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -855,7 +855,7 @@ def test_engine_reference_cycle(self): def test_getitem_2d_deprecated(self): # GH#30588 idx = self.create_index() - with tm.assert_produces_warning(DeprecationWarning, check_stacklevel=False): + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): res = idx[:, None] assert isinstance(res, np.ndarray), type(res) diff --git a/pandas/tests/indexes/datetimes/test_indexing.py b/pandas/tests/indexes/datetimes/test_indexing.py index b1faaa2115f55..6d6193ceaf27d 100644 --- a/pandas/tests/indexes/datetimes/test_indexing.py +++ b/pandas/tests/indexes/datetimes/test_indexing.py @@ -95,7 +95,7 @@ def test_dti_business_getitem(self): def test_dti_business_getitem_matplotlib_hackaround(self): rng = pd.bdate_range(START, END) - with tm.assert_produces_warning(DeprecationWarning): + with tm.assert_produces_warning(FutureWarning): # GH#30588 multi-dimensional indexing deprecated values = rng[:, None] expected = rng.values[:, None] @@ -122,7 +122,7 @@ def test_dti_custom_getitem(self): def test_dti_custom_getitem_matplotlib_hackaround(self): rng = pd.bdate_range(START, END, freq="C") - with tm.assert_produces_warning(DeprecationWarning): + with tm.assert_produces_warning(FutureWarning): # GH#30588 multi-dimensional indexing deprecated values = rng[:, None] expected = rng.values[:, None] diff --git a/pandas/tests/indexes/interval/test_base.py b/pandas/tests/indexes/interval/test_base.py index 891640234d26e..c316655fbda8a 100644 --- a/pandas/tests/indexes/interval/test_base.py +++ b/pandas/tests/indexes/interval/test_base.py @@ -84,5 +84,5 @@ def test_getitem_2d_deprecated(self): # GH#30588 multi-dim indexing is deprecated, but raising is also acceptable idx = self.create_index() with pytest.raises(ValueError, match="multi-dimensional indexing not allowed"): - with tm.assert_produces_warning(DeprecationWarning, check_stacklevel=False): + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): idx[:, None] diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 099c7ced5e2ce..eaf48421dc071 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -56,7 +56,7 @@ def test_can_hold_identifiers(self): @pytest.mark.parametrize("index", ["datetime"], indirect=True) def test_new_axis(self, index): - with tm.assert_produces_warning(DeprecationWarning): + with tm.assert_produces_warning(FutureWarning): # GH#30588 multi-dimensional indexing deprecated new_index = index[None, :] assert new_index.ndim == 2 @@ -2531,7 +2531,7 @@ def test_shape_of_invalid_index(): # that the returned shape is consistent with this underlying array for # compat with matplotlib (see https://github.com/pandas-dev/pandas/issues/27775) idx = pd.Index([0, 1, 2, 3]) - with tm.assert_produces_warning(DeprecationWarning): + with tm.assert_produces_warning(FutureWarning): # GH#30588 multi-dimensional indexing deprecated assert idx[:, None].shape == (4, 1) diff --git a/pandas/tests/series/indexing/test_getitem.py b/pandas/tests/series/indexing/test_getitem.py index 164c63483f71f..6b7cda89a4714 100644 --- a/pandas/tests/series/indexing/test_getitem.py +++ b/pandas/tests/series/indexing/test_getitem.py @@ -51,11 +51,7 @@ class TestSeriesGetitemSlices: def test_getitem_slice_2d(self, datetime_series): # GH#30588 multi-dimensional indexing deprecated - # This is currently failing because the test was relying on - # the DeprecationWarning coming through Index.__getitem__. - # We want to implement a warning specifically for Series.__getitem__ - # at which point this will become a Deprecation/FutureWarning - with tm.assert_produces_warning(None): + with tm.assert_produces_warning(FutureWarning): # GH#30867 Don't want to support this long-term, but # for now ensure that the warning from Index # doesn't comes through via Series.__getitem__. @@ -135,3 +131,9 @@ def test_getitem_generator(string_series): expected = string_series[string_series > 0] tm.assert_series_equal(result, expected) tm.assert_series_equal(result2, expected) + + +def test_getitem_ndim_deprecated(): + s = pd.Series([0, 1]) + with tm.assert_produces_warning(FutureWarning): + s[:, None] From de8d41cce05cb0789dbb9edd1202ecc484a0c765 Mon Sep 17 00:00:00 2001 From: MBrouns Date: Tue, 7 Jul 2020 01:27:26 +0200 Subject: [PATCH 0282/1025] TST: Add test to verify align behaviour on CategoricalIndex (#34880) --- pandas/tests/frame/methods/test_align.py | 33 ++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/pandas/tests/frame/methods/test_align.py b/pandas/tests/frame/methods/test_align.py index 5dae719283d17..d19b59debfdea 100644 --- a/pandas/tests/frame/methods/test_align.py +++ b/pandas/tests/frame/methods/test_align.py @@ -129,6 +129,39 @@ def test_align_mixed_int(self, mixed_int_frame): ) tm.assert_index_equal(bf.index, Index([])) + @pytest.mark.parametrize( + "l_ordered,r_ordered,expected", + [ + [True, True, pd.CategoricalIndex], + [True, False, pd.Index], + [False, True, pd.Index], + [False, False, pd.CategoricalIndex], + ], + ) + def test_align_categorical(self, l_ordered, r_ordered, expected): + # GH-28397 + df_1 = DataFrame( + { + "A": np.arange(6, dtype="int64"), + "B": Series(list("aabbca")).astype( + pd.CategoricalDtype(list("cab"), ordered=l_ordered) + ), + } + ).set_index("B") + df_2 = DataFrame( + { + "A": np.arange(5, dtype="int64"), + "B": Series(list("babca")).astype( + pd.CategoricalDtype(list("cab"), ordered=r_ordered) + ), + } + ).set_index("B") + + aligned_1, aligned_2 = df_1.align(df_2) + assert isinstance(aligned_1.index, expected) + assert isinstance(aligned_2.index, expected) + tm.assert_index_equal(aligned_1.index, aligned_2.index) + def test_align_multiindex(self): # GH#10665 # same test cases as test_align_multiindex in test_series.py From 13d2de04e783d41540a8438d4aa071602776bad0 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 6 Jul 2020 18:28:05 -0500 Subject: [PATCH 0283/1025] Move mark registration (#35146) --- pandas/conftest.py | 13 +++++++++++++ setup.cfg | 7 ------- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index 5fe4cc45b0006..e0adb37e7d2f5 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -44,6 +44,19 @@ # Configuration / Settings # ---------------------------------------------------------------- # pytest +def pytest_configure(config): + # Register marks to avoid warnings in pandas.test() + # sync with setup.cfg + config.addinivalue_line("markers", "single: mark a test as single cpu only") + config.addinivalue_line("markers", "slow: mark a test as slow") + config.addinivalue_line("markers", "network: mark a test as network") + config.addinivalue_line( + "markers", "db: tests requiring a database (mysql or postgres)" + ) + config.addinivalue_line("markers", "high_memory: mark a test as a high-memory only") + config.addinivalue_line("markers", "clipboard: mark a pd.read_clipboard test") + + def pytest_addoption(parser): parser.addoption("--skip-slow", action="store_true", help="skip slow tests") parser.addoption("--skip-network", action="store_true", help="skip network tests") diff --git a/setup.cfg b/setup.cfg index 49a57b7a525f0..074b0b6bdff71 100644 --- a/setup.cfg +++ b/setup.cfg @@ -54,13 +54,6 @@ exclude = # sync minversion with setup.cfg & install.rst minversion = 4.0.2 testpaths = pandas -markers = - single: mark a test as single cpu only - slow: mark a test as slow - network: mark a test as network - db: tests requiring a database (mysql or postgres) - high_memory: mark a test as a high-memory only - clipboard: mark a pd.read_clipboard test doctest_optionflags = NORMALIZE_WHITESPACE IGNORE_EXCEPTION_DETAIL ELLIPSIS addopts = --strict-data-files xfail_strict = True From 8a139a2b07f30df01868342388d0df19a38612d4 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 6 Jul 2020 17:07:34 -0700 Subject: [PATCH 0284/1025] REF: standardize tz_convert_single usage (#35102) --- pandas/_libs/tslibs/timestamps.pyx | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index 9035d1bb2f643..b670491063616 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -1380,7 +1380,7 @@ default 'raise' cdef: npy_datetimestruct dts - int64_t value, value_tz + int64_t value object k, v datetime ts_input tzinfo_type tzobj @@ -1389,8 +1389,7 @@ default 'raise' tzobj = self.tzinfo value = self.value if tzobj is not None: - value_tz = tz_convert_single(value, tzobj, UTC) - value += value - value_tz + value = tz_convert_single(value, UTC, tzobj) # setup components dt64_to_dtstruct(value, &dts) From cc92719ed19ccec993e670dbaeb1f4a48976b377 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 6 Jul 2020 17:11:59 -0700 Subject: [PATCH 0285/1025] CLN: tz_convert is always from UTC (#35104) --- pandas/_libs/tslibs/tzconversion.pyx | 5 ++-- pandas/tests/tslibs/test_conversion.py | 33 +++++++++++++++++--------- 2 files changed, 25 insertions(+), 13 deletions(-) diff --git a/pandas/_libs/tslibs/tzconversion.pyx b/pandas/_libs/tslibs/tzconversion.pyx index d1d6bc40ef288..dc01210f2789f 100644 --- a/pandas/_libs/tslibs/tzconversion.pyx +++ b/pandas/_libs/tslibs/tzconversion.pyx @@ -388,8 +388,9 @@ def tz_convert(int64_t[:] vals, tzinfo tz1, tzinfo tz2): bint to_utc = is_utc(tz2) tzinfo tz - # See GH#17734 We should always be converting either from UTC or to UTC - assert is_utc(tz1) or to_utc + # See GH#17734 We should always be converting from UTC; otherwise + # should use tz_localize_to_utc. + assert is_utc(tz1) if len(vals) == 0: return np.array([], dtype=np.int64) diff --git a/pandas/tests/tslibs/test_conversion.py b/pandas/tests/tslibs/test_conversion.py index 3a7e06fb14a5f..5a16fea47e90d 100644 --- a/pandas/tests/tslibs/test_conversion.py +++ b/pandas/tests/tslibs/test_conversion.py @@ -20,33 +20,47 @@ def f(x): tm.assert_numpy_array_equal(result, expected) -def _compare_local_to_utc(tz_didx, utc_didx): +def _compare_local_to_utc(tz_didx, naive_didx): + # Check that tz_localize behaves the same vectorized and pointwise. def f(x): return tzconversion.tz_convert_single(x, tz_didx.tz, UTC) - result = tzconversion.tz_convert(utc_didx.asi8, tz_didx.tz, UTC) - expected = np.vectorize(f)(utc_didx.asi8) + err1 = err2 = None + try: + result = tzconversion.tz_localize_to_utc(naive_didx.asi8, tz_didx.tz) + err1 = None + except Exception as err: + err1 = err - tm.assert_numpy_array_equal(result, expected) + try: + expected = naive_didx.map(lambda x: x.tz_localize(tz_didx.tz)).asi8 + except Exception as err: + err2 = err + + if err1 is not None: + assert type(err1) == type(err2) + else: + assert err2 is None + tm.assert_numpy_array_equal(result, expected) def test_tz_convert_single_matches_tz_convert_hourly(tz_aware_fixture): tz = tz_aware_fixture tz_didx = date_range("2014-03-01", "2015-01-10", freq="H", tz=tz) - utc_didx = date_range("2014-03-01", "2015-01-10", freq="H") + naive_didx = date_range("2014-03-01", "2015-01-10", freq="H") _compare_utc_to_local(tz_didx) - _compare_local_to_utc(tz_didx, utc_didx) + _compare_local_to_utc(tz_didx, naive_didx) @pytest.mark.parametrize("freq", ["D", "A"]) def test_tz_convert_single_matches_tz_convert(tz_aware_fixture, freq): tz = tz_aware_fixture tz_didx = date_range("2000-01-01", "2020-01-01", freq=freq, tz=tz) - utc_didx = date_range("2000-01-01", "2020-01-01", freq=freq) + naive_didx = date_range("2000-01-01", "2020-01-01", freq=freq) _compare_utc_to_local(tz_didx) - _compare_local_to_utc(tz_didx, utc_didx) + _compare_local_to_utc(tz_didx, naive_didx) @pytest.mark.parametrize( @@ -57,9 +71,6 @@ def test_tz_convert_single_matches_tz_convert(tz_aware_fixture, freq): ], ) def test_tz_convert_corner(arr): - result = tzconversion.tz_convert(arr, timezones.maybe_get_tz("US/Eastern"), UTC) - tm.assert_numpy_array_equal(result, arr) - result = tzconversion.tz_convert(arr, UTC, timezones.maybe_get_tz("Asia/Tokyo")) tm.assert_numpy_array_equal(result, arr) From 6a256adc1513f0f237242972a06867c5e14cbeba Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 6 Jul 2020 17:13:26 -0700 Subject: [PATCH 0286/1025] PERF: tz_localize(None) from dst, asvs (#35106) * PERF: avoid copy in tz_convert dst cases * ASVs * asv fixup --- asv_bench/benchmarks/tslibs/tz_convert.py | 30 +++++++++++++ pandas/_libs/tslibs/tzconversion.pyx | 55 +++++++++++++++-------- 2 files changed, 67 insertions(+), 18 deletions(-) create mode 100644 asv_bench/benchmarks/tslibs/tz_convert.py diff --git a/asv_bench/benchmarks/tslibs/tz_convert.py b/asv_bench/benchmarks/tslibs/tz_convert.py new file mode 100644 index 0000000000000..2a1f559bdf6d4 --- /dev/null +++ b/asv_bench/benchmarks/tslibs/tz_convert.py @@ -0,0 +1,30 @@ +import numpy as np +from pytz import UTC + +from pandas._libs.tslibs.tzconversion import tz_convert, tz_localize_to_utc + +from .tslib import _sizes, _tzs + + +class TimeTZConvert: + params = ( + _sizes, + [x for x in _tzs if x is not None], + ) + param_names = ["size", "tz"] + + def setup(self, size, tz): + arr = np.random.randint(0, 10, size=size, dtype="i8") + self.i8data = arr + + def time_tz_convert_from_utc(self, size, tz): + # effectively: + # dti = DatetimeIndex(self.i8data, tz=tz) + # dti.tz_localize(None) + tz_convert(self.i8data, UTC, tz) + + def time_tz_localize_to_utc(self, size, tz): + # effectively: + # dti = DatetimeIndex(self.i8data) + # dti.tz_localize(tz, ambiguous="NaT", nonexistent="NaT") + tz_localize_to_utc(self.i8data, tz, ambiguous="NaT", nonexistent="NaT") diff --git a/pandas/_libs/tslibs/tzconversion.pyx b/pandas/_libs/tslibs/tzconversion.pyx index dc01210f2789f..273781ee34f0a 100644 --- a/pandas/_libs/tslibs/tzconversion.pyx +++ b/pandas/_libs/tslibs/tzconversion.pyx @@ -552,29 +552,48 @@ cdef int64_t[:] _tz_convert_dst( int64_t[:] result = np.empty(n, dtype=np.int64) ndarray[int64_t] trans int64_t[:] deltas - int64_t v + int64_t v, delta + str typ # tz is assumed _not_ to be tzlocal; that should go # through _tz_convert_tzlocal_utc - trans, deltas, _ = get_dst_info(tz) - if not to_utc: - # We add `offset` below instead of subtracting it - deltas = -1 * np.array(deltas, dtype='i8') + trans, deltas, typ = get_dst_info(tz) - # Previously, this search was done pointwise to try and benefit - # from getting to skip searches for iNaTs. However, it seems call - # overhead dominates the search time so doing it once in bulk - # is substantially faster (GH#24603) - pos = trans.searchsorted(values, side='right') - 1 + if typ not in ["pytz", "dateutil"]: + # FixedOffset, we know len(deltas) == 1 + delta = deltas[0] - for i in range(n): - v = values[i] - if v == NPY_NAT: - result[i] = v - else: - if pos[i] < 0: - raise ValueError('First time before start of DST info') - result[i] = v - deltas[pos[i]] + for i in range(n): + v = values[i] + if v == NPY_NAT: + result[i] = v + else: + if to_utc: + result[i] = v - delta + else: + result[i] = v + delta + + else: + # Previously, this search was done pointwise to try and benefit + # from getting to skip searches for iNaTs. However, it seems call + # overhead dominates the search time so doing it once in bulk + # is substantially faster (GH#24603) + pos = trans.searchsorted(values, side="right") - 1 + + for i in range(n): + v = values[i] + if v == NPY_NAT: + result[i] = v + else: + if pos[i] < 0: + # TODO: How is this reached? Should we be checking for + # it elsewhere? + raise ValueError("First time before start of DST info") + + if to_utc: + result[i] = v - deltas[pos[i]] + else: + result[i] = v + deltas[pos[i]] return result From b4e8fb833c7390f3040b004c9011e778977a26bf Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 6 Jul 2020 17:14:26 -0700 Subject: [PATCH 0287/1025] REF: implement tz_localize_to_utc_single (#35108) * REF: implement tz_localize_to_utc_single * docstring --- pandas/_libs/tslib.pyx | 7 ++--- pandas/_libs/tslibs/conversion.pyx | 9 +++--- pandas/_libs/tslibs/timestamps.pyx | 12 ++++---- pandas/_libs/tslibs/tzconversion.pxd | 3 ++ pandas/_libs/tslibs/tzconversion.pyx | 42 ++++++++++++++++++++++++++-- 5 files changed, 57 insertions(+), 16 deletions(-) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 4fda44e766109..3472dbf161b8e 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -46,7 +46,6 @@ from pandas._libs.tslibs.timezones cimport ( get_dst_info, is_utc, is_tzlocal, - utc_pytz as UTC, ) from pandas._libs.tslibs.conversion cimport ( _TSObject, @@ -67,8 +66,8 @@ from pandas._libs.tslibs.timestamps cimport create_timestamp_from_ts, _Timestamp from pandas._libs.tslibs.timestamps import Timestamp from pandas._libs.tslibs.tzconversion cimport ( - tz_convert_single, tz_convert_utc_to_tzlocal, + tz_localize_to_utc_single, ) # Note: this is the only non-tslibs intra-pandas dependency here @@ -250,7 +249,7 @@ def _test_parse_iso8601(ts: str): check_dts_bounds(&obj.dts) if out_local == 1: obj.tzinfo = pytz.FixedOffset(out_tzoffset) - obj.value = tz_convert_single(obj.value, obj.tzinfo, UTC) + obj.value = tz_localize_to_utc_single(obj.value, obj.tzinfo) return Timestamp(obj.value, tz=obj.tzinfo) else: return Timestamp(obj.value) @@ -708,7 +707,7 @@ cpdef array_to_datetime( # dateutil.tz.tzoffset objects out_tzoffset_vals.add(out_tzoffset * 60.) tz = pytz.FixedOffset(out_tzoffset) - value = tz_convert_single(value, tz, UTC) + value = tz_localize_to_utc_single(value, tz) out_local = 0 out_tzoffset = 0 else: diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index a1f074b1b29a8..36a4a1f60d8b9 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -39,10 +39,9 @@ from pandas._libs.tslibs.nattype cimport ( c_nat_strings as nat_strings, ) -from pandas._libs.tslibs.tzconversion import tz_localize_to_utc from pandas._libs.tslibs.tzconversion cimport ( tz_convert_utc_to_tzlocal, - tz_convert_single, + tz_localize_to_utc_single, ) # ---------------------------------------------------------------------- @@ -481,7 +480,7 @@ cdef _TSObject _create_tsobject_tz_using_offset(npy_datetimestruct dts, value = dtstruct_to_dt64(&dts) obj.dts = dts obj.tzinfo = pytz.FixedOffset(tzoffset) - obj.value = tz_convert_single(value, obj.tzinfo, UTC) + obj.value = tz_localize_to_utc_single(value, obj.tzinfo) if tz is None: check_overflows(obj) return obj @@ -567,8 +566,8 @@ cdef _TSObject _convert_str_to_tsobject(object ts, tzinfo tz, str unit, ts = dtstruct_to_dt64(&dts) if tz is not None: # shift for _localize_tso - ts = tz_localize_to_utc(np.array([ts], dtype='i8'), tz, - ambiguous='raise')[0] + ts = tz_localize_to_utc_single(ts, tz, + ambiguous="raise") except OutOfBoundsDatetime: # GH#19382 for just-barely-OutOfBounds falling back to dateutil diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index b670491063616..a2dacd9d36b14 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -58,8 +58,10 @@ from pandas._libs.tslibs.timezones cimport ( is_utc, maybe_get_tz, treat_tz_as_pytz, utc_pytz as UTC, get_timezone, tz_compare, ) -from pandas._libs.tslibs.tzconversion cimport tz_convert_single -from pandas._libs.tslibs.tzconversion import tz_localize_to_utc +from pandas._libs.tslibs.tzconversion cimport ( + tz_convert_single, + tz_localize_to_utc_single, +) # ---------------------------------------------------------------------- # Constants @@ -1300,9 +1302,9 @@ default 'raise' tz = maybe_get_tz(tz) if not isinstance(ambiguous, str): ambiguous = [ambiguous] - value = tz_localize_to_utc(np.array([self.value], dtype='i8'), tz, - ambiguous=ambiguous, - nonexistent=nonexistent)[0] + value = tz_localize_to_utc_single(self.value, tz, + ambiguous=ambiguous, + nonexistent=nonexistent) return Timestamp(value, tz=tz, freq=self.freq) else: if tz is None: diff --git a/pandas/_libs/tslibs/tzconversion.pxd b/pandas/_libs/tslibs/tzconversion.pxd index 7f445d7549f45..7d102868256de 100644 --- a/pandas/_libs/tslibs/tzconversion.pxd +++ b/pandas/_libs/tslibs/tzconversion.pxd @@ -4,3 +4,6 @@ from numpy cimport int64_t cdef int64_t tz_convert_utc_to_tzlocal(int64_t utc_val, tzinfo tz, bint* fold=*) cpdef int64_t tz_convert_single(int64_t val, tzinfo tz1, tzinfo tz2) +cdef int64_t tz_localize_to_utc_single( + int64_t val, tzinfo tz, object ambiguous=*, object nonexistent=* +) except? -1 diff --git a/pandas/_libs/tslibs/tzconversion.pyx b/pandas/_libs/tslibs/tzconversion.pyx index 273781ee34f0a..98c40e109dbab 100644 --- a/pandas/_libs/tslibs/tzconversion.pyx +++ b/pandas/_libs/tslibs/tzconversion.pyx @@ -20,10 +20,48 @@ from pandas._libs.tslibs.ccalendar cimport DAY_NANOS, HOUR_NANOS from pandas._libs.tslibs.nattype cimport NPY_NAT from pandas._libs.tslibs.np_datetime cimport ( npy_datetimestruct, dt64_to_dtstruct) -from pandas._libs.tslibs.timezones cimport get_dst_info, is_tzlocal, is_utc +from pandas._libs.tslibs.timezones cimport ( + get_dst_info, + get_utcoffset, + is_fixed_offset, + is_tzlocal, + is_utc, +) + + +cdef int64_t tz_localize_to_utc_single( + int64_t val, tzinfo tz, object ambiguous=None, object nonexistent=None, +) except? -1: + """See tz_localize_to_utc.__doc__""" + cdef: + int64_t delta + int64_t[:] deltas + + if val == NPY_NAT: + return val + + elif is_utc(tz) or tz is None: + return val + + elif is_tzlocal(tz): + return _tz_convert_tzlocal_utc(val, tz, to_utc=True) + + elif is_fixed_offset(tz): + # TODO: in this case we should be able to use get_utcoffset, + # that returns None for e.g. 'dateutil//usr/share/zoneinfo/Etc/GMT-9' + _, deltas, _ = get_dst_info(tz) + delta = deltas[0] + return val - delta + + else: + return tz_localize_to_utc( + np.array([val], dtype="i8"), + tz, + ambiguous=ambiguous, + nonexistent=nonexistent, + )[0] -# TODO: cdef scalar version to call from convert_str_to_tsobject @cython.boundscheck(False) @cython.wraparound(False) def tz_localize_to_utc(ndarray[int64_t] vals, tzinfo tz, object ambiguous=None, From f7a2eb8d38d86a98e0dd9cc54f9c1abd9ca14c1f Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Tue, 7 Jul 2020 01:39:25 +0100 Subject: [PATCH 0288/1025] no need to do a circular import (#35128) --- pandas/core/generic.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 0b76960558721..571fcc67f3bb5 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -5366,9 +5366,8 @@ def dtypes(self): string object dtype: object """ - from pandas import Series # noqa: F811 - - return Series(self._mgr.get_dtypes(), index=self._info_axis, dtype=np.object_) + data = self._mgr.get_dtypes() + return self._constructor_sliced(data, index=self._info_axis, dtype=np.object_) def _to_dict_of_blocks(self, copy: bool_t = True): """ From 087cf683d8a2d60875ba8e1ee17a3580af7cdd92 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 6 Jul 2020 17:44:20 -0700 Subject: [PATCH 0289/1025] REF: make ccalendar self-contained (#35119) --- pandas/_libs/tslibs/ccalendar.pyx | 23 ----------------------- pandas/_libs/tslibs/fields.pyx | 26 +++++++++++++++++++++++--- 2 files changed, 23 insertions(+), 26 deletions(-) diff --git a/pandas/_libs/tslibs/ccalendar.pyx b/pandas/_libs/tslibs/ccalendar.pyx index 9f8cf6c28adab..de8fd3911e946 100644 --- a/pandas/_libs/tslibs/ccalendar.pyx +++ b/pandas/_libs/tslibs/ccalendar.pyx @@ -7,11 +7,6 @@ import cython from numpy cimport int64_t, int32_t -from locale import LC_TIME - -from pandas._config.localization import set_locale -from pandas._libs.tslibs.strptime import LocaleTime - # ---------------------------------------------------------------------- # Constants @@ -246,21 +241,3 @@ cpdef int32_t get_day_of_year(int year, int month, int day) nogil: day_of_year = mo_off + day return day_of_year - - -def get_locale_names(name_type: str, locale: object = None): - """ - Returns an array of localized day or month names. - - Parameters - ---------- - name_type : string, attribute of LocaleTime() in which to return localized - names - locale : string - - Returns - ------- - list of locale names - """ - with set_locale(locale, LC_TIME): - return getattr(LocaleTime(), name_type) diff --git a/pandas/_libs/tslibs/fields.pyx b/pandas/_libs/tslibs/fields.pyx index 126deb67e4189..2351aca749dcc 100644 --- a/pandas/_libs/tslibs/fields.pyx +++ b/pandas/_libs/tslibs/fields.pyx @@ -2,6 +2,7 @@ Functions for accessing attributes of Timestamp/datetime64/datetime-like objects and arrays """ +from locale import LC_TIME import cython from cython import Py_ssize_t @@ -11,9 +12,9 @@ cimport numpy as cnp from numpy cimport ndarray, int64_t, int32_t, int8_t, uint32_t cnp.import_array() -from pandas._libs.tslibs.ccalendar import ( - get_locale_names, MONTHS_FULL, DAYS_FULL, -) +from pandas._config.localization import set_locale + +from pandas._libs.tslibs.ccalendar import MONTHS_FULL, DAYS_FULL from pandas._libs.tslibs.ccalendar cimport ( DAY_NANOS, get_days_in_month, is_leapyear, dayofweek, get_week_of_year, @@ -24,6 +25,7 @@ from pandas._libs.tslibs.np_datetime cimport ( npy_datetimestruct, pandas_timedeltastruct, dt64_to_dtstruct, td64_to_tdstruct) from pandas._libs.tslibs.nattype cimport NPY_NAT +from pandas._libs.tslibs.strptime import LocaleTime def get_time_micros(const int64_t[:] dtindex): @@ -704,3 +706,21 @@ def build_isocalendar_sarray(const int64_t[:] dtindex): iso_weeks[i] = ret_val[1] days[i] = ret_val[2] return out + + +def get_locale_names(name_type: str, locale: object = None): + """ + Returns an array of localized day or month names. + + Parameters + ---------- + name_type : string, attribute of LocaleTime() in which to return localized + names + locale : string + + Returns + ------- + list of locale names + """ + with set_locale(locale, LC_TIME): + return getattr(LocaleTime(), name_type) From da4152e8596ad72f515a5b34cf7d8252b9475163 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guillem=20S=C3=A1nchez?= Date: Tue, 7 Jul 2020 13:09:09 +0200 Subject: [PATCH 0290/1025] Added missing import in boxplot_frame_groupby example (#34343) --- pandas/plotting/_core.py | 30 +++++++++++++++++++----------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index 3a8cc5c299640..45a3818492b44 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -567,17 +567,25 @@ def boxplot_frame_groupby( Examples -------- - >>> import itertools - >>> tuples = [t for t in itertools.product(range(1000), range(4))] - >>> index = pd.MultiIndex.from_tuples(tuples, names=['lvl0', 'lvl1']) - >>> data = np.random.randn(len(index),4) - >>> df = pd.DataFrame(data, columns=list('ABCD'), index=index) - >>> - >>> grouped = df.groupby(level='lvl1') - >>> boxplot_frame_groupby(grouped) - >>> - >>> grouped = df.unstack(level='lvl1').groupby(level=0, axis=1) - >>> boxplot_frame_groupby(grouped, subplots=False) + You can create boxplots for grouped data and show them as separate subplots: + + .. plot:: + :context: close-figs + + >>> import itertools + >>> tuples = [t for t in itertools.product(range(1000), range(4))] + >>> index = pd.MultiIndex.from_tuples(tuples, names=['lvl0', 'lvl1']) + >>> data = np.random.randn(len(index),4) + >>> df = pd.DataFrame(data, columns=list('ABCD'), index=index) + >>> grouped = df.groupby(level='lvl1') + >>> grouped.boxplot(rot=45, fontsize=12, figsize=(8,10)) + + The ``subplots=False`` option shows the boxplots in a single figure. + + .. plot:: + :context: close-figs + + >>> grouped.boxplot(subplots=False, rot=45, fontsize=12) """ plot_backend = _get_plot_backend(backend) return plot_backend.boxplot_frame_groupby( From e5b2fad32c8332a85fd032c6a72761a67f5274cf Mon Sep 17 00:00:00 2001 From: Fangchen Li Date: Tue, 7 Jul 2020 08:10:06 -0500 Subject: [PATCH 0291/1025] CI: move py36 slow to azure (#34776) (#35151) --- .travis.yml | 6 ------ ci/azure/posix.yml | 5 +++++ ci/deps/{travis-36-slow.yaml => azure-36-slow.yaml} | 0 ci/setup_env.sh | 1 - 4 files changed, 5 insertions(+), 7 deletions(-) rename ci/deps/{travis-36-slow.yaml => azure-36-slow.yaml} (100%) diff --git a/.travis.yml b/.travis.yml index fdea9876d5d89..b016cf386098e 100644 --- a/.travis.yml +++ b/.travis.yml @@ -58,12 +58,6 @@ matrix: services: - mysql - postgresql - - - env: - - JOB="3.6, slow" ENV_FILE="ci/deps/travis-36-slow.yaml" PATTERN="slow" SQL="1" - services: - - mysql - - postgresql allow_failures: - arch: arm64 env: diff --git a/ci/azure/posix.yml b/ci/azure/posix.yml index 880fdc46f43f5..f716974f6add1 100644 --- a/ci/azure/posix.yml +++ b/ci/azure/posix.yml @@ -30,6 +30,11 @@ jobs: LC_ALL: "zh_CN.utf8" EXTRA_APT: "language-pack-zh-hans" + py36_slow: + ENV_FILE: ci/deps/azure-36-slow.yaml + CONDA_PY: "36" + PATTERN: "slow" + py36_locale: ENV_FILE: ci/deps/azure-36-locale.yaml CONDA_PY: "36" diff --git a/ci/deps/travis-36-slow.yaml b/ci/deps/azure-36-slow.yaml similarity index 100% rename from ci/deps/travis-36-slow.yaml rename to ci/deps/azure-36-slow.yaml diff --git a/ci/setup_env.sh b/ci/setup_env.sh index 4d551294dbb21..aa43d8b7dd00a 100755 --- a/ci/setup_env.sh +++ b/ci/setup_env.sh @@ -166,5 +166,4 @@ if [[ -n ${SQL:0} ]]; then else echo "not using dbs on non-linux Travis builds or Azure Pipelines" fi - echo "done" From bb96386027507d7d5be1e90f467ae9630d0283d0 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 7 Jul 2020 06:14:28 -0700 Subject: [PATCH 0292/1025] BUG: get_loc with time object matching NaT micros (#35114) --- doc/source/whatsnew/v1.1.0.rst | 1 + pandas/_libs/tslibs/fields.pyx | 22 ------------------ pandas/core/indexes/datetimes.py | 23 ++++++++++++++----- .../tests/indexes/datetimes/test_indexing.py | 10 ++++++++ 4 files changed, 28 insertions(+), 28 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 2bb933f6bdb60..cee41f248fc60 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -978,6 +978,7 @@ Indexing - Bug in :meth:`DataFrame.loc` with dictionary of values changes columns with dtype of ``int`` to ``float`` (:issue:`34573`) - Bug in :meth:`Series.loc` when used with a :class:`MultiIndex` would raise an IndexingError when accessing a None value (:issue:`34318`) - Bug in :meth:`DataFrame.reset_index` and :meth:`Series.reset_index` would not preserve data types on an empty :class:`DataFrame` or :class:`Series` with a :class:`MultiIndex` (:issue:`19602`) +- Bug in :class:`Series` and :class:`DataFrame` indexing with a ``time`` key on a :class:`DatetimeIndex` with ``NaT`` entries (:issue:`35114`) Missing ^^^^^^^ diff --git a/pandas/_libs/tslibs/fields.pyx b/pandas/_libs/tslibs/fields.pyx index 2351aca749dcc..5ea7c0b6c5d02 100644 --- a/pandas/_libs/tslibs/fields.pyx +++ b/pandas/_libs/tslibs/fields.pyx @@ -16,7 +16,6 @@ from pandas._config.localization import set_locale from pandas._libs.tslibs.ccalendar import MONTHS_FULL, DAYS_FULL from pandas._libs.tslibs.ccalendar cimport ( - DAY_NANOS, get_days_in_month, is_leapyear, dayofweek, get_week_of_year, get_day_of_year, get_iso_calendar, iso_calendar_t, month_offset, @@ -28,27 +27,6 @@ from pandas._libs.tslibs.nattype cimport NPY_NAT from pandas._libs.tslibs.strptime import LocaleTime -def get_time_micros(const int64_t[:] dtindex): - """ - Return the number of microseconds in the time component of a - nanosecond timestamp. - - Parameters - ---------- - dtindex : ndarray[int64_t] - - Returns - ------- - micros : ndarray[int64_t] - """ - cdef: - ndarray[int64_t] micros - - micros = np.mod(dtindex, DAY_NANOS, dtype=np.int64) - micros //= 1000 - return micros - - @cython.wraparound(False) @cython.boundscheck(False) def build_field_sarray(const int64_t[:] dtindex): diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 86c6cdf5b15c7..0317d0b93859b 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -6,7 +6,7 @@ import numpy as np from pandas._libs import NaT, Period, Timestamp, index as libindex, lib, tslib -from pandas._libs.tslibs import Resolution, fields, parsing, timezones, to_offset +from pandas._libs.tslibs import Resolution, parsing, timezones, to_offset from pandas._libs.tslibs.offsets import prefix_mapping from pandas._typing import DtypeObj, Label from pandas.errors import InvalidIndexError @@ -86,7 +86,6 @@ def _new_DatetimeIndex(cls, d): "tzinfo", "dtype", "to_pydatetime", - "_local_timestamps", "_has_same_tz", "_format_native_types", "date", @@ -380,10 +379,22 @@ def union_many(self, others): # -------------------------------------------------------------------- def _get_time_micros(self): + """ + Return the number of microseconds since midnight. + + Returns + ------- + ndarray[int64_t] + """ values = self.asi8 if self.tz is not None and not timezones.is_utc(self.tz): values = self._data._local_timestamps() - return fields.get_time_micros(values) + + nanos = values % (24 * 3600 * 1_000_000_000) + micros = nanos // 1000 + + micros[self._isnan] = -1 + return micros def to_series(self, keep_tz=lib.no_default, index=None, name=None): """ @@ -1094,6 +1105,6 @@ def bdate_range( ) -def _time_to_micros(time): - seconds = time.hour * 60 * 60 + 60 * time.minute + time.second - return 1000000 * seconds + time.microsecond +def _time_to_micros(time_obj: time) -> int: + seconds = time_obj.hour * 60 * 60 + 60 * time_obj.minute + time_obj.second + return 1_000_000 * seconds + time_obj.microsecond diff --git a/pandas/tests/indexes/datetimes/test_indexing.py b/pandas/tests/indexes/datetimes/test_indexing.py index 6d6193ceaf27d..5d2c6daba3f57 100644 --- a/pandas/tests/indexes/datetimes/test_indexing.py +++ b/pandas/tests/indexes/datetimes/test_indexing.py @@ -471,6 +471,16 @@ def test_get_loc(self): with pytest.raises(NotImplementedError, match=msg): idx.get_loc(time(12, 30), method="pad") + def test_get_loc_time_nat(self): + # GH#35114 + # Case where key's total microseconds happens to match iNaT % 1e6 // 1000 + tic = time(minute=12, second=43, microsecond=145224) + dti = pd.DatetimeIndex([pd.NaT]) + + loc = dti.get_loc(tic) + expected = np.array([], dtype=np.intp) + tm.assert_numpy_array_equal(loc, expected) + def test_get_loc_tz_aware(self): # https://github.com/pandas-dev/pandas/issues/32140 dti = pd.date_range( From d118f892a39d1b5cac45415b0cc94ba7681d112a Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Tue, 7 Jul 2020 14:19:59 +0100 Subject: [PATCH 0293/1025] CLN: Index._format_with_header (remove kwargs etc.) (#35118) --- pandas/core/indexes/base.py | 27 ++++++++++++++++----------- pandas/core/indexes/category.py | 9 +++++++++ pandas/core/indexes/datetimelike.py | 6 ++++-- pandas/core/indexes/interval.py | 6 +++--- pandas/core/indexes/range.py | 4 ++-- pandas/io/formats/style.py | 5 ++++- 6 files changed, 38 insertions(+), 19 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index b12a556a8291d..2f12a2e4c27ea 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2,7 +2,16 @@ from datetime import datetime import operator from textwrap import dedent -from typing import TYPE_CHECKING, Any, Callable, FrozenSet, Hashable, Optional, Union +from typing import ( + TYPE_CHECKING, + Any, + Callable, + FrozenSet, + Hashable, + List, + Optional, + Union, +) import warnings import numpy as np @@ -910,15 +919,12 @@ def format(self, name: bool = False, formatter=None, **kwargs): return self._format_with_header(header, **kwargs) - def _format_with_header(self, header, na_rep="NaN", **kwargs): - values = self._values - + def _format_with_header(self, header, na_rep="NaN") -> List[str_t]: from pandas.io.formats.format import format_array - if is_categorical_dtype(values.dtype): - values = np.array(values) + values = self._values - elif is_object_dtype(values.dtype): + if is_object_dtype(values.dtype): values = lib.maybe_convert_objects(values, safe=1) if is_object_dtype(values.dtype): @@ -929,10 +935,9 @@ def _format_with_header(self, header, na_rep="NaN", **kwargs): if mask.any(): result = np.array(result) result[mask] = na_rep - result = result.tolist() - + result = result.tolist() # type: ignore else: - result = _trim_front(format_array(values, None, justify="left")) + result = trim_front(format_array(values, None, justify="left")) return header + result def to_native_types(self, slicer=None, **kwargs): @@ -5611,7 +5616,7 @@ def ensure_has_len(seq): return seq -def _trim_front(strings): +def trim_front(strings: List[str]) -> List[str]: """ Trims zeros and decimal points. """ diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 2a79c83de7ef2..b0b008de69a94 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -347,6 +347,15 @@ def _format_attrs(self): attrs.append(("length", len(self))) return attrs + def _format_with_header(self, header, na_rep="NaN") -> List[str]: + from pandas.io.formats.format import format_array + + formatted_values = format_array( + self._values, formatter=None, na_rep=na_rep, justify="left" + ) + result = ibase.trim_front(formatted_values) + return header + result + # -------------------------------------------------------------------- @property diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 49b8ec3276e37..7be6aa50fa16b 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -338,8 +338,10 @@ def argmax(self, axis=None, skipna=True, *args, **kwargs): # -------------------------------------------------------------------- # Rendering Methods - def _format_with_header(self, header, na_rep="NaT", **kwargs): - return header + list(self._format_native_types(na_rep, **kwargs)) + def _format_with_header(self, header, na_rep="NaT", date_format=None) -> List[str]: + return header + list( + self._format_native_types(na_rep=na_rep, date_format=date_format) + ) @property def _formatter_func(self): diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index f7a7b382b853f..9548ebbd9c3b2 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -1,7 +1,7 @@ """ define the IntervalIndex """ from operator import le, lt import textwrap -from typing import Any, Optional, Tuple, Union +from typing import Any, List, Optional, Tuple, Union import numpy as np @@ -948,8 +948,8 @@ def take(self, indices, axis=0, allow_fill=True, fill_value=None, **kwargs): # Rendering Methods # __repr__ associated methods are based on MultiIndex - def _format_with_header(self, header, **kwargs): - return header + list(self._format_native_types(**kwargs)) + def _format_with_header(self, header, na_rep="NaN") -> List[str]: + return header + list(self._format_native_types(na_rep=na_rep)) def _format_native_types(self, na_rep="NaN", quoting=None, **kwargs): # GH 28210: use base method but with different default na_rep diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 49a0f0fb7ae92..6d9fd6efe54a3 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -1,7 +1,7 @@ from datetime import timedelta import operator from sys import getsizeof -from typing import Any, Optional +from typing import Any, List, Optional import warnings import numpy as np @@ -197,7 +197,7 @@ def _format_data(self, name=None): # we are formatting thru the attributes return None - def _format_with_header(self, header, na_rep="NaN", **kwargs): + def _format_with_header(self, header, na_rep="NaN") -> List[str]: return header + list(map(pprint_thing, self._range)) # -------------------------------------------------------------------- diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index f7ba4750bc2ad..6250e99252928 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -1524,7 +1524,10 @@ def _get_level_lengths(index, hidden_elements=None): Result is a dictionary of (level, initial_position): span """ - levels = index.format(sparsify=lib.no_default, adjoin=False, names=False) + if isinstance(index, pd.MultiIndex): + levels = index.format(sparsify=lib.no_default, adjoin=False) + else: + levels = index.format() if hidden_elements is None: hidden_elements = [] From e1325811a03909df7aedac3184a78095512df3ee Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Tue, 7 Jul 2020 20:19:26 +0100 Subject: [PATCH 0294/1025] TYP, DOC, CLN:SeriesGroupBy._wrap_applied_output (#35120) --- pandas/core/groupby/generic.py | 32 ++++++++++++++++++++++++-------- 1 file changed, 24 insertions(+), 8 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 6f956a3dcc9b6..ebb9d82766c1b 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -19,6 +19,7 @@ Iterable, List, Mapping, + Optional, Sequence, Tuple, Type, @@ -30,7 +31,7 @@ import numpy as np from pandas._libs import lib -from pandas._typing import FrameOrSeries +from pandas._typing import FrameOrSeries, FrameOrSeriesUnion from pandas.util._decorators import Appender, Substitution, doc from pandas.core.dtypes.cast import ( @@ -413,12 +414,31 @@ def _wrap_transformed_output( assert isinstance(result, Series) return result - def _wrap_applied_output(self, keys, values, not_indexed_same=False): + def _wrap_applied_output( + self, keys: Index, values: Optional[List[Any]], not_indexed_same: bool = False + ) -> FrameOrSeriesUnion: + """ + Wrap the output of SeriesGroupBy.apply into the expected result. + + Parameters + ---------- + keys : Index + Keys of groups that Series was grouped by. + values : Optional[List[Any]] + Applied output for each group. + not_indexed_same : bool, default False + Whether the applied outputs are not indexed the same as the group axes. + + Returns + ------- + DataFrame or Series + """ if len(keys) == 0: # GH #6265 return self.obj._constructor( [], name=self._selection_name, index=keys, dtype=np.float64 ) + assert values is not None def _get_index() -> Index: if self.grouper.nkeys > 1: @@ -430,7 +450,7 @@ def _get_index() -> Index: if isinstance(values[0], dict): # GH #823 #24880 index = _get_index() - result = self._reindex_output( + result: FrameOrSeriesUnion = self._reindex_output( self.obj._constructor_expanddim(values, index=index) ) # if self.observed is False, @@ -438,11 +458,7 @@ def _get_index() -> Index: result = result.stack(dropna=self.observed) result.name = self._selection_name return result - - if isinstance(values[0], Series): - return self._concat_objects(keys, values, not_indexed_same=not_indexed_same) - elif isinstance(values[0], DataFrame): - # possible that Series -> DataFrame by applied function + elif isinstance(values[0], (Series, DataFrame)): return self._concat_objects(keys, values, not_indexed_same=not_indexed_same) else: # GH #6265 #24880 From 56093615405a2e3d0ca55cc971864bd1a7491a65 Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Tue, 7 Jul 2020 20:24:24 +0100 Subject: [PATCH 0295/1025] CI: add validate_unwanted_patterns to known_third_parties (#35021) --- scripts/tests/test_validate_unwanted_patterns.py | 1 - setup.cfg | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/scripts/tests/test_validate_unwanted_patterns.py b/scripts/tests/test_validate_unwanted_patterns.py index b6cfa20cd7ca0..947666a730ee9 100644 --- a/scripts/tests/test_validate_unwanted_patterns.py +++ b/scripts/tests/test_validate_unwanted_patterns.py @@ -1,7 +1,6 @@ import io import pytest - import validate_unwanted_patterns diff --git a/setup.cfg b/setup.cfg index 074b0b6bdff71..00af7f6f1b79a 100644 --- a/setup.cfg +++ b/setup.cfg @@ -105,7 +105,7 @@ known_dtypes = pandas.core.dtypes known_post_core = pandas.tseries,pandas.io,pandas.plotting sections = FUTURE,STDLIB,THIRDPARTY,PRE_LIBS,PRE_CORE,DTYPES,FIRSTPARTY,POST_CORE,LOCALFOLDER known_first_party = pandas -known_third_party = _pytest,announce,dateutil,docutils,flake8,git,hypothesis,jinja2,lxml,matplotlib,numpy,numpydoc,pkg_resources,pyarrow,pytest,pytz,requests,scipy,setuptools,sphinx,sqlalchemy,validate_docstrings,yaml,odf +known_third_party = _pytest,announce,dateutil,docutils,flake8,git,hypothesis,jinja2,lxml,matplotlib,numpy,numpydoc,pkg_resources,pyarrow,pytest,pytz,requests,scipy,setuptools,sphinx,sqlalchemy,validate_docstrings,validate_unwanted_patterns,yaml,odf multi_line_output = 3 include_trailing_comma = True force_grid_wrap = 0 From 2603c9e31917bc524f444470ee8ef11b3f75585a Mon Sep 17 00:00:00 2001 From: Suvayu Ali Date: Wed, 8 Jul 2020 11:47:08 +0000 Subject: [PATCH 0296/1025] BUG: incorrect type when indexing sparse dataframe with iterable (#34908) * TST: regression tests for indexing sparse dataframe with iterable closes #34526 --- doc/source/whatsnew/v1.1.0.rst | 1 + pandas/core/arrays/sparse/array.py | 7 +-- pandas/tests/frame/indexing/test_indexing.py | 15 ------ pandas/tests/frame/indexing/test_sparse.py | 51 ++++++++++++++++++++ 4 files changed, 54 insertions(+), 20 deletions(-) create mode 100644 pandas/tests/frame/indexing/test_sparse.py diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index cee41f248fc60..386fe3ce2160f 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -1124,6 +1124,7 @@ Sparse - Bug where :class:`DataFrame` containing :class:`SparseArray` filled with ``NaN`` when indexed by a list-like (:issue:`27781`, :issue:`29563`) - The repr of :class:`SparseDtype` now includes the repr of its ``fill_value`` attribute. Previously it used ``fill_value``'s string representation (:issue:`34352`) - Bug where empty :class:`DataFrame` could not be cast to :class:`SparseDtype` (:issue:`33113`) +- Bug in :meth:`arrays.SparseArray` was returning the incorrect type when indexing a sparse dataframe with an iterable (:issue:`34526`, :issue:`34540`) ExtensionArray ^^^^^^^^^^^^^^ diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index 4996a10002c63..b18a58da3950f 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -866,11 +866,8 @@ def _take_with_fill(self, indices, fill_value=None) -> np.ndarray: if self.sp_index.npoints == 0: # Avoid taking from the empty self.sp_values - taken = np.full( - sp_indexer.shape, - fill_value=fill_value, - dtype=np.result_type(type(fill_value)), - ) + _dtype = np.result_type(self.dtype.subtype, type(fill_value)) + taken = np.full(sp_indexer.shape, fill_value=fill_value, dtype=_dtype) else: taken = self.sp_values.take(sp_indexer) diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index 3865ea64ee479..3fa3c9303806f 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -21,7 +21,6 @@ notna, ) import pandas._testing as tm -from pandas.arrays import SparseArray import pandas.core.common as com from pandas.core.indexing import IndexingError @@ -1907,20 +1906,6 @@ def test_getitem_ix_float_duplicates(self): expect = df.iloc[[1, -1], 0] tm.assert_series_equal(df.loc[0.2, "a"], expect) - def test_getitem_sparse_column(self): - # https://github.com/pandas-dev/pandas/issues/23559 - data = SparseArray([0, 1]) - df = pd.DataFrame({"A": data}) - expected = pd.Series(data, name="A") - result = df["A"] - tm.assert_series_equal(result, expected) - - result = df.iloc[:, 0] - tm.assert_series_equal(result, expected) - - result = df.loc[:, "A"] - tm.assert_series_equal(result, expected) - def test_setitem_with_unaligned_tz_aware_datetime_column(self): # GH 12981 # Assignment of unaligned offset-aware datetime series. diff --git a/pandas/tests/frame/indexing/test_sparse.py b/pandas/tests/frame/indexing/test_sparse.py new file mode 100644 index 0000000000000..876fbe212c466 --- /dev/null +++ b/pandas/tests/frame/indexing/test_sparse.py @@ -0,0 +1,51 @@ +import numpy as np +import pytest + +import pandas.util._test_decorators as td + +import pandas as pd +import pandas._testing as tm +from pandas.arrays import SparseArray +from pandas.core.arrays.sparse import SparseDtype + + +class TestSparseDataFrameIndexing: + def test_getitem_sparse_column(self): + # https://github.com/pandas-dev/pandas/issues/23559 + data = SparseArray([0, 1]) + df = pd.DataFrame({"A": data}) + expected = pd.Series(data, name="A") + result = df["A"] + tm.assert_series_equal(result, expected) + + result = df.iloc[:, 0] + tm.assert_series_equal(result, expected) + + result = df.loc[:, "A"] + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("spmatrix_t", ["coo_matrix", "csc_matrix", "csr_matrix"]) + @pytest.mark.parametrize("dtype", [np.int64, np.float64, complex]) + @td.skip_if_no_scipy + def test_locindexer_from_spmatrix(self, spmatrix_t, dtype): + import scipy.sparse + + spmatrix_t = getattr(scipy.sparse, spmatrix_t) + + # The bug is triggered by a sparse matrix with purely sparse columns. So the + # recipe below generates a rectangular matrix of dimension (5, 7) where all the + # diagonal cells are ones, meaning the last two columns are purely sparse. + rows, cols = 5, 7 + spmatrix = spmatrix_t(np.eye(rows, cols, dtype=dtype), dtype=dtype) + df = pd.DataFrame.sparse.from_spmatrix(spmatrix) + + # regression test for #34526 + itr_idx = range(2, rows) + result = df.loc[itr_idx].values + expected = spmatrix.toarray()[itr_idx] + tm.assert_numpy_array_equal(result, expected) + + # regression test for #34540 + result = df.loc[itr_idx].dtypes.values + expected = np.full(cols, SparseDtype(dtype, fill_value=0)) + tm.assert_numpy_array_equal(result, expected) From 87c04f0d38c7bebcb2ff4a6b9ac5df43fb2e2e2f Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 8 Jul 2020 07:40:45 -0500 Subject: [PATCH 0297/1025] Fixed Series.apply performance regression (#35166) --- pandas/core/apply.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 9c223d66b727b..d4be660939773 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -291,16 +291,14 @@ def apply_series_generator(self, partial_result=None) -> Tuple[ResType, "Index"] res_index = res_index.take(successes) else: - for i, v in series_gen_enumeration: - - with option_context("mode.chained_assignment", None): + with option_context("mode.chained_assignment", None): + for i, v in series_gen_enumeration: # ignore SettingWithCopy here in case the user mutates results[i] = self.f(v) - - if isinstance(results[i], ABCSeries): - # If we have a view on v, we need to make a copy because - # series_generator will swap out the underlying data - results[i] = results[i].copy(deep=False) + if isinstance(results[i], ABCSeries): + # If we have a view on v, we need to make a copy because + # series_generator will swap out the underlying data + results[i] = results[i].copy(deep=False) return results, res_index From e93e53c2331a0db6b7792b27748aa11c0c6310e7 Mon Sep 17 00:00:00 2001 From: Irv Lustig Date: Wed, 8 Jul 2020 08:43:18 -0400 Subject: [PATCH 0298/1025] Fix regression on datetime in MultiIndex (#35140) --- pandas/core/indexing.py | 4 +++ .../indexing/multiindex/test_datetime.py | 30 ++++++++++++++++++- 2 files changed, 33 insertions(+), 1 deletion(-) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 708b687434327..04d1dbceb3342 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1165,6 +1165,10 @@ def _convert_to_indexer(self, key, axis: int, is_setter: bool = False): if len(key) == labels.nlevels: return {"key": key} raise + except InvalidIndexError: + # GH35015, using datetime as column indices raises exception + if not isinstance(labels, ABCMultiIndex): + raise except TypeError: pass except ValueError: diff --git a/pandas/tests/indexing/multiindex/test_datetime.py b/pandas/tests/indexing/multiindex/test_datetime.py index 907d20cd5bd53..a49cb0bc2c43e 100644 --- a/pandas/tests/indexing/multiindex/test_datetime.py +++ b/pandas/tests/indexing/multiindex/test_datetime.py @@ -2,7 +2,16 @@ import numpy as np -from pandas import Index, Period, Series, period_range +from pandas import ( + DataFrame, + Index, + MultiIndex, + Period, + Series, + period_range, + to_datetime, +) +import pandas._testing as tm def test_multiindex_period_datetime(): @@ -20,3 +29,22 @@ def test_multiindex_period_datetime(): # try datetime as index result = s.loc["a", datetime(2012, 1, 1)] assert result == expected + + +def test_multiindex_datetime_columns(): + # GH35015, using datetime as column indices raises exception + + mi = MultiIndex.from_tuples( + [(to_datetime("02/29/2020"), to_datetime("03/01/2020"))], names=["a", "b"] + ) + + df = DataFrame([], columns=mi) + + expected_df = DataFrame( + [], + columns=MultiIndex.from_arrays( + [[to_datetime("02/29/2020")], [to_datetime("03/01/2020")]], names=["a", "b"] + ), + ) + + tm.assert_frame_equal(df, expected_df) From 4b39fde67eaf34a3e57bdc667845ccf9ae78f1e4 Mon Sep 17 00:00:00 2001 From: Robin to Roxel <35864265+r-toroxel@users.noreply.github.com> Date: Wed, 8 Jul 2020 14:52:25 +0200 Subject: [PATCH 0299/1025] TST add test case for drop_duplicates (#35121) --- .../frame/methods/test_drop_duplicates.py | 27 ++++++++++++------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/pandas/tests/frame/methods/test_drop_duplicates.py b/pandas/tests/frame/methods/test_drop_duplicates.py index 7c6391140e2bb..cebec215a0d9d 100644 --- a/pandas/tests/frame/methods/test_drop_duplicates.py +++ b/pandas/tests/frame/methods/test_drop_duplicates.py @@ -333,64 +333,73 @@ def test_drop_duplicates_inplace(): ) # single column df = orig.copy() - df.drop_duplicates("A", inplace=True) + return_value = df.drop_duplicates("A", inplace=True) expected = orig[:2] result = df tm.assert_frame_equal(result, expected) + assert return_value is None df = orig.copy() - df.drop_duplicates("A", keep="last", inplace=True) + return_value = df.drop_duplicates("A", keep="last", inplace=True) expected = orig.loc[[6, 7]] result = df tm.assert_frame_equal(result, expected) + assert return_value is None df = orig.copy() - df.drop_duplicates("A", keep=False, inplace=True) + return_value = df.drop_duplicates("A", keep=False, inplace=True) expected = orig.loc[[]] result = df tm.assert_frame_equal(result, expected) assert len(df) == 0 + assert return_value is None # multi column df = orig.copy() - df.drop_duplicates(["A", "B"], inplace=True) + return_value = df.drop_duplicates(["A", "B"], inplace=True) expected = orig.loc[[0, 1, 2, 3]] result = df tm.assert_frame_equal(result, expected) + assert return_value is None df = orig.copy() - df.drop_duplicates(["A", "B"], keep="last", inplace=True) + return_value = df.drop_duplicates(["A", "B"], keep="last", inplace=True) expected = orig.loc[[0, 5, 6, 7]] result = df tm.assert_frame_equal(result, expected) + assert return_value is None df = orig.copy() - df.drop_duplicates(["A", "B"], keep=False, inplace=True) + return_value = df.drop_duplicates(["A", "B"], keep=False, inplace=True) expected = orig.loc[[0]] result = df tm.assert_frame_equal(result, expected) + assert return_value is None # consider everything orig2 = orig.loc[:, ["A", "B", "C"]].copy() df2 = orig2.copy() - df2.drop_duplicates(inplace=True) + return_value = df2.drop_duplicates(inplace=True) # in this case only expected = orig2.drop_duplicates(["A", "B"]) result = df2 tm.assert_frame_equal(result, expected) + assert return_value is None df2 = orig2.copy() - df2.drop_duplicates(keep="last", inplace=True) + return_value = df2.drop_duplicates(keep="last", inplace=True) expected = orig2.drop_duplicates(["A", "B"], keep="last") result = df2 tm.assert_frame_equal(result, expected) + assert return_value is None df2 = orig2.copy() - df2.drop_duplicates(keep=False, inplace=True) + return_value = df2.drop_duplicates(keep=False, inplace=True) expected = orig2.drop_duplicates(["A", "B"], keep=False) result = df2 tm.assert_frame_equal(result, expected) + assert return_value is None @pytest.mark.parametrize("inplace", [True, False]) From 3890e6b6e98639faf08cf803fd1f860e81800fbb Mon Sep 17 00:00:00 2001 From: Mak Sze Chun Date: Wed, 8 Jul 2020 22:38:15 +0800 Subject: [PATCH 0300/1025] ENH: Add argmax and argmin to ExtensionArray (#27801) --- doc/source/whatsnew/v1.1.0.rst | 1 + pandas/core/arrays/base.py | 36 +++++++++++++++++++++++++- pandas/core/sorting.py | 27 +++++++++++++++++++ pandas/tests/extension/base/methods.py | 36 ++++++++++++++++++++++++++ pandas/tests/extension/test_boolean.py | 17 ++++++++++++ pandas/tests/extension/test_sparse.py | 8 ++++++ 6 files changed, 124 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 386fe3ce2160f..46e0d2a1164e1 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -280,6 +280,7 @@ Other enhancements - :meth:`Styler.highlight_null` now accepts ``subset`` argument (:issue:`31345`) - When writing directly to a sqlite connection :func:`to_sql` now supports the ``multi`` method (:issue:`29921`) - `OptionError` is now exposed in `pandas.errors` (:issue:`27553`) +- Add :meth:`ExtensionArray.argmax` and :meth:`ExtensionArray.argmin` (:issue:`24382`) - :func:`timedelta_range` will now infer a frequency when passed ``start``, ``stop``, and ``periods`` (:issue:`32377`) - Positional slicing on a :class:`IntervalIndex` now supports slices with ``step > 1`` (:issue:`31658`) - :class:`Series.str` now has a `fullmatch` method that matches a regular expression against the entire string in each row of the series, similar to `re.fullmatch` (:issue:`32806`). diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 5565b85f8d59a..32a2a30fcfd43 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -28,7 +28,7 @@ from pandas.core import ops from pandas.core.algorithms import _factorize_array, unique from pandas.core.missing import backfill_1d, pad_1d -from pandas.core.sorting import nargsort +from pandas.core.sorting import nargminmax, nargsort _extension_array_shared_docs: Dict[str, str] = dict() @@ -533,6 +533,40 @@ def argsort( result = nargsort(self, kind=kind, ascending=ascending, na_position="last") return result + def argmin(self): + """ + Return the index of minimum value. + + In case of multiple occurrences of the minimum value, the index + corresponding to the first occurrence is returned. + + Returns + ------- + int + + See Also + -------- + ExtensionArray.argmax + """ + return nargminmax(self, "argmin") + + def argmax(self): + """ + Return the index of maximum value. + + In case of multiple occurrences of the maximum value, the index + corresponding to the first occurrence is returned. + + Returns + ------- + int + + See Also + -------- + ExtensionArray.argmin + """ + return nargminmax(self, "argmax") + def fillna(self, value=None, method=None, limit=None): """ Fill NA/NaN values using the specified method. diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index da9cbe1023599..ee73aa42701b0 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -319,6 +319,33 @@ def nargsort( return indexer +def nargminmax(values, method: str): + """ + Implementation of np.argmin/argmax but for ExtensionArray and which + handles missing values. + + Parameters + ---------- + values : ExtensionArray + method : {"argmax", "argmin"} + + Returns + ------- + int + """ + assert method in {"argmax", "argmin"} + func = np.argmax if method == "argmax" else np.argmin + + mask = np.asarray(isna(values)) + values = values._values_for_argsort() + + idx = np.arange(len(values)) + non_nans = values[~mask] + non_nan_idx = idx[~mask] + + return non_nan_idx[func(non_nans)] + + def ensure_key_mapped_multiindex(index, key: Callable, level=None): """ Returns a new MultiIndex in which key has been applied diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index 874a8dfd4253f..5e1cf30efd534 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -75,6 +75,42 @@ def test_argsort_missing(self, data_missing_for_sorting): expected = pd.Series(np.array([1, -1, 0], dtype=np.int64)) self.assert_series_equal(result, expected) + def test_argmin_argmax(self, data_for_sorting, data_missing_for_sorting, na_value): + # GH 24382 + + # data_for_sorting -> [B, C, A] with A < B < C + assert data_for_sorting.argmax() == 1 + assert data_for_sorting.argmin() == 2 + + # with repeated values -> first occurence + data = data_for_sorting.take([2, 0, 0, 1, 1, 2]) + assert data.argmax() == 3 + assert data.argmin() == 0 + + # with missing values + # data_missing_for_sorting -> [B, NA, A] with A < B and NA missing. + assert data_missing_for_sorting.argmax() == 0 + assert data_missing_for_sorting.argmin() == 2 + + @pytest.mark.parametrize( + "method", ["argmax", "argmin"], + ) + def test_argmin_argmax_empty_array(self, method, data): + # GH 24382 + err_msg = "attempt to get" + with pytest.raises(ValueError, match=err_msg): + getattr(data[:0], method)() + + @pytest.mark.parametrize( + "method", ["argmax", "argmin"], + ) + def test_argmin_argmax_all_na(self, method, data, na_value): + # all missing with skipna=True is the same as emtpy + err_msg = "attempt to get" + data_na = type(data)._from_sequence([na_value, na_value], dtype=data.dtype) + with pytest.raises(ValueError, match=err_msg): + getattr(data_na, method)() + @pytest.mark.parametrize( "na_position, expected", [ diff --git a/pandas/tests/extension/test_boolean.py b/pandas/tests/extension/test_boolean.py index 725067951eeef..8acbeaf0b8170 100644 --- a/pandas/tests/extension/test_boolean.py +++ b/pandas/tests/extension/test_boolean.py @@ -235,6 +235,23 @@ def test_searchsorted(self, data_for_sorting, as_series): def test_value_counts(self, all_data, dropna): return super().test_value_counts(all_data, dropna) + def test_argmin_argmax(self, data_for_sorting, data_missing_for_sorting): + # override because there are only 2 unique values + + # data_for_sorting -> [B, C, A] with A < B < C -> here True, True, False + assert data_for_sorting.argmax() == 0 + assert data_for_sorting.argmin() == 2 + + # with repeated values -> first occurence + data = data_for_sorting.take([2, 0, 0, 1, 1, 2]) + assert data.argmax() == 1 + assert data.argmin() == 0 + + # with missing values + # data_missing_for_sorting -> [B, NA, A] with A < B and NA missing. + assert data_missing_for_sorting.argmax() == 0 + assert data_missing_for_sorting.argmin() == 2 + class TestCasting(base.BaseCastingTests): pass diff --git a/pandas/tests/extension/test_sparse.py b/pandas/tests/extension/test_sparse.py index f318934ef5e52..68e521b005c02 100644 --- a/pandas/tests/extension/test_sparse.py +++ b/pandas/tests/extension/test_sparse.py @@ -321,6 +321,14 @@ def test_shift_0_periods(self, data): data._sparse_values[0] = data._sparse_values[1] assert result._sparse_values[0] != result._sparse_values[1] + @pytest.mark.parametrize( + "method", ["argmax", "argmin"], + ) + def test_argmin_argmax_all_na(self, method, data, na_value): + # overriding because Sparse[int64, 0] cannot handle na_value + self._check_unsupported(data) + super().test_argmin_argmax_all_na(method, data, na_value) + @pytest.mark.parametrize("box", [pd.array, pd.Series, pd.DataFrame]) def test_equals(self, data, na_value, as_series, box): self._check_unsupported(data) From 5ab5bf5516f643b7956cc496469327e6ad6590f3 Mon Sep 17 00:00:00 2001 From: Thomas Smith Date: Wed, 8 Jul 2020 16:35:25 +0100 Subject: [PATCH 0301/1025] TST: add test to ensure that df.groupby() returns the missing categories when grouping on 2 pd.Categoricals (#35022) --- pandas/tests/groupby/test_categorical.py | 174 ++++++++++++++++++----- 1 file changed, 142 insertions(+), 32 deletions(-) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 4de61f719dfbb..118d928ac02f4 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -36,6 +36,41 @@ def f(a): return result.reindex(index).sort_index() +_results_for_groupbys_with_missing_categories = dict( + # This maps the builtin groupby functions to their expected outputs for + # missing categories when they are called on a categorical grouper with + # observed=False. Some functions are expected to return NaN, some zero. + # These expected values can be used across several tests (i.e. they are + # the same for SeriesGroupBy and DataFrameGroupBy) but they should only be + # hardcoded in one place. + [ + ("all", np.NaN), + ("any", np.NaN), + ("count", 0), + ("corrwith", np.NaN), + ("first", np.NaN), + ("idxmax", np.NaN), + ("idxmin", np.NaN), + ("last", np.NaN), + ("mad", np.NaN), + ("max", np.NaN), + ("mean", np.NaN), + ("median", np.NaN), + ("min", np.NaN), + ("nth", np.NaN), + ("nunique", 0), + ("prod", np.NaN), + ("quantile", np.NaN), + ("sem", np.NaN), + ("size", 0), + ("skew", np.NaN), + ("std", np.NaN), + ("sum", 0), + ("var", np.NaN), + ] +) + + def test_apply_use_categorical_name(df): cats = qcut(df.C, 4) @@ -1263,12 +1298,13 @@ def test_series_groupby_on_2_categoricals_unobserved( reduction_func: str, observed: bool, request ): # GH 17605 - if reduction_func == "ngroup": pytest.skip("ngroup is not truly a reduction") if reduction_func == "corrwith": # GH 32293 - mark = pytest.mark.xfail(reason="TODO: implemented SeriesGroupBy.corrwith") + mark = pytest.mark.xfail( + reason="TODO: implemented SeriesGroupBy.corrwith. See GH 32293" + ) request.node.add_marker(mark) df = pd.DataFrame( @@ -1289,36 +1325,30 @@ def test_series_groupby_on_2_categoricals_unobserved( assert len(result) == expected_length -@pytest.mark.parametrize( - "func, zero_or_nan", - [ - ("all", np.NaN), - ("any", np.NaN), - ("count", 0), - ("first", np.NaN), - ("idxmax", np.NaN), - ("idxmin", np.NaN), - ("last", np.NaN), - ("mad", np.NaN), - ("max", np.NaN), - ("mean", np.NaN), - ("median", np.NaN), - ("min", np.NaN), - ("nth", np.NaN), - ("nunique", 0), - ("prod", np.NaN), - ("quantile", np.NaN), - ("sem", np.NaN), - ("size", 0), - ("skew", np.NaN), - ("std", np.NaN), - ("sum", np.NaN), - ("var", np.NaN), - ], -) -def test_series_groupby_on_2_categoricals_unobserved_zeroes_or_nans(func, zero_or_nan): +def test_series_groupby_on_2_categoricals_unobserved_zeroes_or_nans( + reduction_func: str, request +): # GH 17605 # Tests whether the unobserved categories in the result contain 0 or NaN + + if reduction_func == "ngroup": + pytest.skip("ngroup is not truly a reduction") + + if reduction_func == "corrwith": # GH 32293 + mark = pytest.mark.xfail( + reason="TODO: implemented SeriesGroupBy.corrwith. See GH 32293" + ) + request.node.add_marker(mark) + + if reduction_func == "sum": # GH 31422 + mark = pytest.mark.xfail( + reason=( + "sum should return 0 but currently returns NaN. " + "This is a known bug. See GH 31422." + ) + ) + request.node.add_marker(mark) + df = pd.DataFrame( { "cat_1": pd.Categorical(list("AABB"), categories=list("ABC")), @@ -1327,12 +1357,14 @@ def test_series_groupby_on_2_categoricals_unobserved_zeroes_or_nans(func, zero_o } ) unobserved = [tuple("AC"), tuple("BC"), tuple("CA"), tuple("CB"), tuple("CC")] - args = {"nth": [0]}.get(func, []) + args = {"nth": [0]}.get(reduction_func, []) series_groupby = df.groupby(["cat_1", "cat_2"], observed=False)["value"] - agg = getattr(series_groupby, func) + agg = getattr(series_groupby, reduction_func) result = agg(*args) + zero_or_nan = _results_for_groupbys_with_missing_categories[reduction_func] + for idx in unobserved: val = result.loc[idx] assert (pd.isna(zero_or_nan) and pd.isna(val)) or (val == zero_or_nan) @@ -1342,6 +1374,84 @@ def test_series_groupby_on_2_categoricals_unobserved_zeroes_or_nans(func, zero_o assert np.issubdtype(result.dtype, np.integer) +def test_dataframe_groupby_on_2_categoricals_when_observed_is_true(reduction_func: str): + # GH 23865 + # GH 27075 + # Ensure that df.groupby, when 'by' is two pd.Categorical variables, + # does not return the categories that are not in df when observed=True + if reduction_func == "ngroup": + pytest.skip("ngroup does not return the Categories on the index") + + df = pd.DataFrame( + { + "cat_1": pd.Categorical(list("AABB"), categories=list("ABC")), + "cat_2": pd.Categorical(list("1111"), categories=list("12")), + "value": [0.1, 0.1, 0.1, 0.1], + } + ) + unobserved_cats = [("A", "2"), ("B", "2"), ("C", "1"), ("C", "2")] + + df_grp = df.groupby(["cat_1", "cat_2"], observed=True) + + args = {"nth": [0], "corrwith": [df]}.get(reduction_func, []) + res = getattr(df_grp, reduction_func)(*args) + + for cat in unobserved_cats: + assert cat not in res.index + + +@pytest.mark.parametrize("observed", [False, None]) +def test_dataframe_groupby_on_2_categoricals_when_observed_is_false( + reduction_func: str, observed: bool, request +): + # GH 23865 + # GH 27075 + # Ensure that df.groupby, when 'by' is two pd.Categorical variables, + # returns the categories that are not in df when observed=False/None + + if reduction_func == "ngroup": + pytest.skip("ngroup does not return the Categories on the index") + + if reduction_func == "count": # GH 35028 + mark = pytest.mark.xfail( + reason=( + "DataFrameGroupBy.count returns np.NaN for missing " + "categories, when it should return 0. See GH 35028" + ) + ) + request.node.add_marker(mark) + + if reduction_func == "sum": # GH 31422 + mark = pytest.mark.xfail( + reason=( + "sum should return 0 but currently returns NaN. " + "This is a known bug. See GH 31422." + ) + ) + request.node.add_marker(mark) + + df = pd.DataFrame( + { + "cat_1": pd.Categorical(list("AABB"), categories=list("ABC")), + "cat_2": pd.Categorical(list("1111"), categories=list("12")), + "value": [0.1, 0.1, 0.1, 0.1], + } + ) + unobserved_cats = [("A", "2"), ("B", "2"), ("C", "1"), ("C", "2")] + + df_grp = df.groupby(["cat_1", "cat_2"], observed=observed) + + args = {"nth": [0], "corrwith": [df]}.get(reduction_func, []) + res = getattr(df_grp, reduction_func)(*args) + + expected = _results_for_groupbys_with_missing_categories[reduction_func] + + if expected is np.nan: + assert res.loc[unobserved_cats].isnull().all().all() + else: + assert (res.loc[unobserved_cats] == expected).all().all() + + def test_series_groupby_categorical_aggregation_getitem(): # GH 8870 d = {"foo": [10, 8, 4, 1], "bar": [10, 20, 30, 40], "baz": ["d", "c", "d", "c"]} From 40e0e9f15093ad5851a5470a3ba971efeee5c48a Mon Sep 17 00:00:00 2001 From: Vijay Sai Mutyala Date: Wed, 8 Jul 2020 15:36:28 +0000 Subject: [PATCH 0302/1025] =?UTF-8?q?DOC=20-=20Moving=20Tips=20and=20Trick?= =?UTF-8?q?=20from=20wiki=20to=20Style=20Guide=20-=20added=20Reading=20?= =?UTF-8?q?=E2=80=A6=20(#34366)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- doc/source/development/code_style.rst | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/doc/source/development/code_style.rst b/doc/source/development/code_style.rst index 6d33537a40175..11d0c35f92ff5 100644 --- a/doc/source/development/code_style.rst +++ b/doc/source/development/code_style.rst @@ -159,3 +159,18 @@ For example: # wrong from common import test_base + + +Miscellaneous +============= + +Reading from a url +------------------ + +**Good:** + +.. code-block:: python + + from pandas.io.common import urlopen + with urlopen('http://www.google.com') as url: + raw_text = url.read() From 13203b7650b3a64d482ca7170c7377c05b7e72f7 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 8 Jul 2020 18:15:55 +0200 Subject: [PATCH 0303/1025] ENH: concat of nullable int + bool preserves int dtype (#34985) --- doc/source/whatsnew/v1.1.0.rst | 2 +- pandas/core/arrays/integer.py | 9 +++-- pandas/tests/arrays/integer/test_concat.py | 45 +++++++++++++++++++++- 3 files changed, 50 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 46e0d2a1164e1..24283d2c2e48d 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -322,7 +322,7 @@ Other enhancements - :meth:`DataFrame.hist`, :meth:`Series.hist`, :meth:`core.groupby.DataFrameGroupBy.hist`, and :meth:`core.groupby.SeriesGroupBy.hist` have gained the ``legend`` argument. Set to True to show a legend in the histogram. (:issue:`6279`) - :func:`concat` and :meth:`~DataFrame.append` now preserve extension dtypes, for example combining a nullable integer column with a numpy integer column will no longer - result in object dtype but preserve the integer dtype (:issue:`33607`, :issue:`34339`). + result in object dtype but preserve the integer dtype (:issue:`33607`, :issue:`34339`, :issue:`34095`). - :meth:`~pandas.io.gbq.read_gbq` now allows to disable progress bar (:issue:`33360`). - :meth:`~pandas.io.gbq.read_gbq` now supports the ``max_results`` kwarg from ``pandas-gbq`` (:issue:`34639`). - :meth:`DataFrame.cov` and :meth:`Series.cov` now support a new parameter ddof to support delta degrees of freedom as in the corresponding numpy methods (:issue:`34611`). diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index df43b5d6115ba..7be7ef3637ee5 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -92,10 +92,13 @@ def construct_array_type(cls) -> Type["IntegerArray"]: return IntegerArray def _get_common_dtype(self, dtypes: List[DtypeObj]) -> Optional[DtypeObj]: - # for now only handle other integer types + # we only handle nullable EA dtypes and numeric numpy dtypes if not all( - isinstance(t, _IntegerDtype) - or (isinstance(t, np.dtype) and np.issubdtype(t, np.integer)) + isinstance(t, BaseMaskedDtype) + or ( + isinstance(t, np.dtype) + and (np.issubdtype(t, np.number) or np.issubdtype(t, np.bool_)) + ) for t in dtypes ): return None diff --git a/pandas/tests/arrays/integer/test_concat.py b/pandas/tests/arrays/integer/test_concat.py index 3ace35700bd3e..fc24709deb82c 100644 --- a/pandas/tests/arrays/integer/test_concat.py +++ b/pandas/tests/arrays/integer/test_concat.py @@ -1,3 +1,4 @@ +import numpy as np import pytest import pandas as pd @@ -15,12 +16,52 @@ (["Int32", "UInt32"], "Int64"), # this still gives object (awaiting float extension dtype) (["Int64", "UInt64"], "object"), + (["Int64", "boolean"], "Int64"), + (["UInt8", "boolean"], "UInt8"), ], ) def test_concat_series(to_concat_dtypes, result_dtype): - result = pd.concat([pd.Series([1, 2, pd.NA], dtype=t) for t in to_concat_dtypes]) - expected = pd.concat([pd.Series([1, 2, pd.NA], dtype=object)] * 2).astype( + result = pd.concat([pd.Series([0, 1, pd.NA], dtype=t) for t in to_concat_dtypes]) + expected = pd.concat([pd.Series([0, 1, pd.NA], dtype=object)] * 2).astype( result_dtype ) tm.assert_series_equal(result, expected) + + # order doesn't matter for result + result = pd.concat( + [pd.Series([0, 1, pd.NA], dtype=t) for t in to_concat_dtypes[::-1]] + ) + expected = pd.concat([pd.Series([0, 1, pd.NA], dtype=object)] * 2).astype( + result_dtype + ) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "to_concat_dtypes, result_dtype", + [ + (["Int64", "int64"], "Int64"), + (["UInt64", "uint64"], "UInt64"), + (["Int8", "int8"], "Int8"), + (["Int8", "int16"], "Int16"), + (["UInt8", "int8"], "Int16"), + (["Int32", "uint32"], "Int64"), + # this still gives object (awaiting float extension dtype) + (["Int64", "uint64"], "object"), + (["Int64", "bool"], "Int64"), + (["UInt8", "bool"], "UInt8"), + ], +) +def test_concat_series_with_numpy(to_concat_dtypes, result_dtype): + + s1 = pd.Series([0, 1, pd.NA], dtype=to_concat_dtypes[0]) + s2 = pd.Series(np.array([0, 1], dtype=to_concat_dtypes[1])) + result = pd.concat([s1, s2], ignore_index=True) + expected = pd.Series([0, 1, pd.NA, 0, 1], dtype=object).astype(result_dtype) + tm.assert_series_equal(result, expected) + + # order doesn't matter for result + result = pd.concat([s2, s1], ignore_index=True) + expected = pd.Series([0, 1, 0, 1, pd.NA], dtype=object).astype(result_dtype) + tm.assert_series_equal(result, expected) From 89c64843a4cc0ab1e2b2414fbed5769553ba599e Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 8 Jul 2020 14:50:16 -0700 Subject: [PATCH 0304/1025] REF: collect get_dst_info-using functions in tslibs.vectorized (#35168) --- asv_bench/benchmarks/tslibs/resolution.py | 5 +- asv_bench/benchmarks/tslibs/tslib.py | 5 +- pandas/_libs/tslib.pyx | 170 +-------- pandas/_libs/tslibs/__init__.py | 12 + pandas/_libs/tslibs/conversion.pxd | 1 - pandas/_libs/tslibs/conversion.pyx | 127 ------- pandas/_libs/tslibs/offsets.pyx | 3 +- pandas/_libs/tslibs/period.pxd | 5 + pandas/_libs/tslibs/period.pyx | 57 --- pandas/_libs/tslibs/resolution.pyx | 98 +---- pandas/_libs/tslibs/vectorized.pyx | 440 ++++++++++++++++++++++ pandas/core/arrays/datetimes.py | 20 +- pandas/core/arrays/period.py | 3 +- pandas/core/dtypes/cast.py | 5 +- pandas/core/indexes/datetimes.py | 12 +- pandas/tests/tslibs/test_api.py | 6 + setup.py | 2 + 17 files changed, 504 insertions(+), 467 deletions(-) create mode 100644 pandas/_libs/tslibs/vectorized.pyx diff --git a/asv_bench/benchmarks/tslibs/resolution.py b/asv_bench/benchmarks/tslibs/resolution.py index 274aa1ad6d4a9..280be7932d4db 100644 --- a/asv_bench/benchmarks/tslibs/resolution.py +++ b/asv_bench/benchmarks/tslibs/resolution.py @@ -23,7 +23,10 @@ import numpy as np import pytz -from pandas._libs.tslibs.resolution import get_resolution +try: + from pandas._libs.tslibs import get_resolution +except ImportError: + from pandas._libs.tslibs.resolution import get_resolution class TimeResolution: diff --git a/asv_bench/benchmarks/tslibs/tslib.py b/asv_bench/benchmarks/tslibs/tslib.py index eacf5a5731dc2..5952a402bf89a 100644 --- a/asv_bench/benchmarks/tslibs/tslib.py +++ b/asv_bench/benchmarks/tslibs/tslib.py @@ -21,7 +21,10 @@ import numpy as np import pytz -from pandas._libs.tslib import ints_to_pydatetime +try: + from pandas._libs.tslibs import ints_to_pydatetime +except ImportError: + from pandas._libs.tslib import ints_to_pydatetime _tzs = [ None, diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 3472dbf161b8e..d70d0378a2621 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -4,18 +4,14 @@ from cpython.datetime cimport ( PyDate_Check, PyDateTime_Check, PyDateTime_IMPORT, - date, datetime, - time, - timedelta, - tzinfo, ) # import datetime C API PyDateTime_IMPORT cimport numpy as cnp -from numpy cimport float64_t, int64_t, ndarray, uint8_t, intp_t +from numpy cimport float64_t, int64_t, ndarray import numpy as np cnp.import_array() @@ -42,11 +38,6 @@ from pandas._libs.tslibs.np_datetime import OutOfBoundsDatetime from pandas._libs.tslibs.parsing import parse_datetime_string -from pandas._libs.tslibs.timezones cimport ( - get_dst_info, - is_utc, - is_tzlocal, -) from pandas._libs.tslibs.conversion cimport ( _TSObject, cast_from_unit, @@ -60,13 +51,10 @@ from pandas._libs.tslibs.nattype cimport ( c_nat_strings as nat_strings, ) -from pandas._libs.tslibs.offsets cimport to_offset - -from pandas._libs.tslibs.timestamps cimport create_timestamp_from_ts, _Timestamp +from pandas._libs.tslibs.timestamps cimport _Timestamp from pandas._libs.tslibs.timestamps import Timestamp from pandas._libs.tslibs.tzconversion cimport ( - tz_convert_utc_to_tzlocal, tz_localize_to_utc_single, ) @@ -74,160 +62,6 @@ from pandas._libs.tslibs.tzconversion cimport ( from pandas._libs.missing cimport checknull_with_nat_and_na -cdef inline object create_datetime_from_ts( - int64_t value, - npy_datetimestruct dts, - tzinfo tz, - object freq, - bint fold, -): - """ - Convenience routine to construct a datetime.datetime from its parts. - """ - return datetime( - dts.year, dts.month, dts.day, dts.hour, dts.min, dts.sec, dts.us, tz, fold=fold - ) - - -cdef inline object create_date_from_ts( - int64_t value, - npy_datetimestruct dts, - tzinfo tz, - object freq, - bint fold -): - """ - Convenience routine to construct a datetime.date from its parts. - """ - # GH 25057 add fold argument to match other func_create signatures - return date(dts.year, dts.month, dts.day) - - -cdef inline object create_time_from_ts( - int64_t value, - npy_datetimestruct dts, - tzinfo tz, - object freq, - bint fold -): - """ - Convenience routine to construct a datetime.time from its parts. - """ - return time(dts.hour, dts.min, dts.sec, dts.us, tz, fold=fold) - - -@cython.wraparound(False) -@cython.boundscheck(False) -def ints_to_pydatetime( - const int64_t[:] arr, - tzinfo tz=None, - object freq=None, - bint fold=False, - str box="datetime" -): - """ - Convert an i8 repr to an ndarray of datetimes, date, time or Timestamp. - - Parameters - ---------- - arr : array of i8 - tz : str, optional - convert to this timezone - freq : str/Offset, optional - freq to convert - fold : bint, default is 0 - Due to daylight saving time, one wall clock time can occur twice - when shifting from summer to winter time; fold describes whether the - datetime-like corresponds to the first (0) or the second time (1) - the wall clock hits the ambiguous time - - .. versionadded:: 1.1.0 - box : {'datetime', 'timestamp', 'date', 'time'}, default 'datetime' - * If datetime, convert to datetime.datetime - * If date, convert to datetime.date - * If time, convert to datetime.time - * If Timestamp, convert to pandas.Timestamp - - Returns - ------- - ndarray of dtype specified by box - """ - cdef: - Py_ssize_t i, n = len(arr) - ndarray[int64_t] trans - int64_t[:] deltas - intp_t[:] pos - npy_datetimestruct dts - object dt, new_tz - str typ - int64_t value, local_value, delta = NPY_NAT # dummy for delta - ndarray[object] result = np.empty(n, dtype=object) - object (*func_create)(int64_t, npy_datetimestruct, tzinfo, object, bint) - bint use_utc = False, use_tzlocal = False, use_fixed = False - bint use_pytz = False - - if box == "date": - assert (tz is None), "tz should be None when converting to date" - - func_create = create_date_from_ts - elif box == "timestamp": - func_create = create_timestamp_from_ts - - if isinstance(freq, str): - freq = to_offset(freq) - elif box == "time": - func_create = create_time_from_ts - elif box == "datetime": - func_create = create_datetime_from_ts - else: - raise ValueError( - "box must be one of 'datetime', 'date', 'time' or 'timestamp'" - ) - - if is_utc(tz) or tz is None: - use_utc = True - elif is_tzlocal(tz): - use_tzlocal = True - else: - trans, deltas, typ = get_dst_info(tz) - if typ not in ["pytz", "dateutil"]: - # static/fixed; in this case we know that len(delta) == 1 - use_fixed = True - delta = deltas[0] - else: - pos = trans.searchsorted(arr, side="right") - 1 - use_pytz = typ == "pytz" - - for i in range(n): - new_tz = tz - value = arr[i] - - if value == NPY_NAT: - result[i] = NaT - else: - if use_utc: - local_value = value - elif use_tzlocal: - local_value = tz_convert_utc_to_tzlocal(value, tz) - elif use_fixed: - local_value = value + delta - elif not use_pytz: - # i.e. dateutil - # no zone-name change for dateutil tzs - dst etc - # represented in single object. - local_value = value + deltas[pos[i]] - else: - # pytz - # find right representation of dst etc in pytz timezone - new_tz = tz._tzinfos[tz._transition_info[pos[i]]] - local_value = value + deltas[pos[i]] - - dt64_to_dtstruct(local_value, &dts) - result[i] = func_create(value, dts, new_tz, freq, fold) - - return result - - def _test_parse_iso8601(ts: str): """ TESTING ONLY: Parse string into Timestamp using iso8601 parser. Used diff --git a/pandas/_libs/tslibs/__init__.py b/pandas/_libs/tslibs/__init__.py index 76e356370de70..c2f3478a50ab4 100644 --- a/pandas/_libs/tslibs/__init__.py +++ b/pandas/_libs/tslibs/__init__.py @@ -11,8 +11,13 @@ "Period", "Resolution", "Timedelta", + "normalize_i8_timestamps", + "is_date_array_normalized", + "dt64arr_to_periodarr", "delta_to_nanoseconds", + "ints_to_pydatetime", "ints_to_pytimedelta", + "get_resolution", "Timestamp", "tz_convert_single", "to_offset", @@ -30,3 +35,10 @@ from .timedeltas import Timedelta, delta_to_nanoseconds, ints_to_pytimedelta from .timestamps import Timestamp from .tzconversion import tz_convert_single +from .vectorized import ( + dt64arr_to_periodarr, + get_resolution, + ints_to_pydatetime, + is_date_array_normalized, + normalize_i8_timestamps, +) diff --git a/pandas/_libs/tslibs/conversion.pxd b/pandas/_libs/tslibs/conversion.pxd index 0eb94fecf7d6b..73772e5ab4577 100644 --- a/pandas/_libs/tslibs/conversion.pxd +++ b/pandas/_libs/tslibs/conversion.pxd @@ -25,5 +25,4 @@ cdef int64_t get_datetime64_nanos(object val) except? -1 cpdef datetime localize_pydatetime(datetime dt, object tz) cdef int64_t cast_from_unit(object ts, str unit) except? -1 -cpdef ndarray[int64_t] normalize_i8_timestamps(const int64_t[:] stamps, tzinfo tz) cdef int64_t normalize_i8_stamp(int64_t local_val) nogil diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index 36a4a1f60d8b9..31d2d0e9572f5 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -765,73 +765,6 @@ cpdef inline datetime localize_pydatetime(datetime dt, object tz): # ---------------------------------------------------------------------- # Normalization - -@cython.wraparound(False) -@cython.boundscheck(False) -cpdef ndarray[int64_t] normalize_i8_timestamps(const int64_t[:] stamps, tzinfo tz): - """ - Normalize each of the (nanosecond) timezone aware timestamps in the given - array by rounding down to the beginning of the day (i.e. midnight). - This is midnight for timezone, `tz`. - - Parameters - ---------- - stamps : int64 ndarray - tz : tzinfo or None - - Returns - ------- - result : int64 ndarray of converted of normalized nanosecond timestamps - """ - cdef: - Py_ssize_t i, n = len(stamps) - int64_t[:] result = np.empty(n, dtype=np.int64) - ndarray[int64_t] trans - int64_t[:] deltas - str typ - Py_ssize_t[:] pos - int64_t delta, local_val - - if tz is None or is_utc(tz): - with nogil: - for i in range(n): - if stamps[i] == NPY_NAT: - result[i] = NPY_NAT - continue - local_val = stamps[i] - result[i] = normalize_i8_stamp(local_val) - elif is_tzlocal(tz): - for i in range(n): - if stamps[i] == NPY_NAT: - result[i] = NPY_NAT - continue - local_val = tz_convert_utc_to_tzlocal(stamps[i], tz) - result[i] = normalize_i8_stamp(local_val) - else: - # Adjust datetime64 timestamp, recompute datetimestruct - trans, deltas, typ = get_dst_info(tz) - - if typ not in ['pytz', 'dateutil']: - # static/fixed; in this case we know that len(delta) == 1 - delta = deltas[0] - for i in range(n): - if stamps[i] == NPY_NAT: - result[i] = NPY_NAT - continue - local_val = stamps[i] + delta - result[i] = normalize_i8_stamp(local_val) - else: - pos = trans.searchsorted(stamps, side='right') - 1 - for i in range(n): - if stamps[i] == NPY_NAT: - result[i] = NPY_NAT - continue - local_val = stamps[i] + deltas[pos[i]] - result[i] = normalize_i8_stamp(local_val) - - return result.base # `.base` to access underlying ndarray - - @cython.cdivision cdef inline int64_t normalize_i8_stamp(int64_t local_val) nogil: """ @@ -848,63 +781,3 @@ cdef inline int64_t normalize_i8_stamp(int64_t local_val) nogil: cdef: int64_t day_nanos = 24 * 3600 * 1_000_000_000 return local_val - (local_val % day_nanos) - - -@cython.wraparound(False) -@cython.boundscheck(False) -def is_date_array_normalized(const int64_t[:] stamps, tzinfo tz=None): - """ - Check if all of the given (nanosecond) timestamps are normalized to - midnight, i.e. hour == minute == second == 0. If the optional timezone - `tz` is not None, then this is midnight for this timezone. - - Parameters - ---------- - stamps : int64 ndarray - tz : tzinfo or None - - Returns - ------- - is_normalized : bool True if all stamps are normalized - """ - cdef: - Py_ssize_t i, n = len(stamps) - ndarray[int64_t] trans - int64_t[:] deltas - intp_t[:] pos - int64_t local_val, delta - str typ - int64_t day_nanos = 24 * 3600 * 1_000_000_000 - - if tz is None or is_utc(tz): - for i in range(n): - local_val = stamps[i] - if local_val % day_nanos != 0: - return False - - elif is_tzlocal(tz): - for i in range(n): - local_val = tz_convert_utc_to_tzlocal(stamps[i], tz) - if local_val % day_nanos != 0: - return False - else: - trans, deltas, typ = get_dst_info(tz) - - if typ not in ['pytz', 'dateutil']: - # static/fixed; in this case we know that len(delta) == 1 - delta = deltas[0] - for i in range(n): - # Adjust datetime64 timestamp, recompute datetimestruct - local_val = stamps[i] + delta - if local_val % day_nanos != 0: - return False - - else: - pos = trans.searchsorted(stamps) - 1 - for i in range(n): - # Adjust datetime64 timestamp, recompute datetimestruct - local_val = stamps[i] + deltas[pos[i]] - if local_val % day_nanos != 0: - return False - - return True diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index e4d05e0d70e2f..fb07e3fe7547e 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -38,7 +38,6 @@ from pandas._libs.tslibs.ccalendar cimport DAY_NANOS, get_days_in_month, dayofwe from pandas._libs.tslibs.conversion cimport ( convert_datetime_to_tsobject, localize_pydatetime, - normalize_i8_timestamps, ) from pandas._libs.tslibs.nattype cimport NPY_NAT, c_NaT as NaT from pandas._libs.tslibs.np_datetime cimport ( @@ -92,6 +91,8 @@ def apply_index_wraps(func): result = np.asarray(result) if self.normalize: + # TODO: Avoid circular/runtime import + from .vectorized import normalize_i8_timestamps result = normalize_i8_timestamps(result.view("i8"), None) return result diff --git a/pandas/_libs/tslibs/period.pxd b/pandas/_libs/tslibs/period.pxd index eb11a4a572e85..9c0342e239a89 100644 --- a/pandas/_libs/tslibs/period.pxd +++ b/pandas/_libs/tslibs/period.pxd @@ -1 +1,6 @@ +from numpy cimport int64_t + +from .np_datetime cimport npy_datetimestruct + cdef bint is_period_object(object obj) +cdef int64_t get_period_ordinal(npy_datetimestruct *dts, int freq) nogil diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index c0641297c4b8a..e6ba1968797ed 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -14,7 +14,6 @@ import cython from cpython.datetime cimport ( datetime, - tzinfo, PyDate_Check, PyDateTime_Check, PyDateTime_IMPORT, @@ -41,7 +40,6 @@ cdef extern from "src/datetime/np_datetime.h": cimport pandas._libs.tslibs.util as util from pandas._libs.tslibs.timestamps import Timestamp -from pandas._libs.tslibs.timezones cimport is_utc, is_tzlocal, get_dst_info from pandas._libs.tslibs.timedeltas import Timedelta from pandas._libs.tslibs.timedeltas cimport ( delta_to_nanoseconds, @@ -91,7 +89,6 @@ from pandas._libs.tslibs.offsets cimport ( is_offset_object, ) from pandas._libs.tslibs.offsets import INVALID_FREQ_ERR_MSG -from pandas._libs.tslibs.tzconversion cimport tz_convert_utc_to_tzlocal cdef: @@ -1416,60 +1413,6 @@ def extract_freq(ndarray[object] values): # period helpers -@cython.wraparound(False) -@cython.boundscheck(False) -def dt64arr_to_periodarr(const int64_t[:] stamps, int freq, tzinfo tz): - cdef: - Py_ssize_t n = len(stamps) - int64_t[:] result = np.empty(n, dtype=np.int64) - ndarray[int64_t] trans - int64_t[:] deltas - Py_ssize_t[:] pos - npy_datetimestruct dts - int64_t local_val - - if is_utc(tz) or tz is None: - with nogil: - for i in range(n): - if stamps[i] == NPY_NAT: - result[i] = NPY_NAT - continue - dt64_to_dtstruct(stamps[i], &dts) - result[i] = get_period_ordinal(&dts, freq) - - elif is_tzlocal(tz): - for i in range(n): - if stamps[i] == NPY_NAT: - result[i] = NPY_NAT - continue - local_val = tz_convert_utc_to_tzlocal(stamps[i], tz) - dt64_to_dtstruct(local_val, &dts) - result[i] = get_period_ordinal(&dts, freq) - else: - # Adjust datetime64 timestamp, recompute datetimestruct - trans, deltas, typ = get_dst_info(tz) - - if typ not in ['pytz', 'dateutil']: - # static/fixed; in this case we know that len(delta) == 1 - for i in range(n): - if stamps[i] == NPY_NAT: - result[i] = NPY_NAT - continue - dt64_to_dtstruct(stamps[i] + deltas[0], &dts) - result[i] = get_period_ordinal(&dts, freq) - else: - pos = trans.searchsorted(stamps, side='right') - 1 - - for i in range(n): - if stamps[i] == NPY_NAT: - result[i] = NPY_NAT - continue - dt64_to_dtstruct(stamps[i] + deltas[pos[i]], &dts) - result[i] = get_period_ordinal(&dts, freq) - - return result.base # .base to get underlying ndarray - - DIFFERENT_FREQ = ("Input has different freq={other_freq} " "from {cls}(freq={own_freq})") diff --git a/pandas/_libs/tslibs/resolution.pyx b/pandas/_libs/tslibs/resolution.pyx index d5f10374d2860..d2861d8e9fe8d 100644 --- a/pandas/_libs/tslibs/resolution.pyx +++ b/pandas/_libs/tslibs/resolution.pyx @@ -1,105 +1,9 @@ -from cpython.datetime cimport tzinfo import numpy as np -from numpy cimport ndarray, int64_t, int32_t - -from pandas._libs.tslibs.util cimport get_nat +from numpy cimport int32_t from pandas._libs.tslibs.dtypes import Resolution -from pandas._libs.tslibs.np_datetime cimport ( - npy_datetimestruct, dt64_to_dtstruct) -from pandas._libs.tslibs.timezones cimport ( - is_utc, is_tzlocal, get_dst_info) from pandas._libs.tslibs.ccalendar cimport get_days_in_month -from pandas._libs.tslibs.tzconversion cimport tz_convert_utc_to_tzlocal - -# ---------------------------------------------------------------------- -# Constants - -cdef: - int64_t NPY_NAT = get_nat() - - int RESO_NS = 0 - int RESO_US = 1 - int RESO_MS = 2 - int RESO_SEC = 3 - int RESO_MIN = 4 - int RESO_HR = 5 - int RESO_DAY = 6 - int RESO_MTH = 7 - int RESO_QTR = 8 - int RESO_YR = 9 - - -# ---------------------------------------------------------------------- - - -def get_resolution(const int64_t[:] stamps, tzinfo tz=None): - cdef: - Py_ssize_t i, n = len(stamps) - npy_datetimestruct dts - int reso = RESO_DAY, curr_reso - ndarray[int64_t] trans - int64_t[:] deltas - Py_ssize_t[:] pos - int64_t local_val, delta - - if is_utc(tz) or tz is None: - for i in range(n): - if stamps[i] == NPY_NAT: - continue - dt64_to_dtstruct(stamps[i], &dts) - curr_reso = _reso_stamp(&dts) - if curr_reso < reso: - reso = curr_reso - elif is_tzlocal(tz): - for i in range(n): - if stamps[i] == NPY_NAT: - continue - local_val = tz_convert_utc_to_tzlocal(stamps[i], tz) - dt64_to_dtstruct(local_val, &dts) - curr_reso = _reso_stamp(&dts) - if curr_reso < reso: - reso = curr_reso - else: - # Adjust datetime64 timestamp, recompute datetimestruct - trans, deltas, typ = get_dst_info(tz) - - if typ not in ['pytz', 'dateutil']: - # static/fixed; in this case we know that len(delta) == 1 - delta = deltas[0] - for i in range(n): - if stamps[i] == NPY_NAT: - continue - dt64_to_dtstruct(stamps[i] + delta, &dts) - curr_reso = _reso_stamp(&dts) - if curr_reso < reso: - reso = curr_reso - else: - pos = trans.searchsorted(stamps, side='right') - 1 - for i in range(n): - if stamps[i] == NPY_NAT: - continue - dt64_to_dtstruct(stamps[i] + deltas[pos[i]], &dts) - curr_reso = _reso_stamp(&dts) - if curr_reso < reso: - reso = curr_reso - - return Resolution(reso) - - -cdef inline int _reso_stamp(npy_datetimestruct *dts): - if dts.us != 0: - if dts.us % 1000 == 0: - return RESO_MS - return RESO_US - elif dts.sec != 0: - return RESO_SEC - elif dts.min != 0: - return RESO_MIN - elif dts.hour != 0: - return RESO_HR - return RESO_DAY # ---------------------------------------------------------------------- diff --git a/pandas/_libs/tslibs/vectorized.pyx b/pandas/_libs/tslibs/vectorized.pyx new file mode 100644 index 0000000000000..c8f8daf6724c2 --- /dev/null +++ b/pandas/_libs/tslibs/vectorized.pyx @@ -0,0 +1,440 @@ +import cython + +from cpython.datetime cimport datetime, date, time, tzinfo + +import numpy as np +from numpy cimport int64_t, intp_t, ndarray + +from .conversion cimport normalize_i8_stamp +from .dtypes import Resolution +from .nattype cimport NPY_NAT, c_NaT as NaT +from .np_datetime cimport npy_datetimestruct, dt64_to_dtstruct +from .offsets cimport to_offset +from .period cimport get_period_ordinal +from .timestamps cimport create_timestamp_from_ts +from .timezones cimport is_utc, is_tzlocal, get_dst_info +from .tzconversion cimport tz_convert_utc_to_tzlocal + +# ------------------------------------------------------------------------- + +cdef inline object create_datetime_from_ts( + int64_t value, + npy_datetimestruct dts, + tzinfo tz, + object freq, + bint fold, +): + """ + Convenience routine to construct a datetime.datetime from its parts. + """ + return datetime( + dts.year, dts.month, dts.day, dts.hour, dts.min, dts.sec, dts.us, + tz, fold=fold, + ) + + +cdef inline object create_date_from_ts( + int64_t value, + npy_datetimestruct dts, + tzinfo tz, + object freq, + bint fold +): + """ + Convenience routine to construct a datetime.date from its parts. + """ + # GH#25057 add fold argument to match other func_create signatures + return date(dts.year, dts.month, dts.day) + + +cdef inline object create_time_from_ts( + int64_t value, + npy_datetimestruct dts, + tzinfo tz, + object freq, + bint fold +): + """ + Convenience routine to construct a datetime.time from its parts. + """ + return time(dts.hour, dts.min, dts.sec, dts.us, tz, fold=fold) + + +@cython.wraparound(False) +@cython.boundscheck(False) +def ints_to_pydatetime( + const int64_t[:] arr, + tzinfo tz=None, + object freq=None, + bint fold=False, + str box="datetime" +): + """ + Convert an i8 repr to an ndarray of datetimes, date, time or Timestamp. + + Parameters + ---------- + arr : array of i8 + tz : str, optional + convert to this timezone + freq : str/Offset, optional + freq to convert + fold : bint, default is 0 + Due to daylight saving time, one wall clock time can occur twice + when shifting from summer to winter time; fold describes whether the + datetime-like corresponds to the first (0) or the second time (1) + the wall clock hits the ambiguous time + + .. versionadded:: 1.1.0 + box : {'datetime', 'timestamp', 'date', 'time'}, default 'datetime' + * If datetime, convert to datetime.datetime + * If date, convert to datetime.date + * If time, convert to datetime.time + * If Timestamp, convert to pandas.Timestamp + + Returns + ------- + ndarray of dtype specified by box + """ + cdef: + Py_ssize_t i, n = len(arr) + ndarray[int64_t] trans + int64_t[:] deltas + intp_t[:] pos + npy_datetimestruct dts + object dt, new_tz + str typ + int64_t value, local_value, delta = NPY_NAT # dummy for delta + ndarray[object] result = np.empty(n, dtype=object) + object (*func_create)(int64_t, npy_datetimestruct, tzinfo, object, bint) + bint use_utc = False, use_tzlocal = False, use_fixed = False + bint use_pytz = False + + if box == "date": + assert (tz is None), "tz should be None when converting to date" + + func_create = create_date_from_ts + elif box == "timestamp": + func_create = create_timestamp_from_ts + + if isinstance(freq, str): + freq = to_offset(freq) + elif box == "time": + func_create = create_time_from_ts + elif box == "datetime": + func_create = create_datetime_from_ts + else: + raise ValueError( + "box must be one of 'datetime', 'date', 'time' or 'timestamp'" + ) + + if is_utc(tz) or tz is None: + use_utc = True + elif is_tzlocal(tz): + use_tzlocal = True + else: + trans, deltas, typ = get_dst_info(tz) + if typ not in ["pytz", "dateutil"]: + # static/fixed; in this case we know that len(delta) == 1 + use_fixed = True + delta = deltas[0] + else: + pos = trans.searchsorted(arr, side="right") - 1 + use_pytz = typ == "pytz" + + for i in range(n): + new_tz = tz + value = arr[i] + + if value == NPY_NAT: + result[i] = NaT + else: + if use_utc: + local_value = value + elif use_tzlocal: + local_value = tz_convert_utc_to_tzlocal(value, tz) + elif use_fixed: + local_value = value + delta + elif not use_pytz: + # i.e. dateutil + # no zone-name change for dateutil tzs - dst etc + # represented in single object. + local_value = value + deltas[pos[i]] + else: + # pytz + # find right representation of dst etc in pytz timezone + new_tz = tz._tzinfos[tz._transition_info[pos[i]]] + local_value = value + deltas[pos[i]] + + dt64_to_dtstruct(local_value, &dts) + result[i] = func_create(value, dts, new_tz, freq, fold) + + return result + + +# ------------------------------------------------------------------------- + +cdef: + int RESO_NS = 0 + int RESO_US = 1 + int RESO_MS = 2 + int RESO_SEC = 3 + int RESO_MIN = 4 + int RESO_HR = 5 + int RESO_DAY = 6 + int RESO_MTH = 7 + int RESO_QTR = 8 + int RESO_YR = 9 + + +cdef inline int _reso_stamp(npy_datetimestruct *dts): + if dts.us != 0: + if dts.us % 1000 == 0: + return RESO_MS + return RESO_US + elif dts.sec != 0: + return RESO_SEC + elif dts.min != 0: + return RESO_MIN + elif dts.hour != 0: + return RESO_HR + return RESO_DAY + + +def get_resolution(const int64_t[:] stamps, tzinfo tz=None): + cdef: + Py_ssize_t i, n = len(stamps) + npy_datetimestruct dts + int reso = RESO_DAY, curr_reso + ndarray[int64_t] trans + int64_t[:] deltas + Py_ssize_t[:] pos + int64_t local_val, delta + + if is_utc(tz) or tz is None: + for i in range(n): + if stamps[i] == NPY_NAT: + continue + dt64_to_dtstruct(stamps[i], &dts) + curr_reso = _reso_stamp(&dts) + if curr_reso < reso: + reso = curr_reso + elif is_tzlocal(tz): + for i in range(n): + if stamps[i] == NPY_NAT: + continue + local_val = tz_convert_utc_to_tzlocal(stamps[i], tz) + dt64_to_dtstruct(local_val, &dts) + curr_reso = _reso_stamp(&dts) + if curr_reso < reso: + reso = curr_reso + else: + # Adjust datetime64 timestamp, recompute datetimestruct + trans, deltas, typ = get_dst_info(tz) + + if typ not in ["pytz", "dateutil"]: + # static/fixed; in this case we know that len(delta) == 1 + delta = deltas[0] + for i in range(n): + if stamps[i] == NPY_NAT: + continue + dt64_to_dtstruct(stamps[i] + delta, &dts) + curr_reso = _reso_stamp(&dts) + if curr_reso < reso: + reso = curr_reso + else: + pos = trans.searchsorted(stamps, side="right") - 1 + for i in range(n): + if stamps[i] == NPY_NAT: + continue + dt64_to_dtstruct(stamps[i] + deltas[pos[i]], &dts) + curr_reso = _reso_stamp(&dts) + if curr_reso < reso: + reso = curr_reso + + return Resolution(reso) + + +# ------------------------------------------------------------------------- + +@cython.wraparound(False) +@cython.boundscheck(False) +cpdef ndarray[int64_t] normalize_i8_timestamps(const int64_t[:] stamps, tzinfo tz): + """ + Normalize each of the (nanosecond) timezone aware timestamps in the given + array by rounding down to the beginning of the day (i.e. midnight). + This is midnight for timezone, `tz`. + + Parameters + ---------- + stamps : int64 ndarray + tz : tzinfo or None + + Returns + ------- + result : int64 ndarray of converted of normalized nanosecond timestamps + """ + cdef: + Py_ssize_t i, n = len(stamps) + int64_t[:] result = np.empty(n, dtype=np.int64) + ndarray[int64_t] trans + int64_t[:] deltas + str typ + Py_ssize_t[:] pos + int64_t delta, local_val + + if tz is None or is_utc(tz): + with nogil: + for i in range(n): + if stamps[i] == NPY_NAT: + result[i] = NPY_NAT + continue + local_val = stamps[i] + result[i] = normalize_i8_stamp(local_val) + elif is_tzlocal(tz): + for i in range(n): + if stamps[i] == NPY_NAT: + result[i] = NPY_NAT + continue + local_val = tz_convert_utc_to_tzlocal(stamps[i], tz) + result[i] = normalize_i8_stamp(local_val) + else: + # Adjust datetime64 timestamp, recompute datetimestruct + trans, deltas, typ = get_dst_info(tz) + + if typ not in ["pytz", "dateutil"]: + # static/fixed; in this case we know that len(delta) == 1 + delta = deltas[0] + for i in range(n): + if stamps[i] == NPY_NAT: + result[i] = NPY_NAT + continue + local_val = stamps[i] + delta + result[i] = normalize_i8_stamp(local_val) + else: + pos = trans.searchsorted(stamps, side="right") - 1 + for i in range(n): + if stamps[i] == NPY_NAT: + result[i] = NPY_NAT + continue + local_val = stamps[i] + deltas[pos[i]] + result[i] = normalize_i8_stamp(local_val) + + return result.base # `.base` to access underlying ndarray + + +@cython.wraparound(False) +@cython.boundscheck(False) +def is_date_array_normalized(const int64_t[:] stamps, tzinfo tz=None): + """ + Check if all of the given (nanosecond) timestamps are normalized to + midnight, i.e. hour == minute == second == 0. If the optional timezone + `tz` is not None, then this is midnight for this timezone. + + Parameters + ---------- + stamps : int64 ndarray + tz : tzinfo or None + + Returns + ------- + is_normalized : bool True if all stamps are normalized + """ + cdef: + Py_ssize_t i, n = len(stamps) + ndarray[int64_t] trans + int64_t[:] deltas + intp_t[:] pos + int64_t local_val, delta + str typ + int64_t day_nanos = 24 * 3600 * 1_000_000_000 + + if tz is None or is_utc(tz): + for i in range(n): + local_val = stamps[i] + if local_val % day_nanos != 0: + return False + + elif is_tzlocal(tz): + for i in range(n): + local_val = tz_convert_utc_to_tzlocal(stamps[i], tz) + if local_val % day_nanos != 0: + return False + else: + trans, deltas, typ = get_dst_info(tz) + + if typ not in ["pytz", "dateutil"]: + # static/fixed; in this case we know that len(delta) == 1 + delta = deltas[0] + for i in range(n): + # Adjust datetime64 timestamp, recompute datetimestruct + local_val = stamps[i] + delta + if local_val % day_nanos != 0: + return False + + else: + pos = trans.searchsorted(stamps) - 1 + for i in range(n): + # Adjust datetime64 timestamp, recompute datetimestruct + local_val = stamps[i] + deltas[pos[i]] + if local_val % day_nanos != 0: + return False + + return True + + +# ------------------------------------------------------------------------- + + +@cython.wraparound(False) +@cython.boundscheck(False) +def dt64arr_to_periodarr(const int64_t[:] stamps, int freq, tzinfo tz): + cdef: + Py_ssize_t n = len(stamps) + int64_t[:] result = np.empty(n, dtype=np.int64) + ndarray[int64_t] trans + int64_t[:] deltas + Py_ssize_t[:] pos + npy_datetimestruct dts + int64_t local_val + + if is_utc(tz) or tz is None: + with nogil: + for i in range(n): + if stamps[i] == NPY_NAT: + result[i] = NPY_NAT + continue + dt64_to_dtstruct(stamps[i], &dts) + result[i] = get_period_ordinal(&dts, freq) + + elif is_tzlocal(tz): + for i in range(n): + if stamps[i] == NPY_NAT: + result[i] = NPY_NAT + continue + local_val = tz_convert_utc_to_tzlocal(stamps[i], tz) + dt64_to_dtstruct(local_val, &dts) + result[i] = get_period_ordinal(&dts, freq) + else: + # Adjust datetime64 timestamp, recompute datetimestruct + trans, deltas, typ = get_dst_info(tz) + + if typ not in ["pytz", "dateutil"]: + # static/fixed; in this case we know that len(delta) == 1 + for i in range(n): + if stamps[i] == NPY_NAT: + result[i] = NPY_NAT + continue + dt64_to_dtstruct(stamps[i] + deltas[0], &dts) + result[i] = get_period_ordinal(&dts, freq) + else: + pos = trans.searchsorted(stamps, side="right") - 1 + + for i in range(n): + if stamps[i] == NPY_NAT: + result[i] = NPY_NAT + continue + dt64_to_dtstruct(stamps[i] + deltas[pos[i]], &dts) + result[i] = get_period_ordinal(&dts, freq) + + return result.base # .base to get underlying ndarray diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index fcfbaa4ac2a1c..8eac45cdedaec 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -10,7 +10,11 @@ Timestamp, conversion, fields, + get_resolution, iNaT, + ints_to_pydatetime, + is_date_array_normalized, + normalize_i8_timestamps, resolution as libresolution, timezones, to_offset, @@ -526,11 +530,11 @@ def is_normalized(self): """ Returns True if all of the dates are at midnight ("no time") """ - return conversion.is_date_array_normalized(self.asi8, self.tz) + return is_date_array_normalized(self.asi8, self.tz) @property # NB: override with cache_readonly in immutable subclasses def _resolution_obj(self) -> libresolution.Resolution: - return libresolution.get_resolution(self.asi8, self.tz) + return get_resolution(self.asi8, self.tz) # ---------------------------------------------------------------- # Array-Like / EA-Interface Methods @@ -559,7 +563,7 @@ def __iter__(self): for i in range(chunks): start_i = i * chunksize end_i = min((i + 1) * chunksize, length) - converted = tslib.ints_to_pydatetime( + converted = ints_to_pydatetime( data[start_i:end_i], tz=self.tz, freq=self.freq, box="timestamp" ) for v in converted: @@ -991,7 +995,7 @@ def to_pydatetime(self) -> np.ndarray: ------- datetimes : ndarray """ - return tslib.ints_to_pydatetime(self.asi8, tz=self.tz) + return ints_to_pydatetime(self.asi8, tz=self.tz) def normalize(self): """ @@ -1031,7 +1035,7 @@ def normalize(self): '2014-08-01 00:00:00+05:30'], dtype='datetime64[ns, Asia/Calcutta]', freq=None) """ - new_values = conversion.normalize_i8_timestamps(self.asi8, self.tz) + new_values = normalize_i8_timestamps(self.asi8, self.tz) return type(self)(new_values)._with_freq("infer").tz_localize(self.tz) def to_period(self, freq=None): @@ -1219,7 +1223,7 @@ def time(self): else: timestamps = self.asi8 - return tslib.ints_to_pydatetime(timestamps, box="time") + return ints_to_pydatetime(timestamps, box="time") @property def timetz(self): @@ -1227,7 +1231,7 @@ def timetz(self): Returns numpy array of datetime.time also containing timezone information. The time part of the Timestamps. """ - return tslib.ints_to_pydatetime(self.asi8, self.tz, box="time") + return ints_to_pydatetime(self.asi8, self.tz, box="time") @property def date(self): @@ -1243,7 +1247,7 @@ def date(self): else: timestamps = self.asi8 - return tslib.ints_to_pydatetime(timestamps, box="date") + return ints_to_pydatetime(timestamps, box="date") def isocalendar(self): """ diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 4b4df3445be4e..b336371655466 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -10,6 +10,7 @@ NaTType, Timedelta, delta_to_nanoseconds, + dt64arr_to_periodarr as c_dt64arr_to_periodarr, iNaT, period as libperiod, to_offset, @@ -951,7 +952,7 @@ def dt64arr_to_periodarr(data, freq, tz=None): data = data._values base = freq._period_dtype_code - return libperiod.dt64arr_to_periodarr(data.view("i8"), base, tz), freq + return c_dt64arr_to_periodarr(data.view("i8"), base, tz), freq def _get_ordinal_range(start, end, periods, freq, mult=1): diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index d0417d51da497..6b84f0e81f48b 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -15,6 +15,7 @@ Timedelta, Timestamp, iNaT, + ints_to_pydatetime, ) from pandas._libs.tslibs.timezones import tz_compare from pandas._typing import ArrayLike, Dtype, DtypeObj @@ -919,7 +920,7 @@ def astype_nansafe(arr, dtype, copy: bool = True, skipna: bool = False): elif is_datetime64_dtype(arr): if is_object_dtype(dtype): - return tslib.ints_to_pydatetime(arr.view(np.int64)) + return ints_to_pydatetime(arr.view(np.int64)) elif dtype == np.int64: if isna(arr).any(): raise ValueError("Cannot convert NaT values to integer") @@ -1399,7 +1400,7 @@ def maybe_cast_to_datetime(value, dtype, errors: str = "raise"): if value.dtype != DT64NS_DTYPE: value = value.astype(DT64NS_DTYPE) ints = np.asarray(value).view("i8") - return tslib.ints_to_pydatetime(ints) + return ints_to_pydatetime(ints) # we have a non-castable dtype that was passed raise TypeError(f"Cannot cast datetime64 to {dtype}") diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 0317d0b93859b..6d2e592f024ed 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -5,8 +5,14 @@ import numpy as np -from pandas._libs import NaT, Period, Timestamp, index as libindex, lib, tslib -from pandas._libs.tslibs import Resolution, parsing, timezones, to_offset +from pandas._libs import NaT, Period, Timestamp, index as libindex, lib +from pandas._libs.tslibs import ( + Resolution, + ints_to_pydatetime, + parsing, + timezones, + to_offset, +) from pandas._libs.tslibs.offsets import prefix_mapping from pandas._typing import DtypeObj, Label from pandas.errors import InvalidIndexError @@ -339,7 +345,7 @@ def _is_comparable_dtype(self, dtype: DtypeObj) -> bool: def _mpl_repr(self): # how to represent ourselves to matplotlib - return tslib.ints_to_pydatetime(self.asi8, self.tz) + return ints_to_pydatetime(self.asi8, self.tz) @property def _formatter_func(self): diff --git a/pandas/tests/tslibs/test_api.py b/pandas/tests/tslibs/test_api.py index 840a8c2fb68b1..957706fcb460e 100644 --- a/pandas/tests/tslibs/test_api.py +++ b/pandas/tests/tslibs/test_api.py @@ -18,6 +18,7 @@ def test_namespace(): "period", "resolution", "strptime", + "vectorized", "timedeltas", "timestamps", "timezones", @@ -37,7 +38,12 @@ def test_namespace(): "Resolution", "Tick", "Timedelta", + "dt64arr_to_periodarr", "Timestamp", + "is_date_array_normalized", + "ints_to_pydatetime", + "normalize_i8_timestamps", + "get_resolution", "delta_to_nanoseconds", "ints_to_pytimedelta", "localize_pydatetime", diff --git a/setup.py b/setup.py index e9d305d831653..1885546e001fe 100755 --- a/setup.py +++ b/setup.py @@ -322,6 +322,7 @@ class CheckSDist(sdist_class): "pandas/_libs/tslibs/resolution.pyx", "pandas/_libs/tslibs/parsing.pyx", "pandas/_libs/tslibs/tzconversion.pyx", + "pandas/_libs/tslibs/vectorized.pyx", "pandas/_libs/window/indexers.pyx", "pandas/_libs/writers.pyx", "pandas/io/sas/sas.pyx", @@ -659,6 +660,7 @@ def srcpath(name=None, suffix=".pyx", subdir="src"): "pyxfile": "_libs/tslibs/tzconversion", "depends": tseries_depends, }, + "_libs.tslibs.vectorized": {"pyxfile": "_libs/tslibs/vectorized"}, "_libs.testing": {"pyxfile": "_libs/testing"}, "_libs.window.aggregations": { "pyxfile": "_libs/window/aggregations", From 755e96f82882f444919959938444075b7112c05b Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 8 Jul 2020 14:51:12 -0700 Subject: [PATCH 0305/1025] ASV: tslibs.fields (#35149) --- asv_bench/benchmarks/tslibs/fields.py | 74 +++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) create mode 100644 asv_bench/benchmarks/tslibs/fields.py diff --git a/asv_bench/benchmarks/tslibs/fields.py b/asv_bench/benchmarks/tslibs/fields.py new file mode 100644 index 0000000000000..0607a799ec707 --- /dev/null +++ b/asv_bench/benchmarks/tslibs/fields.py @@ -0,0 +1,74 @@ +import numpy as np + +from pandas._libs.tslibs.fields import ( + get_date_field, + get_start_end_field, + get_timedelta_field, +) + +from .tslib import _sizes + + +class TimeGetTimedeltaField: + params = [ + _sizes, + ["days", "h", "s", "seconds", "ms", "microseconds", "us", "ns", "nanoseconds"], + ] + param_names = ["size", "field"] + + def setup(self, size, field): + arr = np.random.randint(0, 10, size=size, dtype="i8") + self.i8data = arr + + def time_get_timedelta_field(self, size, field): + get_timedelta_field(self.i8data, field) + + +class TimeGetDateField: + params = [ + _sizes, + [ + "Y", + "M", + "D", + "h", + "m", + "s", + "us", + "ns", + "doy", + "dow", + "woy", + "q", + "dim", + "is_leap_year", + ], + ] + param_names = ["size", "field"] + + def setup(self, size, field): + arr = np.random.randint(0, 10, size=size, dtype="i8") + self.i8data = arr + + def time_get_date_field(self, size, field): + get_date_field(self.i8data, field) + + +class TimeGetStartEndField: + params = [ + _sizes, + ["start", "end"], + ["month", "quarter", "year"], + ["B", None, "QS"], + [12, 3, 5], + ] + param_names = ["size", "side", "period", "freqstr", "month_kw"] + + def setup(self, size, side, period, freqstr, month_kw): + arr = np.random.randint(0, 10, size=size, dtype="i8") + self.i8data = arr + + self.attrname = f"is_{period}_{side}" + + def time_get_start_end_field(self, size, side, period, freqstr, month_kw): + get_start_end_field(self.i8data, self.attrname, freqstr, month_kw=month_kw) From 04ca3149347a54b7dfaa0ec3a382375dd6edef96 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 8 Jul 2020 14:52:28 -0700 Subject: [PATCH 0306/1025] TYP: maybe_get_tz (#35103) --- pandas/_libs/tslibs/timezones.pxd | 2 +- pandas/_libs/tslibs/timezones.pyx | 8 +++++++- pandas/tests/tslibs/test_timezones.py | 12 ++++++++++++ 3 files changed, 20 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/tslibs/timezones.pxd b/pandas/_libs/tslibs/timezones.pxd index f51ee41cb99a6..136710003d32a 100644 --- a/pandas/_libs/tslibs/timezones.pxd +++ b/pandas/_libs/tslibs/timezones.pxd @@ -9,7 +9,7 @@ cdef bint treat_tz_as_pytz(tzinfo tz) cpdef bint tz_compare(tzinfo start, tzinfo end) cpdef object get_timezone(tzinfo tz) -cpdef object maybe_get_tz(object tz) +cpdef tzinfo maybe_get_tz(object tz) cdef timedelta get_utcoffset(tzinfo tz, datetime obj) cdef bint is_fixed_offset(tzinfo tz) diff --git a/pandas/_libs/tslibs/timezones.pyx b/pandas/_libs/tslibs/timezones.pyx index 3b2104f75956a..a8c785704d8e8 100644 --- a/pandas/_libs/tslibs/timezones.pyx +++ b/pandas/_libs/tslibs/timezones.pyx @@ -84,7 +84,7 @@ cpdef inline object get_timezone(tzinfo tz): return tz -cpdef inline object maybe_get_tz(object tz): +cpdef inline tzinfo maybe_get_tz(object tz): """ (Maybe) Construct a timezone object from a string. If tz is a string, use it to construct a timezone object. Otherwise, just return tz. @@ -102,6 +102,12 @@ cpdef inline object maybe_get_tz(object tz): tz = pytz.timezone(tz) elif is_integer_object(tz): tz = pytz.FixedOffset(tz / 60) + elif isinstance(tz, tzinfo): + pass + elif tz is None: + pass + else: + raise TypeError(type(tz)) return tz diff --git a/pandas/tests/tslibs/test_timezones.py b/pandas/tests/tslibs/test_timezones.py index 03cc8fcb6e904..81b41f567976d 100644 --- a/pandas/tests/tslibs/test_timezones.py +++ b/pandas/tests/tslibs/test_timezones.py @@ -106,3 +106,15 @@ def test_infer_tz_mismatch(infer_setup, ordered): with pytest.raises(AssertionError, match=msg): timezones.infer_tzinfo(*args) + + +def test_maybe_get_tz_invalid_types(): + with pytest.raises(TypeError, match=""): + timezones.maybe_get_tz(44.0) + + with pytest.raises(TypeError, match=""): + timezones.maybe_get_tz(pytz) + + msg = "" + with pytest.raises(TypeError, match=msg): + timezones.maybe_get_tz(Timestamp.now("UTC")) From 599b603cf988a30152da6f80c2776c37259d885a Mon Sep 17 00:00:00 2001 From: Irv Lustig Date: Wed, 8 Jul 2020 18:03:47 -0400 Subject: [PATCH 0307/1025] Fix Issue 34748 - read in datetime as MultiIndex for column headers (#34954) * Fix Issue 34748 - read in datetime as MultiIndex for column headers * add more xls file formats * use testing pattern for other Excel files * added ods file * remove xfail for ods test * skip tests on xlsb with datetimes Co-authored-by: Will Ayd --- doc/source/whatsnew/v1.1.0.rst | 1 + pandas/io/parsers.py | 2 +- .../tests/io/data/excel/test_datetime_mi.ods | Bin 0 -> 3585 bytes .../tests/io/data/excel/test_datetime_mi.xls | Bin 0 -> 24576 bytes .../tests/io/data/excel/test_datetime_mi.xlsb | Bin 0 -> 7947 bytes .../tests/io/data/excel/test_datetime_mi.xlsm | Bin 0 -> 8700 bytes .../tests/io/data/excel/test_datetime_mi.xlsx | Bin 0 -> 8687 bytes pandas/tests/io/excel/test_readers.py | 19 ++++++++++++++++++ 8 files changed, 21 insertions(+), 1 deletion(-) create mode 100644 pandas/tests/io/data/excel/test_datetime_mi.ods create mode 100644 pandas/tests/io/data/excel/test_datetime_mi.xls create mode 100644 pandas/tests/io/data/excel/test_datetime_mi.xlsb create mode 100644 pandas/tests/io/data/excel/test_datetime_mi.xlsm create mode 100644 pandas/tests/io/data/excel/test_datetime_mi.xlsx diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 24283d2c2e48d..ce0668917f800 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -1051,6 +1051,7 @@ I/O - Bug in :meth:`~HDFStore.create_table` now raises an error when `column` argument was not specified in `data_columns` on input (:issue:`28156`) - :meth:`read_json` now could read line-delimited json file from a file url while `lines` and `chunksize` are set. - Bug in :meth:`DataFrame.to_sql` when reading DataFrames with ``-np.inf`` entries with MySQL now has a more explicit ``ValueError`` (:issue:`34431`) +- Bug in "meth"`read_excel` where datetime values are used in the header in a `MultiIndex` (:issue:`34748`) Plotting ^^^^^^^^ diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index c427d3a198b10..d4f346f8c1087 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1614,7 +1614,7 @@ def extract(r): # Clean the column names (if we have an index_col). if len(ic): col_names = [ - r[0] if (len(r[0]) and r[0] not in self.unnamed_cols) else None + r[0] if ((r[0] is not None) and r[0] not in self.unnamed_cols) else None for r in header ] else: diff --git a/pandas/tests/io/data/excel/test_datetime_mi.ods b/pandas/tests/io/data/excel/test_datetime_mi.ods new file mode 100644 index 0000000000000000000000000000000000000000..c37c35060c6508a0958b0ae21bd0181bd63017e5 GIT binary patch literal 3585 zcmZ{n2Q-|`8pl@`YY{|?&Jtqvo<#H(HHg(MHtK4-dJBSRQIhCHi`9Dz)+!M`Eu^fl zdW&9yh={m+_ui9pbMKw^%z59LIsfN9bLN@f%-`T15iteepM?t$4Q@*)fux%~ukow`|$j`_F4NUO*!ayuuS7wAedN=;qYljPC z$>G9qhpw!2o;gdm$Z1w#_8Oj8|R5anu^7~I+2^Vi)11)s(Z!@F6Iq`cI)c7379{!Y)TXV%E zBF^0`-ZqWmT6e89JFvLP&&uhiJFi-CC;3pzI9W!YNU5lY$GxmfEBee+E;Gei^>e%Y zr`fT|CgMT<{+g8xCrZ}>Tqm}88y=|T(Wnm_n22W&r`itr9YqhgN(j}yt@%-uM_@8h zK&T_~oPW^O{t4}rxJF8hCiN<^k%C;9B){d2Tr4nEqRG6i(n73@BN=fur(hS{M=8Dd zS{tLJJ1I8stR*+BE5V(Y&5N3aJo&o={y@dt+PxuMXh!n>@j|ml@;X}JxvXg1&z7vO z8Q!{r+)EJ;jBi4N9iFmhv}YhFGx(Wrt*+Yp>4hgP&}2W1ek3VKH>DwsZ`PiQ+?vR* zfKlPYc0+lg)_YVwKH$f4lR{*O)0F4_{f%6AB#vm?Pt29$kv+qERd46z-Ohr?#*LWwwu`Fc36fKM8KZ;e#AJTS`{zq zsCRq&>^Am@Wqdxz1-dke`jMM10O_OqxWB=Q67AC z@4P`)K7@Xw_<6Y@UoJ$;E!B|Ep7W{>T(mGM@Ij@b zl&%ML$@C8Z41FC&EL<=VC!m6(-NChb(e&rIY+tfc<620Gt%geUPVhK|bc!My>~QBz zEmkqrQ`*vubylIfFYg6c-HDEORbg;EIGE{ic-e+8>Yk{>;uL$=2K*Hb%}ED(Yz%S* zk`dOCJr{%Rt8G8ekL2MAht5nTg4Y|ayDf)hv=U8nZo;BNIRc$P+s$Cg`LoY@3VX4y zBhBAri#Xa}0fwZQatPc;6zMx~oKdQ}f40qm%{94}?Y~%I-W;8?n=yo65A?0}MLfCL zL$I;O^t?Hmij#;w;gHVSv~$+e=+~USZ*a@Yg%`(Jd3QV}_WY@~##5UaS&FF*tYjdS zK^F6K;1G2`*A8=*9CYIO;|Y?a$~SuN+q+AM&(dg)q0p*{P57eDEm*Q5Q){%c6ZXY| zDjBUlPjpu>0f`C7QiWMi%&6F2zh!X(PLXu-;iU)``_uP__m99In=45l(v|dt7xo25 zs0q+A86J?xJvQ|0tNN^{srLAXO#2;D1*+J)J9_>NQl02KOsL~u%(jGGJs#VhXXhXQ z;QX8j@Sm{D1j(MGE@1cODgXeyfStXEJL-~-m$ZZ8FK9;-A~F^x+W0O(Sg*d$HpsAX z&HiOGhhhqk1yq|}bFlvOh%L2HXs+3%?s}tQWLK=L%2&_P%!PqX{eE*Hs(chUTlcvF zc$|=hFW}Rb^!V<}FtEqC^7ScCK?kDjHDdW6jc5BYO+rI)N~7RS7bi+sz1VlZ@J%Us znMRtmnKHg~V^et#Cy?y0_oOl%-wNHA>FsT@pHoOQwI<&Ih;?Lo66e>}B8@&7$UQ(w zn>M~|Z#e1`)kmj$?<^NF6hCJ=YI{}EW2>Jo7gv5#AN%cXV*u-M$oWT+m&H?k>n=@k zxl$}o7TpeAZ-H|d9mU9ND>r0Aw3p`bC?5hBVP!oKCN@bSzjC#V(29dqKMgsN42}lJ zGeoNL$-gT^BzwjVg28Y_SZ>(DZC!($O@Fy|r+@541!ysm2a!5Dm({Ot zvOgJluX_l5-blyi59Jow!h*;B$Aa4Jy0HD#L#E5aBEbrz&etB$!v)9U@2^G~v(k&z z&!~bJDxJiknvp4Z)rU4xK!{o&BM0sA1?~*&w!gO}mHVEO>9KQAfUXoqa4y`cta!rz zMpX%j5=zf5EmrQ8%hM#A5t!Pkp?WxE3{B*M<&PqB5VSE|-yh)e#`3Eb@72->jnFUL zX~ci!o|oP4V!8#}*q}Dn&T1~wdAm@%5wg@;EE97SxF-fFp64=lSMwluRS-|AkSEJl zQy|}wD1Cc}Qfx0AR?8}LTEZ-DQ*qq1I({y~gMT*t>9gi{tg`x7!-5xu71^8hE2K4F z+YJ=4$%NWvqBU-b)IAQfn3~o}Jk>TNC}6}IiZ}?%8~nuMul>#Rv!?^a@X#35PRvgo zn^ERDXU=*llf)?Z4SQo5`dPfn8EA~UIwg!S@I)^kgb{h%1y5wVYON2M={CsPl*Sa= z-XEBeN&hqK>X>$~lyh0vb!f=dtmr06tcx0Kp9Rq!01omjym_ZB-xyu(}9 zYTV|0V{s)Dxe$b%O+oLd`<8Fn+M^7k_$^!N7y-FjcaDK>a~Z=lhbvm*sZ|R6T05ac zozARGpQ9%+H=FQo*Z9rc9IKaLoRTuc6Y3b3#;bK6 zQMi&>$eZ|VrQP6Ap|w*=O(mM!k#RlqHWm}iN{7krD%86V7KE zPIoIJVvDzRD%lQnPl@&zIcGJq#CGddf{IPHX%q8wn6V3Ra615Yy6UE%I}KQ3_6{z zViLKS=znwC?DZUX2LS-!Mf9(WE*u5Bgmi+DZp{Z#nhu_Q4(>U+{&ljC+Fz;2d3k9x z5549eZXxoz>v-@1i1Ry7b5#-g(~C2iGn13LXQ*|PZxlGKqU)-0wDA1bSMU3Q0u@3B z=Zb~{N=S^}xC%_>@2H;*#;~sS7T$D9=dZ zk@by323`A2`b_OJk)WPgZxWBO_+z3zf2JzIgzegSCVCw~Ha?PWCtQHx{d22@F9)w> zJ-EYJsmNFxZe)Mys4G?87C-xlsVa)5l7y}%BCy)-i4y$h5c9`>b>3)@`o7jge*GevMlb((vhpg; zTKokPXt?iqniayFn?5srepR##+O^4RmL(G8Yg^Pxl0q3(A1Dspp7Yi-Zx{fqtQkdrHg|YbO z3KI}Q{e&M(-EZpC@ZE%l-^m2sc;^Dl={UOTB);|Eh14Ur~ literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/excel/test_datetime_mi.xls b/pandas/tests/io/data/excel/test_datetime_mi.xls new file mode 100644 index 0000000000000000000000000000000000000000..aeade05855919877556f4adc171aa1c4b5c22c8b GIT binary patch literal 24576 zcmeHP3tUZE+h6B&s?$wZ>4Hud>0Y{vbVI0=gfT8joi0joR5CO*$&hPuS2M;XBG(Y5 z+%gd%DJBMmj3x{QjhW$m&)U0F`|NY6_nY_q{l4$_o_&66@4cSA*8h3dvz~Q%)?WM6 z-PCN}dCZ^%(T)*F4t?e;Aq5JZ0rz%-xCVmfd_Il}uC{OwB-!!*NCQf2s7M8Q^kwjM z#S4gu6xay0!1X8##}Il9VLF1E1GgRLCgAvc12?-<@hx;4bvggEzsGXtF>^4udR zLy!Tw0_ph~x!Ba1j1%$ zoyicxg(f8umeLBvg7mIkQmWCUmdaLtWjFcVXZ4k z#2CoxkK};|GBGnTwJ|ky2)7(I-9iwVZaI);4E=w9q36>=xluEO6PX}IG@v_?or5qA z#4*%h5Y>d9iiXyi)D2l7M})$$C{`SSWS!CJ76f6s<*-hIiLD999d6+r4qWVzCHx>O zHJ=rSn#qbIl*0?zAP-??WC?`%5u#C8dUM<(jSw6^PiTdjNq#)QeS5!&`(OBpOQd7 zE`d&#SClU)kE)0~hX@>)XpwfcQB!k^$`%#GBI`w~iyj?)&gh&GVv+JFK(|8J;UAEI zI7s_cfF4df-9`f4N-Q1sGz8AOPWaCDr6Qnr**|HAFf%06H=q!>H4?N(C1EYRKomQRn30je7kF^E+A#o?rbMB^BjQjNLrZ7Vg&GQJsX}>)7wYZXw<1D$d3lKl zMWqRZ!n97AVCqB)Po;?p1@l3QP=BUNP`5ats9O1FnxeNxR}hM9jad64LJ64hGZ|f) z5-pikSIKbu$A?&xE-WnU0uk5%RkZ?UR9f%VO>dZrgl!<5okp`{?4-*h zLHpzLYr8TCge@s4;iJw{2!thQoe~R#rP2hp(M1C;qP?iSkg&MAg*2GXo!QU~fPiouq>d5(0sU zc%9%0on)Qh>02*!A{|7(Hz2iv(m`NG*|Q{$PSQclg+L%8UMF}?DOo2vLLd_R6X_r@ zVeh#(wSm$>IK2UJbdnA-SO^3n;&p-@8p%4r)Bj%BiF6Rd-hk8wN(V9S4Tz%?6=VZb z3u(acg&Z9Qacnt0k&)cQ*lg`Fbl#PiLHl(XTu#x9R|NZoics5#eHcp~! z%%!j)oe_eK0yp~Q%a`KWXw%s^i?Xqj!Unr5ok~;WM)Tj+if5xkXX7Hu#!d=ksf zVRNG&w0DSS)0fW1Rg{gR6gK4Sl~9@zH~R5!_r$Z&#cZ6VTbdEDaqfks)drrZ{zW{S zesnfsEX`C38>*$%2L5sFka#wFbT(ovZLkzJR7U7jHTI2VMDdF+Q7=Yuf(%4q_YuYX--nuP%W)Cu)6t#csBhp8&~O;#sN02 zy|6S#r?vYf)Xe}o8!?tOyGiVkW_1EpFKLNC*YhzB+6 z1_k?}gASsBQf&qi&NeZs()CH)Vv!MY=aJJO9Q3a147gC z8^wbT?gj-*po0#ffl_N9p?Qb=#euSs1&pr9HzF#JJyf{n8HRj0AXAsGg2?>^kthZBA4I^^D-I>W z6bvtC;jtHk;#DgF1H6hvFi=3S1`H4JdIoGD5Q?#dVoE}>4NP@-^$P3uezDxhc#a=8 zAt8^QL6fT=Um4_zg~?zfiifG8AKWFtk17>js}V6DfmC?WMxje8GA59Uj0vP7V;f*? z);d#;pU371iy|)vO#DD4bKw^O0w9Fb+GHv~S_@iQ1=!5gotA^9!ALNpE01j^3-XxL z^04@9$ddw_J9xz0Hxe!C%3+tO2s!L%ITZP-A~}S4^+onD14pCM<|RZVMkFWk7IDJ3 zX~}tPe_<%53tt0`LTRw~2G@NeQU6IG$Rf~r7>JVw_JCK~1htPVyVJasplq-_63tZc!nXhmmBT(mDh=Z86}uKO}2Rk*byrOl9N(6^X_s&PxGP zND;0wrVf;6ENFHf*dng zL6iusp9@t^?E&Pvtg)*v67)WJC0hX$r;6M_el(t&lFW;M?uFJdGx|fvJRX2?NqJyX zxMLwb8lpdicOruF7Ql;>PJ;)N%0$o{O@w8IZBz;XhFcBpATSi}B9xFjZXV;4lDT>8 z*}_?v%FsM;mSm6tuS*L$TD&eT=)v%sa(BN<#sobW858tiWNZUd1r0^8h83KWOjPhf z8q>Z29ZVEiw2G-cJnj&Msu&ZxB_U%1NywN$5;BJSM~_T-P?edm2=elv_Yf#EuglLT zm8Fb^Vap7QOHKf74H3>Irw+9d2F8#K%EkH;HEsqr?%@PPQZlY^+ERm@J^_IhPSy~M zupy!41gv*Kw?URx5W^Owi9BI2L{lPoiMWR%ds8oAL76JhF;4*wJg{3qCnc^I#bY+1 z;kE29&|1oxNG_C46MkU;GMgyvL=brqaTf$-$9-Q4n3*GJF%G2?7Sx@By5-Il&0^qJ zBg>#v*?8UomSO7T?B?vgovnwqQ+P7)LpbntH*D10O?igttHF#J@VmfZLqN#oS0?%! z_E`xIKGX*rMY44v2xtiDcHT14X7&`-&6Fo!9HwAFT8L5VkeP+QjJwU>7-Ie%rF znQqNbCHp4*)e%@9=3B(kO|WTl>iB-x6JDi};>oJ51^dPwdFyL!)^6xOqwtYiaMPj5 zIo}xQ4UZ^P-}K}bRlk0NJrRX8&t;%Yxs-vno;WuMA`nJ!`4f{Ul#r-=|6Dl68 zEm^z7b!P4O3w)hf@4cRD->%J=mH`^!!<am~Kwb+(a&vO}BHOuzw8^oTLHt*Zri|?NduL!xdd-(MshJ|BF z9X4sHuV@Oc2tDi@b*o0Be%z@0&a=%j_N|{Ee%EO0_xjDQfw$l5hTe6Xb$DG-4gYwR zL+$s$7c!36w2opWx@`=Mdpdh(k^Pg2mAOUS0}TV~PM-dF=Mxk~w4m{eHtQ~h0V?y^HCA8_uRv~D#1&E~<(GP60JtNf=tzujT^cvQQ!xiZsTyzt}e=ia&r z-SJT)vNce|CW|lxi~VA1K$4-G#@T+)Gro9Hc`T}3Q-8f&d*8vw5==uwpN<*ZU}yV^ z=M$QmY@IjJUS4_6D0}&@+ILzQ{J=1HS8o)_@yY+ZqEA`9o#li#a--MlmLk7^bf>@F zorXCLTl~wb<14-zyjA7mbj$Mjn&;Kg zJ42dQXyl~ZkJc>m?7yx?V{v+6{iCTCNfS@6E(}reySjLsOR1T2sF zQj9+6AGw|B>-)!N)*tEf;Lx1-;@B3oPt&Xm z7GC=`vc<)sG5pI53q694?pk*D2c7Vl7q%Fl9DVK15g*I5C0R#wQ>PUkC=6ffJMM9f za(T$%1ebpMu5%(scNDz-vaDf$;*CweSgiQgro2HnQukWQG2NY!)x%JT_e!%I&!GXY z_0)`4=C8>t*r6BjJ^wUKM*~eAQNW_S&uU*&SoEmo3Z; zuJpIblJofE3FBPBNUxEX^1O|orxqn%t78tV)ktYE_{p_JyU)~W`3g7B($gP&M{K%u z?Dr~%lFS*^u}kOgV+=e{sjAYkZ+`Q`g5aQ^O|!>OU+gv^Vd!bkAWo&$maEY#Y`6RC z{HP`GUR^)&+%cuMN!KQNGrFhO`=dn9f4|-o+_F)X`>uqK1tYX{y z$ve{2M$g!n*rIKybV`0#pSS6OwoVWICgzxIxjJHHQQYfvGrxW{4_B)PO`kUAb#kWV z@Aph?vICywRxMK7=asGcBz5tY8zzP|gSMSceq~x67_};ByYWh`ex?48>n=U)r_G-5 zbik0dtcBasmlwsqmRotZ{4b>oo6p<~X001J=aNTJ~W?wa$0DG^-yz`KynD+U74@Z;th18yCNbm@>=LJN4j!c0HBY z4PoKOx0`h29Q7?-E?4?(@WRrKY0(yIHNy9OL=}nmD%2+3*fvhRZQg9AI{QS1hW+^6 z{gvw?SRG#;@;5G-v!z`3RoZv+uVwTryWD(scxmkP=CdsQC2{&=UhJ(^oO|29;T}hQ z>xe9cs!fSLmv|X=yM8^OoA1$D{UIp-48y>*_)*T7C5gWuZ}_m0|6S?j`EJvLtDa0w zC_ETY>d_v0^ervVI(W0mCPHV@Q zo;S-ra-jBtg05Zjm}_YTH8*oq7aNb8U;Qht@9yZ>n z7$1DZX>VUf-E4akPDgUk{27afr<6N4*fqzk%GOMDRhp~+pgBilVD>M!_D{C=eNuAv zaI4+P=av(nf8w0r88qHEnmvC~_|l(wi9a0MW*Ph9yE~t*PS*F0UXfSs^=;0OC2O=+ z4A}X7xrSQv$!j5R-+g;-T=dw}xwd8(8=5`Z_B3%!v=*$(D|%J@BnAL(7;m38Ut8;< zf52gdxYkHh#qp=UTYWil_{P2W*YxGKz0-1BvfaAi+?+SXOE;P2XPsHx^1Q*Y&N3sL zbLnd1F308-?6mnIfro~gyf(K9h+6cwZQs3%?zx1IYSSBcS2M`h`G`fdV{nVdrkLb+ zlcMFG%=tRnp(rA6k#T?4f%Dr(pF9vZ*(J13eDSM~zE^pp>uclpZkiOe@Ce)%Z< zFG^Y)S5{n@6EXWrT=cTwi5^(u_tIq4#VdbqT6S;H8LrF4vH>&HC(JuyESD_K>L(u&!)|Fb^rG1 ztt`8R;|l8zXE)jT*J->@4$do7vn~G}Z+jFSztwK#?kI&PhA_J0*P_%Rkesv-6v`;x zIf)5UzUZKn-I2ABos@LDTw}fYgB5j$8sxH{x~RE&nN4`{Qthw}+irQp*==!WwtQEk z9qIURNbCUvwny8h`P(ndJ^f;>cc||8oTgz>M`vZNO>%qLH)ef+MUc5!sD<5-t-L;i z$89L6TN(B-^RWBwb(5c(?B45hVsy7c!d*mD}ruKoU;f@CdD-x1dcGY3-k>< zxd0B2Ijq4Wb3b#Fu(-M01)TBRg`Ci&xez`pNX@ctGc4zX&n5U!!%`{Ct=L`aA|KztVkXY71NK^}8h9dks+EMJy4%Yo&| za%YW(D+lNyXel9XHgPrwah-^x%?p1$Be=JK`!%*dW!e7p+RKu1zZ0;PfCKgj87K?c zFl-otJc28HLIawA^_bju*g)V-iTgM(NnM1{X9>1$QKHF!4TZ?CA^6Y$o1z zhAB34g;zVc&j5|GeId~ zV%|W3uVo-DLaV@-Rtc5E1HM91$LsU>ASFR)8;Fpn62tC?Vkn8hhVc6rxsjv+l3D&M z&!72N1!QGX6@c~EylltYkHN7T#p~I~dhqck2cU7_0p56zpU;DUbEZMSYR`s%TNPd^ z!S%HV0xb2RqY$w2Dg6be(Q}`JIZvXEfkVZygs{%M)Y|w)%#(xT+c%K-?#s~c# z=k5MDRrEPEpQN;SO&Q~AKnRM9)Jof)XAmDeuUqHYg$i+gyFTLkMzCEE;@v&=r?EedeSQ3n9y`+5XUG0K-T;C(gJ2&Y|GN%;myi8?ykn;?1nleM z^#DBx_zzL=AAsOLyufP(_`kxjV{Qb21Hl-A2?SFJW)SdmbGYI?X%-MHAy`4MhF}8$ zKmP;)UuDNX*8twEgulJvz_*6@&pq(>h5yQ-3;#YXkOJEghmFq&ohQy{GtdGGR*h7q^md sEr366L%99)mOuU4*RTAR+?*cmQ-nU2z9{ z7pT39p@xSe)LEb1-OiRW8yS%?6MzW+{=fAv)QGorYB z9y=S7rp2TO6=9^^cjDyBi?#rqI}##~N%>CaTLsb>YMLJO6Czq$a6vX`5?*MsPrAle z?XP>CzrPzXzA?@VB(?!e+j>6dc?==vq^j+ShzFk&QK`hM$gT~;@?LcY%1O-x#UH4Z ztz&erS{~EJ)!^DHm=1R2zv5wB>+Lo(nAo4EeLGXV_ROs!e&F@-f+Zad7IQshlOSJ( zg_BhN#k)I{;$197{wccOZC;einBWq9N4nQS<+wvVFR&D_7u2ri7ITmuV4SyV$XpLC>mdrp%oOG2<-J+25~w znL{gbc522@$X!o%wSP2{(K0_YQP{>S!BX!3Ox53{phss?^I)u{JYA$mVtAA>l7-hP zK;P$igU@D+8hKdji7J$Uh-FLt>qv;|*&|if?wY}PtxgH1;4E=v&&IqTzuB}B{ zM>V|EMW<5GREEYjhbHUt?8-KX#xn4lUCdFwrcMzFe0`ZQX^#+ zh=}hWJQ~|@&AzGV3B@Cp3wVNPxWV^K5An$&w#$4rw68+oC}i<&5bT`_X{kUGES#e7 zIsEAVj1v~uY9}zfpiROb9QYURwj6E_PBtbE4mRv2R`z<)BXVM@px)=E>(sG&avccW%<7>o=&s%lP#1GaOIYrGbjamsQ%B(oeKs(YFoMUNJiX#WacZ)w1?f zb0iF&D3v%^RXwbVWu_CPFDPxV=66;@%*u`w@kqk5wzbYKAt8qepn?ZqekZ#-Lp7Y6 zmVVb48foECvt0Xl3MLa&x$2p*ZEy@^pmFDqMO0)GjI)rRF+Zt8DNrL;^rZfNS^nMD$l3|Cy8! z;H&{|g@5-djp?xN;sifs z;YPKV3wxHWd`dZ<1>NlKSemN-)DGgi-WAJv9&KL8Jtp2Zy-TuP3YaP9r{=8%V=}(o zu<+9Aa0@J$jJKGXz^GXxdW?6v3zec&wSu6ODNa6vUi>;z{vx?bhDWh}3DxHrFO?xu zuJ6zHN1qLkL#7%+Tl7RI2On*qXqc)gXD`bWRSY8;^j4IICh7sxw!*vDOT*?S42g?v z%)g;;wvdZnDCKrFZ?2$0$K?Qru!~ifI9!UYGp%GG!6hm-H)-+n2zh8m=^MT6sjsPt4Ph&i zMf0q6i!5Qu+TQhJlu?5f3C`gd^V*7(MBhv(7~5$lsadNCD?`clOzxw-GmSqCc}nJ$ zT9V~Q4RPFR<)0-v^^o@4H0K<2joH2F;9f3fcZD`AZOlI~O@A|a>2dP$PTP|kMi8~E zl(<2ve)TupQswIK2(#{Pv2oz)l9D93^K|%t*(s5dV}mVNX2?zhLS$< z5{#Ncdm+77vZLh|uTgD*T4R<-+%c+u-m3l9bN{24vL=s%aCnhh&>KqD?Qb9N6D)T1 z#|_C;Z+(km$Wa?S3y|o?R83fleB0ak)@S$C+c^oe2c6(k5g}Z(!c}QCcANnyIMA5Z zzLHL9shMvT9D6W8v`#F&naPvC{v>alu`h(SEiv=n2cg#^{rFyZdE;;ERE2U6l4Y_` z^W&o8t!-s3_x5UwK_+bHab6Z7uHLULdWRXlB^?Wf1q5*L`%2z?U)SXs+@i$Gc8V&c z;u(LQq({#wP`1tYeT6oI8@)4gZ0N)W5Z~Nr>^yZ3;^af}B41s{YM|%Sbm<8U)8@sv zbnq8ZXO)U4?&8mVf}MFZD1a5BMb}1X!XzghVT1-SHSb0kj@Rdf_D>a}Ol?dySp zrq%XjW=QOs3!46&`zP1tb$n*STUmGu*f7eWS*vNi;di}RC|MB!GC??)-x6$>qw+K>htK+WOiks8fGeiDbH$HywGTvR&<@vtN z1scLGM6%1nJ8x6FX~nu@MR2!V3&{uGAubrw>~x+~+I2FJR!0EcElxhH@JD`=yW98v zEPD!(oJ-0pLIBHAc?hcGUM)xXrT_atx1iY8`-g56bP2HF61rUN6}9Q8O35QC$xsPk zs?m1Q{uEPhWnaGu1WE5%&w|-Uq@aNQK*;YDXQjs~4h_-eUPr6;ZWsFFI{8+h zTTA(y^|#@B;HgAqHWH$#L`oP38&c?eoJWNneaI6$d;~7hQKQCjwh)~!91otIyM3(^A;_*3YCLyq!4Zb?xf2a-Rz;i+k3uWnn1CH)rj5x_pf`n~d;; z4hO6eJD>V&{-d7T!|NL#?HUIT!?On${E@o`y}{(GO^N+VT0%-3>a`rB(}fd}$8pjb z-@Cn{yf*^ItMt^pX^)GdUKn!M;9@fK`OJkL(X>O6ql({mAQRem#3UT@zRJKQeo0ds zkdswh5yvW#?@84{hv#UL8=TfekY2uG3XxalFVTEfoTR1YX9hO7bM%xqFP5okj8$ph z=_L8HSVpxyI^*<2R1>PNc%jWy6 zGS-a6@r*Y)&5OaMhk^>ds`9ZWYAP-Q5!%NO$y|s$#varxD-I=`P^-z7uyJr)Bu!I< zik9kov-Hkq%p==d4Rffyy7lIqg~qX^pmbIZzlSfaH78oEb4gFN=5c^bgT%5mg>coY zYEGo}loCUrMuQ33EMHMeUWQT?BoPQH&d)M6wDv#K^$|0mYhQYEMXGs*h|-a%rC_eA z9^p1oIWRq4s4oBkUIOaE@$b;=@<6N`_7W!1H(fa0^9FF7W>taWM~a3;Y-JVihji^+ zzv9RC0h;Cr;>DWp4!iRC2U(VS#d6!V6ME3slp!cDguRNqX}RkP^JUvhWG-{m!PpVa z`+(7WqQ--{`_WjZ4AZlYNJYmCU%TtL$roAso=ckk%Z`k+d63u?|lUr%)XU^ zJE%-8XJ$XqUrKV!p-z4%Q)?G>$lm6ji=mkvX6YkEiX^u7mgLE@n_pJ!fxszEA2?f5 zg}41!zgKK$7Y|#g^AC!Yt0w=0BKdI5_jUwsvj!|dNILFG(=?_VB{2KBBbZ8{VTy(L zH@hQgd-{>Kk9QN?GEQp|72EKs`Rfr;PxSf+luCQJN;;S{$RDg zcR@e7?QcDh?Z^-AkBPc;k-2Z=Vwif&*&CuhWgKm@O)gRs-_4!y5-Rug^u+G-)M4T` z;jHW0K+So^@F?3%n(a>ci5Bf%#0ZPCLGsY_9QL?PyeI-=?Ib}Es6;nB%B+c(p6ZQwF^2t{U0r!-g z?jmjoDp^7Q?2SfiSw=c|pGIJ&IM5^CWNFiF>fyr|En0a*jP|Y$!tr$Ix4oodFx0uaS*$%B8sn z2Q`O|j2d{`*maE=+DT#&WwanB)Ah$eUg$Zk)uQ#9)5emih2dK9>!Q8D$G5f=$;sKK z%@cluUjOttA@Vw9jcVh=wt}>I@p^n1ulaMnhxGQ4lzw^~NXt!}&>|bTcuCdxRVi|G zb+mt0C&_^SHR)CVvkD4INAxHn4s;-?uI|Fjv)V1vVrGpmU4kn9cv{5A%_KWfh`uxr z-P|`5&+Eq{mH;%pu!=+S7H*fML+Nn1k?0QaREfiAn8){BWY zu)TKG#d+T}0ZVk8P234Kvn#-C^6L>pRj7~%Es_o`FqmLBq%+z11KBYKQK)OEmCqhQ z-or1A4!SwZpLP?;Ne_>zTh=NW9I?19mgl1tBX|9-Cu|-5WVM4; z9WS+dh~PU-|3#uhHe{RHY>m)MsLSz|E_2)dL!FMzTp8tQf8Wp`>}q`Pwy-iSLVGog zY36XsZ@3`0OsB5_zYh z`O6K)ZNb|^+@GRj@Vaqpq$1}G9Zu7&R0RHd+5BxYk+2U={+mz*}C^NzB6a13E+y=bO0Dc0p65a;9 z%?EB{{k_uvL;?V!;aLBu`?tmao@@Uqu1NeB@jo)Jiag41gaZh_JDdG6yC(f1@qcM{ BN0|Ts literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/excel/test_datetime_mi.xlsm b/pandas/tests/io/data/excel/test_datetime_mi.xlsm new file mode 100644 index 0000000000000000000000000000000000000000..55fb88912afb9cd6685123f34c65b612786d329c GIT binary patch literal 8700 zcmeHsg;!kJ@^#|`3GNWwH8=#fU;%;!8YeXFPO#t}LesdrOYla6Lx6-vLU8Q>0fGe& z@b%1l@0*#-{QiRX&Ry%C)$87>b^4y#wd?HCQbR!{2A~5l0RR9!pz=jep&1eYaDoB= z5CJfejpSXNJweW%X1ac^AP*BBUnfVpLR4h7d;l`y{r`>s;xAC0Jf_jlhp%{{{8N65 zJ9fE222}6+Ooh zjA}qxr}J{5rOWB|NiIox@${502FW6zVf6CaUT_uleP{!Xezgj_1ZGGj5@*s4;0;4_ zqCN|7C<0TP8;(41VD8Ea@p)j(p;Om8RK!{i|4?kX06VNn#zZ$fZd?A+$iX7dR|Ag8 zj%}V^WrM$Y&n|G-TZ-!|FPtu_y2B`q;_X*AlljG-&6>ZI%x*MtC;sxt?dtn2IFEGK z1jtpOV=gKTh%u@t%oD$Ier?&{a2H~IndNuf&WbV#tZx;` z|CanID|jxobQf1TT1?M)rs{gCOB0|EH(J%=u@*z$U@tu^hhvkq7-%BS)j5akS+t2#WeUNTy>CydNNl zYYrU%z(af)Uq@aa7k39s7Z-=$h^tu7#3hT5I54m2{&lAf7K;p_E`LcejKfQ3*84!M z_yZEFfl&H93vK1~st+!`Q>LURx2og5!<$cC7sfX~Zc5ia$Epr0P%s8;FvO)+j2J-U zVTX$qk}|1N#p9S{%n7hmjvs+P&c=3uBg0VqL{b2RLkap>Z%dS8!cKN3rrh>HYQ$SK z7#~FKsU8@t4u^aTeUiHusUd3+0}NO0#*pVC&tyd|}Id$=# zL_z9vN=|Gca)vu=;k0?6dT0pS0_X5fQVHAwgmr=W{^PZ&pN9%3CiR92%B-Xk{&8pL!y=CFL0?VDBn$m3sRt zsxWM51e`Q4Z)(*xYJFj->Wh_PF5WmPq|^>eQx%ciNknQ4+KCvY`{C#k?=L*G9TNEY z6vh9QRL;XAz6_@R3Y8Qn9Rm+wt$bfBOKl-Hae@X()rC~4Uer#7jeDKHy)-V%sp1*S z#LqT3c9o0|jX+sPvQqFrq9!3~!F3IR7H$N!Py#TK5UBYVyZjkD|6v*=MCFI@^4~qG zlg8|SuY4lz!qNjJy|GFlBl8fsmcd&9eIo5ZYPdD{bFkTZ=(3Em@H}QM`mzU;laKUf zz6LS$LPI_Yo9moRyfS3*n%O_ za-^jQ=K2){Drgr#ols0}gfsL=U9D~Olq~-KNxsCM?hR0WY3@^gB|Fo}f%B?hSBZ~H zBBL1ewM?p=GMQFN40lXqPd4+@_hXk6lE>iO2>oknH`6$Whl7$d-GLIWPnoKQ8wN!+ zNhu_5wob_YDXSV788(d~VgrQ8s^o}`|94jPumyoUJ$Qeg9{-kJvr}YU7WjyV50lSn zxM;nBZiM;7sm1njTK)!lDb7!_2I%!SMH_G-8p)+J=aT2Rua-coAwflwPnbQ6l${t| z-v{@s`UeI1J-~BO2>GgR$1{d`!;QzN&%zIf3-;yJD*0oO+sThXNE@sOO!*V=Hd|f# zB6R1(ZN};7AmV+{$Fr~TbGFpAmcb;PvZKX>Gw;oWnA^4EOFt?TGaYpXGO+QwJtHsQ z=CevY4#x%Y6IBZeDZC4%j5gcuoN~xi&J82dHThK8r4WmSPh{%a=&2R2z%QDWstr>? zy6HZCA}A|@C505Q=pf=8G?u&;0|r``qFHGVcXr)M?Gj}Q79ZS7V{iIITY5O-xrSkx zl7Uy940_Q&2QfSLgv@(Z^v0JF+Fg)>RuiI-C z>h|la@p(Dv>ja#fc*9=8{`XOduo{SE_GDPna-Wb#N!_+$ z&*cgrAW!{tBEx|7iqVjb)^^Fd0}PW+QeOB5z?8hfT_* zt}o5aF|Lzs=l336Ms+508MOjhF!meeSx6cs373cAH<};fw@=A_JkqoI>>yb{VF*o~ z#eFq_#zJHlPpru<)WCATc=hMIdc-_j~Ac)f<@NuRNj*z6BMnYrzg zn?Aa_v>Hp%b!$kloow#O65^%+X>X^@O5!KWcuNOtOjI6fuU>aYH(h((Z{FQ}b3U^< zySdx`_E2mOJIWM8OcBai%T#_c| zA_^^X+ zS+~}SWOn?Qbf)RlUZ-Dq@Qq zq=T4|Np0_~MMG9u;)j*{81qPnscFJi(n&c>v}?#2!I!3?mRkOS{Pc|bxGAc^{uP~? z>yMMC(ABn+#i)=U1qa4mrrbF z)IXG@mSs(o!8bpr+wI57S@ zzZBp#vNn^-#^@l9<|JlATO6C%+Q1_7G$x@-Bq57=n=U)9s@`RtRLET^V{8i_e{qV5 zd5i&-_PMX+z^Ho5PtN>V5jx&nN&mZ}-P5QZ$>1M5SN&!U&}I%bXAEW?o$s@WX7{(Z zWGiMpcSqljMRsRckNXCCeqN2G_1qtepVewYn7Q}M@O!?>dVQe(3p;x$s~vGJgU`zi*14Y7aQ913H6|_ zxUqyWrJ}u4Iz8{?<7uioNNXk=dH9THwebCZ(Yt7Jxwq>6QVU4~9eWZEU8K2QbOXe9`9|u~HX!+7S5hL(K z{Inuvp@Uq=W%r0j}9tyFNeiu>lR>I(?mtp7YpUm76>$`PB|gqzN17{ELYU|QeO3V=#(a4bG25)xnd`I=%EJW6&!Hqx3J zvi*{ztSO^2z833ahSIBKfr<uJOT$r2G+I^78ae!fhW#a&GwkB9E^evS42DH{d2zP zP{tV}vO=9UZwi%M=Sv@pR0^_&Cob9ID7}2#dE@-g3_CUYV;dWvFE$BqVd7}c2wysRDTzuR*Ro2F7KSoUD2MdZXKTKEUmefG+2z`9r>;lec|@b+aTJYL`aQND&KWW_1#or``ygpm3K8IxU_S$syj!$VhcaS>It9h_29n zVNApSb6ba?n$MnFkAwvH4kynrZgs2Y4%~9i^<^{ZaU4LdnL-M<=!j`I#+cmH1;f>; z*KuI1K!Z^$FOn-&eWp;6-QY7ORDt$AvS1u@K}wnlkPwos0LgQ&dh{MZhx?RMvsy7q zaF?<37ZkC9g*?Daaxd%7;<+HN#H|uoP3-~h8llNtr}qCFNy6rMa+;m6Ar5>C);#eP6 z8uvbRxn7&^h{YynAno#wPj?BEG|i`H8nw+=o#@stgmy&IzWF})4ZZtf0mwEeQ6nm8 zs~9mhVf(l>{5)J^yk1g{Ro1w95*?O2%Om}9q!(|8H}^oySjQJOByek|V77&EOx=Ga z-;D!vp&wF@eoA(Rccxe8byoe!9NZ4};NDKvlI)*i{e;aU?ID;b8Rq46a$8?s#e4bU zv;3()5EeX%d~2I%I~{#`le{FL^CNjlFJ-Qso=;`5yPNe&<0uU5NG(6vV5;!c>5`St zYqKys5WR@v5#5+tS%}q{;^M5?o94DMR_Y< zV2=kQJFq>z{$sa;#hdrM;uN{-J+v)7H9mNX)YW|l90p4bOa`orCU3dEw|1Dkd$@UY zmp9*dgOccc&x?e`0CWT*X*N=1(~L-KwBvk2)$FNb&6nKnev`ItLKq?IMy1qisq&)N z-Y*MPk#MjBZh7;Tcr@Psc+jO?2!7~U0#Db zihnAFtIwXhMj}vpgg6ZnAd~_tmzP@ZF0LNDFJ0V0|85QaPar@fsd(_J8bTl#UPZf) z0$=dl_Lhd%hJKLI!e7=3h#NI5a-sz9eZBLa_N=11UNdrE?a=J@N!!?{swfgNC<98e z?lRF6*uqpxs>BSr$&SThsNK2!mhID%xkcTd=jIak@xg#?tu;t-SjJl_3y&?nZqdN_ zRC9czB0FL}O_~NcIQ7HKoD^}%@;Z?-nMdP=UmLq`HDL@{Gl6NeV+riUjWAvl6WiO$ z6cz0rr&)bB$%W{_GTZqF%t%JpcSe%8kMp^KLJMSSXu^@kvEnS~}nD?X-B5S874=E@U2P7=R`Z?aQUH zE)!;!)^p5OvXK3tl5)5DHzYuVs-QEF{$zhuFt@scD;odFZKDOyAI{#>Zpx#a!E8v# z+9*1aYWMMkU9BN)iWOfMFj1?M>crEd65r4{1EN>A+rA) zc2*@ya^f%syCnHI%6f1%g*}L%KsmI7U|%XwrJ@;BT()v3X7!y%-8(9lyxk`!yg5Zj`mp$iWk;@rS`(;>Qe-E zG}#62lavU5Z*ws!T3^{~eMEgc*3kL2p4AmskZj#{gk|_l=H6k!H|Of8LMj}UOA$A) zK2`=Rh{Hki!{<^AsOsiR#_n+Q1bqlYl63r=M}caCb6x11mjUckIxYFDg#}?;XnC!c zRVfi23iL)G0#s?-0hIFWBc+UJZJee@GRcW(nP%*pYWkgQ;p3+7WEG33OGK>ZXX2Z2 zm7h!9Tq2fVu%^O?JM3PYGiS}5Rzyd#-RH;`QamVT=`@0T=P1H4B^{mSv5{o4k>e_5 zQX^I8ymFkN`4L0x;?8-AQz~*{b=&`T^m_nXOHS-$aa-d2Ba99DK~g7S=pJNI_VmSR zP!jL0;B6qZ(-iUoB);Lua=Du}wu;!7e?>DAGB@D=>eT;d_x>~gL*HIY?XM30+EVY&kVN{sgFlMZUrqnYrhga$05p^Uz~8v_ lSM$H-vp<_JQ2)vN-`Pz|4Gpn30KfypC552xa@ya1{Xh9wA;I?87=5_I9J zE*0Av>cLPLuF6iZmv@k?|McYv;}{Eo*0yx2i|S|Y z@HJ9FeHjd}uHH?AmUJkss%QEFd2`#vK{~Yf)z9Xa1)@w7@yFV_^RzgKLna!OH#IAA zZ)F&+5L7Y+kc7yNW7jz)}yb9It0 zu1ey@7hDQYDuqy5^F;pJal0#1gV42T&sY!rDQ9PG(nFx$b!>w4JyKFvx&f5a>UBmV z((yca?Cl5W*J_`0I2%{c&-_55QPDPZY6K1&evECk>@{*8mqtpz1t+0sM5ciJy*-{rBDV8Ql=o7i1j=-Ov7nX$M_MnuP>)aJF zICw&!Mz6|<^jjAXWJX%oG+CF5wSH{R>5J*hOgTj#de4po#`5NpLZzV zTg_GZL+vRqZb-B=tOaap%=26YseDWMNC3cZH1f7*^Kx>vGk0>b`z=ou zX&E_Xv*QKi*FU~)w?d5&$I%olrnXLMEwydCq*32`jaxx7Vn0<>cDv#Qq`ZedZzZ7N zz2q6(UiPaB_!(f}x}T>wAdL_5-=vDs&V=eU#7`WRe|=7=WTUhEGmtE+I2>mZhEC1_h71v5pp+t~`?f`M2{pE3ijC6cr(C;W|` zDx{GvF0XgbD_(~m53+I_cj0DWG&1VZhT|{=AdIWX-F6545G3kC?YU$}I+7ychb(g? z_lYnO2PSGY52EJx)ZE!{qfgp4;7A;cMiTsJ@!8q*9^mgdEV4^#eUMR1W{782Ny*z% zo)VUoB8mnE@K9#o)|-)5;+Zk03kQT75T2{{QHS@!2LxA-Lztm+?X8e+Jc4H(*d+;NFUA(<`QHCp)c6lJ+|$tDG6?!bFaatsu=O;Y?pX* z8?q?XD2{tmm9d?O@nc0hGxqr46=xr41@nnt&ihw>?1~>QH{AsrmrmGG;dpr z7DV6>?$+6-J>rAbSU>yDlkNS`Jl{oZjd4F;T6&fJ@M3BqFShSHw9V4}T^`M=h;Nq> zS}n8v&?|Q85#OD`O=jF%1glFSVx$pnO5>q!}7Zr2P)#Ni?9kH#ytQFKi$6WWA9P z%>?Vld1PCstrVL?powsGW;@|S^yl_YApy@e^i1<_L%UV1Yy^8ZEV5!3G&B9E*5Lsd zW%+f*MN1rW#`qN1o@NL!$?;*6IEO`Ot*!ng^+QFO`R@K^_|R&r@8>#GLnbg@{GB;* zZ|0p>VRu+y>4gM<0taKxKQhdp`SY)A0|zViVEp^<9u-N$w!arU;SZr10V1BL#VOFa z6v@VcdjMqud4F1%CFEO>$y&&gxIFJ1N)^(Q8?}R%*hYa0Udxq=RMJ)fRL|t9YWi3! zK)55soTG|2)T_vhD#GaIy5_#9M_#bsh+MarV30&G={N$~_-?}a`;!)K8-LQcA|eCy zPtTQ=nupE^V;_YIg!eV?^raSO#tO<9s80_ZSGYThy_^z20xhq_(`@7k)l#Eaq9VF- zXym$wuSZ2rAbH{1x1=t{F?JXOB4nKb!p?Hk<%6{Y{Hg@R!grgeg#Qj7x(0gnLojTB zVUd*xw(|WgXJds93nh&UNl;vC^FU*=gz%3UGIwCqmp3e5j&u{tI?|j)B9; zt5Fbr%lC*D>Vxea_oBPFS=>bj_hM)pUXkW*j+oA&sK$hl6$jlOq;G*V_Fchq?xj7k zAFvM?y&0L46uXT>&yCgV!RhOM zlQ2=4VxBV|lDh!UZ2Bd1vQb7T>2y9TzejQmeu%(jGx}1pFa_+coq?<8zg9XFyx!C* zQhk7I)K0**wVrZf<0uZ)7r`=ofj+sFxQh^tw^+}cgjrhXQnCr(<0Dnj`u1}71Ub=p za27GLuRn`a@0lI(o3rLA(H%d^%syHO^1XMHO1Iqx$zsQBLqnAFc=P3>TZcil(M)C) zm)Zowo&0o4Itf0ro8BJTx!}DV;rVkdt8aE9g~WO-Y12U8QAAo?n^-(m27!8VBmsX& znu3jIlo}21G6z>3s#d+Z?TAw$(>Vtux{tW+0e4N2@f|%M(J1#bm*P>xjNGyQe#n}I zt8O5ZSpY|cci2KBec>GmM)CZ}>n6d;Bs2yLa8_RH_=cCJCb`ODWc4cC#7+*&crjkR zznG^(RvDhq66WFoJ@qp;CEwSbk@dG8j~fqndyW@Y7k3Zadl&-yXm5;>6&~U;x!EGG z*W z<<&9!&J&(e*YA=~(2dT5 zDz~V!?G^PsfiZFq9Uneic-_CRiRfF{*1J zD@zEUDbNh6i6BRBU6nzrmS7My?0b`I`N3*`rU1{xpl?R$z*w#6+)hne!L>PpHN3&r zt7RVFqIV}AcNslYq65M(_A(Jz{zF|uS(l1$OF#CIQxxzTUY%NDeP{q(bsVL(Ifh%sC8W}8vO7c6y3U zh~LBU?%A6zk)ZRPn?94;mIg*8M`Riejl<~#lgIme!ex`LhvU5yzTK~MC%ye$KW~Q9 zyB<#jFKX5o@!2{9PEU7}30DFxwpA0z<`+oGGL9*uJ|0)Cy*i?eF$>1HJAo6+?Pr(l zhlpeEc$ORvL*b1LDE4|iY7TlANOeCUwhMRJ&9#!Bv{1)qR?B`wTznPlN#rHCT9!3@ zeC(X3yp(*_64e$po@95(s#pQI+U6?ISy!Kns{wxpM&m@i=kJ-+==vxXOIF?{6}dGd z!nv+o6Lo%wRUH~RMI+^EN~6wNFJ&G%+zX}%n?D_0cBt+vv6~k0w*Z;J*1;YaS-SOU z*qZg_4|M@)4j!`37>i9#*PtFb+A20`Y2Rpuv)fn9VM^Zx-&~MLbfKbeVB9@7>gRk_Ch&0?|KN0ygiMr{In{f?q zw@m0znWmEcP^@5re$l$fF@*y|%Iv-|xTd2pfv-+d9BQnbTtd7F=%cV#if%}xmlTco zw{kRnbK@~ytn$Ebk#ELKG0Y(DutVCq6!zm#^sLf-2@d^@oS6ch>IO|smPHOa5s3Y5 zi776dE=RFm+Jy+4MWCRf6E1~B;y)Nl=a%c+AEso_X7ge`!IRbQz)jwa8yJhKrTgji z`MDRI8xmrI!O2~UU^YE?TROPz=@mJ%ov(G?Vz)D-!tI-nmYebP_2XTc<&?2#HTa8N z=9&Jdi^~fM+stloMOaZZMxX_Bxan2`Dhgxq*D7{*DZE56^+7dcw4{JI~wqv@rWjzm8~e0=Y%%T<|yRm80mZu4#j#JahY+qY3yr4T>3wR=eDjr4wZ**UnJd>` z945A%3>95-FfFadc)yV2O@l2;6C;S*?gn>yqh7~^y6%l~1ZUUaCvDw$ zt($(fo@tMs6iA5&3C5^-zRj@6>*a~68ktaKA}3Ov%jzlIXBEyUdve*Ky2M95{n-u4 z=DXGV0mi+r?#a7vgf@ZYr&3>Q8H*k7RajiB>{wecaHf3}+ty%Etb)6?3fs3lM%v(wcGVh!#RJKBW#M? zyN1sNdJdkp;g$^|^avd)@5X<86AU-5G&lnKF4ats3LkfXfEe@Bj%xdSs12@71}L^P z;J|l!CCI6YLM2Vl?hEcIm|mG~F$7q20WCO9N{(?P<%`hmrzGDKYQKMl8Dn*;#we)nAxLq7*E}vf|cr&@$L7pO)Q(Tnt=7E0~u-(|^mVpGD+h*Gb-ZDvxG9%jLD(=ToLqo7gB* zEXshk+g_5m;$sl(+R_cR+A`L``cOut4&m}=AJX77-L$EkM!*l9Va>}(dO<^IRUMoAR#<&X^2EFD!4h+;>&kf(p2R40S)=DF zB8Z%McL|QlFjDEYIZC3&Cc{?5f2jawgm^+sIT0 zFMj=8MwUVIT=}VH&3s5(1bO`7%pOwb)x19afN&+hh_y`k@TfIMQ`lvg%1DifB%Op| z!#L7J@-(a17ibUW4qM)VfT4!>#3!zM8)=hGvq82`9==%8_z}7nm1X)gBiW zW2TT+h#Sjxnwm)8B;6PqtC$;if=H-`$LW1dNjclK&NrzuKmCcIark@d1na5Dv%BO) zE{*f#MXl7CR!Vk-h0ad88)l|BSpT*xd; zn_S<&7+Fdo5q>Nkxyz9xuqt-dqEAl&@iB7!Tn^VBfgY(E>Q+4z%LD56-q-GLS9)L? zn7-zEf^&-A2ziaeZ<%}e#oDzyL*!0_{au_N~LeF52sp3 zoI?f`Q)xvXLs0FirYjKH?u<`gGr|d%aCs)%W3HgnW9yTRAcHf|wq<^oh&L4Lclvoy zfcoWTzK@*foGxqTs_n>hMqvv91Y!M=U*eN;;x3!+1M$Brg}o$S$wa_ddJH=aV#AaI z3#V6Vu1?NwY_FVL!T;)SDUvd*kFAo@N`~<y5+(($91lH!t_c9{@e1(lC||@9q~G9 zfpSPLrbso% z3{b?+nW)4??O*uTehCF0i50V5d8rmo>+S|>!^`W*_o$fx!4g6751Xw`m{|N_gE53* zi1C#yP6v#w`=n)znuon2Y9YAV#OG}!dzkQnARaI8qx2?C3N}9Ek`>!7Ni?SE>dq0= z)}V{(a2sHixNK(-ud>|-;ccCnvuEm-3TgdAdO4P7+*UUXO0_e~HxdD236$gZ2lt^F zTW9BC#oNp+v!FwOSwGK=*0z|sP205#HWF`yq}-dJ`kyeN;_v6U%H*<%3>bKsV95mQ z&)_w4cK#oBVW9oz$bpPHF0f;T9U5 zKSTo*N*lmMAC@nz*suAOT-uHtIK-tlj0VzTAo#2GPO%GGKR!Nqy7Zw^Z;^mILZit5 zbCo}@CG6i~VhsTZhI-8g1mD(tC-p$?=#|lLtG5aGfnUsf*z9ERZf$wL={f1iaBchd z8aihnH{qH!ly>k!{LyaSJNM?eR5T2MSq2zT6D^J!$Y`hf`P+Ns7R8NMpw2MUIPDaw zB(d1|=K)FsGaX1w*ZvHX8jS@jZwo`25%ZhO%TvSKq$v%+*a%|4{wF0l$FiA`>ge^) z#gh{dvrHH^l(gIF!$yohO31t;E#|YB`x@H-lz%CDcMV&9;i>{VaL~0XcgB(?{T&7V zcCS5qaM6H(xdRmb!JdzCQY;__x?5in^wJ=(qA3?wXOIs!{6pNFpB)C$@r_`udS9pOe}qLzxw%AmHfjK68iry;%^$|uU>u?cmD8#2x}t2R`aXW^Q(hj z%hNv`gc1Di;E!VUSJS^{(?1Mh;fVwQ_*-uM)%>sV?9b+-q<=F1XLM6jLWJ!N0Pqy{ N6NRPkZ1Ug!{U7#%BdP!Z literal 0 HcmV?d00001 diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 955db982f8300..ddc631532194a 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -1143,3 +1143,22 @@ def test_header_with_index_col(self, engine, filename): filename, sheet_name="Sheet1", index_col=0, header=[0, 1] ) tm.assert_frame_equal(expected, result) + + def test_read_datetime_multiindex(self, engine, read_ext): + # GH 34748 + if engine == "pyxlsb": + pytest.xfail("Sheets containing datetimes not supported by pyxlsb") + + f = "test_datetime_mi" + read_ext + with pd.ExcelFile(f) as excel: + actual = pd.read_excel(excel, header=[0, 1], index_col=0, engine=engine) + expected_column_index = pd.MultiIndex.from_tuples( + [(pd.to_datetime("02/29/2020"), pd.to_datetime("03/01/2020"))], + names=[ + pd.to_datetime("02/29/2020").to_pydatetime(), + pd.to_datetime("03/01/2020").to_pydatetime(), + ], + ) + expected = pd.DataFrame([], columns=expected_column_index) + + tm.assert_frame_equal(expected, actual) From b2abda45edfd74c478d6a7d98d60b061cf433f98 Mon Sep 17 00:00:00 2001 From: Alex Kirko Date: Thu, 9 Jul 2020 16:02:32 +0300 Subject: [PATCH 0308/1025] BUG: fix union_indexes not supporting sort=False for Index subclasses (#35098) --- doc/source/whatsnew/v1.1.0.rst | 1 + pandas/core/indexes/api.py | 8 +++++++- pandas/tests/frame/test_constructors.py | 6 ++++-- pandas/tests/indexes/test_common.py | 18 ++++++++++++++++- pandas/tests/reshape/test_concat.py | 14 +++++++++++++ pandas/tests/reshape/test_melt.py | 26 ++++++++++++------------- pandas/tests/test_strings.py | 9 ++++++++- 7 files changed, 64 insertions(+), 18 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index ce0668917f800..986ee371566cd 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -1117,6 +1117,7 @@ Reshaping - Fixed bug in :func:`melt` where melting MultiIndex columns with ``col_level`` > 0 would raise a ``KeyError`` on ``id_vars`` (:issue:`34129`) - Bug in :meth:`Series.where` with an empty Series and empty ``cond`` having non-bool dtype (:issue:`34592`) - Fixed regression where :meth:`DataFrame.apply` would raise ``ValueError`` for elements whth ``S`` dtype (:issue:`34529`) +- Bug in :meth:`DataFrame.append` leading to sorting columns even when ``sort=False`` is specified (:issue:`35092`) Sparse ^^^^^^ diff --git a/pandas/core/indexes/api.py b/pandas/core/indexes/api.py index 4c5a70f4088ee..9849742abcfca 100644 --- a/pandas/core/indexes/api.py +++ b/pandas/core/indexes/api.py @@ -214,7 +214,13 @@ def conv(i): return result.union_many(indexes[1:]) else: for other in indexes[1:]: - result = result.union(other) + # GH 35092. Index.union expects sort=None instead of sort=True + # to signify that sort=True isn't fully implemented and + # legacy implementation sometimes might not sort (see GH 24959) + # In this case we currently sort in _get_combined_index + if sort: + sort = None + result = result.union(other, sort=sort) return result elif kind == "array": index = indexes[0] diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index dba243f1a339a..ab4f7781467e7 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -2542,11 +2542,13 @@ def test_construct_with_two_categoricalindex_series(self): index=pd.CategoricalIndex(["f", "female", "m", "male", "unknown"]), ) result = DataFrame([s1, s2]) + # GH 35092. Extra s2 columns are now appended to s1 columns + # in original order expected = DataFrame( np.array( - [[np.nan, 39.0, np.nan, 6.0, 4.0], [2.0, 152.0, 2.0, 242.0, 150.0]] + [[39.0, 6.0, 4.0, np.nan, np.nan], [152.0, 242.0, 150.0, 2.0, 2.0]] ), - columns=["f", "female", "m", "male", "unknown"], + columns=["female", "male", "unknown", "f", "m"], ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/indexes/test_common.py b/pandas/tests/indexes/test_common.py index 02a173eb4958d..c85696e02ad39 100644 --- a/pandas/tests/indexes/test_common.py +++ b/pandas/tests/indexes/test_common.py @@ -13,8 +13,9 @@ from pandas.core.dtypes.common import is_period_dtype, needs_i8_conversion import pandas as pd -from pandas import CategoricalIndex, MultiIndex, RangeIndex +from pandas import CategoricalIndex, Index, MultiIndex, RangeIndex import pandas._testing as tm +from pandas.core.indexes.api import union_indexes class TestCommon: @@ -395,3 +396,18 @@ def test_astype_preserves_name(self, index, dtype, copy): assert result.names == index.names else: assert result.name == index.name + + +@pytest.mark.parametrize("arr", [[0, 1, 4, 3]]) +@pytest.mark.parametrize("dtype", ["int8", "int16", "int32", "int64"]) +def test_union_index_no_sort(arr, sort, dtype): + # GH 35092. Check that we don't sort with sort=False + ind1 = Index(arr[:2], dtype=dtype) + ind2 = Index(arr[2:], dtype=dtype) + + # sort is None indicates that we sort the combined index + if sort is None: + arr.sort() + expected = Index(arr, dtype=dtype) + result = union_indexes([ind1, ind2], sort=sort) + tm.assert_index_equal(result, expected) diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py index ffeb5ff0f8aaa..ff95d8ad997a4 100644 --- a/pandas/tests/reshape/test_concat.py +++ b/pandas/tests/reshape/test_concat.py @@ -2857,3 +2857,17 @@ def test_concat_frame_axis0_extension_dtypes(): result = pd.concat([df2, df1], ignore_index=True) expected = pd.DataFrame({"a": [4, 5, 6, 1, 2, 3]}, dtype="Int64") tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("sort", [True, False]) +def test_append_sort(sort): + # GH 35092. Check that DataFrame.append respects the sort argument. + df1 = pd.DataFrame(data={0: [1, 2], 1: [3, 4]}) + df2 = pd.DataFrame(data={3: [1, 2], 2: [3, 4]}) + cols = list(df1.columns) + list(df2.columns) + if sort: + cols.sort() + + result = df1.append(df2, sort=sort).columns + expected = type(result)(cols) + tm.assert_index_equal(result, expected) diff --git a/pandas/tests/reshape/test_melt.py b/pandas/tests/reshape/test_melt.py index a0fa10802f860..03fda038539e2 100644 --- a/pandas/tests/reshape/test_melt.py +++ b/pandas/tests/reshape/test_melt.py @@ -691,11 +691,11 @@ def test_unbalanced(self): ) df["id"] = df.index exp_data = { - "X": ["X1", "X1", "X2", "X2"], - "A": [1.0, 3.0, 2.0, 4.0], - "B": [5.0, np.nan, 6.0, np.nan], - "id": [0, 0, 1, 1], - "year": [2010, 2011, 2010, 2011], + "X": ["X1", "X2", "X1", "X2"], + "A": [1.0, 2.0, 3.0, 4.0], + "B": [5.0, 6.0, np.nan, np.nan], + "id": [0, 1, 0, 1], + "year": [2010, 2010, 2011, 2011], } expected = pd.DataFrame(exp_data) expected = expected.set_index(["id", "year"])[["X", "A", "B"]] @@ -938,10 +938,10 @@ def test_nonnumeric_suffix(self): ) expected = pd.DataFrame( { - "A": ["X1", "X1", "X2", "X2"], - "colname": ["placebo", "test", "placebo", "test"], - "result": [5.0, np.nan, 6.0, np.nan], - "treatment": [1.0, 3.0, 2.0, 4.0], + "A": ["X1", "X2", "X1", "X2"], + "colname": ["placebo", "placebo", "test", "test"], + "result": [5.0, 6.0, np.nan, np.nan], + "treatment": [1.0, 2.0, 3.0, 4.0], } ) expected = expected.set_index(["A", "colname"]) @@ -985,10 +985,10 @@ def test_float_suffix(self): ) expected = pd.DataFrame( { - "A": ["X1", "X1", "X1", "X1", "X2", "X2", "X2", "X2"], - "colname": [1, 1.1, 1.2, 2.1, 1, 1.1, 1.2, 2.1], - "result": [0.0, np.nan, 5.0, np.nan, 9.0, np.nan, 6.0, np.nan], - "treatment": [np.nan, 1.0, np.nan, 3.0, np.nan, 2.0, np.nan, 4.0], + "A": ["X1", "X2", "X1", "X2", "X1", "X2", "X1", "X2"], + "colname": [1.2, 1.2, 1.0, 1.0, 1.1, 1.1, 2.1, 2.1], + "result": [5.0, 6.0, 0.0, 9.0, np.nan, np.nan, np.nan, np.nan], + "treatment": [np.nan, np.nan, np.nan, np.nan, 1.0, 2.0, 3.0, 4.0], } ) expected = expected.set_index(["A", "colname"]) diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index d9396d70f9112..3a4e54052305e 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -636,8 +636,15 @@ def test_str_cat_align_mixed_inputs(self, join): # mixed list of indexed/unindexed u = np.array(["A", "B", "C", "D"]) expected_outer = Series(["aaA", "bbB", "c-C", "ddD", "-e-"]) + # joint index of rhs [t, u]; u will be forced have index of s - rhs_idx = t.index & s.index if join == "inner" else t.index | s.index + # GH 35092. If right join, maintain order of t.index + if join == "inner": + rhs_idx = t.index & s.index + elif join == "right": + rhs_idx = t.index.union(s.index, sort=False) + else: + rhs_idx = t.index | s.index expected = expected_outer.loc[s.index.join(rhs_idx, how=join)] result = s.str.cat([t, u], join=join, na_rep="-") From 6779a97987e8767025d6ee46bac33fa0b7720123 Mon Sep 17 00:00:00 2001 From: William Ayd Date: Thu, 9 Jul 2020 14:57:04 -0700 Subject: [PATCH 0309/1025] Json Visual Clutter cleanup (#35183) --- pandas/_libs/src/ujson/python/objToJSON.c | 165 +++------------------- 1 file changed, 19 insertions(+), 146 deletions(-) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index e841f00489887..59298522d86d1 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -143,8 +143,6 @@ typedef struct __PyObjectEncoder { enum PANDAS_FORMAT { SPLIT, RECORDS, INDEX, COLUMNS, VALUES }; -#define PRINTMARK() - int PdBlock_iterNext(JSOBJ, JSONTypeContext *); void *initObjToJSON(void) { @@ -219,13 +217,11 @@ static TypeContext *createTypeContext(void) { static PyObject *get_values(PyObject *obj) { PyObject *values = NULL; - PRINTMARK(); - - if (PyObject_TypeCheck(obj, cls_index) || PyObject_TypeCheck(obj, cls_series)) { + if (PyObject_TypeCheck(obj, cls_index) || + PyObject_TypeCheck(obj, cls_series)) { // The special cases to worry about are dt64tz and category[dt64tz]. // In both cases we want the UTC-localized datetime64 ndarray, // without going through and object array of Timestamps. - PRINTMARK(); values = PyObject_GetAttrString(obj, "values"); if (values == NULL) { @@ -236,7 +232,6 @@ static PyObject *get_values(PyObject *obj) { values = PyObject_CallMethod(values, "__array__", NULL); } else if (!PyArray_CheckExact(values)) { // Didn't get a numpy array, so keep trying - PRINTMARK(); Py_DECREF(values); values = NULL; } @@ -245,7 +240,6 @@ static PyObject *get_values(PyObject *obj) { if (values == NULL) { PyObject *typeRepr = PyObject_Repr((PyObject *)Py_TYPE(obj)); PyObject *repr; - PRINTMARK(); if (PyObject_HasAttrString(obj, "dtype")) { PyObject *dtype = PyObject_GetAttrString(obj, "dtype"); repr = PyObject_Repr(dtype); @@ -369,7 +363,6 @@ static char *PyTimeToJSON(JSOBJ _obj, JSONTypeContext *tc, size_t *outLen) { str = PyObject_CallMethod(obj, "isoformat", NULL); if (str == NULL) { - PRINTMARK(); *outLen = 0; if (!PyErr_Occurred()) { PyErr_SetString(PyExc_ValueError, "Failed to convert time"); @@ -397,7 +390,6 @@ static char *PyTimeToJSON(JSOBJ _obj, JSONTypeContext *tc, size_t *outLen) { static void NpyArr_freeItemValue(JSOBJ Py_UNUSED(_obj), JSONTypeContext *tc) { if (GET_TC(tc)->npyarr && GET_TC(tc)->itemValue != GET_TC(tc)->npyarr->array) { - PRINTMARK(); Py_XDECREF(GET_TC(tc)->itemValue); GET_TC(tc)->itemValue = NULL; } @@ -417,7 +409,6 @@ void NpyArr_iterBegin(JSOBJ _obj, JSONTypeContext *tc) { obj = (PyArrayObject *)_obj; } - PRINTMARK(); npyarr = PyObject_Malloc(sizeof(NpyArrContext)); GET_TC(tc)->npyarr = npyarr; @@ -454,7 +445,6 @@ void NpyArr_iterBegin(JSOBJ _obj, JSONTypeContext *tc) { void NpyArr_iterEnd(JSOBJ obj, JSONTypeContext *tc) { NpyArrContext *npyarr = GET_TC(tc)->npyarr; - PRINTMARK(); if (npyarr) { NpyArr_freeItemValue(obj, tc); @@ -463,13 +453,10 @@ void NpyArr_iterEnd(JSOBJ obj, JSONTypeContext *tc) { } void NpyArrPassThru_iterBegin(JSOBJ Py_UNUSED(obj), - JSONTypeContext *Py_UNUSED(tc)) { - PRINTMARK(); -} + JSONTypeContext *Py_UNUSED(tc)) {} void NpyArrPassThru_iterEnd(JSOBJ obj, JSONTypeContext *tc) { NpyArrContext *npyarr = GET_TC(tc)->npyarr; - PRINTMARK(); // finished this dimension, reset the data pointer npyarr->curdim--; npyarr->dataptr -= npyarr->stride * npyarr->index[npyarr->stridedim]; @@ -483,28 +470,24 @@ void NpyArrPassThru_iterEnd(JSOBJ obj, JSONTypeContext *tc) { int NpyArr_iterNextItem(JSOBJ obj, JSONTypeContext *tc) { NpyArrContext *npyarr = GET_TC(tc)->npyarr; - PRINTMARK(); if (PyErr_Occurred()) { return 0; } if (npyarr->index[npyarr->stridedim] >= npyarr->dim) { - PRINTMARK(); return 0; } NpyArr_freeItemValue(obj, tc); if (PyArray_ISDATETIME(npyarr->array)) { - PRINTMARK(); GET_TC(tc)->itemValue = obj; Py_INCREF(obj); ((PyObjectEncoder *)tc->encoder)->npyType = PyArray_TYPE(npyarr->array); ((PyObjectEncoder *)tc->encoder)->npyValue = npyarr->dataptr; ((PyObjectEncoder *)tc->encoder)->npyCtxtPassthru = npyarr; } else { - PRINTMARK(); GET_TC(tc)->itemValue = npyarr->getitem(npyarr->dataptr, npyarr->array); } @@ -515,16 +498,13 @@ int NpyArr_iterNextItem(JSOBJ obj, JSONTypeContext *tc) { int NpyArr_iterNext(JSOBJ _obj, JSONTypeContext *tc) { NpyArrContext *npyarr = GET_TC(tc)->npyarr; - PRINTMARK(); if (PyErr_Occurred()) { - PRINTMARK(); return 0; } if (npyarr->curdim >= npyarr->ndim || npyarr->index[npyarr->stridedim] >= npyarr->dim) { - PRINTMARK(); // innermost dimension, start retrieving item values GET_TC(tc)->iterNext = NpyArr_iterNextItem; return NpyArr_iterNextItem(_obj, tc); @@ -545,7 +525,6 @@ int NpyArr_iterNext(JSOBJ _obj, JSONTypeContext *tc) { } JSOBJ NpyArr_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { - PRINTMARK(); return GET_TC(tc)->itemValue; } @@ -553,7 +532,6 @@ char *NpyArr_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, size_t *outLen) { NpyArrContext *npyarr = GET_TC(tc)->npyarr; npy_intp idx; - PRINTMARK(); char *cStr; if (GET_TC(tc)->iterNext == NpyArr_iterNextItem) { @@ -580,7 +558,6 @@ char *NpyArr_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, void PdBlockPassThru_iterEnd(JSOBJ obj, JSONTypeContext *tc) { PdBlockContext *blkCtxt = GET_TC(tc)->pdblock; - PRINTMARK(); if (blkCtxt->transpose) { blkCtxt->colIdx++; @@ -593,7 +570,6 @@ void PdBlockPassThru_iterEnd(JSOBJ obj, JSONTypeContext *tc) { int PdBlock_iterNextItem(JSOBJ obj, JSONTypeContext *tc) { PdBlockContext *blkCtxt = GET_TC(tc)->pdblock; - PRINTMARK(); if (blkCtxt->colIdx >= blkCtxt->ncols) { return 0; @@ -610,7 +586,6 @@ char *PdBlock_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, NpyArrContext *npyarr = blkCtxt->npyCtxts[0]; npy_intp idx; char *cStr; - PRINTMARK(); if (GET_TC(tc)->iterNext == PdBlock_iterNextItem) { idx = blkCtxt->colIdx - 1; @@ -633,7 +608,6 @@ char *PdBlock_iterGetName_Transpose(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, NpyArrContext *npyarr = blkCtxt->npyCtxts[blkCtxt->colIdx]; npy_intp idx; char *cStr; - PRINTMARK(); if (GET_TC(tc)->iterNext == NpyArr_iterNextItem) { idx = npyarr->index[npyarr->stridedim] - 1; @@ -650,7 +624,6 @@ char *PdBlock_iterGetName_Transpose(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, int PdBlock_iterNext(JSOBJ obj, JSONTypeContext *tc) { PdBlockContext *blkCtxt = GET_TC(tc)->pdblock; NpyArrContext *npyarr; - PRINTMARK(); if (PyErr_Occurred() || ((JSONObjectEncoder *)tc->encoder)->errorMsg) { return 0; @@ -675,7 +648,6 @@ int PdBlock_iterNext(JSOBJ obj, JSONTypeContext *tc) { void PdBlockPassThru_iterBegin(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { PdBlockContext *blkCtxt = GET_TC(tc)->pdblock; - PRINTMARK(); if (blkCtxt->transpose) { // if transposed we exhaust each column before moving to the next @@ -697,7 +669,6 @@ void PdBlock_iterBegin(JSOBJ _obj, JSONTypeContext *tc) { npy_int64 colIdx; npy_intp idx; - PRINTMARK(); obj = (PyObject *)_obj; GET_TC(tc)->iterGetName = GET_TC(tc)->transpose @@ -744,8 +715,8 @@ void PdBlock_iterBegin(JSOBJ _obj, JSONTypeContext *tc) { GET_TC(tc)->iterNext = NpyArr_iterNextNone; return; } else if (!PyTuple_Check(blocks)) { - PyErr_SetString(PyExc_TypeError, "blocks must be a tuple!"); - goto BLKRET; + PyErr_SetString(PyExc_TypeError, "blocks must be a tuple!"); + goto BLKRET; } // force transpose so each NpyArrContext strides down its column @@ -836,7 +807,6 @@ void PdBlock_iterEnd(JSOBJ obj, JSONTypeContext *tc) { PdBlockContext *blkCtxt; NpyArrContext *npyarr; int i; - PRINTMARK(); GET_TC(tc)->itemValue = NULL; npyarr = GET_TC(tc)->npyarr; @@ -948,7 +918,7 @@ JSOBJ Set_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { } char *Set_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *Py_UNUSED(tc), - size_t *Py_UNUSED(outLen)) { + size_t *Py_UNUSED(outLen)) { return NULL; } @@ -961,7 +931,6 @@ void Dir_iterBegin(JSOBJ obj, JSONTypeContext *tc) { GET_TC(tc)->attrList = PyObject_Dir(obj); GET_TC(tc)->index = 0; GET_TC(tc)->size = PyList_GET_SIZE(GET_TC(tc)->attrList); - PRINTMARK(); } void Dir_iterEnd(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { @@ -976,7 +945,6 @@ void Dir_iterEnd(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { } Py_DECREF((PyObject *)GET_TC(tc)->attrList); - PRINTMARK(); } int Dir_iterNext(JSOBJ _obj, JSONTypeContext *tc) { @@ -1007,7 +975,6 @@ int Dir_iterNext(JSOBJ _obj, JSONTypeContext *tc) { attrStr = PyBytes_AS_STRING(attr); if (attrStr[0] == '_') { - PRINTMARK(); Py_DECREF(attr); continue; } @@ -1016,14 +983,12 @@ int Dir_iterNext(JSOBJ _obj, JSONTypeContext *tc) { if (itemValue == NULL) { PyErr_Clear(); Py_DECREF(attr); - PRINTMARK(); continue; } if (PyCallable_Check(itemValue)) { Py_DECREF(itemValue); Py_DECREF(attr); - PRINTMARK(); continue; } @@ -1031,7 +996,6 @@ int Dir_iterNext(JSOBJ _obj, JSONTypeContext *tc) { GET_TC(tc)->itemValue = itemValue; GET_TC(tc)->index++; - PRINTMARK(); itemName = attr; break; } @@ -1046,18 +1010,15 @@ int Dir_iterNext(JSOBJ _obj, JSONTypeContext *tc) { GET_TC(tc)->itemValue = itemValue; GET_TC(tc)->index++; - PRINTMARK(); return 1; } JSOBJ Dir_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { - PRINTMARK(); return GET_TC(tc)->itemValue; } char *Dir_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, size_t *outLen) { - PRINTMARK(); *outLen = PyBytes_GET_SIZE(GET_TC(tc)->itemName); return PyBytes_AS_STRING(GET_TC(tc)->itemName); } @@ -1073,7 +1034,6 @@ void List_iterBegin(JSOBJ obj, JSONTypeContext *tc) { int List_iterNext(JSOBJ obj, JSONTypeContext *tc) { if (GET_TC(tc)->index >= GET_TC(tc)->size) { - PRINTMARK(); return 0; } @@ -1102,7 +1062,6 @@ void Index_iterBegin(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { if (!GET_TC(tc)->cStr) { PyErr_NoMemory(); } - PRINTMARK(); } int Index_iterNext(JSOBJ obj, JSONTypeContext *tc) { @@ -1123,18 +1082,14 @@ int Index_iterNext(JSOBJ obj, JSONTypeContext *tc) { return 0; } } else { - PRINTMARK(); return 0; } GET_TC(tc)->index++; - PRINTMARK(); return 1; } -void Index_iterEnd(JSOBJ Py_UNUSED(obj), JSONTypeContext *Py_UNUSED(tc)) { - PRINTMARK(); -} +void Index_iterEnd(JSOBJ Py_UNUSED(obj), JSONTypeContext *Py_UNUSED(tc)) {} JSOBJ Index_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { return GET_TC(tc)->itemValue; @@ -1157,7 +1112,6 @@ void Series_iterBegin(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { if (!GET_TC(tc)->cStr) { PyErr_NoMemory(); } - PRINTMARK(); } int Series_iterNext(JSOBJ obj, JSONTypeContext *tc) { @@ -1181,19 +1135,16 @@ int Series_iterNext(JSOBJ obj, JSONTypeContext *tc) { return 0; } } else { - PRINTMARK(); return 0; } GET_TC(tc)->index++; - PRINTMARK(); return 1; } void Series_iterEnd(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; enc->outputFormat = enc->originalOutputFormat; - PRINTMARK(); } JSOBJ Series_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { @@ -1217,7 +1168,6 @@ void DataFrame_iterBegin(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { if (!GET_TC(tc)->cStr) { PyErr_NoMemory(); } - PRINTMARK(); } int DataFrame_iterNext(JSOBJ obj, JSONTypeContext *tc) { @@ -1246,19 +1196,16 @@ int DataFrame_iterNext(JSOBJ obj, JSONTypeContext *tc) { GET_TC(tc)->itemValue = obj; } } else { - PRINTMARK(); return 0; } GET_TC(tc)->index++; - PRINTMARK(); return 1; } void DataFrame_iterEnd(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { PyObjectEncoder *enc = (PyObjectEncoder *)tc->encoder; enc->outputFormat = enc->originalOutputFormat; - PRINTMARK(); } JSOBJ DataFrame_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { @@ -1278,7 +1225,6 @@ char *DataFrame_iterGetName(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc, //============================================================================= void Dict_iterBegin(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { GET_TC(tc)->index = 0; - PRINTMARK(); } int Dict_iterNext(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { @@ -1291,7 +1237,6 @@ int Dict_iterNext(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { if (!PyDict_Next((PyObject *)GET_TC(tc)->dictObj, &GET_TC(tc)->index, &GET_TC(tc)->itemName, &GET_TC(tc)->itemValue)) { - PRINTMARK(); return 0; } @@ -1305,7 +1250,6 @@ int Dict_iterNext(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { } else { Py_INCREF(GET_TC(tc)->itemName); } - PRINTMARK(); return 1; } @@ -1315,7 +1259,6 @@ void Dict_iterEnd(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { GET_TC(tc)->itemName = NULL; } Py_DECREF(GET_TC(tc)->dictObj); - PRINTMARK(); } JSOBJ Dict_iterGetValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { @@ -1366,7 +1309,6 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc, char *dataptr, *cLabel; int type_num; NPY_DATETIMEUNIT base = enc->datetimeUnit; - PRINTMARK(); if (!labels) { return 0; @@ -1425,8 +1367,7 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc, 1000000000LL; // nanoseconds per second } else { // datetime.* objects don't follow above rules - nanosecVal = - PyDateTimeToEpoch(item, NPY_FR_ns); + nanosecVal = PyDateTimeToEpoch(item, NPY_FR_ns); } } } @@ -1503,7 +1444,6 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc, void Object_invokeDefaultHandler(PyObject *obj, PyObjectEncoder *enc) { PyObject *tmpObj = NULL; - PRINTMARK(); tmpObj = PyObject_CallFunctionObjArgs(enc->defaultHandler, obj, NULL); if (!PyErr_Occurred()) { if (tmpObj == NULL) { @@ -1524,7 +1464,6 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { double val; npy_int64 value; int unit; - PRINTMARK(); tc->prv = NULL; @@ -1537,11 +1476,9 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { enc = (PyObjectEncoder *)tc->encoder; if (PyBool_Check(obj)) { - PRINTMARK(); tc->type = (obj == Py_True) ? JT_TRUE : JT_FALSE; return; } else if (obj == Py_None) { - PRINTMARK(); tc->type = JT_NULL; return; } @@ -1554,7 +1491,6 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { tc->prv = pc; if (PyTypeNum_ISDATETIME(enc->npyType)) { - PRINTMARK(); int64_t longVal; PyArray_VectorUnaryFunc *castfunc = PyArray_GetCastFunc(PyArray_DescrFromType(enc->npyType), NPY_INT64); @@ -1564,12 +1500,10 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { } castfunc(enc->npyValue, &longVal, 1, NULL, NULL); if (longVal == get_nat()) { - PRINTMARK(); tc->type = JT_NULL; } else { if (enc->datetimeIso) { - PRINTMARK(); if (enc->npyType == NPY_TIMEDELTA) { pc->PyTypeToUTF8 = NpyTimeDeltaToIsoCallback; } else { @@ -1580,7 +1514,6 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { GET_TC(tc)->longValue = longVal; tc->type = JT_UTF8; } else { - PRINTMARK(); NPY_DATETIMEUNIT base = ((PyObjectEncoder *)tc->encoder)->datetimeUnit; GET_TC(tc)->longValue = NpyDateTimeToEpoch(longVal, base); @@ -1597,30 +1530,24 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { if (PyIter_Check(obj) || (PyArray_Check(obj) && !PyArray_CheckScalar(obj))) { - PRINTMARK(); goto ISITERABLE; } if (PyLong_Check(obj)) { - PRINTMARK(); tc->type = JT_LONG; int overflow = 0; GET_TC(tc)->longValue = PyLong_AsLongLongAndOverflow(obj, &overflow); int err; err = (GET_TC(tc)->longValue == -1) && PyErr_Occurred(); - if (overflow){ - PRINTMARK(); + if (overflow) { tc->type = JT_BIGNUM; - } - else if (err) { - PRINTMARK(); + } else if (err) { goto INVALID; } - + return; } else if (PyFloat_Check(obj)) { - PRINTMARK(); val = PyFloat_AS_DOUBLE(obj); if (npy_isnan(val) || npy_isinf(val)) { tc->type = JT_NULL; @@ -1630,80 +1557,61 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { } return; } else if (PyBytes_Check(obj)) { - PRINTMARK(); pc->PyTypeToUTF8 = PyBytesToUTF8; tc->type = JT_UTF8; return; } else if (PyUnicode_Check(obj)) { - PRINTMARK(); pc->PyTypeToUTF8 = PyUnicodeToUTF8; tc->type = JT_UTF8; return; } else if (PyObject_TypeCheck(obj, type_decimal)) { - PRINTMARK(); GET_TC(tc)->doubleValue = PyFloat_AsDouble(obj); tc->type = JT_DOUBLE; return; } else if (PyDateTime_Check(obj) || PyDate_Check(obj)) { if (PyObject_TypeCheck(obj, cls_nat)) { - PRINTMARK(); tc->type = JT_NULL; return; } - PRINTMARK(); if (enc->datetimeIso) { - PRINTMARK(); pc->PyTypeToUTF8 = PyDateTimeToIsoCallback; tc->type = JT_UTF8; } else { - PRINTMARK(); NPY_DATETIMEUNIT base = ((PyObjectEncoder *)tc->encoder)->datetimeUnit; - GET_TC(tc)->longValue = - PyDateTimeToEpoch(obj, base); + GET_TC(tc)->longValue = PyDateTimeToEpoch(obj, base); tc->type = JT_LONG; } return; } else if (PyTime_Check(obj)) { - PRINTMARK(); pc->PyTypeToUTF8 = PyTimeToJSON; tc->type = JT_UTF8; return; } else if (PyArray_IsScalar(obj, Datetime)) { - PRINTMARK(); if (((PyDatetimeScalarObject *)obj)->obval == get_nat()) { - PRINTMARK(); tc->type = JT_NULL; return; } - PRINTMARK(); if (enc->datetimeIso) { - PRINTMARK(); pc->PyTypeToUTF8 = PyDateTimeToIsoCallback; tc->type = JT_UTF8; } else { - PRINTMARK(); NPY_DATETIMEUNIT base = ((PyObjectEncoder *)tc->encoder)->datetimeUnit; - GET_TC(tc)->longValue = - PyDateTimeToEpoch(obj, base); + GET_TC(tc)->longValue = PyDateTimeToEpoch(obj, base); tc->type = JT_LONG; } return; } else if (PyDelta_Check(obj)) { if (PyObject_HasAttrString(obj, "value")) { - PRINTMARK(); value = get_long_attr(obj, "value"); } else { - PRINTMARK(); value = total_seconds(obj) * 1000000000LL; // nanoseconds per second } - PRINTMARK(); if (value == get_nat()) { - PRINTMARK(); tc->type = JT_NULL; return; } else if (enc->datetimeIso) { @@ -1718,7 +1626,6 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { exc = PyErr_Occurred(); if (exc && PyErr_ExceptionMatches(PyExc_OverflowError)) { - PRINTMARK(); goto INVALID; } @@ -1727,7 +1634,6 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { GET_TC(tc)->longValue = value; return; } else if (PyArray_IsScalar(obj, Integer)) { - PRINTMARK(); tc->type = JT_LONG; PyArray_CastScalarToCtype(obj, &(GET_TC(tc)->longValue), PyArray_DescrFromType(NPY_INT64)); @@ -1735,19 +1641,16 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { exc = PyErr_Occurred(); if (exc && PyErr_ExceptionMatches(PyExc_OverflowError)) { - PRINTMARK(); goto INVALID; } return; } else if (PyArray_IsScalar(obj, Bool)) { - PRINTMARK(); PyArray_CastScalarToCtype(obj, &(GET_TC(tc)->longValue), PyArray_DescrFromType(NPY_BOOL)); tc->type = (GET_TC(tc)->longValue) ? JT_TRUE : JT_FALSE; return; } else if (PyArray_IsScalar(obj, Float) || PyArray_IsScalar(obj, Double)) { - PRINTMARK(); PyArray_CastScalarToCtype(obj, &(GET_TC(tc)->doubleValue), PyArray_DescrFromType(NPY_DOUBLE)); tc->type = JT_DOUBLE; @@ -1758,7 +1661,6 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { obj); goto INVALID; } else if (PyObject_TypeCheck(obj, cls_na)) { - PRINTMARK(); tc->type = JT_NULL; return; } @@ -1767,7 +1669,6 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { if (PyObject_TypeCheck(obj, cls_index)) { if (enc->outputFormat == SPLIT) { - PRINTMARK(); tc->type = JT_OBJECT; pc->iterBegin = Index_iterBegin; pc->iterEnd = Index_iterEnd; @@ -1779,7 +1680,6 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { pc->newObj = get_values(obj); if (pc->newObj) { - PRINTMARK(); tc->type = JT_ARRAY; pc->iterBegin = NpyArr_iterBegin; pc->iterEnd = NpyArr_iterEnd; @@ -1793,7 +1693,6 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { return; } else if (PyObject_TypeCheck(obj, cls_series)) { if (enc->outputFormat == SPLIT) { - PRINTMARK(); tc->type = JT_OBJECT; pc->iterBegin = Series_iterBegin; pc->iterEnd = Series_iterEnd; @@ -1809,7 +1708,6 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { } if (enc->outputFormat == INDEX || enc->outputFormat == COLUMNS) { - PRINTMARK(); tc->type = JT_OBJECT; tmpObj = PyObject_GetAttrString(obj, "index"); if (!tmpObj) { @@ -1827,7 +1725,6 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { goto INVALID; } } else { - PRINTMARK(); tc->type = JT_ARRAY; } pc->iterBegin = NpyArr_iterBegin; @@ -1838,7 +1735,6 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { return; } else if (PyArray_Check(obj)) { if (enc->npyCtxtPassthru) { - PRINTMARK(); pc->npyarr = enc->npyCtxtPassthru; tc->type = (pc->npyarr->columnLabels ? JT_OBJECT : JT_ARRAY); @@ -1852,7 +1748,6 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { return; } - PRINTMARK(); tc->type = JT_ARRAY; pc->iterBegin = NpyArr_iterBegin; pc->iterEnd = NpyArr_iterEnd; @@ -1862,7 +1757,6 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { return; } else if (PyObject_TypeCheck(obj, cls_dataframe)) { if (enc->blkCtxtPassthru) { - PRINTMARK(); pc->pdblock = enc->blkCtxtPassthru; tc->type = (pc->pdblock->npyCtxts[0]->columnLabels ? JT_OBJECT : JT_ARRAY); @@ -1878,7 +1772,6 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { } if (enc->outputFormat == SPLIT) { - PRINTMARK(); tc->type = JT_OBJECT; pc->iterBegin = DataFrame_iterBegin; pc->iterEnd = DataFrame_iterEnd; @@ -1888,7 +1781,6 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { return; } - PRINTMARK(); if (is_simple_frame(obj)) { pc->iterBegin = NpyArr_iterBegin; pc->iterEnd = NpyArr_iterEnd; @@ -1908,10 +1800,8 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { pc->iterGetValue = NpyArr_iterGetValue; if (enc->outputFormat == VALUES) { - PRINTMARK(); tc->type = JT_ARRAY; } else if (enc->outputFormat == RECORDS) { - PRINTMARK(); tc->type = JT_ARRAY; tmpObj = PyObject_GetAttrString(obj, "columns"); if (!tmpObj) { @@ -1930,7 +1820,6 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { goto INVALID; } } else if (enc->outputFormat == INDEX || enc->outputFormat == COLUMNS) { - PRINTMARK(); tc->type = JT_OBJECT; tmpObj = (enc->outputFormat == INDEX ? PyObject_GetAttrString(obj, "index") @@ -1973,7 +1862,6 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { } if (enc->outputFormat == COLUMNS) { - PRINTMARK(); pc->transpose = 1; } } else { @@ -1981,7 +1869,6 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { } return; } else if (PyDict_Check(obj)) { - PRINTMARK(); tc->type = JT_OBJECT; pc->iterBegin = Dict_iterBegin; pc->iterEnd = Dict_iterEnd; @@ -1993,7 +1880,6 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { return; } else if (PyList_Check(obj)) { - PRINTMARK(); tc->type = JT_ARRAY; pc->iterBegin = List_iterBegin; pc->iterEnd = List_iterEnd; @@ -2002,7 +1888,6 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { pc->iterGetName = List_iterGetName; return; } else if (PyTuple_Check(obj)) { - PRINTMARK(); tc->type = JT_ARRAY; pc->iterBegin = Tuple_iterBegin; pc->iterEnd = Tuple_iterEnd; @@ -2011,7 +1896,6 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { pc->iterGetName = Tuple_iterGetName; return; } else if (PyAnySet_Check(obj)) { - PRINTMARK(); tc->type = JT_ARRAY; pc->iterBegin = Set_iterBegin; pc->iterEnd = Set_iterEnd; @@ -2041,7 +1925,6 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { return; } - PRINTMARK(); tc->type = JT_OBJECT; pc->iterBegin = Dict_iterBegin; pc->iterEnd = Dict_iterEnd; @@ -2059,7 +1942,6 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { goto INVALID; } - PRINTMARK(); tc->type = JT_OBJECT; pc->iterBegin = Dir_iterBegin; pc->iterEnd = Dir_iterEnd; @@ -2076,7 +1958,6 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { } void Object_endTypeContext(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { - PRINTMARK(); if (tc->prv) { Py_XDECREF(GET_TC(tc)->newObj); GET_TC(tc)->newObj = NULL; @@ -2105,16 +1986,16 @@ double Object_getDoubleValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { return GET_TC(tc)->doubleValue; } -const char *Object_getBigNumStringValue(JSOBJ obj, JSONTypeContext *tc, - size_t *_outLen) { - PyObject* repr = PyObject_Str(obj); - const char *str = PyUnicode_AsUTF8AndSize(repr, (Py_ssize_t *) _outLen); - char* bytes = PyObject_Malloc(*_outLen + 1); +const char *Object_getBigNumStringValue(JSOBJ obj, JSONTypeContext *tc, + size_t *_outLen) { + PyObject *repr = PyObject_Str(obj); + const char *str = PyUnicode_AsUTF8AndSize(repr, (Py_ssize_t *)_outLen); + char *bytes = PyObject_Malloc(*_outLen + 1); memcpy(bytes, str, *_outLen + 1); GET_TC(tc)->cStr = bytes; Py_DECREF(repr); - + return GET_TC(tc)->cStr; } @@ -2200,8 +2081,6 @@ PyObject *objToJSON(PyObject *Py_UNUSED(self), PyObject *args, pyEncoder.outputFormat = COLUMNS; pyEncoder.defaultHandler = 0; - PRINTMARK(); - if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|OiOssOOi", kwlist, &oinput, &oensureAscii, &idoublePrecision, &oencodeHTMLChars, &sOrient, &sdateFormat, @@ -2274,16 +2153,12 @@ PyObject *objToJSON(PyObject *Py_UNUSED(self), PyObject *args, encoder->indent = indent; pyEncoder.originalOutputFormat = pyEncoder.outputFormat; - PRINTMARK(); ret = JSON_EncodeObject(oinput, encoder, buffer, sizeof(buffer)); - PRINTMARK(); if (PyErr_Occurred()) { - PRINTMARK(); return NULL; } if (encoder->errorMsg) { - PRINTMARK(); if (ret != buffer) { encoder->free(ret); } @@ -2297,7 +2172,5 @@ PyObject *objToJSON(PyObject *Py_UNUSED(self), PyObject *args, encoder->free(ret); } - PRINTMARK(); - return newobj; } From 1f380550b5e1badd399c2306cb2621da3af5d89d Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 9 Jul 2020 15:00:13 -0700 Subject: [PATCH 0310/1025] REF: remove always-UTC arg from tz_convert, tz_convert_single (#35154) --- asv_bench/benchmarks/tslibs/tz_convert.py | 21 ++++-- pandas/_libs/tslibs/__init__.py | 4 +- pandas/_libs/tslibs/offsets.pyx | 5 +- pandas/_libs/tslibs/timestamps.pyx | 6 +- pandas/_libs/tslibs/tzconversion.pxd | 2 +- pandas/_libs/tslibs/tzconversion.pyx | 68 +++++-------------- pandas/core/arrays/datetimes.py | 4 +- .../tests/scalar/timestamp/test_timezones.py | 2 +- pandas/tests/tslibs/test_api.py | 2 +- pandas/tests/tslibs/test_conversion.py | 9 +-- pandas/tseries/frequencies.py | 5 +- 11 files changed, 52 insertions(+), 76 deletions(-) diff --git a/asv_bench/benchmarks/tslibs/tz_convert.py b/asv_bench/benchmarks/tslibs/tz_convert.py index 2a1f559bdf6d4..c2c90024ca5bd 100644 --- a/asv_bench/benchmarks/tslibs/tz_convert.py +++ b/asv_bench/benchmarks/tslibs/tz_convert.py @@ -1,16 +1,23 @@ import numpy as np from pytz import UTC -from pandas._libs.tslibs.tzconversion import tz_convert, tz_localize_to_utc +from pandas._libs.tslibs.tzconversion import tz_localize_to_utc from .tslib import _sizes, _tzs +try: + old_sig = False + from pandas._libs.tslibs.tzconversion import tz_convert_from_utc +except ImportError: + old_sig = True + from pandas._libs.tslibs.tzconversion import tz_convert as tz_convert_from_utc + class TimeTZConvert: - params = ( + params = [ _sizes, [x for x in _tzs if x is not None], - ) + ] param_names = ["size", "tz"] def setup(self, size, tz): @@ -21,7 +28,13 @@ def time_tz_convert_from_utc(self, size, tz): # effectively: # dti = DatetimeIndex(self.i8data, tz=tz) # dti.tz_localize(None) - tz_convert(self.i8data, UTC, tz) + if size >= 10 ** 6 and str(tz) == "tzlocal()": + # asv fill will because each call takes 8+seconds + return + if old_sig: + tz_convert_from_utc(self.i8data, UTC, tz) + else: + tz_convert_from_utc(self.i8data, tz) def time_tz_localize_to_utc(self, size, tz): # effectively: diff --git a/pandas/_libs/tslibs/__init__.py b/pandas/_libs/tslibs/__init__.py index c2f3478a50ab4..6fe6fa0a13c34 100644 --- a/pandas/_libs/tslibs/__init__.py +++ b/pandas/_libs/tslibs/__init__.py @@ -19,7 +19,7 @@ "ints_to_pytimedelta", "get_resolution", "Timestamp", - "tz_convert_single", + "tz_convert_from_utc_single", "to_offset", "Tick", "BaseOffset", @@ -34,7 +34,7 @@ from .resolution import Resolution from .timedeltas import Timedelta, delta_to_nanoseconds, ints_to_pytimedelta from .timestamps import Timestamp -from .tzconversion import tz_convert_single +from .tzconversion import tz_convert_from_utc_single from .vectorized import ( dt64arr_to_periodarr, get_resolution, diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index fb07e3fe7547e..0f9280ae92d39 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -46,8 +46,7 @@ from pandas._libs.tslibs.np_datetime cimport ( dt64_to_dtstruct, pydate_to_dtstruct, ) -from pandas._libs.tslibs.timezones cimport utc_pytz as UTC -from pandas._libs.tslibs.tzconversion cimport tz_convert_single +from pandas._libs.tslibs.tzconversion cimport tz_convert_from_utc_single from .dtypes cimport PeriodDtypeCode from .timedeltas cimport delta_to_nanoseconds @@ -264,7 +263,7 @@ cdef _to_dt64D(dt): # equiv `Timestamp(dt).value` or `dt.timestamp() * 10**9` nanos = getattr(dt, "nanosecond", 0) i8 = convert_datetime_to_tsobject(dt, tz=None, nanos=nanos).value - dt = tz_convert_single(i8, UTC, dt.tzinfo) + dt = tz_convert_from_utc_single(i8, dt.tzinfo) dt = np.int64(dt).astype('datetime64[ns]') else: dt = np.datetime64(dt) diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index a2dacd9d36b14..8cef685933863 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -59,7 +59,7 @@ from pandas._libs.tslibs.timezones cimport ( get_timezone, tz_compare, ) from pandas._libs.tslibs.tzconversion cimport ( - tz_convert_single, + tz_convert_from_utc_single, tz_localize_to_utc_single, ) @@ -1309,7 +1309,7 @@ default 'raise' else: if tz is None: # reset tz - value = tz_convert_single(self.value, UTC, self.tz) + value = tz_convert_from_utc_single(self.value, self.tz) return Timestamp(value, tz=tz, freq=self.freq) else: raise TypeError( @@ -1391,7 +1391,7 @@ default 'raise' tzobj = self.tzinfo value = self.value if tzobj is not None: - value = tz_convert_single(value, UTC, tzobj) + value = tz_convert_from_utc_single(value, tzobj) # setup components dt64_to_dtstruct(value, &dts) diff --git a/pandas/_libs/tslibs/tzconversion.pxd b/pandas/_libs/tslibs/tzconversion.pxd index 7d102868256de..1990afd77a8fb 100644 --- a/pandas/_libs/tslibs/tzconversion.pxd +++ b/pandas/_libs/tslibs/tzconversion.pxd @@ -3,7 +3,7 @@ from numpy cimport int64_t cdef int64_t tz_convert_utc_to_tzlocal(int64_t utc_val, tzinfo tz, bint* fold=*) -cpdef int64_t tz_convert_single(int64_t val, tzinfo tz1, tzinfo tz2) +cpdef int64_t tz_convert_from_utc_single(int64_t val, tzinfo tz) cdef int64_t tz_localize_to_utc_single( int64_t val, tzinfo tz, object ambiguous=*, object nonexistent=* ) except? -1 diff --git a/pandas/_libs/tslibs/tzconversion.pyx b/pandas/_libs/tslibs/tzconversion.pyx index 98c40e109dbab..a6afd47d93479 100644 --- a/pandas/_libs/tslibs/tzconversion.pyx +++ b/pandas/_libs/tslibs/tzconversion.pyx @@ -366,17 +366,16 @@ cdef int64_t tz_convert_utc_to_tzlocal(int64_t utc_val, tzinfo tz, bint* fold=NU return _tz_convert_tzlocal_utc(utc_val, tz, to_utc=False, fold=fold) -cpdef int64_t tz_convert_single(int64_t val, tzinfo tz1, tzinfo tz2): +cpdef int64_t tz_convert_from_utc_single(int64_t val, tzinfo tz): """ - Convert the val (in i8) from timezone1 to timezone2 + Convert the val (in i8) from UTC to tz - This is a single timezone version of tz_convert + This is a single value version of tz_convert_from_utc. Parameters ---------- val : int64 - tz1 : tzinfo - tz2 : tzinfo + tz : tzinfo Returns ------- @@ -384,38 +383,27 @@ cpdef int64_t tz_convert_single(int64_t val, tzinfo tz1, tzinfo tz2): """ cdef: int64_t arr[1] - bint to_utc = is_utc(tz2) - tzinfo tz - - # See GH#17734 We should always be converting either from UTC or to UTC - assert is_utc(tz1) or to_utc if val == NPY_NAT: return val - if to_utc: - tz = tz1 - else: - tz = tz2 - if is_utc(tz): return val elif is_tzlocal(tz): - return _tz_convert_tzlocal_utc(val, tz, to_utc=to_utc) + return _tz_convert_tzlocal_utc(val, tz, to_utc=False) else: arr[0] = val - return _tz_convert_dst(arr, tz, to_utc=to_utc)[0] + return _tz_convert_dst(arr, tz)[0] -def tz_convert(int64_t[:] vals, tzinfo tz1, tzinfo tz2): +def tz_convert_from_utc(int64_t[:] vals, tzinfo tz): """ - Convert the values (in i8) from timezone1 to timezone2 + Convert the values (in i8) from UTC to tz Parameters ---------- vals : int64 ndarray - tz1 : tzinfo - tz2 : tzinfo + tz : tzinfo Returns ------- @@ -423,36 +411,24 @@ def tz_convert(int64_t[:] vals, tzinfo tz1, tzinfo tz2): """ cdef: int64_t[:] converted - bint to_utc = is_utc(tz2) - tzinfo tz - - # See GH#17734 We should always be converting from UTC; otherwise - # should use tz_localize_to_utc. - assert is_utc(tz1) if len(vals) == 0: return np.array([], dtype=np.int64) - if to_utc: - tz = tz1 - else: - tz = tz2 - - converted = _tz_convert_one_way(vals, tz, to_utc=to_utc) + converted = _tz_convert_from_utc(vals, tz) return np.array(converted, dtype=np.int64) @cython.boundscheck(False) @cython.wraparound(False) -cdef int64_t[:] _tz_convert_one_way(int64_t[:] vals, tzinfo tz, bint to_utc): +cdef int64_t[:] _tz_convert_from_utc(int64_t[:] vals, tzinfo tz): """ Convert the given values (in i8) either to UTC or from UTC. Parameters ---------- vals : int64 ndarray - tz1 : tzinfo - to_utc : bool + tz : tzinfo Returns ------- @@ -472,9 +448,9 @@ cdef int64_t[:] _tz_convert_one_way(int64_t[:] vals, tzinfo tz, bint to_utc): if val == NPY_NAT: converted[i] = NPY_NAT else: - converted[i] = _tz_convert_tzlocal_utc(val, tz, to_utc) + converted[i] = _tz_convert_tzlocal_utc(val, tz, to_utc=False) else: - converted = _tz_convert_dst(vals, tz, to_utc) + converted = _tz_convert_dst(vals, tz) return converted @@ -565,9 +541,7 @@ cdef int64_t _tz_convert_tzlocal_utc(int64_t val, tzinfo tz, bint to_utc=True, @cython.boundscheck(False) @cython.wraparound(False) -cdef int64_t[:] _tz_convert_dst( - const int64_t[:] values, tzinfo tz, bint to_utc=True, -): +cdef int64_t[:] _tz_convert_dst(const int64_t[:] values, tzinfo tz): """ tz_convert for non-UTC non-tzlocal cases where we have to check DST transitions pointwise. @@ -576,8 +550,6 @@ cdef int64_t[:] _tz_convert_dst( ---------- values : ndarray[int64_t] tz : tzinfo - to_utc : bool - True if converting _to_ UTC, False if converting _from_ utc Returns ------- @@ -607,10 +579,7 @@ cdef int64_t[:] _tz_convert_dst( if v == NPY_NAT: result[i] = v else: - if to_utc: - result[i] = v - delta - else: - result[i] = v + delta + result[i] = v + delta else: # Previously, this search was done pointwise to try and benefit @@ -629,9 +598,6 @@ cdef int64_t[:] _tz_convert_dst( # it elsewhere? raise ValueError("First time before start of DST info") - if to_utc: - result[i] = v - deltas[pos[i]] - else: - result[i] = v + deltas[pos[i]] + result[i] = v + deltas[pos[i]] return result diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 8eac45cdedaec..5038df85c9160 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -728,7 +728,7 @@ def _local_timestamps(self): This is used to calculate time-of-day information as if the timestamps were timezone-naive. """ - return tzconversion.tz_convert(self.asi8, timezones.UTC, self.tz) + return tzconversion.tz_convert_from_utc(self.asi8, self.tz) def tz_convert(self, tz): """ @@ -960,7 +960,7 @@ def tz_localize(self, tz, ambiguous="raise", nonexistent="raise"): if self.tz is not None: if tz is None: - new_dates = tzconversion.tz_convert(self.asi8, timezones.UTC, self.tz) + new_dates = tzconversion.tz_convert_from_utc(self.asi8, self.tz) else: raise TypeError("Already tz-aware, use tz_convert to convert.") else: diff --git a/pandas/tests/scalar/timestamp/test_timezones.py b/pandas/tests/scalar/timestamp/test_timezones.py index 9611c827be6fe..83764aa184392 100644 --- a/pandas/tests/scalar/timestamp/test_timezones.py +++ b/pandas/tests/scalar/timestamp/test_timezones.py @@ -334,7 +334,7 @@ def test_timestamp_to_datetime_tzoffset(self): def test_timestamp_constructor_near_dst_boundary(self): # GH#11481 & GH#15777 # Naive string timestamps were being localized incorrectly - # with tz_convert_single instead of tz_localize_to_utc + # with tz_convert_from_utc_single instead of tz_localize_to_utc for tz in ["Europe/Brussels", "Europe/Prague"]: result = Timestamp("2015-10-25 01:00", tz=tz) diff --git a/pandas/tests/tslibs/test_api.py b/pandas/tests/tslibs/test_api.py index 957706fcb460e..ccaceb7e6f906 100644 --- a/pandas/tests/tslibs/test_api.py +++ b/pandas/tests/tslibs/test_api.py @@ -47,7 +47,7 @@ def test_namespace(): "delta_to_nanoseconds", "ints_to_pytimedelta", "localize_pydatetime", - "tz_convert_single", + "tz_convert_from_utc_single", "to_offset", ] diff --git a/pandas/tests/tslibs/test_conversion.py b/pandas/tests/tslibs/test_conversion.py index 5a16fea47e90d..b35940c6bb95b 100644 --- a/pandas/tests/tslibs/test_conversion.py +++ b/pandas/tests/tslibs/test_conversion.py @@ -12,9 +12,9 @@ def _compare_utc_to_local(tz_didx): def f(x): - return tzconversion.tz_convert_single(x, UTC, tz_didx.tz) + return tzconversion.tz_convert_from_utc_single(x, tz_didx.tz) - result = tzconversion.tz_convert(tz_didx.asi8, UTC, tz_didx.tz) + result = tzconversion.tz_convert_from_utc(tz_didx.asi8, tz_didx.tz) expected = np.vectorize(f)(tz_didx.asi8) tm.assert_numpy_array_equal(result, expected) @@ -22,9 +22,6 @@ def f(x): def _compare_local_to_utc(tz_didx, naive_didx): # Check that tz_localize behaves the same vectorized and pointwise. - def f(x): - return tzconversion.tz_convert_single(x, tz_didx.tz, UTC) - err1 = err2 = None try: result = tzconversion.tz_localize_to_utc(naive_didx.asi8, tz_didx.tz) @@ -71,7 +68,7 @@ def test_tz_convert_single_matches_tz_convert(tz_aware_fixture, freq): ], ) def test_tz_convert_corner(arr): - result = tzconversion.tz_convert(arr, UTC, timezones.maybe_get_tz("Asia/Tokyo")) + result = tzconversion.tz_convert_from_utc(arr, timezones.maybe_get_tz("Asia/Tokyo")) tm.assert_numpy_array_equal(result, arr) diff --git a/pandas/tseries/frequencies.py b/pandas/tseries/frequencies.py index f94c8ef6550a5..23e08c7550646 100644 --- a/pandas/tseries/frequencies.py +++ b/pandas/tseries/frequencies.py @@ -21,7 +21,6 @@ ) from pandas._libs.tslibs.parsing import get_rule_month from pandas._libs.tslibs.resolution import month_position_check -from pandas._libs.tslibs.timezones import UTC from pandas.util._decorators import cache_readonly from pandas.core.dtypes.common import ( @@ -198,7 +197,9 @@ def __init__(self, index, warn: bool = True): # the timezone so they are in local time if hasattr(index, "tz"): if index.tz is not None: - self.i8values = tzconversion.tz_convert(self.i8values, UTC, index.tz) + self.i8values = tzconversion.tz_convert_from_utc( + self.i8values, index.tz + ) self.warn = warn From ef432aa194547b8ea90b98ce37c0bd8f441a94c7 Mon Sep 17 00:00:00 2001 From: Robin to Roxel <35864265+r-toroxel@users.noreply.github.com> Date: Fri, 10 Jul 2020 00:05:33 +0200 Subject: [PATCH 0311/1025] TST Verifiy that dropna returns none when called inplace (#35179) (#35181) --- pandas/tests/frame/test_missing.py | 36 ++++++++++++++++++++---------- 1 file changed, 24 insertions(+), 12 deletions(-) diff --git a/pandas/tests/frame/test_missing.py b/pandas/tests/frame/test_missing.py index 7cb7115276f71..9bf5d24085697 100644 --- a/pandas/tests/frame/test_missing.py +++ b/pandas/tests/frame/test_missing.py @@ -24,14 +24,16 @@ def test_dropEmptyRows(self, float_frame): smaller_frame = frame.dropna(how="all") # check that original was preserved tm.assert_series_equal(frame["foo"], original) - inplace_frame1.dropna(how="all", inplace=True) + return_value = inplace_frame1.dropna(how="all", inplace=True) tm.assert_series_equal(smaller_frame["foo"], expected) tm.assert_series_equal(inplace_frame1["foo"], expected) + assert return_value is None smaller_frame = frame.dropna(how="all", subset=["foo"]) - inplace_frame2.dropna(how="all", subset=["foo"], inplace=True) + return_value = inplace_frame2.dropna(how="all", subset=["foo"], inplace=True) tm.assert_series_equal(smaller_frame["foo"], expected) tm.assert_series_equal(inplace_frame2["foo"], expected) + assert return_value is None def test_dropIncompleteRows(self, float_frame): N = len(float_frame.index) @@ -45,18 +47,20 @@ def test_dropIncompleteRows(self, float_frame): smaller_frame = frame.dropna() tm.assert_series_equal(frame["foo"], original) - inp_frame1.dropna(inplace=True) + return_value = inp_frame1.dropna(inplace=True) exp = Series(mat[5:], index=float_frame.index[5:], name="foo") tm.assert_series_equal(smaller_frame["foo"], exp) tm.assert_series_equal(inp_frame1["foo"], exp) + assert return_value is None samesize_frame = frame.dropna(subset=["bar"]) tm.assert_series_equal(frame["foo"], original) assert (frame["bar"] == 5).all() - inp_frame2.dropna(subset=["bar"], inplace=True) + return_value = inp_frame2.dropna(subset=["bar"], inplace=True) tm.assert_index_equal(samesize_frame.index, float_frame.index) tm.assert_index_equal(inp_frame2.index, float_frame.index) + assert return_value is None def test_dropna(self): df = DataFrame(np.random.randn(6, 4)) @@ -65,31 +69,35 @@ def test_dropna(self): dropped = df.dropna(axis=1) expected = df.loc[:, [0, 1, 3]] inp = df.copy() - inp.dropna(axis=1, inplace=True) + return_value = inp.dropna(axis=1, inplace=True) tm.assert_frame_equal(dropped, expected) tm.assert_frame_equal(inp, expected) + assert return_value is None dropped = df.dropna(axis=0) expected = df.loc[list(range(2, 6))] inp = df.copy() - inp.dropna(axis=0, inplace=True) + return_value = inp.dropna(axis=0, inplace=True) tm.assert_frame_equal(dropped, expected) tm.assert_frame_equal(inp, expected) + assert return_value is None # threshold dropped = df.dropna(axis=1, thresh=5) expected = df.loc[:, [0, 1, 3]] inp = df.copy() - inp.dropna(axis=1, thresh=5, inplace=True) + return_value = inp.dropna(axis=1, thresh=5, inplace=True) tm.assert_frame_equal(dropped, expected) tm.assert_frame_equal(inp, expected) + assert return_value is None dropped = df.dropna(axis=0, thresh=4) expected = df.loc[range(2, 6)] inp = df.copy() - inp.dropna(axis=0, thresh=4, inplace=True) + return_value = inp.dropna(axis=0, thresh=4, inplace=True) tm.assert_frame_equal(dropped, expected) tm.assert_frame_equal(inp, expected) + assert return_value is None dropped = df.dropna(axis=1, thresh=4) tm.assert_frame_equal(dropped, df) @@ -100,9 +108,10 @@ def test_dropna(self): # subset dropped = df.dropna(axis=0, subset=[0, 1, 3]) inp = df.copy() - inp.dropna(axis=0, subset=[0, 1, 3], inplace=True) + return_value = inp.dropna(axis=0, subset=[0, 1, 3], inplace=True) tm.assert_frame_equal(dropped, df) tm.assert_frame_equal(inp, df) + assert return_value is None # all dropped = df.dropna(axis=1, how="all") @@ -126,12 +135,14 @@ def test_drop_and_dropna_caching(self): df2 = df.copy() df["A"].dropna() tm.assert_series_equal(df["A"], original) - df["A"].dropna(inplace=True) + return_value = df["A"].dropna(inplace=True) tm.assert_series_equal(df["A"], expected) + assert return_value is None df2["A"].drop([1]) tm.assert_series_equal(df2["A"], original) - df2["A"].drop([1], inplace=True) + return_value = df2["A"].drop([1], inplace=True) tm.assert_series_equal(df2["A"], original.drop([1])) + assert return_value is None def test_dropna_corner(self, float_frame): # bad input @@ -251,8 +262,9 @@ def test_fillna_different_dtype(self): ) tm.assert_frame_equal(result, expected) - df.fillna({2: "foo"}, inplace=True) + return_value = df.fillna({2: "foo"}, inplace=True) tm.assert_frame_equal(df, expected) + assert return_value is None def test_fillna_limit_and_value(self): # limit and value From 49d4a526f2f505a95afe20d4420f964fb3098f31 Mon Sep 17 00:00:00 2001 From: Song Wenhao Date: Fri, 10 Jul 2020 06:06:36 +0800 Subject: [PATCH 0312/1025] DOC: fix code snippets for generic indexing (#35175) Co-authored-by: maxime.song --- doc/source/user_guide/cookbook.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/user_guide/cookbook.rst b/doc/source/user_guide/cookbook.rst index 50b946999092a..49487ac327e73 100644 --- a/doc/source/user_guide/cookbook.rst +++ b/doc/source/user_guide/cookbook.rst @@ -219,8 +219,8 @@ There are 2 explicit slicing methods, with a third general case df.loc['bar':'kar'] # Label # Generic - df.iloc[0:3] - df.loc['bar':'kar'] + df[0:3] + df['bar':'kar'] Ambiguity arises when an index consists of integers with a non-zero start or non-unit increment. From c2da40adaa67a56bef80cd07f59645d023d68946 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 9 Jul 2020 15:08:16 -0700 Subject: [PATCH 0313/1025] PERF: periodarr_to_dt64arr (#35171) --- asv_bench/benchmarks/tslibs/period.py | 38 ++++++++++++++++++++++++++- pandas/_libs/tslibs/period.pyx | 31 ++++++++++++++++++---- 2 files changed, 63 insertions(+), 6 deletions(-) diff --git a/asv_bench/benchmarks/tslibs/period.py b/asv_bench/benchmarks/tslibs/period.py index 9156c4aa90ea0..1a2c89b48c665 100644 --- a/asv_bench/benchmarks/tslibs/period.py +++ b/asv_bench/benchmarks/tslibs/period.py @@ -2,10 +2,15 @@ Period benchmarks that rely only on tslibs. See benchmarks.period for Period benchmarks that rely on other parts fo pandas. """ -from pandas import Period + +import numpy as np + +from pandas._libs.tslibs.period import Period, periodarr_to_dt64arr from pandas.tseries.frequencies import to_offset +from .tslib import _sizes + class PeriodProperties: @@ -68,3 +73,34 @@ def setup(self, freq, is_offset): def time_period_constructor(self, freq, is_offset): Period("2012-06-01", freq=freq) + + +class TimePeriodArrToDT64Arr: + params = [ + _sizes, + [ + 1000, + 1011, # Annual - November End + 2000, + 2011, # Quarterly - November End + 3000, + 4000, + 4006, # Weekly - Saturday End + 5000, + 6000, + 7000, + 8000, + 9000, + 10000, + 11000, + 12000, + ], + ] + param_names = ["size", "freq"] + + def setup(self, size, freq): + arr = np.arange(10, dtype="i8").repeat(size // 10) + self.i8values = arr + + def time_periodarray_to_dt64arr(self, size, freq): + periodarr_to_dt64arr(self.i8values, freq) diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index e6ba1968797ed..e992b20b12db2 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -54,6 +54,7 @@ from pandas._libs.tslibs.ccalendar cimport ( get_days_in_month, ) from pandas._libs.tslibs.ccalendar cimport c_MONTH_NUMBERS +from pandas._libs.tslibs.conversion import ensure_datetime64ns from pandas._libs.tslibs.dtypes cimport ( PeriodDtypeBase, @@ -943,14 +944,34 @@ def periodarr_to_dt64arr(const int64_t[:] periodarr, int freq): int64_t[:] out Py_ssize_t i, l - l = len(periodarr) + if freq < 6000: # i.e. FR_DAY, hard-code to avoid need to cast + l = len(periodarr) + out = np.empty(l, dtype="i8") - out = np.empty(l, dtype='i8') + # We get here with freqs that do not correspond to a datetime64 unit + for i in range(l): + out[i] = period_ordinal_to_dt64(periodarr[i], freq) - for i in range(l): - out[i] = period_ordinal_to_dt64(periodarr[i], freq) + return out.base # .base to access underlying np.ndarray - return out.base # .base to access underlying np.ndarray + else: + # Short-circuit for performance + if freq == FR_NS: + return periodarr.base + + if freq == FR_US: + dta = periodarr.base.view("M8[us]") + elif freq == FR_MS: + dta = periodarr.base.view("M8[ms]") + elif freq == FR_SEC: + dta = periodarr.base.view("M8[s]") + elif freq == FR_MIN: + dta = periodarr.base.view("M8[m]") + elif freq == FR_HR: + dta = periodarr.base.view("M8[h]") + elif freq == FR_DAY: + dta = periodarr.base.view("M8[D]") + return ensure_datetime64ns(dta) cpdef int64_t period_asfreq(int64_t ordinal, int freq1, int freq2, bint end): From 03fdde991310f82c5f5207d8c9a3b7259c6cdc94 Mon Sep 17 00:00:00 2001 From: Rik-de-Kort Date: Fri, 10 Jul 2020 01:34:44 +0200 Subject: [PATCH 0314/1025] ENH: Add optional argument index to pd.melt to maintain index values (#33659) --- doc/source/user_guide/reshaping.rst | 16 +++++++++++ doc/source/whatsnew/v1.1.0.rst | 2 ++ pandas/core/frame.py | 4 ++- pandas/core/reshape/melt.py | 10 +++++-- pandas/core/shared_docs.py | 16 +++++++++++ pandas/tests/reshape/test_melt.py | 41 +++++++++++++++++++++++++++++ 6 files changed, 86 insertions(+), 3 deletions(-) diff --git a/doc/source/user_guide/reshaping.rst b/doc/source/user_guide/reshaping.rst index c476e33b8ddde..aa6bf44547040 100644 --- a/doc/source/user_guide/reshaping.rst +++ b/doc/source/user_guide/reshaping.rst @@ -296,6 +296,22 @@ For instance, cheese.melt(id_vars=['first', 'last']) cheese.melt(id_vars=['first', 'last'], var_name='quantity') +When transforming a DataFrame using :func:`~pandas.melt`, the index will be ignored. The original index values can be kept around by setting the ``ignore_index`` parameter to ``False`` (default is ``True``). This will however duplicate them. + +.. versionadded:: 1.1.0 + +.. ipython:: python + + index = pd.MultiIndex.from_tuples([('person', 'A'), ('person', 'B')]) + cheese = pd.DataFrame({'first': ['John', 'Mary'], + 'last': ['Doe', 'Bo'], + 'height': [5.5, 6.0], + 'weight': [130, 150]}, + index=index) + cheese + cheese.melt(id_vars=['first', 'last']) + cheese.melt(id_vars=['first', 'last'], ignore_index=False) + Another way to transform is to use the :func:`~pandas.wide_to_long` panel data convenience function. It is less flexible than :func:`~pandas.melt`, but more user-friendly. diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 986ee371566cd..dc1ef12890233 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -297,6 +297,7 @@ Other enhancements This can be used to set a custom compression level, e.g., ``df.to_csv(path, compression={'method': 'gzip', 'compresslevel': 1}`` (:issue:`33196`) +- :meth:`melt` has gained an ``ignore_index`` (default ``True``) argument that, if set to ``False``, prevents the method from dropping the index (:issue:`17440`). - :meth:`Series.update` now accepts objects that can be coerced to a :class:`Series`, such as ``dict`` and ``list``, mirroring the behavior of :meth:`DataFrame.update` (:issue:`33215`) - :meth:`~pandas.core.groupby.GroupBy.transform` and :meth:`~pandas.core.groupby.GroupBy.aggregate` has gained ``engine`` and ``engine_kwargs`` arguments that supports executing functions with ``Numba`` (:issue:`32854`, :issue:`33388`) @@ -1168,3 +1169,4 @@ Other Contributors ~~~~~~~~~~~~ + diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 87041341ac3a6..3d2200cb45c6e 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2145,7 +2145,7 @@ def to_stata( from pandas.io.stata import StataWriter117 as statawriter # type: ignore else: # versions 118 and 119 # mypy: Name 'statawriter' already defined (possibly by an import) - from pandas.io.stata import StataWriterUTF8 as statawriter # type:ignore + from pandas.io.stata import StataWriterUTF8 as statawriter # type: ignore kwargs: Dict[str, Any] = {} if version is None or version >= 117: @@ -7105,6 +7105,7 @@ def melt( var_name=None, value_name="value", col_level=None, + ignore_index=True, ) -> "DataFrame": return melt( @@ -7114,6 +7115,7 @@ def melt( var_name=var_name, value_name=value_name, col_level=col_level, + ignore_index=ignore_index, ) # ---------------------------------------------------------------------- diff --git a/pandas/core/reshape/melt.py b/pandas/core/reshape/melt.py index 923b9e7462d8b..1ba6854a79265 100644 --- a/pandas/core/reshape/melt.py +++ b/pandas/core/reshape/melt.py @@ -14,6 +14,7 @@ import pandas.core.common as com from pandas.core.indexes.api import Index, MultiIndex from pandas.core.reshape.concat import concat +from pandas.core.reshape.util import _tile_compat from pandas.core.shared_docs import _shared_docs from pandas.core.tools.numeric import to_numeric @@ -32,8 +33,8 @@ def melt( var_name=None, value_name="value", col_level=None, + ignore_index: bool = True, ) -> "DataFrame": - # TODO: what about the existing index? # If multiindex, gather names of columns on all level for checking presence # of `id_vars` and `value_vars` if isinstance(frame.columns, MultiIndex): @@ -132,7 +133,12 @@ def melt( # asanyarray will keep the columns as an Index mdata[col] = np.asanyarray(frame.columns._get_level_values(i)).repeat(N) - return frame._constructor(mdata, columns=mcolumns) + result = frame._constructor(mdata, columns=mcolumns) + + if not ignore_index: + result.index = _tile_compat(frame.index, K) + + return result @deprecate_kwarg(old_arg_name="label", new_arg_name=None) diff --git a/pandas/core/shared_docs.py b/pandas/core/shared_docs.py index 1894f551afea5..b81942f062b19 100644 --- a/pandas/core/shared_docs.py +++ b/pandas/core/shared_docs.py @@ -28,6 +28,11 @@ Name to use for the 'value' column. col_level : int or str, optional If columns are a MultiIndex then use this level to melt. + ignore_index : bool, default True + If True, original index is ignored. If False, the original index is retained. + Index labels will be repeated as necessary. + + .. versionadded:: 1.1.0 Returns ------- @@ -78,6 +83,17 @@ 1 b B 3 2 c B 5 + Original index values can be kept around: + + >>> %(caller)sid_vars=['A'], value_vars=['B', 'C'], ignore_index=False) + A variable value + 0 a B 1 + 1 b B 3 + 2 c B 5 + 0 a C 2 + 1 b C 4 + 2 c C 6 + If you have multi-index columns: >>> df.columns = [list('ABC'), list('DEF')] diff --git a/pandas/tests/reshape/test_melt.py b/pandas/tests/reshape/test_melt.py index 03fda038539e2..241721432bbf9 100644 --- a/pandas/tests/reshape/test_melt.py +++ b/pandas/tests/reshape/test_melt.py @@ -357,6 +357,47 @@ def test_melt_mixed_int_str_value_vars(self): expected = DataFrame({"variable": [0, "a"], "value": ["foo", "bar"]}) tm.assert_frame_equal(result, expected) + def test_ignore_index(self): + # GH 17440 + df = DataFrame({"foo": [0], "bar": [1]}, index=["first"]) + result = melt(df, ignore_index=False) + expected = DataFrame( + {"variable": ["foo", "bar"], "value": [0, 1]}, index=["first", "first"] + ) + tm.assert_frame_equal(result, expected) + + def test_ignore_multiindex(self): + # GH 17440 + index = pd.MultiIndex.from_tuples( + [("first", "second"), ("first", "third")], names=["baz", "foobar"] + ) + df = DataFrame({"foo": [0, 1], "bar": [2, 3]}, index=index) + result = melt(df, ignore_index=False) + + expected_index = pd.MultiIndex.from_tuples( + [("first", "second"), ("first", "third")] * 2, names=["baz", "foobar"] + ) + expected = DataFrame( + {"variable": ["foo"] * 2 + ["bar"] * 2, "value": [0, 1, 2, 3]}, + index=expected_index, + ) + + tm.assert_frame_equal(result, expected) + + def test_ignore_index_name_and_type(self): + # GH 17440 + index = pd.Index(["foo", "bar"], dtype="category", name="baz") + df = DataFrame({"x": [0, 1], "y": [2, 3]}, index=index) + result = melt(df, ignore_index=False) + + expected_index = pd.Index(["foo", "bar"] * 2, dtype="category", name="baz") + expected = DataFrame( + {"variable": ["x", "x", "y", "y"], "value": [0, 1, 2, 3]}, + index=expected_index, + ) + + tm.assert_frame_equal(result, expected) + class TestLreshape: def test_pairs(self): From b67e155be83591ad43eff65a0d9c36e6f0a7b1d8 Mon Sep 17 00:00:00 2001 From: Jihwan Song Date: Thu, 9 Jul 2020 19:35:58 -0400 Subject: [PATCH 0315/1025] PERF: to speed up rendering of styler (#34863) --- asv_bench/asv.conf.json | 1 + asv_bench/benchmarks/io/style.py | 34 +++++++++++++++++++++++++++ doc/source/whatsnew/v1.1.0.rst | 1 + pandas/io/formats/style.py | 18 ++++++++++---- pandas/tests/io/formats/test_style.py | 13 +++------- 5 files changed, 52 insertions(+), 15 deletions(-) create mode 100644 asv_bench/benchmarks/io/style.py diff --git a/asv_bench/asv.conf.json b/asv_bench/asv.conf.json index 7c10a2d17775a..4583fac85b776 100644 --- a/asv_bench/asv.conf.json +++ b/asv_bench/asv.conf.json @@ -53,6 +53,7 @@ "xlwt": [], "odfpy": [], "pytest": [], + "jinja2": [], // If using Windows with python 2.7 and want to build using the // mingw toolchain (rather than MSVC), uncomment the following line. // "libpython": [], diff --git a/asv_bench/benchmarks/io/style.py b/asv_bench/benchmarks/io/style.py new file mode 100644 index 0000000000000..4fc07bbabda06 --- /dev/null +++ b/asv_bench/benchmarks/io/style.py @@ -0,0 +1,34 @@ +import numpy as np + +from pandas import DataFrame + + +class RenderApply: + + params = [[12, 24, 36], [12, 120]] + param_names = ["cols", "rows"] + + def setup(self, cols, rows): + self.df = DataFrame( + np.random.randn(rows, cols), + columns=[f"float_{i+1}" for i in range(cols)], + index=[f"row_{i+1}" for i in range(rows)], + ) + self._style_apply() + + def time_render(self, cols, rows): + self.st.render() + + def peakmem_apply(self, cols, rows): + self._style_apply() + + def peakmem_render(self, cols, rows): + self.st.render() + + def _style_apply(self): + def _apply_func(s): + return [ + "background-color: lightcyan" if s.name == "row_1" else "" for v in s + ] + + self.st = self.df.style.apply(_apply_func, axis=1) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index dc1ef12890233..5473b7c1523f3 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -854,6 +854,7 @@ Performance improvements - Performance improvement in :class:`pandas.core.groupby.RollingGroupby` (:issue:`34052`) - Performance improvement in arithmetic operations (sub, add, mul, div) for MultiIndex (:issue:`34297`) - Performance improvement in `DataFrame[bool_indexer]` when `bool_indexer` is a list (:issue:`33924`) +- Significant performance improvement of :meth:`io.formats.style.Styler.render` with styles added with various ways such as :meth:`io.formats.style.Styler.apply`, :meth:`io.formats.style.Styler.applymap` or :meth:`io.formats.style.Styler.bar` (:issue:`19917`) .. --------------------------------------------------------------------------- diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index 6250e99252928..d11144938eb26 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -561,11 +561,19 @@ def _update_ctx(self, attrs: DataFrame) -> None: Whitespace shouldn't matter and the final trailing ';' shouldn't matter. """ - for row_label, v in attrs.iterrows(): - for col_label, col in v.items(): - i = self.index.get_indexer([row_label])[0] - j = self.columns.get_indexer([col_label])[0] - for pair in col.rstrip(";").split(";"): + coli = {k: i for i, k in enumerate(self.columns)} + rowi = {k: i for i, k in enumerate(self.index)} + for jj in range(len(attrs.columns)): + cn = attrs.columns[jj] + j = coli[cn] + for rn, c in attrs[[cn]].itertuples(): + if not c: + continue + c = c.rstrip(";") + if not c: + continue + i = rowi[rn] + for pair in c.split(";"): self.ctx[(i, j)].append(pair) def _copy(self, deepcopy: bool = False) -> "Styler": diff --git a/pandas/tests/io/formats/test_style.py b/pandas/tests/io/formats/test_style.py index ec4614538004c..9c6910637fa7e 100644 --- a/pandas/tests/io/formats/test_style.py +++ b/pandas/tests/io/formats/test_style.py @@ -405,9 +405,10 @@ def f(x): result = self.df.style.where(f, style1)._compute().ctx expected = { - (r, c): [style1 if f(self.df.loc[row, col]) else ""] + (r, c): [style1] for r, row in enumerate(self.df.index) for c, col in enumerate(self.df.columns) + if f(self.df.loc[row, col]) } assert result == expected @@ -966,7 +967,6 @@ def test_bar_align_mid_nans(self): "transparent 25.0%, #d65f5f 25.0%, " "#d65f5f 50.0%, transparent 50.0%)", ], - (1, 0): [""], (0, 1): [ "width: 10em", " height: 80%", @@ -994,7 +994,6 @@ def test_bar_align_zero_nans(self): "transparent 50.0%, #d65f5f 50.0%, " "#d65f5f 75.0%, transparent 75.0%)", ], - (1, 0): [""], (0, 1): [ "width: 10em", " height: 80%", @@ -1091,7 +1090,7 @@ def test_format_with_bad_na_rep(self): def test_highlight_null(self, null_color="red"): df = pd.DataFrame({"A": [0, np.nan]}) result = df.style.highlight_null()._compute().ctx - expected = {(0, 0): [""], (1, 0): ["background-color: red"]} + expected = {(1, 0): ["background-color: red"]} assert result == expected def test_highlight_null_subset(self): @@ -1104,9 +1103,7 @@ def test_highlight_null_subset(self): .ctx ) expected = { - (0, 0): [""], (1, 0): ["background-color: red"], - (0, 1): [""], (1, 1): ["background-color: green"], } assert result == expected @@ -1219,8 +1216,6 @@ def test_highlight_max(self): expected = { (1, 0): ["background-color: yellow"], (1, 1): ["background-color: yellow"], - (0, 1): [""], - (0, 0): [""], } assert result == expected @@ -1228,8 +1223,6 @@ def test_highlight_max(self): expected = { (0, 1): ["background-color: yellow"], (1, 1): ["background-color: yellow"], - (0, 0): [""], - (1, 0): [""], } assert result == expected From ec090883ab60f9ca974c49669ae0950af136f868 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 9 Jul 2020 16:40:14 -0700 Subject: [PATCH 0316/1025] REF: re-use get_firstbday, get_lastbday in fields.pyx (#35199) --- pandas/_libs/tslibs/ccalendar.pxd | 2 + pandas/_libs/tslibs/ccalendar.pyx | 49 ++++++++++++++++ pandas/_libs/tslibs/fields.pyx | 77 ++++++-------------------- pandas/_libs/tslibs/offsets.pyx | 53 +++--------------- pandas/tests/tslibs/test_liboffsets.py | 5 +- 5 files changed, 78 insertions(+), 108 deletions(-) diff --git a/pandas/_libs/tslibs/ccalendar.pxd b/pandas/_libs/tslibs/ccalendar.pxd index 41cc477413607..4eb5188b8a04b 100644 --- a/pandas/_libs/tslibs/ccalendar.pxd +++ b/pandas/_libs/tslibs/ccalendar.pxd @@ -10,6 +10,8 @@ cpdef int32_t get_days_in_month(int year, Py_ssize_t month) nogil cpdef int32_t get_week_of_year(int year, int month, int day) nogil cpdef iso_calendar_t get_iso_calendar(int year, int month, int day) nogil cpdef int32_t get_day_of_year(int year, int month, int day) nogil +cpdef int get_lastbday(int year, int month) nogil +cpdef int get_firstbday(int year, int month) nogil cdef int64_t DAY_NANOS cdef int64_t HOUR_NANOS diff --git a/pandas/_libs/tslibs/ccalendar.pyx b/pandas/_libs/tslibs/ccalendar.pyx index de8fd3911e946..00cecd25e5225 100644 --- a/pandas/_libs/tslibs/ccalendar.pyx +++ b/pandas/_libs/tslibs/ccalendar.pyx @@ -241,3 +241,52 @@ cpdef int32_t get_day_of_year(int year, int month, int day) nogil: day_of_year = mo_off + day return day_of_year + + +# --------------------------------------------------------------------- +# Business Helpers + +cpdef int get_lastbday(int year, int month) nogil: + """ + Find the last day of the month that is a business day. + + Parameters + ---------- + year : int + month : int + + Returns + ------- + last_bday : int + """ + cdef: + int wkday, days_in_month + + wkday = dayofweek(year, month, 1) + days_in_month = get_days_in_month(year, month) + return days_in_month - max(((wkday + days_in_month - 1) % 7) - 4, 0) + + +cpdef int get_firstbday(int year, int month) nogil: + """ + Find the first day of the month that is a business day. + + Parameters + ---------- + year : int + month : int + + Returns + ------- + first_bday : int + """ + cdef: + int first, wkday + + wkday = dayofweek(year, month, 1) + first = 1 + if wkday == 5: # on Saturday + first = 3 + elif wkday == 6: # on Sunday + first = 2 + return first diff --git a/pandas/_libs/tslibs/fields.pyx b/pandas/_libs/tslibs/fields.pyx index 5ea7c0b6c5d02..03e4188fd06ef 100644 --- a/pandas/_libs/tslibs/fields.pyx +++ b/pandas/_libs/tslibs/fields.pyx @@ -19,6 +19,8 @@ from pandas._libs.tslibs.ccalendar cimport ( get_days_in_month, is_leapyear, dayofweek, get_week_of_year, get_day_of_year, get_iso_calendar, iso_calendar_t, month_offset, + get_firstbday, + get_lastbday, ) from pandas._libs.tslibs.np_datetime cimport ( npy_datetimestruct, pandas_timedeltastruct, dt64_to_dtstruct, @@ -137,9 +139,7 @@ def get_start_end_field(const int64_t[:] dtindex, str field, int end_month = 12 int start_month = 1 ndarray[int8_t] out - bint isleap npy_datetimestruct dts - int mo_off, dom, doy, dow, ldom out = np.zeros(count, dtype='int8') @@ -172,10 +172,8 @@ def get_start_end_field(const int64_t[:] dtindex, str field, continue dt64_to_dtstruct(dtindex[i], &dts) - dom = dts.day - dow = dayofweek(dts.year, dts.month, dts.day) - if (dom == 1 and dow < 5) or (dom <= 3 and dow == 0): + if dts.day == get_firstbday(dts.year, dts.month): out[i] = 1 else: @@ -185,9 +183,8 @@ def get_start_end_field(const int64_t[:] dtindex, str field, continue dt64_to_dtstruct(dtindex[i], &dts) - dom = dts.day - if dom == 1: + if dts.day == 1: out[i] = 1 elif field == 'is_month_end': @@ -198,15 +195,8 @@ def get_start_end_field(const int64_t[:] dtindex, str field, continue dt64_to_dtstruct(dtindex[i], &dts) - isleap = is_leapyear(dts.year) - mo_off = month_offset[isleap * 13 + dts.month - 1] - dom = dts.day - doy = mo_off + dom - ldom = month_offset[isleap * 13 + dts.month] - dow = dayofweek(dts.year, dts.month, dts.day) - - if (ldom == doy and dow < 5) or ( - dow == 4 and (ldom - doy <= 2)): + + if dts.day == get_lastbday(dts.year, dts.month): out[i] = 1 else: @@ -216,13 +206,8 @@ def get_start_end_field(const int64_t[:] dtindex, str field, continue dt64_to_dtstruct(dtindex[i], &dts) - isleap = is_leapyear(dts.year) - mo_off = month_offset[isleap * 13 + dts.month - 1] - dom = dts.day - doy = mo_off + dom - ldom = month_offset[isleap * 13 + dts.month] - if ldom == doy: + if dts.day == get_days_in_month(dts.year, dts.month): out[i] = 1 elif field == 'is_quarter_start': @@ -233,11 +218,9 @@ def get_start_end_field(const int64_t[:] dtindex, str field, continue dt64_to_dtstruct(dtindex[i], &dts) - dom = dts.day - dow = dayofweek(dts.year, dts.month, dts.day) if ((dts.month - start_month) % 3 == 0) and ( - (dom == 1 and dow < 5) or (dom <= 3 and dow == 0)): + dts.day == get_firstbday(dts.year, dts.month)): out[i] = 1 else: @@ -247,9 +230,8 @@ def get_start_end_field(const int64_t[:] dtindex, str field, continue dt64_to_dtstruct(dtindex[i], &dts) - dom = dts.day - if ((dts.month - start_month) % 3 == 0) and dom == 1: + if ((dts.month - start_month) % 3 == 0) and dts.day == 1: out[i] = 1 elif field == 'is_quarter_end': @@ -260,16 +242,9 @@ def get_start_end_field(const int64_t[:] dtindex, str field, continue dt64_to_dtstruct(dtindex[i], &dts) - isleap = is_leapyear(dts.year) - mo_off = month_offset[isleap * 13 + dts.month - 1] - dom = dts.day - doy = mo_off + dom - ldom = month_offset[isleap * 13 + dts.month] - dow = dayofweek(dts.year, dts.month, dts.day) if ((dts.month - end_month) % 3 == 0) and ( - (ldom == doy and dow < 5) or ( - dow == 4 and (ldom - doy <= 2))): + dts.day == get_lastbday(dts.year, dts.month)): out[i] = 1 else: @@ -279,13 +254,9 @@ def get_start_end_field(const int64_t[:] dtindex, str field, continue dt64_to_dtstruct(dtindex[i], &dts) - isleap = is_leapyear(dts.year) - mo_off = month_offset[isleap * 13 + dts.month - 1] - dom = dts.day - doy = mo_off + dom - ldom = month_offset[isleap * 13 + dts.month] - if ((dts.month - end_month) % 3 == 0) and (ldom == doy): + if ((dts.month - end_month) % 3 == 0) and ( + dts.day == get_days_in_month(dts.year, dts.month)): out[i] = 1 elif field == 'is_year_start': @@ -296,11 +267,9 @@ def get_start_end_field(const int64_t[:] dtindex, str field, continue dt64_to_dtstruct(dtindex[i], &dts) - dom = dts.day - dow = dayofweek(dts.year, dts.month, dts.day) if (dts.month == start_month) and ( - (dom == 1 and dow < 5) or (dom <= 3 and dow == 0)): + dts.day == get_firstbday(dts.year, dts.month)): out[i] = 1 else: @@ -310,9 +279,8 @@ def get_start_end_field(const int64_t[:] dtindex, str field, continue dt64_to_dtstruct(dtindex[i], &dts) - dom = dts.day - if (dts.month == start_month) and dom == 1: + if (dts.month == start_month) and dts.day == 1: out[i] = 1 elif field == 'is_year_end': @@ -323,16 +291,9 @@ def get_start_end_field(const int64_t[:] dtindex, str field, continue dt64_to_dtstruct(dtindex[i], &dts) - isleap = is_leapyear(dts.year) - dom = dts.day - mo_off = month_offset[isleap * 13 + dts.month - 1] - doy = mo_off + dom - dow = dayofweek(dts.year, dts.month, dts.day) - ldom = month_offset[isleap * 13 + dts.month] if (dts.month == end_month) and ( - (ldom == doy and dow < 5) or ( - dow == 4 and (ldom - doy <= 2))): + dts.day == get_lastbday(dts.year, dts.month)): out[i] = 1 else: @@ -342,13 +303,9 @@ def get_start_end_field(const int64_t[:] dtindex, str field, continue dt64_to_dtstruct(dtindex[i], &dts) - isleap = is_leapyear(dts.year) - mo_off = month_offset[isleap * 13 + dts.month - 1] - dom = dts.day - doy = mo_off + dom - ldom = month_offset[isleap * 13 + dts.month] - if (dts.month == end_month) and (ldom == doy): + if (dts.month == end_month) and ( + dts.day == get_days_in_month(dts.year, dts.month)): out[i] = 1 else: diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index 0f9280ae92d39..4429ff083f350 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -34,7 +34,13 @@ from pandas._libs.tslibs.util cimport ( from pandas._libs.tslibs.ccalendar import ( MONTH_ALIASES, MONTH_TO_CAL_NUM, weekday_to_int, int_to_weekday, ) -from pandas._libs.tslibs.ccalendar cimport DAY_NANOS, get_days_in_month, dayofweek +from pandas._libs.tslibs.ccalendar cimport ( + DAY_NANOS, + dayofweek, + get_days_in_month, + get_firstbday, + get_lastbday, +) from pandas._libs.tslibs.conversion cimport ( convert_datetime_to_tsobject, localize_pydatetime, @@ -177,51 +183,6 @@ cdef _wrap_timedelta_result(result): # --------------------------------------------------------------------- # Business Helpers -cpdef int get_lastbday(int year, int month) nogil: - """ - Find the last day of the month that is a business day. - - Parameters - ---------- - year : int - month : int - - Returns - ------- - last_bday : int - """ - cdef: - int wkday, days_in_month - - wkday = dayofweek(year, month, 1) - days_in_month = get_days_in_month(year, month) - return days_in_month - max(((wkday + days_in_month - 1) % 7) - 4, 0) - - -cpdef int get_firstbday(int year, int month) nogil: - """ - Find the first day of the month that is a business day. - - Parameters - ---------- - year : int - month : int - - Returns - ------- - first_bday : int - """ - cdef: - int first, wkday - - wkday = dayofweek(year, month, 1) - first = 1 - if wkday == 5: # on Saturday - first = 3 - elif wkday == 6: # on Sunday - first = 2 - return first - cdef _get_calendar(weekmask, holidays, calendar): """Generate busdaycalendar""" diff --git a/pandas/tests/tslibs/test_liboffsets.py b/pandas/tests/tslibs/test_liboffsets.py index 206a604788c7e..6a514d2cc8713 100644 --- a/pandas/tests/tslibs/test_liboffsets.py +++ b/pandas/tests/tslibs/test_liboffsets.py @@ -5,6 +5,7 @@ import pytest +from pandas._libs.tslibs.ccalendar import get_firstbday, get_lastbday import pandas._libs.tslibs.offsets as liboffsets from pandas._libs.tslibs.offsets import roll_qtrday @@ -25,7 +26,7 @@ def day_opt(request): ) def test_get_last_bday(dt, exp_week_day, exp_last_day): assert dt.weekday() == exp_week_day - assert liboffsets.get_lastbday(dt.year, dt.month) == exp_last_day + assert get_lastbday(dt.year, dt.month) == exp_last_day @pytest.mark.parametrize( @@ -37,7 +38,7 @@ def test_get_last_bday(dt, exp_week_day, exp_last_day): ) def test_get_first_bday(dt, exp_week_day, exp_first_day): assert dt.weekday() == exp_week_day - assert liboffsets.get_firstbday(dt.year, dt.month) == exp_first_day + assert get_firstbday(dt.year, dt.month) == exp_first_day @pytest.mark.parametrize( From 91e1af61ba81312704adeb850d0468a65aba04cb Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Thu, 9 Jul 2020 18:40:59 -0500 Subject: [PATCH 0317/1025] TST: update gbq service account key (#35090) Re-enable gbq integration tests. --- ci/travis_encrypt_gbq.sh | 2 +- ci/travis_gbq.json.enc | Bin 2352 -> 2352 bytes ci/travis_gbq_config.txt | 4 ++-- ci/travis_process_gbq_encryption.sh | 2 +- pandas/tests/io/test_gbq.py | 1 - 5 files changed, 4 insertions(+), 5 deletions(-) diff --git a/ci/travis_encrypt_gbq.sh b/ci/travis_encrypt_gbq.sh index e404ca73a405e..7d5692d9520af 100755 --- a/ci/travis_encrypt_gbq.sh +++ b/ci/travis_encrypt_gbq.sh @@ -19,7 +19,7 @@ if [[ ! -f $GBQ_JSON_FILE ]]; then fi echo "Encrypting $GBQ_JSON_FILE..." -read -d "\n" TRAVIS_KEY TRAVIS_IV <<<$(travis encrypt-file $GBQ_JSON_FILE \ +read -d "\n" TRAVIS_KEY TRAVIS_IV <<<$(travis encrypt-file -r pandas-dev/pandas $GBQ_JSON_FILE \ travis_gbq.json.enc -f | grep -o "\w*_iv\|\w*_key"); echo "Adding your secure key to travis_gbq_config.txt ..." diff --git a/ci/travis_gbq.json.enc b/ci/travis_gbq.json.enc index c2a33bbd6f26383bd7e8a7a504e626284efb5fd0..6e0b6cee4048c70f9073dcdd75ed5008f2792c3e 100644 GIT binary patch literal 2352 zcmV-03D5R{crO*E{ek(-gWC#PQIGk*$iR%q>rF77-%gDxF9P?pb2rMmlMwIEx10ls z#PDGCjA$!JdSdpjZG0M3l|}#dZLrLz#@4cSS0@@t%glX;4Cgv+_M276eKlp=I~~f^ z%6@(WxCT-1IIF_t#kN*1>H;&?w(8kIry&7Tdh|#i_k}B$0Lp8?q^ZOjMAio zG~}090`&R9oHeK4@|G&WTDA(#J*5+tL_alee&0doZ1hX5f@K8!7VIsiIO5l>kw^9q zHn4oO0?|c@mcj5wmdg|43cpet_4yqeDXy!}NfQQGCFW$_mE<29(#wYNVMR-`L8-M*R?IybrG!?W#@hMr>cIK;v$yroE9pO zQb@b&_g0DR5b>Ol)3;QB7*Z!!fHm55$>4o?0mIAa#TnN^Z4+p*Z2;Uht1Zl|*rlcp z*#kXKpFu2=Bw99Fbjr(aYg?9R+)-dM`5PD1$*T49gous*frnsg;r)s@MWtfmI zLR)mg$U70>i+&b0$x35T{UM;)svKX@sgC|~I+K)zgn>}uRF879K)QH&m zerHq)M13h6y8+U!+qDEVikv~<3&=R%X5!0?8Pdh3wg)x#a&7x`4gtX^6*R2&c;VKa zdpBUe%+Dn*I@#gZRu6GaTw`v!Ph~`Q>mZNj{9M<-TLMv3o;1QnF3xQ~hU$?D>B{%^W@(yzk{L|L0Xpdv| zo7BMZuK&hCS$9&8Q=k&QJn2XFqZ4C_F{9tZ_pA*9ss|A1QA~O0Ea!z|$`u|9Hl90A zf_}IdJ2twW*p(j@gm+O|S9qmIVXHw?XzpNJ8u})9!g?TIsZLUtlXhS;A<%(E zY#(@-7UEcnq3v4wM}m4=aI?u;!jjj zj{y#}T1&-pta-xV|iAe;L~Y6)e#{Eu7vJi zFp&<(a5>&})lUM*kvbD&+Um68btHJ+xIu++nsaXPh27xV+P^ex6D-Z=2D|j84i#mb zZqTb^l=ev7M^2nCI#zDVrGrbW^Yw$dAuZa1i+P=ciW>ier!oKjISql9Uj{#phN)iFh?Y%cJSFg*e! ze_#xuad%2365^;n%ib)9#*k;gMLNT6!;F~oSampPoUpRW3zm9o>yK^Uw+M8)QQpf~ zLl90Q=5qY}n>;0t&D3@xwW23aEg4XD{t5mE zIFdFeWVp`GC21lVFbUk!8BQgXZxA&Sxt09e7T87sIY!^xJ4Z>lKgUwc+ zvv}f5AT7*14C5&?^hI{^#16^|h%X|SZ94E?Jt|o4N`fE)3k9EnIjr$F?aeuFdtRS* z2gch7Ir;~5tV{mT2Uf=$grPU^K(9E{K0D$DB0Nh3xvFf$dgKw9DVCcnb*v0jbOE8K zBmerx!Lrv;`r$+b(UpLuo(-FL75{zf|9zd84^~VGP}Z}(tRacc#0uU*V>!tY!P(Cx zWBEf+U^92gvLpwO?>nK_+U8pY2!Yc#3DmCbir{6AQFn}1?V)Qes1CNplnsBm)HRD%vCy=<|CT!dVU1CsBx#;b6?#-0=_@dkZT^22Z^ zq?UCZwVZJt-)oW5ODgD}S~a=`*&wFL9_ZVv5w8RQYG~OcOhttjso=0=)M{-={bTL( z%)F1VdvGBRuSxy2p5QBL9%yo%P8xJ>7VOF;$)fiFCLzLX{1Yok!WC>sT9?P!dKf!DcSq0>CvDP}E}q#rZl(V#J)t_VWJyG`)-e(B#o1bQ{dxyI=%t~g%{cKO z>L=mbyw_5R|2<(Xlz||b{4QL4?5PTpgA^Erkks{2PHY3+$Li1a{yu@o@%_B`7kiBI zuresPtxqLPM#g9yq-VL~uIfvGsh?KLx_cfB-Rf+q%se7im@7L>85(tpFm8P{@GMtm zg_5nF^uPL^iS^Z5mq_w1A;1+-aKa2C?vw@dAsG2lk!HTklD}D`R$Pf3?U;Q16kd7) zv2|W#d+=%H5w#%R4&sH7RS9ML_Kl*=MmkM_CEb z@Y+575%-IiF+LL|GVjPCqWQ&&4!Vesv%QrmSL}?=s7Z>YrW{7DHeu)i{I2cDGU4*Y zpL~~3vie3HB$lmO81RrnZhG!+TepN}ke9`&R&N^zYcWDJ=}|=$2vHQfkjZPddu-OU zSp~gP7(vSAjg>i=Du-%GL&cs-ob$y0Vdng~-RC-&AWWY*&tHYgo(hr;&}5Z+BnI&Q zcW9#;(+J!?=nhWKZMdyhnJ2V<4U-vKM7uMKAC|M0vASbw5OR#wHgI)KJf1D2HMNie>v8Wv Wjonm%xU9sD!uKl)As^5BMSjuVmYSFV literal 2352 zcmV-03D5QoiY_vZjh&7QCFrhKcFBG@`zj6HxkUamBtL*$SOfIYLQAnP$$?HCW-UzE zqY3S}bS_tytBr;XZgqTWlqlC0A?TtDDzJS4<-4yF+82AKZYaOSzyy z)LIN&*Phn|s>u2rH)V_1hyj-xu@)mBOg%_tj5_Sz6kyK>B5Gj0bp;~khYB=Ul|&X? zUFSM`<{}P#4_#PMfT#y?P!&Q=azAz#tG@DOU=aLF%RTb9pTg+mwrTZ+`_vBO5^xdb zCk{k&n*k1|x?M-4M;q$_?J$Z=GMNDL*;ETHrT|OpFalF9aJ;1NN8;rz^YfzF2c#MtNZvI;NuIJQ-M<=GHh=X9{ian$nm(H@?nOf1bgG`&RpLSr<5g9xf z2teKs?kATag6a+LsF}ejFjmcfSCRZKh(1~}uiJ(Qc@Q;)ValsMLtF!2X$O%Cb z2KMdb?&ns7GPy+RSdg<1=+QLqzgq74x1J+)2!4_{d|gtTVv9I=qfT>YNLb!NjSeg= zF|Qh88XA3rHR)>wth;QO_M(&hfA8)$QEpGgANx7DK|J`dW)T_`Xz_E!NK^R8RZg$y zc5}UIuDBt}n1#0!5GPf8Jbgag71LqHsVxL^@1qNIX|Dy=0vXV0(4^j2t$?ktEZdd5 zu_ckdLNK1WUPlJaR4^MLsqCIlhr=wrO2O}*qt8Z*MskXFh93(O!7RnBrwEDnT<`it5D0Mb#*2bx#aqC@LEJC=x_>Rx<|ygktaBRpWD z4#{MIj?XI%F|f1Z!qi;RP!vt6Ble@nmfAd}TzlXws1BJ)f5{5gri+aezIomN6ImrH zx}$i#tM@W$hzh(j)Gt+D=6S|?h}()_-~|h%S3)QyM`7f{Yf{v>p$dbYb8XdaAwacm zYIgF03~bBRJ?Q|Rm{AoSq^LSBkDa|`3tNoi02mXu+-Du+k_EUwoHMFk922)^pS;_D6#vtq~4S z0+*&E9tblkhvce%@L*}odrsPg ze1D(imA!lhnI7E+EDFG9720>Y4#l_d;0oNsr)BvjIN8`WGnc1$a?%?ycY8#Jhm$-C3s{t9ZH!5Tdr>`t41 zT)!t07R`S+w73>s@5X;v4d{Zrz<~%E?>$ry4A?zF{TOsf3y|_$p=_p^7 zyHtMEaO`#lEy8g>>v{%h!1*z-W`(rGI}x7M3P7v}4?u6$pF9q$Z>h4+;M|XMMXn-` zt;L)h+N2X->u!;3$*+|@qIVFK-FHTOWzOKyOMLi?7uHQUumZzC>x@c?*cS{IeR9pz z%j|yMgIP(6EQpB4%%ANMRmAGv^MZ8l-{UC8Un6k3C~MltE7?VC^N!9xT725P)|Gtf z&Y(8ua0ZUJO(-Sc>1rq^R0ra;Wa5&>w$UCFV36KRm<$T^2(h&JMd-wYacGQvViWbN z;Sj}nB6rj56!|*PGf00&z+`c`4W3nX4V>s9=aCW8AGAn)EiROzk#ku76;QET`eHgm z(nw)$QzY5E$?_QwzB-{3OpF_c;7(A1@_v7pYaO5JgoY(y&*&O#VUKi8dkA)N#1BEo z^s5wOm{@=f>c|t#|7>EeQqHh!uRXjICpE`%G!Z+Zt<^J-#-9iG(VG#%Nv?sI+ zbc`m4USJyzcgu?tl;%C}Ez6G@|f#&^hF+`g-yrj{hmY4yhlk+b#gV44cV?S5r%;?ge?g z#lzI?kuY1oXLg&XxdkBG8g*9plC**(x1xRs!fCuZZfAb#o*pyTq1{n<-CM+4c6lHo zqhwh;eK)Jl1X}YUP)?=oto!8X%qgNi1g>n7$x+*H3lrxcs&2-MENP(#=M;+oe_zRD zmCP_qF1Fe;UFgs(|6U79ig}b`dz4{4Eh38)&RvnO=3V=+bB@oe8weiJM6CJ5c%GQ-iz&#q=Du>_LJKa?c5%>1J4;MeQNYk^_$~ z;|WA1#Nz81yr8Jafys`4PisrSy?Jw~yQrKw#cLkq4Jq8We*d_mk#2#X^w3p=gJB>* z#!GJ%sBPy+SR&x<$od^Zj0! zidEfbN|w72WG4PR*<}{0X+HTW38KvQlnKe|LO@K*{nS!xOGu^})|VMf4R={d{^$ZY Wc%~RC+CiWM`BrrE1b(~# diff --git a/ci/travis_gbq_config.txt b/ci/travis_gbq_config.txt index 0b28cdedbd0d7..dc857c450331c 100644 --- a/ci/travis_gbq_config.txt +++ b/ci/travis_gbq_config.txt @@ -1,2 +1,2 @@ -TRAVIS_IV_ENV=encrypted_1d9d7b1f171b_iv -TRAVIS_KEY_ENV=encrypted_1d9d7b1f171b_key +TRAVIS_IV_ENV=encrypted_e05c934e101e_iv +TRAVIS_KEY_ENV=encrypted_e05c934e101e_key diff --git a/ci/travis_process_gbq_encryption.sh b/ci/travis_process_gbq_encryption.sh index 9967d40e49f0a..fccf8e1e8deff 100755 --- a/ci/travis_process_gbq_encryption.sh +++ b/ci/travis_process_gbq_encryption.sh @@ -7,7 +7,7 @@ if [[ -n ${SERVICE_ACCOUNT_KEY} ]]; then elif [[ -n ${!TRAVIS_IV_ENV} ]]; then openssl aes-256-cbc -K ${!TRAVIS_KEY_ENV} -iv ${!TRAVIS_IV_ENV} \ -in ci/travis_gbq.json.enc -out ci/travis_gbq.json -d; - export GBQ_PROJECT_ID='pandas-travis'; + export GBQ_PROJECT_ID='pandas-gbq-tests'; echo 'Successfully decrypted gbq credentials' fi diff --git a/pandas/tests/io/test_gbq.py b/pandas/tests/io/test_gbq.py index 870d78ef1c533..df107259d38cd 100644 --- a/pandas/tests/io/test_gbq.py +++ b/pandas/tests/io/test_gbq.py @@ -148,7 +148,6 @@ def mock_read_gbq(sql, **kwargs): @pytest.mark.single -@pytest.mark.xfail(reason="skipping gbq integration for now, xref #34779") class TestToGBQIntegrationWithServiceAccountKeyPath: @pytest.fixture() def gbq_dataset(self): From 33517bc876b9f92e36b977c156fb7750abc6dba0 Mon Sep 17 00:00:00 2001 From: Andrew Wieteska <48889395+arw2019@users.noreply.github.com> Date: Thu, 9 Jul 2020 19:41:55 -0400 Subject: [PATCH 0318/1025] TST: df.loc[:, 'col'] returning a view, but df.loc[df.index, 'col'] returning a copy (#34996) --- pandas/tests/indexing/test_loc.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 47980e88f76d4..30b13b6ea9fce 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -894,6 +894,22 @@ def test_identity_slice_returns_new_object(self): original_series[:3] = [7, 8, 9] assert all(sliced_series[:3] == [7, 8, 9]) + def test_loc_copy_vs_view(self): + # GH 15631 + x = DataFrame(zip(range(3), range(3)), columns=["a", "b"]) + + y = x.copy() + q = y.loc[:, "a"] + q += 2 + + tm.assert_frame_equal(x, y) + + z = x.copy() + q = z.loc[x.index, "a"] + q += 2 + + tm.assert_frame_equal(x, z) + def test_loc_uint64(self): # GH20722 # Test whether loc accept uint64 max value as index. From b738ff977448044d63a4a0003cbb8cc4b81a9ff9 Mon Sep 17 00:00:00 2001 From: Ayappan Date: Fri, 10 Jul 2020 05:24:08 +0530 Subject: [PATCH 0319/1025] No fastcall attribute in POWER platform (#35083) --- pandas/_libs/src/ujson/lib/ultrajson.h | 8 +------- pandas/_libs/src/ujson/lib/ultrajsondec.c | 22 +++++++++++----------- pandas/_libs/src/ujson/lib/ultrajsonenc.c | 4 ++-- 3 files changed, 14 insertions(+), 20 deletions(-) diff --git a/pandas/_libs/src/ujson/lib/ultrajson.h b/pandas/_libs/src/ujson/lib/ultrajson.h index 69284e1c3f2ab..757cabdbbc730 100644 --- a/pandas/_libs/src/ujson/lib/ultrajson.h +++ b/pandas/_libs/src/ujson/lib/ultrajson.h @@ -94,7 +94,7 @@ typedef __int64 JSLONG; #define EXPORTFUNCTION __declspec(dllexport) #define FASTCALL_MSVC __fastcall -#define FASTCALL_ATTR + #define INLINE_PREFIX static __inline #else @@ -108,12 +108,6 @@ typedef uint32_t JSUINT32; #define FASTCALL_MSVC -#if !defined __x86_64__ && !defined __aarch64__ -#define FASTCALL_ATTR __attribute__((fastcall)) -#else -#define FASTCALL_ATTR -#endif - #define INLINE_PREFIX static inline typedef uint8_t JSUINT8; diff --git a/pandas/_libs/src/ujson/lib/ultrajsondec.c b/pandas/_libs/src/ujson/lib/ultrajsondec.c index 36eb170f8048f..81327fd9efb06 100644 --- a/pandas/_libs/src/ujson/lib/ultrajsondec.c +++ b/pandas/_libs/src/ujson/lib/ultrajsondec.c @@ -68,7 +68,7 @@ struct DecoderState { JSONObjectDecoder *dec; }; -JSOBJ FASTCALL_MSVC decode_any(struct DecoderState *ds) FASTCALL_ATTR; +JSOBJ FASTCALL_MSVC decode_any(struct DecoderState *ds); typedef JSOBJ (*PFN_DECODER)(struct DecoderState *ds); static JSOBJ SetError(struct DecoderState *ds, int offset, @@ -99,7 +99,7 @@ double createDouble(double intNeg, double intValue, double frcValue, return (intValue + (frcValue * g_pow10[frcDecimalCount])) * intNeg; } -FASTCALL_ATTR JSOBJ FASTCALL_MSVC decodePreciseFloat(struct DecoderState *ds) { +JSOBJ FASTCALL_MSVC decodePreciseFloat(struct DecoderState *ds) { char *end; double value; errno = 0; @@ -114,7 +114,7 @@ FASTCALL_ATTR JSOBJ FASTCALL_MSVC decodePreciseFloat(struct DecoderState *ds) { return ds->dec->newDouble(ds->prv, value); } -FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_numeric(struct DecoderState *ds) { +JSOBJ FASTCALL_MSVC decode_numeric(struct DecoderState *ds) { int intNeg = 1; int mantSize = 0; JSUINT64 intValue; @@ -340,7 +340,7 @@ FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_numeric(struct DecoderState *ds) { pow(10.0, expValue * expNeg)); } -FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_true(struct DecoderState *ds) { +JSOBJ FASTCALL_MSVC decode_true(struct DecoderState *ds) { char *offset = ds->start; offset++; @@ -356,7 +356,7 @@ FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_true(struct DecoderState *ds) { return SetError(ds, -1, "Unexpected character found when decoding 'true'"); } -FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_false(struct DecoderState *ds) { +JSOBJ FASTCALL_MSVC decode_false(struct DecoderState *ds) { char *offset = ds->start; offset++; @@ -373,7 +373,7 @@ FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_false(struct DecoderState *ds) { return SetError(ds, -1, "Unexpected character found when decoding 'false'"); } -FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_null(struct DecoderState *ds) { +JSOBJ FASTCALL_MSVC decode_null(struct DecoderState *ds) { char *offset = ds->start; offset++; @@ -389,7 +389,7 @@ FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_null(struct DecoderState *ds) { return SetError(ds, -1, "Unexpected character found when decoding 'null'"); } -FASTCALL_ATTR void FASTCALL_MSVC SkipWhitespace(struct DecoderState *ds) { +void FASTCALL_MSVC SkipWhitespace(struct DecoderState *ds) { char *offset; for (offset = ds->start; (ds->end - offset) > 0; offset++) { @@ -677,7 +677,7 @@ static const JSUINT8 g_decoderLookup[256] = { DS_UTFLENERROR, }; -FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_string(struct DecoderState *ds) { +JSOBJ FASTCALL_MSVC decode_string(struct DecoderState *ds) { JSUTF16 sur[2] = {0}; int iSur = 0; int index; @@ -957,7 +957,7 @@ FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_string(struct DecoderState *ds) { } } -FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_array(struct DecoderState *ds) { +JSOBJ FASTCALL_MSVC decode_array(struct DecoderState *ds) { JSOBJ itemValue; JSOBJ newObj; int len; @@ -1021,7 +1021,7 @@ FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_array(struct DecoderState *ds) { } } -FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_object(struct DecoderState *ds) { +JSOBJ FASTCALL_MSVC decode_object(struct DecoderState *ds) { JSOBJ itemName; JSOBJ itemValue; JSOBJ newObj; @@ -1104,7 +1104,7 @@ FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_object(struct DecoderState *ds) { } } -FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_any(struct DecoderState *ds) { +JSOBJ FASTCALL_MSVC decode_any(struct DecoderState *ds) { for (;;) { switch (*ds->start) { case '\"': diff --git a/pandas/_libs/src/ujson/lib/ultrajsonenc.c b/pandas/_libs/src/ujson/lib/ultrajsonenc.c index 51aa39a16920e..5343999c369f7 100644 --- a/pandas/_libs/src/ujson/lib/ultrajsonenc.c +++ b/pandas/_libs/src/ujson/lib/ultrajsonenc.c @@ -393,7 +393,7 @@ void Buffer_Realloc(JSONObjectEncoder *enc, size_t cbNeeded) { enc->end = enc->start + newSize; } -FASTCALL_ATTR INLINE_PREFIX void FASTCALL_MSVC +INLINE_PREFIX void FASTCALL_MSVC Buffer_AppendShortHexUnchecked(char *outputOffset, unsigned short value) { *(outputOffset++) = g_hexChars[(value & 0xf000) >> 12]; *(outputOffset++) = g_hexChars[(value & 0x0f00) >> 8]; @@ -722,7 +722,7 @@ int Buffer_EscapeStringValidated(JSOBJ obj, JSONObjectEncoder *enc, #define Buffer_AppendCharUnchecked(__enc, __chr) *((__enc)->offset++) = __chr; -FASTCALL_ATTR INLINE_PREFIX void FASTCALL_MSVC strreverse(char *begin, +INLINE_PREFIX void FASTCALL_MSVC strreverse(char *begin, char *end) { char aux; while (end > begin) aux = *end, *end-- = *begin, *begin++ = aux; From 62550ac2cc79c0c54ecca7fea90f97a96d972547 Mon Sep 17 00:00:00 2001 From: Vishwam Pandya Date: Thu, 9 Jul 2020 20:08:26 -0400 Subject: [PATCH 0320/1025] TST: category isin on frame (#34363) --- pandas/tests/frame/methods/test_isin.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/pandas/tests/frame/methods/test_isin.py b/pandas/tests/frame/methods/test_isin.py index 79ea70a38f145..35d45bd00131b 100644 --- a/pandas/tests/frame/methods/test_isin.py +++ b/pandas/tests/frame/methods/test_isin.py @@ -189,3 +189,18 @@ def test_isin_empty_datetimelike(self): tm.assert_frame_equal(result, expected) result = df1_td.isin(df3) tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "values", + [ + pd.DataFrame({"a": [1, 2, 3]}, dtype="category"), + pd.Series([1, 2, 3], dtype="category"), + ], + ) + def test_isin_category_frame(self, values): + # GH#34256 + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + expected = DataFrame({"a": [True, True, True], "b": [False, False, False]}) + + result = df.isin(values) + tm.assert_frame_equal(result, expected) From 7533354cf7aa896b12563941fa077783947d951a Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 10 Jul 2020 05:13:19 -0700 Subject: [PATCH 0321/1025] PERF: MonthOffset.apply_index (#35195) --- pandas/_libs/tslibs/offsets.pyx | 27 ++++++++++---------- pandas/tests/tseries/offsets/test_offsets.py | 6 ----- 2 files changed, 13 insertions(+), 20 deletions(-) diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index 4429ff083f350..b0c6648514e99 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -558,7 +558,7 @@ cdef class BaseOffset: def _get_offset_day(self, other: datetime) -> int: # subclass must implement `_day_opt`; calling from the base class - # will raise NotImplementedError. + # will implicitly assume day_opt = "business_end", see get_day_of_month. cdef: npy_datetimestruct dts pydate_to_dtstruct(other, &dts) @@ -3611,7 +3611,6 @@ def shift_months(const int64_t[:] dtindex, int months, object day_opt=None): out[i] = dtstruct_to_dt64(&dts) elif day_opt in ["start", "end", "business_start", "business_end"]: _shift_months(dtindex, out, count, months, day_opt) - else: raise ValueError("day must be None, 'start', 'end', " "'business_start', or 'business_end'") @@ -3801,7 +3800,7 @@ def shift_month(stamp: datetime, months: int, day_opt: object=None) -> datetime: return stamp.replace(year=year, month=month, day=day) -cdef inline int get_day_of_month(npy_datetimestruct* dts, day_opt) nogil except? -1: +cdef inline int get_day_of_month(npy_datetimestruct* dts, str day_opt) nogil: """ Find the day in `other`'s month that satisfies a DateOffset's is_on_offset policy, as described by the `day_opt` argument. @@ -3827,27 +3826,23 @@ cdef inline int get_day_of_month(npy_datetimestruct* dts, day_opt) nogil except? >>> get_day_of_month(other, 'end') 30 + Notes + ----- + Caller is responsible for ensuring one of the four accepted day_opt values + is passed. """ - cdef: - int days_in_month if day_opt == "start": return 1 elif day_opt == "end": - days_in_month = get_days_in_month(dts.year, dts.month) - return days_in_month + return get_days_in_month(dts.year, dts.month) elif day_opt == "business_start": # first business day of month return get_firstbday(dts.year, dts.month) - elif day_opt == "business_end": + else: + # i.e. day_opt == "business_end": # last business day of month return get_lastbday(dts.year, dts.month) - elif day_opt is not None: - raise ValueError(day_opt) - elif day_opt is None: - # Note: unlike `shift_month`, get_day_of_month does not - # allow day_opt = None - raise NotImplementedError cpdef int roll_convention(int other, int n, int compare) nogil: @@ -3901,6 +3896,10 @@ def roll_qtrday(other: datetime, n: int, month: int, cdef: int months_since npy_datetimestruct dts + + if day_opt not in ["start", "end", "business_start", "business_end"]: + raise ValueError(day_opt) + pydate_to_dtstruct(other, &dts) if modby == 12: diff --git a/pandas/tests/tseries/offsets/test_offsets.py b/pandas/tests/tseries/offsets/test_offsets.py index 784c04f225630..cffaa7b43d0cf 100644 --- a/pandas/tests/tseries/offsets/test_offsets.py +++ b/pandas/tests/tseries/offsets/test_offsets.py @@ -4310,12 +4310,6 @@ def test_all_offset_classes(self, tup): # --------------------------------------------------------------------- -def test_get_offset_day_error(): - # subclass of _BaseOffset must override _day_opt attribute, or we should - # get a NotImplementedError - - with pytest.raises(NotImplementedError): - DateOffset()._get_offset_day(datetime.now()) def test_valid_default_arguments(offset_types): From fe565eb87a4462b359b6eaf0bc9b7f1c04c4018b Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 10 Jul 2020 14:14:10 +0200 Subject: [PATCH 0322/1025] BUG: fix IntegerArray astype with copy=True/False (#34931) * BUG: fix IntegerArray astype with copy=True/False * fix mypy * return self for same dtype and copy=False * whatsnew --- doc/source/whatsnew/v1.1.0.rst | 1 + pandas/core/arrays/integer.py | 20 +++++++----- pandas/core/arrays/masked.py | 11 +++++++ pandas/tests/arrays/integer/test_dtypes.py | 38 ++++++++++++++++++++++ 4 files changed, 62 insertions(+), 8 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 5473b7c1523f3..5dff6d729479a 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -1143,6 +1143,7 @@ ExtensionArray - Fixed bug where :meth:`StringArray.memory_usage` was not implemented (:issue:`33963`) - Fixed bug where :meth:`DataFrameGroupBy` would ignore the ``min_count`` argument for aggregations on nullable boolean dtypes (:issue:`34051`) - Fixed bug that `DataFrame(columns=.., dtype='string')` would fail (:issue:`27953`, :issue:`33623`) +- Fixed bug in ``IntegerArray.astype`` to correctly copy the mask as well (:issue:`34931`). Other ^^^^^ diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 7be7ef3637ee5..b5cb681812939 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -448,18 +448,22 @@ def astype(self, dtype, copy: bool = True) -> ArrayLike: if incompatible type with an IntegerDtype, equivalent of same_kind casting """ - from pandas.core.arrays.boolean import BooleanDtype + from pandas.core.arrays.masked import BaseMaskedDtype from pandas.core.arrays.string_ import StringDtype dtype = pandas_dtype(dtype) - # if we are astyping to an existing IntegerDtype we can fastpath - if isinstance(dtype, _IntegerDtype): - result = self._data.astype(dtype.numpy_dtype, copy=False) - return dtype.construct_array_type()(result, mask=self._mask, copy=False) - elif isinstance(dtype, BooleanDtype): - result = self._data.astype("bool", copy=False) - return dtype.construct_array_type()(result, mask=self._mask, copy=False) + # if the dtype is exactly the same, we can fastpath + if self.dtype == dtype: + # return the same object for copy=False + return self.copy() if copy else self + # if we are astyping to another nullable masked dtype, we can fastpath + if isinstance(dtype, BaseMaskedDtype): + data = self._data.astype(dtype.numpy_dtype, copy=copy) + # mask is copied depending on whether the data was copied, and + # not directly depending on the `copy` keyword + mask = self._mask if data is self._data else self._mask.copy() + return dtype.construct_array_type()(data, mask, copy=False) elif isinstance(dtype, StringDtype): return dtype.construct_array_type()._from_sequence(self, copy=False) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 28add129825d1..235840d6d201e 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -40,6 +40,17 @@ class BaseMaskedDtype(ExtensionDtype): def numpy_dtype(self) -> np.dtype: raise AbstractMethodError + @classmethod + def construct_array_type(cls) -> Type["BaseMaskedArray"]: + """ + Return the array type associated with this dtype. + + Returns + ------- + type + """ + raise NotImplementedError + class BaseMaskedArray(ExtensionArray, ExtensionOpsMixin): """ diff --git a/pandas/tests/arrays/integer/test_dtypes.py b/pandas/tests/arrays/integer/test_dtypes.py index cafe9e47a18f4..67efa4cb2ce4a 100644 --- a/pandas/tests/arrays/integer/test_dtypes.py +++ b/pandas/tests/arrays/integer/test_dtypes.py @@ -144,6 +144,44 @@ def test_astype(all_data): tm.assert_series_equal(result, expected) +def test_astype_copy(): + arr = pd.array([1, 2, 3, None], dtype="Int64") + orig = pd.array([1, 2, 3, None], dtype="Int64") + + # copy=True -> ensure both data and mask are actual copies + result = arr.astype("Int64", copy=True) + assert result is not arr + assert not np.shares_memory(result._data, arr._data) + assert not np.shares_memory(result._mask, arr._mask) + result[0] = 10 + tm.assert_extension_array_equal(arr, orig) + result[0] = pd.NA + tm.assert_extension_array_equal(arr, orig) + + # copy=False + result = arr.astype("Int64", copy=False) + assert result is arr + assert np.shares_memory(result._data, arr._data) + assert np.shares_memory(result._mask, arr._mask) + result[0] = 10 + assert arr[0] == 10 + result[0] = pd.NA + assert arr[0] is pd.NA + + # astype to different dtype -> always needs a copy -> even with copy=False + # we need to ensure that also the mask is actually copied + arr = pd.array([1, 2, 3, None], dtype="Int64") + orig = pd.array([1, 2, 3, None], dtype="Int64") + + result = arr.astype("Int32", copy=False) + assert not np.shares_memory(result._data, arr._data) + assert not np.shares_memory(result._mask, arr._mask) + result[0] = 10 + tm.assert_extension_array_equal(arr, orig) + result[0] = pd.NA + tm.assert_extension_array_equal(arr, orig) + + def test_astype_to_larger_numpy(): a = pd.array([1, 2], dtype="Int32") result = a.astype("int64") From b26ff6af9964eb20b9a7422d7e3d526f0428cdf0 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 10 Jul 2020 05:15:21 -0700 Subject: [PATCH 0323/1025] CLN: tighten types to get_rule_month (#35205) --- pandas/_libs/tslibs/parsing.pxd | 2 +- pandas/_libs/tslibs/parsing.pyx | 14 ++++++-------- pandas/_libs/tslibs/period.pyx | 9 +++++---- pandas/core/arrays/period.py | 3 ++- pandas/tests/tslibs/test_libfrequencies.py | 12 ++++++------ 5 files changed, 20 insertions(+), 20 deletions(-) diff --git a/pandas/_libs/tslibs/parsing.pxd b/pandas/_libs/tslibs/parsing.pxd index 6e826cd4c6602..9c9262beaafad 100644 --- a/pandas/_libs/tslibs/parsing.pxd +++ b/pandas/_libs/tslibs/parsing.pxd @@ -1,2 +1,2 @@ -cpdef str get_rule_month(object source, str default=*) +cpdef str get_rule_month(str source) diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index 3a1af9fdb1e8f..92654f3b587e5 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -284,7 +284,7 @@ def parse_time_string(arg: str, freq=None, dayfirst=None, yearfirst=None): cdef parse_datetime_string_with_reso( - str date_string, object freq=None, bint dayfirst=False, bint yearfirst=False, + str date_string, str freq=None, bint dayfirst=False, bint yearfirst=False, ): """ Parse datetime string and try to identify its resolution. @@ -438,6 +438,7 @@ cdef inline object _parse_dateabbr_string(object date_string, datetime default, if freq is not None: # TODO: hack attack, #1228 + freq = getattr(freq, "freqstr", freq) try: mnum = c_MONTH_NUMBERS[get_rule_month(freq)] + 1 except (KeyError, ValueError): @@ -1020,15 +1021,14 @@ def concat_date_cols(tuple date_cols, bint keep_trivial_numbers=True): return result -# TODO: `default` never used? -cpdef str get_rule_month(object source, str default="DEC"): +cpdef str get_rule_month(str source): """ Return starting month of given freq, default is December. Parameters ---------- - source : object - default : str, default "DEC" + source : str + Derived from `freq.rule_code` or `freq.freqstr`. Returns ------- @@ -1042,10 +1042,8 @@ cpdef str get_rule_month(object source, str default="DEC"): >>> get_rule_month('A-JAN') 'JAN' """ - if is_offset_object(source): - source = source.freqstr source = source.upper() if "-" not in source: - return default + return "DEC" else: return source.split("-")[1] diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index e992b20b12db2..20961c6da56bd 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -2440,13 +2440,13 @@ cdef int64_t _ordinal_from_fields(int year, int month, quarter, int day, BaseOffset freq): base = freq_to_dtype_code(freq) if quarter is not None: - year, month = quarter_to_myear(year, quarter, freq) + year, month = quarter_to_myear(year, quarter, freq.freqstr) return period_ordinal(year, month, day, hour, minute, second, 0, 0, base) -def quarter_to_myear(year: int, quarter: int, freq): +def quarter_to_myear(year: int, quarter: int, freqstr: str): """ A quarterly frequency defines a "year" which may not coincide with the calendar-year. Find the calendar-year and calendar-month associated @@ -2456,7 +2456,8 @@ def quarter_to_myear(year: int, quarter: int, freq): ---------- year : int quarter : int - freq : DateOffset + freqstr : str + Equivalent to freq.freqstr Returns ------- @@ -2470,7 +2471,7 @@ def quarter_to_myear(year: int, quarter: int, freq): if quarter <= 0 or quarter > 4: raise ValueError('Quarter must be 1 <= q <= 4') - mnum = c_MONTH_NUMBERS[get_rule_month(freq)] + 1 + mnum = c_MONTH_NUMBERS[get_rule_month(freqstr)] + 1 month = (mnum + (quarter - 1) * 3) % 12 + 1 if month > mnum: year -= 1 diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index b336371655466..8d5cb12d60e4d 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -1034,9 +1034,10 @@ def _range_from_fields( if base != FreqGroup.FR_QTR: raise AssertionError("base must equal FR_QTR") + freqstr = freq.freqstr year, quarter = _make_field_arrays(year, quarter) for y, q in zip(year, quarter): - y, m = libperiod.quarter_to_myear(y, q, freq) + y, m = libperiod.quarter_to_myear(y, q, freqstr) val = libperiod.period_ordinal(y, m, 1, 1, 1, 1, 0, 0, base) ordinals.append(val) else: diff --git a/pandas/tests/tslibs/test_libfrequencies.py b/pandas/tests/tslibs/test_libfrequencies.py index 993f2f4c8ef10..83f28f6b5dc01 100644 --- a/pandas/tests/tslibs/test_libfrequencies.py +++ b/pandas/tests/tslibs/test_libfrequencies.py @@ -9,19 +9,19 @@ "obj,expected", [ ("W", "DEC"), - (offsets.Week(), "DEC"), + (offsets.Week().freqstr, "DEC"), ("D", "DEC"), - (offsets.Day(), "DEC"), + (offsets.Day().freqstr, "DEC"), ("Q", "DEC"), - (offsets.QuarterEnd(startingMonth=12), "DEC"), + (offsets.QuarterEnd(startingMonth=12).freqstr, "DEC"), ("Q-JAN", "JAN"), - (offsets.QuarterEnd(startingMonth=1), "JAN"), + (offsets.QuarterEnd(startingMonth=1).freqstr, "JAN"), ("A-DEC", "DEC"), ("Y-DEC", "DEC"), - (offsets.YearEnd(), "DEC"), + (offsets.YearEnd().freqstr, "DEC"), ("A-MAY", "MAY"), ("Y-MAY", "MAY"), - (offsets.YearEnd(month=5), "MAY"), + (offsets.YearEnd(month=5).freqstr, "MAY"), ], ) def test_get_rule_month(obj, expected): From 60f6e8cc6f6c8d3ca1c4a73d9a23aa59c08ddfc4 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 10 Jul 2020 05:16:27 -0700 Subject: [PATCH 0324/1025] REF: remove libresolution (#35198) --- pandas/_libs/tslibs/__init__.py | 2 +- pandas/_libs/tslibs/fields.pyx | 40 ++++++++++++++++++++++ pandas/_libs/tslibs/resolution.pyx | 53 ------------------------------ pandas/core/arrays/datetimes.py | 4 +-- pandas/tests/tslibs/test_api.py | 1 - pandas/tseries/frequencies.py | 3 +- setup.py | 5 --- 7 files changed, 44 insertions(+), 64 deletions(-) delete mode 100644 pandas/_libs/tslibs/resolution.pyx diff --git a/pandas/_libs/tslibs/__init__.py b/pandas/_libs/tslibs/__init__.py index 6fe6fa0a13c34..0ae4cc97d07e3 100644 --- a/pandas/_libs/tslibs/__init__.py +++ b/pandas/_libs/tslibs/__init__.py @@ -27,11 +27,11 @@ from . import dtypes from .conversion import localize_pydatetime +from .dtypes import Resolution from .nattype import NaT, NaTType, iNaT, is_null_datetimelike, nat_strings from .np_datetime import OutOfBoundsDatetime from .offsets import BaseOffset, Tick, to_offset from .period import IncompatibleFrequency, Period -from .resolution import Resolution from .timedeltas import Timedelta, delta_to_nanoseconds, ints_to_pytimedelta from .timestamps import Timestamp from .tzconversion import tz_convert_from_utc_single diff --git a/pandas/_libs/tslibs/fields.pyx b/pandas/_libs/tslibs/fields.pyx index 03e4188fd06ef..1d1f900bc18b3 100644 --- a/pandas/_libs/tslibs/fields.pyx +++ b/pandas/_libs/tslibs/fields.pyx @@ -73,6 +73,46 @@ def build_field_sarray(const int64_t[:] dtindex): return out +def month_position_check(fields, weekdays): + cdef: + int32_t daysinmonth, y, m, d + bint calendar_end = True + bint business_end = True + bint calendar_start = True + bint business_start = True + bint cal + int32_t[:] years = fields["Y"] + int32_t[:] months = fields["M"] + int32_t[:] days = fields["D"] + + for y, m, d, wd in zip(years, months, days, weekdays): + if calendar_start: + calendar_start &= d == 1 + if business_start: + business_start &= d == 1 or (d <= 3 and wd == 0) + + if calendar_end or business_end: + daysinmonth = get_days_in_month(y, m) + cal = d == daysinmonth + if calendar_end: + calendar_end &= cal + if business_end: + business_end &= cal or (daysinmonth - d < 3 and wd == 4) + elif not calendar_start and not business_start: + break + + if calendar_end: + return "ce" + elif business_end: + return "be" + elif calendar_start: + return "cs" + elif business_start: + return "bs" + else: + return None + + @cython.wraparound(False) @cython.boundscheck(False) def get_date_name_field(const int64_t[:] dtindex, str field, object locale=None): diff --git a/pandas/_libs/tslibs/resolution.pyx b/pandas/_libs/tslibs/resolution.pyx deleted file mode 100644 index d2861d8e9fe8d..0000000000000 --- a/pandas/_libs/tslibs/resolution.pyx +++ /dev/null @@ -1,53 +0,0 @@ - -import numpy as np -from numpy cimport int32_t - -from pandas._libs.tslibs.dtypes import Resolution -from pandas._libs.tslibs.ccalendar cimport get_days_in_month - - -# ---------------------------------------------------------------------- -# Frequency Inference - -def month_position_check(fields, weekdays): - cdef: - int32_t daysinmonth, y, m, d - bint calendar_end = True - bint business_end = True - bint calendar_start = True - bint business_start = True - bint cal - int32_t[:] years - int32_t[:] months - int32_t[:] days - - years = fields['Y'] - months = fields['M'] - days = fields['D'] - - for y, m, d, wd in zip(years, months, days, weekdays): - if calendar_start: - calendar_start &= d == 1 - if business_start: - business_start &= d == 1 or (d <= 3 and wd == 0) - - if calendar_end or business_end: - daysinmonth = get_days_in_month(y, m) - cal = d == daysinmonth - if calendar_end: - calendar_end &= cal - if business_end: - business_end &= cal or (daysinmonth - d < 3 and wd == 4) - elif not calendar_start and not business_start: - break - - if calendar_end: - return 'ce' - elif business_end: - return 'be' - elif calendar_start: - return 'cs' - elif business_start: - return 'bs' - else: - return None diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 5038df85c9160..7058ed3682d59 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -7,6 +7,7 @@ from pandas._libs import lib, tslib from pandas._libs.tslibs import ( NaT, + Resolution, Timestamp, conversion, fields, @@ -15,7 +16,6 @@ ints_to_pydatetime, is_date_array_normalized, normalize_i8_timestamps, - resolution as libresolution, timezones, to_offset, tzconversion, @@ -533,7 +533,7 @@ def is_normalized(self): return is_date_array_normalized(self.asi8, self.tz) @property # NB: override with cache_readonly in immutable subclasses - def _resolution_obj(self) -> libresolution.Resolution: + def _resolution_obj(self) -> Resolution: return get_resolution(self.asi8, self.tz) # ---------------------------------------------------------------- diff --git a/pandas/tests/tslibs/test_api.py b/pandas/tests/tslibs/test_api.py index ccaceb7e6f906..036037032031a 100644 --- a/pandas/tests/tslibs/test_api.py +++ b/pandas/tests/tslibs/test_api.py @@ -16,7 +16,6 @@ def test_namespace(): "offsets", "parsing", "period", - "resolution", "strptime", "vectorized", "timedeltas", diff --git a/pandas/tseries/frequencies.py b/pandas/tseries/frequencies.py index 23e08c7550646..f80ff1a53cd69 100644 --- a/pandas/tseries/frequencies.py +++ b/pandas/tseries/frequencies.py @@ -12,7 +12,7 @@ MONTHS, int_to_weekday, ) -from pandas._libs.tslibs.fields import build_field_sarray +from pandas._libs.tslibs.fields import build_field_sarray, month_position_check from pandas._libs.tslibs.offsets import ( # noqa:F401 DateOffset, Day, @@ -20,7 +20,6 @@ to_offset, ) from pandas._libs.tslibs.parsing import get_rule_month -from pandas._libs.tslibs.resolution import month_position_check from pandas.util._decorators import cache_readonly from pandas.core.dtypes.common import ( diff --git a/setup.py b/setup.py index 1885546e001fe..aebbdbf4d1e96 100755 --- a/setup.py +++ b/setup.py @@ -319,7 +319,6 @@ class CheckSDist(sdist_class): "pandas/_libs/tslibs/conversion.pyx", "pandas/_libs/tslibs/fields.pyx", "pandas/_libs/tslibs/offsets.pyx", - "pandas/_libs/tslibs/resolution.pyx", "pandas/_libs/tslibs/parsing.pyx", "pandas/_libs/tslibs/tzconversion.pyx", "pandas/_libs/tslibs/vectorized.pyx", @@ -639,10 +638,6 @@ def srcpath(name=None, suffix=".pyx", subdir="src"): "depends": tseries_depends, "sources": ["pandas/_libs/tslibs/src/datetime/np_datetime.c"], }, - "_libs.tslibs.resolution": { - "pyxfile": "_libs/tslibs/resolution", - "depends": tseries_depends, - }, "_libs.tslibs.strptime": { "pyxfile": "_libs/tslibs/strptime", "depends": tseries_depends, From 1a5a1d2ed35e3d4d77ed32d095a8b375747e20e4 Mon Sep 17 00:00:00 2001 From: rjfs Date: Fri, 10 Jul 2020 14:53:59 +0100 Subject: [PATCH 0325/1025] BUG: fix read_excel error for header=None and index_col as list #31783 (#35035) --- doc/source/whatsnew/v1.1.0.rst | 1 + pandas/io/excel/_base.py | 4 +++- pandas/tests/io/excel/test_readers.py | 13 +++++++++++++ 3 files changed, 17 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 5dff6d729479a..798a3d838ef7e 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -1053,6 +1053,7 @@ I/O - Bug in :meth:`~HDFStore.create_table` now raises an error when `column` argument was not specified in `data_columns` on input (:issue:`28156`) - :meth:`read_json` now could read line-delimited json file from a file url while `lines` and `chunksize` are set. - Bug in :meth:`DataFrame.to_sql` when reading DataFrames with ``-np.inf`` entries with MySQL now has a more explicit ``ValueError`` (:issue:`34431`) +- Bug in :meth:`read_excel` that was raising a ``TypeError`` when ``header=None`` and ``index_col`` given as list (:issue:`31783`) - Bug in "meth"`read_excel` where datetime values are used in the header in a `MultiIndex` (:issue:`34748`) Plotting diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 4fa4f158e9c3c..2a12f779230b2 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -468,7 +468,9 @@ def parse( if is_list_like(index_col): # Forward fill values for MultiIndex index. - if not is_list_like(header): + if header is None: + offset = 0 + elif not is_list_like(header): offset = 1 + header else: offset = 1 + max(header) diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index ddc631532194a..b610c5ec3a838 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -968,6 +968,19 @@ def test_deprecated_kwargs(self, read_ext): pd.read_excel("test1" + read_ext) + def test_no_header_with_list_index_col(self, read_ext): + # GH 31783 + file_name = "testmultiindex" + read_ext + data = [("B", "B"), ("key", "val"), (3, 4), (3, 4)] + idx = pd.MultiIndex.from_tuples( + [("A", "A"), ("key", "val"), (1, 2), (1, 2)], names=(0, 1) + ) + expected = pd.DataFrame(data, index=idx, columns=(2, 3)) + result = pd.read_excel( + file_name, sheet_name="index_col_none", index_col=[0, 1], header=None + ) + tm.assert_frame_equal(expected, result) + class TestExcelFileRead: @pytest.fixture(autouse=True) From e7758e5484b7b34c092f034ea942f4ca71733509 Mon Sep 17 00:00:00 2001 From: Jon Thielen Date: Fri, 10 Jul 2020 09:55:38 -0500 Subject: [PATCH 0326/1025] Add xarray copyright notice to comply with reuse under Apache License (#35213) --- LICENSES/XARRAY_LICENSE | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/LICENSES/XARRAY_LICENSE b/LICENSES/XARRAY_LICENSE index 37ec93a14fdcd..6bafeb9d3d80e 100644 --- a/LICENSES/XARRAY_LICENSE +++ b/LICENSES/XARRAY_LICENSE @@ -1,3 +1,7 @@ +Copyright 2014-2019, xarray Developers + +-------------------------------------------------------------------------------- + Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ From 7926003c2120bad2510bb7c339fd00fa695b98cc Mon Sep 17 00:00:00 2001 From: Kaiqi Dong Date: Fri, 10 Jul 2020 21:16:27 +0200 Subject: [PATCH 0327/1025] ENH: Implement Keyword Aggregation for DataFrame.agg and Series.agg (#29116) --- pandas/core/aggregation.py | 162 +++++++++++++++++- pandas/core/frame.py | 12 +- pandas/core/groupby/generic.py | 22 +-- pandas/core/series.py | 7 +- pandas/tests/frame/apply/__init__.py | 0 .../frame/apply/test_apply_relabeling.py | 104 +++++++++++ .../test_frame_apply.py} | 0 pandas/tests/series/apply/__init__.py | 0 .../series/apply/test_apply_relabeling.py | 33 ++++ .../test_series_apply.py} | 0 10 files changed, 317 insertions(+), 23 deletions(-) create mode 100644 pandas/tests/frame/apply/__init__.py create mode 100644 pandas/tests/frame/apply/test_apply_relabeling.py rename pandas/tests/frame/{test_apply.py => apply/test_frame_apply.py} (100%) create mode 100644 pandas/tests/series/apply/__init__.py create mode 100644 pandas/tests/series/apply/test_apply_relabeling.py rename pandas/tests/series/{test_apply.py => apply/test_series_apply.py} (100%) diff --git a/pandas/core/aggregation.py b/pandas/core/aggregation.py index 838722f60b380..16c4a9f862d79 100644 --- a/pandas/core/aggregation.py +++ b/pandas/core/aggregation.py @@ -5,12 +5,99 @@ from collections import defaultdict from functools import partial -from typing import Any, Callable, DefaultDict, List, Sequence, Tuple, Union +from typing import ( + Any, + Callable, + DefaultDict, + Dict, + List, + Optional, + Sequence, + Tuple, + Union, +) + +from pandas._typing import Label from pandas.core.dtypes.common import is_dict_like, is_list_like +from pandas.core.base import SpecificationError import pandas.core.common as com from pandas.core.indexes.api import Index +from pandas.core.series import FrameOrSeriesUnion, Series + +# types of `func` kwarg for DataFrame.aggregate and Series.aggregate +AggFuncTypeBase = Union[Callable, str] +AggFuncType = Union[ + AggFuncTypeBase, + List[AggFuncTypeBase], + Dict[Label, Union[AggFuncTypeBase, List[AggFuncTypeBase]]], +] + + +def reconstruct_func( + func: Optional[AggFuncType], **kwargs, +) -> Tuple[ + bool, Optional[AggFuncType], Optional[List[str]], Optional[List[int]], +]: + """ + This is the internal function to reconstruct func given if there is relabeling + or not and also normalize the keyword to get new order of columns. + + If named aggregation is applied, `func` will be None, and kwargs contains the + column and aggregation function information to be parsed; + If named aggregation is not applied, `func` is either string (e.g. 'min') or + Callable, or list of them (e.g. ['min', np.max]), or the dictionary of column name + and str/Callable/list of them (e.g. {'A': 'min'}, or {'A': [np.min, lambda x: x]}) + + If relabeling is True, will return relabeling, reconstructed func, column + names, and the reconstructed order of columns. + If relabeling is False, the columns and order will be None. + + Parameters + ---------- + func: agg function (e.g. 'min' or Callable) or list of agg functions + (e.g. ['min', np.max]) or dictionary (e.g. {'A': ['min', np.max]}). + **kwargs: dict, kwargs used in is_multi_agg_with_relabel and + normalize_keyword_aggregation function for relabelling + + Returns + ------- + relabelling: bool, if there is relabelling or not + func: normalized and mangled func + columns: list of column names + order: list of columns indices + + Examples + -------- + >>> reconstruct_func(None, **{"foo": ("col", "min")}) + (True, defaultdict(None, {'col': ['min']}), ('foo',), array([0])) + + >>> reconstruct_func("min") + (False, 'min', None, None) + """ + relabeling = func is None and is_multi_agg_with_relabel(**kwargs) + columns: Optional[List[str]] = None + order: Optional[List[int]] = None + + if not relabeling: + if isinstance(func, list) and len(func) > len(set(func)): + + # GH 28426 will raise error if duplicated function names are used and + # there is no reassigned name + raise SpecificationError( + "Function names must be unique if there is no new column names " + "assigned" + ) + elif func is None: + # nicer error message + raise TypeError("Must provide 'func' or tuples of '(column, aggfunc).") + + if relabeling: + func, columns, order = normalize_keyword_aggregation(kwargs) + func = maybe_mangle_lambdas(func) + + return relabeling, func, columns, order def is_multi_agg_with_relabel(**kwargs) -> bool: @@ -198,6 +285,79 @@ def maybe_mangle_lambdas(agg_spec: Any) -> Any: return mangled_aggspec +def relabel_result( + result: FrameOrSeriesUnion, + func: Dict[str, List[Union[Callable, str]]], + columns: Tuple, + order: List[int], +) -> Dict[Label, Series]: + """Internal function to reorder result if relabelling is True for + dataframe.agg, and return the reordered result in dict. + + Parameters: + ---------- + result: Result from aggregation + func: Dict of (column name, funcs) + columns: New columns name for relabelling + order: New order for relabelling + + Examples: + --------- + >>> result = DataFrame({"A": [np.nan, 2, np.nan], + ... "C": [6, np.nan, np.nan], "B": [np.nan, 4, 2.5]}) # doctest: +SKIP + >>> funcs = {"A": ["max"], "C": ["max"], "B": ["mean", "min"]} + >>> columns = ("foo", "aab", "bar", "dat") + >>> order = [0, 1, 2, 3] + >>> _relabel_result(result, func, columns, order) # doctest: +SKIP + dict(A=Series([2.0, NaN, NaN, NaN], index=["foo", "aab", "bar", "dat"]), + C=Series([NaN, 6.0, NaN, NaN], index=["foo", "aab", "bar", "dat"]), + B=Series([NaN, NaN, 2.5, 4.0], index=["foo", "aab", "bar", "dat"])) + """ + reordered_indexes = [ + pair[0] for pair in sorted(zip(columns, order), key=lambda t: t[1]) + ] + reordered_result_in_dict: Dict[Label, Series] = {} + idx = 0 + + reorder_mask = not isinstance(result, Series) and len(result.columns) > 1 + for col, fun in func.items(): + s = result[col].dropna() + + # In the `_aggregate`, the callable names are obtained and used in `result`, and + # these names are ordered alphabetically. e.g. + # C2 C1 + # 1 NaN + # amax NaN 4.0 + # max NaN 4.0 + # sum 18.0 6.0 + # Therefore, the order of functions for each column could be shuffled + # accordingly so need to get the callable name if it is not parsed names, and + # reorder the aggregated result for each column. + # e.g. if df.agg(c1=("C2", sum), c2=("C2", lambda x: min(x))), correct order is + # [sum, ], but in `result`, it will be [, sum], and we need to + # reorder so that aggregated values map to their functions regarding the order. + + # However there is only one column being used for aggregation, not need to + # reorder since the index is not sorted, and keep as is in `funcs`, e.g. + # A + # min 1.0 + # mean 1.5 + # mean 1.5 + if reorder_mask: + fun = [ + com.get_callable_name(f) if not isinstance(f, str) else f for f in fun + ] + col_idx_order = Index(s.index).get_indexer(fun) + s = s[col_idx_order] + + # assign the new user-provided "named aggregation" as index names, and reindex + # it based on the whole user-provided names. + s.index = reordered_indexes[idx : idx + len(fun)] + reordered_result_in_dict[col] = s.reindex(columns, copy=False) + idx = idx + len(fun) + return reordered_result_in_dict + + def validate_func_kwargs( kwargs: dict, ) -> Tuple[List[str], List[Union[str, Callable[..., Any]]]]: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 3d2200cb45c6e..10539ab74b4aa 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -114,6 +114,7 @@ from pandas.core import algorithms, common as com, nanops, ops from pandas.core.accessor import CachedAccessor +from pandas.core.aggregation import reconstruct_func, relabel_result from pandas.core.arrays import Categorical, ExtensionArray from pandas.core.arrays.datetimelike import DatetimeLikeArrayMixin as DatetimeLikeArray from pandas.core.arrays.sparse import SparseFrameAccessor @@ -7301,9 +7302,11 @@ def _gotitem( examples=_agg_examples_doc, versionadded="\n.. versionadded:: 0.20.0\n", ) - def aggregate(self, func, axis=0, *args, **kwargs): + def aggregate(self, func=None, axis=0, *args, **kwargs): axis = self._get_axis_number(axis) + relabeling, func, columns, order = reconstruct_func(func, **kwargs) + result = None try: result, how = self._aggregate(func, axis=axis, *args, **kwargs) @@ -7315,6 +7318,13 @@ def aggregate(self, func, axis=0, *args, **kwargs): raise exc from err if result is None: return self.apply(func, axis=axis, args=args, **kwargs) + + if relabeling: + # This is to keep the order to columns occurrence unchanged, and also + # keep the order of new columns occurrence unchanged + result_in_dict = relabel_result(result, func, columns, order) + result = DataFrame(result_in_dict, index=columns) + return result def _aggregate(self, arg, axis=0, *args, **kwargs): diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index ebb9d82766c1b..7f2eac520264d 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -55,9 +55,8 @@ from pandas.core.dtypes.missing import isna, notna from pandas.core.aggregation import ( - is_multi_agg_with_relabel, maybe_mangle_lambdas, - normalize_keyword_aggregation, + reconstruct_func, validate_func_kwargs, ) import pandas.core.algorithms as algorithms @@ -937,24 +936,7 @@ def aggregate( self, func=None, *args, engine="cython", engine_kwargs=None, **kwargs ): - relabeling = func is None and is_multi_agg_with_relabel(**kwargs) - if relabeling: - func, columns, order = normalize_keyword_aggregation(kwargs) - - kwargs = {} - elif isinstance(func, list) and len(func) > len(set(func)): - - # GH 28426 will raise error if duplicated function names are used and - # there is no reassigned name - raise SpecificationError( - "Function names must be unique if there is no new column " - "names assigned" - ) - elif func is None: - # nicer error message - raise TypeError("Must provide 'func' or tuples of '(column, aggfunc).") - - func = maybe_mangle_lambdas(func) + relabeling, func, columns, order = reconstruct_func(func, **kwargs) if engine == "numba": return self._python_agg_general( diff --git a/pandas/core/series.py b/pandas/core/series.py index 6c1d21e4526cf..9a633079b8c1d 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -4016,9 +4016,14 @@ def _gotitem(self, key, ndim, subset=None) -> "Series": examples=_agg_examples_doc, versionadded="\n.. versionadded:: 0.20.0\n", ) - def aggregate(self, func, axis=0, *args, **kwargs): + def aggregate(self, func=None, axis=0, *args, **kwargs): # Validate the axis parameter self._get_axis_number(axis) + + # if func is None, will switch to user-provided "named aggregation" kwargs + if func is None: + func = dict(kwargs.items()) + result, how = self._aggregate(func, *args, **kwargs) if result is None: diff --git a/pandas/tests/frame/apply/__init__.py b/pandas/tests/frame/apply/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/pandas/tests/frame/apply/test_apply_relabeling.py b/pandas/tests/frame/apply/test_apply_relabeling.py new file mode 100644 index 0000000000000..965f69753bdc7 --- /dev/null +++ b/pandas/tests/frame/apply/test_apply_relabeling.py @@ -0,0 +1,104 @@ +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm + + +class TestDataFrameNamedAggregate: + def test_agg_relabel(self): + # GH 26513 + df = pd.DataFrame({"A": [1, 2, 1, 2], "B": [1, 2, 3, 4], "C": [3, 4, 5, 6]}) + + # simplest case with one column, one func + result = df.agg(foo=("B", "sum")) + expected = pd.DataFrame({"B": [10]}, index=pd.Index(["foo"])) + tm.assert_frame_equal(result, expected) + + # test on same column with different methods + result = df.agg(foo=("B", "sum"), bar=("B", "min")) + expected = pd.DataFrame({"B": [10, 1]}, index=pd.Index(["foo", "bar"])) + + tm.assert_frame_equal(result, expected) + + def test_agg_relabel_multi_columns_multi_methods(self): + # GH 26513, test on multiple columns with multiple methods + df = pd.DataFrame({"A": [1, 2, 1, 2], "B": [1, 2, 3, 4], "C": [3, 4, 5, 6]}) + result = df.agg( + foo=("A", "sum"), + bar=("B", "mean"), + cat=("A", "min"), + dat=("B", "max"), + f=("A", "max"), + g=("C", "min"), + ) + expected = pd.DataFrame( + { + "A": [6.0, np.nan, 1.0, np.nan, 2.0, np.nan], + "B": [np.nan, 2.5, np.nan, 4.0, np.nan, np.nan], + "C": [np.nan, np.nan, np.nan, np.nan, np.nan, 3.0], + }, + index=pd.Index(["foo", "bar", "cat", "dat", "f", "g"]), + ) + tm.assert_frame_equal(result, expected) + + def test_agg_relabel_partial_functions(self): + # GH 26513, test on partial, functools or more complex cases + df = pd.DataFrame({"A": [1, 2, 1, 2], "B": [1, 2, 3, 4], "C": [3, 4, 5, 6]}) + result = df.agg(foo=("A", np.mean), bar=("A", "mean"), cat=("A", min)) + expected = pd.DataFrame( + {"A": [1.5, 1.5, 1.0]}, index=pd.Index(["foo", "bar", "cat"]) + ) + tm.assert_frame_equal(result, expected) + + result = df.agg( + foo=("A", min), + bar=("A", np.min), + cat=("B", max), + dat=("C", "min"), + f=("B", np.sum), + kk=("B", lambda x: min(x)), + ) + expected = pd.DataFrame( + { + "A": [1.0, 1.0, np.nan, np.nan, np.nan, np.nan], + "B": [np.nan, np.nan, 4.0, np.nan, 10.0, 1.0], + "C": [np.nan, np.nan, np.nan, 3.0, np.nan, np.nan], + }, + index=pd.Index(["foo", "bar", "cat", "dat", "f", "kk"]), + ) + tm.assert_frame_equal(result, expected) + + def test_agg_namedtuple(self): + # GH 26513 + df = pd.DataFrame({"A": [0, 1], "B": [1, 2]}) + result = df.agg( + foo=pd.NamedAgg("B", "sum"), + bar=pd.NamedAgg("B", min), + cat=pd.NamedAgg(column="B", aggfunc="count"), + fft=pd.NamedAgg("B", aggfunc="max"), + ) + + expected = pd.DataFrame( + {"B": [3, 1, 2, 2]}, index=pd.Index(["foo", "bar", "cat", "fft"]) + ) + tm.assert_frame_equal(result, expected) + + result = df.agg( + foo=pd.NamedAgg("A", "min"), + bar=pd.NamedAgg(column="B", aggfunc="max"), + cat=pd.NamedAgg(column="A", aggfunc="max"), + ) + expected = pd.DataFrame( + {"A": [0.0, np.nan, 1.0], "B": [np.nan, 2.0, np.nan]}, + index=pd.Index(["foo", "bar", "cat"]), + ) + tm.assert_frame_equal(result, expected) + + def test_agg_raises(self): + # GH 26513 + df = pd.DataFrame({"A": [0, 1], "B": [1, 2]}) + msg = "Must provide" + + with pytest.raises(TypeError, match=msg): + df.agg() diff --git a/pandas/tests/frame/test_apply.py b/pandas/tests/frame/apply/test_frame_apply.py similarity index 100% rename from pandas/tests/frame/test_apply.py rename to pandas/tests/frame/apply/test_frame_apply.py diff --git a/pandas/tests/series/apply/__init__.py b/pandas/tests/series/apply/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/pandas/tests/series/apply/test_apply_relabeling.py b/pandas/tests/series/apply/test_apply_relabeling.py new file mode 100644 index 0000000000000..0b8d2c4e1f26d --- /dev/null +++ b/pandas/tests/series/apply/test_apply_relabeling.py @@ -0,0 +1,33 @@ +import pandas as pd +import pandas._testing as tm + + +class TestNamedAggregation: + def test_relabel_no_duplicated_method(self): + # this is to test there is no duplicated method used in agg + df = pd.DataFrame({"A": [1, 2, 1, 2], "B": [1, 2, 3, 4]}) + + result = df["A"].agg(foo="sum") + expected = df["A"].agg({"foo": "sum"}) + tm.assert_series_equal(result, expected) + + result = df["B"].agg(foo="min", bar="max") + expected = df["B"].agg({"foo": "min", "bar": "max"}) + tm.assert_series_equal(result, expected) + + result = df["B"].agg(foo=sum, bar=min, cat="max") + expected = df["B"].agg({"foo": sum, "bar": min, "cat": "max"}) + tm.assert_series_equal(result, expected) + + def test_relabel_duplicated_method(self): + # this is to test with nested renaming, duplicated method can be used + # if they are assigned with different new names + df = pd.DataFrame({"A": [1, 2, 1, 2], "B": [1, 2, 3, 4]}) + + result = df["A"].agg(foo="sum", bar="sum") + expected = pd.Series([6, 6], index=["foo", "bar"], name="A") + tm.assert_series_equal(result, expected) + + result = df["B"].agg(foo=min, bar="min") + expected = pd.Series([1, 1], index=["foo", "bar"], name="B") + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/test_apply.py b/pandas/tests/series/apply/test_series_apply.py similarity index 100% rename from pandas/tests/series/test_apply.py rename to pandas/tests/series/apply/test_series_apply.py From 70cf4f1b41d39bcabafcafc2a287633f31a9da3e Mon Sep 17 00:00:00 2001 From: Robin to Roxel <35864265+r-toroxel@users.noreply.github.com> Date: Fri, 10 Jul 2020 21:32:34 +0200 Subject: [PATCH 0328/1025] Tst return none inplace series (#35210) --- pandas/tests/series/indexing/test_indexing.py | 6 ++-- .../series/methods/test_drop_duplicates.py | 33 ++++++++++++------- pandas/tests/series/methods/test_fillna.py | 3 +- pandas/tests/series/methods/test_replace.py | 24 +++++++++----- .../tests/series/methods/test_reset_index.py | 3 +- .../tests/series/methods/test_sort_values.py | 3 +- pandas/tests/series/methods/test_truncate.py | 3 +- pandas/tests/series/test_api.py | 18 ++++++---- pandas/tests/series/test_datetime_values.py | 3 +- pandas/tests/series/test_missing.py | 15 ++++++--- 10 files changed, 74 insertions(+), 37 deletions(-) diff --git a/pandas/tests/series/indexing/test_indexing.py b/pandas/tests/series/indexing/test_indexing.py index 737e21af9242f..3ed25b8bca566 100644 --- a/pandas/tests/series/indexing/test_indexing.py +++ b/pandas/tests/series/indexing/test_indexing.py @@ -736,14 +736,16 @@ def test_append_timedelta_does_not_cast(td): def test_underlying_data_conversion(): # GH 4080 df = DataFrame({c: [1, 2, 3] for c in ["a", "b", "c"]}) - df.set_index(["a", "b", "c"], inplace=True) + return_value = df.set_index(["a", "b", "c"], inplace=True) + assert return_value is None s = Series([1], index=[(2, 2, 2)]) df["val"] = 0 df df["val"].update(s) expected = DataFrame(dict(a=[1, 2, 3], b=[1, 2, 3], c=[1, 2, 3], val=[0, 1, 0])) - expected.set_index(["a", "b", "c"], inplace=True) + return_value = expected.set_index(["a", "b", "c"], inplace=True) + assert return_value is None tm.assert_frame_equal(df, expected) # GH 3970 diff --git a/pandas/tests/series/methods/test_drop_duplicates.py b/pandas/tests/series/methods/test_drop_duplicates.py index a4532ebb3d8c5..40651c4342e8a 100644 --- a/pandas/tests/series/methods/test_drop_duplicates.py +++ b/pandas/tests/series/methods/test_drop_duplicates.py @@ -22,7 +22,8 @@ def test_drop_duplicates(any_numpy_dtype, keep, expected): tm.assert_series_equal(tc.duplicated(keep=keep), expected) tm.assert_series_equal(tc.drop_duplicates(keep=keep), tc[~expected]) sc = tc.copy() - sc.drop_duplicates(keep=keep, inplace=True) + return_value = sc.drop_duplicates(keep=keep, inplace=True) + assert return_value is None tm.assert_series_equal(sc, tc[~expected]) @@ -40,8 +41,9 @@ def test_drop_duplicates_bool(keep, expected): tm.assert_series_equal(tc.duplicated(keep=keep), expected) tm.assert_series_equal(tc.drop_duplicates(keep=keep), tc[~expected]) sc = tc.copy() - sc.drop_duplicates(keep=keep, inplace=True) + return_value = sc.drop_duplicates(keep=keep, inplace=True) tm.assert_series_equal(sc, tc[~expected]) + assert return_value is None @pytest.mark.parametrize("values", [[], list(range(5))]) @@ -84,21 +86,24 @@ def test_drop_duplicates_categorical_non_bool(self, dtype, ordered): tm.assert_series_equal(tc1.duplicated(), expected) tm.assert_series_equal(tc1.drop_duplicates(), tc1[~expected]) sc = tc1.copy() - sc.drop_duplicates(inplace=True) + return_value = sc.drop_duplicates(inplace=True) + assert return_value is None tm.assert_series_equal(sc, tc1[~expected]) expected = Series([False, False, True, False]) tm.assert_series_equal(tc1.duplicated(keep="last"), expected) tm.assert_series_equal(tc1.drop_duplicates(keep="last"), tc1[~expected]) sc = tc1.copy() - sc.drop_duplicates(keep="last", inplace=True) + return_value = sc.drop_duplicates(keep="last", inplace=True) + assert return_value is None tm.assert_series_equal(sc, tc1[~expected]) expected = Series([False, False, True, True]) tm.assert_series_equal(tc1.duplicated(keep=False), expected) tm.assert_series_equal(tc1.drop_duplicates(keep=False), tc1[~expected]) sc = tc1.copy() - sc.drop_duplicates(keep=False, inplace=True) + return_value = sc.drop_duplicates(keep=False, inplace=True) + assert return_value is None tm.assert_series_equal(sc, tc1[~expected]) # Test case 2 @@ -113,21 +118,24 @@ def test_drop_duplicates_categorical_non_bool(self, dtype, ordered): tm.assert_series_equal(tc2.duplicated(), expected) tm.assert_series_equal(tc2.drop_duplicates(), tc2[~expected]) sc = tc2.copy() - sc.drop_duplicates(inplace=True) + return_value = sc.drop_duplicates(inplace=True) + assert return_value is None tm.assert_series_equal(sc, tc2[~expected]) expected = Series([False, True, True, False, False, False, False]) tm.assert_series_equal(tc2.duplicated(keep="last"), expected) tm.assert_series_equal(tc2.drop_duplicates(keep="last"), tc2[~expected]) sc = tc2.copy() - sc.drop_duplicates(keep="last", inplace=True) + return_value = sc.drop_duplicates(keep="last", inplace=True) + assert return_value is None tm.assert_series_equal(sc, tc2[~expected]) expected = Series([False, True, True, False, True, True, False]) tm.assert_series_equal(tc2.duplicated(keep=False), expected) tm.assert_series_equal(tc2.drop_duplicates(keep=False), tc2[~expected]) sc = tc2.copy() - sc.drop_duplicates(keep=False, inplace=True) + return_value = sc.drop_duplicates(keep=False, inplace=True) + assert return_value is None tm.assert_series_equal(sc, tc2[~expected]) def test_drop_duplicates_categorical_bool(self, ordered): @@ -141,19 +149,22 @@ def test_drop_duplicates_categorical_bool(self, ordered): tm.assert_series_equal(tc.duplicated(), expected) tm.assert_series_equal(tc.drop_duplicates(), tc[~expected]) sc = tc.copy() - sc.drop_duplicates(inplace=True) + return_value = sc.drop_duplicates(inplace=True) + assert return_value is None tm.assert_series_equal(sc, tc[~expected]) expected = Series([True, True, False, False]) tm.assert_series_equal(tc.duplicated(keep="last"), expected) tm.assert_series_equal(tc.drop_duplicates(keep="last"), tc[~expected]) sc = tc.copy() - sc.drop_duplicates(keep="last", inplace=True) + return_value = sc.drop_duplicates(keep="last", inplace=True) + assert return_value is None tm.assert_series_equal(sc, tc[~expected]) expected = Series([True, True, True, True]) tm.assert_series_equal(tc.duplicated(keep=False), expected) tm.assert_series_equal(tc.drop_duplicates(keep=False), tc[~expected]) sc = tc.copy() - sc.drop_duplicates(keep=False, inplace=True) + return_value = sc.drop_duplicates(keep=False, inplace=True) + assert return_value is None tm.assert_series_equal(sc, tc[~expected]) diff --git a/pandas/tests/series/methods/test_fillna.py b/pandas/tests/series/methods/test_fillna.py index c34838be24fc1..80b8271e16e7a 100644 --- a/pandas/tests/series/methods/test_fillna.py +++ b/pandas/tests/series/methods/test_fillna.py @@ -67,7 +67,8 @@ def test_fillna_numeric_inplace(self): x = Series([np.nan, 1.0, np.nan, 3.0, np.nan], ["z", "a", "b", "c", "d"]) y = x.copy() - y.fillna(value=0, inplace=True) + return_value = y.fillna(value=0, inplace=True) + assert return_value is None expected = x.fillna(value=0) tm.assert_series_equal(y, expected) diff --git a/pandas/tests/series/methods/test_replace.py b/pandas/tests/series/methods/test_replace.py index 8f57cf3191d5d..11802c59a29da 100644 --- a/pandas/tests/series/methods/test_replace.py +++ b/pandas/tests/series/methods/test_replace.py @@ -13,7 +13,8 @@ def test_replace(self, datetime_series): ser[6:10] = 0 # replace list with a single value - ser.replace([np.nan], -1, inplace=True) + return_value = ser.replace([np.nan], -1, inplace=True) + assert return_value is None exp = ser.fillna(-1) tm.assert_series_equal(ser, exp) @@ -48,7 +49,8 @@ def test_replace(self, datetime_series): tm.assert_series_equal(rs, rs2) # replace inplace - ser.replace([np.nan, "foo", "bar"], -1, inplace=True) + return_value = ser.replace([np.nan, "foo", "bar"], -1, inplace=True) + assert return_value is None assert (ser[:5] == -1).all() assert (ser[6:10] == -1).all() @@ -124,7 +126,8 @@ def test_replace_with_single_list(self): tm.assert_series_equal(result, pd.Series([0, 0, 0, 0, 4])) s = ser.copy() - s.replace([1, 2, 3], inplace=True) + return_value = s.replace([1, 2, 3], inplace=True) + assert return_value is None tm.assert_series_equal(s, pd.Series([0, 0, 0, 0, 4])) # make sure things don't get corrupted when fillna call fails @@ -134,7 +137,8 @@ def test_replace_with_single_list(self): r"\(bfill\)\. Got crash_cymbal" ) with pytest.raises(ValueError, match=msg): - s.replace([1, 2, 3], inplace=True, method="crash_cymbal") + return_value = s.replace([1, 2, 3], inplace=True, method="crash_cymbal") + assert return_value is None tm.assert_series_equal(s, ser) def test_replace_with_empty_list(self): @@ -156,7 +160,8 @@ def test_replace_mixed_types(self): def check_replace(to_rep, val, expected): sc = s.copy() r = s.replace(to_rep, val) - sc.replace(to_rep, val, inplace=True) + return_value = sc.replace(to_rep, val, inplace=True) + assert return_value is None tm.assert_series_equal(expected, r) tm.assert_series_equal(expected, sc) @@ -242,7 +247,8 @@ def test_replace2(self): tm.assert_series_equal(rs, rs2) # replace inplace - ser.replace([np.nan, "foo", "bar"], -1, inplace=True) + return_value = ser.replace([np.nan, "foo", "bar"], -1, inplace=True) + assert return_value is None assert (ser[:5] == -1).all() assert (ser[6:10] == -1).all() assert (ser[20:30] == -1).all() @@ -325,11 +331,13 @@ def test_replace_categorical_single(self): tm.assert_series_equal(expected, result) assert c[2] != "foo" # ensure non-inplace call does not alter original - c.replace(c[2], "foo", inplace=True) + return_value = c.replace(c[2], "foo", inplace=True) + assert return_value is None tm.assert_series_equal(expected, c) first_value = c[0] - c.replace(c[1], c[0], inplace=True) + return_value = c.replace(c[1], c[0], inplace=True) + assert return_value is None assert c[0] == c[1] == first_value # test replacing with existing value def test_replace_with_no_overflowerror(self): diff --git a/pandas/tests/series/methods/test_reset_index.py b/pandas/tests/series/methods/test_reset_index.py index 597b43a370ef5..1474bb95f4af2 100644 --- a/pandas/tests/series/methods/test_reset_index.py +++ b/pandas/tests/series/methods/test_reset_index.py @@ -22,7 +22,8 @@ def test_reset_index(self): # check inplace s = ser.reset_index(drop=True) s2 = ser - s2.reset_index(drop=True, inplace=True) + return_value = s2.reset_index(drop=True, inplace=True) + assert return_value is None tm.assert_series_equal(s, s2) # level diff --git a/pandas/tests/series/methods/test_sort_values.py b/pandas/tests/series/methods/test_sort_values.py index b32c59b4daa0d..b49e39d4592ea 100644 --- a/pandas/tests/series/methods/test_sort_values.py +++ b/pandas/tests/series/methods/test_sort_values.py @@ -65,7 +65,8 @@ def test_sort_values(self, datetime_series): # inplace=True ts = datetime_series.copy() - ts.sort_values(ascending=False, inplace=True) + return_value = ts.sort_values(ascending=False, inplace=True) + assert return_value is None tm.assert_series_equal(ts, datetime_series.sort_values(ascending=False)) tm.assert_index_equal( ts.index, datetime_series.sort_values(ascending=False).index diff --git a/pandas/tests/series/methods/test_truncate.py b/pandas/tests/series/methods/test_truncate.py index 8a2c62cee7e24..7c82edbaec177 100644 --- a/pandas/tests/series/methods/test_truncate.py +++ b/pandas/tests/series/methods/test_truncate.py @@ -136,7 +136,8 @@ def test_truncate_multiindex(self): df = pd.DataFrame.from_dict( {"L1": [2, 2, 3, 3], "L2": ["A", "B", "A", "B"], "col": [2, 3, 4, 5]} ) - df.set_index(["L1", "L2"], inplace=True) + return_value = df.set_index(["L1", "L2"], inplace=True) + assert return_value is None expected = df.col tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/test_api.py b/pandas/tests/series/test_api.py index 042841bb4e019..b174eb0e42776 100644 --- a/pandas/tests/series/test_api.py +++ b/pandas/tests/series/test_api.py @@ -179,7 +179,8 @@ def test_constructor_dict_timedelta_index(self): def test_sparse_accessor_updates_on_inplace(self): s = pd.Series([1, 1, 2, 3], dtype="Sparse[int]") - s.drop([0, 1], inplace=True) + return_value = s.drop([0, 1], inplace=True) + assert return_value is None assert s.sparse.density == 1.0 def test_tab_completion(self): @@ -459,7 +460,8 @@ def f(x): def test_str_accessor_updates_on_inplace(self): s = pd.Series(list("abc")) - s.drop([0], inplace=True) + return_value = s.drop([0], inplace=True) + assert return_value is None assert len(s.str.lower()) == 2 def test_str_attribute(self): @@ -548,7 +550,8 @@ def test_cat_accessor(self): assert not s.cat.ordered, False exp = Categorical(["a", "b", np.nan, "a"], categories=["b", "a"]) - s.cat.set_categories(["b", "a"], inplace=True) + return_value = s.cat.set_categories(["b", "a"], inplace=True) + assert return_value is None tm.assert_categorical_equal(s.values, exp) res = s.cat.set_categories(["b", "a"]) @@ -579,8 +582,10 @@ def test_cat_accessor_no_new_attributes(self): def test_cat_accessor_updates_on_inplace(self): s = Series(list("abc")).astype("category") - s.drop(0, inplace=True) - s.cat.remove_unused_categories(inplace=True) + return_value = s.drop(0, inplace=True) + assert return_value is None + return_value = s.cat.remove_unused_categories(inplace=True) + assert return_value is None assert len(s.cat.categories) == 2 def test_categorical_delegations(self): @@ -614,7 +619,8 @@ def test_categorical_delegations(self): assert s.cat.ordered s = s.cat.as_unordered() assert not s.cat.ordered - s.cat.as_ordered(inplace=True) + return_value = s.cat.as_ordered(inplace=True) + assert return_value is None assert s.cat.ordered # reorder diff --git a/pandas/tests/series/test_datetime_values.py b/pandas/tests/series/test_datetime_values.py index 0fd51b8828bc5..d2ad9c8c398ea 100644 --- a/pandas/tests/series/test_datetime_values.py +++ b/pandas/tests/series/test_datetime_values.py @@ -625,7 +625,8 @@ def test_dt_accessor_invalid(self, ser): def test_dt_accessor_updates_on_inplace(self): s = Series(pd.date_range("2018-01-01", periods=10)) s[2] = None - s.fillna(pd.Timestamp("2018-01-01"), inplace=True) + return_value = s.fillna(pd.Timestamp("2018-01-01"), inplace=True) + assert return_value is None result = s.dt.date assert result[0] == result[2] diff --git a/pandas/tests/series/test_missing.py b/pandas/tests/series/test_missing.py index 162778e372426..0144e4257efe0 100644 --- a/pandas/tests/series/test_missing.py +++ b/pandas/tests/series/test_missing.py @@ -453,7 +453,8 @@ def test_fillna_downcast(self): def test_fillna_int(self): s = Series(np.random.randint(-100, 100, 50)) - s.fillna(method="ffill", inplace=True) + return_value = s.fillna(method="ffill", inplace=True) + assert return_value is None tm.assert_series_equal(s.fillna(method="ffill", inplace=False), s) def test_categorical_nan_equality(self): @@ -680,7 +681,8 @@ def test_dropna_empty(self): s = Series([], dtype=object) assert len(s.dropna()) == 0 - s.dropna(inplace=True) + return_value = s.dropna(inplace=True) + assert return_value is None assert len(s) == 0 # invalid axis @@ -729,7 +731,8 @@ def test_dropna_no_nan(self): assert result is not s s2 = s.copy() - s2.dropna(inplace=True) + return_value = s2.dropna(inplace=True) + assert return_value is None tm.assert_series_equal(s2, s) def test_dropna_intervals(self): @@ -775,7 +778,8 @@ def test_pad_nan(self): [np.nan, 1.0, np.nan, 3.0, np.nan], ["z", "a", "b", "c", "d"], dtype=float ) - x.fillna(method="pad", inplace=True) + return_value = x.fillna(method="pad", inplace=True) + assert return_value is None expected = Series( [np.nan, 1.0, 1.0, 3.0, 3.0], ["z", "a", "b", "c", "d"], dtype=float @@ -799,7 +803,8 @@ def test_dropna_preserve_name(self, datetime_series): assert result.name == datetime_series.name name = datetime_series.name ts = datetime_series.copy() - ts.dropna(inplace=True) + return_value = ts.dropna(inplace=True) + assert return_value is None assert ts.name == name def test_series_fillna_limit(self): From 1bbcc8807d1aba22d84abfc81296c52b0683ba57 Mon Sep 17 00:00:00 2001 From: Gim Seng <26968986+gimseng@users.noreply.github.com> Date: Fri, 10 Jul 2020 20:33:13 +0100 Subject: [PATCH 0329/1025] Fix strange behavior when precision display option is zero (#20359) (#35212) --- doc/source/whatsnew/v1.1.0.rst | 1 + pandas/io/formats/format.py | 17 +++++++++++------ pandas/tests/io/formats/test_format.py | 9 +++++++++ 3 files changed, 21 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 798a3d838ef7e..d3724112ef455 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -1018,6 +1018,7 @@ MultiIndex I/O ^^^ +- Bug in print-out when ``display.precision`` is zero. (:issue:`20359`) - Bug in :meth:`read_json` where integer overflow was occurring when json contains big number strings. (:issue:`30320`) - `read_csv` will now raise a ``ValueError`` when the arguments `header` and `prefix` both are not `None`. (:issue:`27394`) - Bug in :meth:`DataFrame.to_json` was raising ``NotFoundError`` when ``path_or_buf`` was an S3 URI (:issue:`28375`) diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 66be1cedbc3bf..b4e1ebe93fb0e 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -1384,9 +1384,9 @@ def format_values_with(float_format): if self.fixed_width: if is_complex: - result = _trim_zeros_complex(values, na_rep) + result = _trim_zeros_complex(values, self.decimal, na_rep) else: - result = _trim_zeros_float(values, na_rep) + result = _trim_zeros_float(values, self.decimal, na_rep) return np.asarray(result, dtype="object") return values @@ -1756,19 +1756,21 @@ def just(x): return result -def _trim_zeros_complex(str_complexes: np.ndarray, na_rep: str = "NaN") -> List[str]: +def _trim_zeros_complex( + str_complexes: np.ndarray, decimal: str = ".", na_rep: str = "NaN" +) -> List[str]: """ Separates the real and imaginary parts from the complex number, and executes the _trim_zeros_float method on each of those. """ return [ - "".join(_trim_zeros_float(re.split(r"([j+-])", x), na_rep)) + "".join(_trim_zeros_float(re.split(r"([j+-])", x), decimal, na_rep)) for x in str_complexes ] def _trim_zeros_float( - str_floats: Union[np.ndarray, List[str]], na_rep: str = "NaN" + str_floats: Union[np.ndarray, List[str]], decimal: str = ".", na_rep: str = "NaN" ) -> List[str]: """ Trims zeros, leaving just one before the decimal points if need be. @@ -1780,8 +1782,11 @@ def _is_number(x): def _cond(values): finite = [x for x in values if _is_number(x)] + has_decimal = [decimal in x for x in finite] + return ( len(finite) > 0 + and all(has_decimal) and all(x.endswith("0") for x in finite) and not (any(("e" in x) or ("E" in x) for x in finite)) ) @@ -1790,7 +1795,7 @@ def _cond(values): trimmed = [x[:-1] if _is_number(x) else x for x in trimmed] # leave one 0 after the decimal points if need be. - return [x + "0" if x.endswith(".") and _is_number(x) else x for x in trimmed] + return [x + "0" if x.endswith(decimal) and _is_number(x) else x for x in trimmed] def _has_names(index: Index) -> bool: diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index 0a79f2321c432..4413c5145cd41 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -2918,6 +2918,15 @@ def test_format(self): assert result[0] == " 12.0" assert result[1] == " 0.0" + def test_output_display_precision_trailing_zeroes(self): + # Issue #20359: trimming zeros while there is no decimal point + + # Happens when display precision is set to zero + with pd.option_context("display.precision", 0): + s = pd.Series([840.0, 4200.0]) + expected_output = "0 840\n1 4200\ndtype: float64" + assert str(s) == expected_output + def test_output_significant_digits(self): # Issue #9764 From 8ee3dfefb6f56c97c2e946bca6f1652340df5343 Mon Sep 17 00:00:00 2001 From: Zeb Nicholls Date: Sat, 11 Jul 2020 06:33:17 +1000 Subject: [PATCH 0330/1025] DOC: Add pint pandas ecosystem docs (#35170) --- doc/source/ecosystem.rst | 10 ++++++++++ web/pandas/community/ecosystem.md | 17 +++++++++++++---- 2 files changed, 23 insertions(+), 4 deletions(-) diff --git a/doc/source/ecosystem.rst b/doc/source/ecosystem.rst index 72e24e34bc5c1..b02d4abd3ddf8 100644 --- a/doc/source/ecosystem.rst +++ b/doc/source/ecosystem.rst @@ -421,6 +421,14 @@ found in NumPy or pandas, which work well with pandas' data containers. Cyberpandas provides an extension type for storing arrays of IP Addresses. These arrays can be stored inside pandas' Series and DataFrame. +`Pint-Pandas`_ +~~~~~~~~~~~~~~ + +`Pint-Pandas ` provides an extension type for +storing numeric arrays with units. These arrays can be stored inside pandas' +Series and DataFrame. Operations between Series and DataFrame columns which +use pint's extension array are then units aware. + .. _ecosystem.accessors: Accessors @@ -436,6 +444,7 @@ Library Accessor Classes Description `cyberpandas`_ ``ip`` ``Series`` Provides common operations for working with IP addresses. `pdvega`_ ``vgplot`` ``Series``, ``DataFrame`` Provides plotting functions from the Altair_ library. `pandas_path`_ ``path`` ``Index``, ``Series`` Provides `pathlib.Path`_ functions for Series. +`pint-pandas`_ ``pint`` ``Series``, ``DataFrame`` Provides units support for numeric Series and DataFrames. =============== ========== ========================= =============================================================== .. _cyberpandas: https://cyberpandas.readthedocs.io/en/latest @@ -443,3 +452,4 @@ Library Accessor Classes Description .. _Altair: https://altair-viz.github.io/ .. _pandas_path: https://github.com/drivendataorg/pandas-path/ .. _pathlib.Path: https://docs.python.org/3/library/pathlib.html +.. _pint-pandas: https://github.com/hgrecco/pint-pandas diff --git a/web/pandas/community/ecosystem.md b/web/pandas/community/ecosystem.md index 715a84c1babc6..be109ea53eb7d 100644 --- a/web/pandas/community/ecosystem.md +++ b/web/pandas/community/ecosystem.md @@ -353,13 +353,22 @@ Cyberpandas provides an extension type for storing arrays of IP Addresses. These arrays can be stored inside pandas' Series and DataFrame. +### [Pint-Pandas](https://github.com/hgrecco/pint-pandas) + +Pint-Pandas provides an extension type for storing numeric arrays with units. +These arrays can be stored inside pandas' Series and DataFrame. Operations +between Series and DataFrame columns which use pint's extension array are then +units aware. + ## Accessors A directory of projects providing `extension accessors `. This is for users to discover new accessors and for library authors to coordinate on the namespace. - | Library | Accessor | Classes | - | ------------------------------------------------------------|----------|-----------------------| - | [cyberpandas](https://cyberpandas.readthedocs.io/en/latest) | `ip` | `Series` | - | [pdvega](https://altair-viz.github.io/pdvega/) | `vgplot` | `Series`, `DataFrame` | + | Library | Accessor | Classes | + | --------------------------------------------------------------|----------|-----------------------| + | [cyberpandas](https://cyberpandas.readthedocs.io/en/latest) | `ip` | `Series` | + | [pdvega](https://altair-viz.github.io/pdvega/) | `vgplot` | `Series`, `DataFrame` | + | [pandas_path](https://github.com/drivendataorg/pandas-path/) | `path` | `Index`, `Series` | + | [pint-pandas](https://github.com/hgrecco/pint-pandas) | `pint` | `Series`, `DataFrame` | From 00fafa0d56b99e89ba20e9bd9f154c6c2850c5b2 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 10 Jul 2020 15:19:55 -0700 Subject: [PATCH 0331/1025] BUG: ensure_timedelta64ns overflows (#34448) --- .../reference/general_utility_functions.rst | 1 + pandas/_libs/tslibs/__init__.py | 3 +- pandas/_libs/tslibs/conversion.pyx | 40 +++++++++++++++++-- pandas/core/internals/blocks.py | 3 +- pandas/errors/__init__.py | 2 +- pandas/tests/tslibs/test_api.py | 1 + pandas/tests/tslibs/test_conversion.py | 15 ++++++- 7 files changed, 57 insertions(+), 8 deletions(-) diff --git a/doc/source/reference/general_utility_functions.rst b/doc/source/reference/general_utility_functions.rst index 72a84217323ab..c1759110b94ad 100644 --- a/doc/source/reference/general_utility_functions.rst +++ b/doc/source/reference/general_utility_functions.rst @@ -43,6 +43,7 @@ Exceptions and warnings errors.NullFrequencyError errors.NumbaUtilError errors.OutOfBoundsDatetime + errors.OutOfBoundsTimedelta errors.ParserError errors.ParserWarning errors.PerformanceWarning diff --git a/pandas/_libs/tslibs/__init__.py b/pandas/_libs/tslibs/__init__.py index 0ae4cc97d07e3..7723140e3eab1 100644 --- a/pandas/_libs/tslibs/__init__.py +++ b/pandas/_libs/tslibs/__init__.py @@ -7,6 +7,7 @@ "nat_strings", "is_null_datetimelike", "OutOfBoundsDatetime", + "OutOfBoundsTimedelta", "IncompatibleFrequency", "Period", "Resolution", @@ -26,7 +27,7 @@ ] from . import dtypes -from .conversion import localize_pydatetime +from .conversion import OutOfBoundsTimedelta, localize_pydatetime from .dtypes import Resolution from .nattype import NaT, NaTType, iNaT, is_null_datetimelike, nat_strings from .np_datetime import OutOfBoundsDatetime diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index 31d2d0e9572f5..85da7a60a029a 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -51,6 +51,15 @@ DT64NS_DTYPE = np.dtype('M8[ns]') TD64NS_DTYPE = np.dtype('m8[ns]') +class OutOfBoundsTimedelta(ValueError): + """ + Raised when encountering a timedelta value that cannot be represented + as a timedelta64[ns]. + """ + # Timedelta analogue to OutOfBoundsDatetime + pass + + # ---------------------------------------------------------------------- # Unit Conversion Helpers @@ -228,11 +237,34 @@ def ensure_timedelta64ns(arr: ndarray, copy: bool=True): Returns ------- - result : ndarray with dtype timedelta64[ns] - + ndarray[timedelta64[ns]] """ - return arr.astype(TD64NS_DTYPE, copy=copy) - # TODO: check for overflows when going from a lower-resolution to nanos + assert arr.dtype.kind == "m", arr.dtype + + if arr.dtype == TD64NS_DTYPE: + return arr.copy() if copy else arr + + # Re-use the datetime64 machinery to do an overflow-safe `astype` + dtype = arr.dtype.str.replace("m8", "M8") + dummy = arr.view(dtype) + try: + dt64_result = ensure_datetime64ns(dummy, copy) + except OutOfBoundsDatetime as err: + # Re-write the exception in terms of timedelta64 instead of dt64 + + # Find the value that we are going to report as causing an overflow + tdmin = arr.min() + tdmax = arr.max() + if np.abs(tdmin) >= np.abs(tdmax): + bad_val = tdmin + else: + bad_val = tdmax + + raise OutOfBoundsTimedelta( + f"Out of bounds for nanosecond {arr.dtype.name} {bad_val}" + ) + + return dt64_result.view(TD64NS_DTYPE) # ---------------------------------------------------------------------- diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index d8779dae7c384..6a4b3318d3aa7 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -2302,7 +2302,8 @@ class TimeDeltaBlock(DatetimeLikeBlockMixin, IntBlock): def __init__(self, values, placement, ndim=None): if values.dtype != TD64NS_DTYPE: - values = conversion.ensure_timedelta64ns(values) + # e.g. non-nano or int64 + values = TimedeltaArray._from_sequence(values)._data if isinstance(values, TimedeltaArray): values = values._data assert isinstance(values, np.ndarray), type(values) diff --git a/pandas/errors/__init__.py b/pandas/errors/__init__.py index e3427d93f3d84..6ac3004d29996 100644 --- a/pandas/errors/__init__.py +++ b/pandas/errors/__init__.py @@ -6,7 +6,7 @@ from pandas._config.config import OptionError -from pandas._libs.tslibs import OutOfBoundsDatetime +from pandas._libs.tslibs import OutOfBoundsDatetime, OutOfBoundsTimedelta class NullFrequencyError(ValueError): diff --git a/pandas/tests/tslibs/test_api.py b/pandas/tests/tslibs/test_api.py index 036037032031a..eca444c9ceb34 100644 --- a/pandas/tests/tslibs/test_api.py +++ b/pandas/tests/tslibs/test_api.py @@ -32,6 +32,7 @@ def test_namespace(): "is_null_datetimelike", "nat_strings", "OutOfBoundsDatetime", + "OutOfBoundsTimedelta", "Period", "IncompatibleFrequency", "Resolution", diff --git a/pandas/tests/tslibs/test_conversion.py b/pandas/tests/tslibs/test_conversion.py index b35940c6bb95b..4f184b78f34a1 100644 --- a/pandas/tests/tslibs/test_conversion.py +++ b/pandas/tests/tslibs/test_conversion.py @@ -4,7 +4,13 @@ import pytest from pytz import UTC -from pandas._libs.tslibs import conversion, iNaT, timezones, tzconversion +from pandas._libs.tslibs import ( + OutOfBoundsTimedelta, + conversion, + iNaT, + timezones, + tzconversion, +) from pandas import Timestamp, date_range import pandas._testing as tm @@ -89,6 +95,13 @@ def test_ensure_datetime64ns_bigendian(): tm.assert_numpy_array_equal(result, expected) +def test_ensure_timedelta64ns_overflows(): + arr = np.arange(10).astype("m8[Y]") * 100 + msg = r"Out of bounds for nanosecond timedelta64\[Y\] 900" + with pytest.raises(OutOfBoundsTimedelta, match=msg): + conversion.ensure_timedelta64ns(arr) + + class SubDatetime(datetime): pass From 0fba7652f53aedd561d53d542840e7ab4137699e Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 10 Jul 2020 15:20:26 -0700 Subject: [PATCH 0332/1025] asvs for normalize functions (#35221) --- asv_bench/benchmarks/tslibs/normalize.py | 32 ++++++++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 asv_bench/benchmarks/tslibs/normalize.py diff --git a/asv_bench/benchmarks/tslibs/normalize.py b/asv_bench/benchmarks/tslibs/normalize.py new file mode 100644 index 0000000000000..7d4e0556f4d96 --- /dev/null +++ b/asv_bench/benchmarks/tslibs/normalize.py @@ -0,0 +1,32 @@ +try: + from pandas._libs.tslibs import normalize_i8_timestamps, is_date_array_normalized +except ImportError: + from pandas._libs.tslibs.conversion import ( + normalize_i8_timestamps, + is_date_array_normalized, + ) + +import pandas as pd + +from .tslib import _sizes, _tzs + + +class Normalize: + params = [ + _sizes, + _tzs, + ] + param_names = ["size", "tz"] + + def setup(self, size, tz): + # use an array that will have is_date_array_normalized give True, + # so we do not short-circuit early. + dti = pd.date_range("2016-01-01", periods=10, tz=tz).repeat(size // 10) + self.i8data = dti.asi8 + + def time_normalize_i8_timestamps(self, size, tz): + normalize_i8_timestamps(self.i8data, tz) + + def time_is_date_array_normalized(self, size, tz): + # TODO: cases with different levels of short-circuiting + is_date_array_normalized(self.i8data, tz) From 8214ff5c3974988bda7fef3d65faa2bfea5a18dc Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 10 Jul 2020 15:20:51 -0700 Subject: [PATCH 0333/1025] CLN: remove unused freq kwarg in libparsing (#35167) --- pandas/_libs/tslibs/parsing.pyx | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index 92654f3b587e5..c4f369d0d3b3f 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -197,7 +197,6 @@ cdef inline bint does_string_look_like_time(str parse_string): def parse_datetime_string( str date_string, - object freq=None, bint dayfirst=False, bint yearfirst=False, **kwargs, @@ -228,7 +227,7 @@ def parse_datetime_string( return dt try: - dt, _ = _parse_dateabbr_string(date_string, _DEFAULT_DATETIME, freq) + dt, _ = _parse_dateabbr_string(date_string, _DEFAULT_DATETIME, freq=None) return dt except DateParseError: raise @@ -265,9 +264,6 @@ def parse_time_string(arg: str, freq=None, dayfirst=None, yearfirst=None): ------- datetime, datetime/dateutil.parser._result, str """ - if not isinstance(arg, str): - raise TypeError("parse_time_string argument must be str") - if is_offset_object(freq): freq = freq.rule_code From 62ed26a9b9bb82f7d50857be3de777d8300df197 Mon Sep 17 00:00:00 2001 From: Evan Kanter <34668740+evank28@users.noreply.github.com> Date: Fri, 10 Jul 2020 18:22:50 -0400 Subject: [PATCH 0334/1025] Update 02_read_write.rst (#35222) --- doc/source/getting_started/intro_tutorials/02_read_write.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/getting_started/intro_tutorials/02_read_write.rst b/doc/source/getting_started/intro_tutorials/02_read_write.rst index 12fa2a1e094d6..c6c6bfefc4303 100644 --- a/doc/source/getting_started/intro_tutorials/02_read_write.rst +++ b/doc/source/getting_started/intro_tutorials/02_read_write.rst @@ -151,7 +151,7 @@ named *passengers* instead of the default *Sheet1*. By setting -The equivalent read function :meth:`~DataFrame.to_excel` will reload the data to a +The equivalent read function :meth:`~DataFrame.read_excel` will reload the data to a ``DataFrame``: .. ipython:: python From 9327f673a2fd31f87b35e1af621697329a852fff Mon Sep 17 00:00:00 2001 From: rhshadrach <45562402+rhshadrach@users.noreply.github.com> Date: Fri, 10 Jul 2020 18:46:41 -0400 Subject: [PATCH 0335/1025] BUG: transform with nunique should have dtype int64 (#35152) --- doc/source/whatsnew/v1.1.0.rst | 1 + pandas/core/common.py | 21 ++++++++++++++++++- pandas/core/groupby/generic.py | 30 ++++++++++------------------ pandas/tests/groupby/test_nunique.py | 8 ++++++++ 4 files changed, 40 insertions(+), 20 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index d3724112ef455..a4c107ddefd7b 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -1089,6 +1089,7 @@ Groupby/resample/rolling - Bug in :meth:`DataFrame.groupby` lost index, when one of the ``agg`` keys referenced an empty list (:issue:`32580`) - Bug in :meth:`Rolling.apply` where ``center=True`` was ignored when ``engine='numba'`` was specified (:issue:`34784`) - Bug in :meth:`DataFrame.ewm.cov` was throwing ``AssertionError`` for :class:`MultiIndex` inputs (:issue:`34440`) +- Bug in :meth:`core.groupby.DataFrameGroupBy.transform` when ``func='nunique'`` and columns are of type ``datetime64``, the result would also be of type ``datetime64`` instead of ``int64`` (:issue:`35109`) Reshaping ^^^^^^^^^ diff --git a/pandas/core/common.py b/pandas/core/common.py index b4f726f4e59a9..e7260a9923ee0 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -5,10 +5,11 @@ """ from collections import abc, defaultdict +import contextlib from datetime import datetime, timedelta from functools import partial import inspect -from typing import Any, Collection, Iterable, List, Union +from typing import Any, Collection, Iterable, Iterator, List, Union import warnings import numpy as np @@ -502,3 +503,21 @@ def convert_to_list_like( return list(values) return [values] + + +@contextlib.contextmanager +def temp_setattr(obj, attr: str, value) -> Iterator[None]: + """Temporarily set attribute on an object. + + Args: + obj: Object whose attribute will be modified. + attr: Attribute to modify. + value: Value to temporarily set attribute to. + + Yields: + obj with modified attribute. + """ + old_value = getattr(obj, attr) + setattr(obj, attr, value) + yield obj + setattr(obj, attr, old_value) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 7f2eac520264d..1f49ee2b0b665 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -500,8 +500,10 @@ def transform(self, func, *args, engine="cython", engine_kwargs=None, **kwargs): # If func is a reduction, we need to broadcast the # result to the whole group. Compute func result # and deal with possible broadcasting below. - result = getattr(self, func)(*args, **kwargs) - return self._transform_fast(result, func) + # Temporarily set observed for dealing with categoricals. + with com.temp_setattr(self, "observed", True): + result = getattr(self, func)(*args, **kwargs) + return self._transform_fast(result) def _transform_general( self, func, *args, engine="cython", engine_kwargs=None, **kwargs @@ -554,17 +556,14 @@ def _transform_general( result.index = self._selected_obj.index return result - def _transform_fast(self, result, func_nm: str) -> Series: + def _transform_fast(self, result) -> Series: """ fast version of transform, only applicable to builtin/cythonizable functions """ ids, _, ngroup = self.grouper.group_info result = result.reindex(self.grouper.result_index, copy=False) - cast = self._transform_should_cast(func_nm) out = algorithms.take_1d(result._values, ids) - if cast: - out = maybe_cast_result(out, self.obj, how=func_nm) return self.obj._constructor(out, index=self.obj.index, name=self.obj.name) def filter(self, func, dropna=True, *args, **kwargs): @@ -1465,25 +1464,23 @@ def transform(self, func, *args, engine="cython", engine_kwargs=None, **kwargs): # If func is a reduction, we need to broadcast the # result to the whole group. Compute func result # and deal with possible broadcasting below. - result = getattr(self, func)(*args, **kwargs) + # Temporarily set observed for dealing with categoricals. + with com.temp_setattr(self, "observed", True): + result = getattr(self, func)(*args, **kwargs) if isinstance(result, DataFrame) and result.columns.equals( self._obj_with_exclusions.columns ): - return self._transform_fast(result, func) + return self._transform_fast(result) return self._transform_general( func, engine=engine, engine_kwargs=engine_kwargs, *args, **kwargs ) - def _transform_fast(self, result: DataFrame, func_nm: str) -> DataFrame: + def _transform_fast(self, result: DataFrame) -> DataFrame: """ Fast transform path for aggregations """ - # if there were groups with no observations (Categorical only?) - # try casting data to original dtype - cast = self._transform_should_cast(func_nm) - obj = self._obj_with_exclusions # for each col, reshape to to size of original frame @@ -1492,12 +1489,7 @@ def _transform_fast(self, result: DataFrame, func_nm: str) -> DataFrame: result = result.reindex(self.grouper.result_index, copy=False) output = [] for i, _ in enumerate(result.columns): - res = algorithms.take_1d(result.iloc[:, i].values, ids) - # TODO: we have no test cases that get here with EA dtypes; - # maybe_cast_result may not be needed if EAs never get here - if cast: - res = maybe_cast_result(res, obj.iloc[:, i], how=func_nm) - output.append(res) + output.append(algorithms.take_1d(result.iloc[:, i].values, ids)) return self.obj._constructor._from_arrays( output, columns=result.columns, index=obj.index diff --git a/pandas/tests/groupby/test_nunique.py b/pandas/tests/groupby/test_nunique.py index 1475b1ce2907c..c3347b7ae52f3 100644 --- a/pandas/tests/groupby/test_nunique.py +++ b/pandas/tests/groupby/test_nunique.py @@ -167,3 +167,11 @@ def test_nunique_preserves_column_level_names(): result = test.groupby([0, 0, 0]).nunique() expected = pd.DataFrame([2], columns=test.columns) tm.assert_frame_equal(result, expected) + + +def test_nunique_transform_with_datetime(): + # GH 35109 - transform with nunique on datetimes results in integers + df = pd.DataFrame(date_range("2008-12-31", "2009-01-02"), columns=["date"]) + result = df.groupby([0, 0, 1])["date"].transform("nunique") + expected = pd.Series([2, 2, 1], name="date") + tm.assert_series_equal(result, expected) From 3d727116c9d26996bf72fe3c701afb954fef1afc Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 10 Jul 2020 16:54:34 -0700 Subject: [PATCH 0336/1025] REF: move registry, Registry to dtypes.base (#34830) --- pandas/api/extensions/__init__.py | 2 +- pandas/core/arrays/integer.py | 2 +- pandas/core/arrays/sparse/dtype.py | 3 +- pandas/core/arrays/string_.py | 3 +- pandas/core/construction.py | 2 +- pandas/core/dtypes/base.py | 91 +++++++++++++++++++++++++++++- pandas/core/dtypes/common.py | 2 +- pandas/core/dtypes/dtypes.py | 91 +----------------------------- pandas/tests/arrays/test_array.py | 2 +- pandas/tests/arrays/test_period.py | 3 +- pandas/tests/dtypes/test_dtypes.py | 2 +- 11 files changed, 101 insertions(+), 102 deletions(-) diff --git a/pandas/api/extensions/__init__.py b/pandas/api/extensions/__init__.py index 3019dd0e9b371..401e7081d2422 100644 --- a/pandas/api/extensions/__init__.py +++ b/pandas/api/extensions/__init__.py @@ -4,7 +4,7 @@ from pandas._libs.lib import no_default -from pandas.core.dtypes.dtypes import ExtensionDtype, register_extension_dtype +from pandas.core.dtypes.base import ExtensionDtype, register_extension_dtype from pandas.core.accessor import ( register_dataframe_accessor, diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index b5cb681812939..b0958af41158c 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -10,6 +10,7 @@ from pandas.compat.numpy import function as nv from pandas.util._decorators import cache_readonly +from pandas.core.dtypes.base import register_extension_dtype from pandas.core.dtypes.common import ( is_bool_dtype, is_datetime64_dtype, @@ -21,7 +22,6 @@ is_object_dtype, pandas_dtype, ) -from pandas.core.dtypes.dtypes import register_extension_dtype from pandas.core.dtypes.missing import isna from pandas.core import ops diff --git a/pandas/core/arrays/sparse/dtype.py b/pandas/core/arrays/sparse/dtype.py index b3da9cbeb44af..ccf2825162f51 100644 --- a/pandas/core/arrays/sparse/dtype.py +++ b/pandas/core/arrays/sparse/dtype.py @@ -9,7 +9,7 @@ from pandas._typing import Dtype, DtypeObj from pandas.errors import PerformanceWarning -from pandas.core.dtypes.base import ExtensionDtype +from pandas.core.dtypes.base import ExtensionDtype, register_extension_dtype from pandas.core.dtypes.cast import astype_nansafe from pandas.core.dtypes.common import ( is_bool_dtype, @@ -19,7 +19,6 @@ is_string_dtype, pandas_dtype, ) -from pandas.core.dtypes.dtypes import register_extension_dtype from pandas.core.dtypes.missing import isna, na_value_for_dtype if TYPE_CHECKING: diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index ac501a8afbe09..5104e3f12f5b4 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -5,9 +5,8 @@ from pandas._libs import lib, missing as libmissing -from pandas.core.dtypes.base import ExtensionDtype +from pandas.core.dtypes.base import ExtensionDtype, register_extension_dtype from pandas.core.dtypes.common import pandas_dtype -from pandas.core.dtypes.dtypes import register_extension_dtype from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries from pandas.core.dtypes.inference import is_array_like diff --git a/pandas/core/construction.py b/pandas/core/construction.py index 9ac661f97a56e..6c58698989e96 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -15,6 +15,7 @@ from pandas._libs.tslibs import IncompatibleFrequency, OutOfBoundsDatetime from pandas._typing import AnyArrayLike, ArrayLike, Dtype, DtypeObj +from pandas.core.dtypes.base import ExtensionDtype, registry from pandas.core.dtypes.cast import ( construct_1d_arraylike_from_scalar, construct_1d_ndarray_preserving_na, @@ -36,7 +37,6 @@ is_object_dtype, is_timedelta64_ns_dtype, ) -from pandas.core.dtypes.dtypes import ExtensionDtype, registry from pandas.core.dtypes.generic import ( ABCExtensionArray, ABCIndexClass, diff --git a/pandas/core/dtypes/base.py b/pandas/core/dtypes/base.py index 2d81dd4d884a3..07c73876954d0 100644 --- a/pandas/core/dtypes/base.py +++ b/pandas/core/dtypes/base.py @@ -2,7 +2,7 @@ Extend pandas with custom array types. """ -from typing import TYPE_CHECKING, Any, List, Optional, Tuple, Type +from typing import TYPE_CHECKING, Any, List, Optional, Tuple, Type, Union import numpy as np @@ -352,3 +352,92 @@ def _get_common_dtype(self, dtypes: List[DtypeObj]) -> Optional[DtypeObj]: return self else: return None + + +def register_extension_dtype(cls: Type[ExtensionDtype]) -> Type[ExtensionDtype]: + """ + Register an ExtensionType with pandas as class decorator. + + .. versionadded:: 0.24.0 + + This enables operations like ``.astype(name)`` for the name + of the ExtensionDtype. + + Returns + ------- + callable + A class decorator. + + Examples + -------- + >>> from pandas.api.extensions import register_extension_dtype + >>> from pandas.api.extensions import ExtensionDtype + >>> @register_extension_dtype + ... class MyExtensionDtype(ExtensionDtype): + ... name = "myextension" + """ + registry.register(cls) + return cls + + +class Registry: + """ + Registry for dtype inference. + + The registry allows one to map a string repr of a extension + dtype to an extension dtype. The string alias can be used in several + places, including + + * Series and Index constructors + * :meth:`pandas.array` + * :meth:`pandas.Series.astype` + + Multiple extension types can be registered. + These are tried in order. + """ + + def __init__(self): + self.dtypes: List[Type[ExtensionDtype]] = [] + + def register(self, dtype: Type[ExtensionDtype]) -> None: + """ + Parameters + ---------- + dtype : ExtensionDtype class + """ + if not issubclass(dtype, ExtensionDtype): + raise ValueError("can only register pandas extension dtypes") + + self.dtypes.append(dtype) + + def find( + self, dtype: Union[Type[ExtensionDtype], str] + ) -> Optional[Type[ExtensionDtype]]: + """ + Parameters + ---------- + dtype : Type[ExtensionDtype] or str + + Returns + ------- + return the first matching dtype, otherwise return None + """ + if not isinstance(dtype, str): + dtype_type = dtype + if not isinstance(dtype, type): + dtype_type = type(dtype) + if issubclass(dtype_type, ExtensionDtype): + return dtype + + return None + + for dtype_type in self.dtypes: + try: + return dtype_type.construct_from_string(dtype) + except TypeError: + pass + + return None + + +registry = Registry() diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 9e960375e9bf4..a2ca4d84b2bf6 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -11,13 +11,13 @@ from pandas._libs.tslibs import conversion from pandas._typing import ArrayLike, DtypeObj +from pandas.core.dtypes.base import registry from pandas.core.dtypes.dtypes import ( CategoricalDtype, DatetimeTZDtype, ExtensionDtype, IntervalDtype, PeriodDtype, - registry, ) from pandas.core.dtypes.generic import ABCCategorical, ABCIndexClass from pandas.core.dtypes.inference import ( # noqa:F401 diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index a9d2430717e4f..22480fbc47508 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -24,7 +24,7 @@ from pandas._libs.tslibs.offsets import BaseOffset from pandas._typing import DtypeObj, Ordered -from pandas.core.dtypes.base import ExtensionDtype +from pandas.core.dtypes.base import ExtensionDtype, register_extension_dtype from pandas.core.dtypes.generic import ABCCategoricalIndex, ABCIndexClass from pandas.core.dtypes.inference import is_bool, is_list_like @@ -40,95 +40,6 @@ str_type = str -def register_extension_dtype(cls: Type[ExtensionDtype]) -> Type[ExtensionDtype]: - """ - Register an ExtensionType with pandas as class decorator. - - .. versionadded:: 0.24.0 - - This enables operations like ``.astype(name)`` for the name - of the ExtensionDtype. - - Returns - ------- - callable - A class decorator. - - Examples - -------- - >>> from pandas.api.extensions import register_extension_dtype - >>> from pandas.api.extensions import ExtensionDtype - >>> @register_extension_dtype - ... class MyExtensionDtype(ExtensionDtype): - ... pass - """ - registry.register(cls) - return cls - - -class Registry: - """ - Registry for dtype inference. - - The registry allows one to map a string repr of a extension - dtype to an extension dtype. The string alias can be used in several - places, including - - * Series and Index constructors - * :meth:`pandas.array` - * :meth:`pandas.Series.astype` - - Multiple extension types can be registered. - These are tried in order. - """ - - def __init__(self): - self.dtypes: List[Type[ExtensionDtype]] = [] - - def register(self, dtype: Type[ExtensionDtype]) -> None: - """ - Parameters - ---------- - dtype : ExtensionDtype class - """ - if not issubclass(dtype, ExtensionDtype): - raise ValueError("can only register pandas extension dtypes") - - self.dtypes.append(dtype) - - def find( - self, dtype: Union[Type[ExtensionDtype], str] - ) -> Optional[Type[ExtensionDtype]]: - """ - Parameters - ---------- - dtype : Type[ExtensionDtype] or str - - Returns - ------- - return the first matching dtype, otherwise return None - """ - if not isinstance(dtype, str): - dtype_type = dtype - if not isinstance(dtype, type): - dtype_type = type(dtype) - if issubclass(dtype_type, ExtensionDtype): - return dtype - - return None - - for dtype_type in self.dtypes: - try: - return dtype_type.construct_from_string(dtype) - except TypeError: - pass - - return None - - -registry = Registry() - - class PandasExtensionDtype(ExtensionDtype): """ A np.dtype duck-typed class, suitable for holding a custom dtype. diff --git a/pandas/tests/arrays/test_array.py b/pandas/tests/arrays/test_array.py index ad6e6e4a98057..a0525aa511ee2 100644 --- a/pandas/tests/arrays/test_array.py +++ b/pandas/tests/arrays/test_array.py @@ -5,7 +5,7 @@ import pytest import pytz -from pandas.core.dtypes.dtypes import registry +from pandas.core.dtypes.base import registry import pandas as pd import pandas._testing as tm diff --git a/pandas/tests/arrays/test_period.py b/pandas/tests/arrays/test_period.py index 27e6334788284..8887dd0278afe 100644 --- a/pandas/tests/arrays/test_period.py +++ b/pandas/tests/arrays/test_period.py @@ -5,7 +5,8 @@ from pandas._libs.tslibs.period import IncompatibleFrequency import pandas.util._test_decorators as td -from pandas.core.dtypes.dtypes import PeriodDtype, registry +from pandas.core.dtypes.base import registry +from pandas.core.dtypes.dtypes import PeriodDtype import pandas as pd import pandas._testing as tm diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index b1fe673e9e2f1..a58dc5e5ec74a 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -4,6 +4,7 @@ import pytest import pytz +from pandas.core.dtypes.base import registry from pandas.core.dtypes.common import ( is_bool_dtype, is_categorical, @@ -22,7 +23,6 @@ DatetimeTZDtype, IntervalDtype, PeriodDtype, - registry, ) import pandas as pd From 1c6e5c002cc425398c7e5e7528d569aa4a9d684f Mon Sep 17 00:00:00 2001 From: Justin Essert Date: Fri, 10 Jul 2020 21:02:54 -0500 Subject: [PATCH 0337/1025] DF.__setitem__ creates extension column when given extension scalar (#34875) * Bugfix to make DF.__setitem__ create extension column instead of object column when given an extension scalar * removed bad whitespace * Apply suggestions from code review Checking if extension dtype via built in function instead of manually Co-authored-by: Tom Augspurger * added missing : * modified cast_extension_scalar_to_array test to include an Interval type * added user-facing test for extension type bug * fixed pep8 issues * added note about bug in setting series to scalar extension type * corrected order of imports * corrected order of imports * fixed black formatting errors * removed extra comma * updated cast_scalar_to_arr to support tuple shape for extension dtype * removed unneeded code * added coverage for datetime with timezone in extension_array test * added TODO * correct line that was too long * fixed dtype issue with tz test * creating distinct arrays for each column * resolving mypy error * added docstring info and test * removed unneeded import * flattened else case in init * refactored extension type column fix * reverted docstring changes * reverted docstring changes * removed unneeded imports * reverted test changes * fixed construct_1d_arraylike bug * reorganized if statements * moved what's new statement to correct file * created new test for period df construction * added assert_frame_equal to period_data test * Using pandas array instead of df constructor for better test Co-authored-by: Joris Van den Bossche * changed wording * pylint fixes * parameterized test and added comment * removed extra comma * parameterized test * renamed test Co-authored-by: Tom Augspurger Co-authored-by: Joris Van den Bossche --- doc/source/whatsnew/v1.1.0.rst | 1 + pandas/core/frame.py | 48 ++++++++++++++----- pandas/tests/frame/indexing/test_setitem.py | 33 ++++++++++++- .../tests/frame/methods/test_combine_first.py | 8 +++- pandas/tests/frame/test_constructors.py | 28 ++++++++++- 5 files changed, 103 insertions(+), 15 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index a4c107ddefd7b..eb4075927b6aa 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -1146,6 +1146,7 @@ ExtensionArray - Fixed bug where :meth:`StringArray.memory_usage` was not implemented (:issue:`33963`) - Fixed bug where :meth:`DataFrameGroupBy` would ignore the ``min_count`` argument for aggregations on nullable boolean dtypes (:issue:`34051`) - Fixed bug that `DataFrame(columns=.., dtype='string')` would fail (:issue:`27953`, :issue:`33623`) +- Bug where :class:`DataFrame` column set to scalar extension type was considered an object type rather than the extension type (:issue:`34832`) - Fixed bug in ``IntegerArray.astype`` to correctly copy the mask as well (:issue:`34931`). Other diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 10539ab74b4aa..cfe5621fec14e 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -520,25 +520,43 @@ def __init__( mgr = init_ndarray(data, index, columns, dtype=dtype, copy=copy) else: mgr = init_dict({}, index, columns, dtype=dtype) + # For data is scalar else: - try: - arr = np.array(data, dtype=dtype, copy=copy) - except (ValueError, TypeError) as err: - exc = TypeError( - "DataFrame constructor called with " - f"incompatible data and dtype: {err}" - ) - raise exc from err + if index is None or columns is None: + raise ValueError("DataFrame constructor not properly called!") + + if not dtype: + dtype, _ = infer_dtype_from_scalar(data, pandas_dtype=True) + + # For data is a scalar extension dtype + if is_extension_array_dtype(dtype): + + values = [ + construct_1d_arraylike_from_scalar(data, len(index), dtype) + for _ in range(len(columns)) + ] + mgr = arrays_to_mgr(values, columns, index, columns, dtype=None) + else: + # Attempt to coerce to a numpy array + try: + arr = np.array(data, dtype=dtype, copy=copy) + except (ValueError, TypeError) as err: + exc = TypeError( + "DataFrame constructor called with " + f"incompatible data and dtype: {err}" + ) + raise exc from err + + if arr.ndim != 0: + raise ValueError("DataFrame constructor not properly called!") - if arr.ndim == 0 and index is not None and columns is not None: values = cast_scalar_to_array( (len(index), len(columns)), data, dtype=dtype ) + mgr = init_ndarray( values, index, columns, dtype=values.dtype, copy=False ) - else: - raise ValueError("DataFrame constructor not properly called!") NDFrame.__init__(self, mgr) @@ -3740,7 +3758,13 @@ def reindexer(value): infer_dtype, _ = infer_dtype_from_scalar(value, pandas_dtype=True) # upcast - value = cast_scalar_to_array(len(self.index), value) + if is_extension_array_dtype(infer_dtype): + value = construct_1d_arraylike_from_scalar( + value, len(self.index), infer_dtype + ) + else: + value = cast_scalar_to_array(len(self.index), value) + value = maybe_cast_to_datetime(value, infer_dtype) # return internal types directly diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py index 8fcdae95fbab5..9bb5338f1e07f 100644 --- a/pandas/tests/frame/indexing/test_setitem.py +++ b/pandas/tests/frame/indexing/test_setitem.py @@ -1,7 +1,18 @@ import numpy as np import pytest -from pandas import Categorical, DataFrame, Index, Series, Timestamp, date_range +from pandas.core.dtypes.dtypes import DatetimeTZDtype, IntervalDtype, PeriodDtype + +from pandas import ( + Categorical, + DataFrame, + Index, + Interval, + Period, + Series, + Timestamp, + date_range, +) import pandas._testing as tm from pandas.core.arrays import SparseArray @@ -150,3 +161,23 @@ def test_setitem_dict_preserves_dtypes(self): "c": float(b), } tm.assert_frame_equal(df, expected) + + @pytest.mark.parametrize( + "obj,dtype", + [ + (Period("2020-01"), PeriodDtype("M")), + (Interval(left=0, right=5), IntervalDtype("int64")), + ( + Timestamp("2011-01-01", tz="US/Eastern"), + DatetimeTZDtype(tz="US/Eastern"), + ), + ], + ) + def test_setitem_extension_types(self, obj, dtype): + # GH: 34832 + expected = DataFrame({"idx": [1, 2, 3], "obj": Series([obj] * 3, dtype=dtype)}) + + df = DataFrame({"idx": [1, 2, 3]}) + df["obj"] = obj + + tm.assert_frame_equal(df, expected) diff --git a/pandas/tests/frame/methods/test_combine_first.py b/pandas/tests/frame/methods/test_combine_first.py index 7715cb1cb6eec..78f265d32f8df 100644 --- a/pandas/tests/frame/methods/test_combine_first.py +++ b/pandas/tests/frame/methods/test_combine_first.py @@ -199,12 +199,14 @@ def test_combine_first_timezone(self): columns=["UTCdatetime", "abc"], data=data1, index=pd.date_range("20140627", periods=1), + dtype="object", ) data2 = pd.to_datetime("20121212 12:12").tz_localize("UTC") df2 = pd.DataFrame( columns=["UTCdatetime", "xyz"], data=data2, index=pd.date_range("20140628", periods=1), + dtype="object", ) res = df2[["UTCdatetime"]].combine_first(df1) exp = pd.DataFrame( @@ -217,10 +219,14 @@ def test_combine_first_timezone(self): }, columns=["UTCdatetime", "abc"], index=pd.date_range("20140627", periods=2, freq="D"), + dtype="object", ) - tm.assert_frame_equal(res, exp) assert res["UTCdatetime"].dtype == "datetime64[ns, UTC]" assert res["abc"].dtype == "datetime64[ns, UTC]" + # Need to cast all to "obejct" because combine_first does not retain dtypes: + # GH Issue 7509 + res = res.astype("object") + tm.assert_frame_equal(res, exp) # see gh-10567 dts1 = pd.date_range("2015-01-01", "2015-01-05", tz="UTC") diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index ab4f7781467e7..64ae29e6de63c 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -14,13 +14,16 @@ from pandas.compat.numpy import _np_version_under1p19 from pandas.core.dtypes.common import is_integer_dtype +from pandas.core.dtypes.dtypes import DatetimeTZDtype, IntervalDtype, PeriodDtype import pandas as pd from pandas import ( Categorical, DataFrame, Index, + Interval, MultiIndex, + Period, RangeIndex, Series, Timedelta, @@ -700,7 +703,7 @@ def create_data(constructor): tm.assert_frame_equal(result_timedelta, expected) tm.assert_frame_equal(result_Timedelta, expected) - def test_constructor_period(self): + def test_constructor_period_dict(self): # PeriodIndex a = pd.PeriodIndex(["2012-01", "NaT", "2012-04"], freq="M") b = pd.PeriodIndex(["2012-02-01", "2012-03-01", "NaT"], freq="D") @@ -713,6 +716,29 @@ def test_constructor_period(self): assert df["a"].dtype == a.dtype assert df["b"].dtype == b.dtype + @pytest.mark.parametrize( + "data,dtype", + [ + (Period("2020-01"), PeriodDtype("M")), + (Interval(left=0, right=5), IntervalDtype("int64")), + ( + Timestamp("2011-01-01", tz="US/Eastern"), + DatetimeTZDtype(tz="US/Eastern"), + ), + ], + ) + def test_constructor_extension_scalar_data(self, data, dtype): + # GH 34832 + df = DataFrame(index=[0, 1], columns=["a", "b"], data=data) + + assert df["a"].dtype == dtype + assert df["b"].dtype == dtype + + arr = pd.array([data] * 2, dtype=dtype) + expected = DataFrame({"a": arr, "b": arr}) + + tm.assert_frame_equal(df, expected) + def test_nested_dict_frame_constructor(self): rng = pd.period_range("1/1/2000", periods=5) df = DataFrame(np.random.randn(10, 5), columns=rng) From 292a0ddfee8f06abc5800b3c1db034ed9c2f26ff Mon Sep 17 00:00:00 2001 From: Kaiqi Dong Date: Sat, 11 Jul 2020 04:10:19 +0200 Subject: [PATCH 0338/1025] TYPING/DOC: Move custom type to _typing and add whatsnew (#35220) * remove \n from docstring * fix issue 17038 * revert change * revert change * move defined typing and add whatsnew --- doc/source/whatsnew/v1.1.0.rst | 1 + pandas/_typing.py | 8 ++++++++ pandas/core/aggregation.py | 10 +--------- 3 files changed, 10 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index eb4075927b6aa..5f93e08d51baa 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -332,6 +332,7 @@ Other enhancements - :meth:`~Series.explode` now accepts ``ignore_index`` to reset the index, similarly to :meth:`pd.concat` or :meth:`DataFrame.sort_values` (:issue:`34932`). - :meth:`read_csv` now accepts string values like "0", "0.0", "1", "1.0" as convertible to the nullable boolean dtype (:issue:`34859`) - :class:`pandas.core.window.ExponentialMovingWindow` now supports a ``times`` argument that allows ``mean`` to be calculated with observations spaced by the timestamps in ``times`` (:issue:`34839`) +- :meth:`DataFrame.agg` and :meth:`Series.agg` now accept named aggregation for renaming the output columns/indexes. (:issue:`26513`) .. --------------------------------------------------------------------------- diff --git a/pandas/_typing.py b/pandas/_typing.py index 4892abc5f6f51..8e98833ad37f7 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -96,3 +96,11 @@ # DataFrame::sort_index, among others ValueKeyFunc = Optional[Callable[["Series"], Union["Series", AnyArrayLike]]] IndexKeyFunc = Optional[Callable[["Index"], Union["Index", AnyArrayLike]]] + +# types of `func` kwarg for DataFrame.aggregate and Series.aggregate +AggFuncTypeBase = Union[Callable, str] +AggFuncType = Union[ + AggFuncTypeBase, + List[AggFuncTypeBase], + Dict[Label, Union[AggFuncTypeBase, List[AggFuncTypeBase]]], +] diff --git a/pandas/core/aggregation.py b/pandas/core/aggregation.py index 16c4a9f862d79..891048ae82dfd 100644 --- a/pandas/core/aggregation.py +++ b/pandas/core/aggregation.py @@ -17,7 +17,7 @@ Union, ) -from pandas._typing import Label +from pandas._typing import AggFuncType, Label from pandas.core.dtypes.common import is_dict_like, is_list_like @@ -26,14 +26,6 @@ from pandas.core.indexes.api import Index from pandas.core.series import FrameOrSeriesUnion, Series -# types of `func` kwarg for DataFrame.aggregate and Series.aggregate -AggFuncTypeBase = Union[Callable, str] -AggFuncType = Union[ - AggFuncTypeBase, - List[AggFuncTypeBase], - Dict[Label, Union[AggFuncTypeBase, List[AggFuncTypeBase]]], -] - def reconstruct_func( func: Optional[AggFuncType], **kwargs, From 4cd1f111977654aa8dac7529e85c58561a9a913f Mon Sep 17 00:00:00 2001 From: Paul Sanders Date: Sat, 11 Jul 2020 04:51:36 +0000 Subject: [PATCH 0339/1025] TST: added test for groupby/apply timezone-aware with copy (#35225) --- pandas/tests/groupby/test_apply.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 1945647ced08f..aa10f44670361 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -995,3 +995,18 @@ def test_apply_function_with_indexing_return_column(): result = df.groupby("foo1", as_index=False).apply(lambda x: x.mean()) expected = DataFrame({"foo1": ["one", "three", "two"], "foo2": [3.0, 4.0, 4.0]}) tm.assert_frame_equal(result, expected) + + +def test_apply_with_timezones_aware(): + # GH: 27212 + + dates = ["2001-01-01"] * 2 + ["2001-01-02"] * 2 + ["2001-01-03"] * 2 + index_no_tz = pd.DatetimeIndex(dates) + index_tz = pd.DatetimeIndex(dates, tz="UTC") + df1 = pd.DataFrame({"x": list(range(2)) * 3, "y": range(6), "t": index_no_tz}) + df2 = pd.DataFrame({"x": list(range(2)) * 3, "y": range(6), "t": index_tz}) + + result1 = df1.groupby("x", group_keys=False).apply(lambda df: df[["x", "y"]].copy()) + result2 = df2.groupby("x", group_keys=False).apply(lambda df: df[["x", "y"]].copy()) + + tm.assert_frame_equal(result1, result2) From f1f1ffd0fb51edac6908b1a9292afa9d152e67bd Mon Sep 17 00:00:00 2001 From: Robin to Roxel <35864265+r-toroxel@users.noreply.github.com> Date: Sat, 11 Jul 2020 12:44:56 +0200 Subject: [PATCH 0340/1025] TST add corner cases in test_constructors (#35216) * TST add test case to drop_duplicates for inplace=True * CLN PEP-8 * TST move to existing test * CLN remove parenthesis * TST test from_tuple corner cases * add comment * CLN run black formatting * CLN refactor case as separate test --- pandas/tests/frame/test_constructors.py | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 64ae29e6de63c..bfff58d05007f 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -1204,6 +1204,13 @@ def test_constructor_list_of_odicts(self): expected = DataFrame(index=[0]) tm.assert_frame_equal(result, expected) + def test_constructor_single_row(self): + data = [OrderedDict([["a", 1.5], ["b", 3], ["c", 4], ["d", 6]])] + + result = DataFrame(data) + expected = DataFrame.from_dict(dict(zip([0], data)), orient="index") + tm.assert_frame_equal(result, expected.reindex(result.index)) + def test_constructor_ordered_dict_preserve_order(self): # see gh-13304 expected = DataFrame([[2, 1]], columns=["b", "a"]) @@ -1519,16 +1526,17 @@ def test_from_dict_columns_parameter(self): ) @pytest.mark.parametrize( - "data_dict, keys", + "data_dict, keys, orient", [ - ([{("a",): 1}, {("a",): 2}], [("a",)]), - ([OrderedDict([(("a",), 1), (("b",), 2)])], [("a",), ("b",)]), - ([{("a", "b"): 1}], [("a", "b")]), + ({}, [], "index"), + ([{("a",): 1}, {("a",): 2}], [("a",)], "columns"), + ([OrderedDict([(("a",), 1), (("b",), 2)])], [("a",), ("b",)], "columns"), + ([{("a", "b"): 1}], [("a", "b")], "columns"), ], ) - def test_constructor_from_dict_tuples(self, data_dict, keys): + def test_constructor_from_dict_tuples(self, data_dict, keys, orient): # GH 16769 - df = DataFrame.from_dict(data_dict) + df = DataFrame.from_dict(data_dict, orient) result = df.columns expected = Index(keys, dtype="object", tupleize_cols=False) From f5d59ef57114f2eb490fbe16fa4f73aca489f4df Mon Sep 17 00:00:00 2001 From: Robin to Roxel <35864265+r-toroxel@users.noreply.github.com> Date: Sun, 12 Jul 2020 14:05:02 +0200 Subject: [PATCH 0341/1025] Tst verify return none in tests/frame (#35232) --- .../tests/frame/indexing/test_categorical.py | 5 +- pandas/tests/frame/indexing/test_mask.py | 6 +- pandas/tests/frame/indexing/test_where.py | 42 +++-- pandas/tests/frame/methods/test_clip.py | 3 +- pandas/tests/frame/methods/test_drop.py | 9 +- .../tests/frame/methods/test_interpolate.py | 12 +- pandas/tests/frame/methods/test_rename.py | 3 +- .../tests/frame/methods/test_rename_axis.py | 6 +- pandas/tests/frame/methods/test_replace.py | 163 +++++++++++++----- .../tests/frame/methods/test_reset_index.py | 6 +- pandas/tests/frame/methods/test_set_index.py | 3 +- pandas/tests/frame/methods/test_sort_index.py | 15 +- .../tests/frame/methods/test_sort_values.py | 27 ++- pandas/tests/frame/test_block_internals.py | 3 +- pandas/tests/frame/test_query_eval.py | 18 +- pandas/tests/frame/test_reshape.py | 3 +- pandas/tests/frame/test_to_csv.py | 3 +- 17 files changed, 230 insertions(+), 97 deletions(-) diff --git a/pandas/tests/frame/indexing/test_categorical.py b/pandas/tests/frame/indexing/test_categorical.py index d94dc8d2ffe00..cfc22b9b18729 100644 --- a/pandas/tests/frame/indexing/test_categorical.py +++ b/pandas/tests/frame/indexing/test_categorical.py @@ -326,7 +326,10 @@ def test_assigning_ops(self): df = DataFrame({"cats": catsf, "values": valuesf}, index=idxf) exp_fancy = exp_multi_row.copy() - exp_fancy["cats"].cat.set_categories(["a", "b", "c"], inplace=True) + return_value = exp_fancy["cats"].cat.set_categories( + ["a", "b", "c"], inplace=True + ) + assert return_value is None df[df["cats"] == "c"] = ["b", 2] # category c is kept in .categories diff --git a/pandas/tests/frame/indexing/test_mask.py b/pandas/tests/frame/indexing/test_mask.py index 30db6110efc80..23f3a18881782 100644 --- a/pandas/tests/frame/indexing/test_mask.py +++ b/pandas/tests/frame/indexing/test_mask.py @@ -36,12 +36,14 @@ def test_mask_inplace(self): rdf = df.copy() - rdf.where(cond, inplace=True) + return_value = rdf.where(cond, inplace=True) + assert return_value is None tm.assert_frame_equal(rdf, df.where(cond)) tm.assert_frame_equal(rdf, df.mask(~cond)) rdf = df.copy() - rdf.where(cond, -df, inplace=True) + return_value = rdf.where(cond, -df, inplace=True) + assert return_value is None tm.assert_frame_equal(rdf, df.where(cond, -df)) tm.assert_frame_equal(rdf, df.mask(~cond, -df)) diff --git a/pandas/tests/frame/indexing/test_where.py b/pandas/tests/frame/indexing/test_where.py index 24eb424bd5735..d114a3178b686 100644 --- a/pandas/tests/frame/indexing/test_where.py +++ b/pandas/tests/frame/indexing/test_where.py @@ -162,7 +162,8 @@ def _check_set(df, cond, check_dtypes=True): econd = cond.reindex_like(df).fillna(True) expected = dfi.mask(~econd) - dfi.where(cond, np.nan, inplace=True) + return_value = dfi.where(cond, np.nan, inplace=True) + assert return_value is None tm.assert_frame_equal(dfi, expected) # dtypes (and confirm upcasts)x @@ -303,7 +304,8 @@ def test_where_bug(self): tm.assert_frame_equal(result, expected) result = df.copy() - result.where(result > 2, np.nan, inplace=True) + return_value = result.where(result > 2, np.nan, inplace=True) + assert return_value is None tm.assert_frame_equal(result, expected) def test_where_bug_mixed(self, sint_dtype): @@ -324,7 +326,8 @@ def test_where_bug_mixed(self, sint_dtype): tm.assert_frame_equal(result, expected) result = df.copy() - result.where(result > 2, np.nan, inplace=True) + return_value = result.where(result > 2, np.nan, inplace=True) + assert return_value is None tm.assert_frame_equal(result, expected) def test_where_bug_transposition(self): @@ -417,7 +420,8 @@ def create(): result = df.where(pd.notna(df), df.mean(), axis="columns") tm.assert_frame_equal(result, expected) - df.where(pd.notna(df), df.mean(), inplace=True, axis="columns") + return_value = df.where(pd.notna(df), df.mean(), inplace=True, axis="columns") + assert return_value is None tm.assert_frame_equal(df, expected) df = create().fillna(0) @@ -453,7 +457,8 @@ def test_where_axis(self): tm.assert_frame_equal(result, expected) result = df.copy() - result.where(mask, s, axis="index", inplace=True) + return_value = result.where(mask, s, axis="index", inplace=True) + assert return_value is None tm.assert_frame_equal(result, expected) expected = DataFrame([[0, 1], [0, 1]], dtype="float64") @@ -461,7 +466,8 @@ def test_where_axis(self): tm.assert_frame_equal(result, expected) result = df.copy() - result.where(mask, s, axis="columns", inplace=True) + return_value = result.where(mask, s, axis="columns", inplace=True) + assert return_value is None tm.assert_frame_equal(result, expected) # Upcast needed @@ -474,7 +480,8 @@ def test_where_axis(self): tm.assert_frame_equal(result, expected) result = df.copy() - result.where(mask, s, axis="index", inplace=True) + return_value = result.where(mask, s, axis="index", inplace=True) + assert return_value is None tm.assert_frame_equal(result, expected) expected = DataFrame([[0, np.nan], [0, np.nan]]) @@ -488,7 +495,8 @@ def test_where_axis(self): } ) result = df.copy() - result.where(mask, s, axis="columns", inplace=True) + return_value = result.where(mask, s, axis="columns", inplace=True) + assert return_value is None tm.assert_frame_equal(result, expected) # Multiple dtypes (=> multiple Blocks) @@ -511,7 +519,8 @@ def test_where_axis(self): tm.assert_frame_equal(result, expected) result = df.copy() - result.where(mask, s1, axis="columns", inplace=True) + return_value = result.where(mask, s1, axis="columns", inplace=True) + assert return_value is None tm.assert_frame_equal(result, expected) result = df.where(mask, s2, axis="index") @@ -521,7 +530,8 @@ def test_where_axis(self): tm.assert_frame_equal(result, expected) result = df.copy() - result.where(mask, s2, axis="index", inplace=True) + return_value = result.where(mask, s2, axis="index", inplace=True) + assert return_value is None tm.assert_frame_equal(result, expected) # DataFrame vs DataFrame @@ -534,10 +544,12 @@ def test_where_axis(self): result = df.where(mask, d1, axis="index") tm.assert_frame_equal(result, expected) result = df.copy() - result.where(mask, d1, inplace=True) + return_value = result.where(mask, d1, inplace=True) + assert return_value is None tm.assert_frame_equal(result, expected) result = df.copy() - result.where(mask, d1, inplace=True, axis="index") + return_value = result.where(mask, d1, inplace=True, axis="index") + assert return_value is None tm.assert_frame_equal(result, expected) d2 = df.copy().drop(1, axis=1) @@ -549,10 +561,12 @@ def test_where_axis(self): result = df.where(mask, d2, axis="columns") tm.assert_frame_equal(result, expected) result = df.copy() - result.where(mask, d2, inplace=True) + return_value = result.where(mask, d2, inplace=True) + assert return_value is None tm.assert_frame_equal(result, expected) result = df.copy() - result.where(mask, d2, inplace=True, axis="columns") + return_value = result.where(mask, d2, inplace=True, axis="columns") + assert return_value is None tm.assert_frame_equal(result, expected) def test_where_callable(self): diff --git a/pandas/tests/frame/methods/test_clip.py b/pandas/tests/frame/methods/test_clip.py index 34727da3b95ae..ca62b56664518 100644 --- a/pandas/tests/frame/methods/test_clip.py +++ b/pandas/tests/frame/methods/test_clip.py @@ -22,7 +22,8 @@ def test_inplace_clip(self, float_frame): median = float_frame.median().median() frame_copy = float_frame.copy() - frame_copy.clip(upper=median, lower=median, inplace=True) + return_value = frame_copy.clip(upper=median, lower=median, inplace=True) + assert return_value is None assert not (frame_copy.values != median).any() def test_dataframe_clip(self): diff --git a/pandas/tests/frame/methods/test_drop.py b/pandas/tests/frame/methods/test_drop.py index 177d10cdbf615..aa44a2427dc8f 100644 --- a/pandas/tests/frame/methods/test_drop.py +++ b/pandas/tests/frame/methods/test_drop.py @@ -70,8 +70,10 @@ def test_drop_names(self): df_dropped_b = df.drop("b") df_dropped_e = df.drop("e", axis=1) df_inplace_b, df_inplace_e = df.copy(), df.copy() - df_inplace_b.drop("b", inplace=True) - df_inplace_e.drop("e", axis=1, inplace=True) + return_value = df_inplace_b.drop("b", inplace=True) + assert return_value is None + return_value = df_inplace_e.drop("e", axis=1, inplace=True) + assert return_value is None for obj in (df_dropped_b, df_dropped_e, df_inplace_b, df_inplace_e): assert obj.index.name == "first" assert obj.columns.name == "second" @@ -148,7 +150,8 @@ def test_drop(self): # GH#5628 df = pd.DataFrame(np.random.randn(10, 3), columns=list("abc")) expected = df[~(df.b > 0)] - df.drop(labels=df[df.b > 0].index, inplace=True) + return_value = df.drop(labels=df[df.b > 0].index, inplace=True) + assert return_value is None tm.assert_frame_equal(df, expected) def test_drop_multiindex_not_lexsorted(self): diff --git a/pandas/tests/frame/methods/test_interpolate.py b/pandas/tests/frame/methods/test_interpolate.py index facb116646573..ddb5723e7bd3e 100644 --- a/pandas/tests/frame/methods/test_interpolate.py +++ b/pandas/tests/frame/methods/test_interpolate.py @@ -246,11 +246,13 @@ def test_interp_inplace(self): df = DataFrame({"a": [1.0, 2.0, np.nan, 4.0]}) expected = DataFrame({"a": [1.0, 2.0, 3.0, 4.0]}) result = df.copy() - result["a"].interpolate(inplace=True) + return_value = result["a"].interpolate(inplace=True) + assert return_value is None tm.assert_frame_equal(result, expected) result = df.copy() - result["a"].interpolate(inplace=True, downcast="infer") + return_value = result["a"].interpolate(inplace=True, downcast="infer") + assert return_value is None tm.assert_frame_equal(result, expected.astype("int64")) def test_interp_inplace_row(self): @@ -259,7 +261,8 @@ def test_interp_inplace_row(self): {"a": [1.0, 2.0, 3.0, 4.0], "b": [np.nan, 2.0, 3.0, 4.0], "c": [3, 2, 2, 2]} ) expected = result.interpolate(method="linear", axis=1, inplace=False) - result.interpolate(method="linear", axis=1, inplace=True) + return_value = result.interpolate(method="linear", axis=1, inplace=True) + assert return_value is None tm.assert_frame_equal(result, expected) def test_interp_ignore_all_good(self): @@ -297,7 +300,8 @@ def test_interp_time_inplace_axis(self, axis): expected = DataFrame(index=idx, columns=idx, data=data) result = expected.interpolate(axis=0, method="time") - expected.interpolate(axis=0, method="time", inplace=True) + return_value = expected.interpolate(axis=0, method="time", inplace=True) + assert return_value is None tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("axis_name, axis_number", [("index", 0), ("columns", 1)]) diff --git a/pandas/tests/frame/methods/test_rename.py b/pandas/tests/frame/methods/test_rename.py index ffad526d3f4d1..eb908e9472fe2 100644 --- a/pandas/tests/frame/methods/test_rename.py +++ b/pandas/tests/frame/methods/test_rename.py @@ -150,7 +150,8 @@ def test_rename_inplace(self, float_frame): c_id = id(float_frame["C"]) float_frame = float_frame.copy() - float_frame.rename(columns={"C": "foo"}, inplace=True) + return_value = float_frame.rename(columns={"C": "foo"}, inplace=True) + assert return_value is None assert "C" not in float_frame assert "foo" in float_frame diff --git a/pandas/tests/frame/methods/test_rename_axis.py b/pandas/tests/frame/methods/test_rename_axis.py index 9b964d842526c..3339119841813 100644 --- a/pandas/tests/frame/methods/test_rename_axis.py +++ b/pandas/tests/frame/methods/test_rename_axis.py @@ -10,14 +10,16 @@ def test_rename_axis_inplace(self, float_frame): # GH#15704 expected = float_frame.rename_axis("foo") result = float_frame.copy() - no_return = result.rename_axis("foo", inplace=True) + return_value = no_return = result.rename_axis("foo", inplace=True) + assert return_value is None assert no_return is None tm.assert_frame_equal(result, expected) expected = float_frame.rename_axis("bar", axis=1) result = float_frame.copy() - no_return = result.rename_axis("bar", axis=1, inplace=True) + return_value = no_return = result.rename_axis("bar", axis=1, inplace=True) + assert return_value is None assert no_return is None tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py index 498f7f7790514..ea72a3d8fef4d 100644 --- a/pandas/tests/frame/methods/test_replace.py +++ b/pandas/tests/frame/methods/test_replace.py @@ -27,7 +27,8 @@ def test_replace_inplace(self, datetime_frame, float_string_frame): datetime_frame["A"][-5:] = np.nan tsframe = datetime_frame.copy() - tsframe.replace(np.nan, 0, inplace=True) + return_value = tsframe.replace(np.nan, 0, inplace=True) + assert return_value is None tm.assert_frame_equal(tsframe, datetime_frame.fillna(0)) # mixed type @@ -40,7 +41,8 @@ def test_replace_inplace(self, datetime_frame, float_string_frame): tm.assert_frame_equal(result, expected) tsframe = datetime_frame.copy() - tsframe.replace([np.nan], [0], inplace=True) + return_value = tsframe.replace([np.nan], [0], inplace=True) + assert return_value is None tm.assert_frame_equal(tsframe, datetime_frame.fillna(0)) def test_regex_replace_scalar(self, mix_ab): @@ -117,18 +119,21 @@ def test_regex_replace_scalar_inplace(self, mix_ab): # regex -> value # obj frame res = dfobj.copy() - res.replace(r"\s*\.\s*", np.nan, regex=True, inplace=True) + return_value = res.replace(r"\s*\.\s*", np.nan, regex=True, inplace=True) + assert return_value is None tm.assert_frame_equal(dfobj, res.fillna(".")) # mixed res = dfmix.copy() - res.replace(r"\s*\.\s*", np.nan, regex=True, inplace=True) + return_value = res.replace(r"\s*\.\s*", np.nan, regex=True, inplace=True) + assert return_value is None tm.assert_frame_equal(dfmix, res.fillna(".")) # regex -> regex # obj frame res = dfobj.copy() - res.replace(r"\s*(\.)\s*", r"\1\1\1", regex=True, inplace=True) + return_value = res.replace(r"\s*(\.)\s*", r"\1\1\1", regex=True, inplace=True) + assert return_value is None objc = obj.copy() objc["a"] = ["a", "b", "...", "..."] expec = DataFrame(objc) @@ -136,7 +141,8 @@ def test_regex_replace_scalar_inplace(self, mix_ab): # with mixed res = dfmix.copy() - res.replace(r"\s*(\.)\s*", r"\1\1\1", regex=True, inplace=True) + return_value = res.replace(r"\s*(\.)\s*", r"\1\1\1", regex=True, inplace=True) + assert return_value is None mixc = mix_ab.copy() mixc["b"] = ["a", "b", "...", "..."] expec = DataFrame(mixc) @@ -144,18 +150,27 @@ def test_regex_replace_scalar_inplace(self, mix_ab): # everything with compiled regexs as well res = dfobj.copy() - res.replace(re.compile(r"\s*\.\s*"), np.nan, regex=True, inplace=True) + return_value = res.replace( + re.compile(r"\s*\.\s*"), np.nan, regex=True, inplace=True + ) + assert return_value is None tm.assert_frame_equal(dfobj, res.fillna(".")) # mixed res = dfmix.copy() - res.replace(re.compile(r"\s*\.\s*"), np.nan, regex=True, inplace=True) + return_value = res.replace( + re.compile(r"\s*\.\s*"), np.nan, regex=True, inplace=True + ) + assert return_value is None tm.assert_frame_equal(dfmix, res.fillna(".")) # regex -> regex # obj frame res = dfobj.copy() - res.replace(re.compile(r"\s*(\.)\s*"), r"\1\1\1", regex=True, inplace=True) + return_value = res.replace( + re.compile(r"\s*(\.)\s*"), r"\1\1\1", regex=True, inplace=True + ) + assert return_value is None objc = obj.copy() objc["a"] = ["a", "b", "...", "..."] expec = DataFrame(objc) @@ -163,25 +178,31 @@ def test_regex_replace_scalar_inplace(self, mix_ab): # with mixed res = dfmix.copy() - res.replace(re.compile(r"\s*(\.)\s*"), r"\1\1\1", regex=True, inplace=True) + return_value = res.replace( + re.compile(r"\s*(\.)\s*"), r"\1\1\1", regex=True, inplace=True + ) + assert return_value is None mixc = mix_ab.copy() mixc["b"] = ["a", "b", "...", "..."] expec = DataFrame(mixc) tm.assert_frame_equal(res, expec) res = dfobj.copy() - res.replace(regex=r"\s*\.\s*", value=np.nan, inplace=True) + return_value = res.replace(regex=r"\s*\.\s*", value=np.nan, inplace=True) + assert return_value is None tm.assert_frame_equal(dfobj, res.fillna(".")) # mixed res = dfmix.copy() - res.replace(regex=r"\s*\.\s*", value=np.nan, inplace=True) + return_value = res.replace(regex=r"\s*\.\s*", value=np.nan, inplace=True) + assert return_value is None tm.assert_frame_equal(dfmix, res.fillna(".")) # regex -> regex # obj frame res = dfobj.copy() - res.replace(regex=r"\s*(\.)\s*", value=r"\1\1\1", inplace=True) + return_value = res.replace(regex=r"\s*(\.)\s*", value=r"\1\1\1", inplace=True) + assert return_value is None objc = obj.copy() objc["a"] = ["a", "b", "...", "..."] expec = DataFrame(objc) @@ -189,7 +210,8 @@ def test_regex_replace_scalar_inplace(self, mix_ab): # with mixed res = dfmix.copy() - res.replace(regex=r"\s*(\.)\s*", value=r"\1\1\1", inplace=True) + return_value = res.replace(regex=r"\s*(\.)\s*", value=r"\1\1\1", inplace=True) + assert return_value is None mixc = mix_ab.copy() mixc["b"] = ["a", "b", "...", "..."] expec = DataFrame(mixc) @@ -197,18 +219,27 @@ def test_regex_replace_scalar_inplace(self, mix_ab): # everything with compiled regexs as well res = dfobj.copy() - res.replace(regex=re.compile(r"\s*\.\s*"), value=np.nan, inplace=True) + return_value = res.replace( + regex=re.compile(r"\s*\.\s*"), value=np.nan, inplace=True + ) + assert return_value is None tm.assert_frame_equal(dfobj, res.fillna(".")) # mixed res = dfmix.copy() - res.replace(regex=re.compile(r"\s*\.\s*"), value=np.nan, inplace=True) + return_value = res.replace( + regex=re.compile(r"\s*\.\s*"), value=np.nan, inplace=True + ) + assert return_value is None tm.assert_frame_equal(dfmix, res.fillna(".")) # regex -> regex # obj frame res = dfobj.copy() - res.replace(regex=re.compile(r"\s*(\.)\s*"), value=r"\1\1\1", inplace=True) + return_value = res.replace( + regex=re.compile(r"\s*(\.)\s*"), value=r"\1\1\1", inplace=True + ) + assert return_value is None objc = obj.copy() objc["a"] = ["a", "b", "...", "..."] expec = DataFrame(objc) @@ -216,7 +247,10 @@ def test_regex_replace_scalar_inplace(self, mix_ab): # with mixed res = dfmix.copy() - res.replace(regex=re.compile(r"\s*(\.)\s*"), value=r"\1\1\1", inplace=True) + return_value = res.replace( + regex=re.compile(r"\s*(\.)\s*"), value=r"\1\1\1", inplace=True + ) + assert return_value is None mixc = mix_ab.copy() mixc["b"] = ["a", "b", "...", "..."] expec = DataFrame(mixc) @@ -290,7 +324,8 @@ def test_regex_replace_list_obj_inplace(self): to_replace_res = [r"\s*\.\s*", r"e|f|g"] values = [np.nan, "crap"] res = dfobj.copy() - res.replace(to_replace_res, values, inplace=True, regex=True) + return_value = res.replace(to_replace_res, values, inplace=True, regex=True) + assert return_value is None expec = DataFrame( { "a": ["a", "b", np.nan, np.nan], @@ -304,7 +339,8 @@ def test_regex_replace_list_obj_inplace(self): to_replace_res = [r"\s*(\.)\s*", r"(e|f|g)"] values = [r"\1\1", r"\1_crap"] res = dfobj.copy() - res.replace(to_replace_res, values, inplace=True, regex=True) + return_value = res.replace(to_replace_res, values, inplace=True, regex=True) + assert return_value is None expec = DataFrame( { "a": ["a", "b", "..", ".."], @@ -319,7 +355,8 @@ def test_regex_replace_list_obj_inplace(self): to_replace_res = [r"\s*(\.)\s*", r"e"] values = [r"\1\1", r"crap"] res = dfobj.copy() - res.replace(to_replace_res, values, inplace=True, regex=True) + return_value = res.replace(to_replace_res, values, inplace=True, regex=True) + assert return_value is None expec = DataFrame( { "a": ["a", "b", "..", ".."], @@ -332,7 +369,8 @@ def test_regex_replace_list_obj_inplace(self): to_replace_res = [r"\s*(\.)\s*", r"e"] values = [r"\1\1", r"crap"] res = dfobj.copy() - res.replace(value=values, regex=to_replace_res, inplace=True) + return_value = res.replace(value=values, regex=to_replace_res, inplace=True) + assert return_value is None expec = DataFrame( { "a": ["a", "b", "..", ".."], @@ -391,7 +429,8 @@ def test_regex_replace_list_mixed_inplace(self, mix_ab): to_replace_res = [r"\s*\.\s*", r"a"] values = [np.nan, "crap"] res = dfmix.copy() - res.replace(to_replace_res, values, inplace=True, regex=True) + return_value = res.replace(to_replace_res, values, inplace=True, regex=True) + assert return_value is None expec = DataFrame({"a": mix_ab["a"], "b": ["crap", "b", np.nan, np.nan]}) tm.assert_frame_equal(res, expec) @@ -399,7 +438,8 @@ def test_regex_replace_list_mixed_inplace(self, mix_ab): to_replace_res = [r"\s*(\.)\s*", r"(a|b)"] values = [r"\1\1", r"\1_crap"] res = dfmix.copy() - res.replace(to_replace_res, values, inplace=True, regex=True) + return_value = res.replace(to_replace_res, values, inplace=True, regex=True) + assert return_value is None expec = DataFrame({"a": mix_ab["a"], "b": ["a_crap", "b_crap", "..", ".."]}) tm.assert_frame_equal(res, expec) @@ -408,14 +448,16 @@ def test_regex_replace_list_mixed_inplace(self, mix_ab): to_replace_res = [r"\s*(\.)\s*", r"a", r"(b)"] values = [r"\1\1", r"crap", r"\1_crap"] res = dfmix.copy() - res.replace(to_replace_res, values, inplace=True, regex=True) + return_value = res.replace(to_replace_res, values, inplace=True, regex=True) + assert return_value is None expec = DataFrame({"a": mix_ab["a"], "b": ["crap", "b_crap", "..", ".."]}) tm.assert_frame_equal(res, expec) to_replace_res = [r"\s*(\.)\s*", r"a", r"(b)"] values = [r"\1\1", r"crap", r"\1_crap"] res = dfmix.copy() - res.replace(regex=to_replace_res, value=values, inplace=True) + return_value = res.replace(regex=to_replace_res, value=values, inplace=True) + assert return_value is None expec = DataFrame({"a": mix_ab["a"], "b": ["crap", "b_crap", "..", ".."]}) tm.assert_frame_equal(res, expec) @@ -430,7 +472,10 @@ def test_regex_replace_dict_mixed(self, mix_abc): # frame res = dfmix.replace({"b": r"\s*\.\s*"}, {"b": np.nan}, regex=True) res2 = dfmix.copy() - res2.replace({"b": r"\s*\.\s*"}, {"b": np.nan}, inplace=True, regex=True) + return_value = res2.replace( + {"b": r"\s*\.\s*"}, {"b": np.nan}, inplace=True, regex=True + ) + assert return_value is None expec = DataFrame( {"a": mix_abc["a"], "b": ["a", "b", np.nan, np.nan], "c": mix_abc["c"]} ) @@ -441,7 +486,10 @@ def test_regex_replace_dict_mixed(self, mix_abc): # whole frame res = dfmix.replace({"b": r"\s*(\.)\s*"}, {"b": r"\1ty"}, regex=True) res2 = dfmix.copy() - res2.replace({"b": r"\s*(\.)\s*"}, {"b": r"\1ty"}, inplace=True, regex=True) + return_value = res2.replace( + {"b": r"\s*(\.)\s*"}, {"b": r"\1ty"}, inplace=True, regex=True + ) + assert return_value is None expec = DataFrame( {"a": mix_abc["a"], "b": ["a", "b", ".ty", ".ty"], "c": mix_abc["c"]} ) @@ -450,7 +498,10 @@ def test_regex_replace_dict_mixed(self, mix_abc): res = dfmix.replace(regex={"b": r"\s*(\.)\s*"}, value={"b": r"\1ty"}) res2 = dfmix.copy() - res2.replace(regex={"b": r"\s*(\.)\s*"}, value={"b": r"\1ty"}, inplace=True) + return_value = res2.replace( + regex={"b": r"\s*(\.)\s*"}, value={"b": r"\1ty"}, inplace=True + ) + assert return_value is None expec = DataFrame( {"a": mix_abc["a"], "b": ["a", "b", ".ty", ".ty"], "c": mix_abc["c"]} ) @@ -464,13 +515,15 @@ def test_regex_replace_dict_mixed(self, mix_abc): ) res = dfmix.replace("a", {"b": np.nan}, regex=True) res2 = dfmix.copy() - res2.replace("a", {"b": np.nan}, regex=True, inplace=True) + return_value = res2.replace("a", {"b": np.nan}, regex=True, inplace=True) + assert return_value is None tm.assert_frame_equal(res, expec) tm.assert_frame_equal(res2, expec) res = dfmix.replace("a", {"b": np.nan}, regex=True) res2 = dfmix.copy() - res2.replace(regex="a", value={"b": np.nan}, inplace=True) + return_value = res2.replace(regex="a", value={"b": np.nan}, inplace=True) + assert return_value is None expec = DataFrame( {"a": mix_abc["a"], "b": [np.nan, "b", ".", "."], "c": mix_abc["c"]} ) @@ -483,9 +536,13 @@ def test_regex_replace_dict_nested(self, mix_abc): res = dfmix.replace({"b": {r"\s*\.\s*": np.nan}}, regex=True) res2 = dfmix.copy() res4 = dfmix.copy() - res2.replace({"b": {r"\s*\.\s*": np.nan}}, inplace=True, regex=True) + return_value = res2.replace( + {"b": {r"\s*\.\s*": np.nan}}, inplace=True, regex=True + ) + assert return_value is None res3 = dfmix.replace(regex={"b": {r"\s*\.\s*": np.nan}}) - res4.replace(regex={"b": {r"\s*\.\s*": np.nan}}, inplace=True) + return_value = res4.replace(regex={"b": {r"\s*\.\s*": np.nan}}, inplace=True) + assert return_value is None expec = DataFrame( {"a": mix_abc["a"], "b": ["a", "b", np.nan, np.nan], "c": mix_abc["c"]} ) @@ -519,8 +576,14 @@ def test_regex_replace_list_to_scalar(self, mix_abc): res = df.replace([r"\s*\.\s*", "a|b"], np.nan, regex=True) res2 = df.copy() res3 = df.copy() - res2.replace([r"\s*\.\s*", "a|b"], np.nan, regex=True, inplace=True) - res3.replace(regex=[r"\s*\.\s*", "a|b"], value=np.nan, inplace=True) + return_value = res2.replace( + [r"\s*\.\s*", "a|b"], np.nan, regex=True, inplace=True + ) + assert return_value is None + return_value = res3.replace( + regex=[r"\s*\.\s*", "a|b"], value=np.nan, inplace=True + ) + assert return_value is None tm.assert_frame_equal(res, expec) tm.assert_frame_equal(res2, expec) tm.assert_frame_equal(res3, expec) @@ -530,9 +593,11 @@ def test_regex_replace_str_to_numeric(self, mix_abc): df = DataFrame(mix_abc) res = df.replace(r"\s*\.\s*", 0, regex=True) res2 = df.copy() - res2.replace(r"\s*\.\s*", 0, inplace=True, regex=True) + return_value = res2.replace(r"\s*\.\s*", 0, inplace=True, regex=True) + assert return_value is None res3 = df.copy() - res3.replace(regex=r"\s*\.\s*", value=0, inplace=True) + return_value = res3.replace(regex=r"\s*\.\s*", value=0, inplace=True) + assert return_value is None expec = DataFrame({"a": mix_abc["a"], "b": ["a", "b", 0, 0], "c": mix_abc["c"]}) tm.assert_frame_equal(res, expec) tm.assert_frame_equal(res2, expec) @@ -542,9 +607,11 @@ def test_regex_replace_regex_list_to_numeric(self, mix_abc): df = DataFrame(mix_abc) res = df.replace([r"\s*\.\s*", "b"], 0, regex=True) res2 = df.copy() - res2.replace([r"\s*\.\s*", "b"], 0, regex=True, inplace=True) + return_value = res2.replace([r"\s*\.\s*", "b"], 0, regex=True, inplace=True) + assert return_value is None res3 = df.copy() - res3.replace(regex=[r"\s*\.\s*", "b"], value=0, inplace=True) + return_value = res3.replace(regex=[r"\s*\.\s*", "b"], value=0, inplace=True) + assert return_value is None expec = DataFrame( {"a": mix_abc["a"], "b": ["a", 0, 0, 0], "c": ["a", 0, np.nan, "d"]} ) @@ -558,9 +625,11 @@ def test_regex_replace_series_of_regexes(self, mix_abc): s2 = Series({"b": np.nan}) res = df.replace(s1, s2, regex=True) res2 = df.copy() - res2.replace(s1, s2, inplace=True, regex=True) + return_value = res2.replace(s1, s2, inplace=True, regex=True) + assert return_value is None res3 = df.copy() - res3.replace(regex=s1, value=s2, inplace=True) + return_value = res3.replace(regex=s1, value=s2, inplace=True) + assert return_value is None expec = DataFrame( {"a": mix_abc["a"], "b": ["a", "b", np.nan, np.nan], "c": mix_abc["c"]} ) @@ -714,7 +783,8 @@ def test_replace_mixed(self, float_string_frame): result = df.replace(0, 0.5) tm.assert_frame_equal(result, expected) - df.replace(0, 0.5, inplace=True) + return_value = df.replace(0, 0.5, inplace=True) + assert return_value is None tm.assert_frame_equal(df, expected) # int block splitting @@ -942,7 +1012,8 @@ def test_replace_input_formats_listlike(self): result = df.replace(to_rep, values) expected = df.copy() for i in range(len(to_rep)): - expected.replace(to_rep[i], values[i], inplace=True) + return_value = expected.replace(to_rep[i], values[i], inplace=True) + assert return_value is None tm.assert_frame_equal(result, expected) msg = r"Replacement lists must match in length\. Expecting 3 got 2" @@ -969,7 +1040,8 @@ def test_replace_input_formats_scalar(self): result = df.replace(to_rep, -1) expected = df.copy() for i in range(len(to_rep)): - expected.replace(to_rep[i], -1, inplace=True) + return_value = expected.replace(to_rep[i], -1, inplace=True) + assert return_value is None tm.assert_frame_equal(result, expected) def test_replace_limit(self): @@ -1321,7 +1393,8 @@ def test_categorical_replace_with_dict(self, replace_dict, final_data): with pytest.raises(AssertionError, match=msg): # ensure non-inplace call does not affect original tm.assert_frame_equal(df, expected) - df.replace(replace_dict, 3, inplace=True) + return_value = df.replace(replace_dict, 3, inplace=True) + assert return_value is None tm.assert_frame_equal(df, expected) @pytest.mark.parametrize( diff --git a/pandas/tests/frame/methods/test_reset_index.py b/pandas/tests/frame/methods/test_reset_index.py index cf0bbe144caa5..da4bfa9be4881 100644 --- a/pandas/tests/frame/methods/test_reset_index.py +++ b/pandas/tests/frame/methods/test_reset_index.py @@ -119,7 +119,8 @@ def test_reset_index(self, float_frame): # test resetting in place df = float_frame.copy() resetted = float_frame.reset_index() - df.reset_index(inplace=True) + return_value = df.reset_index(inplace=True) + assert return_value is None tm.assert_frame_equal(df, resetted, check_names=False) df = float_frame.reset_index().set_index(["index", "A", "B"]) @@ -137,7 +138,8 @@ def test_reset_index_name(self): ) assert df.reset_index().index.name is None assert df.reset_index(drop=True).index.name is None - df.reset_index(inplace=True) + return_value = df.reset_index(inplace=True) + assert return_value is None assert df.index.name is None def test_reset_index_level(self): diff --git a/pandas/tests/frame/methods/test_set_index.py b/pandas/tests/frame/methods/test_set_index.py index 5f62697cc3e43..ebe7eabd53b46 100644 --- a/pandas/tests/frame/methods/test_set_index.py +++ b/pandas/tests/frame/methods/test_set_index.py @@ -137,7 +137,8 @@ def test_set_index_drop_inplace(self, frame_of_index_cols, drop, inplace, keys): if inplace: result = df.copy() - result.set_index(keys, drop=drop, inplace=True) + return_value = result.set_index(keys, drop=drop, inplace=True) + assert return_value is None else: result = df.set_index(keys, drop=drop) diff --git a/pandas/tests/frame/methods/test_sort_index.py b/pandas/tests/frame/methods/test_sort_index.py index 543d87485d3c4..5216c3be116e0 100644 --- a/pandas/tests/frame/methods/test_sort_index.py +++ b/pandas/tests/frame/methods/test_sort_index.py @@ -218,25 +218,29 @@ def test_sort_index_inplace(self): unordered = frame.loc[[3, 2, 4, 1]] a_id = id(unordered["A"]) df = unordered.copy() - df.sort_index(inplace=True) + return_value = df.sort_index(inplace=True) + assert return_value is None expected = frame tm.assert_frame_equal(df, expected) assert a_id != id(df["A"]) df = unordered.copy() - df.sort_index(ascending=False, inplace=True) + return_value = df.sort_index(ascending=False, inplace=True) + assert return_value is None expected = frame[::-1] tm.assert_frame_equal(df, expected) # axis=1 unordered = frame.loc[:, ["D", "B", "C", "A"]] df = unordered.copy() - df.sort_index(axis=1, inplace=True) + return_value = df.sort_index(axis=1, inplace=True) + assert return_value is None expected = frame tm.assert_frame_equal(df, expected) df = unordered.copy() - df.sort_index(axis=1, ascending=False, inplace=True) + return_value = df.sort_index(axis=1, ascending=False, inplace=True) + assert return_value is None expected = frame.iloc[:, ::-1] tm.assert_frame_equal(df, expected) @@ -589,7 +593,8 @@ def test_sort_index_level2(self): # inplace rs = frame.copy() - rs.sort_index(level=0, inplace=True) + return_value = rs.sort_index(level=0, inplace=True) + assert return_value is None tm.assert_frame_equal(rs, frame.sort_index(level=0)) def test_sort_index_level_large_cardinality(self): diff --git a/pandas/tests/frame/methods/test_sort_values.py b/pandas/tests/frame/methods/test_sort_values.py index 1275da01eace9..c60e7e3b1bdb6 100644 --- a/pandas/tests/frame/methods/test_sort_values.py +++ b/pandas/tests/frame/methods/test_sort_values.py @@ -77,22 +77,28 @@ def test_sort_values_inplace(self): ) sorted_df = frame.copy() - sorted_df.sort_values(by="A", inplace=True) + return_value = sorted_df.sort_values(by="A", inplace=True) + assert return_value is None expected = frame.sort_values(by="A") tm.assert_frame_equal(sorted_df, expected) sorted_df = frame.copy() - sorted_df.sort_values(by=1, axis=1, inplace=True) + return_value = sorted_df.sort_values(by=1, axis=1, inplace=True) + assert return_value is None expected = frame.sort_values(by=1, axis=1) tm.assert_frame_equal(sorted_df, expected) sorted_df = frame.copy() - sorted_df.sort_values(by="A", ascending=False, inplace=True) + return_value = sorted_df.sort_values(by="A", ascending=False, inplace=True) + assert return_value is None expected = frame.sort_values(by="A", ascending=False) tm.assert_frame_equal(sorted_df, expected) sorted_df = frame.copy() - sorted_df.sort_values(by=["A", "B"], ascending=False, inplace=True) + return_value = sorted_df.sort_values( + by=["A", "B"], ascending=False, inplace=True + ) + assert return_value is None expected = frame.sort_values(by=["A", "B"], ascending=False) tm.assert_frame_equal(sorted_df, expected) @@ -544,17 +550,24 @@ def test_sort_values_inplace_key(self, sort_by_key): ) sorted_df = frame.copy() - sorted_df.sort_values(by="A", inplace=True, key=sort_by_key) + return_value = sorted_df.sort_values(by="A", inplace=True, key=sort_by_key) + assert return_value is None expected = frame.sort_values(by="A", key=sort_by_key) tm.assert_frame_equal(sorted_df, expected) sorted_df = frame.copy() - sorted_df.sort_values(by=1, axis=1, inplace=True, key=sort_by_key) + return_value = sorted_df.sort_values( + by=1, axis=1, inplace=True, key=sort_by_key + ) + assert return_value is None expected = frame.sort_values(by=1, axis=1, key=sort_by_key) tm.assert_frame_equal(sorted_df, expected) sorted_df = frame.copy() - sorted_df.sort_values(by="A", ascending=False, inplace=True, key=sort_by_key) + return_value = sorted_df.sort_values( + by="A", ascending=False, inplace=True, key=sort_by_key + ) + assert return_value is None expected = frame.sort_values(by="A", ascending=False, key=sort_by_key) tm.assert_frame_equal(sorted_df, expected) diff --git a/pandas/tests/frame/test_block_internals.py b/pandas/tests/frame/test_block_internals.py index d5554860c034d..c9fec3215d57f 100644 --- a/pandas/tests/frame/test_block_internals.py +++ b/pandas/tests/frame/test_block_internals.py @@ -64,7 +64,8 @@ def test_consolidate(self, float_frame): float_frame["F"] = 8.0 assert len(float_frame._mgr.blocks) == 3 - float_frame._consolidate(inplace=True) + return_value = float_frame._consolidate(inplace=True) + assert return_value is None assert len(float_frame._mgr.blocks) == 1 def test_consolidate_inplace(self, float_frame): diff --git a/pandas/tests/frame/test_query_eval.py b/pandas/tests/frame/test_query_eval.py index 98a2a33822e3b..628b955a1de92 100644 --- a/pandas/tests/frame/test_query_eval.py +++ b/pandas/tests/frame/test_query_eval.py @@ -413,7 +413,8 @@ def test_date_index_query(self): df = DataFrame(np.random.randn(n, 3)) df["dates1"] = date_range("1/1/2012", periods=n) df["dates3"] = date_range("1/1/2014", periods=n) - df.set_index("dates1", inplace=True, drop=True) + return_value = df.set_index("dates1", inplace=True, drop=True) + assert return_value is None res = df.query("index < 20130101 < dates3", engine=engine, parser=parser) expec = df[(df.index < "20130101") & ("20130101" < df.dates3)] tm.assert_frame_equal(res, expec) @@ -425,7 +426,8 @@ def test_date_index_query_with_NaT(self): df["dates1"] = date_range("1/1/2012", periods=n) df["dates3"] = date_range("1/1/2014", periods=n) df.iloc[0, 0] = pd.NaT - df.set_index("dates1", inplace=True, drop=True) + return_value = df.set_index("dates1", inplace=True, drop=True) + assert return_value is None res = df.query("index < 20130101 < dates3", engine=engine, parser=parser) expec = df[(df.index < "20130101") & ("20130101" < df.dates3)] tm.assert_frame_equal(res, expec) @@ -438,7 +440,8 @@ def test_date_index_query_with_NaT_duplicates(self): d["dates3"] = date_range("1/1/2014", periods=n) df = DataFrame(d) df.loc[np.random.rand(n) > 0.5, "dates1"] = pd.NaT - df.set_index("dates1", inplace=True, drop=True) + return_value = df.set_index("dates1", inplace=True, drop=True) + assert return_value is None res = df.query("dates1 < 20130101 < dates3", engine=engine, parser=parser) expec = df[(df.index.to_series() < "20130101") & ("20130101" < df.dates3)] tm.assert_frame_equal(res, expec) @@ -759,7 +762,8 @@ def test_date_index_query(self): df = DataFrame(np.random.randn(n, 3)) df["dates1"] = date_range("1/1/2012", periods=n) df["dates3"] = date_range("1/1/2014", periods=n) - df.set_index("dates1", inplace=True, drop=True) + return_value = df.set_index("dates1", inplace=True, drop=True) + assert return_value is None res = df.query( "(index < 20130101) & (20130101 < dates3)", engine=engine, parser=parser ) @@ -773,7 +777,8 @@ def test_date_index_query_with_NaT(self): df["dates1"] = date_range("1/1/2012", periods=n) df["dates3"] = date_range("1/1/2014", periods=n) df.iloc[0, 0] = pd.NaT - df.set_index("dates1", inplace=True, drop=True) + return_value = df.set_index("dates1", inplace=True, drop=True) + assert return_value is None res = df.query( "(index < 20130101) & (20130101 < dates3)", engine=engine, parser=parser ) @@ -787,7 +792,8 @@ def test_date_index_query_with_NaT_duplicates(self): df["dates1"] = date_range("1/1/2012", periods=n) df["dates3"] = date_range("1/1/2014", periods=n) df.loc[np.random.rand(n) > 0.5, "dates1"] = pd.NaT - df.set_index("dates1", inplace=True, drop=True) + return_value = df.set_index("dates1", inplace=True, drop=True) + assert return_value is None msg = r"'BoolOp' nodes are not implemented" with pytest.raises(NotImplementedError, match=msg): df.query("index < 20130101 < dates3", engine=engine, parser=parser) diff --git a/pandas/tests/frame/test_reshape.py b/pandas/tests/frame/test_reshape.py index 1634baacf6d6e..6a8f1e7c1aca2 100644 --- a/pandas/tests/frame/test_reshape.py +++ b/pandas/tests/frame/test_reshape.py @@ -473,7 +473,8 @@ def test_stack_ints(self): ) df_named = df.copy() - df_named.columns.set_names(range(3), inplace=True) + return_value = df_named.columns.set_names(range(3), inplace=True) + assert return_value is None tm.assert_frame_equal( df_named.stack(level=[1, 2]), df_named.stack(level=1).stack(level=1) diff --git a/pandas/tests/frame/test_to_csv.py b/pandas/tests/frame/test_to_csv.py index 2b7b3af8f4705..db7347bb863a5 100644 --- a/pandas/tests/frame/test_to_csv.py +++ b/pandas/tests/frame/test_to_csv.py @@ -570,7 +570,8 @@ def test_to_csv_headers(self): from_df.to_csv(path, index=False, header=["X", "Y"]) recons = self.read_csv(path) - recons.reset_index(inplace=True) + return_value = recons.reset_index(inplace=True) + assert return_value is None tm.assert_frame_equal(to_df, recons) def test_to_csv_multiindex(self, float_frame, datetime_frame): From 7e117d4b4082f262afc9fcd75ced504b52e5210b Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 13 Jul 2020 05:35:00 -0700 Subject: [PATCH 0342/1025] CLN: annotate (#35242) --- pandas/_libs/hashing.pyx | 2 +- pandas/_libs/tslib.pyx | 21 +++++++++++---------- pandas/_libs/tslibs/strptime.pyx | 24 ++++++++++++------------ 3 files changed, 24 insertions(+), 23 deletions(-) diff --git a/pandas/_libs/hashing.pyx b/pandas/_libs/hashing.pyx index 2d859db22ea23..a98820ca57895 100644 --- a/pandas/_libs/hashing.pyx +++ b/pandas/_libs/hashing.pyx @@ -15,7 +15,7 @@ DEF dROUNDS = 4 @cython.boundscheck(False) -def hash_object_array(ndarray[object] arr, object key, object encoding='utf8'): +def hash_object_array(ndarray[object] arr, str key, str encoding="utf8"): """ Parameters ---------- diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index d70d0378a2621..35d5cd8f1e275 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -5,6 +5,7 @@ from cpython.datetime cimport ( PyDateTime_Check, PyDateTime_IMPORT, datetime, + tzinfo, ) # import datetime C API PyDateTime_IMPORT @@ -93,8 +94,8 @@ def _test_parse_iso8601(ts: str): @cython.boundscheck(False) def format_array_from_datetime( ndarray[int64_t] values, - object tz=None, - object format=None, + tzinfo tz=None, + str format=None, object na_rep=None ): """ @@ -103,8 +104,8 @@ def format_array_from_datetime( Parameters ---------- values : a 1-d i8 array - tz : the timezone (or None) - format : optional, default is None + tz : tzinfo or None, default None + format : str or None, default None a strftime capable string na_rep : optional, default is None a nat format @@ -360,7 +361,7 @@ cpdef array_to_datetime( str errors='raise', bint dayfirst=False, bint yearfirst=False, - object utc=None, + bint utc=False, bint require_iso8601=False ): """ @@ -386,7 +387,7 @@ cpdef array_to_datetime( dayfirst parsing behavior when encountering datetime strings yearfirst : bool, default False yearfirst parsing behavior when encountering datetime strings - utc : bool, default None + utc : bool, default False indicator whether the dates should be UTC require_iso8601 : bool, default False indicator whether the datetime string should be iso8601 @@ -412,7 +413,7 @@ cpdef array_to_datetime( bint is_same_offsets _TSObject _ts int64_t value - int out_local=0, out_tzoffset=0 + int out_local = 0, out_tzoffset = 0 float offset_seconds, tz_offset set out_tzoffset_vals = set() bint string_to_dts_failed @@ -659,7 +660,7 @@ cdef array_to_datetime_object( ndarray[object] values, str errors, bint dayfirst=False, - bint yearfirst=False + bint yearfirst=False, ): """ Fall back function for array_to_datetime @@ -671,7 +672,7 @@ cdef array_to_datetime_object( ---------- values : ndarray of object date-like objects to convert - errors : str, default 'raise' + errors : str error behavior when parsing dayfirst : bool, default False dayfirst parsing behavior when encountering datetime strings @@ -684,7 +685,7 @@ cdef array_to_datetime_object( """ cdef: Py_ssize_t i, n = len(values) - object val, + object val bint is_ignore = errors == 'ignore' bint is_coerce = errors == 'coerce' bint is_raise = errors == 'raise' diff --git a/pandas/_libs/tslibs/strptime.pyx b/pandas/_libs/tslibs/strptime.pyx index 884578df3e00b..660b582f73e6e 100644 --- a/pandas/_libs/tslibs/strptime.pyx +++ b/pandas/_libs/tslibs/strptime.pyx @@ -5,7 +5,7 @@ import locale import calendar import re -from cpython cimport datetime +from cpython.datetime cimport date, tzinfo from _thread import allocate_lock as _thread_allocate_lock @@ -291,20 +291,20 @@ def array_strptime(object[:] values, object fmt, bint exact=True, errors='raise' elif iso_year != -1 and iso_week != -1: year, julian = _calc_julian_from_V(iso_year, iso_week, weekday + 1) - # Cannot pre-calculate datetime.date() since can change in Julian + # Cannot pre-calculate date() since can change in Julian # calculation and thus could have different value for the day of the wk # calculation. try: if julian == -1: # Need to add 1 to result since first day of the year is 1, not # 0. - ordinal = datetime.date(year, month, day).toordinal() - julian = ordinal - datetime.date(year, 1, 1).toordinal() + 1 + ordinal = date(year, month, day).toordinal() + julian = ordinal - date(year, 1, 1).toordinal() + 1 else: # Assume that if they bothered to include Julian day it will # be accurate. - datetime_result = datetime.date.fromordinal( - (julian - 1) + datetime.date(year, 1, 1).toordinal()) + datetime_result = date.fromordinal( + (julian - 1) + date(year, 1, 1).toordinal()) year = datetime_result.year month = datetime_result.month day = datetime_result.day @@ -314,7 +314,7 @@ def array_strptime(object[:] values, object fmt, bint exact=True, errors='raise' continue raise if weekday == -1: - weekday = datetime.date(year, month, day).weekday() + weekday = date(year, month, day).weekday() dts.year = year dts.month = month @@ -652,7 +652,7 @@ cdef int _calc_julian_from_U_or_W(int year, int week_of_year, cdef: int first_weekday, week_0_length, days_to_week - first_weekday = datetime.date(year, 1, 1).weekday() + first_weekday = date(year, 1, 1).weekday() # If we are dealing with the %U directive (week starts on Sunday), it's # easier to just shift the view to Sunday being the first day of the # week. @@ -695,18 +695,18 @@ cdef (int, int) _calc_julian_from_V(int iso_year, int iso_week, int iso_weekday) cdef: int correction, ordinal - correction = datetime.date(iso_year, 1, 4).isoweekday() + 3 + correction = date(iso_year, 1, 4).isoweekday() + 3 ordinal = (iso_week * 7) + iso_weekday - correction # ordinal may be negative or 0 now, which means the date is in the previous # calendar year if ordinal < 1: - ordinal += datetime.date(iso_year, 1, 1).toordinal() + ordinal += date(iso_year, 1, 1).toordinal() iso_year -= 1 - ordinal -= datetime.date(iso_year, 1, 1).toordinal() + ordinal -= date(iso_year, 1, 1).toordinal() return iso_year, ordinal -cdef parse_timezone_directive(str z): +cdef tzinfo parse_timezone_directive(str z): """ Parse the '%z' directive and return a pytz.FixedOffset From ad7e7a6ff47ffb4a9040b68c7970eb14c081f5cd Mon Sep 17 00:00:00 2001 From: SurajH1 <53505344+SurajH1@users.noreply.github.com> Date: Mon, 13 Jul 2020 07:55:11 -0500 Subject: [PATCH 0343/1025] TST: concat(..., copy=False) with datetime tz-aware data raises ValueError (#33458) Co-authored-by: Simon Hawkins --- pandas/tests/dtypes/test_concat.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/pandas/tests/dtypes/test_concat.py b/pandas/tests/dtypes/test_concat.py index 1fbbd3356ae13..5a9ad732792ea 100644 --- a/pandas/tests/dtypes/test_concat.py +++ b/pandas/tests/dtypes/test_concat.py @@ -88,3 +88,14 @@ def test_concat_mismatched_categoricals_with_empty(): result = _concat.concat_compat([ser1._values, ser2._values]) expected = pd.concat([ser1, ser2])._values tm.assert_categorical_equal(result, expected) + + +@pytest.mark.parametrize("copy", [True, False]) +def test_concat_single_dataframe_tz_aware(copy): + # https://github.com/pandas-dev/pandas/issues/25257 + df = pd.DataFrame( + {"timestamp": [pd.Timestamp("2020-04-08 09:00:00.709949+0000", tz="UTC")]} + ) + expected = df.copy() + result = pd.concat([df], copy=copy) + tm.assert_frame_equal(result, expected) From 5454d097e216002f12ea72e8f4235a18cc4153af Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 13 Jul 2020 10:26:59 -0500 Subject: [PATCH 0344/1025] CI: Ignore setuptools distutils warning (#35260) --- ci/deps/azure-36-32bit.yaml | 2 +- ci/deps/azure-36-locale.yaml | 2 +- ci/deps/azure-36-locale_slow.yaml | 2 +- ci/deps/azure-36-minimum_versions.yaml | 2 +- ci/deps/azure-36-slow.yaml | 2 +- ci/deps/azure-37-locale.yaml | 2 +- ci/deps/azure-37-numpydev.yaml | 2 +- ci/deps/azure-macos-36.yaml | 2 +- ci/deps/azure-windows-36.yaml | 2 +- ci/deps/azure-windows-37.yaml | 2 +- ci/deps/travis-36-cov.yaml | 2 +- ci/deps/travis-36-locale.yaml | 2 +- ci/deps/travis-37-arm64.yaml | 2 +- ci/deps/travis-37.yaml | 2 +- ci/deps/travis-38.yaml | 2 +- environment.yml | 2 +- pandas/tests/util/test_show_versions.py | 4 ++++ requirements-dev.txt | 2 +- 18 files changed, 21 insertions(+), 17 deletions(-) diff --git a/ci/deps/azure-36-32bit.yaml b/ci/deps/azure-36-32bit.yaml index 15704cf0d5427..2dc53f8181ac4 100644 --- a/ci/deps/azure-36-32bit.yaml +++ b/ci/deps/azure-36-32bit.yaml @@ -23,4 +23,4 @@ dependencies: - pip - pip: - cython>=0.29.16 - - pytest>=5.0.1 + - pytest>=5.0.1,<6.0.0rc0 diff --git a/ci/deps/azure-36-locale.yaml b/ci/deps/azure-36-locale.yaml index a9b9a5a47ccf5..d31015fde4741 100644 --- a/ci/deps/azure-36-locale.yaml +++ b/ci/deps/azure-36-locale.yaml @@ -7,7 +7,7 @@ dependencies: # tools - cython>=0.29.16 - - pytest>=5.0.1 + - pytest>=5.0.1,<6.0.0rc0 - pytest-xdist>=1.21 - pytest-asyncio - hypothesis>=3.58.0 diff --git a/ci/deps/azure-36-locale_slow.yaml b/ci/deps/azure-36-locale_slow.yaml index c086b3651afc3..23121b985492e 100644 --- a/ci/deps/azure-36-locale_slow.yaml +++ b/ci/deps/azure-36-locale_slow.yaml @@ -7,7 +7,7 @@ dependencies: # tools - cython>=0.29.16 - - pytest>=5.0.1 + - pytest>=5.0.1,<6.0.0rc0 - pytest-xdist>=1.21 - hypothesis>=3.58.0 - pytest-azurepipelines diff --git a/ci/deps/azure-36-minimum_versions.yaml b/ci/deps/azure-36-minimum_versions.yaml index f5af7bcf36189..9f66f82720b5b 100644 --- a/ci/deps/azure-36-minimum_versions.yaml +++ b/ci/deps/azure-36-minimum_versions.yaml @@ -6,7 +6,7 @@ dependencies: # tools - cython=0.29.16 - - pytest=5.0.1 + - pytest>=5.0.1, <6.0.0rc0 - pytest-xdist>=1.21 - hypothesis>=3.58.0 - pytest-azurepipelines diff --git a/ci/deps/azure-36-slow.yaml b/ci/deps/azure-36-slow.yaml index 87bad59fa4873..0a6d1d13c8549 100644 --- a/ci/deps/azure-36-slow.yaml +++ b/ci/deps/azure-36-slow.yaml @@ -7,7 +7,7 @@ dependencies: # tools - cython>=0.29.16 - - pytest>=5.0.1 + - pytest>=5.0.1,<6.0.0rc0 - pytest-xdist>=1.21 - hypothesis>=3.58.0 diff --git a/ci/deps/azure-37-locale.yaml b/ci/deps/azure-37-locale.yaml index 81e336cf1ed7f..714e1100b1e1a 100644 --- a/ci/deps/azure-37-locale.yaml +++ b/ci/deps/azure-37-locale.yaml @@ -6,7 +6,7 @@ dependencies: # tools - cython>=0.29.16 - - pytest>=5.0.1 + - pytest>=5.0.1,<6.0.0rc0 - pytest-xdist>=1.21 - pytest-asyncio - hypothesis>=3.58.0 diff --git a/ci/deps/azure-37-numpydev.yaml b/ci/deps/azure-37-numpydev.yaml index 5cb58756a6ac1..451fb5884a4af 100644 --- a/ci/deps/azure-37-numpydev.yaml +++ b/ci/deps/azure-37-numpydev.yaml @@ -5,7 +5,7 @@ dependencies: - python=3.7.* # tools - - pytest>=5.0.1 + - pytest>=5.0.1,<6.0.0rc0 - pytest-xdist>=1.21 - hypothesis>=3.58.0 - pytest-azurepipelines diff --git a/ci/deps/azure-macos-36.yaml b/ci/deps/azure-macos-36.yaml index eeea249a19ca1..81a27465f9e61 100644 --- a/ci/deps/azure-macos-36.yaml +++ b/ci/deps/azure-macos-36.yaml @@ -5,7 +5,7 @@ dependencies: - python=3.6.* # tools - - pytest>=5.0.1 + - pytest>=5.0.1,<6.0.0rc0 - pytest-xdist>=1.21 - hypothesis>=3.58.0 - pytest-azurepipelines diff --git a/ci/deps/azure-windows-36.yaml b/ci/deps/azure-windows-36.yaml index 548660cabaa67..4d7e1d821037b 100644 --- a/ci/deps/azure-windows-36.yaml +++ b/ci/deps/azure-windows-36.yaml @@ -7,7 +7,7 @@ dependencies: # tools - cython>=0.29.16 - - pytest>=5.0.1 + - pytest>=5.0.1,<6.0.0rc0 - pytest-xdist>=1.21 - hypothesis>=3.58.0 - pytest-azurepipelines diff --git a/ci/deps/azure-windows-37.yaml b/ci/deps/azure-windows-37.yaml index 5bbd0e2795d7e..34fca631df6c1 100644 --- a/ci/deps/azure-windows-37.yaml +++ b/ci/deps/azure-windows-37.yaml @@ -7,7 +7,7 @@ dependencies: # tools - cython>=0.29.16 - - pytest>=5.0.1 + - pytest>=5.0.1,<6.0.0rc0 - pytest-xdist>=1.21 - hypothesis>=3.58.0 - pytest-azurepipelines diff --git a/ci/deps/travis-36-cov.yaml b/ci/deps/travis-36-cov.yaml index 177e0d3f4c0af..5f5ea8034cddf 100644 --- a/ci/deps/travis-36-cov.yaml +++ b/ci/deps/travis-36-cov.yaml @@ -7,7 +7,7 @@ dependencies: # tools - cython>=0.29.16 - - pytest>=5.0.1 + - pytest>=5.0.1,<6.0.0rc0 - pytest-xdist>=1.21 - hypothesis>=3.58.0 - pytest-cov # this is only needed in the coverage build diff --git a/ci/deps/travis-36-locale.yaml b/ci/deps/travis-36-locale.yaml index 03a1e751b6a86..6bc4aba733ee5 100644 --- a/ci/deps/travis-36-locale.yaml +++ b/ci/deps/travis-36-locale.yaml @@ -7,7 +7,7 @@ dependencies: # tools - cython>=0.29.16 - - pytest>=5.0.1 + - pytest>=5.0.1,<6.0.0rc0 - pytest-xdist>=1.21 - hypothesis>=3.58.0 diff --git a/ci/deps/travis-37-arm64.yaml b/ci/deps/travis-37-arm64.yaml index 5cb53489be225..f434a03609b26 100644 --- a/ci/deps/travis-37-arm64.yaml +++ b/ci/deps/travis-37-arm64.yaml @@ -7,7 +7,7 @@ dependencies: # tools - cython>=0.29.13 - - pytest>=5.0.1 + - pytest>=5.0.1,<6.0.0rc0 - pytest-xdist>=1.21 - hypothesis>=3.58.0 diff --git a/ci/deps/travis-37.yaml b/ci/deps/travis-37.yaml index e896233aac63c..aaf706d61fe5c 100644 --- a/ci/deps/travis-37.yaml +++ b/ci/deps/travis-37.yaml @@ -7,7 +7,7 @@ dependencies: # tools - cython>=0.29.16 - - pytest>=5.0.1 + - pytest>=5.0.1,<6.0.0rc0 - pytest-xdist>=1.21 - hypothesis>=3.58.0 diff --git a/ci/deps/travis-38.yaml b/ci/deps/travis-38.yaml index b879c0f81dab2..ac39a223cd086 100644 --- a/ci/deps/travis-38.yaml +++ b/ci/deps/travis-38.yaml @@ -7,7 +7,7 @@ dependencies: # tools - cython>=0.29.16 - - pytest>=5.0.1 + - pytest>=5.0.1,<6.0.0rc0 - pytest-xdist>=1.21 - hypothesis>=3.58.0 diff --git a/environment.yml b/environment.yml index 80dbffebf6b9d..32ff8c91cb69c 100644 --- a/environment.yml +++ b/environment.yml @@ -51,7 +51,7 @@ dependencies: - botocore>=1.11 - hypothesis>=3.82 - moto # mock S3 - - pytest>=5.0.1 + - pytest>=5.0.1,<6.0.0rc0 - pytest-cov - pytest-xdist>=1.21 - pytest-asyncio diff --git a/pandas/tests/util/test_show_versions.py b/pandas/tests/util/test_show_versions.py index e36ea662fac8b..04e841c05e44a 100644 --- a/pandas/tests/util/test_show_versions.py +++ b/pandas/tests/util/test_show_versions.py @@ -21,6 +21,10 @@ # pandas_datareader "ignore:pandas.util.testing is deprecated:FutureWarning" ) +@pytest.mark.filterwarnings( + # https://github.com/pandas-dev/pandas/issues/35252 + "ignore:Distutils:UserWarning" +) def test_show_versions(capsys): # gh-32041 pd.show_versions() diff --git a/requirements-dev.txt b/requirements-dev.txt index 886f400caf44f..3cda38d4b72f5 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -32,7 +32,7 @@ boto3 botocore>=1.11 hypothesis>=3.82 moto -pytest>=5.0.1 +pytest>=5.0.1,<6.0.0rc0 pytest-cov pytest-xdist>=1.21 pytest-asyncio From 332944033466ae71b0d480bda159f2fdc1412135 Mon Sep 17 00:00:00 2001 From: avinashpancham <44933366+avinashpancham@users.noreply.github.com> Date: Mon, 13 Jul 2020 18:36:11 +0200 Subject: [PATCH 0345/1025] TST: GH20676 Verify equals operator for list of Numpy arrays (#35237) --- pandas/tests/generic/test_generic.py | 37 +-------------- pandas/tests/series/methods/test_equals.py | 55 ++++++++++++++++++++++ 2 files changed, 56 insertions(+), 36 deletions(-) create mode 100644 pandas/tests/series/methods/test_equals.py diff --git a/pandas/tests/generic/test_generic.py b/pandas/tests/generic/test_generic.py index 94747a52136c4..5e66925a38ec6 100644 --- a/pandas/tests/generic/test_generic.py +++ b/pandas/tests/generic/test_generic.py @@ -8,7 +8,7 @@ from pandas.core.dtypes.common import is_scalar import pandas as pd -from pandas import DataFrame, MultiIndex, Series, date_range +from pandas import DataFrame, Series, date_range import pandas._testing as tm import pandas.core.common as com @@ -785,26 +785,6 @@ def test_depr_take_kwarg_is_copy(self, is_copy): s.take([0, 1], is_copy=is_copy) def test_equals(self): - s1 = pd.Series([1, 2, 3], index=[0, 2, 1]) - s2 = s1.copy() - assert s1.equals(s2) - - s1[1] = 99 - assert not s1.equals(s2) - - # NaNs compare as equal - s1 = pd.Series([1, np.nan, 3, np.nan], index=[0, 2, 1, 3]) - s2 = s1.copy() - assert s1.equals(s2) - - s2[0] = 9.9 - assert not s1.equals(s2) - - idx = MultiIndex.from_tuples([(0, "a"), (1, "b"), (2, "c")]) - s1 = Series([1, 2, np.nan], index=idx) - s2 = s1.copy() - assert s1.equals(s2) - # Add object dtype column with nans index = np.random.random(10) df1 = DataFrame(np.random.random(10), index=index, columns=["floats"]) @@ -857,21 +837,6 @@ def test_equals(self): df2 = df1.set_index(["floats"], append=True) assert df3.equals(df2) - # GH 8437 - a = pd.Series([False, np.nan]) - b = pd.Series([False, np.nan]) - c = pd.Series(index=range(2), dtype=object) - d = c.copy() - e = c.copy() - f = c.copy() - c[:-1] = d[:-1] = e[0] = f[0] = False - assert a.equals(a) - assert a.equals(b) - assert a.equals(c) - assert a.equals(d) - assert a.equals(e) - assert e.equals(f) - def test_pipe(self): df = DataFrame({"A": [1, 2, 3]}) f = lambda x, y: x ** y diff --git a/pandas/tests/series/methods/test_equals.py b/pandas/tests/series/methods/test_equals.py new file mode 100644 index 0000000000000..600154adfcda3 --- /dev/null +++ b/pandas/tests/series/methods/test_equals.py @@ -0,0 +1,55 @@ +import numpy as np +import pytest + +from pandas import MultiIndex, Series + + +@pytest.mark.parametrize( + "arr, idx", + [ + ([1, 2, 3, 4], [0, 2, 1, 3]), + ([1, np.nan, 3, np.nan], [0, 2, 1, 3]), + ( + [1, np.nan, 3, np.nan], + MultiIndex.from_tuples([(0, "a"), (1, "b"), (2, "c"), (3, "c")]), + ), + ], +) +def test_equals(arr, idx): + s1 = Series(arr, index=idx) + s2 = s1.copy() + assert s1.equals(s2) + + s1[1] = 9 + assert not s1.equals(s2) + + +def test_equals_list_array(): + # GH20676 Verify equals operator for list of Numpy arrays + arr = np.array([1, 2]) + s1 = Series([arr, arr]) + s2 = s1.copy() + assert s1.equals(s2) + + # TODO: Series equals should also work between single value and list + # s1[1] = 9 + # assert not s1.equals(s2) + + +def test_equals_false_negative(): + # GH8437 Verify false negative behavior of equals function for dtype object + arr = [False, np.nan] + s1 = Series(arr) + s2 = s1.copy() + s3 = Series(index=range(2), dtype=object) + s4 = s3.copy() + s5 = s3.copy() + s6 = s3.copy() + + s3[:-1] = s4[:-1] = s5[0] = s6[0] = False + assert s1.equals(s1) + assert s1.equals(s2) + assert s1.equals(s3) + assert s1.equals(s4) + assert s1.equals(s5) + assert s5.equals(s6) From 0dd400464d2c778dec0215904c1c549eb504a277 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 13 Jul 2020 09:42:46 -0700 Subject: [PATCH 0346/1025] REF: make tz_convert match pattern eslewhere (#35255) --- pandas/_libs/tslibs/tzconversion.pyx | 115 +++++++++++---------------- 1 file changed, 45 insertions(+), 70 deletions(-) diff --git a/pandas/_libs/tslibs/tzconversion.pyx b/pandas/_libs/tslibs/tzconversion.pyx index a6afd47d93479..606639af16a18 100644 --- a/pandas/_libs/tslibs/tzconversion.pyx +++ b/pandas/_libs/tslibs/tzconversion.pyx @@ -382,7 +382,10 @@ cpdef int64_t tz_convert_from_utc_single(int64_t val, tzinfo tz): converted: int64 """ cdef: - int64_t arr[1] + int64_t delta + int64_t[:] deltas + ndarray[int64_t, ndim=1] trans + intp_t pos if val == NPY_NAT: return val @@ -391,9 +394,14 @@ cpdef int64_t tz_convert_from_utc_single(int64_t val, tzinfo tz): return val elif is_tzlocal(tz): return _tz_convert_tzlocal_utc(val, tz, to_utc=False) + elif is_fixed_offset(tz): + _, deltas, _ = get_dst_info(tz) + delta = deltas[0] + return val + delta else: - arr[0] = val - return _tz_convert_dst(arr, tz)[0] + trans, deltas, _ = get_dst_info(tz) + pos = trans.searchsorted(val, side="right") - 1 + return val + deltas[pos] def tz_convert_from_utc(int64_t[:] vals, tzinfo tz): @@ -435,9 +443,12 @@ cdef int64_t[:] _tz_convert_from_utc(int64_t[:] vals, tzinfo tz): converted : ndarray[int64_t] """ cdef: - int64_t[:] converted + int64_t[:] converted, deltas Py_ssize_t i, n = len(vals) - int64_t val + int64_t val, delta + intp_t[:] pos + ndarray[int64_t] trans + str typ if is_utc(tz): converted = vals @@ -450,7 +461,35 @@ cdef int64_t[:] _tz_convert_from_utc(int64_t[:] vals, tzinfo tz): else: converted[i] = _tz_convert_tzlocal_utc(val, tz, to_utc=False) else: - converted = _tz_convert_dst(vals, tz) + converted = np.empty(n, dtype=np.int64) + + trans, deltas, typ = get_dst_info(tz) + + if typ not in ["pytz", "dateutil"]: + # FixedOffset, we know len(deltas) == 1 + delta = deltas[0] + + for i in range(n): + val = vals[i] + if val == NPY_NAT: + converted[i] = val + else: + converted[i] = val + delta + + else: + pos = trans.searchsorted(vals, side="right") - 1 + + for i in range(n): + val = vals[i] + if val == NPY_NAT: + converted[i] = val + else: + if pos[i] < 0: + # TODO: How is this reached? Should we be checking for + # it elsewhere? + raise ValueError("First time before start of DST info") + + converted[i] = val + deltas[pos[i]] return converted @@ -537,67 +576,3 @@ cdef int64_t _tz_convert_tzlocal_utc(int64_t val, tzinfo tz, bint to_utc=True, return val - delta else: return val + delta - - -@cython.boundscheck(False) -@cython.wraparound(False) -cdef int64_t[:] _tz_convert_dst(const int64_t[:] values, tzinfo tz): - """ - tz_convert for non-UTC non-tzlocal cases where we have to check - DST transitions pointwise. - - Parameters - ---------- - values : ndarray[int64_t] - tz : tzinfo - - Returns - ------- - result : ndarray[int64_t] - """ - cdef: - Py_ssize_t n = len(values) - Py_ssize_t i - intp_t[:] pos - int64_t[:] result = np.empty(n, dtype=np.int64) - ndarray[int64_t] trans - int64_t[:] deltas - int64_t v, delta - str typ - - # tz is assumed _not_ to be tzlocal; that should go - # through _tz_convert_tzlocal_utc - - trans, deltas, typ = get_dst_info(tz) - - if typ not in ["pytz", "dateutil"]: - # FixedOffset, we know len(deltas) == 1 - delta = deltas[0] - - for i in range(n): - v = values[i] - if v == NPY_NAT: - result[i] = v - else: - result[i] = v + delta - - else: - # Previously, this search was done pointwise to try and benefit - # from getting to skip searches for iNaTs. However, it seems call - # overhead dominates the search time so doing it once in bulk - # is substantially faster (GH#24603) - pos = trans.searchsorted(values, side="right") - 1 - - for i in range(n): - v = values[i] - if v == NPY_NAT: - result[i] = v - else: - if pos[i] < 0: - # TODO: How is this reached? Should we be checking for - # it elsewhere? - raise ValueError("First time before start of DST info") - - result[i] = v + deltas[pos[i]] - - return result From 2af28189f72b4577c10aab05e6f7c0e61a82362d Mon Sep 17 00:00:00 2001 From: Robin to Roxel <35864265+r-toroxel@users.noreply.github.com> Date: Mon, 13 Jul 2020 22:36:04 +0200 Subject: [PATCH 0347/1025] TST verify return none inplace in tests/indexing (#35230) --- pandas/tests/frame/test_constructors.py | 6 ++++-- pandas/tests/indexing/multiindex/test_indexing_slow.py | 9 ++++++--- pandas/tests/indexing/multiindex/test_ix.py | 3 ++- pandas/tests/indexing/multiindex/test_sorted.py | 6 ++++-- pandas/tests/indexing/multiindex/test_xs.py | 6 ++++-- 5 files changed, 20 insertions(+), 10 deletions(-) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index bfff58d05007f..17ac2307b9da6 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -1208,8 +1208,10 @@ def test_constructor_single_row(self): data = [OrderedDict([["a", 1.5], ["b", 3], ["c", 4], ["d", 6]])] result = DataFrame(data) - expected = DataFrame.from_dict(dict(zip([0], data)), orient="index") - tm.assert_frame_equal(result, expected.reindex(result.index)) + expected = DataFrame.from_dict(dict(zip([0], data)), orient="index").reindex( + result.index + ) + tm.assert_frame_equal(result, expected) def test_constructor_ordered_dict_preserve_order(self): # see gh-13304 diff --git a/pandas/tests/indexing/multiindex/test_indexing_slow.py b/pandas/tests/indexing/multiindex/test_indexing_slow.py index ea4453b8dd6eb..be193e0854d8d 100644 --- a/pandas/tests/indexing/multiindex/test_indexing_slow.py +++ b/pandas/tests/indexing/multiindex/test_indexing_slow.py @@ -34,12 +34,15 @@ def validate(mi, df, key): right = df[mask].copy() if i + 1 != len(key): # partial key - right.drop(cols[: i + 1], axis=1, inplace=True) - right.set_index(cols[i + 1 : -1], inplace=True) + return_value = right.drop(cols[: i + 1], axis=1, inplace=True) + assert return_value is None + return_value = right.set_index(cols[i + 1 : -1], inplace=True) + assert return_value is None tm.assert_frame_equal(mi.loc[key[: i + 1]], right) else: # full key - right.set_index(cols[:-1], inplace=True) + return_value = right.set_index(cols[:-1], inplace=True) + assert return_value is None if len(right) == 1: # single hit right = Series( right["jolia"].values, name=right.index[0], index=["jolia"] diff --git a/pandas/tests/indexing/multiindex/test_ix.py b/pandas/tests/indexing/multiindex/test_ix.py index 01b0b392d52a3..abf989324e4a5 100644 --- a/pandas/tests/indexing/multiindex/test_ix.py +++ b/pandas/tests/indexing/multiindex/test_ix.py @@ -35,7 +35,8 @@ def test_loc_general(self): tm.assert_frame_equal(df.loc[key], df.iloc[2:]) # this is ok - df.sort_index(inplace=True) + return_value = df.sort_index(inplace=True) + assert return_value is None res = df.loc[key] # col has float dtype, result should be Float64Index diff --git a/pandas/tests/indexing/multiindex/test_sorted.py b/pandas/tests/indexing/multiindex/test_sorted.py index fdeb3ce95b0bb..572cb9da405d1 100644 --- a/pandas/tests/indexing/multiindex/test_sorted.py +++ b/pandas/tests/indexing/multiindex/test_sorted.py @@ -43,8 +43,10 @@ def test_frame_getitem_not_sorted2(self, key): df2 = df.set_index(["col1", "col2"]) df2_original = df2.copy() - df2.index.set_levels(["b", "d", "a"], level="col1", inplace=True) - df2.index.set_codes([0, 1, 0, 2], level="col1", inplace=True) + return_value = df2.index.set_levels(["b", "d", "a"], level="col1", inplace=True) + assert return_value is None + return_value = df2.index.set_codes([0, 1, 0, 2], level="col1", inplace=True) + assert return_value is None assert not df2.index.is_lexsorted() assert not df2.index.is_monotonic diff --git a/pandas/tests/indexing/multiindex/test_xs.py b/pandas/tests/indexing/multiindex/test_xs.py index ff748d755c063..b807795b9c309 100644 --- a/pandas/tests/indexing/multiindex/test_xs.py +++ b/pandas/tests/indexing/multiindex/test_xs.py @@ -237,9 +237,11 @@ def test_series_getitem_multiindex_xs_by_label(): [("a", "one"), ("a", "two"), ("b", "one"), ("b", "two")] ) s = Series([1, 2, 3, 4], index=idx) - s.index.set_names(["L1", "L2"], inplace=True) + return_value = s.index.set_names(["L1", "L2"], inplace=True) + assert return_value is None expected = Series([1, 3], index=["a", "b"]) - expected.index.set_names(["L1"], inplace=True) + return_value = expected.index.set_names(["L1"], inplace=True) + assert return_value is None result = s.xs("one", level="L2") tm.assert_series_equal(result, expected) From 1bbccfaf6f4f9c6794dc54c6b8b1e39bcd6d4082 Mon Sep 17 00:00:00 2001 From: Thomas Smith Date: Mon, 13 Jul 2020 23:22:45 +0100 Subject: [PATCH 0348/1025] BUG: ValueError on groupby with categoricals (#35253) --- doc/source/whatsnew/v1.1.0.rst | 1 + pandas/core/groupby/generic.py | 6 ++- pandas/tests/groupby/test_categorical.py | 50 ++++++++++++++++++++++++ 3 files changed, 56 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 5f93e08d51baa..85b29a58a1f15 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -1091,6 +1091,7 @@ Groupby/resample/rolling - Bug in :meth:`Rolling.apply` where ``center=True`` was ignored when ``engine='numba'`` was specified (:issue:`34784`) - Bug in :meth:`DataFrame.ewm.cov` was throwing ``AssertionError`` for :class:`MultiIndex` inputs (:issue:`34440`) - Bug in :meth:`core.groupby.DataFrameGroupBy.transform` when ``func='nunique'`` and columns are of type ``datetime64``, the result would also be of type ``datetime64`` instead of ``int64`` (:issue:`35109`) +- Bug in :meth:'DataFrameGroupBy.first' and :meth:'DataFrameGroupBy.last' that would raise an unnecessary ``ValueError`` when grouping on multiple ``Categoricals`` (:issue:`34951`) Reshaping ^^^^^^^^^ diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 1f49ee2b0b665..093e1d4ab3942 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1058,7 +1058,11 @@ def _cython_agg_blocks( # reductions; see GH#28949 obj = obj.iloc[:, 0] - s = get_groupby(obj, self.grouper) + # Create SeriesGroupBy with observed=True so that it does + # not try to add missing categories if grouping over multiple + # Categoricals. This will done by later self._reindex_output() + # Doing it here creates an error. See GH#34951 + s = get_groupby(obj, self.grouper, observed=True) try: result = s.aggregate(lambda x: alt(x, axis=self.axis)) except TypeError: diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 118d928ac02f4..7e4513da37dc9 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -1669,3 +1669,53 @@ def test_categorical_transform(): expected["status"] = expected["status"].astype(delivery_status_type) tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("func", ["first", "last"]) +def test_series_groupby_first_on_categorical_col_grouped_on_2_categoricals( + func: str, observed: bool +): + # GH 34951 + cat = pd.Categorical([0, 0, 1, 1]) + val = [0, 1, 1, 0] + df = pd.DataFrame({"a": cat, "b": cat, "c": val}) + + idx = pd.Categorical([0, 1]) + idx = pd.MultiIndex.from_product([idx, idx], names=["a", "b"]) + expected_dict = { + "first": pd.Series([0, np.NaN, np.NaN, 1], idx, name="c"), + "last": pd.Series([1, np.NaN, np.NaN, 0], idx, name="c"), + } + + expected = expected_dict[func] + if observed: + expected = expected.dropna().astype(np.int64) + + srs_grp = df.groupby(["a", "b"], observed=observed)["c"] + result = getattr(srs_grp, func)() + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("func", ["first", "last"]) +def test_df_groupby_first_on_categorical_col_grouped_on_2_categoricals( + func: str, observed: bool +): + # GH 34951 + cat = pd.Categorical([0, 0, 1, 1]) + val = [0, 1, 1, 0] + df = pd.DataFrame({"a": cat, "b": cat, "c": val}) + + idx = pd.Categorical([0, 1]) + idx = pd.MultiIndex.from_product([idx, idx], names=["a", "b"]) + expected_dict = { + "first": pd.Series([0, np.NaN, np.NaN, 1], idx, name="c"), + "last": pd.Series([1, np.NaN, np.NaN, 0], idx, name="c"), + } + + expected = expected_dict[func].to_frame() + if observed: + expected = expected.dropna().astype(np.int64) + + df_grp = df.groupby(["a", "b"], observed=observed) + result = getattr(df_grp, func)() + tm.assert_frame_equal(result, expected) From c35f4d19df8142d34fa480c2d2bf99a593573d56 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Tue, 14 Jul 2020 14:56:49 +0100 Subject: [PATCH 0349/1025] CI: pin pytest in minimum verions (#35274) --- ci/deps/azure-36-minimum_versions.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/deps/azure-36-minimum_versions.yaml b/ci/deps/azure-36-minimum_versions.yaml index 9f66f82720b5b..f5af7bcf36189 100644 --- a/ci/deps/azure-36-minimum_versions.yaml +++ b/ci/deps/azure-36-minimum_versions.yaml @@ -6,7 +6,7 @@ dependencies: # tools - cython=0.29.16 - - pytest>=5.0.1, <6.0.0rc0 + - pytest=5.0.1 - pytest-xdist>=1.21 - hypothesis>=3.58.0 - pytest-azurepipelines From 6e830f852cba9b7df4d1ef810b3c4e799fa99e71 Mon Sep 17 00:00:00 2001 From: Lewis Cowles Date: Tue, 14 Jul 2020 18:05:58 +0100 Subject: [PATCH 0350/1025] Improved error message for invalid construction (#35190) --- doc/source/whatsnew/v1.1.0.rst | 1 + pandas/core/internals/construction.py | 7 ++++++- pandas/tests/extension/base/setitem.py | 5 ++++- pandas/tests/frame/indexing/test_indexing.py | 9 ++++++--- pandas/tests/frame/indexing/test_setitem.py | 5 ++++- 5 files changed, 21 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 85b29a58a1f15..a460ad247e152 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -1170,6 +1170,7 @@ Other - Bug in :class:`Tick` comparisons raising ``TypeError`` when comparing against timedelta-like objects (:issue:`34088`) - Bug in :class:`Tick` multiplication raising ``TypeError`` when multiplying by a float (:issue:`34486`) - Passing a `set` as `names` argument to :func:`pandas.read_csv`, :func:`pandas.read_table`, or :func:`pandas.read_fwf` will raise ``ValueError: Names should be an ordered collection.`` (:issue:`34946`) +- Improved error message for invalid construction of list when creating a new index (:issue:`35190`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 4b9db810dead0..2d4163e0dee89 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -744,7 +744,12 @@ def sanitize_index(data, index: Index): through a non-Index. """ if len(data) != len(index): - raise ValueError("Length of values does not match length of index") + raise ValueError( + "Length of values " + f"({len(data)}) " + "does not match length of index " + f"({len(index)})" + ) if isinstance(data, np.ndarray): diff --git a/pandas/tests/extension/base/setitem.py b/pandas/tests/extension/base/setitem.py index bfa53ad02525b..a4e6fc0f78cbb 100644 --- a/pandas/tests/extension/base/setitem.py +++ b/pandas/tests/extension/base/setitem.py @@ -244,7 +244,10 @@ def test_setitem_expand_with_extension(self, data): def test_setitem_frame_invalid_length(self, data): df = pd.DataFrame({"A": [1] * len(data)}) - xpr = "Length of values does not match length of index" + xpr = ( + rf"Length of values \({len(data[:5])}\) " + rf"does not match length of index \({len(df)}\)" + ) with pytest.raises(ValueError, match=xpr): df["B"] = data[:5] diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index 3fa3c9303806f..d27487dfb8aaa 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -160,10 +160,13 @@ def test_setitem_list(self, float_frame): msg = "Columns must be same length as key" with pytest.raises(ValueError, match=msg): data[["A"]] = float_frame[["A", "B"]] - - msg = "Length of values does not match length of index" + newcolumndata = range(len(data.index) - 1) + msg = ( + rf"Length of values \({len(newcolumndata)}\) " + rf"does not match length of index \({len(data)}\)" + ) with pytest.raises(ValueError, match=msg): - data["A"] = range(len(data.index) - 1) + data["A"] = newcolumndata df = DataFrame(0, index=range(3), columns=["tt1", "tt2"], dtype=np.int_) df.loc[1, ["tt1", "tt2"]] = [1, 2] diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py index 9bb5338f1e07f..c5945edfd3127 100644 --- a/pandas/tests/frame/indexing/test_setitem.py +++ b/pandas/tests/frame/indexing/test_setitem.py @@ -117,7 +117,10 @@ def test_setitem_wrong_length_categorical_dtype_raises(self): cat = Categorical.from_codes([0, 1, 1, 0, 1, 2], ["a", "b", "c"]) df = DataFrame(range(10), columns=["bar"]) - msg = "Length of values does not match length of index" + msg = ( + rf"Length of values \({len(cat)}\) " + rf"does not match length of index \({len(df)}\)" + ) with pytest.raises(ValueError, match=msg): df["foo"] = cat From 7331cb658602e797f4f493a5ac73bbf3b4fdf25c Mon Sep 17 00:00:00 2001 From: MBrouns Date: Tue, 14 Jul 2020 19:08:44 +0200 Subject: [PATCH 0351/1025] Deprecate `center` on `df.expanding` (#34887) --- doc/source/whatsnew/v1.1.0.rst | 3 +++ pandas/core/generic.py | 12 +++++++++++- pandas/core/window/expanding.py | 2 +- .../test_moments_consistency_expanding.py | 10 +++++----- pandas/tests/window/test_api.py | 11 ++--------- pandas/tests/window/test_expanding.py | 16 ++++++++++++++++ 6 files changed, 38 insertions(+), 16 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index a460ad247e152..56c816eecb5ca 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -825,6 +825,9 @@ Deprecations precision through the ``rtol``, and ``atol`` parameters, thus deprecating the ``check_less_precise`` parameter. (:issue:`13357`). - :func:`DataFrame.melt` accepting a value_name that already exists is deprecated, and will be removed in a future version (:issue:`34731`) +- the ``center`` keyword in the :meth:`DataFrame.expanding` function is deprecated and will be removed in a future version (:issue:`20647`) + + .. --------------------------------------------------------------------------- diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 571fcc67f3bb5..ece4281af3208 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -10500,8 +10500,18 @@ def rolling( cls.rolling = rolling @doc(Expanding) - def expanding(self, min_periods=1, center=False, axis=0): + def expanding(self, min_periods=1, center=None, axis=0): axis = self._get_axis_number(axis) + if center is not None: + warnings.warn( + "The `center` argument on `expanding` " + "will be removed in the future", + FutureWarning, + stacklevel=2, + ) + else: + center = False + return Expanding(self, min_periods=min_periods, center=center, axis=axis) cls.expanding = expanding diff --git a/pandas/core/window/expanding.py b/pandas/core/window/expanding.py index bbc19fad8b799..8267cd4f0971e 100644 --- a/pandas/core/window/expanding.py +++ b/pandas/core/window/expanding.py @@ -57,7 +57,7 @@ class Expanding(_Rolling_and_Expanding): _attributes = ["min_periods", "center", "axis"] - def __init__(self, obj, min_periods=1, center=False, axis=0, **kwargs): + def __init__(self, obj, min_periods=1, center=None, axis=0, **kwargs): super().__init__(obj=obj, min_periods=min_periods, center=center, axis=axis) @property diff --git a/pandas/tests/window/moments/test_moments_consistency_expanding.py b/pandas/tests/window/moments/test_moments_consistency_expanding.py index ee3579d76d1db..3ec91dcb60610 100644 --- a/pandas/tests/window/moments/test_moments_consistency_expanding.py +++ b/pandas/tests/window/moments/test_moments_consistency_expanding.py @@ -119,8 +119,8 @@ def test_expanding_corr_pairwise(frame): ids=["sum", "mean", "max", "min"], ) def test_expanding_func(func, static_comp, has_min_periods, series, frame, nan_locs): - def expanding_func(x, min_periods=1, center=False, axis=0): - exp = x.expanding(min_periods=min_periods, center=center, axis=axis) + def expanding_func(x, min_periods=1, axis=0): + exp = x.expanding(min_periods=min_periods, axis=axis) return getattr(exp, func)() _check_expanding( @@ -166,7 +166,7 @@ def test_expanding_apply_consistency( with warnings.catch_warnings(): warnings.filterwarnings( - "ignore", message=".*(empty slice|0 for slice).*", category=RuntimeWarning, + "ignore", message=".*(empty slice|0 for slice).*", category=RuntimeWarning ) # test consistency between expanding_xyz() and either (a) # expanding_apply of Series.xyz(), or (b) expanding_apply of @@ -267,7 +267,7 @@ def test_expanding_consistency(consistency_data, min_periods): # with empty/0-length Series/DataFrames with warnings.catch_warnings(): warnings.filterwarnings( - "ignore", message=".*(empty slice|0 for slice).*", category=RuntimeWarning, + "ignore", message=".*(empty slice|0 for slice).*", category=RuntimeWarning ) # test consistency between different expanding_* moments @@ -454,7 +454,7 @@ def test_expanding_cov_pairwise_diff_length(): def test_expanding_corr_pairwise_diff_length(): # GH 7512 df1 = DataFrame( - [[1, 2], [3, 2], [3, 4]], columns=["A", "B"], index=Index(range(3), name="bar"), + [[1, 2], [3, 2], [3, 4]], columns=["A", "B"], index=Index(range(3), name="bar") ) df1a = DataFrame( [[1, 2], [3, 4]], index=Index([0, 2], name="bar"), columns=["A", "B"] diff --git a/pandas/tests/window/test_api.py b/pandas/tests/window/test_api.py index 33fb79d98a324..2c3d8b4608806 100644 --- a/pandas/tests/window/test_api.py +++ b/pandas/tests/window/test_api.py @@ -107,10 +107,7 @@ def test_agg(): with pytest.raises(SpecificationError, match=msg): r.aggregate( - { - "A": {"mean": "mean", "sum": "sum"}, - "B": {"mean2": "mean", "sum2": "sum"}, - } + {"A": {"mean": "mean", "sum": "sum"}, "B": {"mean2": "mean", "sum2": "sum"}} ) result = r.aggregate({"A": ["mean", "std"], "B": ["mean", "std"]}) @@ -191,11 +188,7 @@ def test_count_nonnumeric_types(): "dt_nat", "periods_nat", ] - dt_nat_col = [ - Timestamp("20170101"), - Timestamp("20170203"), - Timestamp(None), - ] + dt_nat_col = [Timestamp("20170101"), Timestamp("20170203"), Timestamp(None)] df = DataFrame( { diff --git a/pandas/tests/window/test_expanding.py b/pandas/tests/window/test_expanding.py index 30d65ebe84a1f..146eca07c523e 100644 --- a/pandas/tests/window/test_expanding.py +++ b/pandas/tests/window/test_expanding.py @@ -16,6 +16,9 @@ def test_doc_string(): df.expanding(2).sum() +@pytest.mark.filterwarnings( + "ignore:The `center` argument on `expanding` will be removed in the future" +) def test_constructor(which): # GH 12669 @@ -213,3 +216,16 @@ def test_iter_expanding_series(ser, expected, min_periods): for (expected, actual) in zip(expected, ser.expanding(min_periods)): tm.assert_series_equal(actual, expected) + + +def test_center_deprecate_warning(): + # GH 20647 + df = pd.DataFrame() + with tm.assert_produces_warning(FutureWarning): + df.expanding(center=True) + + with tm.assert_produces_warning(FutureWarning): + df.expanding(center=False) + + with tm.assert_produces_warning(None): + df.expanding() From 9ff6ff334d94b6503a894f9f0a06e329bfe359c2 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 14 Jul 2020 12:09:41 -0500 Subject: [PATCH 0352/1025] Move API changes to appropriate sections (#35273) --- doc/source/whatsnew/v1.1.0.rst | 71 ++++++++++++++++------------------ 1 file changed, 34 insertions(+), 37 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 56c816eecb5ca..088f1d1946fa9 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -276,6 +276,10 @@ change, as ``fsspec`` will still bring in the same packages as before. Other enhancements ^^^^^^^^^^^^^^^^^^ +- Added :class:`pandas.errors.InvalidIndexError` (:issue:`34570`). +- Added :meth:`DataFrame.value_counts` (:issue:`5377`) +- Added a :func:`pandas.api.indexers.FixedForwardWindowIndexer` class to support forward-looking windows during ``rolling`` operations. +- Added a :func:`pandas.api.indexers.VariableOffsetWindowIndexer` class to support ``rolling`` operations with non-fixed offsets (:issue:`34994`) - :class:`Styler` may now render CSS more efficiently where multiple cells have the same styling (:issue:`30876`) - :meth:`Styler.highlight_null` now accepts ``subset`` argument (:issue:`31345`) - When writing directly to a sqlite connection :func:`to_sql` now supports the ``multi`` method (:issue:`29921`) @@ -336,10 +340,12 @@ Other enhancements .. --------------------------------------------------------------------------- -.. _whatsnew_110.api: +.. _whatsnew_110.notable_bug_fixes: + +Notable bug fixes +~~~~~~~~~~~~~~~~~ -Backwards incompatible API changes -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +These are bug fixes that might have notable behavior changes. ``MultiIndex.get_indexer`` interprets `method` argument differently ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -403,7 +409,7 @@ And the differences in reindexing ``df`` with ``mi_2`` and using ``method='pad'` - -.. _whatsnew_110.api_breaking.indexing_raises_key_errors: +.. _whatsnew_110.notable_bug_fixes.indexing_raises_key_errors: Failed Label-Based Lookups Always Raise KeyError ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -473,7 +479,10 @@ key and type of :class:`Index`. These now consistently raise ``KeyError`` (:iss ... KeyError: Timestamp('1970-01-01 00:00:00') -.. _whatsnew_110.api_breaking.indexing_int_multiindex_raises_key_errors: + +Similarly, :meth:`DataFrame.at` and :meth:`Series.at` will raise a ``TypeError`` instead of a ``ValueError`` if an incompatible key is passed, and ``KeyError`` if a missing key is passed, matching the behavior of ``.loc[]`` (:issue:`31722`) + +.. _whatsnew_110.notable_bug_fixes.indexing_int_multiindex_raises_key_errors: Failed Integer Lookups on MultiIndex Raise KeyError ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -531,7 +540,7 @@ those integer keys is not present in the first level of the index (:issue:`33539 .. --------------------------------------------------------------------------- -.. _whatsnew_110.api_breaking.assignment_to_multiple_columns: +.. _whatsnew_110.notable_bug_fixes.assignment_to_multiple_columns: Assignment to multiple columns of a DataFrame when some columns do not exist ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -562,7 +571,7 @@ Assignment to multiple columns of a :class:`DataFrame` when some of the columns df[['a', 'c']] = 1 df -.. _whatsnew_110.api_breaking.groupby_consistency: +.. _whatsnew_110.notable_bug_fixes.groupby_consistency: Consistency across groupby reductions ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -628,7 +637,7 @@ The method :meth:`core.DataFrameGroupBy.size` would previously ignore ``as_index df.groupby("a", as_index=False).size() -.. _whatsnew_110.api_breaking.apply_applymap_first_once: +.. _whatsnew_110.notable_bug_fixes.apply_applymap_first_once: apply and applymap on ``DataFrame`` evaluates first row/column only once ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -673,34 +682,6 @@ Other API changes - :meth:`Series.describe` will now show distribution percentiles for ``datetime`` dtypes, statistics ``first`` and ``last`` will now be ``min`` and ``max`` to match with numeric dtypes in :meth:`DataFrame.describe` (:issue:`30164`) -- Added :meth:`DataFrame.value_counts` (:issue:`5377`) -- :meth:`Groupby.groups` now returns an abbreviated representation when called on large dataframes (:issue:`1135`) -- ``loc`` lookups with an object-dtype :class:`Index` and an integer key will now raise ``KeyError`` instead of ``TypeError`` when key is missing (:issue:`31905`) -- Using a :func:`pandas.api.indexers.BaseIndexer` with ``count``, ``min``, ``max``, ``median``, ``skew``, ``cov``, ``corr`` will now return correct results for any monotonic :func:`pandas.api.indexers.BaseIndexer` descendant (:issue:`32865`) -- Added a :func:`pandas.api.indexers.FixedForwardWindowIndexer` class to support forward-looking windows during ``rolling`` operations. -- Added a :func:`pandas.api.indexers.VariableOffsetWindowIndexer` class to support ``rolling`` operations with non-fixed offsets (:issue:`34994`) -- Added :class:`pandas.errors.InvalidIndexError` (:issue:`34570`). -- :meth:`DataFrame.swaplevels` now raises a ``TypeError`` if the axis is not a :class:`MultiIndex`. - Previously an ``AttributeError`` was raised (:issue:`31126`) -- :meth:`DataFrame.xs` now raises a ``TypeError`` if a ``level`` keyword is supplied and the axis is not a :class:`MultiIndex`. - Previously an ``AttributeError`` was raised (:issue:`33610`) -- :meth:`DataFrameGroupby.mean` and :meth:`SeriesGroupby.mean` (and similarly for :meth:`~DataFrameGroupby.median`, :meth:`~DataFrameGroupby.std` and :meth:`~DataFrameGroupby.var`) - now raise a ``TypeError`` if a not-accepted keyword argument is passed into it. - Previously a ``UnsupportedFunctionCall`` was raised (``AssertionError`` if ``min_count`` passed into :meth:`~DataFrameGroupby.median`) (:issue:`31485`) -- :meth:`DataFrame.at` and :meth:`Series.at` will raise a ``TypeError`` instead of a ``ValueError`` if an incompatible key is passed, and ``KeyError`` if a missing key is passed, matching the behavior of ``.loc[]`` (:issue:`31722`) -- Passing an integer dtype other than ``int64`` to ``np.array(period_index, dtype=...)`` will now raise ``TypeError`` instead of incorrectly using ``int64`` (:issue:`32255`) -- Passing an invalid ``fill_value`` to :meth:`Categorical.take` raises a ``ValueError`` instead of ``TypeError`` (:issue:`33660`) -- Combining a ``Categorical`` with integer categories and which contains missing values - with a float dtype column in operations such as :func:`concat` or :meth:`~DataFrame.append` - will now result in a float column instead of an object dtyped column (:issue:`33607`) -- :meth:`Series.to_timestamp` now raises a ``TypeError`` if the axis is not a :class:`PeriodIndex`. Previously an ``AttributeError`` was raised (:issue:`33327`) -- :meth:`Series.to_period` now raises a ``TypeError`` if the axis is not a :class:`DatetimeIndex`. Previously an ``AttributeError`` was raised (:issue:`33327`) -- :func: `pandas.api.dtypes.is_string_dtype` no longer incorrectly identifies categorical series as string. -- :func:`read_excel` no longer takes ``**kwds`` arguments. This means that passing in keyword ``chunksize`` now raises a ``TypeError`` - (previously raised a ``NotImplementedError``), while passing in keyword ``encoding`` now raises a ``TypeError`` (:issue:`34464`) -- :class:`Period` no longer accepts tuples for the ``freq`` argument (:issue:`34658`) -- :meth:`Series.interpolate` and :meth:`DataFrame.interpolate` now raises ValueError if ``limit_direction`` is 'forward' or 'both' and ``method`` is 'backfill' or 'bfill' or ``limit_direction`` is 'backward' or 'both' and ``method`` is 'pad' or 'ffill' (:issue:`34746`) -- The :class:`DataFrame` constructor no longer accepts a list of ``DataFrame`` objects. Because of changes to NumPy, ``DataFrame`` objects are now consistently treated as 2D objects, so a list of ``DataFrames`` is considered 3D, and no longer acceptible for the ``DataFrame`` constructor (:issue:`32289`). Increased minimum versions for dependencies @@ -871,6 +852,8 @@ Bug fixes Categorical ^^^^^^^^^^^ +- Passing an invalid ``fill_value`` to :meth:`Categorical.take` raises a ``ValueError`` instead of ``TypeError`` (:issue:`33660`) +- Combining a ``Categorical`` with integer categories and which contains missing values with a float dtype column in operations such as :func:`concat` or :meth:`~DataFrame.append` will now result in a float column instead of an object dtyped column (:issue:`33607`) - Bug where :func:`merge` was unable to join on non-unique categorical indices (:issue:`28189`) - Bug when passing categorical data to :class:`Index` constructor along with ``dtype=object`` incorrectly returning a :class:`CategoricalIndex` instead of object-dtype :class:`Index` (:issue:`32167`) - Bug where :class:`Categorical` comparison operator ``__ne__`` would incorrectly evaluate to ``False`` when either element was missing (:issue:`32276`) @@ -880,6 +863,10 @@ Categorical Datetimelike ^^^^^^^^^^^^ +- Passing an integer dtype other than ``int64`` to ``np.array(period_index, dtype=...)`` will now raise ``TypeError`` instead of incorrectly using ``int64`` (:issue:`32255`) +- :meth:`Series.to_timestamp` now raises a ``TypeError`` if the axis is not a :class:`PeriodIndex`. Previously an ``AttributeError`` was raised (:issue:`33327`) +- :meth:`Series.to_period` now raises a ``TypeError`` if the axis is not a :class:`DatetimeIndex`. Previously an ``AttributeError`` was raised (:issue:`33327`) +- :class:`Period` no longer accepts tuples for the ``freq`` argument (:issue:`34658`) - Bug in :class:`Timestamp` where constructing :class:`Timestamp` from ambiguous epoch time and calling constructor again changed :meth:`Timestamp.value` property (:issue:`24329`) - :meth:`DatetimeArray.searchsorted`, :meth:`TimedeltaArray.searchsorted`, :meth:`PeriodArray.searchsorted` not recognizing non-pandas scalars and incorrectly raising ``ValueError`` instead of ``TypeError`` (:issue:`30950`) - Bug in :class:`Timestamp` where constructing :class:`Timestamp` with dateutil timezone less than 128 nanoseconds before daylight saving time switch from winter to summer would result in nonexistent time (:issue:`31043`) @@ -947,7 +934,7 @@ Strings - Bug in the :meth:`~Series.astype` method when converting "string" dtype data to nullable integer dtype (:issue:`32450`). - Fixed issue where taking ``min`` or ``max`` of a ``StringArray`` or ``Series`` with ``StringDtype`` type would raise. (:issue:`31746`) - Bug in :meth:`Series.str.cat` returning ``NaN`` output when other had :class:`Index` type (:issue:`33425`) - +- :func: `pandas.api.dtypes.is_string_dtype` no longer incorrectly identifies categorical series as string. Interval ^^^^^^^^ @@ -956,6 +943,8 @@ Interval Indexing ^^^^^^^^ + +- :meth:`DataFrame.xs` now raises a ``TypeError`` if a ``level`` keyword is supplied and the axis is not a :class:`MultiIndex`. Previously an ``AttributeError`` was raised (:issue:`33610`) - Bug in slicing on a :class:`DatetimeIndex` with a partial-timestamp dropping high-resolution indices near the end of a year, quarter, or month (:issue:`31064`) - Bug in :meth:`PeriodIndex.get_loc` treating higher-resolution strings differently from :meth:`PeriodIndex.get_value` (:issue:`31172`) - Bug in :meth:`Series.at` and :meth:`DataFrame.at` not matching ``.loc`` behavior when looking up an integer in a :class:`Float64Index` (:issue:`31329`) @@ -999,6 +988,8 @@ Missing MultiIndex ^^^^^^^^^^ + +- :meth:`DataFrame.swaplevels` now raises a ``TypeError`` if the axis is not a :class:`MultiIndex`. Previously an ``AttributeError`` was raised (:issue:`31126`) - Bug in :meth:`Dataframe.loc` when used with a :class:`MultiIndex`. The returned values were not in the same order as the given inputs (:issue:`22797`) .. ipython:: python @@ -1060,6 +1051,7 @@ I/O - Bug in :meth:`DataFrame.to_sql` when reading DataFrames with ``-np.inf`` entries with MySQL now has a more explicit ``ValueError`` (:issue:`34431`) - Bug in :meth:`read_excel` that was raising a ``TypeError`` when ``header=None`` and ``index_col`` given as list (:issue:`31783`) - Bug in "meth"`read_excel` where datetime values are used in the header in a `MultiIndex` (:issue:`34748`) +- :func:`read_excel` no longer takes ``**kwds`` arguments. This means that passing in keyword ``chunksize`` now raises a ``TypeError`` (previously raised a ``NotImplementedError``), while passing in keyword ``encoding`` now raises a ``TypeError`` (:issue:`34464`) Plotting ^^^^^^^^ @@ -1075,6 +1067,8 @@ Plotting Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ +- Using a :func:`pandas.api.indexers.BaseIndexer` with ``count``, ``min``, ``max``, ``median``, ``skew``, ``cov``, ``corr`` will now return correct results for any monotonic :func:`pandas.api.indexers.BaseIndexer` descendant (:issue:`32865`) +- :meth:`DataFrameGroupby.mean` and :meth:`SeriesGroupby.mean` (and similarly for :meth:`~DataFrameGroupby.median`, :meth:`~DataFrameGroupby.std` and :meth:`~DataFrameGroupby.var`) now raise a ``TypeError`` if a not-accepted keyword argument is passed into it. Previously a ``UnsupportedFunctionCall`` was raised (``AssertionError`` if ``min_count`` passed into :meth:`~DataFrameGroupby.median`) (:issue:`31485`) - Bug in :meth:`GroupBy.apply` raises ``ValueError`` when the ``by`` axis is not sorted and has duplicates and the applied ``func`` does not mutate passed in objects (:issue:`30667`) - Bug in :meth:`DataFrameGroupby.transform` produces incorrect result with transformation functions (:issue:`30918`) - Bug in :meth:`Groupby.transform` was returning the wrong result when grouping by multiple keys of which some were categorical and others not (:issue:`32494`) @@ -1156,6 +1150,9 @@ ExtensionArray Other ^^^^^ + +- The :class:`DataFrame` constructor no longer accepts a list of ``DataFrame`` objects. Because of changes to NumPy, ``DataFrame`` objects are now consistently treated as 2D objects, so a list of ``DataFrames`` is considered 3D, and no longer acceptible for the ``DataFrame`` constructor (:issue:`32289`). +- :meth:`Series.interpolate` and :meth:`DataFrame.interpolate` now raises ValueError if ``limit_direction`` is 'forward' or 'both' and ``method`` is 'backfill' or 'bfill' or ``limit_direction`` is 'backward' or 'both' and ``method`` is 'pad' or 'ffill' (:issue:`34746`) - Appending a dictionary to a :class:`DataFrame` without passing ``ignore_index=True`` will raise ``TypeError: Can only append a dict if ignore_index=True`` instead of ``TypeError: Can only append a Series if ignore_index=True or if the Series has a name`` (:issue:`30871`) - Set operations on an object-dtype :class:`Index` now always return object-dtype results (:issue:`31401`) From b02fb661a2cde23b79052a7823cd51e1f37e4402 Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Tue, 14 Jul 2020 18:15:06 +0100 Subject: [PATCH 0353/1025] TYP: Add type hints to pd.read_html (#34291) --- pandas/io/html.py | 45 ++++++++++++++++++++++++--------------------- 1 file changed, 24 insertions(+), 21 deletions(-) diff --git a/pandas/io/html.py b/pandas/io/html.py index c4ffe332e3020..3193f52d239f1 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -8,7 +8,9 @@ import numbers import os import re +from typing import Dict, List, Optional, Pattern, Sequence, Union +from pandas._typing import FilePathOrBuffer from pandas.compat._optional import import_optional_dependency from pandas.errors import AbstractMethodError, EmptyDataError from pandas.util._decorators import deprecate_nonkeyword_arguments @@ -16,6 +18,7 @@ from pandas.core.dtypes.common import is_list_like from pandas.core.construction import create_series_with_explicit_dtype +from pandas.core.frame import DataFrame from pandas.io.common import is_url, urlopen, validate_header_arg from pandas.io.formats.printing import pprint_thing @@ -924,22 +927,22 @@ def _parse(flavor, io, match, attrs, encoding, displayed_only, **kwargs): @deprecate_nonkeyword_arguments(version="2.0") def read_html( - io, - match=".+", - flavor=None, - header=None, - index_col=None, - skiprows=None, - attrs=None, - parse_dates=False, - thousands=",", - encoding=None, - decimal=".", - converters=None, + io: FilePathOrBuffer, + match: Union[str, Pattern] = ".+", + flavor: Optional[str] = None, + header: Optional[Union[int, Sequence[int]]] = None, + index_col: Optional[Union[int, Sequence[int]]] = None, + skiprows: Optional[Union[int, Sequence[int], slice]] = None, + attrs: Optional[Dict[str, str]] = None, + parse_dates: bool = False, + thousands: Optional[str] = ",", + encoding: Optional[str] = None, + decimal: str = ".", + converters: Optional[Dict] = None, na_values=None, - keep_default_na=True, - displayed_only=True, -): + keep_default_na: bool = True, + displayed_only: bool = True, +) -> List[DataFrame]: r""" Read HTML tables into a ``list`` of ``DataFrame`` objects. @@ -958,26 +961,26 @@ def read_html( This value is converted to a regular expression so that there is consistent behavior between Beautiful Soup and lxml. - flavor : str or None + flavor : str, optional The parsing engine to use. 'bs4' and 'html5lib' are synonymous with each other, they are both there for backwards compatibility. The default of ``None`` tries to use ``lxml`` to parse and if that fails it falls back on ``bs4`` + ``html5lib``. - header : int or list-like or None, optional + header : int or list-like, optional The row (or list of rows for a :class:`~pandas.MultiIndex`) to use to make the columns headers. - index_col : int or list-like or None, optional + index_col : int or list-like, optional The column (or list of columns) to use to create the index. - skiprows : int or list-like or slice or None, optional + skiprows : int, list-like or slice, optional Number of rows to skip after parsing the column integer. 0-based. If a sequence of integers or a slice is given, will skip the rows indexed by that sequence. Note that a single element sequence means 'skip the nth row' whereas an integer means 'skip n rows'. - attrs : dict or None, optional + attrs : dict, optional This is a dictionary of attributes that you can pass to use to identify the table in the HTML. These are not checked for validity before being passed to lxml or Beautiful Soup. However, these attributes must be @@ -1005,7 +1008,7 @@ def read_html( thousands : str, optional Separator to use to parse thousands. Defaults to ``','``. - encoding : str or None, optional + encoding : str, optional The encoding used to decode the web page. Defaults to ``None``.``None`` preserves the previous encoding behavior, which depends on the underlying parser library (e.g., the parser library will try to use From b16abadca26051214cc0ba23a8eda18a92fe7f4f Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 14 Jul 2020 10:16:29 -0700 Subject: [PATCH 0354/1025] ASV: dt64arr_to_periodarr (#35244) --- asv_bench/benchmarks/tslibs/period.py | 60 +++++++++++++++++++-------- 1 file changed, 42 insertions(+), 18 deletions(-) diff --git a/asv_bench/benchmarks/tslibs/period.py b/asv_bench/benchmarks/tslibs/period.py index 1a2c89b48c665..849e8ec864ac2 100644 --- a/asv_bench/benchmarks/tslibs/period.py +++ b/asv_bench/benchmarks/tslibs/period.py @@ -9,7 +9,12 @@ from pandas.tseries.frequencies import to_offset -from .tslib import _sizes +from .tslib import _sizes, _tzs + +try: + from pandas._libs.tslibs.vectorized import dt64arr_to_periodarr +except ImportError: + from pandas._libs.tslibs.period import dt64arr_to_periodarr class PeriodProperties: @@ -75,26 +80,29 @@ def time_period_constructor(self, freq, is_offset): Period("2012-06-01", freq=freq) +_freq_ints = [ + 1000, + 1011, # Annual - November End + 2000, + 2011, # Quarterly - November End + 3000, + 4000, + 4006, # Weekly - Saturday End + 5000, + 6000, + 7000, + 8000, + 9000, + 10000, + 11000, + 12000, +] + + class TimePeriodArrToDT64Arr: params = [ _sizes, - [ - 1000, - 1011, # Annual - November End - 2000, - 2011, # Quarterly - November End - 3000, - 4000, - 4006, # Weekly - Saturday End - 5000, - 6000, - 7000, - 8000, - 9000, - 10000, - 11000, - 12000, - ], + _freq_ints, ] param_names = ["size", "freq"] @@ -104,3 +112,19 @@ def setup(self, size, freq): def time_periodarray_to_dt64arr(self, size, freq): periodarr_to_dt64arr(self.i8values, freq) + + +class TimeDT64ArrToPeriodArr: + params = [ + _sizes, + _freq_ints, + _tzs, + ] + param_names = ["size", "freq", "tz"] + + def setup(self, size, freq, tz): + arr = np.arange(10, dtype="i8").repeat(size // 10) + self.i8values = arr + + def time_dt64arr_to_periodarr(self, size, freq, tz): + dt64arr_to_periodarr(self.i8values, freq, tz) From ee5f5ecab2398b2f6f987b6b2d81c2dd46c904c3 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 14 Jul 2020 15:21:56 -0500 Subject: [PATCH 0355/1025] API: Make describe changes backwards compatible (#34798) --- doc/source/whatsnew/v1.1.0.rst | 10 +-- pandas/core/generic.py | 54 +++++++++++++++-- pandas/tests/frame/methods/test_describe.py | 64 +++++++++++++++++++- pandas/tests/series/methods/test_describe.py | 42 ++++++++++++- 4 files changed, 153 insertions(+), 17 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 088f1d1946fa9..cfac916157649 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -280,6 +280,7 @@ Other enhancements - Added :meth:`DataFrame.value_counts` (:issue:`5377`) - Added a :func:`pandas.api.indexers.FixedForwardWindowIndexer` class to support forward-looking windows during ``rolling`` operations. - Added a :func:`pandas.api.indexers.VariableOffsetWindowIndexer` class to support ``rolling`` operations with non-fixed offsets (:issue:`34994`) +- :meth:`~DataFrame.describe` now includes a ``datetime_is_numeric`` keyword to control how datetime columns are summarized (:issue:`30164`, :issue:`34798`) - :class:`Styler` may now render CSS more efficiently where multiple cells have the same styling (:issue:`30876`) - :meth:`Styler.highlight_null` now accepts ``subset`` argument (:issue:`31345`) - When writing directly to a sqlite connection :func:`to_sql` now supports the ``multi`` method (:issue:`29921`) @@ -675,15 +676,6 @@ apply and applymap on ``DataFrame`` evaluates first row/column only once df.apply(func, axis=1) -.. _whatsnew_110.api.other: - -Other API changes -^^^^^^^^^^^^^^^^^ - -- :meth:`Series.describe` will now show distribution percentiles for ``datetime`` dtypes, statistics ``first`` and ``last`` - will now be ``min`` and ``max`` to match with numeric dtypes in :meth:`DataFrame.describe` (:issue:`30164`) - - Increased minimum versions for dependencies ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/pandas/core/generic.py b/pandas/core/generic.py index ece4281af3208..eb55369d83593 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -9711,7 +9711,11 @@ def abs(self: FrameOrSeries) -> FrameOrSeries: return np.abs(self) def describe( - self: FrameOrSeries, percentiles=None, include=None, exclude=None + self: FrameOrSeries, + percentiles=None, + include=None, + exclude=None, + datetime_is_numeric=False, ) -> FrameOrSeries: """ Generate descriptive statistics. @@ -9757,6 +9761,12 @@ def describe( ``select_dtypes`` (e.g. ``df.describe(include=['O'])``). To exclude pandas categorical columns, use ``'category'`` - None (default) : The result will exclude nothing. + datetime_is_numeric : bool, default False + Whether to treat datetime dtypes as numeric. This affects statistics + calculated for the column. For DataFrame input, this also + controls whether datetime columns are included by default. + + .. versionadded:: 1.1.0 Returns ------- @@ -9834,7 +9844,7 @@ def describe( ... np.datetime64("2010-01-01"), ... np.datetime64("2010-01-01") ... ]) - >>> s.describe() + >>> s.describe(datetime_is_numeric=True) count 3 mean 2006-09-01 08:00:00 min 2000-01-01 00:00:00 @@ -9992,8 +10002,37 @@ def describe_categorical_1d(data): dtype = None if result[1] > 0: top, freq = objcounts.index[0], objcounts.iloc[0] - names += ["top", "freq"] - result += [top, freq] + if is_datetime64_any_dtype(data.dtype): + if self.ndim == 1: + stacklevel = 4 + else: + stacklevel = 5 + warnings.warn( + "Treating datetime data as categorical rather than numeric in " + "`.describe` is deprecated and will be removed in a future " + "version of pandas. Specify `datetime_is_numeric=True` to " + "silence this warning and adopt the future behavior now.", + FutureWarning, + stacklevel=stacklevel, + ) + tz = data.dt.tz + asint = data.dropna().values.view("i8") + top = Timestamp(top) + if top.tzinfo is not None and tz is not None: + # Don't tz_localize(None) if key is already tz-aware + top = top.tz_convert(tz) + else: + top = top.tz_localize(tz) + names += ["top", "freq", "first", "last"] + result += [ + top, + freq, + Timestamp(asint.min(), tz=tz), + Timestamp(asint.max(), tz=tz), + ] + else: + names += ["top", "freq"] + result += [top, freq] # If the DataFrame is empty, set 'top' and 'freq' to None # to maintain output shape consistency @@ -10019,7 +10058,7 @@ def describe_1d(data): return describe_categorical_1d(data) elif is_numeric_dtype(data): return describe_numeric_1d(data) - elif is_datetime64_any_dtype(data.dtype): + elif is_datetime64_any_dtype(data.dtype) and datetime_is_numeric: return describe_timestamp_1d(data) elif is_timedelta64_dtype(data.dtype): return describe_numeric_1d(data) @@ -10030,7 +10069,10 @@ def describe_1d(data): return describe_1d(self) elif (include is None) and (exclude is None): # when some numerics are found, keep only numerics - data = self.select_dtypes(include=[np.number]) + default_include = [np.number] + if datetime_is_numeric: + default_include.append("datetime") + data = self.select_dtypes(include=default_include) if len(data.columns) == 0: data = self elif include == "all": diff --git a/pandas/tests/frame/methods/test_describe.py b/pandas/tests/frame/methods/test_describe.py index b61d0d28e2fba..0b70bead375da 100644 --- a/pandas/tests/frame/methods/test_describe.py +++ b/pandas/tests/frame/methods/test_describe.py @@ -267,7 +267,69 @@ def test_describe_tz_values(self, tz_naive_fixture): }, index=["count", "mean", "min", "25%", "50%", "75%", "max", "std"], ) - result = df.describe(include="all") + result = df.describe(include="all", datetime_is_numeric=True) + tm.assert_frame_equal(result, expected) + + def test_datetime_is_numeric_includes_datetime(self): + df = pd.DataFrame({"a": pd.date_range("2012", periods=3), "b": [1, 2, 3]}) + result = df.describe(datetime_is_numeric=True) + expected = pd.DataFrame( + { + "a": [ + 3, + pd.Timestamp("2012-01-02"), + pd.Timestamp("2012-01-01"), + pd.Timestamp("2012-01-01T12:00:00"), + pd.Timestamp("2012-01-02"), + pd.Timestamp("2012-01-02T12:00:00"), + pd.Timestamp("2012-01-03"), + np.nan, + ], + "b": [3, 2, 1, 1.5, 2, 2.5, 3, 1], + }, + index=["count", "mean", "min", "25%", "50%", "75%", "max", "std"], + ) + tm.assert_frame_equal(result, expected) + + def test_describe_tz_values2(self): + tz = "CET" + s1 = Series(range(5)) + start = Timestamp(2018, 1, 1) + end = Timestamp(2018, 1, 5) + s2 = Series(date_range(start, end, tz=tz)) + df = pd.DataFrame({"s1": s1, "s2": s2}) + + s1_ = s1.describe() + s2_ = pd.Series( + [ + 5, + 5, + s2.value_counts().index[0], + 1, + start.tz_localize(tz), + end.tz_localize(tz), + ], + index=["count", "unique", "top", "freq", "first", "last"], + ) + idx = [ + "count", + "unique", + "top", + "freq", + "first", + "last", + "mean", + "std", + "min", + "25%", + "50%", + "75%", + "max", + ] + expected = pd.concat([s1_, s2_], axis=1, keys=["s1", "s2"]).loc[idx] + + with tm.assert_produces_warning(FutureWarning): + result = df.describe(include="all") tm.assert_frame_equal(result, expected) def test_describe_percentiles_integer_idx(self): diff --git a/pandas/tests/series/methods/test_describe.py b/pandas/tests/series/methods/test_describe.py index 4e59c6995f4f2..a15dc0751aa7d 100644 --- a/pandas/tests/series/methods/test_describe.py +++ b/pandas/tests/series/methods/test_describe.py @@ -83,7 +83,7 @@ def test_describe_with_tz(self, tz_naive_fixture): start = Timestamp(2018, 1, 1) end = Timestamp(2018, 1, 5) s = Series(date_range(start, end, tz=tz), name=name) - result = s.describe() + result = s.describe(datetime_is_numeric=True) expected = Series( [ 5, @@ -98,3 +98,43 @@ def test_describe_with_tz(self, tz_naive_fixture): index=["count", "mean", "min", "25%", "50%", "75%", "max"], ) tm.assert_series_equal(result, expected) + + def test_describe_with_tz_warns(self): + name = tz = "CET" + start = Timestamp(2018, 1, 1) + end = Timestamp(2018, 1, 5) + s = Series(date_range(start, end, tz=tz), name=name) + + with tm.assert_produces_warning(FutureWarning): + result = s.describe() + + expected = Series( + [ + 5, + 5, + s.value_counts().index[0], + 1, + start.tz_localize(tz), + end.tz_localize(tz), + ], + name=name, + index=["count", "unique", "top", "freq", "first", "last"], + ) + tm.assert_series_equal(result, expected) + + def test_datetime_is_numeric_includes_datetime(self): + s = Series(date_range("2012", periods=3)) + result = s.describe(datetime_is_numeric=True) + expected = Series( + [ + 3, + Timestamp("2012-01-02"), + Timestamp("2012-01-01"), + Timestamp("2012-01-01T12:00:00"), + Timestamp("2012-01-02"), + Timestamp("2012-01-02T12:00:00"), + Timestamp("2012-01-03"), + ], + index=["count", "mean", "min", "25%", "50%", "75%", "max"], + ) + tm.assert_series_equal(result, expected) From 85b832a23ae6bdba60ecce9472d19ac9fc3c0552 Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Tue, 14 Jul 2020 21:34:55 +0100 Subject: [PATCH 0356/1025] BUG: aggregations were getting overwritten if they had the same name (#30858) * :bug: aggregations were getting overwritten if they had the same name --- doc/source/whatsnew/v1.1.0.rst | 1 + pandas/core/groupby/generic.py | 15 +++-- .../tests/groupby/aggregate/test_aggregate.py | 58 +++++++++++++++++++ 3 files changed, 68 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index cfac916157649..3faca9c8868ca 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -1093,6 +1093,7 @@ Reshaping - Bug in :func:`crosstab` when inputs are two Series and have tuple names, the output will keep dummy MultiIndex as columns. (:issue:`18321`) - :meth:`DataFrame.pivot` can now take lists for ``index`` and ``columns`` arguments (:issue:`21425`) - Bug in :func:`concat` where the resulting indices are not copied when ``copy=True`` (:issue:`29879`) +- Bug in :meth:`SeriesGroupBy.aggregate` was resulting in aggregations being overwritten when they shared the same name (:issue:`30880`) - Bug where :meth:`Index.astype` would lose the name attribute when converting from ``Float64Index`` to ``Int64Index``, or when casting to an ``ExtensionArray`` dtype (:issue:`32013`) - :meth:`Series.append` will now raise a ``TypeError`` when passed a DataFrame or a sequence containing Dataframe (:issue:`31413`) - :meth:`DataFrame.replace` and :meth:`Series.replace` will raise a ``TypeError`` if ``to_replace`` is not an expected type. Previously the ``replace`` would fail silently (:issue:`18634`) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 093e1d4ab3942..94dc216c82f55 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -278,7 +278,7 @@ def aggregate( if isinstance(ret, dict): from pandas import concat - ret = concat(ret, axis=1) + ret = concat(ret.values(), axis=1, keys=[key.label for key in ret.keys()]) return ret agg = aggregate @@ -307,8 +307,8 @@ def _aggregate_multiple_funcs(self, arg): arg = zip(columns, arg) - results = {} - for name, func in arg: + results: Dict[base.OutputKey, Union[Series, DataFrame]] = {} + for idx, (name, func) in enumerate(arg): obj = self # reset the cache so that we @@ -317,13 +317,14 @@ def _aggregate_multiple_funcs(self, arg): obj = copy.copy(obj) obj._reset_cache() obj._selection = name - results[name] = obj.aggregate(func) + results[base.OutputKey(label=name, position=idx)] = obj.aggregate(func) if any(isinstance(x, DataFrame) for x in results.values()): # let higher level handle return results - return self.obj._constructor_expanddim(results, columns=columns) + output = self._wrap_aggregated_output(results) + return self.obj._constructor_expanddim(output, columns=columns) def _wrap_series_output( self, output: Mapping[base.OutputKey, Union[Series, np.ndarray]], index: Index, @@ -354,10 +355,12 @@ def _wrap_series_output( if len(output) > 1: result = self.obj._constructor_expanddim(indexed_output, index=index) result.columns = columns - else: + elif not columns.empty: result = self.obj._constructor( indexed_output[0], index=index, name=columns[0] ) + else: + result = self.obj._constructor_expanddim() return result diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index dbd713a0af4cf..bf465635c0085 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -2,10 +2,13 @@ test .agg behavior / note that .apply is tested generally in test_groupby.py """ import functools +from functools import partial import numpy as np import pytest +from pandas.errors import PerformanceWarning + from pandas.core.dtypes.common import is_integer_dtype import pandas as pd @@ -252,6 +255,61 @@ def test_agg_multiple_functions_maintain_order(df): tm.assert_index_equal(result.columns, exp_cols) +def test_agg_multiple_functions_same_name(): + # GH 30880 + df = pd.DataFrame( + np.random.randn(1000, 3), + index=pd.date_range("1/1/2012", freq="S", periods=1000), + columns=["A", "B", "C"], + ) + result = df.resample("3T").agg( + {"A": [partial(np.quantile, q=0.9999), partial(np.quantile, q=0.1111)]} + ) + expected_index = pd.date_range("1/1/2012", freq="3T", periods=6) + expected_columns = MultiIndex.from_tuples([("A", "quantile"), ("A", "quantile")]) + expected_values = np.array( + [df.resample("3T").A.quantile(q=q).values for q in [0.9999, 0.1111]] + ).T + expected = pd.DataFrame( + expected_values, columns=expected_columns, index=expected_index + ) + tm.assert_frame_equal(result, expected) + + +def test_agg_multiple_functions_same_name_with_ohlc_present(): + # GH 30880 + # ohlc expands dimensions, so different test to the above is required. + df = pd.DataFrame( + np.random.randn(1000, 3), + index=pd.date_range("1/1/2012", freq="S", periods=1000), + columns=["A", "B", "C"], + ) + result = df.resample("3T").agg( + {"A": ["ohlc", partial(np.quantile, q=0.9999), partial(np.quantile, q=0.1111)]} + ) + expected_index = pd.date_range("1/1/2012", freq="3T", periods=6) + expected_columns = pd.MultiIndex.from_tuples( + [ + ("A", "ohlc", "open"), + ("A", "ohlc", "high"), + ("A", "ohlc", "low"), + ("A", "ohlc", "close"), + ("A", "quantile", "A"), + ("A", "quantile", "A"), + ] + ) + non_ohlc_expected_values = np.array( + [df.resample("3T").A.quantile(q=q).values for q in [0.9999, 0.1111]] + ).T + expected_values = np.hstack([df.resample("3T").A.ohlc(), non_ohlc_expected_values]) + expected = pd.DataFrame( + expected_values, columns=expected_columns, index=expected_index + ) + # PerformanceWarning is thrown by `assert col in right` in assert_frame_equal + with tm.assert_produces_warning(PerformanceWarning): + tm.assert_frame_equal(result, expected) + + def test_multiple_functions_tuples_and_non_tuples(df): # #1359 funcs = [("foo", "mean"), "std"] From 5f27ec3364400a45de450fe250ac3779bffc49ee Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Tue, 14 Jul 2020 23:32:48 +0100 Subject: [PATCH 0357/1025] CLN: remove kwargs in Index.format (#35122) --- pandas/core/indexes/base.py | 9 +++++++-- pandas/core/indexes/datetimelike.py | 22 ++++++++++++++++++++- pandas/core/indexes/multi.py | 30 ++++++++++++++++------------- pandas/core/indexes/range.py | 2 +- pandas/io/formats/format.py | 3 +-- pandas/io/formats/html.py | 1 + 6 files changed, 48 insertions(+), 19 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 2f12a2e4c27ea..3dbee7d0929cb 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -902,7 +902,12 @@ def _mpl_repr(self): # how to represent ourselves to matplotlib return self.values - def format(self, name: bool = False, formatter=None, **kwargs): + def format( + self, + name: bool = False, + formatter: Optional[Callable] = None, + na_rep: str_t = "NaN", + ) -> List[str_t]: """ Render a string representation of the Index. """ @@ -917,7 +922,7 @@ def format(self, name: bool = False, formatter=None, **kwargs): if formatter is not None: return header + list(self.map(formatter)) - return self._format_with_header(header, **kwargs) + return self._format_with_header(header, na_rep=na_rep) def _format_with_header(self, header, na_rep="NaN") -> List[str_t]: from pandas.io.formats.format import format_array diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 7be6aa50fa16b..15a7e25238983 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -9,7 +9,7 @@ from pandas._libs import NaT, Timedelta, iNaT, join as libjoin, lib from pandas._libs.tslibs import BaseOffset, Resolution, Tick, timezones from pandas._libs.tslibs.parsing import DateParseError -from pandas._typing import Label +from pandas._typing import Callable, Label from pandas.compat.numpy import function as nv from pandas.errors import AbstractMethodError from pandas.util._decorators import Appender, cache_readonly, doc @@ -338,6 +338,26 @@ def argmax(self, axis=None, skipna=True, *args, **kwargs): # -------------------------------------------------------------------- # Rendering Methods + def format( + self, + name: bool = False, + formatter: Optional[Callable] = None, + na_rep: str = "NaT", + date_format: Optional[str] = None, + ) -> List[str]: + """ + Render a string representation of the Index. + """ + header = [] + if name: + fmt_name = ibase.pprint_thing(self.name, escape_chars=("\t", "\r", "\n")) + header.append(fmt_name) + + if formatter is not None: + return header + list(self.map(formatter)) + + return self._format_with_header(header, na_rep=na_rep, date_format=date_format) + def _format_with_header(self, header, na_rep="NaT", date_format=None) -> List[str]: return header + list( self._format_native_types(na_rep=na_rep, date_format=date_format) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 15db6c51a1f2f..235da89083d0a 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -2,6 +2,7 @@ from typing import ( TYPE_CHECKING, Any, + Callable, Hashable, Iterable, List, @@ -1231,13 +1232,17 @@ def _format_native_types(self, na_rep="nan", **kwargs): def format( self, - space=2, + name: Optional[bool] = None, + formatter: Optional[Callable] = None, + na_rep: Optional[str] = None, + names: bool = False, + space: int = 2, sparsify=None, - adjoin=True, - names=False, - na_rep=None, - formatter=None, - ): + adjoin: bool = True, + ) -> List: + if name is not None: + names = name + if len(self) == 0: return [] @@ -1265,13 +1270,13 @@ def format( stringified_levels.append(formatted) result_levels = [] - for lev, name in zip(stringified_levels, self.names): + for lev, lev_name in zip(stringified_levels, self.names): level = [] if names: level.append( - pprint_thing(name, escape_chars=("\t", "\r", "\n")) - if name is not None + pprint_thing(lev_name, escape_chars=("\t", "\r", "\n")) + if lev_name is not None else "" ) @@ -1283,10 +1288,9 @@ def format( if sparsify: sentinel = "" - # GH3547 - # use value of sparsify as sentinel, unless it's an obvious - # "Truthy" value - if sparsify not in [True, 1]: + # GH3547 use value of sparsify as sentinel if it's "Falsey" + assert isinstance(sparsify, bool) or sparsify is lib.no_default + if sparsify in [False, lib.no_default]: sentinel = sparsify # little bit of a kludge job for #1217 result_levels = _sparsify( diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 6d9fd6efe54a3..e5e98039ff77b 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -198,7 +198,7 @@ def _format_data(self, name=None): return None def _format_with_header(self, header, na_rep="NaN") -> List[str]: - return header + list(map(pprint_thing, self._range)) + return header + [pprint_thing(x) for x in self._range] # -------------------------------------------------------------------- _deprecation_message = ( diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index b4e1ebe93fb0e..b2cd8e9319791 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -330,9 +330,8 @@ def _get_footer(self) -> str: def _get_formatted_index(self) -> Tuple[List[str], bool]: index = self.tr_series.index - is_multi = isinstance(index, MultiIndex) - if is_multi: + if isinstance(index, MultiIndex): have_header = any(name for name in index.names) fmt_index = index.format(names=True) else: diff --git a/pandas/io/formats/html.py b/pandas/io/formats/html.py index 7ea2417ceb24b..13f0ab1e8a52c 100644 --- a/pandas/io/formats/html.py +++ b/pandas/io/formats/html.py @@ -442,6 +442,7 @@ def _write_hierarchical_rows( frame = self.fmt.tr_frame nrows = len(frame) + assert isinstance(frame.index, MultiIndex) idx_values = frame.index.format(sparsify=False, adjoin=False, names=False) idx_values = list(zip(*idx_values)) From e551fea60a8c3b96365f0afae0842973b158ee9f Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 15 Jul 2020 07:24:39 -0500 Subject: [PATCH 0358/1025] Revert "BUG: fix union_indexes not supporting sort=False for Index subclasses (#35098)" (#35277) This reverts commit c21be0562a33d149b62735fc82aff80e4d5942f5. --- doc/source/whatsnew/v1.1.0.rst | 1 - pandas/core/indexes/api.py | 8 +------- pandas/tests/frame/test_constructors.py | 6 ++---- pandas/tests/indexes/test_common.py | 18 +---------------- pandas/tests/reshape/test_concat.py | 14 ------------- pandas/tests/reshape/test_melt.py | 26 ++++++++++++------------- pandas/tests/test_strings.py | 9 +-------- 7 files changed, 18 insertions(+), 64 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 3faca9c8868ca..dc1e7523046d5 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -1114,7 +1114,6 @@ Reshaping - Fixed bug in :func:`melt` where melting MultiIndex columns with ``col_level`` > 0 would raise a ``KeyError`` on ``id_vars`` (:issue:`34129`) - Bug in :meth:`Series.where` with an empty Series and empty ``cond`` having non-bool dtype (:issue:`34592`) - Fixed regression where :meth:`DataFrame.apply` would raise ``ValueError`` for elements whth ``S`` dtype (:issue:`34529`) -- Bug in :meth:`DataFrame.append` leading to sorting columns even when ``sort=False`` is specified (:issue:`35092`) Sparse ^^^^^^ diff --git a/pandas/core/indexes/api.py b/pandas/core/indexes/api.py index 9849742abcfca..4c5a70f4088ee 100644 --- a/pandas/core/indexes/api.py +++ b/pandas/core/indexes/api.py @@ -214,13 +214,7 @@ def conv(i): return result.union_many(indexes[1:]) else: for other in indexes[1:]: - # GH 35092. Index.union expects sort=None instead of sort=True - # to signify that sort=True isn't fully implemented and - # legacy implementation sometimes might not sort (see GH 24959) - # In this case we currently sort in _get_combined_index - if sort: - sort = None - result = result.union(other, sort=sort) + result = result.union(other) return result elif kind == "array": index = indexes[0] diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 17ac2307b9da6..1631342c359c1 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -2578,13 +2578,11 @@ def test_construct_with_two_categoricalindex_series(self): index=pd.CategoricalIndex(["f", "female", "m", "male", "unknown"]), ) result = DataFrame([s1, s2]) - # GH 35092. Extra s2 columns are now appended to s1 columns - # in original order expected = DataFrame( np.array( - [[39.0, 6.0, 4.0, np.nan, np.nan], [152.0, 242.0, 150.0, 2.0, 2.0]] + [[np.nan, 39.0, np.nan, 6.0, 4.0], [2.0, 152.0, 2.0, 242.0, 150.0]] ), - columns=["female", "male", "unknown", "f", "m"], + columns=["f", "female", "m", "male", "unknown"], ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/indexes/test_common.py b/pandas/tests/indexes/test_common.py index c85696e02ad39..02a173eb4958d 100644 --- a/pandas/tests/indexes/test_common.py +++ b/pandas/tests/indexes/test_common.py @@ -13,9 +13,8 @@ from pandas.core.dtypes.common import is_period_dtype, needs_i8_conversion import pandas as pd -from pandas import CategoricalIndex, Index, MultiIndex, RangeIndex +from pandas import CategoricalIndex, MultiIndex, RangeIndex import pandas._testing as tm -from pandas.core.indexes.api import union_indexes class TestCommon: @@ -396,18 +395,3 @@ def test_astype_preserves_name(self, index, dtype, copy): assert result.names == index.names else: assert result.name == index.name - - -@pytest.mark.parametrize("arr", [[0, 1, 4, 3]]) -@pytest.mark.parametrize("dtype", ["int8", "int16", "int32", "int64"]) -def test_union_index_no_sort(arr, sort, dtype): - # GH 35092. Check that we don't sort with sort=False - ind1 = Index(arr[:2], dtype=dtype) - ind2 = Index(arr[2:], dtype=dtype) - - # sort is None indicates that we sort the combined index - if sort is None: - arr.sort() - expected = Index(arr, dtype=dtype) - result = union_indexes([ind1, ind2], sort=sort) - tm.assert_index_equal(result, expected) diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py index ff95d8ad997a4..ffeb5ff0f8aaa 100644 --- a/pandas/tests/reshape/test_concat.py +++ b/pandas/tests/reshape/test_concat.py @@ -2857,17 +2857,3 @@ def test_concat_frame_axis0_extension_dtypes(): result = pd.concat([df2, df1], ignore_index=True) expected = pd.DataFrame({"a": [4, 5, 6, 1, 2, 3]}, dtype="Int64") tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("sort", [True, False]) -def test_append_sort(sort): - # GH 35092. Check that DataFrame.append respects the sort argument. - df1 = pd.DataFrame(data={0: [1, 2], 1: [3, 4]}) - df2 = pd.DataFrame(data={3: [1, 2], 2: [3, 4]}) - cols = list(df1.columns) + list(df2.columns) - if sort: - cols.sort() - - result = df1.append(df2, sort=sort).columns - expected = type(result)(cols) - tm.assert_index_equal(result, expected) diff --git a/pandas/tests/reshape/test_melt.py b/pandas/tests/reshape/test_melt.py index 241721432bbf9..2b75a1ec6ca6e 100644 --- a/pandas/tests/reshape/test_melt.py +++ b/pandas/tests/reshape/test_melt.py @@ -732,11 +732,11 @@ def test_unbalanced(self): ) df["id"] = df.index exp_data = { - "X": ["X1", "X2", "X1", "X2"], - "A": [1.0, 2.0, 3.0, 4.0], - "B": [5.0, 6.0, np.nan, np.nan], - "id": [0, 1, 0, 1], - "year": [2010, 2010, 2011, 2011], + "X": ["X1", "X1", "X2", "X2"], + "A": [1.0, 3.0, 2.0, 4.0], + "B": [5.0, np.nan, 6.0, np.nan], + "id": [0, 0, 1, 1], + "year": [2010, 2011, 2010, 2011], } expected = pd.DataFrame(exp_data) expected = expected.set_index(["id", "year"])[["X", "A", "B"]] @@ -979,10 +979,10 @@ def test_nonnumeric_suffix(self): ) expected = pd.DataFrame( { - "A": ["X1", "X2", "X1", "X2"], - "colname": ["placebo", "placebo", "test", "test"], - "result": [5.0, 6.0, np.nan, np.nan], - "treatment": [1.0, 2.0, 3.0, 4.0], + "A": ["X1", "X1", "X2", "X2"], + "colname": ["placebo", "test", "placebo", "test"], + "result": [5.0, np.nan, 6.0, np.nan], + "treatment": [1.0, 3.0, 2.0, 4.0], } ) expected = expected.set_index(["A", "colname"]) @@ -1026,10 +1026,10 @@ def test_float_suffix(self): ) expected = pd.DataFrame( { - "A": ["X1", "X2", "X1", "X2", "X1", "X2", "X1", "X2"], - "colname": [1.2, 1.2, 1.0, 1.0, 1.1, 1.1, 2.1, 2.1], - "result": [5.0, 6.0, 0.0, 9.0, np.nan, np.nan, np.nan, np.nan], - "treatment": [np.nan, np.nan, np.nan, np.nan, 1.0, 2.0, 3.0, 4.0], + "A": ["X1", "X1", "X1", "X1", "X2", "X2", "X2", "X2"], + "colname": [1, 1.1, 1.2, 2.1, 1, 1.1, 1.2, 2.1], + "result": [0.0, np.nan, 5.0, np.nan, 9.0, np.nan, 6.0, np.nan], + "treatment": [np.nan, 1.0, np.nan, 3.0, np.nan, 2.0, np.nan, 4.0], } ) expected = expected.set_index(["A", "colname"]) diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index 3a4e54052305e..d9396d70f9112 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -636,15 +636,8 @@ def test_str_cat_align_mixed_inputs(self, join): # mixed list of indexed/unindexed u = np.array(["A", "B", "C", "D"]) expected_outer = Series(["aaA", "bbB", "c-C", "ddD", "-e-"]) - # joint index of rhs [t, u]; u will be forced have index of s - # GH 35092. If right join, maintain order of t.index - if join == "inner": - rhs_idx = t.index & s.index - elif join == "right": - rhs_idx = t.index.union(s.index, sort=False) - else: - rhs_idx = t.index | s.index + rhs_idx = t.index & s.index if join == "inner" else t.index | s.index expected = expected_outer.loc[s.index.join(rhs_idx, how=join)] result = s.str.cat([t, u], join=join, na_rep="-") From ce1adc78f8f8fb099b3457964b26d3ee40be63c9 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 15 Jul 2020 07:25:33 -0500 Subject: [PATCH 0359/1025] Fixed apply_index (#35165) --- doc/source/reference/offset_frequency.rst | 18 ++++ doc/source/whatsnew/v1.1.0.rst | 1 + pandas/_libs/tslibs/offsets.pyx | 99 +++++++++++++++++--- pandas/core/arrays/datetimes.py | 2 +- pandas/tests/tseries/offsets/test_offsets.py | 7 +- 5 files changed, 110 insertions(+), 17 deletions(-) diff --git a/doc/source/reference/offset_frequency.rst b/doc/source/reference/offset_frequency.rst index 1b63253cde2c5..e6271a7806706 100644 --- a/doc/source/reference/offset_frequency.rst +++ b/doc/source/reference/offset_frequency.rst @@ -33,6 +33,7 @@ Methods :toctree: api/ DateOffset.apply + DateOffset.apply_index DateOffset.copy DateOffset.isAnchored DateOffset.onOffset @@ -117,6 +118,7 @@ Methods :toctree: api/ BusinessHour.apply + BusinessHour.apply_index BusinessHour.copy BusinessHour.isAnchored BusinessHour.onOffset @@ -201,6 +203,7 @@ Methods :toctree: api/ CustomBusinessHour.apply + CustomBusinessHour.apply_index CustomBusinessHour.copy CustomBusinessHour.isAnchored CustomBusinessHour.onOffset @@ -401,6 +404,7 @@ Methods :toctree: api/ CustomBusinessMonthEnd.apply + CustomBusinessMonthEnd.apply_index CustomBusinessMonthEnd.copy CustomBusinessMonthEnd.isAnchored CustomBusinessMonthEnd.onOffset @@ -447,6 +451,7 @@ Methods :toctree: api/ CustomBusinessMonthBegin.apply + CustomBusinessMonthBegin.apply_index CustomBusinessMonthBegin.copy CustomBusinessMonthBegin.isAnchored CustomBusinessMonthBegin.onOffset @@ -586,6 +591,7 @@ Methods :toctree: api/ WeekOfMonth.apply + WeekOfMonth.apply_index WeekOfMonth.copy WeekOfMonth.isAnchored WeekOfMonth.onOffset @@ -622,6 +628,7 @@ Methods :toctree: api/ LastWeekOfMonth.apply + LastWeekOfMonth.apply_index LastWeekOfMonth.copy LastWeekOfMonth.isAnchored LastWeekOfMonth.onOffset @@ -938,6 +945,7 @@ Methods :toctree: api/ FY5253.apply + FY5253.apply_index FY5253.copy FY5253.get_rule_code_suffix FY5253.get_year_end @@ -977,6 +985,7 @@ Methods :toctree: api/ FY5253Quarter.apply + FY5253Quarter.apply_index FY5253Quarter.copy FY5253Quarter.get_rule_code_suffix FY5253Quarter.get_weeks @@ -1013,6 +1022,7 @@ Methods :toctree: api/ Easter.apply + Easter.apply_index Easter.copy Easter.isAnchored Easter.onOffset @@ -1053,6 +1063,7 @@ Methods Tick.is_on_offset Tick.__call__ Tick.apply + Tick.apply_index Day --- @@ -1087,6 +1098,7 @@ Methods Day.is_on_offset Day.__call__ Day.apply + Day.apply_index Hour ---- @@ -1121,6 +1133,7 @@ Methods Hour.is_on_offset Hour.__call__ Hour.apply + Hour.apply_index Minute ------ @@ -1155,6 +1168,7 @@ Methods Minute.is_on_offset Minute.__call__ Minute.apply + Minute.apply_index Second ------ @@ -1189,6 +1203,7 @@ Methods Second.is_on_offset Second.__call__ Second.apply + Second.apply_index Milli ----- @@ -1223,6 +1238,7 @@ Methods Milli.is_on_offset Milli.__call__ Milli.apply + Milli.apply_index Micro ----- @@ -1257,6 +1273,7 @@ Methods Micro.is_on_offset Micro.__call__ Micro.apply + Micro.apply_index Nano ---- @@ -1291,6 +1308,7 @@ Methods Nano.is_on_offset Nano.__call__ Nano.apply + Nano.apply_index .. _api.frequencies: diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index dc1e7523046d5..85661c0507caa 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -787,6 +787,7 @@ Deprecations - :meth:`DatetimeIndex.week` and `DatetimeIndex.weekofyear` are deprecated and will be removed in a future version, use :meth:`DatetimeIndex.isocalendar().week` instead (:issue:`33595`) - :meth:`DatetimeArray.week` and `DatetimeArray.weekofyear` are deprecated and will be removed in a future version, use :meth:`DatetimeArray.isocalendar().week` instead (:issue:`33595`) - :meth:`DateOffset.__call__` is deprecated and will be removed in a future version, use ``offset + other`` instead (:issue:`34171`) +- :meth:`~BusinessDay.apply_index` is deprecated and will be removed in a future version. Use ``offset + other`` instead (:issue:`34580`) - :meth:`DataFrame.tshift` and :meth:`Series.tshift` are deprecated and will be removed in a future version, use :meth:`DataFrame.shift` and :meth:`Series.shift` instead (:issue:`11631`) - Indexing an :class:`Index` object with a float key is deprecated, and will raise an ``IndexError`` in the future. You can manually convert to an integer key diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index b0c6648514e99..9a7ca15a2a1c2 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -86,19 +86,38 @@ cdef bint _is_normalized(datetime dt): return True +def apply_wrapper_core(func, self, other) -> ndarray: + result = func(self, other) + result = np.asarray(result) + + if self.normalize: + # TODO: Avoid circular/runtime import + from .vectorized import normalize_i8_timestamps + result = normalize_i8_timestamps(result.view("i8"), None) + + return result + + def apply_index_wraps(func): # Note: normally we would use `@functools.wraps(func)`, but this does # not play nicely with cython class methods - def wrapper(self, other) -> np.ndarray: + def wrapper(self, other): # other is a DatetimeArray + result = apply_wrapper_core(func, self, other) + result = type(other)(result) + warnings.warn("'Offset.apply_index(other)' is deprecated. " + "Use 'offset + other' instead.", FutureWarning) + return result - result = func(self, other) - result = np.asarray(result) + return wrapper - if self.normalize: - # TODO: Avoid circular/runtime import - from .vectorized import normalize_i8_timestamps - result = normalize_i8_timestamps(result.view("i8"), None) + +def apply_array_wraps(func): + # Note: normally we would use `@functools.wraps(func)`, but this does + # not play nicely with cython class methods + def wrapper(self, other) -> np.ndarray: + # other is a DatetimeArray + result = apply_wrapper_core(func, self, other) return result # do @functools.wraps(func) manually since it doesn't work on cdef funcs @@ -515,6 +534,10 @@ cdef class BaseOffset: raises NotImplementedError for offsets without a vectorized implementation. + .. deprecated:: 1.1.0 + + Use ``offset + dtindex`` instead. + Parameters ---------- index : DatetimeIndex @@ -522,12 +545,25 @@ cdef class BaseOffset: Returns ------- DatetimeIndex + + Raises + ------ + NotImplementedError + When the specific offset subclass does not have a vectorized + implementation. """ raise NotImplementedError( f"DateOffset subclass {type(self).__name__} " "does not have a vectorized implementation" ) + @apply_array_wraps + def _apply_array(self, dtarr): + raise NotImplementedError( + f"DateOffset subclass {type(self).__name__} " + "does not have a vectorized implementation" + ) + def rollback(self, dt) -> datetime: """ Roll provided date backward to next offset only if not on offset. @@ -992,7 +1028,11 @@ cdef class RelativeDeltaOffset(BaseOffset): ------- ndarray[datetime64[ns]] """ - dt64other = np.asarray(dtindex) + return self._apply_array(dtindex) + + @apply_array_wraps + def _apply_array(self, dtarr): + dt64other = np.asarray(dtarr) kwds = self.kwds relativedelta_fast = { "years", @@ -1321,7 +1361,11 @@ cdef class BusinessDay(BusinessMixin): @apply_index_wraps def apply_index(self, dtindex): - i8other = dtindex.view("i8") + return self._apply_array(dtindex) + + @apply_array_wraps + def _apply_array(self, dtarr): + i8other = dtarr.view("i8") return shift_bdays(i8other, self.n) def is_on_offset(self, dt: datetime) -> bool: @@ -1804,8 +1848,12 @@ cdef class YearOffset(SingleConstructorOffset): @apply_index_wraps def apply_index(self, dtindex): + return self._apply_array(dtindex) + + @apply_array_wraps + def _apply_array(self, dtarr): shifted = shift_quarters( - dtindex.view("i8"), self.n, self.month, self._day_opt, modby=12 + dtarr.view("i8"), self.n, self.month, self._day_opt, modby=12 ) return shifted @@ -1957,8 +2005,12 @@ cdef class QuarterOffset(SingleConstructorOffset): @apply_index_wraps def apply_index(self, dtindex): + return self._apply_array(dtindex) + + @apply_array_wraps + def _apply_array(self, dtarr): shifted = shift_quarters( - dtindex.view("i8"), self.n, self.startingMonth, self._day_opt + dtarr.view("i8"), self.n, self.startingMonth, self._day_opt ) return shifted @@ -2072,7 +2124,11 @@ cdef class MonthOffset(SingleConstructorOffset): @apply_index_wraps def apply_index(self, dtindex): - shifted = shift_months(dtindex.view("i8"), self.n, self._day_opt) + return self._apply_array(dtindex) + + @apply_array_wraps + def _apply_array(self, dtarr): + shifted = shift_months(dtarr.view("i8"), self.n, self._day_opt) return shifted cpdef __setstate__(self, state): @@ -2209,8 +2265,14 @@ cdef class SemiMonthOffset(SingleConstructorOffset): @cython.wraparound(False) @cython.boundscheck(False) def apply_index(self, dtindex): + return self._apply_array(dtindex) + + @apply_array_wraps + @cython.wraparound(False) + @cython.boundscheck(False) + def _apply_array(self, dtarr): cdef: - int64_t[:] i8other = dtindex.view("i8") + int64_t[:] i8other = dtarr.view("i8") Py_ssize_t i, count = len(i8other) int64_t val int64_t[:] out = np.empty(count, dtype="i8") @@ -2368,12 +2430,16 @@ cdef class Week(SingleConstructorOffset): @apply_index_wraps def apply_index(self, dtindex): + return self._apply_array(dtindex) + + @apply_array_wraps + def _apply_array(self, dtarr): if self.weekday is None: td = timedelta(days=7 * self.n) td64 = np.timedelta64(td, "ns") - return dtindex + td64 + return dtarr + td64 else: - i8other = dtindex.view("i8") + i8other = dtarr.view("i8") return self._end_apply_index(i8other) @cython.wraparound(False) @@ -3146,6 +3212,9 @@ cdef class CustomBusinessDay(BusinessDay): def apply_index(self, dtindex): raise NotImplementedError + def _apply_array(self, dtarr): + raise NotImplementedError + def is_on_offset(self, dt: datetime) -> bool: if self.normalize and not _is_normalized(dt): return False diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 7058ed3682d59..d674b1c476d2c 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -683,7 +683,7 @@ def _add_offset(self, offset): values = self.tz_localize(None) else: values = self - result = offset.apply_index(values) + result = offset._apply_array(values) result = DatetimeArray._simple_new(result) result = result.tz_localize(self.tz) diff --git a/pandas/tests/tseries/offsets/test_offsets.py b/pandas/tests/tseries/offsets/test_offsets.py index cffaa7b43d0cf..8c51908c547f4 100644 --- a/pandas/tests/tseries/offsets/test_offsets.py +++ b/pandas/tests/tseries/offsets/test_offsets.py @@ -3663,14 +3663,19 @@ def test_offset(self, case): @pytest.mark.parametrize("case", offset_cases) def test_apply_index(self, case): + # https://github.com/pandas-dev/pandas/issues/34580 offset, cases = case s = DatetimeIndex(cases.keys()) + exp = DatetimeIndex(cases.values()) + with tm.assert_produces_warning(None): # GH#22535 check that we don't get a FutureWarning from adding # an integer array to PeriodIndex result = offset + s + tm.assert_index_equal(result, exp) - exp = DatetimeIndex(cases.values()) + with tm.assert_produces_warning(FutureWarning): + result = offset.apply_index(s) tm.assert_index_equal(result, exp) on_offset_cases = [ From 6fe7cffc79530e364bf19646877f5b53e5a1c775 Mon Sep 17 00:00:00 2001 From: Christos Petropoulos Date: Wed, 15 Jul 2020 14:27:21 +0200 Subject: [PATCH 0360/1025] Place the calculation of mask prior to the calls of comp in replace_list to improve performance (#35229) --- pandas/core/internals/managers.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index c82670106d3b6..d5947726af7fd 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -596,18 +596,22 @@ def replace_list( # figure out our mask apriori to avoid repeated replacements values = self.as_array() - def comp(s, regex=False): + def comp(s: Scalar, mask: np.ndarray, regex: bool = False): """ Generate a bool array by perform an equality check, or perform an element-wise regular expression matching """ if isna(s): - return isna(values) + return ~mask s = com.maybe_box_datetimelike(s) - return _compare_or_regex_search(values, s, regex) + return _compare_or_regex_search(values, s, regex, mask) - masks = [comp(s, regex) for s in src_list] + # Calculate the mask once, prior to the call of comp + # in order to avoid repeating the same computations + mask = ~isna(values) + + masks = [comp(s, mask, regex) for s in src_list] result_blocks = [] src_len = len(src_list) - 1 @@ -1895,7 +1899,7 @@ def _merge_blocks( def _compare_or_regex_search( - a: ArrayLike, b: Scalar, regex: bool = False + a: ArrayLike, b: Scalar, regex: bool = False, mask: Optional[ArrayLike] = None ) -> Union[ArrayLike, bool]: """ Compare two array_like inputs of the same shape or two scalar values @@ -1908,6 +1912,7 @@ def _compare_or_regex_search( a : array_like b : scalar regex : bool, default False + mask : array_like or None (default) Returns ------- @@ -1941,7 +1946,7 @@ def _check_comparison_types( ) # GH#32621 use mask to avoid comparing to NAs - if isinstance(a, np.ndarray) and not isinstance(b, np.ndarray): + if mask is None and isinstance(a, np.ndarray) and not isinstance(b, np.ndarray): mask = np.reshape(~(isna(a)), a.shape) if isinstance(a, np.ndarray): a = a[mask] @@ -1953,7 +1958,7 @@ def _check_comparison_types( result = op(a) - if isinstance(result, np.ndarray): + if isinstance(result, np.ndarray) and mask is not None: # The shape of the mask can differ to that of the result # since we may compare only a subset of a's or b's elements tmp = np.zeros(mask.shape, dtype=np.bool_) From a38b9386ddea2e8315c47e8db8dc7994097dfd22 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Wed, 15 Jul 2020 05:28:03 -0700 Subject: [PATCH 0361/1025] ENH: Add compute.use_numba configuration for automatically using numba (#35182) --- doc/source/whatsnew/v1.1.0.rst | 1 + pandas/core/config_init.py | 17 +++++++++ pandas/core/groupby/generic.py | 23 ++++++------ pandas/core/groupby/groupby.py | 9 +++-- pandas/core/groupby/ops.py | 7 ++-- pandas/core/util/numba_.py | 13 +++++++ pandas/core/window/rolling.py | 37 +++++++++----------- pandas/tests/groupby/aggregate/test_numba.py | 17 ++++++++- pandas/tests/groupby/transform/test_numba.py | 17 ++++++++- pandas/tests/util/test_numba.py | 12 +++++++ pandas/tests/window/test_numba.py | 14 +++++++- 11 files changed, 124 insertions(+), 43 deletions(-) create mode 100644 pandas/tests/util/test_numba.py diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 85661c0507caa..4b893bcd0c87d 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -338,6 +338,7 @@ Other enhancements - :meth:`read_csv` now accepts string values like "0", "0.0", "1", "1.0" as convertible to the nullable boolean dtype (:issue:`34859`) - :class:`pandas.core.window.ExponentialMovingWindow` now supports a ``times`` argument that allows ``mean`` to be calculated with observations spaced by the timestamps in ``times`` (:issue:`34839`) - :meth:`DataFrame.agg` and :meth:`Series.agg` now accept named aggregation for renaming the output columns/indexes. (:issue:`26513`) +- ``compute.use_numba`` now exists as a configuration option that utilizes the numba engine when available (:issue:`33966`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index 54d23fe8829e6..86f6be77bc505 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -52,6 +52,20 @@ def use_numexpr_cb(key): expressions.set_use_numexpr(cf.get_option(key)) +use_numba_doc = """ +: bool + Use the numba engine option for select operations if it is installed, + the default is False + Valid values: False,True +""" + + +def use_numba_cb(key): + from pandas.core.util import numba_ + + numba_.set_use_numba(cf.get_option(key)) + + with cf.config_prefix("compute"): cf.register_option( "use_bottleneck", @@ -63,6 +77,9 @@ def use_numexpr_cb(key): cf.register_option( "use_numexpr", True, use_numexpr_doc, validator=is_bool, cb=use_numexpr_cb ) + cf.register_option( + "use_numba", False, use_numba_doc, validator=is_bool, cb=use_numba_cb + ) # # options from the "display" namespace diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 94dc216c82f55..b9a8f3d5a5176 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -80,6 +80,7 @@ from pandas.core.util.numba_ import ( NUMBA_FUNC_CACHE, generate_numba_func, + maybe_use_numba, split_for_numba, ) @@ -227,9 +228,7 @@ def apply(self, func, *args, **kwargs): @doc( _agg_template, examples=_agg_examples_doc, klass="Series", ) - def aggregate( - self, func=None, *args, engine="cython", engine_kwargs=None, **kwargs - ): + def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs): relabeling = func is None columns = None @@ -483,7 +482,7 @@ def _aggregate_named(self, func, *args, **kwargs): @Substitution(klass="Series") @Appender(_transform_template) - def transform(self, func, *args, engine="cython", engine_kwargs=None, **kwargs): + def transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs): func = self._get_cython_func(func) or func if not isinstance(func, str): @@ -515,7 +514,7 @@ def _transform_general( Transform with a non-str `func`. """ - if engine == "numba": + if maybe_use_numba(engine): numba_func, cache_key = generate_numba_func( func, engine_kwargs, kwargs, "groupby_transform" ) @@ -525,7 +524,7 @@ def _transform_general( results = [] for name, group in self: object.__setattr__(group, "name", name) - if engine == "numba": + if maybe_use_numba(engine): values, index = split_for_numba(group) res = numba_func(values, index, *args) if cache_key not in NUMBA_FUNC_CACHE: @@ -934,13 +933,11 @@ class DataFrameGroupBy(GroupBy[DataFrame]): @doc( _agg_template, examples=_agg_examples_doc, klass="DataFrame", ) - def aggregate( - self, func=None, *args, engine="cython", engine_kwargs=None, **kwargs - ): + def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs): relabeling, func, columns, order = reconstruct_func(func, **kwargs) - if engine == "numba": + if maybe_use_numba(engine): return self._python_agg_general( func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs ) @@ -1385,7 +1382,7 @@ def _transform_general( applied = [] obj = self._obj_with_exclusions gen = self.grouper.get_iterator(obj, axis=self.axis) - if engine == "numba": + if maybe_use_numba(engine): numba_func, cache_key = generate_numba_func( func, engine_kwargs, kwargs, "groupby_transform" ) @@ -1395,7 +1392,7 @@ def _transform_general( for name, group in gen: object.__setattr__(group, "name", name) - if engine == "numba": + if maybe_use_numba(engine): values, index = split_for_numba(group) res = numba_func(values, index, *args) if cache_key not in NUMBA_FUNC_CACHE: @@ -1446,7 +1443,7 @@ def _transform_general( @Substitution(klass="DataFrame") @Appender(_transform_template) - def transform(self, func, *args, engine="cython", engine_kwargs=None, **kwargs): + def transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs): # optimized transforms func = self._get_cython_func(func) or func diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index d039b715b3c08..65483abbd2a6e 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -65,6 +65,7 @@ class providing the base-class of operations. from pandas.core.indexes.api import CategoricalIndex, Index, MultiIndex from pandas.core.series import Series from pandas.core.sorting import get_group_index_sorter +from pandas.core.util.numba_ import maybe_use_numba _common_see_also = """ See Also @@ -286,9 +287,10 @@ class providing the base-class of operations. .. versionchanged:: 1.1.0 *args Positional arguments to pass to func -engine : str, default 'cython' +engine : str, default None * ``'cython'`` : Runs the function through C-extensions from cython. * ``'numba'`` : Runs the function through JIT compiled code from numba. + * ``None`` : Defaults to ``'cython'`` or globally setting ``compute.use_numba`` .. versionadded:: 1.1.0 engine_kwargs : dict, default None @@ -393,9 +395,10 @@ class providing the base-class of operations. .. versionchanged:: 1.1.0 *args Positional arguments to pass to func -engine : str, default 'cython' +engine : str, default None * ``'cython'`` : Runs the function through C-extensions from cython. * ``'numba'`` : Runs the function through JIT compiled code from numba. + * ``None`` : Defaults to ``'cython'`` or globally setting ``compute.use_numba`` .. versionadded:: 1.1.0 engine_kwargs : dict, default None @@ -1063,7 +1066,7 @@ def _python_agg_general( # agg_series below assumes ngroups > 0 continue - if engine == "numba": + if maybe_use_numba(engine): result, counts = self.grouper.agg_series( obj, func, diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 74db87f46c5e2..3aaeef3b63760 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -58,6 +58,7 @@ from pandas.core.util.numba_ import ( NUMBA_FUNC_CACHE, generate_numba_func, + maybe_use_numba, split_for_numba, ) @@ -620,7 +621,7 @@ def agg_series( # Caller is responsible for checking ngroups != 0 assert self.ngroups != 0 - if engine == "numba": + if maybe_use_numba(engine): return self._aggregate_series_pure_python( obj, func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs ) @@ -678,7 +679,7 @@ def _aggregate_series_pure_python( **kwargs, ): - if engine == "numba": + if maybe_use_numba(engine): numba_func, cache_key = generate_numba_func( func, engine_kwargs, kwargs, "groupby_agg" ) @@ -691,7 +692,7 @@ def _aggregate_series_pure_python( splitter = get_splitter(obj, group_index, ngroups, axis=0) for label, group in splitter: - if engine == "numba": + if maybe_use_numba(engine): values, index = split_for_numba(group) res = numba_func(values, index, *args) if cache_key not in NUMBA_FUNC_CACHE: diff --git a/pandas/core/util/numba_.py b/pandas/core/util/numba_.py index c3f60ea7cc217..c9b7943478cdd 100644 --- a/pandas/core/util/numba_.py +++ b/pandas/core/util/numba_.py @@ -10,9 +10,22 @@ from pandas.compat._optional import import_optional_dependency from pandas.errors import NumbaUtilError +GLOBAL_USE_NUMBA: bool = False NUMBA_FUNC_CACHE: Dict[Tuple[Callable, str], Callable] = dict() +def maybe_use_numba(engine: Optional[str]) -> bool: + """Signal whether to use numba routines.""" + return engine == "numba" or (engine is None and GLOBAL_USE_NUMBA) + + +def set_use_numba(enable: bool = False) -> None: + global GLOBAL_USE_NUMBA + if enable: + import_optional_dependency("numba") + GLOBAL_USE_NUMBA = enable + + def check_kwargs_and_nopython( kwargs: Optional[Dict] = None, nopython: Optional[bool] = None ) -> None: diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 8cb53ebd92214..48953f6a75487 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -39,7 +39,7 @@ import pandas.core.common as com from pandas.core.construction import extract_array from pandas.core.indexes.api import Index, MultiIndex, ensure_index -from pandas.core.util.numba_ import NUMBA_FUNC_CACHE +from pandas.core.util.numba_ import NUMBA_FUNC_CACHE, maybe_use_numba from pandas.core.window.common import ( WindowGroupByMixin, _doc_template, @@ -1298,10 +1298,11 @@ def count(self): objects instead. If you are just applying a NumPy reduction function this will achieve much better performance. - engine : str, default 'cython' + engine : str, default None * ``'cython'`` : Runs rolling apply through C-extensions from cython. * ``'numba'`` : Runs rolling apply through JIT compiled code from numba. Only available when ``raw`` is set to ``True``. + * ``None`` : Defaults to ``'cython'`` or globally setting ``compute.use_numba`` .. versionadded:: 1.0.0 @@ -1357,18 +1358,7 @@ def apply( if not is_bool(raw): raise ValueError("raw parameter must be `True` or `False`") - if engine == "cython": - if engine_kwargs is not None: - raise ValueError("cython engine does not accept engine_kwargs") - # Cython apply functions handle center, so don't need to use - # _apply's center handling - window = self._get_window() - offset = calculate_center_offset(window) if self.center else 0 - apply_func = self._generate_cython_apply_func( - args, kwargs, raw, offset, func - ) - center = False - elif engine == "numba": + if maybe_use_numba(engine): if raw is False: raise ValueError("raw must be `True` when using the numba engine") cache_key = (func, "rolling_apply") @@ -1380,6 +1370,17 @@ def apply( args, kwargs, func, engine_kwargs ) center = self.center + elif engine in ("cython", None): + if engine_kwargs is not None: + raise ValueError("cython engine does not accept engine_kwargs") + # Cython apply functions handle center, so don't need to use + # _apply's center handling + window = self._get_window() + offset = calculate_center_offset(window) if self.center else 0 + apply_func = self._generate_cython_apply_func( + args, kwargs, raw, offset, func + ) + center = False else: raise ValueError("engine must be either 'numba' or 'cython'") @@ -2053,13 +2054,7 @@ def count(self): @Substitution(name="rolling") @Appender(_shared_docs["apply"]) def apply( - self, - func, - raw=False, - engine="cython", - engine_kwargs=None, - args=None, - kwargs=None, + self, func, raw=False, engine=None, engine_kwargs=None, args=None, kwargs=None, ): return super().apply( func, diff --git a/pandas/tests/groupby/aggregate/test_numba.py b/pandas/tests/groupby/aggregate/test_numba.py index 726d79535184a..690694b0e66f5 100644 --- a/pandas/tests/groupby/aggregate/test_numba.py +++ b/pandas/tests/groupby/aggregate/test_numba.py @@ -4,7 +4,7 @@ from pandas.errors import NumbaUtilError import pandas.util._test_decorators as td -from pandas import DataFrame +from pandas import DataFrame, option_context import pandas._testing as tm from pandas.core.util.numba_ import NUMBA_FUNC_CACHE @@ -113,3 +113,18 @@ def func_2(values, index): result = grouped.agg(func_1, engine="numba", engine_kwargs=engine_kwargs) expected = grouped.agg(lambda x: np.mean(x) - 3.4, engine="cython") tm.assert_equal(result, expected) + + +@td.skip_if_no("numba", "0.46.0") +def test_use_global_config(): + def func_1(values, index): + return np.mean(values) - 3.4 + + data = DataFrame( + {0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1], + ) + grouped = data.groupby(0) + expected = grouped.agg(func_1, engine="numba") + with option_context("compute.use_numba", True): + result = grouped.agg(func_1, engine=None) + tm.assert_frame_equal(expected, result) diff --git a/pandas/tests/groupby/transform/test_numba.py b/pandas/tests/groupby/transform/test_numba.py index 9a4015ac983c5..ee482571e644d 100644 --- a/pandas/tests/groupby/transform/test_numba.py +++ b/pandas/tests/groupby/transform/test_numba.py @@ -3,7 +3,7 @@ from pandas.errors import NumbaUtilError import pandas.util._test_decorators as td -from pandas import DataFrame +from pandas import DataFrame, option_context import pandas._testing as tm from pandas.core.util.numba_ import NUMBA_FUNC_CACHE @@ -112,3 +112,18 @@ def func_2(values, index): result = grouped.transform(func_1, engine="numba", engine_kwargs=engine_kwargs) expected = grouped.transform(lambda x: x + 1, engine="cython") tm.assert_equal(result, expected) + + +@td.skip_if_no("numba", "0.46.0") +def test_use_global_config(): + def func_1(values, index): + return values + 1 + + data = DataFrame( + {0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1], + ) + grouped = data.groupby(0) + expected = grouped.transform(func_1, engine="numba") + with option_context("compute.use_numba", True): + result = grouped.transform(func_1, engine=None) + tm.assert_frame_equal(expected, result) diff --git a/pandas/tests/util/test_numba.py b/pandas/tests/util/test_numba.py new file mode 100644 index 0000000000000..27b68ff0f6044 --- /dev/null +++ b/pandas/tests/util/test_numba.py @@ -0,0 +1,12 @@ +import pytest + +import pandas.util._test_decorators as td + +from pandas import option_context + + +@td.skip_if_installed("numba") +def test_numba_not_installed_option_context(): + with pytest.raises(ImportError, match="Missing optional"): + with option_context("compute.use_numba", True): + pass diff --git a/pandas/tests/window/test_numba.py b/pandas/tests/window/test_numba.py index 7e049af0ca1f8..35bdb972a7bc0 100644 --- a/pandas/tests/window/test_numba.py +++ b/pandas/tests/window/test_numba.py @@ -3,7 +3,7 @@ import pandas.util._test_decorators as td -from pandas import Series +from pandas import Series, option_context import pandas._testing as tm from pandas.core.util.numba_ import NUMBA_FUNC_CACHE @@ -75,3 +75,15 @@ def func_2(x): ) expected = roll.apply(func_1, engine="cython", raw=True) tm.assert_series_equal(result, expected) + + +@td.skip_if_no("numba", "0.46.0") +def test_use_global_config(): + def f(x): + return np.mean(x) + 2 + + s = Series(range(10)) + with option_context("compute.use_numba", True): + result = s.rolling(2).apply(f, engine=None, raw=True) + expected = s.rolling(2).apply(f, engine="numba", raw=True) + tm.assert_series_equal(expected, result) From c91be7e6320883bbfea1bdebc3923a73e64c1515 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Quang=20Nguy=E1=BB=85n?= <30631476+quangngd@users.noreply.github.com> Date: Wed, 15 Jul 2020 19:29:53 +0700 Subject: [PATCH 0362/1025] ENH: Add index option to_markdown() (#33091) --- doc/source/whatsnew/v1.1.0.rst | 1 + pandas/core/frame.py | 15 ++++++++- pandas/core/series.py | 13 ++++++-- pandas/tests/io/formats/test_to_markdown.py | 35 +++++++++++++++++++++ 4 files changed, 61 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 4b893bcd0c87d..814dbe999d5c1 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -335,6 +335,7 @@ Other enhancements - :meth:`DataFrame.to_html` and :meth:`DataFrame.to_string`'s ``col_space`` parameter now accepts a list or dict to change only some specific columns' width (:issue:`28917`). - :meth:`DataFrame.to_excel` can now also write OpenOffice spreadsheet (.ods) files (:issue:`27222`) - :meth:`~Series.explode` now accepts ``ignore_index`` to reset the index, similarly to :meth:`pd.concat` or :meth:`DataFrame.sort_values` (:issue:`34932`). +- :meth:`DataFrame.to_markdown` and :meth:`Series.to_markdown` now accept ``index`` argument as an alias for tabulate's ``showindex`` (:issue:`32667`) - :meth:`read_csv` now accepts string values like "0", "0.0", "1", "1.0" as convertible to the nullable boolean dtype (:issue:`34859`) - :class:`pandas.core.window.ExponentialMovingWindow` now supports a ``times`` argument that allows ``mean`` to be calculated with observations spaced by the timestamps in ``times`` (:issue:`34839`) - :meth:`DataFrame.agg` and :meth:`Series.agg` now accept named aggregation for renaming the output columns/indexes. (:issue:`26513`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index cfe5621fec14e..0268d19e00b97 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2236,10 +2236,23 @@ def to_feather(self, path, **kwargs) -> None: """, ) def to_markdown( - self, buf: Optional[IO[str]] = None, mode: Optional[str] = None, **kwargs + self, + buf: Optional[IO[str]] = None, + mode: Optional[str] = None, + index: bool = True, + **kwargs, ) -> Optional[str]: + if "showindex" in kwargs: + warnings.warn( + "'showindex' is deprecated. Only 'index' will be used " + "in a future version. Use 'index' to silence this warning.", + FutureWarning, + stacklevel=2, + ) + kwargs.setdefault("headers", "keys") kwargs.setdefault("tablefmt", "pipe") + kwargs.setdefault("showindex", index) tabulate = import_optional_dependency("tabulate") result = tabulate.tabulate(self, **kwargs) if buf is None: diff --git a/pandas/core/series.py b/pandas/core/series.py index 9a633079b8c1d..ef3be854bc3bb 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1419,7 +1419,11 @@ def to_string( ), ) def to_markdown( - self, buf: Optional[IO[str]] = None, mode: Optional[str] = None, **kwargs + self, + buf: Optional[IO[str]] = None, + mode: Optional[str] = None, + index: bool = True, + **kwargs, ) -> Optional[str]: """ Print {klass} in Markdown-friendly format. @@ -1432,6 +1436,11 @@ def to_markdown( Buffer to write to. If None, the output is returned as a string. mode : str, optional Mode in which file is opened. + index : bool, optional, default True + Add index (row) labels. + + .. versionadded:: 1.1.0 + **kwargs These parameters will be passed to `tabulate \ `_. @@ -1467,7 +1476,7 @@ def to_markdown( | 3 | quetzal | +----+----------+ """ - return self.to_frame().to_markdown(buf, mode, **kwargs) + return self.to_frame().to_markdown(buf, mode, index, **kwargs) # ---------------------------------------------------------------------- diff --git a/pandas/tests/io/formats/test_to_markdown.py b/pandas/tests/io/formats/test_to_markdown.py index 8893e4294353f..5223b313fef4f 100644 --- a/pandas/tests/io/formats/test_to_markdown.py +++ b/pandas/tests/io/formats/test_to_markdown.py @@ -3,6 +3,7 @@ import pytest import pandas as pd +import pandas._testing as tm pytest.importorskip("tabulate") @@ -53,3 +54,37 @@ def test_no_buf(capsys): assert ( result == "| | 0 |\n|---:|----:|\n| 0 | 1 |\n| 1 | 2 |\n| 2 | 3 |" ) + + +@pytest.mark.parametrize("index", [True, False, None]) +@pytest.mark.parametrize("showindex", [True, False, None]) +def test_index(index, showindex): + # GH 32667 + kwargs = {} + if index is not None: + kwargs["index"] = index + if showindex is not None: + kwargs["showindex"] = showindex + + df = pd.DataFrame([1, 2, 3]) + yes_index_result = ( + "| | 0 |\n|---:|----:|\n| 0 | 1 |\n| 1 | 2 |\n| 2 | 3 |" + ) + no_index_result = "| 0 |\n|----:|\n| 1 |\n| 2 |\n| 3 |" + + warning = FutureWarning if "showindex" in kwargs else None + with tm.assert_produces_warning(warning): + result = df.to_markdown(**kwargs) + + if "showindex" in kwargs: + # give showindex higher priority if specified + if showindex: + expected = yes_index_result + else: + expected = no_index_result + else: + if index in [True, None]: + expected = yes_index_result + else: + expected = no_index_result + assert result == expected From 443605d1904c786a658fbd706be322be8d1ecbf9 Mon Sep 17 00:00:00 2001 From: Andrew Wieteska <48889395+arw2019@users.noreply.github.com> Date: Wed, 15 Jul 2020 09:43:44 -0400 Subject: [PATCH 0363/1025] WIP CI: MacPython failing TestPandasContainer.test_to_json_large_numbers (#35184) --- pandas/tests/io/json/test_pandas.py | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 10f49b9b81528..97b53a6e66575 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1250,23 +1250,32 @@ def test_to_json_large_numbers(self, bigNum): json = series.to_json() expected = '{"articleId":' + str(bigNum) + "}" assert json == expected - # GH 20599 + + df = DataFrame(bigNum, dtype=object, index=["articleId"], columns=[0]) + json = df.to_json() + expected = '{"0":{"articleId":' + str(bigNum) + "}}" + assert json == expected + + @pytest.mark.parametrize("bigNum", [sys.maxsize + 1, -(sys.maxsize + 2)]) + @pytest.mark.skipif(sys.maxsize <= 2 ** 32, reason="GH-35279") + def test_read_json_large_numbers(self, bigNum): + # GH20599 + + series = Series(bigNum, dtype=object, index=["articleId"]) + json = '{"articleId":' + str(bigNum) + "}" with pytest.raises(ValueError): json = StringIO(json) result = read_json(json) tm.assert_series_equal(series, result) df = DataFrame(bigNum, dtype=object, index=["articleId"], columns=[0]) - json = df.to_json() - expected = '{"0":{"articleId":' + str(bigNum) + "}}" - assert json == expected - # GH 20599 + json = '{"0":{"articleId":' + str(bigNum) + "}}" with pytest.raises(ValueError): json = StringIO(json) result = read_json(json) tm.assert_frame_equal(df, result) - def test_read_json_large_numbers(self): + def test_read_json_large_numbers2(self): # GH18842 json = '{"articleId": "1404366058080022500245"}' json = StringIO(json) From 56b81ee3ef5e2e0ac23d978ca5f40c8c1d414273 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 15 Jul 2020 13:04:47 -0500 Subject: [PATCH 0364/1025] xfail 32-bit testsp (#35289) --- pandas/compat/__init__.py | 1 + pandas/tests/io/json/test_pandas.py | 4 ++-- pandas/tests/io/json/test_ujson.py | 1 + pandas/tests/resample/test_resampler_grouper.py | 3 ++- 4 files changed, 6 insertions(+), 3 deletions(-) diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index f7bb73b916ce0..b5a1dc2b2fb94 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -18,6 +18,7 @@ PY38 = sys.version_info >= (3, 8) PY39 = sys.version_info >= (3, 9) PYPY = platform.python_implementation() == "PyPy" +IS64 = sys.maxsize > 2 ** 32 # ---------------------------------------------------------------------------- diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 97b53a6e66575..c4db0170ecc90 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -13,7 +13,7 @@ import pandas.util._test_decorators as td import pandas as pd -from pandas import DataFrame, DatetimeIndex, Series, Timestamp, read_json +from pandas import DataFrame, DatetimeIndex, Series, Timestamp, compat, read_json import pandas._testing as tm _seriesd = tm.getSeriesData() @@ -1257,7 +1257,7 @@ def test_to_json_large_numbers(self, bigNum): assert json == expected @pytest.mark.parametrize("bigNum", [sys.maxsize + 1, -(sys.maxsize + 2)]) - @pytest.mark.skipif(sys.maxsize <= 2 ** 32, reason="GH-35279") + @pytest.mark.skipif(not compat.IS64, reason="GH-35279") def test_read_json_large_numbers(self, bigNum): # GH20599 diff --git a/pandas/tests/io/json/test_ujson.py b/pandas/tests/io/json/test_ujson.py index 952c583040360..f969cbca9f427 100644 --- a/pandas/tests/io/json/test_ujson.py +++ b/pandas/tests/io/json/test_ujson.py @@ -561,6 +561,7 @@ def test_encode_long_conversion(self): assert long_input == ujson.decode(output) @pytest.mark.parametrize("bigNum", [sys.maxsize + 1, -(sys.maxsize + 2)]) + @pytest.mark.xfail(not compat.IS64, reason="GH-35288") def test_dumps_ints_larger_than_maxsize(self, bigNum): # GH34395 bigNum = sys.maxsize + 1 diff --git a/pandas/tests/resample/test_resampler_grouper.py b/pandas/tests/resample/test_resampler_grouper.py index cbf3a778f9ae0..b36b11582c1ec 100644 --- a/pandas/tests/resample/test_resampler_grouper.py +++ b/pandas/tests/resample/test_resampler_grouper.py @@ -6,7 +6,7 @@ from pandas.util._test_decorators import async_mark import pandas as pd -from pandas import DataFrame, Series, Timestamp +from pandas import DataFrame, Series, Timestamp, compat import pandas._testing as tm from pandas.core.indexes.datetimes import date_range @@ -317,6 +317,7 @@ def test_resample_groupby_with_label(): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(not compat.IS64, reason="GH-35148") def test_consistency_with_window(): # consistent return values with window From 14dd9e7f69d5f77ced9a26ad64c349dc65cbecae Mon Sep 17 00:00:00 2001 From: Ali McMaster Date: Wed, 15 Jul 2020 20:07:51 +0100 Subject: [PATCH 0365/1025] BUG/TST: Read from Public s3 Bucket Without Creds (#34877) * Public Bucket Read Test --- pandas/io/common.py | 34 +++++++++++++++++++++++++++++++--- pandas/tests/io/test_s3.py | 31 +++++++++++++++++++++++++++++++ 2 files changed, 62 insertions(+), 3 deletions(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index 51323c5ff3ef5..32ec088f00d88 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -202,9 +202,37 @@ def get_filepath_or_buffer( filepath_or_buffer = filepath_or_buffer.replace("s3n://", "s3://") fsspec = import_optional_dependency("fsspec") - file_obj = fsspec.open( - filepath_or_buffer, mode=mode or "rb", **(storage_options or {}) - ).open() + # If botocore is installed we fallback to reading with anon=True + # to allow reads from public buckets + err_types_to_retry_with_anon: List[Any] = [] + try: + import_optional_dependency("botocore") + from botocore.exceptions import ClientError, NoCredentialsError + + err_types_to_retry_with_anon = [ + ClientError, + NoCredentialsError, + PermissionError, + ] + except ImportError: + pass + + try: + file_obj = fsspec.open( + filepath_or_buffer, mode=mode or "rb", **(storage_options or {}) + ).open() + # GH 34626 Reads from Public Buckets without Credentials needs anon=True + except tuple(err_types_to_retry_with_anon): + if storage_options is None: + storage_options = {"anon": True} + else: + # don't mutate user input. + storage_options = dict(storage_options) + storage_options["anon"] = True + file_obj = fsspec.open( + filepath_or_buffer, mode=mode or "rb", **(storage_options or {}) + ).open() + return file_obj, encoding, compression, True if isinstance(filepath_or_buffer, (str, bytes, mmap.mmap)): diff --git a/pandas/tests/io/test_s3.py b/pandas/tests/io/test_s3.py index a76be9465f62a..5e0f7edf4d8ae 100644 --- a/pandas/tests/io/test_s3.py +++ b/pandas/tests/io/test_s3.py @@ -1,8 +1,12 @@ from io import BytesIO +import os import pytest +import pandas.util._test_decorators as td + from pandas import read_csv +import pandas._testing as tm def test_streaming_s3_objects(): @@ -15,3 +19,30 @@ def test_streaming_s3_objects(): for el in data: body = StreamingBody(BytesIO(el), content_length=len(el)) read_csv(body) + + +@tm.network +@td.skip_if_no("s3fs") +def test_read_without_creds_from_pub_bucket(): + # GH 34626 + # Use Amazon Open Data Registry - https://registry.opendata.aws/gdelt + result = read_csv("s3://gdelt-open-data/events/1981.csv", nrows=3) + assert len(result) == 3 + + +@tm.network +@td.skip_if_no("s3fs") +def test_read_with_creds_from_pub_bucke(): + # Ensure we can read from a public bucket with credentials + # GH 34626 + # Use Amazon Open Data Registry - https://registry.opendata.aws/gdelt + + with tm.ensure_safe_environment_variables(): + # temporary workaround as moto fails for botocore >= 1.11 otherwise, + # see https://github.com/spulec/moto/issues/1924 & 1952 + os.environ.setdefault("AWS_ACCESS_KEY_ID", "foobar_key") + os.environ.setdefault("AWS_SECRET_ACCESS_KEY", "foobar_secret") + df = read_csv( + "s3://gdelt-open-data/events/1981.csv", nrows=5, sep="\t", header=None, + ) + assert len(df) == 5 From 92209c7b87685fef64dccbc4463fa4324b1dc7dc Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 15 Jul 2020 16:09:14 -0500 Subject: [PATCH 0366/1025] CI: xfail failing 32-bit tests (#35295) * CI: xfail failing 32-bit tests https://github.com/pandas-dev/pandas/issues/35294 --- pandas/tests/window/test_api.py | 5 +++-- pandas/tests/window/test_apply.py | 3 ++- pandas/tests/window/test_grouper.py | 8 +++++++- pandas/tests/window/test_timeseries_window.py | 3 +++ 4 files changed, 15 insertions(+), 4 deletions(-) diff --git a/pandas/tests/window/test_api.py b/pandas/tests/window/test_api.py index 2c3d8b4608806..28e27791cad35 100644 --- a/pandas/tests/window/test_api.py +++ b/pandas/tests/window/test_api.py @@ -6,7 +6,7 @@ import pandas.util._test_decorators as td import pandas as pd -from pandas import DataFrame, Index, Series, Timestamp, concat +from pandas import DataFrame, Index, Series, Timestamp, compat, concat import pandas._testing as tm from pandas.core.base import SpecificationError @@ -277,7 +277,7 @@ def test_preserve_metadata(): @pytest.mark.parametrize( "func,window_size,expected_vals", [ - ( + pytest.param( "rolling", 2, [ @@ -289,6 +289,7 @@ def test_preserve_metadata(): [35.0, 40.0, 60.0, 40.0], [60.0, 80.0, 85.0, 80], ], + marks=pytest.mark.xfail(not compat.IS64, reason="GH-35294"), ), ( "expanding", diff --git a/pandas/tests/window/test_apply.py b/pandas/tests/window/test_apply.py index bc38634da8941..2aaf6af103e98 100644 --- a/pandas/tests/window/test_apply.py +++ b/pandas/tests/window/test_apply.py @@ -4,7 +4,7 @@ from pandas.errors import NumbaUtilError import pandas.util._test_decorators as td -from pandas import DataFrame, Index, MultiIndex, Series, Timestamp, date_range +from pandas import DataFrame, Index, MultiIndex, Series, Timestamp, compat, date_range import pandas._testing as tm @@ -142,6 +142,7 @@ def test_invalid_kwargs_nopython(): @pytest.mark.parametrize("args_kwargs", [[None, {"par": 10}], [(10,), None]]) +@pytest.mark.xfail(not compat.IS64, reason="GH-35294") def test_rolling_apply_args_kwargs(args_kwargs): # GH 33433 def foo(x, par): diff --git a/pandas/tests/window/test_grouper.py b/pandas/tests/window/test_grouper.py index 5b2687271f9d6..744ca264e91d9 100644 --- a/pandas/tests/window/test_grouper.py +++ b/pandas/tests/window/test_grouper.py @@ -2,7 +2,7 @@ import pytest import pandas as pd -from pandas import DataFrame, Series +from pandas import DataFrame, Series, compat import pandas._testing as tm from pandas.core.groupby.groupby import get_groupby @@ -23,6 +23,7 @@ def test_mutated(self): g = get_groupby(self.frame, by="A", mutated=True) assert g.mutated + @pytest.mark.xfail(not compat.IS64, reason="GH-35294") def test_getitem(self): g = self.frame.groupby("A") g_mutated = get_groupby(self.frame, by="A", mutated=True) @@ -55,6 +56,7 @@ def test_getitem_multiple(self): result = r.B.count() tm.assert_series_equal(result, expected) + @pytest.mark.xfail(not compat.IS64, reason="GH-35294") def test_rolling(self): g = self.frame.groupby("A") r = g.rolling(window=4) @@ -72,6 +74,7 @@ def test_rolling(self): @pytest.mark.parametrize( "interpolation", ["linear", "lower", "higher", "midpoint", "nearest"] ) + @pytest.mark.xfail(not compat.IS64, reason="GH-35294") def test_rolling_quantile(self, interpolation): g = self.frame.groupby("A") r = g.rolling(window=4) @@ -102,6 +105,7 @@ def func(x): expected = g.apply(func) tm.assert_series_equal(result, expected) + @pytest.mark.xfail(not compat.IS64, reason="GH-35294") def test_rolling_apply(self, raw): g = self.frame.groupby("A") r = g.rolling(window=4) @@ -111,6 +115,7 @@ def test_rolling_apply(self, raw): expected = g.apply(lambda x: x.rolling(4).apply(lambda y: y.sum(), raw=raw)) tm.assert_frame_equal(result, expected) + @pytest.mark.xfail(not compat.IS64, reason="GH-35294") def test_rolling_apply_mutability(self): # GH 14013 df = pd.DataFrame({"A": ["foo"] * 3 + ["bar"] * 3, "B": [1] * 6}) @@ -192,6 +197,7 @@ def test_expanding_apply(self, raw): tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("expected_value,raw_value", [[1.0, True], [0.0, False]]) + @pytest.mark.xfail(not compat.IS64, reason="GH-35294") def test_groupby_rolling(self, expected_value, raw_value): # GH 31754 diff --git a/pandas/tests/window/test_timeseries_window.py b/pandas/tests/window/test_timeseries_window.py index 8aa4d7103e48a..90f919d5565b0 100644 --- a/pandas/tests/window/test_timeseries_window.py +++ b/pandas/tests/window/test_timeseries_window.py @@ -7,6 +7,7 @@ MultiIndex, Series, Timestamp, + compat, date_range, to_datetime, ) @@ -656,6 +657,7 @@ def agg_by_day(x): tm.assert_frame_equal(result, expected) + @pytest.mark.xfail(not compat.IS64, reason="GH-35294") def test_groupby_monotonic(self): # GH 15130 @@ -685,6 +687,7 @@ def test_groupby_monotonic(self): result = df.groupby("name").rolling("180D", on="date")["amount"].sum() tm.assert_series_equal(result, expected) + @pytest.mark.xfail(not compat.IS64, reason="GH-35294") def test_non_monotonic(self): # GH 13966 (similar to #15130, closed by #15175) From d27e870ac75a03ff8de7515a6c4a91e4871ef15e Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 15 Jul 2020 15:17:57 -0700 Subject: [PATCH 0367/1025] BUG: Use correct ExtensionArray reductions in DataFrame reductions (#35254) --- doc/source/whatsnew/v1.1.0.rst | 1 + pandas/core/frame.py | 27 ++++++++++++++------ pandas/tests/arrays/integer/test_function.py | 9 +++++++ pandas/tests/frame/test_analytics.py | 23 +++++++++++++++++ 4 files changed, 52 insertions(+), 8 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 814dbe999d5c1..36433a45ede9f 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -916,6 +916,7 @@ Numeric - Bug in :meth:`DataFrame.diff` with ``axis=1`` returning incorrect results with mixed dtypes (:issue:`32995`) - Bug in :meth:`DataFrame.corr` and :meth:`DataFrame.cov` raising when handling nullable integer columns with ``pandas.NA`` (:issue:`33803`) - Bug in :class:`DataFrame` and :class:`Series` addition and subtraction between object-dtype objects and ``datetime64`` dtype objects (:issue:`33824`) +- Bug in :class:`DataFrame` reductions (e.g. ``df.min()``, ``df.max()``) with ``ExtensionArray`` dtypes (:issue:`34520`, :issue:`32651`) Conversion ^^^^^^^^^^ diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 0268d19e00b97..b63467f08cdaa 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -118,6 +118,7 @@ from pandas.core.arrays import Categorical, ExtensionArray from pandas.core.arrays.datetimelike import DatetimeLikeArrayMixin as DatetimeLikeArray from pandas.core.arrays.sparse import SparseFrameAccessor +from pandas.core.construction import extract_array from pandas.core.generic import NDFrame, _shared_docs from pandas.core.indexes import base as ibase from pandas.core.indexes.api import Index, ensure_index, ensure_index_from_sequences @@ -8512,7 +8513,14 @@ def _count_level(self, level, axis=0, numeric_only=False): return result def _reduce( - self, op, name, axis=0, skipna=True, numeric_only=None, filter_type=None, **kwds + self, + op, + name: str, + axis=0, + skipna=True, + numeric_only=None, + filter_type=None, + **kwds, ): assert filter_type is None or filter_type == "bool", filter_type @@ -8544,8 +8552,11 @@ def _reduce( labels = self._get_agg_axis(axis) constructor = self._constructor - def f(x): - return op(x, axis=axis, skipna=skipna, **kwds) + def func(values): + if is_extension_array_dtype(values.dtype): + return extract_array(values)._reduce(name, skipna=skipna, **kwds) + else: + return op(values, axis=axis, skipna=skipna, **kwds) def _get_data(axis_matters): if filter_type is None: @@ -8592,7 +8603,7 @@ def blk_func(values): out[:] = coerce_to_dtypes(out.values, df.dtypes) return out - if not self._is_homogeneous_type: + if not self._is_homogeneous_type or self._mgr.any_extension_types: # try to avoid self.values call if filter_type is None and axis == 0 and len(self) > 0: @@ -8612,7 +8623,7 @@ def blk_func(values): from pandas.core.apply import frame_apply opa = frame_apply( - self, func=f, result_type="expand", ignore_failures=True + self, func=func, result_type="expand", ignore_failures=True ) result = opa.get_result() if result.ndim == self.ndim: @@ -8624,7 +8635,7 @@ def blk_func(values): values = data.values try: - result = f(values) + result = func(values) except TypeError: # e.g. in nanops trying to convert strs to float @@ -8635,7 +8646,7 @@ def blk_func(values): values = data.values with np.errstate(all="ignore"): - result = f(values) + result = func(values) else: if numeric_only: @@ -8646,7 +8657,7 @@ def blk_func(values): else: data = self values = data.values - result = f(values) + result = func(values) if filter_type == "bool" and is_object_dtype(values) and axis is None: # work around https://github.com/numpy/numpy/issues/10489 diff --git a/pandas/tests/arrays/integer/test_function.py b/pandas/tests/arrays/integer/test_function.py index 44c3077228e80..a81434339fdae 100644 --- a/pandas/tests/arrays/integer/test_function.py +++ b/pandas/tests/arrays/integer/test_function.py @@ -133,6 +133,15 @@ def test_integer_array_numpy_sum(values, expected): assert result == expected +@pytest.mark.parametrize("op", ["sum", "prod", "min", "max"]) +def test_dataframe_reductions(op): + # https://github.com/pandas-dev/pandas/pull/32867 + # ensure the integers are not cast to float during reductions + df = pd.DataFrame({"a": pd.array([1, 2], dtype="Int64")}) + result = df.max() + assert isinstance(result["a"], np.int64) + + # TODO(jreback) - these need testing / are broken # shift diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index db8bb5ca3c437..9d6b9f39a0578 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -1303,3 +1303,26 @@ def test_preserve_timezone(self, initial: str, method): df = DataFrame([expected]) result = getattr(df, method)(axis=1) tm.assert_series_equal(result, expected) + + +def test_mixed_frame_with_integer_sum(): + # https://github.com/pandas-dev/pandas/issues/34520 + df = pd.DataFrame([["a", 1]], columns=list("ab")) + df = df.astype({"b": "Int64"}) + result = df.sum() + expected = pd.Series(["a", 1], index=["a", "b"]) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("numeric_only", [True, False, None]) +@pytest.mark.parametrize("method", ["min", "max"]) +def test_minmax_extensionarray(method, numeric_only): + # https://github.com/pandas-dev/pandas/issues/32651 + int64_info = np.iinfo("int64") + ser = Series([int64_info.max, None, int64_info.min], dtype=pd.Int64Dtype()) + df = DataFrame({"Int64": ser}) + result = getattr(df, method)(numeric_only=numeric_only) + expected = Series( + [getattr(int64_info, method)], index=pd.Index(["Int64"], dtype="object") + ) + tm.assert_series_equal(result, expected) From 9b138c57a3030fa82607aa675891ae2468dfff4c Mon Sep 17 00:00:00 2001 From: patrick <61934744+phofl@users.noreply.github.com> Date: Thu, 16 Jul 2020 00:19:12 +0200 Subject: [PATCH 0368/1025] BUG: Fix droped result column in groupby with as_index False (#33247) --- doc/source/whatsnew/v1.1.0.rst | 37 +++++++++++++++++++ pandas/core/groupby/generic.py | 8 ++-- .../tests/groupby/aggregate/test_aggregate.py | 35 ++++++++++++++++++ 3 files changed, 76 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 36433a45ede9f..90534c00df621 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -640,6 +640,43 @@ The method :meth:`core.DataFrameGroupBy.size` would previously ignore ``as_index df.groupby("a", as_index=False).size() +.. _whatsnew_110.api_breaking.groupby_results_lost_as_index_false: + +:meth:`DataFrameGroupby.agg` lost results with ``as_index`` ``False`` when relabeling columns +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Previously :meth:`DataFrameGroupby.agg` lost the result columns, when the ``as_index`` option was +set to ``False`` and the result columns were relabeled. In this case he result values were replaced with +the previous index (:issue:`32240`). + +.. ipython:: python + + df = pd.DataFrame({"key": ["x", "y", "z", "x", "y", "z"], + "val": [1.0, 0.8, 2.0, 3.0, 3.6, 0.75]}) + df + +*Previous behavior*: + +.. code-block:: ipython + + In [2]: grouped = df.groupby("key", as_index=False) + In [3]: result = grouped.agg(min_val=pd.NamedAgg(column="val", aggfunc="min")) + In [4]: result + Out[4]: + min_val + 0 x + 1 y + 2 z + +*New behavior*: + +.. ipython:: python + + grouped = df.groupby("key", as_index=False) + result = grouped.agg(min_val=pd.NamedAgg(column="val", aggfunc="min")) + result + + .. _whatsnew_110.notable_bug_fixes.apply_applymap_first_once: apply and applymap on ``DataFrame`` evaluates first row/column only once diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index b9a8f3d5a5176..1d14361757e4a 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -975,16 +975,16 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs) [self._selected_obj.columns.name] * result.columns.nlevels ).droplevel(-1) - if not self.as_index: - self._insert_inaxis_grouper_inplace(result) - result.index = np.arange(len(result)) - if relabeling: # used reordered index of columns result = result.iloc[:, order] result.columns = columns + if not self.as_index: + self._insert_inaxis_grouper_inplace(result) + result.index = np.arange(len(result)) + return result._convert(datetime=True) agg = aggregate diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index bf465635c0085..40a20c8210052 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -795,6 +795,41 @@ def test_groupby_aggregate_empty_key_empty_return(): tm.assert_frame_equal(result, expected) +def test_grouby_agg_loses_results_with_as_index_false_relabel(): + # GH 32240: When the aggregate function relabels column names and + # as_index=False is specified, the results are dropped. + + df = pd.DataFrame( + {"key": ["x", "y", "z", "x", "y", "z"], "val": [1.0, 0.8, 2.0, 3.0, 3.6, 0.75]} + ) + + grouped = df.groupby("key", as_index=False) + result = grouped.agg(min_val=pd.NamedAgg(column="val", aggfunc="min")) + expected = pd.DataFrame({"key": ["x", "y", "z"], "min_val": [1.0, 0.8, 0.75]}) + tm.assert_frame_equal(result, expected) + + +def test_grouby_agg_loses_results_with_as_index_false_relabel_multiindex(): + # GH 32240: When the aggregate function relabels column names and + # as_index=False is specified, the results are dropped. Check if + # multiindex is returned in the right order + + df = pd.DataFrame( + { + "key": ["x", "y", "x", "y", "x", "x"], + "key1": ["a", "b", "c", "b", "a", "c"], + "val": [1.0, 0.8, 2.0, 3.0, 3.6, 0.75], + } + ) + + grouped = df.groupby(["key", "key1"], as_index=False) + result = grouped.agg(min_val=pd.NamedAgg(column="val", aggfunc="min")) + expected = pd.DataFrame( + {"key": ["x", "x", "y"], "key1": ["a", "c", "b"], "min_val": [1.0, 0.75, 0.8]} + ) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( "func", [lambda s: s.mean(), lambda s: np.mean(s), lambda s: np.nanmean(s)] ) From 08c64320fc5d866eecb97d565c9548512f0ce744 Mon Sep 17 00:00:00 2001 From: avinashpancham <44933366+avinashpancham@users.noreply.github.com> Date: Thu, 16 Jul 2020 00:24:05 +0200 Subject: [PATCH 0369/1025] TST: Verify filtering operations on DataFrames with categorical Series (#35233) --- pandas/tests/frame/indexing/test_categorical.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/pandas/tests/frame/indexing/test_categorical.py b/pandas/tests/frame/indexing/test_categorical.py index cfc22b9b18729..314de5bdd8146 100644 --- a/pandas/tests/frame/indexing/test_categorical.py +++ b/pandas/tests/frame/indexing/test_categorical.py @@ -394,3 +394,14 @@ def test_loc_indexing_preserves_index_category_dtype(self): result = df.loc[["a"]].index.levels[0] tm.assert_index_equal(result, expected) + + def test_categorical_filtering(self): + # GH22609 Verify filtering operations on DataFrames with categorical Series + df = pd.DataFrame(data=[[0, 0], [1, 1]], columns=["a", "b"]) + df["b"] = df.b.astype("category") + + result = df.where(df.a > 0) + expected = df.copy() + expected.loc[0, :] = np.nan + + tm.assert_equal(result, expected) From af57febfccb33e3f1de486c0c19fcac75606f7c8 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 16 Jul 2020 00:25:23 +0200 Subject: [PATCH 0370/1025] BUG/API: other object type check in Series/DataFrame.equals (#34402) --- doc/source/whatsnew/v1.1.0.rst | 2 ++ pandas/core/generic.py | 2 +- pandas/tests/frame/test_subclass.py | 8 ++++++++ pandas/tests/series/test_subclass.py | 8 ++++++++ 4 files changed, 19 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 90534c00df621..f62e78831616b 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -1200,6 +1200,8 @@ Other - Bug in :meth:`DataFrame.__dir__` caused a segfault when using unicode surrogates in a column name (:issue:`25509`) - Bug in :meth:`DataFrame.plot.scatter` caused an error when plotting variable marker sizes (:issue:`32904`) - :class:`IntegerArray` now implements the ``sum`` operation (:issue:`33172`) +- Bug in :meth:`DataFrame.equals` and :meth:`Series.equals` in allowing subclasses + to be equal (:issue:`34402`). - Bug in :class:`Tick` comparisons raising ``TypeError`` when comparing against timedelta-like objects (:issue:`34088`) - Bug in :class:`Tick` multiplication raising ``TypeError`` when multiplying by a float (:issue:`34486`) - Passing a `set` as `names` argument to :func:`pandas.read_csv`, :func:`pandas.read_table`, or :func:`pandas.read_fwf` will raise ``ValueError: Names should be an ordered collection.`` (:issue:`34946`) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index eb55369d83593..e46fde1f59f16 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1278,7 +1278,7 @@ def equals(self, other): >>> df.equals(different_data_type) False """ - if not isinstance(other, self._constructor): + if not (isinstance(other, type(self)) or isinstance(self, type(other))): return False return self._mgr.equals(other._mgr) diff --git a/pandas/tests/frame/test_subclass.py b/pandas/tests/frame/test_subclass.py index 08920cf7fceeb..2b462d5a10c51 100644 --- a/pandas/tests/frame/test_subclass.py +++ b/pandas/tests/frame/test_subclass.py @@ -696,3 +696,11 @@ def test_idxmax_preserves_subclass(self): df = tm.SubclassedDataFrame({"A": [1, 2, 3], "B": [4, 5, 6], "C": [7, 8, 9]}) result = df.idxmax() assert isinstance(result, tm.SubclassedSeries) + + def test_equals_subclass(self): + # https://github.com/pandas-dev/pandas/pull/34402 + # allow subclass in both directions + df1 = pd.DataFrame({"a": [1, 2, 3]}) + df2 = tm.SubclassedDataFrame({"a": [1, 2, 3]}) + assert df1.equals(df2) + assert df2.equals(df1) diff --git a/pandas/tests/series/test_subclass.py b/pandas/tests/series/test_subclass.py index a596ed49c1df2..86330b7cc6993 100644 --- a/pandas/tests/series/test_subclass.py +++ b/pandas/tests/series/test_subclass.py @@ -51,3 +51,11 @@ def test_explode(self): s = tm.SubclassedSeries([[1, 2, 3], "foo", [], [3, 4]]) result = s.explode() assert isinstance(result, tm.SubclassedSeries) + + def test_equals(self): + # https://github.com/pandas-dev/pandas/pull/34402 + # allow subclass in both directions + s1 = pd.Series([1, 2, 3]) + s2 = tm.SubclassedSeries([1, 2, 3]) + assert s1.equals(s2) + assert s2.equals(s1) From e79493225a5617faf969556faf65c496208b9dc1 Mon Sep 17 00:00:00 2001 From: Paul Sanders Date: Wed, 15 Jul 2020 21:31:45 -0400 Subject: [PATCH 0371/1025] TST: added tests for sparse and date range quantiles (#35236) * TST: added tests for sparse and date range quantiles * TST: updating quantile arguments --- pandas/tests/frame/methods/test_quantile.py | 39 ++++++++++++++++++--- 1 file changed, 34 insertions(+), 5 deletions(-) diff --git a/pandas/tests/frame/methods/test_quantile.py b/pandas/tests/frame/methods/test_quantile.py index 0eec30cbc5c67..0b8f1e0495155 100644 --- a/pandas/tests/frame/methods/test_quantile.py +++ b/pandas/tests/frame/methods/test_quantile.py @@ -7,14 +7,29 @@ class TestDataFrameQuantile: - def test_quantile_sparse(self): + @pytest.mark.parametrize( + "df,expected", + [ + [ + pd.DataFrame( + { + 0: pd.Series(pd.arrays.SparseArray([1, 2])), + 1: pd.Series(pd.arrays.SparseArray([3, 4])), + } + ), + pd.Series([1.5, 3.5], name=0.5), + ], + [ + pd.DataFrame(pd.Series([0.0, None, 1.0, 2.0], dtype="Sparse[float]")), + pd.Series([1.0], name=0.5), + ], + ], + ) + def test_quantile_sparse(self, df, expected): # GH#17198 - s = pd.Series(pd.arrays.SparseArray([1, 2])) - s1 = pd.Series(pd.arrays.SparseArray([3, 4])) - df = pd.DataFrame({0: s, 1: s1}) + # GH#24600 result = df.quantile() - expected = pd.Series([1.5, 3.5], name=0.5) tm.assert_series_equal(result, expected) def test_quantile(self, datetime_frame): @@ -59,6 +74,20 @@ def test_quantile(self, datetime_frame): expected = Series([3.0, 4.0], index=[0, 1], name=0.5) tm.assert_series_equal(result, expected) + def test_quantile_date_range(self): + # GH 2460 + + dti = pd.date_range("2016-01-01", periods=3, tz="US/Pacific") + ser = pd.Series(dti) + df = pd.DataFrame(ser) + + result = df.quantile(numeric_only=False) + expected = pd.Series( + ["2016-01-02 00:00:00"], name=0.5, dtype="datetime64[ns, US/Pacific]" + ) + + tm.assert_series_equal(result, expected) + def test_quantile_axis_mixed(self): # mixed on axis=1 From 301617507c79a96363a1c3f8f10b554e7c53fbd8 Mon Sep 17 00:00:00 2001 From: Gabriel Tutui Date: Wed, 15 Jul 2020 22:51:18 -0300 Subject: [PATCH 0372/1025] Add date overflow message to tz_localize (#32967) (#35187) * Add date overflow message to tz_localize (#32967) * Delay evaluating timestamp --- doc/source/whatsnew/v1.1.0.rst | 1 + pandas/_libs/tslibs/conversion.pyx | 15 ++++++++++++--- pandas/tests/scalar/timestamp/test_timezones.py | 9 ++++++++- 3 files changed, 21 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index f62e78831616b..97099ce73354f 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -922,6 +922,7 @@ Datetimelike resolution which converted to object dtype instead of coercing to ``datetime64[ns]`` dtype when within the timestamp bounds (:issue:`34843`). - The ``freq`` keyword in :class:`Period`, :func:`date_range`, :func:`period_range`, :func:`pd.tseries.frequencies.to_offset` no longer allows tuples, pass as string instead (:issue:`34703`) +- ``OutOfBoundsDatetime`` issues an improved error message when timestamp is out of implementation bounds. (:issue:`32967`) Timedelta ^^^^^^^^^ diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index 85da7a60a029a..8cc3d25e86340 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -639,11 +639,20 @@ cdef inline check_overflows(_TSObject obj): # GH#12677 if obj.dts.year == 1677: if not (obj.value < 0): - raise OutOfBoundsDatetime + from pandas._libs.tslibs.timestamps import Timestamp + fmt = (f"{obj.dts.year}-{obj.dts.month:02d}-{obj.dts.day:02d} " + f"{obj.dts.hour:02d}:{obj.dts.min:02d}:{obj.dts.sec:02d}") + raise OutOfBoundsDatetime( + f"Converting {fmt} underflows past {Timestamp.min}" + ) elif obj.dts.year == 2262: if not (obj.value > 0): - raise OutOfBoundsDatetime - + from pandas._libs.tslibs.timestamps import Timestamp + fmt = (f"{obj.dts.year}-{obj.dts.month:02d}-{obj.dts.day:02d} " + f"{obj.dts.hour:02d}:{obj.dts.min:02d}:{obj.dts.sec:02d}") + raise OutOfBoundsDatetime( + f"Converting {fmt} overflows past {Timestamp.max}" + ) # ---------------------------------------------------------------------- # Localization diff --git a/pandas/tests/scalar/timestamp/test_timezones.py b/pandas/tests/scalar/timestamp/test_timezones.py index 83764aa184392..f05f2054b2483 100644 --- a/pandas/tests/scalar/timestamp/test_timezones.py +++ b/pandas/tests/scalar/timestamp/test_timezones.py @@ -21,9 +21,12 @@ class TestTimestampTZOperations: # Timestamp.tz_localize def test_tz_localize_pushes_out_of_bounds(self): - msg = "^$" # GH#12677 # tz_localize that pushes away from the boundary is OK + msg = ( + f"Converting {Timestamp.min.strftime('%Y-%m-%d %H:%M:%S')} " + f"underflows past {Timestamp.min}" + ) pac = Timestamp.min.tz_localize("US/Pacific") assert pac.value > Timestamp.min.value pac.tz_convert("Asia/Tokyo") # tz_convert doesn't change value @@ -31,6 +34,10 @@ def test_tz_localize_pushes_out_of_bounds(self): Timestamp.min.tz_localize("Asia/Tokyo") # tz_localize that pushes away from the boundary is OK + msg = ( + f"Converting {Timestamp.max.strftime('%Y-%m-%d %H:%M:%S')} " + f"overflows past {Timestamp.max}" + ) tokyo = Timestamp.max.tz_localize("Asia/Tokyo") assert tokyo.value < Timestamp.max.value tokyo.tz_convert("US/Pacific") # tz_convert doesn't change value From 4987066b268e874a5e403df14da1694fd46d1c22 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Thu, 16 Jul 2020 10:55:13 +0100 Subject: [PATCH 0373/1025] BUG: Inconsistent behavior in Index.difference (#35231) --- doc/source/whatsnew/v1.1.0.rst | 1 + pandas/core/indexes/numeric.py | 22 ----------------- pandas/tests/indexes/test_numeric.py | 37 ++++++++++++++++++++++++++++ 3 files changed, 38 insertions(+), 22 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 97099ce73354f..9f0318bfdb895 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -954,6 +954,7 @@ Numeric - Bug in :meth:`DataFrame.diff` with ``axis=1`` returning incorrect results with mixed dtypes (:issue:`32995`) - Bug in :meth:`DataFrame.corr` and :meth:`DataFrame.cov` raising when handling nullable integer columns with ``pandas.NA`` (:issue:`33803`) - Bug in :class:`DataFrame` and :class:`Series` addition and subtraction between object-dtype objects and ``datetime64`` dtype objects (:issue:`33824`) +- Bug in :meth:`Index.difference` incorrect results when comparing a :class:`Float64Index` and object :class:`Index` (:issue:`35217`) - Bug in :class:`DataFrame` reductions (e.g. ``df.min()``, ``df.max()``) with ``ExtensionArray`` dtypes (:issue:`34520`, :issue:`32651`) Conversion diff --git a/pandas/core/indexes/numeric.py b/pandas/core/indexes/numeric.py index 5020a25c88ff4..731907993d08f 100644 --- a/pandas/core/indexes/numeric.py +++ b/pandas/core/indexes/numeric.py @@ -400,28 +400,6 @@ def _format_native_types( ) return formatter.get_result_as_array() - def equals(self, other) -> bool: - """ - Determines if two Index objects contain the same elements. - """ - if self is other: - return True - - if not isinstance(other, Index): - return False - - # need to compare nans locations and make sure that they are the same - # since nans don't compare equal this is a bit tricky - try: - if not isinstance(other, Float64Index): - other = self._constructor(other) - if not is_dtype_equal(self.dtype, other.dtype) or self.shape != other.shape: - return False - left, right = self._values, other._values - return ((left == right) | (self._isnan & other._isnan)).all() - except (TypeError, ValueError): - return False - def __contains__(self, other: Any) -> bool: hash(other) if super().__contains__(other): diff --git a/pandas/tests/indexes/test_numeric.py b/pandas/tests/indexes/test_numeric.py index 33de0800658f2..a7c5734ef9b02 100644 --- a/pandas/tests/indexes/test_numeric.py +++ b/pandas/tests/indexes/test_numeric.py @@ -239,6 +239,19 @@ def test_equals_numeric(self): i2 = Float64Index([1.0, np.nan]) assert i.equals(i2) + @pytest.mark.parametrize( + "other", + ( + Int64Index([1, 2]), + Index([1.0, 2.0], dtype=object), + Index([1, 2], dtype=object), + ), + ) + def test_equals_numeric_other_index_type(self, other): + i = Float64Index([1.0, 2.0]) + assert i.equals(other) + assert other.equals(i) + @pytest.mark.parametrize( "vals", [ @@ -635,3 +648,27 @@ def test_uint_index_does_not_convert_to_float64(): tm.assert_index_equal(result.index, expected) tm.assert_equal(result, series[:3]) + + +def test_float64_index_equals(): + # https://github.com/pandas-dev/pandas/issues/35217 + float_index = pd.Index([1.0, 2, 3]) + string_index = pd.Index(["1", "2", "3"]) + + result = float_index.equals(string_index) + assert result is False + + result = string_index.equals(float_index) + assert result is False + + +def test_float64_index_difference(): + # https://github.com/pandas-dev/pandas/issues/35217 + float_index = pd.Index([1.0, 2, 3]) + string_index = pd.Index(["1", "2", "3"]) + + result = float_index.difference(string_index) + tm.assert_index_equal(result, float_index) + + result = string_index.difference(float_index) + tm.assert_index_equal(result, string_index) From f7473c61a66d933bbdf6d3f122f08b7da0c77a83 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 16 Jul 2020 06:17:06 -0500 Subject: [PATCH 0374/1025] Fix indexing, reindex on all-sparse SparseArray. (#35287) --- doc/source/whatsnew/v1.1.0.rst | 2 +- pandas/core/arrays/sparse/array.py | 19 +++++++++------ pandas/core/internals/blocks.py | 5 +--- pandas/tests/arrays/sparse/test_array.py | 5 ++++ pandas/tests/extension/base/getitem.py | 28 ---------------------- pandas/tests/extension/test_sparse.py | 5 ---- pandas/tests/frame/indexing/test_sparse.py | 20 ++++++++++++++++ 7 files changed, 39 insertions(+), 45 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 9f0318bfdb895..de3a05a2ccdfb 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -1163,7 +1163,7 @@ Sparse - Creating a :class:`SparseArray` from timezone-aware dtype will issue a warning before dropping timezone information, instead of doing so silently (:issue:`32501`) - Bug in :meth:`arrays.SparseArray.from_spmatrix` wrongly read scipy sparse matrix (:issue:`31991`) - Bug in :meth:`Series.sum` with ``SparseArray`` raises ``TypeError`` (:issue:`25777`) -- Bug where :class:`DataFrame` containing :class:`SparseArray` filled with ``NaN`` when indexed by a list-like (:issue:`27781`, :issue:`29563`) +- Bug where :class:`DataFrame` containing an all-sparse :class:`SparseArray` filled with ``NaN`` when indexed by a list-like (:issue:`27781`, :issue:`29563`) - The repr of :class:`SparseDtype` now includes the repr of its ``fill_value`` attribute. Previously it used ``fill_value``'s string representation (:issue:`34352`) - Bug where empty :class:`DataFrame` could not be cast to :class:`SparseDtype` (:issue:`33113`) - Bug in :meth:`arrays.SparseArray` was returning the incorrect type when indexing a sparse dataframe with an iterable (:issue:`34526`, :issue:`34540`) diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index b18a58da3950f..1d675b54a9c62 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -862,21 +862,26 @@ def _take_with_fill(self, indices, fill_value=None) -> np.ndarray: else: raise IndexError("cannot do a non-empty take from an empty axes.") + # sp_indexer may be -1 for two reasons + # 1.) we took for an index of -1 (new) + # 2.) we took a value that was self.fill_value (old) sp_indexer = self.sp_index.lookup_array(indices) + new_fill_indices = indices == -1 + old_fill_indices = (sp_indexer == -1) & ~new_fill_indices - if self.sp_index.npoints == 0: + if self.sp_index.npoints == 0 and old_fill_indices.all(): + # We've looked up all valid points on an all-sparse array. + taken = np.full( + sp_indexer.shape, fill_value=self.fill_value, dtype=self.dtype.subtype + ) + + elif self.sp_index.npoints == 0: # Avoid taking from the empty self.sp_values _dtype = np.result_type(self.dtype.subtype, type(fill_value)) taken = np.full(sp_indexer.shape, fill_value=fill_value, dtype=_dtype) else: taken = self.sp_values.take(sp_indexer) - # sp_indexer may be -1 for two reasons - # 1.) we took for an index of -1 (new) - # 2.) we took a value that was self.fill_value (old) - new_fill_indices = indices == -1 - old_fill_indices = (sp_indexer == -1) & ~new_fill_indices - # Fill in two steps. # Old fill values # New fill values diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 6a4b3318d3aa7..cc0f09ced7399 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1636,10 +1636,7 @@ def _holder(self): @property def fill_value(self): # Used in reindex_indexer - if is_sparse(self.values): - return self.values.dtype.fill_value - else: - return self.values.dtype.na_value + return self.values.dtype.na_value @property def _can_hold_na(self): diff --git a/pandas/tests/arrays/sparse/test_array.py b/pandas/tests/arrays/sparse/test_array.py index d0cdec712f39d..04215bfe1bedb 100644 --- a/pandas/tests/arrays/sparse/test_array.py +++ b/pandas/tests/arrays/sparse/test_array.py @@ -281,6 +281,11 @@ def test_take(self): exp = SparseArray(np.take(self.arr_data, [0, 1, 2])) tm.assert_sp_array_equal(self.arr.take([0, 1, 2]), exp) + def test_take_all_empty(self): + a = pd.array([0, 0], dtype=pd.SparseDtype("int64")) + result = a.take([0, 1], allow_fill=True, fill_value=np.nan) + tm.assert_sp_array_equal(a, result) + def test_take_fill_value(self): data = np.array([1, np.nan, 0, 3, 0]) sparse = SparseArray(data, fill_value=0) diff --git a/pandas/tests/extension/base/getitem.py b/pandas/tests/extension/base/getitem.py index 5d0ea69007e27..251376798efc3 100644 --- a/pandas/tests/extension/base/getitem.py +++ b/pandas/tests/extension/base/getitem.py @@ -399,31 +399,3 @@ def test_item(self, data): with pytest.raises(ValueError, match=msg): s.item() - - def test_boolean_mask_frame_fill_value(self, data): - # https://github.com/pandas-dev/pandas/issues/27781 - df = pd.DataFrame({"A": data}) - - mask = np.random.choice([True, False], df.shape[0]) - result = pd.isna(df.iloc[mask]["A"]) - expected = pd.isna(df["A"].iloc[mask]) - self.assert_series_equal(result, expected) - - mask = pd.Series(mask, index=df.index) - result = pd.isna(df.loc[mask]["A"]) - expected = pd.isna(df["A"].loc[mask]) - self.assert_series_equal(result, expected) - - def test_fancy_index_frame_fill_value(self, data): - # https://github.com/pandas-dev/pandas/issues/29563 - df = pd.DataFrame({"A": data}) - - mask = np.random.choice(df.shape[0], df.shape[0]) - result = pd.isna(df.iloc[mask]["A"]) - expected = pd.isna(df["A"].iloc[mask]) - self.assert_series_equal(result, expected) - - mask = pd.Series(mask, index=df.index) - result = pd.isna(df.loc[mask]["A"]) - expected = pd.isna(df["A"].loc[mask]) - self.assert_series_equal(result, expected) diff --git a/pandas/tests/extension/test_sparse.py b/pandas/tests/extension/test_sparse.py index 68e521b005c02..b411ca1c482a4 100644 --- a/pandas/tests/extension/test_sparse.py +++ b/pandas/tests/extension/test_sparse.py @@ -41,11 +41,6 @@ def data_for_twos(request): return SparseArray(np.ones(100) * 2) -@pytest.fixture(params=[0, np.nan]) -def data_zeros(request): - return SparseArray(np.zeros(100, dtype=int), fill_value=request.param) - - @pytest.fixture(params=[0, np.nan]) def data_missing(request): """Length 2 array with [NA, Valid]""" diff --git a/pandas/tests/frame/indexing/test_sparse.py b/pandas/tests/frame/indexing/test_sparse.py index 876fbe212c466..04e1c8b94c4d9 100644 --- a/pandas/tests/frame/indexing/test_sparse.py +++ b/pandas/tests/frame/indexing/test_sparse.py @@ -49,3 +49,23 @@ def test_locindexer_from_spmatrix(self, spmatrix_t, dtype): result = df.loc[itr_idx].dtypes.values expected = np.full(cols, SparseDtype(dtype, fill_value=0)) tm.assert_numpy_array_equal(result, expected) + + def test_reindex(self): + # https://github.com/pandas-dev/pandas/issues/35286 + df = pd.DataFrame( + {"A": [0, 1], "B": pd.array([0, 1], dtype=pd.SparseDtype("int64", 0))} + ) + result = df.reindex([0, 2]) + expected = pd.DataFrame( + { + "A": [0.0, np.nan], + "B": pd.array([0.0, np.nan], dtype=pd.SparseDtype("float64", 0.0)), + }, + index=[0, 2], + ) + tm.assert_frame_equal(result, expected) + + def test_all_sparse(self): + df = pd.DataFrame({"A": pd.array([0, 0], dtype=pd.SparseDtype("int64"))}) + result = df.loc[[0, 1]] + tm.assert_frame_equal(result, df) From 81ae0a9ccc9f74269ef7847e553b7f4b8fee0890 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 16 Jul 2020 08:04:53 -0500 Subject: [PATCH 0375/1025] TST: xfail more 32-bits (#35304) --- pandas/tests/window/test_rolling.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index 8d72e2cb92ca9..bea239a245a4f 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -7,7 +7,7 @@ import pandas.util._test_decorators as td import pandas as pd -from pandas import DataFrame, Series, date_range +from pandas import DataFrame, Series, compat, date_range import pandas._testing as tm from pandas.core.window import Rolling @@ -150,6 +150,7 @@ def test_closed_one_entry(func): @pytest.mark.parametrize("func", ["min", "max"]) +@pytest.mark.xfail(not compat.IS64, reason="GH-35294") def test_closed_one_entry_groupby(func): # GH24718 ser = pd.DataFrame( @@ -682,6 +683,7 @@ def test_iter_rolling_datetime(expected, expected_index, window): ), ], ) +@pytest.mark.xfail(not compat.IS64, reason="GH-35294") def test_rolling_positional_argument(grouping, _index, raw): # GH 34605 From cc5f4b8791343797f2f0b7cdac4c71a10a4fc8d0 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 16 Jul 2020 09:56:08 -0500 Subject: [PATCH 0376/1025] Revert BUG: Ensure same index is returned for slow and fast path in groupby.apply #31613 (#35306) xref https://github.com/pandas-dev/pandas/pull/34998 --- doc/source/whatsnew/v1.1.0.rst | 4 ---- pandas/_libs/reduction.pyx | 2 +- pandas/tests/groupby/test_apply.py | 8 ++++++-- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index de3a05a2ccdfb..cee3680b4bf65 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -1114,10 +1114,6 @@ Groupby/resample/rolling - Bug in :meth:`DataFrame.resample` where an ``AmbiguousTimeError`` would be raised when the resulting timezone aware :class:`DatetimeIndex` had a DST transition at midnight (:issue:`25758`) - Bug in :meth:`DataFrame.groupby` where a ``ValueError`` would be raised when grouping by a categorical column with read-only categories and ``sort=False`` (:issue:`33410`) - Bug in :meth:`GroupBy.agg`, :meth:`GroupBy.transform`, and :meth:`GroupBy.resample` where subclasses are not preserved (:issue:`28330`) -- Bug in :meth:`core.groupby.DataFrameGroupBy.apply` where the output index shape for functions returning a DataFrame which is equally indexed - to the input DataFrame is inconsistent. An internal heuristic to detect index mutation would behave differently for equal but not identical - indices. In particular, the result index shape might change if a copy of the input would be returned. - The behaviour now is consistent, independent of internal heuristics. (:issue:`31612`, :issue:`14927`, :issue:`13056`) - Bug in :meth:`SeriesGroupBy.agg` where any column name was accepted in the named aggregation of ``SeriesGroupBy`` previously. The behaviour now allows only ``str`` and callables else would raise ``TypeError``. (:issue:`34422`) - Bug in :meth:`DataFrame.groupby` lost index, when one of the ``agg`` keys referenced an empty list (:issue:`32580`) - Bug in :meth:`Rolling.apply` where ``center=True`` was ignored when ``engine='numba'`` was specified (:issue:`34784`) diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx index 97c491776f831..a01e0c5705dcf 100644 --- a/pandas/_libs/reduction.pyx +++ b/pandas/_libs/reduction.pyx @@ -366,7 +366,7 @@ def apply_frame_axis0(object frame, object f, object names, # Need to infer if low level index slider will cause segfaults require_slow_apply = i == 0 and piece is chunk try: - if not piece.index.equals(chunk.index): + if not piece.index is chunk.index: mutated = True except AttributeError: # `piece` might not have an index, could be e.g. an int diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index aa10f44670361..5a1268bfb03db 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -211,6 +211,7 @@ def test_group_apply_once_per_group2(capsys): assert result == expected +@pytest.mark.xfail(reason="GH-34998") def test_apply_fast_slow_identical(): # GH 31613 @@ -234,9 +235,11 @@ def fast(group): "func", [ lambda x: x, - lambda x: x[:], + pytest.param(lambda x: x[:], marks=pytest.mark.xfail(reason="GH-34998")), lambda x: x.copy(deep=False), - lambda x: x.copy(deep=True), + pytest.param( + lambda x: x.copy(deep=True), marks=pytest.mark.xfail(reason="GH-34998") + ), ], ) def test_groupby_apply_identity_maybecopy_index_identical(func): @@ -997,6 +1000,7 @@ def test_apply_function_with_indexing_return_column(): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(reason="GH-34998") def test_apply_with_timezones_aware(): # GH: 27212 From d620a10cc5e36576379e7104364f9c8fe67521bd Mon Sep 17 00:00:00 2001 From: vivikelapoutre <31180320+vivikelapoutre@users.noreply.github.com> Date: Thu, 16 Jul 2020 18:08:54 +0200 Subject: [PATCH 0377/1025] BUG: Groupby with as_index=True causes incorrect summarization (#34906) * add test * PR comments * attempt to make the code cleaner --- pandas/tests/groupby/test_function.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 6f19ec40c2520..e693962e57ac3 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -85,6 +85,24 @@ def test_max_min_non_numeric(): assert "ss" in result +def test_min_date_with_nans(): + # GH26321 + dates = pd.to_datetime( + pd.Series(["2019-05-09", "2019-05-09", "2019-05-09"]), format="%Y-%m-%d" + ).dt.date + df = pd.DataFrame({"a": [np.nan, "1", np.nan], "b": [0, 1, 1], "c": dates}) + + result = df.groupby("b", as_index=False)["c"].min()["c"] + expected = pd.to_datetime( + pd.Series(["2019-05-09", "2019-05-09"], name="c"), format="%Y-%m-%d" + ).dt.date + tm.assert_series_equal(result, expected) + + result = df.groupby("b")["c"].min() + expected.index.name = "b" + tm.assert_series_equal(result, expected) + + def test_intercept_builtin_sum(): s = Series([1.0, 2.0, np.nan, 3.0]) grouped = s.groupby([0, 1, 2, 2]) From d48738520f76e44ac3d2d1ba4ecd4a5396c69379 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 16 Jul 2020 14:38:59 -0500 Subject: [PATCH 0378/1025] TST: Remove deprecated use of apply_index (#35298) --- .../offsets/test_offsets_properties.py | 29 ------------------- 1 file changed, 29 deletions(-) diff --git a/pandas/tests/tseries/offsets/test_offsets_properties.py b/pandas/tests/tseries/offsets/test_offsets_properties.py index 81465e733da85..ca14b202ef888 100644 --- a/pandas/tests/tseries/offsets/test_offsets_properties.py +++ b/pandas/tests/tseries/offsets/test_offsets_properties.py @@ -12,7 +12,6 @@ from hypothesis import assume, given, strategies as st from hypothesis.extra.dateutil import timezones as dateutil_timezones from hypothesis.extra.pytz import timezones as pytz_timezones -import pytest import pandas as pd from pandas import Timestamp @@ -95,34 +94,6 @@ def test_on_offset_implementations(dt, offset): assert offset.is_on_offset(dt) == (compare == dt) -@pytest.mark.xfail( - reason="res_v2 below is incorrect, needs to use the " - "commented-out version with tz_localize. " - "But with that fix in place, hypothesis then " - "has errors in timezone generation." -) -@given(gen_yqm_offset, gen_date_range) -def test_apply_index_implementations(offset, rng): - # offset.apply_index(dti)[i] should match dti[i] + offset - assume(offset.n != 0) # TODO: test for that case separately - - # rng = pd.date_range(start='1/1/2000', periods=100000, freq='T') - ser = pd.Series(rng) - - res = rng + offset - res_v2 = offset.apply_index(rng) - # res_v2 = offset.apply_index(rng.tz_localize(None)).tz_localize(rng.tz) - assert (res == res_v2).all() - - assert res[0] == rng[0] + offset - assert res[-1] == rng[-1] + offset - res2 = ser + offset - # apply_index is only for indexes, not series, so no res2_v2 - assert res2.iloc[0] == ser.iloc[0] + offset - assert res2.iloc[-1] == ser.iloc[-1] + offset - # TODO: Check randomly assorted entries, not just first/last - - @given(gen_yqm_offset) def test_shift_across_dst(offset): # GH#18319 check that 1) timezone is correctly normalized and From 6277a8f44476c59d9b4b46e6d95f77c2f298a56a Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 16 Jul 2020 17:43:07 -0500 Subject: [PATCH 0379/1025] pin numpy (#35312) --- environment.yml | 3 ++- requirements-dev.txt | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/environment.yml b/environment.yml index 32ff8c91cb69c..53106906a52cb 100644 --- a/environment.yml +++ b/environment.yml @@ -3,7 +3,8 @@ channels: - conda-forge dependencies: # required - - numpy>=1.15 + # Pin numpy<1.19 until MPL 3.3.0 is released. + - numpy>=1.15,<1.19.0 - python=3 - python-dateutil>=2.7.3 - pytz diff --git a/requirements-dev.txt b/requirements-dev.txt index 3cda38d4b72f5..1ec998ffa72d4 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,7 +1,7 @@ # This file is auto-generated from environment.yml, do not modify. # See that file for comments about the need/usage of each dependency. -numpy>=1.15 +numpy>=1.15,<1.19.0 python-dateutil>=2.7.3 pytz asv From 475142a9f6d086529ac035bf7ac57408c0475e41 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Thu, 16 Jul 2020 23:44:18 +0100 Subject: [PATCH 0380/1025] CLN: consistent EA._reduce signatures (#35308) --- pandas/core/arrays/base.py | 2 +- pandas/core/arrays/categorical.py | 4 ++-- pandas/core/arrays/datetimelike.py | 2 +- pandas/core/arrays/sparse/array.py | 2 +- pandas/core/arrays/string_.py | 2 +- pandas/tests/extension/arrow/arrays.py | 4 ++-- pandas/tests/extension/decimal/array.py | 2 +- 7 files changed, 9 insertions(+), 9 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 32a2a30fcfd43..2553a65aed07b 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -1120,7 +1120,7 @@ def _concat_same_type( # of objects _can_hold_na = True - def _reduce(self, name, skipna=True, **kwargs): + def _reduce(self, name: str, skipna: bool = True, **kwargs): """ Return a scalar result of performing the reduction operation. diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 1fedfa70cc469..db9cfd9d7fc59 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2076,11 +2076,11 @@ def _reverse_indexer(self) -> Dict[Hashable, np.ndarray]: return result # reduction ops # - def _reduce(self, name, axis=0, **kwargs): + def _reduce(self, name: str, skipna: bool = True, **kwargs): func = getattr(self, name, None) if func is None: raise TypeError(f"Categorical cannot perform the operation {name}") - return func(**kwargs) + return func(skipna=skipna, **kwargs) @deprecate_kwarg(old_arg_name="numeric_only", new_arg_name="skipna") def min(self, skipna=True, **kwargs): diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index a306268cd8ede..ee4d43fdb3bc2 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -1552,7 +1552,7 @@ def __isub__(self, other): # -------------------------------------------------------------- # Reductions - def _reduce(self, name, axis=0, skipna=True, **kwargs): + def _reduce(self, name: str, skipna: bool = True, **kwargs): op = getattr(self, name, None) if op: return op(skipna=skipna, **kwargs) diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index 1d675b54a9c62..d8db196e4b92f 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -1164,7 +1164,7 @@ def nonzero(self): # Reductions # ------------------------------------------------------------------------ - def _reduce(self, name, skipna=True, **kwargs): + def _reduce(self, name: str, skipna: bool = True, **kwargs): method = getattr(self, name, None) if method is None: diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 5104e3f12f5b4..fddd3af858f77 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -291,7 +291,7 @@ def astype(self, dtype, copy=True): return super().astype(dtype, copy) - def _reduce(self, name, skipna=True, **kwargs): + def _reduce(self, name: str, skipna: bool = True, **kwargs): if name in ["min", "max"]: return getattr(self, name)(skipna=skipna) diff --git a/pandas/tests/extension/arrow/arrays.py b/pandas/tests/extension/arrow/arrays.py index 29cfe1e0fe606..8a18f505058bc 100644 --- a/pandas/tests/extension/arrow/arrays.py +++ b/pandas/tests/extension/arrow/arrays.py @@ -162,14 +162,14 @@ def _concat_same_type(cls, to_concat): def __invert__(self): return type(self).from_scalars(~self._data.to_pandas()) - def _reduce(self, method, skipna=True, **kwargs): + def _reduce(self, name: str, skipna: bool = True, **kwargs): if skipna: arr = self[~self.isna()] else: arr = self try: - op = getattr(arr, method) + op = getattr(arr, name) except AttributeError as err: raise TypeError from err return op(**kwargs) diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py index 4d5be75ff8200..2fbeec8dd8378 100644 --- a/pandas/tests/extension/decimal/array.py +++ b/pandas/tests/extension/decimal/array.py @@ -174,7 +174,7 @@ def _formatter(self, boxed=False): def _concat_same_type(cls, to_concat): return cls(np.concatenate([x._data for x in to_concat])) - def _reduce(self, name, skipna=True, **kwargs): + def _reduce(self, name: str, skipna: bool = True, **kwargs): if skipna: # If we don't have any NAs, we can ignore skipna From 466a04b3a70d7025c30b5e43df434c6c0f854736 Mon Sep 17 00:00:00 2001 From: Karthik Mathur <22126205+mathurk1@users.noreply.github.com> Date: Thu, 16 Jul 2020 17:45:57 -0500 Subject: [PATCH 0381/1025] TST add test for dtype consistency with pd replace #23305 (#35234) --- pandas/tests/frame/methods/test_replace.py | 80 ++++++++++++++++++++++ 1 file changed, 80 insertions(+) diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py index ea72a3d8fef4d..a3f056dbf9648 100644 --- a/pandas/tests/frame/methods/test_replace.py +++ b/pandas/tests/frame/methods/test_replace.py @@ -1493,3 +1493,83 @@ def test_replace_period_ignore_float(self): result = df.replace(1.0, 0.0) expected = pd.DataFrame({"Per": [pd.Period("2020-01")] * 3}) tm.assert_frame_equal(expected, result) + + def test_replace_value_category_type(self): + """ + Test for #23305: to ensure category dtypes are maintained + after replace with direct values + """ + + # create input data + input_dict = { + "col1": [1, 2, 3, 4], + "col2": ["a", "b", "c", "d"], + "col3": [1.5, 2.5, 3.5, 4.5], + "col4": ["cat1", "cat2", "cat3", "cat4"], + "col5": ["obj1", "obj2", "obj3", "obj4"], + } + # explicitly cast columns as category and order them + input_df = pd.DataFrame(data=input_dict).astype( + {"col2": "category", "col4": "category"} + ) + input_df["col2"] = input_df["col2"].cat.reorder_categories( + ["a", "b", "c", "d"], ordered=True + ) + input_df["col4"] = input_df["col4"].cat.reorder_categories( + ["cat1", "cat2", "cat3", "cat4"], ordered=True + ) + + # create expected dataframe + expected_dict = { + "col1": [1, 2, 3, 4], + "col2": ["a", "b", "c", "z"], + "col3": [1.5, 2.5, 3.5, 4.5], + "col4": ["cat1", "catX", "cat3", "cat4"], + "col5": ["obj9", "obj2", "obj3", "obj4"], + } + # explicitly cast columns as category and order them + expected = pd.DataFrame(data=expected_dict).astype( + {"col2": "category", "col4": "category"} + ) + expected["col2"] = expected["col2"].cat.reorder_categories( + ["a", "b", "c", "z"], ordered=True + ) + expected["col4"] = expected["col4"].cat.reorder_categories( + ["cat1", "catX", "cat3", "cat4"], ordered=True + ) + + # replace values in input dataframe + input_df = input_df.replace("d", "z") + input_df = input_df.replace("obj1", "obj9") + result = input_df.replace("cat2", "catX") + + tm.assert_frame_equal(result, expected) + + @pytest.mark.xfail( + reason="category dtype gets changed to object type after replace, see #35268", + strict=True, + ) + def test_replace_dict_category_type(self, input_category_df, expected_category_df): + """ + Test to ensure category dtypes are maintained + after replace with dict values + """ + + # create input dataframe + input_dict = {"col1": ["a"], "col2": ["obj1"], "col3": ["cat1"]} + # explicitly cast columns as category + input_df = pd.DataFrame(data=input_dict).astype( + {"col1": "category", "col2": "category", "col3": "category"} + ) + + # create expected dataframe + expected_dict = {"col1": ["z"], "col2": ["obj9"], "col3": ["catX"]} + # explicitly cast columns as category + expected = pd.DataFrame(data=expected_dict).astype( + {"col1": "category", "col2": "category", "col3": "category"} + ) + + # replace values in input dataframe using a dict + result = input_df.replace({"a": "z", "obj1": "obj9", "cat1": "catX"}) + + tm.assert_frame_equal(result, expected) From 5fb3add1c9f9460c941dd7aca2de513c3482b94a Mon Sep 17 00:00:00 2001 From: rhshadrach <45562402+rhshadrach@users.noreply.github.com> Date: Thu, 16 Jul 2020 18:48:29 -0400 Subject: [PATCH 0382/1025] BUG: DataFrameGroupBy.quantile raises for non-numeric dtypes rather than dropping columns (#34756) --- doc/source/whatsnew/v1.1.0.rst | 1 + pandas/core/groupby/groupby.py | 17 ++++++++++++++--- pandas/tests/groupby/test_quantile.py | 8 ++++++++ 3 files changed, 23 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index cee3680b4bf65..7b28eb79e433c 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -1118,6 +1118,7 @@ Groupby/resample/rolling - Bug in :meth:`DataFrame.groupby` lost index, when one of the ``agg`` keys referenced an empty list (:issue:`32580`) - Bug in :meth:`Rolling.apply` where ``center=True`` was ignored when ``engine='numba'`` was specified (:issue:`34784`) - Bug in :meth:`DataFrame.ewm.cov` was throwing ``AssertionError`` for :class:`MultiIndex` inputs (:issue:`34440`) +- Bug in :meth:`core.groupby.DataFrameGroupBy.quantile` raises ``TypeError`` for non-numeric types rather than dropping columns (:issue:`27892`) - Bug in :meth:`core.groupby.DataFrameGroupBy.transform` when ``func='nunique'`` and columns are of type ``datetime64``, the result would also be of type ``datetime64`` instead of ``int64`` (:issue:`35109`) - Bug in :meth:'DataFrameGroupBy.first' and :meth:'DataFrameGroupBy.last' that would raise an unnecessary ``ValueError`` when grouping on multiple ``Categoricals`` (:issue:`34951`) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 65483abbd2a6e..ac45222625569 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -2403,7 +2403,7 @@ def _get_cythonized_result( signature needs_2d : bool, default False Whether the values and result of the Cython call signature - are at least 2-dimensional. + are 2-dimensional. min_count : int, default None When not None, min_count for the Cython call needs_mask : bool, default False @@ -2419,7 +2419,9 @@ def _get_cythonized_result( Function should return a tuple where the first element is the values to be passed to Cython and the second element is an optional type which the values should be converted to after being returned - by the Cython operation. Raises if `needs_values` is False. + by the Cython operation. This function is also responsible for + raising a TypeError if the values have an invalid type. Raises + if `needs_values` is False. post_processing : function, default None Function to be applied to result of Cython function. Should accept an array of values as the first argument and type inferences as its @@ -2451,6 +2453,7 @@ def _get_cythonized_result( output: Dict[base.OutputKey, np.ndarray] = {} base_func = getattr(libgroupby, how) + error_msg = "" for idx, obj in enumerate(self._iterate_slices()): name = obj.name values = obj._values @@ -2477,7 +2480,11 @@ def _get_cythonized_result( if needs_values: vals = values if pre_processing: - vals, inferences = pre_processing(vals) + try: + vals, inferences = pre_processing(vals) + except TypeError as e: + error_msg = str(e) + continue if needs_2d: vals = vals.reshape((-1, 1)) vals = vals.astype(cython_dtype, copy=False) @@ -2509,6 +2516,10 @@ def _get_cythonized_result( key = base.OutputKey(label=name, position=idx) output[key] = result + # error_msg is "" on an frame/series with no rows or columns + if len(output) == 0 and error_msg != "": + raise TypeError(error_msg) + if aggregate: return self._wrap_aggregated_output(output) else: diff --git a/pandas/tests/groupby/test_quantile.py b/pandas/tests/groupby/test_quantile.py index 8cfd8035502c3..9338742195bfe 100644 --- a/pandas/tests/groupby/test_quantile.py +++ b/pandas/tests/groupby/test_quantile.py @@ -232,3 +232,11 @@ def test_groupby_quantile_nullable_array(values, q): expected = pd.Series(true_quantiles * 2, index=idx, name="b") tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("q", [0.5, [0.0, 0.5, 1.0]]) +def test_groupby_quantile_skips_invalid_dtype(q): + df = pd.DataFrame({"a": [1], "b": [2.0], "c": ["x"]}) + result = df.groupby("a").quantile(q) + expected = df.groupby("a")[["b"]].quantile(q) + tm.assert_frame_equal(result, expected) From e0d4faee3bca6174c60ed61fb26aa9f9f3084675 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 16 Jul 2020 17:49:13 -0500 Subject: [PATCH 0383/1025] Fixed reindexing arith with duplicates (#35303) Closes https://github.com/pandas-dev/pandas/issues/35194 --- doc/source/whatsnew/v1.1.0.rst | 1 + pandas/core/ops/__init__.py | 23 +++++++++++++++++++---- pandas/tests/frame/test_arithmetic.py | 9 +++++++++ 3 files changed, 29 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 7b28eb79e433c..6434fe4042ffc 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -953,6 +953,7 @@ Numeric - Bug in :meth:`DataFrame.count` with ``level="foo"`` and index level ``"foo"`` containing NaNs causes segmentation fault (:issue:`21824`) - Bug in :meth:`DataFrame.diff` with ``axis=1`` returning incorrect results with mixed dtypes (:issue:`32995`) - Bug in :meth:`DataFrame.corr` and :meth:`DataFrame.cov` raising when handling nullable integer columns with ``pandas.NA`` (:issue:`33803`) +- Bug in arithmetic operations between ``DataFrame`` objects with non-overlapping columns with duplicate labels causing an infinite loop (:issue:`35194`) - Bug in :class:`DataFrame` and :class:`Series` addition and subtraction between object-dtype objects and ``datetime64`` dtype objects (:issue:`33824`) - Bug in :meth:`Index.difference` incorrect results when comparing a :class:`Float64Index` and object :class:`Index` (:issue:`35217`) - Bug in :class:`DataFrame` reductions (e.g. ``df.min()``, ``df.max()``) with ``ExtensionArray`` dtypes (:issue:`34520`, :issue:`32651`) diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index 5dd94a8af74ac..60f3d23aaed13 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -17,6 +17,7 @@ from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries from pandas.core.dtypes.missing import isna +from pandas.core import algorithms from pandas.core.construction import extract_array from pandas.core.ops.array_ops import ( arithmetic_op, @@ -562,10 +563,12 @@ def _frame_arith_method_with_reindex( DataFrame """ # GH#31623, only operate on shared columns - cols = left.columns.intersection(right.columns) + cols, lcols, rcols = left.columns.join( + right.columns, how="inner", level=None, return_indexers=True + ) - new_left = left[cols] - new_right = right[cols] + new_left = left.iloc[:, lcols] + new_right = right.iloc[:, rcols] result = op(new_left, new_right) # Do the join on the columns instead of using _align_method_FRAME @@ -573,7 +576,19 @@ def _frame_arith_method_with_reindex( join_columns, _, _ = left.columns.join( right.columns, how="outer", level=None, return_indexers=True ) - return result.reindex(join_columns, axis=1) + + if result.columns.has_duplicates: + # Avoid reindexing with a duplicate axis. + # https://github.com/pandas-dev/pandas/issues/35194 + indexer, _ = result.columns.get_indexer_non_unique(join_columns) + indexer = algorithms.unique1d(indexer) + result = result._reindex_with_indexers( + {1: [join_columns, indexer]}, allow_dups=True + ) + else: + result = result.reindex(join_columns, axis=1) + + return result def _maybe_align_series_as_frame(frame: "DataFrame", series: "Series", axis: int): diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index a6b0ece58b095..e17357e9845b5 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -1552,3 +1552,12 @@ def test_dataframe_operation_with_non_numeric_types(df, col_dtype): expected = expected.astype({"b": col_dtype}) result = df + pd.Series([-1.0], index=list("a")) tm.assert_frame_equal(result, expected) + + +def test_arith_reindex_with_duplicates(): + # https://github.com/pandas-dev/pandas/issues/35194 + df1 = pd.DataFrame(data=[[0]], columns=["second"]) + df2 = pd.DataFrame(data=[[0, 0, 0]], columns=["first", "second", "second"]) + result = df1 + df2 + expected = pd.DataFrame([[np.nan, 0, 0]], columns=["first", "second", "second"]) + tm.assert_frame_equal(result, expected) From 630e2737e3784a3c1aee005f386fd6e0c59c2a31 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Thu, 16 Jul 2020 23:50:38 +0100 Subject: [PATCH 0384/1025] BUG: DataFrame.append with empty DataFrame and Series with tz-aware datetime value allocated object column (#35038) --- doc/source/whatsnew/v1.1.0.rst | 1 + pandas/core/dtypes/concat.py | 4 ++-- pandas/core/internals/concat.py | 2 +- pandas/tests/reshape/test_concat.py | 29 ++++++++++++++++++----------- 4 files changed, 22 insertions(+), 14 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 6434fe4042ffc..df41b24ee5097 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -922,6 +922,7 @@ Datetimelike resolution which converted to object dtype instead of coercing to ``datetime64[ns]`` dtype when within the timestamp bounds (:issue:`34843`). - The ``freq`` keyword in :class:`Period`, :func:`date_range`, :func:`period_range`, :func:`pd.tseries.frequencies.to_offset` no longer allows tuples, pass as string instead (:issue:`34703`) +- Bug in :meth:`DataFrame.append` when appending a :class:`Series` containing a scalar tz-aware :class:`Timestamp` to an empty :class:`DataFrame` resulted in an object column instead of datetime64[ns, tz] dtype (:issue:`35038`) - ``OutOfBoundsDatetime`` issues an improved error message when timestamp is out of implementation bounds. (:issue:`32967`) Timedelta diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index 4b7c818f487ac..9902016475b22 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -152,11 +152,11 @@ def is_nonempty(x) -> bool: target_dtype = find_common_type([x.dtype for x in to_concat]) to_concat = [_cast_to_common_type(arr, target_dtype) for arr in to_concat] - if isinstance(to_concat[0], ExtensionArray): + if isinstance(to_concat[0], ExtensionArray) and axis == 0: cls = type(to_concat[0]) return cls._concat_same_type(to_concat) else: - return np.concatenate(to_concat) + return np.concatenate(to_concat, axis=axis) elif _contains_datetime or "timedelta" in typs: return concat_datetime(to_concat, axis=axis, typs=typs) diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 2cc7461986c8f..2c0d4931a7bf2 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -333,7 +333,7 @@ def _concatenate_join_units(join_units, concat_axis, copy): # concatting with at least one EA means we are concatting a single column # the non-EA values are 2D arrays with shape (1, n) to_concat = [t if isinstance(t, ExtensionArray) else t[0, :] for t in to_concat] - concat_values = concat_compat(to_concat, axis=concat_axis) + concat_values = concat_compat(to_concat, axis=0) if not isinstance(concat_values, ExtensionArray): # if the result of concat is not an EA but an ndarray, reshape to # 2D to put it a non-EA Block diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py index ffeb5ff0f8aaa..0159fabd04d59 100644 --- a/pandas/tests/reshape/test_concat.py +++ b/pandas/tests/reshape/test_concat.py @@ -1087,20 +1087,27 @@ def test_append_empty_frame_to_series_with_dateutil_tz(self): date = Timestamp("2018-10-24 07:30:00", tz=dateutil.tz.tzutc()) s = Series({"date": date, "a": 1.0, "b": 2.0}) df = DataFrame(columns=["c", "d"]) - result = df.append(s, ignore_index=True) - # n.b. it's not clear to me that expected is correct here. - # It's possible that the `date` column should have - # datetime64[ns, tz] dtype for both result and expected. - # that would be more consistent with new columns having - # their own dtype (float for a and b, datetime64ns, tz for date). + result_a = df.append(s, ignore_index=True) expected = DataFrame( - [[np.nan, np.nan, 1.0, 2.0, date]], - columns=["c", "d", "a", "b", "date"], - dtype=object, + [[np.nan, np.nan, 1.0, 2.0, date]], columns=["c", "d", "a", "b", "date"] ) # These columns get cast to object after append - expected["a"] = expected["a"].astype(float) - expected["b"] = expected["b"].astype(float) + expected["c"] = expected["c"].astype(object) + expected["d"] = expected["d"].astype(object) + tm.assert_frame_equal(result_a, expected) + + expected = DataFrame( + [[np.nan, np.nan, 1.0, 2.0, date]] * 2, columns=["c", "d", "a", "b", "date"] + ) + expected["c"] = expected["c"].astype(object) + expected["d"] = expected["d"].astype(object) + + result_b = result_a.append(s, ignore_index=True) + tm.assert_frame_equal(result_b, expected) + + # column order is different + expected = expected[["c", "d", "date", "a", "b"]] + result = df.append([s, s], ignore_index=True) tm.assert_frame_equal(result, expected) From ecc42e1e280af0eaf045dffb1d56727cb1c140bb Mon Sep 17 00:00:00 2001 From: Matthias Bussonnier Date: Fri, 17 Jul 2020 03:25:50 -0700 Subject: [PATCH 0385/1025] DOC: extra closing parens make example invalid. (#35316) --- pandas/_libs/lib.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 37d83a73c6597..5ecbb2c3ffd35 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -983,7 +983,7 @@ def is_list_like(obj: object, allow_sets: bool = True) -> bool: False >>> is_list_like(np.array([2])) True - >>> is_list_like(np.array(2))) + >>> is_list_like(np.array(2)) False """ return c_is_list_like(obj, allow_sets) From bd9832d022ac967be6b0d6ebdea93fb4791520cc Mon Sep 17 00:00:00 2001 From: Joseph Gulian Date: Fri, 17 Jul 2020 06:37:13 -0400 Subject: [PATCH 0386/1025] TST: Adds CategoricalIndex DataFrame from_records test (GH32805) (#35055) * TST: Adds CategoricalIndex DataFrame from_records test (GH32805) * TST: Reformats CategoricalIndex DataFrame from_records test * TST: Removes comma from CategoricalIndex DataFrame from_records test * TST: Isorts tests/frame/test_constructor.py imports --- pandas/tests/frame/test_constructors.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 1631342c359c1..a4ed548264d39 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -19,6 +19,7 @@ import pandas as pd from pandas import ( Categorical, + CategoricalIndex, DataFrame, Index, Interval, @@ -2509,6 +2510,18 @@ def test_from_records_series_list_dict(self): result = DataFrame.from_records(data) tm.assert_frame_equal(result, expected) + def test_from_records_series_categorical_index(self): + # GH 32805 + index = CategoricalIndex( + [pd.Interval(-20, -10), pd.Interval(-10, 0), pd.Interval(0, 10)] + ) + series_of_dicts = pd.Series([{"a": 1}, {"a": 2}, {"b": 3}], index=index) + frame = pd.DataFrame.from_records(series_of_dicts, index=index) + expected = DataFrame( + {"a": [1, 2, np.NaN], "b": [np.NaN, np.NaN, 3]}, index=index + ) + tm.assert_frame_equal(frame, expected) + def test_frame_from_records_utc(self): rec = {"datum": 1.5, "begin_time": datetime(2006, 4, 27, tzinfo=pytz.utc)} From 8fefe76cbd01da28e554584b33e7afb8ef10eb17 Mon Sep 17 00:00:00 2001 From: willbowditch <14288042+willbowditch@users.noreply.github.com> Date: Fri, 17 Jul 2020 11:58:29 +0100 Subject: [PATCH 0387/1025] Infer compression if file extension is uppercase (#35164) * Infer compression even if file extension is uppercase * Add upercase extensions to infer compression tests * Update whatsnew * Add PR number --- doc/source/whatsnew/v1.1.0.rst | 1 + pandas/io/common.py | 2 +- pandas/tests/io/test_common.py | 12 +++++++++++- 3 files changed, 13 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index df41b24ee5097..bdb844ded59b7 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -1086,6 +1086,7 @@ I/O - Bug in :meth:`~HDFStore.create_table` now raises an error when `column` argument was not specified in `data_columns` on input (:issue:`28156`) - :meth:`read_json` now could read line-delimited json file from a file url while `lines` and `chunksize` are set. - Bug in :meth:`DataFrame.to_sql` when reading DataFrames with ``-np.inf`` entries with MySQL now has a more explicit ``ValueError`` (:issue:`34431`) +- Bug where capitalised files extensions were not decompressed by read_* functions (:issue:`35164`) - Bug in :meth:`read_excel` that was raising a ``TypeError`` when ``header=None`` and ``index_col`` given as list (:issue:`31783`) - Bug in "meth"`read_excel` where datetime values are used in the header in a `MultiIndex` (:issue:`34748`) - :func:`read_excel` no longer takes ``**kwds`` arguments. This means that passing in keyword ``chunksize`` now raises a ``TypeError`` (previously raised a ``NotImplementedError``), while passing in keyword ``encoding`` now raises a ``TypeError`` (:issue:`34464`) diff --git a/pandas/io/common.py b/pandas/io/common.py index 32ec088f00d88..bd77a1e69c138 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -339,7 +339,7 @@ def infer_compression( # Infer compression from the filename/URL extension for compression, extension in _compression_to_extension.items(): - if filepath_or_buffer.endswith(extension): + if filepath_or_buffer.lower().endswith(extension): return compression return None diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index e2f4ae04c1f9f..dde38eb55ea7f 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -87,7 +87,17 @@ def test_stringify_path_fspath(self): @pytest.mark.parametrize( "extension,expected", - [("", None), (".gz", "gzip"), (".bz2", "bz2"), (".zip", "zip"), (".xz", "xz")], + [ + ("", None), + (".gz", "gzip"), + (".bz2", "bz2"), + (".zip", "zip"), + (".xz", "xz"), + (".GZ", "gzip"), + (".BZ2", "bz2"), + (".ZIP", "zip"), + (".XZ", "xz"), + ], ) @pytest.mark.parametrize("path_type", path_types) def test_infer_compression_from_path(self, extension, expected, path_type): From 7fda1f7d7f07558cf38a88d96d0e59e5c0bfda99 Mon Sep 17 00:00:00 2001 From: rhshadrach <45562402+rhshadrach@users.noreply.github.com> Date: Fri, 17 Jul 2020 08:33:04 -0400 Subject: [PATCH 0388/1025] CLN/DOC: DataFrame.to_parquet supports file-like objects (#35235) --- pandas/core/frame.py | 21 +++++++++++++-------- pandas/io/parquet.py | 29 +++++++++++++++++------------ 2 files changed, 30 insertions(+), 20 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b63467f08cdaa..f52341ed782d8 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -19,6 +19,7 @@ IO, TYPE_CHECKING, Any, + AnyStr, Dict, FrozenSet, Hashable, @@ -2266,11 +2267,11 @@ def to_markdown( @deprecate_kwarg(old_arg_name="fname", new_arg_name="path") def to_parquet( self, - path, - engine="auto", - compression="snappy", - index=None, - partition_cols=None, + path: FilePathOrBuffer[AnyStr], + engine: str = "auto", + compression: Optional[str] = "snappy", + index: Optional[bool] = None, + partition_cols: Optional[List[str]] = None, **kwargs, ) -> None: """ @@ -2283,9 +2284,12 @@ def to_parquet( Parameters ---------- - path : str - File path or Root Directory path. Will be used as Root Directory - path while writing a partitioned dataset. + path : str or file-like object + If a string, it will be used as Root Directory path + when writing a partitioned dataset. By file-like object, + we refer to objects with a write() method, such as a file handler + (e.g. via builtin open function) or io.BytesIO. The engine + fastparquet does not accept file-like objects. .. versionchanged:: 1.0.0 @@ -2312,6 +2316,7 @@ def to_parquet( partition_cols : list, optional, default None Column names by which to partition the dataset. Columns are partitioned in the order they are given. + Must be None if path is not a string. .. versionadded:: 0.24.0 diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index a0c9242684f0f..8c4b63767ac06 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -1,8 +1,9 @@ """ parquet compat """ -from typing import Any, Dict, Optional +from typing import Any, AnyStr, Dict, List, Optional from warnings import catch_warnings +from pandas._typing import FilePathOrBuffer from pandas.compat._optional import import_optional_dependency from pandas.errors import AbstractMethodError @@ -85,10 +86,10 @@ def __init__(self): def write( self, df: DataFrame, - path, - compression="snappy", + path: FilePathOrBuffer[AnyStr], + compression: Optional[str] = "snappy", index: Optional[bool] = None, - partition_cols=None, + partition_cols: Optional[List[str]] = None, **kwargs, ): self.validate_dataframe(df) @@ -213,11 +214,11 @@ def read(self, path, columns=None, **kwargs): def to_parquet( df: DataFrame, - path, + path: FilePathOrBuffer[AnyStr], engine: str = "auto", - compression="snappy", + compression: Optional[str] = "snappy", index: Optional[bool] = None, - partition_cols=None, + partition_cols: Optional[List[str]] = None, **kwargs, ): """ @@ -226,9 +227,12 @@ def to_parquet( Parameters ---------- df : DataFrame - path : str - File path or Root Directory path. Will be used as Root Directory path - while writing a partitioned dataset. + path : str or file-like object + If a string, it will be used as Root Directory path + when writing a partitioned dataset. By file-like object, + we refer to objects with a write() method, such as a file handler + (e.g. via builtin open function) or io.BytesIO. The engine + fastparquet does not accept file-like objects. .. versionchanged:: 0.24.0 @@ -251,8 +255,9 @@ def to_parquet( .. versionadded:: 0.24.0 partition_cols : str or list, optional, default None - Column names by which to partition the dataset - Columns are partitioned in the order they are given + Column names by which to partition the dataset. + Columns are partitioned in the order they are given. + Must be None if path is not a string. .. versionadded:: 0.24.0 From 843fcec06c7612a20c27146f471225189d791d90 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 17 Jul 2020 08:25:27 -0500 Subject: [PATCH 0389/1025] CI: Skip test for 3.7.0 exactly (#35310) xref https://github.com/pandas-dev/pandas/issues/35309 --- pandas/tests/io/json/test_json_table_schema.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/tests/io/json/test_json_table_schema.py b/pandas/tests/io/json/test_json_table_schema.py index df64af6ac2265..22b4ec189a0f1 100644 --- a/pandas/tests/io/json/test_json_table_schema.py +++ b/pandas/tests/io/json/test_json_table_schema.py @@ -1,6 +1,7 @@ """Tests for Table Schema integration.""" from collections import OrderedDict import json +import sys import numpy as np import pytest @@ -671,6 +672,7 @@ class TestTableOrientReader: {"bools": [True, False, False, True]}, ], ) + @pytest.mark.skipif(sys.version_info[:3] == (3, 7, 0), reason="GH-35309") def test_read_json_table_orient(self, index_nm, vals, recwarn): df = DataFrame(vals, index=pd.Index(range(4), name=index_nm)) out = df.to_json(orient="table") From 803bd342edd6dd33b81c7227251d2a573b6e2604 Mon Sep 17 00:00:00 2001 From: Saul Shanabrook Date: Fri, 17 Jul 2020 12:24:14 -0400 Subject: [PATCH 0390/1025] Remove deprecated warn kwarg for matplotlib use (#35323) The warn param has been deprecated in 3.2.1: https://matplotlib.org/3.2.1/api/prev_api_changes/api_changes_3.2.0.html#matplotlib-use --- pandas/util/_test_decorators.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/util/_test_decorators.py b/pandas/util/_test_decorators.py index 25394dc6775d8..a4a1d83177c50 100644 --- a/pandas/util/_test_decorators.py +++ b/pandas/util/_test_decorators.py @@ -94,7 +94,7 @@ def safe_import(mod_name: str, min_version: Optional[str] = None): def _skip_if_no_mpl(): mod = safe_import("matplotlib") if mod: - mod.use("Agg", warn=True) + mod.use("Agg") else: return True From 1461f95a4122dc01b13f3b5c136014fb3a22a96d Mon Sep 17 00:00:00 2001 From: Steffen Schmitz Date: Fri, 17 Jul 2020 18:26:46 +0200 Subject: [PATCH 0391/1025] BUG: asymmetric error bars for series (GH9536) (#34514) --- doc/source/user_guide/visualization.rst | 2 +- doc/source/whatsnew/v1.1.0.rst | 1 + pandas/plotting/_matplotlib/core.py | 16 +++++++++++++++- pandas/tests/plotting/test_series.py | 20 ++++++++++++++++++++ 4 files changed, 37 insertions(+), 2 deletions(-) diff --git a/doc/source/user_guide/visualization.rst b/doc/source/user_guide/visualization.rst index 27826e7cde9e1..5bc87bca87211 100644 --- a/doc/source/user_guide/visualization.rst +++ b/doc/source/user_guide/visualization.rst @@ -1425,7 +1425,7 @@ Horizontal and vertical error bars can be supplied to the ``xerr`` and ``yerr`` * As a ``str`` indicating which of the columns of plotting :class:`DataFrame` contain the error values. * As raw values (``list``, ``tuple``, or ``np.ndarray``). Must be the same length as the plotting :class:`DataFrame`/:class:`Series`. -Asymmetrical error bars are also supported, however raw error values must be provided in this case. For a ``M`` length :class:`Series`, a ``Mx2`` array should be provided indicating lower and upper (or left and right) errors. For a ``MxN`` :class:`DataFrame`, asymmetrical errors should be in a ``Mx2xN`` array. +Asymmetrical error bars are also supported, however raw error values must be provided in this case. For a ``N`` length :class:`Series`, a ``2xN`` array should be provided indicating lower and upper (or left and right) errors. For a ``MxN`` :class:`DataFrame`, asymmetrical errors should be in a ``Mx2xN`` array. Here is an example of one way to easily plot group means with standard deviations from the raw data. diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index bdb844ded59b7..d74c1bca61de8 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -340,6 +340,7 @@ Other enhancements - :class:`pandas.core.window.ExponentialMovingWindow` now supports a ``times`` argument that allows ``mean`` to be calculated with observations spaced by the timestamps in ``times`` (:issue:`34839`) - :meth:`DataFrame.agg` and :meth:`Series.agg` now accept named aggregation for renaming the output columns/indexes. (:issue:`26513`) - ``compute.use_numba`` now exists as a configuration option that utilizes the numba engine when available (:issue:`33966`) +- :meth:`Series.plot` now supports asymmetric error bars. Previously, if :meth:`Series.plot` received a "2xN" array with error values for `yerr` and/or `xerr`, the left/lower values (first row) were mirrored, while the right/upper values (second row) were ignored. Now, the first row represents the left/lower error values and the second row the right/upper error values. (:issue:`9536`) .. --------------------------------------------------------------------------- diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index e510f7140519a..353bc8a8936a5 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -770,6 +770,12 @@ def _parse_errorbars(self, label, err): DataFrame/dict: error values are paired with keys matching the key in the plotted DataFrame str: the name of the column within the plotted DataFrame + + Asymmetrical error bars are also supported, however raw error values + must be provided in this case. For a ``N`` length :class:`Series`, a + ``2xN`` array should be provided indicating lower and upper (or left + and right) errors. For a ``MxN`` :class:`DataFrame`, asymmetrical errors + should be in a ``Mx2xN`` array. """ if err is None: return None @@ -810,7 +816,15 @@ def match_labels(data, e): err_shape = err.shape # asymmetrical error bars - if err.ndim == 3: + if isinstance(self.data, ABCSeries) and err_shape[0] == 2: + err = np.expand_dims(err, 0) + err_shape = err.shape + if err_shape[2] != len(self.data): + raise ValueError( + "Asymmetrical error bars should be provided " + f"with the shape (2, {len(self.data)})" + ) + elif isinstance(self.data, ABCDataFrame) and err.ndim == 3: if ( (err_shape[0] != self.nseries) or (err_shape[1] != 2) diff --git a/pandas/tests/plotting/test_series.py b/pandas/tests/plotting/test_series.py index 64da98f57676f..316ca6ce91af7 100644 --- a/pandas/tests/plotting/test_series.py +++ b/pandas/tests/plotting/test_series.py @@ -729,6 +729,26 @@ def test_dup_datetime_index_plot(self): s = Series(values, index=index) _check_plot_works(s.plot) + def test_errorbar_asymmetrical(self): + # GH9536 + s = Series(np.arange(10), name="x") + err = np.random.rand(2, 10) + + ax = s.plot(yerr=err, xerr=err) + + result = np.vstack([i.vertices[:, 1] for i in ax.collections[1].get_paths()]) + expected = (err.T * np.array([-1, 1])) + s.to_numpy().reshape(-1, 1) + tm.assert_numpy_array_equal(result, expected) + + msg = ( + "Asymmetrical error bars should be provided " + f"with the shape \\(2, {len(s)}\\)" + ) + with pytest.raises(ValueError, match=msg): + s.plot(yerr=np.random.rand(2, 11)) + + tm.close() + @pytest.mark.slow def test_errorbar_plot(self): From b3c0abbdebeefc67b7c8fc378c547bc37da9a633 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 17 Jul 2020 11:30:40 -0500 Subject: [PATCH 0392/1025] Doc fixups (#35327) --- doc/source/reference/groupby.rst | 1 + doc/source/reference/window.rst | 1 + doc/source/whatsnew/v1.1.0.rst | 95 +++++++++++++++----------------- 3 files changed, 46 insertions(+), 51 deletions(-) diff --git a/doc/source/reference/groupby.rst b/doc/source/reference/groupby.rst index 76cb53559f334..ccf130d03418c 100644 --- a/doc/source/reference/groupby.rst +++ b/doc/source/reference/groupby.rst @@ -128,6 +128,7 @@ The following methods are available only for ``SeriesGroupBy`` objects. .. autosummary:: :toctree: api/ + SeriesGroupBy.hist SeriesGroupBy.nlargest SeriesGroupBy.nsmallest SeriesGroupBy.nunique diff --git a/doc/source/reference/window.rst b/doc/source/reference/window.rst index d7e6405a3732b..611c0e0f7f160 100644 --- a/doc/source/reference/window.rst +++ b/doc/source/reference/window.rst @@ -86,3 +86,4 @@ Base class for defining custom window boundaries. api.indexers.BaseIndexer api.indexers.FixedForwardWindowIndexer + api.indexers.VariableOffsetWindowIndexer diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index d74c1bca61de8..b9ef1aae9a801 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -17,7 +17,7 @@ Enhancements KeyErrors raised by loc specify missing labels ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Previously, if labels were missing for a loc call, a KeyError was raised stating that this was no longer supported. +Previously, if labels were missing for a ``.loc`` call, a KeyError was raised stating that this was no longer supported. Now the error message also includes a list of the missing labels (max 10 items, display width 80 characters). See :issue:`34272`. @@ -124,8 +124,6 @@ compatibility (:issue:`3729`) The default setting of ``dropna`` argument is ``True`` which means ``NA`` are not included in group keys. -.. versionadded:: 1.1.0 - .. _whatsnew_110.key_sorting: @@ -281,17 +279,17 @@ Other enhancements - Added a :func:`pandas.api.indexers.FixedForwardWindowIndexer` class to support forward-looking windows during ``rolling`` operations. - Added a :func:`pandas.api.indexers.VariableOffsetWindowIndexer` class to support ``rolling`` operations with non-fixed offsets (:issue:`34994`) - :meth:`~DataFrame.describe` now includes a ``datetime_is_numeric`` keyword to control how datetime columns are summarized (:issue:`30164`, :issue:`34798`) -- :class:`Styler` may now render CSS more efficiently where multiple cells have the same styling (:issue:`30876`) -- :meth:`Styler.highlight_null` now accepts ``subset`` argument (:issue:`31345`) -- When writing directly to a sqlite connection :func:`to_sql` now supports the ``multi`` method (:issue:`29921`) -- `OptionError` is now exposed in `pandas.errors` (:issue:`27553`) -- Add :meth:`ExtensionArray.argmax` and :meth:`ExtensionArray.argmin` (:issue:`24382`) +- :class:`~pandas.io.formats.style.Styler` may now render CSS more efficiently where multiple cells have the same styling (:issue:`30876`) +- :meth:`~pandas.io.formats.style.Styler.highlight_null` now accepts ``subset`` argument (:issue:`31345`) +- When writing directly to a sqlite connection :meth:`DataFrame.to_sql` now supports the ``multi`` method (:issue:`29921`) +- :class:`pandas.errors.OptionError` is now exposed in ``pandas.errors`` (:issue:`27553`) +- Add :meth:`api.extensions.ExtensionArray.argmax` and :meth:`api.extensions.ExtensionArray.argmin` (:issue:`24382`) - :func:`timedelta_range` will now infer a frequency when passed ``start``, ``stop``, and ``periods`` (:issue:`32377`) - Positional slicing on a :class:`IntervalIndex` now supports slices with ``step > 1`` (:issue:`31658`) - :class:`Series.str` now has a `fullmatch` method that matches a regular expression against the entire string in each row of the series, similar to `re.fullmatch` (:issue:`32806`). - :meth:`DataFrame.sample` will now also allow array-like and BitGenerator objects to be passed to ``random_state`` as seeds (:issue:`32503`) -- :meth:`MultiIndex.union` will now raise `RuntimeWarning` if the object inside are unsortable, pass `sort=False` to suppress this warning (:issue:`33015`) -- :class:`Series.dt` and :class:`DatatimeIndex` now have an `isocalendar` method that returns a :class:`DataFrame` with year, week, and day calculated according to the ISO 8601 calendar (:issue:`33206`, :issue:`34392`). +- :meth:`Index.union` will now raise ``RuntimeWarning`` for :class:`MultiIndex` objects if the object inside are unsortable. Pass ``sort=False`` to suppress this warning (:issue:`33015`) +- Added :meth:`Series.dt.isocalendar` and :meth:`DatetimeIndex.isocalendar` that returns a :class:`DataFrame` with year, week, and day calculated according to the ISO 8601 calendar (:issue:`33206`, :issue:`34392`). - The :meth:`DataFrame.to_feather` method now supports additional keyword arguments (e.g. to set the compression) that are added in pyarrow 0.17 (:issue:`33422`). @@ -305,32 +303,31 @@ Other enhancements - :meth:`melt` has gained an ``ignore_index`` (default ``True``) argument that, if set to ``False``, prevents the method from dropping the index (:issue:`17440`). - :meth:`Series.update` now accepts objects that can be coerced to a :class:`Series`, such as ``dict`` and ``list``, mirroring the behavior of :meth:`DataFrame.update` (:issue:`33215`) -- :meth:`~pandas.core.groupby.GroupBy.transform` and :meth:`~pandas.core.groupby.GroupBy.aggregate` has gained ``engine`` and ``engine_kwargs`` arguments that supports executing functions with ``Numba`` (:issue:`32854`, :issue:`33388`) +- :meth:`~pandas.core.groupby.DataFrameGroupBy.transform` and :meth:`~pandas.core.groupby.DataFrameGroupBy.aggregate` has gained ``engine`` and ``engine_kwargs`` arguments that supports executing functions with ``Numba`` (:issue:`32854`, :issue:`33388`) - :meth:`~pandas.core.resample.Resampler.interpolate` now supports SciPy interpolation method :class:`scipy.interpolate.CubicSpline` as method ``cubicspline`` (:issue:`33670`) -- :class:`~pandas.core.groupby.generic.DataFrameGroupBy` and :class:`~pandas.core.groupby.generic.SeriesGroupBy` now implement the ``sample`` method for doing random sampling within groups (:issue:`31775`) +- :class:`~pandas.core.groupby.DataFrameGroupBy` and :class:`~pandas.core.groupby.SeriesGroupBy` now implement the ``sample`` method for doing random sampling within groups (:issue:`31775`) - :meth:`DataFrame.to_numpy` now supports the ``na_value`` keyword to control the NA sentinel in the output array (:issue:`33820`) -- The ``ExtensionArray`` class has now an :meth:`~pandas.arrays.ExtensionArray.equals` - method, similarly to :meth:`Series.equals` (:issue:`27081`). -- The minimum suppported dta version has increased to 105 in :meth:`~pandas.io.stata.read_stata` and :class:`~pandas.io.stata.StataReader` (:issue:`26667`). -- :meth:`~pandas.core.frame.DataFrame.to_stata` supports compression using the ``compression`` +- Added :class:`api.extension.ExtensionArray.equals` to the extension array interface, similarl to :meth:`Series.equals` (:issue:`27081`). +- The minimum supported dta version has increased to 105 in :meth:`read_stata` and :class:`~pandas.io.stata.StataReader` (:issue:`26667`). +- :meth:`~DataFrame.to_stata` supports compression using the ``compression`` keyword argument. Compression can either be inferred or explicitly set using a string or a dictionary containing both the method and any additional arguments that are passed to the compression library. Compression was also added to the low-level Stata-file writers :class:`~pandas.io.stata.StataWriter`, :class:`~pandas.io.stata.StataWriter117`, and :class:`~pandas.io.stata.StataWriterUTF8` (:issue:`26599`). -- :meth:`HDFStore.put` now accepts `track_times` parameter. Parameter is passed to ``create_table`` method of ``PyTables`` (:issue:`32682`). +- :meth:`HDFStore.put` now accepts a ``track_times`` parameter. Parameter is passed to ``create_table`` method of ``PyTables`` (:issue:`32682`). - :meth:`Series.plot` and :meth:`DataFrame.plot` now accepts `xlabel` and `ylabel` parameters to present labels on x and y axis (:issue:`9093`). -- Make :class:`pandas.core.window.Rolling` and :class:`pandas.core.window.Expanding` iterable(:issue:`11704`) +- Make :class:`pandas.core.window.rolling.Rolling` and :class:`pandas.core.window.expanding.Expanding` iterable(:issue:`11704`) - Make ``option_context`` a :class:`contextlib.ContextDecorator`, which allows it to be used as a decorator over an entire function (:issue:`34253`). - :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` now accept an ``errors`` argument (:issue:`22610`) -- :meth:`groupby.transform` now allows ``func`` to be ``pad``, ``backfill`` and ``cumcount`` (:issue:`31269`). -- :meth:`~pandas.io.json.read_json` now accepts `nrows` parameter. (:issue:`33916`). -- :meth:`DataFrame.hist`, :meth:`Series.hist`, :meth:`core.groupby.DataFrameGroupBy.hist`, and :meth:`core.groupby.SeriesGroupBy.hist` have gained the ``legend`` argument. Set to True to show a legend in the histogram. (:issue:`6279`) +- :meth:`~pandas.core.groupby.DataFrameGroupBy.groupby.transform` now allows ``func`` to be ``pad``, ``backfill`` and ``cumcount`` (:issue:`31269`). +- :meth:`read_json` now accepts an ``nrows`` parameter. (:issue:`33916`). +- :meth:`DataFrame.hist`, :meth:`Series.hist`, :meth:`core.groupby.DataFrameGroupBy.hist`, and :meth:`core.groupby.SeriesGroupBy.hist` have gained the ``legend`` argument. Set to True to show a legend in the histogram. (:issue:`6279`)0 - :func:`concat` and :meth:`~DataFrame.append` now preserve extension dtypes, for example combining a nullable integer column with a numpy integer column will no longer result in object dtype but preserve the integer dtype (:issue:`33607`, :issue:`34339`, :issue:`34095`). -- :meth:`~pandas.io.gbq.read_gbq` now allows to disable progress bar (:issue:`33360`). -- :meth:`~pandas.io.gbq.read_gbq` now supports the ``max_results`` kwarg from ``pandas-gbq`` (:issue:`34639`). +- :meth:`read_gbq` now allows to disable progress bar (:issue:`33360`). +- :meth:`read_gbq` now supports the ``max_results`` kwarg from ``pandas-gbq`` (:issue:`34639`). - :meth:`DataFrame.cov` and :meth:`Series.cov` now support a new parameter ddof to support delta degrees of freedom as in the corresponding numpy methods (:issue:`34611`). - :meth:`DataFrame.to_html` and :meth:`DataFrame.to_string`'s ``col_space`` parameter now accepts a list or dict to change only some specific columns' width (:issue:`28917`). - :meth:`DataFrame.to_excel` can now also write OpenOffice spreadsheet (.ods) files (:issue:`27222`) @@ -351,7 +348,7 @@ Notable bug fixes These are bug fixes that might have notable behavior changes. -``MultiIndex.get_indexer`` interprets `method` argument differently +``MultiIndex.get_indexer`` interprets ``method`` argument correctly ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ This restores the behavior of :meth:`MultiIndex.get_indexer` with ``method='backfill'`` or ``method='pad'`` to the behavior before pandas 0.23.0. In particular, MultiIndexes are treated as a list of tuples and padding or backfilling is done with respect to the ordering of these lists of tuples (:issue:`29896`). @@ -411,8 +408,6 @@ And the differences in reindexing ``df`` with ``mi_2`` and using ``method='pad'` df.reindex(mi_2, method='pad') -- - .. _whatsnew_110.notable_bug_fixes.indexing_raises_key_errors: Failed Label-Based Lookups Always Raise KeyError @@ -522,8 +517,10 @@ those integer keys is not present in the first level of the index (:issue:`33539 .. ipython:: python - left_df = pd.DataFrame({'animal': ['dog', 'pig'], 'max_speed': [40, 11]}) - right_df = pd.DataFrame({'animal': ['quetzal', 'pig'], 'max_speed': [80, 11]}) + left_df = pd.DataFrame({'animal': ['dog', 'pig'], + 'max_speed': [40, 11]}) + right_df = pd.DataFrame({'animal': ['quetzal', 'pig'], + 'max_speed': [80, 11]}) left_df right_df @@ -622,7 +619,7 @@ Using :meth:`DataFrame.groupby` with ``as_index=False`` and the function ``idxma df.groupby("a", as_index=False).nunique() -The method :meth:`core.DataFrameGroupBy.size` would previously ignore ``as_index=False``. Now the grouping columns are returned as columns, making the result a `DataFrame` instead of a `Series`. (:issue:`32599`) +The method :meth:`~pandas.core.groupby.DataFrameGroupBy.size` would previously ignore ``as_index=False``. Now the grouping columns are returned as columns, making the result a `DataFrame` instead of a `Series`. (:issue:`32599`) *Previous behavior*: @@ -643,11 +640,11 @@ The method :meth:`core.DataFrameGroupBy.size` would previously ignore ``as_index .. _whatsnew_110.api_breaking.groupby_results_lost_as_index_false: -:meth:`DataFrameGroupby.agg` lost results with ``as_index`` ``False`` when relabeling columns -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +:meth:`~pandas.core.groupby.DataFrameGroupby.agg` lost results with ``as_index=False`` when relabeling columns +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Previously :meth:`DataFrameGroupby.agg` lost the result columns, when the ``as_index`` option was -set to ``False`` and the result columns were relabeled. In this case he result values were replaced with +Previously :meth:`~pandas.core.groupby.DataFrameGroupby.agg` lost the result columns, when the ``as_index`` option was +set to ``False`` and the result columns were relabeled. In this case the result values were replaced with the previous index (:issue:`32240`). .. ipython:: python @@ -812,7 +809,7 @@ Deprecations positional arguments is deprecated since version 1.1. All other arguments should be given as keyword arguments (:issue:`27573`). -- Passing any arguments but `path_or_buf` (the first one) to +- Passing any arguments but ``path_or_buf`` (the first one) to :func:`read_json` as positional arguments is deprecated since version 1.1. All other arguments should be given as keyword arguments (:issue:`27573`). @@ -821,23 +818,20 @@ Deprecations positional arguments is deprecated since version 1.1. All other arguments should be given as keyword arguments (:issue:`27573`). -- :func:`pandas.api.types.is_categorical` is deprecated and will be removed in a future version; use `:func:pandas.api.types.is_categorical_dtype` instead (:issue:`33385`) +- :func:`pandas.api.types.is_categorical` is deprecated and will be removed in a future version; use :func:`pandas.api.types.is_categorical_dtype` instead (:issue:`33385`) - :meth:`Index.get_value` is deprecated and will be removed in a future version (:issue:`19728`) - :meth:`Series.dt.week` and `Series.dt.weekofyear` are deprecated and will be removed in a future version, use :meth:`Series.dt.isocalendar().week` instead (:issue:`33595`) -- :meth:`DatetimeIndex.week` and `DatetimeIndex.weekofyear` are deprecated and will be removed in a future version, use :meth:`DatetimeIndex.isocalendar().week` instead (:issue:`33595`) -- :meth:`DatetimeArray.week` and `DatetimeArray.weekofyear` are deprecated and will be removed in a future version, use :meth:`DatetimeArray.isocalendar().week` instead (:issue:`33595`) +- :meth:`DatetimeIndex.week` and ``DatetimeIndex.weekofyear`` are deprecated and will be removed in a future version, use ``DatetimeIndex.isocalendar().week`` instead (:issue:`33595`) +- :meth:`DatetimeArray.week` and ``DatetimeArray.weekofyear`` are deprecated and will be removed in a future version, use ``DatetimeArray.isocalendar().week`` instead (:issue:`33595`) - :meth:`DateOffset.__call__` is deprecated and will be removed in a future version, use ``offset + other`` instead (:issue:`34171`) -- :meth:`~BusinessDay.apply_index` is deprecated and will be removed in a future version. Use ``offset + other`` instead (:issue:`34580`) +- :meth:`~pandas.tseries.offsets.BusinessDay.apply_index` is deprecated and will be removed in a future version. Use ``offset + other`` instead (:issue:`34580`) - :meth:`DataFrame.tshift` and :meth:`Series.tshift` are deprecated and will be removed in a future version, use :meth:`DataFrame.shift` and :meth:`Series.shift` instead (:issue:`11631`) - Indexing an :class:`Index` object with a float key is deprecated, and will raise an ``IndexError`` in the future. You can manually convert to an integer key instead (:issue:`34191`). -- The ``squeeze`` keyword in the ``groupby`` function is deprecated and will be removed in a future version (:issue:`32380`) -- The ``tz`` keyword in :meth:`Period.to_timestamp` is deprecated and will be removed in a future version; use `per.to_timestamp(...).tz_localize(tz)`` instead (:issue:`34522`) +- The ``squeeze`` keyword in :meth:`~DataFrame.groupby` is deprecated and will be removed in a future version (:issue:`32380`) +- The ``tz`` keyword in :meth:`Period.to_timestamp` is deprecated and will be removed in a future version; use ``per.to_timestamp(...).tz_localize(tz)`` instead (:issue:`34522`) - :meth:`DatetimeIndex.to_perioddelta` is deprecated and will be removed in a future version. Use ``index - index.to_period(freq).to_timestamp()`` instead (:issue:`34853`) -- :meth:`util.testing.assert_almost_equal` now accepts both relative and absolute - precision through the ``rtol``, and ``atol`` parameters, thus deprecating the - ``check_less_precise`` parameter. (:issue:`13357`). - :func:`DataFrame.melt` accepting a value_name that already exists is deprecated, and will be removed in a future version (:issue:`34731`) - the ``center`` keyword in the :meth:`DataFrame.expanding` function is deprecated and will be removed in a future version (:issue:`20647`) @@ -930,10 +924,10 @@ Timedelta ^^^^^^^^^ - Bug in constructing a :class:`Timedelta` with a high precision integer that would round the :class:`Timedelta` components (:issue:`31354`) -- Bug in dividing ``np.nan`` or ``None`` by :class:`Timedelta`` incorrectly returning ``NaT`` (:issue:`31869`) +- Bug in dividing ``np.nan`` or ``None`` by :class:`Timedelta` incorrectly returning ``NaT`` (:issue:`31869`) - Timedeltas now understand ``µs`` as identifier for microsecond (:issue:`32899`) - :class:`Timedelta` string representation now includes nanoseconds, when nanoseconds are non-zero (:issue:`9309`) -- Bug in comparing a :class:`Timedelta`` object against a ``np.ndarray`` with ``timedelta64`` dtype incorrectly viewing all entries as unequal (:issue:`33441`) +- Bug in comparing a :class:`Timedelta` object against a ``np.ndarray`` with ``timedelta64`` dtype incorrectly viewing all entries as unequal (:issue:`33441`) - Bug in :func:`timedelta_range` that produced an extra point on a edge case (:issue:`30353`, :issue:`33498`) - Bug in :meth:`DataFrame.resample` that produced an extra point on a edge case (:issue:`30353`, :issue:`13022`, :issue:`33498`) - Bug in :meth:`DataFrame.resample` that ignored the ``loffset`` argument when dealing with timedelta (:issue:`7687`, :issue:`33498`) @@ -972,12 +966,11 @@ Strings - Bug in the :meth:`~Series.astype` method when converting "string" dtype data to nullable integer dtype (:issue:`32450`). - Fixed issue where taking ``min`` or ``max`` of a ``StringArray`` or ``Series`` with ``StringDtype`` type would raise. (:issue:`31746`) - Bug in :meth:`Series.str.cat` returning ``NaN`` output when other had :class:`Index` type (:issue:`33425`) -- :func: `pandas.api.dtypes.is_string_dtype` no longer incorrectly identifies categorical series as string. +- :func:`pandas.api.dtypes.is_string_dtype` no longer incorrectly identifies categorical series as string. Interval ^^^^^^^^ - Bug in :class:`IntervalArray` incorrectly allowing the underlying data to be changed when setting values (:issue:`32782`) -- Indexing ^^^^^^^^ @@ -1002,8 +995,8 @@ Indexing - Bug in :meth:`DataFrame.copy` _item_cache not invalidated after copy causes post-copy value updates to not be reflected (:issue:`31784`) - Fixed regression in :meth:`DataFrame.loc` and :meth:`Series.loc` throwing an error when a ``datetime64[ns, tz]`` value is provided (:issue:`32395`) - Bug in `Series.__getitem__` with an integer key and a :class:`MultiIndex` with leading integer level failing to raise ``KeyError`` if the key is not present in the first level (:issue:`33355`) -- Bug in :meth:`DataFrame.iloc` when slicing a single column-:class:`DataFrame`` with ``ExtensionDtype`` (e.g. ``df.iloc[:, :1]``) returning an invalid result (:issue:`32957`) -- Bug in :meth:`DatetimeIndex.insert` and :meth:`TimedeltaIndex.insert` causing index ``freq`` to be lost when setting an element into an empty :class:`Series` (:issue:33573`) +- Bug in :meth:`DataFrame.iloc` when slicing a single column-:class:`DataFrame` with ``ExtensionDtype`` (e.g. ``df.iloc[:, :1]``) returning an invalid result (:issue:`32957`) +- Bug in :meth:`DatetimeIndex.insert` and :meth:`TimedeltaIndex.insert` causing index ``freq`` to be lost when setting an element into an empty :class:`Series` (:issue:`33573`) - Bug in :meth:`Series.__setitem__` with an :class:`IntervalIndex` and a list-like key of integers (:issue:`33473`) - Bug in :meth:`Series.__getitem__` allowing missing labels with ``np.ndarray``, :class:`Index`, :class:`Series` indexers but not ``list``, these now all raise ``KeyError`` (:issue:`33646`) - Bug in :meth:`DataFrame.truncate` and :meth:`Series.truncate` where index was assumed to be monotone increasing (:issue:`33756`) @@ -1089,7 +1082,7 @@ I/O - Bug in :meth:`DataFrame.to_sql` when reading DataFrames with ``-np.inf`` entries with MySQL now has a more explicit ``ValueError`` (:issue:`34431`) - Bug where capitalised files extensions were not decompressed by read_* functions (:issue:`35164`) - Bug in :meth:`read_excel` that was raising a ``TypeError`` when ``header=None`` and ``index_col`` given as list (:issue:`31783`) -- Bug in "meth"`read_excel` where datetime values are used in the header in a `MultiIndex` (:issue:`34748`) +- Bug in :meth:`read_excel` where datetime values are used in the header in a `MultiIndex` (:issue:`34748`) - :func:`read_excel` no longer takes ``**kwds`` arguments. This means that passing in keyword ``chunksize`` now raises a ``TypeError`` (previously raised a ``NotImplementedError``), while passing in keyword ``encoding`` now raises a ``TypeError`` (:issue:`34464`) Plotting @@ -1196,7 +1189,7 @@ Other - Bug in :class:`DataFrame` when initiating a frame with lists and assign ``columns`` with nested list for ``MultiIndex`` (:issue:`32173`) - Bug in :meth:`DataFrame.to_records` incorrectly losing timezone information in timezone-aware ``datetime64`` columns (:issue:`32535`) - Fixed :func:`pandas.testing.assert_series_equal` to correctly raise if left object is a different subclass with ``check_series_type=True`` (:issue:`32670`). -- :meth:`IntegerArray.astype` now supports ``datetime64`` dtype (:issue:32538`) +- :meth:`IntegerArray.astype` now supports ``datetime64`` dtype (:issue:`32538`) - Getting a missing attribute in a query/eval string raises the correct ``AttributeError`` (:issue:`32408`) - Fixed bug in :func:`pandas.testing.assert_series_equal` where dtypes were checked for ``Interval`` and ``ExtensionArray`` operands when ``check_dtype`` was ``False`` (:issue:`32747`) - Bug in :meth:`Series.map` not raising on invalid ``na_action`` (:issue:`32815`) From f5b1119111a6339fe2e5809171ad30df34b50434 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 17 Jul 2020 12:32:55 -0500 Subject: [PATCH 0393/1025] more fixups (#35329) --- doc/source/whatsnew/v1.1.0.rst | 35 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 18 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index b9ef1aae9a801..285cfdfc4c431 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -274,6 +274,8 @@ change, as ``fsspec`` will still bring in the same packages as before. Other enhancements ^^^^^^^^^^^^^^^^^^ +- :meth:`IntegerArray.astype` now supports ``datetime64`` dtype (:issue:`32538`) +- :class:`IntegerArray` now implements the ``sum`` operation (:issue:`33172`) - Added :class:`pandas.errors.InvalidIndexError` (:issue:`34570`). - Added :meth:`DataFrame.value_counts` (:issue:`5377`) - Added a :func:`pandas.api.indexers.FixedForwardWindowIndexer` class to support forward-looking windows during ``rolling`` operations. @@ -919,6 +921,9 @@ Datetimelike - The ``freq`` keyword in :class:`Period`, :func:`date_range`, :func:`period_range`, :func:`pd.tseries.frequencies.to_offset` no longer allows tuples, pass as string instead (:issue:`34703`) - Bug in :meth:`DataFrame.append` when appending a :class:`Series` containing a scalar tz-aware :class:`Timestamp` to an empty :class:`DataFrame` resulted in an object column instead of datetime64[ns, tz] dtype (:issue:`35038`) - ``OutOfBoundsDatetime`` issues an improved error message when timestamp is out of implementation bounds. (:issue:`32967`) +- Bug in :meth:`AbstractHolidayCalendar.holidays` when no rules were defined (:issue:`31415`) +- Bug in :class:`Tick` comparisons raising ``TypeError`` when comparing against timedelta-like objects (:issue:`34088`) +- Bug in :class:`Tick` multiplication raising ``TypeError`` when multiplying by a float (:issue:`34486`) Timedelta ^^^^^^^^^ @@ -937,7 +942,6 @@ Timezones ^^^^^^^^^ - Bug in :func:`to_datetime` with ``infer_datetime_format=True`` where timezone names (e.g. ``UTC``) would not be parsed correctly (:issue:`33133`) -- Numeric @@ -953,12 +957,17 @@ Numeric - Bug in :class:`DataFrame` and :class:`Series` addition and subtraction between object-dtype objects and ``datetime64`` dtype objects (:issue:`33824`) - Bug in :meth:`Index.difference` incorrect results when comparing a :class:`Float64Index` and object :class:`Index` (:issue:`35217`) - Bug in :class:`DataFrame` reductions (e.g. ``df.min()``, ``df.max()``) with ``ExtensionArray`` dtypes (:issue:`34520`, :issue:`32651`) +- :meth:`Series.interpolate` and :meth:`DataFrame.interpolate` now raises ValueError if ``limit_direction`` is 'forward' or 'both' and ``method`` is 'backfill' or 'bfill' or ``limit_direction`` is 'backward' or 'both' and ``method`` is 'pad' or 'ffill' (:issue:`34746`) Conversion ^^^^^^^^^^ - Bug in :class:`Series` construction from NumPy array with big-endian ``datetime64`` dtype (:issue:`29684`) - Bug in :class:`Timedelta` construction with large nanoseconds keyword value (:issue:`32402`) - Bug in :class:`DataFrame` construction where sets would be duplicated rather than raising (:issue:`32582`) +- The :class:`DataFrame` constructor no longer accepts a list of ``DataFrame`` objects. Because of changes to NumPy, ``DataFrame`` objects are now consistently treated as 2D objects, so a list of ``DataFrames`` is considered 3D, and no longer acceptible for the ``DataFrame`` constructor (:issue:`32289`). +- Bug in :class:`DataFrame` when initiating a frame with lists and assign ``columns`` with nested list for ``MultiIndex`` (:issue:`32173`) +- Improved error message for invalid construction of list when creating a new index (:issue:`35190`) + Strings ^^^^^^^ @@ -1016,6 +1025,7 @@ Missing - :meth:`DataFrame.interpolate` uses the correct axis convention now. Previously interpolating along columns lead to interpolation along indices and vice versa. Furthermore interpolating with methods ``pad``, ``ffill``, ``bfill`` and ``backfill`` are identical to using these methods with :meth:`fillna` (:issue:`12918`, :issue:`29146`) - Bug in :meth:`DataFrame.interpolate` when called on a DataFrame with column names of string type was throwing a ValueError. The method is no independing of the type of column names (:issue:`33956`) - passing :class:`NA` will into a format string using format specs will now work. For example ``"{:.1f}".format(pd.NA)`` would previously raise a ``ValueError``, but will now return the string ``""`` (:issue:`34740`) +- Bug in :meth:`Series.map` not raising on invalid ``na_action`` (:issue:`32815`) MultiIndex ^^^^^^^^^^ @@ -1044,6 +1054,7 @@ MultiIndex I/O ^^^ +- Passing a `set` as `names` argument to :func:`pandas.read_csv`, :func:`pandas.read_table`, or :func:`pandas.read_fwf` will raise ``ValueError: Names should be an ordered collection.`` (:issue:`34946`) - Bug in print-out when ``display.precision`` is zero. (:issue:`20359`) - Bug in :meth:`read_json` where integer overflow was occurring when json contains big number strings. (:issue:`30320`) - `read_csv` will now raise a ``ValueError`` when the arguments `header` and `prefix` both are not `None`. (:issue:`27394`) @@ -1084,6 +1095,7 @@ I/O - Bug in :meth:`read_excel` that was raising a ``TypeError`` when ``header=None`` and ``index_col`` given as list (:issue:`31783`) - Bug in :meth:`read_excel` where datetime values are used in the header in a `MultiIndex` (:issue:`34748`) - :func:`read_excel` no longer takes ``**kwds`` arguments. This means that passing in keyword ``chunksize`` now raises a ``TypeError`` (previously raised a ``NotImplementedError``), while passing in keyword ``encoding`` now raises a ``TypeError`` (:issue:`34464`) +- Bug in :meth:`DataFrame.to_records` incorrectly losing timezone information in timezone-aware ``datetime64`` columns (:issue:`32535`) Plotting ^^^^^^^^ @@ -1095,6 +1107,7 @@ Plotting - Bug in :meth:`DataFrame.plot.scatter` that when adding multiple plots with different ``cmap``, colorbars alway use the first ``cmap`` (:issue:`33389`) - Bug in :meth:`DataFrame.plot.scatter` was adding a colorbar to the plot even if the argument `c` was assigned to a column containing color names (:issue:`34316`) - Bug in :meth:`pandas.plotting.bootstrap_plot` was causing cluttered axes and overlapping labels (:issue:`34905`) +- Bug in :meth:`DataFrame.plot.scatter` caused an error when plotting variable marker sizes (:issue:`32904`) Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ @@ -1140,6 +1153,7 @@ Reshaping - Bug in :meth:`concat` where when passing a non-dict mapping as ``objs`` would raise a ``TypeError`` (:issue:`32863`) - :meth:`DataFrame.agg` now provides more descriptive ``SpecificationError`` message when attempting to aggregating non-existant column (:issue:`32755`) - Bug in :meth:`DataFrame.unstack` when MultiIndexed columns and MultiIndexed rows were used (:issue:`32624`, :issue:`24729` and :issue:`28306`) +- Appending a dictionary to a :class:`DataFrame` without passing ``ignore_index=True`` will raise ``TypeError: Can only append a dict if ignore_index=True`` instead of ``TypeError: Can only append a Series if ignore_index=True or if the Series has a name`` (:issue:`30871`) - Bug in :meth:`DataFrame.corrwith()`, :meth:`DataFrame.memory_usage()`, :meth:`DataFrame.dot()`, :meth:`DataFrame.idxmin()`, :meth:`DataFrame.idxmax()`, :meth:`DataFrame.duplicated()`, :meth:`DataFrame.isin()`, :meth:`DataFrame.count()`, :meth:`Series.explode()`, :meth:`Series.asof()` and :meth:`DataFrame.asof()` not @@ -1180,28 +1194,12 @@ ExtensionArray Other ^^^^^ -- The :class:`DataFrame` constructor no longer accepts a list of ``DataFrame`` objects. Because of changes to NumPy, ``DataFrame`` objects are now consistently treated as 2D objects, so a list of ``DataFrames`` is considered 3D, and no longer acceptible for the ``DataFrame`` constructor (:issue:`32289`). -- :meth:`Series.interpolate` and :meth:`DataFrame.interpolate` now raises ValueError if ``limit_direction`` is 'forward' or 'both' and ``method`` is 'backfill' or 'bfill' or ``limit_direction`` is 'backward' or 'both' and ``method`` is 'pad' or 'ffill' (:issue:`34746`) -- Appending a dictionary to a :class:`DataFrame` without passing ``ignore_index=True`` will raise ``TypeError: Can only append a dict if ignore_index=True`` - instead of ``TypeError: Can only append a Series if ignore_index=True or if the Series has a name`` (:issue:`30871`) - Set operations on an object-dtype :class:`Index` now always return object-dtype results (:issue:`31401`) -- Bug in :meth:`AbstractHolidayCalendar.holidays` when no rules were defined (:issue:`31415`) -- Bug in :class:`DataFrame` when initiating a frame with lists and assign ``columns`` with nested list for ``MultiIndex`` (:issue:`32173`) -- Bug in :meth:`DataFrame.to_records` incorrectly losing timezone information in timezone-aware ``datetime64`` columns (:issue:`32535`) - Fixed :func:`pandas.testing.assert_series_equal` to correctly raise if left object is a different subclass with ``check_series_type=True`` (:issue:`32670`). -- :meth:`IntegerArray.astype` now supports ``datetime64`` dtype (:issue:`32538`) - Getting a missing attribute in a query/eval string raises the correct ``AttributeError`` (:issue:`32408`) - Fixed bug in :func:`pandas.testing.assert_series_equal` where dtypes were checked for ``Interval`` and ``ExtensionArray`` operands when ``check_dtype`` was ``False`` (:issue:`32747`) -- Bug in :meth:`Series.map` not raising on invalid ``na_action`` (:issue:`32815`) - Bug in :meth:`DataFrame.__dir__` caused a segfault when using unicode surrogates in a column name (:issue:`25509`) -- Bug in :meth:`DataFrame.plot.scatter` caused an error when plotting variable marker sizes (:issue:`32904`) -- :class:`IntegerArray` now implements the ``sum`` operation (:issue:`33172`) -- Bug in :meth:`DataFrame.equals` and :meth:`Series.equals` in allowing subclasses - to be equal (:issue:`34402`). -- Bug in :class:`Tick` comparisons raising ``TypeError`` when comparing against timedelta-like objects (:issue:`34088`) -- Bug in :class:`Tick` multiplication raising ``TypeError`` when multiplying by a float (:issue:`34486`) -- Passing a `set` as `names` argument to :func:`pandas.read_csv`, :func:`pandas.read_table`, or :func:`pandas.read_fwf` will raise ``ValueError: Names should be an ordered collection.`` (:issue:`34946`) -- Improved error message for invalid construction of list when creating a new index (:issue:`35190`) +- Bug in :meth:`DataFrame.equals` and :meth:`Series.equals` in allowing subclasses to be equal (:issue:`34402`). .. --------------------------------------------------------------------------- @@ -1210,3 +1208,4 @@ Other Contributors ~~~~~~~~~~~~ +.. contributors:: v1.0.5..v1.1.0|HEAD From 5be977fbc1a5ab00f4c9ad860a547ffe72996bf0 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 17 Jul 2020 13:49:27 -0500 Subject: [PATCH 0394/1025] Optimize array_equivalent for NDFrame.equals (#35328) --- pandas/core/dtypes/missing.py | 96 +++++++++++++++++++---------- pandas/core/internals/managers.py | 4 +- pandas/tests/dtypes/test_missing.py | 59 ++++++++++++++---- 3 files changed, 113 insertions(+), 46 deletions(-) diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index 75188ad5b00eb..8551ce9f14e6c 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -355,7 +355,9 @@ def _isna_compat(arr, fill_value=np.nan) -> bool: return True -def array_equivalent(left, right, strict_nan: bool = False) -> bool: +def array_equivalent( + left, right, strict_nan: bool = False, dtype_equal: bool = False +) -> bool: """ True if two arrays, left and right, have equal non-NaN elements, and NaNs in corresponding locations. False otherwise. It is assumed that left and @@ -368,6 +370,12 @@ def array_equivalent(left, right, strict_nan: bool = False) -> bool: left, right : ndarrays strict_nan : bool, default False If True, consider NaN and None to be different. + dtype_equal : bool, default False + Whether `left` and `right` are known to have the same dtype + according to `is_dtype_equal`. Some methods like `BlockManager.equals`. + require that the dtypes match. Setting this to ``True`` can improve + performance, but will give different results for arrays that are + equal but different dtypes. Returns ------- @@ -391,43 +399,28 @@ def array_equivalent(left, right, strict_nan: bool = False) -> bool: if left.shape != right.shape: return False + if dtype_equal: + # fastpath when we require that the dtypes match (Block.equals) + if is_float_dtype(left.dtype) or is_complex_dtype(left.dtype): + return _array_equivalent_float(left, right) + elif is_datetimelike_v_numeric(left.dtype, right.dtype): + return False + elif needs_i8_conversion(left.dtype): + return _array_equivalent_datetimelike(left, right) + elif is_string_dtype(left.dtype): + # TODO: fastpath for pandas' StringDtype + return _array_equivalent_object(left, right, strict_nan) + else: + return np.array_equal(left, right) + + # Slow path when we allow comparing different dtypes. # Object arrays can contain None, NaN and NaT. # string dtypes must be come to this path for NumPy 1.7.1 compat if is_string_dtype(left.dtype) or is_string_dtype(right.dtype): - - if not strict_nan: - # isna considers NaN and None to be equivalent. - return lib.array_equivalent_object( - ensure_object(left.ravel()), ensure_object(right.ravel()) - ) - - for left_value, right_value in zip(left, right): - if left_value is NaT and right_value is not NaT: - return False - - elif left_value is libmissing.NA and right_value is not libmissing.NA: - return False - - elif isinstance(left_value, float) and np.isnan(left_value): - if not isinstance(right_value, float) or not np.isnan(right_value): - return False - else: - try: - if np.any(np.asarray(left_value != right_value)): - return False - except TypeError as err: - if "Cannot compare tz-naive" in str(err): - # tzawareness compat failure, see GH#28507 - return False - elif "boolean value of NA is ambiguous" in str(err): - return False - raise - return True + return _array_equivalent_object(left, right, strict_nan) # NaNs can occur in float and complex arrays. if is_float_dtype(left.dtype) or is_complex_dtype(left.dtype): - - # empty if not (np.prod(left.shape) and np.prod(right.shape)): return True return ((left == right) | (isna(left) & isna(right))).all() @@ -452,6 +445,45 @@ def array_equivalent(left, right, strict_nan: bool = False) -> bool: return np.array_equal(left, right) +def _array_equivalent_float(left, right): + return ((left == right) | (np.isnan(left) & np.isnan(right))).all() + + +def _array_equivalent_datetimelike(left, right): + return np.array_equal(left.view("i8"), right.view("i8")) + + +def _array_equivalent_object(left, right, strict_nan): + if not strict_nan: + # isna considers NaN and None to be equivalent. + return lib.array_equivalent_object( + ensure_object(left.ravel()), ensure_object(right.ravel()) + ) + + for left_value, right_value in zip(left, right): + if left_value is NaT and right_value is not NaT: + return False + + elif left_value is libmissing.NA and right_value is not libmissing.NA: + return False + + elif isinstance(left_value, float) and np.isnan(left_value): + if not isinstance(right_value, float) or not np.isnan(right_value): + return False + else: + try: + if np.any(np.asarray(left_value != right_value)): + return False + except TypeError as err: + if "Cannot compare tz-naive" in str(err): + # tzawareness compat failure, see GH#28507 + return False + elif "boolean value of NA is ambiguous" in str(err): + return False + raise + return True + + def _infer_fill_value(val): """ infer the fill value for the nan/NaT from the provided diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index d5947726af7fd..895385b170c91 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1436,7 +1436,7 @@ def equals(self, other: "BlockManager") -> bool: return array_equivalent(left, right) for i in range(len(self.items)): - # Check column-wise, return False if any column doesnt match + # Check column-wise, return False if any column doesn't match left = self.iget_values(i) right = other.iget_values(i) if not is_dtype_equal(left.dtype, right.dtype): @@ -1445,7 +1445,7 @@ def equals(self, other: "BlockManager") -> bool: if not left.equals(right): return False else: - if not array_equivalent(left, right): + if not array_equivalent(left, right, dtype_equal=True): return False return True diff --git a/pandas/tests/dtypes/test_missing.py b/pandas/tests/dtypes/test_missing.py index f9a854c5778a2..04dde08de082d 100644 --- a/pandas/tests/dtypes/test_missing.py +++ b/pandas/tests/dtypes/test_missing.py @@ -300,50 +300,80 @@ def test_period(self): tm.assert_series_equal(notna(s), ~exp) -def test_array_equivalent(): - assert array_equivalent(np.array([np.nan, np.nan]), np.array([np.nan, np.nan])) +@pytest.mark.parametrize("dtype_equal", [True, False]) +def test_array_equivalent(dtype_equal): assert array_equivalent( - np.array([np.nan, 1, np.nan]), np.array([np.nan, 1, np.nan]) + np.array([np.nan, np.nan]), np.array([np.nan, np.nan]), dtype_equal=dtype_equal + ) + assert array_equivalent( + np.array([np.nan, 1, np.nan]), + np.array([np.nan, 1, np.nan]), + dtype_equal=dtype_equal, ) assert array_equivalent( np.array([np.nan, None], dtype="object"), np.array([np.nan, None], dtype="object"), + dtype_equal=dtype_equal, ) # Check the handling of nested arrays in array_equivalent_object assert array_equivalent( np.array([np.array([np.nan, None], dtype="object"), None], dtype="object"), np.array([np.array([np.nan, None], dtype="object"), None], dtype="object"), + dtype_equal=dtype_equal, ) assert array_equivalent( np.array([np.nan, 1 + 1j], dtype="complex"), np.array([np.nan, 1 + 1j], dtype="complex"), + dtype_equal=dtype_equal, ) assert not array_equivalent( np.array([np.nan, 1 + 1j], dtype="complex"), np.array([np.nan, 1 + 2j], dtype="complex"), + dtype_equal=dtype_equal, + ) + assert not array_equivalent( + np.array([np.nan, 1, np.nan]), + np.array([np.nan, 2, np.nan]), + dtype_equal=dtype_equal, + ) + assert not array_equivalent( + np.array(["a", "b", "c", "d"]), np.array(["e", "e"]), dtype_equal=dtype_equal + ) + assert array_equivalent( + Float64Index([0, np.nan]), Float64Index([0, np.nan]), dtype_equal=dtype_equal ) assert not array_equivalent( - np.array([np.nan, 1, np.nan]), np.array([np.nan, 2, np.nan]) + Float64Index([0, np.nan]), Float64Index([1, np.nan]), dtype_equal=dtype_equal + ) + assert array_equivalent( + DatetimeIndex([0, np.nan]), DatetimeIndex([0, np.nan]), dtype_equal=dtype_equal + ) + assert not array_equivalent( + DatetimeIndex([0, np.nan]), DatetimeIndex([1, np.nan]), dtype_equal=dtype_equal + ) + assert array_equivalent( + TimedeltaIndex([0, np.nan]), + TimedeltaIndex([0, np.nan]), + dtype_equal=dtype_equal, ) - assert not array_equivalent(np.array(["a", "b", "c", "d"]), np.array(["e", "e"])) - assert array_equivalent(Float64Index([0, np.nan]), Float64Index([0, np.nan])) - assert not array_equivalent(Float64Index([0, np.nan]), Float64Index([1, np.nan])) - assert array_equivalent(DatetimeIndex([0, np.nan]), DatetimeIndex([0, np.nan])) - assert not array_equivalent(DatetimeIndex([0, np.nan]), DatetimeIndex([1, np.nan])) - assert array_equivalent(TimedeltaIndex([0, np.nan]), TimedeltaIndex([0, np.nan])) assert not array_equivalent( - TimedeltaIndex([0, np.nan]), TimedeltaIndex([1, np.nan]) + TimedeltaIndex([0, np.nan]), + TimedeltaIndex([1, np.nan]), + dtype_equal=dtype_equal, ) assert array_equivalent( DatetimeIndex([0, np.nan], tz="US/Eastern"), DatetimeIndex([0, np.nan], tz="US/Eastern"), + dtype_equal=dtype_equal, ) assert not array_equivalent( DatetimeIndex([0, np.nan], tz="US/Eastern"), DatetimeIndex([1, np.nan], tz="US/Eastern"), + dtype_equal=dtype_equal, ) + # The rest are not dtype_equal assert not array_equivalent( - DatetimeIndex([0, np.nan]), DatetimeIndex([0, np.nan], tz="US/Eastern") + DatetimeIndex([0, np.nan]), DatetimeIndex([0, np.nan], tz="US/Eastern"), ) assert not array_equivalent( DatetimeIndex([0, np.nan], tz="CET"), @@ -353,6 +383,11 @@ def test_array_equivalent(): assert not array_equivalent(DatetimeIndex([0, np.nan]), TimedeltaIndex([0, np.nan])) +def test_array_equivalent_different_dtype_but_equal(): + # Unclear if this is exposed anywhere in the public-facing API + assert array_equivalent(np.array([1, 2]), np.array([1.0, 2.0])) + + @pytest.mark.parametrize( "lvalue, rvalue", [ From 5bd458108937f562259909a6f499e43ed2df6441 Mon Sep 17 00:00:00 2001 From: salem3358 <46143571+salem3358@users.noreply.github.com> Date: Sat, 18 Jul 2020 01:49:53 +0700 Subject: [PATCH 0395/1025] Fix AttributeError when groupby as_index=False on empty DataFrame (#35324) --- doc/source/whatsnew/v1.1.0.rst | 1 + pandas/core/groupby/generic.py | 17 +++++++++++------ pandas/tests/groupby/test_groupby.py | 8 ++++++++ 3 files changed, 20 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 285cfdfc4c431..43d1244c15d8a 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -1130,6 +1130,7 @@ Groupby/resample/rolling - Bug in :meth:`DataFrame.ewm.cov` was throwing ``AssertionError`` for :class:`MultiIndex` inputs (:issue:`34440`) - Bug in :meth:`core.groupby.DataFrameGroupBy.quantile` raises ``TypeError`` for non-numeric types rather than dropping columns (:issue:`27892`) - Bug in :meth:`core.groupby.DataFrameGroupBy.transform` when ``func='nunique'`` and columns are of type ``datetime64``, the result would also be of type ``datetime64`` instead of ``int64`` (:issue:`35109`) +- Bug in :meth:`DataFrame.groupby` raising an ``AttributeError`` when selecting a column and aggregating with ``as_index=False`` (:issue:`35246`). - Bug in :meth:'DataFrameGroupBy.first' and :meth:'DataFrameGroupBy.last' that would raise an unnecessary ``ValueError`` when grouping on multiple ``Categoricals`` (:issue:`34951`) Reshaping diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 1d14361757e4a..ec7b14f27c5a1 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -963,18 +963,23 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs) # try to treat as if we are passing a list try: result = self._aggregate_multiple_funcs([func], _axis=self.axis) - except ValueError as err: - if "no results" not in str(err): - # raised directly by _aggregate_multiple_funcs - raise - result = self._aggregate_frame(func) - else: + # select everything except for the last level, which is the one # containing the name of the function(s), see GH 32040 result.columns = result.columns.rename( [self._selected_obj.columns.name] * result.columns.nlevels ).droplevel(-1) + except ValueError as err: + if "no results" not in str(err): + # raised directly by _aggregate_multiple_funcs + raise + result = self._aggregate_frame(func) + except AttributeError: + # catch exception from line 969 + # (Series does not have attribute "columns"), see GH 35246 + result = self._aggregate_frame(func) + if relabeling: # used reordered index of columns diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 0d040b8e6955a..ebce5b0ef0a66 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -605,6 +605,14 @@ def test_as_index_select_column(): tm.assert_series_equal(result, expected) +def test_groupby_as_index_select_column_sum_empty_df(): + # GH 35246 + df = DataFrame(columns=["A", "B", "C"]) + left = df.groupby(by="A", as_index=False)["B"].sum() + assert type(left) is DataFrame + assert left.to_dict() == {"A": {}, "B": {}} + + def test_groupby_as_index_agg(df): grouped = df.groupby("A", as_index=False) From 403f459db464a188523ae33913fb3bfd702c3bf8 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 20 Jul 2020 17:17:50 -0500 Subject: [PATCH 0396/1025] CI: pin matplotlib for doc build (#35358) --- ci/deps/azure-37-locale.yaml | 2 +- environment.yml | 2 +- requirements-dev.txt | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/ci/deps/azure-37-locale.yaml b/ci/deps/azure-37-locale.yaml index 714e1100b1e1a..77aae791a47c1 100644 --- a/ci/deps/azure-37-locale.yaml +++ b/ci/deps/azure-37-locale.yaml @@ -18,7 +18,7 @@ dependencies: - ipython - jinja2 - lxml - - matplotlib + - matplotlib <3.3.0 - moto - nomkl - numexpr diff --git a/environment.yml b/environment.yml index 53106906a52cb..53222624619de 100644 --- a/environment.yml +++ b/environment.yml @@ -73,7 +73,7 @@ dependencies: - ipykernel - ipython>=7.11.1 - jinja2 # pandas.Styler - - matplotlib>=2.2.2 # pandas.plotting, Series.plot, DataFrame.plot + - matplotlib>=2.2.2,<3.3.0 # pandas.plotting, Series.plot, DataFrame.plot - numexpr>=2.6.8 - scipy>=1.2 - numba>=0.46.0 diff --git a/requirements-dev.txt b/requirements-dev.txt index 1ec998ffa72d4..0c024d1b54637 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -47,7 +47,7 @@ bottleneck>=1.2.1 ipykernel ipython>=7.11.1 jinja2 -matplotlib>=2.2.2 +matplotlib>=2.2.2,<3.3.0 numexpr>=2.6.8 scipy>=1.2 numba>=0.46.0 From d026e01cca2cb3d7afb4e36f0fd09bbc88e8002e Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Tue, 21 Jul 2020 12:20:00 +0100 Subject: [PATCH 0397/1025] REGR: MultiIndex Indexing (#35353) * REGR: MultiIndex Indexing * add test --- pandas/core/indexes/base.py | 5 ++++- pandas/tests/indexing/multiindex/test_loc.py | 19 +++++++++++++++++++ 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 3dbee7d0929cb..986d6323e704e 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -3397,7 +3397,10 @@ def _reindex_non_unique(self, target): new_indexer = np.arange(len(self.take(indexer))) new_indexer[~check] = -1 - new_index = Index(new_labels, name=self.name) + if isinstance(self, ABCMultiIndex): + new_index = type(self).from_tuples(new_labels, names=self.names) + else: + new_index = Index(new_labels, name=self.name) return new_index, indexer, new_indexer # -------------------------------------------------------------------- diff --git a/pandas/tests/indexing/multiindex/test_loc.py b/pandas/tests/indexing/multiindex/test_loc.py index f0cbdbe8d0564..63983f45d7832 100644 --- a/pandas/tests/indexing/multiindex/test_loc.py +++ b/pandas/tests/indexing/multiindex/test_loc.py @@ -491,3 +491,22 @@ def test_loc_datetime_mask_slicing(): ), ) tm.assert_series_equal(result, expected) + + +def test_loc_with_mi_indexer(): + # https://github.com/pandas-dev/pandas/issues/35351 + df = DataFrame( + data=[["a", 1], ["a", 0], ["b", 1], ["c", 2]], + index=MultiIndex.from_tuples( + [(0, 1), (1, 0), (1, 1), (1, 1)], names=["index", "date"] + ), + columns=["author", "price"], + ) + idx = MultiIndex.from_tuples([(0, 1), (1, 1)], names=["index", "date"]) + result = df.loc[idx, :] + expected = DataFrame( + [["a", 1], ["b", 1], ["c", 2]], + index=MultiIndex.from_tuples([(0, 1), (1, 1), (1, 1)], names=["index", "date"]), + columns=["author", "price"], + ) + tm.assert_frame_equal(result, expected) From ebb135f0f42374f278d7f483ca2572cbbf0b2356 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Wed, 22 Jul 2020 03:45:40 -0700 Subject: [PATCH 0398/1025] Change defaults for rolling/expanding.apply engine kwargs to None (#35374) * Change defaults for rolling/expanding.apply engine kwargs to None * Add whatsnew Co-authored-by: Matt Roeschke --- doc/source/whatsnew/v1.1.0.rst | 2 +- pandas/core/window/expanding.py | 2 +- pandas/core/window/rolling.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 43d1244c15d8a..55e2a810e6fc3 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -338,7 +338,7 @@ Other enhancements - :meth:`read_csv` now accepts string values like "0", "0.0", "1", "1.0" as convertible to the nullable boolean dtype (:issue:`34859`) - :class:`pandas.core.window.ExponentialMovingWindow` now supports a ``times`` argument that allows ``mean`` to be calculated with observations spaced by the timestamps in ``times`` (:issue:`34839`) - :meth:`DataFrame.agg` and :meth:`Series.agg` now accept named aggregation for renaming the output columns/indexes. (:issue:`26513`) -- ``compute.use_numba`` now exists as a configuration option that utilizes the numba engine when available (:issue:`33966`) +- ``compute.use_numba`` now exists as a configuration option that utilizes the numba engine when available (:issue:`33966`, :issue:`35374`) - :meth:`Series.plot` now supports asymmetric error bars. Previously, if :meth:`Series.plot` received a "2xN" array with error values for `yerr` and/or `xerr`, the left/lower values (first row) were mirrored, while the right/upper values (second row) were ignored. Now, the first row represents the left/lower error values and the second row the right/upper error values. (:issue:`9536`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/window/expanding.py b/pandas/core/window/expanding.py index 8267cd4f0971e..ce4ab2f98c23d 100644 --- a/pandas/core/window/expanding.py +++ b/pandas/core/window/expanding.py @@ -137,7 +137,7 @@ def apply( self, func, raw: bool = False, - engine: str = "cython", + engine: Optional[str] = None, engine_kwargs: Optional[Dict[str, bool]] = None, args=None, kwargs=None, diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 48953f6a75487..445f179248226 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -1344,7 +1344,7 @@ def apply( self, func, raw: bool = False, - engine: str = "cython", + engine: Optional[str] = None, engine_kwargs: Optional[Dict] = None, args: Optional[Tuple] = None, kwargs: Optional[Dict] = None, From 3e8b3864bbb8eddf5a92c8b2958a7cee62f0d2ec Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 23 Jul 2020 14:10:33 -0500 Subject: [PATCH 0399/1025] PKG: Set max pin for Cython (#35396) We know that pandas doesn't work with Cython 3.0 (https://github.com/pandas-dev/pandas/issues/34213, https://github.com/pandas-dev/pandas/issues/34014) This sets the maximum supported version of Cython in our pyproject.toml to ensure that pandas 1.1.0 can continue to be built from source without Cython pre-installed after Cython 3.0 is released. --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index aaebcff8e4c1e..f282f2a085000 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ requires = [ "setuptools", "wheel", - "Cython>=0.29.16", # Note: sync with setup.py + "Cython>=0.29.16,<3", # Note: sync with setup.py "numpy==1.15.4; python_version=='3.6' and platform_system!='AIX'", "numpy==1.15.4; python_version=='3.7' and platform_system!='AIX'", "numpy==1.17.3; python_version>='3.8' and platform_system!='AIX'", From e83850546103ddee3ed9cd7af093cd5ae5c806d5 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 23 Jul 2020 17:06:03 -0500 Subject: [PATCH 0400/1025] Matplotlib 3.3 compatibility fixups (#35393) --- ci/deps/azure-37-locale.yaml | 2 +- doc/source/whatsnew/v1.1.0.rst | 1 + pandas/plotting/_matplotlib/boxplot.py | 5 ++++ pandas/plotting/_matplotlib/compat.py | 1 + pandas/plotting/_matplotlib/converter.py | 34 +++++----------------- pandas/tests/plotting/test_converter.py | 6 ++-- pandas/tests/plotting/test_datetimelike.py | 14 +++++---- pandas/tests/plotting/test_frame.py | 1 + pandas/tests/plotting/test_series.py | 10 +++++-- 9 files changed, 34 insertions(+), 40 deletions(-) diff --git a/ci/deps/azure-37-locale.yaml b/ci/deps/azure-37-locale.yaml index 77aae791a47c1..4dbb6a5344976 100644 --- a/ci/deps/azure-37-locale.yaml +++ b/ci/deps/azure-37-locale.yaml @@ -18,7 +18,7 @@ dependencies: - ipython - jinja2 - lxml - - matplotlib <3.3.0 + - matplotlib>=3.3.0 - moto - nomkl - numexpr diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 55e2a810e6fc3..b1ab54a608748 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -274,6 +274,7 @@ change, as ``fsspec`` will still bring in the same packages as before. Other enhancements ^^^^^^^^^^^^^^^^^^ +- Compatibility with matplotlib 3.3.0 (:issue:`34850`) - :meth:`IntegerArray.astype` now supports ``datetime64`` dtype (:issue:`32538`) - :class:`IntegerArray` now implements the ``sum`` operation (:issue:`33172`) - Added :class:`pandas.errors.InvalidIndexError` (:issue:`34570`). diff --git a/pandas/plotting/_matplotlib/boxplot.py b/pandas/plotting/_matplotlib/boxplot.py index 4b79bef41d025..53ef97bbe9a72 100644 --- a/pandas/plotting/_matplotlib/boxplot.py +++ b/pandas/plotting/_matplotlib/boxplot.py @@ -299,6 +299,11 @@ def plot_group(keys, values, ax): if fontsize is not None: ax.tick_params(axis="both", labelsize=fontsize) if kwds.get("vert", 1): + ticks = ax.get_xticks() + if len(ticks) != len(keys): + i, remainder = divmod(len(ticks), len(keys)) + assert remainder == 0, remainder + keys *= i ax.set_xticklabels(keys, rotation=rot) else: ax.set_yticklabels(keys, rotation=rot) diff --git a/pandas/plotting/_matplotlib/compat.py b/pandas/plotting/_matplotlib/compat.py index f2c5032112bc9..7f107f18eca25 100644 --- a/pandas/plotting/_matplotlib/compat.py +++ b/pandas/plotting/_matplotlib/compat.py @@ -21,3 +21,4 @@ def inner(): _mpl_ge_3_0_0 = _mpl_version("3.0.0", operator.ge) _mpl_ge_3_1_0 = _mpl_version("3.1.0", operator.ge) _mpl_ge_3_2_0 = _mpl_version("3.2.0", operator.ge) +_mpl_ge_3_3_0 = _mpl_version("3.3.0", operator.ge) diff --git a/pandas/plotting/_matplotlib/converter.py b/pandas/plotting/_matplotlib/converter.py index 05377e0c240b9..8f2080658e63e 100644 --- a/pandas/plotting/_matplotlib/converter.py +++ b/pandas/plotting/_matplotlib/converter.py @@ -16,7 +16,6 @@ from pandas._libs.tslibs.offsets import BaseOffset from pandas.core.dtypes.common import ( - is_datetime64_ns_dtype, is_float, is_float_dtype, is_integer, @@ -246,19 +245,6 @@ def get_datevalue(date, freq): raise ValueError(f"Unrecognizable date '{date}'") -def _dt_to_float_ordinal(dt): - """ - Convert :mod:`datetime` to the Gregorian date as UTC float days, - preserving hours, minutes, seconds and microseconds. Return value - is a :func:`float`. - """ - if isinstance(dt, (np.ndarray, Index, Series)) and is_datetime64_ns_dtype(dt): - base = dates.epoch2num(dt.asi8 / 1.0e9) - else: - base = dates.date2num(dt) - return base - - # Datetime Conversion class DatetimeConverter(dates.DateConverter): @staticmethod @@ -274,15 +260,11 @@ def convert(values, unit, axis): def _convert_1d(values, unit, axis): def try_parse(values): try: - return _dt_to_float_ordinal(tools.to_datetime(values)) + return dates.date2num(tools.to_datetime(values)) except Exception: return values - if isinstance(values, (datetime, pydt.date)): - return _dt_to_float_ordinal(values) - elif isinstance(values, np.datetime64): - return _dt_to_float_ordinal(Timestamp(values)) - elif isinstance(values, pydt.time): + if isinstance(values, (datetime, pydt.date, np.datetime64, pydt.time)): return dates.date2num(values) elif is_integer(values) or is_float(values): return values @@ -303,12 +285,10 @@ def try_parse(values): try: values = tools.to_datetime(values) - if isinstance(values, Index): - values = _dt_to_float_ordinal(values) - else: - values = [_dt_to_float_ordinal(x) for x in values] except Exception: - values = _dt_to_float_ordinal(values) + pass + + values = dates.date2num(values) return values @@ -411,8 +391,8 @@ def __call__(self): interval = self._get_interval() freq = f"{interval}L" tz = self.tz.tzname(None) - st = _from_ordinal(dates.date2num(dmin)) # strip tz - ed = _from_ordinal(dates.date2num(dmax)) + st = dmin.replace(tzinfo=None) + ed = dmin.replace(tzinfo=None) all_dates = date_range(start=st, end=ed, freq=freq, tz=tz).astype(object) try: diff --git a/pandas/tests/plotting/test_converter.py b/pandas/tests/plotting/test_converter.py index df2c9ecbd7a0a..b2eeb649276d5 100644 --- a/pandas/tests/plotting/test_converter.py +++ b/pandas/tests/plotting/test_converter.py @@ -27,6 +27,7 @@ pass pytest.importorskip("matplotlib.pyplot") +dates = pytest.importorskip("matplotlib.dates") def test_registry_mpl_resets(): @@ -146,7 +147,7 @@ def test_convert_accepts_unicode(self): def test_conversion(self): rs = self.dtc.convert(["2012-1-1"], None, None)[0] - xp = datetime(2012, 1, 1).toordinal() + xp = dates.date2num(datetime(2012, 1, 1)) assert rs == xp rs = self.dtc.convert("2012-1-1", None, None) @@ -155,9 +156,6 @@ def test_conversion(self): rs = self.dtc.convert(date(2012, 1, 1), None, None) assert rs == xp - rs = self.dtc.convert(datetime(2012, 1, 1).toordinal(), None, None) - assert rs == xp - rs = self.dtc.convert("2012-1-1", None, None) assert rs == xp diff --git a/pandas/tests/plotting/test_datetimelike.py b/pandas/tests/plotting/test_datetimelike.py index 201856669103a..ecf378d4fc04a 100644 --- a/pandas/tests/plotting/test_datetimelike.py +++ b/pandas/tests/plotting/test_datetimelike.py @@ -331,7 +331,7 @@ def test_freq_with_no_period_alias(self): bts = tm.makeTimeSeries(5).asfreq(freq) _, ax = self.plt.subplots() bts.plot(ax=ax) - assert ax.get_lines()[0].get_xydata()[0, 0] == bts.index[0].toordinal() + idx = ax.get_lines()[0].get_xdata() msg = "freq not specified and cannot be inferred" with pytest.raises(ValueError, match=msg): @@ -1279,6 +1279,8 @@ def test_mpl_nopandas(self): @pytest.mark.slow def test_irregular_ts_shared_ax_xlim(self): # GH 2960 + from pandas.plotting._matplotlib.converter import DatetimeConverter + ts = tm.makeTimeSeries()[:20] ts_irregular = ts[[1, 4, 5, 6, 8, 9, 10, 12, 13, 14, 15, 17, 18]] @@ -1289,8 +1291,8 @@ def test_irregular_ts_shared_ax_xlim(self): # check that axis limits are correct left, right = ax.get_xlim() - assert left <= ts_irregular.index.min().toordinal() - assert right >= ts_irregular.index.max().toordinal() + assert left <= DatetimeConverter.convert(ts_irregular.index.min(), "", ax) + assert right >= DatetimeConverter.convert(ts_irregular.index.max(), "", ax) @pytest.mark.slow def test_secondary_y_non_ts_xlim(self): @@ -1345,6 +1347,8 @@ def test_secondary_y_mixed_freq_ts_xlim(self): @pytest.mark.slow def test_secondary_y_irregular_ts_xlim(self): # GH 3490 - irregular-timeseries with secondary y + from pandas.plotting._matplotlib.converter import DatetimeConverter + ts = tm.makeTimeSeries()[:20] ts_irregular = ts[[1, 4, 5, 6, 8, 9, 10, 12, 13, 14, 15, 17, 18]] @@ -1356,8 +1360,8 @@ def test_secondary_y_irregular_ts_xlim(self): ts_irregular[:5].plot(ax=ax) left, right = ax.get_xlim() - assert left <= ts_irregular.index.min().toordinal() - assert right >= ts_irregular.index.max().toordinal() + assert left <= DatetimeConverter.convert(ts_irregular.index.min(), "", ax) + assert right >= DatetimeConverter.convert(ts_irregular.index.max(), "", ax) def test_plot_outofbounds_datetime(self): # 2579 - checking this does not raise diff --git a/pandas/tests/plotting/test_frame.py b/pandas/tests/plotting/test_frame.py index 3d85e79b15c4c..317a994bd9a32 100644 --- a/pandas/tests/plotting/test_frame.py +++ b/pandas/tests/plotting/test_frame.py @@ -1563,6 +1563,7 @@ def test_boxplot(self): ax.xaxis.get_ticklocs(), np.arange(1, len(numeric_cols) + 1) ) assert len(ax.lines) == self.bp_n_objects * len(numeric_cols) + tm.close() axes = series.plot.box(rot=40) self._check_ticks_props(axes, xrot=40, yrot=0) diff --git a/pandas/tests/plotting/test_series.py b/pandas/tests/plotting/test_series.py index 316ca6ce91af7..151bb3bed7207 100644 --- a/pandas/tests/plotting/test_series.py +++ b/pandas/tests/plotting/test_series.py @@ -274,12 +274,14 @@ def test_rotation(self): self._check_ticks_props(axes, xrot=30) def test_irregular_datetime(self): + from pandas.plotting._matplotlib.converter import DatetimeConverter + rng = date_range("1/1/2000", "3/1/2000") rng = rng[[0, 1, 2, 3, 5, 9, 10, 11, 12]] ser = Series(randn(len(rng)), rng) _, ax = self.plt.subplots() ax = ser.plot(ax=ax) - xp = datetime(1999, 1, 1).toordinal() + xp = DatetimeConverter.convert(datetime(1999, 1, 1), "", ax) ax.set_xlim("1/1/1999", "1/1/2001") assert xp == ax.get_xlim()[0] @@ -684,11 +686,13 @@ def test_kind_both_ways(self): kinds = ( plotting.PlotAccessor._common_kinds + plotting.PlotAccessor._series_kinds ) - _, ax = self.plt.subplots() for kind in kinds: - + _, ax = self.plt.subplots() s.plot(kind=kind, ax=ax) + self.plt.close() + _, ax = self.plt.subplots() getattr(s.plot, kind)() + self.plt.close() @pytest.mark.slow def test_invalid_plot_data(self): From f5612df809189ec512102f9ac6b2054d0ff5ed21 Mon Sep 17 00:00:00 2001 From: alm Date: Fri, 24 Jul 2020 15:46:54 +0300 Subject: [PATCH 0401/1025] Remove leftover partial-result code (#35397) --- pandas/core/apply.py | 21 ++++----------------- 1 file changed, 4 insertions(+), 17 deletions(-) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index d4be660939773..733dbeed34b72 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -252,33 +252,20 @@ def apply_broadcast(self, target: "DataFrame") -> "DataFrame": return result def apply_standard(self): - - # partial result that may be returned from reduction - partial_result = None - - # compute the result using the series generator, - # use the result computed while trying to reduce if available. - results, res_index = self.apply_series_generator(partial_result) + results, res_index = self.apply_series_generator() # wrap results return self.wrap_results(results, res_index) - def apply_series_generator(self, partial_result=None) -> Tuple[ResType, "Index"]: + def apply_series_generator(self) -> Tuple[ResType, "Index"]: series_gen = self.series_generator res_index = self.result_index results = {} - # If a partial result was already computed, - # use it instead of running on the first element again - series_gen_enumeration = enumerate(series_gen) - if partial_result is not None: - i, v = next(series_gen_enumeration) - results[i] = partial_result - if self.ignore_failures: successes = [] - for i, v in series_gen_enumeration: + for i, v in enumerate(series_gen): try: results[i] = self.f(v) except Exception: @@ -292,7 +279,7 @@ def apply_series_generator(self, partial_result=None) -> Tuple[ResType, "Index"] else: with option_context("mode.chained_assignment", None): - for i, v in series_gen_enumeration: + for i, v in enumerate(series_gen): # ignore SettingWithCopy here in case the user mutates results[i] = self.f(v) if isinstance(results[i], ABCSeries): From 69f4fba1dc3f699f3f33f2b72f213eac2ad6aa07 Mon Sep 17 00:00:00 2001 From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> Date: Fri, 24 Jul 2020 09:06:26 -0400 Subject: [PATCH 0402/1025] DOC: Fixed formatting and errors in whatsnew v1.1.0 (#35398) --- doc/source/whatsnew/v1.1.0.rst | 305 ++++++++++++++++----------------- 1 file changed, 152 insertions(+), 153 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index b1ab54a608748..c2a4abbea107c 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -42,8 +42,8 @@ For example, the below now works: .. _whatsnew_110.period_index_partial_string_slicing: -Nonmonotonic PeriodIndex Partial String Slicing -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Non-monotonic PeriodIndex Partial String Slicing +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ :class:`PeriodIndex` now supports partial string slicing for non-monotonic indexes, mirroring :class:`DatetimeIndex` behavior (:issue:`31096`) @@ -130,7 +130,7 @@ The default setting of ``dropna`` argument is ``True`` which means ``NA`` are no Sorting with keys ^^^^^^^^^^^^^^^^^ -We've added a ``key`` argument to the DataFrame and Series sorting methods, including +We've added a ``key`` argument to the :class:`DataFrame` and :class:`Series` sorting methods, including :meth:`DataFrame.sort_values`, :meth:`DataFrame.sort_index`, :meth:`Series.sort_values`, and :meth:`Series.sort_index`. The ``key`` can be any callable function which is applied column-by-column to each column used for sorting, before sorting is performed (:issue:`27237`). @@ -215,14 +215,14 @@ For example: Grouper and resample now supports the arguments origin and offset ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -:class:`Grouper` and :class:`DataFrame.resample` now supports the arguments ``origin`` and ``offset``. It let the user control the timestamp on which to adjust the grouping. (:issue:`31809`) +:class:`Grouper` and :meth:`DataFrame.resample` now supports the arguments ``origin`` and ``offset``. It let the user control the timestamp on which to adjust the grouping. (:issue:`31809`) The bins of the grouping are adjusted based on the beginning of the day of the time series starting point. This works well with frequencies that are multiples of a day (like `30D`) or that divides a day (like `90s` or `1min`). But it can create inconsistencies with some frequencies that do not meet this criteria. To change this behavior you can now specify a fixed timestamp with the argument ``origin``. -Two arguments are now deprecated (more information in the documentation of :class:`DataFrame.resample`): +Two arguments are now deprecated (more information in the documentation of :meth:`DataFrame.resample`): - ``base`` should be replaced by ``offset``. -- ``loffset`` should be replaced by directly adding an offset to the index DataFrame after being resampled. +- ``loffset`` should be replaced by directly adding an offset to the index :class:`DataFrame` after being resampled. Small example of the use of ``origin``: @@ -248,7 +248,7 @@ Resample using a fixed origin: ts.resample('17min', origin='epoch').sum() ts.resample('17min', origin='2000-01-01').sum() -If needed you can adjust the bins with the argument ``offset`` (a Timedelta) that would be added to the default ``origin``. +If needed you can adjust the bins with the argument ``offset`` (a :class:`Timedelta`) that would be added to the default ``origin``. For a full example, see: :ref:`timeseries.adjust-the-start-of-the-bins`. @@ -286,10 +286,10 @@ Other enhancements - :meth:`~pandas.io.formats.style.Styler.highlight_null` now accepts ``subset`` argument (:issue:`31345`) - When writing directly to a sqlite connection :meth:`DataFrame.to_sql` now supports the ``multi`` method (:issue:`29921`) - :class:`pandas.errors.OptionError` is now exposed in ``pandas.errors`` (:issue:`27553`) -- Add :meth:`api.extensions.ExtensionArray.argmax` and :meth:`api.extensions.ExtensionArray.argmin` (:issue:`24382`) +- Added :meth:`api.extensions.ExtensionArray.argmax` and :meth:`api.extensions.ExtensionArray.argmin` (:issue:`24382`) - :func:`timedelta_range` will now infer a frequency when passed ``start``, ``stop``, and ``periods`` (:issue:`32377`) - Positional slicing on a :class:`IntervalIndex` now supports slices with ``step > 1`` (:issue:`31658`) -- :class:`Series.str` now has a `fullmatch` method that matches a regular expression against the entire string in each row of the series, similar to `re.fullmatch` (:issue:`32806`). +- :class:`Series.str` now has a `fullmatch` method that matches a regular expression against the entire string in each row of the :class:`Series`, similar to `re.fullmatch` (:issue:`32806`). - :meth:`DataFrame.sample` will now also allow array-like and BitGenerator objects to be passed to ``random_state`` as seeds (:issue:`32503`) - :meth:`Index.union` will now raise ``RuntimeWarning`` for :class:`MultiIndex` objects if the object inside are unsortable. Pass ``sort=False`` to suppress this warning (:issue:`33015`) - Added :meth:`Series.dt.isocalendar` and :meth:`DatetimeIndex.isocalendar` that returns a :class:`DataFrame` with year, week, and day calculated according to the ISO 8601 calendar (:issue:`33206`, :issue:`34392`). @@ -306,37 +306,37 @@ Other enhancements - :meth:`melt` has gained an ``ignore_index`` (default ``True``) argument that, if set to ``False``, prevents the method from dropping the index (:issue:`17440`). - :meth:`Series.update` now accepts objects that can be coerced to a :class:`Series`, such as ``dict`` and ``list``, mirroring the behavior of :meth:`DataFrame.update` (:issue:`33215`) -- :meth:`~pandas.core.groupby.DataFrameGroupBy.transform` and :meth:`~pandas.core.groupby.DataFrameGroupBy.aggregate` has gained ``engine`` and ``engine_kwargs`` arguments that supports executing functions with ``Numba`` (:issue:`32854`, :issue:`33388`) +- :meth:`~pandas.core.groupby.DataFrameGroupBy.transform` and :meth:`~pandas.core.groupby.DataFrameGroupBy.aggregate` have gained ``engine`` and ``engine_kwargs`` arguments that support executing functions with ``Numba`` (:issue:`32854`, :issue:`33388`) - :meth:`~pandas.core.resample.Resampler.interpolate` now supports SciPy interpolation method :class:`scipy.interpolate.CubicSpline` as method ``cubicspline`` (:issue:`33670`) - :class:`~pandas.core.groupby.DataFrameGroupBy` and :class:`~pandas.core.groupby.SeriesGroupBy` now implement the ``sample`` method for doing random sampling within groups (:issue:`31775`) - :meth:`DataFrame.to_numpy` now supports the ``na_value`` keyword to control the NA sentinel in the output array (:issue:`33820`) -- Added :class:`api.extension.ExtensionArray.equals` to the extension array interface, similarl to :meth:`Series.equals` (:issue:`27081`). -- The minimum supported dta version has increased to 105 in :meth:`read_stata` and :class:`~pandas.io.stata.StataReader` (:issue:`26667`). +- Added :class:`api.extension.ExtensionArray.equals` to the extension array interface, similar to :meth:`Series.equals` (:issue:`27081`) +- The minimum supported dta version has increased to 105 in :func:`read_stata` and :class:`~pandas.io.stata.StataReader` (:issue:`26667`). - :meth:`~DataFrame.to_stata` supports compression using the ``compression`` keyword argument. Compression can either be inferred or explicitly set using a string or a dictionary containing both the method and any additional arguments that are passed to the compression library. Compression was also added to the low-level Stata-file writers :class:`~pandas.io.stata.StataWriter`, :class:`~pandas.io.stata.StataWriter117`, and :class:`~pandas.io.stata.StataWriterUTF8` (:issue:`26599`). -- :meth:`HDFStore.put` now accepts a ``track_times`` parameter. Parameter is passed to ``create_table`` method of ``PyTables`` (:issue:`32682`). +- :meth:`HDFStore.put` now accepts a ``track_times`` parameter. This parameter is passed to the ``create_table`` method of ``PyTables`` (:issue:`32682`). - :meth:`Series.plot` and :meth:`DataFrame.plot` now accepts `xlabel` and `ylabel` parameters to present labels on x and y axis (:issue:`9093`). -- Make :class:`pandas.core.window.rolling.Rolling` and :class:`pandas.core.window.expanding.Expanding` iterable(:issue:`11704`) -- Make ``option_context`` a :class:`contextlib.ContextDecorator`, which allows it to be used as a decorator over an entire function (:issue:`34253`). +- Made :class:`pandas.core.window.rolling.Rolling` and :class:`pandas.core.window.expanding.Expanding` iterable(:issue:`11704`) +- Made ``option_context`` a :class:`contextlib.ContextDecorator`, which allows it to be used as a decorator over an entire function (:issue:`34253`). - :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` now accept an ``errors`` argument (:issue:`22610`) - :meth:`~pandas.core.groupby.DataFrameGroupBy.groupby.transform` now allows ``func`` to be ``pad``, ``backfill`` and ``cumcount`` (:issue:`31269`). -- :meth:`read_json` now accepts an ``nrows`` parameter. (:issue:`33916`). -- :meth:`DataFrame.hist`, :meth:`Series.hist`, :meth:`core.groupby.DataFrameGroupBy.hist`, and :meth:`core.groupby.SeriesGroupBy.hist` have gained the ``legend`` argument. Set to True to show a legend in the histogram. (:issue:`6279`)0 +- :func:`read_json` now accepts an ``nrows`` parameter. (:issue:`33916`). +- :meth:`DataFrame.hist`, :meth:`Series.hist`, :meth:`core.groupby.DataFrameGroupBy.hist`, and :meth:`core.groupby.SeriesGroupBy.hist` have gained the ``legend`` argument. Set to True to show a legend in the histogram. (:issue:`6279`) - :func:`concat` and :meth:`~DataFrame.append` now preserve extension dtypes, for example combining a nullable integer column with a numpy integer column will no longer result in object dtype but preserve the integer dtype (:issue:`33607`, :issue:`34339`, :issue:`34095`). -- :meth:`read_gbq` now allows to disable progress bar (:issue:`33360`). -- :meth:`read_gbq` now supports the ``max_results`` kwarg from ``pandas-gbq`` (:issue:`34639`). -- :meth:`DataFrame.cov` and :meth:`Series.cov` now support a new parameter ddof to support delta degrees of freedom as in the corresponding numpy methods (:issue:`34611`). +- :func:`read_gbq` now allows to disable progress bar (:issue:`33360`). +- :func:`read_gbq` now supports the ``max_results`` kwarg from ``pandas-gbq`` (:issue:`34639`). +- :meth:`DataFrame.cov` and :meth:`Series.cov` now support a new parameter ``ddof`` to support delta degrees of freedom as in the corresponding numpy methods (:issue:`34611`). - :meth:`DataFrame.to_html` and :meth:`DataFrame.to_string`'s ``col_space`` parameter now accepts a list or dict to change only some specific columns' width (:issue:`28917`). - :meth:`DataFrame.to_excel` can now also write OpenOffice spreadsheet (.ods) files (:issue:`27222`) -- :meth:`~Series.explode` now accepts ``ignore_index`` to reset the index, similarly to :meth:`pd.concat` or :meth:`DataFrame.sort_values` (:issue:`34932`). +- :meth:`~Series.explode` now accepts ``ignore_index`` to reset the index, similar to :meth:`pd.concat` or :meth:`DataFrame.sort_values` (:issue:`34932`). - :meth:`DataFrame.to_markdown` and :meth:`Series.to_markdown` now accept ``index`` argument as an alias for tabulate's ``showindex`` (:issue:`32667`) -- :meth:`read_csv` now accepts string values like "0", "0.0", "1", "1.0" as convertible to the nullable boolean dtype (:issue:`34859`) +- :meth:`read_csv` now accepts string values like "0", "0.0", "1", "1.0" as convertible to the nullable Boolean dtype (:issue:`34859`) - :class:`pandas.core.window.ExponentialMovingWindow` now supports a ``times`` argument that allows ``mean`` to be calculated with observations spaced by the timestamps in ``times`` (:issue:`34839`) - :meth:`DataFrame.agg` and :meth:`Series.agg` now accept named aggregation for renaming the output columns/indexes. (:issue:`26513`) - ``compute.use_numba`` now exists as a configuration option that utilizes the numba engine when available (:issue:`33966`, :issue:`35374`) @@ -417,7 +417,7 @@ Failed Label-Based Lookups Always Raise KeyError ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Label lookups ``series[key]``, ``series.loc[key]`` and ``frame.loc[key]`` -used to raises either ``KeyError`` or ``TypeError`` depending on the type of +used to raise either ``KeyError`` or ``TypeError`` depending on the type of key and type of :class:`Index`. These now consistently raise ``KeyError`` (:issue:`31867`) .. ipython:: python @@ -488,7 +488,7 @@ Similarly, :meth:`DataFrame.at` and :meth:`Series.at` will raise a ``TypeError`` Failed Integer Lookups on MultiIndex Raise KeyError ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Indexing with integers with a :class:`MultiIndex` that has a integer-dtype +Indexing with integers with a :class:`MultiIndex` that has an integer-dtype first level incorrectly failed to raise ``KeyError`` when one or more of those integer keys is not present in the first level of the index (:issue:`33539`) @@ -516,7 +516,7 @@ those integer keys is not present in the first level of the index (:issue:`33539 :meth:`DataFrame.merge` preserves right frame's row order ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -:meth:`DataFrame.merge` now preserves right frame's row order when executing a right merge (:issue:`27453`) +:meth:`DataFrame.merge` now preserves the right frame's row order when executing a right merge (:issue:`27453`) .. ipython:: python @@ -549,7 +549,7 @@ those integer keys is not present in the first level of the index (:issue:`33539 Assignment to multiple columns of a DataFrame when some columns do not exist ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Assignment to multiple columns of a :class:`DataFrame` when some of the columns do not exist would previously assign the values to the last column. Now, new columns would be constructed with the right values. (:issue:`13658`) +Assignment to multiple columns of a :class:`DataFrame` when some of the columns do not exist would previously assign the values to the last column. Now, new columns will be constructed with the right values. (:issue:`13658`) .. ipython:: python @@ -622,7 +622,7 @@ Using :meth:`DataFrame.groupby` with ``as_index=False`` and the function ``idxma df.groupby("a", as_index=False).nunique() -The method :meth:`~pandas.core.groupby.DataFrameGroupBy.size` would previously ignore ``as_index=False``. Now the grouping columns are returned as columns, making the result a `DataFrame` instead of a `Series`. (:issue:`32599`) +The method :meth:`~pandas.core.groupby.DataFrameGroupBy.size` would previously ignore ``as_index=False``. Now the grouping columns are returned as columns, making the result a :class:`DataFrame` instead of a :class:`Series`. (:issue:`32599`) *Previous behavior*: @@ -797,37 +797,36 @@ Development Changes Deprecations ~~~~~~~~~~~~ -- Lookups on a :class:`Series` with a single-item list containing a slice (e.g. ``ser[[slice(0, 4)]]``) are deprecated, will raise in a future version. Either convert the list to tuple, or pass the slice directly instead (:issue:`31333`) +- Lookups on a :class:`Series` with a single-item list containing a slice (e.g. ``ser[[slice(0, 4)]]``) are deprecated and will raise in a future version. Either convert the list to a tuple, or pass the slice directly instead (:issue:`31333`) -- :meth:`DataFrame.mean` and :meth:`DataFrame.median` with ``numeric_only=None`` will include datetime64 and datetime64tz columns in a future version (:issue:`29941`) +- :meth:`DataFrame.mean` and :meth:`DataFrame.median` with ``numeric_only=None`` will include ``datetime64`` and ``datetime64tz`` columns in a future version (:issue:`29941`) - Setting values with ``.loc`` using a positional slice is deprecated and will raise in a future version. Use ``.loc`` with labels or ``.iloc`` with positions instead (:issue:`31840`) -- :meth:`DataFrame.to_dict` has deprecated accepting short names for ``orient`` in future versions (:issue:`32515`) +- :meth:`DataFrame.to_dict` has deprecated accepting short names for ``orient`` and will raise in a future version (:issue:`32515`) - :meth:`Categorical.to_dense` is deprecated and will be removed in a future version, use ``np.asarray(cat)`` instead (:issue:`32639`) - The ``fastpath`` keyword in the ``SingleBlockManager`` constructor is deprecated and will be removed in a future version (:issue:`33092`) - Providing ``suffixes`` as a ``set`` in :func:`pandas.merge` is deprecated. Provide a tuple instead (:issue:`33740`, :issue:`34741`). -- Indexing a series with a multi-dimensional indexer like ``[:, None]`` to return an ndarray now raises a ``FutureWarning``. Convert to a NumPy array before indexing instead (:issue:`27837`) +- Indexing a :class:`Series` with a multi-dimensional indexer like ``[:, None]`` to return an ``ndarray`` now raises a ``FutureWarning``. Convert to a NumPy array before indexing instead (:issue:`27837`) - :meth:`Index.is_mixed` is deprecated and will be removed in a future version, check ``index.inferred_type`` directly instead (:issue:`32922`) -- Passing any arguments but the first one to :func:`read_html` as - positional arguments is deprecated since version 1.1. All other +- Passing any arguments but the first one to :func:`read_html` as + positional arguments is deprecated. All other arguments should be given as keyword arguments (:issue:`27573`). - Passing any arguments but ``path_or_buf`` (the first one) to - :func:`read_json` as positional arguments is deprecated since - version 1.1. All other arguments should be given as keyword - arguments (:issue:`27573`). + :func:`read_json` as positional arguments is deprecated. All + other arguments should be given as keyword arguments (:issue:`27573`). -- Passing any arguments but the first 2 to :func:`read_excel` as - positional arguments is deprecated since version 1.1. All other +- Passing any arguments but the first two to :func:`read_excel` as + positional arguments is deprecated. All other arguments should be given as keyword arguments (:issue:`27573`). - :func:`pandas.api.types.is_categorical` is deprecated and will be removed in a future version; use :func:`pandas.api.types.is_categorical_dtype` instead (:issue:`33385`) - :meth:`Index.get_value` is deprecated and will be removed in a future version (:issue:`19728`) -- :meth:`Series.dt.week` and `Series.dt.weekofyear` are deprecated and will be removed in a future version, use :meth:`Series.dt.isocalendar().week` instead (:issue:`33595`) +- :meth:`Series.dt.week` and :meth:`Series.dt.weekofyear` are deprecated and will be removed in a future version, use :meth:`Series.dt.isocalendar().week` instead (:issue:`33595`) - :meth:`DatetimeIndex.week` and ``DatetimeIndex.weekofyear`` are deprecated and will be removed in a future version, use ``DatetimeIndex.isocalendar().week`` instead (:issue:`33595`) - :meth:`DatetimeArray.week` and ``DatetimeArray.weekofyear`` are deprecated and will be removed in a future version, use ``DatetimeArray.isocalendar().week`` instead (:issue:`33595`) - :meth:`DateOffset.__call__` is deprecated and will be removed in a future version, use ``offset + other`` instead (:issue:`34171`) -- :meth:`~pandas.tseries.offsets.BusinessDay.apply_index` is deprecated and will be removed in a future version. Use ``offset + other`` instead (:issue:`34580`) +- :meth:`~pandas.tseries.offsets.BusinessDay.apply_index` is deprecated and will be removed in a future version. Use ``offset + other`` instead (:issue:`34580`) - :meth:`DataFrame.tshift` and :meth:`Series.tshift` are deprecated and will be removed in a future version, use :meth:`DataFrame.shift` and :meth:`Series.shift` instead (:issue:`11631`) - Indexing an :class:`Index` object with a float key is deprecated, and will raise an ``IndexError`` in the future. You can manually convert to an integer key @@ -835,8 +834,8 @@ Deprecations - The ``squeeze`` keyword in :meth:`~DataFrame.groupby` is deprecated and will be removed in a future version (:issue:`32380`) - The ``tz`` keyword in :meth:`Period.to_timestamp` is deprecated and will be removed in a future version; use ``per.to_timestamp(...).tz_localize(tz)`` instead (:issue:`34522`) - :meth:`DatetimeIndex.to_perioddelta` is deprecated and will be removed in a future version. Use ``index - index.to_period(freq).to_timestamp()`` instead (:issue:`34853`) -- :func:`DataFrame.melt` accepting a value_name that already exists is deprecated, and will be removed in a future version (:issue:`34731`) -- the ``center`` keyword in the :meth:`DataFrame.expanding` function is deprecated and will be removed in a future version (:issue:`20647`) +- :meth:`DataFrame.melt` accepting a ``value_name`` that already exists is deprecated, and will be removed in a future version (:issue:`34731`) +- The ``center`` keyword in the :meth:`DataFrame.expanding` function is deprecated and will be removed in a future version (:issue:`20647`) @@ -851,7 +850,7 @@ Performance improvements - Performance improvement in :class:`Timedelta` constructor (:issue:`30543`) - Performance improvement in :class:`Timestamp` constructor (:issue:`30543`) - Performance improvement in flex arithmetic ops between :class:`DataFrame` and :class:`Series` with ``axis=0`` (:issue:`31296`) -- Performance improvement in arithmetic ops between :class:`DataFrame` and :class:`Series` with ``axis=1`` (:issue:`33600`) +- Performance improvement in arithmetic ops between :class:`DataFrame` and :class:`Series` with ``axis=1`` (:issue:`33600`) - The internal index method :meth:`~Index._shallow_copy` now copies cached attributes over to the new index, avoiding creating these again on the new index. This can speed up many operations that depend on creating copies of existing indexes (:issue:`28584`, :issue:`32640`, :issue:`32669`) @@ -861,14 +860,14 @@ Performance improvements :issue:`32825`, :issue:`32826`, :issue:`32856`, :issue:`32858`). - Performance improvement for groupby methods :meth:`~pandas.core.groupby.groupby.Groupby.first` and :meth:`~pandas.core.groupby.groupby.Groupby.last` (:issue:`34178`) -- Performance improvement in :func:`factorize` for nullable (integer and boolean) dtypes (:issue:`33064`). +- Performance improvement in :func:`factorize` for nullable (integer and Boolean) dtypes (:issue:`33064`). - Performance improvement when constructing :class:`Categorical` objects (:issue:`33921`) - Fixed performance regression in :func:`pandas.qcut` and :func:`pandas.cut` (:issue:`33921`) -- Performance improvement in reductions (sum, prod, min, max) for nullable (integer and boolean) dtypes (:issue:`30982`, :issue:`33261`, :issue:`33442`). +- Performance improvement in reductions (``sum``, ``prod``, ``min``, ``max``) for nullable (integer and Boolean) dtypes (:issue:`30982`, :issue:`33261`, :issue:`33442`). - Performance improvement in arithmetic operations between two :class:`DataFrame` objects (:issue:`32779`) - Performance improvement in :class:`pandas.core.groupby.RollingGroupby` (:issue:`34052`) -- Performance improvement in arithmetic operations (sub, add, mul, div) for MultiIndex (:issue:`34297`) -- Performance improvement in `DataFrame[bool_indexer]` when `bool_indexer` is a list (:issue:`33924`) +- Performance improvement in arithmetic operations (``sub``, ``add``, ``mul``, ``div``) for :class:`MultiIndex` (:issue:`34297`) +- Performance improvement in ``DataFrame[bool_indexer]`` when ``bool_indexer`` is a ``list`` (:issue:`33924`) - Significant performance improvement of :meth:`io.formats.style.Styler.render` with styles added with various ways such as :meth:`io.formats.style.Styler.apply`, :meth:`io.formats.style.Styler.applymap` or :meth:`io.formats.style.Styler.bar` (:issue:`19917`) .. --------------------------------------------------------------------------- @@ -883,12 +882,12 @@ Categorical ^^^^^^^^^^^ - Passing an invalid ``fill_value`` to :meth:`Categorical.take` raises a ``ValueError`` instead of ``TypeError`` (:issue:`33660`) -- Combining a ``Categorical`` with integer categories and which contains missing values with a float dtype column in operations such as :func:`concat` or :meth:`~DataFrame.append` will now result in a float column instead of an object dtyped column (:issue:`33607`) +- Combining a :class:`Categorical` with integer categories and which contains missing values with a float dtype column in operations such as :func:`concat` or :meth:`~DataFrame.append` will now result in a float column instead of an object dtype column (:issue:`33607`) - Bug where :func:`merge` was unable to join on non-unique categorical indices (:issue:`28189`) - Bug when passing categorical data to :class:`Index` constructor along with ``dtype=object`` incorrectly returning a :class:`CategoricalIndex` instead of object-dtype :class:`Index` (:issue:`32167`) - Bug where :class:`Categorical` comparison operator ``__ne__`` would incorrectly evaluate to ``False`` when either element was missing (:issue:`32276`) - :meth:`Categorical.fillna` now accepts :class:`Categorical` ``other`` argument (:issue:`32420`) -- Repr of :class:`Categorical` was not distinguishing between int and str (:issue:`33676`) +- Repr of :class:`Categorical` was not distinguishing between ``int`` and ``str`` (:issue:`33676`) Datetimelike ^^^^^^^^^^^^ @@ -897,30 +896,30 @@ Datetimelike - :meth:`Series.to_timestamp` now raises a ``TypeError`` if the axis is not a :class:`PeriodIndex`. Previously an ``AttributeError`` was raised (:issue:`33327`) - :meth:`Series.to_period` now raises a ``TypeError`` if the axis is not a :class:`DatetimeIndex`. Previously an ``AttributeError`` was raised (:issue:`33327`) - :class:`Period` no longer accepts tuples for the ``freq`` argument (:issue:`34658`) -- Bug in :class:`Timestamp` where constructing :class:`Timestamp` from ambiguous epoch time and calling constructor again changed :meth:`Timestamp.value` property (:issue:`24329`) +- Bug in :class:`Timestamp` where constructing a :class:`Timestamp` from ambiguous epoch time and calling constructor again changed the :meth:`Timestamp.value` property (:issue:`24329`) - :meth:`DatetimeArray.searchsorted`, :meth:`TimedeltaArray.searchsorted`, :meth:`PeriodArray.searchsorted` not recognizing non-pandas scalars and incorrectly raising ``ValueError`` instead of ``TypeError`` (:issue:`30950`) - Bug in :class:`Timestamp` where constructing :class:`Timestamp` with dateutil timezone less than 128 nanoseconds before daylight saving time switch from winter to summer would result in nonexistent time (:issue:`31043`) - Bug in :meth:`Period.to_timestamp`, :meth:`Period.start_time` with microsecond frequency returning a timestamp one nanosecond earlier than the correct time (:issue:`31475`) -- :class:`Timestamp` raising confusing error message when year, month or day is missing (:issue:`31200`) -- Bug in :class:`DatetimeIndex` constructor incorrectly accepting ``bool``-dtyped inputs (:issue:`32668`) +- :class:`Timestamp` raised a confusing error message when year, month or day is missing (:issue:`31200`) +- Bug in :class:`DatetimeIndex` constructor incorrectly accepting ``bool``-dtype inputs (:issue:`32668`) - Bug in :meth:`DatetimeIndex.searchsorted` not accepting a ``list`` or :class:`Series` as its argument (:issue:`32762`) - Bug where :meth:`PeriodIndex` raised when passed a :class:`Series` of strings (:issue:`26109`) -- Bug in :class:`Timestamp` arithmetic when adding or subtracting a ``np.ndarray`` with ``timedelta64`` dtype (:issue:`33296`) -- Bug in :meth:`DatetimeIndex.to_period` not infering the frequency when called with no arguments (:issue:`33358`) -- Bug in :meth:`DatetimeIndex.tz_localize` incorrectly retaining ``freq`` in some cases where the original freq is no longer valid (:issue:`30511`) +- Bug in :class:`Timestamp` arithmetic when adding or subtracting an ``np.ndarray`` with ``timedelta64`` dtype (:issue:`33296`) +- Bug in :meth:`DatetimeIndex.to_period` not inferring the frequency when called with no arguments (:issue:`33358`) +- Bug in :meth:`DatetimeIndex.tz_localize` incorrectly retaining ``freq`` in some cases where the original ``freq`` is no longer valid (:issue:`30511`) - Bug in :meth:`DatetimeIndex.intersection` losing ``freq`` and timezone in some cases (:issue:`33604`) - Bug in :meth:`DatetimeIndex.get_indexer` where incorrect output would be returned for mixed datetime-like targets (:issue:`33741`) - Bug in :class:`DatetimeIndex` addition and subtraction with some types of :class:`DateOffset` objects incorrectly retaining an invalid ``freq`` attribute (:issue:`33779`) - Bug in :class:`DatetimeIndex` where setting the ``freq`` attribute on an index could silently change the ``freq`` attribute on another index viewing the same data (:issue:`33552`) -- :meth:`DataFrame.min`/:meth:`DataFrame.max` not returning consistent result with :meth:`Series.min`/:meth:`Series.max` when called on objects initialized with empty :func:`pd.to_datetime` +- :meth:`DataFrame.min` and :meth:`DataFrame.max` were not returning consistent results with :meth:`Series.min` and :meth:`Series.max` when called on objects initialized with empty :func:`pd.to_datetime` - Bug in :meth:`DatetimeIndex.intersection` and :meth:`TimedeltaIndex.intersection` with results not having the correct ``name`` attribute (:issue:`33904`) - Bug in :meth:`DatetimeArray.__setitem__`, :meth:`TimedeltaArray.__setitem__`, :meth:`PeriodArray.__setitem__` incorrectly allowing values with ``int64`` dtype to be silently cast (:issue:`33717`) - Bug in subtracting :class:`TimedeltaIndex` from :class:`Period` incorrectly raising ``TypeError`` in some cases where it should succeed and ``IncompatibleFrequency`` in some cases where it should raise ``TypeError`` (:issue:`33883`) -- Bug in constructing a Series or Index from a read-only NumPy array with non-ns +- Bug in constructing a :class:`Series` or :class:`Index` from a read-only NumPy array with non-ns resolution which converted to object dtype instead of coercing to ``datetime64[ns]`` dtype when within the timestamp bounds (:issue:`34843`). - The ``freq`` keyword in :class:`Period`, :func:`date_range`, :func:`period_range`, :func:`pd.tseries.frequencies.to_offset` no longer allows tuples, pass as string instead (:issue:`34703`) -- Bug in :meth:`DataFrame.append` when appending a :class:`Series` containing a scalar tz-aware :class:`Timestamp` to an empty :class:`DataFrame` resulted in an object column instead of datetime64[ns, tz] dtype (:issue:`35038`) +- Bug in :meth:`DataFrame.append` when appending a :class:`Series` containing a scalar tz-aware :class:`Timestamp` to an empty :class:`DataFrame` resulted in an object column instead of ``datetime64[ns, tz]`` dtype (:issue:`35038`) - ``OutOfBoundsDatetime`` issues an improved error message when timestamp is out of implementation bounds. (:issue:`32967`) - Bug in :meth:`AbstractHolidayCalendar.holidays` when no rules were defined (:issue:`31415`) - Bug in :class:`Tick` comparisons raising ``TypeError`` when comparing against timedelta-like objects (:issue:`34088`) @@ -931,13 +930,13 @@ Timedelta - Bug in constructing a :class:`Timedelta` with a high precision integer that would round the :class:`Timedelta` components (:issue:`31354`) - Bug in dividing ``np.nan`` or ``None`` by :class:`Timedelta` incorrectly returning ``NaT`` (:issue:`31869`) -- Timedeltas now understand ``µs`` as identifier for microsecond (:issue:`32899`) +- :class:`Timedelta` now understands ``µs`` as an identifier for microsecond (:issue:`32899`) - :class:`Timedelta` string representation now includes nanoseconds, when nanoseconds are non-zero (:issue:`9309`) -- Bug in comparing a :class:`Timedelta` object against a ``np.ndarray`` with ``timedelta64`` dtype incorrectly viewing all entries as unequal (:issue:`33441`) +- Bug in comparing a :class:`Timedelta` object against an ``np.ndarray`` with ``timedelta64`` dtype incorrectly viewing all entries as unequal (:issue:`33441`) - Bug in :func:`timedelta_range` that produced an extra point on a edge case (:issue:`30353`, :issue:`33498`) - Bug in :meth:`DataFrame.resample` that produced an extra point on a edge case (:issue:`30353`, :issue:`13022`, :issue:`33498`) - Bug in :meth:`DataFrame.resample` that ignored the ``loffset`` argument when dealing with timedelta (:issue:`7687`, :issue:`33498`) -- Bug in :class:`Timedelta` and `pandas.to_timedelta` that ignored `unit`-argument for string input (:issue:`12136`) +- Bug in :class:`Timedelta` and :func:`pandas.to_timedelta` that ignored the ``unit`` argument for string input (:issue:`12136`) Timezones ^^^^^^^^^ @@ -948,24 +947,24 @@ Timezones Numeric ^^^^^^^ - Bug in :meth:`DataFrame.floordiv` with ``axis=0`` not treating division-by-zero like :meth:`Series.floordiv` (:issue:`31271`) -- Bug in :meth:`to_numeric` with string argument ``"uint64"`` and ``errors="coerce"`` silently fails (:issue:`32394`) -- Bug in :meth:`to_numeric` with ``downcast="unsigned"`` fails for empty data (:issue:`32493`) +- Bug in :func:`to_numeric` with string argument ``"uint64"`` and ``errors="coerce"`` silently fails (:issue:`32394`) +- Bug in :func:`to_numeric` with ``downcast="unsigned"`` fails for empty data (:issue:`32493`) - Bug in :meth:`DataFrame.mean` with ``numeric_only=False`` and either ``datetime64`` dtype or ``PeriodDtype`` column incorrectly raising ``TypeError`` (:issue:`32426`) - Bug in :meth:`DataFrame.count` with ``level="foo"`` and index level ``"foo"`` containing NaNs causes segmentation fault (:issue:`21824`) - Bug in :meth:`DataFrame.diff` with ``axis=1`` returning incorrect results with mixed dtypes (:issue:`32995`) - Bug in :meth:`DataFrame.corr` and :meth:`DataFrame.cov` raising when handling nullable integer columns with ``pandas.NA`` (:issue:`33803`) -- Bug in arithmetic operations between ``DataFrame`` objects with non-overlapping columns with duplicate labels causing an infinite loop (:issue:`35194`) +- Bug in arithmetic operations between :class:`DataFrame` objects with non-overlapping columns with duplicate labels causing an infinite loop (:issue:`35194`) - Bug in :class:`DataFrame` and :class:`Series` addition and subtraction between object-dtype objects and ``datetime64`` dtype objects (:issue:`33824`) -- Bug in :meth:`Index.difference` incorrect results when comparing a :class:`Float64Index` and object :class:`Index` (:issue:`35217`) +- Bug in :meth:`Index.difference` giving incorrect results when comparing a :class:`Float64Index` and object :class:`Index` (:issue:`35217`) - Bug in :class:`DataFrame` reductions (e.g. ``df.min()``, ``df.max()``) with ``ExtensionArray`` dtypes (:issue:`34520`, :issue:`32651`) -- :meth:`Series.interpolate` and :meth:`DataFrame.interpolate` now raises ValueError if ``limit_direction`` is 'forward' or 'both' and ``method`` is 'backfill' or 'bfill' or ``limit_direction`` is 'backward' or 'both' and ``method`` is 'pad' or 'ffill' (:issue:`34746`) +- :meth:`Series.interpolate` and :meth:`DataFrame.interpolate` now raise a ValueError if ``limit_direction`` is ``'forward'`` or ``'both'`` and ``method`` is ``'backfill'`` or ``'bfill'`` or ``limit_direction`` is ``'backward'`` or ``'both'`` and ``method`` is ``'pad'`` or ``'ffill'`` (:issue:`34746`) Conversion ^^^^^^^^^^ - Bug in :class:`Series` construction from NumPy array with big-endian ``datetime64`` dtype (:issue:`29684`) - Bug in :class:`Timedelta` construction with large nanoseconds keyword value (:issue:`32402`) - Bug in :class:`DataFrame` construction where sets would be duplicated rather than raising (:issue:`32582`) -- The :class:`DataFrame` constructor no longer accepts a list of ``DataFrame`` objects. Because of changes to NumPy, ``DataFrame`` objects are now consistently treated as 2D objects, so a list of ``DataFrames`` is considered 3D, and no longer acceptible for the ``DataFrame`` constructor (:issue:`32289`). +- The :class:`DataFrame` constructor no longer accepts a list of :class:`DataFrame` objects. Because of changes to NumPy, :class:`DataFrame` objects are now consistently treated as 2D objects, so a list of :class:`DataFrame` objects is considered 3D, and no longer acceptable for the :class:`DataFrame` constructor (:issue:`32289`). - Bug in :class:`DataFrame` when initiating a frame with lists and assign ``columns`` with nested list for ``MultiIndex`` (:issue:`32173`) - Improved error message for invalid construction of list when creating a new index (:issue:`35190`) @@ -996,42 +995,42 @@ Indexing - Bug in :meth:`DataFrame.at` when either columns or index is non-unique (:issue:`33041`) - Bug in :meth:`Series.loc` and :meth:`DataFrame.loc` when indexing with an integer key on a object-dtype :class:`Index` that is not all-integers (:issue:`31905`) - Bug in :meth:`DataFrame.iloc.__setitem__` on a :class:`DataFrame` with duplicate columns incorrectly setting values for all matching columns (:issue:`15686`, :issue:`22036`) -- Bug in :meth:`DataFrame.loc:` and :meth:`Series.loc` with a :class:`DatetimeIndex`, :class:`TimedeltaIndex`, or :class:`PeriodIndex` incorrectly allowing lookups of non-matching datetime-like dtypes (:issue:`32650`) +- Bug in :meth:`DataFrame.loc` and :meth:`Series.loc` with a :class:`DatetimeIndex`, :class:`TimedeltaIndex`, or :class:`PeriodIndex` incorrectly allowing lookups of non-matching datetime-like dtypes (:issue:`32650`) - Bug in :meth:`Series.__getitem__` indexing with non-standard scalars, e.g. ``np.dtype`` (:issue:`32684`) -- Bug in :class:`Index` constructor where an unhelpful error message was raised for ``numpy`` scalars (:issue:`33017`) +- Bug in :class:`Index` constructor where an unhelpful error message was raised for NumPy scalars (:issue:`33017`) - Bug in :meth:`DataFrame.lookup` incorrectly raising an ``AttributeError`` when ``frame.index`` or ``frame.columns`` is not unique; this will now raise a ``ValueError`` with a helpful error message (:issue:`33041`) - Bug in :meth:`DataFrame.iloc.__setitem__` creating a new array instead of overwriting ``Categorical`` values in-place (:issue:`32831`) - Bug in :class:`Interval` where a :class:`Timedelta` could not be added or subtracted from a :class:`Timestamp` interval (:issue:`32023`) -- Bug in :meth:`DataFrame.copy` _item_cache not invalidated after copy causes post-copy value updates to not be reflected (:issue:`31784`) +- Bug in :meth:`DataFrame.copy` not invalidating _item_cache after copy caused post-copy value updates to not be reflected (:issue:`31784`) - Fixed regression in :meth:`DataFrame.loc` and :meth:`Series.loc` throwing an error when a ``datetime64[ns, tz]`` value is provided (:issue:`32395`) -- Bug in `Series.__getitem__` with an integer key and a :class:`MultiIndex` with leading integer level failing to raise ``KeyError`` if the key is not present in the first level (:issue:`33355`) -- Bug in :meth:`DataFrame.iloc` when slicing a single column-:class:`DataFrame` with ``ExtensionDtype`` (e.g. ``df.iloc[:, :1]``) returning an invalid result (:issue:`32957`) +- Bug in :meth:`Series.__getitem__` with an integer key and a :class:`MultiIndex` with leading integer level failing to raise ``KeyError`` if the key is not present in the first level (:issue:`33355`) +- Bug in :meth:`DataFrame.iloc` when slicing a single column :class:`DataFrame` with ``ExtensionDtype`` (e.g. ``df.iloc[:, :1]``) returning an invalid result (:issue:`32957`) - Bug in :meth:`DatetimeIndex.insert` and :meth:`TimedeltaIndex.insert` causing index ``freq`` to be lost when setting an element into an empty :class:`Series` (:issue:`33573`) - Bug in :meth:`Series.__setitem__` with an :class:`IntervalIndex` and a list-like key of integers (:issue:`33473`) - Bug in :meth:`Series.__getitem__` allowing missing labels with ``np.ndarray``, :class:`Index`, :class:`Series` indexers but not ``list``, these now all raise ``KeyError`` (:issue:`33646`) - Bug in :meth:`DataFrame.truncate` and :meth:`Series.truncate` where index was assumed to be monotone increasing (:issue:`33756`) -- Indexing with a list of strings representing datetimes failed on :class:`DatetimeIndex` or :class:`PeriodIndex`(:issue:`11278`) +- Indexing with a list of strings representing datetimes failed on :class:`DatetimeIndex` or :class:`PeriodIndex` (:issue:`11278`) - Bug in :meth:`Series.at` when used with a :class:`MultiIndex` would raise an exception on valid inputs (:issue:`26989`) - Bug in :meth:`DataFrame.loc` with dictionary of values changes columns with dtype of ``int`` to ``float`` (:issue:`34573`) -- Bug in :meth:`Series.loc` when used with a :class:`MultiIndex` would raise an IndexingError when accessing a None value (:issue:`34318`) +- Bug in :meth:`Series.loc` when used with a :class:`MultiIndex` would raise an ``IndexingError`` when accessing a ``None`` value (:issue:`34318`) - Bug in :meth:`DataFrame.reset_index` and :meth:`Series.reset_index` would not preserve data types on an empty :class:`DataFrame` or :class:`Series` with a :class:`MultiIndex` (:issue:`19602`) - Bug in :class:`Series` and :class:`DataFrame` indexing with a ``time`` key on a :class:`DatetimeIndex` with ``NaT`` entries (:issue:`35114`) Missing ^^^^^^^ -- Calling :meth:`fillna` on an empty Series now correctly returns a shallow copied object. The behaviour is now consistent with :class:`Index`, :class:`DataFrame` and a non-empty :class:`Series` (:issue:`32543`). -- Bug in :meth:`replace` when argument ``to_replace`` is of type dict/list and is used on a :class:`Series` containing ```` was raising a ``TypeError``. The method now handles this by ignoring ```` values when doing the comparison for the replacement (:issue:`32621`) -- Bug in :meth:`~Series.any` and :meth:`~Series.all` incorrectly returning ```` for all ``False`` or all ``True`` values using the nulllable boolean dtype and with ``skipna=False`` (:issue:`33253`) -- Clarified documentation on interpolate with method =akima. The ``der`` parameter must be scalar or None (:issue:`33426`) -- :meth:`DataFrame.interpolate` uses the correct axis convention now. Previously interpolating along columns lead to interpolation along indices and vice versa. Furthermore interpolating with methods ``pad``, ``ffill``, ``bfill`` and ``backfill`` are identical to using these methods with :meth:`fillna` (:issue:`12918`, :issue:`29146`) -- Bug in :meth:`DataFrame.interpolate` when called on a DataFrame with column names of string type was throwing a ValueError. The method is no independing of the type of column names (:issue:`33956`) -- passing :class:`NA` will into a format string using format specs will now work. For example ``"{:.1f}".format(pd.NA)`` would previously raise a ``ValueError``, but will now return the string ``""`` (:issue:`34740`) +- Calling :meth:`fillna` on an empty :class:`Series` now correctly returns a shallow copied object. The behaviour is now consistent with :class:`Index`, :class:`DataFrame` and a non-empty :class:`Series` (:issue:`32543`). +- Bug in :meth:`Series.replace` when argument ``to_replace`` is of type dict/list and is used on a :class:`Series` containing ```` was raising a ``TypeError``. The method now handles this by ignoring ```` values when doing the comparison for the replacement (:issue:`32621`) +- Bug in :meth:`~Series.any` and :meth:`~Series.all` incorrectly returning ```` for all ``False`` or all ``True`` values using the nulllable Boolean dtype and with ``skipna=False`` (:issue:`33253`) +- Clarified documentation on interpolate with ``method=akima``. The ``der`` parameter must be scalar or ``None`` (:issue:`33426`) +- :meth:`DataFrame.interpolate` uses the correct axis convention now. Previously interpolating along columns lead to interpolation along indices and vice versa. Furthermore interpolating with methods ``pad``, ``ffill``, ``bfill`` and ``backfill`` are identical to using these methods with :meth:`DataFrame.fillna` (:issue:`12918`, :issue:`29146`) +- Bug in :meth:`DataFrame.interpolate` when called on a :class:`DataFrame` with column names of string type was throwing a ValueError. The method is now independent of the type of the column names (:issue:`33956`) +- Passing :class:`NA` into a format string using format specs will now work. For example ``"{:.1f}".format(pd.NA)`` would previously raise a ``ValueError``, but will now return the string ``""`` (:issue:`34740`) - Bug in :meth:`Series.map` not raising on invalid ``na_action`` (:issue:`32815`) MultiIndex ^^^^^^^^^^ -- :meth:`DataFrame.swaplevels` now raises a ``TypeError`` if the axis is not a :class:`MultiIndex`. Previously an ``AttributeError`` was raised (:issue:`31126`) +- :meth:`DataFrame.swaplevels` now raises a ``TypeError`` if the axis is not a :class:`MultiIndex`. Previously an ``AttributeError`` was raised (:issue:`31126`) - Bug in :meth:`Dataframe.loc` when used with a :class:`MultiIndex`. The returned values were not in the same order as the given inputs (:issue:`22797`) .. ipython:: python @@ -1051,128 +1050,128 @@ MultiIndex # Common elements are now guaranteed to be ordered by the left side left.intersection(right, sort=False) -- Bug when joining 2 Multi-indexes, without specifying level with different columns. Return-indexers parameter is ignored. (:issue:`34074`) +- Bug when joining two :class:`MultiIndex` without specifying level with different columns. Return-indexers parameter was ignored. (:issue:`34074`) I/O ^^^ -- Passing a `set` as `names` argument to :func:`pandas.read_csv`, :func:`pandas.read_table`, or :func:`pandas.read_fwf` will raise ``ValueError: Names should be an ordered collection.`` (:issue:`34946`) +- Passing a ``set`` as ``names`` argument to :func:`pandas.read_csv`, :func:`pandas.read_table`, or :func:`pandas.read_fwf` will raise ``ValueError: Names should be an ordered collection.`` (:issue:`34946`) - Bug in print-out when ``display.precision`` is zero. (:issue:`20359`) -- Bug in :meth:`read_json` where integer overflow was occurring when json contains big number strings. (:issue:`30320`) -- `read_csv` will now raise a ``ValueError`` when the arguments `header` and `prefix` both are not `None`. (:issue:`27394`) +- Bug in :func:`read_json` where integer overflow was occurring when json contains big number strings. (:issue:`30320`) +- :func:`read_csv` will now raise a ``ValueError`` when the arguments ``header`` and ``prefix`` both are not ``None``. (:issue:`27394`) - Bug in :meth:`DataFrame.to_json` was raising ``NotFoundError`` when ``path_or_buf`` was an S3 URI (:issue:`28375`) - Bug in :meth:`DataFrame.to_parquet` overwriting pyarrow's default for ``coerce_timestamps``; following pyarrow's default allows writing nanosecond timestamps with ``version="2.0"`` (:issue:`31652`). -- Bug in :meth:`read_csv` was raising `TypeError` when `sep=None` was used in combination with `comment` keyword (:issue:`31396`) -- Bug in :class:`HDFStore` that caused it to set to ``int64`` the dtype of a ``datetime64`` column when reading a DataFrame in Python 3 from fixed format written in Python 2 (:issue:`31750`) +- Bug in :func:`read_csv` was raising ``TypeError`` when ``sep=None`` was used in combination with ``comment`` keyword (:issue:`31396`) +- Bug in :class:`HDFStore` that caused it to set to ``int64`` the dtype of a ``datetime64`` column when reading a :class:`DataFrame` in Python 3 from fixed format written in Python 2 (:issue:`31750`) - :func:`read_sas()` now handles dates and datetimes larger than :attr:`Timestamp.max` returning them as :class:`datetime.datetime` objects (:issue:`20927`) - Bug in :meth:`DataFrame.to_json` where ``Timedelta`` objects would not be serialized correctly with ``date_format="iso"`` (:issue:`28256`) -- :func:`read_csv` will raise a ``ValueError`` when the column names passed in `parse_dates` are missing in the Dataframe (:issue:`31251`) -- Bug in :meth:`read_excel` where a UTF-8 string with a high surrogate would cause a segmentation violation (:issue:`23809`) -- Bug in :meth:`read_csv` was causing a file descriptor leak on an empty file (:issue:`31488`) -- Bug in :meth:`read_csv` was causing a segfault when there were blank lines between the header and data rows (:issue:`28071`) -- Bug in :meth:`read_csv` was raising a misleading exception on a permissions issue (:issue:`23784`) -- Bug in :meth:`read_csv` was raising an ``IndexError`` when header=None and 2 extra data columns -- Bug in :meth:`read_sas` was raising an ``AttributeError`` when reading files from Google Cloud Storage (issue:`33069`) +- :func:`read_csv` will raise a ``ValueError`` when the column names passed in ``parse_dates`` are missing in the :class:`Dataframe` (:issue:`31251`) +- Bug in :func:`read_excel` where a UTF-8 string with a high surrogate would cause a segmentation violation (:issue:`23809`) +- Bug in :func:`read_csv` was causing a file descriptor leak on an empty file (:issue:`31488`) +- Bug in :func:`read_csv` was causing a segfault when there were blank lines between the header and data rows (:issue:`28071`) +- Bug in :func:`read_csv` was raising a misleading exception on a permissions issue (:issue:`23784`) +- Bug in :func:`read_csv` was raising an ``IndexError`` when ``header=None`` and two extra data columns +- Bug in :func:`read_sas` was raising an ``AttributeError`` when reading files from Google Cloud Storage (:issue:`33069`) - Bug in :meth:`DataFrame.to_sql` where an ``AttributeError`` was raised when saving an out of bounds date (:issue:`26761`) -- Bug in :meth:`read_excel` did not correctly handle multiple embedded spaces in OpenDocument text cells. (:issue:`32207`) -- Bug in :meth:`read_json` was raising ``TypeError`` when reading a list of booleans into a Series. (:issue:`31464`) -- Bug in :func:`pandas.io.json.json_normalize` where location specified by `record_path` doesn't point to an array. (:issue:`26284`) +- Bug in :func:`read_excel` did not correctly handle multiple embedded spaces in OpenDocument text cells. (:issue:`32207`) +- Bug in :func:`read_json` was raising ``TypeError`` when reading a ``list`` of Booleans into a :class:`Series`. (:issue:`31464`) +- Bug in :func:`pandas.io.json.json_normalize` where location specified by ``record_path`` doesn't point to an array. (:issue:`26284`) - :func:`pandas.read_hdf` has a more explicit error message when loading an unsupported HDF file (:issue:`9539`) -- Bug in :meth:`~DataFrame.read_feather` was raising an `ArrowIOError` when reading an s3 or http file path (:issue:`29055`) -- Bug in :meth:`~DataFrame.to_excel` could not handle the column name `render` and was raising an ``KeyError`` (:issue:`34331`) -- Bug in :meth:`~SQLDatabase.execute` was raising a ``ProgrammingError`` for some DB-API drivers when the SQL statement contained the `%` character and no parameters were present (:issue:`34211`) -- Bug in :meth:`~pandas.io.stata.StataReader` which resulted in categorical variables with difference dtypes when reading data using an iterator. (:issue:`31544`) -- :meth:`HDFStore.keys` has now an optional `include` parameter that allows the retrieval of all native HDF5 table names (:issue:`29916`) -- `TypeError` exceptions raised by :meth:`read_csv` and :meth:`read_table` were showing as ``parser_f`` when an unexpected keyword argument was passed (:issue:`25648`) -- Bug in :meth:`read_excel` for ODS files removes 0.0 values (:issue:`27222`) -- Bug in :meth:`ujson.encode` was raising an `OverflowError` with numbers larger than sys.maxsize (:issue: `34395`) -- Bug in :meth:`HDFStore.append_to_multiple` was raising a ``ValueError`` when the min_itemsize parameter is set (:issue:`11238`) -- Bug in :meth:`~HDFStore.create_table` now raises an error when `column` argument was not specified in `data_columns` on input (:issue:`28156`) -- :meth:`read_json` now could read line-delimited json file from a file url while `lines` and `chunksize` are set. +- Bug in :meth:`~DataFrame.read_feather` was raising an ``ArrowIOError`` when reading an s3 or http file path (:issue:`29055`) +- Bug in :meth:`~DataFrame.to_excel` could not handle the column name ``render`` and was raising an ``KeyError`` (:issue:`34331`) +- Bug in :meth:`~SQLDatabase.execute` was raising a ``ProgrammingError`` for some DB-API drivers when the SQL statement contained the ``%`` character and no parameters were present (:issue:`34211`) +- Bug in :meth:`~pandas.io.stata.StataReader` which resulted in categorical variables with different dtypes when reading data using an iterator. (:issue:`31544`) +- :meth:`HDFStore.keys` has now an optional ``include`` parameter that allows the retrieval of all native HDF5 table names (:issue:`29916`) +- ``TypeError`` exceptions raised by :func:`read_csv` and :func:`read_table` were showing as ``parser_f`` when an unexpected keyword argument was passed (:issue:`25648`) +- Bug in :func:`read_excel` for ODS files removes 0.0 values (:issue:`27222`) +- Bug in :func:`ujson.encode` was raising an ``OverflowError`` with numbers larger than ``sys.maxsize`` (:issue:`34395`) +- Bug in :meth:`HDFStore.append_to_multiple` was raising a ``ValueError`` when the ``min_itemsize`` parameter is set (:issue:`11238`) +- Bug in :meth:`~HDFStore.create_table` now raises an error when ``column`` argument was not specified in ``data_columns`` on input (:issue:`28156`) +- :func:`read_json` now could read line-delimited json file from a file url while ``lines`` and ``chunksize`` are set. - Bug in :meth:`DataFrame.to_sql` when reading DataFrames with ``-np.inf`` entries with MySQL now has a more explicit ``ValueError`` (:issue:`34431`) - Bug where capitalised files extensions were not decompressed by read_* functions (:issue:`35164`) -- Bug in :meth:`read_excel` that was raising a ``TypeError`` when ``header=None`` and ``index_col`` given as list (:issue:`31783`) -- Bug in :meth:`read_excel` where datetime values are used in the header in a `MultiIndex` (:issue:`34748`) -- :func:`read_excel` no longer takes ``**kwds`` arguments. This means that passing in keyword ``chunksize`` now raises a ``TypeError`` (previously raised a ``NotImplementedError``), while passing in keyword ``encoding`` now raises a ``TypeError`` (:issue:`34464`) -- Bug in :meth:`DataFrame.to_records` incorrectly losing timezone information in timezone-aware ``datetime64`` columns (:issue:`32535`) +- Bug in :meth:`read_excel` that was raising a ``TypeError`` when ``header=None`` and ``index_col`` is given as a ``list`` (:issue:`31783`) +- Bug in :func:`read_excel` where datetime values are used in the header in a :class:`MultiIndex` (:issue:`34748`) +- :func:`read_excel` no longer takes ``**kwds`` arguments. This means that passing in the keyword argument ``chunksize`` now raises a ``TypeError`` (previously raised a ``NotImplementedError``), while passing in the keyword argument ``encoding`` now raises a ``TypeError`` (:issue:`34464`) +- Bug in :meth:`DataFrame.to_records` was incorrectly losing timezone information in timezone-aware ``datetime64`` columns (:issue:`32535`) Plotting ^^^^^^^^ -- :func:`.plot` for line/bar now accepts color by dictonary (:issue:`8193`). +- :meth:`DataFrame.plot` for line/bar now accepts color by dictionary (:issue:`8193`). - Bug in :meth:`DataFrame.plot.hist` where weights are not working for multiple columns (:issue:`33173`) -- Bug in :meth:`DataFrame.boxplot` and :meth:`DataFrame.plot.boxplot` lost color attributes of ``medianprops``, ``whiskerprops``, ``capprops`` and ``medianprops`` (:issue:`30346`) +- Bug in :meth:`DataFrame.boxplot` and :meth:`DataFrame.plot.boxplot` lost color attributes of ``medianprops``, ``whiskerprops``, ``capprops`` and ``boxprops`` (:issue:`30346`) - Bug in :meth:`DataFrame.hist` where the order of ``column`` argument was ignored (:issue:`29235`) -- Bug in :meth:`DataFrame.plot.scatter` that when adding multiple plots with different ``cmap``, colorbars alway use the first ``cmap`` (:issue:`33389`) -- Bug in :meth:`DataFrame.plot.scatter` was adding a colorbar to the plot even if the argument `c` was assigned to a column containing color names (:issue:`34316`) +- Bug in :meth:`DataFrame.plot.scatter` that when adding multiple plots with different ``cmap``, colorbars always use the first ``cmap`` (:issue:`33389`) +- Bug in :meth:`DataFrame.plot.scatter` was adding a colorbar to the plot even if the argument ``c`` was assigned to a column containing color names (:issue:`34316`) - Bug in :meth:`pandas.plotting.bootstrap_plot` was causing cluttered axes and overlapping labels (:issue:`34905`) - Bug in :meth:`DataFrame.plot.scatter` caused an error when plotting variable marker sizes (:issue:`32904`) Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ -- Using a :func:`pandas.api.indexers.BaseIndexer` with ``count``, ``min``, ``max``, ``median``, ``skew``, ``cov``, ``corr`` will now return correct results for any monotonic :func:`pandas.api.indexers.BaseIndexer` descendant (:issue:`32865`) -- :meth:`DataFrameGroupby.mean` and :meth:`SeriesGroupby.mean` (and similarly for :meth:`~DataFrameGroupby.median`, :meth:`~DataFrameGroupby.std` and :meth:`~DataFrameGroupby.var`) now raise a ``TypeError`` if a not-accepted keyword argument is passed into it. Previously a ``UnsupportedFunctionCall`` was raised (``AssertionError`` if ``min_count`` passed into :meth:`~DataFrameGroupby.median`) (:issue:`31485`) -- Bug in :meth:`GroupBy.apply` raises ``ValueError`` when the ``by`` axis is not sorted and has duplicates and the applied ``func`` does not mutate passed in objects (:issue:`30667`) -- Bug in :meth:`DataFrameGroupby.transform` produces incorrect result with transformation functions (:issue:`30918`) +- Using a :class:`pandas.api.indexers.BaseIndexer` with ``count``, ``min``, ``max``, ``median``, ``skew``, ``cov``, ``corr`` will now return correct results for any monotonic :class:`pandas.api.indexers.BaseIndexer` descendant (:issue:`32865`) +- :meth:`DataFrameGroupby.mean` and :meth:`SeriesGroupby.mean` (and similarly for :meth:`~DataFrameGroupby.median`, :meth:`~DataFrameGroupby.std` and :meth:`~DataFrameGroupby.var`) now raise a ``TypeError`` if a non-accepted keyword argument is passed into it. Previously an ``UnsupportedFunctionCall`` was raised (``AssertionError`` if ``min_count`` passed into :meth:`~DataFrameGroupby.median`) (:issue:`31485`) +- Bug in :meth:`GroupBy.apply` raises ``ValueError`` when the ``by`` axis is not sorted, has duplicates, and the applied ``func`` does not mutate passed in objects (:issue:`30667`) +- Bug in :meth:`DataFrameGroupBy.transform` produces an incorrect result with transformation functions (:issue:`30918`) - Bug in :meth:`Groupby.transform` was returning the wrong result when grouping by multiple keys of which some were categorical and others not (:issue:`32494`) -- Bug in :meth:`GroupBy.count` causes segmentation fault when grouped-by column contains NaNs (:issue:`32841`) -- Bug in :meth:`DataFrame.groupby` and :meth:`Series.groupby` produces inconsistent type when aggregating Boolean series (:issue:`32894`) +- Bug in :meth:`GroupBy.count` causes segmentation fault when grouped-by columns contain NaNs (:issue:`32841`) +- Bug in :meth:`DataFrame.groupby` and :meth:`Series.groupby` produces inconsistent type when aggregating Boolean :class:`Series` (:issue:`32894`) - Bug in :meth:`DataFrameGroupBy.sum` and :meth:`SeriesGroupBy.sum` where a large negative number would be returned when the number of non-null values was below ``min_count`` for nullable integer dtypes (:issue:`32861`) -- Bug in :meth:`SeriesGroupBy.quantile` raising on nullable integers (:issue:`33136`) +- Bug in :meth:`SeriesGroupBy.quantile` was raising on nullable integers (:issue:`33136`) - Bug in :meth:`DataFrame.resample` where an ``AmbiguousTimeError`` would be raised when the resulting timezone aware :class:`DatetimeIndex` had a DST transition at midnight (:issue:`25758`) - Bug in :meth:`DataFrame.groupby` where a ``ValueError`` would be raised when grouping by a categorical column with read-only categories and ``sort=False`` (:issue:`33410`) - Bug in :meth:`GroupBy.agg`, :meth:`GroupBy.transform`, and :meth:`GroupBy.resample` where subclasses are not preserved (:issue:`28330`) -- Bug in :meth:`SeriesGroupBy.agg` where any column name was accepted in the named aggregation of ``SeriesGroupBy`` previously. The behaviour now allows only ``str`` and callables else would raise ``TypeError``. (:issue:`34422`) -- Bug in :meth:`DataFrame.groupby` lost index, when one of the ``agg`` keys referenced an empty list (:issue:`32580`) +- Bug in :meth:`SeriesGroupBy.agg` where any column name was accepted in the named aggregation of :class:`SeriesGroupBy` previously. The behaviour now allows only ``str`` and callables else would raise ``TypeError``. (:issue:`34422`) +- Bug in :meth:`DataFrame.groupby` lost the name of the :class:`Index` when one of the ``agg`` keys referenced an empty list (:issue:`32580`) - Bug in :meth:`Rolling.apply` where ``center=True`` was ignored when ``engine='numba'`` was specified (:issue:`34784`) - Bug in :meth:`DataFrame.ewm.cov` was throwing ``AssertionError`` for :class:`MultiIndex` inputs (:issue:`34440`) -- Bug in :meth:`core.groupby.DataFrameGroupBy.quantile` raises ``TypeError`` for non-numeric types rather than dropping columns (:issue:`27892`) +- Bug in :meth:`core.groupby.DataFrameGroupBy.quantile` raised ``TypeError`` for non-numeric types rather than dropping the columns (:issue:`27892`) - Bug in :meth:`core.groupby.DataFrameGroupBy.transform` when ``func='nunique'`` and columns are of type ``datetime64``, the result would also be of type ``datetime64`` instead of ``int64`` (:issue:`35109`) - Bug in :meth:`DataFrame.groupby` raising an ``AttributeError`` when selecting a column and aggregating with ``as_index=False`` (:issue:`35246`). -- Bug in :meth:'DataFrameGroupBy.first' and :meth:'DataFrameGroupBy.last' that would raise an unnecessary ``ValueError`` when grouping on multiple ``Categoricals`` (:issue:`34951`) +- Bug in :meth:`DataFrameGroupBy.first` and :meth:`DataFrameGroupBy.last` that would raise an unnecessary ``ValueError`` when grouping on multiple ``Categoricals`` (:issue:`34951`) Reshaping ^^^^^^^^^ -- Bug effecting all numeric and boolean reduction methods not returning subclassed data type. (:issue:`25596`) -- Bug in :meth:`DataFrame.pivot_table` when only MultiIndexed columns is set (:issue:`17038`) -- Bug in :meth:`DataFrame.unstack` and :meth:`Series.unstack` can take tuple names in MultiIndexed data (:issue:`19966`) +- Bug effecting all numeric and Boolean reduction methods not returning subclassed data type. (:issue:`25596`) +- Bug in :meth:`DataFrame.pivot_table` when only :class:`MultiIndexed` columns is set (:issue:`17038`) +- Bug in :meth:`DataFrame.unstack` and :meth:`Series.unstack` can take tuple names in :class:`MultiIndexed` data (:issue:`19966`) - Bug in :meth:`DataFrame.pivot_table` when ``margin`` is ``True`` and only ``column`` is defined (:issue:`31016`) -- Fix incorrect error message in :meth:`DataFrame.pivot` when ``columns`` is set to ``None``. (:issue:`30924`) -- Bug in :func:`crosstab` when inputs are two Series and have tuple names, the output will keep dummy MultiIndex as columns. (:issue:`18321`) +- Fixed incorrect error message in :meth:`DataFrame.pivot` when ``columns`` is set to ``None``. (:issue:`30924`) +- Bug in :func:`crosstab` when inputs are two :class:`Series` and have tuple names, the output will keep a dummy :class:`MultiIndex` as columns. (:issue:`18321`) - :meth:`DataFrame.pivot` can now take lists for ``index`` and ``columns`` arguments (:issue:`21425`) - Bug in :func:`concat` where the resulting indices are not copied when ``copy=True`` (:issue:`29879`) - Bug in :meth:`SeriesGroupBy.aggregate` was resulting in aggregations being overwritten when they shared the same name (:issue:`30880`) -- Bug where :meth:`Index.astype` would lose the name attribute when converting from ``Float64Index`` to ``Int64Index``, or when casting to an ``ExtensionArray`` dtype (:issue:`32013`) -- :meth:`Series.append` will now raise a ``TypeError`` when passed a DataFrame or a sequence containing Dataframe (:issue:`31413`) +- Bug where :meth:`Index.astype` would lose the :attr:`name` attribute when converting from ``Float64Index`` to ``Int64Index``, or when casting to an ``ExtensionArray`` dtype (:issue:`32013`) +- :meth:`Series.append` will now raise a ``TypeError`` when passed a :class:`DataFrame` or a sequence containing :class:`DataFrame` (:issue:`31413`) - :meth:`DataFrame.replace` and :meth:`Series.replace` will raise a ``TypeError`` if ``to_replace`` is not an expected type. Previously the ``replace`` would fail silently (:issue:`18634`) -- Bug on inplace operation of a Series that was adding a column to the DataFrame from where it was originally dropped from (using inplace=True) (:issue:`30484`) +- Bug on inplace operation of a :class:`Series` that was adding a column to the :class:`DataFrame` from where it was originally dropped from (using ``inplace=True``) (:issue:`30484`) - Bug in :meth:`DataFrame.apply` where callback was called with :class:`Series` parameter even though ``raw=True`` requested. (:issue:`32423`) - Bug in :meth:`DataFrame.pivot_table` losing timezone information when creating a :class:`MultiIndex` level from a column with timezone-aware dtype (:issue:`32558`) -- Bug in :meth:`concat` where when passing a non-dict mapping as ``objs`` would raise a ``TypeError`` (:issue:`32863`) -- :meth:`DataFrame.agg` now provides more descriptive ``SpecificationError`` message when attempting to aggregating non-existant column (:issue:`32755`) -- Bug in :meth:`DataFrame.unstack` when MultiIndexed columns and MultiIndexed rows were used (:issue:`32624`, :issue:`24729` and :issue:`28306`) -- Appending a dictionary to a :class:`DataFrame` without passing ``ignore_index=True`` will raise ``TypeError: Can only append a dict if ignore_index=True`` instead of ``TypeError: Can only append a Series if ignore_index=True or if the Series has a name`` (:issue:`30871`) +- Bug in :func:`concat` where when passing a non-dict mapping as ``objs`` would raise a ``TypeError`` (:issue:`32863`) +- :meth:`DataFrame.agg` now provides more descriptive ``SpecificationError`` message when attempting to aggregate a non-existent column (:issue:`32755`) +- Bug in :meth:`DataFrame.unstack` when :class:`MultiIndex` columns and :class:`MultiIndex` rows were used (:issue:`32624`, :issue:`24729` and :issue:`28306`) +- Appending a dictionary to a :class:`DataFrame` without passing ``ignore_index=True`` will raise ``TypeError: Can only append a dict if ignore_index=True`` instead of ``TypeError: Can only append a :class:`Series` if ignore_index=True or if the :class:`Series` has a name`` (:issue:`30871`) - Bug in :meth:`DataFrame.corrwith()`, :meth:`DataFrame.memory_usage()`, :meth:`DataFrame.dot()`, :meth:`DataFrame.idxmin()`, :meth:`DataFrame.idxmax()`, :meth:`DataFrame.duplicated()`, :meth:`DataFrame.isin()`, :meth:`DataFrame.count()`, :meth:`Series.explode()`, :meth:`Series.asof()` and :meth:`DataFrame.asof()` not returning subclassed types. (:issue:`31331`) -- Bug in :func:`concat` was not allowing for concatenation of ``DataFrame`` and ``Series`` with duplicate keys (:issue:`33654`) -- Bug in :func:`cut` raised an error when non-unique labels (:issue:`33141`) +- Bug in :func:`concat` was not allowing for concatenation of :class:`DataFrame` and :class:`Series` with duplicate keys (:issue:`33654`) +- Bug in :func:`cut` raised an error when the argument ``labels`` contains duplicates (:issue:`33141`) - Ensure only named functions can be used in :func:`eval()` (:issue:`32460`) -- Bug in :func:`Dataframe.aggregate` and :func:`Series.aggregate` was causing recursive loop in some cases (:issue:`34224`) -- Fixed bug in :func:`melt` where melting MultiIndex columns with ``col_level`` > 0 would raise a ``KeyError`` on ``id_vars`` (:issue:`34129`) -- Bug in :meth:`Series.where` with an empty Series and empty ``cond`` having non-bool dtype (:issue:`34592`) -- Fixed regression where :meth:`DataFrame.apply` would raise ``ValueError`` for elements whth ``S`` dtype (:issue:`34529`) +- Bug in :meth:`Dataframe.aggregate` and :meth:`Series.aggregate` was causing a recursive loop in some cases (:issue:`34224`) +- Fixed bug in :func:`melt` where melting :class:`MultiIndex` columns with ``col_level > 0`` would raise a ``KeyError`` on ``id_vars`` (:issue:`34129`) +- Bug in :meth:`Series.where` with an empty :class:`Series` and empty ``cond`` having non-bool dtype (:issue:`34592`) +- Fixed regression where :meth:`DataFrame.apply` would raise ``ValueError`` for elements with ``S`` dtype (:issue:`34529`) Sparse ^^^^^^ - Creating a :class:`SparseArray` from timezone-aware dtype will issue a warning before dropping timezone information, instead of doing so silently (:issue:`32501`) - Bug in :meth:`arrays.SparseArray.from_spmatrix` wrongly read scipy sparse matrix (:issue:`31991`) -- Bug in :meth:`Series.sum` with ``SparseArray`` raises ``TypeError`` (:issue:`25777`) +- Bug in :meth:`Series.sum` with ``SparseArray`` raised a ``TypeError`` (:issue:`25777`) - Bug where :class:`DataFrame` containing an all-sparse :class:`SparseArray` filled with ``NaN`` when indexed by a list-like (:issue:`27781`, :issue:`29563`) - The repr of :class:`SparseDtype` now includes the repr of its ``fill_value`` attribute. Previously it used ``fill_value``'s string representation (:issue:`34352`) - Bug where empty :class:`DataFrame` could not be cast to :class:`SparseDtype` (:issue:`33113`) @@ -1182,23 +1181,23 @@ ExtensionArray ^^^^^^^^^^^^^^ - Fixed bug where :meth:`Series.value_counts` would raise on empty input of ``Int64`` dtype (:issue:`33317`) -- Fixed bug in :func:`concat` when concatenating DataFrames with non-overlaping columns resulting in object-dtype columns rather than preserving the extension dtype (:issue:`27692`, :issue:`33027`) +- Fixed bug in :func:`concat` when concatenating :class:`DataFrame` objects with non-overlapping columns resulting in object-dtype columns rather than preserving the extension dtype (:issue:`27692`, :issue:`33027`) - Fixed bug where :meth:`StringArray.isna` would return ``False`` for NA values when ``pandas.options.mode.use_inf_as_na`` was set to ``True`` (:issue:`33655`) - Fixed bug in :class:`Series` construction with EA dtype and index but no data or scalar data fails (:issue:`26469`) - Fixed bug that caused :meth:`Series.__repr__()` to crash for extension types whose elements are multidimensional arrays (:issue:`33770`). - Fixed bug where :meth:`Series.update` would raise a ``ValueError`` for ``ExtensionArray`` dtypes with missing values (:issue:`33980`) - Fixed bug where :meth:`StringArray.memory_usage` was not implemented (:issue:`33963`) -- Fixed bug where :meth:`DataFrameGroupBy` would ignore the ``min_count`` argument for aggregations on nullable boolean dtypes (:issue:`34051`) -- Fixed bug that `DataFrame(columns=.., dtype='string')` would fail (:issue:`27953`, :issue:`33623`) +- Fixed bug where :meth:`DataFrameGroupBy` would ignore the ``min_count`` argument for aggregations on nullable Boolean dtypes (:issue:`34051`) +- Fixed bug where the constructor of :class:`DataFrame` with ``dtype='string'`` would fail (:issue:`27953`, :issue:`33623`) - Bug where :class:`DataFrame` column set to scalar extension type was considered an object type rather than the extension type (:issue:`34832`) -- Fixed bug in ``IntegerArray.astype`` to correctly copy the mask as well (:issue:`34931`). +- Fixed bug in :meth:`IntegerArray.astype` to correctly copy the mask as well (:issue:`34931`). Other ^^^^^ - Set operations on an object-dtype :class:`Index` now always return object-dtype results (:issue:`31401`) -- Fixed :func:`pandas.testing.assert_series_equal` to correctly raise if left object is a different subclass with ``check_series_type=True`` (:issue:`32670`). -- Getting a missing attribute in a query/eval string raises the correct ``AttributeError`` (:issue:`32408`) +- Fixed :func:`pandas.testing.assert_series_equal` to correctly raise if the ``left`` argument is a different subclass with ``check_series_type=True`` (:issue:`32670`). +- Getting a missing attribute in a :meth:`DataFrame.query` or :meth:`DataFrame.eval` string raises the correct ``AttributeError`` (:issue:`32408`) - Fixed bug in :func:`pandas.testing.assert_series_equal` where dtypes were checked for ``Interval`` and ``ExtensionArray`` operands when ``check_dtype`` was ``False`` (:issue:`32747`) - Bug in :meth:`DataFrame.__dir__` caused a segfault when using unicode surrogates in a column name (:issue:`25509`) - Bug in :meth:`DataFrame.equals` and :meth:`Series.equals` in allowing subclasses to be equal (:issue:`34402`). From b7bf64e7a2ff726bf8b3e10f96924b13fc360a80 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 27 Jul 2020 20:00:06 +0200 Subject: [PATCH 0403/1025] REGR: revert ExtensionBlock.set to be in-place (#35271) * REGR: revert ExtensionBlock.set to be in-place Co-authored-by: Simon Hawkins Co-authored-by: Tom Augspurger --- doc/source/whatsnew/v1.1.0.rst | 1 - pandas/core/internals/blocks.py | 2 +- pandas/tests/indexing/test_iloc.py | 1 + pandas/tests/indexing/test_indexing.py | 10 ++++++++++ 4 files changed, 12 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index c2a4abbea107c..04a816b50103c 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -999,7 +999,6 @@ Indexing - Bug in :meth:`Series.__getitem__` indexing with non-standard scalars, e.g. ``np.dtype`` (:issue:`32684`) - Bug in :class:`Index` constructor where an unhelpful error message was raised for NumPy scalars (:issue:`33017`) - Bug in :meth:`DataFrame.lookup` incorrectly raising an ``AttributeError`` when ``frame.index`` or ``frame.columns`` is not unique; this will now raise a ``ValueError`` with a helpful error message (:issue:`33041`) -- Bug in :meth:`DataFrame.iloc.__setitem__` creating a new array instead of overwriting ``Categorical`` values in-place (:issue:`32831`) - Bug in :class:`Interval` where a :class:`Timedelta` could not be added or subtracted from a :class:`Timestamp` interval (:issue:`32023`) - Bug in :meth:`DataFrame.copy` not invalidating _item_cache after copy caused post-copy value updates to not be reflected (:issue:`31784`) - Fixed regression in :meth:`DataFrame.loc` and :meth:`Series.loc` throwing an error when a ``datetime64[ns, tz]`` value is provided (:issue:`32395`) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index cc0f09ced7399..6ca6eca1ff829 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1589,7 +1589,7 @@ def should_store(self, value: ArrayLike) -> bool: def set(self, locs, values): assert locs.tolist() == [0] - self.values[:] = values + self.values = values def putmask( self, mask, new, inplace: bool = False, axis: int = 0, transpose: bool = False, diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index c5f40102874dd..4fae01ec710fd 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -694,6 +694,7 @@ def test_series_indexing_zerodim_np_array(self): result = s.iloc[np.array(0)] assert result == 1 + @pytest.mark.xfail(reason="https://github.com/pandas-dev/pandas/issues/33457") def test_iloc_setitem_categorical_updates_inplace(self): # Mixed dtype ensures we go through take_split_path in setitem_with_indexer cat = pd.Categorical(["A", "B", "C"]) diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index ced70069dd955..5b7f013d5de31 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -1100,3 +1100,13 @@ def test_long_text_missing_labels_inside_loc_error_message_limited(): error_message_regex = "long_missing_label_text_0.*\\\\n.*long_missing_label_text_1" with pytest.raises(KeyError, match=error_message_regex): s.loc[["a", "c"] + missing_labels] + + +def test_setitem_categorical(): + # https://github.com/pandas-dev/pandas/issues/35369 + df = pd.DataFrame({"h": pd.Series(list("mn")).astype("category")}) + df.h = df.h.cat.reorder_categories(["n", "m"]) + expected = pd.DataFrame( + {"h": pd.Categorical(["m", "n"]).reorder_categories(["n", "m"])} + ) + tm.assert_frame_equal(df, expected) From 73ad5477f32174ca974368f1eb038c78e9fd0279 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 28 Jul 2020 08:11:26 -0500 Subject: [PATCH 0404/1025] DOC: 1.1.0 release date (#35435) * DOC: 1.1.0 release date --- doc/source/whatsnew/v1.1.0.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 04a816b50103c..a49b29d691692 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -1,7 +1,7 @@ .. _whatsnew_110: -What's new in 1.1.0 (??) ------------------------- +What's new in 1.1.0 (July 28, 2020) +----------------------------------- These are the changes in pandas 1.1.0. See :ref:`release` for a full changelog including other versions of pandas. From 1df84c2f8617a4f3c79018e7fa5c4269b1e8908d Mon Sep 17 00:00:00 2001 From: Florian Roscheck <9593883+flrs@users.noreply.github.com> Date: Wed, 29 Jul 2020 02:12:21 -0700 Subject: [PATCH 0405/1025] DOC: Fix small spelling mistake in style docs (#35355) --- doc/source/user_guide/style.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/user_guide/style.ipynb b/doc/source/user_guide/style.ipynb index fd8dda4fe365e..77a1fef28f373 100644 --- a/doc/source/user_guide/style.ipynb +++ b/doc/source/user_guide/style.ipynb @@ -141,7 +141,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "In this case, the cell's style depends only on it's own value.\n", + "In this case, the cell's style depends only on its own value.\n", "That means we should use the `Styler.applymap` method which works elementwise." ] }, From 91b1952d483e202e1a6d74ca5d60cc674d656b1f Mon Sep 17 00:00:00 2001 From: Ty Mick Date: Wed, 29 Jul 2020 05:13:53 -0400 Subject: [PATCH 0406/1025] Improve dropna subset example (#35337) --- pandas/core/frame.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index f52341ed782d8..3f634c1e6e1ff 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4966,9 +4966,10 @@ def dropna(self, axis=0, how="any", thresh=None, subset=None, inplace=False): Define in which columns to look for missing values. - >>> df.dropna(subset=['name', 'born']) + >>> df.dropna(subset=['name', 'toy']) name toy born 1 Batman Batmobile 1940-04-25 + 2 Catwoman Bullwhip NaT Keep the DataFrame with valid entries in the same variable. From f7748e0a6e9771a546d2913eaa7ef34d7bca72d9 Mon Sep 17 00:00:00 2001 From: Fangchen Li Date: Wed, 29 Jul 2020 04:15:17 -0500 Subject: [PATCH 0407/1025] CI: resolve conflict mypy and isort (#35339) --- pandas/tests/test_algos.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index a080bf0feaebc..6c6bdb6b1b2bd 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -6,7 +6,8 @@ from numpy.random import RandomState import pytest -from pandas._libs import algos as libalgos, groupby as libgroupby, hashtable as ht +from pandas._libs import algos as libalgos, hashtable as ht +from pandas._libs.groupby import group_var_float32, group_var_float64 from pandas.compat.numpy import np_array_datetime64_compat import pandas.util._test_decorators as td @@ -1493,7 +1494,7 @@ def test_group_var_constant(self): class TestGroupVarFloat64(GroupVarTestMixin): __test__ = True - algo = staticmethod(libgroupby.group_var_float64) + algo = staticmethod(group_var_float64) dtype = np.float64 rtol = 1e-5 @@ -1516,7 +1517,7 @@ def test_group_var_large_inputs(self): class TestGroupVarFloat32(GroupVarTestMixin): __test__ = True - algo = staticmethod(libgroupby.group_var_float32) + algo = staticmethod(group_var_float32) dtype = np.float32 rtol = 1e-2 From b999a7696ad1269d69d0348ff95475116f745293 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Wed, 29 Jul 2020 10:16:24 +0100 Subject: [PATCH 0408/1025] TYP: remove # type: ignore for unpacking compression_args (#35344) --- pandas/io/common.py | 29 +++++++++++------------------ 1 file changed, 11 insertions(+), 18 deletions(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index bd77a1e69c138..6ac8051f35b6f 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -267,8 +267,8 @@ def file_path_to_url(path: str) -> str: def get_compression_method( - compression: Optional[Union[str, Mapping[str, str]]] -) -> Tuple[Optional[str], Dict[str, str]]: + compression: Optional[Union[str, Mapping[str, Any]]] +) -> Tuple[Optional[str], Dict[str, Any]]: """ Simplifies a compression argument to a compression method string and a mapping containing additional arguments. @@ -282,21 +282,23 @@ def get_compression_method( Returns ------- tuple of ({compression method}, Optional[str] - {compression arguments}, Dict[str, str]) + {compression arguments}, Dict[str, Any]) Raises ------ ValueError on mapping missing 'method' key """ + compression_method: Optional[str] if isinstance(compression, Mapping): compression_args = dict(compression) try: - compression = compression_args.pop("method") + compression_method = compression_args.pop("method") except KeyError as err: raise ValueError("If mapping, compression must have key 'method'") from err else: compression_args = {} - return compression, compression_args + compression_method = compression + return compression_method, compression_args def infer_compression( @@ -434,28 +436,19 @@ def get_handle( if compression: - # GH33398 the type ignores here seem related to mypy issue #5382; - # it may be possible to remove them once that is resolved. - # GZ Compression if compression == "gzip": if is_path: - f = gzip.open( - path_or_buf, mode, **compression_args # type: ignore - ) + f = gzip.open(path_or_buf, mode, **compression_args) else: - f = gzip.GzipFile( - fileobj=path_or_buf, **compression_args # type: ignore - ) + f = gzip.GzipFile(fileobj=path_or_buf, **compression_args) # BZ Compression elif compression == "bz2": if is_path: - f = bz2.BZ2File( - path_or_buf, mode, **compression_args # type: ignore - ) + f = bz2.BZ2File(path_or_buf, mode, **compression_args) else: - f = bz2.BZ2File(path_or_buf, **compression_args) # type: ignore + f = bz2.BZ2File(path_or_buf, **compression_args) # ZIP Compression elif compression == "zip": From 0cd023aed86c6762f79e1df8e35eb0cddfa8a5ed Mon Sep 17 00:00:00 2001 From: Andrew Wieteska <48889395+arw2019@users.noreply.github.com> Date: Wed, 29 Jul 2020 05:20:32 -0400 Subject: [PATCH 0409/1025] DOC: whatsnew for 1.2 (#35315) --- doc/source/whatsnew/index.rst | 8 ++ doc/source/whatsnew/v1.2.0.rst | 168 +++++++++++++++++++++++++++++++++ 2 files changed, 176 insertions(+) create mode 100644 doc/source/whatsnew/v1.2.0.rst diff --git a/doc/source/whatsnew/index.rst b/doc/source/whatsnew/index.rst index ad5bb5a5b2d72..17043b83f03df 100644 --- a/doc/source/whatsnew/index.rst +++ b/doc/source/whatsnew/index.rst @@ -10,6 +10,14 @@ This is the list of changes to pandas between each release. For full details, see the `commit logs `_. For install and upgrade instructions, see :ref:`install`. +Version 1.2 +----------- + +.. toctree:: + :maxdepth: 2 + + v1.2.0 + Version 1.1 ----------- diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst new file mode 100644 index 0000000000000..2066858e5de86 --- /dev/null +++ b/doc/source/whatsnew/v1.2.0.rst @@ -0,0 +1,168 @@ +.. _whatsnew_120: + +What's new in 1.2.0 (??) +------------------------ + +These are the changes in pandas 1.2.0. See :ref:`release` for a full changelog +including other versions of pandas. + +{{ header }} + +.. --------------------------------------------------------------------------- + +Enhancements +~~~~~~~~~~~~ + +.. _whatsnew_120.enhancements.other: + +Other enhancements +^^^^^^^^^^^^^^^^^^ + +- +- + + +.. --------------------------------------------------------------------------- + +.. _whatsnew_120.deprecations: + +Deprecations +~~~~~~~~~~~~ + +- +- + +.. --------------------------------------------------------------------------- + + +.. _whatsnew_120.performance: + +Performance improvements +~~~~~~~~~~~~~~~~~~~~~~~~ + +- +- + +.. --------------------------------------------------------------------------- + +.. _whatsnew_120.bug_fixes: + +Bug fixes +~~~~~~~~~ + + +Categorical +^^^^^^^^^^^ + +- +- + +Datetimelike +^^^^^^^^^^^^ +- +- + +Timedelta +^^^^^^^^^ + +- +- + +Timezones +^^^^^^^^^ + +- +- + + +Numeric +^^^^^^^ +- +- + +Conversion +^^^^^^^^^^ + +- +- + +Strings +^^^^^^^ + +- +- + + +Interval +^^^^^^^^ + +- +- + +Indexing +^^^^^^^^ + +- +- + +Missing +^^^^^^^ + +- +- + +MultiIndex +^^^^^^^^^^ + +- +- + +I/O +^^^ + +- +- + +Plotting +^^^^^^^^ + +- +- + +Groupby/resample/rolling +^^^^^^^^^^^^^^^^^^^^^^^^ + +- +- + + +Reshaping +^^^^^^^^^ + +- +- + +Sparse +^^^^^^ + +- +- + +ExtensionArray +^^^^^^^^^^^^^^ + +- +- + + +Other +^^^^^ +- +- + +.. --------------------------------------------------------------------------- + +.. _whatsnew_120.contributors: + +Contributors +~~~~~~~~~~~~ \ No newline at end of file From 1cba955a6955237eb8328c3a0577a294de6ff618 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 29 Jul 2020 08:45:32 -0500 Subject: [PATCH 0410/1025] DOC: Start 1.1.1 (#35452) --- doc/source/whatsnew/index.rst | 1 + doc/source/whatsnew/v1.1.1.rst | 54 ++++++++++++++++++++++++++++++++++ 2 files changed, 55 insertions(+) create mode 100644 doc/source/whatsnew/v1.1.1.rst diff --git a/doc/source/whatsnew/index.rst b/doc/source/whatsnew/index.rst index 17043b83f03df..a280a981c789b 100644 --- a/doc/source/whatsnew/index.rst +++ b/doc/source/whatsnew/index.rst @@ -24,6 +24,7 @@ Version 1.1 .. toctree:: :maxdepth: 2 + v1.1.1 v1.1.0 Version 1.0 diff --git a/doc/source/whatsnew/v1.1.1.rst b/doc/source/whatsnew/v1.1.1.rst new file mode 100644 index 0000000000000..443589308ad4c --- /dev/null +++ b/doc/source/whatsnew/v1.1.1.rst @@ -0,0 +1,54 @@ +.. _whatsnew_111: + +What's new in 1.1.1 (?) +----------------------- + +These are the changes in pandas 1.1.1. See :ref:`release` for a full changelog +including other versions of pandas. + +{{ header }} + +.. --------------------------------------------------------------------------- + +.. _whatsnew_111.regressions: + +Fixed regressions +~~~~~~~~~~~~~~~~~ + +- +- +- + +.. --------------------------------------------------------------------------- + +.. _whatsnew_111.bug_fixes: + +Bug fixes +~~~~~~~~~ + +**Datetimelike** + +- +- + +**Numeric** + +- +- + +**Plotting** + +- + +**Indexing** + +- + +.. --------------------------------------------------------------------------- + +.. _whatsnew_111.contributors: + +Contributors +~~~~~~~~~~~~ + +.. contributors:: v1.1.0..v1.1.1|HEAD From d71b3dd5cdc573b2178d0bcc6c770f453eba397e Mon Sep 17 00:00:00 2001 From: Fangchen Li Date: Wed, 29 Jul 2020 09:33:02 -0500 Subject: [PATCH 0411/1025] CLN: resolve isort mypy import conflict (#35134) (#35386) --- pandas/tests/groupby/transform/test_transform.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index cdaf27e214d80..c09f35526a6bf 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -4,7 +4,7 @@ import numpy as np import pytest -from pandas._libs import groupby +from pandas._libs.groupby import group_cumprod_float64, group_cumsum from pandas.core.dtypes.common import ensure_platform_int, is_timedelta64_dtype @@ -545,14 +545,14 @@ def _check_cython_group_transform_cumulative(pd_op, np_op, dtype): def test_cython_group_transform_cumsum(any_real_dtype): # see gh-4095 dtype = np.dtype(any_real_dtype).type - pd_op, np_op = groupby.group_cumsum, np.cumsum + pd_op, np_op = group_cumsum, np.cumsum _check_cython_group_transform_cumulative(pd_op, np_op, dtype) def test_cython_group_transform_cumprod(): # see gh-4095 dtype = np.float64 - pd_op, np_op = groupby.group_cumprod_float64, np.cumproduct + pd_op, np_op = group_cumprod_float64, np.cumproduct _check_cython_group_transform_cumulative(pd_op, np_op, dtype) @@ -567,13 +567,13 @@ def test_cython_group_transform_algos(): data = np.array([[1], [2], [3], [np.nan], [4]], dtype="float64") actual = np.zeros_like(data) actual.fill(np.nan) - groupby.group_cumprod_float64(actual, data, labels, ngroups, is_datetimelike) + group_cumprod_float64(actual, data, labels, ngroups, is_datetimelike) expected = np.array([1, 2, 6, np.nan, 24], dtype="float64") tm.assert_numpy_array_equal(actual[:, 0], expected) actual = np.zeros_like(data) actual.fill(np.nan) - groupby.group_cumsum(actual, data, labels, ngroups, is_datetimelike) + group_cumsum(actual, data, labels, ngroups, is_datetimelike) expected = np.array([1, 3, 6, np.nan, 10], dtype="float64") tm.assert_numpy_array_equal(actual[:, 0], expected) @@ -581,7 +581,7 @@ def test_cython_group_transform_algos(): is_datetimelike = True data = np.array([np.timedelta64(1, "ns")] * 5, dtype="m8[ns]")[:, None] actual = np.zeros_like(data, dtype="int64") - groupby.group_cumsum(actual, data.view("int64"), labels, ngroups, is_datetimelike) + group_cumsum(actual, data.view("int64"), labels, ngroups, is_datetimelike) expected = np.array( [ np.timedelta64(1, "ns"), From 6cc919ffda726de8447537a05f8aa251552f30a0 Mon Sep 17 00:00:00 2001 From: Fangchen Li Date: Wed, 29 Jul 2020 09:33:43 -0500 Subject: [PATCH 0412/1025] isort mypy import confilict (#35380) --- pandas/tests/reshape/merge/test_join.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/tests/reshape/merge/test_join.py b/pandas/tests/reshape/merge/test_join.py index c33443e24b268..d4d4c4190417e 100644 --- a/pandas/tests/reshape/merge/test_join.py +++ b/pandas/tests/reshape/merge/test_join.py @@ -2,7 +2,7 @@ from numpy.random import randn import pytest -from pandas._libs import join as libjoin +from pandas._libs.join import inner_join, left_outer_join import pandas as pd from pandas import DataFrame, Index, MultiIndex, Series, concat, merge @@ -48,7 +48,7 @@ def test_cython_left_outer_join(self): right = a_([1, 1, 0, 4, 2, 2, 1], dtype=np.int64) max_group = 5 - ls, rs = libjoin.left_outer_join(left, right, max_group) + ls, rs = left_outer_join(left, right, max_group) exp_ls = left.argsort(kind="mergesort") exp_rs = right.argsort(kind="mergesort") @@ -70,7 +70,7 @@ def test_cython_right_outer_join(self): right = a_([1, 1, 0, 4, 2, 2, 1], dtype=np.int64) max_group = 5 - rs, ls = libjoin.left_outer_join(right, left, max_group) + rs, ls = left_outer_join(right, left, max_group) exp_ls = left.argsort(kind="mergesort") exp_rs = right.argsort(kind="mergesort") @@ -116,7 +116,7 @@ def test_cython_inner_join(self): right = a_([1, 1, 0, 4, 2, 2, 1, 4], dtype=np.int64) max_group = 5 - ls, rs = libjoin.inner_join(left, right, max_group) + ls, rs = inner_join(left, right, max_group) exp_ls = left.argsort(kind="mergesort") exp_rs = right.argsort(kind="mergesort") From 384938d21a35a019320da132860904cf9082e03a Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Wed, 29 Jul 2020 16:43:10 +0100 Subject: [PATCH 0413/1025] DOC: update Python support policy (#35454) --- doc/source/development/policies.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/development/policies.rst b/doc/source/development/policies.rst index 1031bbfc46457..a564afc408df9 100644 --- a/doc/source/development/policies.rst +++ b/doc/source/development/policies.rst @@ -52,6 +52,6 @@ Python support ~~~~~~~~~~~~~~ pandas will only drop support for specific Python versions (e.g. 3.6.x, 3.7.x) in -pandas **major** releases. +pandas **major** or **minor** releases. .. _SemVer: https://semver.org From 9fa80b378755cd4f49d561e7c4e676ea5ef25ec2 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Wed, 29 Jul 2020 16:59:28 +0100 Subject: [PATCH 0414/1025] CI: Unpin pytest (#35272) --- ci/deps/azure-36-32bit.yaml | 2 +- ci/deps/azure-36-locale.yaml | 2 +- ci/deps/azure-36-locale_slow.yaml | 2 +- ci/deps/azure-36-slow.yaml | 2 +- ci/deps/azure-37-locale.yaml | 2 +- ci/deps/azure-37-numpydev.yaml | 2 +- ci/deps/azure-macos-36.yaml | 2 +- ci/deps/azure-windows-36.yaml | 2 +- ci/deps/azure-windows-37.yaml | 2 +- ci/deps/travis-36-cov.yaml | 2 +- ci/deps/travis-36-locale.yaml | 2 +- ci/deps/travis-37-arm64.yaml | 2 +- ci/deps/travis-37.yaml | 2 +- ci/deps/travis-38.yaml | 2 +- environment.yml | 2 +- pandas/_testing.py | 6 ++---- pandas/tests/groupby/test_categorical.py | 10 ++++------ pandas/util/_test_decorators.py | 18 ++++++++++-------- requirements-dev.txt | 2 +- setup.cfg | 2 +- 20 files changed, 33 insertions(+), 35 deletions(-) diff --git a/ci/deps/azure-36-32bit.yaml b/ci/deps/azure-36-32bit.yaml index 2dc53f8181ac4..15704cf0d5427 100644 --- a/ci/deps/azure-36-32bit.yaml +++ b/ci/deps/azure-36-32bit.yaml @@ -23,4 +23,4 @@ dependencies: - pip - pip: - cython>=0.29.16 - - pytest>=5.0.1,<6.0.0rc0 + - pytest>=5.0.1 diff --git a/ci/deps/azure-36-locale.yaml b/ci/deps/azure-36-locale.yaml index d31015fde4741..a9b9a5a47ccf5 100644 --- a/ci/deps/azure-36-locale.yaml +++ b/ci/deps/azure-36-locale.yaml @@ -7,7 +7,7 @@ dependencies: # tools - cython>=0.29.16 - - pytest>=5.0.1,<6.0.0rc0 + - pytest>=5.0.1 - pytest-xdist>=1.21 - pytest-asyncio - hypothesis>=3.58.0 diff --git a/ci/deps/azure-36-locale_slow.yaml b/ci/deps/azure-36-locale_slow.yaml index 23121b985492e..c086b3651afc3 100644 --- a/ci/deps/azure-36-locale_slow.yaml +++ b/ci/deps/azure-36-locale_slow.yaml @@ -7,7 +7,7 @@ dependencies: # tools - cython>=0.29.16 - - pytest>=5.0.1,<6.0.0rc0 + - pytest>=5.0.1 - pytest-xdist>=1.21 - hypothesis>=3.58.0 - pytest-azurepipelines diff --git a/ci/deps/azure-36-slow.yaml b/ci/deps/azure-36-slow.yaml index 0a6d1d13c8549..87bad59fa4873 100644 --- a/ci/deps/azure-36-slow.yaml +++ b/ci/deps/azure-36-slow.yaml @@ -7,7 +7,7 @@ dependencies: # tools - cython>=0.29.16 - - pytest>=5.0.1,<6.0.0rc0 + - pytest>=5.0.1 - pytest-xdist>=1.21 - hypothesis>=3.58.0 diff --git a/ci/deps/azure-37-locale.yaml b/ci/deps/azure-37-locale.yaml index 4dbb6a5344976..6f64c81f299d1 100644 --- a/ci/deps/azure-37-locale.yaml +++ b/ci/deps/azure-37-locale.yaml @@ -6,7 +6,7 @@ dependencies: # tools - cython>=0.29.16 - - pytest>=5.0.1,<6.0.0rc0 + - pytest>=5.0.1 - pytest-xdist>=1.21 - pytest-asyncio - hypothesis>=3.58.0 diff --git a/ci/deps/azure-37-numpydev.yaml b/ci/deps/azure-37-numpydev.yaml index 451fb5884a4af..5cb58756a6ac1 100644 --- a/ci/deps/azure-37-numpydev.yaml +++ b/ci/deps/azure-37-numpydev.yaml @@ -5,7 +5,7 @@ dependencies: - python=3.7.* # tools - - pytest>=5.0.1,<6.0.0rc0 + - pytest>=5.0.1 - pytest-xdist>=1.21 - hypothesis>=3.58.0 - pytest-azurepipelines diff --git a/ci/deps/azure-macos-36.yaml b/ci/deps/azure-macos-36.yaml index 81a27465f9e61..eeea249a19ca1 100644 --- a/ci/deps/azure-macos-36.yaml +++ b/ci/deps/azure-macos-36.yaml @@ -5,7 +5,7 @@ dependencies: - python=3.6.* # tools - - pytest>=5.0.1,<6.0.0rc0 + - pytest>=5.0.1 - pytest-xdist>=1.21 - hypothesis>=3.58.0 - pytest-azurepipelines diff --git a/ci/deps/azure-windows-36.yaml b/ci/deps/azure-windows-36.yaml index 4d7e1d821037b..548660cabaa67 100644 --- a/ci/deps/azure-windows-36.yaml +++ b/ci/deps/azure-windows-36.yaml @@ -7,7 +7,7 @@ dependencies: # tools - cython>=0.29.16 - - pytest>=5.0.1,<6.0.0rc0 + - pytest>=5.0.1 - pytest-xdist>=1.21 - hypothesis>=3.58.0 - pytest-azurepipelines diff --git a/ci/deps/azure-windows-37.yaml b/ci/deps/azure-windows-37.yaml index 34fca631df6c1..5bbd0e2795d7e 100644 --- a/ci/deps/azure-windows-37.yaml +++ b/ci/deps/azure-windows-37.yaml @@ -7,7 +7,7 @@ dependencies: # tools - cython>=0.29.16 - - pytest>=5.0.1,<6.0.0rc0 + - pytest>=5.0.1 - pytest-xdist>=1.21 - hypothesis>=3.58.0 - pytest-azurepipelines diff --git a/ci/deps/travis-36-cov.yaml b/ci/deps/travis-36-cov.yaml index 5f5ea8034cddf..177e0d3f4c0af 100644 --- a/ci/deps/travis-36-cov.yaml +++ b/ci/deps/travis-36-cov.yaml @@ -7,7 +7,7 @@ dependencies: # tools - cython>=0.29.16 - - pytest>=5.0.1,<6.0.0rc0 + - pytest>=5.0.1 - pytest-xdist>=1.21 - hypothesis>=3.58.0 - pytest-cov # this is only needed in the coverage build diff --git a/ci/deps/travis-36-locale.yaml b/ci/deps/travis-36-locale.yaml index 6bc4aba733ee5..03a1e751b6a86 100644 --- a/ci/deps/travis-36-locale.yaml +++ b/ci/deps/travis-36-locale.yaml @@ -7,7 +7,7 @@ dependencies: # tools - cython>=0.29.16 - - pytest>=5.0.1,<6.0.0rc0 + - pytest>=5.0.1 - pytest-xdist>=1.21 - hypothesis>=3.58.0 diff --git a/ci/deps/travis-37-arm64.yaml b/ci/deps/travis-37-arm64.yaml index f434a03609b26..5cb53489be225 100644 --- a/ci/deps/travis-37-arm64.yaml +++ b/ci/deps/travis-37-arm64.yaml @@ -7,7 +7,7 @@ dependencies: # tools - cython>=0.29.13 - - pytest>=5.0.1,<6.0.0rc0 + - pytest>=5.0.1 - pytest-xdist>=1.21 - hypothesis>=3.58.0 diff --git a/ci/deps/travis-37.yaml b/ci/deps/travis-37.yaml index aaf706d61fe5c..e896233aac63c 100644 --- a/ci/deps/travis-37.yaml +++ b/ci/deps/travis-37.yaml @@ -7,7 +7,7 @@ dependencies: # tools - cython>=0.29.16 - - pytest>=5.0.1,<6.0.0rc0 + - pytest>=5.0.1 - pytest-xdist>=1.21 - hypothesis>=3.58.0 diff --git a/ci/deps/travis-38.yaml b/ci/deps/travis-38.yaml index ac39a223cd086..b879c0f81dab2 100644 --- a/ci/deps/travis-38.yaml +++ b/ci/deps/travis-38.yaml @@ -7,7 +7,7 @@ dependencies: # tools - cython>=0.29.16 - - pytest>=5.0.1,<6.0.0rc0 + - pytest>=5.0.1 - pytest-xdist>=1.21 - hypothesis>=3.58.0 diff --git a/environment.yml b/environment.yml index 53222624619de..3b088ca511be9 100644 --- a/environment.yml +++ b/environment.yml @@ -52,7 +52,7 @@ dependencies: - botocore>=1.11 - hypothesis>=3.82 - moto # mock S3 - - pytest>=5.0.1,<6.0.0rc0 + - pytest>=5.0.1 - pytest-cov - pytest-xdist>=1.21 - pytest-asyncio diff --git a/pandas/_testing.py b/pandas/_testing.py index fc6df7a95e348..1cf9304ed2715 100644 --- a/pandas/_testing.py +++ b/pandas/_testing.py @@ -9,7 +9,7 @@ from shutil import rmtree import string import tempfile -from typing import Any, Callable, List, Optional, Type, Union, cast +from typing import Any, Callable, ContextManager, List, Optional, Type, Union, cast import warnings import zipfile @@ -2880,9 +2880,7 @@ def convert_rows_list_to_csv_str(rows_list: List[str]): return expected -def external_error_raised( - expected_exception: Type[Exception], -) -> Callable[[Type[Exception], None], None]: +def external_error_raised(expected_exception: Type[Exception],) -> ContextManager: """ Helper function to mark pytest.raises that have an external error message. diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 7e4513da37dc9..0d447a70b540d 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -1294,9 +1294,7 @@ def test_get_nonexistent_category(): ) -def test_series_groupby_on_2_categoricals_unobserved( - reduction_func: str, observed: bool, request -): +def test_series_groupby_on_2_categoricals_unobserved(reduction_func, observed, request): # GH 17605 if reduction_func == "ngroup": pytest.skip("ngroup is not truly a reduction") @@ -1326,7 +1324,7 @@ def test_series_groupby_on_2_categoricals_unobserved( def test_series_groupby_on_2_categoricals_unobserved_zeroes_or_nans( - reduction_func: str, request + reduction_func, request ): # GH 17605 # Tests whether the unobserved categories in the result contain 0 or NaN @@ -1374,7 +1372,7 @@ def test_series_groupby_on_2_categoricals_unobserved_zeroes_or_nans( assert np.issubdtype(result.dtype, np.integer) -def test_dataframe_groupby_on_2_categoricals_when_observed_is_true(reduction_func: str): +def test_dataframe_groupby_on_2_categoricals_when_observed_is_true(reduction_func): # GH 23865 # GH 27075 # Ensure that df.groupby, when 'by' is two pd.Categorical variables, @@ -1402,7 +1400,7 @@ def test_dataframe_groupby_on_2_categoricals_when_observed_is_true(reduction_fun @pytest.mark.parametrize("observed", [False, None]) def test_dataframe_groupby_on_2_categoricals_when_observed_is_false( - reduction_func: str, observed: bool, request + reduction_func, observed, request ): # GH 23865 # GH 27075 diff --git a/pandas/util/_test_decorators.py b/pandas/util/_test_decorators.py index a4a1d83177c50..bdf633839b2cd 100644 --- a/pandas/util/_test_decorators.py +++ b/pandas/util/_test_decorators.py @@ -120,7 +120,9 @@ def _skip_if_no_scipy() -> bool: ) -def skip_if_installed(package: str) -> Callable: +# TODO: return type, _pytest.mark.structures.MarkDecorator is not public +# https://github.com/pytest-dev/pytest/issues/7469 +def skip_if_installed(package: str): """ Skip a test if a package is installed. @@ -134,7 +136,9 @@ def skip_if_installed(package: str) -> Callable: ) -def skip_if_no(package: str, min_version: Optional[str] = None) -> Callable: +# TODO: return type, _pytest.mark.structures.MarkDecorator is not public +# https://github.com/pytest-dev/pytest/issues/7469 +def skip_if_no(package: str, min_version: Optional[str] = None): """ Generic function to help skip tests when required packages are not present on the testing system. @@ -196,14 +200,12 @@ def skip_if_no(package: str, min_version: Optional[str] = None) -> Callable: ) -def skip_if_np_lt( - ver_str: str, reason: Optional[str] = None, *args, **kwds -) -> Callable: +# TODO: return type, _pytest.mark.structures.MarkDecorator is not public +# https://github.com/pytest-dev/pytest/issues/7469 +def skip_if_np_lt(ver_str: str, *args, reason: Optional[str] = None): if reason is None: reason = f"NumPy {ver_str} or greater required" - return pytest.mark.skipif( - _np_version < LooseVersion(ver_str), reason=reason, *args, **kwds - ) + return pytest.mark.skipif(_np_version < LooseVersion(ver_str), *args, reason=reason) def parametrize_fixture_doc(*args): diff --git a/requirements-dev.txt b/requirements-dev.txt index 0c024d1b54637..7bf3df176b378 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -32,7 +32,7 @@ boto3 botocore>=1.11 hypothesis>=3.82 moto -pytest>=5.0.1,<6.0.0rc0 +pytest>=5.0.1 pytest-cov pytest-xdist>=1.21 pytest-asyncio diff --git a/setup.cfg b/setup.cfg index 00af7f6f1b79a..ee5725e36d193 100644 --- a/setup.cfg +++ b/setup.cfg @@ -105,7 +105,7 @@ known_dtypes = pandas.core.dtypes known_post_core = pandas.tseries,pandas.io,pandas.plotting sections = FUTURE,STDLIB,THIRDPARTY,PRE_LIBS,PRE_CORE,DTYPES,FIRSTPARTY,POST_CORE,LOCALFOLDER known_first_party = pandas -known_third_party = _pytest,announce,dateutil,docutils,flake8,git,hypothesis,jinja2,lxml,matplotlib,numpy,numpydoc,pkg_resources,pyarrow,pytest,pytz,requests,scipy,setuptools,sphinx,sqlalchemy,validate_docstrings,validate_unwanted_patterns,yaml,odf +known_third_party = announce,dateutil,docutils,flake8,git,hypothesis,jinja2,lxml,matplotlib,numpy,numpydoc,pkg_resources,pyarrow,pytest,pytz,requests,scipy,setuptools,sphinx,sqlalchemy,validate_docstrings,validate_unwanted_patterns,yaml,odf multi_line_output = 3 include_trailing_comma = True force_grid_wrap = 0 From 8c7b1cb59ad4103c116971784ddacbe23b009da5 Mon Sep 17 00:00:00 2001 From: Kevin Sheppard Date: Thu, 30 Jul 2020 19:45:45 +0100 Subject: [PATCH 0415/1025] MAINT: Use float arange when required or intended (#35477) --- pandas/tests/window/test_base_indexer.py | 4 ++-- pandas/tests/window/test_ewm.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/tests/window/test_base_indexer.py b/pandas/tests/window/test_base_indexer.py index 4a0212e890d3a..2300d8dd5529b 100644 --- a/pandas/tests/window/test_base_indexer.py +++ b/pandas/tests/window/test_base_indexer.py @@ -140,7 +140,7 @@ def get_window_bounds(self, num_values, min_periods, center, closed): ) def test_rolling_forward_window(constructor, func, np_func, expected, np_kwargs): # GH 32865 - values = np.arange(10) + values = np.arange(10.0) values[5] = 100.0 indexer = FixedForwardWindowIndexer(window_size=3) @@ -177,7 +177,7 @@ def test_rolling_forward_window(constructor, func, np_func, expected, np_kwargs) @pytest.mark.parametrize("constructor", [Series, DataFrame]) def test_rolling_forward_skewness(constructor): - values = np.arange(10) + values = np.arange(10.0) values[5] = 100.0 indexer = FixedForwardWindowIndexer(window_size=5) diff --git a/pandas/tests/window/test_ewm.py b/pandas/tests/window/test_ewm.py index 12c314d5e9ec9..69cd1d1ba069c 100644 --- a/pandas/tests/window/test_ewm.py +++ b/pandas/tests/window/test_ewm.py @@ -108,7 +108,7 @@ def test_ewma_halflife_without_times(halflife_with_times): @pytest.mark.parametrize("min_periods", [0, 2]) def test_ewma_with_times_equal_spacing(halflife_with_times, times, min_periods): halflife = halflife_with_times - data = np.arange(10) + data = np.arange(10.0) data[::2] = np.nan df = DataFrame({"A": data, "time_col": date_range("2000", freq="D", periods=10)}) result = df.ewm(halflife=halflife, min_periods=min_periods, times=times).mean() From 25ec4eb63d5601f83737f36425b4b59f31273e39 Mon Sep 17 00:00:00 2001 From: Fangchen Li Date: Sat, 1 Aug 2020 03:43:00 -0500 Subject: [PATCH 0416/1025] CI: unpin isort 5 (#35470) --- asv_bench/benchmarks/frame_ctor.py | 2 +- asv_bench/benchmarks/gil.py | 8 +- asv_bench/benchmarks/io/parsers.py | 2 +- asv_bench/benchmarks/tslibs/normalize.py | 2 +- ci/code_checks.sh | 2 +- doc/source/development/contributing.rst | 4 +- environment.yml | 2 +- pandas/_config/config.py | 4 +- pandas/_libs/algos.pyx | 10 +-- pandas/_libs/groupby.pyx | 48 +++++++++--- pandas/_libs/hashing.pyx | 7 +- pandas/_libs/hashtable.pyx | 75 +++++++++---------- pandas/_libs/index.pyx | 8 +- pandas/_libs/internals.pyx | 3 + pandas/_libs/interval.pyx | 16 ++-- pandas/_libs/join.pyx | 9 ++- pandas/_libs/lib.pyx | 48 ++++++------ pandas/_libs/missing.pyx | 16 ++-- pandas/_libs/ops.pyx | 9 +-- pandas/_libs/parsers.pyx | 65 ++++++++++------ pandas/_libs/reduction.pyx | 9 ++- pandas/_libs/reshape.pyx | 2 + pandas/_libs/sparse.pyx | 15 +++- pandas/_libs/testing.pyx | 7 +- pandas/_libs/tslib.pyx | 19 ++--- pandas/_libs/tslibs/ccalendar.pyx | 2 +- pandas/_libs/tslibs/conversion.pyx | 58 +++++++++----- pandas/_libs/tslibs/fields.pyx | 29 ++++--- pandas/_libs/tslibs/nattype.pyx | 22 +++--- pandas/_libs/tslibs/np_datetime.pyx | 6 +- pandas/_libs/tslibs/offsets.pyx | 43 +++++++---- pandas/_libs/tslibs/parsing.pyx | 41 +++++----- pandas/_libs/tslibs/period.pyx | 62 +++++++-------- pandas/_libs/tslibs/strptime.pyx | 17 +++-- pandas/_libs/tslibs/timedeltas.pyx | 38 ++++++---- pandas/_libs/tslibs/timestamps.pyx | 62 ++++++++------- pandas/_libs/tslibs/timezones.pyx | 12 ++- pandas/_libs/tslibs/tzconversion.pyx | 18 +++-- pandas/_libs/tslibs/vectorized.pyx | 9 ++- pandas/_libs/window/aggregations.pyx | 9 ++- pandas/_libs/window/indexers.pyx | 3 +- pandas/_libs/writers.pyx | 2 +- pandas/_testing.py | 2 +- pandas/_typing.py | 10 ++- pandas/compat/pickle_compat.py | 2 +- pandas/core/apply.py | 2 +- pandas/core/arrays/categorical.py | 4 +- pandas/core/arrays/datetimelike.py | 2 +- pandas/core/arrays/integer.py | 1 + pandas/core/arrays/interval.py | 1 + pandas/core/arrays/period.py | 1 + pandas/core/arrays/sparse/accessor.py | 7 +- pandas/core/config_init.py | 6 +- pandas/core/construction.py | 10 +-- pandas/core/dtypes/cast.py | 3 +- pandas/core/dtypes/dtypes.py | 12 ++- pandas/core/frame.py | 6 +- pandas/core/groupby/generic.py | 2 +- pandas/core/groupby/grouper.py | 1 - pandas/core/indexes/base.py | 2 +- pandas/core/internals/ops.py | 2 +- pandas/core/strings.py | 6 +- pandas/core/tools/datetimes.py | 5 +- pandas/core/util/hashing.py | 2 +- pandas/io/clipboard/__init__.py | 16 ++-- pandas/io/excel/_base.py | 2 +- pandas/io/excel/_odfreader.py | 4 +- pandas/io/excel/_openpyxl.py | 2 +- pandas/io/excel/_xlrd.py | 4 +- pandas/io/formats/format.py | 2 +- pandas/io/formats/style.py | 2 +- pandas/io/html.py | 2 +- pandas/io/pytables.py | 2 +- pandas/io/sas/sas.pyx | 2 +- pandas/io/sql.py | 16 ++-- pandas/plotting/_matplotlib/core.py | 2 +- pandas/plotting/_matplotlib/timeseries.py | 2 +- pandas/tests/api/test_api.py | 3 +- pandas/tests/arrays/interval/test_interval.py | 4 + pandas/tests/arrays/test_period.py | 3 + pandas/tests/frame/test_analytics.py | 4 +- .../tests/indexes/datetimes/test_datetime.py | 1 + .../indexing/multiindex/test_indexing_slow.py | 2 +- pandas/tests/io/test_fsspec.py | 2 +- pandas/tests/io/test_gcs.py | 9 +-- pandas/tests/io/test_sql.py | 4 +- pandas/tests/plotting/common.py | 4 +- pandas/tests/plotting/test_frame.py | 8 +- pandas/tests/plotting/test_hist_method.py | 3 +- pandas/tests/plotting/test_misc.py | 10 ++- pandas/tests/plotting/test_series.py | 3 +- pandas/tests/series/indexing/test_datetime.py | 2 +- pandas/tests/series/methods/test_asof.py | 2 +- pandas/tests/series/test_arithmetic.py | 2 +- pandas/tests/test_downstream.py | 2 +- pandas/util/_doctools.py | 2 +- requirements-dev.txt | 2 +- 97 files changed, 612 insertions(+), 432 deletions(-) diff --git a/asv_bench/benchmarks/frame_ctor.py b/asv_bench/benchmarks/frame_ctor.py index dc6f45f810f3d..e0a2257b0ca1f 100644 --- a/asv_bench/benchmarks/frame_ctor.py +++ b/asv_bench/benchmarks/frame_ctor.py @@ -6,7 +6,7 @@ from .pandas_vb_common import tm try: - from pandas.tseries.offsets import Nano, Hour + from pandas.tseries.offsets import Hour, Nano except ImportError: # For compatibility with older versions from pandas.core.datetools import * # noqa diff --git a/asv_bench/benchmarks/gil.py b/asv_bench/benchmarks/gil.py index e266d871f5bc6..5d9070de92ec7 100644 --- a/asv_bench/benchmarks/gil.py +++ b/asv_bench/benchmarks/gil.py @@ -7,14 +7,14 @@ try: from pandas import ( - rolling_median, + rolling_kurt, + rolling_max, rolling_mean, + rolling_median, rolling_min, - rolling_max, - rolling_var, rolling_skew, - rolling_kurt, rolling_std, + rolling_var, ) have_rolling_methods = True diff --git a/asv_bench/benchmarks/io/parsers.py b/asv_bench/benchmarks/io/parsers.py index ec3eddfff7184..5390056ba36f2 100644 --- a/asv_bench/benchmarks/io/parsers.py +++ b/asv_bench/benchmarks/io/parsers.py @@ -2,8 +2,8 @@ try: from pandas._libs.tslibs.parsing import ( - concat_date_cols, _does_string_look_like_datetime, + concat_date_cols, ) except ImportError: # Avoid whole benchmark suite import failure on asv (currently 0.4) diff --git a/asv_bench/benchmarks/tslibs/normalize.py b/asv_bench/benchmarks/tslibs/normalize.py index 7d4e0556f4d96..9a206410d8775 100644 --- a/asv_bench/benchmarks/tslibs/normalize.py +++ b/asv_bench/benchmarks/tslibs/normalize.py @@ -1,5 +1,5 @@ try: - from pandas._libs.tslibs import normalize_i8_timestamps, is_date_array_normalized + from pandas._libs.tslibs import is_date_array_normalized, normalize_i8_timestamps except ImportError: from pandas._libs.tslibs.conversion import ( normalize_i8_timestamps, diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 7b12de387d648..69ce0f1adce22 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -121,7 +121,7 @@ if [[ -z "$CHECK" || "$CHECK" == "lint" ]]; then # Imports - Check formatting using isort see setup.cfg for settings MSG='Check import format using isort' ; echo $MSG - ISORT_CMD="isort --quiet --recursive --check-only pandas asv_bench scripts" + ISORT_CMD="isort --quiet --check-only pandas asv_bench scripts" if [[ "$GITHUB_ACTIONS" == "true" ]]; then eval $ISORT_CMD | awk '{print "##[error]" $0}'; RET=$(($RET + ${PIPESTATUS[0]})) else diff --git a/doc/source/development/contributing.rst b/doc/source/development/contributing.rst index b85e9403038ab..1b0e36e7b6933 100644 --- a/doc/source/development/contributing.rst +++ b/doc/source/development/contributing.rst @@ -751,7 +751,7 @@ Imports are alphabetically sorted within these sections. As part of :ref:`Continuous Integration ` checks we run:: - isort --recursive --check-only pandas + isort --check-only pandas to check that imports are correctly formatted as per the `setup.cfg`. @@ -770,8 +770,6 @@ You should run:: to automatically format imports correctly. This will modify your local copy of the files. -The `--recursive` flag can be passed to sort all files in a directory. - Alternatively, you can run a command similar to what was suggested for ``black`` and ``flake8`` :ref:`right above `:: git diff upstream/master --name-only -- "*.py" | xargs -r isort diff --git a/environment.yml b/environment.yml index 3b088ca511be9..9efb995e29497 100644 --- a/environment.yml +++ b/environment.yml @@ -21,7 +21,7 @@ dependencies: - flake8<3.8.0 # temporary pin, GH#34150 - flake8-comprehensions>=3.1.0 # used by flake8, linting of unnecessary comprehensions - flake8-rst>=0.6.0,<=0.7.0 # linting of code blocks in rst files - - isort=4.3.21 # check that imports are in the right order + - isort>=5.2.1 # check that imports are in the right order - mypy=0.730 - pycodestyle # used by flake8 diff --git a/pandas/_config/config.py b/pandas/_config/config.py index f5e16cddeb04c..d7b73a0a685d3 100644 --- a/pandas/_config/config.py +++ b/pandas/_config/config.py @@ -442,8 +442,8 @@ def register_option( ValueError if `validator` is specified and `defval` is not a valid value. """ - import tokenize import keyword + import tokenize key = key.lower() @@ -660,8 +660,8 @@ def _build_option_description(k: str) -> str: def pp_options_list(keys: Iterable[str], width=80, _print: bool = False): """ Builds a concise listing of available options, grouped by prefix """ - from textwrap import wrap from itertools import groupby + from textwrap import wrap def pp(name: str, ks: Iterable[str]) -> List[str]: pfx = "- " + name + ".[" if name else "" diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 6b6ead795584f..7e90a8cc681ef 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -1,11 +1,12 @@ import cython from cython import Py_ssize_t -from libc.stdlib cimport malloc, free -from libc.string cimport memmove from libc.math cimport fabs, sqrt +from libc.stdlib cimport free, malloc +from libc.string cimport memmove import numpy as np + cimport numpy as cnp from numpy cimport ( NPY_FLOAT32, @@ -31,12 +32,11 @@ from numpy cimport ( uint32_t, uint64_t, ) + cnp.import_array() cimport pandas._libs.util as util -from pandas._libs.util cimport numeric, get_nat - from pandas._libs.khash cimport ( kh_destroy_int64, kh_get_int64, @@ -46,7 +46,7 @@ from pandas._libs.khash cimport ( kh_resize_int64, khiter_t, ) - +from pandas._libs.util cimport get_nat, numeric import pandas._libs.missing as missing diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 7c57e6ee9dbfd..38cb973d6dde9 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -1,27 +1,51 @@ import cython from cython import Py_ssize_t -from cython cimport floating -from libc.stdlib cimport malloc, free +from cython cimport floating +from libc.stdlib cimport free, malloc import numpy as np + cimport numpy as cnp -from numpy cimport (ndarray, - int8_t, int16_t, int32_t, int64_t, uint8_t, uint16_t, - uint32_t, uint64_t, float32_t, float64_t, complex64_t, complex128_t) +from numpy cimport ( + complex64_t, + complex128_t, + float32_t, + float64_t, + int8_t, + int16_t, + int32_t, + int64_t, + ndarray, + uint8_t, + uint16_t, + uint32_t, + uint64_t, +) from numpy.math cimport NAN -cnp.import_array() -from pandas._libs.util cimport numeric, get_nat +cnp.import_array() -from pandas._libs.algos cimport (swap, TiebreakEnumType, TIEBREAK_AVERAGE, - TIEBREAK_MIN, TIEBREAK_MAX, TIEBREAK_FIRST, - TIEBREAK_DENSE) -from pandas._libs.algos import (take_2d_axis1_float64_float64, - groupsort_indexer, tiebreakers) +from pandas._libs.algos cimport ( + TIEBREAK_AVERAGE, + TIEBREAK_DENSE, + TIEBREAK_FIRST, + TIEBREAK_MAX, + TIEBREAK_MIN, + TiebreakEnumType, + swap, +) +from pandas._libs.util cimport get_nat, numeric + +from pandas._libs.algos import ( + groupsort_indexer, + take_2d_axis1_float64_float64, + tiebreakers, +) from pandas._libs.missing cimport checknull + cdef int64_t NPY_NAT = get_nat() _int64_max = np.iinfo(np.int64).max diff --git a/pandas/_libs/hashing.pyx b/pandas/_libs/hashing.pyx index a98820ca57895..f2af04d91a3e3 100644 --- a/pandas/_libs/hashing.pyx +++ b/pandas/_libs/hashing.pyx @@ -2,10 +2,13 @@ # at https://github.com/veorq/SipHash import cython -from libc.stdlib cimport malloc, free + +from libc.stdlib cimport free, malloc import numpy as np -from numpy cimport ndarray, uint8_t, uint32_t, uint64_t, import_array + +from numpy cimport import_array, ndarray, uint8_t, uint32_t, uint64_t + import_array() from pandas._libs.util cimport is_nan diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx index c3dcbb942d7fe..ffaf6d6505955 100644 --- a/pandas/_libs/hashtable.pyx +++ b/pandas/_libs/hashtable.pyx @@ -1,60 +1,57 @@ cimport cython - -from cpython.ref cimport PyObject, Py_INCREF -from cpython.mem cimport PyMem_Malloc, PyMem_Free - -from libc.stdlib cimport malloc, free +from cpython.mem cimport PyMem_Free, PyMem_Malloc +from cpython.ref cimport Py_INCREF, PyObject +from libc.stdlib cimport free, malloc import numpy as np + cimport numpy as cnp -from numpy cimport ndarray, uint8_t, uint32_t, float64_t +from numpy cimport float64_t, ndarray, uint8_t, uint32_t from numpy.math cimport NAN + cnp.import_array() +from pandas._libs cimport util from pandas._libs.khash cimport ( - khiter_t, - kh_str_t, - kh_init_str, - kh_put_str, - kh_exist_str, - kh_get_str, - kh_destroy_str, - kh_resize_str, - kh_put_strbox, - kh_get_strbox, - kh_init_strbox, - kh_int64_t, - kh_init_int64, - kh_resize_int64, + kh_destroy_float64, kh_destroy_int64, - kh_get_int64, + kh_destroy_pymap, + kh_destroy_str, + kh_destroy_uint64, + kh_exist_float64, kh_exist_int64, - kh_put_int64, + kh_exist_pymap, + kh_exist_str, + kh_exist_uint64, kh_float64_t, - kh_exist_float64, - kh_put_float64, - kh_init_float64, kh_get_float64, - kh_destroy_float64, - kh_resize_float64, - kh_resize_uint64, - kh_exist_uint64, - kh_destroy_uint64, - kh_put_uint64, + kh_get_int64, + kh_get_pymap, + kh_get_str, + kh_get_strbox, kh_get_uint64, - kh_init_uint64, - kh_destroy_pymap, - kh_exist_pymap, + kh_init_float64, + kh_init_int64, kh_init_pymap, - kh_get_pymap, + kh_init_str, + kh_init_strbox, + kh_init_uint64, + kh_int64_t, + kh_put_float64, + kh_put_int64, kh_put_pymap, + kh_put_str, + kh_put_strbox, + kh_put_uint64, + kh_resize_float64, + kh_resize_int64, kh_resize_pymap, + kh_resize_str, + kh_resize_uint64, + kh_str_t, + khiter_t, ) - - -from pandas._libs cimport util - from pandas._libs.missing cimport checknull diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index 35c4b73b47695..d6659cc1895b1 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -1,6 +1,7 @@ import warnings import numpy as np + cimport numpy as cnp from numpy cimport ( float32_t, @@ -16,17 +17,16 @@ from numpy cimport ( uint32_t, uint64_t, ) + cnp.import_array() from pandas._libs cimport util - +from pandas._libs.hashtable cimport HashTable from pandas._libs.tslibs.nattype cimport c_NaT as NaT from pandas._libs.tslibs.period cimport is_period_object -from pandas._libs.tslibs.timestamps cimport _Timestamp from pandas._libs.tslibs.timedeltas cimport _Timedelta - -from pandas._libs.hashtable cimport HashTable +from pandas._libs.tslibs.timestamps cimport _Timestamp from pandas._libs import algos, hashtable as _hash from pandas._libs.missing import checknull diff --git a/pandas/_libs/internals.pyx b/pandas/_libs/internals.pyx index 8b4b490f49b12..4f27fde52414a 100644 --- a/pandas/_libs/internals.pyx +++ b/pandas/_libs/internals.pyx @@ -5,12 +5,15 @@ from cython import Py_ssize_t from cpython.slice cimport PySlice_GetIndicesEx + cdef extern from "Python.h": Py_ssize_t PY_SSIZE_T_MAX import numpy as np + cimport numpy as cnp from numpy cimport NPY_INT64, int64_t + cnp.import_array() from pandas._libs.algos import ensure_int64 diff --git a/pandas/_libs/interval.pyx b/pandas/_libs/interval.pyx index 95881ebf1385c..6867e8aba7411 100644 --- a/pandas/_libs/interval.pyx +++ b/pandas/_libs/interval.pyx @@ -1,7 +1,8 @@ import numbers from operator import le, lt -from cpython.datetime cimport PyDelta_Check, PyDateTime_IMPORT +from cpython.datetime cimport PyDateTime_IMPORT, PyDelta_Check + PyDateTime_IMPORT from cpython.object cimport ( @@ -16,8 +17,8 @@ from cpython.object cimport ( import cython from cython import Py_ssize_t - import numpy as np + cimport numpy as cnp from numpy cimport ( NPY_QUICKSORT, @@ -30,22 +31,21 @@ from numpy cimport ( ndarray, uint64_t, ) + cnp.import_array() from pandas._libs cimport util - from pandas._libs.hashtable cimport Int64Vector +from pandas._libs.tslibs.timedeltas cimport _Timedelta +from pandas._libs.tslibs.timestamps cimport _Timestamp +from pandas._libs.tslibs.timezones cimport tz_compare from pandas._libs.tslibs.util cimport ( - is_integer_object, is_float_object, + is_integer_object, is_timedelta64_object, ) -from pandas._libs.tslibs.timezones cimport tz_compare -from pandas._libs.tslibs.timestamps cimport _Timestamp -from pandas._libs.tslibs.timedeltas cimport _Timedelta - _VALID_CLOSED = frozenset(['left', 'right', 'both', 'neither']) diff --git a/pandas/_libs/join.pyx b/pandas/_libs/join.pyx index 54892a7e4bc77..13c7187923473 100644 --- a/pandas/_libs/join.pyx +++ b/pandas/_libs/join.pyx @@ -1,7 +1,7 @@ import cython from cython import Py_ssize_t - import numpy as np + cimport numpy as cnp from numpy cimport ( float32_t, @@ -16,6 +16,7 @@ from numpy cimport ( uint32_t, uint64_t, ) + cnp.import_array() from pandas._libs.algos import ( @@ -640,7 +641,11 @@ def outer_join_indexer(ndarray[join_t] left, ndarray[join_t] right): # ---------------------------------------------------------------------- from pandas._libs.hashtable cimport ( - HashTable, PyObjectHashTable, UInt64HashTable, Int64HashTable) + HashTable, + Int64HashTable, + PyObjectHashTable, + UInt64HashTable, +) ctypedef fused asof_t: uint8_t diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 5ecbb2c3ffd35..5fa91ffee8ea8 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -5,23 +5,24 @@ import warnings import cython from cython import Py_ssize_t -from cpython.object cimport PyObject_RichCompareBool, Py_EQ -from cpython.ref cimport Py_INCREF -from cpython.tuple cimport PyTuple_SET_ITEM, PyTuple_New -from cpython.iterator cimport PyIter_Check -from cpython.sequence cimport PySequence_Check -from cpython.number cimport PyNumber_Check - from cpython.datetime cimport ( - PyDateTime_Check, PyDate_Check, - PyTime_Check, - PyDelta_Check, + PyDateTime_Check, PyDateTime_IMPORT, + PyDelta_Check, + PyTime_Check, ) +from cpython.iterator cimport PyIter_Check +from cpython.number cimport PyNumber_Check +from cpython.object cimport Py_EQ, PyObject_RichCompareBool +from cpython.ref cimport Py_INCREF +from cpython.sequence cimport PySequence_Check +from cpython.tuple cimport PyTuple_New, PyTuple_SET_ITEM + PyDateTime_IMPORT import numpy as np + cimport numpy as cnp from numpy cimport ( NPY_OBJECT, @@ -39,6 +40,7 @@ from numpy cimport ( uint8_t, uint64_t, ) + cnp.import_array() cdef extern from "numpy/arrayobject.h": @@ -63,28 +65,23 @@ cdef extern from "src/parse_helper.h": int floatify(object, float64_t *result, int *maybe_int) except -1 from pandas._libs cimport util -from pandas._libs.util cimport is_nan, UINT64_MAX, INT64_MAX, INT64_MIN +from pandas._libs.util cimport INT64_MAX, INT64_MIN, UINT64_MAX, is_nan from pandas._libs.tslib import array_to_datetime -from pandas._libs.tslibs.nattype cimport ( - NPY_NAT, - c_NaT as NaT, - checknull_with_nat, -) -from pandas._libs.tslibs.conversion cimport convert_to_tsobject -from pandas._libs.tslibs.timedeltas cimport convert_to_timedelta64 -from pandas._libs.tslibs.timezones cimport tz_compare -from pandas._libs.tslibs.period cimport is_period_object -from pandas._libs.tslibs.offsets cimport is_offset_object from pandas._libs.missing cimport ( + C_NA, checknull, - isnaobj, is_null_datetime64, is_null_timedelta64, - C_NA, + isnaobj, ) - +from pandas._libs.tslibs.conversion cimport convert_to_tsobject +from pandas._libs.tslibs.nattype cimport NPY_NAT, c_NaT as NaT, checknull_with_nat +from pandas._libs.tslibs.offsets cimport is_offset_object +from pandas._libs.tslibs.period cimport is_period_object +from pandas._libs.tslibs.timedeltas cimport convert_to_timedelta64 +from pandas._libs.tslibs.timezones cimport tz_compare # constants that will be compared to potentially arbitrarily large # python int @@ -1317,8 +1314,7 @@ def infer_dtype(value: object, skipna: bool = True) -> str: if not isinstance(value, list): value = list(value) - from pandas.core.dtypes.cast import ( - construct_1d_object_array_from_listlike) + from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike values = construct_1d_object_array_from_listlike(value) # make contiguous diff --git a/pandas/_libs/missing.pyx b/pandas/_libs/missing.pyx index fdd06fe631b97..760fab3781fd4 100644 --- a/pandas/_libs/missing.pyx +++ b/pandas/_libs/missing.pyx @@ -1,27 +1,25 @@ -import cython -from cython import Py_ssize_t - import numbers +import cython +from cython import Py_ssize_t import numpy as np + cimport numpy as cnp -from numpy cimport ndarray, int64_t, uint8_t, float64_t +from numpy cimport float64_t, int64_t, ndarray, uint8_t + cnp.import_array() from pandas._libs cimport util - - -from pandas._libs.tslibs.np_datetime cimport get_datetime64_value, get_timedelta64_value from pandas._libs.tslibs.nattype cimport ( c_NaT as NaT, checknull_with_nat, is_null_datetimelike, ) -from pandas._libs.ops_dispatch import maybe_dispatch_ufunc_to_dunder_op +from pandas._libs.tslibs.np_datetime cimport get_datetime64_value, get_timedelta64_value +from pandas._libs.ops_dispatch import maybe_dispatch_ufunc_to_dunder_op from pandas.compat import is_platform_32bit - cdef: float64_t INF = np.inf float64_t NEGINF = -INF diff --git a/pandas/_libs/ops.pyx b/pandas/_libs/ops.pyx index 658600cdfbe6c..d1f897d237c1b 100644 --- a/pandas/_libs/ops.pyx +++ b/pandas/_libs/ops.pyx @@ -10,18 +10,17 @@ from cpython.object cimport ( PyObject_RichCompareBool, ) - import cython from cython import Py_ssize_t - import numpy as np -from numpy cimport ndarray, uint8_t, import_array -import_array() +from numpy cimport import_array, ndarray, uint8_t + +import_array() -from pandas._libs.util cimport UINT8_MAX, is_nan from pandas._libs.missing cimport checknull +from pandas._libs.util cimport UINT8_MAX, is_nan @cython.wraparound(False) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 6ffb036e01595..fa77af6bd5a25 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -1,6 +1,8 @@ # Copyright (c) 2012, Lambda Foundry, Inc. # See LICENSE for the license import bz2 +from csv import QUOTE_MINIMAL, QUOTE_NONE, QUOTE_NONNUMERIC +from errno import ENOENT import gzip import io import os @@ -9,17 +11,14 @@ import time import warnings import zipfile -from csv import QUOTE_MINIMAL, QUOTE_NONNUMERIC, QUOTE_NONE -from errno import ENOENT - from libc.stdlib cimport free -from libc.string cimport strncpy, strlen, strcasecmp +from libc.string cimport strcasecmp, strlen, strncpy import cython from cython import Py_ssize_t from cpython.bytes cimport PyBytes_AsString, PyBytes_FromString -from cpython.exc cimport PyErr_Occurred, PyErr_Fetch +from cpython.exc cimport PyErr_Fetch, PyErr_Occurred from cpython.object cimport PyObject from cpython.ref cimport Py_XDECREF from cpython.unicode cimport PyUnicode_AsUTF8String, PyUnicode_Decode @@ -30,37 +29,59 @@ cdef extern from "Python.h": import numpy as np + cimport numpy as cnp -from numpy cimport ndarray, uint8_t, uint64_t, int64_t, float64_t +from numpy cimport float64_t, int64_t, ndarray, uint8_t, uint64_t + cnp.import_array() from pandas._libs cimport util -from pandas._libs.util cimport UINT64_MAX, INT64_MAX, INT64_MIN +from pandas._libs.util cimport INT64_MAX, INT64_MIN, UINT64_MAX + import pandas._libs.lib as lib from pandas._libs.khash cimport ( - khiter_t, - kh_str_t, kh_init_str, kh_put_str, kh_exist_str, - kh_get_str, kh_destroy_str, - kh_float64_t, kh_get_float64, kh_destroy_float64, - kh_put_float64, kh_init_float64, kh_resize_float64, - kh_strbox_t, kh_put_strbox, kh_get_strbox, kh_init_strbox, + kh_destroy_float64, + kh_destroy_str, + kh_destroy_str_starts, kh_destroy_strbox, - kh_str_starts_t, kh_put_str_starts_item, kh_init_str_starts, - kh_get_str_starts_item, kh_destroy_str_starts, kh_resize_str_starts) + kh_exist_str, + kh_float64_t, + kh_get_float64, + kh_get_str, + kh_get_str_starts_item, + kh_get_strbox, + kh_init_float64, + kh_init_str, + kh_init_str_starts, + kh_init_strbox, + kh_put_float64, + kh_put_str, + kh_put_str_starts_item, + kh_put_strbox, + kh_resize_float64, + kh_resize_str_starts, + kh_str_starts_t, + kh_str_t, + kh_strbox_t, + khiter_t, +) + +from pandas.compat import _get_lzma_file, _import_lzma +from pandas.errors import DtypeWarning, EmptyDataError, ParserError, ParserWarning from pandas.core.dtypes.common import ( + is_bool_dtype, is_categorical_dtype, - is_integer_dtype, is_float_dtype, - is_bool_dtype, is_object_dtype, is_datetime64_dtype, - pandas_dtype, is_extension_array_dtype) + is_extension_array_dtype, + is_float_dtype, + is_integer_dtype, + is_object_dtype, + pandas_dtype, +) from pandas.core.dtypes.concat import union_categoricals -from pandas.compat import _import_lzma, _get_lzma_file -from pandas.errors import (ParserError, DtypeWarning, - EmptyDataError, ParserWarning) - lzma = _import_lzma() cdef: diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx index a01e0c5705dcf..7b36bc8baf891 100644 --- a/pandas/_libs/reduction.pyx +++ b/pandas/_libs/reduction.pyx @@ -2,15 +2,18 @@ from copy import copy from cython import Py_ssize_t -from libc.stdlib cimport malloc, free +from libc.stdlib cimport free, malloc import numpy as np + cimport numpy as cnp -from numpy cimport ndarray, int64_t +from numpy cimport int64_t, ndarray + cnp.import_array() from pandas._libs cimport util -from pandas._libs.lib import maybe_convert_objects, is_scalar + +from pandas._libs.lib import is_scalar, maybe_convert_objects cdef _check_result_array(object obj, Py_ssize_t cnt): diff --git a/pandas/_libs/reshape.pyx b/pandas/_libs/reshape.pyx index da4dd00027395..5c6c15fb50fed 100644 --- a/pandas/_libs/reshape.pyx +++ b/pandas/_libs/reshape.pyx @@ -16,7 +16,9 @@ from numpy cimport ( ) import numpy as np + cimport numpy as cnp + cnp.import_array() from pandas._libs.lib cimport c_is_list_like diff --git a/pandas/_libs/sparse.pyx b/pandas/_libs/sparse.pyx index 7c9575d921dc9..321d7c374d8ec 100644 --- a/pandas/_libs/sparse.pyx +++ b/pandas/_libs/sparse.pyx @@ -1,9 +1,18 @@ import cython - import numpy as np + cimport numpy as cnp -from numpy cimport (ndarray, uint8_t, int64_t, int32_t, int16_t, int8_t, - float64_t, float32_t) +from numpy cimport ( + float32_t, + float64_t, + int8_t, + int16_t, + int32_t, + int64_t, + ndarray, + uint8_t, +) + cnp.import_array() diff --git a/pandas/_libs/testing.pyx b/pandas/_libs/testing.pyx index 785a4d1f8b923..64fc8d615ea9c 100644 --- a/pandas/_libs/testing.pyx +++ b/pandas/_libs/testing.pyx @@ -1,13 +1,16 @@ import math import numpy as np + from numpy cimport import_array + import_array() from pandas._libs.util cimport is_array -from pandas.core.dtypes.missing import isna, array_equivalent from pandas.core.dtypes.common import is_dtype_equal +from pandas.core.dtypes.missing import array_equivalent, isna + cdef NUMERIC_TYPES = ( bool, @@ -129,6 +132,7 @@ cpdef assert_almost_equal(a, b, if not isiterable(b): from pandas._testing import assert_class_equal + # classes can't be the same, to raise error assert_class_equal(a, b, obj=obj) @@ -181,6 +185,7 @@ cpdef assert_almost_equal(a, b, elif isiterable(b): from pandas._testing import assert_class_equal + # classes can't be the same, to raise error assert_class_equal(a, b, obj=obj) diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 35d5cd8f1e275..e4128af62d06d 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -7,23 +7,20 @@ from cpython.datetime cimport ( datetime, tzinfo, ) + # import datetime C API PyDateTime_IMPORT cimport numpy as cnp from numpy cimport float64_t, int64_t, ndarray + import numpy as np + cnp.import_array() import pytz -from pandas._libs.util cimport ( - is_datetime64_object, - is_float_object, - is_integer_object, -) - from pandas._libs.tslibs.np_datetime cimport ( _string_to_dts, check_dts_bounds, @@ -34,9 +31,9 @@ from pandas._libs.tslibs.np_datetime cimport ( pydate_to_dt64, pydatetime_to_dt64, ) +from pandas._libs.util cimport is_datetime64_object, is_float_object, is_integer_object from pandas._libs.tslibs.np_datetime import OutOfBoundsDatetime - from pandas._libs.tslibs.parsing import parse_datetime_string from pandas._libs.tslibs.conversion cimport ( @@ -45,22 +42,18 @@ from pandas._libs.tslibs.conversion cimport ( convert_datetime_to_tsobject, get_datetime64_nanos, ) - from pandas._libs.tslibs.nattype cimport ( NPY_NAT, c_NaT as NaT, c_nat_strings as nat_strings, ) - from pandas._libs.tslibs.timestamps cimport _Timestamp -from pandas._libs.tslibs.timestamps import Timestamp -from pandas._libs.tslibs.tzconversion cimport ( - tz_localize_to_utc_single, -) +from pandas._libs.tslibs.timestamps import Timestamp # Note: this is the only non-tslibs intra-pandas dependency here from pandas._libs.missing cimport checknull_with_nat_and_na +from pandas._libs.tslibs.tzconversion cimport tz_localize_to_utc_single def _test_parse_iso8601(ts: str): diff --git a/pandas/_libs/tslibs/ccalendar.pyx b/pandas/_libs/tslibs/ccalendar.pyx index 00cecd25e5225..6cce2f5e1fd95 100644 --- a/pandas/_libs/tslibs/ccalendar.pyx +++ b/pandas/_libs/tslibs/ccalendar.pyx @@ -5,7 +5,7 @@ Cython implementations of functions resembling the stdlib calendar module import cython -from numpy cimport int64_t, int32_t +from numpy cimport int32_t, int64_t # ---------------------------------------------------------------------- # Constants diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index 8cc3d25e86340..adf1dfbc1ac72 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -1,44 +1,68 @@ import cython - import numpy as np + cimport numpy as cnp -from numpy cimport int64_t, int32_t, intp_t, ndarray +from numpy cimport int32_t, int64_t, intp_t, ndarray + cnp.import_array() import pytz # stdlib datetime imports -from cpython.datetime cimport (datetime, time, tzinfo, - PyDateTime_Check, PyDate_Check, - PyDateTime_IMPORT) + +from cpython.datetime cimport ( + PyDate_Check, + PyDateTime_Check, + PyDateTime_IMPORT, + datetime, + time, + tzinfo, +) + PyDateTime_IMPORT from pandas._libs.tslibs.base cimport ABCTimestamp - from pandas._libs.tslibs.np_datetime cimport ( - check_dts_bounds, npy_datetimestruct, pandas_datetime_to_datetimestruct, - _string_to_dts, npy_datetime, dt64_to_dtstruct, dtstruct_to_dt64, - get_datetime64_unit, get_datetime64_value, pydatetime_to_dt64, - NPY_DATETIMEUNIT, NPY_FR_ns) -from pandas._libs.tslibs.np_datetime import OutOfBoundsDatetime + NPY_DATETIMEUNIT, + NPY_FR_ns, + _string_to_dts, + check_dts_bounds, + dt64_to_dtstruct, + dtstruct_to_dt64, + get_datetime64_unit, + get_datetime64_value, + npy_datetime, + npy_datetimestruct, + pandas_datetime_to_datetimestruct, + pydatetime_to_dt64, +) -from pandas._libs.tslibs.util cimport ( - is_datetime64_object, is_integer_object, is_float_object) +from pandas._libs.tslibs.np_datetime import OutOfBoundsDatetime from pandas._libs.tslibs.timezones cimport ( - is_utc, is_tzlocal, is_fixed_offset, get_utcoffset, get_dst_info, - maybe_get_tz, tz_compare, + get_dst_info, + get_utcoffset, + is_fixed_offset, + is_tzlocal, + is_utc, + maybe_get_tz, + tz_compare, utc_pytz as UTC, ) +from pandas._libs.tslibs.util cimport ( + is_datetime64_object, + is_float_object, + is_integer_object, +) + from pandas._libs.tslibs.parsing import parse_datetime_string from pandas._libs.tslibs.nattype cimport ( NPY_NAT, - checknull_with_nat, c_NaT as NaT, c_nat_strings as nat_strings, + checknull_with_nat, ) - from pandas._libs.tslibs.tzconversion cimport ( tz_convert_utc_to_tzlocal, tz_localize_to_utc_single, diff --git a/pandas/_libs/tslibs/fields.pyx b/pandas/_libs/tslibs/fields.pyx index 1d1f900bc18b3..16fa05c3801c6 100644 --- a/pandas/_libs/tslibs/fields.pyx +++ b/pandas/_libs/tslibs/fields.pyx @@ -6,26 +6,37 @@ from locale import LC_TIME import cython from cython import Py_ssize_t - import numpy as np + cimport numpy as cnp -from numpy cimport ndarray, int64_t, int32_t, int8_t, uint32_t +from numpy cimport int8_t, int32_t, int64_t, ndarray, uint32_t + cnp.import_array() from pandas._config.localization import set_locale -from pandas._libs.tslibs.ccalendar import MONTHS_FULL, DAYS_FULL +from pandas._libs.tslibs.ccalendar import DAYS_FULL, MONTHS_FULL + from pandas._libs.tslibs.ccalendar cimport ( - get_days_in_month, is_leapyear, dayofweek, get_week_of_year, - get_day_of_year, get_iso_calendar, iso_calendar_t, - month_offset, + dayofweek, + get_day_of_year, + get_days_in_month, get_firstbday, + get_iso_calendar, get_lastbday, + get_week_of_year, + is_leapyear, + iso_calendar_t, + month_offset, ) -from pandas._libs.tslibs.np_datetime cimport ( - npy_datetimestruct, pandas_timedeltastruct, dt64_to_dtstruct, - td64_to_tdstruct) from pandas._libs.tslibs.nattype cimport NPY_NAT +from pandas._libs.tslibs.np_datetime cimport ( + dt64_to_dtstruct, + npy_datetimestruct, + pandas_timedeltastruct, + td64_to_tdstruct, +) + from pandas._libs.tslibs.strptime import LocaleTime diff --git a/pandas/_libs/tslibs/nattype.pyx b/pandas/_libs/tslibs/nattype.pyx index 264013f928d22..73df51832d700 100644 --- a/pandas/_libs/tslibs/nattype.pyx +++ b/pandas/_libs/tslibs/nattype.pyx @@ -1,3 +1,10 @@ +from cpython.datetime cimport ( + PyDateTime_Check, + PyDateTime_IMPORT, + PyDelta_Check, + datetime, + timedelta, +) from cpython.object cimport ( Py_EQ, Py_GE, @@ -8,28 +15,19 @@ from cpython.object cimport ( PyObject_RichCompare, ) -from cpython.datetime cimport ( - PyDateTime_Check, - PyDateTime_IMPORT, - PyDelta_Check, - datetime, - timedelta, -) PyDateTime_IMPORT from cpython.version cimport PY_MINOR_VERSION import numpy as np + cimport numpy as cnp from numpy cimport int64_t + cnp.import_array() -from pandas._libs.tslibs.np_datetime cimport ( - get_datetime64_value, - get_timedelta64_value, -) cimport pandas._libs.tslibs.util as util - +from pandas._libs.tslibs.np_datetime cimport get_datetime64_value, get_timedelta64_value # ---------------------------------------------------------------------- # Constants diff --git a/pandas/_libs/tslibs/np_datetime.pyx b/pandas/_libs/tslibs/np_datetime.pyx index 31cc55ad981bb..12aaaf4ce3977 100644 --- a/pandas/_libs/tslibs/np_datetime.pyx +++ b/pandas/_libs/tslibs/np_datetime.pyx @@ -1,5 +1,3 @@ -from cpython.object cimport Py_EQ, Py_NE, Py_GE, Py_GT, Py_LT, Py_LE - from cpython.datetime cimport ( PyDateTime_DATE_GET_HOUR, PyDateTime_DATE_GET_MICROSECOND, @@ -10,11 +8,15 @@ from cpython.datetime cimport ( PyDateTime_GET_YEAR, PyDateTime_IMPORT, ) +from cpython.object cimport Py_EQ, Py_GE, Py_GT, Py_LE, Py_LT, Py_NE + PyDateTime_IMPORT from numpy cimport int64_t + from pandas._libs.tslibs.util cimport get_c_string_buf_and_size + cdef extern from "src/datetime/np_datetime.h": int cmp_npy_datetimestruct(npy_datetimestruct *a, npy_datetimestruct *b) diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index 9a7ca15a2a1c2..ac2725fc58aee 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -1,39 +1,51 @@ -import cython - import operator import re import time from typing import Any import warnings -from cpython.datetime cimport (PyDateTime_IMPORT, - PyDateTime_Check, - PyDate_Check, - PyDelta_Check, - datetime, timedelta, date, - time as dt_time) + +import cython + +from cpython.datetime cimport ( + PyDate_Check, + PyDateTime_Check, + PyDateTime_IMPORT, + PyDelta_Check, + date, + datetime, + time as dt_time, + timedelta, +) + PyDateTime_IMPORT -from dateutil.relativedelta import relativedelta from dateutil.easter import easter - +from dateutil.relativedelta import relativedelta import numpy as np + cimport numpy as cnp from numpy cimport int64_t, ndarray + cnp.import_array() # TODO: formalize having _libs.properties "above" tslibs in the dependency structure + from pandas._libs.properties import cache_readonly from pandas._libs.tslibs cimport util from pandas._libs.tslibs.util cimport ( - is_integer_object, is_datetime64_object, is_float_object, + is_integer_object, ) from pandas._libs.tslibs.ccalendar import ( - MONTH_ALIASES, MONTH_TO_CAL_NUM, weekday_to_int, int_to_weekday, + MONTH_ALIASES, + MONTH_TO_CAL_NUM, + int_to_weekday, + weekday_to_int, ) + from pandas._libs.tslibs.ccalendar cimport ( DAY_NANOS, dayofweek, @@ -47,17 +59,20 @@ from pandas._libs.tslibs.conversion cimport ( ) from pandas._libs.tslibs.nattype cimport NPY_NAT, c_NaT as NaT from pandas._libs.tslibs.np_datetime cimport ( - npy_datetimestruct, - dtstruct_to_dt64, dt64_to_dtstruct, + dtstruct_to_dt64, + npy_datetimestruct, pydate_to_dtstruct, ) from pandas._libs.tslibs.tzconversion cimport tz_convert_from_utc_single from .dtypes cimport PeriodDtypeCode from .timedeltas cimport delta_to_nanoseconds + from .timedeltas import Timedelta + from .timestamps cimport _Timestamp + from .timestamps import Timestamp # --------------------------------------------------------------------- diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index c4f369d0d3b3f..8429aebbd85b8 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -9,39 +9,44 @@ from libc.string cimport strchr import cython from cython import Py_ssize_t -from cpython.object cimport PyObject_Str - from cpython.datetime cimport datetime, datetime_new, import_datetime, tzinfo +from cpython.object cimport PyObject_Str from cpython.version cimport PY_VERSION_HEX + import_datetime() import numpy as np + cimport numpy as cnp -from numpy cimport (PyArray_GETITEM, PyArray_ITER_DATA, PyArray_ITER_NEXT, - PyArray_IterNew, flatiter, float64_t) +from numpy cimport ( + PyArray_GETITEM, + PyArray_ITER_DATA, + PyArray_ITER_NEXT, + PyArray_IterNew, + flatiter, + float64_t, +) + cnp.import_array() # dateutil compat -from dateutil.tz import (tzoffset, - tzlocal as _dateutil_tzlocal, - tzutc as _dateutil_tzutc, - tzstr as _dateutil_tzstr) + +from dateutil.parser import DEFAULTPARSER, parse as du_parse from dateutil.relativedelta import relativedelta -from dateutil.parser import DEFAULTPARSER -from dateutil.parser import parse as du_parse +from dateutil.tz import ( + tzlocal as _dateutil_tzlocal, + tzoffset, + tzstr as _dateutil_tzstr, + tzutc as _dateutil_tzutc, +) from pandas._config import get_option from pandas._libs.tslibs.ccalendar cimport c_MONTH_NUMBERS -from pandas._libs.tslibs.nattype cimport ( - c_nat_strings as nat_strings, - c_NaT as NaT, -) -from pandas._libs.tslibs.util cimport ( - is_array, - get_c_string_buf_and_size, -) +from pandas._libs.tslibs.nattype cimport c_NaT as NaT, c_nat_strings as nat_strings from pandas._libs.tslibs.offsets cimport is_offset_object +from pandas._libs.tslibs.util cimport get_c_string_buf_and_size, is_array + cdef extern from "../src/headers/portable.h": int getdigit_ascii(char c, int default) nogil diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index 20961c6da56bd..86b6533f5caf5 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -1,96 +1,98 @@ import warnings -from cpython.object cimport PyObject_RichCompareBool, Py_EQ, Py_NE +from cpython.object cimport Py_EQ, Py_NE, PyObject_RichCompareBool +from numpy cimport import_array, int64_t, ndarray -from numpy cimport int64_t, import_array, ndarray import numpy as np + import_array() from libc.stdlib cimport free, malloc +from libc.string cimport memset, strlen from libc.time cimport strftime, tm -from libc.string cimport strlen, memset import cython from cpython.datetime cimport ( - datetime, PyDate_Check, PyDateTime_Check, PyDateTime_IMPORT, PyDelta_Check, + datetime, ) + # import datetime C API PyDateTime_IMPORT from pandas._libs.tslibs.np_datetime cimport ( - npy_datetimestruct, - dtstruct_to_dt64, - dt64_to_dtstruct, - pandas_datetime_to_datetimestruct, - check_dts_bounds, NPY_DATETIMEUNIT, NPY_FR_D, NPY_FR_us, + check_dts_bounds, + dt64_to_dtstruct, + dtstruct_to_dt64, + npy_datetimestruct, + pandas_datetime_to_datetimestruct, ) + cdef extern from "src/datetime/np_datetime.h": int64_t npy_datetimestruct_to_datetime(NPY_DATETIMEUNIT fr, npy_datetimestruct *d) nogil cimport pandas._libs.tslibs.util as util -from pandas._libs.tslibs.timestamps import Timestamp from pandas._libs.tslibs.timedeltas import Timedelta -from pandas._libs.tslibs.timedeltas cimport ( - delta_to_nanoseconds, - is_any_td_scalar, -) +from pandas._libs.tslibs.timestamps import Timestamp from pandas._libs.tslibs.ccalendar cimport ( + c_MONTH_NUMBERS, dayofweek, get_day_of_year, - is_leapyear, - get_week_of_year, get_days_in_month, + get_week_of_year, + is_leapyear, ) -from pandas._libs.tslibs.ccalendar cimport c_MONTH_NUMBERS +from pandas._libs.tslibs.timedeltas cimport delta_to_nanoseconds, is_any_td_scalar + from pandas._libs.tslibs.conversion import ensure_datetime64ns from pandas._libs.tslibs.dtypes cimport ( - PeriodDtypeBase, - FR_UND, FR_ANN, - FR_QTR, - FR_MTH, - FR_WK, FR_BUS, FR_DAY, FR_HR, FR_MIN, - FR_SEC, FR_MS, - FR_US, + FR_MTH, FR_NS, + FR_QTR, + FR_SEC, + FR_UND, + FR_US, + FR_WK, + PeriodDtypeBase, attrname_to_abbrevs, ) - from pandas._libs.tslibs.parsing cimport get_rule_month + from pandas._libs.tslibs.parsing import parse_time_string + from pandas._libs.tslibs.nattype cimport ( - _nat_scalar_rules, NPY_NAT, - is_null_datetimelike, + _nat_scalar_rules, c_NaT as NaT, c_nat_strings as nat_strings, + is_null_datetimelike, ) from pandas._libs.tslibs.offsets cimport ( BaseOffset, - to_offset, - is_tick_object, is_offset_object, + is_tick_object, + to_offset, ) -from pandas._libs.tslibs.offsets import INVALID_FREQ_ERR_MSG +from pandas._libs.tslibs.offsets import INVALID_FREQ_ERR_MSG cdef: enum: diff --git a/pandas/_libs/tslibs/strptime.pyx b/pandas/_libs/tslibs/strptime.pyx index 660b582f73e6e..d2690be905a68 100644 --- a/pandas/_libs/tslibs/strptime.pyx +++ b/pandas/_libs/tslibs/strptime.pyx @@ -1,27 +1,30 @@ """Strptime-related classes and functions. """ -import time -import locale import calendar +import locale import re +import time from cpython.datetime cimport date, tzinfo from _thread import allocate_lock as _thread_allocate_lock +import numpy as np import pytz -import numpy as np from numpy cimport int64_t -from pandas._libs.tslibs.np_datetime cimport ( - check_dts_bounds, dtstruct_to_dt64, npy_datetimestruct) - from pandas._libs.tslibs.nattype cimport ( - checknull_with_nat, NPY_NAT, c_nat_strings as nat_strings, + checknull_with_nat, ) +from pandas._libs.tslibs.np_datetime cimport ( + check_dts_bounds, + dtstruct_to_dt64, + npy_datetimestruct, +) + cdef dict _parse_code_table = {'y': 0, 'Y': 1, diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index 8f3a599bf107c..ee32ed53a908b 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -2,39 +2,47 @@ import collections import cython -from cpython.object cimport Py_NE, Py_EQ, PyObject_RichCompare +from cpython.object cimport Py_EQ, Py_NE, PyObject_RichCompare import numpy as np + cimport numpy as cnp from numpy cimport int64_t, ndarray + cnp.import_array() -from cpython.datetime cimport (timedelta, - PyDateTime_Check, PyDelta_Check, - PyDateTime_IMPORT) +from cpython.datetime cimport ( + PyDateTime_Check, + PyDateTime_IMPORT, + PyDelta_Check, + timedelta, +) + PyDateTime_IMPORT cimport pandas._libs.tslibs.util as util -from pandas._libs.tslibs.util cimport ( - is_timedelta64_object, is_datetime64_object, is_integer_object, - is_float_object, is_array -) - from pandas._libs.tslibs.base cimport ABCTimestamp - from pandas._libs.tslibs.conversion cimport cast_from_unit - -from pandas._libs.tslibs.np_datetime cimport ( - cmp_scalar, td64_to_tdstruct, pandas_timedeltastruct) - from pandas._libs.tslibs.nattype cimport ( - checknull_with_nat, NPY_NAT, c_NaT as NaT, c_nat_strings as nat_strings, + checknull_with_nat, +) +from pandas._libs.tslibs.np_datetime cimport ( + cmp_scalar, + pandas_timedeltastruct, + td64_to_tdstruct, ) from pandas._libs.tslibs.offsets cimport is_tick_object +from pandas._libs.tslibs.util cimport ( + is_array, + is_datetime64_object, + is_float_object, + is_integer_object, + is_timedelta64_object, +) # ---------------------------------------------------------------------- # Constants diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index 8cef685933863..bddfc30d86a53 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -9,54 +9,66 @@ shadows the python class, where we do any heavy lifting. import warnings import numpy as np + cimport numpy as cnp -from numpy cimport int64_t, int8_t, uint8_t, ndarray -cnp.import_array() +from numpy cimport int8_t, int64_t, ndarray, uint8_t -from cpython.object cimport (PyObject_RichCompareBool, PyObject_RichCompare, - Py_EQ, Py_NE) +cnp.import_array() -from cpython.datetime cimport ( - datetime, - time, - tzinfo, - tzinfo as tzinfo_type, # alias bc `tzinfo` is a kwarg below +from cpython.datetime cimport ( # alias bc `tzinfo` is a kwarg below PyDateTime_Check, + PyDateTime_IMPORT, PyDelta_Check, PyTZInfo_Check, - PyDateTime_IMPORT, -) -PyDateTime_IMPORT - -from pandas._libs.tslibs.util cimport ( - is_datetime64_object, is_float_object, is_integer_object, - is_timedelta64_object, is_array, + datetime, + time, + tzinfo as tzinfo_type, ) +from cpython.object cimport Py_EQ, Py_NE, PyObject_RichCompare, PyObject_RichCompareBool -from pandas._libs.tslibs.base cimport ABCTimestamp +PyDateTime_IMPORT from pandas._libs.tslibs cimport ccalendar - +from pandas._libs.tslibs.base cimport ABCTimestamp from pandas._libs.tslibs.conversion cimport ( _TSObject, - convert_to_tsobject, convert_datetime_to_tsobject, + convert_to_tsobject, normalize_i8_stamp, ) -from pandas._libs.tslibs.fields import get_start_end_field, get_date_name_field +from pandas._libs.tslibs.util cimport ( + is_array, + is_datetime64_object, + is_float_object, + is_integer_object, + is_timedelta64_object, +) + +from pandas._libs.tslibs.fields import get_date_name_field, get_start_end_field + from pandas._libs.tslibs.nattype cimport NPY_NAT, c_NaT as NaT from pandas._libs.tslibs.np_datetime cimport ( - check_dts_bounds, npy_datetimestruct, dt64_to_dtstruct, + check_dts_bounds, cmp_scalar, + dt64_to_dtstruct, + npy_datetimestruct, pydatetime_to_dt64, ) + from pandas._libs.tslibs.np_datetime import OutOfBoundsDatetime -from pandas._libs.tslibs.offsets cimport to_offset, is_offset_object -from pandas._libs.tslibs.timedeltas cimport is_any_td_scalar, delta_to_nanoseconds + +from pandas._libs.tslibs.offsets cimport is_offset_object, to_offset +from pandas._libs.tslibs.timedeltas cimport delta_to_nanoseconds, is_any_td_scalar + from pandas._libs.tslibs.timedeltas import Timedelta + from pandas._libs.tslibs.timezones cimport ( - is_utc, maybe_get_tz, treat_tz_as_pytz, utc_pytz as UTC, - get_timezone, tz_compare, + get_timezone, + is_utc, + maybe_get_tz, + treat_tz_as_pytz, + tz_compare, + utc_pytz as UTC, ) from pandas._libs.tslibs.tzconversion cimport ( tz_convert_from_utc_single, diff --git a/pandas/_libs/tslibs/timezones.pyx b/pandas/_libs/tslibs/timezones.pyx index a8c785704d8e8..b82291a71057e 100644 --- a/pandas/_libs/tslibs/timezones.pyx +++ b/pandas/_libs/tslibs/timezones.pyx @@ -1,27 +1,31 @@ from datetime import timezone + from cpython.datetime cimport datetime, timedelta, tzinfo # dateutil compat + from dateutil.tz import ( gettz as dateutil_gettz, tzfile as _dateutil_tzfile, tzlocal as _dateutil_tzlocal, tzutc as _dateutil_tzutc, ) - - -from pytz.tzinfo import BaseTzInfo as _pytz_BaseTzInfo import pytz +from pytz.tzinfo import BaseTzInfo as _pytz_BaseTzInfo + UTC = pytz.utc import numpy as np + cimport numpy as cnp from numpy cimport int64_t + cnp.import_array() # ---------------------------------------------------------------------- -from pandas._libs.tslibs.util cimport is_integer_object, get_nat +from pandas._libs.tslibs.util cimport get_nat, is_integer_object + cdef int64_t NPY_NAT = get_nat() cdef tzinfo utc_stdlib = timezone.utc diff --git a/pandas/_libs/tslibs/tzconversion.pyx b/pandas/_libs/tslibs/tzconversion.pyx index 606639af16a18..2b148cd8849f1 100644 --- a/pandas/_libs/tslibs/tzconversion.pyx +++ b/pandas/_libs/tslibs/tzconversion.pyx @@ -5,21 +5,27 @@ import cython from cython import Py_ssize_t from cpython.datetime cimport ( - PyDateTime_IMPORT, PyDelta_Check, datetime, timedelta, tzinfo) + PyDateTime_IMPORT, + PyDelta_Check, + datetime, + timedelta, + tzinfo, +) + PyDateTime_IMPORT -import pytz from dateutil.tz import tzutc - import numpy as np +import pytz + cimport numpy as cnp -from numpy cimport ndarray, int64_t, uint8_t, intp_t +from numpy cimport int64_t, intp_t, ndarray, uint8_t + cnp.import_array() from pandas._libs.tslibs.ccalendar cimport DAY_NANOS, HOUR_NANOS from pandas._libs.tslibs.nattype cimport NPY_NAT -from pandas._libs.tslibs.np_datetime cimport ( - npy_datetimestruct, dt64_to_dtstruct) +from pandas._libs.tslibs.np_datetime cimport dt64_to_dtstruct, npy_datetimestruct from pandas._libs.tslibs.timezones cimport ( get_dst_info, get_utcoffset, diff --git a/pandas/_libs/tslibs/vectorized.pyx b/pandas/_libs/tslibs/vectorized.pyx index c8f8daf6724c2..bdc00f6c6e21a 100644 --- a/pandas/_libs/tslibs/vectorized.pyx +++ b/pandas/_libs/tslibs/vectorized.pyx @@ -1,18 +1,21 @@ import cython -from cpython.datetime cimport datetime, date, time, tzinfo +from cpython.datetime cimport date, datetime, time, tzinfo import numpy as np + from numpy cimport int64_t, intp_t, ndarray from .conversion cimport normalize_i8_stamp + from .dtypes import Resolution + from .nattype cimport NPY_NAT, c_NaT as NaT -from .np_datetime cimport npy_datetimestruct, dt64_to_dtstruct +from .np_datetime cimport dt64_to_dtstruct, npy_datetimestruct from .offsets cimport to_offset from .period cimport get_period_ordinal from .timestamps cimport create_timestamp_from_ts -from .timezones cimport is_utc, is_tzlocal, get_dst_info +from .timezones cimport get_dst_info, is_tzlocal, is_utc from .tzconversion cimport tz_convert_utc_to_tzlocal # ------------------------------------------------------------------------- diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index 362d0e6263697..3ec4547d223ce 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -2,13 +2,15 @@ import cython from cython import Py_ssize_t -from libcpp.deque cimport deque -from libc.stdlib cimport malloc, free +from libc.stdlib cimport free, malloc +from libcpp.deque cimport deque import numpy as np + cimport numpy as cnp -from numpy cimport ndarray, int64_t, float64_t, float32_t, uint8_t +from numpy cimport float32_t, float64_t, int64_t, ndarray, uint8_t + cnp.import_array() @@ -22,6 +24,7 @@ from pandas._libs.algos import is_monotonic from pandas._libs.util cimport numeric + cdef extern from "../src/skiplist.h": ctypedef struct node_t: node_t **next diff --git a/pandas/_libs/window/indexers.pyx b/pandas/_libs/window/indexers.pyx index 8a1e7feb57ace..9af1159a805ec 100644 --- a/pandas/_libs/window/indexers.pyx +++ b/pandas/_libs/window/indexers.pyx @@ -1,7 +1,8 @@ # cython: boundscheck=False, wraparound=False, cdivision=True import numpy as np -from numpy cimport ndarray, int64_t + +from numpy cimport int64_t, ndarray # Cython routines for window indexers diff --git a/pandas/_libs/writers.pyx b/pandas/_libs/writers.pyx index 2d5b31d7ccbcf..40c39aabb7a7a 100644 --- a/pandas/_libs/writers.pyx +++ b/pandas/_libs/writers.pyx @@ -5,8 +5,8 @@ from cpython.bytes cimport PyBytes_GET_SIZE from cpython.unicode cimport PyUnicode_GET_SIZE import numpy as np -from numpy cimport ndarray, uint8_t +from numpy cimport ndarray, uint8_t ctypedef fused pandas_string: str diff --git a/pandas/_testing.py b/pandas/_testing.py index 1cf9304ed2715..a020fbff3553a 100644 --- a/pandas/_testing.py +++ b/pandas/_testing.py @@ -535,7 +535,7 @@ def rands(nchars): def close(fignum=None): - from matplotlib.pyplot import get_fignums, close as _close + from matplotlib.pyplot import close as _close, get_fignums if fignum is None: for fignum in get_fignums(): diff --git a/pandas/_typing.py b/pandas/_typing.py index 8e98833ad37f7..76ec527e6e258 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -24,13 +24,15 @@ # https://mypy.readthedocs.io/en/latest/common_issues.html#import-cycles if TYPE_CHECKING: from pandas._libs import Period, Timedelta, Timestamp # noqa: F401 - from pandas.core.arrays.base import ExtensionArray # noqa: F401 + from pandas.core.dtypes.dtypes import ExtensionDtype # noqa: F401 - from pandas.core.indexes.base import Index # noqa: F401 - from pandas.core.generic import NDFrame # noqa: F401 + from pandas import Interval # noqa: F401 - from pandas.core.series import Series # noqa: F401 + from pandas.core.arrays.base import ExtensionArray # noqa: F401 from pandas.core.frame import DataFrame # noqa: F401 + from pandas.core.generic import NDFrame # noqa: F401 + from pandas.core.indexes.base import Index # noqa: F401 + from pandas.core.series import Series # noqa: F401 # array-like diff --git a/pandas/compat/pickle_compat.py b/pandas/compat/pickle_compat.py index 0484de3fa165d..015b203a60256 100644 --- a/pandas/compat/pickle_compat.py +++ b/pandas/compat/pickle_compat.py @@ -14,7 +14,7 @@ from pandas import Index if TYPE_CHECKING: - from pandas import Series, DataFrame + from pandas import DataFrame, Series def load_reduce(self): diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 733dbeed34b72..6b8d7dc35fe95 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -15,7 +15,7 @@ from pandas.core.construction import create_series_with_explicit_dtype if TYPE_CHECKING: - from pandas import DataFrame, Series, Index + from pandas import DataFrame, Index, Series ResType = Dict[int, Any] diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index db9cfd9d7fc59..6e5c7bc699962 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -520,7 +520,7 @@ def _from_inferred_categories( ------- Categorical """ - from pandas import Index, to_numeric, to_datetime, to_timedelta + from pandas import Index, to_datetime, to_numeric, to_timedelta cats = Index(inferred_categories) known_categories = ( @@ -1403,7 +1403,7 @@ def value_counts(self, dropna=True): -------- Series.value_counts """ - from pandas import Series, CategoricalIndex + from pandas import CategoricalIndex, Series code, cat = self._codes, self.categories ncat, mask = len(cat), 0 <= code diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index ee4d43fdb3bc2..c6945e2f78b5a 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -959,7 +959,7 @@ def value_counts(self, dropna=False): ------- Series """ - from pandas import Series, Index + from pandas import Index, Series if dropna: values = self[~self.isna()]._data diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index b0958af41158c..57df067c7b16e 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -116,6 +116,7 @@ def __from_arrow__( Construct IntegerArray from pyarrow Array/ChunkedArray. """ import pyarrow # noqa: F811 + from pandas.core.arrays._arrow_utils import pyarrow_array_to_numpy_and_mask pyarrow_type = pyarrow.from_numpy_dtype(self.type) diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index c861d25afd13f..ed2437cc061bd 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -1105,6 +1105,7 @@ def __arrow_array__(self, type=None): Convert myself into a pyarrow Array. """ import pyarrow + from pandas.core.arrays._arrow_utils import ArrowIntervalType try: diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 8d5cb12d60e4d..fe78481d99d30 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -300,6 +300,7 @@ def __arrow_array__(self, type=None): Convert myself into a pyarrow Array. """ import pyarrow + from pandas.core.arrays._arrow_utils import ArrowPeriodType if type is not None: diff --git a/pandas/core/arrays/sparse/accessor.py b/pandas/core/arrays/sparse/accessor.py index 8a30d2b954b55..da8d695c59b9e 100644 --- a/pandas/core/arrays/sparse/accessor.py +++ b/pandas/core/arrays/sparse/accessor.py @@ -87,8 +87,8 @@ def from_coo(cls, A, dense_index=False): 1 0 3.0 dtype: Sparse[float64, nan] """ - from pandas.core.arrays.sparse.scipy_sparse import _coo_to_sparse_series from pandas import Series + from pandas.core.arrays.sparse.scipy_sparse import _coo_to_sparse_series result = _coo_to_sparse_series(A, dense_index=dense_index) result = Series(result.array, index=result.index, copy=False) @@ -253,9 +253,10 @@ def from_spmatrix(cls, data, index=None, columns=None): 1 0.0 1.0 0.0 2 0.0 0.0 1.0 """ - from pandas import DataFrame from pandas._libs.sparse import IntIndex + from pandas import DataFrame + data = data.tocsc() index, columns = cls._prep_index(data, index, columns) n_rows, n_columns = data.shape @@ -354,8 +355,8 @@ def density(self) -> float: @staticmethod def _prep_index(data, index, columns): - import pandas.core.indexes.base as ibase from pandas.core.indexes.api import ensure_index + import pandas.core.indexes.base as ibase N, K = data.shape if index is None: diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index 86f6be77bc505..2b2431149e230 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -662,8 +662,10 @@ def register_plotting_backend_cb(key): def register_converter_cb(key): - from pandas.plotting import register_matplotlib_converters - from pandas.plotting import deregister_matplotlib_converters + from pandas.plotting import ( + deregister_matplotlib_converters, + register_matplotlib_converters, + ) if cf.get_option(key): register_matplotlib_converters() diff --git a/pandas/core/construction.py b/pandas/core/construction.py index 6c58698989e96..47f10f1f65f4a 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -48,9 +48,9 @@ import pandas.core.common as com if TYPE_CHECKING: - from pandas.core.series import Series # noqa: F401 - from pandas.core.indexes.api import Index # noqa: F401 from pandas.core.arrays import ExtensionArray # noqa: F401 + from pandas.core.indexes.api import Index # noqa: F401 + from pandas.core.series import Series # noqa: F401 def array( @@ -255,14 +255,14 @@ def array( ValueError: Cannot pass scalar '1' to 'pandas.array'. """ from pandas.core.arrays import ( - period_array, BooleanArray, + DatetimeArray, IntegerArray, IntervalArray, PandasArray, - DatetimeArray, - TimedeltaArray, StringArray, + TimedeltaArray, + period_array, ) if lib.is_scalar(data): diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 6b84f0e81f48b..228329898b6a4 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1244,6 +1244,7 @@ def try_datetime(v): # if so coerce to a DatetimeIndex; if they are not the same, # then these stay as object dtype, xref GH19671 from pandas._libs.tslibs import conversion + from pandas import DatetimeIndex try: @@ -1303,8 +1304,8 @@ def maybe_cast_to_datetime(value, dtype, errors: str = "raise"): try to cast the array/value to a datetimelike dtype, converting float nan to iNaT """ - from pandas.core.tools.timedeltas import to_timedelta from pandas.core.tools.datetimes import to_datetime + from pandas.core.tools.timedeltas import to_timedelta if dtype is not None: if isinstance(dtype, str): diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 22480fbc47508..8350e136417b1 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -30,12 +30,13 @@ if TYPE_CHECKING: import pyarrow # noqa: F401 + + from pandas import Categorical # noqa: F401 from pandas.core.arrays import ( # noqa: F401 + DatetimeArray, IntervalArray, PeriodArray, - DatetimeArray, ) - from pandas import Categorical # noqa: F401 str_type = str @@ -391,12 +392,13 @@ def __repr__(self) -> str_type: @staticmethod def _hash_categories(categories, ordered: Ordered = True) -> int: + from pandas.core.dtypes.common import DT64NS_DTYPE, is_datetime64tz_dtype + from pandas.core.util.hashing import ( - hash_array, _combine_hash_arrays, + hash_array, hash_tuples, ) - from pandas.core.dtypes.common import is_datetime64tz_dtype, DT64NS_DTYPE if len(categories) and isinstance(categories[0], tuple): # assumes if any individual category is a tuple, then all our. ATM @@ -939,6 +941,7 @@ def __from_arrow__( Construct PeriodArray from pyarrow Array/ChunkedArray. """ import pyarrow # noqa: F811 + from pandas.core.arrays import PeriodArray from pandas.core.arrays._arrow_utils import pyarrow_array_to_numpy_and_mask @@ -1136,6 +1139,7 @@ def __from_arrow__( Construct IntervalArray from pyarrow Array/ChunkedArray. """ import pyarrow # noqa: F811 + from pandas.core.arrays import IntervalArray if isinstance(array, pyarrow.Array): diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 3f634c1e6e1ff..79627e43d78c2 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -150,6 +150,7 @@ if TYPE_CHECKING: from pandas.core.groupby.generic import DataFrameGroupBy + from pandas.io.formats.style import Styler # --------------------------------------------------------------------- @@ -5205,8 +5206,9 @@ def duplicated( 4 True dtype: bool """ + from pandas._libs.hashtable import _SIZE_HINT_LIMIT, duplicated_int64 + from pandas.core.sorting import get_group_index - from pandas._libs.hashtable import duplicated_int64, _SIZE_HINT_LIMIT if self.empty: return self._constructor_sliced(dtype=bool) @@ -7868,8 +7870,8 @@ def join( def _join_compat( self, other, on=None, how="left", lsuffix="", rsuffix="", sort=False ): - from pandas.core.reshape.merge import merge from pandas.core.reshape.concat import concat + from pandas.core.reshape.merge import merge if isinstance(other, Series): if other.name is None: diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index ec7b14f27c5a1..c50b753cf3293 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -681,8 +681,8 @@ def value_counts( self, normalize=False, sort=True, ascending=False, bins=None, dropna=True ): - from pandas.core.reshape.tile import cut from pandas.core.reshape.merge import _get_join_indexers + from pandas.core.reshape.tile import cut if bins is not None and not np.iterable(bins): # scalar bins cannot be done at top level diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 67003dffb90bb..8239a792c65dd 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -237,7 +237,6 @@ def __new__(cls, *args, **kwargs): # core/groupby/grouper.py::Grouper # raising these warnings from TimeGrouper directly would fail the test: # tests/resample/test_deprecated.py::test_deprecating_on_loffset_and_base - # hacky way to set the stacklevel: if cls is TimeGrouper it means # that the call comes from a pandas internal call of resample, # otherwise it comes from pd.Grouper diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 986d6323e704e..1be381e38b157 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -5731,9 +5731,9 @@ def _maybe_cast_data_without_dtype(subarr): """ # Runtime import needed bc IntervalArray imports Index from pandas.core.arrays import ( + DatetimeArray, IntervalArray, PeriodArray, - DatetimeArray, TimedeltaArray, ) diff --git a/pandas/core/internals/ops.py b/pandas/core/internals/ops.py index fd9a9a5ef6c93..6eedf72726acb 100644 --- a/pandas/core/internals/ops.py +++ b/pandas/core/internals/ops.py @@ -5,8 +5,8 @@ from pandas._typing import ArrayLike if TYPE_CHECKING: - from pandas.core.internals.managers import BlockManager # noqa:F401 from pandas.core.internals.blocks import Block # noqa:F401 + from pandas.core.internals.managers import BlockManager # noqa:F401 def operate_blockwise( diff --git a/pandas/core/strings.py b/pandas/core/strings.py index a1db7742916de..6702bf519c52e 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -155,7 +155,7 @@ def _map_stringarray( an ndarray. """ - from pandas.arrays import IntegerArray, StringArray, BooleanArray + from pandas.arrays import BooleanArray, IntegerArray, StringArray mask = isna(arr) @@ -2186,7 +2186,7 @@ def _wrap_result( returns_string=True, ): - from pandas import Index, Series, MultiIndex + from pandas import Index, MultiIndex, Series # for category, we do the stuff on the categories, so blow it up # to the full series again @@ -2292,7 +2292,7 @@ def _get_series_list(self, others): list of Series Others transformed into list of Series. """ - from pandas import Series, DataFrame + from pandas import DataFrame, Series # self._orig is either Series or Index idx = self._orig if isinstance(self._orig, ABCIndexClass) else self._orig.index diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 0adab143f6052..7aac2f793f61a 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -53,9 +53,10 @@ from pandas.core.indexes.datetimes import DatetimeIndex if TYPE_CHECKING: - from pandas import Series # noqa:F401 from pandas._libs.tslibs.nattype import NaTType # noqa:F401 + from pandas import Series # noqa:F401 + # --------------------------------------------------------------------- # types used in annotations @@ -876,7 +877,7 @@ def _assemble_from_unit_mappings(arg, errors, tz): ------- Series """ - from pandas import to_timedelta, to_numeric, DataFrame + from pandas import DataFrame, to_numeric, to_timedelta arg = DataFrame(arg) if not arg.columns.is_unique: diff --git a/pandas/core/util/hashing.py b/pandas/core/util/hashing.py index 1b56b6d5a46fa..d79b9f4092325 100644 --- a/pandas/core/util/hashing.py +++ b/pandas/core/util/hashing.py @@ -275,7 +275,7 @@ def hash_array( # then hash and rename categories. We allow skipping the categorization # when the values are known/likely to be unique. if categorize: - from pandas import factorize, Categorical, Index + from pandas import Categorical, Index, factorize codes, categories = factorize(vals, sort=False) cat = Categorical(codes, Index(categories), ordered=False, fastpath=True) diff --git a/pandas/io/clipboard/__init__.py b/pandas/io/clipboard/__init__.py index 40bff5a75709b..d16955a98b62f 100644 --- a/pandas/io/clipboard/__init__.py +++ b/pandas/io/clipboard/__init__.py @@ -311,17 +311,17 @@ def init_windows_clipboard(): global HGLOBAL, LPVOID, DWORD, LPCSTR, INT global HWND, HINSTANCE, HMENU, BOOL, UINT, HANDLE from ctypes.wintypes import ( - HGLOBAL, - LPVOID, + BOOL, DWORD, - LPCSTR, - INT, - HWND, + HANDLE, + HGLOBAL, HINSTANCE, HMENU, - BOOL, + HWND, + INT, + LPCSTR, + LPVOID, UINT, - HANDLE, ) windll = ctypes.windll @@ -528,8 +528,8 @@ def determine_clipboard(): # Setup for the MAC OS X platform: if os.name == "mac" or platform.system() == "Darwin": try: - import Foundation # check if pyobjc is installed import AppKit + import Foundation # check if pyobjc is installed except ImportError: return init_osx_pbcopy_clipboard() else: diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 2a12f779230b2..b1bbda4a4b7e0 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -834,8 +834,8 @@ class ExcelFile: from pandas.io.excel._odfreader import _ODFReader from pandas.io.excel._openpyxl import _OpenpyxlReader - from pandas.io.excel._xlrd import _XlrdReader from pandas.io.excel._pyxlsb import _PyxlsbReader + from pandas.io.excel._xlrd import _XlrdReader _engines = { "xlrd": _XlrdReader, diff --git a/pandas/io/excel/_odfreader.py b/pandas/io/excel/_odfreader.py index 85ec9afaaec25..44abaf5d3b3c9 100644 --- a/pandas/io/excel/_odfreader.py +++ b/pandas/io/excel/_odfreader.py @@ -191,9 +191,9 @@ def _get_cell_string_value(self, cell) -> str: Find and decode OpenDocument text:s tags that represent a run length encoded sequence of space characters. """ - from odf.element import Text, Element - from odf.text import S, P + from odf.element import Element, Text from odf.namespaces import TEXTNS + from odf.text import P, S text_p = P().qname text_s = S().qname diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py index 0696d82e51f34..03a30cbd62f9a 100644 --- a/pandas/io/excel/_openpyxl.py +++ b/pandas/io/excel/_openpyxl.py @@ -225,7 +225,7 @@ def _convert_to_fill(cls, fill_dict): ------- fill : openpyxl.styles.Fill """ - from openpyxl.styles import PatternFill, GradientFill + from openpyxl.styles import GradientFill, PatternFill _pattern_fill_key_map = { "patternType": "fill_type", diff --git a/pandas/io/excel/_xlrd.py b/pandas/io/excel/_xlrd.py index 8f7d3b1368fc7..af82c15fd6b66 100644 --- a/pandas/io/excel/_xlrd.py +++ b/pandas/io/excel/_xlrd.py @@ -48,11 +48,11 @@ def get_sheet_by_index(self, index): def get_sheet_data(self, sheet, convert_float): from xlrd import ( - xldate, + XL_CELL_BOOLEAN, XL_CELL_DATE, XL_CELL_ERROR, - XL_CELL_BOOLEAN, XL_CELL_NUMBER, + xldate, ) epoch1904 = self.book.datemode diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index b2cd8e9319791..296fc341bf817 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -72,7 +72,7 @@ from pandas.io.formats.printing import adjoin, justify, pprint_thing if TYPE_CHECKING: - from pandas import Series, DataFrame, Categorical + from pandas import Categorical, DataFrame, Series FormattersType = Union[ List[Callable], Tuple[Callable, ...], Mapping[Union[str, int], Callable] diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index d11144938eb26..fd1efa2d1b668 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -42,8 +42,8 @@ try: - import matplotlib.pyplot as plt from matplotlib import colors + import matplotlib.pyplot as plt has_mpl = True except ImportError: diff --git a/pandas/io/html.py b/pandas/io/html.py index 3193f52d239f1..8354cf413814e 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -707,8 +707,8 @@ def _build_doc(self): -------- pandas.io.html._HtmlFrameParser._build_doc """ - from lxml.html import parse, fromstring, HTMLParser from lxml.etree import XMLSyntaxError + from lxml.html import HTMLParser, fromstring, parse parser = HTMLParser(recover=True, encoding=self.encoding) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index b67a1c5781d91..e0df4c29e543e 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -57,7 +57,7 @@ from pandas.io.formats.printing import adjoin, pprint_thing if TYPE_CHECKING: - from tables import File, Node, Col # noqa:F401 + from tables import Col, File, Node # noqa:F401 # versioning attribute diff --git a/pandas/io/sas/sas.pyx b/pandas/io/sas/sas.pyx index 0038e39e2ffcc..17b41fd2b4379 100644 --- a/pandas/io/sas/sas.pyx +++ b/pandas/io/sas/sas.pyx @@ -1,8 +1,8 @@ # cython: profile=False # cython: boundscheck=False, initializedcheck=False from cython import Py_ssize_t - import numpy as np + import pandas.io.sas.sas_constants as const ctypedef signed long long int64_t diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 9177696ca13d6..c87391eaa62b1 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -937,7 +937,7 @@ def _get_column_names_and_types(self, dtype_mapper): return column_names_and_types def _create_table_setup(self): - from sqlalchemy import Table, Column, PrimaryKeyConstraint + from sqlalchemy import Column, PrimaryKeyConstraint, Table column_names_and_types = self._get_column_names_and_types(self._sqlalchemy_type) @@ -1026,15 +1026,15 @@ def _sqlalchemy_type(self, col): col_type = lib.infer_dtype(col, skipna=True) from sqlalchemy.types import ( + TIMESTAMP, BigInteger, - Integer, - Float, - Text, Boolean, - DateTime, Date, + DateTime, + Float, + Integer, + Text, Time, - TIMESTAMP, ) if col_type == "datetime64" or col_type == "datetime": @@ -1079,7 +1079,7 @@ def _sqlalchemy_type(self, col): return Text def _get_dtype(self, sqltype): - from sqlalchemy.types import Integer, Float, Boolean, DateTime, Date, TIMESTAMP + from sqlalchemy.types import TIMESTAMP, Boolean, Date, DateTime, Float, Integer if isinstance(sqltype, Float): return float @@ -1374,7 +1374,7 @@ def to_sql( dtype = {col_name: dtype for col_name in frame} if dtype is not None: - from sqlalchemy.types import to_instance, TypeEngine + from sqlalchemy.types import TypeEngine, to_instance for col, my_type in dtype.items(): if not isinstance(to_instance(my_type), TypeEngine): diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index 353bc8a8936a5..b490e07e43753 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -1149,8 +1149,8 @@ def _plot(cls, ax, x, y, style=None, column_num=None, stacking_id=None, **kwds): @classmethod def _ts_plot(cls, ax, x, data, style=None, **kwds): from pandas.plotting._matplotlib.timeseries import ( - _maybe_resample, _decorate_axes, + _maybe_resample, format_dateaxis, ) diff --git a/pandas/plotting/_matplotlib/timeseries.py b/pandas/plotting/_matplotlib/timeseries.py index 8f3571cf13cbc..95f9fbf3995ed 100644 --- a/pandas/plotting/_matplotlib/timeseries.py +++ b/pandas/plotting/_matplotlib/timeseries.py @@ -24,7 +24,7 @@ from pandas.tseries.frequencies import get_period_alias, is_subperiod, is_superperiod if TYPE_CHECKING: - from pandas import Series, Index # noqa:F401 + from pandas import Index, Series # noqa:F401 # --------------------------------------------------------------------- diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index ecd20796b6f21..caa348d3a1fb9 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -267,9 +267,10 @@ def test_sparsearray(): def test_np(): - import numpy as np import warnings + import numpy as np + with warnings.catch_warnings(): warnings.simplefilter("ignore", FutureWarning) assert (pd.np.arange(0, 10) == np.arange(0, 10)).all() diff --git a/pandas/tests/arrays/interval/test_interval.py b/pandas/tests/arrays/interval/test_interval.py index d517eaaec68d2..0176755b54dd1 100644 --- a/pandas/tests/arrays/interval/test_interval.py +++ b/pandas/tests/arrays/interval/test_interval.py @@ -142,6 +142,7 @@ def test_repr(): @pyarrow_skip def test_arrow_extension_type(): import pyarrow as pa + from pandas.core.arrays._arrow_utils import ArrowIntervalType p1 = ArrowIntervalType(pa.int64(), "left") @@ -158,6 +159,7 @@ def test_arrow_extension_type(): @pyarrow_skip def test_arrow_array(): import pyarrow as pa + from pandas.core.arrays._arrow_utils import ArrowIntervalType intervals = pd.interval_range(1, 5, freq=1).array @@ -187,6 +189,7 @@ def test_arrow_array(): @pyarrow_skip def test_arrow_array_missing(): import pyarrow as pa + from pandas.core.arrays._arrow_utils import ArrowIntervalType arr = IntervalArray.from_breaks([0.0, 1.0, 2.0, 3.0]) @@ -221,6 +224,7 @@ def test_arrow_array_missing(): ) def test_arrow_table_roundtrip(breaks): import pyarrow as pa + from pandas.core.arrays._arrow_utils import ArrowIntervalType arr = IntervalArray.from_breaks(breaks) diff --git a/pandas/tests/arrays/test_period.py b/pandas/tests/arrays/test_period.py index 8887dd0278afe..0d81e8e733842 100644 --- a/pandas/tests/arrays/test_period.py +++ b/pandas/tests/arrays/test_period.py @@ -359,6 +359,7 @@ def test_arrow_extension_type(): ) def test_arrow_array(data, freq): import pyarrow as pa + from pandas.core.arrays._arrow_utils import ArrowPeriodType periods = period_array(data, freq=freq) @@ -384,6 +385,7 @@ def test_arrow_array(data, freq): @pyarrow_skip def test_arrow_array_missing(): import pyarrow as pa + from pandas.core.arrays._arrow_utils import ArrowPeriodType arr = PeriodArray([1, 2, 3], freq="D") @@ -399,6 +401,7 @@ def test_arrow_array_missing(): @pyarrow_skip def test_arrow_table_roundtrip(): import pyarrow as pa + from pandas.core.arrays._arrow_utils import ArrowPeriodType arr = PeriodArray([1, 2, 3], freq="D") diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index 9d6b9f39a0578..52a1e3aae9058 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -287,7 +287,7 @@ def test_stat_op_api(self, float_frame, float_string_frame): assert_stat_op_api("median", float_frame, float_string_frame) try: - from scipy.stats import skew, kurtosis # noqa:F401 + from scipy.stats import kurtosis, skew # noqa:F401 assert_stat_op_api("skew", float_frame, float_string_frame) assert_stat_op_api("kurt", float_frame, float_string_frame) @@ -370,7 +370,7 @@ def kurt(x): ) try: - from scipy import skew, kurtosis # noqa:F401 + from scipy import kurtosis, skew # noqa:F401 assert_stat_op_calc("skew", skewness, float_frame_with_na) assert_stat_op_calc("kurt", kurt, float_frame_with_na) diff --git a/pandas/tests/indexes/datetimes/test_datetime.py b/pandas/tests/indexes/datetimes/test_datetime.py index ec4162f87010f..7bb1d98086a91 100644 --- a/pandas/tests/indexes/datetimes/test_datetime.py +++ b/pandas/tests/indexes/datetimes/test_datetime.py @@ -59,6 +59,7 @@ def test_reindex_with_same_tz(self): def test_time_loc(self): # GH8667 from datetime import time + from pandas._libs.index import _SIZE_CUTOFF ns = _SIZE_CUTOFF + np.array([-100, 100], dtype=np.int64) diff --git a/pandas/tests/indexing/multiindex/test_indexing_slow.py b/pandas/tests/indexing/multiindex/test_indexing_slow.py index be193e0854d8d..d8e56661b7d61 100644 --- a/pandas/tests/indexing/multiindex/test_indexing_slow.py +++ b/pandas/tests/indexing/multiindex/test_indexing_slow.py @@ -15,7 +15,7 @@ def test_multiindex_get_loc(): # GH7724, GH2646 with warnings.catch_warnings(record=True): # test indexing into a multi-index before & past the lexsort depth - from numpy.random import randint, choice, randn + from numpy.random import choice, randint, randn cols = ["jim", "joe", "jolie", "joline", "jolia"] diff --git a/pandas/tests/io/test_fsspec.py b/pandas/tests/io/test_fsspec.py index c397a61616c1c..d64e2d1933ace 100644 --- a/pandas/tests/io/test_fsspec.py +++ b/pandas/tests/io/test_fsspec.py @@ -37,8 +37,8 @@ def test_read_csv(cleared_fs): def test_reasonable_error(monkeypatch, cleared_fs): - from fsspec.registry import known_implementations from fsspec import registry + from fsspec.registry import known_implementations registry.target.clear() with pytest.raises(ValueError) as e: diff --git a/pandas/tests/io/test_gcs.py b/pandas/tests/io/test_gcs.py index 4d93119ffa3f5..eacf4fa08545d 100644 --- a/pandas/tests/io/test_gcs.py +++ b/pandas/tests/io/test_gcs.py @@ -11,8 +11,7 @@ @td.skip_if_no("gcsfs") def test_read_csv_gcs(monkeypatch): - from fsspec import AbstractFileSystem - from fsspec import registry + from fsspec import AbstractFileSystem, registry registry.target.clear() # noqa # remove state @@ -37,8 +36,7 @@ def open(*args, **kwargs): @td.skip_if_no("gcsfs") def test_to_csv_gcs(monkeypatch): - from fsspec import AbstractFileSystem - from fsspec import registry + from fsspec import AbstractFileSystem, registry registry.target.clear() # noqa # remove state df1 = DataFrame( @@ -76,8 +74,7 @@ def mock_get_filepath_or_buffer(*args, **kwargs): @td.skip_if_no("gcsfs") def test_to_parquet_gcs_new_file(monkeypatch, tmpdir): """Regression test for writing to a not-yet-existent GCS Parquet file.""" - from fsspec import AbstractFileSystem - from fsspec import registry + from fsspec import AbstractFileSystem, registry registry.target.clear() # noqa # remove state df1 = DataFrame( diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 0991fae39138e..29b787d39c09d 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -48,10 +48,10 @@ try: import sqlalchemy - import sqlalchemy.schema - import sqlalchemy.sql.sqltypes as sqltypes from sqlalchemy.ext import declarative from sqlalchemy.orm import session as sa_session + import sqlalchemy.schema + import sqlalchemy.sql.sqltypes as sqltypes SQLALCHEMY_INSTALLED = True except ImportError: diff --git a/pandas/tests/plotting/common.py b/pandas/tests/plotting/common.py index 896d3278cdde1..3b1ff233c5ec1 100644 --- a/pandas/tests/plotting/common.py +++ b/pandas/tests/plotting/common.py @@ -13,7 +13,6 @@ from pandas import DataFrame, Series import pandas._testing as tm - """ This is a common base class used for various plotting tests """ @@ -24,6 +23,7 @@ class TestPlotBase: def setup_method(self, method): import matplotlib as mpl + from pandas.plotting._matplotlib import compat mpl.rcdefaults() @@ -187,8 +187,8 @@ def _check_colors( Series used for color grouping key used for andrew_curves, parallel_coordinates, radviz test """ + from matplotlib.collections import Collection, LineCollection, PolyCollection from matplotlib.lines import Line2D - from matplotlib.collections import Collection, PolyCollection, LineCollection conv = self.colorconverter if linecolors is not None: diff --git a/pandas/tests/plotting/test_frame.py b/pandas/tests/plotting/test_frame.py index 317a994bd9a32..ee43e5d7072fe 100644 --- a/pandas/tests/plotting/test_frame.py +++ b/pandas/tests/plotting/test_frame.py @@ -2408,8 +2408,8 @@ def test_specified_props_kwd_plot_box(self, props, expected): assert result[expected][0].get_color() == "C1" def test_default_color_cycle(self): - import matplotlib.pyplot as plt import cycler + import matplotlib.pyplot as plt colors = list("rgbk") plt.rcParams["axes.prop_cycle"] = cycler.cycler("color", colors) @@ -2953,8 +2953,8 @@ def _check(axes): @td.skip_if_no_scipy def test_memory_leak(self): """ Check that every plot type gets properly collected. """ - import weakref import gc + import weakref results = {} for kind in plotting.PlotAccessor._all_kinds: @@ -3032,8 +3032,8 @@ def test_df_subplots_patterns_minorticks(self): @pytest.mark.slow def test_df_gridspec_patterns(self): # GH 10819 - import matplotlib.pyplot as plt import matplotlib.gridspec as gridspec + import matplotlib.pyplot as plt ts = Series(np.random.randn(10), index=date_range("1/1/2000", periods=10)) @@ -3422,9 +3422,9 @@ def test_xlabel_ylabel_dataframe_subplots( def _generate_4_axes_via_gridspec(): - import matplotlib.pyplot as plt import matplotlib as mpl import matplotlib.gridspec # noqa + import matplotlib.pyplot as plt gs = mpl.gridspec.GridSpec(2, 2) ax_tl = plt.subplot(gs[0, 0]) diff --git a/pandas/tests/plotting/test_hist_method.py b/pandas/tests/plotting/test_hist_method.py index b6a6c326c3df3..34c881855d16a 100644 --- a/pandas/tests/plotting/test_hist_method.py +++ b/pandas/tests/plotting/test_hist_method.py @@ -101,7 +101,7 @@ def test_hist_layout_with_by(self): @pytest.mark.slow def test_hist_no_overlap(self): - from matplotlib.pyplot import subplot, gcf + from matplotlib.pyplot import gcf, subplot x = Series(randn(2)) y = Series(randn(2)) @@ -352,6 +352,7 @@ class TestDataFrameGroupByPlots(TestPlotBase): @pytest.mark.slow def test_grouped_hist_legacy(self): from matplotlib.patches import Rectangle + from pandas.plotting._matplotlib.hist import _grouped_hist df = DataFrame(randn(500, 2), columns=["A", "B"]) diff --git a/pandas/tests/plotting/test_misc.py b/pandas/tests/plotting/test_misc.py index 75eeede472fe9..f5c1c58f3f7ed 100644 --- a/pandas/tests/plotting/test_misc.py +++ b/pandas/tests/plotting/test_misc.py @@ -131,9 +131,10 @@ def test_scatter_matrix_axis(self): @pytest.mark.slow def test_andrews_curves(self, iris): - from pandas.plotting import andrews_curves from matplotlib import cm + from pandas.plotting import andrews_curves + df = iris _check_plot_works(andrews_curves, frame=df, class_column="Name") @@ -206,9 +207,10 @@ def test_andrews_curves(self, iris): @pytest.mark.slow def test_parallel_coordinates(self, iris): - from pandas.plotting import parallel_coordinates from matplotlib import cm + from pandas.plotting import parallel_coordinates + df = iris ax = _check_plot_works(parallel_coordinates, frame=df, class_column="Name") @@ -279,9 +281,10 @@ def test_parallel_coordinates_with_sorted_labels(self): @pytest.mark.slow def test_radviz(self, iris): - from pandas.plotting import radviz from matplotlib import cm + from pandas.plotting import radviz + df = iris _check_plot_works(radviz, frame=df, class_column="Name") @@ -397,6 +400,7 @@ def test_get_standard_colors_no_appending(self): # Make sure not to add more colors so that matplotlib can cycle # correctly. from matplotlib import cm + from pandas.plotting._matplotlib.style import _get_standard_colors color_before = cm.gnuplot(range(5)) diff --git a/pandas/tests/plotting/test_series.py b/pandas/tests/plotting/test_series.py index 151bb3bed7207..cc00626e992f3 100644 --- a/pandas/tests/plotting/test_series.py +++ b/pandas/tests/plotting/test_series.py @@ -452,7 +452,7 @@ def test_hist_layout_with_by(self): @pytest.mark.slow def test_hist_no_overlap(self): - from matplotlib.pyplot import subplot, gcf + from matplotlib.pyplot import gcf, subplot x = Series(randn(2)) y = Series(randn(2)) @@ -827,6 +827,7 @@ def test_standard_colors(self): @pytest.mark.slow def test_standard_colors_all(self): import matplotlib.colors as colors + from pandas.plotting._matplotlib.style import _get_standard_colors # multiple colors like mediumaquamarine diff --git a/pandas/tests/series/indexing/test_datetime.py b/pandas/tests/series/indexing/test_datetime.py index 0b34fab7b80b1..088f8681feb99 100644 --- a/pandas/tests/series/indexing/test_datetime.py +++ b/pandas/tests/series/indexing/test_datetime.py @@ -11,7 +11,6 @@ from pandas import DataFrame, DatetimeIndex, NaT, Series, Timestamp, date_range import pandas._testing as tm - """ Also test support for datetime64[ns] in Series / DataFrame """ @@ -166,6 +165,7 @@ def test_getitem_setitem_datetime_tz_pytz(): def test_getitem_setitem_datetime_tz_dateutil(): from dateutil.tz import tzutc + from pandas._libs.tslibs.timezones import dateutil_gettz as gettz tz = ( diff --git a/pandas/tests/series/methods/test_asof.py b/pandas/tests/series/methods/test_asof.py index 19caf4eccf748..4b4ef5ea046be 100644 --- a/pandas/tests/series/methods/test_asof.py +++ b/pandas/tests/series/methods/test_asof.py @@ -90,7 +90,7 @@ def test_with_nan(self): tm.assert_series_equal(result, expected) def test_periodindex(self): - from pandas import period_range, PeriodIndex + from pandas import PeriodIndex, period_range # array or list or dates N = 50 diff --git a/pandas/tests/series/test_arithmetic.py b/pandas/tests/series/test_arithmetic.py index 5c8a0d224c4f9..ef2bafd4ea2ad 100644 --- a/pandas/tests/series/test_arithmetic.py +++ b/pandas/tests/series/test_arithmetic.py @@ -195,8 +195,8 @@ def test_add_with_duplicate_index(self): tm.assert_series_equal(result, expected) def test_add_na_handling(self): - from decimal import Decimal from datetime import date + from decimal import Decimal s = Series( [Decimal("1.3"), Decimal("2.3")], index=[date(2012, 1, 1), date(2012, 1, 2)] diff --git a/pandas/tests/test_downstream.py b/pandas/tests/test_downstream.py index e718a6b759963..b32c5e91af295 100644 --- a/pandas/tests/test_downstream.py +++ b/pandas/tests/test_downstream.py @@ -90,7 +90,7 @@ def test_statsmodels(): def test_scikit_learn(df): sklearn = import_module("sklearn") # noqa - from sklearn import svm, datasets + from sklearn import datasets, svm digits = datasets.load_digits() clf = svm.SVC(gamma=0.001, C=100.0) diff --git a/pandas/util/_doctools.py b/pandas/util/_doctools.py index f413490764124..3a8a1a3144269 100644 --- a/pandas/util/_doctools.py +++ b/pandas/util/_doctools.py @@ -53,8 +53,8 @@ def plot(self, left, right, labels=None, vertical: bool = True): vertical : bool, default True If True, use vertical layout. If False, use horizontal layout. """ - import matplotlib.pyplot as plt import matplotlib.gridspec as gridspec + import matplotlib.pyplot as plt if not isinstance(left, list): left = [left] diff --git a/requirements-dev.txt b/requirements-dev.txt index 7bf3df176b378..c0dd77cd73ddc 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -11,7 +11,7 @@ cpplint flake8<3.8.0 flake8-comprehensions>=3.1.0 flake8-rst>=0.6.0,<=0.7.0 -isort==4.3.21 +isort>=5.2.1 mypy==0.730 pycodestyle gitpython From 5a48bda4386b9a7e5242eb88e76155709ce4e7f6 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sat, 1 Aug 2020 13:46:27 +0100 Subject: [PATCH 0417/1025] CI: xfail numpy-dev (#35502) --- pandas/tests/indexes/common.py | 12 +++++++++++- pandas/tests/indexing/test_loc.py | 3 +++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index c8b780455f862..f5b9f4a401e60 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -5,6 +5,7 @@ import pytest from pandas._libs import iNaT +from pandas.compat.numpy import _is_numpy_dev from pandas.errors import InvalidIndexError from pandas.core.dtypes.common import is_datetime64tz_dtype @@ -417,7 +418,7 @@ def test_set_ops_error_cases(self, case, method, index): with pytest.raises(TypeError, match=msg): getattr(index, method)(case) - def test_intersection_base(self, index): + def test_intersection_base(self, index, request): if isinstance(index, CategoricalIndex): return @@ -434,6 +435,15 @@ def test_intersection_base(self, index): # GH 10149 cases = [klass(second.values) for klass in [np.array, Series, list]] for case in cases: + # https://github.com/pandas-dev/pandas/issues/35481 + if ( + _is_numpy_dev + and isinstance(case, Series) + and isinstance(index, UInt64Index) + ): + mark = pytest.mark.xfail(reason="gh-35481") + request.node.add_marker(mark) + result = first.intersection(case) assert tm.equalContents(result, second) diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 30b13b6ea9fce..193800fae751f 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -5,6 +5,8 @@ import numpy as np import pytest +from pandas.compat.numpy import _is_numpy_dev + import pandas as pd from pandas import DataFrame, Series, Timestamp, date_range import pandas._testing as tm @@ -945,6 +947,7 @@ def test_loc_setitem_empty_append(self): df.loc[0, "x"] = expected.loc[0, "x"] tm.assert_frame_equal(df, expected) + @pytest.mark.xfail(_is_numpy_dev, reason="gh-35481") def test_loc_setitem_empty_append_raises(self): # GH6173, various appends to an empty dataframe From 8c650b6372bb72603428d050a8b9f2c68647586d Mon Sep 17 00:00:00 2001 From: Alex Lim Date: Sat, 1 Aug 2020 09:10:24 -0400 Subject: [PATCH 0418/1025] DOC: Expanded Using a Docker Container section (#35345) (#35379) --- doc/source/development/contributing.rst | 32 +++++++++++++++++++++---- 1 file changed, 28 insertions(+), 4 deletions(-) diff --git a/doc/source/development/contributing.rst b/doc/source/development/contributing.rst index 1b0e36e7b6933..4ffd1d586a99a 100644 --- a/doc/source/development/contributing.rst +++ b/doc/source/development/contributing.rst @@ -153,14 +153,38 @@ to build the documentation locally before pushing your changes. Using a Docker container ~~~~~~~~~~~~~~~~~~~~~~~~ -Instead of manually setting up a development environment, you can use Docker to -automatically create the environment with just several commands. Pandas provides a `DockerFile` -in the root directory to build a Docker image with a full pandas development environment. +Instead of manually setting up a development environment, you can use `Docker +`_ to automatically create the environment with just several +commands. Pandas provides a `DockerFile` in the root directory to build a Docker image +with a full pandas development environment. -Even easier, you can use the DockerFile to launch a remote session with Visual Studio Code, +**Docker Commands** + +Pass your GitHub username in the `DockerFile` to use your own fork:: + + # Build the image pandas-yourname-env + docker build --tag pandas-yourname-env . + # Run a container and bind your local forked repo, pandas-yourname, to the container + docker run -it --rm -v path-to-pandas-yourname:/home/pandas-yourname pandas-yourname-env + +Even easier, you can integrate Docker with the following IDEs: + +**Visual Studio Code** + +You can use the DockerFile to launch a remote session with Visual Studio Code, a popular free IDE, using the `.devcontainer.json` file. See https://code.visualstudio.com/docs/remote/containers for details. +**PyCharm (Professional)** + +Enable Docker support and use the Services tool window to build and manage images as well as +run and interact with containers. +See https://www.jetbrains.com/help/pycharm/docker.html for details. + +Note that you might need to rebuild the C extensions if/when you merge with upstream/master using:: + + python setup.py build_ext --inplace -j 4 + .. _contributing.dev_c: Installing a C compiler From 10c31127133245699680fdf109f5fcb74c5fc02d Mon Sep 17 00:00:00 2001 From: Adam Spannbauer Date: Sat, 1 Aug 2020 13:05:54 -0400 Subject: [PATCH 0419/1025] DOC: add note on str cons to read_sql (#35503) --- pandas/io/sql.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/io/sql.py b/pandas/io/sql.py index c87391eaa62b1..51888e5021d80 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -439,7 +439,8 @@ def read_sql( con : SQLAlchemy connectable, str, or sqlite3 connection Using SQLAlchemy makes it possible to use any DB supported by that library. If a DBAPI2 object, only sqlite3 is supported. The user is responsible - for engine disposal and connection closure for the SQLAlchemy connectable. See + for engine disposal and connection closure for the SQLAlchemy connectable; str + connections are closed automatically. See `here `_. index_col : str or list of str, optional, default: None Column(s) to set as index(MultiIndex). From 41848ee2514168a6bc04c0226edd18a54eef0194 Mon Sep 17 00:00:00 2001 From: Andrew Wieteska <48889395+arw2019@users.noreply.github.com> Date: Sat, 1 Aug 2020 13:10:09 -0400 Subject: [PATCH 0420/1025] BUG: date_range doesn't propagate ambigous=False to tz_localize (#35302) --- doc/source/whatsnew/v1.2.0.rst | 2 +- pandas/core/arrays/datetimes.py | 4 +- .../indexes/datetimes/test_constructors.py | 59 +++++++++++++++++++ 3 files changed, 62 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 2066858e5de86..b16ca0a80c5b4 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -71,7 +71,7 @@ Timedelta Timezones ^^^^^^^^^ -- +- Bug in :func:`date_range` was raising AmbiguousTimeError for valid input with `ambiguous=False` (:issue:`35297`) - diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index d674b1c476d2c..8b2bb7832b5d0 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -418,9 +418,9 @@ def _generate_range( # index is localized datetime64 array -> have to convert # start/end as well to compare if start is not None: - start = start.tz_localize(tz).asm8 + start = start.tz_localize(tz, ambiguous, nonexistent).asm8 if end is not None: - end = end.tz_localize(tz).asm8 + end = end.tz_localize(tz, ambiguous, nonexistent).asm8 else: # Create a linearly spaced date_range in local time # Nanosecond-granularity timestamps aren't always correctly diff --git a/pandas/tests/indexes/datetimes/test_constructors.py b/pandas/tests/indexes/datetimes/test_constructors.py index c150e7901c86a..9a855a1624520 100644 --- a/pandas/tests/indexes/datetimes/test_constructors.py +++ b/pandas/tests/indexes/datetimes/test_constructors.py @@ -787,6 +787,65 @@ def test_construction_with_nat_and_tzlocal(self): expected = DatetimeIndex([Timestamp("2018", tz=tz), pd.NaT]) tm.assert_index_equal(result, expected) + def test_constructor_with_ambiguous_keyword_arg(self): + # GH 35297 + + expected = DatetimeIndex( + ["2020-11-01 01:00:00", "2020-11-02 01:00:00"], + dtype="datetime64[ns, America/New_York]", + freq="D", + ambiguous=False, + ) + + # ambiguous keyword in start + timezone = "America/New_York" + start = pd.Timestamp(year=2020, month=11, day=1, hour=1).tz_localize( + timezone, ambiguous=False + ) + result = pd.date_range(start=start, periods=2, ambiguous=False) + tm.assert_index_equal(result, expected) + + # ambiguous keyword in end + timezone = "America/New_York" + end = pd.Timestamp(year=2020, month=11, day=2, hour=1).tz_localize( + timezone, ambiguous=False + ) + result = pd.date_range(end=end, periods=2, ambiguous=False) + tm.assert_index_equal(result, expected) + + def test_constructor_with_nonexistent_keyword_arg(self): + # GH 35297 + + timezone = "Europe/Warsaw" + + # nonexistent keyword in start + start = pd.Timestamp("2015-03-29 02:30:00").tz_localize( + timezone, nonexistent="shift_forward" + ) + result = pd.date_range(start=start, periods=2, freq="H") + expected = DatetimeIndex( + [ + pd.Timestamp("2015-03-29 03:00:00+02:00", tz=timezone), + pd.Timestamp("2015-03-29 04:00:00+02:00", tz=timezone), + ] + ) + + tm.assert_index_equal(result, expected) + + # nonexistent keyword in end + end = pd.Timestamp("2015-03-29 02:30:00").tz_localize( + timezone, nonexistent="shift_forward" + ) + result = pd.date_range(end=end, periods=2, freq="H") + expected = DatetimeIndex( + [ + pd.Timestamp("2015-03-29 01:00:00+01:00", tz=timezone), + pd.Timestamp("2015-03-29 03:00:00+02:00", tz=timezone), + ] + ) + + tm.assert_index_equal(result, expected) + def test_constructor_no_precision_raises(self): # GH-24753, GH-24739 From a0169458568a9568c093d03494dc1639eb317d21 Mon Sep 17 00:00:00 2001 From: Mohammad Jafar Mashhadi Date: Sun, 2 Aug 2020 04:18:01 -0600 Subject: [PATCH 0421/1025] Added alignable boolean series and its example to `.loc` docs. (#35506) --- pandas/core/indexing.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 04d1dbceb3342..dd81823055390 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -255,6 +255,8 @@ def loc(self) -> "_LocIndexer": - A boolean array of the same length as the axis being sliced, e.g. ``[True, False, True]``. + - An alignable boolean Series. The index of the key will be aligned before + masking. - A ``callable`` function with one argument (the calling Series or DataFrame) and that returns valid output for indexing (one of the above) @@ -264,6 +266,8 @@ def loc(self) -> "_LocIndexer": ------ KeyError If any items are not found. + IndexingError + If an indexed key is passed and its index is unalignable to the frame index. See Also -------- @@ -319,6 +323,13 @@ def loc(self) -> "_LocIndexer": max_speed shield sidewinder 7 8 + Alignable boolean Series: + + >>> df.loc[pd.Series([False, True, False], + ... index=['viper', 'sidewinder', 'cobra'])] + max_speed shield + sidewinder 7 8 + Conditional that returns a boolean Series >>> df.loc[df['shield'] > 6] From 21693477df5db7444aa1b215953409f35aab84de Mon Sep 17 00:00:00 2001 From: Thomas Smith Date: Tue, 4 Aug 2020 00:13:37 +0100 Subject: [PATCH 0422/1025] TST: adding test for .describe() with duplicate columns (#35424) --- pandas/tests/groupby/test_function.py | 62 +++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index e693962e57ac3..cbfba16223f74 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -992,6 +992,68 @@ def test_frame_describe_unstacked_format(): tm.assert_frame_equal(result, expected) +@pytest.mark.filterwarnings( + "ignore:" + "indexing past lexsort depth may impact performance:" + "pandas.errors.PerformanceWarning" +) +@pytest.mark.parametrize("as_index", [True, False]) +def test_describe_with_duplicate_output_column_names(as_index): + # GH 35314 + df = pd.DataFrame( + { + "a": [99, 99, 99, 88, 88, 88], + "b": [1, 2, 3, 4, 5, 6], + "c": [10, 20, 30, 40, 50, 60], + }, + columns=["a", "b", "b"], + ) + + expected = ( + pd.DataFrame.from_records( + [ + ("a", "count", 3.0, 3.0), + ("a", "mean", 88.0, 99.0), + ("a", "std", 0.0, 0.0), + ("a", "min", 88.0, 99.0), + ("a", "25%", 88.0, 99.0), + ("a", "50%", 88.0, 99.0), + ("a", "75%", 88.0, 99.0), + ("a", "max", 88.0, 99.0), + ("b", "count", 3.0, 3.0), + ("b", "mean", 5.0, 2.0), + ("b", "std", 1.0, 1.0), + ("b", "min", 4.0, 1.0), + ("b", "25%", 4.5, 1.5), + ("b", "50%", 5.0, 2.0), + ("b", "75%", 5.5, 2.5), + ("b", "max", 6.0, 3.0), + ("b", "count", 3.0, 3.0), + ("b", "mean", 5.0, 2.0), + ("b", "std", 1.0, 1.0), + ("b", "min", 4.0, 1.0), + ("b", "25%", 4.5, 1.5), + ("b", "50%", 5.0, 2.0), + ("b", "75%", 5.5, 2.5), + ("b", "max", 6.0, 3.0), + ], + ) + .set_index([0, 1]) + .T + ) + expected.columns.names = [None, None] + expected.index = pd.Index([88, 99], name="a") + + if as_index: + expected = expected.drop(columns=["a"], level=0) + else: + expected = expected.reset_index(drop=True) + + result = df.groupby("a", as_index=as_index).describe() + + tm.assert_frame_equal(result, expected) + + def test_groupby_mean_no_overflow(): # Regression test for (#22487) df = pd.DataFrame( From 7d1f59c5449ace93cd03a825f878dff91fc109ba Mon Sep 17 00:00:00 2001 From: Thomas Smith Date: Tue, 4 Aug 2020 00:19:26 +0100 Subject: [PATCH 0423/1025] TST: ensure that DataFrameGroupBy.apply does not convert datetime.date to pd.Timestamp (#35504) --- pandas/tests/groupby/test_apply.py | 32 +++++++++++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 5a1268bfb03db..525a6fe2637c3 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -1,4 +1,4 @@ -from datetime import datetime +from datetime import date, datetime from io import StringIO import numpy as np @@ -1014,3 +1014,33 @@ def test_apply_with_timezones_aware(): result2 = df2.groupby("x", group_keys=False).apply(lambda df: df[["x", "y"]].copy()) tm.assert_frame_equal(result1, result2) + + +def test_apply_with_date_in_multiindex_does_not_convert_to_timestamp(): + # GH 29617 + + df = pd.DataFrame( + { + "A": ["a", "a", "a", "b"], + "B": [ + date(2020, 1, 10), + date(2020, 1, 10), + date(2020, 2, 10), + date(2020, 2, 10), + ], + "C": [1, 2, 3, 4], + }, + index=pd.Index([100, 101, 102, 103], name="idx"), + ) + + grp = df.groupby(["A", "B"]) + result = grp.apply(lambda x: x.head(1)) + + expected = df.iloc[[0, 2, 3]] + expected = expected.reset_index() + expected.index = pd.MultiIndex.from_frame(expected[["A", "B", "idx"]]) + expected = expected.drop(columns="idx") + + tm.assert_frame_equal(result, expected) + for val in result.index.levels[1]: + assert type(val) is date From 9912b51c02ce6743d8e947b77296e87975ce0db1 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 3 Aug 2020 16:22:03 -0700 Subject: [PATCH 0424/1025] REF: de-duplicate get_resolution (#35245) --- pandas/_libs/tslibs/vectorized.pyx | 57 +++++++++++++----------------- 1 file changed, 24 insertions(+), 33 deletions(-) diff --git a/pandas/_libs/tslibs/vectorized.pyx b/pandas/_libs/tslibs/vectorized.pyx index bdc00f6c6e21a..b23f8255a76ac 100644 --- a/pandas/_libs/tslibs/vectorized.pyx +++ b/pandas/_libs/tslibs/vectorized.pyx @@ -211,49 +211,40 @@ def get_resolution(const int64_t[:] stamps, tzinfo tz=None): int reso = RESO_DAY, curr_reso ndarray[int64_t] trans int64_t[:] deltas - Py_ssize_t[:] pos - int64_t local_val, delta + intp_t[:] pos + int64_t local_val, delta = NPY_NAT + bint use_utc = False, use_tzlocal = False, use_fixed = False if is_utc(tz) or tz is None: - for i in range(n): - if stamps[i] == NPY_NAT: - continue - dt64_to_dtstruct(stamps[i], &dts) - curr_reso = _reso_stamp(&dts) - if curr_reso < reso: - reso = curr_reso + use_utc = True elif is_tzlocal(tz): - for i in range(n): - if stamps[i] == NPY_NAT: - continue - local_val = tz_convert_utc_to_tzlocal(stamps[i], tz) - dt64_to_dtstruct(local_val, &dts) - curr_reso = _reso_stamp(&dts) - if curr_reso < reso: - reso = curr_reso + use_tzlocal = True else: - # Adjust datetime64 timestamp, recompute datetimestruct trans, deltas, typ = get_dst_info(tz) - if typ not in ["pytz", "dateutil"]: # static/fixed; in this case we know that len(delta) == 1 + use_fixed = True delta = deltas[0] - for i in range(n): - if stamps[i] == NPY_NAT: - continue - dt64_to_dtstruct(stamps[i] + delta, &dts) - curr_reso = _reso_stamp(&dts) - if curr_reso < reso: - reso = curr_reso else: pos = trans.searchsorted(stamps, side="right") - 1 - for i in range(n): - if stamps[i] == NPY_NAT: - continue - dt64_to_dtstruct(stamps[i] + deltas[pos[i]], &dts) - curr_reso = _reso_stamp(&dts) - if curr_reso < reso: - reso = curr_reso + + for i in range(n): + if stamps[i] == NPY_NAT: + continue + + if use_utc: + local_val = stamps[i] + elif use_tzlocal: + local_val = tz_convert_utc_to_tzlocal(stamps[i], tz) + elif use_fixed: + local_val = stamps[i] + delta + else: + local_val = stamps[i] + deltas[pos[i]] + + dt64_to_dtstruct(local_val, &dts) + curr_reso = _reso_stamp(&dts) + if curr_reso < reso: + reso = curr_reso return Resolution(reso) From 8f9765af1c490d1282f60ad758e9b49bd39b9daa Mon Sep 17 00:00:00 2001 From: Daniel Saxton <2658661+dsaxton@users.noreply.github.com> Date: Mon, 3 Aug 2020 18:29:31 -0500 Subject: [PATCH 0425/1025] REGR: Check for float in isnaobj_old (#35510) --- doc/source/whatsnew/v1.1.1.rst | 2 +- pandas/_libs/missing.pyx | 5 ++++- pandas/tests/io/parser/test_common.py | 12 +++++++++++- 3 files changed, 16 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.1.1.rst b/doc/source/whatsnew/v1.1.1.rst index 443589308ad4c..f8f655ce7b866 100644 --- a/doc/source/whatsnew/v1.1.1.rst +++ b/doc/source/whatsnew/v1.1.1.rst @@ -15,7 +15,7 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ -- +- Fixed regression where :func:`read_csv` would raise a ``ValueError`` when ``pandas.options.mode.use_inf_as_na`` was set to ``True`` (:issue:`35493`). - - diff --git a/pandas/_libs/missing.pyx b/pandas/_libs/missing.pyx index 760fab3781fd4..771e8053ac9be 100644 --- a/pandas/_libs/missing.pyx +++ b/pandas/_libs/missing.pyx @@ -155,7 +155,10 @@ def isnaobj_old(arr: ndarray) -> ndarray: result = np.zeros(n, dtype=np.uint8) for i in range(n): val = arr[i] - result[i] = checknull(val) or val == INF or val == NEGINF + result[i] = ( + checknull(val) + or util.is_float_object(val) and (val == INF or val == NEGINF) + ) return result.view(np.bool_) diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index 12e73bae40eac..5154a9ba6fdf0 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -18,7 +18,7 @@ from pandas.errors import DtypeWarning, EmptyDataError, ParserError import pandas.util._test_decorators as td -from pandas import DataFrame, Index, MultiIndex, Series, compat, concat +from pandas import DataFrame, Index, MultiIndex, Series, compat, concat, option_context import pandas._testing as tm from pandas.io.parsers import CParserWrapper, TextFileReader, TextParser @@ -2179,3 +2179,13 @@ def test_read_csv_names_not_accepting_sets(all_parsers): parser = all_parsers with pytest.raises(ValueError, match="Names should be an ordered collection."): parser.read_csv(StringIO(data), names=set("QAZ")) + + +def test_read_csv_with_use_inf_as_na(all_parsers): + # https://github.com/pandas-dev/pandas/issues/35493 + parser = all_parsers + data = "1.0\nNaN\n3.0" + with option_context("use_inf_as_na", True): + result = parser.read_csv(StringIO(data), header=None) + expected = DataFrame([1.0, np.nan, 3.0]) + tm.assert_frame_equal(result, expected) From 3a0a2b972331c722b4a4237228b6abcbc07df046 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Tue, 4 Aug 2020 00:30:24 +0100 Subject: [PATCH 0426/1025] REF: remove special casing from Index.equals (always dispatch to subclass) (#35330) --- pandas/core/indexes/base.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 1be381e38b157..7ba94c76d0037 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4252,16 +4252,15 @@ def equals(self, other: Any) -> bool: if not isinstance(other, Index): return False - if is_object_dtype(self.dtype) and not is_object_dtype(other.dtype): - # if other is not object, use other's logic for coercion - return other.equals(self) - - if isinstance(other, ABCMultiIndex): - # d-level MultiIndex can equal d-tuple Index - return other.equals(self) - - if is_extension_array_dtype(other.dtype): - # All EA-backed Index subclasses override equals + # If other is a subclass of self and defines it's own equals method, we + # dispatch to the subclass method. For instance for a MultiIndex, + # a d-level MultiIndex can equal d-tuple Index. + # Note: All EA-backed Index subclasses override equals + if ( + isinstance(other, type(self)) + and type(other) is not type(self) + and other.equals is not self.equals + ): return other.equals(self) return array_equivalent(self._values, other._values) From 33284edd21a0046c76142baee9668ec1937ad216 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Tue, 4 Aug 2020 00:40:27 +0100 Subject: [PATCH 0427/1025] CI: activate azure pipelines on 1.1.x - DO NOT MERGE (#35468) --- azure-pipelines.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index e45cafc02cb61..113ad3e338952 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -1,9 +1,11 @@ # Adapted from https://github.com/numba/numba/blob/master/azure-pipelines.yml trigger: - master +- 1.1.x pr: - master +- 1.1.x variables: PYTEST_WORKERS: auto From 359d8b03d44a3eafc128cd7d01cd25d7d0889120 Mon Sep 17 00:00:00 2001 From: Thomas Smith Date: Tue, 4 Aug 2020 00:46:22 +0100 Subject: [PATCH 0428/1025] adding test for #18451 (#35494) --- pandas/tests/groupby/test_groupby.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index ebce5b0ef0a66..8c51ebf89f5c0 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2055,3 +2055,17 @@ def test_groups_repr_truncates(max_seq_items, expected): result = df.groupby(np.array(df.a)).groups.__repr__() assert result == expected + + +def test_group_on_two_row_multiindex_returns_one_tuple_key(): + # GH 18451 + df = pd.DataFrame([{"a": 1, "b": 2, "c": 99}, {"a": 1, "b": 2, "c": 88}]) + df = df.set_index(["a", "b"]) + + grp = df.groupby(["a", "b"]) + result = grp.indices + expected = {(1, 2): np.array([0, 1], dtype=np.int64)} + + assert len(result) == 1 + key = (1, 2) + assert (result[key] == expected[key]).all() From 98879eba38040a98be6a139b752d793e647d493b Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Tue, 4 Aug 2020 00:48:54 +0100 Subject: [PATCH 0429/1025] BUG: CategoricalIndex.format (#35440) --- doc/source/whatsnew/v1.1.1.rst | 7 +++++++ pandas/core/indexes/category.py | 12 ++++++------ pandas/core/indexes/range.py | 7 +------ pandas/tests/indexes/categorical/test_category.py | 6 ++++++ pandas/tests/indexes/common.py | 6 ++++++ pandas/tests/indexes/datetimes/test_datetimelike.py | 6 ++++++ pandas/tests/indexes/test_base.py | 7 +++++-- pandas/tests/indexes/test_numeric.py | 7 +++++++ pandas/tests/io/formats/test_format.py | 9 +++++++++ 9 files changed, 53 insertions(+), 14 deletions(-) diff --git a/doc/source/whatsnew/v1.1.1.rst b/doc/source/whatsnew/v1.1.1.rst index f8f655ce7b866..6a327a4fc732f 100644 --- a/doc/source/whatsnew/v1.1.1.rst +++ b/doc/source/whatsnew/v1.1.1.rst @@ -26,6 +26,13 @@ Fixed regressions Bug fixes ~~~~~~~~~ + +Categorical +^^^^^^^^^^^ + +- Bug in :meth:`CategoricalIndex.format` where, when stringified scalars had different lengths, the shorter string would be right-filled with spaces, so it had the same length as the longest string (:issue:`35439`) + + **Datetimelike** - diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index b0b008de69a94..74b235655e345 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -20,7 +20,7 @@ pandas_dtype, ) from pandas.core.dtypes.dtypes import CategoricalDtype -from pandas.core.dtypes.missing import is_valid_nat_for_dtype, isna +from pandas.core.dtypes.missing import is_valid_nat_for_dtype, isna, notna from pandas.core import accessor from pandas.core.algorithms import take_1d @@ -348,12 +348,12 @@ def _format_attrs(self): return attrs def _format_with_header(self, header, na_rep="NaN") -> List[str]: - from pandas.io.formats.format import format_array + from pandas.io.formats.printing import pprint_thing - formatted_values = format_array( - self._values, formatter=None, na_rep=na_rep, justify="left" - ) - result = ibase.trim_front(formatted_values) + result = [ + pprint_thing(x, escape_chars=("\t", "\r", "\n")) if notna(x) else na_rep + for x in self._values + ] return header + result # -------------------------------------------------------------------- diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index e5e98039ff77b..eee610681087d 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -1,7 +1,7 @@ from datetime import timedelta import operator from sys import getsizeof -from typing import Any, List, Optional +from typing import Any, Optional import warnings import numpy as np @@ -33,8 +33,6 @@ from pandas.core.indexes.numeric import Int64Index from pandas.core.ops.common import unpack_zerodim_and_defer -from pandas.io.formats.printing import pprint_thing - _empty_range = range(0) @@ -197,9 +195,6 @@ def _format_data(self, name=None): # we are formatting thru the attributes return None - def _format_with_header(self, header, na_rep="NaN") -> List[str]: - return header + [pprint_thing(x) for x in self._range] - # -------------------------------------------------------------------- _deprecation_message = ( "RangeIndex.{} is deprecated and will be " diff --git a/pandas/tests/indexes/categorical/test_category.py b/pandas/tests/indexes/categorical/test_category.py index 7f30a77872bc1..8af26eef504fc 100644 --- a/pandas/tests/indexes/categorical/test_category.py +++ b/pandas/tests/indexes/categorical/test_category.py @@ -478,3 +478,9 @@ def test_reindex_base(self): def test_map_str(self): # See test_map.py pass + + def test_format_different_scalar_lengths(self): + # GH35439 + idx = CategoricalIndex(["aaaaaaaaa", "b"]) + expected = ["aaaaaaaaa", "b"] + assert idx.format() == expected diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index f5b9f4a401e60..3b41c4bfacf73 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -642,6 +642,12 @@ def test_equals_op(self): tm.assert_numpy_array_equal(index_a == item, expected3) tm.assert_series_equal(series_a == item, Series(expected3)) + def test_format(self): + # GH35439 + idx = self.create_index() + expected = [str(x) for x in idx] + assert idx.format() == expected + def test_hasnans_isnans(self, index): # GH 11343, added tests for hasnans / isnans if isinstance(index, MultiIndex): diff --git a/pandas/tests/indexes/datetimes/test_datetimelike.py b/pandas/tests/indexes/datetimes/test_datetimelike.py index 7345ae3032463..a5abf2946feda 100644 --- a/pandas/tests/indexes/datetimes/test_datetimelike.py +++ b/pandas/tests/indexes/datetimes/test_datetimelike.py @@ -20,6 +20,12 @@ def index(self, request): def create_index(self) -> DatetimeIndex: return date_range("20130101", periods=5) + def test_format(self): + # GH35439 + idx = self.create_index() + expected = [f"{x:%Y-%m-%d}" for x in idx] + assert idx.format() == expected + def test_shift(self): pass # handled in test_ops diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index eaf48421dc071..59ee88117a984 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -1171,8 +1171,11 @@ def test_summary_bug(self): assert "~:{range}:0" in result assert "{other}%s" in result - def test_format(self, index): - self._check_method_works(Index.format, index) + def test_format_different_scalar_lengths(self): + # GH35439 + idx = Index(["aaaaaaaaa", "b"]) + expected = ["aaaaaaaaa", "b"] + assert idx.format() == expected def test_format_bug(self): # GH 14626 diff --git a/pandas/tests/indexes/test_numeric.py b/pandas/tests/indexes/test_numeric.py index a7c5734ef9b02..bfcac5d433d2c 100644 --- a/pandas/tests/indexes/test_numeric.py +++ b/pandas/tests/indexes/test_numeric.py @@ -21,6 +21,13 @@ def test_can_hold_identifiers(self): key = idx[0] assert idx._can_hold_identifiers_and_holds_name(key) is False + def test_format(self): + # GH35439 + idx = self.create_index() + max_width = max(len(str(x)) for x in idx) + expected = [str(x).ljust(max_width) for x in idx] + assert idx.format() == expected + def test_numeric_compat(self): pass # override Base method diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index 4413c5145cd41..d3031a9e1695a 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -2141,6 +2141,15 @@ def test_dict_entries(self): assert "'a': 1" in val assert "'b': 2" in val + def test_categorical_columns(self): + # GH35439 + data = [[4, 2], [3, 2], [4, 3]] + cols = ["aaaaaaaaa", "b"] + df = pd.DataFrame(data, columns=cols) + df_cat_cols = pd.DataFrame(data, columns=pd.CategoricalIndex(cols)) + + assert df.to_string() == df_cat_cols.to_string() + def test_period(self): # GH 12615 df = pd.DataFrame( From d24660680bbcfccb65fc1d9801f9e9d64cf42976 Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Tue, 4 Aug 2020 08:51:51 +0100 Subject: [PATCH 0430/1025] CLN/PERF: move RangeIndex._cached_data to RangeIndex._cache (#35432) * CLN: move cached_data to _cache['_data'] * add GH number * flake8 cleanup --- pandas/core/indexes/range.py | 16 ++------ pandas/tests/indexes/ranges/test_range.py | 45 +++++++++++++---------- 2 files changed, 29 insertions(+), 32 deletions(-) diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index eee610681087d..1dc4fc1e91462 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -1,7 +1,7 @@ from datetime import timedelta import operator from sys import getsizeof -from typing import Any, Optional +from typing import Any import warnings import numpy as np @@ -78,8 +78,6 @@ class RangeIndex(Int64Index): _engine_type = libindex.Int64Engine _range: range - # check whether self._data has been called - _cached_data: Optional[np.ndarray] = None # -------------------------------------------------------------------- # Constructors @@ -150,20 +148,14 @@ def _constructor(self): """ return the class to use for construction """ return Int64Index - @property + @cache_readonly def _data(self): """ An int array that for performance reasons is created only when needed. - The constructed array is saved in ``_cached_data``. This allows us to - check if the array has been created without accessing ``_data`` and - triggering the construction. + The constructed array is saved in ``_cache``. """ - if self._cached_data is None: - self._cached_data = np.arange( - self.start, self.stop, self.step, dtype=np.int64 - ) - return self._cached_data + return np.arange(self.start, self.stop, self.step, dtype=np.int64) @cache_readonly def _int64index(self) -> Int64Index: diff --git a/pandas/tests/indexes/ranges/test_range.py b/pandas/tests/indexes/ranges/test_range.py index 5b6f9cb358b7d..ef4bb9a0869b0 100644 --- a/pandas/tests/indexes/ranges/test_range.py +++ b/pandas/tests/indexes/ranges/test_range.py @@ -137,53 +137,58 @@ def test_dtype(self): index = self.create_index() assert index.dtype == np.int64 - def test_cached_data(self): - # GH 26565, GH26617 - # Calling RangeIndex._data caches an int64 array of the same length at - # self._cached_data. This test checks whether _cached_data has been set + def test_cache(self): + # GH 26565, GH26617, GH35432 + # This test checks whether _cache has been set. + # Calling RangeIndex._cache["_data"] creates an int64 array of the same length + # as the RangeIndex and stores it in _cache. idx = RangeIndex(0, 100, 10) - assert idx._cached_data is None + assert idx._cache == {} repr(idx) - assert idx._cached_data is None + assert idx._cache == {} str(idx) - assert idx._cached_data is None + assert idx._cache == {} idx.get_loc(20) - assert idx._cached_data is None + assert idx._cache == {} - 90 in idx - assert idx._cached_data is None + 90 in idx # True + assert idx._cache == {} - 91 in idx - assert idx._cached_data is None + 91 in idx # False + assert idx._cache == {} idx.all() - assert idx._cached_data is None + assert idx._cache == {} idx.any() - assert idx._cached_data is None + assert idx._cache == {} df = pd.DataFrame({"a": range(10)}, index=idx) df.loc[50] - assert idx._cached_data is None + assert idx._cache == {} with pytest.raises(KeyError, match="51"): df.loc[51] - assert idx._cached_data is None + assert idx._cache == {} df.loc[10:50] - assert idx._cached_data is None + assert idx._cache == {} df.iloc[5:10] - assert idx._cached_data is None + assert idx._cache == {} - # actually calling idx._data + # idx._cache should contain a _data entry after call to idx._data + idx._data assert isinstance(idx._data, np.ndarray) - assert isinstance(idx._cached_data, np.ndarray) + assert idx._data is idx._data # check cached value is reused + assert len(idx._cache) == 4 + expected = np.arange(0, 100, 10, dtype="int64") + tm.assert_numpy_array_equal(idx._cache["_data"], expected) def test_is_monotonic(self): index = RangeIndex(0, 20, 2) From 8dede5abcca2fe7c00861528322dd8d6642aabde Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Tue, 4 Aug 2020 09:24:28 +0100 Subject: [PATCH 0431/1025] CI: activate github actions on 1.1.x (PR only) (#35467) --- .github/workflows/ci.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index db1fc30111a2d..149acef72db26 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -4,7 +4,9 @@ on: push: branches: master pull_request: - branches: master + branches: + - master + - 1.1.x env: ENV_FILE: environment.yml From 9f6ccb044477a37e4a63af5464dc95568e8cf32f Mon Sep 17 00:00:00 2001 From: Marc Garcia Date: Tue, 4 Aug 2020 16:20:56 +0100 Subject: [PATCH 0432/1025] WEB: Fixing whatsnew link in the home page (version was hardcoded) (#35451) --- web/pandas/index.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/web/pandas/index.html b/web/pandas/index.html index 83d0f48197033..75c797d6dd93d 100644 --- a/web/pandas/index.html +++ b/web/pandas/index.html @@ -63,7 +63,7 @@
With the support of:
{% if releases %}

Latest version: {{ releases[0].name }}

    -
  • What's new in {{ releases[0].name }}
  • +
  • What's new in {{ releases[0].name }}
  • Release date:
    {{ releases[0].published.strftime("%b %d, %Y") }}
  • Documentation (web)
  • Documentation (pdf)
  • From dc2628cd0bdd597cb5281e0dac68af8e63b686c7 Mon Sep 17 00:00:00 2001 From: Deepyaman Datta Date: Tue, 4 Aug 2020 19:00:15 -0400 Subject: [PATCH 0433/1025] Fix unnecessary pluralization (#35553) --- pandas/core/frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 79627e43d78c2..d229cd5a9d7ec 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4193,7 +4193,7 @@ def rename( Parameters ---------- mapper : dict-like or function - Dict-like or functions transformations to apply to + Dict-like or function transformations to apply to that axis' values. Use either ``mapper`` and ``axis`` to specify the axis to target with ``mapper``, or ``index`` and ``columns``. From 19338f4efad06dd0bcc937fa549827e61d337aca Mon Sep 17 00:00:00 2001 From: Sander Date: Wed, 5 Aug 2020 04:27:22 +0200 Subject: [PATCH 0434/1025] Added paragraph on creating DataFrame from list of namedtuples (#35507) --- doc/source/user_guide/dsintro.rst | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/doc/source/user_guide/dsintro.rst b/doc/source/user_guide/dsintro.rst index 360a14998b227..23bd44c1969a5 100644 --- a/doc/source/user_guide/dsintro.rst +++ b/doc/source/user_guide/dsintro.rst @@ -397,6 +397,32 @@ The result will be a DataFrame with the same index as the input Series, and with one column whose name is the original name of the Series (only if no other column name provided). + +.. _basics.dataframe.from_list_namedtuples: + +From a list of namedtuples +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The field names of the first ``namedtuple`` in the list determine the columns +of the ``DataFrame``. The remaining namedtuples (or tuples) are simply unpacked +and their values are fed into the rows of the ``DataFrame``. If any of those +tuples is shorter than the first ``namedtuple`` then the later columns in the +corresponding row are marked as missing values. If any are longer than the +first ``namedtuple``, a ``ValueError`` is raised. + +.. ipython:: python + + from collections import namedtuple + + Point = namedtuple('Point', 'x y') + + pd.DataFrame([Point(0, 0), Point(0, 3), (2, 3)]) + + Point3D = namedtuple('Point3D', 'x y z') + + pd.DataFrame([Point3D(0, 0, 0), Point3D(0, 3, 5), Point(2, 3)]) + + .. _basics.dataframe.from_list_dataclasses: From a list of dataclasses From 6bb7102cdf8d61121c74921938b8019c5dd013cc Mon Sep 17 00:00:00 2001 From: Kevin Sheppard Date: Thu, 6 Aug 2020 13:42:56 +0100 Subject: [PATCH 0435/1025] MAINT: Fix issue in StataReader due to upstream changes (#35427) --- pandas/io/stata.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 7677d8a94d521..3717a2025cf51 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -1643,8 +1643,7 @@ def read( data = self._insert_strls(data) - cols_ = np.where(self.dtyplist)[0] - + cols_ = np.where([dtyp is not None for dtyp in self.dtyplist])[0] # Convert columns (if needed) to match input type ix = data.index requires_type_conversion = False From 419f547c67c37629c0b1f733c27af33f4cc9ccc7 Mon Sep 17 00:00:00 2001 From: David Li Date: Thu, 6 Aug 2020 11:11:21 -0400 Subject: [PATCH 0436/1025] BUG: handle immutable arrays in tz_convert_from_utc (#35530) (#35532) --- doc/source/whatsnew/v1.2.0.rst | 2 +- pandas/_libs/tslibs/tzconversion.pyx | 6 +++--- pandas/tests/tslibs/test_conversion.py | 8 ++++++++ 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index b16ca0a80c5b4..304897edbb75e 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -59,7 +59,7 @@ Categorical Datetimelike ^^^^^^^^^^^^ -- +- Bug in :attr:`DatetimeArray.date` where a ``ValueError`` would be raised with a read-only backing array (:issue:`33530`) - Timedelta diff --git a/pandas/_libs/tslibs/tzconversion.pyx b/pandas/_libs/tslibs/tzconversion.pyx index 2b148cd8849f1..4c62b16d430bd 100644 --- a/pandas/_libs/tslibs/tzconversion.pyx +++ b/pandas/_libs/tslibs/tzconversion.pyx @@ -410,7 +410,7 @@ cpdef int64_t tz_convert_from_utc_single(int64_t val, tzinfo tz): return val + deltas[pos] -def tz_convert_from_utc(int64_t[:] vals, tzinfo tz): +def tz_convert_from_utc(const int64_t[:] vals, tzinfo tz): """ Convert the values (in i8) from UTC to tz @@ -435,7 +435,7 @@ def tz_convert_from_utc(int64_t[:] vals, tzinfo tz): @cython.boundscheck(False) @cython.wraparound(False) -cdef int64_t[:] _tz_convert_from_utc(int64_t[:] vals, tzinfo tz): +cdef int64_t[:] _tz_convert_from_utc(const int64_t[:] vals, tzinfo tz): """ Convert the given values (in i8) either to UTC or from UTC. @@ -457,7 +457,7 @@ cdef int64_t[:] _tz_convert_from_utc(int64_t[:] vals, tzinfo tz): str typ if is_utc(tz): - converted = vals + converted = vals.copy() elif is_tzlocal(tz): converted = np.empty(n, dtype=np.int64) for i in range(n): diff --git a/pandas/tests/tslibs/test_conversion.py b/pandas/tests/tslibs/test_conversion.py index 4f184b78f34a1..87cd97f853f4d 100644 --- a/pandas/tests/tslibs/test_conversion.py +++ b/pandas/tests/tslibs/test_conversion.py @@ -78,6 +78,14 @@ def test_tz_convert_corner(arr): tm.assert_numpy_array_equal(result, arr) +def test_tz_convert_readonly(): + # GH#35530 + arr = np.array([0], dtype=np.int64) + arr.setflags(write=False) + result = tzconversion.tz_convert_from_utc(arr, UTC) + tm.assert_numpy_array_equal(result, arr) + + @pytest.mark.parametrize("copy", [True, False]) @pytest.mark.parametrize("dtype", ["M8[ns]", "M8[s]"]) def test_length_zero_copy(dtype, copy): From 79e3a40340527c9a8247f0b4041786da90a286b9 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Thu, 6 Aug 2020 16:12:34 +0100 Subject: [PATCH 0437/1025] TYP: Add MyPy Error Codes (#35311) --- ci/code_checks.sh | 5 ++++ environment.yml | 1 + pandas/_config/config.py | 2 +- pandas/compat/pickle_compat.py | 6 ++--- pandas/core/algorithms.py | 3 ++- pandas/core/arrays/datetimelike.py | 9 +++++-- pandas/core/arrays/interval.py | 2 +- pandas/core/arrays/period.py | 4 +-- pandas/core/computation/expressions.py | 3 ++- pandas/core/computation/parsing.py | 4 ++- pandas/core/computation/pytables.py | 2 +- pandas/core/config_init.py | 2 +- pandas/core/dtypes/common.py | 6 +++-- pandas/core/dtypes/dtypes.py | 3 ++- pandas/core/dtypes/generic.py | 2 +- pandas/core/frame.py | 21 +++++++++++---- pandas/core/generic.py | 18 +++++++++---- pandas/core/indexes/base.py | 6 +++-- pandas/core/indexes/datetimelike.py | 8 ++++-- pandas/core/internals/blocks.py | 3 ++- pandas/core/ops/array_ops.py | 3 ++- pandas/core/resample.py | 2 +- pandas/core/series.py | 8 ++++-- pandas/core/tools/datetimes.py | 2 +- pandas/io/common.py | 24 ++++++++++++++++-- pandas/io/formats/html.py | 3 ++- pandas/io/formats/printing.py | 12 +++++---- pandas/io/json/_json.py | 3 ++- pandas/io/pytables.py | 9 ++++--- pandas/io/stata.py | 5 +++- pandas/plotting/_matplotlib/timeseries.py | 8 ++++-- pandas/tests/indexes/test_base.py | 31 ++++++++++++----------- pandas/tests/io/pytables/test_store.py | 6 ++--- pandas/tests/io/test_fsspec.py | 3 ++- pandas/util/_decorators.py | 14 +++++++--- requirements-dev.txt | 3 ++- setup.cfg | 1 + 37 files changed, 170 insertions(+), 77 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 69ce0f1adce22..816bb23865c04 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -230,6 +230,11 @@ if [[ -z "$CHECK" || "$CHECK" == "patterns" ]]; then invgrep -R --include="*.py" -P '# type: (?!ignore)' pandas RET=$(($RET + $?)) ; echo $MSG "DONE" + # https://github.com/python/mypy/issues/7384 + # MSG='Check for missing error codes with # type: ignore' ; echo $MSG + # invgrep -R --include="*.py" -P '# type: ignore(?!\[)' pandas + # RET=$(($RET + $?)) ; echo $MSG "DONE" + MSG='Check for use of foo.__class__ instead of type(foo)' ; echo $MSG invgrep -R --include=*.{py,pyx} '\.__class__' pandas RET=$(($RET + $?)) ; echo $MSG "DONE" diff --git a/environment.yml b/environment.yml index 9efb995e29497..ed9762e5b8893 100644 --- a/environment.yml +++ b/environment.yml @@ -109,3 +109,4 @@ dependencies: - pip: - git+https://github.com/pandas-dev/pydata-sphinx-theme.git@master - git+https://github.com/numpy/numpydoc + - pyflakes>=2.2.0 diff --git a/pandas/_config/config.py b/pandas/_config/config.py index d7b73a0a685d3..fb41b37980b2e 100644 --- a/pandas/_config/config.py +++ b/pandas/_config/config.py @@ -462,7 +462,7 @@ def register_option( for k in path: # NOTE: tokenize.Name is not a public constant # error: Module has no attribute "Name" [attr-defined] - if not re.match("^" + tokenize.Name + "$", k): # type: ignore + if not re.match("^" + tokenize.Name + "$", k): # type: ignore[attr-defined] raise ValueError(f"{k} is not a valid identifier") if keyword.iskeyword(k): raise ValueError(f"{k} is a python keyword") diff --git a/pandas/compat/pickle_compat.py b/pandas/compat/pickle_compat.py index 015b203a60256..ef9f36705a7ee 100644 --- a/pandas/compat/pickle_compat.py +++ b/pandas/compat/pickle_compat.py @@ -64,7 +64,7 @@ class _LoadSparseSeries: # https://github.com/python/mypy/issues/1020 # error: Incompatible return type for "__new__" (returns "Series", but must return # a subtype of "_LoadSparseSeries") - def __new__(cls) -> "Series": # type: ignore + def __new__(cls) -> "Series": # type: ignore[misc] from pandas import Series warnings.warn( @@ -82,7 +82,7 @@ class _LoadSparseFrame: # https://github.com/python/mypy/issues/1020 # error: Incompatible return type for "__new__" (returns "DataFrame", but must # return a subtype of "_LoadSparseFrame") - def __new__(cls) -> "DataFrame": # type: ignore + def __new__(cls) -> "DataFrame": # type: ignore[misc] from pandas import DataFrame warnings.warn( @@ -181,7 +181,7 @@ def __new__(cls) -> "DataFrame": # type: ignore # functions for compat and uses a non-public class of the pickle module. # error: Name 'pkl._Unpickler' is not defined -class Unpickler(pkl._Unpickler): # type: ignore +class Unpickler(pkl._Unpickler): # type: ignore[name-defined] def find_class(self, module, name): # override superclass key = (module, name) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 9e3ca4cc53363..befde7c355818 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -427,7 +427,8 @@ def isin(comps: AnyArrayLike, values: AnyArrayLike) -> np.ndarray: if is_categorical_dtype(comps): # TODO(extension) # handle categoricals - return comps.isin(values) # type: ignore + # error: "ExtensionArray" has no attribute "isin" [attr-defined] + return comps.isin(values) # type: ignore[attr-defined] comps, dtype = _ensure_data(comps) values, _ = _ensure_data(values, dtype=dtype) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index c6945e2f78b5a..1b5e1d81f00d6 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -468,6 +468,9 @@ def _ndarray(self) -> np.ndarray: def _from_backing_data(self: _T, arr: np.ndarray) -> _T: # Note: we do not retain `freq` + # error: Unexpected keyword argument "dtype" for "NDArrayBackedExtensionArray" + # TODO: add my error code + # https://github.com/python/mypy/issues/7384 return type(self)(arr, dtype=self.dtype) # type: ignore # ------------------------------------------------------------------ @@ -809,7 +812,8 @@ def _validate_scalar( value = NaT elif isinstance(value, self._recognized_scalars): - value = self._scalar_type(value) # type: ignore + # error: Too many arguments for "object" [call-arg] + value = self._scalar_type(value) # type: ignore[call-arg] else: if msg is None: @@ -1129,7 +1133,8 @@ def resolution(self) -> str: """ Returns day, hour, minute, second, millisecond or microsecond """ - return self._resolution_obj.attrname # type: ignore + # error: Item "None" of "Optional[Any]" has no attribute "attrname" + return self._resolution_obj.attrname # type: ignore[union-attr] @classmethod def _validate_frequency(cls, index, freq, **kwargs): diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index ed2437cc061bd..d76e0fd628a48 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -1057,7 +1057,7 @@ def mid(self): # https://github.com/python/mypy/issues/1362 # Mypy does not support decorated properties - @property # type: ignore + @property # type: ignore[misc] @Appender( _interval_shared_docs["is_non_overlapping_monotonic"] % _shared_docs_kwargs ) diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index fe78481d99d30..ddaf6d39f1837 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -278,8 +278,8 @@ def _check_compatible_with(self, other, setitem: bool = False): def dtype(self) -> PeriodDtype: return self._dtype - # error: Read-only property cannot override read-write property [misc] - @property # type: ignore + # error: Read-only property cannot override read-write property + @property # type: ignore[misc] def freq(self) -> BaseOffset: """ Return the frequency object for this PeriodArray. diff --git a/pandas/core/computation/expressions.py b/pandas/core/computation/expressions.py index 0e9077e6d557e..05a5538a88772 100644 --- a/pandas/core/computation/expressions.py +++ b/pandas/core/computation/expressions.py @@ -227,7 +227,8 @@ def evaluate(op, a, b, use_numexpr: bool = True): if op_str is not None: use_numexpr = use_numexpr and _bool_arith_check(op_str, a, b) if use_numexpr: - return _evaluate(op, op_str, a, b) # type: ignore + # error: "None" not callable + return _evaluate(op, op_str, a, b) # type: ignore[misc] return _evaluate_standard(op, op_str, a, b) diff --git a/pandas/core/computation/parsing.py b/pandas/core/computation/parsing.py index c7c7103654a65..86e125b6b909b 100644 --- a/pandas/core/computation/parsing.py +++ b/pandas/core/computation/parsing.py @@ -37,7 +37,9 @@ def create_valid_python_identifier(name: str) -> str: special_characters_replacements = { char: f"_{token.tok_name[tokval]}_" # The ignore here is because of a bug in mypy that is resolved in 0.740 - for char, tokval in tokenize.EXACT_TOKEN_TYPES.items() # type: ignore + for char, tokval in ( + tokenize.EXACT_TOKEN_TYPES.items() # type: ignore[attr-defined] + ) } special_characters_replacements.update( { diff --git a/pandas/core/computation/pytables.py b/pandas/core/computation/pytables.py index 001eb1789007f..f1b11a6869c2b 100644 --- a/pandas/core/computation/pytables.py +++ b/pandas/core/computation/pytables.py @@ -63,7 +63,7 @@ def _resolve_name(self): return self.name # read-only property overwriting read/write property - @property # type: ignore + @property # type: ignore[misc] def value(self): return self._value diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index 2b2431149e230..0c23f1b4bcdf2 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -327,7 +327,7 @@ def is_terminal() -> bool: """ try: # error: Name 'get_ipython' is not defined - ip = get_ipython() # type: ignore + ip = get_ipython() # type: ignore[name-defined] except NameError: # assume standard Python interpreter in a terminal return True else: diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index a2ca4d84b2bf6..73109020b1b54 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -136,11 +136,13 @@ def ensure_int_or_float(arr: ArrayLike, copy: bool = False) -> np.array: """ # TODO: GH27506 potential bug with ExtensionArrays try: - return arr.astype("int64", copy=copy, casting="safe") # type: ignore + # error: Unexpected keyword argument "casting" for "astype" + return arr.astype("int64", copy=copy, casting="safe") # type: ignore[call-arg] except TypeError: pass try: - return arr.astype("uint64", copy=copy, casting="safe") # type: ignore + # error: Unexpected keyword argument "casting" for "astype" + return arr.astype("uint64", copy=copy, casting="safe") # type: ignore[call-arg] except TypeError: if is_extension_array_dtype(arr.dtype): return arr.to_numpy(dtype="float64", na_value=np.nan) diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 8350e136417b1..8dc500dddeafa 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -635,7 +635,8 @@ class DatetimeTZDtype(PandasExtensionDtype): def __init__(self, unit: Union[str_type, "DatetimeTZDtype"] = "ns", tz=None): if isinstance(unit, DatetimeTZDtype): - unit, tz = unit.unit, unit.tz # type: ignore + # error: "str" has no attribute "tz" + unit, tz = unit.unit, unit.tz # type: ignore[attr-defined] if unit != "ns": if isinstance(unit, str) and tz is None: diff --git a/pandas/core/dtypes/generic.py b/pandas/core/dtypes/generic.py index 36eff214fc314..1f1017cfc1929 100644 --- a/pandas/core/dtypes/generic.py +++ b/pandas/core/dtypes/generic.py @@ -7,7 +7,7 @@ def create_pandas_abc_type(name, attr, comp): # https://github.com/python/mypy/issues/1006 # error: 'classmethod' used with a non-method - @classmethod # type: ignore + @classmethod # type: ignore[misc] def _check(cls, inst) -> bool: return getattr(inst, attr, "_typ") in comp diff --git a/pandas/core/frame.py b/pandas/core/frame.py index d229cd5a9d7ec..aabdac16e9a1a 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2164,10 +2164,14 @@ def to_stata( from pandas.io.stata import StataWriter as statawriter elif version == 117: # mypy: Name 'statawriter' already defined (possibly by an import) - from pandas.io.stata import StataWriter117 as statawriter # type: ignore + from pandas.io.stata import ( # type: ignore[no-redef] + StataWriter117 as statawriter, + ) else: # versions 118 and 119 # mypy: Name 'statawriter' already defined (possibly by an import) - from pandas.io.stata import StataWriterUTF8 as statawriter # type: ignore + from pandas.io.stata import ( # type: ignore[no-redef] + StataWriterUTF8 as statawriter, + ) kwargs: Dict[str, Any] = {} if version is None or version >= 117: @@ -2178,7 +2182,7 @@ def to_stata( kwargs["version"] = version # mypy: Too many arguments for "StataWriter" - writer = statawriter( # type: ignore + writer = statawriter( # type: ignore[call-arg] path, self, convert_dates=convert_dates, @@ -3578,7 +3582,13 @@ def extract_unique_dtypes_from_dtypes_set( extracted_dtypes = [ unique_dtype for unique_dtype in unique_dtypes - if issubclass(unique_dtype.type, tuple(dtypes_set)) # type: ignore + # error: Argument 1 to "tuple" has incompatible type + # "FrozenSet[Union[ExtensionDtype, str, Any, Type[str], + # Type[float], Type[int], Type[complex], Type[bool]]]"; + # expected "Iterable[Union[type, Tuple[Any, ...]]]" + if issubclass( + unique_dtype.type, tuple(dtypes_set) # type: ignore[arg-type] + ) ] return extracted_dtypes @@ -5250,7 +5260,8 @@ def f(vals): # TODO: Just move the sort_values doc here. @Substitution(**_shared_doc_kwargs) @Appender(NDFrame.sort_values.__doc__) - def sort_values( # type: ignore[override] # NOQA # issue 27237 + # error: Signature of "sort_values" incompatible with supertype "NDFrame" + def sort_values( # type: ignore[override] self, by, axis=0, diff --git a/pandas/core/generic.py b/pandas/core/generic.py index e46fde1f59f16..42d02f37508fc 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -589,9 +589,9 @@ def swapaxes(self: FrameOrSeries, axis1, axis2, copy=True) -> FrameOrSeries: # ignore needed because of NDFrame constructor is different than # DataFrame/Series constructors. - return self._constructor(new_values, *new_axes).__finalize__( # type: ignore - self, method="swapaxes" - ) + return self._constructor( + new_values, *new_axes # type: ignore[arg-type] + ).__finalize__(self, method="swapaxes") def droplevel(self: FrameOrSeries, level, axis=0) -> FrameOrSeries: """ @@ -4011,7 +4011,11 @@ def add_prefix(self: FrameOrSeries, prefix: str) -> FrameOrSeries: f = functools.partial("{prefix}{}".format, prefix=prefix) mapper = {self._info_axis_name: f} - return self.rename(**mapper) # type: ignore + # error: Incompatible return value type (got "Optional[FrameOrSeries]", + # expected "FrameOrSeries") + # error: Argument 1 to "rename" of "NDFrame" has incompatible type + # "**Dict[str, partial[str]]"; expected "Union[str, int, None]" + return self.rename(**mapper) # type: ignore[return-value, arg-type] def add_suffix(self: FrameOrSeries, suffix: str) -> FrameOrSeries: """ @@ -4070,7 +4074,11 @@ def add_suffix(self: FrameOrSeries, suffix: str) -> FrameOrSeries: f = functools.partial("{}{suffix}".format, suffix=suffix) mapper = {self._info_axis_name: f} - return self.rename(**mapper) # type: ignore + # error: Incompatible return value type (got "Optional[FrameOrSeries]", + # expected "FrameOrSeries") + # error: Argument 1 to "rename" of "NDFrame" has incompatible type + # "**Dict[str, partial[str]]"; expected "Union[str, int, None]" + return self.rename(**mapper) # type: ignore[return-value, arg-type] def sort_values( self, diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 7ba94c76d0037..278999930463f 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -885,7 +885,8 @@ def _format_data(self, name=None) -> str_t: if self.inferred_type == "string": is_justify = False elif self.inferred_type == "categorical": - if is_object_dtype(self.categories): # type: ignore + # error: "Index" has no attribute "categories" + if is_object_dtype(self.categories): # type: ignore[attr-defined] is_justify = False return format_object_summary( @@ -940,7 +941,8 @@ def _format_with_header(self, header, na_rep="NaN") -> List[str_t]: if mask.any(): result = np.array(result) result[mask] = na_rep - result = result.tolist() # type: ignore + # error: "List[str]" has no attribute "tolist" + result = result.tolist() # type: ignore[attr-defined] else: result = trim_front(format_array(values, None, justify="left")) return header + result diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 15a7e25238983..0ce057d6e764a 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -54,7 +54,8 @@ def _join_i8_wrapper(joinf, with_indexers: bool = True): Create the join wrapper methods. """ - @staticmethod # type: ignore + # error: 'staticmethod' used with a non-method + @staticmethod # type: ignore[misc] def wrapper(left, right): if isinstance(left, (np.ndarray, ABCIndex, ABCSeries, DatetimeLikeArrayMixin)): left = left.view("i8") @@ -95,7 +96,10 @@ class DatetimeIndexOpsMixin(ExtensionIndex): _bool_ops: List[str] = [] _field_ops: List[str] = [] - hasnans = cache_readonly(DatetimeLikeArrayMixin._hasnans.fget) # type: ignore + # error: "Callable[[Any], Any]" has no attribute "fget" + hasnans = cache_readonly( + DatetimeLikeArrayMixin._hasnans.fget # type: ignore[attr-defined] + ) _hasnans = hasnans # for index / array -agnostic code @property diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 6ca6eca1ff829..3186c555b7ae1 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -2744,7 +2744,8 @@ def _block_shape(values: ArrayLike, ndim: int = 1) -> ArrayLike: # TODO(EA2D): https://github.com/pandas-dev/pandas/issues/23023 # block.shape is incorrect for "2D" ExtensionArrays # We can't, and don't need to, reshape. - values = values.reshape(tuple((1,) + shape)) # type: ignore + # error: "ExtensionArray" has no attribute "reshape" + values = values.reshape(tuple((1,) + shape)) # type: ignore[attr-defined] return values diff --git a/pandas/core/ops/array_ops.py b/pandas/core/ops/array_ops.py index 3379ee56b6ad0..aab10cea33632 100644 --- a/pandas/core/ops/array_ops.py +++ b/pandas/core/ops/array_ops.py @@ -349,7 +349,8 @@ def fill_bool(x, left=None): filler = fill_int if is_self_int_dtype and is_other_int_dtype else fill_bool res_values = na_logical_op(lvalues, rvalues, op) - res_values = filler(res_values) # type: ignore + # error: Cannot call function of unknown type + res_values = filler(res_values) # type: ignore[operator] return res_values diff --git a/pandas/core/resample.py b/pandas/core/resample.py index bfdfc65723433..e82a1d4d2cda8 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -967,7 +967,7 @@ def __init__(self, obj, *args, **kwargs): setattr(self, attr, kwargs.get(attr, getattr(parent, attr))) # error: Too many arguments for "__init__" of "object" - super().__init__(None) # type: ignore + super().__init__(None) # type: ignore[call-arg] self._groupby = groupby self._groupby.mutated = True self._groupby.grouper.mutated = True diff --git a/pandas/core/series.py b/pandas/core/series.py index ef3be854bc3bb..9e70120f67969 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -571,7 +571,8 @@ def _values(self): """ return self._mgr.internal_values() - @Appender(base.IndexOpsMixin.array.__doc__) # type: ignore + # error: Decorated property not supported + @Appender(base.IndexOpsMixin.array.__doc__) # type: ignore[misc] @property def array(self) -> ExtensionArray: return self._mgr._block.array_values() @@ -4921,7 +4922,10 @@ def to_timestamp(self, freq=None, how="start", copy=True) -> "Series": if not isinstance(self.index, PeriodIndex): raise TypeError(f"unsupported Type {type(self.index).__name__}") - new_index = self.index.to_timestamp(freq=freq, how=how) # type: ignore + # error: "PeriodIndex" has no attribute "to_timestamp" + new_index = self.index.to_timestamp( # type: ignore[attr-defined] + freq=freq, how=how + ) return self._constructor(new_values, index=new_index).__finalize__( self, method="to_timestamp" ) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 7aac2f793f61a..3c1fe6bacefcf 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -309,7 +309,7 @@ def _convert_listlike_datetimes( if tz == "utc": # error: Item "DatetimeIndex" of "Union[DatetimeArray, DatetimeIndex]" has # no attribute "tz_convert" - arg = arg.tz_convert(None).tz_localize(tz) # type: ignore + arg = arg.tz_convert(None).tz_localize(tz) # type: ignore[union-attr] return arg elif is_datetime64_ns_dtype(arg_dtype): diff --git a/pandas/io/common.py b/pandas/io/common.py index 6ac8051f35b6f..f39b8279fbdb0 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -121,7 +121,15 @@ def stringify_path( """ if hasattr(filepath_or_buffer, "__fspath__"): # https://github.com/python/mypy/issues/1424 - return filepath_or_buffer.__fspath__() # type: ignore + # error: Item "str" of "Union[str, Path, IO[str]]" has no attribute + # "__fspath__" [union-attr] + # error: Item "IO[str]" of "Union[str, Path, IO[str]]" has no attribute + # "__fspath__" [union-attr] + # error: Item "str" of "Union[str, Path, IO[bytes]]" has no attribute + # "__fspath__" [union-attr] + # error: Item "IO[bytes]" of "Union[str, Path, IO[bytes]]" has no + # attribute "__fspath__" [union-attr] + return filepath_or_buffer.__fspath__() # type: ignore[union-attr] elif isinstance(filepath_or_buffer, pathlib.Path): return str(filepath_or_buffer) return _expand_user(filepath_or_buffer) @@ -516,7 +524,19 @@ def get_handle( return f, handles -class _BytesZipFile(zipfile.ZipFile, BytesIO): # type: ignore +# error: Definition of "__exit__" in base class "ZipFile" is incompatible with +# definition in base class "BytesIO" [misc] +# error: Definition of "__enter__" in base class "ZipFile" is incompatible with +# definition in base class "BytesIO" [misc] +# error: Definition of "__enter__" in base class "ZipFile" is incompatible with +# definition in base class "BinaryIO" [misc] +# error: Definition of "__enter__" in base class "ZipFile" is incompatible with +# definition in base class "IO" [misc] +# error: Definition of "read" in base class "ZipFile" is incompatible with +# definition in base class "BytesIO" [misc] +# error: Definition of "read" in base class "ZipFile" is incompatible with +# definition in base class "IO" [misc] +class _BytesZipFile(zipfile.ZipFile, BytesIO): # type: ignore[misc] """ Wrapper for standard library class ZipFile and allow the returned file-like handle to accept byte strings via `write` method. diff --git a/pandas/io/formats/html.py b/pandas/io/formats/html.py index 13f0ab1e8a52c..c89189f1e679a 100644 --- a/pandas/io/formats/html.py +++ b/pandas/io/formats/html.py @@ -86,8 +86,9 @@ def _get_columns_formatted_values(self) -> Iterable: return self.columns # https://github.com/python/mypy/issues/1237 + # error: Signature of "is_truncated" incompatible with supertype "TableFormatter" @property - def is_truncated(self) -> bool: # type: ignore + def is_truncated(self) -> bool: # type: ignore[override] return self.fmt.is_truncated @property diff --git a/pandas/io/formats/printing.py b/pandas/io/formats/printing.py index 1cf79dc105901..23daab725ec65 100644 --- a/pandas/io/formats/printing.py +++ b/pandas/io/formats/printing.py @@ -499,7 +499,7 @@ def _justify( # error: Incompatible return value type (got "Tuple[List[Sequence[str]], # List[Sequence[str]]]", expected "Tuple[List[Tuple[str, ...]], # List[Tuple[str, ...]]]") - return head, tail # type: ignore + return head, tail # type: ignore[return-value] def format_object_attrs( @@ -524,14 +524,16 @@ def format_object_attrs( attrs: List[Tuple[str, Union[str, int]]] = [] if hasattr(obj, "dtype") and include_dtype: # error: "Sequence[Any]" has no attribute "dtype" - attrs.append(("dtype", f"'{obj.dtype}'")) # type: ignore + attrs.append(("dtype", f"'{obj.dtype}'")) # type: ignore[attr-defined] if getattr(obj, "name", None) is not None: # error: "Sequence[Any]" has no attribute "name" - attrs.append(("name", default_pprint(obj.name))) # type: ignore + attrs.append(("name", default_pprint(obj.name))) # type: ignore[attr-defined] # error: "Sequence[Any]" has no attribute "names" - elif getattr(obj, "names", None) is not None and any(obj.names): # type: ignore + elif getattr(obj, "names", None) is not None and any( + obj.names # type: ignore[attr-defined] + ): # error: "Sequence[Any]" has no attribute "names" - attrs.append(("names", default_pprint(obj.names))) # type: ignore + attrs.append(("names", default_pprint(obj.names))) # type: ignore[attr-defined] max_seq_items = get_option("display.max_seq_items") or len(obj) if len(obj) > max_seq_items: attrs.append(("length", len(obj))) diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index ff37c36962aec..0b06a26d4aa3c 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -115,7 +115,8 @@ def __init__( self.obj = obj if orient is None: - orient = self._default_orient # type: ignore + # error: "Writer" has no attribute "_default_orient" + orient = self._default_orient # type: ignore[attr-defined] self.orient = orient self.date_format = date_format diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index e0df4c29e543e..9f5b6041b0ffa 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -2280,7 +2280,8 @@ def _get_atom(cls, values: ArrayLike) -> "Col": Get an appropriately typed and shaped pytables.Col object for values. """ dtype = values.dtype - itemsize = dtype.itemsize # type: ignore + # error: "ExtensionDtype" has no attribute "itemsize" + itemsize = dtype.itemsize # type: ignore[attr-defined] shape = values.shape if values.ndim == 1: @@ -3349,9 +3350,9 @@ def queryables(self) -> Dict[str, Any]: (v.cname, v) for v in self.values_axes if v.name in set(self.data_columns) ] - return dict(d1 + d2 + d3) # type: ignore - # error: List comprehension has incompatible type - # List[Tuple[Any, None]]; expected List[Tuple[str, IndexCol]] + # error: Unsupported operand types for + ("List[Tuple[str, IndexCol]]" + # and "List[Tuple[str, None]]") + return dict(d1 + d2 + d3) # type: ignore[operator] def index_cols(self): """ return a list of my index cols """ diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 3717a2025cf51..cb23b781a7ad2 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -1952,7 +1952,10 @@ def _open_file_binary_write( """ if hasattr(fname, "write"): # See https://github.com/python/mypy/issues/1424 for hasattr challenges - return fname, False, None # type: ignore + # error: Incompatible return value type (got "Tuple[Union[str, Path, + # IO[Any]], bool, None]", expected "Tuple[BinaryIO, bool, Union[str, + # Mapping[str, str], None]]") + return fname, False, None # type: ignore[return-value] elif isinstance(fname, (str, Path)): # Extract compression mode as given, if dict compression_typ, compression_args = get_compression_method(compression) diff --git a/pandas/plotting/_matplotlib/timeseries.py b/pandas/plotting/_matplotlib/timeseries.py index 95f9fbf3995ed..eef4276f0ed09 100644 --- a/pandas/plotting/_matplotlib/timeseries.py +++ b/pandas/plotting/_matplotlib/timeseries.py @@ -45,7 +45,10 @@ def _maybe_resample(series: "Series", ax, kwargs): if ax_freq is not None and freq != ax_freq: if is_superperiod(freq, ax_freq): # upsample input series = series.copy() - series.index = series.index.asfreq(ax_freq, how="s") # type: ignore + # error: "Index" has no attribute "asfreq" + series.index = series.index.asfreq( # type: ignore[attr-defined] + ax_freq, how="s" + ) freq = ax_freq elif _is_sup(freq, ax_freq): # one is weekly how = kwargs.pop("how", "last") @@ -222,7 +225,8 @@ def _get_index_freq(index: "Index") -> Optional[BaseOffset]: if freq is None: freq = getattr(index, "inferred_freq", None) if freq == "B": - weekdays = np.unique(index.dayofweek) # type: ignore + # error: "Index" has no attribute "dayofweek" + weekdays = np.unique(index.dayofweek) # type: ignore[attr-defined] if (5 in weekdays) or (6 in weekdays): freq = None diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 59ee88117a984..70eb9e502f78a 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -1514,23 +1514,24 @@ def test_slice_locs_na_raises(self): @pytest.mark.parametrize( "in_slice,expected", [ + # error: Slice index must be an integer or None (pd.IndexSlice[::-1], "yxdcb"), - (pd.IndexSlice["b":"y":-1], ""), # type: ignore - (pd.IndexSlice["b"::-1], "b"), # type: ignore - (pd.IndexSlice[:"b":-1], "yxdcb"), # type: ignore - (pd.IndexSlice[:"y":-1], "y"), # type: ignore - (pd.IndexSlice["y"::-1], "yxdcb"), # type: ignore - (pd.IndexSlice["y"::-4], "yb"), # type: ignore + (pd.IndexSlice["b":"y":-1], ""), # type: ignore[misc] + (pd.IndexSlice["b"::-1], "b"), # type: ignore[misc] + (pd.IndexSlice[:"b":-1], "yxdcb"), # type: ignore[misc] + (pd.IndexSlice[:"y":-1], "y"), # type: ignore[misc] + (pd.IndexSlice["y"::-1], "yxdcb"), # type: ignore[misc] + (pd.IndexSlice["y"::-4], "yb"), # type: ignore[misc] # absent labels - (pd.IndexSlice[:"a":-1], "yxdcb"), # type: ignore - (pd.IndexSlice[:"a":-2], "ydb"), # type: ignore - (pd.IndexSlice["z"::-1], "yxdcb"), # type: ignore - (pd.IndexSlice["z"::-3], "yc"), # type: ignore - (pd.IndexSlice["m"::-1], "dcb"), # type: ignore - (pd.IndexSlice[:"m":-1], "yx"), # type: ignore - (pd.IndexSlice["a":"a":-1], ""), # type: ignore - (pd.IndexSlice["z":"z":-1], ""), # type: ignore - (pd.IndexSlice["m":"m":-1], ""), # type: ignore + (pd.IndexSlice[:"a":-1], "yxdcb"), # type: ignore[misc] + (pd.IndexSlice[:"a":-2], "ydb"), # type: ignore[misc] + (pd.IndexSlice["z"::-1], "yxdcb"), # type: ignore[misc] + (pd.IndexSlice["z"::-3], "yc"), # type: ignore[misc] + (pd.IndexSlice["m"::-1], "dcb"), # type: ignore[misc] + (pd.IndexSlice[:"m":-1], "yx"), # type: ignore[misc] + (pd.IndexSlice["a":"a":-1], ""), # type: ignore[misc] + (pd.IndexSlice["z":"z":-1], ""), # type: ignore[misc] + (pd.IndexSlice["m":"m":-1], ""), # type: ignore[misc] ], ) def test_slice_locs_negative_step(self, in_slice, expected): diff --git a/pandas/tests/io/pytables/test_store.py b/pandas/tests/io/pytables/test_store.py index df014171be817..0942c79837e7c 100644 --- a/pandas/tests/io/pytables/test_store.py +++ b/pandas/tests/io/pytables/test_store.py @@ -1751,9 +1751,9 @@ def col(t, column): # try to index a col which isn't a data_column msg = ( - f"column string2 is not a data_column.\n" - f"In order to read column string2 you must reload the dataframe \n" - f"into HDFStore and include string2 with the data_columns argument." + "column string2 is not a data_column.\n" + "In order to read column string2 you must reload the dataframe \n" + "into HDFStore and include string2 with the data_columns argument." ) with pytest.raises(AttributeError, match=msg): store.create_table_index("f", columns=["string2"]) diff --git a/pandas/tests/io/test_fsspec.py b/pandas/tests/io/test_fsspec.py index d64e2d1933ace..a0723452ccb70 100644 --- a/pandas/tests/io/test_fsspec.py +++ b/pandas/tests/io/test_fsspec.py @@ -15,7 +15,8 @@ ) # the ignore on the following line accounts for to_csv returning Optional(str) # in general, but always str in the case we give no filename -text = df1.to_csv(index=False).encode() # type: ignore +# error: Item "None" of "Optional[str]" has no attribute "encode" +text = df1.to_csv(index=False).encode() # type: ignore[union-attr] @pytest.fixture diff --git a/pandas/util/_decorators.py b/pandas/util/_decorators.py index 6135ccba1573d..f81bca7e85156 100644 --- a/pandas/util/_decorators.py +++ b/pandas/util/_decorators.py @@ -323,7 +323,8 @@ def wrapper(*args, **kwargs) -> Callable[..., Any]: sig = inspect.Signature(params) # https://github.com/python/typing/issues/598 - func.__signature__ = sig # type: ignore + # error: "F" has no attribute "__signature__" + func.__signature__ = sig # type: ignore[attr-defined] return cast(F, wrapper) return decorate @@ -357,8 +358,12 @@ def decorator(decorated: F) -> F: for docstring in docstrings: if hasattr(docstring, "_docstring_components"): + # error: Item "str" of "Union[str, Callable[..., Any]]" has no + # attribute "_docstring_components" [union-attr] + # error: Item "function" of "Union[str, Callable[..., Any]]" + # has no attribute "_docstring_components" [union-attr] docstring_components.extend( - docstring._docstring_components # type: ignore + docstring._docstring_components # type: ignore[union-attr] ) elif isinstance(docstring, str) or docstring.__doc__: docstring_components.append(docstring) @@ -373,7 +378,10 @@ def decorator(decorated: F) -> F: ] ) - decorated._docstring_components = docstring_components # type: ignore + # error: "F" has no attribute "_docstring_components" + decorated._docstring_components = ( # type: ignore[attr-defined] + docstring_components + ) return decorated return decorator diff --git a/requirements-dev.txt b/requirements-dev.txt index c0dd77cd73ddc..6a87b0a99a4f8 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -73,4 +73,5 @@ cftime pyreadstat tabulate>=0.8.3 git+https://github.com/pandas-dev/pydata-sphinx-theme.git@master -git+https://github.com/numpy/numpydoc \ No newline at end of file +git+https://github.com/numpy/numpydoc +pyflakes>=2.2.0 \ No newline at end of file diff --git a/setup.cfg b/setup.cfg index ee5725e36d193..84c281b756395 100644 --- a/setup.cfg +++ b/setup.cfg @@ -122,6 +122,7 @@ check_untyped_defs=True strict_equality=True warn_redundant_casts = True warn_unused_ignores = True +show_error_codes = True [mypy-pandas.tests.*] check_untyped_defs=False From c526cd44dd62ac19b092f8d4d2f40a566c008b2c Mon Sep 17 00:00:00 2001 From: Eric Wieser Date: Thu, 6 Aug 2020 16:14:43 +0100 Subject: [PATCH 0438/1025] Ensure _group_selection_context is always reset (#35572) --- pandas/core/groupby/groupby.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index ac45222625569..6c8a780859939 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -463,8 +463,10 @@ def _group_selection_context(groupby): Set / reset the _group_selection_context. """ groupby._set_group_selection() - yield groupby - groupby._reset_group_selection() + try: + yield groupby + finally: + groupby._reset_group_selection() _KeysArgType = Union[ From 7b28ff502a82390fbc3ca0b1a1a3201f5f195901 Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Thu, 6 Aug 2020 16:47:59 +0100 Subject: [PATCH 0439/1025] CLN: remove kwargs in RangeIndex.copy (#35575) --- pandas/core/indexes/range.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 1dc4fc1e91462..e9c4c301f4dca 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -385,11 +385,13 @@ def _shallow_copy(self, values=None, name: Label = no_default): return Int64Index._simple_new(values, name=name) @doc(Int64Index.copy) - def copy(self, name=None, deep=False, dtype=None, **kwargs): + def copy(self, name=None, deep=False, dtype=None, names=None): self._validate_dtype(dtype) - if name is None: - name = self.name - return self.from_range(self._range, name=name) + + new_index = self._shallow_copy() + names = self._validate_names(name=name, names=names, deep=deep) + new_index = new_index.set_names(names) + return new_index def _minmax(self, meth: str): no_steps = len(self) - 1 From 13683daa4954292346195b9435453ec5611fcd38 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 6 Aug 2020 08:49:06 -0700 Subject: [PATCH 0440/1025] ENH: enable mul, div on Index by dispatching to Series (#34160) --- doc/source/whatsnew/v1.2.0.rst | 2 +- pandas/core/indexes/base.py | 58 +------------------ pandas/core/indexes/category.py | 2 - pandas/core/indexes/datetimes.py | 1 - pandas/core/indexes/multi.py | 35 +++++++++++ pandas/core/indexes/period.py | 1 - pandas/tests/arithmetic/test_numeric.py | 14 ----- .../indexes/categorical/test_category.py | 9 ++- pandas/tests/indexes/common.py | 33 ++++++++--- 9 files changed, 73 insertions(+), 82 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 304897edbb75e..82037a332e0f9 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -17,7 +17,7 @@ Enhancements Other enhancements ^^^^^^^^^^^^^^^^^^ - +- :class:`Index` with object dtype supports division and multiplication (:issue:`34160`) - - diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 278999930463f..bfdfbd35f27ad 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2377,31 +2377,10 @@ def _get_unique_index(self, dropna: bool = False): # -------------------------------------------------------------------- # Arithmetic & Logical Methods - def __add__(self, other): - if isinstance(other, (ABCSeries, ABCDataFrame)): - return NotImplemented - from pandas import Series - - return Index(Series(self) + other) - - def __radd__(self, other): - from pandas import Series - - return Index(other + Series(self)) - def __iadd__(self, other): # alias for __add__ return self + other - def __sub__(self, other): - return Index(np.array(self) - other) - - def __rsub__(self, other): - # wrap Series to ensure we pin name correctly - from pandas import Series - - return Index(other - Series(self)) - def __and__(self, other): return self.intersection(other) @@ -5293,38 +5272,6 @@ def _add_comparison_methods(cls): cls.__le__ = _make_comparison_op(operator.le, cls) cls.__ge__ = _make_comparison_op(operator.ge, cls) - @classmethod - def _add_numeric_methods_add_sub_disabled(cls): - """ - Add in the numeric add/sub methods to disable. - """ - cls.__add__ = make_invalid_op("__add__") - cls.__radd__ = make_invalid_op("__radd__") - cls.__iadd__ = make_invalid_op("__iadd__") - cls.__sub__ = make_invalid_op("__sub__") - cls.__rsub__ = make_invalid_op("__rsub__") - cls.__isub__ = make_invalid_op("__isub__") - - @classmethod - def _add_numeric_methods_disabled(cls): - """ - Add in numeric methods to disable other than add/sub. - """ - cls.__pow__ = make_invalid_op("__pow__") - cls.__rpow__ = make_invalid_op("__rpow__") - cls.__mul__ = make_invalid_op("__mul__") - cls.__rmul__ = make_invalid_op("__rmul__") - cls.__floordiv__ = make_invalid_op("__floordiv__") - cls.__rfloordiv__ = make_invalid_op("__rfloordiv__") - cls.__truediv__ = make_invalid_op("__truediv__") - cls.__rtruediv__ = make_invalid_op("__rtruediv__") - cls.__mod__ = make_invalid_op("__mod__") - cls.__divmod__ = make_invalid_op("__divmod__") - cls.__neg__ = make_invalid_op("__neg__") - cls.__pos__ = make_invalid_op("__pos__") - cls.__abs__ = make_invalid_op("__abs__") - cls.__inv__ = make_invalid_op("__inv__") - @classmethod def _add_numeric_methods_binary(cls): """ @@ -5340,11 +5287,12 @@ def _add_numeric_methods_binary(cls): cls.__truediv__ = _make_arithmetic_op(operator.truediv, cls) cls.__rtruediv__ = _make_arithmetic_op(ops.rtruediv, cls) - # TODO: rmod? rdivmod? cls.__mod__ = _make_arithmetic_op(operator.mod, cls) + cls.__rmod__ = _make_arithmetic_op(ops.rmod, cls) cls.__floordiv__ = _make_arithmetic_op(operator.floordiv, cls) cls.__rfloordiv__ = _make_arithmetic_op(ops.rfloordiv, cls) cls.__divmod__ = _make_arithmetic_op(divmod, cls) + cls.__rdivmod__ = _make_arithmetic_op(ops.rdivmod, cls) cls.__mul__ = _make_arithmetic_op(operator.mul, cls) cls.__rmul__ = _make_arithmetic_op(ops.rmul, cls) @@ -5504,7 +5452,7 @@ def shape(self): return self._values.shape -Index._add_numeric_methods_disabled() +Index._add_numeric_methods() Index._add_logical_methods() Index._add_comparison_methods() diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 74b235655e345..fb283cbe02954 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -770,6 +770,4 @@ def _wrap_joined_index( return self._create_from_codes(joined, name=name) -CategoricalIndex._add_numeric_methods_add_sub_disabled() -CategoricalIndex._add_numeric_methods_disabled() CategoricalIndex._add_logical_methods_disabled() diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 6d2e592f024ed..f71fd0d406c54 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -842,7 +842,6 @@ def indexer_between_time( return mask.nonzero()[0] -DatetimeIndex._add_numeric_methods_disabled() DatetimeIndex._add_logical_methods_disabled() diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 235da89083d0a..a6e8ec0707de7 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -50,6 +50,7 @@ from pandas.core.indexes.frozen import FrozenList from pandas.core.indexes.numeric import Int64Index import pandas.core.missing as missing +from pandas.core.ops.invalid import make_invalid_op from pandas.core.sorting import ( get_group_index, indexer_from_factorized, @@ -3606,6 +3607,40 @@ def isin(self, values, level=None): return np.zeros(len(levs), dtype=np.bool_) return levs.isin(values) + @classmethod + def _add_numeric_methods_add_sub_disabled(cls): + """ + Add in the numeric add/sub methods to disable. + """ + cls.__add__ = make_invalid_op("__add__") + cls.__radd__ = make_invalid_op("__radd__") + cls.__iadd__ = make_invalid_op("__iadd__") + cls.__sub__ = make_invalid_op("__sub__") + cls.__rsub__ = make_invalid_op("__rsub__") + cls.__isub__ = make_invalid_op("__isub__") + + @classmethod + def _add_numeric_methods_disabled(cls): + """ + Add in numeric methods to disable other than add/sub. + """ + cls.__pow__ = make_invalid_op("__pow__") + cls.__rpow__ = make_invalid_op("__rpow__") + cls.__mul__ = make_invalid_op("__mul__") + cls.__rmul__ = make_invalid_op("__rmul__") + cls.__floordiv__ = make_invalid_op("__floordiv__") + cls.__rfloordiv__ = make_invalid_op("__rfloordiv__") + cls.__truediv__ = make_invalid_op("__truediv__") + cls.__rtruediv__ = make_invalid_op("__rtruediv__") + cls.__mod__ = make_invalid_op("__mod__") + cls.__rmod__ = make_invalid_op("__rmod__") + cls.__divmod__ = make_invalid_op("__divmod__") + cls.__rdivmod__ = make_invalid_op("__rdivmod__") + cls.__neg__ = make_invalid_op("__neg__") + cls.__pos__ = make_invalid_op("__pos__") + cls.__abs__ = make_invalid_op("__abs__") + cls.__inv__ = make_invalid_op("__inv__") + MultiIndex._add_numeric_methods_disabled() MultiIndex._add_numeric_methods_add_sub_disabled() diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index 03e11b652477f..c7199e4a28a17 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -724,7 +724,6 @@ def memory_usage(self, deep=False): return result -PeriodIndex._add_numeric_methods_disabled() PeriodIndex._add_logical_methods_disabled() diff --git a/pandas/tests/arithmetic/test_numeric.py b/pandas/tests/arithmetic/test_numeric.py index 2155846b271fc..484f83deb0f55 100644 --- a/pandas/tests/arithmetic/test_numeric.py +++ b/pandas/tests/arithmetic/test_numeric.py @@ -548,20 +548,6 @@ class TestMultiplicationDivision: # __mul__, __rmul__, __div__, __rdiv__, __floordiv__, __rfloordiv__ # for non-timestamp/timedelta/period dtypes - @pytest.mark.parametrize( - "box", - [ - pytest.param( - pd.Index, - marks=pytest.mark.xfail( - reason="Index.__div__ always raises", raises=TypeError - ), - ), - pd.Series, - pd.DataFrame, - ], - ids=lambda x: x.__name__, - ) def test_divide_decimal(self, box): # resolves issue GH#9787 ser = Series([Decimal(10)]) diff --git a/pandas/tests/indexes/categorical/test_category.py b/pandas/tests/indexes/categorical/test_category.py index 8af26eef504fc..b325edb321ed4 100644 --- a/pandas/tests/indexes/categorical/test_category.py +++ b/pandas/tests/indexes/categorical/test_category.py @@ -43,7 +43,14 @@ def test_disallow_addsub_ops(self, func, op_name): # GH 10039 # set ops (+/-) raise TypeError idx = pd.Index(pd.Categorical(["a", "b"])) - msg = f"cannot perform {op_name} with this index type: CategoricalIndex" + cat_or_list = "'(Categorical|list)' and '(Categorical|list)'" + msg = "|".join( + [ + f"cannot perform {op_name} with this index type: CategoricalIndex", + "can only concatenate list", + rf"unsupported operand type\(s\) for [\+-]: {cat_or_list}", + ] + ) with pytest.raises(TypeError, match=msg): func(idx) diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index 3b41c4bfacf73..238ee8d304d05 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -146,22 +146,41 @@ def test_numeric_compat(self): # Check that this doesn't cover MultiIndex case, if/when it does, # we can remove multi.test_compat.test_numeric_compat assert not isinstance(idx, MultiIndex) + if type(idx) is Index: + return - with pytest.raises(TypeError, match="cannot perform __mul__"): + typ = type(idx._data).__name__ + lmsg = "|".join( + [ + rf"unsupported operand type\(s\) for \*: '{typ}' and 'int'", + "cannot perform (__mul__|__truediv__|__floordiv__) with " + f"this index type: {typ}", + ] + ) + with pytest.raises(TypeError, match=lmsg): idx * 1 - with pytest.raises(TypeError, match="cannot perform __rmul__"): + rmsg = "|".join( + [ + rf"unsupported operand type\(s\) for \*: 'int' and '{typ}'", + "cannot perform (__rmul__|__rtruediv__|__rfloordiv__) with " + f"this index type: {typ}", + ] + ) + with pytest.raises(TypeError, match=rmsg): 1 * idx - div_err = "cannot perform __truediv__" + div_err = lmsg.replace("*", "/") with pytest.raises(TypeError, match=div_err): idx / 1 - - div_err = div_err.replace(" __", " __r") + div_err = rmsg.replace("*", "/") with pytest.raises(TypeError, match=div_err): 1 / idx - with pytest.raises(TypeError, match="cannot perform __floordiv__"): + + floordiv_err = lmsg.replace("*", "//") + with pytest.raises(TypeError, match=floordiv_err): idx // 1 - with pytest.raises(TypeError, match="cannot perform __rfloordiv__"): + floordiv_err = rmsg.replace("*", "//") + with pytest.raises(TypeError, match=floordiv_err): 1 // idx def test_logical_compat(self): From 50ab5d6a48999e2c9c813c1c1952e1343c93c0f3 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 6 Aug 2020 09:20:28 -0700 Subject: [PATCH 0441/1025] BUG: TDA.__floordiv__ with NaT (#35583) * BUG: TDA.__floordiv__ with NaT * whatsnew --- doc/source/whatsnew/v1.2.0.rst | 2 +- pandas/core/arrays/timedeltas.py | 2 +- pandas/tests/arithmetic/test_timedelta64.py | 17 +++++++++++++++++ 3 files changed, 19 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 82037a332e0f9..3ec53227003fd 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -64,7 +64,7 @@ Datetimelike Timedelta ^^^^^^^^^ - +- Bug in :class:`TimedeltaIndex`, :class:`Series`, and :class:`DataFrame` floor-division with ``timedelta64`` dtypes and ``NaT`` in the denominator (:issue:`35529`) - - diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index a378423df788b..99a4725c2d806 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -628,7 +628,7 @@ def __floordiv__(self, other): result = self.asi8 // other.asi8 mask = self._isnan | other._isnan if mask.any(): - result = result.astype(np.int64) + result = result.astype(np.float64) result[mask] = np.nan return result diff --git a/pandas/tests/arithmetic/test_timedelta64.py b/pandas/tests/arithmetic/test_timedelta64.py index f94408d657ae5..64d3d5b6d684d 100644 --- a/pandas/tests/arithmetic/test_timedelta64.py +++ b/pandas/tests/arithmetic/test_timedelta64.py @@ -1733,6 +1733,23 @@ def test_tdarr_div_length_mismatch(self, box_with_array): # ------------------------------------------------------------------ # __floordiv__, __rfloordiv__ + def test_td64arr_floordiv_td64arr_with_nat(self, box_with_array): + # GH#35529 + box = box_with_array + + left = pd.Series([1000, 222330, 30], dtype="timedelta64[ns]") + right = pd.Series([1000, 222330, None], dtype="timedelta64[ns]") + + left = tm.box_expected(left, box) + right = tm.box_expected(right, box) + + expected = np.array([1.0, 1.0, np.nan], dtype=np.float64) + expected = tm.box_expected(expected, box) + + result = left // right + + tm.assert_equal(result, expected) + def test_td64arr_floordiv_tdscalar(self, box_with_array, scalar_td): # GH#18831 td1 = Series([timedelta(minutes=5, seconds=3)] * 3) From 76cc9d0bec25aded998d192d554ad007a3eef188 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Thu, 6 Aug 2020 10:45:19 -0700 Subject: [PATCH 0442/1025] BUG: RollingGroupby respects __getitem__ (#35513) --- doc/source/whatsnew/v1.1.1.rst | 3 +-- pandas/core/window/rolling.py | 4 ++++ pandas/tests/window/test_grouper.py | 25 +++++++++++++++++++++++++ 3 files changed, 30 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.1.1.rst b/doc/source/whatsnew/v1.1.1.rst index 6a327a4fc732f..5e36bfe6b6307 100644 --- a/doc/source/whatsnew/v1.1.1.rst +++ b/doc/source/whatsnew/v1.1.1.rst @@ -16,8 +16,7 @@ Fixed regressions ~~~~~~~~~~~~~~~~~ - Fixed regression where :func:`read_csv` would raise a ``ValueError`` when ``pandas.options.mode.use_inf_as_na`` was set to ``True`` (:issue:`35493`). -- -- +- Fixed regression in :class:`pandas.core.groupby.RollingGroupby` where column selection was ignored (:issue:`35486`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 445f179248226..87bcaa7d9512f 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -2220,6 +2220,10 @@ def _apply( def _constructor(self): return Rolling + @cache_readonly + def _selected_obj(self): + return self._groupby._selected_obj + def _create_blocks(self, obj: FrameOrSeries): """ Split data into blocks & return conformed data. diff --git a/pandas/tests/window/test_grouper.py b/pandas/tests/window/test_grouper.py index 744ca264e91d9..ca5a9eccea4f5 100644 --- a/pandas/tests/window/test_grouper.py +++ b/pandas/tests/window/test_grouper.py @@ -214,3 +214,28 @@ def foo(x): name="value", ) tm.assert_series_equal(result, expected) + + def test_groupby_subselect_rolling(self): + # GH 35486 + df = DataFrame( + {"a": [1, 2, 3, 2], "b": [4.0, 2.0, 3.0, 1.0], "c": [10, 20, 30, 20]} + ) + result = df.groupby("a")[["b"]].rolling(2).max() + expected = DataFrame( + [np.nan, np.nan, 2.0, np.nan], + columns=["b"], + index=pd.MultiIndex.from_tuples( + ((1, 0), (2, 1), (2, 3), (3, 2)), names=["a", None] + ), + ) + tm.assert_frame_equal(result, expected) + + result = df.groupby("a")["b"].rolling(2).max() + expected = Series( + [np.nan, np.nan, 2.0, np.nan], + index=pd.MultiIndex.from_tuples( + ((1, 0), (2, 1), (2, 3), (3, 2)), names=["a", None] + ), + name="b", + ) + tm.assert_series_equal(result, expected) From 47b24a3cfab9a7e4dd5adcb53d0ae328d6713f7b Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Thu, 6 Aug 2020 14:51:05 -0700 Subject: [PATCH 0443/1025] CLN: get_flattened_iterator (#35515) --- pandas/core/groupby/ops.py | 4 +-- pandas/core/sorting.py | 53 +++++++++++++++----------------------- 2 files changed, 23 insertions(+), 34 deletions(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 3aaeef3b63760..c6b0732b04c09 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -50,7 +50,7 @@ from pandas.core.sorting import ( compress_group_index, decons_obs_group_ids, - get_flattened_iterator, + get_flattened_list, get_group_index, get_group_index_sorter, get_indexer_dict, @@ -153,7 +153,7 @@ def _get_group_keys(self): comp_ids, _, ngroups = self.group_info # provide "flattened" iterator for multi-group setting - return get_flattened_iterator(comp_ids, ngroups, self.levels, self.codes) + return get_flattened_list(comp_ids, ngroups, self.levels, self.codes) def apply(self, f: F, data: FrameOrSeries, axis: int = 0): mutated = self.mutated diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index ee73aa42701b0..8bdd466ae6f33 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -1,5 +1,6 @@ """ miscellaneous sorting / groupby utilities """ -from typing import Callable, Optional +from collections import defaultdict +from typing import TYPE_CHECKING, Callable, DefaultDict, Iterable, List, Optional, Tuple import numpy as np @@ -18,6 +19,9 @@ import pandas.core.algorithms as algorithms from pandas.core.construction import extract_array +if TYPE_CHECKING: + from pandas.core.indexes.base import Index # noqa:F401 + _INT64_MAX = np.iinfo(np.int64).max @@ -409,7 +413,7 @@ def ensure_key_mapped(values, key: Optional[Callable], levels=None): levels : Optional[List], if values is a MultiIndex, list of levels to apply the key to. """ - from pandas.core.indexes.api import Index + from pandas.core.indexes.api import Index # noqa:F811 if not key: return values @@ -440,36 +444,21 @@ def ensure_key_mapped(values, key: Optional[Callable], levels=None): return result -class _KeyMapper: - """ - Map compressed group id -> key tuple. - """ - - def __init__(self, comp_ids, ngroups: int, levels, labels): - self.levels = levels - self.labels = labels - self.comp_ids = comp_ids.astype(np.int64) - - self.k = len(labels) - self.tables = [hashtable.Int64HashTable(ngroups) for _ in range(self.k)] - - self._populate_tables() - - def _populate_tables(self): - for labs, table in zip(self.labels, self.tables): - table.map(self.comp_ids, labs.astype(np.int64)) - - def get_key(self, comp_id): - return tuple( - level[table.get_item(comp_id)] - for table, level in zip(self.tables, self.levels) - ) - - -def get_flattened_iterator(comp_ids, ngroups, levels, labels): - # provide "flattened" iterator for multi-group setting - mapper = _KeyMapper(comp_ids, ngroups, levels, labels) - return [mapper.get_key(i) for i in range(ngroups)] +def get_flattened_list( + comp_ids: np.ndarray, + ngroups: int, + levels: Iterable["Index"], + labels: Iterable[np.ndarray], +) -> List[Tuple]: + """Map compressed group id -> key tuple.""" + comp_ids = comp_ids.astype(np.int64, copy=False) + arrays: DefaultDict[int, List[int]] = defaultdict(list) + for labs, level in zip(labels, levels): + table = hashtable.Int64HashTable(ngroups) + table.map(comp_ids, labs.astype(np.int64, copy=False)) + for i in range(ngroups): + arrays[i].append(level[table.get_item(i)]) + return [tuple(array) for array in arrays.values()] def get_indexer_dict(label_list, keys): From 43dc63ea880763d8ba8514e30ec71801e9284c3f Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 6 Aug 2020 17:00:26 -0500 Subject: [PATCH 0444/1025] DOC: Document that read_hdf can use pickle (#35554) --- doc/source/user_guide/io.rst | 9 ++++--- pandas/io/pytables.py | 51 +++++++++++++++++++++++++++++++++++- 2 files changed, 55 insertions(+), 5 deletions(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index d4be9d802d697..cc42f952b1733 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -3441,10 +3441,11 @@ for some advanced strategies .. warning:: - pandas requires ``PyTables`` >= 3.0.0. - There is a indexing bug in ``PyTables`` < 3.2 which may appear when querying stores using an index. - If you see a subset of results being returned, upgrade to ``PyTables`` >= 3.2. - Stores created previously will need to be rewritten using the updated version. + Pandas uses PyTables for reading and writing HDF5 files, which allows + serializing object-dtype data with pickle. Loading pickled data received from + untrusted sources can be unsafe. + + See: https://docs.python.org/3/library/pickle.html for more. .. ipython:: python :suppress: diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 9f5b6041b0ffa..aeb7b3e044794 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -289,7 +289,15 @@ def read_hdf( Read from the store, close it if we opened it. Retrieve pandas object stored in file, optionally based on where - criteria + criteria. + + .. warning:: + + Pandas uses PyTables for reading and writing HDF5 files, which allows + serializing object-dtype data with pickle when using the "fixed" format. + Loading pickled data received from untrusted sources can be unsafe. + + See: https://docs.python.org/3/library/pickle.html for more. Parameters ---------- @@ -445,6 +453,14 @@ class HDFStore: Either Fixed or Table format. + .. warning:: + + Pandas uses PyTables for reading and writing HDF5 files, which allows + serializing object-dtype data with pickle when using the "fixed" format. + Loading pickled data received from untrusted sources can be unsafe. + + See: https://docs.python.org/3/library/pickle.html for more. + Parameters ---------- path : str @@ -789,6 +805,14 @@ def select( """ Retrieve pandas object stored in file, optionally based on where criteria. + .. warning:: + + Pandas uses PyTables for reading and writing HDF5 files, which allows + serializing object-dtype data with pickle when using the "fixed" format. + Loading pickled data received from untrusted sources can be unsafe. + + See: https://docs.python.org/3/library/pickle.html for more. + Parameters ---------- key : str @@ -852,6 +876,15 @@ def select_as_coordinates( """ return the selection as an Index + .. warning:: + + Pandas uses PyTables for reading and writing HDF5 files, which allows + serializing object-dtype data with pickle when using the "fixed" format. + Loading pickled data received from untrusted sources can be unsafe. + + See: https://docs.python.org/3/library/pickle.html for more. + + Parameters ---------- key : str @@ -876,6 +909,14 @@ def select_column( return a single column from the table. This is generally only useful to select an indexable + .. warning:: + + Pandas uses PyTables for reading and writing HDF5 files, which allows + serializing object-dtype data with pickle when using the "fixed" format. + Loading pickled data received from untrusted sources can be unsafe. + + See: https://docs.python.org/3/library/pickle.html for more. + Parameters ---------- key : str @@ -912,6 +953,14 @@ def select_as_multiple( """ Retrieve pandas objects from multiple tables. + .. warning:: + + Pandas uses PyTables for reading and writing HDF5 files, which allows + serializing object-dtype data with pickle when using the "fixed" format. + Loading pickled data received from untrusted sources can be unsafe. + + See: https://docs.python.org/3/library/pickle.html for more. + Parameters ---------- keys : a list of the tables From 8d4070180fb7b0f8125d15d798679d636cc95a7d Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 6 Aug 2020 15:06:46 -0700 Subject: [PATCH 0445/1025] BUG: NaT.__cmp__(invalid) should raise TypeError (#35585) --- doc/source/whatsnew/v1.2.0.rst | 1 + pandas/_libs/tslibs/nattype.pyx | 31 +++++++---------- pandas/tests/scalar/test_nat.py | 62 +++++++++++++++++++++++++++++++-- 3 files changed, 73 insertions(+), 21 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 3ec53227003fd..7168fc0078083 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -60,6 +60,7 @@ Categorical Datetimelike ^^^^^^^^^^^^ - Bug in :attr:`DatetimeArray.date` where a ``ValueError`` would be raised with a read-only backing array (:issue:`33530`) +- Bug in ``NaT`` comparisons failing to raise ``TypeError`` on invalid inequality comparisons (:issue:`35046`) - Timedelta diff --git a/pandas/_libs/tslibs/nattype.pyx b/pandas/_libs/tslibs/nattype.pyx index 73df51832d700..79f50c7261905 100644 --- a/pandas/_libs/tslibs/nattype.pyx +++ b/pandas/_libs/tslibs/nattype.pyx @@ -107,30 +107,25 @@ cdef class _NaT(datetime): __array_priority__ = 100 def __richcmp__(_NaT self, object other, int op): - cdef: - int ndim = getattr(other, "ndim", -1) + if util.is_datetime64_object(other) or PyDateTime_Check(other): + # We treat NaT as datetime-like for this comparison + return _nat_scalar_rules[op] - if ndim == -1: + elif util.is_timedelta64_object(other) or PyDelta_Check(other): + # We treat NaT as timedelta-like for this comparison return _nat_scalar_rules[op] elif util.is_array(other): - result = np.empty(other.shape, dtype=np.bool_) - result.fill(_nat_scalar_rules[op]) + if other.dtype.kind in "mM": + result = np.empty(other.shape, dtype=np.bool_) + result.fill(_nat_scalar_rules[op]) + elif other.dtype.kind == "O": + result = np.array([PyObject_RichCompare(self, x, op) for x in other]) + else: + return NotImplemented return result - elif ndim == 0: - if util.is_datetime64_object(other): - return _nat_scalar_rules[op] - else: - raise TypeError( - f"Cannot compare type {type(self).__name__} " - f"with type {type(other).__name__}" - ) - - # Note: instead of passing "other, self, _reverse_ops[op]", we observe - # that `_nat_scalar_rules` is invariant under `_reverse_ops`, - # rendering it unnecessary. - return PyObject_RichCompare(other, self, op) + return NotImplemented def __add__(self, other): if self is not c_NaT: diff --git a/pandas/tests/scalar/test_nat.py b/pandas/tests/scalar/test_nat.py index e1e2ea1a5cec8..03830019affa1 100644 --- a/pandas/tests/scalar/test_nat.py +++ b/pandas/tests/scalar/test_nat.py @@ -513,11 +513,67 @@ def test_to_numpy_alias(): assert isna(expected) and isna(result) -@pytest.mark.parametrize("other", [Timedelta(0), Timestamp(0)]) +@pytest.mark.parametrize( + "other", + [ + Timedelta(0), + Timedelta(0).to_pytimedelta(), + pytest.param( + Timedelta(0).to_timedelta64(), + marks=pytest.mark.xfail( + reason="td64 doesnt return NotImplemented, see numpy#17017" + ), + ), + Timestamp(0), + Timestamp(0).to_pydatetime(), + pytest.param( + Timestamp(0).to_datetime64(), + marks=pytest.mark.xfail( + reason="dt64 doesnt return NotImplemented, see numpy#17017" + ), + ), + Timestamp(0).tz_localize("UTC"), + NaT, + ], +) def test_nat_comparisons(compare_operators_no_eq_ne, other): # GH 26039 - assert getattr(NaT, compare_operators_no_eq_ne)(other) is False - assert getattr(other, compare_operators_no_eq_ne)(NaT) is False + opname = compare_operators_no_eq_ne + + assert getattr(NaT, opname)(other) is False + + op = getattr(operator, opname.strip("_")) + assert op(NaT, other) is False + assert op(other, NaT) is False + + +@pytest.mark.parametrize("other", [np.timedelta64(0, "ns"), np.datetime64("now", "ns")]) +def test_nat_comparisons_numpy(other): + # Once numpy#17017 is fixed and the xfailed cases in test_nat_comparisons + # pass, this test can be removed + assert not NaT == other + assert NaT != other + assert not NaT < other + assert not NaT > other + assert not NaT <= other + assert not NaT >= other + + +@pytest.mark.parametrize("other", ["foo", 2, 2.0]) +@pytest.mark.parametrize("op", [operator.le, operator.lt, operator.ge, operator.gt]) +def test_nat_comparisons_invalid(other, op): + # GH#35585 + assert not NaT == other + assert not other == NaT + + assert NaT != other + assert other != NaT + + with pytest.raises(TypeError): + op(NaT, other) + + with pytest.raises(TypeError): + op(other, NaT) @pytest.mark.parametrize( From b9a196546efcc08370a78a210699316fef2fd64d Mon Sep 17 00:00:00 2001 From: Thomas Smith Date: Fri, 7 Aug 2020 00:29:21 +0100 Subject: [PATCH 0446/1025] BUG: GroupBy.apply() throws erroneous ValueError with duplicate axes (#35441) --- doc/source/whatsnew/v1.2.0.rst | 1 + pandas/core/groupby/ops.py | 8 ++++---- pandas/tests/groupby/test_apply.py | 27 ++++++++++++++++++++------- pandas/tests/groupby/test_function.py | 4 ---- 4 files changed, 25 insertions(+), 15 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 7168fc0078083..6f173cb2fce12 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -133,6 +133,7 @@ Plotting Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ +- Bug in :meth:`DataFrameGroupBy.apply` that would some times throw an erroneous ``ValueError`` if the grouping axis had duplicate entries (:issue:`16646`) - - diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index c6b0732b04c09..64eb413fe78fa 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -211,7 +211,7 @@ def apply(self, f: F, data: FrameOrSeries, axis: int = 0): # group might be modified group_axes = group.axes res = f(group) - if not _is_indexed_like(res, group_axes): + if not _is_indexed_like(res, group_axes, axis): mutated = True result_values.append(res) @@ -897,13 +897,13 @@ def agg_series( return grouper.get_result() -def _is_indexed_like(obj, axes) -> bool: +def _is_indexed_like(obj, axes, axis: int) -> bool: if isinstance(obj, Series): if len(axes) > 1: return False - return obj.index.equals(axes[0]) + return obj.axes[axis].equals(axes[axis]) elif isinstance(obj, DataFrame): - return obj.index.equals(axes[0]) + return obj.axes[axis].equals(axes[axis]) return False diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 525a6fe2637c3..665cd12225ad7 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -63,15 +63,8 @@ def test_apply_trivial(): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail( - reason="GH#20066; function passed into apply " - "returns a DataFrame with the same index " - "as the one to create GroupBy object." -) def test_apply_trivial_fail(): # GH 20066 - # trivial apply fails if the constant dataframe has the same index - # with the one used to create GroupBy object. df = pd.DataFrame( {"key": ["a", "a", "b", "b", "a"], "data": [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=["key", "data"], @@ -1044,3 +1037,23 @@ def test_apply_with_date_in_multiindex_does_not_convert_to_timestamp(): tm.assert_frame_equal(result, expected) for val in result.index.levels[1]: assert type(val) is date + + +def test_apply_by_cols_equals_apply_by_rows_transposed(): + # GH 16646 + # Operating on the columns, or transposing and operating on the rows + # should give the same result. There was previously a bug where the + # by_rows operation would work fine, but by_cols would throw a ValueError + + df = pd.DataFrame( + np.random.random([6, 4]), + columns=pd.MultiIndex.from_product([["A", "B"], [1, 2]]), + ) + + by_rows = df.T.groupby(axis=0, level=0).apply( + lambda x: x.droplevel(axis=0, level=0) + ) + by_cols = df.groupby(axis=1, level=0).apply(lambda x: x.droplevel(axis=1, level=0)) + + tm.assert_frame_equal(by_cols, by_rows.T) + tm.assert_frame_equal(by_cols, df) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index cbfba16223f74..42945be923fa0 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -940,10 +940,6 @@ def test_frame_describe_multikey(tsframe): groupedT = tsframe.groupby({"A": 0, "B": 0, "C": 1, "D": 1}, axis=1) result = groupedT.describe() expected = tsframe.describe().T - expected.index = pd.MultiIndex( - levels=[[0, 1], expected.index], - codes=[[0, 0, 1, 1], range(len(expected.index))], - ) tm.assert_frame_equal(result, expected) From d685cf6fc660ee3ba4baa50d51fc0ae66b91c679 Mon Sep 17 00:00:00 2001 From: Jeff Hernandez <12969559+jeff-hernandez@users.noreply.github.com> Date: Thu, 6 Aug 2020 18:37:25 -0500 Subject: [PATCH 0447/1025] DOC: Add compose ecosystem docs (#35405) --- doc/source/ecosystem.rst | 7 +++++++ web/pandas/community/ecosystem.md | 8 ++++++++ 2 files changed, 15 insertions(+) diff --git a/doc/source/ecosystem.rst b/doc/source/ecosystem.rst index b02d4abd3ddf8..de231e43918f8 100644 --- a/doc/source/ecosystem.rst +++ b/doc/source/ecosystem.rst @@ -80,6 +80,11 @@ ML pipeline. Featuretools is a Python library for automated feature engineering built on top of pandas. It excels at transforming temporal and relational datasets into feature matrices for machine learning using reusable feature engineering "primitives". Users can contribute their own primitives in Python and share them with the rest of the community. +`Compose `__ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Compose is a machine learning tool for labeling data and prediction engineering. It allows you to structure the labeling process by parameterizing prediction problems and transforming time-driven relational data into target values with cutoff times that can be used for supervised learning. + .. _ecosystem.visualization: Visualization @@ -445,6 +450,7 @@ Library Accessor Classes Description `pdvega`_ ``vgplot`` ``Series``, ``DataFrame`` Provides plotting functions from the Altair_ library. `pandas_path`_ ``path`` ``Index``, ``Series`` Provides `pathlib.Path`_ functions for Series. `pint-pandas`_ ``pint`` ``Series``, ``DataFrame`` Provides units support for numeric Series and DataFrames. +`composeml`_ ``slice`` ``DataFrame`` Provides a generator for enhanced data slicing. =============== ========== ========================= =============================================================== .. _cyberpandas: https://cyberpandas.readthedocs.io/en/latest @@ -453,3 +459,4 @@ Library Accessor Classes Description .. _pandas_path: https://github.com/drivendataorg/pandas-path/ .. _pathlib.Path: https://docs.python.org/3/library/pathlib.html .. _pint-pandas: https://github.com/hgrecco/pint-pandas +.. _composeml: https://github.com/FeatureLabs/compose diff --git a/web/pandas/community/ecosystem.md b/web/pandas/community/ecosystem.md index be109ea53eb7d..515d23afb93ec 100644 --- a/web/pandas/community/ecosystem.md +++ b/web/pandas/community/ecosystem.md @@ -42,6 +42,13 @@ datasets into feature matrices for machine learning using reusable feature engineering "primitives". Users can contribute their own primitives in Python and share them with the rest of the community. +### [Compose](https://github.com/FeatureLabs/compose) + +Compose is a machine learning tool for labeling data and prediction engineering. +It allows you to structure the labeling process by parameterizing +prediction problems and transforming time-driven relational data into +target values with cutoff times that can be used for supervised learning. + ## Visualization ### [Altair](https://altair-viz.github.io/) @@ -372,3 +379,4 @@ authors to coordinate on the namespace. | [pdvega](https://altair-viz.github.io/pdvega/) | `vgplot` | `Series`, `DataFrame` | | [pandas_path](https://github.com/drivendataorg/pandas-path/) | `path` | `Index`, `Series` | | [pint-pandas](https://github.com/hgrecco/pint-pandas) | `pint` | `Series`, `DataFrame` | + | [composeml](https://github.com/FeatureLabs/compose) | `slice` | `DataFrame` | From f2d5ee7801d619046e7dffdad2caf88990960b40 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 6 Aug 2020 16:50:50 -0700 Subject: [PATCH 0448/1025] REF: implement _apply_blockwise (#35359) --- pandas/core/window/ewm.py | 27 +++------------- pandas/core/window/rolling.py | 60 ++++++++++++++++++++++------------- 2 files changed, 42 insertions(+), 45 deletions(-) diff --git a/pandas/core/window/ewm.py b/pandas/core/window/ewm.py index 7a2d8e84bec76..c57c434dd3040 100644 --- a/pandas/core/window/ewm.py +++ b/pandas/core/window/ewm.py @@ -12,9 +12,7 @@ from pandas.util._decorators import Appender, Substitution, doc from pandas.core.dtypes.common import is_datetime64_ns_dtype -from pandas.core.dtypes.generic import ABCDataFrame -from pandas.core.base import DataError import pandas.core.common as common from pandas.core.window.common import _doc_template, _shared_docs, zsqrt from pandas.core.window.rolling import _flex_binary_moment, _Rolling @@ -302,30 +300,13 @@ def _apply(self, func): ------- y : same type as input argument """ - blocks, obj = self._create_blocks(self._selected_obj) - block_list = list(blocks) - - results = [] - exclude = [] - for i, b in enumerate(blocks): - try: - values = self._prep_values(b.values) - - except (TypeError, NotImplementedError) as err: - if isinstance(obj, ABCDataFrame): - exclude.extend(b.columns) - del block_list[i] - continue - else: - raise DataError("No numeric types to aggregate") from err + def homogeneous_func(values: np.ndarray): if values.size == 0: - results.append(values.copy()) - continue + return values.copy() + return np.apply_along_axis(func, self.axis, values) - results.append(np.apply_along_axis(func, self.axis, values)) - - return self._wrap_results(results, block_list, obj, exclude) + return self._apply_blockwise(homogeneous_func) @Substitution(name="ewm", func_name="mean") @Appender(_doc_template) diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 87bcaa7d9512f..a04d68a6d6745 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -12,7 +12,7 @@ from pandas._libs.tslibs import BaseOffset, to_offset import pandas._libs.window.aggregations as window_aggregations -from pandas._typing import Axis, FrameOrSeries, Scalar +from pandas._typing import ArrayLike, Axis, FrameOrSeries, Scalar from pandas.compat._optional import import_optional_dependency from pandas.compat.numpy import function as nv from pandas.util._decorators import Appender, Substitution, cache_readonly, doc @@ -487,6 +487,38 @@ def _get_window_indexer(self, window: int) -> BaseIndexer: return VariableWindowIndexer(index_array=self._on.asi8, window_size=window) return FixedWindowIndexer(window_size=window) + def _apply_blockwise( + self, homogeneous_func: Callable[..., ArrayLike] + ) -> FrameOrSeries: + """ + Apply the given function to the DataFrame broken down into homogeneous + sub-frames. + """ + # This isn't quite blockwise, since `blocks` is actually a collection + # of homogenenous DataFrames. + blocks, obj = self._create_blocks(self._selected_obj) + + skipped: List[int] = [] + results: List[ArrayLike] = [] + exclude: List[Scalar] = [] + for i, b in enumerate(blocks): + try: + values = self._prep_values(b.values) + + except (TypeError, NotImplementedError) as err: + if isinstance(obj, ABCDataFrame): + skipped.append(i) + exclude.extend(b.columns) + continue + else: + raise DataError("No numeric types to aggregate") from err + + result = homogeneous_func(values) + results.append(result) + + block_list = [blk for i, blk in enumerate(blocks) if i not in skipped] + return self._wrap_results(results, block_list, obj, exclude) + def _apply( self, func: Callable, @@ -524,30 +556,14 @@ def _apply( """ win_type = self._get_win_type(kwargs) window = self._get_window(win_type=win_type) - - blocks, obj = self._create_blocks(self._selected_obj) - block_list = list(blocks) window_indexer = self._get_window_indexer(window) - results = [] - exclude: List[Scalar] = [] - for i, b in enumerate(blocks): - try: - values = self._prep_values(b.values) - - except (TypeError, NotImplementedError) as err: - if isinstance(obj, ABCDataFrame): - exclude.extend(b.columns) - del block_list[i] - continue - else: - raise DataError("No numeric types to aggregate") from err + def homogeneous_func(values: np.ndarray): + # calculation function if values.size == 0: - results.append(values.copy()) - continue + return values.copy() - # calculation function offset = calculate_center_offset(window) if center else 0 additional_nans = np.array([np.nan] * offset) @@ -594,9 +610,9 @@ def calc(x): if center: result = self._center_window(result, window) - results.append(result) + return result - return self._wrap_results(results, block_list, obj, exclude) + return self._apply_blockwise(homogeneous_func) def aggregate(self, func, *args, **kwargs): result, how = self._aggregate(func, *args, **kwargs) From 57e6be52476c2804fda1d79136953dfeb8ebe0b7 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 6 Aug 2020 19:20:03 -0700 Subject: [PATCH 0449/1025] BUG: validate index/data length match in DataFrame construction (#35590) * BUG: validate index/data length match in DataFrame construction * whatsnew * Other -> DataFrame --- doc/source/whatsnew/v1.1.1.rst | 4 ++++ pandas/core/internals/blocks.py | 3 --- pandas/core/internals/managers.py | 2 +- pandas/tests/frame/test_constructors.py | 6 ++++++ 4 files changed, 11 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v1.1.1.rst b/doc/source/whatsnew/v1.1.1.rst index 5e36bfe6b6307..7db609fba5d68 100644 --- a/doc/source/whatsnew/v1.1.1.rst +++ b/doc/source/whatsnew/v1.1.1.rst @@ -50,6 +50,10 @@ Categorical - +**DataFrame** +- Bug in :class:`DataFrame` constructor failing to raise ``ValueError`` in some cases when data and index have mismatched lengths (:issue:`33437`) +- + .. --------------------------------------------------------------------------- .. _whatsnew_111.contributors: diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 3186c555b7ae1..f3286b3c20965 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -105,7 +105,6 @@ class Block(PandasObject): is_extension = False _can_hold_na = False _can_consolidate = True - _verify_integrity = True _validate_ndim = True @classmethod @@ -1525,7 +1524,6 @@ class ExtensionBlock(Block): """ _can_consolidate = False - _verify_integrity = False _validate_ndim = False is_extension = True @@ -2613,7 +2611,6 @@ def _replace_coerce( class CategoricalBlock(ExtensionBlock): __slots__ = () is_categorical = True - _verify_integrity = True _can_hold_na = True should_store = Block.should_store diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 895385b170c91..0ce2408eb003e 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -312,7 +312,7 @@ def _verify_integrity(self) -> None: mgr_shape = self.shape tot_items = sum(len(x.mgr_locs) for x in self.blocks) for block in self.blocks: - if block._verify_integrity and block.shape[1:] != mgr_shape[1:]: + if block.shape[1:] != mgr_shape[1:]: raise construction_error(tot_items, block.shape[1:], self.axes) if len(self.items) != tot_items: raise AssertionError( diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index a4ed548264d39..b78bb1c492ef4 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -2619,6 +2619,12 @@ class DatetimeSubclass(datetime): data = pd.DataFrame({"datetime": [DatetimeSubclass(2020, 1, 1, 1, 1)]}) assert data.datetime.dtype == "datetime64[ns]" + def test_with_mismatched_index_length_raises(self): + # GH#33437 + dti = pd.date_range("2016-01-01", periods=3, tz="US/Pacific") + with pytest.raises(ValueError, match="Shape of passed values"): + DataFrame(dti, index=range(4)) + class TestDataFrameConstructorWithDatetimeTZ: def test_from_dict(self): From 121d3c0ad00ba5e18d65b1b365da0ec2f2a09dce Mon Sep 17 00:00:00 2001 From: cleconte987 Date: Fri, 7 Aug 2020 11:35:39 +0200 Subject: [PATCH 0450/1025] DOC: Fix heading capitalization in doc/source/whatsnew (#35368) --- doc/source/whatsnew/v0.22.0.rst | 6 +++--- doc/source/whatsnew/v0.23.0.rst | 22 ++++++++++---------- doc/source/whatsnew/v0.24.0.rst | 12 +++++------ doc/source/whatsnew/v0.24.2.rst | 1 - scripts/validate_rst_title_capitalization.py | 3 +++ 5 files changed, 23 insertions(+), 21 deletions(-) diff --git a/doc/source/whatsnew/v0.22.0.rst b/doc/source/whatsnew/v0.22.0.rst index 75949a90d09a6..66d3ab3305565 100644 --- a/doc/source/whatsnew/v0.22.0.rst +++ b/doc/source/whatsnew/v0.22.0.rst @@ -1,7 +1,7 @@ .. _whatsnew_0220: -v0.22.0 (December 29, 2017) ---------------------------- +Version 0.22.0 (December 29, 2017) +---------------------------------- {{ header }} @@ -96,7 +96,7 @@ returning ``1`` instead. These changes affect :meth:`DataFrame.sum` and :meth:`DataFrame.prod` as well. Finally, a few less obvious places in pandas are affected by this change. -Grouping by a categorical +Grouping by a Categorical ^^^^^^^^^^^^^^^^^^^^^^^^^ Grouping by a ``Categorical`` and summing now returns ``0`` instead of diff --git a/doc/source/whatsnew/v0.23.0.rst b/doc/source/whatsnew/v0.23.0.rst index b9e1b5060d1da..f91d89679dad1 100644 --- a/doc/source/whatsnew/v0.23.0.rst +++ b/doc/source/whatsnew/v0.23.0.rst @@ -86,8 +86,8 @@ Please note that the string `index` is not supported with the round trip format, .. _whatsnew_0230.enhancements.assign_dependent: -``.assign()`` accepts dependent arguments -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Method ``.assign()`` accepts dependent arguments +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ The :func:`DataFrame.assign` now accepts dependent keyword arguments for python version later than 3.6 (see also `PEP 468 `_). Later keyword arguments may now refer to earlier ones if the argument is a callable. See the @@ -244,7 +244,7 @@ documentation. If you build an extension array, publicize it on our .. _whatsnew_0230.enhancements.categorical_grouping: -New ``observed`` keyword for excluding unobserved categories in ``groupby`` +New ``observed`` keyword for excluding unobserved categories in ``GroupBy`` ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Grouping by a categorical includes the unobserved categories in the output. @@ -360,8 +360,8 @@ Fill all consecutive outside values in both directions .. _whatsnew_0210.enhancements.get_dummies_dtype: -``get_dummies`` now supports ``dtype`` argument -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Function ``get_dummies`` now supports ``dtype`` argument +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ The :func:`get_dummies` now accepts a ``dtype`` argument, which specifies a dtype for the new columns. The default remains uint8. (:issue:`18330`) @@ -388,8 +388,8 @@ See the :ref:`documentation here `. (:issue:`19365`) .. _whatsnew_0230.enhancements.ran_inf: -``.rank()`` handles ``inf`` values when ``NaN`` are present -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Method ``.rank()`` handles ``inf`` values when ``NaN`` are present +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ In previous versions, ``.rank()`` would assign ``inf`` elements ``NaN`` as their ranks. Now ranks are calculated properly. (:issue:`6945`) @@ -587,7 +587,7 @@ If installed, we now require: .. _whatsnew_0230.api_breaking.dict_insertion_order: -Instantiation from dicts preserves dict insertion order for python 3.6+ +Instantiation from dicts preserves dict insertion order for Python 3.6+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Until Python 3.6, dicts in Python had no formally defined ordering. For Python @@ -1365,8 +1365,8 @@ MultiIndex - Bug in indexing where nested indexers having only numpy arrays are handled incorrectly (:issue:`19686`) -I/O -^^^ +IO +^^ - :func:`read_html` now rewinds seekable IO objects after parse failure, before attempting to parse with a new parser. If a parser errors and the object is non-seekable, an informative error is raised suggesting the use of a different parser (:issue:`17975`) - :meth:`DataFrame.to_html` now has an option to add an id to the leading `` tag (:issue:`8496`) @@ -1403,7 +1403,7 @@ Plotting - :func:`DataFrame.plot` now supports multiple columns to the ``y`` argument (:issue:`19699`) -Groupby/resample/rolling +GroupBy/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ - Bug when grouping by a single column and aggregating with a class like ``list`` or ``tuple`` (:issue:`18079`) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 45399792baecf..5bfaa7a5a3e6b 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -277,8 +277,8 @@ For earlier versions this can be done using the following. .. _whatsnew_0240.enhancements.read_html: -``read_html`` Enhancements -^^^^^^^^^^^^^^^^^^^^^^^^^^ +Function ``read_html`` enhancements +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ :func:`read_html` previously ignored ``colspan`` and ``rowspan`` attributes. Now it understands them, treating them as sequences of cells with the same @@ -1371,7 +1371,7 @@ the object's ``freq`` attribute (:issue:`21939`, :issue:`23878`). .. _whatsnew_0240.deprecations.integer_tz: -Passing integer data and a timezone to datetimeindex +Passing integer data and a timezone to DatetimeIndex ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ The behavior of :class:`DatetimeIndex` when passed integer data and @@ -1769,8 +1769,8 @@ MultiIndex - :class:`MultiIndex` has gained the :meth:`MultiIndex.from_frame`, it allows constructing a :class:`MultiIndex` object from a :class:`DataFrame` (:issue:`22420`) - Fix ``TypeError`` in Python 3 when creating :class:`MultiIndex` in which some levels have mixed types, e.g. when some labels are tuples (:issue:`15457`) -I/O -^^^ +IO +^^ - Bug in :func:`read_csv` in which a column specified with ``CategoricalDtype`` of boolean categories was not being correctly coerced from string values to booleans (:issue:`20498`) - Bug in :func:`read_csv` in which unicode column names were not being properly recognized with Python 2.x (:issue:`13253`) @@ -1827,7 +1827,7 @@ Plotting - Bug in :func:`DataFrame.plot.bar` caused bars to use multiple colors instead of a single one (:issue:`20585`) - Bug in validating color parameter caused extra color to be appended to the given color array. This happened to multiple plotting functions using matplotlib. (:issue:`20726`) -Groupby/resample/rolling +GroupBy/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ - Bug in :func:`pandas.core.window.Rolling.min` and :func:`pandas.core.window.Rolling.max` with ``closed='left'``, a datetime-like index and only one entry in the series leading to segfault (:issue:`24718`) diff --git a/doc/source/whatsnew/v0.24.2.rst b/doc/source/whatsnew/v0.24.2.rst index d1a893f99cff4..27e84bf0a7cd7 100644 --- a/doc/source/whatsnew/v0.24.2.rst +++ b/doc/source/whatsnew/v0.24.2.rst @@ -51,7 +51,6 @@ Bug fixes - Bug where calling :meth:`Series.replace` on categorical data could return a ``Series`` with incorrect dimensions (:issue:`24971`) - -- **Reshaping** diff --git a/scripts/validate_rst_title_capitalization.py b/scripts/validate_rst_title_capitalization.py index 5de2a07381ae5..62ec6b9ef07af 100755 --- a/scripts/validate_rst_title_capitalization.py +++ b/scripts/validate_rst_title_capitalization.py @@ -138,6 +138,9 @@ "CategoricalDtype", "UTC", "Panel", + "False", + "Styler", + "os", } CAP_EXCEPTIONS_DICT = {word.lower(): word for word in CAPITALIZATION_EXCEPTIONS} From b7af2db9cc1a59e185f1b4e75a419c0b755c8176 Mon Sep 17 00:00:00 2001 From: gabicca <33315687+gabicca@users.noreply.github.com> Date: Fri, 7 Aug 2020 12:39:31 +0100 Subject: [PATCH 0451/1025] Bug fix one element series truncate (#35547) --- doc/source/whatsnew/v1.1.1.rst | 2 +- pandas/core/generic.py | 2 +- pandas/tests/series/methods/test_truncate.py | 11 +++++++++++ 3 files changed, 13 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.1.1.rst b/doc/source/whatsnew/v1.1.1.rst index 7db609fba5d68..45f1015a8e7bd 100644 --- a/doc/source/whatsnew/v1.1.1.rst +++ b/doc/source/whatsnew/v1.1.1.rst @@ -48,7 +48,7 @@ Categorical **Indexing** -- +- Bug in :meth:`Series.truncate` when trying to truncate a single-element series (:issue:`35544`) **DataFrame** - Bug in :class:`DataFrame` constructor failing to raise ``ValueError`` in some cases when data and index have mismatched lengths (:issue:`33437`) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 42d02f37508fc..aaf23cc198d95 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -9405,7 +9405,7 @@ def truncate( if before > after: raise ValueError(f"Truncate: {after} must be after {before}") - if ax.is_monotonic_decreasing: + if len(ax) > 1 and ax.is_monotonic_decreasing: before, after = after, before slicer = [slice(None, None)] * self._AXIS_LEN diff --git a/pandas/tests/series/methods/test_truncate.py b/pandas/tests/series/methods/test_truncate.py index 7c82edbaec177..45592f8d99b93 100644 --- a/pandas/tests/series/methods/test_truncate.py +++ b/pandas/tests/series/methods/test_truncate.py @@ -141,3 +141,14 @@ def test_truncate_multiindex(self): expected = df.col tm.assert_series_equal(result, expected) + + def test_truncate_one_element_series(self): + # GH 35544 + series = pd.Series([0.1], index=pd.DatetimeIndex(["2020-08-04"])) + before = pd.Timestamp("2020-08-02") + after = pd.Timestamp("2020-08-04") + + result = series.truncate(before=before, after=after) + + # the input Series and the expected Series are the same + tm.assert_series_equal(result, series) From 49711506ced3f314e0c465a881622a36a6499b61 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 7 Aug 2020 04:45:48 -0700 Subject: [PATCH 0452/1025] BUG: df.shift(n, axis=1) with multiple blocks (#35578) --- doc/source/whatsnew/v1.1.1.rst | 1 + pandas/core/internals/managers.py | 25 ++++++++++++++++++++-- pandas/tests/frame/methods/test_shift.py | 27 ++++++++++++++++++++++++ 3 files changed, 51 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.1.1.rst b/doc/source/whatsnew/v1.1.1.rst index 45f1015a8e7bd..232e24c6db020 100644 --- a/doc/source/whatsnew/v1.1.1.rst +++ b/doc/source/whatsnew/v1.1.1.rst @@ -17,6 +17,7 @@ Fixed regressions - Fixed regression where :func:`read_csv` would raise a ``ValueError`` when ``pandas.options.mode.use_inf_as_na`` was set to ``True`` (:issue:`35493`). - Fixed regression in :class:`pandas.core.groupby.RollingGroupby` where column selection was ignored (:issue:`35486`) +- Fixed regression in :meth:`DataFrame.shift` with ``axis=1`` and heterogeneous dtypes (:issue:`35488`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 0ce2408eb003e..4693cc193c27c 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -551,6 +551,24 @@ def interpolate(self, **kwargs) -> "BlockManager": return self.apply("interpolate", **kwargs) def shift(self, periods: int, axis: int, fill_value) -> "BlockManager": + if axis == 0 and self.ndim == 2 and self.nblocks > 1: + # GH#35488 we need to watch out for multi-block cases + ncols = self.shape[0] + if periods > 0: + indexer = [-1] * periods + list(range(ncols - periods)) + else: + nper = abs(periods) + indexer = list(range(nper, ncols)) + [-1] * nper + result = self.reindex_indexer( + self.items, + indexer, + axis=0, + fill_value=fill_value, + allow_dups=True, + consolidate=False, + ) + return result + return self.apply("shift", periods=periods, axis=axis, fill_value=fill_value) def fillna(self, value, limit, inplace: bool, downcast) -> "BlockManager": @@ -1213,6 +1231,7 @@ def reindex_indexer( fill_value=None, allow_dups: bool = False, copy: bool = True, + consolidate: bool = True, ) -> T: """ Parameters @@ -1223,7 +1242,8 @@ def reindex_indexer( fill_value : object, default None allow_dups : bool, default False copy : bool, default True - + consolidate: bool, default True + Whether to consolidate inplace before reindexing. pandas-indexer with -1's only. """ @@ -1236,7 +1256,8 @@ def reindex_indexer( result.axes[axis] = new_axis return result - self._consolidate_inplace() + if consolidate: + self._consolidate_inplace() # some axes don't allow reindexing with dups if not allow_dups: diff --git a/pandas/tests/frame/methods/test_shift.py b/pandas/tests/frame/methods/test_shift.py index 9ec029a6c4304..8f6902eca816f 100644 --- a/pandas/tests/frame/methods/test_shift.py +++ b/pandas/tests/frame/methods/test_shift.py @@ -145,6 +145,33 @@ def test_shift_duplicate_columns(self): tm.assert_frame_equal(shifted[0], shifted[1]) tm.assert_frame_equal(shifted[0], shifted[2]) + def test_shift_axis1_multiple_blocks(self): + # GH#35488 + df1 = pd.DataFrame(np.random.randint(1000, size=(5, 3))) + df2 = pd.DataFrame(np.random.randint(1000, size=(5, 2))) + df3 = pd.concat([df1, df2], axis=1) + assert len(df3._mgr.blocks) == 2 + + result = df3.shift(2, axis=1) + + expected = df3.take([-1, -1, 0, 1, 2], axis=1) + expected.iloc[:, :2] = np.nan + expected.columns = df3.columns + + tm.assert_frame_equal(result, expected) + + # Case with periods < 0 + # rebuild df3 because `take` call above consolidated + df3 = pd.concat([df1, df2], axis=1) + assert len(df3._mgr.blocks) == 2 + result = df3.shift(-2, axis=1) + + expected = df3.take([2, 3, 4, -1, -1], axis=1) + expected.iloc[:, -2:] = np.nan + expected.columns = df3.columns + + tm.assert_frame_equal(result, expected) + @pytest.mark.filterwarnings("ignore:tshift is deprecated:FutureWarning") def test_tshift(self, datetime_frame): # TODO: remove this test when tshift deprecation is enforced From 41298f60dc0e6f4b1b53b1756b0435fea76c0b7d Mon Sep 17 00:00:00 2001 From: Eric Goddard Date: Fri, 7 Aug 2020 06:49:52 -0500 Subject: [PATCH 0453/1025] BUG: to_timedelta fails on Int64 Series with null values (#35582) --- doc/source/whatsnew/v1.1.1.rst | 5 +++++ pandas/core/arrays/timedeltas.py | 4 +++- pandas/tests/tools/test_to_timedelta.py | 13 +++++++++++++ 3 files changed, 21 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.1.rst b/doc/source/whatsnew/v1.1.1.rst index 232e24c6db020..6b315e0a9d016 100644 --- a/doc/source/whatsnew/v1.1.1.rst +++ b/doc/source/whatsnew/v1.1.1.rst @@ -38,6 +38,11 @@ Categorical - - +**Timedelta** + +- Bug in :meth:`to_timedelta` fails when arg is a :class:`Series` with `Int64` dtype containing null values (:issue:`35574`) + + **Numeric** - diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index 99a4725c2d806..3e21d01355dda 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -29,7 +29,7 @@ from pandas.core import nanops from pandas.core.algorithms import checked_add_with_arr -from pandas.core.arrays import datetimelike as dtl +from pandas.core.arrays import IntegerArray, datetimelike as dtl from pandas.core.arrays._ranges import generate_regular_range import pandas.core.common as com from pandas.core.construction import extract_array @@ -921,6 +921,8 @@ def sequence_to_td64ns(data, copy=False, unit=None, errors="raise"): elif isinstance(data, (ABCTimedeltaIndex, TimedeltaArray)): inferred_freq = data.freq data = data._data + elif isinstance(data, IntegerArray): + data = data.to_numpy("int64", na_value=tslibs.iNaT) # Convert whatever we have into timedelta64[ns] dtype if is_object_dtype(data.dtype) or is_string_dtype(data.dtype): diff --git a/pandas/tests/tools/test_to_timedelta.py b/pandas/tests/tools/test_to_timedelta.py index 1e193f22a6698..f68d83f7f4d58 100644 --- a/pandas/tests/tools/test_to_timedelta.py +++ b/pandas/tests/tools/test_to_timedelta.py @@ -166,3 +166,16 @@ def test_to_timedelta_ignore_strings_unit(self): arr = np.array([1, 2, "error"], dtype=object) result = pd.to_timedelta(arr, unit="ns", errors="ignore") tm.assert_numpy_array_equal(result, arr) + + def test_to_timedelta_nullable_int64_dtype(self): + # GH 35574 + expected = Series([timedelta(days=1), timedelta(days=2)]) + result = to_timedelta(Series([1, 2], dtype="Int64"), unit="days") + + tm.assert_series_equal(result, expected) + + # IntegerArray Series with nulls + expected = Series([timedelta(days=1), None]) + result = to_timedelta(Series([1, None], dtype="Int64"), unit="days") + + tm.assert_series_equal(result, expected) From 991fb3d5944bab1b8b7ae943c90ea92024f280b9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torsten=20W=C3=B6rtwein?= Date: Fri, 7 Aug 2020 07:53:09 -0400 Subject: [PATCH 0454/1025] support binary file handles in to_csv (#35129) --- doc/source/user_guide/io.rst | 17 ++++++ doc/source/whatsnew/v1.2.0.rst | 23 +++++++- pandas/core/generic.py | 16 ++++- pandas/io/common.py | 17 ++++-- pandas/io/formats/csvs.py | 82 +++++++++++--------------- pandas/tests/io/formats/test_to_csv.py | 36 +++++++++++ pandas/tests/io/test_common.py | 11 ++++ pandas/tests/io/test_compression.py | 16 +++++ 8 files changed, 158 insertions(+), 60 deletions(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index cc42f952b1733..ab233f653061a 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -1064,6 +1064,23 @@ DD/MM/YYYY instead. For convenience, a ``dayfirst`` keyword is provided: pd.read_csv('tmp.csv', parse_dates=[0]) pd.read_csv('tmp.csv', dayfirst=True, parse_dates=[0]) +Writing CSVs to binary file objects ++++++++++++++++++++++++++++++++++++ + +.. versionadded:: 1.2.0 + +``df.to_csv(..., mode="w+b")`` allows writing a CSV to a file object +opened binary mode. For this to work, it is necessary that ``mode`` +contains a "b": + +.. ipython:: python + + import io + + data = pd.DataFrame([0, 1, 2]) + buffer = io.BytesIO() + data.to_csv(buffer, mode="w+b", encoding="utf-8", compression="gzip") + .. _io.float_precision: Specifying method for floating-point conversion diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 6f173cb2fce12..10dfd8406b8ce 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -13,6 +13,25 @@ including other versions of pandas. Enhancements ~~~~~~~~~~~~ +.. _whatsnew_120.binary_handle_to_csv: + +Support for binary file handles in ``to_csv`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:meth:`to_csv` supports file handles in binary mode (:issue:`19827` and :issue:`35058`) +with ``encoding`` (:issue:`13068` and :issue:`23854`) and ``compression`` (:issue:`22555`). +``mode`` has to contain a ``b`` for binary handles to be supported. + +For example: + +.. ipython:: python + + import io + + data = pd.DataFrame([0, 1, 2]) + buffer = io.BytesIO() + data.to_csv(buffer, mode="w+b", encoding="utf-8", compression="gzip") + .. _whatsnew_120.enhancements.other: Other enhancements @@ -121,7 +140,7 @@ MultiIndex I/O ^^^ -- +- Bug in :meth:`to_csv` caused a ``ValueError`` when it was called with a filename in combination with ``mode`` containing a ``b`` (:issue:`35058`) - Plotting @@ -167,4 +186,4 @@ Other .. _whatsnew_120.contributors: Contributors -~~~~~~~~~~~~ \ No newline at end of file +~~~~~~~~~~~~ diff --git a/pandas/core/generic.py b/pandas/core/generic.py index aaf23cc198d95..441eef26bea58 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3021,13 +3021,18 @@ def to_csv( ---------- path_or_buf : str or file handle, default None File path or object, if None is provided the result is returned as - a string. If a file object is passed it should be opened with - `newline=''`, disabling universal newlines. + a string. If a non-binary file object is passed, it should be opened + with `newline=''`, disabling universal newlines. If a binary + file object is passed, `mode` needs to contain a `'b'`. .. versionchanged:: 0.24.0 Was previously named "path" for Series. + .. versionchanged:: 1.2.0 + + Support for binary file objects was introduced. + sep : str, default ',' String of length 1. Field delimiter for the output file. na_rep : str, default '' @@ -3056,7 +3061,8 @@ def to_csv( Python write mode, default 'w'. encoding : str, optional A string representing the encoding to use in the output file, - defaults to 'utf-8'. + defaults to 'utf-8'. `encoding` is not supported if `path_or_buf` + is a non-binary file object. compression : str or dict, default 'infer' If str, represents compression mode. If dict, value at 'method' is the compression mode. Compression mode may be any of the following @@ -3080,6 +3086,10 @@ def to_csv( supported for compression modes 'gzip' and 'bz2' as well as 'zip'. + .. versionchanged:: 1.2.0 + + Compression is supported for non-binary file objects. + quoting : optional constant from csv module Defaults to csv.QUOTE_MINIMAL. If you have set a `float_format` then floats are converted to strings and thus csv.QUOTE_NONNUMERIC diff --git a/pandas/io/common.py b/pandas/io/common.py index f39b8279fbdb0..34e4425c657f1 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -407,8 +407,9 @@ def get_handle( memory_map : boolean, default False See parsers._parser_params for more information. is_text : boolean, default True - whether file/buffer is in text format (csv, json, etc.), or in binary - mode (pickle, etc.). + Whether the type of the content passed to the file/buffer is string or + bytes. This is not the same as `"b" not in mode`. If a string content is + passed to a binary file/buffer, a wrapper is inserted. errors : str, default 'strict' Specifies how encoding and decoding errors are to be handled. See the errors argument for :func:`open` for a full list @@ -449,14 +450,14 @@ def get_handle( if is_path: f = gzip.open(path_or_buf, mode, **compression_args) else: - f = gzip.GzipFile(fileobj=path_or_buf, **compression_args) + f = gzip.GzipFile(fileobj=path_or_buf, mode=mode, **compression_args) # BZ Compression elif compression == "bz2": if is_path: f = bz2.BZ2File(path_or_buf, mode, **compression_args) else: - f = bz2.BZ2File(path_or_buf, **compression_args) + f = bz2.BZ2File(path_or_buf, mode=mode, **compression_args) # ZIP Compression elif compression == "zip": @@ -489,10 +490,14 @@ def get_handle( handles.append(f) elif is_path: - if encoding: + # Check whether the filename is to be opened in binary mode. + # Binary mode does not support 'encoding' and 'newline'. + is_binary_mode = "b" in mode + + if encoding and not is_binary_mode: # Encoding f = open(path_or_buf, mode, encoding=encoding, errors=errors, newline="") - elif is_text: + elif is_text and not is_binary_mode: # No explicit encoding f = open(path_or_buf, mode, errors="replace", newline="") else: diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 5bd51dc8351f6..b10946a20d041 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -3,11 +3,10 @@ """ import csv as csvlib -from io import StringIO +from io import StringIO, TextIOWrapper import os from typing import Hashable, List, Mapping, Optional, Sequence, Union import warnings -from zipfile import ZipFile import numpy as np @@ -159,38 +158,29 @@ def save(self) -> None: """ Create the writer & save. """ - # GH21227 internal compression is not used when file-like passed. - if self.compression and hasattr(self.path_or_buf, "write"): + # GH21227 internal compression is not used for non-binary handles. + if ( + self.compression + and hasattr(self.path_or_buf, "write") + and "b" not in self.mode + ): warnings.warn( - "compression has no effect when passing file-like object as input.", + "compression has no effect when passing a non-binary object as input.", RuntimeWarning, stacklevel=2, ) - - # when zip compression is called. - is_zip = isinstance(self.path_or_buf, ZipFile) or ( - not hasattr(self.path_or_buf, "write") and self.compression == "zip" + self.compression = None + + # get a handle or wrap an existing handle to take care of 1) compression and + # 2) text -> byte conversion + f, handles = get_handle( + self.path_or_buf, + self.mode, + encoding=self.encoding, + errors=self.errors, + compression=dict(self.compression_args, method=self.compression), ) - if is_zip: - # zipfile doesn't support writing string to archive. uses string - # buffer to receive csv writing and dump into zip compression - # file handle. GH21241, GH21118 - f = StringIO() - close = False - elif hasattr(self.path_or_buf, "write"): - f = self.path_or_buf - close = False - else: - f, handles = get_handle( - self.path_or_buf, - self.mode, - encoding=self.encoding, - errors=self.errors, - compression=dict(self.compression_args, method=self.compression), - ) - close = True - try: # Note: self.encoding is irrelevant here self.writer = csvlib.writer( @@ -206,29 +196,23 @@ def save(self) -> None: self._save() finally: - if is_zip: - # GH17778 handles zip compression separately. - buf = f.getvalue() - if hasattr(self.path_or_buf, "write"): - self.path_or_buf.write(buf) - else: - compression = dict(self.compression_args, method=self.compression) - - f, handles = get_handle( - self.path_or_buf, - self.mode, - encoding=self.encoding, - errors=self.errors, - compression=compression, - ) - f.write(buf) - close = True - if close: + if self.should_close: f.close() - for _fh in handles: - _fh.close() - elif self.should_close: + elif ( + isinstance(f, TextIOWrapper) + and not f.closed + and f != self.path_or_buf + and hasattr(self.path_or_buf, "write") + ): + # get_handle uses TextIOWrapper for non-binary handles. TextIOWrapper + # closes the wrapped handle if it is not detached. + f.flush() # make sure everything is written + f.detach() # makes f unusable + del f + elif f != self.path_or_buf: f.close() + for _fh in handles: + _fh.close() def _save_header(self): writer = self.writer diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py index 4c86e3a16b135..753b8b6eda9c5 100644 --- a/pandas/tests/io/formats/test_to_csv.py +++ b/pandas/tests/io/formats/test_to_csv.py @@ -607,3 +607,39 @@ def test_to_csv_errors(self, errors): ser.to_csv(path, errors=errors) # No use in reading back the data as it is not the same anymore # due to the error handling + + def test_to_csv_binary_handle(self): + """ + Binary file objects should work if 'mode' contains a 'b'. + + GH 35058 and GH 19827 + """ + df = tm.makeDataFrame() + with tm.ensure_clean() as path: + with open(path, mode="w+b") as handle: + df.to_csv(handle, mode="w+b") + tm.assert_frame_equal(df, pd.read_csv(path, index_col=0)) + + def test_to_csv_encoding_binary_handle(self): + """ + Binary file objects should honor a specified encoding. + + GH 23854 and GH 13068 with binary handles + """ + # example from GH 23854 + content = "a, b, 🐟".encode("utf-8-sig") + buffer = io.BytesIO(content) + df = pd.read_csv(buffer, encoding="utf-8-sig") + + buffer = io.BytesIO() + df.to_csv(buffer, mode="w+b", encoding="utf-8-sig", index=False) + buffer.seek(0) # tests whether file handle wasn't closed + assert buffer.getvalue().startswith(content) + + # example from GH 13068 + with tm.ensure_clean() as path: + with open(path, "w+b") as handle: + pd.DataFrame().to_csv(handle, mode="w+b", encoding="utf-8-sig") + + handle.seek(0) + assert handle.read().startswith(b'\xef\xbb\xbf""') diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index dde38eb55ea7f..5ce2233bc0cd0 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -378,6 +378,17 @@ def test_unknown_engine(self): with pytest.raises(ValueError, match="Unknown engine"): pd.read_csv(path, engine="pyt") + def test_binary_mode(self): + """ + 'encoding' shouldn't be passed to 'open' in binary mode. + + GH 35058 + """ + with tm.ensure_clean() as path: + df = tm.makeDataFrame() + df.to_csv(path, mode="w+b") + tm.assert_frame_equal(df, pd.read_csv(path, index_col=0)) + def test_is_fsspec_url(): assert icom.is_fsspec_url("gcs://pandas/somethingelse.com") diff --git a/pandas/tests/io/test_compression.py b/pandas/tests/io/test_compression.py index 59c9bd0a36d3d..902a3d5d2a397 100644 --- a/pandas/tests/io/test_compression.py +++ b/pandas/tests/io/test_compression.py @@ -114,6 +114,22 @@ def test_compression_warning(compression_only): df.to_csv(f, compression=compression_only) +def test_compression_binary(compression_only): + """ + Binary file handles support compression. + + GH22555 + """ + df = tm.makeDataFrame() + with tm.ensure_clean() as path: + with open(path, mode="wb") as file: + df.to_csv(file, mode="wb", compression=compression_only) + file.seek(0) # file shouldn't be closed + tm.assert_frame_equal( + df, pd.read_csv(path, index_col=0, compression=compression_only) + ) + + def test_with_missing_lzma(): """Tests if import pandas works when lzma is not present.""" # https://github.com/pandas-dev/pandas/issues/27575 From b01a6ee4f54247b8b0d22c4fbbc8959f1645fd27 Mon Sep 17 00:00:00 2001 From: Thomas Smith Date: Fri, 7 Aug 2020 16:21:11 +0100 Subject: [PATCH 0455/1025] BUG: GroupBy.count() and GroupBy.sum() incorreclty return NaN instead of 0 for missing categories (#35280) --- doc/source/whatsnew/v1.2.0.rst | 3 +- pandas/core/groupby/generic.py | 8 ++++- pandas/core/groupby/groupby.py | 16 +++++++-- pandas/tests/groupby/test_categorical.py | 46 +++++++----------------- pandas/tests/reshape/test_pivot.py | 13 +++---- 5 files changed, 42 insertions(+), 44 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 10dfd8406b8ce..260b92b5989c1 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -152,6 +152,7 @@ Plotting Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ +- Bug in :meth:`DataFrameGroupBy.count` and :meth:`SeriesGroupBy.sum` returning ``NaN`` for missing categories when grouped on multiple ``Categoricals``. Now returning ``0`` (:issue:`35028`) - Bug in :meth:`DataFrameGroupBy.apply` that would some times throw an erroneous ``ValueError`` if the grouping axis had duplicate entries (:issue:`16646`) - - @@ -160,7 +161,7 @@ Groupby/resample/rolling Reshaping ^^^^^^^^^ -- +- Bug in :meth:`DataFrame.pivot_table` with ``aggfunc='count'`` or ``aggfunc='sum'`` returning ``NaN`` for missing categories when pivoted on a ``Categorical``. Now returning ``0`` (:issue:`31422`) - Sparse diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index c50b753cf3293..740463f0cf356 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1829,7 +1829,13 @@ def count(self): ) blocks = [make_block(val, placement=loc) for val, loc in zip(counted, locs)] - return self._wrap_agged_blocks(blocks, items=data.items) + # If we are grouping on categoricals we want unobserved categories to + # return zero, rather than the default of NaN which the reindexing in + # _wrap_agged_blocks() returns. GH 35028 + with com.temp_setattr(self, "observed", True): + result = self._wrap_agged_blocks(blocks, items=data.items) + + return self._reindex_output(result, fill_value=0) def nunique(self, dropna: bool = True): """ diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 6c8a780859939..ed512710295d7 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1536,9 +1536,19 @@ def size(self) -> FrameOrSeriesUnion: @doc(_groupby_agg_method_template, fname="sum", no=True, mc=0) def sum(self, numeric_only: bool = True, min_count: int = 0): - return self._agg_general( - numeric_only=numeric_only, min_count=min_count, alias="add", npfunc=np.sum - ) + + # If we are grouping on categoricals we want unobserved categories to + # return zero, rather than the default of NaN which the reindexing in + # _agg_general() returns. GH #31422 + with com.temp_setattr(self, "observed", True): + result = self._agg_general( + numeric_only=numeric_only, + min_count=min_count, + alias="add", + npfunc=np.sum, + ) + + return self._reindex_output(result, fill_value=0) @doc(_groupby_agg_method_template, fname="prod", no=True, mc=0) def prod(self, numeric_only: bool = True, min_count: int = 0): diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 0d447a70b540d..c74c1529eb537 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -19,7 +19,7 @@ import pandas._testing as tm -def cartesian_product_for_groupers(result, args, names): +def cartesian_product_for_groupers(result, args, names, fill_value=np.NaN): """ Reindex to a cartesian production for the groupers, preserving the nature (Categorical) of each grouper """ @@ -33,7 +33,7 @@ def f(a): return a index = MultiIndex.from_product(map(f, args), names=names) - return result.reindex(index).sort_index() + return result.reindex(index, fill_value=fill_value).sort_index() _results_for_groupbys_with_missing_categories = dict( @@ -309,7 +309,7 @@ def test_observed(observed): result = gb.sum() if not observed: expected = cartesian_product_for_groupers( - expected, [cat1, cat2, ["foo", "bar"]], list("ABC") + expected, [cat1, cat2, ["foo", "bar"]], list("ABC"), fill_value=0 ) tm.assert_frame_equal(result, expected) @@ -319,7 +319,9 @@ def test_observed(observed): expected = DataFrame({"values": [1, 2, 3, 4]}, index=exp_index) result = gb.sum() if not observed: - expected = cartesian_product_for_groupers(expected, [cat1, cat2], list("AB")) + expected = cartesian_product_for_groupers( + expected, [cat1, cat2], list("AB"), fill_value=0 + ) tm.assert_frame_equal(result, expected) @@ -1189,6 +1191,8 @@ def test_seriesgroupby_observed_false_or_none(df_cat, observed, operation): ).sortlevel() expected = Series(data=[2, 4, np.nan, 1, np.nan, 3], index=index, name="C") + if operation == "agg": + expected = expected.fillna(0, downcast="infer") grouped = df_cat.groupby(["A", "B"], observed=observed)["C"] result = getattr(grouped, operation)(sum) tm.assert_series_equal(result, expected) @@ -1338,15 +1342,6 @@ def test_series_groupby_on_2_categoricals_unobserved_zeroes_or_nans( ) request.node.add_marker(mark) - if reduction_func == "sum": # GH 31422 - mark = pytest.mark.xfail( - reason=( - "sum should return 0 but currently returns NaN. " - "This is a known bug. See GH 31422." - ) - ) - request.node.add_marker(mark) - df = pd.DataFrame( { "cat_1": pd.Categorical(list("AABB"), categories=list("ABC")), @@ -1367,8 +1362,11 @@ def test_series_groupby_on_2_categoricals_unobserved_zeroes_or_nans( val = result.loc[idx] assert (pd.isna(zero_or_nan) and pd.isna(val)) or (val == zero_or_nan) - # If we expect unobserved values to be zero, we also expect the dtype to be int - if zero_or_nan == 0: + # If we expect unobserved values to be zero, we also expect the dtype to be int. + # Except for .sum(). If the observed categories sum to dtype=float (i.e. their + # sums have decimals), then the zeros for the missing categories should also be + # floats. + if zero_or_nan == 0 and reduction_func != "sum": assert np.issubdtype(result.dtype, np.integer) @@ -1410,24 +1408,6 @@ def test_dataframe_groupby_on_2_categoricals_when_observed_is_false( if reduction_func == "ngroup": pytest.skip("ngroup does not return the Categories on the index") - if reduction_func == "count": # GH 35028 - mark = pytest.mark.xfail( - reason=( - "DataFrameGroupBy.count returns np.NaN for missing " - "categories, when it should return 0. See GH 35028" - ) - ) - request.node.add_marker(mark) - - if reduction_func == "sum": # GH 31422 - mark = pytest.mark.xfail( - reason=( - "sum should return 0 but currently returns NaN. " - "This is a known bug. See GH 31422." - ) - ) - request.node.add_marker(mark) - df = pd.DataFrame( { "cat_1": pd.Categorical(list("AABB"), categories=list("ABC")), diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index c07a5673fe503..67b3151b0ff9c 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -1817,7 +1817,7 @@ def test_categorical_aggfunc(self, observed): ["A", "B", "C"], categories=["A", "B", "C"], ordered=False, name="C1" ) expected_columns = pd.Index(["a", "b"], name="C2") - expected_data = np.array([[1.0, np.nan], [1.0, np.nan], [np.nan, 2.0]]) + expected_data = np.array([[1, 0], [1, 0], [0, 2]], dtype=np.int64) expected = pd.DataFrame( expected_data, index=expected_index, columns=expected_columns ) @@ -1851,18 +1851,19 @@ def test_categorical_pivot_index_ordering(self, observed): values="Sales", index="Month", columns="Year", - dropna=observed, + observed=observed, aggfunc="sum", ) expected_columns = pd.Int64Index([2013, 2014], name="Year") expected_index = pd.CategoricalIndex( - ["January"], categories=months, ordered=False, name="Month" + months, categories=months, ordered=False, name="Month" ) + expected_data = [[320, 120]] + [[0, 0]] * 11 expected = pd.DataFrame( - [[320, 120]], index=expected_index, columns=expected_columns + expected_data, index=expected_index, columns=expected_columns ) - if not observed: - result = result.dropna().astype(np.int64) + if observed: + expected = expected.loc[["January"]] tm.assert_frame_equal(result, expected) From d8a96c3f53905bc65672b037fc326c3fa20f286e Mon Sep 17 00:00:00 2001 From: SylvainLan Date: Fri, 7 Aug 2020 17:32:26 +0200 Subject: [PATCH 0456/1025] To latex position (#35284) --- pandas/core/generic.py | 5 +++ pandas/io/formats/format.py | 2 + pandas/io/formats/latex.py | 42 +++++++++++++-------- pandas/tests/io/formats/test_to_latex.py | 48 ++++++++++++++++++++++++ 4 files changed, 82 insertions(+), 15 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 441eef26bea58..6fd55c58ece40 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2840,6 +2840,7 @@ def to_latex( multirow=None, caption=None, label=None, + position=None, ): r""" Render object to a LaTeX tabular, longtable, or nested table/tabular. @@ -2925,6 +2926,9 @@ def to_latex( This is used with ``\ref{}`` in the main ``.tex`` file. .. versionadded:: 1.0.0 + position : str, optional + The LaTeX positional argument for tables, to be placed after + ``\begin{}`` in the output. %(returns)s See Also -------- @@ -2986,6 +2990,7 @@ def to_latex( multirow=multirow, caption=caption, label=label, + position=position, ) def to_csv( diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 296fc341bf817..9546f674aa124 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -931,6 +931,7 @@ def to_latex( multirow: bool = False, caption: Optional[str] = None, label: Optional[str] = None, + position: Optional[str] = None, ) -> Optional[str]: """ Render a DataFrame to a LaTeX tabular/longtable environment output. @@ -946,6 +947,7 @@ def to_latex( multirow=multirow, caption=caption, label=label, + position=position, ).get_result(buf=buf, encoding=encoding) def _format_col(self, i: int) -> List[str]: diff --git a/pandas/io/formats/latex.py b/pandas/io/formats/latex.py index 3a3ca84642d51..5d6f0a08ef2b5 100644 --- a/pandas/io/formats/latex.py +++ b/pandas/io/formats/latex.py @@ -38,6 +38,7 @@ def __init__( multirow: bool = False, caption: Optional[str] = None, label: Optional[str] = None, + position: Optional[str] = None, ): self.fmt = formatter self.frame = self.fmt.frame @@ -50,6 +51,8 @@ def __init__( self.caption = caption self.label = label self.escape = self.fmt.escape + self.position = position + self._table_float = any(p is not None for p in (caption, label, position)) def write_result(self, buf: IO[str]) -> None: """ @@ -284,7 +287,7 @@ def _write_tabular_begin(self, buf, column_format: str): `__ e.g 'rcl' for 3 columns """ - if self.caption is not None or self.label is not None: + if self._table_float: # then write output in a nested table/tabular environment if self.caption is None: caption_ = "" @@ -296,7 +299,12 @@ def _write_tabular_begin(self, buf, column_format: str): else: label_ = f"\n\\label{{{self.label}}}" - buf.write(f"\\begin{{table}}\n\\centering{caption_}{label_}\n") + if self.position is None: + position_ = "" + else: + position_ = f"[{self.position}]" + + buf.write(f"\\begin{{table}}{position_}\n\\centering{caption_}{label_}\n") else: # then write output only in a tabular environment pass @@ -317,7 +325,7 @@ def _write_tabular_end(self, buf): """ buf.write("\\bottomrule\n") buf.write("\\end{tabular}\n") - if self.caption is not None or self.label is not None: + if self._table_float: buf.write("\\end{table}\n") else: pass @@ -337,25 +345,29 @@ def _write_longtable_begin(self, buf, column_format: str): `__ e.g 'rcl' for 3 columns """ - buf.write(f"\\begin{{longtable}}{{{column_format}}}\n") + if self.caption is None: + caption_ = "" + else: + caption_ = f"\\caption{{{self.caption}}}" - if self.caption is not None or self.label is not None: - if self.caption is None: - pass - else: - buf.write(f"\\caption{{{self.caption}}}") + if self.label is None: + label_ = "" + else: + label_ = f"\\label{{{self.label}}}" - if self.label is None: - pass - else: - buf.write(f"\\label{{{self.label}}}") + if self.position is None: + position_ = "" + else: + position_ = f"[{self.position}]" + buf.write( + f"\\begin{{longtable}}{position_}{{{column_format}}}\n{caption_}{label_}" + ) + if self.caption is not None or self.label is not None: # a double-backslash is required at the end of the line # as discussed here: # https://tex.stackexchange.com/questions/219138 buf.write("\\\\\n") - else: - pass @staticmethod def _write_longtable_end(buf): diff --git a/pandas/tests/io/formats/test_to_latex.py b/pandas/tests/io/formats/test_to_latex.py index 509e5bcb33304..93ad3739e59c7 100644 --- a/pandas/tests/io/formats/test_to_latex.py +++ b/pandas/tests/io/formats/test_to_latex.py @@ -573,6 +573,54 @@ def test_to_latex_longtable_caption_label(self): """ assert result_cl == expected_cl + def test_to_latex_position(self): + the_position = "h" + + df = DataFrame({"a": [1, 2], "b": ["b1", "b2"]}) + + # test when only the position is provided + result_p = df.to_latex(position=the_position) + + expected_p = r"""\begin{table}[h] +\centering +\begin{tabular}{lrl} +\toprule +{} & a & b \\ +\midrule +0 & 1 & b1 \\ +1 & 2 & b2 \\ +\bottomrule +\end{tabular} +\end{table} +""" + assert result_p == expected_p + + def test_to_latex_longtable_position(self): + the_position = "t" + + df = DataFrame({"a": [1, 2], "b": ["b1", "b2"]}) + + # test when only the position is provided + result_p = df.to_latex(longtable=True, position=the_position) + + expected_p = r"""\begin{longtable}[t]{lrl} +\toprule +{} & a & b \\ +\midrule +\endhead +\midrule +\multicolumn{3}{r}{{Continued on next page}} \\ +\midrule +\endfoot + +\bottomrule +\endlastfoot +0 & 1 & b1 \\ +1 & 2 & b2 \\ +\end{longtable} +""" + assert result_p == expected_p + def test_to_latex_escape_special_chars(self): special_characters = ["&", "%", "$", "#", "_", "{", "}", "~", "^", "\\"] df = DataFrame(data=special_characters) From 75cb00c70f77a5677197d5e6b581e4a83ace9cab Mon Sep 17 00:00:00 2001 From: Thomas Smith Date: Fri, 7 Aug 2020 17:56:12 +0100 Subject: [PATCH 0457/1025] BUG: GroupBy.apply() returns different results if a different GroupBy method is called first (#35314) --- doc/source/whatsnew/v1.2.0.rst | 2 +- pandas/core/groupby/groupby.py | 89 ++++++++++---------- pandas/tests/groupby/aggregate/test_other.py | 4 +- pandas/tests/groupby/test_apply.py | 29 +++++++ pandas/tests/groupby/test_grouping.py | 8 +- 5 files changed, 82 insertions(+), 50 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 260b92b5989c1..5a4fa3150e344 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -156,7 +156,7 @@ Groupby/resample/rolling - Bug in :meth:`DataFrameGroupBy.apply` that would some times throw an erroneous ``ValueError`` if the grouping axis had duplicate entries (:issue:`16646`) - - - +- Bug in :meth:`DataFrameGroupBy.apply` where a non-nuisance grouping column would be dropped from the output columns if another groupby method was called before ``.apply()`` (:issue:`34656`) Reshaping ^^^^^^^^^ diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index ed512710295d7..4597afeeaddbf 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -736,13 +736,12 @@ def pipe(self, func, *args, **kwargs): def _make_wrapper(self, name): assert name in self._apply_allowlist - self._set_group_selection() - - # need to setup the selection - # as are not passed directly but in the grouper - f = getattr(self._obj_with_exclusions, name) - if not isinstance(f, types.MethodType): - return self.apply(lambda self: getattr(self, name)) + with _group_selection_context(self): + # need to setup the selection + # as are not passed directly but in the grouper + f = getattr(self._obj_with_exclusions, name) + if not isinstance(f, types.MethodType): + return self.apply(lambda self: getattr(self, name)) f = getattr(type(self._obj_with_exclusions), name) sig = inspect.signature(f) @@ -992,28 +991,28 @@ def _agg_general( alias: str, npfunc: Callable, ): - self._set_group_selection() - - # try a cython aggregation if we can - try: - return self._cython_agg_general( - how=alias, alt=npfunc, numeric_only=numeric_only, min_count=min_count, - ) - except DataError: - pass - except NotImplementedError as err: - if "function is not implemented for this dtype" in str( - err - ) or "category dtype not supported" in str(err): - # raised in _get_cython_function, in some cases can - # be trimmed by implementing cython funcs for more dtypes + with _group_selection_context(self): + # try a cython aggregation if we can + try: + return self._cython_agg_general( + how=alias, + alt=npfunc, + numeric_only=numeric_only, + min_count=min_count, + ) + except DataError: pass - else: - raise - - # apply a non-cython aggregation - result = self.aggregate(lambda x: npfunc(x, axis=self.axis)) - return result + except NotImplementedError as err: + if "function is not implemented for this dtype" in str( + err + ) or "category dtype not supported" in str(err): + # raised in _get_cython_function, in some cases can + # be trimmed by implementing cython funcs for more dtypes + pass + + # apply a non-cython aggregation + result = self.aggregate(lambda x: npfunc(x, axis=self.axis)) + return result def _cython_agg_general( self, how: str, alt=None, numeric_only: bool = True, min_count: int = -1 @@ -1940,29 +1939,31 @@ def nth(self, n: Union[int, List[int]], dropna: Optional[str] = None) -> DataFra nth_values = list(set(n)) nth_array = np.array(nth_values, dtype=np.intp) - self._set_group_selection() + with _group_selection_context(self): - mask_left = np.in1d(self._cumcount_array(), nth_array) - mask_right = np.in1d(self._cumcount_array(ascending=False) + 1, -nth_array) - mask = mask_left | mask_right + mask_left = np.in1d(self._cumcount_array(), nth_array) + mask_right = np.in1d( + self._cumcount_array(ascending=False) + 1, -nth_array + ) + mask = mask_left | mask_right - ids, _, _ = self.grouper.group_info + ids, _, _ = self.grouper.group_info - # Drop NA values in grouping - mask = mask & (ids != -1) + # Drop NA values in grouping + mask = mask & (ids != -1) - out = self._selected_obj[mask] - if not self.as_index: - return out + out = self._selected_obj[mask] + if not self.as_index: + return out - result_index = self.grouper.result_index - out.index = result_index[ids[mask]] + result_index = self.grouper.result_index + out.index = result_index[ids[mask]] - if not self.observed and isinstance(result_index, CategoricalIndex): - out = out.reindex(result_index) + if not self.observed and isinstance(result_index, CategoricalIndex): + out = out.reindex(result_index) - out = self._reindex_output(out) - return out.sort_index() if self.sort else out + out = self._reindex_output(out) + return out.sort_index() if self.sort else out # dropna is truthy if isinstance(n, valid_containers): diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py index 264cf40dc6984..e8cd6017a117c 100644 --- a/pandas/tests/groupby/aggregate/test_other.py +++ b/pandas/tests/groupby/aggregate/test_other.py @@ -486,13 +486,13 @@ def test_agg_timezone_round_trip(): assert ts == grouped.first()["B"].iloc[0] # GH#27110 applying iloc should return a DataFrame - assert ts == grouped.apply(lambda x: x.iloc[0]).iloc[0, 0] + assert ts == grouped.apply(lambda x: x.iloc[0]).iloc[0, 1] ts = df["B"].iloc[2] assert ts == grouped.last()["B"].iloc[0] # GH#27110 applying iloc should return a DataFrame - assert ts == grouped.apply(lambda x: x.iloc[-1]).iloc[0, 0] + assert ts == grouped.apply(lambda x: x.iloc[-1]).iloc[0, 1] def test_sum_uint64_overflow(): diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 665cd12225ad7..ee38722ffb8ce 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -1009,6 +1009,35 @@ def test_apply_with_timezones_aware(): tm.assert_frame_equal(result1, result2) +def test_apply_is_unchanged_when_other_methods_are_called_first(reduction_func): + # GH #34656 + # GH #34271 + df = DataFrame( + { + "a": [99, 99, 99, 88, 88, 88], + "b": [1, 2, 3, 4, 5, 6], + "c": [10, 20, 30, 40, 50, 60], + } + ) + + expected = pd.DataFrame( + {"a": [264, 297], "b": [15, 6], "c": [150, 60]}, + index=pd.Index([88, 99], name="a"), + ) + + # Check output when no other methods are called before .apply() + grp = df.groupby(by="a") + result = grp.apply(sum) + tm.assert_frame_equal(result, expected) + + # Check output when another method is called before .apply() + grp = df.groupby(by="a") + args = {"nth": [0], "corrwith": [df]}.get(reduction_func, []) + _ = getattr(grp, reduction_func)(*args) + result = grp.apply(sum) + tm.assert_frame_equal(result, expected) + + def test_apply_with_date_in_multiindex_does_not_convert_to_timestamp(): # GH 29617 diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index efcd22f9c0c82..40b4ce46e550b 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -191,13 +191,15 @@ def test_grouper_creation_bug(self): result = g.sum() tm.assert_frame_equal(result, expected) - result = g.apply(lambda x: x.sum()) - tm.assert_frame_equal(result, expected) - g = df.groupby(pd.Grouper(key="A", axis=0)) result = g.sum() tm.assert_frame_equal(result, expected) + result = g.apply(lambda x: x.sum()) + expected["A"] = [0, 2, 4] + expected = expected.loc[:, ["A", "B"]] + tm.assert_frame_equal(result, expected) + # GH14334 # pd.Grouper(key=...) may be passed in a list df = DataFrame( From 2bb41c1f330640eace140155730f8f28614bbdfa Mon Sep 17 00:00:00 2001 From: Andrew Wieteska <48889395+arw2019@users.noreply.github.com> Date: Fri, 7 Aug 2020 12:58:23 -0400 Subject: [PATCH 0458/1025] CLN: clarify TypeError for IndexSlice argument to pd.xs (#35411) --- doc/source/whatsnew/v1.2.0.rst | 2 +- pandas/core/generic.py | 5 ++++- pandas/tests/indexing/multiindex/test_xs.py | 23 ++++++++++++++++++++- 3 files changed, 27 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 5a4fa3150e344..15616f4a6f27c 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -134,7 +134,7 @@ Missing MultiIndex ^^^^^^^^^^ -- +- Bug in :meth:`DataFrame.xs` when used with :class:`IndexSlice` raises ``TypeError`` with message `Expected label or tuple of labels` (:issue:`35301`) - I/O diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 6fd55c58ece40..843b602a12823 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3492,7 +3492,10 @@ class animal locomotion index = self.index if isinstance(index, MultiIndex): - loc, new_index = self.index.get_loc_level(key, drop_level=drop_level) + try: + loc, new_index = self.index.get_loc_level(key, drop_level=drop_level) + except TypeError as e: + raise TypeError(f"Expected label or tuple of labels, got {key}") from e else: loc = self.index.get_loc(key) diff --git a/pandas/tests/indexing/multiindex/test_xs.py b/pandas/tests/indexing/multiindex/test_xs.py index b807795b9c309..91be1d913001b 100644 --- a/pandas/tests/indexing/multiindex/test_xs.py +++ b/pandas/tests/indexing/multiindex/test_xs.py @@ -1,7 +1,7 @@ import numpy as np import pytest -from pandas import DataFrame, Index, MultiIndex, Series, concat, date_range +from pandas import DataFrame, Index, IndexSlice, MultiIndex, Series, concat, date_range import pandas._testing as tm import pandas.core.common as com @@ -220,6 +220,27 @@ def test_xs_level_series_slice_not_implemented( s[2000, 3:4] +def test_xs_IndexSlice_argument_not_implemented(): + # GH 35301 + + index = MultiIndex( + levels=[[("foo", "bar", 0), ("foo", "baz", 0), ("foo", "qux", 0)], [0, 1]], + codes=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]], + ) + + series = Series(np.random.randn(6), index=index) + frame = DataFrame(np.random.randn(6, 4), index=index) + + msg = ( + "Expected label or tuple of labels, got " + r"\(\('foo', 'qux', 0\), slice\(None, None, None\)\)" + ) + with pytest.raises(TypeError, match=msg): + frame.xs(IndexSlice[("foo", "qux", 0), :]) + with pytest.raises(TypeError, match=msg): + series.xs(IndexSlice[("foo", "qux", 0), :]) + + def test_series_getitem_multiindex_xs(): # GH6258 dt = list(date_range("20130903", periods=3)) From 684c3a2611cbca7bdbf0e669346c5096bec838c4 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Fri, 7 Aug 2020 11:09:30 -0700 Subject: [PATCH 0459/1025] BUG: Ensure rolling groupby doesn't segfault with center=True (#35562) --- doc/source/whatsnew/v1.1.1.rst | 1 + pandas/core/window/indexers.py | 6 +++ pandas/tests/window/test_grouper.py | 65 +++++++++++++++++++++++++++++ 3 files changed, 72 insertions(+) diff --git a/doc/source/whatsnew/v1.1.1.rst b/doc/source/whatsnew/v1.1.1.rst index 6b315e0a9d016..a044a4aab284e 100644 --- a/doc/source/whatsnew/v1.1.1.rst +++ b/doc/source/whatsnew/v1.1.1.rst @@ -18,6 +18,7 @@ Fixed regressions - Fixed regression where :func:`read_csv` would raise a ``ValueError`` when ``pandas.options.mode.use_inf_as_na`` was set to ``True`` (:issue:`35493`). - Fixed regression in :class:`pandas.core.groupby.RollingGroupby` where column selection was ignored (:issue:`35486`) - Fixed regression in :meth:`DataFrame.shift` with ``axis=1`` and heterogeneous dtypes (:issue:`35488`) +- Fixed regression in ``.groupby(..).rolling(..)`` where a segfault would occur with ``center=True`` and an odd number of values (:issue:`35552`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/window/indexers.py b/pandas/core/window/indexers.py index 0898836ed2e0e..bc36bdca982e8 100644 --- a/pandas/core/window/indexers.py +++ b/pandas/core/window/indexers.py @@ -319,4 +319,10 @@ def get_window_bounds( end_arrays.append(window_indicies.take(end)) start = np.concatenate(start_arrays) end = np.concatenate(end_arrays) + # GH 35552: Need to adjust start and end based on the nans appended to values + # when center=True + if num_values > len(start): + offset = num_values - len(start) + start = np.concatenate([start, np.array([end[-1]] * offset)]) + end = np.concatenate([end, np.array([end[-1]] * offset)]) return start, end diff --git a/pandas/tests/window/test_grouper.py b/pandas/tests/window/test_grouper.py index ca5a9eccea4f5..5241b9548a442 100644 --- a/pandas/tests/window/test_grouper.py +++ b/pandas/tests/window/test_grouper.py @@ -215,6 +215,71 @@ def foo(x): ) tm.assert_series_equal(result, expected) + def test_groupby_rolling_center_center(self): + # GH 35552 + series = Series(range(1, 6)) + result = series.groupby(series).rolling(center=True, window=3).mean() + expected = Series( + [np.nan] * 5, + index=pd.MultiIndex.from_tuples(((1, 0), (2, 1), (3, 2), (4, 3), (5, 4))), + ) + tm.assert_series_equal(result, expected) + + series = Series(range(1, 5)) + result = series.groupby(series).rolling(center=True, window=3).mean() + expected = Series( + [np.nan] * 4, + index=pd.MultiIndex.from_tuples(((1, 0), (2, 1), (3, 2), (4, 3))), + ) + tm.assert_series_equal(result, expected) + + df = pd.DataFrame({"a": ["a"] * 5 + ["b"] * 6, "b": range(11)}) + result = df.groupby("a").rolling(center=True, window=3).mean() + expected = pd.DataFrame( + [np.nan, 1, 2, 3, np.nan, np.nan, 6, 7, 8, 9, np.nan], + index=pd.MultiIndex.from_tuples( + ( + ("a", 0), + ("a", 1), + ("a", 2), + ("a", 3), + ("a", 4), + ("b", 5), + ("b", 6), + ("b", 7), + ("b", 8), + ("b", 9), + ("b", 10), + ), + names=["a", None], + ), + columns=["b"], + ) + tm.assert_frame_equal(result, expected) + + df = pd.DataFrame({"a": ["a"] * 5 + ["b"] * 5, "b": range(10)}) + result = df.groupby("a").rolling(center=True, window=3).mean() + expected = pd.DataFrame( + [np.nan, 1, 2, 3, np.nan, np.nan, 6, 7, 8, np.nan], + index=pd.MultiIndex.from_tuples( + ( + ("a", 0), + ("a", 1), + ("a", 2), + ("a", 3), + ("a", 4), + ("b", 5), + ("b", 6), + ("b", 7), + ("b", 8), + ("b", 9), + ), + names=["a", None], + ), + columns=["b"], + ) + tm.assert_frame_equal(result, expected) + def test_groupby_subselect_rolling(self): # GH 35486 df = DataFrame( From 787f63cba5d78c7ec31d0ad43e0d368f105745fc Mon Sep 17 00:00:00 2001 From: Deepak Pandey <40865954+freakypandit@users.noreply.github.com> Date: Sat, 8 Aug 2020 00:46:39 +0530 Subject: [PATCH 0460/1025] DOC: Docstring updated for DataFrame.equals (#34508) Co-authored-by: Joris Van den Bossche --- pandas/core/generic.py | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 843b602a12823..87f25f578c3c6 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1201,9 +1201,11 @@ def equals(self, other): This function allows two Series or DataFrames to be compared against each other to see if they have the same shape and elements. NaNs in - the same location are considered equal. The column headers do not - need to have the same type, but the elements within the columns must - be the same dtype. + the same location are considered equal. + + The row/column index do not need to have the same type, as long + as the values are considered equal. Corresponding columns must be of + the same dtype. Parameters ---------- @@ -1232,13 +1234,6 @@ def equals(self, other): numpy.array_equal : Return True if two arrays have the same shape and elements, False otherwise. - Notes - ----- - This function requires that the elements have the same dtype as their - respective elements in the other Series or DataFrame. However, the - column labels do not need to have the same type, as long as they are - still considered equal. - Examples -------- >>> df = pd.DataFrame({1: [10], 2: [20]}) From d58873af53e7c9c3d447f093cdcf880570a68450 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 7 Aug 2020 12:35:19 -0700 Subject: [PATCH 0461/1025] REF: use unpack_zerodim_and_defer on EA methods (#34042) --- pandas/core/arrays/base.py | 4 ++-- pandas/core/arrays/boolean.py | 19 ++++--------------- pandas/core/arrays/numpy_.py | 9 +++------ pandas/core/arrays/string_.py | 8 +++----- pandas/tests/extension/base/ops.py | 15 +++++++++++---- pandas/tests/extension/test_period.py | 6 +++++- 6 files changed, 28 insertions(+), 33 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 2553a65aed07b..921927325a144 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -22,7 +22,7 @@ from pandas.core.dtypes.cast import maybe_cast_to_extension_array from pandas.core.dtypes.common import is_array_like, is_list_like, pandas_dtype from pandas.core.dtypes.dtypes import ExtensionDtype -from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries +from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries from pandas.core.dtypes.missing import isna from pandas.core import ops @@ -1273,7 +1273,7 @@ def convert_values(param): ovalues = [param] * len(self) return ovalues - if isinstance(other, (ABCSeries, ABCIndexClass)): + if isinstance(other, (ABCSeries, ABCIndexClass, ABCDataFrame)): # rely on pandas to unbox and dispatch to us return NotImplemented diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index dbce71b77a425..bd4bdc5ecb46f 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -20,7 +20,6 @@ pandas_dtype, ) from pandas.core.dtypes.dtypes import register_extension_dtype -from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries from pandas.core.dtypes.missing import isna from pandas.core import ops @@ -559,13 +558,10 @@ def all(self, skipna: bool = True, **kwargs): @classmethod def _create_logical_method(cls, op): + @ops.unpack_zerodim_and_defer(op.__name__) def logical_method(self, other): - if isinstance(other, (ABCDataFrame, ABCSeries, ABCIndexClass)): - # Rely on pandas to unbox and dispatch to us. - return NotImplemented assert op.__name__ in {"or_", "ror_", "and_", "rand_", "xor", "rxor"} - other = lib.item_from_zerodim(other) other_is_booleanarray = isinstance(other, BooleanArray) other_is_scalar = lib.is_scalar(other) mask = None @@ -605,16 +601,14 @@ def logical_method(self, other): @classmethod def _create_comparison_method(cls, op): + @ops.unpack_zerodim_and_defer(op.__name__) def cmp_method(self, other): from pandas.arrays import IntegerArray - if isinstance( - other, (ABCDataFrame, ABCSeries, ABCIndexClass, IntegerArray) - ): + if isinstance(other, IntegerArray): # Rely on pandas to unbox and dispatch to us. return NotImplemented - other = lib.item_from_zerodim(other) mask = None if isinstance(other, BooleanArray): @@ -693,13 +687,8 @@ def _maybe_mask_result(self, result, mask, other, op_name: str): def _create_arithmetic_method(cls, op): op_name = op.__name__ + @ops.unpack_zerodim_and_defer(op_name) def boolean_arithmetic_method(self, other): - - if isinstance(other, (ABCDataFrame, ABCSeries, ABCIndexClass)): - # Rely on pandas to unbox and dispatch to us. - return NotImplemented - - other = lib.item_from_zerodim(other) mask = None if isinstance(other, BooleanArray): diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index f6dfb1f0f1e62..05f901518d82f 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -11,12 +11,11 @@ from pandas.util._validators import validate_fillna_kwargs from pandas.core.dtypes.dtypes import ExtensionDtype -from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries from pandas.core.dtypes.inference import is_array_like from pandas.core.dtypes.missing import isna from pandas import compat -from pandas.core import nanops +from pandas.core import nanops, ops from pandas.core.algorithms import searchsorted from pandas.core.array_algos import masked_reductions from pandas.core.arrays._mixins import NDArrayBackedExtensionArray @@ -436,11 +435,9 @@ def __invert__(self): @classmethod def _create_arithmetic_method(cls, op): + @ops.unpack_zerodim_and_defer(op.__name__) def arithmetic_method(self, other): - if isinstance(other, (ABCIndexClass, ABCSeries)): - return NotImplemented - - elif isinstance(other, cls): + if isinstance(other, cls): other = other._ndarray with np.errstate(all="ignore"): diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index fddd3af858f77..bb55c3cdea45c 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -7,7 +7,6 @@ from pandas.core.dtypes.base import ExtensionDtype, register_extension_dtype from pandas.core.dtypes.common import pandas_dtype -from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries from pandas.core.dtypes.inference import is_array_like from pandas import compat @@ -312,15 +311,14 @@ def memory_usage(self, deep=False): @classmethod def _create_arithmetic_method(cls, op): # Note: this handles both arithmetic and comparison methods. + + @ops.unpack_zerodim_and_defer(op.__name__) def method(self, other): from pandas.arrays import BooleanArray assert op.__name__ in ops.ARITHMETIC_BINOPS | ops.COMPARISON_BINOPS - if isinstance(other, (ABCIndexClass, ABCSeries, ABCDataFrame)): - return NotImplemented - - elif isinstance(other, cls): + if isinstance(other, cls): other = other._ndarray mask = isna(self) | isna(other) diff --git a/pandas/tests/extension/base/ops.py b/pandas/tests/extension/base/ops.py index 359acf230ce14..c93603398977e 100644 --- a/pandas/tests/extension/base/ops.py +++ b/pandas/tests/extension/base/ops.py @@ -114,10 +114,13 @@ def test_error(self, data, all_arithmetic_operators): with pytest.raises(AttributeError): getattr(data, op_name) - def test_direct_arith_with_series_returns_not_implemented(self, data): - # EAs should return NotImplemented for ops with Series. + @pytest.mark.parametrize("box", [pd.Series, pd.DataFrame]) + def test_direct_arith_with_ndframe_returns_not_implemented(self, data, box): + # EAs should return NotImplemented for ops with Series/DataFrame # Pandas takes care of unboxing the series and calling the EA's op. other = pd.Series(data) + if box is pd.DataFrame: + other = other.to_frame() if hasattr(data, "__add__"): result = data.__add__(other) assert result is NotImplemented @@ -156,10 +159,14 @@ def test_compare_array(self, data, all_compare_operators): other = pd.Series([data[0]] * len(data)) self._compare_other(s, data, op_name, other) - def test_direct_arith_with_series_returns_not_implemented(self, data): - # EAs should return NotImplemented for ops with Series. + @pytest.mark.parametrize("box", [pd.Series, pd.DataFrame]) + def test_direct_arith_with_ndframe_returns_not_implemented(self, data, box): + # EAs should return NotImplemented for ops with Series/DataFrame # Pandas takes care of unboxing the series and calling the EA's op. other = pd.Series(data) + if box is pd.DataFrame: + other = other.to_frame() + if hasattr(data, "__eq__"): result = data.__eq__(other) assert result is NotImplemented diff --git a/pandas/tests/extension/test_period.py b/pandas/tests/extension/test_period.py index b1eb276bfc227..817881e00fa99 100644 --- a/pandas/tests/extension/test_period.py +++ b/pandas/tests/extension/test_period.py @@ -126,9 +126,13 @@ def test_add_series_with_extension_array(self, data): def test_error(self): pass - def test_direct_arith_with_series_returns_not_implemented(self, data): + @pytest.mark.parametrize("box", [pd.Series, pd.DataFrame]) + def test_direct_arith_with_ndframe_returns_not_implemented(self, data, box): # Override to use __sub__ instead of __add__ other = pd.Series(data) + if box is pd.DataFrame: + other = other.to_frame() + result = data.__sub__(other) assert result is NotImplemented From 2099dd2ef321a17974c1f7cf0de7428472db470f Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 7 Aug 2020 14:03:00 -0700 Subject: [PATCH 0462/1025] PERF: BlockManager.equals blockwise (#35357) --- pandas/core/dtypes/missing.py | 14 ++++++- pandas/core/internals/managers.py | 27 ++------------ pandas/core/internals/ops.py | 61 ++++++++++++++++++++++--------- 3 files changed, 61 insertions(+), 41 deletions(-) diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index 8551ce9f14e6c..f59bb31af2828 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -10,7 +10,7 @@ from pandas._libs import lib import pandas._libs.missing as libmissing from pandas._libs.tslibs import NaT, iNaT -from pandas._typing import DtypeObj +from pandas._typing import ArrayLike, DtypeObj from pandas.core.dtypes.common import ( DT64NS_DTYPE, @@ -484,6 +484,18 @@ def _array_equivalent_object(left, right, strict_nan): return True +def array_equals(left: ArrayLike, right: ArrayLike) -> bool: + """ + ExtensionArray-compatible implementation of array_equivalent. + """ + if not is_dtype_equal(left.dtype, right.dtype): + return False + elif isinstance(left, ABCExtensionArray): + return left.equals(right) + else: + return array_equivalent(left, right, dtype_equal=True) + + def _infer_fill_value(val): """ infer the fill value for the nan/NaT from the provided diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 4693cc193c27c..4b85f92391dce 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -19,7 +19,6 @@ from pandas.core.dtypes.common import ( DT64NS_DTYPE, is_datetimelike_v_numeric, - is_dtype_equal, is_extension_array_dtype, is_list_like, is_numeric_v_string_like, @@ -28,10 +27,9 @@ from pandas.core.dtypes.concat import concat_compat from pandas.core.dtypes.dtypes import ExtensionDtype from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries -from pandas.core.dtypes.missing import array_equivalent, isna +from pandas.core.dtypes.missing import array_equals, isna import pandas.core.algorithms as algos -from pandas.core.arrays import ExtensionArray from pandas.core.arrays.sparse import SparseDtype from pandas.core.base import PandasObject import pandas.core.common as com @@ -49,7 +47,7 @@ get_block_type, make_block, ) -from pandas.core.internals.ops import operate_blockwise +from pandas.core.internals.ops import blockwise_all, operate_blockwise # TODO: flexible with index=None and/or items=None @@ -1449,26 +1447,9 @@ def equals(self, other: "BlockManager") -> bool: return False left = self.blocks[0].values right = other.blocks[0].values - if not is_dtype_equal(left.dtype, right.dtype): - return False - elif isinstance(left, ExtensionArray): - return left.equals(right) - else: - return array_equivalent(left, right) + return array_equals(left, right) - for i in range(len(self.items)): - # Check column-wise, return False if any column doesn't match - left = self.iget_values(i) - right = other.iget_values(i) - if not is_dtype_equal(left.dtype, right.dtype): - return False - elif isinstance(left, ExtensionArray): - if not left.equals(right): - return False - else: - if not array_equivalent(left, right, dtype_equal=True): - return False - return True + return blockwise_all(self, other, array_equals) def unstack(self, unstacker, fill_value) -> "BlockManager": """ diff --git a/pandas/core/internals/ops.py b/pandas/core/internals/ops.py index 6eedf72726acb..ae4892c720d5b 100644 --- a/pandas/core/internals/ops.py +++ b/pandas/core/internals/ops.py @@ -1,4 +1,5 @@ -from typing import TYPE_CHECKING, List, Tuple +from collections import namedtuple +from typing import TYPE_CHECKING, Iterator, List, Tuple import numpy as np @@ -9,13 +10,17 @@ from pandas.core.internals.managers import BlockManager # noqa:F401 -def operate_blockwise( - left: "BlockManager", right: "BlockManager", array_op -) -> "BlockManager": +BlockPairInfo = namedtuple( + "BlockPairInfo", ["lvals", "rvals", "locs", "left_ea", "right_ea", "rblk"], +) + + +def _iter_block_pairs( + left: "BlockManager", right: "BlockManager" +) -> Iterator[BlockPairInfo]: # At this point we have already checked the parent DataFrames for # assert rframe._indexed_same(lframe) - res_blks: List["Block"] = [] for n, blk in enumerate(left.blocks): locs = blk.mgr_locs blk_vals = blk.values @@ -34,21 +39,32 @@ def operate_blockwise( right_ea = not isinstance(rblk.values, np.ndarray) lvals, rvals = _get_same_shape_values(blk, rblk, left_ea, right_ea) + info = BlockPairInfo(lvals, rvals, locs, left_ea, right_ea, rblk) + yield info - res_values = array_op(lvals, rvals) - if left_ea and not right_ea and hasattr(res_values, "reshape"): - res_values = res_values.reshape(1, -1) - nbs = rblk._split_op_result(res_values) - # Assertions are disabled for performance, but should hold: - # if right_ea or left_ea: - # assert len(nbs) == 1 - # else: - # assert res_values.shape == lvals.shape, (res_values.shape, lvals.shape) +def operate_blockwise( + left: "BlockManager", right: "BlockManager", array_op +) -> "BlockManager": + # At this point we have already checked the parent DataFrames for + # assert rframe._indexed_same(lframe) + + res_blks: List["Block"] = [] + for lvals, rvals, locs, left_ea, right_ea, rblk in _iter_block_pairs(left, right): + res_values = array_op(lvals, rvals) + if left_ea and not right_ea and hasattr(res_values, "reshape"): + res_values = res_values.reshape(1, -1) + nbs = rblk._split_op_result(res_values) + + # Assertions are disabled for performance, but should hold: + # if right_ea or left_ea: + # assert len(nbs) == 1 + # else: + # assert res_values.shape == lvals.shape, (res_values.shape, lvals.shape) - _reset_block_mgr_locs(nbs, locs) + _reset_block_mgr_locs(nbs, locs) - res_blks.extend(nbs) + res_blks.extend(nbs) # Assertions are disabled for performance, but should hold: # slocs = {y for nb in res_blks for y in nb.mgr_locs.as_array} @@ -85,7 +101,7 @@ def _get_same_shape_values( # Require that the indexing into lvals be slice-like assert rblk.mgr_locs.is_slice_like, rblk.mgr_locs - # TODO(EA2D): with 2D EAs pnly this first clause would be needed + # TODO(EA2D): with 2D EAs only this first clause would be needed if not (left_ea or right_ea): lvals = lvals[rblk.mgr_locs.indexer, :] assert lvals.shape == rvals.shape, (lvals.shape, rvals.shape) @@ -102,3 +118,14 @@ def _get_same_shape_values( rvals = rvals[0, :] return lvals, rvals + + +def blockwise_all(left: "BlockManager", right: "BlockManager", op) -> bool: + """ + Blockwise `all` reduction. + """ + for info in _iter_block_pairs(left, right): + res = op(info.lvals, info.rvals) + if not res: + return False + return True From e5698b3c71e232e1dae024c409e1b0b97db61fce Mon Sep 17 00:00:00 2001 From: Andrew Wieteska <48889395+arw2019@users.noreply.github.com> Date: Fri, 7 Aug 2020 17:33:05 -0400 Subject: [PATCH 0463/1025] BUG: DataFrameGroupBy.__getitem__ fails to propagate dropna (#35078) --- doc/source/whatsnew/v1.2.0.rst | 2 +- pandas/core/groupby/generic.py | 18 +++++------ pandas/tests/groupby/test_groupby_dropna.py | 34 +++++++++++++++++++++ 3 files changed, 43 insertions(+), 11 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 15616f4a6f27c..74ef5178eb004 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -128,7 +128,7 @@ Indexing Missing ^^^^^^^ -- +- Bug in :meth:`SeriesGroupBy.transform` now correctly handles missing values for `dropna=False` (:issue:`35014`) - MultiIndex diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 740463f0cf356..1fed193dba02c 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -35,11 +35,11 @@ from pandas.util._decorators import Appender, Substitution, doc from pandas.core.dtypes.cast import ( + find_common_type, maybe_cast_result, maybe_cast_result_dtype, maybe_convert_objects, maybe_downcast_numeric, - maybe_downcast_to_dtype, ) from pandas.core.dtypes.common import ( ensure_int64, @@ -513,7 +513,6 @@ def _transform_general( """ Transform with a non-str `func`. """ - if maybe_use_numba(engine): numba_func, cache_key = generate_numba_func( func, engine_kwargs, kwargs, "groupby_transform" @@ -535,24 +534,23 @@ def _transform_general( if isinstance(res, (ABCDataFrame, ABCSeries)): res = res._values - indexer = self._get_index(name) - ser = klass(res, indexer) - results.append(ser) + results.append(klass(res, index=group.index)) # check for empty "results" to avoid concat ValueError if results: from pandas.core.reshape.concat import concat - result = concat(results).sort_index() + concatenated = concat(results) + result = self._set_result_index_ordered(concatenated) else: result = self.obj._constructor(dtype=np.float64) - # we will only try to coerce the result type if # we have a numeric dtype, as these are *always* user-defined funcs # the cython take a different path (and casting) - dtype = self._selected_obj.dtype - if is_numeric_dtype(dtype): - result = maybe_downcast_to_dtype(result, dtype) + if is_numeric_dtype(result.dtype): + common_dtype = find_common_type([self._selected_obj.dtype, result.dtype]) + if common_dtype is result.dtype: + result = maybe_downcast_numeric(result, self._selected_obj.dtype) result.name = self._selected_obj.name result.index = self._selected_obj.index diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index 1a525d306e9f5..adf62c4723526 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -162,6 +162,40 @@ def test_groupby_dropna_series_by(dropna, expected): tm.assert_series_equal(result, expected) +@pytest.mark.parametrize( + "dropna,df_expected,s_expected", + [ + pytest.param( + True, + pd.DataFrame({"B": [2, 2, 1]}), + pd.Series(data=[2, 2, 1], name="B"), + marks=pytest.mark.xfail(raises=ValueError), + ), + ( + False, + pd.DataFrame({"B": [2, 2, 1, 1]}), + pd.Series(data=[2, 2, 1, 1], name="B"), + ), + ], +) +def test_slice_groupby_then_transform(dropna, df_expected, s_expected): + # GH35014 + + df = pd.DataFrame({"A": [0, 0, 1, None], "B": [1, 2, 3, None]}) + gb = df.groupby("A", dropna=dropna) + + res = gb.transform(len) + tm.assert_frame_equal(res, df_expected) + + gb_slice = gb[["B"]] + res = gb_slice.transform(len) + tm.assert_frame_equal(res, df_expected) + + gb_slice = gb["B"] + res = gb["B"].transform(len) + tm.assert_series_equal(res, s_expected) + + @pytest.mark.parametrize( "dropna, tuples, outputs", [ From 0166f08f01cf04577cc033b520388e9332f73539 Mon Sep 17 00:00:00 2001 From: attack68 <24256554+attack68@users.noreply.github.com> Date: Fri, 7 Aug 2020 23:35:21 +0200 Subject: [PATCH 0464/1025] BUG: fix styler cell_ids arg so that blank style is ignored on False (#35588) --- doc/source/whatsnew/v1.1.1.rst | 1 + pandas/io/formats/style.py | 2 +- pandas/tests/io/formats/test_style.py | 6 ++++++ 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.1.rst b/doc/source/whatsnew/v1.1.1.rst index a044a4aab284e..ade88a6127014 100644 --- a/doc/source/whatsnew/v1.1.1.rst +++ b/doc/source/whatsnew/v1.1.1.rst @@ -27,6 +27,7 @@ Fixed regressions Bug fixes ~~~~~~~~~ +- Bug in ``Styler`` whereby `cell_ids` argument had no effect due to other recent changes (:issue:`35588`). Categorical ^^^^^^^^^^^ diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index fd1efa2d1b668..584f42a6cab12 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -390,7 +390,7 @@ def format_attr(pair): "is_visible": (c not in hidden_columns), } # only add an id if the cell has a style - if self.cell_ids or not (len(ctx[r, c]) == 1 and ctx[r, c][0] == ""): + if self.cell_ids or (r, c) in ctx: row_dict["id"] = "_".join(cs[1:]) row_es.append(row_dict) props = [] diff --git a/pandas/tests/io/formats/test_style.py b/pandas/tests/io/formats/test_style.py index 9c6910637fa7e..3ef5157655e78 100644 --- a/pandas/tests/io/formats/test_style.py +++ b/pandas/tests/io/formats/test_style.py @@ -1682,6 +1682,12 @@ def f(a, b, styler): result = styler.pipe((f, "styler"), a=1, b=2) assert result == (1, 2, styler) + def test_no_cell_ids(self): + # GH 35588 + df = pd.DataFrame(data=[[0]]) + s = Styler(df, uuid="_", cell_ids=False).render() + assert s.find('' in s.render() + @td.skip_if_no_mpl class TestStylerMatplotlibDep: From 5c776455d8b12102f6343b15a36a4e69d5d3e04a Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 8 Sep 2020 17:55:21 -0700 Subject: [PATCH 0716/1025] REF: pass setitem to unbox_scalar to de-duplicate validation (#36234) --- pandas/core/arrays/datetimelike.py | 20 +++++++++++--------- pandas/core/arrays/datetimes.py | 4 ++-- pandas/core/arrays/period.py | 6 ++++-- pandas/core/arrays/timedeltas.py | 4 ++-- pandas/tests/arrays/test_datetimelike.py | 2 +- 5 files changed, 20 insertions(+), 16 deletions(-) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index ba5bfc108f16b..bb40cf78ea006 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -183,7 +183,7 @@ def _rebox_native(cls, value: int) -> Union[int, np.datetime64, np.timedelta64]: """ raise AbstractMethodError(cls) - def _unbox_scalar(self, value: DTScalarOrNaT) -> int: + def _unbox_scalar(self, value: DTScalarOrNaT, setitem: bool = False) -> int: """ Unbox the integer value of a scalar `value`. @@ -191,6 +191,8 @@ def _unbox_scalar(self, value: DTScalarOrNaT) -> int: ---------- value : Period, Timestamp, Timedelta, or NaT Depending on subclass. + setitem : bool, default False + Whether to check compatiblity with setitem strictness. Returns ------- @@ -841,6 +843,7 @@ def _validate_listlike( if is_dtype_equal(value.categories.dtype, self.dtype): # TODO: do we need equal dtype or just comparable? value = value._internal_get_values() + value = extract_array(value, extract_numpy=True) if allow_object and is_object_dtype(value.dtype): pass @@ -875,8 +878,7 @@ def _validate_setitem_value(self, value): # TODO: cast_str for consistency? value = self._validate_scalar(value, msg, cast_str=False) - self._check_compatible_with(value, setitem=True) - return self._unbox(value) + return self._unbox(value, setitem=True) def _validate_insert_value(self, value): msg = f"cannot insert {type(self).__name__} with incompatible label" @@ -886,6 +888,8 @@ def _validate_insert_value(self, value): # TODO: if we dont have compat, should we raise or astype(object)? # PeriodIndex does astype(object) return value + # Note: we do not unbox here because the caller needs boxed value + # to check for freq. def _validate_where_value(self, other): msg = f"Where requires matching dtype, not {type(other)}" @@ -893,20 +897,18 @@ def _validate_where_value(self, other): other = self._validate_scalar(other, msg) else: other = self._validate_listlike(other, "where") - self._check_compatible_with(other, setitem=True) - self._check_compatible_with(other, setitem=True) - return self._unbox(other) + return self._unbox(other, setitem=True) - def _unbox(self, other) -> Union[np.int64, np.ndarray]: + def _unbox(self, other, setitem: bool = False) -> Union[np.int64, np.ndarray]: """ Unbox either a scalar with _unbox_scalar or an instance of our own type. """ if lib.is_scalar(other): - other = self._unbox_scalar(other) + other = self._unbox_scalar(other, setitem=setitem) else: # same type as self - self._check_compatible_with(other) + self._check_compatible_with(other, setitem=setitem) other = other.view("i8") return other diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 9f10cc84dcfcc..56e0a9861548f 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -451,11 +451,11 @@ def _generate_range( def _rebox_native(cls, value: int) -> np.datetime64: return np.int64(value).view("M8[ns]") - def _unbox_scalar(self, value): + def _unbox_scalar(self, value, setitem: bool = False): if not isinstance(value, self._scalar_type) and value is not NaT: raise ValueError("'value' should be a Timestamp.") if not isna(value): - self._check_compatible_with(value) + self._check_compatible_with(value, setitem=setitem) return value.value def _scalar_from_string(self, value): diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index eea11bde77030..865b1680c008a 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -257,11 +257,13 @@ def _generate_range(cls, start, end, periods, freq, fields): def _rebox_native(cls, value: int) -> np.int64: return np.int64(value) - def _unbox_scalar(self, value: Union[Period, NaTType]) -> int: + def _unbox_scalar( + self, value: Union[Period, NaTType], setitem: bool = False + ) -> int: if value is NaT: return value.value elif isinstance(value, self._scalar_type): - self._check_compatible_with(value) + self._check_compatible_with(value, setitem=setitem) return value.ordinal else: raise ValueError(f"'value' should be a Period. Got '{value}' instead.") diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index 5e3c0f2b8d876..3eaf428bc64b2 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -283,10 +283,10 @@ def _generate_range(cls, start, end, periods, freq, closed=None): def _rebox_native(cls, value: int) -> np.timedelta64: return np.int64(value).view("m8[ns]") - def _unbox_scalar(self, value): + def _unbox_scalar(self, value, setitem: bool = False): if not isinstance(value, self._scalar_type) and value is not NaT: raise ValueError("'value' should be a Timedelta.") - self._check_compatible_with(value) + self._check_compatible_with(value, setitem=setitem) return value.value def _scalar_from_string(self, value): diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index d2d3766959fbf..9d316c38082af 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -244,7 +244,7 @@ def test_searchsorted(self): # GH#29884 match numpy convention on whether NaT goes # at the end or the beginning result = arr.searchsorted(pd.NaT) - if _np_version_under1p18 or self.array_cls is PeriodArray: + if np_version_under1p18 or self.array_cls is PeriodArray: # Following numpy convention, NaT goes at the beginning # (unlike NaN which goes at the end) assert result == 0 From 19efb68ae84e6226f0f2ae663d658c09358f79e0 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 8 Sep 2020 18:33:41 -0700 Subject: [PATCH 0717/1025] STY: De-privatize imported names (#36235) --- pandas/_libs/interval.pyx | 4 +- pandas/core/arrays/_arrow_utils.py | 4 +- pandas/core/arrays/interval.py | 12 +++-- pandas/core/arrays/sparse/__init__.py | 2 +- pandas/core/arrays/sparse/array.py | 4 +- pandas/core/computation/engines.py | 2 +- pandas/core/computation/eval.py | 14 ++--- pandas/core/computation/expr.py | 4 +- pandas/core/config_init.py | 4 +- pandas/core/groupby/generic.py | 10 ++-- pandas/core/groupby/groupby.py | 20 +++---- pandas/core/indexes/base.py | 4 +- pandas/core/indexes/interval.py | 1 - pandas/core/reshape/merge.py | 4 +- pandas/io/formats/printing.py | 2 +- pandas/tests/arrays/sparse/test_libsparse.py | 56 +++++++++++--------- pandas/tests/computation/test_compat.py | 6 +-- pandas/tests/computation/test_eval.py | 12 ++--- 18 files changed, 88 insertions(+), 77 deletions(-) diff --git a/pandas/_libs/interval.pyx b/pandas/_libs/interval.pyx index 931ad8326c371..f8bcbcfb158b5 100644 --- a/pandas/_libs/interval.pyx +++ b/pandas/_libs/interval.pyx @@ -46,7 +46,7 @@ from pandas._libs.tslibs.util cimport ( is_timedelta64_object, ) -_VALID_CLOSED = frozenset(['left', 'right', 'both', 'neither']) +VALID_CLOSED = frozenset(['left', 'right', 'both', 'neither']) cdef class IntervalMixin: @@ -318,7 +318,7 @@ cdef class Interval(IntervalMixin): self._validate_endpoint(left) self._validate_endpoint(right) - if closed not in _VALID_CLOSED: + if closed not in VALID_CLOSED: raise ValueError(f"invalid option for 'closed': {closed}") if not left <= right: raise ValueError("left side of interval must be <= right side") diff --git a/pandas/core/arrays/_arrow_utils.py b/pandas/core/arrays/_arrow_utils.py index 4a33e0e841f7f..c89f5554d0715 100644 --- a/pandas/core/arrays/_arrow_utils.py +++ b/pandas/core/arrays/_arrow_utils.py @@ -4,7 +4,7 @@ import numpy as np import pyarrow -from pandas.core.arrays.interval import _VALID_CLOSED +from pandas.core.arrays.interval import VALID_CLOSED _pyarrow_version_ge_015 = LooseVersion(pyarrow.__version__) >= LooseVersion("0.15") @@ -83,7 +83,7 @@ class ArrowIntervalType(pyarrow.ExtensionType): def __init__(self, subtype, closed): # attributes need to be set first before calling # super init (as that calls serialize) - assert closed in _VALID_CLOSED + assert closed in VALID_CLOSED self._closed = closed if not isinstance(subtype, pyarrow.DataType): subtype = pyarrow.type_for_alias(str(subtype)) diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index d76e0fd628a48..1dbd3cfc6dca6 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -5,7 +5,12 @@ from pandas._config import get_option -from pandas._libs.interval import Interval, IntervalMixin, intervals_to_interval_bounds +from pandas._libs.interval import ( + VALID_CLOSED, + Interval, + IntervalMixin, + intervals_to_interval_bounds, +) from pandas.compat.numpy import function as nv from pandas.util._decorators import Appender @@ -42,7 +47,6 @@ from pandas.core.indexers import check_array_indexer from pandas.core.indexes.base import ensure_index -_VALID_CLOSED = {"left", "right", "both", "neither"} _interval_shared_docs = {} _shared_docs_kwargs = dict( @@ -475,7 +479,7 @@ def _validate(self): * left and right have the same missing values * left is always below right """ - if self.closed not in _VALID_CLOSED: + if self.closed not in VALID_CLOSED: msg = f"invalid option for 'closed': {self.closed}" raise ValueError(msg) if len(self.left) != len(self.right): @@ -1012,7 +1016,7 @@ def closed(self): ) ) def set_closed(self, closed): - if closed not in _VALID_CLOSED: + if closed not in VALID_CLOSED: msg = f"invalid option for 'closed': {closed}" raise ValueError(msg) diff --git a/pandas/core/arrays/sparse/__init__.py b/pandas/core/arrays/sparse/__init__.py index e928db499a771..e9ff4b7d4ffc2 100644 --- a/pandas/core/arrays/sparse/__init__.py +++ b/pandas/core/arrays/sparse/__init__.py @@ -5,6 +5,6 @@ BlockIndex, IntIndex, SparseArray, - _make_index, + make_sparse_index, ) from pandas.core.arrays.sparse.dtype import SparseDtype diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index 47c960dc969d6..853f7bb0b0d81 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -1556,7 +1556,7 @@ def make_sparse(arr: np.ndarray, kind="block", fill_value=None, dtype=None, copy else: indices = mask.nonzero()[0].astype(np.int32) - index = _make_index(length, indices, kind) + index = make_sparse_index(length, indices, kind) sparsified_values = arr[mask] if dtype is not None: sparsified_values = astype_nansafe(sparsified_values, dtype=dtype) @@ -1564,7 +1564,7 @@ def make_sparse(arr: np.ndarray, kind="block", fill_value=None, dtype=None, copy return sparsified_values, index, fill_value -def _make_index(length, indices, kind): +def make_sparse_index(length, indices, kind): if kind == "block" or isinstance(kind, BlockIndex): locs, lens = splib.get_blocks(indices) diff --git a/pandas/core/computation/engines.py b/pandas/core/computation/engines.py index 0cdc0f530a7f3..77a378369ca34 100644 --- a/pandas/core/computation/engines.py +++ b/pandas/core/computation/engines.py @@ -130,7 +130,7 @@ def _evaluate(self) -> None: pass -_engines: Dict[str, Type[AbstractEngine]] = { +ENGINES: Dict[str, Type[AbstractEngine]] = { "numexpr": NumExprEngine, "python": PythonEngine, } diff --git a/pandas/core/computation/eval.py b/pandas/core/computation/eval.py index f6a7935142a32..630606b4d8111 100644 --- a/pandas/core/computation/eval.py +++ b/pandas/core/computation/eval.py @@ -9,8 +9,8 @@ from pandas._libs.lib import no_default from pandas.util._validators import validate_bool_kwarg -from pandas.core.computation.engines import _engines -from pandas.core.computation.expr import Expr, _parsers +from pandas.core.computation.engines import ENGINES +from pandas.core.computation.expr import PARSERS, Expr from pandas.core.computation.parsing import tokenize_string from pandas.core.computation.scope import ensure_scope @@ -43,8 +43,8 @@ def _check_engine(engine: Optional[str]) -> str: if engine is None: engine = "numexpr" if NUMEXPR_INSTALLED else "python" - if engine not in _engines: - valid_engines = list(_engines.keys()) + if engine not in ENGINES: + valid_engines = list(ENGINES.keys()) raise KeyError( f"Invalid engine '{engine}' passed, valid engines are {valid_engines}" ) @@ -75,9 +75,9 @@ def _check_parser(parser: str): KeyError * If an invalid parser is passed """ - if parser not in _parsers: + if parser not in PARSERS: raise KeyError( - f"Invalid parser '{parser}' passed, valid parsers are {_parsers.keys()}" + f"Invalid parser '{parser}' passed, valid parsers are {PARSERS.keys()}" ) @@ -341,7 +341,7 @@ def eval( parsed_expr = Expr(expr, engine=engine, parser=parser, env=env) # construct the engine and evaluate the parsed expression - eng = _engines[engine] + eng = ENGINES[engine] eng_inst = eng(parsed_expr) ret = eng_inst.evaluate() diff --git a/pandas/core/computation/expr.py b/pandas/core/computation/expr.py index 8cff6abc071ca..f5897277d83bf 100644 --- a/pandas/core/computation/expr.py +++ b/pandas/core/computation/expr.py @@ -782,7 +782,7 @@ def __init__( self.env = env or Scope(level=level + 1) self.engine = engine self.parser = parser - self._visitor = _parsers[parser](self.env, self.engine, self.parser) + self._visitor = PARSERS[parser](self.env, self.engine, self.parser) self.terms = self.parse() @property @@ -814,4 +814,4 @@ def names(self): return frozenset(term.name for term in com.flatten(self.terms)) -_parsers = {"python": PythonExprVisitor, "pandas": PandasExprVisitor} +PARSERS = {"python": PythonExprVisitor, "pandas": PandasExprVisitor} diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index 0c23f1b4bcdf2..bfe20551cbcfc 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -314,9 +314,9 @@ def use_numba_cb(key): def table_schema_cb(key): - from pandas.io.formats.printing import _enable_data_resource_formatter + from pandas.io.formats.printing import enable_data_resource_formatter - _enable_data_resource_formatter(cf.get_option(key)) + enable_data_resource_formatter(cf.get_option(key)) def is_terminal() -> bool: diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 72003eab24b29..e870187fc7952 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -70,9 +70,9 @@ GroupBy, _agg_template, _apply_docs, - _group_selection_context, _transform_template, get_groupby, + group_selection_context, ) from pandas.core.groupby.numba_ import generate_numba_func, split_for_numba from pandas.core.indexes.api import Index, MultiIndex, all_indexes_same @@ -230,7 +230,7 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs) raise NotImplementedError( "Numba engine can only be used with a single function." ) - with _group_selection_context(self): + with group_selection_context(self): data = self._selected_obj result, index = self._aggregate_with_numba( data.to_frame(), func, *args, engine_kwargs=engine_kwargs, **kwargs @@ -685,7 +685,7 @@ def value_counts( self, normalize=False, sort=True, ascending=False, bins=None, dropna=True ): - from pandas.core.reshape.merge import _get_join_indexers + from pandas.core.reshape.merge import get_join_indexers from pandas.core.reshape.tile import cut if bins is not None and not np.iterable(bins): @@ -787,7 +787,7 @@ def value_counts( right = [diff.cumsum() - 1, codes[-1]] - _, idx = _get_join_indexers(left, right, sort=False, how="left") + _, idx = get_join_indexers(left, right, sort=False, how="left") out = np.where(idx != -1, out[idx], 0) if sort: @@ -942,7 +942,7 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs) raise NotImplementedError( "Numba engine can only be used with a single function." ) - with _group_selection_context(self): + with group_selection_context(self): data = self._selected_obj result, index = self._aggregate_with_numba( data, func, *args, engine_kwargs=engine_kwargs, **kwargs diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 6ef2e67030881..1e3e56f4ff09f 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -459,9 +459,9 @@ def f(self): @contextmanager -def _group_selection_context(groupby: "_GroupBy"): +def group_selection_context(groupby: "_GroupBy"): """ - Set / reset the _group_selection_context. + Set / reset the group_selection_context. """ groupby._set_group_selection() try: @@ -737,7 +737,7 @@ def pipe(self, func, *args, **kwargs): def _make_wrapper(self, name: str) -> Callable: assert name in self._apply_allowlist - with _group_selection_context(self): + with group_selection_context(self): # need to setup the selection # as are not passed directly but in the grouper f = getattr(self._obj_with_exclusions, name) @@ -868,7 +868,7 @@ def f(g): # fails on *some* columns, e.g. a numeric operation # on a string grouper column - with _group_selection_context(self): + with group_selection_context(self): return self._python_apply_general(f, self._selected_obj) return result @@ -994,7 +994,7 @@ def _agg_general( alias: str, npfunc: Callable, ): - with _group_selection_context(self): + with group_selection_context(self): # try a cython aggregation if we can try: return self._cython_agg_general( @@ -1499,7 +1499,7 @@ def var(self, ddof: int = 1): ) else: func = lambda x: x.var(ddof=ddof) - with _group_selection_context(self): + with group_selection_context(self): return self._python_agg_general(func) @Substitution(name="groupby") @@ -1658,7 +1658,7 @@ def ohlc(self) -> DataFrame: @doc(DataFrame.describe) def describe(self, **kwargs): - with _group_selection_context(self): + with group_selection_context(self): result = self.apply(lambda x: x.describe(**kwargs)) if self.axis == 1: return result.T @@ -1963,7 +1963,7 @@ def nth(self, n: Union[int, List[int]], dropna: Optional[str] = None) -> DataFra nth_values = list(set(n)) nth_array = np.array(nth_values, dtype=np.intp) - with _group_selection_context(self): + with group_selection_context(self): mask_left = np.in1d(self._cumcount_array(), nth_array) mask_right = np.in1d( @@ -2226,7 +2226,7 @@ def ngroup(self, ascending: bool = True): 5 0 dtype: int64 """ - with _group_selection_context(self): + with group_selection_context(self): index = self._selected_obj.index result = self._obj_1d_constructor(self.grouper.group_info[0], index) if not ascending: @@ -2287,7 +2287,7 @@ def cumcount(self, ascending: bool = True): 5 0 dtype: int64 """ - with _group_selection_context(self): + with group_selection_context(self): index = self._selected_obj.index cumcounts = self._cumcount_array(ascending=ascending) return self._obj_1d_constructor(cumcounts, index) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 526dae7e256b7..8014b16d07b01 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -3660,7 +3660,7 @@ def _join_multi(self, other, how, return_indexers=True): return result def _join_non_unique(self, other, how="left", return_indexers=False): - from pandas.core.reshape.merge import _get_join_indexers + from pandas.core.reshape.merge import get_join_indexers # We only get here if dtypes match assert self.dtype == other.dtype @@ -3668,7 +3668,7 @@ def _join_non_unique(self, other, how="left", return_indexers=False): lvalues = self._get_engine_target() rvalues = other._get_engine_target() - left_idx, right_idx = _get_join_indexers( + left_idx, right_idx = get_join_indexers( [lvalues], [rvalues], how=how, sort=True ) diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 3f72577c9420e..154f41bf07928 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -59,7 +59,6 @@ if TYPE_CHECKING: from pandas import CategoricalIndex # noqa:F401 -_VALID_CLOSED = {"left", "right", "both", "neither"} _index_doc_kwargs = dict(ibase._index_doc_kwargs) _index_doc_kwargs.update( diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 030dec369c2be..9f19ea9aefe09 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -859,7 +859,7 @@ def _maybe_add_join_keys(self, result, left_indexer, right_indexer): def _get_join_indexers(self): """ return the join indexers """ - return _get_join_indexers( + return get_join_indexers( self.left_join_keys, self.right_join_keys, sort=self.sort, how=self.how ) @@ -1298,7 +1298,7 @@ def _validate(self, validate: str): raise ValueError("Not a valid argument for validate") -def _get_join_indexers( +def get_join_indexers( left_keys, right_keys, sort: bool = False, how: str = "inner", **kwargs ): """ diff --git a/pandas/io/formats/printing.py b/pandas/io/formats/printing.py index edc6fbfff61d7..0d2ca83f1012e 100644 --- a/pandas/io/formats/printing.py +++ b/pandas/io/formats/printing.py @@ -243,7 +243,7 @@ def pprint_thing_encoded( return value.encode(encoding, errors) -def _enable_data_resource_formatter(enable: bool) -> None: +def enable_data_resource_formatter(enable: bool) -> None: if "IPython" not in sys.modules: # definitely not in IPython return diff --git a/pandas/tests/arrays/sparse/test_libsparse.py b/pandas/tests/arrays/sparse/test_libsparse.py index a2f861d378e67..2d6e657debdb2 100644 --- a/pandas/tests/arrays/sparse/test_libsparse.py +++ b/pandas/tests/arrays/sparse/test_libsparse.py @@ -8,7 +8,7 @@ from pandas import Series import pandas._testing as tm -from pandas.core.arrays.sparse import BlockIndex, IntIndex, _make_index +from pandas.core.arrays.sparse import BlockIndex, IntIndex, make_sparse_index TEST_LENGTH = 20 @@ -273,41 +273,43 @@ def test_intersect_identical(self): class TestSparseIndexCommon: def test_int_internal(self): - idx = _make_index(4, np.array([2, 3], dtype=np.int32), kind="integer") + idx = make_sparse_index(4, np.array([2, 3], dtype=np.int32), kind="integer") assert isinstance(idx, IntIndex) assert idx.npoints == 2 tm.assert_numpy_array_equal(idx.indices, np.array([2, 3], dtype=np.int32)) - idx = _make_index(4, np.array([], dtype=np.int32), kind="integer") + idx = make_sparse_index(4, np.array([], dtype=np.int32), kind="integer") assert isinstance(idx, IntIndex) assert idx.npoints == 0 tm.assert_numpy_array_equal(idx.indices, np.array([], dtype=np.int32)) - idx = _make_index(4, np.array([0, 1, 2, 3], dtype=np.int32), kind="integer") + idx = make_sparse_index( + 4, np.array([0, 1, 2, 3], dtype=np.int32), kind="integer" + ) assert isinstance(idx, IntIndex) assert idx.npoints == 4 tm.assert_numpy_array_equal(idx.indices, np.array([0, 1, 2, 3], dtype=np.int32)) def test_block_internal(self): - idx = _make_index(4, np.array([2, 3], dtype=np.int32), kind="block") + idx = make_sparse_index(4, np.array([2, 3], dtype=np.int32), kind="block") assert isinstance(idx, BlockIndex) assert idx.npoints == 2 tm.assert_numpy_array_equal(idx.blocs, np.array([2], dtype=np.int32)) tm.assert_numpy_array_equal(idx.blengths, np.array([2], dtype=np.int32)) - idx = _make_index(4, np.array([], dtype=np.int32), kind="block") + idx = make_sparse_index(4, np.array([], dtype=np.int32), kind="block") assert isinstance(idx, BlockIndex) assert idx.npoints == 0 tm.assert_numpy_array_equal(idx.blocs, np.array([], dtype=np.int32)) tm.assert_numpy_array_equal(idx.blengths, np.array([], dtype=np.int32)) - idx = _make_index(4, np.array([0, 1, 2, 3], dtype=np.int32), kind="block") + idx = make_sparse_index(4, np.array([0, 1, 2, 3], dtype=np.int32), kind="block") assert isinstance(idx, BlockIndex) assert idx.npoints == 4 tm.assert_numpy_array_equal(idx.blocs, np.array([0], dtype=np.int32)) tm.assert_numpy_array_equal(idx.blengths, np.array([4], dtype=np.int32)) - idx = _make_index(4, np.array([0, 2, 3], dtype=np.int32), kind="block") + idx = make_sparse_index(4, np.array([0, 2, 3], dtype=np.int32), kind="block") assert isinstance(idx, BlockIndex) assert idx.npoints == 3 tm.assert_numpy_array_equal(idx.blocs, np.array([0, 2], dtype=np.int32)) @@ -315,7 +317,7 @@ def test_block_internal(self): def test_lookup(self): for kind in ["integer", "block"]: - idx = _make_index(4, np.array([2, 3], dtype=np.int32), kind=kind) + idx = make_sparse_index(4, np.array([2, 3], dtype=np.int32), kind=kind) assert idx.lookup(-1) == -1 assert idx.lookup(0) == -1 assert idx.lookup(1) == -1 @@ -323,12 +325,14 @@ def test_lookup(self): assert idx.lookup(3) == 1 assert idx.lookup(4) == -1 - idx = _make_index(4, np.array([], dtype=np.int32), kind=kind) + idx = make_sparse_index(4, np.array([], dtype=np.int32), kind=kind) for i in range(-1, 5): assert idx.lookup(i) == -1 - idx = _make_index(4, np.array([0, 1, 2, 3], dtype=np.int32), kind=kind) + idx = make_sparse_index( + 4, np.array([0, 1, 2, 3], dtype=np.int32), kind=kind + ) assert idx.lookup(-1) == -1 assert idx.lookup(0) == 0 assert idx.lookup(1) == 1 @@ -336,7 +340,7 @@ def test_lookup(self): assert idx.lookup(3) == 3 assert idx.lookup(4) == -1 - idx = _make_index(4, np.array([0, 2, 3], dtype=np.int32), kind=kind) + idx = make_sparse_index(4, np.array([0, 2, 3], dtype=np.int32), kind=kind) assert idx.lookup(-1) == -1 assert idx.lookup(0) == 0 assert idx.lookup(1) == -1 @@ -346,7 +350,7 @@ def test_lookup(self): def test_lookup_array(self): for kind in ["integer", "block"]: - idx = _make_index(4, np.array([2, 3], dtype=np.int32), kind=kind) + idx = make_sparse_index(4, np.array([2, 3], dtype=np.int32), kind=kind) res = idx.lookup_array(np.array([-1, 0, 2], dtype=np.int32)) exp = np.array([-1, -1, 0], dtype=np.int32) @@ -356,11 +360,13 @@ def test_lookup_array(self): exp = np.array([-1, 0, -1, 1], dtype=np.int32) tm.assert_numpy_array_equal(res, exp) - idx = _make_index(4, np.array([], dtype=np.int32), kind=kind) + idx = make_sparse_index(4, np.array([], dtype=np.int32), kind=kind) res = idx.lookup_array(np.array([-1, 0, 2, 4], dtype=np.int32)) exp = np.array([-1, -1, -1, -1], dtype=np.int32) - idx = _make_index(4, np.array([0, 1, 2, 3], dtype=np.int32), kind=kind) + idx = make_sparse_index( + 4, np.array([0, 1, 2, 3], dtype=np.int32), kind=kind + ) res = idx.lookup_array(np.array([-1, 0, 2], dtype=np.int32)) exp = np.array([-1, 0, 2], dtype=np.int32) tm.assert_numpy_array_equal(res, exp) @@ -369,7 +375,7 @@ def test_lookup_array(self): exp = np.array([-1, 2, 1, 3], dtype=np.int32) tm.assert_numpy_array_equal(res, exp) - idx = _make_index(4, np.array([0, 2, 3], dtype=np.int32), kind=kind) + idx = make_sparse_index(4, np.array([0, 2, 3], dtype=np.int32), kind=kind) res = idx.lookup_array(np.array([2, 1, 3, 0], dtype=np.int32)) exp = np.array([1, -1, 2, 0], dtype=np.int32) tm.assert_numpy_array_equal(res, exp) @@ -402,25 +408,25 @@ def _check(index): class TestBlockIndex: def test_block_internal(self): - idx = _make_index(4, np.array([2, 3], dtype=np.int32), kind="block") + idx = make_sparse_index(4, np.array([2, 3], dtype=np.int32), kind="block") assert isinstance(idx, BlockIndex) assert idx.npoints == 2 tm.assert_numpy_array_equal(idx.blocs, np.array([2], dtype=np.int32)) tm.assert_numpy_array_equal(idx.blengths, np.array([2], dtype=np.int32)) - idx = _make_index(4, np.array([], dtype=np.int32), kind="block") + idx = make_sparse_index(4, np.array([], dtype=np.int32), kind="block") assert isinstance(idx, BlockIndex) assert idx.npoints == 0 tm.assert_numpy_array_equal(idx.blocs, np.array([], dtype=np.int32)) tm.assert_numpy_array_equal(idx.blengths, np.array([], dtype=np.int32)) - idx = _make_index(4, np.array([0, 1, 2, 3], dtype=np.int32), kind="block") + idx = make_sparse_index(4, np.array([0, 1, 2, 3], dtype=np.int32), kind="block") assert isinstance(idx, BlockIndex) assert idx.npoints == 4 tm.assert_numpy_array_equal(idx.blocs, np.array([0], dtype=np.int32)) tm.assert_numpy_array_equal(idx.blengths, np.array([4], dtype=np.int32)) - idx = _make_index(4, np.array([0, 2, 3], dtype=np.int32), kind="block") + idx = make_sparse_index(4, np.array([0, 2, 3], dtype=np.int32), kind="block") assert isinstance(idx, BlockIndex) assert idx.npoints == 3 tm.assert_numpy_array_equal(idx.blocs, np.array([0, 2], dtype=np.int32)) @@ -428,7 +434,7 @@ def test_block_internal(self): def test_make_block_boundary(self): for i in [5, 10, 100, 101]: - idx = _make_index(i, np.arange(0, i, 2, dtype=np.int32), kind="block") + idx = make_sparse_index(i, np.arange(0, i, 2, dtype=np.int32), kind="block") exp = np.arange(0, i, 2, dtype=np.int32) tm.assert_numpy_array_equal(idx.blocs, exp) @@ -514,17 +520,19 @@ def test_check_integrity(self): IntIndex(length=5, indices=[1, 3, 3]) def test_int_internal(self): - idx = _make_index(4, np.array([2, 3], dtype=np.int32), kind="integer") + idx = make_sparse_index(4, np.array([2, 3], dtype=np.int32), kind="integer") assert isinstance(idx, IntIndex) assert idx.npoints == 2 tm.assert_numpy_array_equal(idx.indices, np.array([2, 3], dtype=np.int32)) - idx = _make_index(4, np.array([], dtype=np.int32), kind="integer") + idx = make_sparse_index(4, np.array([], dtype=np.int32), kind="integer") assert isinstance(idx, IntIndex) assert idx.npoints == 0 tm.assert_numpy_array_equal(idx.indices, np.array([], dtype=np.int32)) - idx = _make_index(4, np.array([0, 1, 2, 3], dtype=np.int32), kind="integer") + idx = make_sparse_index( + 4, np.array([0, 1, 2, 3], dtype=np.int32), kind="integer" + ) assert isinstance(idx, IntIndex) assert idx.npoints == 4 tm.assert_numpy_array_equal(idx.indices, np.array([0, 1, 2, 3], dtype=np.int32)) diff --git a/pandas/tests/computation/test_compat.py b/pandas/tests/computation/test_compat.py index ead102f532a20..9fc3ed4800d09 100644 --- a/pandas/tests/computation/test_compat.py +++ b/pandas/tests/computation/test_compat.py @@ -5,7 +5,7 @@ from pandas.compat._optional import VERSIONS import pandas as pd -from pandas.core.computation.engines import _engines +from pandas.core.computation.engines import ENGINES import pandas.core.computation.expr as expr @@ -26,8 +26,8 @@ def test_compat(): pytest.skip("not testing numexpr version compat") -@pytest.mark.parametrize("engine", _engines) -@pytest.mark.parametrize("parser", expr._parsers) +@pytest.mark.parametrize("engine", ENGINES) +@pytest.mark.parametrize("parser", expr.PARSERS) def test_invalid_numexpr_version(engine, parser): def testit(): a, b = 1, 2 # noqa diff --git a/pandas/tests/computation/test_eval.py b/pandas/tests/computation/test_eval.py index 72dc04e68c154..cca64a6bf487c 100644 --- a/pandas/tests/computation/test_eval.py +++ b/pandas/tests/computation/test_eval.py @@ -19,7 +19,7 @@ import pandas._testing as tm from pandas.core.computation import pytables from pandas.core.computation.check import NUMEXPR_VERSION -from pandas.core.computation.engines import NumExprClobberingError, _engines +from pandas.core.computation.engines import ENGINES, NumExprClobberingError import pandas.core.computation.expr as expr from pandas.core.computation.expr import ( BaseExprVisitor, @@ -46,14 +46,14 @@ f"installed->{NUMEXPR_INSTALLED}", ), ) - for engine in _engines + for engine in ENGINES ) ) # noqa def engine(request): return request.param -@pytest.fixture(params=expr._parsers) +@pytest.fixture(params=expr.PARSERS) def parser(request): return request.param @@ -77,7 +77,7 @@ def unary_fns_for_ne(): def engine_has_neg_frac(engine): - return _engines[engine].has_neg_frac + return ENGINES[engine].has_neg_frac def _eval_single_bin(lhs, cmp1, rhs, engine): @@ -168,7 +168,7 @@ def setup_ops(self): def setup_method(self, method): self.setup_ops() self.setup_data() - self.current_engines = (engine for engine in _engines if engine != self.engine) + self.current_engines = (engine for engine in ENGINES if engine != self.engine) def teardown_method(self, method): del self.lhses, self.rhses, self.scalar_rhses, self.scalar_lhses @@ -1921,7 +1921,7 @@ def test_invalid_parser(): } -@pytest.mark.parametrize("engine", _engines) +@pytest.mark.parametrize("engine", ENGINES) @pytest.mark.parametrize("parser", _parsers) def test_disallowed_nodes(engine, parser): VisitorClass = _parsers[parser] From ac21a2af21e84e10666cfaf132ab574a2d1e89ae Mon Sep 17 00:00:00 2001 From: Maxim Ivanov <41443370+ivanovmg@users.noreply.github.com> Date: Wed, 9 Sep 2020 17:33:10 +0700 Subject: [PATCH 0718/1025] REF: simplify CSVFormatter (#36046) * REF: extract properties cols and has_mi_columns * REF: extract property chunksize * REF: extract property quotechar * REF: extract properties data_index and nlevels * REF: refactor _save_chunk * REF: refactor _save * REF: extract method _save_body * REF: reorder _save-like methods * REF: extract compression property * REF: Extract property index_label * REF: extract helper properties * REF: delete local variables in _save_header * REF: extract method _get_header_rows * REF: move check for header into _save function * TYP: add several type annotations * FIX: fix index labels * FIX: fix multiindex * FIX: fix test failures on compression Needed to eliminate compression setter due to the interdependencies between ioargs and compression. * REF: eliminate preallocation of self.data * REF: extract method _convert_to_native_types * REF: rename regular -> flat as reviewed * TYP: add type annotations as reviewed * REF: refactor number formatting Replace _convert_to_native_types method in favor of a number formatting dictionary. * FIX: mypy error with index_label * FIX: reorder if-statements in index_label To make sure that the newer mypy (v0.782) passes. * TYP: move IndexLabel to pandas._typing This eliminates repetition of the type annotations for index label in multiple places. * TYP: quotechar, has_mi_columns, _need_to_save... * TYP: chunksize, but ignored assignment check For some reason mypy would not recognize that chunksize turns from Optional[int] to int inside the setter. Even setting an intentional assertion ``assert chunksize is not None`` does not help. * TYP: cols property Limitations: - ignore type[assignment] error. - Created additional method _refine_cols to allow conversion from Optional[Sequence[Label]] to Sequence[Label]. * TYP: nlevels and _has_aliases * CLN: move GH21227 check to pandas/io/common.py * TYP: remove redundant bool from IndexLabel type * TYP: add to _get_index_label... methods * TYP: use Iterator instead of Generator * TYP: explicitly use List type * TYP: correct dict typing * TYP: remaining properties --- pandas/_typing.py | 2 + pandas/core/generic.py | 3 +- pandas/io/common.py | 15 ++ pandas/io/formats/csvs.py | 361 +++++++++++++++++++------------------- 4 files changed, 202 insertions(+), 179 deletions(-) diff --git a/pandas/_typing.py b/pandas/_typing.py index b237013ac7805..7aef5c02e290f 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -15,6 +15,7 @@ List, Mapping, Optional, + Sequence, Type, TypeVar, Union, @@ -82,6 +83,7 @@ Axis = Union[str, int] Label = Optional[Hashable] +IndexLabel = Optional[Union[Label, Sequence[Label]]] Level = Union[Label, int] Ordered = Optional[bool] JSONSerializable = Optional[Union[PythonScalar, List, Dict]] diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 40f0c6200e835..fffd2e068ebcf 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -40,6 +40,7 @@ CompressionOptions, FilePathOrBuffer, FrameOrSeries, + IndexLabel, JSONSerializable, Label, Level, @@ -3160,7 +3161,7 @@ def to_csv( columns: Optional[Sequence[Label]] = None, header: Union[bool_t, List[str]] = True, index: bool_t = True, - index_label: Optional[Union[bool_t, str, Sequence[Label]]] = None, + index_label: IndexLabel = None, mode: str = "w", encoding: Optional[str] = None, compression: CompressionOptions = "infer", diff --git a/pandas/io/common.py b/pandas/io/common.py index 3f130401558dd..f177e08ac0089 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -208,6 +208,21 @@ def get_filepath_or_buffer( # handle compression dict compression_method, compression = get_compression_method(compression) compression_method = infer_compression(filepath_or_buffer, compression_method) + + # GH21227 internal compression is not used for non-binary handles. + if ( + compression_method + and hasattr(filepath_or_buffer, "write") + and mode + and "b" not in mode + ): + warnings.warn( + "compression has no effect when passing a non-binary object as input.", + RuntimeWarning, + stacklevel=2, + ) + compression_method = None + compression = dict(compression, method=compression_method) # bz2 and xz do not write the byte order mark for utf-16 and utf-32 diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 15cd5c026c6b6..90ab6f61f4d74 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -5,13 +5,18 @@ import csv as csvlib from io import StringIO, TextIOWrapper import os -from typing import Hashable, List, Optional, Sequence, Union -import warnings +from typing import Any, Dict, Hashable, Iterator, List, Optional, Sequence, Union import numpy as np from pandas._libs import writers as libwriters -from pandas._typing import CompressionOptions, FilePathOrBuffer, StorageOptions +from pandas._typing import ( + CompressionOptions, + FilePathOrBuffer, + IndexLabel, + Label, + StorageOptions, +) from pandas.core.dtypes.generic import ( ABCDatetimeIndex, @@ -21,6 +26,8 @@ ) from pandas.core.dtypes.missing import notna +from pandas.core.indexes.api import Index + from pandas.io.common import get_filepath_or_buffer, get_handle @@ -32,10 +39,10 @@ def __init__( sep: str = ",", na_rep: str = "", float_format: Optional[str] = None, - cols=None, + cols: Optional[Sequence[Label]] = None, header: Union[bool, Sequence[Hashable]] = True, index: bool = True, - index_label: Optional[Union[bool, Hashable, Sequence[Hashable]]] = None, + index_label: IndexLabel = None, mode: str = "w", encoding: Optional[str] = None, errors: str = "strict", @@ -43,7 +50,7 @@ def __init__( quoting: Optional[int] = None, line_terminator="\n", chunksize: Optional[int] = None, - quotechar='"', + quotechar: Optional[str] = '"', date_format: Optional[str] = None, doublequote: bool = True, escapechar: Optional[str] = None, @@ -52,16 +59,19 @@ def __init__( ): self.obj = obj + self.encoding = encoding or "utf-8" + if path_or_buf is None: path_or_buf = StringIO() ioargs = get_filepath_or_buffer( path_or_buf, - encoding=encoding, + encoding=self.encoding, compression=compression, mode=mode, storage_options=storage_options, ) + self.compression = ioargs.compression.pop("method") self.compression_args = ioargs.compression self.path_or_buf = ioargs.filepath_or_buffer @@ -72,46 +82,79 @@ def __init__( self.na_rep = na_rep self.float_format = float_format self.decimal = decimal - self.header = header self.index = index self.index_label = index_label - if encoding is None: - encoding = "utf-8" - self.encoding = encoding self.errors = errors + self.quoting = quoting or csvlib.QUOTE_MINIMAL + self.quotechar = quotechar + self.doublequote = doublequote + self.escapechar = escapechar + self.line_terminator = line_terminator or os.linesep + self.date_format = date_format + self.cols = cols # type: ignore[assignment] + self.chunksize = chunksize # type: ignore[assignment] + + @property + def index_label(self) -> IndexLabel: + return self._index_label + + @index_label.setter + def index_label(self, index_label: IndexLabel) -> None: + if index_label is not False: + if index_label is None: + index_label = self._get_index_label_from_obj() + elif not isinstance(index_label, (list, tuple, np.ndarray, ABCIndexClass)): + # given a string for a DF with Index + index_label = [index_label] + self._index_label = index_label + + def _get_index_label_from_obj(self) -> List[str]: + if isinstance(self.obj.index, ABCMultiIndex): + return self._get_index_label_multiindex() + else: + return self._get_index_label_flat() + + def _get_index_label_multiindex(self) -> List[str]: + return [name or "" for name in self.obj.index.names] - if quoting is None: - quoting = csvlib.QUOTE_MINIMAL - self.quoting = quoting + def _get_index_label_flat(self) -> List[str]: + index_label = self.obj.index.name + return [""] if index_label is None else [index_label] - if quoting == csvlib.QUOTE_NONE: + @property + def quotechar(self) -> Optional[str]: + if self.quoting != csvlib.QUOTE_NONE: # prevents crash in _csv - quotechar = None - self.quotechar = quotechar + return self._quotechar + return None - self.doublequote = doublequote - self.escapechar = escapechar + @quotechar.setter + def quotechar(self, quotechar: Optional[str]) -> None: + self._quotechar = quotechar - self.line_terminator = line_terminator or os.linesep + @property + def has_mi_columns(self) -> bool: + return bool(isinstance(self.obj.columns, ABCMultiIndex)) - self.date_format = date_format + @property + def cols(self) -> Sequence[Label]: + return self._cols - self.has_mi_columns = isinstance(obj.columns, ABCMultiIndex) + @cols.setter + def cols(self, cols: Optional[Sequence[Label]]) -> None: + self._cols = self._refine_cols(cols) + def _refine_cols(self, cols: Optional[Sequence[Label]]) -> Sequence[Label]: # validate mi options if self.has_mi_columns: if cols is not None: - raise TypeError("cannot specify cols with a MultiIndex on the columns") + msg = "cannot specify cols with a MultiIndex on the columns" + raise TypeError(msg) if cols is not None: if isinstance(cols, ABCIndexClass): - cols = cols.to_native_types( - na_rep=na_rep, - float_format=float_format, - date_format=date_format, - quoting=self.quoting, - ) + cols = cols.to_native_types(**self._number_format) else: cols = list(cols) self.obj = self.obj.loc[:, cols] @@ -120,58 +163,90 @@ def __init__( # and make sure sure cols is just a list of labels cols = self.obj.columns if isinstance(cols, ABCIndexClass): - cols = cols.to_native_types( - na_rep=na_rep, - float_format=float_format, - date_format=date_format, - quoting=self.quoting, - ) + return cols.to_native_types(**self._number_format) else: - cols = list(cols) + assert isinstance(cols, Sequence) + return list(cols) - # save it - self.cols = cols + @property + def _number_format(self) -> Dict[str, Any]: + """Dictionary used for storing number formatting settings.""" + return dict( + na_rep=self.na_rep, + float_format=self.float_format, + date_format=self.date_format, + quoting=self.quoting, + decimal=self.decimal, + ) - # preallocate data 2d list - ncols = self.obj.shape[-1] - self.data = [None] * ncols + @property + def chunksize(self) -> int: + return self._chunksize + @chunksize.setter + def chunksize(self, chunksize: Optional[int]) -> None: if chunksize is None: chunksize = (100000 // (len(self.cols) or 1)) or 1 - self.chunksize = int(chunksize) + assert chunksize is not None + self._chunksize = int(chunksize) - self.data_index = obj.index + @property + def data_index(self) -> Index: + data_index = self.obj.index if ( - isinstance(self.data_index, (ABCDatetimeIndex, ABCPeriodIndex)) - and date_format is not None + isinstance(data_index, (ABCDatetimeIndex, ABCPeriodIndex)) + and self.date_format is not None ): - from pandas import Index - - self.data_index = Index( - [x.strftime(date_format) if notna(x) else "" for x in self.data_index] + data_index = Index( + [x.strftime(self.date_format) if notna(x) else "" for x in data_index] ) + return data_index + + @property + def nlevels(self) -> int: + if self.index: + return getattr(self.data_index, "nlevels", 1) + else: + return 0 + + @property + def _has_aliases(self) -> bool: + return isinstance(self.header, (tuple, list, np.ndarray, ABCIndexClass)) + + @property + def _need_to_save_header(self) -> bool: + return bool(self._has_aliases or self.header) + + @property + def write_cols(self) -> Sequence[Label]: + if self._has_aliases: + assert not isinstance(self.header, bool) + if len(self.header) != len(self.cols): + raise ValueError( + f"Writing {len(self.cols)} cols but got {len(self.header)} aliases" + ) + else: + return self.header + else: + return self.cols + + @property + def encoded_labels(self) -> List[Label]: + encoded_labels: List[Label] = [] + + if self.index and self.index_label: + assert isinstance(self.index_label, Sequence) + encoded_labels = list(self.index_label) - self.nlevels = getattr(self.data_index, "nlevels", 1) - if not index: - self.nlevels = 0 + if not self.has_mi_columns or self._has_aliases: + encoded_labels += list(self.write_cols) + + return encoded_labels def save(self) -> None: """ Create the writer & save. """ - # GH21227 internal compression is not used for non-binary handles. - if ( - self.compression - and hasattr(self.path_or_buf, "write") - and "b" not in self.mode - ): - warnings.warn( - "compression has no effect when passing a non-binary object as input.", - RuntimeWarning, - stacklevel=2, - ) - self.compression = None - # get a handle or wrap an existing handle to take care of 1) compression and # 2) text -> byte conversion f, handles = get_handle( @@ -215,133 +290,63 @@ def save(self) -> None: for _fh in handles: _fh.close() - def _save_header(self): - writer = self.writer - obj = self.obj - index_label = self.index_label - cols = self.cols - has_mi_columns = self.has_mi_columns - header = self.header - encoded_labels: List[str] = [] - - has_aliases = isinstance(header, (tuple, list, np.ndarray, ABCIndexClass)) - if not (has_aliases or self.header): - return - if has_aliases: - if len(header) != len(cols): - raise ValueError( - f"Writing {len(cols)} cols but got {len(header)} aliases" - ) - else: - write_cols = header - else: - write_cols = cols - - if self.index: - # should write something for index label - if index_label is not False: - if index_label is None: - if isinstance(obj.index, ABCMultiIndex): - index_label = [] - for i, name in enumerate(obj.index.names): - if name is None: - name = "" - index_label.append(name) - else: - index_label = obj.index.name - if index_label is None: - index_label = [""] - else: - index_label = [index_label] - elif not isinstance( - index_label, (list, tuple, np.ndarray, ABCIndexClass) - ): - # given a string for a DF with Index - index_label = [index_label] - - encoded_labels = list(index_label) - else: - encoded_labels = [] - - if not has_mi_columns or has_aliases: - encoded_labels += list(write_cols) - writer.writerow(encoded_labels) - else: - # write out the mi - columns = obj.columns - - # write out the names for each level, then ALL of the values for - # each level - for i in range(columns.nlevels): - - # we need at least 1 index column to write our col names - col_line = [] - if self.index: - - # name is the first column - col_line.append(columns.names[i]) - - if isinstance(index_label, list) and len(index_label) > 1: - col_line.extend([""] * (len(index_label) - 1)) - - col_line.extend(columns._get_level_values(i)) - - writer.writerow(col_line) - - # Write out the index line if it's not empty. - # Otherwise, we will print out an extraneous - # blank line between the mi and the data rows. - if encoded_labels and set(encoded_labels) != {""}: - encoded_labels.extend([""] * len(columns)) - writer.writerow(encoded_labels) - def _save(self) -> None: - self._save_header() + if self._need_to_save_header: + self._save_header() + self._save_body() + def _save_header(self) -> None: + if not self.has_mi_columns or self._has_aliases: + self.writer.writerow(self.encoded_labels) + else: + for row in self._generate_multiindex_header_rows(): + self.writer.writerow(row) + + def _generate_multiindex_header_rows(self) -> Iterator[List[Label]]: + columns = self.obj.columns + for i in range(columns.nlevels): + # we need at least 1 index column to write our col names + col_line = [] + if self.index: + # name is the first column + col_line.append(columns.names[i]) + + if isinstance(self.index_label, list) and len(self.index_label) > 1: + col_line.extend([""] * (len(self.index_label) - 1)) + + col_line.extend(columns._get_level_values(i)) + yield col_line + + # Write out the index line if it's not empty. + # Otherwise, we will print out an extraneous + # blank line between the mi and the data rows. + if self.encoded_labels and set(self.encoded_labels) != {""}: + yield self.encoded_labels + [""] * len(columns) + + def _save_body(self) -> None: nrows = len(self.data_index) - - # write in chunksize bites - chunksize = self.chunksize - chunks = int(nrows / chunksize) + 1 - + chunks = int(nrows / self.chunksize) + 1 for i in range(chunks): - start_i = i * chunksize - end_i = min((i + 1) * chunksize, nrows) + start_i = i * self.chunksize + end_i = min(start_i + self.chunksize, nrows) if start_i >= end_i: break - self._save_chunk(start_i, end_i) def _save_chunk(self, start_i: int, end_i: int) -> None: - data_index = self.data_index + ncols = self.obj.shape[-1] + data = [None] * ncols # create the data for a chunk slicer = slice(start_i, end_i) df = self.obj.iloc[slicer] - blocks = df._mgr.blocks - - for i in range(len(blocks)): - b = blocks[i] - d = b.to_native_types( - na_rep=self.na_rep, - float_format=self.float_format, - decimal=self.decimal, - date_format=self.date_format, - quoting=self.quoting, - ) - for col_loc, col in zip(b.mgr_locs, d): - # self.data is a preallocated list - self.data[col_loc] = col + for block in df._mgr.blocks: + d = block.to_native_types(**self._number_format) - ix = data_index.to_native_types( - slicer=slicer, - na_rep=self.na_rep, - float_format=self.float_format, - decimal=self.decimal, - date_format=self.date_format, - quoting=self.quoting, - ) + for col_loc, col in zip(block.mgr_locs, d): + data[col_loc] = col - libwriters.write_csv_rows(self.data, ix, self.nlevels, self.cols, self.writer) + ix = self.data_index.to_native_types(slicer=slicer, **self._number_format) + libwriters.write_csv_rows(data, ix, self.nlevels, self.cols, self.writer) From 21d38477411403154ebb3e9c6d048b038b2ffdd8 Mon Sep 17 00:00:00 2001 From: Satrio H Wicaksono <44076327+satrio-hw@users.noreply.github.com> Date: Wed, 9 Sep 2020 20:24:14 +0700 Subject: [PATCH 0719/1025] CLN: remove unnecessary trailing commas on issues #35925 (#36193) --- pandas/tests/arrays/categorical/test_replace.py | 8 ++------ pandas/tests/arrays/test_array.py | 8 ++++---- pandas/tests/arrays/test_timedeltas.py | 2 +- pandas/tests/base/test_conversion.py | 7 ++----- pandas/tests/dtypes/test_missing.py | 2 +- pandas/tests/extension/base/methods.py | 8 ++------ pandas/tests/extension/test_sparse.py | 4 +--- pandas/tests/frame/indexing/test_setitem.py | 8 ++------ 8 files changed, 15 insertions(+), 32 deletions(-) diff --git a/pandas/tests/arrays/categorical/test_replace.py b/pandas/tests/arrays/categorical/test_replace.py index b9ac3ce9a37ae..8b784fde1d3c5 100644 --- a/pandas/tests/arrays/categorical/test_replace.py +++ b/pandas/tests/arrays/categorical/test_replace.py @@ -43,9 +43,5 @@ def test_replace(to_replace, value, expected, flip_categories): # the replace call loses categorical dtype expected = pd.Series(np.asarray(expected)) - tm.assert_series_equal( - expected, result, check_category_order=False, - ) - tm.assert_series_equal( - expected, s, check_category_order=False, - ) + tm.assert_series_equal(expected, result, check_category_order=False) + tm.assert_series_equal(expected, s, check_category_order=False) diff --git a/pandas/tests/arrays/test_array.py b/pandas/tests/arrays/test_array.py index a0525aa511ee2..304e1c80a3f77 100644 --- a/pandas/tests/arrays/test_array.py +++ b/pandas/tests/arrays/test_array.py @@ -35,7 +35,7 @@ np.dtype("float32"), PandasArray(np.array([1.0, 2.0], dtype=np.dtype("float32"))), ), - (np.array([1, 2], dtype="int64"), None, IntegerArray._from_sequence([1, 2]),), + (np.array([1, 2], dtype="int64"), None, IntegerArray._from_sequence([1, 2])), # String alias passes through to NumPy ([1, 2], "float32", PandasArray(np.array([1, 2], dtype="float32"))), # Period alias @@ -120,10 +120,10 @@ (pd.Series([1, 2]), None, PandasArray(np.array([1, 2], dtype=np.int64))), # String (["a", None], "string", StringArray._from_sequence(["a", None])), - (["a", None], pd.StringDtype(), StringArray._from_sequence(["a", None]),), + (["a", None], pd.StringDtype(), StringArray._from_sequence(["a", None])), # Boolean ([True, None], "boolean", BooleanArray._from_sequence([True, None])), - ([True, None], pd.BooleanDtype(), BooleanArray._from_sequence([True, None]),), + ([True, None], pd.BooleanDtype(), BooleanArray._from_sequence([True, None])), # Index (pd.Index([1, 2]), None, PandasArray(np.array([1, 2], dtype=np.int64))), # Series[EA] returns the EA @@ -174,7 +174,7 @@ def test_array_copy(): period_array(["2000", "2001"], freq="D"), ), # interval - ([pd.Interval(0, 1), pd.Interval(1, 2)], IntervalArray.from_breaks([0, 1, 2]),), + ([pd.Interval(0, 1), pd.Interval(1, 2)], IntervalArray.from_breaks([0, 1, 2])), # datetime ( [pd.Timestamp("2000"), pd.Timestamp("2001")], diff --git a/pandas/tests/arrays/test_timedeltas.py b/pandas/tests/arrays/test_timedeltas.py index c86b4f71ee592..a32529cb58ba3 100644 --- a/pandas/tests/arrays/test_timedeltas.py +++ b/pandas/tests/arrays/test_timedeltas.py @@ -46,7 +46,7 @@ def test_incorrect_dtype_raises(self): TimedeltaArray(np.array([1, 2, 3], dtype="i8"), dtype="category") with pytest.raises( - ValueError, match=r"dtype int64 cannot be converted to timedelta64\[ns\]", + ValueError, match=r"dtype int64 cannot be converted to timedelta64\[ns\]" ): TimedeltaArray(np.array([1, 2, 3], dtype="i8"), dtype=np.dtype("int64")) diff --git a/pandas/tests/base/test_conversion.py b/pandas/tests/base/test_conversion.py index b688a048cbe8e..b5595ba220a15 100644 --- a/pandas/tests/base/test_conversion.py +++ b/pandas/tests/base/test_conversion.py @@ -183,7 +183,7 @@ def test_iter_box(self): PeriodArray, pd.core.dtypes.dtypes.PeriodDtype("A-DEC"), ), - (pd.IntervalIndex.from_breaks([0, 1, 2]), IntervalArray, "interval",), + (pd.IntervalIndex.from_breaks([0, 1, 2]), IntervalArray, "interval"), # This test is currently failing for datetime64[ns] and timedelta64[ns]. # The NumPy type system is sufficient for representing these types, so # we just use NumPy for Series / DataFrame columns of these types (so @@ -285,10 +285,7 @@ def test_array_multiindex_raises(): pd.core.arrays.period_array(["2000", "2001"], freq="D"), np.array([pd.Period("2000", freq="D"), pd.Period("2001", freq="D")]), ), - ( - pd.core.arrays.integer_array([0, np.nan]), - np.array([0, pd.NA], dtype=object), - ), + (pd.core.arrays.integer_array([0, np.nan]), np.array([0, pd.NA], dtype=object)), ( IntervalArray.from_breaks([0, 1, 2]), np.array([pd.Interval(0, 1), pd.Interval(1, 2)], dtype=object), diff --git a/pandas/tests/dtypes/test_missing.py b/pandas/tests/dtypes/test_missing.py index 04dde08de082d..a642b23379c6f 100644 --- a/pandas/tests/dtypes/test_missing.py +++ b/pandas/tests/dtypes/test_missing.py @@ -373,7 +373,7 @@ def test_array_equivalent(dtype_equal): ) # The rest are not dtype_equal assert not array_equivalent( - DatetimeIndex([0, np.nan]), DatetimeIndex([0, np.nan], tz="US/Eastern"), + DatetimeIndex([0, np.nan]), DatetimeIndex([0, np.nan], tz="US/Eastern") ) assert not array_equivalent( DatetimeIndex([0, np.nan], tz="CET"), diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index 5e1cf30efd534..23e20a2c0903a 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -92,18 +92,14 @@ def test_argmin_argmax(self, data_for_sorting, data_missing_for_sorting, na_valu assert data_missing_for_sorting.argmax() == 0 assert data_missing_for_sorting.argmin() == 2 - @pytest.mark.parametrize( - "method", ["argmax", "argmin"], - ) + @pytest.mark.parametrize("method", ["argmax", "argmin"]) def test_argmin_argmax_empty_array(self, method, data): # GH 24382 err_msg = "attempt to get" with pytest.raises(ValueError, match=err_msg): getattr(data[:0], method)() - @pytest.mark.parametrize( - "method", ["argmax", "argmin"], - ) + @pytest.mark.parametrize("method", ["argmax", "argmin"]) def test_argmin_argmax_all_na(self, method, data, na_value): # all missing with skipna=True is the same as emtpy err_msg = "attempt to get" diff --git a/pandas/tests/extension/test_sparse.py b/pandas/tests/extension/test_sparse.py index b411ca1c482a4..d11cfd219a443 100644 --- a/pandas/tests/extension/test_sparse.py +++ b/pandas/tests/extension/test_sparse.py @@ -316,9 +316,7 @@ def test_shift_0_periods(self, data): data._sparse_values[0] = data._sparse_values[1] assert result._sparse_values[0] != result._sparse_values[1] - @pytest.mark.parametrize( - "method", ["argmax", "argmin"], - ) + @pytest.mark.parametrize("method", ["argmax", "argmin"]) def test_argmin_argmax_all_na(self, method, data, na_value): # overriding because Sparse[int64, 0] cannot handle na_value self._check_unsupported(data) diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py index c5945edfd3127..8313ab0b99bac 100644 --- a/pandas/tests/frame/indexing/test_setitem.py +++ b/pandas/tests/frame/indexing/test_setitem.py @@ -108,7 +108,7 @@ def test_setitem_timestamp_empty_columns(self): df["now"] = Timestamp("20130101", tz="UTC") expected = DataFrame( - [[Timestamp("20130101", tz="UTC")]] * 3, index=[0, 1, 2], columns=["now"], + [[Timestamp("20130101", tz="UTC")]] * 3, index=[0, 1, 2], columns=["now"] ) tm.assert_frame_equal(df, expected) @@ -158,11 +158,7 @@ def test_setitem_dict_preserves_dtypes(self): } ) for idx, b in enumerate([1, 2, 3]): - df.loc[df.shape[0]] = { - "a": int(idx), - "b": float(b), - "c": float(b), - } + df.loc[df.shape[0]] = {"a": int(idx), "b": float(b), "c": float(b)} tm.assert_frame_equal(df, expected) @pytest.mark.parametrize( From 363fb5db8551645b616b7582ae47768ed464ab77 Mon Sep 17 00:00:00 2001 From: danchev <12420863+danchev@users.noreply.github.com> Date: Wed, 9 Sep 2020 10:12:05 -0500 Subject: [PATCH 0720/1025] Fixed a broken JSON Table Schema link (#36246) --- doc/source/user_guide/io.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 43030d76d945a..bf6575a8836f5 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -2412,7 +2412,7 @@ indicate missing values and the subsequent read cannot distinguish the intent. os.remove('test.json') -.. _Table Schema: https://specs.frictionlessdata.io/json-table-schema/ +.. _Table Schema: https://specs.frictionlessdata.io/table-schema/ HTML ---- From 9d78d169cbaf3e271625081000f14eb174ec0fa4 Mon Sep 17 00:00:00 2001 From: Nikhil Choudhary <49715980+Nikhil1O1@users.noreply.github.com> Date: Thu, 10 Sep 2020 03:55:31 +0530 Subject: [PATCH 0721/1025] DOC: Improve Index docstrings (#36239) --- pandas/core/indexes/base.py | 2 +- pandas/core/indexes/numeric.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 8014b16d07b01..67456096e8681 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -195,7 +195,7 @@ def _new_Index(cls, d): class Index(IndexOpsMixin, PandasObject): """ - Immutable ndarray implementing an ordered, sliceable set. The basic object + Immutable sequence used for indexing and alignment. The basic object storing axis labels for all pandas objects. Parameters diff --git a/pandas/core/indexes/numeric.py b/pandas/core/indexes/numeric.py index 079f43cb2c66b..125602ef2054a 100644 --- a/pandas/core/indexes/numeric.py +++ b/pandas/core/indexes/numeric.py @@ -189,7 +189,7 @@ def _union(self, other, sort): _num_index_shared_docs[ "class_descr" ] = """ - Immutable ndarray implementing an ordered, sliceable set. The basic object + Immutable sequence used for indexing and alignment. The basic object storing axis labels for all pandas objects. %(klass)s is a special case of `Index` with purely %(ltype)s labels. %(extra)s. From ada684931ac6df21f22b687455518a328b40dca2 Mon Sep 17 00:00:00 2001 From: Matthias Bussonnier Date: Thu, 10 Sep 2020 11:53:40 -0700 Subject: [PATCH 0722/1025] DOC: Rst Formatting, make sure continuation prompt are used. (#35317) --- pandas/core/indexing.py | 2 +- pandas/io/excel/_base.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index fe2fec1c52063..51031d9ab1153 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -59,7 +59,7 @@ class _IndexSlice: >>> midx = pd.MultiIndex.from_product([['A0','A1'], ['B0','B1','B2','B3']]) >>> columns = ['foo', 'bar'] >>> dfmi = pd.DataFrame(np.arange(16).reshape((len(midx), len(columns))), - index=midx, columns=columns) + ... index=midx, columns=columns) Using the default slice command: diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 87343c22ad4e9..d597731ed0ac4 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -601,8 +601,8 @@ class ExcelWriter(metaclass=abc.ABCMeta): You can set the date format or datetime format: >>> with ExcelWriter('path_to_file.xlsx', - date_format='YYYY-MM-DD', - datetime_format='YYYY-MM-DD HH:MM:SS') as writer: + ... date_format='YYYY-MM-DD', + ... datetime_format='YYYY-MM-DD HH:MM:SS') as writer: ... df.to_excel(writer) You can also append to an existing Excel file: From b34704be75bbdc464784958feeb0213baabf1700 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 10 Sep 2020 18:11:43 -0700 Subject: [PATCH 0723/1025] BUG: DataFrame.any with axis=1 and bool_only=True (#36106) --- doc/source/whatsnew/v1.2.0.rst | 1 + pandas/core/frame.py | 14 +++++--------- pandas/tests/reductions/test_reductions.py | 7 +++++++ 3 files changed, 13 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 2aac2596c18cb..ba556c8dcca54 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -246,6 +246,7 @@ Timezones Numeric ^^^^^^^ - Bug in :func:`to_numeric` where float precision was incorrect (:issue:`31364`) +- Bug in :meth:`DataFrame.any` with ``axis=1`` and ``bool_only=True`` ignoring the ``bool_only`` keyword (:issue:`32432`) - Conversion diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 59cf4c0e2f81d..3eed10917843b 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -8639,15 +8639,12 @@ def func(values): else: return op(values, axis=axis, skipna=skipna, **kwds) - def _get_data(axis_matters: bool) -> DataFrame: + def _get_data() -> DataFrame: if filter_type is None: data = self._get_numeric_data() elif filter_type == "bool": - if axis_matters: - # GH#25101, GH#24434 - data = self._get_bool_data() if axis == 0 else self - else: - data = self._get_bool_data() + # GH#25101, GH#24434 + data = self._get_bool_data() else: # pragma: no cover msg = ( f"Generating numeric_only data with filter_type {filter_type} " @@ -8659,7 +8656,7 @@ def _get_data(axis_matters: bool) -> DataFrame: if numeric_only is not None: df = self if numeric_only is True: - df = _get_data(axis_matters=True) + df = _get_data() if axis == 1: df = df.T axis = 0 @@ -8720,8 +8717,7 @@ def blk_func(values): except TypeError: # e.g. in nanops trying to convert strs to float - # TODO: why doesnt axis matter here? - data = _get_data(axis_matters=False) + data = _get_data() labels = data._get_agg_axis(axis) values = data.values diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py index a112bc80b60b0..bbf2d9f1f0784 100644 --- a/pandas/tests/reductions/test_reductions.py +++ b/pandas/tests/reductions/test_reductions.py @@ -914,6 +914,13 @@ def test_all_any_boolean(self): tm.assert_series_equal(s.all(level=0), Series([False, True, False])) tm.assert_series_equal(s.any(level=0), Series([False, True, True])) + def test_any_axis1_bool_only(self): + # GH#32432 + df = pd.DataFrame({"A": [True, False], "B": [1, 2]}) + result = df.any(axis=1, bool_only=True) + expected = pd.Series([True, False]) + tm.assert_series_equal(result, expected) + def test_timedelta64_analytics(self): # index min/max From 3159230302ab26e1f42a953848a1612c7b09da1e Mon Sep 17 00:00:00 2001 From: Justin Essert Date: Fri, 11 Sep 2020 09:03:02 -0400 Subject: [PATCH 0724/1025] BUG: instantiation using a dict with a period scalar (#35966) --- doc/source/whatsnew/v1.2.0.rst | 2 +- pandas/core/construction.py | 2 +- pandas/core/dtypes/cast.py | 1 - pandas/core/indexes/interval.py | 2 ++ pandas/tests/dtypes/cast/test_infer_dtype.py | 4 +-- pandas/tests/frame/test_constructors.py | 18 ++++++++++++ pandas/tests/series/test_constructors.py | 29 +++++++++++++++++++- 7 files changed, 51 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index ba556c8dcca54..f2f56ee81b8d4 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -334,7 +334,7 @@ Sparse ExtensionArray ^^^^^^^^^^^^^^ -- +- Fixed Bug where :class:`DataFrame` column set to scalar extension type via a dict instantion was considered an object type rather than the extension type (:issue:`35965`) - diff --git a/pandas/core/construction.py b/pandas/core/construction.py index 3812c306b8eb4..0993328aef8de 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -472,7 +472,7 @@ def sanitize_array( # figure out the dtype from the value (upcast if necessary) if dtype is None: - dtype, value = infer_dtype_from_scalar(value) + dtype, value = infer_dtype_from_scalar(value, pandas_dtype=True) else: # need to possibly convert the value here value = maybe_cast_to_datetime(value, dtype) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 8f9c0cf7a01db..ba1b0b075936d 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -709,7 +709,6 @@ def infer_dtype_from_scalar(val, pandas_dtype: bool = False) -> Tuple[DtypeObj, elif pandas_dtype: if lib.is_period(val): dtype = PeriodDtype(freq=val.freq) - val = val.ordinal elif lib.is_interval(val): subtype = infer_dtype_from_scalar(val.left, pandas_dtype=True)[0] dtype = IntervalDtype(subtype=subtype) diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 154f41bf07928..ad0a7ea32a1cc 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -589,6 +589,8 @@ def _maybe_convert_i8(self, key): if scalar: # Timestamp/Timedelta key_dtype, key_i8 = infer_dtype_from_scalar(key, pandas_dtype=True) + if lib.is_period(key): + key_i8 = key.ordinal else: # DatetimeIndex/TimedeltaIndex key_dtype, key_i8 = key.dtype, Index(key.asi8) diff --git a/pandas/tests/dtypes/cast/test_infer_dtype.py b/pandas/tests/dtypes/cast/test_infer_dtype.py index 70d38aad951cc..157adacbdfdf7 100644 --- a/pandas/tests/dtypes/cast/test_infer_dtype.py +++ b/pandas/tests/dtypes/cast/test_infer_dtype.py @@ -84,13 +84,11 @@ def test_infer_dtype_from_period(freq, pandas_dtype): if pandas_dtype: exp_dtype = f"period[{freq}]" - exp_val = p.ordinal else: exp_dtype = np.object_ - exp_val = p assert dtype == exp_dtype - assert val == exp_val + assert val == p @pytest.mark.parametrize( diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 0d1004809f7f1..eb334e811c5a4 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -717,6 +717,24 @@ def test_constructor_period_dict(self): assert df["a"].dtype == a.dtype assert df["b"].dtype == b.dtype + @pytest.mark.parametrize( + "data,dtype", + [ + (pd.Period("2012-01", freq="M"), "period[M]"), + (pd.Period("2012-02-01", freq="D"), "period[D]"), + (Interval(left=0, right=5), IntervalDtype("int64")), + (Interval(left=0.1, right=0.5), IntervalDtype("float64")), + ], + ) + def test_constructor_period_dict_scalar(self, data, dtype): + # scalar periods + df = DataFrame({"a": data}, index=[0]) + assert df["a"].dtype == dtype + + expected = DataFrame(index=[0], columns=["a"], data=data) + + tm.assert_frame_equal(df, expected) + @pytest.mark.parametrize( "data,dtype", [ diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index ce078059479b4..0fb8c5955a2e7 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -8,16 +8,23 @@ from pandas._libs import iNaT, lib from pandas.core.dtypes.common import is_categorical_dtype, is_datetime64tz_dtype -from pandas.core.dtypes.dtypes import CategoricalDtype +from pandas.core.dtypes.dtypes import ( + CategoricalDtype, + DatetimeTZDtype, + IntervalDtype, + PeriodDtype, +) import pandas as pd from pandas import ( Categorical, DataFrame, Index, + Interval, IntervalIndex, MultiIndex, NaT, + Period, Series, Timestamp, date_range, @@ -1075,6 +1082,26 @@ def test_constructor_dict_order(self): expected = Series([1, 0, 2], index=list("bac")) tm.assert_series_equal(result, expected) + @pytest.mark.parametrize( + "data,dtype", + [ + (Period("2020-01"), PeriodDtype("M")), + (Interval(left=0, right=5), IntervalDtype("int64")), + ( + Timestamp("2011-01-01", tz="US/Eastern"), + DatetimeTZDtype(tz="US/Eastern"), + ), + ], + ) + def test_constructor_dict_extension(self, data, dtype): + d = {"a": data} + result = Series(d, index=["a"]) + expected = Series(data, index=["a"], dtype=dtype) + + assert result.dtype == dtype + + tm.assert_series_equal(result, expected) + @pytest.mark.parametrize("value", [2, np.nan, None, float("nan")]) def test_constructor_dict_nan_key(self, value): # GH 18480 From 3581c90d3e5507aac53154cb58eec974a237df31 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 11 Sep 2020 06:04:37 -0700 Subject: [PATCH 0725/1025] REF: share more EA methods (#36209) --- pandas/core/arrays/_mixins.py | 11 +++++++-- pandas/core/arrays/categorical.py | 19 +++------------ pandas/core/arrays/datetimelike.py | 38 ++++-------------------------- pandas/core/arrays/numpy_.py | 17 ++++++------- 4 files changed, 26 insertions(+), 59 deletions(-) diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py index 8b79f8ce66756..e9d8671b69c78 100644 --- a/pandas/core/arrays/_mixins.py +++ b/pandas/core/arrays/_mixins.py @@ -6,7 +6,7 @@ from pandas.errors import AbstractMethodError from pandas.util._decorators import cache_readonly, doc -from pandas.core.algorithms import searchsorted, take, unique +from pandas.core.algorithms import take, unique from pandas.core.array_algos.transforms import shift from pandas.core.arrays.base import ExtensionArray @@ -102,6 +102,9 @@ def T(self: _T) -> _T: # ------------------------------------------------------------------------ + def _values_for_argsort(self): + return self._ndarray + def copy(self: _T) -> _T: new_data = self._ndarray.copy() return self._from_backing_data(new_data) @@ -135,7 +138,11 @@ def _concat_same_type(cls, to_concat, axis: int = 0): @doc(ExtensionArray.searchsorted) def searchsorted(self, value, side="left", sorter=None): - return searchsorted(self._ndarray, value, side=side, sorter=sorter) + value = self._validate_searchsorted_value(value) + return self._ndarray.searchsorted(value, side=side, sorter=sorter) + + def _validate_searchsorted_value(self, value): + return value @doc(ExtensionArray.shift) def shift(self, periods=1, fill_value=None, axis=0): diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index a2b5b54c55490..66d917b07305c 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -12,7 +12,7 @@ from pandas._libs import NaT, algos as libalgos, hashtable as htable, lib from pandas._typing import ArrayLike, Dtype, Ordered, Scalar from pandas.compat.numpy import function as nv -from pandas.util._decorators import cache_readonly, deprecate_kwarg, doc +from pandas.util._decorators import cache_readonly, deprecate_kwarg from pandas.util._validators import validate_bool_kwarg, validate_fillna_kwargs from pandas.core.dtypes.cast import ( @@ -45,12 +45,7 @@ import pandas.core.algorithms as algorithms from pandas.core.algorithms import _get_data_algo, factorize, take_1d, unique1d from pandas.core.arrays._mixins import NDArrayBackedExtensionArray -from pandas.core.base import ( - ExtensionArray, - NoNewAttributesMixin, - PandasObject, - _shared_docs, -) +from pandas.core.base import ExtensionArray, NoNewAttributesMixin, PandasObject import pandas.core.common as com from pandas.core.construction import array, extract_array, sanitize_array from pandas.core.indexers import check_array_indexer, deprecate_ndim_indexing @@ -1315,11 +1310,6 @@ def memory_usage(self, deep=False): """ return self._codes.nbytes + self.dtype.categories.memory_usage(deep=deep) - @doc(_shared_docs["searchsorted"], klass="Categorical") - def searchsorted(self, value, side="left", sorter=None): - value = self._validate_searchsorted_value(value) - return self.codes.searchsorted(value, side=side, sorter=sorter) - def isna(self): """ Detect missing values @@ -1428,9 +1418,6 @@ def check_for_ordered(self, op): "Categorical to an ordered one\n" ) - def _values_for_argsort(self): - return self._codes - def argsort(self, ascending=True, kind="quicksort", **kwargs): """ Return the indices that would sort the Categorical. @@ -1879,7 +1866,7 @@ def __getitem__(self, key): if result.ndim > 1: deprecate_ndim_indexing(result) return result - return self._constructor(result, dtype=self.dtype, fastpath=True) + return self._from_backing_data(result) def __setitem__(self, key, value): """ diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index bb40cf78ea006..6302b48cb1978 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -545,15 +545,18 @@ def __getitem__(self, key): result = self._ndarray[key] if self.ndim == 1: return self._box_func(result) - return self._simple_new(result, dtype=self.dtype) + return self._from_backing_data(result) key = self._validate_getitem_key(key) result = self._ndarray[key] if lib.is_scalar(result): return self._box_func(result) + result = self._from_backing_data(result) + freq = self._get_getitem_freq(key) - return self._simple_new(result, dtype=self.dtype, freq=freq) + result._freq = freq + return result def _validate_getitem_key(self, key): if com.is_bool_indexer(key): @@ -714,9 +717,6 @@ def _values_for_factorize(self): def _from_factorized(cls, values, original): return cls(values, dtype=original.dtype) - def _values_for_argsort(self): - return self._ndarray - # ------------------------------------------------------------------ # Validation Methods # TODO: try to de-duplicate these, ensure identical behavior @@ -917,34 +917,6 @@ def _unbox(self, other, setitem: bool = False) -> Union[np.int64, np.ndarray]: # These are not part of the EA API, but we implement them because # pandas assumes they're there. - def searchsorted(self, value, side="left", sorter=None): - """ - Find indices where elements should be inserted to maintain order. - - Find the indices into a sorted array `self` such that, if the - corresponding elements in `value` were inserted before the indices, - the order of `self` would be preserved. - - Parameters - ---------- - value : array_like - Values to insert into `self`. - side : {'left', 'right'}, optional - If 'left', the index of the first suitable location found is given. - If 'right', return the last such index. If there is no suitable - index, return either 0 or N (where N is the length of `self`). - sorter : 1-D array_like, optional - Optional array of integer indices that sort `self` into ascending - order. They are typically the result of ``np.argsort``. - - Returns - ------- - indices : array of ints - Array of insertion points with the same shape as `value`. - """ - value = self._validate_searchsorted_value(value) - return self._data.searchsorted(value, side=side, sorter=sorter) - def value_counts(self, dropna=False): """ Return a Series containing counts of unique values. diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index 588d68514649a..d3fa87d5ea7ff 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -260,15 +260,19 @@ def __getitem__(self, item): return result def __setitem__(self, key, value) -> None: - value = extract_array(value, extract_numpy=True) + key = self._validate_setitem_key(key) + value = self._validate_setitem_value(value) + self._ndarray[key] = value - key = check_array_indexer(self, key) - scalar_value = lib.is_scalar(value) + def _validate_setitem_value(self, value): + value = extract_array(value, extract_numpy=True) - if not scalar_value: + if not lib.is_scalar(value): value = np.asarray(value, dtype=self._ndarray.dtype) + return value - self._ndarray[key] = value + def _validate_setitem_key(self, key): + return check_array_indexer(self, key) def isna(self) -> np.ndarray: return isna(self._ndarray) @@ -308,9 +312,6 @@ def _validate_fill_value(self, fill_value): fill_value = self.dtype.na_value return fill_value - def _values_for_argsort(self) -> np.ndarray: - return self._ndarray - def _values_for_factorize(self) -> Tuple[np.ndarray, int]: return self._ndarray, -1 From 67d6e3b82bb44073a7effe050f900f2121e2343d Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 11 Sep 2020 06:12:11 -0700 Subject: [PATCH 0726/1025] CLN: simplify Categorical comparisons (#36237) --- pandas/core/arrays/categorical.py | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 66d917b07305c..81f9456502bf0 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -58,6 +58,7 @@ def _cat_compare_op(op): opname = f"__{op.__name__}__" + fill_value = True if op is operator.ne else False @unpack_zerodim_and_defer(opname) def func(self, other): @@ -92,26 +93,23 @@ def func(self, other): else: other_codes = other._codes - f = getattr(self._codes, opname) - ret = f(other_codes) + ret = op(self._codes, other_codes) mask = (self._codes == -1) | (other_codes == -1) if mask.any(): - # In other series, the leads to False, so do that here too - if opname == "__ne__": - ret[(self._codes == -1) & (other_codes == -1)] = True - else: - ret[mask] = False + ret[mask] = fill_value return ret if is_scalar(other): if other in self.categories: i = self.categories.get_loc(other) - ret = getattr(self._codes, opname)(i) + ret = op(self._codes, i) if opname not in {"__eq__", "__ge__", "__gt__"}: - # check for NaN needed if we are not equal or larger + # GH#29820 performance trick; get_loc will always give i>=0, + # so in the cases (__ne__, __le__, __lt__) the setting + # here is a no-op, so can be skipped. mask = self._codes == -1 - ret[mask] = False + ret[mask] = fill_value return ret else: return ops.invalid_comparison(self, other, op) From 14e800ca2f962144079a50bb918ac40c22926d2d Mon Sep 17 00:00:00 2001 From: Nikhil Choudhary <49715980+Nikhil1O1@users.noreply.github.com> Date: Fri, 11 Sep 2020 22:17:13 +0530 Subject: [PATCH 0727/1025] DOC: Update groupby.rst (#36238) --- doc/source/user_guide/groupby.rst | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst index ddba3dc452e28..f745dab00bab8 100644 --- a/doc/source/user_guide/groupby.rst +++ b/doc/source/user_guide/groupby.rst @@ -87,11 +87,9 @@ The mapping can be specified many different ways: * A Python function, to be called on each of the axis labels. * A list or NumPy array of the same length as the selected axis. * A dict or ``Series``, providing a ``label -> group name`` mapping. -* For ``DataFrame`` objects, a string indicating a column to be used to group. - Of course ``df.groupby('A')`` is just syntactic sugar for - ``df.groupby(df['A'])``, but it makes life simpler. -* For ``DataFrame`` objects, a string indicating an index level to be used to - group. +* For ``DataFrame`` objects, a string indicating either a column name or + an index level name to be used to group. +* ``df.groupby('A')`` is just syntactic sugar for ``df.groupby(df['A'])``. * A list of any of the above things. Collectively we refer to the grouping objects as the **keys**. For example, From 483fcd3777f008523df180501397a444f88e6734 Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Sat, 12 Sep 2020 03:40:11 +1000 Subject: [PATCH 0728/1025] ENH add na_action to DataFrame.applymap (#35704) --- doc/source/whatsnew/v1.2.0.rst | 2 +- pandas/_libs/lib.pyx | 8 ++++++- pandas/core/frame.py | 25 +++++++++++++++++--- pandas/tests/frame/apply/test_frame_apply.py | 16 +++++++++++++ 4 files changed, 46 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index f2f56ee81b8d4..bce6a735b7b07 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -100,8 +100,8 @@ For example: Other enhancements ^^^^^^^^^^^^^^^^^^ - - Added :meth:`~DataFrame.set_flags` for setting table-wide flags on a ``Series`` or ``DataFrame`` (:issue:`28394`) +- :meth:`DataFrame.applymap` now supports ``na_action`` (:issue:`23803`) - :class:`Index` with object dtype supports division and multiplication (:issue:`34160`) - :meth:`DataFrame.explode` and :meth:`Series.explode` now support exploding of sets (:issue:`35614`) - diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index eadfcefaac73d..7464fafee2b94 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2377,7 +2377,7 @@ def map_infer_mask(ndarray arr, object f, const uint8_t[:] mask, bint convert=Tr @cython.boundscheck(False) @cython.wraparound(False) -def map_infer(ndarray arr, object f, bint convert=True): +def map_infer(ndarray arr, object f, bint convert=True, bint ignore_na=False): """ Substitute for np.vectorize with pandas-friendly dtype inference. @@ -2385,6 +2385,9 @@ def map_infer(ndarray arr, object f, bint convert=True): ---------- arr : ndarray f : function + convert : bint + ignore_na : bint + If True, NA values will not have f applied Returns ------- @@ -2398,6 +2401,9 @@ def map_infer(ndarray arr, object f, bint convert=True): n = len(arr) result = np.empty(n, dtype=object) for i in range(n): + if ignore_na and checknull(arr[i]): + result[i] = arr[i] + continue val = f(arr[i]) if cnp.PyArray_IsZeroDim(val): diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 3eed10917843b..b03593ad8afe1 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -7619,7 +7619,7 @@ def apply(self, func, axis=0, raw=False, result_type=None, args=(), **kwds): ) return op.get_result() - def applymap(self, func) -> DataFrame: + def applymap(self, func, na_action: Optional[str] = None) -> DataFrame: """ Apply a function to a Dataframe elementwise. @@ -7630,6 +7630,10 @@ def applymap(self, func) -> DataFrame: ---------- func : callable Python function, returns a single value from a single value. + na_action : {None, 'ignore'}, default None + If ‘ignore’, propagate NaN values, without passing them to func. + + .. versionadded:: 1.2 Returns ------- @@ -7653,6 +7657,15 @@ def applymap(self, func) -> DataFrame: 0 3 4 1 5 5 + Like Series.map, NA values can be ignored: + + >>> df_copy = df.copy() + >>> df_copy.iloc[0, 0] = pd.NA + >>> df_copy.applymap(lambda x: len(str(x)), na_action='ignore') + 0 1 + 0 4 + 1 5 5 + Note that a vectorized version of `func` often exists, which will be much faster. You could square each number elementwise. @@ -7668,11 +7681,17 @@ def applymap(self, func) -> DataFrame: 0 1.000000 4.494400 1 11.262736 20.857489 """ + if na_action not in {"ignore", None}: + raise ValueError( + f"na_action must be 'ignore' or None. Got {repr(na_action)}" + ) + ignore_na = na_action == "ignore" + # if we have a dtype == 'M8[ns]', provide boxed values def infer(x): if x.empty: - return lib.map_infer(x, func) - return lib.map_infer(x.astype(object)._values, func) + return lib.map_infer(x, func, ignore_na=ignore_na) + return lib.map_infer(x.astype(object)._values, func, ignore_na=ignore_na) return self.apply(infer) diff --git a/pandas/tests/frame/apply/test_frame_apply.py b/pandas/tests/frame/apply/test_frame_apply.py index bc09501583e2c..1662f9e2fff56 100644 --- a/pandas/tests/frame/apply/test_frame_apply.py +++ b/pandas/tests/frame/apply/test_frame_apply.py @@ -630,6 +630,22 @@ def test_applymap(self, float_frame): result = frame.applymap(func) tm.assert_frame_equal(result, frame) + def test_applymap_na_ignore(self, float_frame): + # GH 23803 + strlen_frame = float_frame.applymap(lambda x: len(str(x))) + float_frame_with_na = float_frame.copy() + mask = np.random.randint(0, 2, size=float_frame.shape, dtype=bool) + float_frame_with_na[mask] = pd.NA + strlen_frame_na_ignore = float_frame_with_na.applymap( + lambda x: len(str(x)), na_action="ignore" + ) + strlen_frame_with_na = strlen_frame.copy() + strlen_frame_with_na[mask] = pd.NA + tm.assert_frame_equal(strlen_frame_na_ignore, strlen_frame_with_na) + + with pytest.raises(ValueError, match="na_action must be .*Got 'abc'"): + float_frame_with_na.applymap(lambda x: len(str(x)), na_action="abc") + def test_applymap_box_timestamps(self): # GH 2689, GH 2627 ser = pd.Series(date_range("1/1/2000", periods=10)) From 9b72512e551e33d75b98e06ff8013293e0a7ae4e Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 11 Sep 2020 12:18:23 -0700 Subject: [PATCH 0729/1025] CI: xfail failing parquet test (#36272) --- pandas/tests/io/test_parquet.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 15f9837176315..35a400cba8671 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -8,6 +8,7 @@ import numpy as np import pytest +from pandas.compat import PY38 import pandas.util._test_decorators as td import pandas as pd @@ -564,8 +565,19 @@ def test_s3_roundtrip(self, df_compat, s3_resource, pa, s3so): write_kwargs=s3so, ) - @td.skip_if_no("s3fs") - @pytest.mark.parametrize("partition_col", [["A"], []]) + @td.skip_if_no("s3fs") # also requires flask + @pytest.mark.parametrize( + "partition_col", + [ + pytest.param( + ["A"], + marks=pytest.mark.xfail( + PY38, reason="Getting back empty DataFrame", raises=AssertionError, + ), + ), + [], + ], + ) def test_s3_roundtrip_for_dir( self, df_compat, s3_resource, pa, partition_col, s3so ): From a067f7ea58bfd20381cb411642cafbe86665185b Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 11 Sep 2020 13:51:54 -0700 Subject: [PATCH 0730/1025] de-privatize (#36259) --- pandas/compat/numpy/function.py | 4 ++-- pandas/core/algorithms.py | 6 +++--- pandas/core/arrays/categorical.py | 6 +++--- pandas/core/ops/__init__.py | 20 +++++++++--------- pandas/core/ops/methods.py | 34 +++++++++++++++---------------- pandas/io/excel/__init__.py | 8 ++++---- pandas/io/excel/_base.py | 16 +++++++-------- pandas/io/excel/_odfreader.py | 2 +- pandas/io/excel/_odswriter.py | 2 +- pandas/io/excel/_openpyxl.py | 4 ++-- pandas/io/excel/_pyxlsb.py | 2 +- pandas/io/excel/_xlrd.py | 2 +- pandas/io/excel/_xlsxwriter.py | 2 +- pandas/io/excel/_xlwt.py | 2 +- pandas/util/_test_decorators.py | 5 +++-- 15 files changed, 58 insertions(+), 57 deletions(-) diff --git a/pandas/compat/numpy/function.py b/pandas/compat/numpy/function.py index d7a14c28cc9ca..5f627aeade47c 100644 --- a/pandas/compat/numpy/function.py +++ b/pandas/compat/numpy/function.py @@ -21,7 +21,7 @@ from distutils.version import LooseVersion from typing import Any, Dict, Optional, Union -from numpy import __version__ as _np_version, ndarray +from numpy import __version__, ndarray from pandas._libs.lib import is_bool, is_integer from pandas.errors import UnsupportedFunctionCall @@ -122,7 +122,7 @@ def validate_argmax_with_skipna(skipna, args, kwargs): ARGSORT_DEFAULTS["kind"] = "quicksort" ARGSORT_DEFAULTS["order"] = None -if LooseVersion(_np_version) >= LooseVersion("1.17.0"): +if LooseVersion(__version__) >= LooseVersion("1.17.0"): # GH-26361. NumPy added radix sort and changed default to None. ARGSORT_DEFAULTS["kind"] = None diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 57e63daff29e4..872c51c7dfa75 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -262,7 +262,7 @@ def _get_values_for_rank(values): return values -def _get_data_algo(values): +def get_data_algo(values): values = _get_values_for_rank(values) ndtype = _check_object_for_strings(values) @@ -491,7 +491,7 @@ def factorize_array( codes : ndarray uniques : ndarray """ - hash_klass, values = _get_data_algo(values) + hash_klass, values = get_data_algo(values) table = hash_klass(size_hint or len(values)) uniques, codes = table.factorize( @@ -2086,7 +2086,7 @@ def sort_mixed(values): if sorter is None: # mixed types - hash_klass, values = _get_data_algo(values) + hash_klass, values = get_data_algo(values) t = hash_klass(len(values)) t.map_locations(values) sorter = ensure_platform_int(t.lookup(ordered)) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 81f9456502bf0..e73a1404c6434 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -43,7 +43,7 @@ from pandas.core import ops from pandas.core.accessor import PandasDelegate, delegate_names import pandas.core.algorithms as algorithms -from pandas.core.algorithms import _get_data_algo, factorize, take_1d, unique1d +from pandas.core.algorithms import factorize, get_data_algo, take_1d, unique1d from pandas.core.arrays._mixins import NDArrayBackedExtensionArray from pandas.core.base import ExtensionArray, NoNewAttributesMixin, PandasObject import pandas.core.common as com @@ -2531,8 +2531,8 @@ def _get_codes_for_values(values, categories): # Only hit here when we've already coerced to object dtypee. - hash_klass, vals = _get_data_algo(values) - _, cats = _get_data_algo(categories) + hash_klass, vals = get_data_algo(values) + _, cats = get_data_algo(categories) t = hash_klass(len(cats)) t.map_locations(cats) return coerce_indexer_dtype(t.lookup(vals), cats) diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index 8fcbee6a20ac3..6763db1e2b138 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -324,12 +324,12 @@ def _align_method_SERIES(left: "Series", right, align_asobject: bool = False): return left, right -def _arith_method_SERIES(cls, op, special): +def arith_method_SERIES(cls, op, special): """ Wrapper function for Series arithmetic operations, to avoid code duplication. """ - assert special # non-special uses _flex_method_SERIES + assert special # non-special uses flex_method_SERIES op_name = _get_op_name(op, special) @unpack_zerodim_and_defer(op_name) @@ -348,12 +348,12 @@ def wrapper(left, right): return wrapper -def _comp_method_SERIES(cls, op, special): +def comp_method_SERIES(cls, op, special): """ Wrapper function for Series arithmetic operations, to avoid code duplication. """ - assert special # non-special uses _flex_method_SERIES + assert special # non-special uses flex_method_SERIES op_name = _get_op_name(op, special) @unpack_zerodim_and_defer(op_name) @@ -375,12 +375,12 @@ def wrapper(self, other): return wrapper -def _bool_method_SERIES(cls, op, special): +def bool_method_SERIES(cls, op, special): """ Wrapper function for Series arithmetic operations, to avoid code duplication. """ - assert special # non-special uses _flex_method_SERIES + assert special # non-special uses flex_method_SERIES op_name = _get_op_name(op, special) @unpack_zerodim_and_defer(op_name) @@ -398,7 +398,7 @@ def wrapper(self, other): return wrapper -def _flex_method_SERIES(cls, op, special): +def flex_method_SERIES(cls, op, special): assert not special # "special" also means "not flex" name = _get_op_name(op, special) doc = _make_flex_doc(name, "series") @@ -614,7 +614,7 @@ def _maybe_align_series_as_frame(frame: "DataFrame", series: "Series", axis: int return type(frame)(rvalues, index=frame.index, columns=frame.columns) -def _arith_method_FRAME(cls: Type["DataFrame"], op, special: bool): +def arith_method_FRAME(cls: Type["DataFrame"], op, special: bool): # This is the only function where `special` can be either True or False op_name = _get_op_name(op, special) default_axis = _get_frame_op_default_axis(op_name) @@ -666,7 +666,7 @@ def f(self, other, axis=default_axis, level=None, fill_value=None): return f -def _flex_comp_method_FRAME(cls: Type["DataFrame"], op, special: bool): +def flex_comp_method_FRAME(cls: Type["DataFrame"], op, special: bool): assert not special # "special" also means "not flex" op_name = _get_op_name(op, special) default_axis = _get_frame_op_default_axis(op_name) @@ -690,7 +690,7 @@ def f(self, other, axis=default_axis, level=None): return f -def _comp_method_FRAME(cls: Type["DataFrame"], op, special: bool): +def comp_method_FRAME(cls: Type["DataFrame"], op, special: bool): assert special # "special" also means "not flex" op_name = _get_op_name(op, special) diff --git a/pandas/core/ops/methods.py b/pandas/core/ops/methods.py index a4694a6e5134f..e04db92b58c36 100644 --- a/pandas/core/ops/methods.py +++ b/pandas/core/ops/methods.py @@ -44,28 +44,28 @@ def _get_method_wrappers(cls): # TODO: make these non-runtime imports once the relevant functions # are no longer in __init__ from pandas.core.ops import ( - _arith_method_FRAME, - _arith_method_SERIES, - _bool_method_SERIES, - _comp_method_FRAME, - _comp_method_SERIES, - _flex_comp_method_FRAME, - _flex_method_SERIES, + arith_method_FRAME, + arith_method_SERIES, + bool_method_SERIES, + comp_method_FRAME, + comp_method_SERIES, + flex_comp_method_FRAME, + flex_method_SERIES, ) if issubclass(cls, ABCSeries): # Just Series - arith_flex = _flex_method_SERIES - comp_flex = _flex_method_SERIES - arith_special = _arith_method_SERIES - comp_special = _comp_method_SERIES - bool_special = _bool_method_SERIES + arith_flex = flex_method_SERIES + comp_flex = flex_method_SERIES + arith_special = arith_method_SERIES + comp_special = comp_method_SERIES + bool_special = bool_method_SERIES elif issubclass(cls, ABCDataFrame): - arith_flex = _arith_method_FRAME - comp_flex = _flex_comp_method_FRAME - arith_special = _arith_method_FRAME - comp_special = _comp_method_FRAME - bool_special = _arith_method_FRAME + arith_flex = arith_method_FRAME + comp_flex = flex_comp_method_FRAME + arith_special = arith_method_FRAME + comp_special = comp_method_FRAME + bool_special = arith_method_FRAME return arith_flex, comp_flex, arith_special, comp_special, bool_special diff --git a/pandas/io/excel/__init__.py b/pandas/io/excel/__init__.py index d035223957a76..3bad493dee388 100644 --- a/pandas/io/excel/__init__.py +++ b/pandas/io/excel/__init__.py @@ -1,9 +1,9 @@ from pandas.io.excel._base import ExcelFile, ExcelWriter, read_excel -from pandas.io.excel._odswriter import _ODSWriter -from pandas.io.excel._openpyxl import _OpenpyxlWriter +from pandas.io.excel._odswriter import ODSWriter as _ODSWriter +from pandas.io.excel._openpyxl import OpenpyxlWriter as _OpenpyxlWriter from pandas.io.excel._util import register_writer -from pandas.io.excel._xlsxwriter import _XlsxWriter -from pandas.io.excel._xlwt import _XlwtWriter +from pandas.io.excel._xlsxwriter import XlsxWriter as _XlsxWriter +from pandas.io.excel._xlwt import XlwtWriter as _XlwtWriter __all__ = ["read_excel", "ExcelWriter", "ExcelFile"] diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index d597731ed0ac4..e9634ff0e9a05 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -844,16 +844,16 @@ class ExcelFile: - ``pyxlsb`` supports Binary Excel files. """ - from pandas.io.excel._odfreader import _ODFReader - from pandas.io.excel._openpyxl import _OpenpyxlReader - from pandas.io.excel._pyxlsb import _PyxlsbReader - from pandas.io.excel._xlrd import _XlrdReader + from pandas.io.excel._odfreader import ODFReader + from pandas.io.excel._openpyxl import OpenpyxlReader + from pandas.io.excel._pyxlsb import PyxlsbReader + from pandas.io.excel._xlrd import XlrdReader _engines: Mapping[str, Any] = { - "xlrd": _XlrdReader, - "openpyxl": _OpenpyxlReader, - "odf": _ODFReader, - "pyxlsb": _PyxlsbReader, + "xlrd": XlrdReader, + "openpyxl": OpenpyxlReader, + "odf": ODFReader, + "pyxlsb": PyxlsbReader, } def __init__( diff --git a/pandas/io/excel/_odfreader.py b/pandas/io/excel/_odfreader.py index 02575ab878f6e..ffb599cdfaaf8 100644 --- a/pandas/io/excel/_odfreader.py +++ b/pandas/io/excel/_odfreader.py @@ -10,7 +10,7 @@ from pandas.io.excel._base import BaseExcelReader -class _ODFReader(BaseExcelReader): +class ODFReader(BaseExcelReader): """ Read tables out of OpenDocument formatted files. diff --git a/pandas/io/excel/_odswriter.py b/pandas/io/excel/_odswriter.py index e7684012c1d4c..cbac60dfabaa7 100644 --- a/pandas/io/excel/_odswriter.py +++ b/pandas/io/excel/_odswriter.py @@ -9,7 +9,7 @@ from pandas.io.formats.excel import ExcelCell -class _ODSWriter(ExcelWriter): +class ODSWriter(ExcelWriter): engine = "odf" supported_extensions = (".ods",) diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py index f395127902101..a5cadf4d93389 100644 --- a/pandas/io/excel/_openpyxl.py +++ b/pandas/io/excel/_openpyxl.py @@ -12,7 +12,7 @@ from openpyxl.descriptors.serialisable import Serialisable -class _OpenpyxlWriter(ExcelWriter): +class OpenpyxlWriter(ExcelWriter): engine = "openpyxl" supported_extensions = (".xlsx", ".xlsm") @@ -438,7 +438,7 @@ def write_cells( setattr(xcell, k, v) -class _OpenpyxlReader(BaseExcelReader): +class OpenpyxlReader(BaseExcelReader): def __init__( self, filepath_or_buffer: FilePathOrBuffer, diff --git a/pandas/io/excel/_pyxlsb.py b/pandas/io/excel/_pyxlsb.py index 069c3a2eaa643..ac94f4dd3df74 100644 --- a/pandas/io/excel/_pyxlsb.py +++ b/pandas/io/excel/_pyxlsb.py @@ -6,7 +6,7 @@ from pandas.io.excel._base import BaseExcelReader -class _PyxlsbReader(BaseExcelReader): +class PyxlsbReader(BaseExcelReader): def __init__( self, filepath_or_buffer: FilePathOrBuffer, diff --git a/pandas/io/excel/_xlrd.py b/pandas/io/excel/_xlrd.py index 9057106fb08e5..dfd5dde0329ae 100644 --- a/pandas/io/excel/_xlrd.py +++ b/pandas/io/excel/_xlrd.py @@ -8,7 +8,7 @@ from pandas.io.excel._base import BaseExcelReader -class _XlrdReader(BaseExcelReader): +class XlrdReader(BaseExcelReader): def __init__(self, filepath_or_buffer, storage_options: StorageOptions = None): """ Reader using xlrd engine. diff --git a/pandas/io/excel/_xlsxwriter.py b/pandas/io/excel/_xlsxwriter.py index 53f0c94d12e4c..16c4d377d7610 100644 --- a/pandas/io/excel/_xlsxwriter.py +++ b/pandas/io/excel/_xlsxwriter.py @@ -158,7 +158,7 @@ def convert(cls, style_dict, num_format_str=None): return props -class _XlsxWriter(ExcelWriter): +class XlsxWriter(ExcelWriter): engine = "xlsxwriter" supported_extensions = (".xlsx",) diff --git a/pandas/io/excel/_xlwt.py b/pandas/io/excel/_xlwt.py index faebe526d17bd..3592c2684f5a5 100644 --- a/pandas/io/excel/_xlwt.py +++ b/pandas/io/excel/_xlwt.py @@ -9,7 +9,7 @@ from xlwt import XFStyle -class _XlwtWriter(ExcelWriter): +class XlwtWriter(ExcelWriter): engine = "xlwt" supported_extensions = (".xls",) diff --git a/pandas/util/_test_decorators.py b/pandas/util/_test_decorators.py index e9deaf3fe67de..0e8f6b933cd97 100644 --- a/pandas/util/_test_decorators.py +++ b/pandas/util/_test_decorators.py @@ -33,7 +33,6 @@ def test_foo(): from pandas.compat import IS64, is_platform_windows from pandas.compat._optional import import_optional_dependency -from pandas.compat.numpy import _np_version from pandas.core.computation.expressions import NUMEXPR_INSTALLED, USE_NUMEXPR @@ -205,7 +204,9 @@ def skip_if_no(package: str, min_version: Optional[str] = None): def skip_if_np_lt(ver_str: str, *args, reason: Optional[str] = None): if reason is None: reason = f"NumPy {ver_str} or greater required" - return pytest.mark.skipif(_np_version < LooseVersion(ver_str), *args, reason=reason) + return pytest.mark.skipif( + np.__version__ < LooseVersion(ver_str), *args, reason=reason + ) def parametrize_fixture_doc(*args): From 7662f37592ca7f4878412b84b0ee06cde102ff12 Mon Sep 17 00:00:00 2001 From: patrick <61934744+phofl@users.noreply.github.com> Date: Fri, 11 Sep 2020 23:33:02 +0200 Subject: [PATCH 0731/1025] Update deprecation warnings, which were already removed (#36292) --- doc/source/user_guide/indexing.rst | 19 ++++++++++++------- doc/source/user_guide/io.rst | 13 +++---------- doc/source/user_guide/timeseries.rst | 8 ++++---- 3 files changed, 19 insertions(+), 21 deletions(-) diff --git a/doc/source/user_guide/indexing.rst b/doc/source/user_guide/indexing.rst index cac18f5bf39cd..74abbc9503db0 100644 --- a/doc/source/user_guide/indexing.rst +++ b/doc/source/user_guide/indexing.rst @@ -313,8 +313,10 @@ Selection by label .. warning:: - Starting in 0.21.0, pandas will show a ``FutureWarning`` if indexing with a list with missing labels. In the future - this will raise a ``KeyError``. See :ref:`list-like Using loc with missing keys in a list is Deprecated `. + .. versionchanged:: 1.0.0 + + Pandas will raise a ``KeyError`` if indexing with a list with missing labels. See :ref:`list-like Using loc with + missing keys in a list is Deprecated `. pandas provides a suite of methods in order to have **purely label based indexing**. This is a strict inclusion based protocol. Every label asked for must be in the index, or a ``KeyError`` will be raised. @@ -578,8 +580,9 @@ IX indexer is deprecated .. warning:: - Starting in 0.20.0, the ``.ix`` indexer is deprecated, in favor of the more strict ``.iloc`` - and ``.loc`` indexers. + .. versionchanged:: 1.0.0 + + The ``.ix`` indexer was removed, in favor of the more strict ``.iloc`` and ``.loc`` indexers. ``.ix`` offers a lot of magic on the inference of what the user wants to do. To wit, ``.ix`` can decide to index *positionally* OR via *labels* depending on the data type of the index. This has caused quite a @@ -636,11 +639,13 @@ Indexing with list with missing labels is deprecated .. warning:: - Starting in 0.21.0, using ``.loc`` or ``[]`` with a list with one or more missing labels, is deprecated, in favor of ``.reindex``. + .. versionchanged:: 1.0.0 + + Using ``.loc`` or ``[]`` with a list with one or more missing labels will no longer reindex, in favor of ``.reindex``. In prior versions, using ``.loc[list-of-labels]`` would work as long as *at least 1* of the keys was found (otherwise it -would raise a ``KeyError``). This behavior is deprecated and will show a warning message pointing to this section. The -recommended alternative is to use ``.reindex()``. +would raise a ``KeyError``). This behavior was changed and will now raise a ``KeyError`` if at least one label is missing. +The recommended alternative is to use ``.reindex()``. For example. diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index bf6575a8836f5..a1ce2f847d4b8 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -3024,19 +3024,12 @@ It is often the case that users will insert columns to do temporary computations in Excel and you may not want to read in those columns. ``read_excel`` takes a ``usecols`` keyword to allow you to specify a subset of columns to parse. -.. deprecated:: 0.24.0 +.. versionchanged:: 1.0.0 -Passing in an integer for ``usecols`` has been deprecated. Please pass in a list +Passing in an integer for ``usecols`` will no longer work. Please pass in a list of ints from 0 to ``usecols`` inclusive instead. -If ``usecols`` is an integer, then it is assumed to indicate the last column -to be parsed. - -.. code-block:: python - - pd.read_excel('path_to_file.xls', 'Sheet1', usecols=2) - -You can also specify a comma-delimited set of Excel columns and ranges as a string: +You can specify a comma-delimited set of Excel columns and ranges as a string: .. code-block:: python diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst index 0bfe9d9b68cdb..71eefb9a76562 100644 --- a/doc/source/user_guide/timeseries.rst +++ b/doc/source/user_guide/timeseries.rst @@ -327,11 +327,11 @@ which can be specified. These are computed from the starting point specified by that was discussed :ref:`above`). The available units are listed on the documentation for :func:`pandas.to_datetime`. +.. versionchanged:: 1.0.0 + Constructing a :class:`Timestamp` or :class:`DatetimeIndex` with an epoch timestamp -with the ``tz`` argument specified will currently localize the epoch timestamps to UTC -first then convert the result to the specified time zone. However, this behavior -is :ref:`deprecated `, and if you have -epochs in wall time in another timezone, it is recommended to read the epochs +with the ``tz`` argument specified will raise a ValueError. If you have +epochs in wall time in another timezone, you can read the epochs as timezone-naive timestamps and then localize to the appropriate timezone: .. ipython:: python From 3ee51508be5f2f2b003c6e450b3af8fb0667a8dc Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 11 Sep 2020 18:00:15 -0700 Subject: [PATCH 0732/1025] CLN: typo cleanups (#36276) * typo cleanups * typo fixup --- pandas/_libs/index.pyx | 2 +- pandas/tests/arrays/categorical/test_indexing.py | 2 +- scripts/validate_unwanted_patterns.py | 6 +++--- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index 569562f5b5037..8155e7e6c074a 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -260,7 +260,7 @@ cdef class IndexEngine: def get_indexer_non_unique(self, targets): """ Return an indexer suitable for taking from a non unique index - return the labels in the same order ast the target + return the labels in the same order as the target and a missing indexer into the targets (which correspond to the -1 indices in the results """ diff --git a/pandas/tests/arrays/categorical/test_indexing.py b/pandas/tests/arrays/categorical/test_indexing.py index abfae189bb4d7..ab8606ef9258d 100644 --- a/pandas/tests/arrays/categorical/test_indexing.py +++ b/pandas/tests/arrays/categorical/test_indexing.py @@ -183,7 +183,7 @@ def test_get_indexer_non_unique(self, idx_values, key_values, key_class): # GH 21448 key = key_class(key_values, categories=range(1, 5)) # Test for flat index and CategoricalIndex with same/different cats: - for dtype in None, "category", key.dtype: + for dtype in [None, "category", key.dtype]: idx = Index(idx_values, dtype=dtype) expected, exp_miss = idx.get_indexer_non_unique(key_values) result, res_miss = idx.get_indexer_non_unique(key) diff --git a/scripts/validate_unwanted_patterns.py b/scripts/validate_unwanted_patterns.py index 1a6d8cc8b9914..2add2b8c62a4e 100755 --- a/scripts/validate_unwanted_patterns.py +++ b/scripts/validate_unwanted_patterns.py @@ -357,9 +357,9 @@ def main( output_format : str Output format of the error message. file_extensions_to_check : str - Coma seperated values of what file extensions to check. + Comma separated values of what file extensions to check. excluded_file_paths : str - Coma seperated values of what file paths to exclude during the check. + Comma separated values of what file paths to exclude during the check. Returns ------- @@ -444,7 +444,7 @@ def main( parser.add_argument( "--included-file-extensions", default="py,pyx,pxd,pxi", - help="Coma seperated file extensions to check.", + help="Comma separated file extensions to check.", ) parser.add_argument( "--excluded-file-paths", From 62af29b5f5623584e6c693358605691d6579e774 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 12 Sep 2020 13:31:29 -0700 Subject: [PATCH 0733/1025] REF: de-duplicate _wrap_joined_index (#36282) --- pandas/core/indexes/base.py | 2 +- pandas/core/indexes/category.py | 3 ++- pandas/core/indexes/datetimelike.py | 4 +++- pandas/core/indexes/multi.py | 2 +- pandas/core/indexes/numeric.py | 9 --------- 5 files changed, 7 insertions(+), 13 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 67456096e8681..5d15d33e7e215 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -3877,7 +3877,7 @@ def _join_monotonic(self, other, how="left", return_indexers=False): def _wrap_joined_index(self, joined, other): name = get_op_result_name(self, other) - return Index(joined, name=name) + return self._constructor(joined, name=name) # -------------------------------------------------------------------- # Uncategorized Methods diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 7509cb35069e8..98ef473a13348 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -767,7 +767,8 @@ def _wrap_joined_index( self, joined: np.ndarray, other: "CategoricalIndex" ) -> "CategoricalIndex": name = get_op_result_name(self, other) - return self._create_from_codes(joined, name=name) + cat = self._data._from_backing_data(joined) + return type(self)._simple_new(cat, name=name) CategoricalIndex._add_logical_methods_disabled() diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 54c8ed60b6097..13236f8488ecb 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -603,7 +603,9 @@ def _wrap_joined_index(self, joined: np.ndarray, other): else: self = cast(DatetimeTimedeltaMixin, self) freq = self.freq if self._can_fast_union(other) else None - new_data = type(self._data)._simple_new(joined, dtype=self.dtype, freq=freq) + + new_data = self._data._from_backing_data(joined) + new_data._freq = freq return type(self)._simple_new(new_data, name=name) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index deeb7ff50b88c..7aceb898f5ccf 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -3637,7 +3637,7 @@ def delete(self, loc): def _wrap_joined_index(self, joined, other): names = self.names if self.names == other.names else None - return MultiIndex.from_tuples(joined, names=names) + return self._constructor(joined, names=names) @doc(Index.isin) def isin(self, values, level=None): diff --git a/pandas/core/indexes/numeric.py b/pandas/core/indexes/numeric.py index 125602ef2054a..e8b7efeee8852 100644 --- a/pandas/core/indexes/numeric.py +++ b/pandas/core/indexes/numeric.py @@ -33,7 +33,6 @@ from pandas.core import algorithms import pandas.core.common as com from pandas.core.indexes.base import Index, maybe_extract_name -from pandas.core.ops import get_op_result_name _num_index_shared_docs = dict() @@ -262,10 +261,6 @@ class Int64Index(IntegerIndex): _engine_type = libindex.Int64Engine _default_dtype = np.dtype(np.int64) - def _wrap_joined_index(self, joined, other): - name = get_op_result_name(self, other) - return Int64Index(joined, name=name) - @classmethod def _assert_safe_casting(cls, data, subarr): """ @@ -324,10 +319,6 @@ def _convert_index_indexer(self, keyarr): # ---------------------------------------------------------------- - def _wrap_joined_index(self, joined, other): - name = get_op_result_name(self, other) - return UInt64Index(joined, name=name) - @classmethod def _assert_safe_casting(cls, data, subarr): """ From b831dd02499e6afcaddd10b87922fdb8d34d6dde Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 12 Sep 2020 13:33:52 -0700 Subject: [PATCH 0734/1025] REF: de-duplicate sort_values (#36301) --- pandas/core/indexes/base.py | 8 +++++--- pandas/core/indexes/datetimelike.py | 17 ----------------- 2 files changed, 5 insertions(+), 20 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 5d15d33e7e215..0aed08d46657e 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4444,8 +4444,8 @@ def asof_locs(self, where, mask): def sort_values( self, - return_indexer=False, - ascending=True, + return_indexer: bool = False, + ascending: bool = True, na_position: str_t = "last", key: Optional[Callable] = None, ): @@ -4509,7 +4509,9 @@ def sort_values( # GH 35584. Sort missing values according to na_position kwarg # ignore na_position for MutiIndex - if not isinstance(self, ABCMultiIndex): + if not isinstance( + self, (ABCMultiIndex, ABCDatetimeIndex, ABCTimedeltaIndex, ABCPeriodIndex) + ): _as = nargsort( items=idx, ascending=ascending, na_position=na_position, key=key ) diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 13236f8488ecb..5ba5732c710f7 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -41,7 +41,6 @@ ) from pandas.core.indexes.numeric import Int64Index from pandas.core.ops import get_op_result_name -from pandas.core.sorting import ensure_key_mapped from pandas.core.tools.timedeltas import to_timedelta _index_doc_kwargs = dict(ibase._index_doc_kwargs) @@ -164,22 +163,6 @@ def __contains__(self, key: Any) -> bool: is_scalar(res) or isinstance(res, slice) or (is_list_like(res) and len(res)) ) - def sort_values(self, return_indexer=False, ascending=True, key=None): - """ - Return sorted copy of Index. - """ - idx = ensure_key_mapped(self, key) - - _as = idx.argsort() - if not ascending: - _as = _as[::-1] - sorted_index = self.take(_as) - - if return_indexer: - return sorted_index, _as - else: - return sorted_index - @Appender(_index_shared_docs["take"] % _index_doc_kwargs) def take(self, indices, axis=0, allow_fill=True, fill_value=None, **kwargs): nv.validate_take(tuple(), kwargs) From 32b6ae41e75864b25f353fd2b2d8b902f4b3ec86 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 12 Sep 2020 13:40:20 -0700 Subject: [PATCH 0735/1025] PERF: get_dtype_kinds (#36309) --- pandas/core/dtypes/concat.py | 51 ++++++++++++------------------ pandas/tests/dtypes/test_concat.py | 4 +-- 2 files changed, 23 insertions(+), 32 deletions(-) diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index dd005752a4832..07904339b93df 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -1,7 +1,7 @@ """ Utility functions related to concat. """ -from typing import cast +from typing import Set, cast import numpy as np @@ -9,15 +9,10 @@ from pandas.core.dtypes.cast import find_common_type from pandas.core.dtypes.common import ( - is_bool_dtype, is_categorical_dtype, - is_datetime64_dtype, - is_datetime64tz_dtype, is_dtype_equal, is_extension_array_dtype, - is_object_dtype, is_sparse, - is_timedelta64_dtype, ) from pandas.core.dtypes.generic import ABCCategoricalIndex, ABCRangeIndex, ABCSeries @@ -26,7 +21,7 @@ from pandas.core.construction import array -def get_dtype_kinds(l): +def _get_dtype_kinds(l) -> Set[str]: """ Parameters ---------- @@ -34,34 +29,30 @@ def get_dtype_kinds(l): Returns ------- - a set of kinds that exist in this list of arrays + set[str] + A set of kinds that exist in this list of arrays. """ - typs = set() + typs: Set[str] = set() for arr in l: + # Note: we use dtype.kind checks because they are much more performant + # than is_foo_dtype dtype = arr.dtype - if is_categorical_dtype(dtype): - typ = "category" - elif is_sparse(dtype): - typ = "sparse" + if not isinstance(dtype, np.dtype): + # ExtensionDtype so we get + # e.g. "categorical", "datetime64[ns, US/Central]", "Sparse[itn64, 0]" + typ = str(dtype) elif isinstance(arr, ABCRangeIndex): typ = "range" - elif is_datetime64tz_dtype(dtype): - # if to_concat contains different tz, - # the result must be object dtype - typ = str(dtype) - elif is_datetime64_dtype(dtype): + elif dtype.kind == "M": typ = "datetime" - elif is_timedelta64_dtype(dtype): + elif dtype.kind == "m": typ = "timedelta" - elif is_object_dtype(dtype): - typ = "object" - elif is_bool_dtype(dtype): - typ = "bool" - elif is_extension_array_dtype(dtype): - typ = str(dtype) + elif dtype.kind in ["O", "b"]: + typ = str(dtype) # i.e. "object", "bool" else: typ = dtype.kind + typs.add(typ) return typs @@ -140,7 +131,7 @@ def is_nonempty(x) -> bool: if non_empties and axis == 0: to_concat = non_empties - typs = get_dtype_kinds(to_concat) + typs = _get_dtype_kinds(to_concat) _contains_datetime = any(typ.startswith("datetime") for typ in typs) all_empty = not len(non_empties) @@ -161,13 +152,13 @@ def is_nonempty(x) -> bool: return np.concatenate(to_concat) elif _contains_datetime or "timedelta" in typs: - return concat_datetime(to_concat, axis=axis, typs=typs) + return _concat_datetime(to_concat, axis=axis, typs=typs) elif all_empty: # we have all empties, but may need to coerce the result dtype to # object if we have non-numeric type operands (numpy would otherwise # cast this to float) - typs = get_dtype_kinds(to_concat) + typs = _get_dtype_kinds(to_concat) if len(typs) != 1: if not len(typs - {"i", "u", "f"}) or not len(typs - {"bool", "i", "u"}): @@ -361,7 +352,7 @@ def _concatenate_2d(to_concat, axis: int): return np.concatenate(to_concat, axis=axis) -def concat_datetime(to_concat, axis=0, typs=None): +def _concat_datetime(to_concat, axis=0, typs=None): """ provide concatenation of an datetimelike array of arrays each of which is a single M8[ns], datetime64[ns, tz] or m8[ns] dtype @@ -377,7 +368,7 @@ def concat_datetime(to_concat, axis=0, typs=None): a single array, preserving the combined dtypes """ if typs is None: - typs = get_dtype_kinds(to_concat) + typs = _get_dtype_kinds(to_concat) to_concat = [_wrap_datetimelike(x) for x in to_concat] single_dtype = len({x.dtype for x in to_concat}) == 1 diff --git a/pandas/tests/dtypes/test_concat.py b/pandas/tests/dtypes/test_concat.py index 5a9ad732792ea..53d53e35c6eb5 100644 --- a/pandas/tests/dtypes/test_concat.py +++ b/pandas/tests/dtypes/test_concat.py @@ -44,7 +44,7 @@ ) def test_get_dtype_kinds(index_or_series, to_concat, expected): to_concat_klass = [index_or_series(c) for c in to_concat] - result = _concat.get_dtype_kinds(to_concat_klass) + result = _concat._get_dtype_kinds(to_concat_klass) assert result == set(expected) @@ -76,7 +76,7 @@ def test_get_dtype_kinds(index_or_series, to_concat, expected): ], ) def test_get_dtype_kinds_period(to_concat, expected): - result = _concat.get_dtype_kinds(to_concat) + result = _concat._get_dtype_kinds(to_concat) assert result == set(expected) From 46d5e42cf3fee39f218b6aaeeecac1f6080d068f Mon Sep 17 00:00:00 2001 From: Maxim Ivanov <41443370+ivanovmg@users.noreply.github.com> Date: Sun, 13 Sep 2020 03:54:22 +0700 Subject: [PATCH 0736/1025] CLN: pandas/io/parsers.py (#36269) --- pandas/io/parsers.py | 53 ++++++++++++++++---------------------------- 1 file changed, 19 insertions(+), 34 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 4c619a636f057..b963d5be69b5f 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -421,10 +421,6 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds): kwds["encoding"] = encoding compression = kwds.get("compression", "infer") - # TODO: get_filepath_or_buffer could return - # Union[FilePathOrBuffer, s3fs.S3File, gcsfs.GCSFile] - # though mypy handling of conditional imports is difficult. - # See https://github.com/python/mypy/issues/1297 ioargs = get_filepath_or_buffer( filepath_or_buffer, encoding, compression, storage_options=storage_options ) @@ -914,7 +910,6 @@ def __init__(self, f, engine=None, **kwds): # miscellanea self.engine = engine - self._engine = None self._currow = 0 options = self._get_options_with_defaults(engine) @@ -923,14 +918,13 @@ def __init__(self, f, engine=None, **kwds): self.nrows = options.pop("nrows", None) self.squeeze = options.pop("squeeze", False) - # might mutate self.engine - self.engine = self._check_file_or_buffer(f, engine) + self._check_file_or_buffer(f, engine) self.options, self.engine = self._clean_options(options, engine) if "has_index_names" in kwds: self.options["has_index_names"] = kwds["has_index_names"] - self._make_engine(self.engine) + self._engine = self._make_engine(self.engine) def close(self): self._engine.close() @@ -987,24 +981,21 @@ def _check_file_or_buffer(self, f, engine): msg = "The 'python' engine cannot iterate through this file buffer." raise ValueError(msg) - return engine - def _clean_options(self, options, engine): result = options.copy() engine_specified = self._engine_specified fallback_reason = None - sep = options["delimiter"] - delim_whitespace = options["delim_whitespace"] - # C engine not supported yet if engine == "c": if options["skipfooter"] > 0: fallback_reason = "the 'c' engine does not support skipfooter" engine = "python" - encoding = sys.getfilesystemencoding() or "utf-8" + sep = options["delimiter"] + delim_whitespace = options["delim_whitespace"] + if sep is None and not delim_whitespace: if engine == "c": fallback_reason = ( @@ -1029,6 +1020,7 @@ def _clean_options(self, options, engine): result["delimiter"] = r"\s+" elif sep is not None: encodeable = True + encoding = sys.getfilesystemencoding() or "utf-8" try: if len(sep.encode(encoding)) > 1: encodeable = False @@ -1161,29 +1153,26 @@ def __next__(self): raise def _make_engine(self, engine="c"): - if engine == "c": - self._engine = CParserWrapper(self.f, **self.options) + mapping = { + "c": CParserWrapper, + "python": PythonParser, + "python-fwf": FixedWidthFieldParser, + } + try: + klass = mapping[engine] + except KeyError: + raise ValueError( + f"Unknown engine: {engine} (valid options are {mapping.keys()})" + ) else: - if engine == "python": - klass = PythonParser - elif engine == "python-fwf": - klass = FixedWidthFieldParser - else: - raise ValueError( - f"Unknown engine: {engine} (valid options " - 'are "c", "python", or "python-fwf")' - ) - self._engine = klass(self.f, **self.options) + return klass(self.f, **self.options) def _failover_to_python(self): raise AbstractMethodError(self) def read(self, nrows=None): nrows = validate_integer("nrows", nrows) - ret = self._engine.read(nrows) - - # May alter columns / col_dict - index, columns, col_dict = self._create_index(ret) + index, columns, col_dict = self._engine.read(nrows) if index is None: if col_dict: @@ -1203,10 +1192,6 @@ def read(self, nrows=None): return df[df.columns[0]].copy() return df - def _create_index(self, ret): - index, columns, col_dict = ret - return index, columns, col_dict - def get_chunk(self, size=None): if size is None: size = self.chunksize From 79812f74c72bc0df51e1f49419e507bbd42ae16b Mon Sep 17 00:00:00 2001 From: Felix Claessen <30658763+Flix6x@users.noreply.github.com> Date: Sat, 12 Sep 2020 23:07:52 +0200 Subject: [PATCH 0737/1025] Resample fix dst transition (#36264) --- doc/source/whatsnew/v1.2.0.rst | 1 + pandas/core/resample.py | 6 ++- pandas/tests/resample/test_datetime_index.py | 47 ++++++++++++++++++++ 3 files changed, 53 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index bce6a735b7b07..f632d87a32a5a 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -308,6 +308,7 @@ Groupby/resample/rolling - Bug in :meth:`DataFrameGroupBy.count` and :meth:`SeriesGroupBy.sum` returning ``NaN`` for missing categories when grouped on multiple ``Categoricals``. Now returning ``0`` (:issue:`35028`) - Bug in :meth:`DataFrameGroupBy.apply` that would some times throw an erroneous ``ValueError`` if the grouping axis had duplicate entries (:issue:`16646`) +- Bug in :meth:`DataFrame.resample(...)` that would throw a ``ValueError`` when resampling from "D" to "24H" over a transition into daylight savings time (DST) (:issue:`35219`) - Bug when combining methods :meth:`DataFrame.groupby` with :meth:`DataFrame.resample` and :meth:`DataFrame.interpolate` raising an ``TypeError`` (:issue:`35325`) - Bug in :meth:`DataFrameGroupBy.apply` where a non-nuisance grouping column would be dropped from the output columns if another groupby method was called before ``.apply()`` (:issue:`34656`) - Bug in :meth:`DataFrameGroupby.apply` would drop a :class:`CategoricalIndex` when grouped on. (:issue:`35792`) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 7b5154756e613..a2bf631959dd1 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -1087,7 +1087,11 @@ def _upsample(self, method, limit=None, fill_value=None): res_index = self._adjust_binner_for_upsample(binner) # if we have the same frequency as our axis, then we are equal sampling - if limit is None and to_offset(ax.inferred_freq) == self.freq: + if ( + limit is None + and to_offset(ax.inferred_freq) == self.freq + and len(obj) == len(res_index) + ): result = obj.copy() result.index = res_index else: diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index 59a0183304c76..9475dcc6981ff 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -1740,3 +1740,50 @@ def test_resample_apply_product(): columns=["A", "B"], ) tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "first,last,freq_in,freq_out,exp_last", + [ + ( + "2020-03-28", + "2020-03-31", + "D", + "24H", + "2020-03-30 01:00", + ), # includes transition into DST + ( + "2020-03-28", + "2020-10-27", + "D", + "24H", + "2020-10-27 00:00", + ), # includes transition into and out of DST + ( + "2020-10-25", + "2020-10-27", + "D", + "24H", + "2020-10-26 23:00", + ), # includes transition out of DST + ( + "2020-03-28", + "2020-03-31", + "24H", + "D", + "2020-03-30 00:00", + ), # same as above, but from 24H to D + ("2020-03-28", "2020-10-27", "24H", "D", "2020-10-27 00:00"), + ("2020-10-25", "2020-10-27", "24H", "D", "2020-10-26 00:00"), + ], +) +def test_resample_calendar_day_with_dst( + first: str, last: str, freq_in: str, freq_out: str, exp_last: str +): + # GH 35219 + ts = pd.Series(1.0, pd.date_range(first, last, freq=freq_in, tz="Europe/Amsterdam")) + result = ts.resample(freq_out).pad() + expected = pd.Series( + 1.0, pd.date_range(first, exp_last, freq=freq_out, tz="Europe/Amsterdam") + ) + tm.assert_series_equal(result, expected) From 750c50a4829460c50020adb531e02c8cc62fd222 Mon Sep 17 00:00:00 2001 From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> Date: Sat, 12 Sep 2020 17:12:42 -0400 Subject: [PATCH 0738/1025] CLN: _wrap_applied_output (#36260) --- pandas/core/groupby/generic.py | 37 +++++++++++++++--------------- pandas/tests/groupby/test_apply.py | 10 ++++++++ 2 files changed, 28 insertions(+), 19 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index e870187fc7952..1552256468ad2 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1203,16 +1203,27 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): values = [x if (x is not None) else backup for x in values] - v = values[0] - - if not isinstance(v, (np.ndarray, Index, Series)) and self.as_index: + if isinstance(first_not_none, (np.ndarray, Index)): + # GH#1738: values is list of arrays of unequal lengths + # fall through to the outer else clause + # TODO: sure this is right? we used to do this + # after raising AttributeError above + return self.obj._constructor_sliced( + values, index=key_index, name=self._selection_name + ) + elif not isinstance(first_not_none, Series): # values are not series or array-like but scalars # self._selection_name not passed through to Series as the # result should not take the name of original selection # of columns - return self.obj._constructor_sliced(values, index=key_index) + if self.as_index: + return self.obj._constructor_sliced(values, index=key_index) + else: + result = DataFrame(values, index=key_index, columns=[self._selection]) + self._insert_inaxis_grouper_inplace(result) + return result - if isinstance(v, Series): + else: all_indexed_same = all_indexes_same((x.index for x in values)) # GH3596 @@ -1253,31 +1264,19 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): if self.axis == 0: index = key_index - columns = v.index.copy() + columns = first_not_none.index.copy() if columns.name is None: # GH6124 - propagate name of Series when it's consistent names = {v.name for v in values} if len(names) == 1: columns.name = list(names)[0] else: - index = v.index + index = first_not_none.index columns = key_index stacked_values = stacked_values.T result = self.obj._constructor(stacked_values, index=index, columns=columns) - elif not self.as_index: - # We add grouping column below, so create a frame here - result = DataFrame(values, index=key_index, columns=[self._selection]) - else: - # GH#1738: values is list of arrays of unequal lengths - # fall through to the outer else clause - # TODO: sure this is right? we used to do this - # after raising AttributeError above - return self.obj._constructor_sliced( - values, index=key_index, name=self._selection_name - ) - # if we have date/time like in the original, then coerce dates # as we are stacking can easily have object dtypes here so = self._selected_obj diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 3183305fe2933..db5c4af9c6f53 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -381,6 +381,16 @@ def test_apply_frame_to_series(df): tm.assert_numpy_array_equal(result.values, expected.values) +def test_apply_frame_not_as_index_column_name(df): + # GH 35964 - path within _wrap_applied_output not hit by a test + grouped = df.groupby(["A", "B"], as_index=False) + result = grouped.apply(len) + expected = grouped.count().rename(columns={"C": np.nan}).drop(columns="D") + # TODO: Use assert_frame_equal when column name is not np.nan (GH 36306) + tm.assert_index_equal(result.index, expected.index) + tm.assert_numpy_array_equal(result.values, expected.values) + + def test_apply_frame_concat_series(): def trans(group): return group.groupby("B")["C"].sum().sort_values()[:2] From 873731ead68cc228848603ff9c5532c399b9c54c Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 12 Sep 2020 14:13:31 -0700 Subject: [PATCH 0739/1025] REF: implement Categorical._validate_listlike (#36274) --- pandas/core/arrays/categorical.py | 44 ++++++++++++++++++++++--------- pandas/core/dtypes/concat.py | 10 ++----- pandas/core/indexes/category.py | 29 +++----------------- pandas/core/reshape/merge.py | 9 ++----- 4 files changed, 39 insertions(+), 53 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index e73a1404c6434..803acce29a7e4 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1716,6 +1716,35 @@ def _box_func(self, i: int): return np.NaN return self.categories[i] + def _validate_listlike(self, target: ArrayLike) -> np.ndarray: + """ + Extract integer codes we can use for comparison. + + Notes + ----- + If a value in target is not present, it gets coded as -1. + """ + + if isinstance(target, Categorical): + # Indexing on codes is more efficient if categories are the same, + # so we can apply some optimizations based on the degree of + # dtype-matching. + if self.categories.equals(target.categories): + # We use the same codes, so can go directly to the engine + codes = target.codes + elif self.is_dtype_equal(target): + # We have the same categories up to a reshuffling of codes. + codes = recode_for_categories( + target.codes, target.categories, self.categories + ) + else: + code_indexer = self.categories.get_indexer(target.categories) + codes = take_1d(code_indexer, target.codes, fill_value=-1) + else: + codes = self.categories.get_indexer(target) + + return codes + # ------------------------------------------------------------------ def take_nd(self, indexer, allow_fill: bool = False, fill_value=None): @@ -1890,11 +1919,8 @@ def _validate_setitem_value(self, value): "Cannot set a Categorical with another, " "without identical categories" ) - if not self.categories.equals(value.categories): - new_codes = recode_for_categories( - value.codes, value.categories, self.categories - ) - value = Categorical.from_codes(new_codes, dtype=self.dtype) + new_codes = self._validate_listlike(value) + value = Categorical.from_codes(new_codes, dtype=self.dtype) rvalue = value if is_list_like(value) else [value] @@ -2164,13 +2190,7 @@ def equals(self, other: object) -> bool: if not isinstance(other, Categorical): return False elif self.is_dtype_equal(other): - if self.categories.equals(other.categories): - # fastpath to avoid re-coding - other_codes = other._codes - else: - other_codes = recode_for_categories( - other.codes, other.categories, self.categories - ) + other_codes = self._validate_listlike(other) return np.array_equal(self._codes, other_codes) return False diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index 07904339b93df..60fd959701821 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -301,14 +301,8 @@ def _maybe_unwrap(x): categories = first.categories ordered = first.ordered - if all(first.categories.equals(other.categories) for other in to_union[1:]): - new_codes = np.concatenate([c.codes for c in to_union]) - else: - codes = [first.codes] + [ - recode_for_categories(other.codes, other.categories, first.categories) - for other in to_union[1:] - ] - new_codes = np.concatenate(codes) + all_codes = [first._validate_listlike(x) for x in to_union] + new_codes = np.concatenate(all_codes) if sort_categories and not ignore_order and ordered: raise TypeError("Cannot use sort_categories=True with ordered Categoricals") diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 98ef473a13348..19a0910a7a282 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -23,8 +23,7 @@ from pandas.core.dtypes.missing import is_valid_nat_for_dtype, notna from pandas.core import accessor -from pandas.core.algorithms import take_1d -from pandas.core.arrays.categorical import Categorical, contains, recode_for_categories +from pandas.core.arrays.categorical import Categorical, contains import pandas.core.common as com from pandas.core.construction import extract_array import pandas.core.indexes.base as ibase @@ -558,21 +557,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): "method='nearest' not implemented yet for CategoricalIndex" ) - if isinstance(target, CategoricalIndex) and self._values.is_dtype_equal(target): - if self._values.equals(target._values): - # we have the same codes - codes = target.codes - else: - codes = recode_for_categories( - target.codes, target.categories, self._values.categories - ) - else: - if isinstance(target, CategoricalIndex): - code_indexer = self.categories.get_indexer(target.categories) - codes = take_1d(code_indexer, target.codes, fill_value=-1) - else: - codes = self.categories.get_indexer(target) - + codes = self._values._validate_listlike(target._values) indexer, _ = self._engine.get_indexer_non_unique(codes) return ensure_platform_int(indexer) @@ -580,15 +565,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): def get_indexer_non_unique(self, target): target = ibase.ensure_index(target) - if isinstance(target, CategoricalIndex): - # Indexing on codes is more efficient if categories are the same: - if target.categories is self.categories: - target = target.codes - indexer, missing = self._engine.get_indexer_non_unique(target) - return ensure_platform_int(indexer), missing - target = target._values - - codes = self.categories.get_indexer(target) + codes = self._values._validate_listlike(target._values) indexer, missing = self._engine.get_indexer_non_unique(codes) return ensure_platform_int(indexer), missing diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 9f19ea9aefe09..d95355589fd0c 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -43,7 +43,6 @@ from pandas import Categorical, Index, MultiIndex from pandas.core import groupby import pandas.core.algorithms as algos -from pandas.core.arrays.categorical import recode_for_categories import pandas.core.common as com from pandas.core.construction import extract_array from pandas.core.frame import _merge_doc @@ -1936,12 +1935,8 @@ def _factorize_keys( ): assert isinstance(lk, Categorical) assert isinstance(rk, Categorical) - if lk.categories.equals(rk.categories): - # if we exactly match in categories, allow us to factorize on codes - rk = rk.codes - else: - # Same categories in different orders -> recode - rk = recode_for_categories(rk.codes, rk.categories, lk.categories) + # Cast rk to encoding so we can compare codes with lk + rk = lk._validate_listlike(rk) lk = ensure_int64(lk.codes) rk = ensure_int64(rk) From c15713f9cd633098e84c1ce73779388fea7db4e5 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 12 Sep 2020 14:14:36 -0700 Subject: [PATCH 0740/1025] CLN: simplify Categorical comparisons (#36250) --- pandas/core/arrays/categorical.py | 10 +--------- pandas/tests/arrays/categorical/test_operators.py | 7 +------ pandas/tests/indexes/categorical/test_category.py | 10 +--------- 3 files changed, 3 insertions(+), 24 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 803acce29a7e4..4fa6b73932aa4 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -76,17 +76,9 @@ def func(self, other): # the same (maybe up to ordering, depending on ordered) msg = "Categoricals can only be compared if 'categories' are the same." - if len(self.categories) != len(other.categories): - raise TypeError(msg + " Categories are different lengths") - elif self.ordered and not (self.categories == other.categories).all(): - raise TypeError(msg) - elif not set(self.categories) == set(other.categories): + if not self.is_dtype_equal(other): raise TypeError(msg) - if not (self.ordered == other.ordered): - raise TypeError( - "Categoricals can only be compared if 'ordered' is the same" - ) if not self.ordered and not self.categories.equals(other.categories): # both unordered and different order other_codes = _get_codes_for_values(other, self.categories) diff --git a/pandas/tests/arrays/categorical/test_operators.py b/pandas/tests/arrays/categorical/test_operators.py index bc5fb51883b3d..9d118f1ed8753 100644 --- a/pandas/tests/arrays/categorical/test_operators.py +++ b/pandas/tests/arrays/categorical/test_operators.py @@ -79,10 +79,6 @@ def test_comparisons(self): cat_rev_base2 = Categorical(["b", "b", "b"], categories=["c", "b", "a", "d"]) - msg = ( - "Categoricals can only be compared if 'categories' are the same. " - "Categories are different lengths" - ) with pytest.raises(TypeError, match=msg): cat_rev > cat_rev_base2 @@ -90,7 +86,6 @@ def test_comparisons(self): cat_unorderd = cat.set_ordered(False) assert not (cat > cat).any() - msg = "Categoricals can only be compared if 'ordered' is the same" with pytest.raises(TypeError, match=msg): cat > cat_unorderd @@ -321,7 +316,7 @@ def test_compare_different_lengths(self): c1 = Categorical([], categories=["a", "b"]) c2 = Categorical([], categories=["a"]) - msg = "Categories are different lengths" + msg = "Categoricals can only be compared if 'categories' are the same." with pytest.raises(TypeError, match=msg): c1 == c2 diff --git a/pandas/tests/indexes/categorical/test_category.py b/pandas/tests/indexes/categorical/test_category.py index b325edb321ed4..a3a06338a0277 100644 --- a/pandas/tests/indexes/categorical/test_category.py +++ b/pandas/tests/indexes/categorical/test_category.py @@ -402,15 +402,7 @@ def test_equals_categorical(self): with pytest.raises(ValueError, match="Lengths must match"): ci1 == Index(["a", "b", "c"]) - msg = ( - "categorical index comparisons must have the same categories " - "and ordered attributes" - "|" - "Categoricals can only be compared if 'categories' are the same. " - "Categories are different lengths" - "|" - "Categoricals can only be compared if 'ordered' is the same" - ) + msg = "Categoricals can only be compared if 'categories' are the same" with pytest.raises(TypeError, match=msg): ci1 == ci2 with pytest.raises(TypeError, match=msg): From de58f8ebacd6eb24b929dd824dc08b66fbcd17f5 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 12 Sep 2020 14:15:13 -0700 Subject: [PATCH 0741/1025] searchsorted numpy compat for Period dtype (#36254) --- doc/source/whatsnew/v1.2.0.rst | 2 +- pandas/core/arrays/period.py | 7 +++++++ pandas/tests/arrays/test_datetimelike.py | 2 +- pandas/tests/indexes/period/test_searchsorted.py | 9 ++++++++- 4 files changed, 17 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index f632d87a32a5a..c746b83c5f526 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -228,7 +228,7 @@ Datetimelike - Bug in :class:`DateOffset` where attributes reconstructed from pickle files differ from original objects when input values exceed normal ranges (e.g months=12) (:issue:`34511`) - Bug in :meth:`DatetimeIndex.get_slice_bound` where ``datetime.date`` objects were not accepted or naive :class:`Timestamp` with a tz-aware :class:`DatetimeIndex` (:issue:`35690`) - Bug in :meth:`DatetimeIndex.slice_locs` where ``datetime.date`` objects were not accepted (:issue:`34077`) -- Bug in :meth:`DatetimeIndex.searchsorted`, :meth:`TimedeltaIndex.searchsorted`, and :meth:`Series.searchsorted` with ``datetime64`` or ``timedelta64`` dtype placement of ``NaT`` values being inconsistent with ``NumPy`` (:issue:`36176`) +- Bug in :meth:`DatetimeIndex.searchsorted`, :meth:`TimedeltaIndex.searchsorted`, :meth:`PeriodIndex.searchsorted`, and :meth:`Series.searchsorted` with ``datetime64``, ``timedelta64`` or ``Period`` dtype placement of ``NaT`` values being inconsistent with ``NumPy`` (:issue:`36176`,:issue:`36254`) Timedelta ^^^^^^^^^ diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 865b1680c008a..44c0455018a42 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -587,6 +587,13 @@ def astype(self, dtype, copy: bool = True): return self.asfreq(dtype.freq) return super().astype(dtype, copy=copy) + def searchsorted(self, value, side="left", sorter=None): + value = self._validate_searchsorted_value(value).view("M8[ns]") + + # Cast to M8 to get datetime-like NaT placement + m8arr = self._ndarray.view("M8[ns]") + return m8arr.searchsorted(value, side=side, sorter=sorter) + # ------------------------------------------------------------------ # Arithmetic Methods diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index 9d316c38082af..624335fd78b0f 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -244,7 +244,7 @@ def test_searchsorted(self): # GH#29884 match numpy convention on whether NaT goes # at the end or the beginning result = arr.searchsorted(pd.NaT) - if np_version_under1p18 or self.array_cls is PeriodArray: + if np_version_under1p18: # Following numpy convention, NaT goes at the beginning # (unlike NaN which goes at the end) assert result == 0 diff --git a/pandas/tests/indexes/period/test_searchsorted.py b/pandas/tests/indexes/period/test_searchsorted.py index f5a2583bf2e10..f2950b9f6065c 100644 --- a/pandas/tests/indexes/period/test_searchsorted.py +++ b/pandas/tests/indexes/period/test_searchsorted.py @@ -2,6 +2,7 @@ import pytest from pandas._libs.tslibs import IncompatibleFrequency +from pandas.compat.numpy import np_version_under1p18 from pandas import NaT, Period, PeriodIndex, Series, array import pandas._testing as tm @@ -21,7 +22,13 @@ def test_searchsorted(self, freq): p2 = Period("2014-01-04", freq=freq) assert pidx.searchsorted(p2) == 3 - assert pidx.searchsorted(NaT) == 0 + if np_version_under1p18: + # GH#36254 + # Following numpy convention, NaT goes at the beginning + # (unlike NaN which goes at the end) + assert pidx.searchsorted(NaT) == 0 + else: + assert pidx.searchsorted(NaT) == 5 msg = "Input has different freq=H from PeriodArray" with pytest.raises(IncompatibleFrequency, match=msg): From 95304d26178ab6a5e923b02db1c43a1d8cb70a47 Mon Sep 17 00:00:00 2001 From: Asish Mahapatra Date: Sat, 12 Sep 2020 17:17:48 -0400 Subject: [PATCH 0742/1025] BUG: na parameter for str.startswith and str.endswith not propagating for Series with categorical dtype (#36249) --- doc/source/whatsnew/v1.1.3.rst | 2 +- pandas/core/strings.py | 2 +- pandas/tests/test_strings.py | 40 +++++++++++++++++++++++++++------- 3 files changed, 34 insertions(+), 10 deletions(-) diff --git a/doc/source/whatsnew/v1.1.3.rst b/doc/source/whatsnew/v1.1.3.rst index e3161012da5d1..c06990e3f2051 100644 --- a/doc/source/whatsnew/v1.1.3.rst +++ b/doc/source/whatsnew/v1.1.3.rst @@ -22,7 +22,7 @@ Fixed regressions Bug fixes ~~~~~~~~~ -- +- Bug in :meth:`Series.str.startswith` and :meth:`Series.str.endswith` with ``category`` dtype not propagating ``na`` parameter (:issue:`36241`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 6702bf519c52e..4decd86764ccc 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -2050,7 +2050,7 @@ def wrapper2(self, pat, flags=0, **kwargs): @forbid_nonstring_types(forbidden_types, name=name) def wrapper3(self, pat, na=np.nan): result = f(self._parent, pat, na=na) - return self._wrap_result(result, returns_string=returns_string) + return self._wrap_result(result, returns_string=returns_string, fill_value=na) wrapper = wrapper3 if na else wrapper2 if flags else wrapper1 diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index d9396d70f9112..c792a48d3ef08 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -29,6 +29,8 @@ def assert_series_or_index_equal(left, right): ("decode", ("UTF-8",), {}), ("encode", ("UTF-8",), {}), ("endswith", ("a",), {}), + ("endswith", ("a",), {"na": True}), + ("endswith", ("a",), {"na": False}), ("extract", ("([a-z]*)",), {"expand": False}), ("extract", ("([a-z]*)",), {"expand": True}), ("extractall", ("([a-z]*)",), {}), @@ -58,6 +60,8 @@ def assert_series_or_index_equal(left, right): ("split", (" ",), {"expand": False}), ("split", (" ",), {"expand": True}), ("startswith", ("a",), {}), + ("startswith", ("a",), {"na": True}), + ("startswith", ("a",), {"na": False}), # translating unicode points of "a" to "d" ("translate", ({97: 100},), {}), ("wrap", (2,), {}), @@ -838,15 +842,23 @@ def test_contains_for_object_category(self): expected = Series([True, False, False, True, False]) tm.assert_series_equal(result, expected) - def test_startswith(self): - values = Series(["om", np.nan, "foo_nom", "nom", "bar_foo", np.nan, "foo"]) + @pytest.mark.parametrize("dtype", [None, "category"]) + @pytest.mark.parametrize("null_value", [None, np.nan, pd.NA]) + @pytest.mark.parametrize("na", [True, False]) + def test_startswith(self, dtype, null_value, na): + # add category dtype parametrizations for GH-36241 + values = Series( + ["om", null_value, "foo_nom", "nom", "bar_foo", null_value, "foo"], + dtype=dtype, + ) result = values.str.startswith("foo") exp = Series([False, np.nan, True, False, False, np.nan, True]) tm.assert_series_equal(result, exp) - result = values.str.startswith("foo", na=True) - tm.assert_series_equal(result, exp.fillna(True).astype(bool)) + result = values.str.startswith("foo", na=na) + exp = Series([False, na, True, False, False, na, True]) + tm.assert_series_equal(result, exp) # mixed mixed = np.array( @@ -867,15 +879,23 @@ def test_startswith(self): ) tm.assert_series_equal(rs, xp) - def test_endswith(self): - values = Series(["om", np.nan, "foo_nom", "nom", "bar_foo", np.nan, "foo"]) + @pytest.mark.parametrize("dtype", [None, "category"]) + @pytest.mark.parametrize("null_value", [None, np.nan, pd.NA]) + @pytest.mark.parametrize("na", [True, False]) + def test_endswith(self, dtype, null_value, na): + # add category dtype parametrizations for GH-36241 + values = Series( + ["om", null_value, "foo_nom", "nom", "bar_foo", null_value, "foo"], + dtype=dtype, + ) result = values.str.endswith("foo") exp = Series([False, np.nan, False, False, True, np.nan, True]) tm.assert_series_equal(result, exp) - result = values.str.endswith("foo", na=False) - tm.assert_series_equal(result, exp.fillna(False).astype(bool)) + result = values.str.endswith("foo", na=na) + exp = Series([False, na, False, False, True, na, True]) + tm.assert_series_equal(result, exp) # mixed mixed = np.array( @@ -3552,6 +3572,10 @@ def test_string_array(any_string_method): assert result.dtype == "boolean" result = result.astype(object) + elif expected.dtype == "bool": + assert result.dtype == "boolean" + result = result.astype("bool") + elif expected.dtype == "float" and expected.isna().any(): assert result.dtype == "Int64" result = result.astype("float") From 93706aaff783d4d29d80a34ef5ff89b6a323c613 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 12 Sep 2020 14:20:29 -0700 Subject: [PATCH 0743/1025] PERF: JoinUnit.is_na (#36312) --- pandas/core/dtypes/missing.py | 39 ++++++++++++++++++++++++++++++++- pandas/core/internals/concat.py | 13 ++++------- 2 files changed, 42 insertions(+), 10 deletions(-) diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index 163500525dbd8..d2e4974741b88 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -9,7 +9,7 @@ from pandas._libs import lib import pandas._libs.missing as libmissing -from pandas._libs.tslibs import NaT, iNaT +from pandas._libs.tslibs import NaT, Period, iNaT from pandas._typing import ArrayLike, DtypeObj from pandas.core.dtypes.common import ( @@ -43,6 +43,9 @@ isposinf_scalar = libmissing.isposinf_scalar isneginf_scalar = libmissing.isneginf_scalar +nan_checker = np.isnan +INF_AS_NA = False + def isna(obj): """ @@ -188,6 +191,12 @@ def _use_inf_as_na(key): """ inf_as_na = get_option(key) globals()["_isna"] = partial(_isna, inf_as_na=inf_as_na) + if inf_as_na: + globals()["nan_checker"] = lambda x: ~np.isfinite(x) + globals()["INF_AS_NA"] = True + else: + globals()["nan_checker"] = np.isnan + globals()["INF_AS_NA"] = False def _isna_ndarraylike(obj, inf_as_na: bool = False): @@ -602,3 +611,31 @@ def is_valid_nat_for_dtype(obj, dtype: DtypeObj) -> bool: # must be PeriodDType return not isinstance(obj, (np.datetime64, np.timedelta64)) + + +def isna_all(arr: ArrayLike) -> bool: + """ + Optimized equivalent to isna(arr).all() + """ + total_len = len(arr) + + # Usually it's enough to check but a small fraction of values to see if + # a block is NOT null, chunks should help in such cases. + # parameters 1000 and 40 were chosen arbitrarily + chunk_len = max(total_len // 40, 1000) + + dtype = arr.dtype + if dtype.kind == "f": + checker = nan_checker + + elif dtype.kind in ["m", "M"] or dtype.type is Period: + checker = lambda x: np.asarray(x.view("i8")) == iNaT + + else: + checker = lambda x: _isna_ndarraylike(x, inf_as_na=INF_AS_NA) + + for i in range(0, total_len, chunk_len): + if not checker(arr[i : i + chunk_len]).all(): + return False + + return True diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 513c5fed1ca62..f5d0c921e1006 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -21,7 +21,7 @@ is_timedelta64_dtype, ) from pandas.core.dtypes.concat import concat_compat -from pandas.core.dtypes.missing import isna +from pandas.core.dtypes.missing import isna_all import pandas.core.algorithms as algos from pandas.core.arrays import DatetimeArray, ExtensionArray @@ -223,13 +223,8 @@ def is_na(self): values_flat = values else: values_flat = values.ravel(order="K") - total_len = values_flat.shape[0] - chunk_len = max(total_len // 40, 1000) - for i in range(0, total_len, chunk_len): - if not isna(values_flat[i : i + chunk_len]).all(): - return False - return True + return isna_all(values_flat) def get_reindexed_values(self, empty_dtype, upcasted_na): if upcasted_na is None: @@ -474,8 +469,8 @@ def _is_uniform_join_units(join_units: List[JoinUnit]) -> bool: # cannot necessarily join return ( # all blocks need to have the same type - all(type(ju.block) is type(join_units[0].block) for ju in join_units) - and # noqa + all(type(ju.block) is type(join_units[0].block) for ju in join_units) # noqa + and # no blocks that would get missing values (can lead to type upcasts) # unless we're an extension dtype. all(not ju.is_na or ju.block.is_extension for ju in join_units) From bdeb2c56ca2059ad7ea96f12cf8046674f3d242a Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Sat, 12 Sep 2020 22:21:15 +0100 Subject: [PATCH 0744/1025] PERF: creating string Series/Arrays from sequence with many strings (#36304) --- doc/source/whatsnew/v1.2.0.rst | 1 + pandas/_libs/lib.pyx | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index c746b83c5f526..00cbb248e4690 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -205,6 +205,7 @@ Deprecations Performance improvements ~~~~~~~~~~~~~~~~~~~~~~~~ +- Performance improvements when creating Series with dtype `str` or :class:`StringDtype` from array with many string elements (:issue:`36304`) - Performance improvement in :meth:`GroupBy.agg` with the ``numba`` engine (:issue:`35759`) - diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 7464fafee2b94..cc63df90a9a9f 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -655,6 +655,10 @@ cpdef ndarray[object] ensure_string_array( for i in range(n): val = result[i] + + if isinstance(val, str): + continue + if not checknull(val): result[i] = str(val) else: From f9662b8fd1339e8e49ed108531f5a5b9394a8bec Mon Sep 17 00:00:00 2001 From: Yanxian Lin Date: Sat, 12 Sep 2020 14:23:20 -0700 Subject: [PATCH 0745/1025] TST: add test case for sort_index on multiindexed Frame with sparse columns (#36236) --- pandas/tests/frame/methods/test_sort_index.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/pandas/tests/frame/methods/test_sort_index.py b/pandas/tests/frame/methods/test_sort_index.py index dcc33428d18a5..a106702aff807 100644 --- a/pandas/tests/frame/methods/test_sort_index.py +++ b/pandas/tests/frame/methods/test_sort_index.py @@ -739,3 +739,18 @@ def test_changes_length_raises(self): df = pd.DataFrame({"A": [1, 2, 3]}) with pytest.raises(ValueError, match="change the shape"): df.sort_index(key=lambda x: x[:1]) + + def test_sort_index_multiindex_sparse_column(self): + # GH 29735, testing that sort_index on a multiindexed frame with sparse + # columns fills with 0. + expected = pd.DataFrame( + { + i: pd.array([0.0, 0.0, 0.0, 0.0], dtype=pd.SparseDtype("float64", 0.0)) + for i in range(0, 4) + }, + index=pd.MultiIndex.from_product([[1, 2], [1, 2]]), + ) + + result = expected.sort_index(level=0) + + tm.assert_frame_equal(result, expected) From e6eed5f4eec0aa1381d0226a54a5e090e0c20f0f Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 12 Sep 2020 14:28:00 -0700 Subject: [PATCH 0746/1025] REF: use BlockManager.apply in csv code (#36150) --- pandas/core/internals/blocks.py | 16 +++++++++------- pandas/io/formats/csvs.py | 9 ++++----- 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index c8da04fbbf987..eb5b887c8b0cb 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -593,7 +593,7 @@ def astype(self, dtype, copy: bool = False, errors: str = "raise"): # use native type formatting for datetime/tz/timedelta if self.is_datelike: - values = self.to_native_types() + values = self.to_native_types().values # astype formatting else: @@ -684,7 +684,7 @@ def to_native_types(self, na_rep="nan", quoting=None, **kwargs): values = np.array(values, dtype="object") values[mask] = na_rep - return values + return self.make_block(values) # block actions # def copy(self, deep: bool = True): @@ -1774,7 +1774,7 @@ def to_native_types(self, na_rep="nan", quoting=None, **kwargs): # TODO(EA2D): reshape not needed with 2D EAs # we are expected to return a 2-d ndarray - return values.reshape(1, len(values)) + return self.make_block(values) def take_nd( self, indexer, axis: int = 0, new_mgr_locs=None, fill_value=lib.no_default @@ -2021,7 +2021,7 @@ def to_native_types( values = np.array(values, dtype="object") values[mask] = na_rep - return values + return self.make_block(values) from pandas.io.formats.format import FloatArrayFormatter @@ -2033,7 +2033,8 @@ def to_native_types( quoting=quoting, fixed_width=False, ) - return formatter.get_result_as_array() + res = formatter.get_result_as_array() + return self.make_block(res) class ComplexBlock(FloatOrComplexBlock): @@ -2192,7 +2193,7 @@ def to_native_types(self, na_rep="NaT", date_format=None, **kwargs): result = dta._format_native_types( na_rep=na_rep, date_format=date_format, **kwargs ) - return np.atleast_2d(result) + return self.make_block(result) def set(self, locs, values): """ @@ -2408,7 +2409,8 @@ def fillna(self, value, **kwargs): def to_native_types(self, na_rep="NaT", **kwargs): """ convert to our native types format """ tda = self.array_values() - return tda._format_native_types(na_rep, **kwargs) + res = tda._format_native_types(na_rep, **kwargs) + return self.make_block(res) class BoolBlock(NumericBlock): diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 90ab6f61f4d74..1bda16d126905 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -341,12 +341,11 @@ def _save_chunk(self, start_i: int, end_i: int) -> None: slicer = slice(start_i, end_i) df = self.obj.iloc[slicer] + mgr = df._mgr - for block in df._mgr.blocks: - d = block.to_native_types(**self._number_format) - - for col_loc, col in zip(block.mgr_locs, d): - data[col_loc] = col + res = mgr.apply("to_native_types", **self._number_format) + for i in range(len(res.items)): + data[i] = res.iget_values(i) ix = self.data_index.to_native_types(slicer=slicer, **self._number_format) libwriters.write_csv_rows(data, ix, self.nlevels, self.cols, self.writer) From fc0f4e63fca0df4038a3f0c436daf0cf2d30a198 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 12 Sep 2020 14:30:32 -0700 Subject: [PATCH 0747/1025] STY/WIP: check for private imports/lookups (#36055) --- Makefile | 6 +++ ci/code_checks.sh | 14 ++++-- pandas/core/arrays/datetimelike.py | 10 ++-- pandas/core/arrays/integer.py | 8 ++-- pandas/core/dtypes/cast.py | 6 ++- pandas/core/groupby/groupby.py | 6 +-- pandas/core/indexes/datetimes.py | 4 +- pandas/core/resample.py | 9 +++- pandas/core/window/ewm.py | 4 +- pandas/core/window/expanding.py | 4 +- pandas/core/window/rolling.py | 6 +-- pandas/io/formats/format.py | 10 ++-- scripts/validate_unwanted_patterns.py | 66 ++++++++++++++++++++++++++- 13 files changed, 119 insertions(+), 34 deletions(-) diff --git a/Makefile b/Makefile index 4a9a48992f92f..b915d8840cd8d 100644 --- a/Makefile +++ b/Makefile @@ -32,3 +32,9 @@ check: --included-file-extensions="py" \ --excluded-file-paths=pandas/tests,asv_bench/,pandas/_vendored \ pandas/ + + python3 scripts/validate_unwanted_patterns.py \ + --validation-type="private_import_across_module" \ + --included-file-extensions="py" \ + --excluded-file-paths=pandas/tests,asv_bench/,pandas/_vendored,doc/ + pandas/ diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 875f1dbb83ce3..54aa830379c07 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -116,11 +116,19 @@ if [[ -z "$CHECK" || "$CHECK" == "lint" ]]; then fi RET=$(($RET + $?)) ; echo $MSG "DONE" - MSG='Check for use of private module attribute access' ; echo $MSG + MSG='Check for import of private attributes across modules' ; echo $MSG if [[ "$GITHUB_ACTIONS" == "true" ]]; then - $BASE_DIR/scripts/validate_unwanted_patterns.py --validation-type="private_function_across_module" --included-file-extensions="py" --excluded-file-paths=pandas/tests,asv_bench/,pandas/_vendored --format="##[error]{source_path}:{line_number}:{msg}" pandas/ + $BASE_DIR/scripts/validate_unwanted_patterns.py --validation-type="private_import_across_module" --included-file-extensions="py" --excluded-file-paths=pandas/tests,asv_bench/,pandas/_vendored --format="##[error]{source_path}:{line_number}:{msg}" pandas/ else - $BASE_DIR/scripts/validate_unwanted_patterns.py --validation-type="private_function_across_module" --included-file-extensions="py" --excluded-file-paths=pandas/tests,asv_bench/,pandas/_vendored pandas/ + $BASE_DIR/scripts/validate_unwanted_patterns.py --validation-type="private_import_across_module" --included-file-extensions="py" --excluded-file-paths=pandas/tests,asv_bench/,pandas/_vendored pandas/ + fi + RET=$(($RET + $?)) ; echo $MSG "DONE" + + MSG='Check for use of private functions across modules' ; echo $MSG + if [[ "$GITHUB_ACTIONS" == "true" ]]; then + $BASE_DIR/scripts/validate_unwanted_patterns.py --validation-type="private_function_across_module" --included-file-extensions="py" --excluded-file-paths=pandas/tests,asv_bench/,pandas/_vendored,doc/ --format="##[error]{source_path}:{line_number}:{msg}" pandas/ + else + $BASE_DIR/scripts/validate_unwanted_patterns.py --validation-type="private_function_across_module" --included-file-extensions="py" --excluded-file-paths=pandas/tests,asv_bench/,pandas/_vendored,doc/ pandas/ fi RET=$(($RET + $?)) ; echo $MSG "DONE" diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 6302b48cb1978..b013246e724de 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -54,7 +54,7 @@ from pandas.core import missing, nanops, ops from pandas.core.algorithms import checked_add_with_arr, unique1d, value_counts -from pandas.core.arrays._mixins import _T, NDArrayBackedExtensionArray +from pandas.core.arrays._mixins import NDArrayBackedExtensionArray from pandas.core.arrays.base import ExtensionOpsMixin import pandas.core.common as com from pandas.core.construction import array, extract_array @@ -472,11 +472,11 @@ class DatetimeLikeArrayMixin( def _ndarray(self) -> np.ndarray: return self._data - def _from_backing_data(self: _T, arr: np.ndarray) -> _T: + def _from_backing_data( + self: DatetimeLikeArrayT, arr: np.ndarray + ) -> DatetimeLikeArrayT: # Note: we do not retain `freq` - return type(self)._simple_new( # type: ignore[attr-defined] - arr, dtype=self.dtype - ) + return type(self)._simple_new(arr, dtype=self.dtype) # ------------------------------------------------------------------ diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index d83ff91a1315f..dc08e018397bc 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -106,7 +106,7 @@ def _get_common_dtype(self, dtypes: List[DtypeObj]) -> Optional[DtypeObj]: [t.numpy_dtype if isinstance(t, BaseMaskedDtype) else t for t in dtypes], [] ) if np.issubdtype(np_dtype, np.integer): - return _dtypes[str(np_dtype)] + return STR_TO_DTYPE[str(np_dtype)] return None def __from_arrow__( @@ -214,7 +214,7 @@ def coerce_to_array( if not issubclass(type(dtype), _IntegerDtype): try: - dtype = _dtypes[str(np.dtype(dtype))] + dtype = STR_TO_DTYPE[str(np.dtype(dtype))] except KeyError as err: raise ValueError(f"invalid dtype specified {dtype}") from err @@ -354,7 +354,7 @@ class IntegerArray(BaseMaskedArray): @cache_readonly def dtype(self) -> _IntegerDtype: - return _dtypes[str(self._data.dtype)] + return STR_TO_DTYPE[str(self._data.dtype)] def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False): if not (isinstance(values, np.ndarray) and values.dtype.kind in ["i", "u"]): @@ -735,7 +735,7 @@ class UInt64Dtype(_IntegerDtype): __doc__ = _dtype_docstring.format(dtype="uint64") -_dtypes: Dict[str, _IntegerDtype] = { +STR_TO_DTYPE: Dict[str, _IntegerDtype] = { "int8": Int8Dtype(), "int16": Int16Dtype(), "int32": Int32Dtype(), diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index ba1b0b075936d..64ccc0be0a25d 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1151,9 +1151,11 @@ def convert_dtypes( target_int_dtype = "Int64" if is_integer_dtype(input_array.dtype): - from pandas.core.arrays.integer import _dtypes + from pandas.core.arrays.integer import STR_TO_DTYPE - inferred_dtype = _dtypes.get(input_array.dtype.name, target_int_dtype) + inferred_dtype = STR_TO_DTYPE.get( + input_array.dtype.name, target_int_dtype + ) if not is_integer_dtype(input_array.dtype) and is_numeric_dtype( input_array.dtype ): diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 1e3e56f4ff09f..8a55d438cf8d4 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -459,7 +459,7 @@ def f(self): @contextmanager -def group_selection_context(groupby: "_GroupBy"): +def group_selection_context(groupby: "BaseGroupBy"): """ Set / reset the group_selection_context. """ @@ -479,7 +479,7 @@ def group_selection_context(groupby: "_GroupBy"): ] -class _GroupBy(PandasObject, SelectionMixin, Generic[FrameOrSeries]): +class BaseGroupBy(PandasObject, SelectionMixin, Generic[FrameOrSeries]): _group_selection = None _apply_allowlist: FrozenSet[str] = frozenset() @@ -1212,7 +1212,7 @@ def _apply_filter(self, indices, dropna): OutputFrameOrSeries = TypeVar("OutputFrameOrSeries", bound=NDFrame) -class GroupBy(_GroupBy[FrameOrSeries]): +class GroupBy(BaseGroupBy[FrameOrSeries]): """ Class for grouping and aggregating relational data. diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index f0b80c2852bd5..f269495f6011a 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -312,9 +312,9 @@ def _is_dates_only(self) -> bool: ------- bool """ - from pandas.io.formats.format import _is_dates_only + from pandas.io.formats.format import is_dates_only - return self.tz is None and _is_dates_only(self._values) + return self.tz is None and is_dates_only(self._values) def __reduce__(self): diff --git a/pandas/core/resample.py b/pandas/core/resample.py index a2bf631959dd1..4ba253e76128e 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -26,7 +26,12 @@ from pandas.core.generic import NDFrame, _shared_docs from pandas.core.groupby.base import GroupByMixin from pandas.core.groupby.generic import SeriesGroupBy -from pandas.core.groupby.groupby import GroupBy, _GroupBy, _pipe_template, get_groupby +from pandas.core.groupby.groupby import ( + BaseGroupBy, + GroupBy, + _pipe_template, + get_groupby, +) from pandas.core.groupby.grouper import Grouper from pandas.core.groupby.ops import BinGrouper from pandas.core.indexes.api import Index @@ -40,7 +45,7 @@ _shared_docs_kwargs: Dict[str, str] = dict() -class Resampler(_GroupBy, ShallowMixin): +class Resampler(BaseGroupBy, ShallowMixin): """ Class for resampling datetimelike data, a groupby-like operation. See aggregate, transform, and apply functions on this object. diff --git a/pandas/core/window/ewm.py b/pandas/core/window/ewm.py index 2bd36d8bff155..4282cb41c4e91 100644 --- a/pandas/core/window/ewm.py +++ b/pandas/core/window/ewm.py @@ -15,7 +15,7 @@ import pandas.core.common as common from pandas.core.window.common import _doc_template, _shared_docs, zsqrt -from pandas.core.window.rolling import _Rolling, flex_binary_moment +from pandas.core.window.rolling import RollingMixin, flex_binary_moment _bias_template = """ Parameters @@ -60,7 +60,7 @@ def get_center_of_mass( return float(comass) -class ExponentialMovingWindow(_Rolling): +class ExponentialMovingWindow(RollingMixin): r""" Provide exponential weighted (EW) functions. diff --git a/pandas/core/window/expanding.py b/pandas/core/window/expanding.py index ce4ab2f98c23d..46e002324ec75 100644 --- a/pandas/core/window/expanding.py +++ b/pandas/core/window/expanding.py @@ -5,10 +5,10 @@ from pandas.util._decorators import Appender, Substitution, doc from pandas.core.window.common import WindowGroupByMixin, _doc_template, _shared_docs -from pandas.core.window.rolling import _Rolling_and_Expanding +from pandas.core.window.rolling import RollingAndExpandingMixin -class Expanding(_Rolling_and_Expanding): +class Expanding(RollingAndExpandingMixin): """ Provide expanding transformations. diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 5a7482076903c..648ab4d25be83 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -1214,13 +1214,13 @@ def std(self, ddof=1, *args, **kwargs): return zsqrt(self.var(ddof=ddof, name="std", **kwargs)) -class _Rolling(_Window): +class RollingMixin(_Window): @property def _constructor(self): return Rolling -class _Rolling_and_Expanding(_Rolling): +class RollingAndExpandingMixin(RollingMixin): _shared_docs["count"] = dedent( r""" @@ -1917,7 +1917,7 @@ def _get_corr(a, b): ) -class Rolling(_Rolling_and_Expanding): +class Rolling(RollingAndExpandingMixin): @cache_readonly def is_datetimelike(self) -> bool: return isinstance( diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 444afcee49a61..4a36dd7bc6de4 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -1588,7 +1588,7 @@ def format_percentiles( return [i + "%" for i in out] -def _is_dates_only( +def is_dates_only( values: Union[np.ndarray, DatetimeArray, Index, DatetimeIndex] ) -> bool: # return a boolean if we are only dates (and don't have a timezone) @@ -1660,8 +1660,8 @@ def get_format_datetime64_from_values( # only accepts 1D values values = values.ravel() - is_dates_only = _is_dates_only(values) - if is_dates_only: + ido = is_dates_only(values) + if ido: return date_format or "%Y-%m-%d" return date_format @@ -1670,9 +1670,9 @@ class Datetime64TZFormatter(Datetime64Formatter): def _format_strings(self) -> List[str]: """ we by definition have a TZ """ values = self.values.astype(object) - is_dates_only = _is_dates_only(values) + ido = is_dates_only(values) formatter = self.formatter or get_format_datetime64( - is_dates_only, date_format=self.date_format + ido, date_format=self.date_format ) fmt_values = [formatter(x) for x in values] diff --git a/scripts/validate_unwanted_patterns.py b/scripts/validate_unwanted_patterns.py index 2add2b8c62a4e..4a0e859535215 100755 --- a/scripts/validate_unwanted_patterns.py +++ b/scripts/validate_unwanted_patterns.py @@ -18,6 +18,39 @@ import tokenize from typing import IO, Callable, FrozenSet, Iterable, List, Set, Tuple +PRIVATE_IMPORTS_TO_IGNORE: Set[str] = { + "_extension_array_shared_docs", + "_index_shared_docs", + "_interval_shared_docs", + "_merge_doc", + "_shared_docs", + "_apply_docs", + "_new_Index", + "_new_PeriodIndex", + "_doc_template", + "_agg_template", + "_pipe_template", + "_get_version", + "__main__", + "_transform_template", + "_arith_doc_FRAME", + "_flex_comp_doc_FRAME", + "_make_flex_doc", + "_op_descriptions", + "_IntegerDtype", + "_use_inf_as_na", + "_get_plot_backend", + "_matplotlib", + "_arrow_utils", + "_registry", + "_get_offset", # TODO: remove after get_offset deprecation enforced + "_test_parse_iso8601", + "_json_normalize", # TODO: remove after deprecation is enforced + "_testing", + "_test_decorators", + "__version__", # check np.__version__ in compat.numpy.function +} + def _get_literal_string_prefix_len(token_string: str) -> int: """ @@ -164,6 +197,36 @@ def private_function_across_module(file_obj: IO[str]) -> Iterable[Tuple[int, str yield (node.lineno, f"Private function '{module_name}.{function_name}'") +def private_import_across_module(file_obj: IO[str]) -> Iterable[Tuple[int, str]]: + """ + Checking that a private function is not imported across modules. + Parameters + ---------- + file_obj : IO + File-like object containing the Python code to validate. + Yields + ------ + line_number : int + Line number of import statement, that imports the private function. + msg : str + Explenation of the error. + """ + contents = file_obj.read() + tree = ast.parse(contents) + + for node in ast.walk(tree): + if not (isinstance(node, ast.Import) or isinstance(node, ast.ImportFrom)): + continue + + for module in node.names: + module_name = module.name.split(".")[-1] + if module_name in PRIVATE_IMPORTS_TO_IGNORE: + continue + + if module_name.startswith("_"): + yield (node.lineno, f"Import of internal function {repr(module_name)}") + + def strings_to_concatenate(file_obj: IO[str]) -> Iterable[Tuple[int, str]]: """ This test case is necessary after 'Black' (https://github.com/psf/black), @@ -419,6 +482,7 @@ def main( available_validation_types: List[str] = [ "bare_pytest_raises", "private_function_across_module", + "private_import_across_module", "strings_to_concatenate", "strings_with_wrong_placed_whitespace", ] @@ -449,7 +513,7 @@ def main( parser.add_argument( "--excluded-file-paths", default="asv_bench/env", - help="Comma separated file extensions to check.", + help="Comma separated file paths to exclude.", ) args = parser.parse_args() From 5a3287d434f9e80acc54426a483a8610820162f6 Mon Sep 17 00:00:00 2001 From: Sam Ezebunandu Date: Sat, 12 Sep 2020 15:31:58 -0600 Subject: [PATCH 0748/1025] DOC: Fix DataFrame.query contradiction on use of Python keywords as identifiers (#36311) --- pandas/core/frame.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b03593ad8afe1..b8b49156a0967 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3238,11 +3238,12 @@ def query(self, expr, inplace=False, **kwargs): in the environment by prefixing them with an '@' character like ``@a + b``. - You can refer to column names that contain spaces or operators by - surrounding them in backticks. This way you can also escape - names that start with a digit, or those that are a Python keyword. - Basically when it is not valid Python identifier. See notes down - for more details. + You can refer to column names that are not valid Python variable names + by surrounding them in backticks. Thus, column names containing spaces + or punctuations (besides underscores) or starting with digits must be + surrounded by backticks. (For example, a column named "Area (cm^2) would + be referenced as `Area (cm^2)`). Column names which are Python keywords + (like "list", "for", "import", etc) cannot be used. For example, if one of your columns is called ``a a`` and you want to sum it with ``b``, your query should be ```a a` + b``. From 06a349ae19540515ec85202b42c396c7d69b2925 Mon Sep 17 00:00:00 2001 From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> Date: Sat, 12 Sep 2020 17:36:51 -0400 Subject: [PATCH 0749/1025] BUG/CLN: Decouple Series/DataFrame.transform (#35964) --- doc/source/whatsnew/v1.2.0.rst | 1 + pandas/core/aggregation.py | 98 +++++++- pandas/core/base.py | 4 +- pandas/core/frame.py | 16 +- pandas/core/generic.py | 74 ------ pandas/core/series.py | 14 +- pandas/core/shared_docs.py | 69 ++++++ .../tests/frame/apply/test_frame_transform.py | 227 ++++++++++++++---- .../tests/series/apply/test_series_apply.py | 5 +- .../series/apply/test_series_transform.py | 168 ++++++++++--- 10 files changed, 507 insertions(+), 169 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 00cbb248e4690..af2960b5038b2 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -344,6 +344,7 @@ Other ^^^^^ - Bug in :meth:`DataFrame.replace` and :meth:`Series.replace` incorrectly raising ``AssertionError`` instead of ``ValueError`` when invalid parameter combinations are passed (:issue:`36045`) - Bug in :meth:`DataFrame.replace` and :meth:`Series.replace` with numeric values and string ``to_replace`` (:issue:`34789`) +- Bug in :meth:`Series.transform` would give incorrect results or raise when the argument ``func`` was dictionary (:issue:`35811`) - .. --------------------------------------------------------------------------- diff --git a/pandas/core/aggregation.py b/pandas/core/aggregation.py index 7ca68d8289bd5..8b74fe01d0dc0 100644 --- a/pandas/core/aggregation.py +++ b/pandas/core/aggregation.py @@ -18,9 +18,10 @@ Union, ) -from pandas._typing import AggFuncType, FrameOrSeries, Label +from pandas._typing import AggFuncType, Axis, FrameOrSeries, Label from pandas.core.dtypes.common import is_dict_like, is_list_like +from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries from pandas.core.base import SpecificationError import pandas.core.common as com @@ -384,3 +385,98 @@ def validate_func_kwargs( if not columns: raise TypeError(no_arg_message) return columns, func + + +def transform( + obj: FrameOrSeries, func: AggFuncType, axis: Axis, *args, **kwargs, +) -> FrameOrSeries: + """ + Transform a DataFrame or Series + + Parameters + ---------- + obj : DataFrame or Series + Object to compute the transform on. + func : string, function, list, or dictionary + Function(s) to compute the transform with. + axis : {0 or 'index', 1 or 'columns'} + Axis along which the function is applied: + + * 0 or 'index': apply function to each column. + * 1 or 'columns': apply function to each row. + + Returns + ------- + DataFrame or Series + Result of applying ``func`` along the given axis of the + Series or DataFrame. + + Raises + ------ + ValueError + If the transform function fails or does not transform. + """ + from pandas.core.reshape.concat import concat + + is_series = obj.ndim == 1 + + if obj._get_axis_number(axis) == 1: + assert not is_series + return transform(obj.T, func, 0, *args, **kwargs).T + + if isinstance(func, list): + if is_series: + func = {com.get_callable_name(v) or v: v for v in func} + else: + func = {col: func for col in obj} + + if isinstance(func, dict): + if not is_series: + cols = sorted(set(func.keys()) - set(obj.columns)) + if len(cols) > 0: + raise SpecificationError(f"Column(s) {cols} do not exist") + + if any(isinstance(v, dict) for v in func.values()): + # GH 15931 - deprecation of renaming keys + raise SpecificationError("nested renamer is not supported") + + results = {} + for name, how in func.items(): + colg = obj._gotitem(name, ndim=1) + try: + results[name] = transform(colg, how, 0, *args, **kwargs) + except Exception as e: + if str(e) == "Function did not transform": + raise e + + # combine results + if len(results) == 0: + raise ValueError("Transform function failed") + return concat(results, axis=1) + + # func is either str or callable + try: + if isinstance(func, str): + result = obj._try_aggregate_string_function(func, *args, **kwargs) + else: + f = obj._get_cython_func(func) + if f and not args and not kwargs: + result = getattr(obj, f)() + else: + try: + result = obj.apply(func, args=args, **kwargs) + except Exception: + result = func(obj, *args, **kwargs) + except Exception: + raise ValueError("Transform function failed") + + # Functions that transform may return empty Series/DataFrame + # when the dtype is not appropriate + if isinstance(result, (ABCSeries, ABCDataFrame)) and result.empty: + raise ValueError("Transform function failed") + if not isinstance(result, (ABCSeries, ABCDataFrame)) or not result.index.equals( + obj.index + ): + raise ValueError("Function did not transform") + + return result diff --git a/pandas/core/base.py b/pandas/core/base.py index 1926803d8f04b..a688302b99724 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -4,7 +4,7 @@ import builtins import textwrap -from typing import Any, Dict, FrozenSet, List, Optional, Union +from typing import Any, Callable, Dict, FrozenSet, List, Optional, Union import numpy as np @@ -560,7 +560,7 @@ def _aggregate_multiple_funcs(self, arg, _axis): ) from err return result - def _get_cython_func(self, arg: str) -> Optional[str]: + def _get_cython_func(self, arg: Callable) -> Optional[str]: """ if we define an internal function for this argument, return it """ diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b8b49156a0967..e9b4fd237c2c2 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -45,6 +45,7 @@ from pandas._libs import algos as libalgos, lib, properties from pandas._libs.lib import no_default from pandas._typing import ( + AggFuncType, ArrayLike, Axes, Axis, @@ -116,7 +117,7 @@ from pandas.core import algorithms, common as com, nanops, ops from pandas.core.accessor import CachedAccessor -from pandas.core.aggregation import reconstruct_func, relabel_result +from pandas.core.aggregation import reconstruct_func, relabel_result, transform from pandas.core.arrays import Categorical, ExtensionArray from pandas.core.arrays.datetimelike import DatetimeLikeArrayMixin as DatetimeLikeArray from pandas.core.arrays.sparse import SparseFrameAccessor @@ -7462,15 +7463,16 @@ def _aggregate(self, arg, axis=0, *args, **kwargs): agg = aggregate @doc( - NDFrame.transform, + _shared_docs["transform"], klass=_shared_doc_kwargs["klass"], axis=_shared_doc_kwargs["axis"], ) - def transform(self, func, axis=0, *args, **kwargs) -> DataFrame: - axis = self._get_axis_number(axis) - if axis == 1: - return self.T.transform(func, *args, **kwargs).T - return super().transform(func, *args, **kwargs) + def transform( + self, func: AggFuncType, axis: Axis = 0, *args, **kwargs + ) -> DataFrame: + result = transform(self, func, axis, *args, **kwargs) + assert isinstance(result, DataFrame) + return result def apply(self, func, axis=0, raw=False, result_type=None, args=(), **kwds): """ diff --git a/pandas/core/generic.py b/pandas/core/generic.py index fffd2e068ebcf..9ed9db801d0a8 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -10648,80 +10648,6 @@ def ewm( times=times, ) - @doc(klass=_shared_doc_kwargs["klass"], axis="") - def transform(self, func, *args, **kwargs): - """ - Call ``func`` on self producing a {klass} with transformed values. - - Produced {klass} will have same axis length as self. - - Parameters - ---------- - func : function, str, list or dict - Function to use for transforming the data. If a function, must either - work when passed a {klass} or when passed to {klass}.apply. - - Accepted combinations are: - - - function - - string function name - - list of functions and/or function names, e.g. ``[np.exp, 'sqrt']`` - - dict of axis labels -> functions, function names or list of such. - {axis} - *args - Positional arguments to pass to `func`. - **kwargs - Keyword arguments to pass to `func`. - - Returns - ------- - {klass} - A {klass} that must have the same length as self. - - Raises - ------ - ValueError : If the returned {klass} has a different length than self. - - See Also - -------- - {klass}.agg : Only perform aggregating type operations. - {klass}.apply : Invoke function on a {klass}. - - Examples - -------- - >>> df = pd.DataFrame({{'A': range(3), 'B': range(1, 4)}}) - >>> df - A B - 0 0 1 - 1 1 2 - 2 2 3 - >>> df.transform(lambda x: x + 1) - A B - 0 1 2 - 1 2 3 - 2 3 4 - - Even though the resulting {klass} must have the same length as the - input {klass}, it is possible to provide several input functions: - - >>> s = pd.Series(range(3)) - >>> s - 0 0 - 1 1 - 2 2 - dtype: int64 - >>> s.transform([np.sqrt, np.exp]) - sqrt exp - 0 0.000000 1.000000 - 1 1.000000 2.718282 - 2 1.414214 7.389056 - """ - result = self.agg(func, *args, **kwargs) - if is_scalar(result) or len(result) != len(self): - raise ValueError("transforms cannot produce aggregated results") - - return result - # ---------------------------------------------------------------------- # Misc methods diff --git a/pandas/core/series.py b/pandas/core/series.py index 6cbd93135a2ca..632b93cdcf24b 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -25,6 +25,7 @@ from pandas._libs import lib, properties, reshape, tslibs from pandas._libs.lib import no_default from pandas._typing import ( + AggFuncType, ArrayLike, Axis, DtypeObj, @@ -89,6 +90,7 @@ from pandas.core.indexes.timedeltas import TimedeltaIndex from pandas.core.indexing import check_bool_indexer from pandas.core.internals import SingleBlockManager +from pandas.core.shared_docs import _shared_docs from pandas.core.sorting import ensure_key_mapped from pandas.core.strings import StringMethods from pandas.core.tools.datetimes import to_datetime @@ -4081,14 +4083,16 @@ def aggregate(self, func=None, axis=0, *args, **kwargs): agg = aggregate @doc( - NDFrame.transform, + _shared_docs["transform"], klass=_shared_doc_kwargs["klass"], axis=_shared_doc_kwargs["axis"], ) - def transform(self, func, axis=0, *args, **kwargs): - # Validate the axis parameter - self._get_axis_number(axis) - return super().transform(func, *args, **kwargs) + def transform( + self, func: AggFuncType, axis: Axis = 0, *args, **kwargs + ) -> FrameOrSeriesUnion: + from pandas.core.aggregation import transform + + return transform(self, func, axis, *args, **kwargs) def apply(self, func, convert_dtype=True, args=(), **kwds): """ diff --git a/pandas/core/shared_docs.py b/pandas/core/shared_docs.py index 0aaccb47efc44..244ee3aa298db 100644 --- a/pandas/core/shared_docs.py +++ b/pandas/core/shared_docs.py @@ -257,3 +257,72 @@ 1 b B E 3 2 c B E 5 """ + +_shared_docs[ + "transform" +] = """\ +Call ``func`` on self producing a {klass} with transformed values. + +Produced {klass} will have same axis length as self. + +Parameters +---------- +func : function, str, list or dict + Function to use for transforming the data. If a function, must either + work when passed a {klass} or when passed to {klass}.apply. + + Accepted combinations are: + + - function + - string function name + - list of functions and/or function names, e.g. ``[np.exp, 'sqrt']`` + - dict of axis labels -> functions, function names or list of such. +{axis} +*args + Positional arguments to pass to `func`. +**kwargs + Keyword arguments to pass to `func`. + +Returns +------- +{klass} + A {klass} that must have the same length as self. + +Raises +------ +ValueError : If the returned {klass} has a different length than self. + +See Also +-------- +{klass}.agg : Only perform aggregating type operations. +{klass}.apply : Invoke function on a {klass}. + +Examples +-------- +>>> df = pd.DataFrame({{'A': range(3), 'B': range(1, 4)}}) +>>> df + A B +0 0 1 +1 1 2 +2 2 3 +>>> df.transform(lambda x: x + 1) + A B +0 1 2 +1 2 3 +2 3 4 + +Even though the resulting {klass} must have the same length as the +input {klass}, it is possible to provide several input functions: + +>>> s = pd.Series(range(3)) +>>> s +0 0 +1 1 +2 2 +dtype: int64 +>>> s.transform([np.sqrt, np.exp]) + sqrt exp +0 0.000000 1.000000 +1 1.000000 2.718282 +2 1.414214 7.389056 +""" diff --git a/pandas/tests/frame/apply/test_frame_transform.py b/pandas/tests/frame/apply/test_frame_transform.py index 3a345215482ed..346e60954fc13 100644 --- a/pandas/tests/frame/apply/test_frame_transform.py +++ b/pandas/tests/frame/apply/test_frame_transform.py @@ -1,72 +1,203 @@ import operator +import re import numpy as np import pytest -import pandas as pd +from pandas import DataFrame, MultiIndex import pandas._testing as tm +from pandas.core.base import SpecificationError +from pandas.core.groupby.base import transformation_kernels from pandas.tests.frame.common import zip_frames -def test_agg_transform(axis, float_frame): - other_axis = 1 if axis in {0, "index"} else 0 +def test_transform_ufunc(axis, float_frame): + # GH 35964 + with np.errstate(all="ignore"): + f_sqrt = np.sqrt(float_frame) + result = float_frame.transform(np.sqrt, axis=axis) + expected = f_sqrt + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("op", transformation_kernels) +def test_transform_groupby_kernel(axis, float_frame, op): + # GH 35964 + if op == "cumcount": + pytest.xfail("DataFrame.cumcount does not exist") + if op == "tshift": + pytest.xfail("Only works on time index and is deprecated") + if axis == 1 or axis == "columns": + pytest.xfail("GH 36308: groupby.transform with axis=1 is broken") + + args = [0.0] if op == "fillna" else [] + if axis == 0 or axis == "index": + ones = np.ones(float_frame.shape[0]) + else: + ones = np.ones(float_frame.shape[1]) + expected = float_frame.groupby(ones, axis=axis).transform(op, *args) + result = float_frame.transform(op, axis, *args) + tm.assert_frame_equal(result, expected) + +@pytest.mark.parametrize( + "ops, names", [([np.sqrt], ["sqrt"]), ([np.abs, np.sqrt], ["absolute", "sqrt"])] +) +def test_transform_list(axis, float_frame, ops, names): + # GH 35964 + other_axis = 1 if axis in {0, "index"} else 0 with np.errstate(all="ignore"): + expected = zip_frames([op(float_frame) for op in ops], axis=other_axis) + if axis in {0, "index"}: + expected.columns = MultiIndex.from_product([float_frame.columns, names]) + else: + expected.index = MultiIndex.from_product([float_frame.index, names]) + result = float_frame.transform(ops, axis=axis) + tm.assert_frame_equal(result, expected) - f_abs = np.abs(float_frame) - f_sqrt = np.sqrt(float_frame) - # ufunc - result = float_frame.transform(np.sqrt, axis=axis) - expected = f_sqrt.copy() - tm.assert_frame_equal(result, expected) - - result = float_frame.transform(np.sqrt, axis=axis) - tm.assert_frame_equal(result, expected) - - # list-like - expected = f_sqrt.copy() - if axis in {0, "index"}: - expected.columns = pd.MultiIndex.from_product( - [float_frame.columns, ["sqrt"]] - ) - else: - expected.index = pd.MultiIndex.from_product([float_frame.index, ["sqrt"]]) - result = float_frame.transform([np.sqrt], axis=axis) - tm.assert_frame_equal(result, expected) - - # multiple items in list - # these are in the order as if we are applying both - # functions per series and then concatting - expected = zip_frames([f_abs, f_sqrt], axis=other_axis) - if axis in {0, "index"}: - expected.columns = pd.MultiIndex.from_product( - [float_frame.columns, ["absolute", "sqrt"]] - ) - else: - expected.index = pd.MultiIndex.from_product( - [float_frame.index, ["absolute", "sqrt"]] - ) - result = float_frame.transform([np.abs, "sqrt"], axis=axis) - tm.assert_frame_equal(result, expected) +def test_transform_dict(axis, float_frame): + # GH 35964 + if axis == 0 or axis == "index": + e = float_frame.columns[0] + expected = float_frame[[e]].transform(np.abs) + else: + e = float_frame.index[0] + expected = float_frame.iloc[[0]].transform(np.abs) + result = float_frame.transform({e: np.abs}, axis=axis) + tm.assert_frame_equal(result, expected) -def test_transform_and_agg_err(axis, float_frame): - # cannot both transform and agg - msg = "transforms cannot produce aggregated results" - with pytest.raises(ValueError, match=msg): - float_frame.transform(["max", "min"], axis=axis) +@pytest.mark.parametrize("use_apply", [True, False]) +def test_transform_udf(axis, float_frame, use_apply): + # GH 35964 + # transform uses UDF either via apply or passing the entire DataFrame + def func(x): + # transform is using apply iff x is not a DataFrame + if use_apply == isinstance(x, DataFrame): + # Force transform to fallback + raise ValueError + return x + 1 - msg = "cannot combine transform and aggregation operations" - with pytest.raises(ValueError, match=msg): - with np.errstate(all="ignore"): - float_frame.transform(["max", "sqrt"], axis=axis) + result = float_frame.transform(func, axis=axis) + expected = float_frame + 1 + tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("method", ["abs", "shift", "pct_change", "cumsum", "rank"]) def test_transform_method_name(method): # GH 19760 - df = pd.DataFrame({"A": [-1, 2]}) + df = DataFrame({"A": [-1, 2]}) result = df.transform(method) expected = operator.methodcaller(method)(df) tm.assert_frame_equal(result, expected) + + +def test_transform_and_agg_err(axis, float_frame): + # GH 35964 + # cannot both transform and agg + msg = "Function did not transform" + with pytest.raises(ValueError, match=msg): + float_frame.transform(["max", "min"], axis=axis) + + msg = "Function did not transform" + with pytest.raises(ValueError, match=msg): + float_frame.transform(["max", "sqrt"], axis=axis) + + +def test_agg_dict_nested_renaming_depr(): + df = DataFrame({"A": range(5), "B": 5}) + + # nested renaming + msg = r"nested renamer is not supported" + with pytest.raises(SpecificationError, match=msg): + # mypy identifies the argument as an invalid type + df.transform({"A": {"foo": "min"}, "B": {"bar": "max"}}) + + +def test_transform_reducer_raises(all_reductions): + # GH 35964 + op = all_reductions + df = DataFrame({"A": [1, 2, 3]}) + msg = "Function did not transform" + with pytest.raises(ValueError, match=msg): + df.transform(op) + with pytest.raises(ValueError, match=msg): + df.transform([op]) + with pytest.raises(ValueError, match=msg): + df.transform({"A": op}) + with pytest.raises(ValueError, match=msg): + df.transform({"A": [op]}) + + +# mypy doesn't allow adding lists of different types +# https://github.com/python/mypy/issues/5492 +@pytest.mark.parametrize("op", [*transformation_kernels, lambda x: x + 1]) +def test_transform_bad_dtype(op): + # GH 35964 + df = DataFrame({"A": 3 * [object]}) # DataFrame that will fail on most transforms + if op in ("backfill", "shift", "pad", "bfill", "ffill"): + pytest.xfail("Transform function works on any datatype") + msg = "Transform function failed" + with pytest.raises(ValueError, match=msg): + df.transform(op) + with pytest.raises(ValueError, match=msg): + df.transform([op]) + with pytest.raises(ValueError, match=msg): + df.transform({"A": op}) + with pytest.raises(ValueError, match=msg): + df.transform({"A": [op]}) + + +@pytest.mark.parametrize("op", transformation_kernels) +def test_transform_partial_failure(op): + # GH 35964 + wont_fail = ["ffill", "bfill", "fillna", "pad", "backfill", "shift"] + if op in wont_fail: + pytest.xfail("Transform kernel is successful on all dtypes") + if op == "cumcount": + pytest.xfail("transform('cumcount') not implemented") + if op == "tshift": + pytest.xfail("Only works on time index; deprecated") + + # Using object makes most transform kernels fail + df = DataFrame({"A": 3 * [object], "B": [1, 2, 3]}) + + expected = df[["B"]].transform([op]) + result = df.transform([op]) + tm.assert_equal(result, expected) + + expected = df[["B"]].transform({"B": op}) + result = df.transform({"B": op}) + tm.assert_equal(result, expected) + + expected = df[["B"]].transform({"B": [op]}) + result = df.transform({"B": [op]}) + tm.assert_equal(result, expected) + + +@pytest.mark.parametrize("use_apply", [True, False]) +def test_transform_passes_args(use_apply): + # GH 35964 + # transform uses UDF either via apply or passing the entire DataFrame + expected_args = [1, 2] + expected_kwargs = {"c": 3} + + def f(x, a, b, c): + # transform is using apply iff x is not a DataFrame + if use_apply == isinstance(x, DataFrame): + # Force transform to fallback + raise ValueError + assert [a, b] == expected_args + assert c == expected_kwargs["c"] + return x + + DataFrame([1]).transform(f, 0, *expected_args, **expected_kwargs) + + +def test_transform_missing_columns(axis): + # GH 35964 + df = DataFrame({"A": [1, 2], "B": [3, 4]}) + match = re.escape("Column(s) ['C'] do not exist") + with pytest.raises(SpecificationError, match=match): + df.transform({"C": "cumsum"}) diff --git a/pandas/tests/series/apply/test_series_apply.py b/pandas/tests/series/apply/test_series_apply.py index b948317f32062..827f466e23106 100644 --- a/pandas/tests/series/apply/test_series_apply.py +++ b/pandas/tests/series/apply/test_series_apply.py @@ -209,8 +209,8 @@ def test_transform(self, string_series): f_abs = np.abs(string_series) # ufunc - expected = f_sqrt.copy() result = string_series.apply(np.sqrt) + expected = f_sqrt.copy() tm.assert_series_equal(result, expected) # list-like @@ -219,6 +219,9 @@ def test_transform(self, string_series): expected.columns = ["sqrt"] tm.assert_frame_equal(result, expected) + result = string_series.apply(["sqrt"]) + tm.assert_frame_equal(result, expected) + # multiple items in list # these are in the order as if we are applying both functions per # series and then concatting diff --git a/pandas/tests/series/apply/test_series_transform.py b/pandas/tests/series/apply/test_series_transform.py index 8bc3d2dc4d0db..0842674da2a7d 100644 --- a/pandas/tests/series/apply/test_series_transform.py +++ b/pandas/tests/series/apply/test_series_transform.py @@ -1,50 +1,90 @@ import numpy as np import pytest -import pandas as pd +from pandas import DataFrame, Series, concat import pandas._testing as tm +from pandas.core.base import SpecificationError +from pandas.core.groupby.base import transformation_kernels -def test_transform(string_series): - # transforming functions - +def test_transform_ufunc(string_series): + # GH 35964 with np.errstate(all="ignore"): f_sqrt = np.sqrt(string_series) - f_abs = np.abs(string_series) - # ufunc - result = string_series.transform(np.sqrt) - expected = f_sqrt.copy() - tm.assert_series_equal(result, expected) + # ufunc + result = string_series.transform(np.sqrt) + expected = f_sqrt.copy() + tm.assert_series_equal(result, expected) - # list-like - result = string_series.transform([np.sqrt]) - expected = f_sqrt.to_frame().copy() - expected.columns = ["sqrt"] - tm.assert_frame_equal(result, expected) - result = string_series.transform([np.sqrt]) - tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize("op", transformation_kernels) +def test_transform_groupby_kernel(string_series, op): + # GH 35964 + if op == "cumcount": + pytest.xfail("Series.cumcount does not exist") + if op == "tshift": + pytest.xfail("Only works on time index and is deprecated") + + args = [0.0] if op == "fillna" else [] + ones = np.ones(string_series.shape[0]) + expected = string_series.groupby(ones).transform(op, *args) + result = string_series.transform(op, 0, *args) + tm.assert_series_equal(result, expected) - result = string_series.transform(["sqrt"]) - tm.assert_frame_equal(result, expected) - # multiple items in list - # these are in the order as if we are applying both functions per - # series and then concatting - expected = pd.concat([f_sqrt, f_abs], axis=1) - result = string_series.transform(["sqrt", "abs"]) - expected.columns = ["sqrt", "abs"] +@pytest.mark.parametrize( + "ops, names", [([np.sqrt], ["sqrt"]), ([np.abs, np.sqrt], ["absolute", "sqrt"])] +) +def test_transform_list(string_series, ops, names): + # GH 35964 + with np.errstate(all="ignore"): + expected = concat([op(string_series) for op in ops], axis=1) + expected.columns = names + result = string_series.transform(ops) tm.assert_frame_equal(result, expected) -def test_transform_and_agg_error(string_series): +def test_transform_dict(string_series): + # GH 35964 + with np.errstate(all="ignore"): + expected = concat([np.sqrt(string_series), np.abs(string_series)], axis=1) + expected.columns = ["foo", "bar"] + result = string_series.transform({"foo": np.sqrt, "bar": np.abs}) + tm.assert_frame_equal(result, expected) + + +def test_transform_udf(axis, string_series): + # GH 35964 + # via apply + def func(x): + if isinstance(x, Series): + raise ValueError + return x + 1 + + result = string_series.transform(func) + expected = string_series + 1 + tm.assert_series_equal(result, expected) + + # via map Series -> Series + def func(x): + if not isinstance(x, Series): + raise ValueError + return x + 1 + + result = string_series.transform(func) + expected = string_series + 1 + tm.assert_series_equal(result, expected) + + +def test_transform_wont_agg(string_series): + # GH 35964 # we are trying to transform with an aggregator - msg = "transforms cannot produce aggregated results" + msg = "Function did not transform" with pytest.raises(ValueError, match=msg): string_series.transform(["min", "max"]) - msg = "cannot combine transform and aggregation operations" + msg = "Function did not transform" with pytest.raises(ValueError, match=msg): with np.errstate(all="ignore"): string_series.transform(["sqrt", "max"]) @@ -52,8 +92,74 @@ def test_transform_and_agg_error(string_series): def test_transform_none_to_type(): # GH34377 - df = pd.DataFrame({"a": [None]}) - - msg = "DataFrame constructor called with incompatible data and dtype" - with pytest.raises(TypeError, match=msg): + df = DataFrame({"a": [None]}) + msg = "Transform function failed" + with pytest.raises(ValueError, match=msg): df.transform({"a": int}) + + +def test_transform_reducer_raises(all_reductions): + # GH 35964 + op = all_reductions + s = Series([1, 2, 3]) + msg = "Function did not transform" + with pytest.raises(ValueError, match=msg): + s.transform(op) + with pytest.raises(ValueError, match=msg): + s.transform([op]) + with pytest.raises(ValueError, match=msg): + s.transform({"A": op}) + with pytest.raises(ValueError, match=msg): + s.transform({"A": [op]}) + + +# mypy doesn't allow adding lists of different types +# https://github.com/python/mypy/issues/5492 +@pytest.mark.parametrize("op", [*transformation_kernels, lambda x: x + 1]) +def test_transform_bad_dtype(op): + # GH 35964 + s = Series(3 * [object]) # Series that will fail on most transforms + if op in ("backfill", "shift", "pad", "bfill", "ffill"): + pytest.xfail("Transform function works on any datatype") + msg = "Transform function failed" + with pytest.raises(ValueError, match=msg): + s.transform(op) + with pytest.raises(ValueError, match=msg): + s.transform([op]) + with pytest.raises(ValueError, match=msg): + s.transform({"A": op}) + with pytest.raises(ValueError, match=msg): + s.transform({"A": [op]}) + + +@pytest.mark.parametrize("use_apply", [True, False]) +def test_transform_passes_args(use_apply): + # GH 35964 + # transform uses UDF either via apply or passing the entire Series + expected_args = [1, 2] + expected_kwargs = {"c": 3} + + def f(x, a, b, c): + # transform is using apply iff x is not a Series + if use_apply == isinstance(x, Series): + # Force transform to fallback + raise ValueError + assert [a, b] == expected_args + assert c == expected_kwargs["c"] + return x + + Series([1]).transform(f, 0, *expected_args, **expected_kwargs) + + +def test_transform_axis_1_raises(): + # GH 35964 + msg = "No axis named 1 for object type Series" + with pytest.raises(ValueError, match=msg): + Series([1]).transform("sum", axis=1) + + +def test_transform_nested_renamer(): + # GH 35964 + match = "nested renamer is not supported" + with pytest.raises(SpecificationError, match=match): + Series([1]).transform({"A": {"B": ["sum"]}}) From f702d8a45002f90f831812a86e39d37a1501a36e Mon Sep 17 00:00:00 2001 From: Avinash Pancham <44933366+avinashpancham@users.noreply.github.com> Date: Sat, 12 Sep 2020 23:37:57 +0200 Subject: [PATCH 0750/1025] DEPR: Deprecate pandas/io/date_converters.py (#35741) --- doc/source/user_guide/io.rst | 15 +-- doc/source/whatsnew/v1.2.0.rst | 2 +- pandas/io/date_converters.py | 62 ++++++++++ pandas/tests/io/parser/test_parse_dates.py | 130 ++++++++++++++------- pandas/tests/io/test_date_converters.py | 15 ++- 5 files changed, 158 insertions(+), 66 deletions(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index a1ce2f847d4b8..4dfabaa99fff6 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -930,7 +930,7 @@ take full advantage of the flexibility of the date parsing API: .. ipython:: python df = pd.read_csv('tmp.csv', header=None, parse_dates=date_spec, - date_parser=pd.io.date_converters.parse_date_time) + date_parser=pd.to_datetime) df Pandas will try to call the ``date_parser`` function in three different ways. If @@ -942,11 +942,6 @@ an exception is raised, the next one is tried: 2. If #1 fails, ``date_parser`` is called with all the columns concatenated row-wise into a single array (e.g., ``date_parser(['2013 1', '2013 2'])``). -3. If #2 fails, ``date_parser`` is called once for every row with one or more - string arguments from the columns indicated with `parse_dates` - (e.g., ``date_parser('2013', '1')`` for the first row, ``date_parser('2013', '2')`` - for the second, etc.). - Note that performance-wise, you should try these methods of parsing dates in order: 1. Try to infer the format using ``infer_datetime_format=True`` (see section below). @@ -958,14 +953,6 @@ Note that performance-wise, you should try these methods of parsing dates in ord For optimal performance, this should be vectorized, i.e., it should accept arrays as arguments. -You can explore the date parsing functionality in -`date_converters.py `__ -and add your own. We would love to turn this module into a community supported -set of date/time parsers. To get you started, ``date_converters.py`` contains -functions to parse dual date and time columns, year/month/day columns, -and year/month/day/hour/minute/second columns. It also contains a -``generic_parser`` function so you can curry it with a function that deals with -a single date rather than the entire array. .. ipython:: python :suppress: diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index af2960b5038b2..2b230d2fde645 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -195,7 +195,7 @@ Deprecations ~~~~~~~~~~~~ - Deprecated parameter ``inplace`` in :meth:`MultiIndex.set_codes` and :meth:`MultiIndex.set_levels` (:issue:`35626`) - Deprecated parameter ``dtype`` in :~meth:`Index.copy` on method all index classes. Use the :meth:`Index.astype` method instead for changing dtype(:issue:`35853`) -- +- Date parser functions :func:`~pandas.io.date_converters.parse_date_time`, :func:`~pandas.io.date_converters.parse_date_fields`, :func:`~pandas.io.date_converters.parse_all_fields` and :func:`~pandas.io.date_converters.generic_parser` from ``pandas.io.date_converters`` are deprecated and will be removed in a future version; use :func:`to_datetime` instead (:issue:`35741`) .. --------------------------------------------------------------------------- diff --git a/pandas/io/date_converters.py b/pandas/io/date_converters.py index 07919dbda63ae..f079a25f69fec 100644 --- a/pandas/io/date_converters.py +++ b/pandas/io/date_converters.py @@ -1,16 +1,46 @@ """This module is designed for community supported date conversion functions""" +import warnings + import numpy as np from pandas._libs.tslibs import parsing def parse_date_time(date_col, time_col): + """ + Parse columns with dates and times into a single datetime column. + + .. deprecated:: 1.2 + """ + warnings.warn( + """ + Use pd.to_datetime(date_col + " " + time_col) instead to get a Pandas Series. + Use pd.to_datetime(date_col + " " + time_col).to_pydatetime() instead to get a Numpy array. +""", # noqa: E501 + FutureWarning, + stacklevel=2, + ) date_col = _maybe_cast(date_col) time_col = _maybe_cast(time_col) return parsing.try_parse_date_and_time(date_col, time_col) def parse_date_fields(year_col, month_col, day_col): + """ + Parse columns with years, months and days into a single date column. + + .. deprecated:: 1.2 + """ + warnings.warn( + """ + Use pd.to_datetime({"year": year_col, "month": month_col, "day": day_col}) instead to get a Pandas Series. + Use ser = pd.to_datetime({"year": year_col, "month": month_col, "day": day_col}) and + np.array([s.to_pydatetime() for s in ser]) instead to get a Numpy array. +""", # noqa: E501 + FutureWarning, + stacklevel=2, + ) + year_col = _maybe_cast(year_col) month_col = _maybe_cast(month_col) day_col = _maybe_cast(day_col) @@ -18,6 +48,24 @@ def parse_date_fields(year_col, month_col, day_col): def parse_all_fields(year_col, month_col, day_col, hour_col, minute_col, second_col): + """ + Parse columns with datetime information into a single datetime column. + + .. deprecated:: 1.2 + """ + + warnings.warn( + """ + Use pd.to_datetime({"year": year_col, "month": month_col, "day": day_col, + "hour": hour_col, "minute": minute_col, second": second_col}) instead to get a Pandas Series. + Use ser = pd.to_datetime({"year": year_col, "month": month_col, "day": day_col, + "hour": hour_col, "minute": minute_col, second": second_col}) and + np.array([s.to_pydatetime() for s in ser]) instead to get a Numpy array. +""", # noqa: E501 + FutureWarning, + stacklevel=2, + ) + year_col = _maybe_cast(year_col) month_col = _maybe_cast(month_col) day_col = _maybe_cast(day_col) @@ -30,6 +78,20 @@ def parse_all_fields(year_col, month_col, day_col, hour_col, minute_col, second_ def generic_parser(parse_func, *cols): + """ + Use dateparser to parse columns with data information into a single datetime column. + + .. deprecated:: 1.2 + """ + + warnings.warn( + """ + Use pd.to_datetime instead. +""", + FutureWarning, + stacklevel=2, + ) + N = _check_columns(cols) results = np.empty(N, dtype=object) diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index 833186b69c63b..662659982c0b3 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -370,7 +370,11 @@ def test_date_col_as_index_col(all_parsers): tm.assert_frame_equal(result, expected) -def test_multiple_date_cols_int_cast(all_parsers): +@pytest.mark.parametrize( + "date_parser, warning", + ([conv.parse_date_time, FutureWarning], [pd.to_datetime, None]), +) +def test_multiple_date_cols_int_cast(all_parsers, date_parser, warning): data = ( "KORD,19990127, 19:00:00, 18:56:00, 0.8100\n" "KORD,19990127, 20:00:00, 19:56:00, 0.0100\n" @@ -382,13 +386,15 @@ def test_multiple_date_cols_int_cast(all_parsers): parse_dates = {"actual": [1, 2], "nominal": [1, 3]} parser = all_parsers - result = parser.read_csv( - StringIO(data), - header=None, - date_parser=conv.parse_date_time, - parse_dates=parse_dates, - prefix="X", - ) + with tm.assert_produces_warning(warning, check_stacklevel=False): + result = parser.read_csv( + StringIO(data), + header=None, + date_parser=date_parser, + parse_dates=parse_dates, + prefix="X", + ) + expected = DataFrame( [ [datetime(1999, 1, 27, 19, 0), datetime(1999, 1, 27, 18, 56), "KORD", 0.81], @@ -808,7 +814,9 @@ def test_parse_dates_custom_euro_format(all_parsers, kwargs): tm.assert_frame_equal(df, expected) else: msg = "got an unexpected keyword argument 'day_first'" - with pytest.raises(TypeError, match=msg): + with pytest.raises(TypeError, match=msg), tm.assert_produces_warning( + FutureWarning + ): parser.read_csv( StringIO(data), names=["time", "Q", "NTU"], @@ -1166,7 +1174,11 @@ def test_parse_dates_no_convert_thousands(all_parsers, data, kwargs, expected): tm.assert_frame_equal(result, expected) -def test_parse_date_time_multi_level_column_name(all_parsers): +@pytest.mark.parametrize( + "date_parser, warning", + ([conv.parse_date_time, FutureWarning], [pd.to_datetime, None]), +) +def test_parse_date_time_multi_level_column_name(all_parsers, date_parser, warning): data = """\ D,T,A,B date, time,a,b @@ -1174,12 +1186,13 @@ def test_parse_date_time_multi_level_column_name(all_parsers): 2001-01-06, 00:00:00, 1.0, 11. """ parser = all_parsers - result = parser.read_csv( - StringIO(data), - header=[0, 1], - parse_dates={"date_time": [0, 1]}, - date_parser=conv.parse_date_time, - ) + with tm.assert_produces_warning(warning, check_stacklevel=False): + result = parser.read_csv( + StringIO(data), + header=[0, 1], + parse_dates={"date_time": [0, 1]}, + date_parser=date_parser, + ) expected_data = [ [datetime(2001, 1, 5, 9, 0, 0), 0.0, 10.0], @@ -1189,6 +1202,10 @@ def test_parse_date_time_multi_level_column_name(all_parsers): tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize( + "date_parser, warning", + ([conv.parse_date_time, FutureWarning], [pd.to_datetime, None]), +) @pytest.mark.parametrize( "data,kwargs,expected", [ @@ -1261,9 +1278,10 @@ def test_parse_date_time_multi_level_column_name(all_parsers): ), ], ) -def test_parse_date_time(all_parsers, data, kwargs, expected): +def test_parse_date_time(all_parsers, data, kwargs, expected, date_parser, warning): parser = all_parsers - result = parser.read_csv(StringIO(data), date_parser=conv.parse_date_time, **kwargs) + with tm.assert_produces_warning(warning, check_stacklevel=False): + result = parser.read_csv(StringIO(data), date_parser=date_parser, **kwargs) # Python can sometimes be flaky about how # the aggregated columns are entered, so @@ -1272,15 +1290,20 @@ def test_parse_date_time(all_parsers, data, kwargs, expected): tm.assert_frame_equal(result, expected) -def test_parse_date_fields(all_parsers): +@pytest.mark.parametrize( + "date_parser, warning", + ([conv.parse_date_fields, FutureWarning], [pd.to_datetime, None]), +) +def test_parse_date_fields(all_parsers, date_parser, warning): parser = all_parsers data = "year,month,day,a\n2001,01,10,10.\n2001,02,1,11." - result = parser.read_csv( - StringIO(data), - header=0, - parse_dates={"ymd": [0, 1, 2]}, - date_parser=conv.parse_date_fields, - ) + with tm.assert_produces_warning(warning, check_stacklevel=False): + result = parser.read_csv( + StringIO(data), + header=0, + parse_dates={"ymd": [0, 1, 2]}, + date_parser=date_parser, + ) expected = DataFrame( [[datetime(2001, 1, 10), 10.0], [datetime(2001, 2, 1), 11.0]], @@ -1289,19 +1312,27 @@ def test_parse_date_fields(all_parsers): tm.assert_frame_equal(result, expected) -def test_parse_date_all_fields(all_parsers): +@pytest.mark.parametrize( + "date_parser, warning", + ( + [conv.parse_all_fields, FutureWarning], + [lambda x: pd.to_datetime(x, format="%Y %m %d %H %M %S"), None], + ), +) +def test_parse_date_all_fields(all_parsers, date_parser, warning): parser = all_parsers data = """\ year,month,day,hour,minute,second,a,b 2001,01,05,10,00,0,0.0,10. 2001,01,5,10,0,00,1.,11. """ - result = parser.read_csv( - StringIO(data), - header=0, - date_parser=conv.parse_all_fields, - parse_dates={"ymdHMS": [0, 1, 2, 3, 4, 5]}, - ) + with tm.assert_produces_warning(warning, check_stacklevel=False): + result = parser.read_csv( + StringIO(data), + header=0, + date_parser=date_parser, + parse_dates={"ymdHMS": [0, 1, 2, 3, 4, 5]}, + ) expected = DataFrame( [ [datetime(2001, 1, 5, 10, 0, 0), 0.0, 10.0], @@ -1312,19 +1343,27 @@ def test_parse_date_all_fields(all_parsers): tm.assert_frame_equal(result, expected) -def test_datetime_fractional_seconds(all_parsers): +@pytest.mark.parametrize( + "date_parser, warning", + ( + [conv.parse_all_fields, FutureWarning], + [lambda x: pd.to_datetime(x, format="%Y %m %d %H %M %S.%f"), None], + ), +) +def test_datetime_fractional_seconds(all_parsers, date_parser, warning): parser = all_parsers data = """\ year,month,day,hour,minute,second,a,b 2001,01,05,10,00,0.123456,0.0,10. 2001,01,5,10,0,0.500000,1.,11. """ - result = parser.read_csv( - StringIO(data), - header=0, - date_parser=conv.parse_all_fields, - parse_dates={"ymdHMS": [0, 1, 2, 3, 4, 5]}, - ) + with tm.assert_produces_warning(warning, check_stacklevel=False): + result = parser.read_csv( + StringIO(data), + header=0, + date_parser=date_parser, + parse_dates={"ymdHMS": [0, 1, 2, 3, 4, 5]}, + ) expected = DataFrame( [ [datetime(2001, 1, 5, 10, 0, 0, microsecond=123456), 0.0, 10.0], @@ -1339,12 +1378,13 @@ def test_generic(all_parsers): parser = all_parsers data = "year,month,day,a\n2001,01,10,10.\n2001,02,1,11." - result = parser.read_csv( - StringIO(data), - header=0, - parse_dates={"ym": [0, 1]}, - date_parser=lambda y, m: date(year=int(y), month=int(m), day=1), - ) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + result = parser.read_csv( + StringIO(data), + header=0, + parse_dates={"ym": [0, 1]}, + date_parser=lambda y, m: date(year=int(y), month=int(m), day=1), + ) expected = DataFrame( [[date(2001, 1, 1), 10, 10.0], [date(2001, 2, 1), 1, 11.0]], columns=["ym", "day", "a"], diff --git a/pandas/tests/io/test_date_converters.py b/pandas/tests/io/test_date_converters.py index cdb8eca02a3e5..a9fa27e091714 100644 --- a/pandas/tests/io/test_date_converters.py +++ b/pandas/tests/io/test_date_converters.py @@ -8,11 +8,12 @@ def test_parse_date_time(): + dates = np.array(["2007/1/3", "2008/2/4"], dtype=object) times = np.array(["05:07:09", "06:08:00"], dtype=object) expected = np.array([datetime(2007, 1, 3, 5, 7, 9), datetime(2008, 2, 4, 6, 8, 0)]) - - result = conv.parse_date_time(dates, times) + with tm.assert_produces_warning(FutureWarning): + result = conv.parse_date_time(dates, times) tm.assert_numpy_array_equal(result, expected) @@ -20,9 +21,10 @@ def test_parse_date_fields(): days = np.array([3, 4]) months = np.array([1, 2]) years = np.array([2007, 2008]) - result = conv.parse_date_fields(years, months, days) - expected = np.array([datetime(2007, 1, 3), datetime(2008, 2, 4)]) + + with tm.assert_produces_warning(FutureWarning): + result = conv.parse_date_fields(years, months, days) tm.assert_numpy_array_equal(result, expected) @@ -34,7 +36,8 @@ def test_parse_all_fields(): days = np.array([3, 4]) years = np.array([2007, 2008]) months = np.array([1, 2]) - - result = conv.parse_all_fields(years, months, days, hours, minutes, seconds) expected = np.array([datetime(2007, 1, 3, 5, 7, 9), datetime(2008, 2, 4, 6, 8, 0)]) + + with tm.assert_produces_warning(FutureWarning): + result = conv.parse_all_fields(years, months, days, hours, minutes, seconds) tm.assert_numpy_array_equal(result, expected) From 6eb6348525e2db49144cffae3446fbbfe3c8be13 Mon Sep 17 00:00:00 2001 From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> Date: Sat, 12 Sep 2020 17:39:39 -0400 Subject: [PATCH 0751/1025] REGR: Series access with Index of tuples/frozenset (#36147) --- doc/source/whatsnew/v1.1.3.rst | 2 ++ pandas/core/series.py | 22 +++++++++---------- pandas/tests/series/indexing/test_indexing.py | 21 +++++++++++++++++- 3 files changed, 33 insertions(+), 12 deletions(-) diff --git a/doc/source/whatsnew/v1.1.3.rst b/doc/source/whatsnew/v1.1.3.rst index c06990e3f2051..25d223418fc92 100644 --- a/doc/source/whatsnew/v1.1.3.rst +++ b/doc/source/whatsnew/v1.1.3.rst @@ -14,6 +14,8 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ +- Fixed regression in :meth:`Series.__getitem__` incorrectly raising when the input was a tuple (:issue:`35534`) +- Fixed regression in :meth:`Series.__getitem__` incorrectly raising when the input was a frozenset (:issue:`35747`) - .. --------------------------------------------------------------------------- diff --git a/pandas/core/series.py b/pandas/core/series.py index 632b93cdcf24b..ef9ade5c7bb15 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -889,21 +889,19 @@ def __getitem__(self, key): elif key_is_scalar: return self._get_value(key) - if ( - isinstance(key, tuple) - and is_hashable(key) - and isinstance(self.index, MultiIndex) - ): + if is_hashable(key): # Otherwise index.get_value will raise InvalidIndexError try: + # For labels that don't resolve as scalars like tuples and frozensets result = self._get_value(key) return result except KeyError: - # We still have the corner case where this tuple is a key - # in the first level of our MultiIndex - return self._get_values_tuple(key) + if isinstance(key, tuple) and isinstance(self.index, MultiIndex): + # We still have the corner case where a tuple is a key + # in the first level of our MultiIndex + return self._get_values_tuple(key) if is_iterator(key): key = list(key) @@ -963,7 +961,7 @@ def _get_values_tuple(self, key): return result if not isinstance(self.index, MultiIndex): - raise ValueError("Can only tuple-index with a MultiIndex") + raise ValueError("key of type tuple not found and not a MultiIndex") # If key is contained, would have returned by now indexer, new_index = self.index.get_loc_level(key) @@ -1017,9 +1015,11 @@ def __setitem__(self, key, value): # GH#12862 adding an new key to the Series self.loc[key] = value - except TypeError as e: + except TypeError as err: if isinstance(key, tuple) and not isinstance(self.index, MultiIndex): - raise ValueError("Can only tuple-index with a MultiIndex") from e + raise ValueError( + "key of type tuple not found and not a MultiIndex" + ) from err if com.is_bool_indexer(key): key = check_bool_indexer(self.index, key) diff --git a/pandas/tests/series/indexing/test_indexing.py b/pandas/tests/series/indexing/test_indexing.py index 3ed25b8bca566..1fafdf00393e1 100644 --- a/pandas/tests/series/indexing/test_indexing.py +++ b/pandas/tests/series/indexing/test_indexing.py @@ -383,7 +383,7 @@ def test_2d_to_1d_assignment_raises(): @pytest.mark.filterwarnings("ignore:Using a non-tuple:FutureWarning") def test_basic_getitem_setitem_corner(datetime_series): # invalid tuples, e.g. td.ts[:, None] vs. td.ts[:, 2] - msg = "Can only tuple-index with a MultiIndex" + msg = "key of type tuple not found and not a MultiIndex" with pytest.raises(ValueError, match=msg): datetime_series[:, 2] with pytest.raises(ValueError, match=msg): @@ -942,3 +942,22 @@ def assert_slices_equivalent(l_slc, i_slc): for key2 in [keystr2, box(keystr2)]: assert_slices_equivalent(SLC[key2:key:-1], SLC[13:8:-1]) assert_slices_equivalent(SLC[key:key2:-1], SLC[0:0:-1]) + + +def test_tuple_index(): + # GH 35534 - Selecting values when a Series has an Index of tuples + s = pd.Series([1, 2], index=[("a",), ("b",)]) + assert s[("a",)] == 1 + assert s[("b",)] == 2 + s[("b",)] = 3 + assert s[("b",)] == 3 + + +def test_frozenset_index(): + # GH35747 - Selecting values when a Series has an Index of frozenset + idx0, idx1 = frozenset("a"), frozenset("b") + s = pd.Series([1, 2], index=[idx0, idx1]) + assert s[idx0] == 1 + assert s[idx1] == 2 + s[idx1] = 3 + assert s[idx1] == 3 From 9f9abe12aff69a38dd5f863672a964eba492ee07 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 12 Sep 2020 16:24:33 -0700 Subject: [PATCH 0752/1025] ENH: consistently cast strings for DTA/TDA/PA.__setitem__ (#36261) * ENH: consistently cast strings for DTA/TDA/PA.__setitem__ * whatsnew --- doc/source/whatsnew/v1.2.0.rst | 2 ++ pandas/core/arrays/datetimelike.py | 3 +-- pandas/tests/arrays/test_datetimelike.py | 31 ++++++++++++++++++++---- pandas/tests/arrays/test_datetimes.py | 23 ++++++++++++++++++ 4 files changed, 52 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 2b230d2fde645..e577a8f26bd12 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -230,6 +230,8 @@ Datetimelike - Bug in :meth:`DatetimeIndex.get_slice_bound` where ``datetime.date`` objects were not accepted or naive :class:`Timestamp` with a tz-aware :class:`DatetimeIndex` (:issue:`35690`) - Bug in :meth:`DatetimeIndex.slice_locs` where ``datetime.date`` objects were not accepted (:issue:`34077`) - Bug in :meth:`DatetimeIndex.searchsorted`, :meth:`TimedeltaIndex.searchsorted`, :meth:`PeriodIndex.searchsorted`, and :meth:`Series.searchsorted` with ``datetime64``, ``timedelta64`` or ``Period`` dtype placement of ``NaT`` values being inconsistent with ``NumPy`` (:issue:`36176`,:issue:`36254`) +- Inconsistency in :class:`DatetimeArray`, :class:`TimedeltaArray`, and :class:`PeriodArray` setitem casting arrays of strings to datetimelike scalars but not scalar strings (:issue:`36261`) +- Timedelta ^^^^^^^^^ diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index b013246e724de..6f0e2a6a598fc 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -875,8 +875,7 @@ def _validate_setitem_value(self, value): if is_list_like(value): value = self._validate_listlike(value, "setitem", cast_str=True) else: - # TODO: cast_str for consistency? - value = self._validate_scalar(value, msg, cast_str=False) + value = self._validate_scalar(value, msg, cast_str=True) return self._unbox(value, setitem=True) diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index 624335fd78b0f..0ae6b5bde5297 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -2,6 +2,7 @@ import numpy as np import pytest +import pytz from pandas._libs import OutOfBoundsDatetime from pandas.compat.numpy import np_version_under1p18 @@ -282,15 +283,35 @@ def test_setitem(self): expected[:2] = expected[-2:] tm.assert_numpy_array_equal(arr.asi8, expected) - def test_setitem_str_array(self, arr1d): - if isinstance(arr1d, DatetimeArray) and arr1d.tz is not None: - pytest.xfail(reason="timezone comparisons inconsistent") + def test_setitem_strs(self, arr1d): + # Check that we parse strs in both scalar and listlike + if isinstance(arr1d, DatetimeArray): + tz = arr1d.tz + if ( + tz is not None + and tz is not pytz.UTC + and not isinstance(tz, pytz._FixedOffset) + ): + # If we have e.g. tzutc(), when we cast to string and parse + # back we get pytz.UTC, and then consider them different timezones + # so incorrectly raise. + pytest.xfail(reason="timezone comparisons inconsistent") + + # Setting list-like of strs expected = arr1d.copy() expected[[0, 1]] = arr1d[-2:] - arr1d[:2] = [str(x) for x in arr1d[-2:]] + result = arr1d.copy() + result[:2] = [str(x) for x in arr1d[-2:]] + tm.assert_equal(result, expected) - tm.assert_equal(arr1d, expected) + # Same thing but now for just a scalar str + expected = arr1d.copy() + expected[0] = arr1d[-1] + + result = arr1d.copy() + result[0] = str(arr1d[-1]) + tm.assert_equal(result, expected) @pytest.mark.parametrize("as_index", [True, False]) def test_setitem_categorical(self, arr1d, as_index): diff --git a/pandas/tests/arrays/test_datetimes.py b/pandas/tests/arrays/test_datetimes.py index 804654451a6d9..53f26de09f94e 100644 --- a/pandas/tests/arrays/test_datetimes.py +++ b/pandas/tests/arrays/test_datetimes.py @@ -197,6 +197,29 @@ def test_tz_setter_raises(self): with pytest.raises(AttributeError, match="tz_localize"): arr.tz = "UTC" + def test_setitem_str_impute_tz(self, tz_naive_fixture): + # Like for getitem, if we are passed a naive-like string, we impute + # our own timezone. + tz = tz_naive_fixture + + data = np.array([1, 2, 3], dtype="M8[ns]") + dtype = data.dtype if tz is None else DatetimeTZDtype(tz=tz) + arr = DatetimeArray(data, dtype=dtype) + expected = arr.copy() + + ts = pd.Timestamp("2020-09-08 16:50").tz_localize(tz) + setter = str(ts.tz_localize(None)) + + # Setting a scalar tznaive string + expected[0] = ts + arr[0] = setter + tm.assert_equal(arr, expected) + + # Setting a listlike of tznaive strings + expected[1] = ts + arr[:2] = [setter, setter] + tm.assert_equal(arr, expected) + def test_setitem_different_tz_raises(self): data = np.array([1, 2, 3], dtype="M8[ns]") arr = DatetimeArray(data, copy=False, dtype=DatetimeTZDtype(tz="US/Central")) From 4534404d5b5b1fa2ecf85bb9c022d03afa15a960 Mon Sep 17 00:00:00 2001 From: Fangchen Li Date: Sun, 13 Sep 2020 06:17:04 -0500 Subject: [PATCH 0753/1025] CI: install numpy from pip #36296 (#36323) --- ci/build39.sh | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/ci/build39.sh b/ci/build39.sh index f85e1c7def206..b9c76635df99b 100755 --- a/ci/build39.sh +++ b/ci/build39.sh @@ -3,16 +3,9 @@ sudo apt-get install build-essential gcc xvfb pip install --no-deps -U pip wheel setuptools -pip install python-dateutil pytz pytest pytest-xdist hypothesis +pip install numpy python-dateutil pytz pytest pytest-xdist hypothesis pip install cython --pre # https://github.com/cython/cython/issues/3395 -git clone https://github.com/numpy/numpy -cd numpy -python setup.py build_ext --inplace -python setup.py install -cd .. -rm -rf numpy - python setup.py build_ext -inplace python -m pip install --no-build-isolation -e . From 37213028207047074f60e351b0be9c474577d77d Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 13 Sep 2020 05:26:21 -0700 Subject: [PATCH 0754/1025] REF: _convert_for_op -> _validate_fill_value (#36318) --- pandas/core/indexes/base.py | 8 +++++--- pandas/core/indexes/datetimes.py | 6 ++---- pandas/core/indexes/numeric.py | 2 +- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 0aed08d46657e..b0f9f8ac8b2fd 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4047,9 +4047,10 @@ def _to_safe_for_reshape(self): """ return self - def _convert_for_op(self, value): + def _validate_fill_value(self, value): """ - Convert value to be insertable to ndarray. + Check if the value can be inserted into our array, and convert + it to an appropriate native type if necessary. """ return value @@ -4228,7 +4229,8 @@ def putmask(self, mask, value): """ values = self.values.copy() try: - np.putmask(values, mask, self._convert_for_op(value)) + converted = self._validate_fill_value(value) + np.putmask(values, mask, converted) if is_period_dtype(self.dtype): # .values cast to object, so we need to cast back values = type(self)(values)._data diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index f269495f6011a..2d166773dda2c 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -325,13 +325,11 @@ def __reduce__(self): d.update(self._get_attributes_dict()) return _new_DatetimeIndex, (type(self), d), None - def _convert_for_op(self, value): + def _validate_fill_value(self, value): """ Convert value to be insertable to ndarray. """ - if self._has_same_tz(value): - return Timestamp(value).asm8 - raise ValueError("Passed item and index have different timezone") + return self._data._validate_setitem_value(value) def _is_comparable_dtype(self, dtype: DtypeObj) -> bool: """ diff --git a/pandas/core/indexes/numeric.py b/pandas/core/indexes/numeric.py index e8b7efeee8852..f6859cbc4c0a2 100644 --- a/pandas/core/indexes/numeric.py +++ b/pandas/core/indexes/numeric.py @@ -116,7 +116,7 @@ def _shallow_copy(self, values=None, name: Label = lib.no_default): return Float64Index._simple_new(values, name=name) return super()._shallow_copy(values=values, name=name) - def _convert_for_op(self, value): + def _validate_fill_value(self, value): """ Convert value to be insertable to ndarray. """ From aaa1a70e5572d75c68092b1d9d978cde1db90a18 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 13 Sep 2020 05:28:15 -0700 Subject: [PATCH 0755/1025] REF: separate out helpers from iLoc._setitem_with_indexer (#36315) --- pandas/core/indexing.py | 141 ++++++++++++++++++++++------------------ 1 file changed, 77 insertions(+), 64 deletions(-) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 51031d9ab1153..64da27a6574a6 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -34,7 +34,7 @@ from pandas.core.indexes.api import Index if TYPE_CHECKING: - from pandas import DataFrame # noqa:F401 + from pandas import DataFrame, Series # noqa:F401 # "null slice" _NS = slice(None, None) @@ -1543,13 +1543,10 @@ def _setitem_with_indexer(self, indexer, value): since it goes from positional indexers back to labels when calling BlockManager methods, see GH#12991, GH#22046, GH#15686. """ - - # also has the side effect of consolidating in-place - from pandas import Series - info_axis = self.obj._info_axis_number # maybe partial set + # _is_mixed_type has the side effect of consolidating in-place take_split_path = self.obj._is_mixed_type # if there is only one block/type, still have to take split path @@ -1642,6 +1639,8 @@ def _setitem_with_indexer(self, indexer, value): # align and set the values if take_split_path: + # We have to operate column-wise + # Above we only set take_split_path to True for 2D cases assert self.ndim == 2 assert info_axis == 1 @@ -1682,29 +1681,6 @@ def _setitem_with_indexer(self, indexer, value): pi = plane_indexer[0] if lplane_indexer == 1 else plane_indexer - def isetter(loc, v): - # positional setting on column loc - ser = self.obj._ixs(loc, axis=1) - - # perform the equivalent of a setitem on the info axis - # as we have a null slice or a slice with full bounds - # which means essentially reassign to the columns of a - # multi-dim object - # GH6149 (null slice), GH10408 (full bounds) - if isinstance(pi, tuple) and all( - com.is_null_slice(idx) or com.is_full_slice(idx, len(self.obj)) - for idx in pi - ): - ser = v - else: - # set the item, possibly having a dtype change - ser = ser.copy() - ser._mgr = ser._mgr.setitem(indexer=pi, value=v) - ser._maybe_update_cacher(clear=True) - - # reset the sliced object if unique - self.obj._iset_item(loc, ser) - # we need an iterable, with a ndim of at least 1 # eg. don't pass through np.array(0) if is_list_like_indexer(value) and getattr(value, "ndim", 1) > 0: @@ -1725,7 +1701,7 @@ def isetter(loc, v): else: v = np.nan - isetter(loc, v) + self._setitem_single_column(loc, v, pi) # we have an equal len ndarray/convertible to our labels # hasattr first, to avoid coercing to ndarray without reason. @@ -1744,7 +1720,7 @@ def isetter(loc, v): for i, loc in enumerate(ilocs): # setting with a list, re-coerces - isetter(loc, value[:, i].tolist()) + self._setitem_single_column(loc, value[:, i].tolist(), pi) elif ( len(labels) == 1 @@ -1753,7 +1729,7 @@ def isetter(loc, v): ): # we have an equal len list/ndarray # We only get here with len(labels) == len(ilocs) == 1 - isetter(ilocs[0], value) + self._setitem_single_column(ilocs[0], value, pi) elif lplane_indexer == 0 and len(value) == len(self.obj.index): # We get here in one case via .loc with a all-False mask @@ -1768,50 +1744,87 @@ def isetter(loc, v): ) for loc, v in zip(ilocs, value): - isetter(loc, v) + self._setitem_single_column(loc, v, pi) else: # scalar value for loc in ilocs: - isetter(loc, value) + self._setitem_single_column(loc, value, pi) else: - if isinstance(indexer, tuple): + self._setitem_single_block_inplace(indexer, value) + + def _setitem_single_column(self, loc: int, value, plane_indexer): + # positional setting on column loc + pi = plane_indexer + + ser = self.obj._ixs(loc, axis=1) + + # perform the equivalent of a setitem on the info axis + # as we have a null slice or a slice with full bounds + # which means essentially reassign to the columns of a + # multi-dim object + # GH#6149 (null slice), GH#10408 (full bounds) + if isinstance(pi, tuple) and all( + com.is_null_slice(idx) or com.is_full_slice(idx, len(self.obj)) + for idx in pi + ): + ser = value + else: + # set the item, possibly having a dtype change + ser = ser.copy() + ser._mgr = ser._mgr.setitem(indexer=pi, value=value) + ser._maybe_update_cacher(clear=True) - # if we are setting on the info axis ONLY - # set using those methods to avoid block-splitting - # logic here - if ( - len(indexer) > info_axis - and is_integer(indexer[info_axis]) - and all( - com.is_null_slice(idx) - for i, idx in enumerate(indexer) - if i != info_axis - ) - and item_labels.is_unique - ): - self.obj[item_labels[indexer[info_axis]]] = value - return + # reset the sliced object if unique + self.obj._iset_item(loc, ser) - indexer = maybe_convert_ix(*indexer) + def _setitem_single_block_inplace(self, indexer, value): + """ + _setitem_with_indexer for the case when we have a single Block + and the value can be set into it without casting. + """ + from pandas import Series - if isinstance(value, (ABCSeries, dict)): - # TODO(EA): ExtensionBlock.setitem this causes issues with - # setting for extensionarrays that store dicts. Need to decide - # if it's worth supporting that. - value = self._align_series(indexer, Series(value)) + info_axis = self.obj._info_axis_number + item_labels = self.obj._get_axis(info_axis) - elif isinstance(value, ABCDataFrame): - value = self._align_frame(indexer, value) + if isinstance(indexer, tuple): - # check for chained assignment - self.obj._check_is_chained_assignment_possible() + # if we are setting on the info axis ONLY + # set using those methods to avoid block-splitting + # logic here + if ( + len(indexer) > info_axis + and is_integer(indexer[info_axis]) + and all( + com.is_null_slice(idx) + for i, idx in enumerate(indexer) + if i != info_axis + ) + and item_labels.is_unique + ): + self.obj[item_labels[indexer[info_axis]]] = value + return - # actually do the set - self.obj._consolidate_inplace() - self.obj._mgr = self.obj._mgr.setitem(indexer=indexer, value=value) - self.obj._maybe_update_cacher(clear=True) + indexer = maybe_convert_ix(*indexer) + + if isinstance(value, (ABCSeries, dict)): + # TODO(EA): ExtensionBlock.setitem this causes issues with + # setting for extensionarrays that store dicts. Need to decide + # if it's worth supporting that. + value = self._align_series(indexer, Series(value)) + + elif isinstance(value, ABCDataFrame): + value = self._align_frame(indexer, value) + + # check for chained assignment + self.obj._check_is_chained_assignment_possible() + + # actually do the set + self.obj._consolidate_inplace() + self.obj._mgr = self.obj._mgr.setitem(indexer=indexer, value=value) + self.obj._maybe_update_cacher(clear=True) def _setitem_with_indexer_missing(self, indexer, value): """ @@ -1873,7 +1886,7 @@ def _setitem_with_indexer_missing(self, indexer, value): self.obj._mgr = self.obj.append(value)._mgr self.obj._maybe_update_cacher(clear=True) - def _align_series(self, indexer, ser: ABCSeries, multiindex_indexer: bool = False): + def _align_series(self, indexer, ser: "Series", multiindex_indexer: bool = False): """ Parameters ---------- From f775e3d17701dd77f45fc7895c6ae8661f23ab95 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 13 Sep 2020 05:29:31 -0700 Subject: [PATCH 0756/1025] PERF: CategoricalDtype.__eq__ (#36280) --- pandas/core/dtypes/dtypes.py | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index e321fdd9b3a9b..2e5dc15131e70 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -375,12 +375,30 @@ def __eq__(self, other: Any) -> bool: # but same order is not necessary. There is no distinction between # ordered=False and ordered=None: CDT(., False) and CDT(., None) # will be equal if they have the same categories. - if ( - self.categories.dtype == other.categories.dtype - and self.categories.equals(other.categories) - ): + left = self.categories + right = other.categories + + # GH#36280 the ordering of checks here is for performance + if not left.dtype == right.dtype: + return False + + if len(left) != len(right): + return False + + if self.categories.equals(other.categories): # Check and see if they happen to be identical categories return True + + if left.dtype != object: + # Faster than calculating hash + indexer = left.get_indexer(right) + # Because left and right have the same length and are unique, + # `indexer` not having any -1s implies that there is a + # bijection between `left` and `right`. + return (indexer != -1).all() + + # With object-dtype we need a comparison that identifies + # e.g. int(2) as distinct from float(2) return hash(self) == hash(other) def __repr__(self) -> str_type: From f558928aebe2ab1d39569d2432fb86f7dd6f9619 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 13 Sep 2020 05:31:17 -0700 Subject: [PATCH 0757/1025] REF: de-duplicate _wrap_joined_index in MultiIndex (#36313) --- pandas/core/indexes/base.py | 5 ++++- pandas/core/indexes/multi.py | 25 +++++++++++++++++++++---- 2 files changed, 25 insertions(+), 5 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index b0f9f8ac8b2fd..04a63beb2ef45 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -3876,7 +3876,10 @@ def _join_monotonic(self, other, how="left", return_indexers=False): return join_index def _wrap_joined_index(self, joined, other): - name = get_op_result_name(self, other) + if isinstance(self, ABCMultiIndex): + name = self.names if self.names == other.names else None + else: + name = get_op_result_name(self, other) return self._constructor(joined, name=name) # -------------------------------------------------------------------- diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 7aceb898f5ccf..197c5f42ed62f 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1,3 +1,4 @@ +from functools import wraps from sys import getsizeof from typing import ( TYPE_CHECKING, @@ -152,6 +153,25 @@ def _codes_to_ints(self, codes): return np.bitwise_or.reduce(codes, axis=1) +def names_compat(meth): + """ + A decorator to allow either `name` or `names` keyword but not both. + + This makes it easier to share code with base class. + """ + + @wraps(meth) + def new_meth(self_or_cls, *args, **kwargs): + if "name" in kwargs and "names" in kwargs: + raise TypeError("Can only provide one of `names` and `name`") + elif "name" in kwargs: + kwargs["names"] = kwargs.pop("name") + + return meth(self_or_cls, *args, **kwargs) + + return new_meth + + class MultiIndex(Index): """ A multi-level, or hierarchical, index object for pandas objects. @@ -449,6 +469,7 @@ def from_arrays(cls, arrays, sortorder=None, names=lib.no_default) -> "MultiInde ) @classmethod + @names_compat def from_tuples( cls, tuples, @@ -3635,10 +3656,6 @@ def delete(self, loc): verify_integrity=False, ) - def _wrap_joined_index(self, joined, other): - names = self.names if self.names == other.names else None - return self._constructor(joined, names=names) - @doc(Index.isin) def isin(self, values, level=None): if level is None: From 9815cba2929c4e35fc22e2c66ebb9c256c130312 Mon Sep 17 00:00:00 2001 From: Daniel Saxton <2658661+dsaxton@users.noreply.github.com> Date: Sun, 13 Sep 2020 07:34:51 -0500 Subject: [PATCH 0758/1025] BUG: Don't overflow with large int scalar (#36316) --- doc/source/whatsnew/v1.1.3.rst | 1 + pandas/core/dtypes/cast.py | 5 +++++ pandas/tests/series/test_constructors.py | 7 +++++++ 3 files changed, 13 insertions(+) diff --git a/doc/source/whatsnew/v1.1.3.rst b/doc/source/whatsnew/v1.1.3.rst index 25d223418fc92..5cbd160f29d66 100644 --- a/doc/source/whatsnew/v1.1.3.rst +++ b/doc/source/whatsnew/v1.1.3.rst @@ -25,6 +25,7 @@ Fixed regressions Bug fixes ~~~~~~~~~ - Bug in :meth:`Series.str.startswith` and :meth:`Series.str.endswith` with ``category`` dtype not propagating ``na`` parameter (:issue:`36241`) +- Bug in :class:`Series` constructor where integer overflow would occur for sufficiently large scalar inputs when an index was provided (:issue:`36291`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 64ccc0be0a25d..05759ffb43dde 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -697,6 +697,11 @@ def infer_dtype_from_scalar(val, pandas_dtype: bool = False) -> Tuple[DtypeObj, else: dtype = np.dtype(np.int64) + try: + np.array(val, dtype=dtype) + except OverflowError: + dtype = np.array(val).dtype + elif is_float(val): if isinstance(val, np.floating): dtype = np.dtype(type(val)) diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 0fb8c5955a2e7..8ac0a55e63cd1 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -1501,3 +1501,10 @@ def test_construction_from_ordered_collection(self): result = Series({"a": 1, "b": 2}.values()) expected = Series([1, 2]) tm.assert_series_equal(result, expected) + + def test_construction_from_large_int_scalar_no_overflow(self): + # https://github.com/pandas-dev/pandas/issues/36291 + n = 1_000_000_000_000_000_000_000 + result = Series(n, index=[0]) + expected = Series(n) + tm.assert_series_equal(result, expected) From 01916a81712aae835e4512e7bf91f1cec1378430 Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Sun, 13 Sep 2020 13:39:58 +0100 Subject: [PATCH 0759/1025] PERF: constructing string Series (#36317) --- doc/source/whatsnew/v1.2.0.rst | 2 +- pandas/core/construction.py | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index e577a8f26bd12..a3260f4089e7d 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -205,7 +205,7 @@ Deprecations Performance improvements ~~~~~~~~~~~~~~~~~~~~~~~~ -- Performance improvements when creating Series with dtype `str` or :class:`StringDtype` from array with many string elements (:issue:`36304`) +- Performance improvements when creating Series with dtype `str` or :class:`StringDtype` from array with many string elements (:issue:`36304`, :issue:`36317`) - Performance improvement in :meth:`GroupBy.agg` with the ``numba`` engine (:issue:`35759`) - diff --git a/pandas/core/construction.py b/pandas/core/construction.py index 0993328aef8de..3ec5bc90d521d 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -37,6 +37,7 @@ is_list_like, is_object_dtype, is_sparse, + is_string_dtype, is_timedelta64_ns_dtype, ) from pandas.core.dtypes.generic import ( @@ -510,7 +511,8 @@ def sanitize_array( data = np.array(data, dtype=dtype, copy=False) subarr = np.array(data, dtype=object, copy=copy) - if is_object_dtype(subarr.dtype) and not is_object_dtype(dtype): + is_object_or_str_dtype = is_object_dtype(dtype) or is_string_dtype(dtype) + if is_object_dtype(subarr.dtype) and not is_object_or_str_dtype: inferred = lib.infer_dtype(subarr, skipna=False) if inferred in {"interval", "period"}: subarr = array(subarr) From a7c95b4b094833859f0d202fc3ec4ff40de80b7e Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 13 Sep 2020 05:40:54 -0700 Subject: [PATCH 0760/1025] REF: de-duplicate get_indexer_non_unique (#36322) --- pandas/core/indexes/multi.py | 4 ---- pandas/core/indexes/period.py | 19 +++---------------- 2 files changed, 3 insertions(+), 20 deletions(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 197c5f42ed62f..a21a54e4a9be3 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -2533,10 +2533,6 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): return ensure_platform_int(indexer) - @Appender(_index_shared_docs["get_indexer_non_unique"] % _index_doc_kwargs) - def get_indexer_non_unique(self, target): - return super().get_indexer_non_unique(target) - def get_slice_bound( self, label: Union[Hashable, Sequence[Hashable]], side: str, kind: str ) -> int: diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index 5282b6f0154b4..42dce1bd53f22 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -12,7 +12,6 @@ from pandas.util._decorators import Appender, cache_readonly, doc from pandas.core.dtypes.common import ( - ensure_platform_int, is_bool_dtype, is_datetime64_any_dtype, is_dtype_equal, @@ -473,12 +472,13 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): target = ensure_index(target) if isinstance(target, PeriodIndex): - if target.freq != self.freq: + if not self._is_comparable_dtype(target.dtype): + # i.e. target.freq != self.freq # No matches no_matches = -1 * np.ones(self.shape, dtype=np.intp) return no_matches - target = target.asi8 + target = target._get_engine_target() # i.e. target.asi8 self_index = self._int64index else: self_index = self @@ -491,19 +491,6 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): return Index.get_indexer(self_index, target, method, limit, tolerance) - @Appender(_index_shared_docs["get_indexer_non_unique"] % _index_doc_kwargs) - def get_indexer_non_unique(self, target): - target = ensure_index(target) - - if not self._is_comparable_dtype(target.dtype): - no_matches = -1 * np.ones(self.shape, dtype=np.intp) - return no_matches, no_matches - - target = target.asi8 - - indexer, missing = self._int64index.get_indexer_non_unique(target) - return ensure_platform_int(indexer), missing - def get_loc(self, key, method=None, tolerance=None): """ Get integer location for requested label. From 0b420655e410e301edbb1165ddda44fbdfe30913 Mon Sep 17 00:00:00 2001 From: Daniel Saxton <2658661+dsaxton@users.noreply.github.com> Date: Sun, 13 Sep 2020 07:44:15 -0500 Subject: [PATCH 0761/1025] REGR: Fix IntegerArray unary ops regression (#36303) --- doc/source/whatsnew/v1.1.3.rst | 1 + pandas/conftest.py | 13 ++++++ pandas/core/arrays/integer.py | 9 ++++ pandas/core/generic.py | 5 ++- .../tests/arrays/integer/test_arithmetic.py | 38 +++++++++++++++++ pandas/tests/frame/test_operators.py | 2 +- pandas/tests/series/test_operators.py | 41 +++++++++++++++++++ 7 files changed, 107 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.1.3.rst b/doc/source/whatsnew/v1.1.3.rst index 5cbd160f29d66..2457d00eb2173 100644 --- a/doc/source/whatsnew/v1.1.3.rst +++ b/doc/source/whatsnew/v1.1.3.rst @@ -14,6 +14,7 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ +- Fixed regression in :class:`IntegerArray` unary plus and minus operations raising a ``TypeError`` (:issue:`36063`) - Fixed regression in :meth:`Series.__getitem__` incorrectly raising when the input was a tuple (:issue:`35534`) - Fixed regression in :meth:`Series.__getitem__` incorrectly raising when the input was a frozenset (:issue:`35747`) - diff --git a/pandas/conftest.py b/pandas/conftest.py index 5474005a63b8e..e79370e53ead6 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1055,6 +1055,19 @@ def any_nullable_int_dtype(request): return request.param +@pytest.fixture(params=tm.SIGNED_EA_INT_DTYPES) +def any_signed_nullable_int_dtype(request): + """ + Parameterized fixture for any signed nullable integer dtype. + + * 'Int8' + * 'Int16' + * 'Int32' + * 'Int64' + """ + return request.param + + @pytest.fixture(params=tm.ALL_REAL_DTYPES) def any_real_dtype(request): """ diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index dc08e018397bc..94af013d6df2c 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -364,6 +364,15 @@ def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False): ) super().__init__(values, mask, copy=copy) + def __neg__(self): + return type(self)(-self._data, self._mask) + + def __pos__(self): + return self + + def __abs__(self): + return type(self)(np.abs(self._data), self._mask) + @classmethod def _from_sequence(cls, scalars, dtype=None, copy: bool = False) -> "IntegerArray": return integer_array(scalars, dtype=dtype, copy=copy) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 9ed9db801d0a8..d7b82923e7488 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1417,7 +1417,10 @@ def __pos__(self): ): arr = operator.pos(values) else: - raise TypeError(f"Unary plus expects numeric dtype, not {values.dtype}") + raise TypeError( + "Unary plus expects bool, numeric, timedelta, " + f"or object dtype, not {values.dtype}" + ) return self.__array_wrap__(arr) def __invert__(self): diff --git a/pandas/tests/arrays/integer/test_arithmetic.py b/pandas/tests/arrays/integer/test_arithmetic.py index d309f6423e0c1..f549a7caeab1d 100644 --- a/pandas/tests/arrays/integer/test_arithmetic.py +++ b/pandas/tests/arrays/integer/test_arithmetic.py @@ -261,3 +261,41 @@ def test_reduce_to_float(op): index=pd.Index(["a", "b"], name="A"), ) tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "source, target", + [ + ([1, 2, 3], [-1, -2, -3]), + ([1, 2, None], [-1, -2, None]), + ([-1, 0, 1], [1, 0, -1]), + ], +) +def test_unary_minus_nullable_int(any_signed_nullable_int_dtype, source, target): + dtype = any_signed_nullable_int_dtype + arr = pd.array(source, dtype=dtype) + result = -arr + expected = pd.array(target, dtype=dtype) + tm.assert_extension_array_equal(result, expected) + + +@pytest.mark.parametrize( + "source", [[1, 2, 3], [1, 2, None], [-1, 0, 1]], +) +def test_unary_plus_nullable_int(any_signed_nullable_int_dtype, source): + dtype = any_signed_nullable_int_dtype + expected = pd.array(source, dtype=dtype) + result = +expected + tm.assert_extension_array_equal(result, expected) + + +@pytest.mark.parametrize( + "source, target", + [([1, 2, 3], [1, 2, 3]), ([1, -2, None], [1, 2, None]), ([-1, 0, 1], [1, 0, 1])], +) +def test_abs_nullable_int(any_signed_nullable_int_dtype, source, target): + dtype = any_signed_nullable_int_dtype + s = pd.array(source, dtype=dtype) + result = abs(s) + expected = pd.array(target, dtype=dtype) + tm.assert_extension_array_equal(result, expected) diff --git a/pandas/tests/frame/test_operators.py b/pandas/tests/frame/test_operators.py index fede1ca23a8ce..8cf66e2737249 100644 --- a/pandas/tests/frame/test_operators.py +++ b/pandas/tests/frame/test_operators.py @@ -119,7 +119,7 @@ def test_pos_object(self, df): "df", [pd.DataFrame({"a": pd.to_datetime(["2017-01-22", "1970-01-01"])})] ) def test_pos_raises(self, df): - msg = re.escape("Unary plus expects numeric dtype, not datetime64[ns]") + msg = "Unary plus expects .* dtype, not datetime64\\[ns\\]" with pytest.raises(TypeError, match=msg): (+df) with pytest.raises(TypeError, match=msg): diff --git a/pandas/tests/series/test_operators.py b/pandas/tests/series/test_operators.py index e1c9682329271..aee947e738525 100644 --- a/pandas/tests/series/test_operators.py +++ b/pandas/tests/series/test_operators.py @@ -536,3 +536,44 @@ def test_invert(self): ser = tm.makeStringSeries() ser.name = "series" tm.assert_series_equal(-(ser < 0), ~(ser < 0)) + + @pytest.mark.parametrize( + "source, target", + [ + ([1, 2, 3], [-1, -2, -3]), + ([1, 2, None], [-1, -2, None]), + ([-1, 0, 1], [1, 0, -1]), + ], + ) + def test_unary_minus_nullable_int( + self, any_signed_nullable_int_dtype, source, target + ): + dtype = any_signed_nullable_int_dtype + s = pd.Series(source, dtype=dtype) + result = -s + expected = pd.Series(target, dtype=dtype) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "source", [[1, 2, 3], [1, 2, None], [-1, 0, 1]], + ) + def test_unary_plus_nullable_int(self, any_signed_nullable_int_dtype, source): + dtype = any_signed_nullable_int_dtype + expected = pd.Series(source, dtype=dtype) + result = +expected + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "source, target", + [ + ([1, 2, 3], [1, 2, 3]), + ([1, -2, None], [1, 2, None]), + ([-1, 0, 1], [1, 0, 1]), + ], + ) + def test_abs_nullable_int(self, any_signed_nullable_int_dtype, source, target): + dtype = any_signed_nullable_int_dtype + s = pd.Series(source, dtype=dtype) + result = abs(s) + expected = pd.Series(target, dtype=dtype) + tm.assert_series_equal(result, expected) From 6fd74c6810374c4323d80722b2eeb113f8f5da27 Mon Sep 17 00:00:00 2001 From: attack68 <24256554+attack68@users.noreply.github.com> Date: Sun, 13 Sep 2020 14:52:42 +0200 Subject: [PATCH 0762/1025] ENH: add set_td_classes method for CSS class addition to data cells (#36159) --- doc/source/whatsnew/v1.2.0.rst | 2 +- pandas/io/formats/style.py | 69 ++++++++++++++++++++++++++- pandas/tests/io/formats/test_style.py | 21 ++++++++ 3 files changed, 90 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index a3260f4089e7d..f7c94b6b1846f 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -104,7 +104,7 @@ Other enhancements - :meth:`DataFrame.applymap` now supports ``na_action`` (:issue:`23803`) - :class:`Index` with object dtype supports division and multiplication (:issue:`34160`) - :meth:`DataFrame.explode` and :meth:`Series.explode` now support exploding of sets (:issue:`35614`) -- +- `Styler` now allows direct CSS class name addition to individual data cells (:issue:`36159`) .. _whatsnew_120.api_breaking.python: diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index b27a4e036e137..5c3a309b0e310 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -171,6 +171,8 @@ def __init__( self.cell_ids = cell_ids self.na_rep = na_rep + self.cell_context: Dict[str, Any] = {} + # display_funcs maps (row, col) -> formatting function def default_display_func(x): @@ -262,7 +264,7 @@ def format_attr(pair): idx_lengths = _get_level_lengths(self.index) col_lengths = _get_level_lengths(self.columns, hidden_columns) - cell_context = dict() + cell_context = self.cell_context n_rlvls = self.data.index.nlevels n_clvls = self.data.columns.nlevels @@ -499,6 +501,70 @@ def format(self, formatter, subset=None, na_rep: Optional[str] = None) -> "Style self._display_funcs[(i, j)] = formatter return self + def set_td_classes(self, classes: DataFrame) -> "Styler": + """ + Add string based CSS class names to data cells that will appear within the + `Styler` HTML result. These classes are added within specified `' in s + assert '' in s + assert '' in s + assert '' in s + def test_colspan_w3(self): # GH 36223 df = pd.DataFrame(data=[[1, 2]], columns=[["l0", "l0"], ["l1a", "l1b"]]) From d95b0b151155d08b8ef8f11866bab455bba77be1 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Sun, 13 Sep 2020 06:13:53 -0700 Subject: [PATCH 0763/1025] PERF: Allow groupby transform with numba engine to be fully parallelizable (#36240) --- asv_bench/benchmarks/groupby.py | 46 ++++++++---- doc/source/whatsnew/v1.2.0.rst | 2 +- pandas/core/groupby/generic.py | 76 +++++++++----------- pandas/core/groupby/groupby.py | 40 ++++++++++- pandas/core/groupby/numba_.py | 69 +++++++++++++++++- pandas/tests/groupby/transform/test_numba.py | 16 +++++ 6 files changed, 187 insertions(+), 62 deletions(-) diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index 5ffda03fad80f..bda3ab71d1a00 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -627,33 +627,42 @@ def time_first(self): class TransformEngine: - def setup(self): + + param_names = ["parallel"] + params = [[True, False]] + + def setup(self, parallel): N = 10 ** 3 data = DataFrame( {0: [str(i) for i in range(100)] * N, 1: list(range(100)) * N}, columns=[0, 1], ) + self.parallel = parallel self.grouper = data.groupby(0) - def time_series_numba(self): + def time_series_numba(self, parallel): def function(values, index): return values * 5 - self.grouper[1].transform(function, engine="numba") + self.grouper[1].transform( + function, engine="numba", engine_kwargs={"parallel": self.parallel} + ) - def time_series_cython(self): + def time_series_cython(self, parallel): def function(values): return values * 5 self.grouper[1].transform(function, engine="cython") - def time_dataframe_numba(self): + def time_dataframe_numba(self, parallel): def function(values, index): return values * 5 - self.grouper.transform(function, engine="numba") + self.grouper.transform( + function, engine="numba", engine_kwargs={"parallel": self.parallel} + ) - def time_dataframe_cython(self): + def time_dataframe_cython(self, parallel): def function(values): return values * 5 @@ -661,15 +670,20 @@ def function(values): class AggEngine: - def setup(self): + + param_names = ["parallel"] + params = [[True, False]] + + def setup(self, parallel): N = 10 ** 3 data = DataFrame( {0: [str(i) for i in range(100)] * N, 1: list(range(100)) * N}, columns=[0, 1], ) + self.parallel = parallel self.grouper = data.groupby(0) - def time_series_numba(self): + def time_series_numba(self, parallel): def function(values, index): total = 0 for i, value in enumerate(values): @@ -679,9 +693,11 @@ def function(values, index): total += value * 2 return total - self.grouper[1].agg(function, engine="numba") + self.grouper[1].agg( + function, engine="numba", engine_kwargs={"parallel": self.parallel} + ) - def time_series_cython(self): + def time_series_cython(self, parallel): def function(values): total = 0 for i, value in enumerate(values): @@ -693,7 +709,7 @@ def function(values): self.grouper[1].agg(function, engine="cython") - def time_dataframe_numba(self): + def time_dataframe_numba(self, parallel): def function(values, index): total = 0 for i, value in enumerate(values): @@ -703,9 +719,11 @@ def function(values, index): total += value * 2 return total - self.grouper.agg(function, engine="numba") + self.grouper.agg( + function, engine="numba", engine_kwargs={"parallel": self.parallel} + ) - def time_dataframe_cython(self): + def time_dataframe_cython(self, parallel): def function(values): total = 0 for i, value in enumerate(values): diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index f7c94b6b1846f..ad90d42633355 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -207,7 +207,7 @@ Performance improvements - Performance improvements when creating Series with dtype `str` or :class:`StringDtype` from array with many string elements (:issue:`36304`, :issue:`36317`) - Performance improvement in :meth:`GroupBy.agg` with the ``numba`` engine (:issue:`35759`) -- +- Performance improvement in :meth:`GroupBy.transform` with the ``numba`` engine (:issue:`36240`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 1552256468ad2..ffd756bed43b6 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -226,10 +226,6 @@ def apply(self, func, *args, **kwargs): def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs): if maybe_use_numba(engine): - if not callable(func): - raise NotImplementedError( - "Numba engine can only be used with a single function." - ) with group_selection_context(self): data = self._selected_obj result, index = self._aggregate_with_numba( @@ -489,12 +485,21 @@ def _aggregate_named(self, func, *args, **kwargs): @Substitution(klass="Series") @Appender(_transform_template) def transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs): + + if maybe_use_numba(engine): + with group_selection_context(self): + data = self._selected_obj + result = self._transform_with_numba( + data.to_frame(), func, *args, engine_kwargs=engine_kwargs, **kwargs + ) + return self.obj._constructor( + result.ravel(), index=data.index, name=data.name + ) + func = self._get_cython_func(func) or func if not isinstance(func, str): - return self._transform_general( - func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs - ) + return self._transform_general(func, *args, **kwargs) elif func not in base.transform_kernel_allowlist: msg = f"'{func}' is not a valid function name for transform(name)" @@ -938,10 +943,6 @@ class DataFrameGroupBy(GroupBy[DataFrame]): def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs): if maybe_use_numba(engine): - if not callable(func): - raise NotImplementedError( - "Numba engine can only be used with a single function." - ) with group_selection_context(self): data = self._selected_obj result, index = self._aggregate_with_numba( @@ -1290,42 +1291,25 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): return self._reindex_output(result) - def _transform_general( - self, func, *args, engine="cython", engine_kwargs=None, **kwargs - ): + def _transform_general(self, func, *args, **kwargs): from pandas.core.reshape.concat import concat applied = [] obj = self._obj_with_exclusions gen = self.grouper.get_iterator(obj, axis=self.axis) - if maybe_use_numba(engine): - numba_func, cache_key = generate_numba_func( - func, engine_kwargs, kwargs, "groupby_transform" - ) - else: - fast_path, slow_path = self._define_paths(func, *args, **kwargs) + fast_path, slow_path = self._define_paths(func, *args, **kwargs) for name, group in gen: object.__setattr__(group, "name", name) - if maybe_use_numba(engine): - values, index = split_for_numba(group) - res = numba_func(values, index, *args) - if cache_key not in NUMBA_FUNC_CACHE: - NUMBA_FUNC_CACHE[cache_key] = numba_func - # Return the result as a DataFrame for concatenation later - res = self.obj._constructor( - res, index=group.index, columns=group.columns - ) - else: - # Try slow path and fast path. - try: - path, res = self._choose_path(fast_path, slow_path, group) - except TypeError: - return self._transform_item_by_item(obj, fast_path) - except ValueError as err: - msg = "transform must return a scalar value for each group" - raise ValueError(msg) from err + # Try slow path and fast path. + try: + path, res = self._choose_path(fast_path, slow_path, group) + except TypeError: + return self._transform_item_by_item(obj, fast_path) + except ValueError as err: + msg = "transform must return a scalar value for each group" + raise ValueError(msg) from err if isinstance(res, Series): @@ -1361,13 +1345,19 @@ def _transform_general( @Appender(_transform_template) def transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs): + if maybe_use_numba(engine): + with group_selection_context(self): + data = self._selected_obj + result = self._transform_with_numba( + data, func, *args, engine_kwargs=engine_kwargs, **kwargs + ) + return self.obj._constructor(result, index=data.index, columns=data.columns) + # optimized transforms func = self._get_cython_func(func) or func if not isinstance(func, str): - return self._transform_general( - func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs - ) + return self._transform_general(func, *args, **kwargs) elif func not in base.transform_kernel_allowlist: msg = f"'{func}' is not a valid function name for transform(name)" @@ -1393,9 +1383,7 @@ def transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs): ): return self._transform_fast(result) - return self._transform_general( - func, engine=engine, engine_kwargs=engine_kwargs, *args, **kwargs - ) + return self._transform_general(func, *args, **kwargs) def _transform_fast(self, result: DataFrame) -> DataFrame: """ diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 8a55d438cf8d4..30bd53a3ddff1 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1056,6 +1056,41 @@ def _cython_agg_general( return self._wrap_aggregated_output(output, index=self.grouper.result_index) + def _transform_with_numba(self, data, func, *args, engine_kwargs=None, **kwargs): + """ + Perform groupby transform routine with the numba engine. + + This routine mimics the data splitting routine of the DataSplitter class + to generate the indices of each group in the sorted data and then passes the + data and indices into a Numba jitted function. + """ + if not callable(func): + raise NotImplementedError( + "Numba engine can only be used with a single function." + ) + group_keys = self.grouper._get_group_keys() + labels, _, n_groups = self.grouper.group_info + sorted_index = get_group_index_sorter(labels, n_groups) + sorted_labels = algorithms.take_nd(labels, sorted_index, allow_fill=False) + sorted_data = data.take(sorted_index, axis=self.axis).to_numpy() + starts, ends = lib.generate_slices(sorted_labels, n_groups) + cache_key = (func, "groupby_transform") + if cache_key in NUMBA_FUNC_CACHE: + numba_transform_func = NUMBA_FUNC_CACHE[cache_key] + else: + numba_transform_func = numba_.generate_numba_transform_func( + tuple(args), kwargs, func, engine_kwargs + ) + result = numba_transform_func( + sorted_data, sorted_index, starts, ends, len(group_keys), len(data.columns) + ) + if cache_key not in NUMBA_FUNC_CACHE: + NUMBA_FUNC_CACHE[cache_key] = numba_transform_func + + # result values needs to be resorted to their original positions since we + # evaluated the data sorted by group + return result.take(np.argsort(sorted_index), axis=0) + def _aggregate_with_numba(self, data, func, *args, engine_kwargs=None, **kwargs): """ Perform groupby aggregation routine with the numba engine. @@ -1064,6 +1099,10 @@ def _aggregate_with_numba(self, data, func, *args, engine_kwargs=None, **kwargs) to generate the indices of each group in the sorted data and then passes the data and indices into a Numba jitted function. """ + if not callable(func): + raise NotImplementedError( + "Numba engine can only be used with a single function." + ) group_keys = self.grouper._get_group_keys() labels, _, n_groups = self.grouper.group_info sorted_index = get_group_index_sorter(labels, n_groups) @@ -1072,7 +1111,6 @@ def _aggregate_with_numba(self, data, func, *args, engine_kwargs=None, **kwargs) starts, ends = lib.generate_slices(sorted_labels, n_groups) cache_key = (func, "groupby_agg") if cache_key in NUMBA_FUNC_CACHE: - # Return an already compiled version of roll_apply if available numba_agg_func = NUMBA_FUNC_CACHE[cache_key] else: numba_agg_func = numba_.generate_numba_agg_func( diff --git a/pandas/core/groupby/numba_.py b/pandas/core/groupby/numba_.py index aebe60f797fcd..a2dfcd7bddd53 100644 --- a/pandas/core/groupby/numba_.py +++ b/pandas/core/groupby/numba_.py @@ -153,7 +153,7 @@ def generate_numba_agg_func( loop_range = range @numba.jit(nopython=nopython, nogil=nogil, parallel=parallel) - def group_apply( + def group_agg( values: np.ndarray, index: np.ndarray, begin: np.ndarray, @@ -169,4 +169,69 @@ def group_apply( result[i, j] = numba_func(group, group_index, *args) return result - return group_apply + return group_agg + + +def generate_numba_transform_func( + args: Tuple, + kwargs: Dict[str, Any], + func: Callable[..., Scalar], + engine_kwargs: Optional[Dict[str, bool]], +) -> Callable[[np.ndarray, np.ndarray, np.ndarray, np.ndarray, int, int], np.ndarray]: + """ + Generate a numba jitted transform function specified by values from engine_kwargs. + + 1. jit the user's function + 2. Return a groupby agg function with the jitted function inline + + Configurations specified in engine_kwargs apply to both the user's + function _AND_ the rolling apply function. + + Parameters + ---------- + args : tuple + *args to be passed into the function + kwargs : dict + **kwargs to be passed into the function + func : function + function to be applied to each window and will be JITed + engine_kwargs : dict + dictionary of arguments to be passed into numba.jit + + Returns + ------- + Numba function + """ + nopython, nogil, parallel = get_jit_arguments(engine_kwargs) + + check_kwargs_and_nopython(kwargs, nopython) + + validate_udf(func) + + numba_func = jit_user_function(func, nopython, nogil, parallel) + + numba = import_optional_dependency("numba") + + if parallel: + loop_range = numba.prange + else: + loop_range = range + + @numba.jit(nopython=nopython, nogil=nogil, parallel=parallel) + def group_transform( + values: np.ndarray, + index: np.ndarray, + begin: np.ndarray, + end: np.ndarray, + num_groups: int, + num_columns: int, + ) -> np.ndarray: + result = np.empty((len(values), num_columns)) + for i in loop_range(num_groups): + group_index = index[begin[i] : end[i]] + for j in loop_range(num_columns): + group = values[begin[i] : end[i], j] + result[begin[i] : end[i], j] = numba_func(group, group_index, *args) + return result + + return group_transform diff --git a/pandas/tests/groupby/transform/test_numba.py b/pandas/tests/groupby/transform/test_numba.py index 87723cd7c8f50..fcaa5ab13599a 100644 --- a/pandas/tests/groupby/transform/test_numba.py +++ b/pandas/tests/groupby/transform/test_numba.py @@ -127,3 +127,19 @@ def func_1(values, index): with option_context("compute.use_numba", True): result = grouped.transform(func_1, engine=None) tm.assert_frame_equal(expected, result) + + +@td.skip_if_no("numba", "0.46.0") +@pytest.mark.parametrize( + "agg_func", [["min", "max"], "min", {"B": ["min", "max"], "C": "sum"}], +) +def test_multifunc_notimplimented(agg_func): + data = DataFrame( + {0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1] + ) + grouped = data.groupby(0) + with pytest.raises(NotImplementedError, match="Numba engine can"): + grouped.transform(agg_func, engine="numba") + + with pytest.raises(NotImplementedError, match="Numba engine can"): + grouped[1].transform(agg_func, engine="numba") From 5c354afbe317057669d559d8eb853607cf2bfed7 Mon Sep 17 00:00:00 2001 From: Leonardus Chen Date: Sun, 13 Sep 2020 21:15:07 +0800 Subject: [PATCH 0764/1025] BUG: GH36212 DataFrame agg() raises error when DataFrame column name is `name` (#36224) --- doc/source/whatsnew/v1.2.0.rst | 1 + pandas/core/base.py | 7 +++++-- pandas/tests/frame/apply/test_frame_apply.py | 15 +++++++++++++++ 3 files changed, 21 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index ad90d42633355..89d94dc0cabd6 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -327,6 +327,7 @@ Reshaping - Bug in :meth:`DataFrame.pivot_table` with ``aggfunc='count'`` or ``aggfunc='sum'`` returning ``NaN`` for missing categories when pivoted on a ``Categorical``. Now returning ``0`` (:issue:`31422`) - Bug in :func:`union_indexes` where input index names are not preserved in some cases. Affects :func:`concat` and :class:`DataFrame` constructor (:issue:`13475`) - Bug in func :meth:`crosstab` when using multiple columns with ``margins=True`` and ``normalize=True`` (:issue:`35144`) +- Bug in :meth:`DataFrame.agg` with ``func={'name':}`` incorrectly raising ``TypeError`` when ``DataFrame.columns==['Name']`` (:issue:`36212`) - Sparse diff --git a/pandas/core/base.py b/pandas/core/base.py index a688302b99724..53378c0e0e252 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -470,9 +470,12 @@ def is_any_frame() -> bool: try: result = DataFrame(result) except ValueError: - # we have a dict of scalars - result = Series(result, name=getattr(self, "name", None)) + + # GH 36212 use name only if self is a series + name = self.name if (self.ndim == 1) else None + + result = Series(result, name=name) return result, True elif is_list_like(arg): diff --git a/pandas/tests/frame/apply/test_frame_apply.py b/pandas/tests/frame/apply/test_frame_apply.py index 1662f9e2fff56..f75d7c13665f9 100644 --- a/pandas/tests/frame/apply/test_frame_apply.py +++ b/pandas/tests/frame/apply/test_frame_apply.py @@ -1147,6 +1147,21 @@ def test_demo(self): ) tm.assert_frame_equal(result.reindex_like(expected), expected) + def test_agg_with_name_as_column_name(self): + # GH 36212 - Column name is "name" + data = {"name": ["foo", "bar"]} + df = pd.DataFrame(data) + + # result's name should be None + result = df.agg({"name": "count"}) + expected = pd.Series({"name": 2}) + tm.assert_series_equal(result, expected) + + # Check if name is still preserved when aggregating series instead + result = df["name"].agg({"name": "count"}) + expected = pd.Series({"name": 2}, name="name") + tm.assert_series_equal(result, expected) + def test_agg_multiple_mixed_no_warning(self): # GH 20909 mdf = pd.DataFrame( From 936400234724ce3ac4a4f49f0025ed5dec3d843e Mon Sep 17 00:00:00 2001 From: Kaiqi Dong Date: Sun, 13 Sep 2020 17:53:37 +0200 Subject: [PATCH 0765/1025] BUG: Fixe unintentionally added suffix in DataFrame.apply/agg and Series.apply/agg (#36231) --- doc/source/whatsnew/v1.1.3.rst | 1 + pandas/core/aggregation.py | 3 +-- pandas/core/groupby/generic.py | 1 + pandas/tests/frame/apply/test_frame_apply.py | 11 +++++++++++ pandas/tests/groupby/aggregate/test_aggregate.py | 15 +++++++++++++++ pandas/tests/series/apply/test_series_apply.py | 8 ++++++++ 6 files changed, 37 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.1.3.rst b/doc/source/whatsnew/v1.1.3.rst index 2457d00eb2173..d789518f93f6d 100644 --- a/doc/source/whatsnew/v1.1.3.rst +++ b/doc/source/whatsnew/v1.1.3.rst @@ -14,6 +14,7 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ +- Fixed regression in :meth:`DataFrame.agg`, :meth:`DataFrame.apply`, :meth:`Series.agg`, and :meth:`Series.apply` where internal suffix is exposed to the users when no relabelling is applied (:issue:`36189`) - Fixed regression in :class:`IntegerArray` unary plus and minus operations raising a ``TypeError`` (:issue:`36063`) - Fixed regression in :meth:`Series.__getitem__` incorrectly raising when the input was a tuple (:issue:`35534`) - Fixed regression in :meth:`Series.__getitem__` incorrectly raising when the input was a frozenset (:issue:`35747`) diff --git a/pandas/core/aggregation.py b/pandas/core/aggregation.py index 8b74fe01d0dc0..c123156495924 100644 --- a/pandas/core/aggregation.py +++ b/pandas/core/aggregation.py @@ -63,7 +63,7 @@ def reconstruct_func( Examples -------- >>> reconstruct_func(None, **{"foo": ("col", "min")}) - (True, defaultdict(None, {'col': ['min']}), ('foo',), array([0])) + (True, defaultdict(, {'col': ['min']}), ('foo',), array([0])) >>> reconstruct_func("min") (False, 'min', None, None) @@ -87,7 +87,6 @@ def reconstruct_func( if relabeling: func, columns, order = normalize_keyword_aggregation(kwargs) - func = maybe_mangle_lambdas(func) return relabeling, func, columns, order diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index ffd756bed43b6..d4e673d2e538c 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -951,6 +951,7 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs) return self.obj._constructor(result, index=index, columns=data.columns) relabeling, func, columns, order = reconstruct_func(func, **kwargs) + func = maybe_mangle_lambdas(func) result, how = self._aggregate(func, *args, **kwargs) if how is None: diff --git a/pandas/tests/frame/apply/test_frame_apply.py b/pandas/tests/frame/apply/test_frame_apply.py index f75d7c13665f9..e25b681c8c7c3 100644 --- a/pandas/tests/frame/apply/test_frame_apply.py +++ b/pandas/tests/frame/apply/test_frame_apply.py @@ -1534,3 +1534,14 @@ def test_apply_empty_list_reduce(): result = df.apply(lambda x: [], result_type="reduce") expected = pd.Series({"a": [], "b": []}, dtype=object) tm.assert_series_equal(result, expected) + + +def test_apply_no_suffix_index(): + # GH36189 + pdf = pd.DataFrame([[4, 9]] * 3, columns=["A", "B"]) + result = pdf.apply(["sum", lambda x: x.sum(), lambda x: x.sum()]) + expected = pd.DataFrame( + {"A": [12, 12, 12], "B": [27, 27, 27]}, index=["sum", "", ""] + ) + + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 8fe450fe6abfc..c96333bc48dd4 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -1153,3 +1153,18 @@ def test_nonagg_agg(): expected = g.agg("cumsum") tm.assert_frame_equal(result, expected) + + +def test_agg_no_suffix_index(): + # GH36189 + df = pd.DataFrame([[4, 9]] * 3, columns=["A", "B"]) + result = df.agg(["sum", lambda x: x.sum(), lambda x: x.sum()]) + expected = pd.DataFrame( + {"A": [12, 12, 12], "B": [27, 27, 27]}, index=["sum", "", ""] + ) + tm.assert_frame_equal(result, expected) + + # test Series case + result = df["A"].agg(["sum", lambda x: x.sum(), lambda x: x.sum()]) + expected = pd.Series([12, 12, 12], index=["sum", "", ""], name="A") + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/apply/test_series_apply.py b/pandas/tests/series/apply/test_series_apply.py index 827f466e23106..ce8759c4ba76d 100644 --- a/pandas/tests/series/apply/test_series_apply.py +++ b/pandas/tests/series/apply/test_series_apply.py @@ -445,6 +445,14 @@ def test_agg_cython_table_raises(self, series, func, expected): # e.g. Series('a b'.split()).cumprod() will raise series.agg(func) + def test_series_apply_no_suffix_index(self): + # GH36189 + s = pd.Series([4] * 3) + result = s.apply(["sum", lambda x: x.sum(), lambda x: x.sum()]) + expected = pd.Series([12, 12, 12], index=["sum", "", ""]) + + tm.assert_series_equal(result, expected) + class TestSeriesMap: def test_map(self, datetime_series): From 386cefa44e92f1261af6f43298150874e5d10893 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 13 Sep 2020 13:19:41 -0700 Subject: [PATCH 0766/1025] CLN: remove CategoricalIndex._create_from_codes (#36342) --- pandas/core/arrays/categorical.py | 13 +++++------ pandas/core/indexes/category.py | 39 +++++++------------------------ 2 files changed, 15 insertions(+), 37 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 4fa6b73932aa4..27e0a198b62a2 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1376,7 +1376,7 @@ def value_counts(self, dropna=True): count = np.bincount(np.where(mask, code, ncat)) ix = np.append(ix, -1) - ix = self._constructor(ix, dtype=self.dtype, fastpath=True) + ix = self._from_backing_data(ix) return Series(count, index=CategoricalIndex(ix), dtype="int64") @@ -1546,9 +1546,8 @@ def sort_values( if inplace: self._codes = self._codes[sorted_idx] else: - return self._constructor( - values=self._codes[sorted_idx], dtype=self.dtype, fastpath=True - ) + codes = self._codes[sorted_idx] + return self._from_backing_data(codes) def _values_for_rank(self): """ @@ -1583,7 +1582,7 @@ def _values_for_rank(self): def view(self, dtype=None): if dtype is not None: raise NotImplementedError(dtype) - return self._constructor(values=self._codes, dtype=self.dtype, fastpath=True) + return self._from_backing_data(self._ndarray) def to_dense(self): """ @@ -1691,7 +1690,7 @@ def fillna(self, value=None, method=None, limit=None): f"or Series, but you passed a {type(value).__name__}" ) - return self._constructor(codes, dtype=self.dtype, fastpath=True) + return self._from_backing_data(codes) # ------------------------------------------------------------------ # NDArrayBackedExtensionArray compat @@ -2098,7 +2097,7 @@ def mode(self, dropna=True): good = self._codes != -1 codes = self._codes[good] codes = sorted(htable.mode_int64(ensure_int64(codes), dropna)) - return self._constructor(values=codes, dtype=self.dtype, fastpath=True) + return self._from_backing_data(codes) # ------------------------------------------------------------------ # ExtensionArray Interface diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 19a0910a7a282..85ef3e58576e3 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -211,29 +211,6 @@ def __new__( return cls._simple_new(data, name=name) - def _create_from_codes(self, codes, dtype=None, name=None): - """ - *this is an internal non-public method* - - create the correct categorical from codes - - Parameters - ---------- - codes : new codes - dtype: CategoricalDtype, defaults to existing - name : optional name attribute, defaults to existing - - Returns - ------- - CategoricalIndex - """ - if dtype is None: - dtype = self.dtype - if name is None: - name = self.name - cat = Categorical.from_codes(codes, dtype=dtype) - return CategoricalIndex(cat, name=name) - @classmethod def _simple_new(cls, values: Categorical, name: Label = None): assert isinstance(values, Categorical), type(values) @@ -495,7 +472,8 @@ def reindex(self, target, method=None, level=None, limit=None, tolerance=None): codes = new_target.codes.copy() codes[indexer == -1] = cats[missing] - new_target = self._create_from_codes(codes) + cat = self._data._from_backing_data(codes) + new_target = type(self)._simple_new(cat, name=self.name) # we always want to return an Index type here # to be consistent with .reindex for other index types (e.g. they don't @@ -695,7 +673,9 @@ def delete(self, loc): ------- new_index : Index """ - return self._create_from_codes(np.delete(self.codes, loc)) + codes = np.delete(self.codes, loc) + cat = self._data._from_backing_data(codes) + return type(self)._simple_new(cat, name=self.name) def insert(self, loc: int, item): """ @@ -720,15 +700,14 @@ def insert(self, loc: int, item): codes = self.codes codes = np.concatenate((codes[:loc], [code], codes[loc:])) - return self._create_from_codes(codes) + cat = self._data._from_backing_data(codes) + return type(self)._simple_new(cat, name=self.name) def _concat(self, to_concat, name): # if calling index is category, don't check dtype of others codes = np.concatenate([self._is_dtype_compat(c).codes for c in to_concat]) - result = self._create_from_codes(codes, name=name) - # if name is None, _create_from_codes sets self.name - result.name = name - return result + cat = self._data._from_backing_data(codes) + return type(self)._simple_new(cat, name=name) def _delegate_method(self, name: str, *args, **kwargs): """ method delegation to the ._values """ From aff856b6f3499b1ab1b208eeec99f63a1f3b1a0b Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 13 Sep 2020 13:20:56 -0700 Subject: [PATCH 0767/1025] REF: move ShallowMixin to groupby.base (#36341) --- pandas/core/base.py | 17 +---------------- pandas/core/groupby/base.py | 17 ++++++++++++++++- pandas/core/resample.py | 6 +++--- pandas/core/window/common.py | 4 ++-- pandas/core/window/rolling.py | 5 +++-- 5 files changed, 25 insertions(+), 24 deletions(-) diff --git a/pandas/core/base.py b/pandas/core/base.py index 53378c0e0e252..4d5cddc086b2a 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -4,7 +4,7 @@ import builtins import textwrap -from typing import Any, Callable, Dict, FrozenSet, List, Optional, Union +from typing import Any, Callable, Dict, FrozenSet, Optional, Union import numpy as np @@ -577,21 +577,6 @@ def _is_builtin_func(self, arg): return self._builtin_table.get(arg, arg) -class ShallowMixin: - _attributes: List[str] = [] - - def _shallow_copy(self, obj, **kwargs): - """ - return a new object with the replacement attributes - """ - if isinstance(obj, self._constructor): - obj = obj.obj - for attr in self._attributes: - if attr not in kwargs: - kwargs[attr] = getattr(self, attr) - return self._constructor(obj, **kwargs) - - class IndexOpsMixin: """ Common ops mixin to support a unified interface / docs for Series / Index diff --git a/pandas/core/groupby/base.py b/pandas/core/groupby/base.py index 999873e7b81e4..9cfd13f95ca0e 100644 --- a/pandas/core/groupby/base.py +++ b/pandas/core/groupby/base.py @@ -13,7 +13,22 @@ OutputKey = collections.namedtuple("OutputKey", ["label", "position"]) -class GroupByMixin(PandasObject): +class ShallowMixin(PandasObject): + _attributes: List[str] = [] + + def _shallow_copy(self, obj, **kwargs): + """ + return a new object with the replacement attributes + """ + if isinstance(obj, self._constructor): + obj = obj.obj + for attr in self._attributes: + if attr not in kwargs: + kwargs[attr] = getattr(self, attr) + return self._constructor(obj, **kwargs) + + +class GotItemMixin(PandasObject): """ Provide the groupby facilities to the mixed object. """ diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 4ba253e76128e..29b7bd7a63faa 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -22,9 +22,9 @@ from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries import pandas.core.algorithms as algos -from pandas.core.base import DataError, ShallowMixin +from pandas.core.base import DataError from pandas.core.generic import NDFrame, _shared_docs -from pandas.core.groupby.base import GroupByMixin +from pandas.core.groupby.base import GotItemMixin, ShallowMixin from pandas.core.groupby.generic import SeriesGroupBy from pandas.core.groupby.groupby import ( BaseGroupBy, @@ -954,7 +954,7 @@ def h(self, _method=method): setattr(Resampler, method, h) -class _GroupByMixin(GroupByMixin): +class _GroupByMixin(GotItemMixin): """ Provide the groupby facilities. """ diff --git a/pandas/core/window/common.py b/pandas/core/window/common.py index df60d2dcf5e84..6452eb8c6b3a9 100644 --- a/pandas/core/window/common.py +++ b/pandas/core/window/common.py @@ -7,7 +7,7 @@ from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries -from pandas.core.groupby.base import GroupByMixin +from pandas.core.groupby.base import GotItemMixin from pandas.core.indexes.api import MultiIndex from pandas.core.shared_docs import _shared_docs @@ -43,7 +43,7 @@ def f(x): return outer -class WindowGroupByMixin(GroupByMixin): +class WindowGroupByMixin(GotItemMixin): """ Provide the groupby facilities. """ diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 648ab4d25be83..d094cc7d70a21 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -46,9 +46,10 @@ ) from pandas.core.dtypes.missing import notna -from pandas.core.base import DataError, PandasObject, SelectionMixin, ShallowMixin +from pandas.core.base import DataError, SelectionMixin import pandas.core.common as com from pandas.core.construction import extract_array +from pandas.core.groupby.base import ShallowMixin from pandas.core.indexes.api import Index, MultiIndex from pandas.core.util.numba_ import NUMBA_FUNC_CACHE, maybe_use_numba from pandas.core.window.common import ( @@ -146,7 +147,7 @@ def func(arg, window, min_periods=None): return func -class _Window(PandasObject, ShallowMixin, SelectionMixin): +class _Window(ShallowMixin, SelectionMixin): _attributes: List[str] = [ "window", "min_periods", From 4077230dc9c4ad9b0d885f00e4f58d29cfe00fd9 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sun, 13 Sep 2020 21:21:17 +0100 Subject: [PATCH 0768/1025] DOC/CLN: remove versionadded/changed:: 0.23 (#36338) --- doc/source/development/extending.rst | 2 -- doc/source/getting_started/install.rst | 2 -- doc/source/user_guide/advanced.rst | 2 -- doc/source/user_guide/basics.rst | 2 -- doc/source/user_guide/categorical.rst | 2 -- doc/source/user_guide/dsintro.rst | 2 -- doc/source/user_guide/io.rst | 2 -- doc/source/user_guide/merging.rst | 4 --- doc/source/user_guide/missing_data.rst | 8 ++--- doc/source/user_guide/reshaping.rst | 2 -- doc/source/user_guide/text.rst | 6 ---- doc/source/user_guide/timedeltas.rst | 8 +---- doc/source/user_guide/timeseries.rst | 4 --- pandas/_libs/tslibs/nattype.pyx | 8 ++--- pandas/_libs/tslibs/timestamps.pyx | 9 ++--- pandas/core/algorithms.py | 2 -- pandas/core/arrays/base.py | 2 -- pandas/core/arrays/categorical.py | 2 -- pandas/core/arrays/datetimes.py | 4 --- pandas/core/arrays/interval.py | 10 ------ pandas/core/dtypes/base.py | 2 -- pandas/core/frame.py | 47 +++----------------------- pandas/core/generic.py | 19 ----------- pandas/core/groupby/groupby.py | 3 -- pandas/core/indexes/base.py | 4 --- pandas/core/resample.py | 2 -- pandas/core/reshape/concat.py | 1 - pandas/core/reshape/melt.py | 15 +++----- pandas/core/reshape/reshape.py | 2 -- pandas/core/reshape/tile.py | 2 -- pandas/core/series.py | 10 ++---- pandas/core/shared_docs.py | 6 ++-- pandas/core/strings.py | 3 -- pandas/core/tools/datetimes.py | 2 -- pandas/core/window/ewm.py | 1 - pandas/core/window/expanding.py | 1 - pandas/core/window/rolling.py | 4 --- pandas/io/excel/_base.py | 3 -- pandas/io/formats/style.py | 4 --- pandas/io/html.py | 4 --- pandas/io/json/_json.py | 3 -- pandas/io/stata.py | 7 ---- pandas/plotting/_core.py | 4 --- 43 files changed, 21 insertions(+), 211 deletions(-) diff --git a/doc/source/development/extending.rst b/doc/source/development/extending.rst index 1e6b2c646fdfd..46c2cbbe39b34 100644 --- a/doc/source/development/extending.rst +++ b/doc/source/development/extending.rst @@ -73,8 +73,6 @@ applies only to certain dtypes. Extension types --------------- -.. versionadded:: 0.23.0 - .. warning:: The :class:`pandas.api.extensions.ExtensionDtype` and :class:`pandas.api.extensions.ExtensionArray` APIs are new and diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index c9ac1b0d284a3..fde9f567cc3ec 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -301,8 +301,6 @@ Optional dependencies for parsing HTML One of the following combinations of libraries is needed to use the top-level :func:`~pandas.read_html` function: -.. versionchanged:: 0.23.0 - * `BeautifulSoup4`_ and `html5lib`_ * `BeautifulSoup4`_ and `lxml`_ * `BeautifulSoup4`_ and `html5lib`_ and `lxml`_ diff --git a/doc/source/user_guide/advanced.rst b/doc/source/user_guide/advanced.rst index a0331dd632583..8cd35e94ae743 100644 --- a/doc/source/user_guide/advanced.rst +++ b/doc/source/user_guide/advanced.rst @@ -1065,8 +1065,6 @@ are closed on. Intervals are closed on the right side by default. pd.interval_range(start=0, end=4, closed='neither') -.. versionadded:: 0.23.0 - Specifying ``start``, ``end``, and ``periods`` will generate a range of evenly spaced intervals from ``start`` to ``end`` inclusively, with ``periods`` number of elements in the resulting ``IntervalIndex``: diff --git a/doc/source/user_guide/basics.rst b/doc/source/user_guide/basics.rst index 87359042928eb..6b13319061ea4 100644 --- a/doc/source/user_guide/basics.rst +++ b/doc/source/user_guide/basics.rst @@ -1877,8 +1877,6 @@ different columns. By indexes and values ~~~~~~~~~~~~~~~~~~~~~ -.. versionadded:: 0.23.0 - Strings passed as the ``by`` parameter to :meth:`DataFrame.sort_values` may refer to either columns or index level names. diff --git a/doc/source/user_guide/categorical.rst b/doc/source/user_guide/categorical.rst index 7def45ddc13e2..b7475ae7bb132 100644 --- a/doc/source/user_guide/categorical.rst +++ b/doc/source/user_guide/categorical.rst @@ -112,8 +112,6 @@ only labels present in a given column are categories: df['B'] -.. versionadded:: 0.23.0 - Analogously, all columns in an existing ``DataFrame`` can be batch converted using :meth:`DataFrame.astype`: .. ipython:: python diff --git a/doc/source/user_guide/dsintro.rst b/doc/source/user_guide/dsintro.rst index 23bd44c1969a5..0e6767e88edc2 100644 --- a/doc/source/user_guide/dsintro.rst +++ b/doc/source/user_guide/dsintro.rst @@ -597,8 +597,6 @@ to be inserted (for example, a ``Series`` or NumPy array), or a function of one argument to be called on the ``DataFrame``. A *copy* of the original DataFrame is returned, with the new values inserted. -.. versionchanged:: 0.23.0 - Starting with Python 3.6 the order of ``**kwargs`` is preserved. This allows for *dependent* assignment, where an expression later in ``**kwargs`` can refer to a column created earlier in the same :meth:`~DataFrame.assign`. diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 4dfabaa99fff6..a0b16e5fe5d1c 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -2360,8 +2360,6 @@ A few notes on the generated table schema: then ``level_`` is used. -.. versionadded:: 0.23.0 - ``read_json`` also accepts ``orient='table'`` as an argument. This allows for the preservation of metadata such as dtypes and index names in a round-trippable manner. diff --git a/doc/source/user_guide/merging.rst b/doc/source/user_guide/merging.rst index 0639e4a7bb5e4..bc8fc5a7e4f4e 100644 --- a/doc/source/user_guide/merging.rst +++ b/doc/source/user_guide/merging.rst @@ -175,8 +175,6 @@ behavior: .. warning:: - .. versionchanged:: 0.23.0 - The default behavior with ``join='outer'`` is to sort the other axis (columns in this case). In a future version of pandas, the default will be to not sort. We specified ``sort=False`` to opt in to the new @@ -1198,8 +1196,6 @@ done using the following code. Merging on a combination of columns and index levels ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. versionadded:: 0.23 - Strings passed as the ``on``, ``left_on``, and ``right_on`` parameters may refer to either column names or index level names. This enables merging ``DataFrame`` instances on a combination of index levels and columns without diff --git a/doc/source/user_guide/missing_data.rst b/doc/source/user_guide/missing_data.rst index 28206192dd161..06a7c6e33768e 100644 --- a/doc/source/user_guide/missing_data.rst +++ b/doc/source/user_guide/missing_data.rst @@ -336,10 +336,6 @@ examined :ref:`in the API `. Interpolation ~~~~~~~~~~~~~ -.. versionadded:: 0.23.0 - - The ``limit_area`` keyword argument was added. - Both Series and DataFrame objects have :meth:`~DataFrame.interpolate` that, by default, performs linear interpolation at missing data points. @@ -507,8 +503,8 @@ By default, ``NaN`` values are filled in a ``forward`` direction. Use ser.interpolate(limit_direction='both') By default, ``NaN`` values are filled whether they are inside (surrounded by) -existing valid values, or outside existing valid values. Introduced in v0.23 -the ``limit_area`` parameter restricts filling to either inside or outside values. +existing valid values, or outside existing valid values. The ``limit_area`` +parameter restricts filling to either inside or outside values. .. ipython:: python diff --git a/doc/source/user_guide/reshaping.rst b/doc/source/user_guide/reshaping.rst index aa6bf44547040..1b90aeb00cf9c 100644 --- a/doc/source/user_guide/reshaping.rst +++ b/doc/source/user_guide/reshaping.rst @@ -679,8 +679,6 @@ To choose another dtype, use the ``dtype`` argument: pd.get_dummies(df, dtype=bool).dtypes -.. versionadded:: 0.23.0 - .. _reshaping.factorize: diff --git a/doc/source/user_guide/text.rst b/doc/source/user_guide/text.rst index 3408b98b3179d..e03ba74f95c90 100644 --- a/doc/source/user_guide/text.rst +++ b/doc/source/user_guide/text.rst @@ -282,8 +282,6 @@ following code will cause trouble because of the regular expression meaning of # We need to escape the special character (for >1 len patterns) dollars.str.replace(r'-\$', '-') -.. versionadded:: 0.23.0 - If you do want literal replacement of a string (equivalent to :meth:`str.replace`), you can set the optional ``regex`` parameter to ``False``, rather than escaping each character. In this case both ``pat`` @@ -390,8 +388,6 @@ Missing values on either side will result in missing values in the result as wel Concatenating a Series and something array-like into a Series ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. versionadded:: 0.23.0 - The parameter ``others`` can also be two-dimensional. In this case, the number or rows must match the lengths of the calling ``Series`` (or ``Index``). .. ipython:: python @@ -404,8 +400,6 @@ The parameter ``others`` can also be two-dimensional. In this case, the number o Concatenating a Series and an indexed object into a Series, with alignment ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. versionadded:: 0.23.0 - For concatenation with a ``Series`` or ``DataFrame``, it is possible to align the indexes before concatenation by setting the ``join``-keyword. diff --git a/doc/source/user_guide/timedeltas.rst b/doc/source/user_guide/timedeltas.rst index 3439a0a4c13c7..3979ad1f3e949 100644 --- a/doc/source/user_guide/timedeltas.rst +++ b/doc/source/user_guide/timedeltas.rst @@ -18,7 +18,7 @@ parsing, and attributes. Parsing ------- -You can construct a ``Timedelta`` scalar through various arguments: +You can construct a ``Timedelta`` scalar through various arguments, including `ISO 8601 Duration`_ strings. .. ipython:: python @@ -53,10 +53,6 @@ You can construct a ``Timedelta`` scalar through various arguments: pd.Timedelta('P0DT0H1M0S') pd.Timedelta('P0DT0H0M0.000000123S') -.. versionadded:: 0.23.0 - - Added constructor for `ISO 8601 Duration`_ strings - :ref:`DateOffsets` (``Day, Hour, Minute, Second, Milli, Micro, Nano``) can also be used in construction. .. ipython:: python @@ -387,8 +383,6 @@ The ``freq`` parameter can passed a variety of :ref:`frequency aliases DataFram Column labels to use when ``orient='index'``. Raises a ValueError if used with ``orient='columns'``. - .. versionadded:: 0.23.0 - Returns ------- DataFrame @@ -2115,7 +2104,6 @@ def to_stata( support Unicode characters, and version 119 supports more than 32,767 variables. - .. versionadded:: 0.23.0 .. versionchanged:: 1.0.0 Added support for formats 118 and 119. @@ -2125,9 +2113,6 @@ def to_stata( format. Only available if version is 117. Storing strings in the StrL format can produce smaller dta files if strings have more than 8 characters and values are repeated. - - .. versionadded:: 0.23.0 - compression : str or dict, default 'infer' For on-the-fly compression of the output dta. If string, specifies compression mode. If dict, value at key 'method' specifies @@ -2466,9 +2451,6 @@ def to_html( table_id : str, optional A css id is included in the opening `
    ') != -1 + @td.skip_if_no_mpl class TestStylerMatplotlibDep: From efbf6ef5ef78945a87a647091fc8f7d688d8e55e Mon Sep 17 00:00:00 2001 From: Douglas Hanley Date: Fri, 7 Aug 2020 18:22:55 -0400 Subject: [PATCH 0465/1025] BUG: assign consensus name to index union in array case GH13475 (#35338) --- doc/source/whatsnew/v1.2.0.rst | 1 + pandas/core/indexes/api.py | 5 ++-- pandas/tests/frame/test_constructors.py | 36 ++++++++++++++++++++++++ pandas/tests/reshape/test_concat.py | 37 +++++++++++++++++++++++++ 4 files changed, 76 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 74ef5178eb004..33e70daa55e66 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -162,6 +162,7 @@ Reshaping ^^^^^^^^^ - Bug in :meth:`DataFrame.pivot_table` with ``aggfunc='count'`` or ``aggfunc='sum'`` returning ``NaN`` for missing categories when pivoted on a ``Categorical``. Now returning ``0`` (:issue:`31422`) +- Bug in :func:`union_indexes` where input index names are not preserved in some cases. Affects :func:`concat` and :class:`DataFrame` constructor (:issue:`13475`) - Sparse diff --git a/pandas/core/indexes/api.py b/pandas/core/indexes/api.py index 4c5a70f4088ee..30cc8cf480dcf 100644 --- a/pandas/core/indexes/api.py +++ b/pandas/core/indexes/api.py @@ -218,9 +218,8 @@ def conv(i): return result elif kind == "array": index = indexes[0] - for other in indexes[1:]: - if not index.equals(other): - return _unique_indices(indexes) + if not all(index.equals(other) for other in indexes[1:]): + index = _unique_indices(indexes) name = get_consensus_names(indexes)[0] if name != index.name: diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index b78bb1c492ef4..d0f774344a33d 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -1618,6 +1618,42 @@ def test_constructor_Series_differently_indexed(self): tm.assert_index_equal(df2.index, other_index) tm.assert_frame_equal(df2, exp2) + @pytest.mark.parametrize( + "name_in1,name_in2,name_in3,name_out", + [ + ("idx", "idx", "idx", "idx"), + ("idx", "idx", None, "idx"), + ("idx", None, None, "idx"), + ("idx1", "idx2", None, None), + ("idx1", "idx1", "idx2", None), + ("idx1", "idx2", "idx3", None), + (None, None, None, None), + ], + ) + def test_constructor_index_names(self, name_in1, name_in2, name_in3, name_out): + # GH13475 + indices = [ + pd.Index(["a", "b", "c"], name=name_in1), + pd.Index(["b", "c", "d"], name=name_in2), + pd.Index(["c", "d", "e"], name=name_in3), + ] + series = { + c: pd.Series([0, 1, 2], index=i) for i, c in zip(indices, ["x", "y", "z"]) + } + result = pd.DataFrame(series) + + exp_ind = pd.Index(["a", "b", "c", "d", "e"], name=name_out) + expected = pd.DataFrame( + { + "x": [0, 1, 2, np.nan, np.nan], + "y": [np.nan, 0, 1, 2, np.nan], + "z": [np.nan, np.nan, 0, 1, 2], + }, + index=exp_ind, + ) + + tm.assert_frame_equal(result, expected) + def test_constructor_manager_resize(self, float_frame): index = list(float_frame.index[:5]) columns = list(float_frame.columns[:3]) diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py index 0159fabd04d59..38cf2cc2402a1 100644 --- a/pandas/tests/reshape/test_concat.py +++ b/pandas/tests/reshape/test_concat.py @@ -1279,6 +1279,43 @@ def test_concat_ignore_index(self, sort): tm.assert_frame_equal(v1, expected) + @pytest.mark.parametrize( + "name_in1,name_in2,name_in3,name_out", + [ + ("idx", "idx", "idx", "idx"), + ("idx", "idx", None, "idx"), + ("idx", None, None, "idx"), + ("idx1", "idx2", None, None), + ("idx1", "idx1", "idx2", None), + ("idx1", "idx2", "idx3", None), + (None, None, None, None), + ], + ) + def test_concat_same_index_names(self, name_in1, name_in2, name_in3, name_out): + # GH13475 + indices = [ + pd.Index(["a", "b", "c"], name=name_in1), + pd.Index(["b", "c", "d"], name=name_in2), + pd.Index(["c", "d", "e"], name=name_in3), + ] + frames = [ + pd.DataFrame({c: [0, 1, 2]}, index=i) + for i, c in zip(indices, ["x", "y", "z"]) + ] + result = pd.concat(frames, axis=1) + + exp_ind = pd.Index(["a", "b", "c", "d", "e"], name=name_out) + expected = pd.DataFrame( + { + "x": [0, 1, 2, np.nan, np.nan], + "y": [np.nan, 0, 1, 2, np.nan], + "z": [np.nan, np.nan, 0, 1, 2], + }, + index=exp_ind, + ) + + tm.assert_frame_equal(result, expected) + def test_concat_multiindex_with_keys(self): index = MultiIndex( levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], From abd553d04810cd0ebc27ad6d85db2488d81bd3ca Mon Sep 17 00:00:00 2001 From: Daniel Saxton <2658661+dsaxton@users.noreply.github.com> Date: Fri, 7 Aug 2020 18:17:14 -0500 Subject: [PATCH 0466/1025] REGR: Fix conversion of mixed dtype DataFrame to numpy str (#35473) * Handle str better * Doc and test * Make an elif * Add back import --- doc/source/whatsnew/v1.1.1.rst | 1 + pandas/core/frame.py | 2 ++ pandas/core/internals/managers.py | 3 +++ pandas/tests/frame/test_api.py | 7 +++++++ 4 files changed, 13 insertions(+) diff --git a/doc/source/whatsnew/v1.1.1.rst b/doc/source/whatsnew/v1.1.1.rst index ade88a6127014..f0ad9d1ca3b0f 100644 --- a/doc/source/whatsnew/v1.1.1.rst +++ b/doc/source/whatsnew/v1.1.1.rst @@ -15,6 +15,7 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ +- Fixed regression where :meth:`DataFrame.to_numpy` would raise a ``RuntimeError`` for mixed dtypes when converting to ``str`` (:issue:`35455`) - Fixed regression where :func:`read_csv` would raise a ``ValueError`` when ``pandas.options.mode.use_inf_as_na`` was set to ``True`` (:issue:`35493`). - Fixed regression in :class:`pandas.core.groupby.RollingGroupby` where column selection was ignored (:issue:`35486`) - Fixed regression in :meth:`DataFrame.shift` with ``axis=1`` and heterogeneous dtypes (:issue:`35488`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index aabdac16e9a1a..b66b6b92336f2 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1371,6 +1371,8 @@ def to_numpy( result = self._mgr.as_array( transpose=self._AXIS_REVERSED, dtype=dtype, copy=copy, na_value=na_value ) + if result.dtype is not dtype: + result = np.array(result, dtype=dtype, copy=False) return result diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 4b85f92391dce..aa74d173d69b3 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -19,6 +19,7 @@ from pandas.core.dtypes.common import ( DT64NS_DTYPE, is_datetimelike_v_numeric, + is_dtype_equal, is_extension_array_dtype, is_list_like, is_numeric_v_string_like, @@ -865,6 +866,8 @@ def _interleave(self, dtype=None, na_value=lib.no_default) -> np.ndarray: dtype = dtype.subtype elif is_extension_array_dtype(dtype): dtype = "object" + elif is_dtype_equal(dtype, str): + dtype = "object" result = np.empty(self.shape, dtype=dtype) diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py index 2b79fc8cd3406..cc57a3970d18b 100644 --- a/pandas/tests/frame/test_api.py +++ b/pandas/tests/frame/test_api.py @@ -367,6 +367,13 @@ def test_to_numpy_copy(self): assert df.to_numpy(copy=False).base is arr assert df.to_numpy(copy=True).base is not arr + def test_to_numpy_mixed_dtype_to_str(self): + # https://github.com/pandas-dev/pandas/issues/35455 + df = pd.DataFrame([[pd.Timestamp("2020-01-01 00:00:00"), 100.0]]) + result = df.to_numpy(dtype=str) + expected = np.array([["2020-01-01 00:00:00", "100.0"]], dtype=str) + tm.assert_numpy_array_equal(result, expected) + def test_swapaxes(self): df = DataFrame(np.random.randn(10, 5)) tm.assert_frame_equal(df.T, df.swapaxes(0, 1)) From 12343cd05802bc5822dd230cfe53906f0005bb23 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torsten=20W=C3=B6rtwein?= Date: Fri, 7 Aug 2020 21:27:39 -0400 Subject: [PATCH 0467/1025] DOC: corrected statement about compression support for file objects in to_csv (#35615) --- pandas/core/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 87f25f578c3c6..834cd992f5650 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3088,7 +3088,7 @@ def to_csv( .. versionchanged:: 1.2.0 - Compression is supported for non-binary file objects. + Compression is supported for binary file objects. quoting : optional constant from csv module Defaults to csv.QUOTE_MINIMAL. If you have set a `float_format` From 2b9217ab340d1dfa90644ec444e3a5d845f8d381 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sat, 8 Aug 2020 10:47:43 +0100 Subject: [PATCH 0468/1025] CI: Linux py36_locale failures with pytest DeprecationWarning (#35621) --- ci/deps/azure-36-locale.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/deps/azure-36-locale.yaml b/ci/deps/azure-36-locale.yaml index a9b9a5a47ccf5..3034ed3dc43af 100644 --- a/ci/deps/azure-36-locale.yaml +++ b/ci/deps/azure-36-locale.yaml @@ -7,7 +7,7 @@ dependencies: # tools - cython>=0.29.16 - - pytest>=5.0.1 + - pytest>=5.0.1,<6.0.0 # https://github.com/pandas-dev/pandas/issues/35620 - pytest-xdist>=1.21 - pytest-asyncio - hypothesis>=3.58.0 From f3e8e686af00465bb105bf6300f877a21abac6bf Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 8 Aug 2020 07:51:39 -0700 Subject: [PATCH 0469/1025] REF: Avoid post-processing in blockwise op (#35356) --- pandas/core/groupby/generic.py | 97 ++++++++++++++++------------------ 1 file changed, 47 insertions(+), 50 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 1fed193dba02c..53242c0332a8c 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1029,11 +1029,36 @@ def _cython_agg_blocks( agg_blocks: List[Block] = [] new_items: List[np.ndarray] = [] deleted_items: List[np.ndarray] = [] - # Some object-dtype blocks might be split into List[Block[T], Block[U]] - split_items: List[np.ndarray] = [] - split_frames: List[DataFrame] = [] no_result = object() + + def cast_result_block(result, block: "Block", how: str) -> "Block": + # see if we can cast the block to the desired dtype + # this may not be the original dtype + assert not isinstance(result, DataFrame) + assert result is not no_result + + dtype = maybe_cast_result_dtype(block.dtype, how) + result = maybe_downcast_numeric(result, dtype) + + if block.is_extension and isinstance(result, np.ndarray): + # e.g. block.values was an IntegerArray + # (1, N) case can occur if block.values was Categorical + # and result is ndarray[object] + # TODO(EA2D): special casing not needed with 2D EAs + assert result.ndim == 1 or result.shape[0] == 1 + try: + # Cast back if feasible + result = type(block.values)._from_sequence( + result.ravel(), dtype=block.values.dtype + ) + except (ValueError, TypeError): + # reshape to be valid for non-Extension Block + result = result.reshape(1, -1) + + agg_block: Block = block.make_block(result) + return agg_block + for block in data.blocks: # Avoid inheriting result from earlier in the loop result = no_result @@ -1065,9 +1090,9 @@ def _cython_agg_blocks( # not try to add missing categories if grouping over multiple # Categoricals. This will done by later self._reindex_output() # Doing it here creates an error. See GH#34951 - s = get_groupby(obj, self.grouper, observed=True) + sgb = get_groupby(obj, self.grouper, observed=True) try: - result = s.aggregate(lambda x: alt(x, axis=self.axis)) + result = sgb.aggregate(lambda x: alt(x, axis=self.axis)) except TypeError: # we may have an exception in trying to aggregate # continue and exclude the block @@ -1081,54 +1106,26 @@ def _cython_agg_blocks( # about a single block input returning a single block output # is a lie. To keep the code-path for the typical non-split case # clean, we choose to clean up this mess later on. - split_items.append(locs) - split_frames.append(result) - continue - - assert len(result._mgr.blocks) == 1 - result = result._mgr.blocks[0].values - if isinstance(result, np.ndarray) and result.ndim == 1: - result = result.reshape(1, -1) - - assert not isinstance(result, DataFrame) - - if result is not no_result: - # see if we can cast the block to the desired dtype - # this may not be the original dtype - dtype = maybe_cast_result_dtype(block.dtype, how) - result = maybe_downcast_numeric(result, dtype) - - if block.is_extension and isinstance(result, np.ndarray): - # e.g. block.values was an IntegerArray - # (1, N) case can occur if block.values was Categorical - # and result is ndarray[object] - # TODO(EA2D): special casing not needed with 2D EAs - assert result.ndim == 1 or result.shape[0] == 1 - try: - # Cast back if feasible - result = type(block.values)._from_sequence( - result.ravel(), dtype=block.values.dtype - ) - except (ValueError, TypeError): - # reshape to be valid for non-Extension Block - result = result.reshape(1, -1) - - agg_block: Block = block.make_block(result) - - new_items.append(locs) - agg_blocks.append(agg_block) + assert len(locs) == result.shape[1] + for i, loc in enumerate(locs): + new_items.append(np.array([loc], dtype=locs.dtype)) + agg_block = result.iloc[:, [i]]._mgr.blocks[0] + agg_blocks.append(agg_block) + else: + result = result._mgr.blocks[0].values + if isinstance(result, np.ndarray) and result.ndim == 1: + result = result.reshape(1, -1) + agg_block = cast_result_block(result, block, how) + new_items.append(locs) + agg_blocks.append(agg_block) + else: + agg_block = cast_result_block(result, block, how) + new_items.append(locs) + agg_blocks.append(agg_block) - if not (agg_blocks or split_frames): + if not agg_blocks: raise DataError("No numeric types to aggregate") - if split_items: - # Clean up the mess left over from split blocks. - for locs, result in zip(split_items, split_frames): - assert len(locs) == result.shape[1] - for i, loc in enumerate(locs): - new_items.append(np.array([loc], dtype=locs.dtype)) - agg_blocks.append(result.iloc[:, [i]]._mgr.blocks[0]) - # reset the locs in the blocks to correspond to our # current ordering indexer = np.concatenate(new_items) From 624fab64a0be4768bad10f2fd1ff63b861a51f75 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sat, 8 Aug 2020 15:52:20 +0100 Subject: [PATCH 0470/1025] DOC: docstrings for __array_wrap__ (#35629) --- pandas/core/generic.py | 23 ++++++++++++++++++++++- pandas/core/indexes/base.py | 2 +- pandas/core/indexes/datetimelike.py | 2 +- pandas/core/indexes/period.py | 9 ++++++--- 4 files changed, 30 insertions(+), 6 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 834cd992f5650..fcb7e2a949205 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1772,7 +1772,28 @@ def empty(self) -> bool_t: def __array__(self, dtype=None) -> np.ndarray: return np.asarray(self._values, dtype=dtype) - def __array_wrap__(self, result, context=None): + def __array_wrap__( + self, + result: np.ndarray, + context: Optional[Tuple[Callable, Tuple[Any, ...], int]] = None, + ): + """ + Gets called after a ufunc and other functions. + + Parameters + ---------- + result: np.ndarray + The result of the ufunc or other function called on the NumPy array + returned by __array__ + context: tuple of (func, tuple, int) + This parameter is returned by ufuncs as a 3-element tuple: (name of the + ufunc, arguments of the ufunc, domain of the ufunc), but is not set by + other numpy functions.q + + Notes + ----- + Series implements __array_ufunc_ so this not called for ufunc on Series. + """ result = lib.item_from_zerodim(result) if is_scalar(result): # e.g. we get here with np.ptp(series) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index bfdfbd35f27ad..bd75a064b483e 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -574,7 +574,7 @@ def __array__(self, dtype=None) -> np.ndarray: def __array_wrap__(self, result, context=None): """ - Gets called after a ufunc. + Gets called after a ufunc and other functions. """ result = lib.item_from_zerodim(result) if is_bool_dtype(result) or lib.is_scalar(result) or np.ndim(result) > 1: diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 0ce057d6e764a..8ccdab21339df 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -116,7 +116,7 @@ def values(self): def __array_wrap__(self, result, context=None): """ - Gets called after a ufunc. + Gets called after a ufunc and other functions. """ result = lib.item_from_zerodim(result) if is_bool_dtype(result) or lib.is_scalar(result): diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index c7199e4a28a17..11334803d4583 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -345,10 +345,13 @@ def _int64index(self) -> Int64Index: def __array_wrap__(self, result, context=None): """ - Gets called after a ufunc. Needs additional handling as - PeriodIndex stores internal data as int dtype + Gets called after a ufunc and other functions. - Replace this to __numpy_ufunc__ in future version + Needs additional handling as PeriodIndex stores internal data as int + dtype + + Replace this to __numpy_ufunc__ in future version and implement + __array_function__ for Indexes """ if isinstance(context, tuple) and len(context) > 0: func = context[0] From e120b1144471bd8f1fc211af2c963343cc9d0777 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sat, 8 Aug 2020 18:49:24 +0100 Subject: [PATCH 0471/1025] TYP: update setup.cfg (#35628) --- setup.cfg | 6 ------ 1 file changed, 6 deletions(-) diff --git a/setup.cfg b/setup.cfg index 84c281b756395..e4c0b3dcf37ef 100644 --- a/setup.cfg +++ b/setup.cfg @@ -175,9 +175,6 @@ check_untyped_defs=False [mypy-pandas.core.groupby.ops] check_untyped_defs=False -[mypy-pandas.core.indexes.base] -check_untyped_defs=False - [mypy-pandas.core.indexes.datetimes] check_untyped_defs=False @@ -214,9 +211,6 @@ check_untyped_defs=False [mypy-pandas.core.window.common] check_untyped_defs=False -[mypy-pandas.core.window.ewm] -check_untyped_defs=False - [mypy-pandas.core.window.expanding] check_untyped_defs=False From b613b9461e51a213372cb986a84f9429ec74ffdb Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Mon, 10 Aug 2020 06:13:30 -0700 Subject: [PATCH 0472/1025] BUG: RollingGroupby with closed and column selection no longer raises ValueError (#35639) --- doc/source/whatsnew/v1.1.1.rst | 4 +++ pandas/core/window/common.py | 2 +- pandas/core/window/rolling.py | 10 ++---- pandas/tests/window/test_grouper.py | 51 +++++++++++++++++++++++++++++ 4 files changed, 59 insertions(+), 8 deletions(-) diff --git a/doc/source/whatsnew/v1.1.1.rst b/doc/source/whatsnew/v1.1.1.rst index f0ad9d1ca3b0f..7f5182e3eaa6f 100644 --- a/doc/source/whatsnew/v1.1.1.rst +++ b/doc/source/whatsnew/v1.1.1.rst @@ -51,6 +51,10 @@ Categorical - - +**Groupby/resample/rolling** + +- Bug in :class:`pandas.core.groupby.RollingGroupby` where passing ``closed`` with column selection would raise a ``ValueError`` (:issue:`35549`) + **Plotting** - diff --git a/pandas/core/window/common.py b/pandas/core/window/common.py index 58e7841d4dde5..51a067427e867 100644 --- a/pandas/core/window/common.py +++ b/pandas/core/window/common.py @@ -52,7 +52,7 @@ def __init__(self, obj, *args, **kwargs): kwargs.pop("parent", None) groupby = kwargs.pop("groupby", None) if groupby is None: - groupby, obj = obj, obj.obj + groupby, obj = obj, obj._selected_obj self._groupby = groupby self._groupby.mutated = True self._groupby.grouper.mutated = True diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index a04d68a6d6745..7347d5686aabc 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -2212,7 +2212,7 @@ def _apply( # Cannot use _wrap_outputs because we calculate the result all at once # Compose MultiIndex result from grouping levels then rolling level # Aggregate the MultiIndex data as tuples then the level names - grouped_object_index = self._groupby._selected_obj.index + grouped_object_index = self.obj.index grouped_index_name = [grouped_object_index.name] groupby_keys = [grouping.name for grouping in self._groupby.grouper._groupings] result_index_names = groupby_keys + grouped_index_name @@ -2236,10 +2236,6 @@ def _apply( def _constructor(self): return Rolling - @cache_readonly - def _selected_obj(self): - return self._groupby._selected_obj - def _create_blocks(self, obj: FrameOrSeries): """ Split data into blocks & return conformed data. @@ -2278,7 +2274,7 @@ def _get_window_indexer(self, window: int) -> GroupbyRollingIndexer: rolling_indexer: Union[Type[FixedWindowIndexer], Type[VariableWindowIndexer]] if self.is_freq_type: rolling_indexer = VariableWindowIndexer - index_array = self._groupby._selected_obj.index.asi8 + index_array = self.obj.index.asi8 else: rolling_indexer = FixedWindowIndexer index_array = None @@ -2295,7 +2291,7 @@ def _gotitem(self, key, ndim, subset=None): # here so our index is carried thru to the selected obj # when we do the splitting for the groupby if self.on is not None: - self._groupby.obj = self._groupby.obj.set_index(self._on) + self.obj = self.obj.set_index(self._on) self.on = None return super()._gotitem(key, ndim, subset=subset) diff --git a/pandas/tests/window/test_grouper.py b/pandas/tests/window/test_grouper.py index 5241b9548a442..e1dcac06c39cc 100644 --- a/pandas/tests/window/test_grouper.py +++ b/pandas/tests/window/test_grouper.py @@ -304,3 +304,54 @@ def test_groupby_subselect_rolling(self): name="b", ) tm.assert_series_equal(result, expected) + + def test_groupby_rolling_subset_with_closed(self): + # GH 35549 + df = pd.DataFrame( + { + "column1": range(6), + "column2": range(6), + "group": 3 * ["A", "B"], + "date": [pd.Timestamp("2019-01-01")] * 6, + } + ) + result = ( + df.groupby("group").rolling("1D", on="date", closed="left")["column1"].sum() + ) + expected = Series( + [np.nan, 0.0, 2.0, np.nan, 1.0, 4.0], + index=pd.MultiIndex.from_tuples( + [("A", pd.Timestamp("2019-01-01"))] * 3 + + [("B", pd.Timestamp("2019-01-01"))] * 3, + names=["group", "date"], + ), + name="column1", + ) + tm.assert_series_equal(result, expected) + + def test_groupby_subset_rolling_subset_with_closed(self): + # GH 35549 + df = pd.DataFrame( + { + "column1": range(6), + "column2": range(6), + "group": 3 * ["A", "B"], + "date": [pd.Timestamp("2019-01-01")] * 6, + } + ) + + result = ( + df.groupby("group")[["column1", "date"]] + .rolling("1D", on="date", closed="left")["column1"] + .sum() + ) + expected = Series( + [np.nan, 0.0, 2.0, np.nan, 1.0, 4.0], + index=pd.MultiIndex.from_tuples( + [("A", pd.Timestamp("2019-01-01"))] * 3 + + [("B", pd.Timestamp("2019-01-01"))] * 3, + names=["group", "date"], + ), + name="column1", + ) + tm.assert_series_equal(result, expected) From 3ea490ba8fe3f2742b37bad528c5c2b07f4fd6b7 Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Mon, 10 Aug 2020 14:18:53 +0100 Subject: [PATCH 0473/1025] DEPR: Deprecate inplace param in MultiIndex.set_codes and MultiIndex.set_levels (#35626) --- doc/source/user_guide/indexing.rst | 8 +-- doc/source/whatsnew/v1.2.0.rst | 2 +- pandas/conftest.py | 2 +- pandas/core/indexes/multi.py | 26 ++++++++- pandas/tests/frame/methods/test_sort_index.py | 4 +- pandas/tests/indexes/multi/test_compat.py | 6 +- pandas/tests/indexes/multi/test_duplicates.py | 3 +- .../tests/indexes/multi/test_equivalence.py | 6 +- pandas/tests/indexes/multi/test_get_set.py | 55 ++++++++++++++----- pandas/tests/indexes/multi/test_integrity.py | 3 +- .../tests/indexing/multiindex/test_sorted.py | 8 ++- pandas/tests/reshape/test_melt.py | 4 +- pandas/tests/test_multilevel.py | 4 +- 13 files changed, 94 insertions(+), 37 deletions(-) diff --git a/doc/source/user_guide/indexing.rst b/doc/source/user_guide/indexing.rst index 6843dd1eadc81..cac18f5bf39cd 100644 --- a/doc/source/user_guide/indexing.rst +++ b/doc/source/user_guide/indexing.rst @@ -1532,12 +1532,8 @@ Setting metadata ~~~~~~~~~~~~~~~~ Indexes are "mostly immutable", but it is possible to set and change their -metadata, like the index ``name`` (or, for ``MultiIndex``, ``levels`` and -``codes``). - -You can use the ``rename``, ``set_names``, ``set_levels``, and ``set_codes`` -to set these attributes directly. They default to returning a copy; however, -you can specify ``inplace=True`` to have the data change in place. +``name`` attribute. You can use the ``rename``, ``set_names`` to set these attributes +directly, and they default to returning a copy. See :ref:`Advanced Indexing ` for usage of MultiIndexes. diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 33e70daa55e66..d3bccada09c29 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -47,7 +47,7 @@ Other enhancements Deprecations ~~~~~~~~~~~~ - +- Deprecated parameter ``inplace`` in :meth:`MultiIndex.set_codes` and :meth:`MultiIndex.set_levels` (:issue:`35626`) - - diff --git a/pandas/conftest.py b/pandas/conftest.py index e0adb37e7d2f5..c1925b4f5ca3b 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -359,7 +359,7 @@ def multiindex_year_month_day_dataframe_random_data(): tdf = tm.makeTimeDataFrame(100) ymd = tdf.groupby([lambda x: x.year, lambda x: x.month, lambda x: x.day]).sum() # use Int64Index, to make sure things work - ymd.index.set_levels([lev.astype("i8") for lev in ymd.index.levels], inplace=True) + ymd.index = ymd.index.set_levels([lev.astype("i8") for lev in ymd.index.levels]) ymd.index.set_names(["year", "month", "day"], inplace=True) return ymd diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index a6e8ec0707de7..13927dede5542 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -740,7 +740,7 @@ def _set_levels( self._tuples = None self._reset_cache() - def set_levels(self, levels, level=None, inplace=False, verify_integrity=True): + def set_levels(self, levels, level=None, inplace=None, verify_integrity=True): """ Set new levels on MultiIndex. Defaults to returning new index. @@ -752,6 +752,8 @@ def set_levels(self, levels, level=None, inplace=False, verify_integrity=True): Level(s) to set (None for all levels). inplace : bool If True, mutates in place. + + .. deprecated:: 1.2.0 verify_integrity : bool, default True If True, checks that levels and codes are compatible. @@ -822,6 +824,15 @@ def set_levels(self, levels, level=None, inplace=False, verify_integrity=True): >>> idx.set_levels([['a', 'b', 'c'], [1, 2, 3, 4]], level=[0, 1]).levels FrozenList([['a', 'b', 'c'], [1, 2, 3, 4]]) """ + if inplace is not None: + warnings.warn( + "inplace is deprecated and will be removed in a future version.", + FutureWarning, + stacklevel=2, + ) + else: + inplace = False + if is_list_like(levels) and not isinstance(levels, Index): levels = list(levels) @@ -898,7 +909,7 @@ def _set_codes( self._tuples = None self._reset_cache() - def set_codes(self, codes, level=None, inplace=False, verify_integrity=True): + def set_codes(self, codes, level=None, inplace=None, verify_integrity=True): """ Set new codes on MultiIndex. Defaults to returning new index. @@ -914,6 +925,8 @@ def set_codes(self, codes, level=None, inplace=False, verify_integrity=True): Level(s) to set (None for all levels). inplace : bool If True, mutates in place. + + .. deprecated:: 1.2.0 verify_integrity : bool (default True) If True, checks that levels and codes are compatible. @@ -958,6 +971,15 @@ def set_codes(self, codes, level=None, inplace=False, verify_integrity=True): (1, 'two')], names=['foo', 'bar']) """ + if inplace is not None: + warnings.warn( + "inplace is deprecated and will be removed in a future version.", + FutureWarning, + stacklevel=2, + ) + else: + inplace = False + if level is not None and not is_list_like(level): if not is_list_like(codes): raise TypeError("Codes must be list-like") diff --git a/pandas/tests/frame/methods/test_sort_index.py b/pandas/tests/frame/methods/test_sort_index.py index 5216c3be116e0..dcc33428d18a5 100644 --- a/pandas/tests/frame/methods/test_sort_index.py +++ b/pandas/tests/frame/methods/test_sort_index.py @@ -555,8 +555,8 @@ def test_sort_index_and_reconstruction(self): ), ) - df.columns.set_levels( - pd.to_datetime(df.columns.levels[1]), level=1, inplace=True + df.columns = df.columns.set_levels( + pd.to_datetime(df.columns.levels[1]), level=1 ) assert not df.columns.is_lexsorted() assert not df.columns.is_monotonic diff --git a/pandas/tests/indexes/multi/test_compat.py b/pandas/tests/indexes/multi/test_compat.py index d1f66af4a8e83..b2500efef9e03 100644 --- a/pandas/tests/indexes/multi/test_compat.py +++ b/pandas/tests/indexes/multi/test_compat.py @@ -84,7 +84,8 @@ def test_inplace_mutation_resets_values(): tm.assert_almost_equal(mi1.values, vals) # Inplace should kill _tuples - mi1.set_levels(levels2, inplace=True) + with tm.assert_produces_warning(FutureWarning): + mi1.set_levels(levels2, inplace=True) tm.assert_almost_equal(mi1.values, vals2) # Make sure label setting works too @@ -103,7 +104,8 @@ def test_inplace_mutation_resets_values(): tm.assert_almost_equal(exp_values, new_values) # ...and again setting inplace should kill _tuples, etc - mi2.set_codes(codes2, inplace=True) + with tm.assert_produces_warning(FutureWarning): + mi2.set_codes(codes2, inplace=True) tm.assert_almost_equal(mi2.values, new_values) diff --git a/pandas/tests/indexes/multi/test_duplicates.py b/pandas/tests/indexes/multi/test_duplicates.py index e48731b9c8099..9add4b478da47 100644 --- a/pandas/tests/indexes/multi/test_duplicates.py +++ b/pandas/tests/indexes/multi/test_duplicates.py @@ -91,7 +91,8 @@ def test_duplicate_multiindex_codes(): mi = MultiIndex.from_arrays([["A", "A", "B", "B", "B"], [1, 2, 1, 2, 3]]) msg = r"Level values must be unique: \[[AB', ]+\] on level 0" with pytest.raises(ValueError, match=msg): - mi.set_levels([["A", "B", "A", "A", "B"], [2, 1, 3, -2, 5]], inplace=True) + with tm.assert_produces_warning(FutureWarning): + mi.set_levels([["A", "B", "A", "A", "B"], [2, 1, 3, -2, 5]], inplace=True) @pytest.mark.parametrize("names", [["a", "b", "a"], [1, 1, 2], [1, "a", 1]]) diff --git a/pandas/tests/indexes/multi/test_equivalence.py b/pandas/tests/indexes/multi/test_equivalence.py index 063ede028add7..b48f09457b96c 100644 --- a/pandas/tests/indexes/multi/test_equivalence.py +++ b/pandas/tests/indexes/multi/test_equivalence.py @@ -192,10 +192,12 @@ def test_is_(): mi4 = mi3.view() # GH 17464 - Remove duplicate MultiIndex levels - mi4.set_levels([list(range(10)), list(range(10))], inplace=True) + with tm.assert_produces_warning(FutureWarning): + mi4.set_levels([list(range(10)), list(range(10))], inplace=True) assert not mi4.is_(mi3) mi5 = mi.view() - mi5.set_levels(mi5.levels, inplace=True) + with tm.assert_produces_warning(FutureWarning): + mi5.set_levels(mi5.levels, inplace=True) assert not mi5.is_(mi) diff --git a/pandas/tests/indexes/multi/test_get_set.py b/pandas/tests/indexes/multi/test_get_set.py index 8a3deca0236e4..b9132f429905d 100644 --- a/pandas/tests/indexes/multi/test_get_set.py +++ b/pandas/tests/indexes/multi/test_get_set.py @@ -93,7 +93,8 @@ def test_set_levels(idx): # level changing [w/ mutation] ind2 = idx.copy() - inplace_return = ind2.set_levels(new_levels, inplace=True) + with tm.assert_produces_warning(FutureWarning): + inplace_return = ind2.set_levels(new_levels, inplace=True) assert inplace_return is None assert_matching(ind2.levels, new_levels) @@ -113,20 +114,23 @@ def test_set_levels(idx): # level changing specific level [w/ mutation] ind2 = idx.copy() - inplace_return = ind2.set_levels(new_levels[0], level=0, inplace=True) + with tm.assert_produces_warning(FutureWarning): + inplace_return = ind2.set_levels(new_levels[0], level=0, inplace=True) assert inplace_return is None assert_matching(ind2.levels, [new_levels[0], levels[1]]) assert_matching(idx.levels, levels) ind2 = idx.copy() - inplace_return = ind2.set_levels(new_levels[1], level=1, inplace=True) + with tm.assert_produces_warning(FutureWarning): + inplace_return = ind2.set_levels(new_levels[1], level=1, inplace=True) assert inplace_return is None assert_matching(ind2.levels, [levels[0], new_levels[1]]) assert_matching(idx.levels, levels) # level changing multiple levels [w/ mutation] ind2 = idx.copy() - inplace_return = ind2.set_levels(new_levels, level=[0, 1], inplace=True) + with tm.assert_produces_warning(FutureWarning): + inplace_return = ind2.set_levels(new_levels, level=[0, 1], inplace=True) assert inplace_return is None assert_matching(ind2.levels, new_levels) assert_matching(idx.levels, levels) @@ -136,19 +140,23 @@ def test_set_levels(idx): original_index = idx.copy() for inplace in [True, False]: with pytest.raises(ValueError, match="^On"): - idx.set_levels(["c"], level=0, inplace=inplace) + with tm.assert_produces_warning(FutureWarning): + idx.set_levels(["c"], level=0, inplace=inplace) assert_matching(idx.levels, original_index.levels, check_dtype=True) with pytest.raises(ValueError, match="^On"): - idx.set_codes([0, 1, 2, 3, 4, 5], level=0, inplace=inplace) + with tm.assert_produces_warning(FutureWarning): + idx.set_codes([0, 1, 2, 3, 4, 5], level=0, inplace=inplace) assert_matching(idx.codes, original_index.codes, check_dtype=True) with pytest.raises(TypeError, match="^Levels"): - idx.set_levels("c", level=0, inplace=inplace) + with tm.assert_produces_warning(FutureWarning): + idx.set_levels("c", level=0, inplace=inplace) assert_matching(idx.levels, original_index.levels, check_dtype=True) with pytest.raises(TypeError, match="^Codes"): - idx.set_codes(1, level=0, inplace=inplace) + with tm.assert_produces_warning(FutureWarning): + idx.set_codes(1, level=0, inplace=inplace) assert_matching(idx.codes, original_index.codes, check_dtype=True) @@ -168,7 +176,8 @@ def test_set_codes(idx): # changing label w/ mutation ind2 = idx.copy() - inplace_return = ind2.set_codes(new_codes, inplace=True) + with tm.assert_produces_warning(FutureWarning): + inplace_return = ind2.set_codes(new_codes, inplace=True) assert inplace_return is None assert_matching(ind2.codes, new_codes) @@ -188,20 +197,23 @@ def test_set_codes(idx): # label changing specific level w/ mutation ind2 = idx.copy() - inplace_return = ind2.set_codes(new_codes[0], level=0, inplace=True) + with tm.assert_produces_warning(FutureWarning): + inplace_return = ind2.set_codes(new_codes[0], level=0, inplace=True) assert inplace_return is None assert_matching(ind2.codes, [new_codes[0], codes[1]]) assert_matching(idx.codes, codes) ind2 = idx.copy() - inplace_return = ind2.set_codes(new_codes[1], level=1, inplace=True) + with tm.assert_produces_warning(FutureWarning): + inplace_return = ind2.set_codes(new_codes[1], level=1, inplace=True) assert inplace_return is None assert_matching(ind2.codes, [codes[0], new_codes[1]]) assert_matching(idx.codes, codes) # codes changing multiple levels [w/ mutation] ind2 = idx.copy() - inplace_return = ind2.set_codes(new_codes, level=[0, 1], inplace=True) + with tm.assert_produces_warning(FutureWarning): + inplace_return = ind2.set_codes(new_codes, level=[0, 1], inplace=True) assert inplace_return is None assert_matching(ind2.codes, new_codes) assert_matching(idx.codes, codes) @@ -217,7 +229,8 @@ def test_set_codes(idx): # [w/ mutation] result = ind.copy() - result.set_codes(codes=new_codes, level=1, inplace=True) + with tm.assert_produces_warning(FutureWarning): + result.set_codes(codes=new_codes, level=1, inplace=True) assert result.equals(expected) @@ -329,3 +342,19 @@ def test_set_levels_with_iterable(): [expected_sizes, colors], names=["size", "color"] ) tm.assert_index_equal(result, expected) + + +@pytest.mark.parametrize("inplace", [True, False]) +def test_set_codes_inplace_deprecated(idx, inplace): + new_codes = idx.codes[1][::-1] + + with tm.assert_produces_warning(FutureWarning): + idx.set_codes(codes=new_codes, level=1, inplace=inplace) + + +@pytest.mark.parametrize("inplace", [True, False]) +def test_set_levels_inplace_deprecated(idx, inplace): + new_level = idx.levels[1].copy() + + with tm.assert_produces_warning(FutureWarning): + idx.set_levels(levels=new_level, level=1, inplace=inplace) diff --git a/pandas/tests/indexes/multi/test_integrity.py b/pandas/tests/indexes/multi/test_integrity.py index fd150bb4d57a2..c776a33717ccd 100644 --- a/pandas/tests/indexes/multi/test_integrity.py +++ b/pandas/tests/indexes/multi/test_integrity.py @@ -220,7 +220,8 @@ def test_metadata_immutable(idx): def test_level_setting_resets_attributes(): ind = pd.MultiIndex.from_arrays([["A", "A", "B", "B", "B"], [1, 2, 1, 2, 3]]) assert ind.is_monotonic - ind.set_levels([["A", "B"], [1, 3, 2]], inplace=True) + with tm.assert_produces_warning(FutureWarning): + ind.set_levels([["A", "B"], [1, 3, 2]], inplace=True) # if this fails, probably didn't reset the cache correctly. assert not ind.is_monotonic diff --git a/pandas/tests/indexing/multiindex/test_sorted.py b/pandas/tests/indexing/multiindex/test_sorted.py index 572cb9da405d1..bafe5068e1418 100644 --- a/pandas/tests/indexing/multiindex/test_sorted.py +++ b/pandas/tests/indexing/multiindex/test_sorted.py @@ -43,9 +43,13 @@ def test_frame_getitem_not_sorted2(self, key): df2 = df.set_index(["col1", "col2"]) df2_original = df2.copy() - return_value = df2.index.set_levels(["b", "d", "a"], level="col1", inplace=True) + with tm.assert_produces_warning(FutureWarning): + return_value = df2.index.set_levels( + ["b", "d", "a"], level="col1", inplace=True + ) assert return_value is None - return_value = df2.index.set_codes([0, 1, 0, 2], level="col1", inplace=True) + with tm.assert_produces_warning(FutureWarning): + return_value = df2.index.set_codes([0, 1, 0, 2], level="col1", inplace=True) assert return_value is None assert not df2.index.is_lexsorted() assert not df2.index.is_monotonic diff --git a/pandas/tests/reshape/test_melt.py b/pandas/tests/reshape/test_melt.py index 2b75a1ec6ca6e..79879ef346f53 100644 --- a/pandas/tests/reshape/test_melt.py +++ b/pandas/tests/reshape/test_melt.py @@ -799,7 +799,7 @@ def test_invalid_separator(self): expected = expected.set_index(["id", "year"])[ ["X", "A2010", "A2011", "B2010", "A", "B"] ] - expected.index.set_levels([0, 1], level=0, inplace=True) + expected.index = expected.index.set_levels([0, 1], level=0) result = wide_to_long(df, ["A", "B"], i="id", j="year", sep=sep) tm.assert_frame_equal(result.sort_index(axis=1), expected.sort_index(axis=1)) @@ -861,7 +861,7 @@ def test_invalid_suffixtype(self): expected = pd.DataFrame(exp_data).astype({"year": "int"}) expected = expected.set_index(["id", "year"]) - expected.index.set_levels([0, 1], level=0, inplace=True) + expected.index = expected.index.set_levels([0, 1], level=0) result = wide_to_long(df, ["A", "B"], i="id", j="year") tm.assert_frame_equal(result.sort_index(axis=1), expected.sort_index(axis=1)) diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 1ba73292dc0b4..724558bd49ea2 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -63,8 +63,8 @@ def setup_method(self, method): ).sum() # use Int64Index, to make sure things work - self.ymd.index.set_levels( - [lev.astype("i8") for lev in self.ymd.index.levels], inplace=True + self.ymd.index = self.ymd.index.set_levels( + [lev.astype("i8") for lev in self.ymd.index.levels] ) self.ymd.index.set_names(["year", "month", "day"], inplace=True) From 70665cb5a02b3addbc48517eb6cd776e0ffb03c7 Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Mon, 10 Aug 2020 14:24:29 +0100 Subject: [PATCH 0474/1025] REF: Simplify Index.copy (#35592) --- pandas/core/dtypes/common.py | 28 +++++++++++++++++- pandas/core/indexes/base.py | 36 +++++++++++++++--------- pandas/core/indexes/range.py | 5 ++-- pandas/core/series.py | 4 +-- pandas/tests/dtypes/test_common.py | 10 +++++++ pandas/tests/indexes/common.py | 14 +++++++++ pandas/tests/indexes/multi/test_names.py | 7 +++++ 7 files changed, 84 insertions(+), 20 deletions(-) diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 73109020b1b54..1e70ff90fcd44 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -9,7 +9,7 @@ from pandas._libs import Interval, Period, algos from pandas._libs.tslibs import conversion -from pandas._typing import ArrayLike, DtypeObj +from pandas._typing import ArrayLike, DtypeObj, Optional from pandas.core.dtypes.base import registry from pandas.core.dtypes.dtypes import ( @@ -1732,6 +1732,32 @@ def _validate_date_like_dtype(dtype) -> None: ) +def validate_all_hashable(*args, error_name: Optional[str] = None) -> None: + """ + Return None if all args are hashable, else raise a TypeError. + + Parameters + ---------- + *args + Arguments to validate. + error_name : str, optional + The name to use if error + + Raises + ------ + TypeError : If an argument is not hashable + + Returns + ------- + None + """ + if not all(is_hashable(arg) for arg in args): + if error_name: + raise TypeError(f"{error_name} must be a hashable type") + else: + raise TypeError("All elements must be hashable") + + def pandas_dtype(dtype) -> DtypeObj: """ Convert input into a pandas only dtype object or a numpy dtype object. diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index bd75a064b483e..ecd3670e724a1 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -58,6 +58,7 @@ is_timedelta64_dtype, is_unsigned_integer_dtype, pandas_dtype, + validate_all_hashable, ) from pandas.core.dtypes.concat import concat_compat from pandas.core.dtypes.generic import ( @@ -812,13 +813,11 @@ def copy(self, name=None, deep=False, dtype=None, names=None): In most cases, there should be no functional difference from using ``deep``, but if ``deep`` is passed it will attempt to deepcopy. """ + name = self._validate_names(name=name, names=names, deep=deep)[0] if deep: - new_index = self._shallow_copy(self._data.copy()) + new_index = self._shallow_copy(self._data.copy(), name=name) else: - new_index = self._shallow_copy() - - names = self._validate_names(name=name, names=names, deep=deep) - new_index = new_index.set_names(names) + new_index = self._shallow_copy(name=name) if dtype: new_index = new_index.astype(dtype) @@ -1186,7 +1185,7 @@ def name(self, value): maybe_extract_name(value, None, type(self)) self._name = value - def _validate_names(self, name=None, names=None, deep: bool = False): + def _validate_names(self, name=None, names=None, deep: bool = False) -> List[Label]: """ Handles the quirks of having a singular 'name' parameter for general Index and plural 'names' parameter for MultiIndex. @@ -1196,15 +1195,25 @@ def _validate_names(self, name=None, names=None, deep: bool = False): if names is not None and name is not None: raise TypeError("Can only provide one of `names` and `name`") elif names is None and name is None: - return deepcopy(self.names) if deep else self.names + new_names = deepcopy(self.names) if deep else self.names elif names is not None: if not is_list_like(names): raise TypeError("Must pass list-like as `names`.") - return names + new_names = names + elif not is_list_like(name): + new_names = [name] else: - if not is_list_like(name): - return [name] - return name + new_names = name + + if len(new_names) != len(self.names): + raise ValueError( + f"Length of new names must be {len(self.names)}, got {len(new_names)}" + ) + + # All items in 'new_names' need to be hashable + validate_all_hashable(*new_names, error_name=f"{type(self).__name__}.name") + + return new_names def _get_names(self): return FrozenList((self.name,)) @@ -1232,9 +1241,8 @@ def _set_names(self, values, level=None): # GH 20527 # All items in 'name' need to be hashable: - for name in values: - if not is_hashable(name): - raise TypeError(f"{type(self).__name__}.name must be a hashable type") + validate_all_hashable(*values, error_name=f"{type(self).__name__}.name") + self._name = values[0] names = property(fset=_set_names, fget=_get_names) diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index e9c4c301f4dca..3577a7aacc008 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -388,9 +388,8 @@ def _shallow_copy(self, values=None, name: Label = no_default): def copy(self, name=None, deep=False, dtype=None, names=None): self._validate_dtype(dtype) - new_index = self._shallow_copy() - names = self._validate_names(name=name, names=names, deep=deep) - new_index = new_index.set_names(names) + name = self._validate_names(name=name, names=names, deep=deep)[0] + new_index = self._shallow_copy(name=name) return new_index def _minmax(self, meth: str): diff --git a/pandas/core/series.py b/pandas/core/series.py index 9e70120f67969..93368ea1e515f 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -54,6 +54,7 @@ is_list_like, is_object_dtype, is_scalar, + validate_all_hashable, ) from pandas.core.dtypes.generic import ABCDataFrame from pandas.core.dtypes.inference import is_hashable @@ -491,8 +492,7 @@ def name(self) -> Label: @name.setter def name(self, value: Label) -> None: - if not is_hashable(value): - raise TypeError("Series.name must be a hashable type") + validate_all_hashable(value, error_name=f"{type(self).__name__}.name") object.__setattr__(self, "_name", value) @property diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index ce12718e48d0d..a6c526fcb008a 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -746,3 +746,13 @@ def test_astype_object_preserves_datetime_na(from_type): result = astype_nansafe(arr, dtype="object") assert isna(result)[0] + + +def test_validate_allhashable(): + assert com.validate_all_hashable(1, "a") is None + + with pytest.raises(TypeError, match="All elements must be hashable"): + com.validate_all_hashable([]) + + with pytest.raises(TypeError, match="list must be a hashable type"): + com.validate_all_hashable([], error_name="list") diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index 238ee8d304d05..98f7c0eadb4bb 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -270,6 +270,20 @@ def test_copy_name(self, index): s3 = s1 * s2 assert s3.index.name == "mario" + def test_name2(self, index): + # gh-35592 + if isinstance(index, MultiIndex): + return + + assert index.copy(name="mario").name == "mario" + + with pytest.raises(ValueError, match="Length of new names must be 1, got 2"): + index.copy(name=["mario", "luigi"]) + + msg = f"{type(index).__name__}.name must be a hashable type" + with pytest.raises(TypeError, match=msg): + index.copy(name=[["mario"]]) + def test_ensure_copied_data(self, index): # Check the "copy" argument of each Index.__new__ is honoured # GH12309 diff --git a/pandas/tests/indexes/multi/test_names.py b/pandas/tests/indexes/multi/test_names.py index 479b5ef0211a0..f38da7ad2ae1c 100644 --- a/pandas/tests/indexes/multi/test_names.py +++ b/pandas/tests/indexes/multi/test_names.py @@ -75,6 +75,13 @@ def test_copy_names(): assert multi_idx.names == ["MyName1", "MyName2"] assert multi_idx3.names == ["NewName1", "NewName2"] + # gh-35592 + with pytest.raises(ValueError, match="Length of new names must be 2, got 1"): + multi_idx.copy(names=["mario"]) + + with pytest.raises(TypeError, match="MultiIndex.name must be a hashable type"): + multi_idx.copy(names=[["mario"], ["luigi"]]) + def test_names(idx, index_names): From b2f1fac2218fe183877fb3f7905ca7163ba4f7b1 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Mon, 10 Aug 2020 23:30:12 +1000 Subject: [PATCH 0475/1025] =?UTF-8?q?BUG:=20Fix=20assert=5Fequal=20when=20?= =?UTF-8?q?check=5Fexact=3DTrue=20for=20non-numeric=20dtypes=20#3=E2=80=A6?= =?UTF-8?q?=20(#35522)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- doc/source/whatsnew/v1.1.1.rst | 1 + pandas/_testing.py | 6 ++---- pandas/tests/util/test_assert_series_equal.py | 15 +++++++++++++++ 3 files changed, 18 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v1.1.1.rst b/doc/source/whatsnew/v1.1.1.rst index 7f5182e3eaa6f..e5860644fa371 100644 --- a/doc/source/whatsnew/v1.1.1.rst +++ b/doc/source/whatsnew/v1.1.1.rst @@ -17,6 +17,7 @@ Fixed regressions - Fixed regression where :meth:`DataFrame.to_numpy` would raise a ``RuntimeError`` for mixed dtypes when converting to ``str`` (:issue:`35455`) - Fixed regression where :func:`read_csv` would raise a ``ValueError`` when ``pandas.options.mode.use_inf_as_na`` was set to ``True`` (:issue:`35493`). +- Fixed regression where :func:`pandas.testing.assert_series_equal` would raise an error when non-numeric dtypes were passed with ``check_exact=True`` (:issue:`35446`) - Fixed regression in :class:`pandas.core.groupby.RollingGroupby` where column selection was ignored (:issue:`35486`) - Fixed regression in :meth:`DataFrame.shift` with ``axis=1`` and heterogeneous dtypes (:issue:`35488`) - Fixed regression in ``.groupby(..).rolling(..)`` where a segfault would occur with ``center=True`` and an odd number of values (:issue:`35552`) diff --git a/pandas/_testing.py b/pandas/_testing.py index a020fbff3553a..713f29466f097 100644 --- a/pandas/_testing.py +++ b/pandas/_testing.py @@ -1339,10 +1339,8 @@ def assert_series_equal( else: assert_attr_equal("dtype", left, right, obj=f"Attributes of {obj}") - if check_exact: - if not is_numeric_dtype(left.dtype): - raise AssertionError("check_exact may only be used with numeric Series") - + if check_exact and is_numeric_dtype(left.dtype) and is_numeric_dtype(right.dtype): + # Only check exact if dtype is numeric assert_numpy_array_equal( left._values, right._values, diff --git a/pandas/tests/util/test_assert_series_equal.py b/pandas/tests/util/test_assert_series_equal.py index 1284cc9d4f49b..a7b5aeac560e4 100644 --- a/pandas/tests/util/test_assert_series_equal.py +++ b/pandas/tests/util/test_assert_series_equal.py @@ -281,3 +281,18 @@ class MySeries(Series): with pytest.raises(AssertionError, match="Series classes are different"): tm.assert_series_equal(s3, s1, check_series_type=True) + + +def test_series_equal_exact_for_nonnumeric(): + # https://github.com/pandas-dev/pandas/issues/35446 + s1 = Series(["a", "b"]) + s2 = Series(["a", "b"]) + s3 = Series(["b", "a"]) + + tm.assert_series_equal(s1, s2, check_exact=True) + tm.assert_series_equal(s2, s1, check_exact=True) + + with pytest.raises(AssertionError): + tm.assert_series_equal(s1, s3, check_exact=True) + with pytest.raises(AssertionError): + tm.assert_series_equal(s3, s1, check_exact=True) From 4a71806e85454ff8e0c1e33b0bea9130d9b250de Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Mon, 10 Aug 2020 11:19:25 -0400 Subject: [PATCH 0476/1025] Storage options (#35381) --- doc/source/user_guide/io.rst | 61 +++++++++++++--- doc/source/whatsnew/v1.2.0.rst | 14 ++++ pandas/_typing.py | 3 + pandas/conftest.py | 22 ++++++ pandas/core/frame.py | 37 +++++++++- pandas/core/generic.py | 44 +++++++++++- pandas/core/series.py | 20 +++++- pandas/io/common.py | 24 +++++-- pandas/io/feather_format.py | 25 +++++-- pandas/io/formats/csvs.py | 9 ++- pandas/io/json/_json.py | 28 ++++++-- pandas/io/parquet.py | 48 +++++++++++-- pandas/io/parsers.py | 5 +- pandas/io/pickle.py | 34 +++++++-- pandas/io/sas/sas_xport.py | 2 +- pandas/io/stata.py | 53 +++++++++++--- pandas/tests/io/test_fsspec.py | 125 ++++++++++++++++++++++++++++++++- pandas/tests/io/test_s3.py | 2 +- 18 files changed, 502 insertions(+), 54 deletions(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index ab233f653061a..35403b5c8b66f 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -1649,29 +1649,72 @@ options include: Specifying any of the above options will produce a ``ParserWarning`` unless the python engine is selected explicitly using ``engine='python'``. -Reading remote files -'''''''''''''''''''' +.. _io.remote: + +Reading/writing remote files +'''''''''''''''''''''''''''' -You can pass in a URL to a CSV file: +You can pass in a URL to read or write remote files to many of Pandas' IO +functions - the following example shows reading a CSV file: .. code-block:: python df = pd.read_csv('https://download.bls.gov/pub/time.series/cu/cu.item', sep='\t') -S3 URLs are handled as well but require installing the `S3Fs +All URLs which are not local files or HTTP(s) are handled by +`fsspec`_, if installed, and its various filesystem implementations +(including Amazon S3, Google Cloud, SSH, FTP, webHDFS...). +Some of these implementations will require additional packages to be +installed, for example +S3 URLs require the `s3fs `_ library: .. code-block:: python - df = pd.read_csv('s3://pandas-test/tips.csv') + df = pd.read_json('s3://pandas-test/adatafile.json') + +When dealing with remote storage systems, you might need +extra configuration with environment variables or config files in +special locations. For example, to access data in your S3 bucket, +you will need to define credentials in one of the several ways listed in +the `S3Fs documentation +`_. The same is true +for several of the storage backends, and you should follow the links +at `fsimpl1`_ for implementations built into ``fsspec`` and `fsimpl2`_ +for those not included in the main ``fsspec`` +distribution. + +You can also pass parameters directly to the backend driver. For example, +if you do *not* have S3 credentials, you can still access public data by +specifying an anonymous connection, such as + +.. versionadded:: 1.2.0 + +.. code-block:: python + + pd.read_csv("s3://ncei-wcsd-archive/data/processed/SH1305/18kHz/SaKe2013" + "-D20130523-T080854_to_SaKe2013-D20130523-T085643.csv", + storage_options={"anon": True}) + +``fsspec`` also allows complex URLs, for accessing data in compressed +archives, local caching of files, and more. To locally cache the above +example, you would modify the call to + +.. code-block:: python -If your S3 bucket requires credentials you will need to set them as environment -variables or in the ``~/.aws/credentials`` config file, refer to the `S3Fs -documentation on credentials -`_. + pd.read_csv("simplecache::s3://ncei-wcsd-archive/data/processed/SH1305/18kHz/" + "SaKe2013-D20130523-T080854_to_SaKe2013-D20130523-T085643.csv", + storage_options={"s3": {"anon": True}}) +where we specify that the "anon" parameter is meant for the "s3" part of +the implementation, not to the caching implementation. Note that this caches to a temporary +directory for the duration of the session only, but you can also specify +a permanent store. +.. _fsspec: https://filesystem-spec.readthedocs.io/en/latest/ +.. _fsimpl1: https://filesystem-spec.readthedocs.io/en/latest/api.html#built-in-implementations +.. _fsimpl2: https://filesystem-spec.readthedocs.io/en/latest/api.html#other-known-implementations Writing out data '''''''''''''''' diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index d3bccada09c29..94bb265c32e4c 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -13,6 +13,20 @@ including other versions of pandas. Enhancements ~~~~~~~~~~~~ +Passing arguments to fsspec backends +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Many read/write functions have acquired the ``storage_options`` optional argument, +to pass a dictionary of parameters to the storage backend. This allows, for +example, for passing credentials to S3 and GCS storage. The details of what +parameters can be passed to which backends can be found in the documentation +of the individual storage backends (detailed from the fsspec docs for +`builtin implementations`_ and linked to `external ones`_). See +Section :ref:`io.remote`. + +.. _builtin implementations: https://filesystem-spec.readthedocs.io/en/latest/api.html#built-in-implementations +.. _external ones: https://filesystem-spec.readthedocs.io/en/latest/api.html#other-known-implementations + .. _whatsnew_120.binary_handle_to_csv: Support for binary file handles in ``to_csv`` diff --git a/pandas/_typing.py b/pandas/_typing.py index 76ec527e6e258..47a102ddc70e0 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -106,3 +106,6 @@ List[AggFuncTypeBase], Dict[Label, Union[AggFuncTypeBase, List[AggFuncTypeBase]]], ] + +# for arbitrary kwargs passed during reading/writing files +StorageOptions = Optional[Dict[str, Any]] diff --git a/pandas/conftest.py b/pandas/conftest.py index c1925b4f5ca3b..97cc514e31bb3 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1224,3 +1224,25 @@ def sort_by_key(request): Tests None (no key) and the identity key. """ return request.param + + +@pytest.fixture() +def fsspectest(): + pytest.importorskip("fsspec") + from fsspec import register_implementation + from fsspec.implementations.memory import MemoryFileSystem + from fsspec.registry import _registry as registry + + class TestMemoryFS(MemoryFileSystem): + protocol = "testmem" + test = [None] + + def __init__(self, **kwargs): + self.test[0] = kwargs.pop("test", None) + super().__init__(**kwargs) + + register_implementation("testmem", TestMemoryFS, clobber=True) + yield TestMemoryFS() + registry.pop("testmem", None) + TestMemoryFS.test[0] = None + TestMemoryFS.store.clear() diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b66b6b92336f2..9d0751fcce460 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -55,6 +55,7 @@ Label, Level, Renamer, + StorageOptions, ValueKeyFunc, ) from pandas.compat import PY37 @@ -2058,6 +2059,7 @@ def to_stata( version: Optional[int] = 114, convert_strl: Optional[Sequence[Label]] = None, compression: Union[str, Mapping[str, str], None] = "infer", + storage_options: StorageOptions = None, ) -> None: """ Export DataFrame object to Stata dta format. @@ -2134,6 +2136,16 @@ def to_stata( .. versionadded:: 1.1.0 + storage_options : dict, optional + Extra options that make sense for a particular storage connection, e.g. + host, port, username, password, etc., if using a URL that will + be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error + will be raised if providing this argument with a local path or + a file-like buffer. See the fsspec and backend storage implementation + docs for the set of allowed keys and values. + + .. versionadded:: 1.2.0 + Raises ------ NotImplementedError @@ -2194,6 +2206,7 @@ def to_stata( write_index=write_index, variable_labels=variable_labels, compression=compression, + storage_options=storage_options, **kwargs, ) writer.write_file() @@ -2246,9 +2259,10 @@ def to_feather(self, path, **kwargs) -> None: ) def to_markdown( self, - buf: Optional[IO[str]] = None, - mode: Optional[str] = None, + buf: Optional[Union[IO[str], str]] = None, + mode: str = "wt", index: bool = True, + storage_options: StorageOptions = None, **kwargs, ) -> Optional[str]: if "showindex" in kwargs: @@ -2266,9 +2280,14 @@ def to_markdown( result = tabulate.tabulate(self, **kwargs) if buf is None: return result - buf, _, _, _ = get_filepath_or_buffer(buf, mode=mode) + buf, _, _, should_close = get_filepath_or_buffer( + buf, mode=mode, storage_options=storage_options + ) assert buf is not None # Help mypy. + assert not isinstance(buf, str) buf.writelines(result) + if should_close: + buf.close() return None @deprecate_kwarg(old_arg_name="fname", new_arg_name="path") @@ -2279,6 +2298,7 @@ def to_parquet( compression: Optional[str] = "snappy", index: Optional[bool] = None, partition_cols: Optional[List[str]] = None, + storage_options: StorageOptions = None, **kwargs, ) -> None: """ @@ -2327,6 +2347,16 @@ def to_parquet( .. versionadded:: 0.24.0 + storage_options : dict, optional + Extra options that make sense for a particular storage connection, e.g. + host, port, username, password, etc., if using a URL that will + be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error + will be raised if providing this argument with a local path or + a file-like buffer. See the fsspec and backend storage implementation + docs for the set of allowed keys and values + + .. versionadded:: 1.2.0 + **kwargs Additional arguments passed to the parquet library. See :ref:`pandas io ` for more details. @@ -2373,6 +2403,7 @@ def to_parquet( compression=compression, index=index, partition_cols=partition_cols, + storage_options=storage_options, **kwargs, ) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index fcb7e2a949205..520023050d49d 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -40,6 +40,7 @@ Label, Level, Renamer, + StorageOptions, TimedeltaConvertibleTypes, TimestampConvertibleTypes, ValueKeyFunc, @@ -2058,6 +2059,7 @@ def to_json( compression: Optional[str] = "infer", index: bool_t = True, indent: Optional[int] = None, + storage_options: StorageOptions = None, ) -> Optional[str]: """ Convert the object to a JSON string. @@ -2141,6 +2143,16 @@ def to_json( .. versionadded:: 1.0.0 + storage_options : dict, optional + Extra options that make sense for a particular storage connection, e.g. + host, port, username, password, etc., if using a URL that will + be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error + will be raised if providing this argument with a local path or + a file-like buffer. See the fsspec and backend storage implementation + docs for the set of allowed keys and values + + .. versionadded:: 1.2.0 + Returns ------- None or str @@ -2319,6 +2331,7 @@ def to_json( compression=compression, index=index, indent=indent, + storage_options=storage_options, ) def to_hdf( @@ -2633,6 +2646,7 @@ def to_pickle( path, compression: Optional[str] = "infer", protocol: int = pickle.HIGHEST_PROTOCOL, + storage_options: StorageOptions = None, ) -> None: """ Pickle (serialize) object to file. @@ -2653,6 +2667,16 @@ def to_pickle( .. [1] https://docs.python.org/3/library/pickle.html. + storage_options : dict, optional + Extra options that make sense for a particular storage connection, e.g. + host, port, username, password, etc., if using a URL that will + be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error + will be raised if providing this argument with a local path or + a file-like buffer. See the fsspec and backend storage implementation + docs for the set of allowed keys and values + + .. versionadded:: 1.2.0 + See Also -------- read_pickle : Load pickled pandas object (or any object) from file. @@ -2686,7 +2710,13 @@ def to_pickle( """ from pandas.io.pickle import to_pickle - to_pickle(self, path, compression=compression, protocol=protocol) + to_pickle( + self, + path, + compression=compression, + protocol=protocol, + storage_options=storage_options, + ) def to_clipboard( self, excel: bool_t = True, sep: Optional[str] = None, **kwargs @@ -3031,6 +3061,7 @@ def to_csv( escapechar: Optional[str] = None, decimal: Optional[str] = ".", errors: str = "strict", + storage_options: StorageOptions = None, ) -> Optional[str]: r""" Write object to a comma-separated values (csv) file. @@ -3142,6 +3173,16 @@ def to_csv( .. versionadded:: 1.1.0 + storage_options : dict, optional + Extra options that make sense for a particular storage connection, e.g. + host, port, username, password, etc., if using a URL that will + be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error + will be raised if providing this argument with a local path or + a file-like buffer. See the fsspec and backend storage implementation + docs for the set of allowed keys and values + + .. versionadded:: 1.2.0 + Returns ------- None or str @@ -3194,6 +3235,7 @@ def to_csv( doublequote=doublequote, escapechar=escapechar, decimal=decimal, + storage_options=storage_options, ) formatter.save() diff --git a/pandas/core/series.py b/pandas/core/series.py index 93368ea1e515f..e8bf87a39b572 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -31,6 +31,7 @@ FrameOrSeriesUnion, IndexKeyFunc, Label, + StorageOptions, ValueKeyFunc, ) from pandas.compat.numpy import function as nv @@ -1422,8 +1423,9 @@ def to_string( def to_markdown( self, buf: Optional[IO[str]] = None, - mode: Optional[str] = None, + mode: str = "wt", index: bool = True, + storage_options: StorageOptions = None, **kwargs, ) -> Optional[str]: """ @@ -1436,12 +1438,22 @@ def to_markdown( buf : str, Path or StringIO-like, optional, default None Buffer to write to. If None, the output is returned as a string. mode : str, optional - Mode in which file is opened. + Mode in which file is opened, "wt" by default. index : bool, optional, default True Add index (row) labels. .. versionadded:: 1.1.0 + storage_options : dict, optional + Extra options that make sense for a particular storage connection, e.g. + host, port, username, password, etc., if using a URL that will + be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error + will be raised if providing this argument with a local path or + a file-like buffer. See the fsspec and backend storage implementation + docs for the set of allowed keys and values + + .. versionadded:: 1.2.0 + **kwargs These parameters will be passed to `tabulate \ `_. @@ -1477,7 +1489,9 @@ def to_markdown( | 3 | quetzal | +----+----------+ """ - return self.to_frame().to_markdown(buf, mode, index, **kwargs) + return self.to_frame().to_markdown( + buf, mode, index, storage_options=storage_options, **kwargs + ) # ---------------------------------------------------------------------- diff --git a/pandas/io/common.py b/pandas/io/common.py index 34e4425c657f1..9ac642e58b544 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -29,7 +29,7 @@ ) import zipfile -from pandas._typing import FilePathOrBuffer +from pandas._typing import FilePathOrBuffer, StorageOptions from pandas.compat import _get_lzma_file, _import_lzma from pandas.compat._optional import import_optional_dependency @@ -162,7 +162,7 @@ def get_filepath_or_buffer( encoding: Optional[str] = None, compression: Optional[str] = None, mode: Optional[str] = None, - storage_options: Optional[Dict[str, Any]] = None, + storage_options: StorageOptions = None, ): """ If the filepath_or_buffer is a url, translate and return the buffer. @@ -175,8 +175,16 @@ def get_filepath_or_buffer( compression : {{'gzip', 'bz2', 'zip', 'xz', None}}, optional encoding : the encoding to use to decode bytes, default is 'utf-8' mode : str, optional - storage_options: dict, optional - passed on to fsspec, if using it; this is not yet accessed by the public API + + storage_options : dict, optional + Extra options that make sense for a particular storage connection, e.g. + host, port, username, password, etc., if using a URL that will + be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error + will be raised if providing this argument with a local path or + a file-like buffer. See the fsspec and backend storage implementation + docs for the set of allowed keys and values + + .. versionadded:: 1.2.0 Returns ------- @@ -188,6 +196,10 @@ def get_filepath_or_buffer( if isinstance(filepath_or_buffer, str) and is_url(filepath_or_buffer): # TODO: fsspec can also handle HTTP via requests, but leaving this unchanged + if storage_options: + raise ValueError( + "storage_options passed with file object or non-fsspec file path" + ) req = urlopen(filepath_or_buffer) content_encoding = req.headers.get("Content-Encoding", None) if content_encoding == "gzip": @@ -242,6 +254,10 @@ def get_filepath_or_buffer( ).open() return file_obj, encoding, compression, True + elif storage_options: + raise ValueError( + "storage_options passed with file object or non-fsspec file path" + ) if isinstance(filepath_or_buffer, (str, bytes, mmap.mmap)): return _expand_user(filepath_or_buffer), None, compression, False diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py index dfa43942fc8b3..2c664e73b9463 100644 --- a/pandas/io/feather_format.py +++ b/pandas/io/feather_format.py @@ -4,10 +4,10 @@ from pandas import DataFrame, Int64Index, RangeIndex -from pandas.io.common import get_filepath_or_buffer, stringify_path +from pandas.io.common import get_filepath_or_buffer -def to_feather(df: DataFrame, path, **kwargs): +def to_feather(df: DataFrame, path, storage_options=None, **kwargs): """ Write a DataFrame to the binary Feather format. @@ -15,6 +15,17 @@ def to_feather(df: DataFrame, path, **kwargs): ---------- df : DataFrame path : string file path, or file-like object + + storage_options : dict, optional + Extra options that make sense for a particular storage connection, e.g. + host, port, username, password, etc., if using a URL that will + be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error + will be raised if providing this argument with a local path or + a file-like buffer. See the fsspec and backend storage implementation + docs for the set of allowed keys and values + + .. versionadded:: 1.2.0 + **kwargs : Additional keywords passed to `pyarrow.feather.write_feather`. @@ -23,7 +34,9 @@ def to_feather(df: DataFrame, path, **kwargs): import_optional_dependency("pyarrow") from pyarrow import feather - path = stringify_path(path) + path, _, _, should_close = get_filepath_or_buffer( + path, mode="wb", storage_options=storage_options + ) if not isinstance(df, DataFrame): raise ValueError("feather only support IO with DataFrames") @@ -64,7 +77,7 @@ def to_feather(df: DataFrame, path, **kwargs): feather.write_feather(df, path, **kwargs) -def read_feather(path, columns=None, use_threads: bool = True): +def read_feather(path, columns=None, use_threads: bool = True, storage_options=None): """ Load a feather-format object from the file path. @@ -98,7 +111,9 @@ def read_feather(path, columns=None, use_threads: bool = True): import_optional_dependency("pyarrow") from pyarrow import feather - path, _, _, should_close = get_filepath_or_buffer(path) + path, _, _, should_close = get_filepath_or_buffer( + path, storage_options=storage_options + ) df = feather.read_feather(path, columns=columns, use_threads=bool(use_threads)) diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index b10946a20d041..6eceb94387171 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -11,7 +11,7 @@ import numpy as np from pandas._libs import writers as libwriters -from pandas._typing import FilePathOrBuffer +from pandas._typing import FilePathOrBuffer, StorageOptions from pandas.core.dtypes.generic import ( ABCDatetimeIndex, @@ -53,6 +53,7 @@ def __init__( doublequote: bool = True, escapechar: Optional[str] = None, decimal=".", + storage_options: StorageOptions = None, ): self.obj = obj @@ -63,7 +64,11 @@ def __init__( compression, self.compression_args = get_compression_method(compression) self.path_or_buf, _, _, self.should_close = get_filepath_or_buffer( - path_or_buf, encoding=encoding, compression=compression, mode=mode + path_or_buf, + encoding=encoding, + compression=compression, + mode=mode, + storage_options=storage_options, ) self.sep = sep self.na_rep = na_rep diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 0b06a26d4aa3c..0d2b351926343 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -9,7 +9,7 @@ import pandas._libs.json as json from pandas._libs.tslibs import iNaT -from pandas._typing import JSONSerializable +from pandas._typing import JSONSerializable, StorageOptions from pandas.errors import AbstractMethodError from pandas.util._decorators import deprecate_kwarg, deprecate_nonkeyword_arguments @@ -44,6 +44,7 @@ def to_json( compression: Optional[str] = "infer", index: bool = True, indent: int = 0, + storage_options: StorageOptions = None, ): if not index and orient not in ["split", "table"]: @@ -52,8 +53,11 @@ def to_json( ) if path_or_buf is not None: - path_or_buf, _, _, _ = get_filepath_or_buffer( - path_or_buf, compression=compression, mode="w" + path_or_buf, _, _, should_close = get_filepath_or_buffer( + path_or_buf, + compression=compression, + mode="wt", + storage_options=storage_options, ) if lines and orient != "records": @@ -97,6 +101,8 @@ def to_json( return s else: path_or_buf.write(s) + if should_close: + path_or_buf.close() class Writer: @@ -365,6 +371,7 @@ def read_json( chunksize: Optional[int] = None, compression="infer", nrows: Optional[int] = None, + storage_options: StorageOptions = None, ): """ Convert a JSON string to pandas object. @@ -510,6 +517,16 @@ def read_json( .. versionadded:: 1.1 + storage_options : dict, optional + Extra options that make sense for a particular storage connection, e.g. + host, port, username, password, etc., if using a URL that will + be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error + will be raised if providing this argument with a local path or + a file-like buffer. See the fsspec and backend storage implementation + docs for the set of allowed keys and values + + .. versionadded:: 1.2.0 + Returns ------- Series or DataFrame @@ -592,7 +609,10 @@ def read_json( compression = infer_compression(path_or_buf, compression) filepath_or_buffer, _, compression, should_close = get_filepath_or_buffer( - path_or_buf, encoding=encoding, compression=compression + path_or_buf, + encoding=encoding, + compression=compression, + storage_options=storage_options, ) json_reader = JsonReader( diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 8c4b63767ac06..7f0eef039a1e8 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -3,7 +3,7 @@ from typing import Any, AnyStr, Dict, List, Optional from warnings import catch_warnings -from pandas._typing import FilePathOrBuffer +from pandas._typing import FilePathOrBuffer, StorageOptions from pandas.compat._optional import import_optional_dependency from pandas.errors import AbstractMethodError @@ -89,6 +89,7 @@ def write( path: FilePathOrBuffer[AnyStr], compression: Optional[str] = "snappy", index: Optional[bool] = None, + storage_options: StorageOptions = None, partition_cols: Optional[List[str]] = None, **kwargs, ): @@ -105,9 +106,13 @@ def write( import_optional_dependency("fsspec") import fsspec.core - fs, path = fsspec.core.url_to_fs(path) + fs, path = fsspec.core.url_to_fs(path, **(storage_options or {})) kwargs["filesystem"] = fs else: + if storage_options: + raise ValueError( + "storage_options passed with file object or non-fsspec file path" + ) path = _expand_user(path) if partition_cols is not None: # writes to multiple files under the given path @@ -122,14 +127,20 @@ def write( # write to single output file self.api.parquet.write_table(table, path, compression=compression, **kwargs) - def read(self, path, columns=None, **kwargs): + def read( + self, path, columns=None, storage_options: StorageOptions = None, **kwargs, + ): if is_fsspec_url(path) and "filesystem" not in kwargs: import_optional_dependency("fsspec") import fsspec.core - fs, path = fsspec.core.url_to_fs(path) + fs, path = fsspec.core.url_to_fs(path, **(storage_options or {})) should_close = False else: + if storage_options: + raise ValueError( + "storage_options passed with buffer or non-fsspec filepath" + ) fs = kwargs.pop("filesystem", None) should_close = False path = _expand_user(path) @@ -163,6 +174,7 @@ def write( compression="snappy", index=None, partition_cols=None, + storage_options: StorageOptions = None, **kwargs, ): self.validate_dataframe(df) @@ -185,8 +197,14 @@ def write( fsspec = import_optional_dependency("fsspec") # if filesystem is provided by fsspec, file must be opened in 'wb' mode. - kwargs["open_with"] = lambda path, _: fsspec.open(path, "wb").open() + kwargs["open_with"] = lambda path, _: fsspec.open( + path, "wb", **(storage_options or {}) + ).open() else: + if storage_options: + raise ValueError( + "storage_options passed with file object or non-fsspec file path" + ) path, _, _, _ = get_filepath_or_buffer(path) with catch_warnings(record=True): @@ -199,11 +217,15 @@ def write( **kwargs, ) - def read(self, path, columns=None, **kwargs): + def read( + self, path, columns=None, storage_options: StorageOptions = None, **kwargs, + ): if is_fsspec_url(path): fsspec = import_optional_dependency("fsspec") - open_with = lambda path, _: fsspec.open(path, "rb").open() + open_with = lambda path, _: fsspec.open( + path, "rb", **(storage_options or {}) + ).open() parquet_file = self.api.ParquetFile(path, open_with=open_with) else: path, _, _, _ = get_filepath_or_buffer(path) @@ -218,6 +240,7 @@ def to_parquet( engine: str = "auto", compression: Optional[str] = "snappy", index: Optional[bool] = None, + storage_options: StorageOptions = None, partition_cols: Optional[List[str]] = None, **kwargs, ): @@ -261,6 +284,16 @@ def to_parquet( .. versionadded:: 0.24.0 + storage_options : dict, optional + Extra options that make sense for a particular storage connection, e.g. + host, port, username, password, etc., if using a URL that will + be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error + will be raised if providing this argument with a local path or + a file-like buffer. See the fsspec and backend storage implementation + docs for the set of allowed keys and values + + .. versionadded:: 1.2.0 + kwargs Additional keyword arguments passed to the engine """ @@ -273,6 +306,7 @@ def to_parquet( compression=compression, index=index, partition_cols=partition_cols, + storage_options=storage_options, **kwargs, ) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index d4f346f8c1087..9dc0e1f71d13b 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -420,6 +420,7 @@ def _validate_names(names): def _read(filepath_or_buffer: FilePathOrBuffer, kwds): """Generic reader of line files.""" encoding = kwds.get("encoding", None) + storage_options = kwds.get("storage_options", None) if encoding is not None: encoding = re.sub("_", "-", encoding).lower() kwds["encoding"] = encoding @@ -432,7 +433,7 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds): # though mypy handling of conditional imports is difficult. # See https://github.com/python/mypy/issues/1297 fp_or_buf, _, compression, should_close = get_filepath_or_buffer( - filepath_or_buffer, encoding, compression + filepath_or_buffer, encoding, compression, storage_options=storage_options ) kwds["compression"] = compression @@ -595,6 +596,7 @@ def read_csv( low_memory=_c_parser_defaults["low_memory"], memory_map=False, float_precision=None, + storage_options=None, ): # gh-23761 # @@ -681,6 +683,7 @@ def read_csv( mangle_dupe_cols=mangle_dupe_cols, infer_datetime_format=infer_datetime_format, skip_blank_lines=skip_blank_lines, + storage_options=storage_options, ) return _read(filepath_or_buffer, kwds) diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py index 3b35b54a6dc16..549d55e65546d 100644 --- a/pandas/io/pickle.py +++ b/pandas/io/pickle.py @@ -3,7 +3,7 @@ from typing import Any, Optional import warnings -from pandas._typing import FilePathOrBuffer +from pandas._typing import FilePathOrBuffer, StorageOptions from pandas.compat import pickle_compat as pc from pandas.io.common import get_filepath_or_buffer, get_handle @@ -14,6 +14,7 @@ def to_pickle( filepath_or_buffer: FilePathOrBuffer, compression: Optional[str] = "infer", protocol: int = pickle.HIGHEST_PROTOCOL, + storage_options: StorageOptions = None, ): """ Pickle (serialize) object to file. @@ -42,6 +43,16 @@ def to_pickle( protocol parameter is equivalent to setting its value to HIGHEST_PROTOCOL. + storage_options : dict, optional + Extra options that make sense for a particular storage connection, e.g. + host, port, username, password, etc., if using a URL that will + be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error + will be raised if providing this argument with a local path or + a file-like buffer. See the fsspec and backend storage implementation + docs for the set of allowed keys and values + + .. versionadded:: 1.2.0 + .. [1] https://docs.python.org/3/library/pickle.html See Also @@ -76,7 +87,10 @@ def to_pickle( >>> os.remove("./dummy.pkl") """ fp_or_buf, _, compression, should_close = get_filepath_or_buffer( - filepath_or_buffer, compression=compression, mode="wb" + filepath_or_buffer, + compression=compression, + mode="wb", + storage_options=storage_options, ) if not isinstance(fp_or_buf, str) and compression == "infer": compression = None @@ -97,7 +111,9 @@ def to_pickle( def read_pickle( - filepath_or_buffer: FilePathOrBuffer, compression: Optional[str] = "infer" + filepath_or_buffer: FilePathOrBuffer, + compression: Optional[str] = "infer", + storage_options: StorageOptions = None, ): """ Load pickled pandas object (or any object) from file. @@ -121,6 +137,16 @@ def read_pickle( compression) If 'infer' and 'path_or_url' is not path-like, then use None (= no decompression). + storage_options : dict, optional + Extra options that make sense for a particular storage connection, e.g. + host, port, username, password, etc., if using a URL that will + be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error + will be raised if providing this argument with a local path or + a file-like buffer. See the fsspec and backend storage implementation + docs for the set of allowed keys and values + + .. versionadded:: 1.2.0 + Returns ------- unpickled : same type as object stored in file @@ -162,7 +188,7 @@ def read_pickle( >>> os.remove("./dummy.pkl") """ fp_or_buf, _, compression, should_close = get_filepath_or_buffer( - filepath_or_buffer, compression=compression + filepath_or_buffer, compression=compression, storage_options=storage_options ) if not isinstance(fp_or_buf, str) and compression == "infer": compression = None diff --git a/pandas/io/sas/sas_xport.py b/pandas/io/sas/sas_xport.py index 7fc1bc6d3eb6c..6cf248b748107 100644 --- a/pandas/io/sas/sas_xport.py +++ b/pandas/io/sas/sas_xport.py @@ -244,7 +244,7 @@ class XportReader(ReaderBase, abc.Iterator): __doc__ = _xport_reader_doc def __init__( - self, filepath_or_buffer, index=None, encoding="ISO-8859-1", chunksize=None + self, filepath_or_buffer, index=None, encoding="ISO-8859-1", chunksize=None, ): self._encoding = encoding diff --git a/pandas/io/stata.py b/pandas/io/stata.py index cb23b781a7ad2..7a25617885839 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -11,7 +11,7 @@ """ from collections import abc import datetime -from io import BytesIO, IOBase +from io import BytesIO import os from pathlib import Path import struct @@ -35,7 +35,7 @@ from pandas._libs.lib import infer_dtype from pandas._libs.writers import max_len_string_array -from pandas._typing import FilePathOrBuffer, Label +from pandas._typing import FilePathOrBuffer, Label, StorageOptions from pandas.util._decorators import Appender from pandas.core.dtypes.common import ( @@ -1035,6 +1035,7 @@ def __init__( columns: Optional[Sequence[str]] = None, order_categoricals: bool = True, chunksize: Optional[int] = None, + storage_options: StorageOptions = None, ): super().__init__() self.col_sizes: List[int] = [] @@ -1068,13 +1069,16 @@ def __init__( self._native_byteorder = _set_endianness(sys.byteorder) path_or_buf = stringify_path(path_or_buf) if isinstance(path_or_buf, str): - path_or_buf, encoding, _, should_close = get_filepath_or_buffer(path_or_buf) + path_or_buf, encoding, _, should_close = get_filepath_or_buffer( + path_or_buf, storage_options=storage_options + ) if isinstance(path_or_buf, (str, bytes)): self.path_or_buf = open(path_or_buf, "rb") - elif isinstance(path_or_buf, IOBase): + elif hasattr(path_or_buf, "read"): # Copy to BytesIO, and ensure no encoding - contents = path_or_buf.read() + pb: Any = path_or_buf + contents = pb.read() self.path_or_buf = BytesIO(contents) self._read_header() @@ -1906,6 +1910,7 @@ def read_stata( order_categoricals: bool = True, chunksize: Optional[int] = None, iterator: bool = False, + storage_options: StorageOptions = None, ) -> Union[DataFrame, StataReader]: reader = StataReader( @@ -1918,6 +1923,7 @@ def read_stata( columns=columns, order_categoricals=order_categoricals, chunksize=chunksize, + storage_options=storage_options, ) if iterator or chunksize: @@ -1931,7 +1937,9 @@ def read_stata( def _open_file_binary_write( - fname: FilePathOrBuffer, compression: Union[str, Mapping[str, str], None], + fname: FilePathOrBuffer, + compression: Union[str, Mapping[str, str], None], + storage_options: StorageOptions = None, ) -> Tuple[BinaryIO, bool, Optional[Union[str, Mapping[str, str]]]]: """ Open a binary file or no-op if file-like. @@ -1943,6 +1951,16 @@ def _open_file_binary_write( compression : {str, dict, None} The compression method to use. + storage_options : dict, optional + Extra options that make sense for a particular storage connection, e.g. + host, port, username, password, etc., if using a URL that will + be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error + will be raised if providing this argument with a local path or + a file-like buffer. See the fsspec and backend storage implementation + docs for the set of allowed keys and values + + .. versionadded:: 1.2.0 + Returns ------- file : file-like object @@ -1961,7 +1979,10 @@ def _open_file_binary_write( compression_typ, compression_args = get_compression_method(compression) compression_typ = infer_compression(fname, compression_typ) path_or_buf, _, compression_typ, _ = get_filepath_or_buffer( - fname, compression=compression_typ + fname, + mode="wb", + compression=compression_typ, + storage_options=storage_options, ) if compression_typ is not None: compression = compression_args @@ -2158,6 +2179,16 @@ class StataWriter(StataParser): .. versionadded:: 1.1.0 + storage_options : dict, optional + Extra options that make sense for a particular storage connection, e.g. + host, port, username, password, etc., if using a URL that will + be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error + will be raised if providing this argument with a local path or + a file-like buffer. See the fsspec and backend storage implementation + docs for the set of allowed keys and values + + .. versionadded:: 1.2.0 + Returns ------- writer : StataWriter instance @@ -2207,6 +2238,7 @@ def __init__( data_label: Optional[str] = None, variable_labels: Optional[Dict[Label, str]] = None, compression: Union[str, Mapping[str, str], None] = "infer", + storage_options: StorageOptions = None, ): super().__init__() self._convert_dates = {} if convert_dates is None else convert_dates @@ -2219,6 +2251,7 @@ def __init__( self._output_file: Optional[BinaryIO] = None # attach nobs, nvars, data, varlist, typlist self._prepare_pandas(data) + self.storage_options = storage_options if byteorder is None: byteorder = sys.byteorder @@ -2505,7 +2538,7 @@ def _encode_strings(self) -> None: def write_file(self) -> None: self._file, self._own_file, compression = _open_file_binary_write( - self._fname, self._compression + self._fname, self._compression, storage_options=self.storage_options ) if compression is not None: self._output_file = self._file @@ -3088,6 +3121,7 @@ def __init__( variable_labels: Optional[Dict[Label, str]] = None, convert_strl: Optional[Sequence[Label]] = None, compression: Union[str, Mapping[str, str], None] = "infer", + storage_options: StorageOptions = None, ): # Copy to new list since convert_strl might be modified later self._convert_strl: List[Label] = [] @@ -3104,6 +3138,7 @@ def __init__( data_label=data_label, variable_labels=variable_labels, compression=compression, + storage_options=storage_options, ) self._map: Dict[str, int] = {} self._strl_blob = b"" @@ -3491,6 +3526,7 @@ def __init__( convert_strl: Optional[Sequence[Label]] = None, version: Optional[int] = None, compression: Union[str, Mapping[str, str], None] = "infer", + storage_options: StorageOptions = None, ): if version is None: version = 118 if data.shape[1] <= 32767 else 119 @@ -3513,6 +3549,7 @@ def __init__( variable_labels=variable_labels, convert_strl=convert_strl, compression=compression, + storage_options=storage_options, ) # Override version set in StataWriter117 init self._dta_version = version diff --git a/pandas/tests/io/test_fsspec.py b/pandas/tests/io/test_fsspec.py index a0723452ccb70..3e89f6ca4ae16 100644 --- a/pandas/tests/io/test_fsspec.py +++ b/pandas/tests/io/test_fsspec.py @@ -1,7 +1,18 @@ +import io + import numpy as np import pytest -from pandas import DataFrame, date_range, read_csv, read_parquet +from pandas import ( + DataFrame, + date_range, + read_csv, + read_feather, + read_json, + read_parquet, + read_pickle, + read_stata, +) import pandas._testing as tm from pandas.util import _test_decorators as td @@ -63,6 +74,16 @@ def test_to_csv(cleared_fs): tm.assert_frame_equal(df1, df2) +def test_csv_options(fsspectest): + df = DataFrame({"a": [0]}) + df.to_csv( + "testmem://test/test.csv", storage_options={"test": "csv_write"}, index=False + ) + assert fsspectest.test[0] == "csv_write" + read_csv("testmem://test/test.csv", storage_options={"test": "csv_read"}) + assert fsspectest.test[0] == "csv_read" + + @td.skip_if_no("fastparquet") def test_to_parquet_new_file(monkeypatch, cleared_fs): """Regression test for writing to a not-yet-existent GCS Parquet file.""" @@ -71,6 +92,44 @@ def test_to_parquet_new_file(monkeypatch, cleared_fs): ) +@td.skip_if_no("pyarrow") +def test_arrowparquet_options(fsspectest): + """Regression test for writing to a not-yet-existent GCS Parquet file.""" + df = DataFrame({"a": [0]}) + df.to_parquet( + "testmem://test/test.csv", + engine="pyarrow", + compression=None, + storage_options={"test": "parquet_write"}, + ) + assert fsspectest.test[0] == "parquet_write" + read_parquet( + "testmem://test/test.csv", + engine="pyarrow", + storage_options={"test": "parquet_read"}, + ) + assert fsspectest.test[0] == "parquet_read" + + +@td.skip_if_no("fastparquet") +def test_fastparquet_options(fsspectest): + """Regression test for writing to a not-yet-existent GCS Parquet file.""" + df = DataFrame({"a": [0]}) + df.to_parquet( + "testmem://test/test.csv", + engine="fastparquet", + compression=None, + storage_options={"test": "parquet_write"}, + ) + assert fsspectest.test[0] == "parquet_write" + read_parquet( + "testmem://test/test.csv", + engine="fastparquet", + storage_options={"test": "parquet_read"}, + ) + assert fsspectest.test[0] == "parquet_read" + + @td.skip_if_no("s3fs") def test_from_s3_csv(s3_resource, tips_file): tm.assert_equal(read_csv("s3://pandas-test/tips.csv"), read_csv(tips_file)) @@ -101,3 +160,67 @@ def test_not_present_exception(): with pytest.raises(ImportError) as e: read_csv("memory://test/test.csv") assert "fsspec library is required" in str(e.value) + + +@td.skip_if_no("pyarrow") +def test_feather_options(fsspectest): + df = DataFrame({"a": [0]}) + df.to_feather("testmem://afile", storage_options={"test": "feather_write"}) + assert fsspectest.test[0] == "feather_write" + out = read_feather("testmem://afile", storage_options={"test": "feather_read"}) + assert fsspectest.test[0] == "feather_read" + tm.assert_frame_equal(df, out) + + +def test_pickle_options(fsspectest): + df = DataFrame({"a": [0]}) + df.to_pickle("testmem://afile", storage_options={"test": "pickle_write"}) + assert fsspectest.test[0] == "pickle_write" + out = read_pickle("testmem://afile", storage_options={"test": "pickle_read"}) + assert fsspectest.test[0] == "pickle_read" + tm.assert_frame_equal(df, out) + + +def test_json_options(fsspectest): + df = DataFrame({"a": [0]}) + df.to_json("testmem://afile", storage_options={"test": "json_write"}) + assert fsspectest.test[0] == "json_write" + out = read_json("testmem://afile", storage_options={"test": "json_read"}) + assert fsspectest.test[0] == "json_read" + tm.assert_frame_equal(df, out) + + +def test_stata_options(fsspectest): + df = DataFrame({"a": [0]}) + df.to_stata( + "testmem://afile", storage_options={"test": "stata_write"}, write_index=False + ) + assert fsspectest.test[0] == "stata_write" + out = read_stata("testmem://afile", storage_options={"test": "stata_read"}) + assert fsspectest.test[0] == "stata_read" + tm.assert_frame_equal(df, out.astype("int64")) + + +@td.skip_if_no("tabulate") +def test_markdown_options(fsspectest): + df = DataFrame({"a": [0]}) + df.to_markdown("testmem://afile", storage_options={"test": "md_write"}) + assert fsspectest.test[0] == "md_write" + assert fsspectest.cat("afile") + + +@td.skip_if_no("pyarrow") +def test_non_fsspec_options(): + with pytest.raises(ValueError, match="storage_options"): + read_csv("localfile", storage_options={"a": True}) + with pytest.raises(ValueError, match="storage_options"): + # separate test for parquet, which has a different code path + read_parquet("localfile", storage_options={"a": True}) + by = io.BytesIO() + + with pytest.raises(ValueError, match="storage_options"): + read_csv(by, storage_options={"a": True}) + + df = DataFrame({"a": [0]}) + with pytest.raises(ValueError, match="storage_options"): + df.to_parquet("nonfsspecpath", storage_options={"a": True}) diff --git a/pandas/tests/io/test_s3.py b/pandas/tests/io/test_s3.py index 5e0f7edf4d8ae..a137e76b1696b 100644 --- a/pandas/tests/io/test_s3.py +++ b/pandas/tests/io/test_s3.py @@ -32,7 +32,7 @@ def test_read_without_creds_from_pub_bucket(): @tm.network @td.skip_if_no("s3fs") -def test_read_with_creds_from_pub_bucke(): +def test_read_with_creds_from_pub_bucket(): # Ensure we can read from a public bucket with credentials # GH 34626 # Use Amazon Open Data Registry - https://registry.opendata.aws/gdelt From 3b541d8a0af3d9518ef0f2cb9c680ebe9a152904 Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Mon, 10 Aug 2020 23:52:08 +0100 Subject: [PATCH 0477/1025] REF/PERF: Move MultiIndex._tuples to MultiIndex._cache (#35641) --- pandas/core/indexes/multi.py | 20 ++++++---------- pandas/io/pytables.py | 8 +++---- pandas/tests/indexes/multi/test_compat.py | 29 +++++++++++++++++------ 3 files changed, 33 insertions(+), 24 deletions(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 13927dede5542..448c2dfe4a29d 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -243,7 +243,6 @@ class MultiIndex(Index): _comparables = ["names"] rename = Index.set_names - _tuples = None sortorder: Optional[int] # -------------------------------------------------------------------- @@ -634,16 +633,9 @@ def from_frame(cls, df, sortorder=None, names=None): # -------------------------------------------------------------------- - @property + @cache_readonly def _values(self): # We override here, since our parent uses _data, which we don't use. - return self.values - - @property - def values(self): - if self._tuples is not None: - return self._tuples - values = [] for i in range(self.nlevels): @@ -657,8 +649,12 @@ def values(self): vals = np.array(vals, copy=False) values.append(vals) - self._tuples = lib.fast_zip(values) - return self._tuples + arr = lib.fast_zip(values) + return arr + + @property + def values(self): + return self._values @property def array(self): @@ -737,7 +733,6 @@ def _set_levels( if any(names): self._set_names(names) - self._tuples = None self._reset_cache() def set_levels(self, levels, level=None, inplace=None, verify_integrity=True): @@ -906,7 +901,6 @@ def _set_codes( self._codes = new_codes - self._tuples = None self._reset_cache() def set_codes(self, codes, level=None, inplace=None, verify_integrity=True): diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index aeb7b3e044794..2abc570a04de3 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -320,6 +320,10 @@ def read_hdf( mode : {'r', 'r+', 'a'}, default 'r' Mode to use when opening the file. Ignored if path_or_buf is a :class:`pandas.HDFStore`. Default is 'r'. + errors : str, default 'strict' + Specifies how encoding and decoding errors are to be handled. + See the errors argument for :func:`open` for a full list + of options. where : list, optional A list of Term (or convertible) objects. start : int, optional @@ -332,10 +336,6 @@ def read_hdf( Return an iterator object. chunksize : int, optional Number of rows to include in an iteration when using an iterator. - errors : str, default 'strict' - Specifies how encoding and decoding errors are to be handled. - See the errors argument for :func:`open` for a full list - of options. **kwargs Additional keyword arguments passed to HDFStore. diff --git a/pandas/tests/indexes/multi/test_compat.py b/pandas/tests/indexes/multi/test_compat.py index b2500efef9e03..72b5ed0edaa78 100644 --- a/pandas/tests/indexes/multi/test_compat.py +++ b/pandas/tests/indexes/multi/test_compat.py @@ -68,24 +68,33 @@ def test_inplace_mutation_resets_values(): mi1 = MultiIndex(levels=levels, codes=codes) mi2 = MultiIndex(levels=levels2, codes=codes) + + # instantiating MultiIndex should not access/cache _.values + assert "_values" not in mi1._cache + assert "_values" not in mi2._cache + vals = mi1.values.copy() vals2 = mi2.values.copy() - assert mi1._tuples is not None + # accessing .values should cache ._values + assert mi1._values is mi1._cache["_values"] + assert mi1.values is mi1._cache["_values"] + assert isinstance(mi1._cache["_values"], np.ndarray) # Make sure level setting works new_vals = mi1.set_levels(levels2).values tm.assert_almost_equal(vals2, new_vals) - # Non-inplace doesn't kill _tuples [implementation detail] - tm.assert_almost_equal(mi1._tuples, vals) + # Non-inplace doesn't drop _values from _cache [implementation detail] + tm.assert_almost_equal(mi1._cache["_values"], vals) # ...and values is still same too tm.assert_almost_equal(mi1.values, vals) - # Inplace should kill _tuples + # Inplace should drop _values from _cache with tm.assert_produces_warning(FutureWarning): mi1.set_levels(levels2, inplace=True) + assert "_values" not in mi1._cache tm.assert_almost_equal(mi1.values, vals2) # Make sure label setting works too @@ -95,18 +104,24 @@ def test_inplace_mutation_resets_values(): # Must be 1d array of tuples assert exp_values.shape == (6,) - new_values = mi2.set_codes(codes2).values + + new_mi = mi2.set_codes(codes2) + assert "_values" not in new_mi._cache + new_values = new_mi.values + assert "_values" in new_mi._cache # Not inplace shouldn't change - tm.assert_almost_equal(mi2._tuples, vals2) + tm.assert_almost_equal(mi2._cache["_values"], vals2) # Should have correct values tm.assert_almost_equal(exp_values, new_values) - # ...and again setting inplace should kill _tuples, etc + # ...and again setting inplace should drop _values from _cache, etc with tm.assert_produces_warning(FutureWarning): mi2.set_codes(codes2, inplace=True) + assert "_values" not in mi2._cache tm.assert_almost_equal(mi2.values, new_values) + assert "_values" in mi2._cache def test_ndarray_compat_properties(idx, compat_props): From 3a6958ae163fc68e7a8bfb88cc5961655639ee52 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 10 Aug 2020 18:02:11 -0500 Subject: [PATCH 0478/1025] Doc notes for core team members (#35608) --- doc/source/development/maintaining.rst | 42 ++++++++++++++++++++++++-- 1 file changed, 39 insertions(+), 3 deletions(-) diff --git a/doc/source/development/maintaining.rst b/doc/source/development/maintaining.rst index 9f9e9dc2631f3..cd084ab263477 100644 --- a/doc/source/development/maintaining.rst +++ b/doc/source/development/maintaining.rst @@ -132,17 +132,24 @@ respond or self-close their issue if it's determined that the behavior is not a or the feature is out of scope. Sometimes reporters just go away though, and we'll close the issue after the conversation has died. +.. _maintaining.reviewing: + Reviewing pull requests ----------------------- Anybody can review a pull request: regular contributors, triagers, or core-team -members. Here are some guidelines to check. +members. But only core-team members can merge pull requets when they're ready. + +Here are some things to check when reviewing a pull request. -* Tests should be in a sensible location. +* Tests should be in a sensible location: in the same file as closely related tests. * New public APIs should be included somewhere in ``doc/source/reference/``. * New / changed API should use the ``versionadded`` or ``versionchanged`` directives in the docstring. * User-facing changes should have a whatsnew in the appropriate file. * Regression tests should reference the original GitHub issue number like ``# GH-1234``. +* The pull request should be labeled and assigned the appropriate milestone (the next patch release + for regression fixes and small bug fixes, the next minor milestone otherwise) +* Changes should comply with our :ref:`policies.version`. Cleaning up old issues ---------------------- @@ -189,5 +196,34 @@ being helpful on the issue tracker. The current list of core-team members is at https://github.com/pandas-dev/pandas-governance/blob/master/people.md + +.. _maintaining.merging: + +Merging pull requests +--------------------- + +Only core team members can merge pull requests. We have a few guidelines. + +1. You should typically not self-merge your own pull requests. Exceptions include + things like small changes to fix CI (e.g. pinning a package version). +2. You should not merge pull requests that have an active discussion, or pull + requests that has any ``-1`` votes from a core maintainer. Pandas operates + by consensus. +3. For larger changes, it's good to have a +1 from at least two core team members. + +In addition to the items listed in :ref:`maintaining.closing`, you should verify +that the pull request is assigned the correct milestone. + +Pull requests merged with a patch-release milestone will typically be backported +by our bot. Verify that the bot noticed the merge (it will leave a comment within +a minute typically). If a manual backport is needed please do that, and remove +the "Needs backport" label once you've done it manually. If you forget to assign +a milestone before tagging, you can request the bot to backport it with: + +.. code-block:: console + + @Meeseeksdev backport + + .. _governance documents: https://github.com/pandas-dev/pandas-governance -.. _list of permissions: https://help.github.com/en/github/setting-up-and-managing-organizations-and-teams/repository-permission-levels-for-an-organization \ No newline at end of file +.. _list of permissions: https://help.github.com/en/github/setting-up-and-managing-organizations-and-teams/repository-permission-levels-for-an-organization From 0588ce450875d93b6247fa67ce96c7028a1cf929 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 10 Aug 2020 17:01:57 -0700 Subject: [PATCH 0479/1025] BUG: DataFrame.apply with func altering row in-place (#35633) --- doc/source/whatsnew/v1.1.1.rst | 1 + pandas/core/apply.py | 2 ++ pandas/tests/frame/apply/test_frame_apply.py | 19 +++++++++++++++++++ 3 files changed, 22 insertions(+) diff --git a/doc/source/whatsnew/v1.1.1.rst b/doc/source/whatsnew/v1.1.1.rst index e5860644fa371..415f9e508feb8 100644 --- a/doc/source/whatsnew/v1.1.1.rst +++ b/doc/source/whatsnew/v1.1.1.rst @@ -21,6 +21,7 @@ Fixed regressions - Fixed regression in :class:`pandas.core.groupby.RollingGroupby` where column selection was ignored (:issue:`35486`) - Fixed regression in :meth:`DataFrame.shift` with ``axis=1`` and heterogeneous dtypes (:issue:`35488`) - Fixed regression in ``.groupby(..).rolling(..)`` where a segfault would occur with ``center=True`` and an odd number of values (:issue:`35552`) +- Fixed regression in :meth:`DataFrame.apply` where functions that altered the input in-place only operated on a single row (:issue:`35462`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 6b8d7dc35fe95..6d44cf917a07a 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -389,6 +389,8 @@ def series_generator(self): blk = mgr.blocks[0] for (arr, name) in zip(values, self.index): + # GH#35462 re-pin mgr in case setitem changed it + ser._mgr = mgr blk.values = arr ser.name = name yield ser diff --git a/pandas/tests/frame/apply/test_frame_apply.py b/pandas/tests/frame/apply/test_frame_apply.py index 3a32278e2a4b1..538978358c8e7 100644 --- a/pandas/tests/frame/apply/test_frame_apply.py +++ b/pandas/tests/frame/apply/test_frame_apply.py @@ -1522,3 +1522,22 @@ def test_apply_dtype(self, col): expected = df.dtypes tm.assert_series_equal(result, expected) + + +def test_apply_mutating(): + # GH#35462 case where applied func pins a new BlockManager to a row + df = pd.DataFrame({"a": range(100), "b": range(100, 200)}) + + def func(row): + mgr = row._mgr + row.loc["a"] += 1 + assert row._mgr is not mgr + return row + + expected = df.copy() + expected["a"] += 1 + + result = df.apply(func, axis=1) + + tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(df, result) From 61ac1f7571962a804d859c503220b7a769031cbe Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 10 Aug 2020 17:35:38 -0700 Subject: [PATCH 0480/1025] REF: use consistent pattern in tslibs.vectorized (#35613) --- pandas/_libs/tslibs/vectorized.pyx | 149 +++++++++++++---------------- 1 file changed, 66 insertions(+), 83 deletions(-) diff --git a/pandas/_libs/tslibs/vectorized.pyx b/pandas/_libs/tslibs/vectorized.pyx index b23f8255a76ac..c3c78ca54885a 100644 --- a/pandas/_libs/tslibs/vectorized.pyx +++ b/pandas/_libs/tslibs/vectorized.pyx @@ -275,44 +275,38 @@ cpdef ndarray[int64_t] normalize_i8_timestamps(const int64_t[:] stamps, tzinfo t int64_t[:] deltas str typ Py_ssize_t[:] pos - int64_t delta, local_val - - if tz is None or is_utc(tz): - with nogil: - for i in range(n): - if stamps[i] == NPY_NAT: - result[i] = NPY_NAT - continue - local_val = stamps[i] - result[i] = normalize_i8_stamp(local_val) + int64_t local_val, delta = NPY_NAT + bint use_utc = False, use_tzlocal = False, use_fixed = False + + if is_utc(tz) or tz is None: + use_utc = True elif is_tzlocal(tz): - for i in range(n): - if stamps[i] == NPY_NAT: - result[i] = NPY_NAT - continue - local_val = tz_convert_utc_to_tzlocal(stamps[i], tz) - result[i] = normalize_i8_stamp(local_val) + use_tzlocal = True else: - # Adjust datetime64 timestamp, recompute datetimestruct trans, deltas, typ = get_dst_info(tz) - if typ not in ["pytz", "dateutil"]: # static/fixed; in this case we know that len(delta) == 1 + use_fixed = True delta = deltas[0] - for i in range(n): - if stamps[i] == NPY_NAT: - result[i] = NPY_NAT - continue - local_val = stamps[i] + delta - result[i] = normalize_i8_stamp(local_val) else: pos = trans.searchsorted(stamps, side="right") - 1 - for i in range(n): - if stamps[i] == NPY_NAT: - result[i] = NPY_NAT - continue - local_val = stamps[i] + deltas[pos[i]] - result[i] = normalize_i8_stamp(local_val) + + for i in range(n): + # TODO: reinstate nogil for use_utc case? + if stamps[i] == NPY_NAT: + result[i] = NPY_NAT + continue + + if use_utc: + local_val = stamps[i] + elif use_tzlocal: + local_val = tz_convert_utc_to_tzlocal(stamps[i], tz) + elif use_fixed: + local_val = stamps[i] + delta + else: + local_val = stamps[i] + deltas[pos[i]] + + result[i] = normalize_i8_stamp(local_val) return result.base # `.base` to access underlying ndarray @@ -339,40 +333,36 @@ def is_date_array_normalized(const int64_t[:] stamps, tzinfo tz=None): ndarray[int64_t] trans int64_t[:] deltas intp_t[:] pos - int64_t local_val, delta + int64_t local_val, delta = NPY_NAT str typ int64_t day_nanos = 24 * 3600 * 1_000_000_000 + bint use_utc = False, use_tzlocal = False, use_fixed = False - if tz is None or is_utc(tz): - for i in range(n): - local_val = stamps[i] - if local_val % day_nanos != 0: - return False - + if is_utc(tz) or tz is None: + use_utc = True elif is_tzlocal(tz): - for i in range(n): - local_val = tz_convert_utc_to_tzlocal(stamps[i], tz) - if local_val % day_nanos != 0: - return False + use_tzlocal = True else: trans, deltas, typ = get_dst_info(tz) - if typ not in ["pytz", "dateutil"]: # static/fixed; in this case we know that len(delta) == 1 + use_fixed = True delta = deltas[0] - for i in range(n): - # Adjust datetime64 timestamp, recompute datetimestruct - local_val = stamps[i] + delta - if local_val % day_nanos != 0: - return False + else: + pos = trans.searchsorted(stamps, side="right") - 1 + for i in range(n): + if use_utc: + local_val = stamps[i] + elif use_tzlocal: + local_val = tz_convert_utc_to_tzlocal(stamps[i], tz) + elif use_fixed: + local_val = stamps[i] + delta else: - pos = trans.searchsorted(stamps) - 1 - for i in range(n): - # Adjust datetime64 timestamp, recompute datetimestruct - local_val = stamps[i] + deltas[pos[i]] - if local_val % day_nanos != 0: - return False + local_val = stamps[i] + deltas[pos[i]] + + if local_val % day_nanos != 0: + return False return True @@ -390,45 +380,38 @@ def dt64arr_to_periodarr(const int64_t[:] stamps, int freq, tzinfo tz): int64_t[:] deltas Py_ssize_t[:] pos npy_datetimestruct dts - int64_t local_val + int64_t local_val, delta = NPY_NAT + bint use_utc = False, use_tzlocal = False, use_fixed = False if is_utc(tz) or tz is None: - with nogil: - for i in range(n): - if stamps[i] == NPY_NAT: - result[i] = NPY_NAT - continue - dt64_to_dtstruct(stamps[i], &dts) - result[i] = get_period_ordinal(&dts, freq) - + use_utc = True elif is_tzlocal(tz): - for i in range(n): - if stamps[i] == NPY_NAT: - result[i] = NPY_NAT - continue - local_val = tz_convert_utc_to_tzlocal(stamps[i], tz) - dt64_to_dtstruct(local_val, &dts) - result[i] = get_period_ordinal(&dts, freq) + use_tzlocal = True else: - # Adjust datetime64 timestamp, recompute datetimestruct trans, deltas, typ = get_dst_info(tz) - if typ not in ["pytz", "dateutil"]: # static/fixed; in this case we know that len(delta) == 1 - for i in range(n): - if stamps[i] == NPY_NAT: - result[i] = NPY_NAT - continue - dt64_to_dtstruct(stamps[i] + deltas[0], &dts) - result[i] = get_period_ordinal(&dts, freq) + use_fixed = True + delta = deltas[0] else: pos = trans.searchsorted(stamps, side="right") - 1 - for i in range(n): - if stamps[i] == NPY_NAT: - result[i] = NPY_NAT - continue - dt64_to_dtstruct(stamps[i] + deltas[pos[i]], &dts) - result[i] = get_period_ordinal(&dts, freq) + for i in range(n): + # TODO: reinstate nogil for use_utc case? + if stamps[i] == NPY_NAT: + result[i] = NPY_NAT + continue + + if use_utc: + local_val = stamps[i] + elif use_tzlocal: + local_val = tz_convert_utc_to_tzlocal(stamps[i], tz) + elif use_fixed: + local_val = stamps[i] + delta + else: + local_val = stamps[i] + deltas[pos[i]] + + dt64_to_dtstruct(local_val, &dts) + result[i] = get_period_ordinal(&dts, freq) return result.base # .base to get underlying ndarray From 07f43ab8d4b836af682666c6fc8143499c7ca054 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ale=C5=A1=20Erjavec?= Date: Tue, 11 Aug 2020 02:36:26 +0200 Subject: [PATCH 0481/1025] [FIX] Handle decimal and thousand separator in 'round_trip' converer (#35377) --- doc/source/whatsnew/v1.2.0.rst | 1 + pandas/_libs/src/parser/tokenizer.c | 57 +++++++++++- pandas/tests/io/parser/test_c_parser_only.py | 98 ++++++++++++++++++++ 3 files changed, 154 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 94bb265c32e4c..ebc8ebf0dc2f7 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -155,6 +155,7 @@ I/O ^^^ - Bug in :meth:`to_csv` caused a ``ValueError`` when it was called with a filename in combination with ``mode`` containing a ``b`` (:issue:`35058`) +- In :meth:`read_csv` `float_precision='round_trip'` now handles `decimal` and `thousands` parameters (:issue:`35365`) - Plotting diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index a195c0daf5271..df8ec68986ccb 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -1778,20 +1778,73 @@ double precise_xstrtod(const char *str, char **endptr, char decimal, return number; } +/* copy a decimal number string with `decimal`, `tsep` as decimal point + and thousands separator to an equivalent c-locale decimal string (striping + `tsep`, replacing `decimal` with '.'). The returned memory should be free-d + with a call to `free`. +*/ + +char* _str_copy_decimal_str_c(const char *s, char **endpos, char decimal, + char tsep) { + const char *p = s; + size_t length = strlen(s); + char *s_copy = malloc(length + 1); + char *dst = s_copy; + // Copy Leading sign + if (*p == '+' || *p == '-') { + *dst++ = *p++; + } + // Copy integer part dropping `tsep` + while (isdigit_ascii(*p)) { + *dst++ = *p++; + p += (tsep != '\0' && *p == tsep); + } + // Replace `decimal` with '.' + if (*p == decimal) { + *dst++ = '.'; + p++; + } + // Copy the remainder of the string as is. + strncpy(dst, p, length + 1 - (p - s)); + if (endpos != NULL) + *endpos = (char *)(s + length); + return s_copy; +} + + double round_trip(const char *p, char **q, char decimal, char sci, char tsep, int skip_trailing, int *error, int *maybe_int) { + // 'normalize' representation to C-locale; replace decimal with '.' and + // remove t(housand)sep. + char *endptr; + char *pc = _str_copy_decimal_str_c(p, &endptr, decimal, tsep); // This is called from a nogil block in parsers.pyx // so need to explicitly get GIL before Python calls PyGILState_STATE gstate; gstate = PyGILState_Ensure(); - - double r = PyOS_string_to_double(p, q, 0); + char *endpc; + double r = PyOS_string_to_double(pc, &endpc, 0); + // PyOS_string_to_double needs to consume the whole string + if (endpc == pc + strlen(pc)) { + if (q != NULL) { + // report endptr from source string (p) + *q = (char *) endptr; + } + } else { + *error = -1; + if (q != NULL) { + // p and pc are different len due to tsep removal. Can't report + // how much it has consumed of p. Just rewind to beginning. + *q = (char *)p; + } + } if (maybe_int != NULL) *maybe_int = 0; if (PyErr_Occurred() != NULL) *error = -1; else if (r == Py_HUGE_VAL) *error = (int)Py_HUGE_VAL; PyErr_Clear(); PyGILState_Release(gstate); + free(pc); return r; } diff --git a/pandas/tests/io/parser/test_c_parser_only.py b/pandas/tests/io/parser/test_c_parser_only.py index d76d01904731a..50179fc1ec4b8 100644 --- a/pandas/tests/io/parser/test_c_parser_only.py +++ b/pandas/tests/io/parser/test_c_parser_only.py @@ -606,3 +606,101 @@ def test_unix_style_breaks(c_parser_only): result = parser.read_csv(path, skiprows=2, encoding="utf-8", engine="c") expected = DataFrame(columns=["col_1", "col_2", "col_3"]) tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("float_precision", [None, "high", "round_trip"]) +@pytest.mark.parametrize( + "data,thousands,decimal", + [ + ( + """A|B|C +1|2,334.01|5 +10|13|10. +""", + ",", + ".", + ), + ( + """A|B|C +1|2.334,01|5 +10|13|10, +""", + ".", + ",", + ), + ], +) +def test_1000_sep_with_decimal( + c_parser_only, data, thousands, decimal, float_precision +): + parser = c_parser_only + expected = DataFrame({"A": [1, 10], "B": [2334.01, 13], "C": [5, 10.0]}) + + result = parser.read_csv( + StringIO(data), + sep="|", + thousands=thousands, + decimal=decimal, + float_precision=float_precision, + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "float_precision", [None, "high", "round_trip"], +) +@pytest.mark.parametrize( + "value,expected", + [ + ("-1,0", -1.0), + ("-1,2e0", -1.2), + ("-1e0", -1.0), + ("+1e0", 1.0), + ("+1e+0", 1.0), + ("+1e-1", 0.1), + ("+,1e1", 1.0), + ("+1,e0", 1.0), + ("-,1e1", -1.0), + ("-1,e0", -1.0), + ("0,1", 0.1), + ("1,", 1.0), + (",1", 0.1), + ("-,1", -0.1), + ("1_,", 1.0), + ("1_234,56", 1234.56), + ("1_234,56e0", 1234.56), + # negative cases; must not parse as float + ("_", "_"), + ("-_", "-_"), + ("-_1", "-_1"), + ("-_1e0", "-_1e0"), + ("_1", "_1"), + ("_1,", "_1,"), + ("_1,_", "_1,_"), + ("_1e0", "_1e0"), + ("1,2e_1", "1,2e_1"), + ("1,2e1_0", "1,2e1_0"), + ("1,_2", "1,_2"), + (",1__2", ",1__2"), + (",1e", ",1e"), + ("-,1e", "-,1e"), + ("1_000,000_000", "1_000,000_000"), + ("1,e1_2", "1,e1_2"), + ], +) +def test_1000_sep_decimal_float_precision( + c_parser_only, value, expected, float_precision +): + # test decimal and thousand sep handling in across 'float_precision' + # parsers + parser = c_parser_only + df = parser.read_csv( + StringIO(value), + sep="|", + thousands="_", + decimal=",", + header=None, + float_precision=float_precision, + ) + val = df.iloc[0, 0] + assert val == expected From f44b3190a36be9befce16683d789ea6574627425 Mon Sep 17 00:00:00 2001 From: Fangchen Li Date: Mon, 10 Aug 2020 20:17:47 -0500 Subject: [PATCH 0482/1025] Drop Python 3.6 support (#35214) * DEPS: drop 3.6 (#34472) * DEPS: drop 3.6 (#34472) * DEPS: fix file name (#34472) * DEPS: fix import (#34472) * DEPS: fix job name (#34472) * DEPS: resolve min version conflicts (#34472) * DEPS: fix env name (#34472) * DEPS: remove py36 check in test, bump matplotlib (#34472) * DEPS: fix travis 37 locale (#34472) * DEPS: remove PY37 check from tests (#34472) * DEPS: remove import (#34472) * DEPS: remove PY37 in benchmark (#34472) * try to fix timeout * pytable version * update minimum version * remove xfail for test apply * remove import * try to fix timeout * try to fix timeout * try to fix timeout * bump to 3.7.1 to fix timeout * migrate ci * fix env name * remove py37-locale from azure * resolve conflicts * update ci * update ci * sync with master * whatsnew and install doc * whatsnew and install doc * update environment.yml * update environment.yml * uncomment azure p37 locale * move min pyarrow test * bumpy numpy to 1.16.5 * bumpy numpy to 1.16.5 * fix 32bit * comment out 32bit CI * update numpy version in numpy/__init__.py * remove import from numpy/__init__.py * filter DeprecationWarning * filter DeprecationWarning * skip unreliable test for windows * skip unreliable test for windows * fix parameter order in docstring * skip test * skip test --- .travis.yml | 4 +- asv_bench/benchmarks/package.py | 24 +- ci/azure/posix.yml | 52 ++--- ci/azure/windows.yml | 12 +- ...zure-36-32bit.yaml => azure-37-32bit.yaml} | 8 +- ci/deps/azure-37-locale.yaml | 8 +- ...le_slow.yaml => azure-37-locale_slow.yaml} | 18 +- ...ns.yaml => azure-37-minimum_versions.yaml} | 17 +- ...{azure-36-slow.yaml => azure-37-slow.yaml} | 2 +- ...re-36-locale.yaml => azure-38-locale.yaml} | 16 +- ...7-numpydev.yaml => azure-38-numpydev.yaml} | 2 +- ...zure-macos-36.yaml => azure-macos-37.yaml} | 6 +- ci/deps/azure-windows-37.yaml | 4 +- ...-windows-36.yaml => azure-windows-38.yaml} | 8 +- ...{travis-36-cov.yaml => travis-37-cov.yaml} | 6 +- ...s-36-locale.yaml => travis-37-locale.yaml} | 10 +- doc/source/getting_started/install.rst | 36 +-- doc/source/whatsnew/v1.2.0.rst | 78 +++++++ environment.yml | 6 +- pandas/__init__.py | 208 +++++------------- pandas/compat/__init__.py | 1 - pandas/compat/numpy/__init__.py | 9 +- pandas/core/frame.py | 5 +- pandas/tests/api/test_api.py | 25 +-- .../arrays/categorical/test_constructors.py | 3 - pandas/tests/extension/arrow/test_bool.py | 4 - pandas/tests/extension/test_numpy.py | 6 - pandas/tests/frame/test_api.py | 13 +- pandas/tests/frame/test_constructors.py | 5 +- pandas/tests/groupby/test_categorical.py | 9 - .../tests/io/json/test_json_table_schema.py | 6 + pandas/tests/io/json/test_pandas.py | 3 + pandas/tests/io/parser/test_common.py | 2 + pandas/tests/scalar/test_nat.py | 4 - pandas/tests/tseries/offsets/test_offsets.py | 11 +- pandas/util/__init__.py | 30 +-- pyproject.toml | 8 +- requirements-dev.txt | 6 +- setup.py | 9 +- 39 files changed, 283 insertions(+), 401 deletions(-) rename ci/deps/{azure-36-32bit.yaml => azure-37-32bit.yaml} (76%) rename ci/deps/{azure-36-locale_slow.yaml => azure-37-locale_slow.yaml} (67%) rename ci/deps/{azure-36-minimum_versions.yaml => azure-37-minimum_versions.yaml} (70%) rename ci/deps/{azure-36-slow.yaml => azure-37-slow.yaml} (96%) rename ci/deps/{azure-36-locale.yaml => azure-38-locale.yaml} (69%) rename ci/deps/{azure-37-numpydev.yaml => azure-38-numpydev.yaml} (96%) rename ci/deps/{azure-macos-36.yaml => azure-macos-37.yaml} (89%) rename ci/deps/{azure-windows-36.yaml => azure-windows-38.yaml} (84%) rename ci/deps/{travis-36-cov.yaml => travis-37-cov.yaml} (93%) rename ci/deps/{travis-36-locale.yaml => travis-37-locale.yaml} (85%) diff --git a/.travis.yml b/.travis.yml index b016cf386098e..2e98cf47aea3e 100644 --- a/.travis.yml +++ b/.travis.yml @@ -45,7 +45,7 @@ matrix: - JOB="3.7, arm64" PYTEST_WORKERS=8 ENV_FILE="ci/deps/travis-37-arm64.yaml" PATTERN="(not slow and not network and not clipboard)" - env: - - JOB="3.6, locale" ENV_FILE="ci/deps/travis-36-locale.yaml" PATTERN="((not slow and not network and not clipboard) or (single and db))" LOCALE_OVERRIDE="zh_CN.UTF-8" SQL="1" + - JOB="3.7, locale" ENV_FILE="ci/deps/travis-37-locale.yaml" PATTERN="((not slow and not network and not clipboard) or (single and db))" LOCALE_OVERRIDE="zh_CN.UTF-8" SQL="1" services: - mysql - postgresql @@ -54,7 +54,7 @@ matrix: # Enabling Deprecations when running tests # PANDAS_TESTING_MODE="deprecate" causes DeprecationWarning messages to be displayed in the logs # See pandas/_testing.py for more details. - - JOB="3.6, coverage" ENV_FILE="ci/deps/travis-36-cov.yaml" PATTERN="((not slow and not network and not clipboard) or (single and db))" PANDAS_TESTING_MODE="deprecate" COVERAGE=true SQL="1" + - JOB="3.7, coverage" ENV_FILE="ci/deps/travis-37-cov.yaml" PATTERN="((not slow and not network and not clipboard) or (single and db))" PANDAS_TESTING_MODE="deprecate" COVERAGE=true SQL="1" services: - mysql - postgresql diff --git a/asv_bench/benchmarks/package.py b/asv_bench/benchmarks/package.py index 8ca33db361fa0..34fe4929a752b 100644 --- a/asv_bench/benchmarks/package.py +++ b/asv_bench/benchmarks/package.py @@ -4,22 +4,16 @@ import subprocess import sys -from pandas.compat import PY37 - class TimeImport: def time_import(self): - if PY37: - # on py37+ we the "-X importtime" usage gives us a more precise - # measurement of the import time we actually care about, - # without the subprocess or interpreter overhead - cmd = [sys.executable, "-X", "importtime", "-c", "import pandas as pd"] - p = subprocess.run(cmd, stderr=subprocess.PIPE) - - line = p.stderr.splitlines()[-1] - field = line.split(b"|")[-2].strip() - total = int(field) # microseconds - return total + # on py37+ we the "-X importtime" usage gives us a more precise + # measurement of the import time we actually care about, + # without the subprocess or interpreter overhead + cmd = [sys.executable, "-X", "importtime", "-c", "import pandas as pd"] + p = subprocess.run(cmd, stderr=subprocess.PIPE) - cmd = [sys.executable, "-c", "import pandas as pd"] - subprocess.run(cmd, stderr=subprocess.PIPE) + line = p.stderr.splitlines()[-1] + field = line.split(b"|")[-2].strip() + total = int(field) # microseconds + return total diff --git a/ci/azure/posix.yml b/ci/azure/posix.yml index f716974f6add1..9f8174b4fa678 100644 --- a/ci/azure/posix.yml +++ b/ci/azure/posix.yml @@ -9,20 +9,20 @@ jobs: strategy: matrix: ${{ if eq(parameters.name, 'macOS') }}: - py36_macos: - ENV_FILE: ci/deps/azure-macos-36.yaml - CONDA_PY: "36" + py37_macos: + ENV_FILE: ci/deps/azure-macos-37.yaml + CONDA_PY: "37" PATTERN: "not slow and not network" ${{ if eq(parameters.name, 'Linux') }}: - py36_minimum_versions: - ENV_FILE: ci/deps/azure-36-minimum_versions.yaml - CONDA_PY: "36" + py37_minimum_versions: + ENV_FILE: ci/deps/azure-37-minimum_versions.yaml + CONDA_PY: "37" PATTERN: "not slow and not network and not clipboard" - py36_locale_slow_old_np: - ENV_FILE: ci/deps/azure-36-locale_slow.yaml - CONDA_PY: "36" + py37_locale_slow: + ENV_FILE: ci/deps/azure-37-locale_slow.yaml + CONDA_PY: "37" PATTERN: "slow" # pandas does not use the language (zh_CN), but should support different encodings (utf8) # we should test with encodings different than utf8, but doesn't seem like Ubuntu supports any @@ -30,36 +30,36 @@ jobs: LC_ALL: "zh_CN.utf8" EXTRA_APT: "language-pack-zh-hans" - py36_slow: - ENV_FILE: ci/deps/azure-36-slow.yaml - CONDA_PY: "36" + py37_slow: + ENV_FILE: ci/deps/azure-37-slow.yaml + CONDA_PY: "37" PATTERN: "slow" - py36_locale: - ENV_FILE: ci/deps/azure-36-locale.yaml - CONDA_PY: "36" + py37_locale: + ENV_FILE: ci/deps/azure-37-locale.yaml + CONDA_PY: "37" PATTERN: "not slow and not network" LANG: "it_IT.utf8" LC_ALL: "it_IT.utf8" EXTRA_APT: "language-pack-it xsel" - #py36_32bit: - # ENV_FILE: ci/deps/azure-36-32bit.yaml - # CONDA_PY: "36" - # PATTERN: "not slow and not network and not clipboard" - # BITS32: "yes" +# py37_32bit: +# ENV_FILE: ci/deps/azure-37-32bit.yaml +# CONDA_PY: "37" +# PATTERN: "not slow and not network and not clipboard" +# BITS32: "yes" - py37_locale: - ENV_FILE: ci/deps/azure-37-locale.yaml - CONDA_PY: "37" + py38_locale: + ENV_FILE: ci/deps/azure-38-locale.yaml + CONDA_PY: "38" PATTERN: "not slow and not network" LANG: "zh_CN.utf8" LC_ALL: "zh_CN.utf8" EXTRA_APT: "language-pack-zh-hans xsel" - py37_np_dev: - ENV_FILE: ci/deps/azure-37-numpydev.yaml - CONDA_PY: "37" + py38_np_dev: + ENV_FILE: ci/deps/azure-38-numpydev.yaml + CONDA_PY: "38" PATTERN: "not slow and not network" TEST_ARGS: "-W error" PANDAS_TESTING_MODE: "deprecate" diff --git a/ci/azure/windows.yml b/ci/azure/windows.yml index 87f1bfd2adb79..5938ba1fd69f5 100644 --- a/ci/azure/windows.yml +++ b/ci/azure/windows.yml @@ -8,16 +8,16 @@ jobs: vmImage: ${{ parameters.vmImage }} strategy: matrix: - py36_np15: - ENV_FILE: ci/deps/azure-windows-36.yaml - CONDA_PY: "36" - PATTERN: "not slow and not network" - - py37_np18: + py37_np16: ENV_FILE: ci/deps/azure-windows-37.yaml CONDA_PY: "37" PATTERN: "not slow and not network" + py38_np18: + ENV_FILE: ci/deps/azure-windows-38.yaml + CONDA_PY: "38" + PATTERN: "not slow and not network" + steps: - powershell: | Write-Host "##vso[task.prependpath]$env:CONDA\Scripts" diff --git a/ci/deps/azure-36-32bit.yaml b/ci/deps/azure-37-32bit.yaml similarity index 76% rename from ci/deps/azure-36-32bit.yaml rename to ci/deps/azure-37-32bit.yaml index 15704cf0d5427..8e0cd73a9536d 100644 --- a/ci/deps/azure-36-32bit.yaml +++ b/ci/deps/azure-37-32bit.yaml @@ -3,10 +3,10 @@ channels: - defaults - conda-forge dependencies: - - python=3.6.* + - python=3.7.* # tools - ### Cython 0.29.13 and pytest 5.0.1 for 32 bits are not available with conda, installing below with pip instead + ### Cython 0.29.16 and pytest 5.0.1 for 32 bits are not available with conda, installing below with pip instead - pytest-xdist>=1.21 - hypothesis>=3.58.0 - pytest-azurepipelines @@ -15,12 +15,12 @@ dependencies: - attrs=19.1.0 - gcc_linux-32 - gxx_linux-32 - - numpy=1.14.* - python-dateutil - - pytz=2017.2 + - pytz=2017.3 # see comment above - pip - pip: - cython>=0.29.16 + - numpy>=1.16.5 - pytest>=5.0.1 diff --git a/ci/deps/azure-37-locale.yaml b/ci/deps/azure-37-locale.yaml index 6f64c81f299d1..a6552aa096a22 100644 --- a/ci/deps/azure-37-locale.yaml +++ b/ci/deps/azure-37-locale.yaml @@ -1,5 +1,6 @@ name: pandas-dev channels: + - defaults - conda-forge dependencies: - python=3.7.* @@ -22,7 +23,7 @@ dependencies: - moto - nomkl - numexpr - - numpy + - numpy=1.16.* - openpyxl - pytables - python-dateutil @@ -32,7 +33,4 @@ dependencies: - xlrd - xlsxwriter - xlwt - - pyarrow>=0.15 - - pip - - pip: - - pyxlsb + - moto diff --git a/ci/deps/azure-36-locale_slow.yaml b/ci/deps/azure-37-locale_slow.yaml similarity index 67% rename from ci/deps/azure-36-locale_slow.yaml rename to ci/deps/azure-37-locale_slow.yaml index c086b3651afc3..3ccb66e09fe7e 100644 --- a/ci/deps/azure-36-locale_slow.yaml +++ b/ci/deps/azure-37-locale_slow.yaml @@ -3,7 +3,7 @@ channels: - defaults - conda-forge dependencies: - - python=3.6.* + - python=3.7.* # tools - cython>=0.29.16 @@ -16,17 +16,15 @@ dependencies: - beautifulsoup4=4.6.0 - bottleneck=1.2.* - lxml - - matplotlib=2.2.2 - - numpy=1.14.* + - matplotlib=3.0.0 + - numpy=1.16.* - openpyxl=2.5.7 - python-dateutil - python-blosc - - pytz=2017.2 + - pytz=2017.3 - scipy - - sqlalchemy=1.1.4 + - sqlalchemy=1.2.8 - xlrd=1.1.0 - - xlsxwriter=0.9.8 - - xlwt=1.2.0 - - pip - - pip: - - html5lib==1.0b2 + - xlsxwriter=1.0.2 + - xlwt=1.3.0 + - html5lib=1.0.1 diff --git a/ci/deps/azure-36-minimum_versions.yaml b/ci/deps/azure-37-minimum_versions.yaml similarity index 70% rename from ci/deps/azure-36-minimum_versions.yaml rename to ci/deps/azure-37-minimum_versions.yaml index f5af7bcf36189..94cc5812bcc10 100644 --- a/ci/deps/azure-36-minimum_versions.yaml +++ b/ci/deps/azure-37-minimum_versions.yaml @@ -2,7 +2,7 @@ name: pandas-dev channels: - conda-forge dependencies: - - python=3.6.1 + - python=3.7.1 # tools - cython=0.29.16 @@ -15,16 +15,17 @@ dependencies: # pandas dependencies - beautifulsoup4=4.6.0 - bottleneck=1.2.1 - - jinja2=2.8 + - jinja2=2.10 - numba=0.46.0 - - numexpr=2.6.2 - - numpy=1.15.4 + - numexpr=2.6.8 + - numpy=1.16.5 - openpyxl=2.5.7 - - pytables=3.4.3 + - pytables=3.4.4 - python-dateutil=2.7.3 - - pytz=2017.2 + - pytz=2017.3 + - pyarrow=0.15 - scipy=1.2 - xlrd=1.1.0 - - xlsxwriter=0.9.8 - - xlwt=1.2.0 + - xlsxwriter=1.0.2 + - xlwt=1.3.0 - html5lib=1.0.1 diff --git a/ci/deps/azure-36-slow.yaml b/ci/deps/azure-37-slow.yaml similarity index 96% rename from ci/deps/azure-36-slow.yaml rename to ci/deps/azure-37-slow.yaml index 87bad59fa4873..e8ffd3d74ca5e 100644 --- a/ci/deps/azure-36-slow.yaml +++ b/ci/deps/azure-37-slow.yaml @@ -3,7 +3,7 @@ channels: - defaults - conda-forge dependencies: - - python=3.6.* + - python=3.7.* # tools - cython>=0.29.16 diff --git a/ci/deps/azure-36-locale.yaml b/ci/deps/azure-38-locale.yaml similarity index 69% rename from ci/deps/azure-36-locale.yaml rename to ci/deps/azure-38-locale.yaml index 3034ed3dc43af..c466a5929ea29 100644 --- a/ci/deps/azure-36-locale.yaml +++ b/ci/deps/azure-38-locale.yaml @@ -1,9 +1,8 @@ name: pandas-dev channels: - - defaults - conda-forge dependencies: - - python=3.6.* + - python=3.8.* # tools - cython>=0.29.16 @@ -19,14 +18,12 @@ dependencies: - ipython - jinja2 - lxml - - matplotlib=3.0.* + - matplotlib <3.3.0 + - moto - nomkl - numexpr - - numpy=1.15.* + - numpy - openpyxl - # lowest supported version of pyarrow (putting it here instead of in - # azure-36-minimum_versions because it needs numpy >= 1.14) - - pyarrow=0.13 - pytables - python-dateutil - pytz @@ -35,4 +32,7 @@ dependencies: - xlrd - xlsxwriter - xlwt - - moto + - pyarrow>=0.15 + - pip + - pip: + - pyxlsb diff --git a/ci/deps/azure-37-numpydev.yaml b/ci/deps/azure-38-numpydev.yaml similarity index 96% rename from ci/deps/azure-37-numpydev.yaml rename to ci/deps/azure-38-numpydev.yaml index 5cb58756a6ac1..37592086d49e3 100644 --- a/ci/deps/azure-37-numpydev.yaml +++ b/ci/deps/azure-38-numpydev.yaml @@ -2,7 +2,7 @@ name: pandas-dev channels: - defaults dependencies: - - python=3.7.* + - python=3.8.* # tools - pytest>=5.0.1 diff --git a/ci/deps/azure-macos-36.yaml b/ci/deps/azure-macos-37.yaml similarity index 89% rename from ci/deps/azure-macos-36.yaml rename to ci/deps/azure-macos-37.yaml index eeea249a19ca1..a5a69b9a59576 100644 --- a/ci/deps/azure-macos-36.yaml +++ b/ci/deps/azure-macos-37.yaml @@ -2,7 +2,7 @@ name: pandas-dev channels: - defaults dependencies: - - python=3.6.* + - python=3.7.* # tools - pytest>=5.0.1 @@ -19,9 +19,9 @@ dependencies: - matplotlib=2.2.3 - nomkl - numexpr - - numpy=1.15.4 + - numpy=1.16.5 - openpyxl - - pyarrow>=0.13.0 + - pyarrow>=0.15.0 - pytables - python-dateutil==2.7.3 - pytz diff --git a/ci/deps/azure-windows-37.yaml b/ci/deps/azure-windows-37.yaml index 5bbd0e2795d7e..4d745454afcab 100644 --- a/ci/deps/azure-windows-37.yaml +++ b/ci/deps/azure-windows-37.yaml @@ -23,9 +23,9 @@ dependencies: - matplotlib=2.2.* - moto - numexpr - - numpy=1.18.* + - numpy=1.16.* - openpyxl - - pyarrow=0.14 + - pyarrow=0.15 - pytables - python-dateutil - pytz diff --git a/ci/deps/azure-windows-36.yaml b/ci/deps/azure-windows-38.yaml similarity index 84% rename from ci/deps/azure-windows-36.yaml rename to ci/deps/azure-windows-38.yaml index 548660cabaa67..f428a6dadfaa2 100644 --- a/ci/deps/azure-windows-36.yaml +++ b/ci/deps/azure-windows-38.yaml @@ -3,7 +3,7 @@ channels: - conda-forge - defaults dependencies: - - python=3.6.* + - python=3.8.* # tools - cython>=0.29.16 @@ -16,13 +16,13 @@ dependencies: - blosc - bottleneck - fastparquet>=0.3.2 - - matplotlib=3.0.2 + - matplotlib=3.1.3 - numba - numexpr - - numpy=1.15.* + - numpy=1.18.* - openpyxl - jinja2 - - pyarrow>=0.13.0 + - pyarrow>=0.15.0 - pytables - python-dateutil - pytz diff --git a/ci/deps/travis-36-cov.yaml b/ci/deps/travis-37-cov.yaml similarity index 93% rename from ci/deps/travis-36-cov.yaml rename to ci/deps/travis-37-cov.yaml index 177e0d3f4c0af..3a0827a16f97a 100644 --- a/ci/deps/travis-36-cov.yaml +++ b/ci/deps/travis-37-cov.yaml @@ -3,7 +3,7 @@ channels: - defaults - conda-forge dependencies: - - python=3.6.* + - python=3.7.* # tools - cython>=0.29.16 @@ -26,12 +26,12 @@ dependencies: - moto - nomkl - numexpr - - numpy=1.15.* + - numpy=1.16.* - odfpy - openpyxl - pandas-gbq - psycopg2 - - pyarrow>=0.13.0 + - pyarrow>=0.15.0 - pymysql - pytables - python-snappy diff --git a/ci/deps/travis-36-locale.yaml b/ci/deps/travis-37-locale.yaml similarity index 85% rename from ci/deps/travis-36-locale.yaml rename to ci/deps/travis-37-locale.yaml index 03a1e751b6a86..4427c1d940bf2 100644 --- a/ci/deps/travis-36-locale.yaml +++ b/ci/deps/travis-37-locale.yaml @@ -3,7 +3,7 @@ channels: - defaults - conda-forge dependencies: - - python=3.6.* + - python=3.7.* # tools - cython>=0.29.16 @@ -19,7 +19,7 @@ dependencies: - html5lib - ipython - jinja2 - - lxml=3.8.0 + - lxml=4.3.0 - matplotlib=3.0.* - moto - nomkl @@ -27,14 +27,14 @@ dependencies: - numpy - openpyxl - pandas-gbq=0.12.0 - - psycopg2=2.6.2 + - psycopg2=2.7 - pymysql=0.7.11 - pytables - python-dateutil - pytz - scipy - - sqlalchemy=1.1.4 - - xarray=0.10 + - sqlalchemy=1.3.0 + - xarray=0.12.0 - xlrd - xlsxwriter - xlwt diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index b79a9cd872c47..7ab150394bf51 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -18,7 +18,7 @@ Instructions for installing from source, Python version support ---------------------- -Officially Python 3.6.1 and above, 3.7, and 3.8. +Officially Python 3.7.1 and above, and 3.8. Installing pandas ----------------- @@ -220,9 +220,9 @@ Dependencies Package Minimum supported version ================================================================ ========================== `setuptools `__ 24.2.0 -`NumPy `__ 1.15.4 +`NumPy `__ 1.16.5 `python-dateutil `__ 2.7.3 -`pytz `__ 2017.2 +`pytz `__ 2017.3 ================================================================ ========================== .. _install.recommended_dependencies: @@ -232,7 +232,7 @@ Recommended dependencies * `numexpr `__: for accelerating certain numerical operations. ``numexpr`` uses multiple cores as well as smart chunking and caching to achieve large speedups. - If installed, must be Version 2.6.2 or higher. + If installed, must be Version 2.6.8 or higher. * `bottleneck `__: for accelerating certain types of ``nan`` evaluations. ``bottleneck`` uses specialized cython routines to achieve large speedups. If installed, @@ -259,36 +259,36 @@ the method requiring that dependency is called. Dependency Minimum Version Notes ========================= ================== ============================================================= BeautifulSoup4 4.6.0 HTML parser for read_html (see :ref:`note `) -Jinja2 Conditional formatting with DataFrame.style +Jinja2 2.10 Conditional formatting with DataFrame.style PyQt4 Clipboard I/O PyQt5 Clipboard I/O -PyTables 3.4.3 HDF5-based reading / writing -SQLAlchemy 1.1.4 SQL support for databases other than sqlite -SciPy 0.19.0 Miscellaneous statistical functions -XLsxWriter 0.9.8 Excel writing -blosc Compression for HDF5 +PyTables 3.4.4 HDF5-based reading / writing +SQLAlchemy 1.2.8 SQL support for databases other than sqlite +SciPy 1.12.0 Miscellaneous statistical functions +xlsxwriter 1.0.2 Excel writing +blosc 1.14.3 Compression for HDF5 fsspec 0.7.4 Handling files aside from local and HTTP fastparquet 0.3.2 Parquet reading / writing gcsfs 0.6.0 Google Cloud Storage access -html5lib HTML parser for read_html (see :ref:`note `) -lxml 3.8.0 HTML parser for read_html (see :ref:`note `) -matplotlib 2.2.2 Visualization +html5lib 1.0.1 HTML parser for read_html (see :ref:`note `) +lxml 4.3.0 HTML parser for read_html (see :ref:`note `) +matplotlib 2.2.3 Visualization numba 0.46.0 Alternative execution engine for rolling operations openpyxl 2.5.7 Reading / writing for xlsx files pandas-gbq 0.12.0 Google Big Query access -psycopg2 PostgreSQL engine for sqlalchemy -pyarrow 0.12.0 Parquet, ORC (requires 0.13.0), and feather reading / writing +psycopg2 2.7 PostgreSQL engine for sqlalchemy +pyarrow 0.15.0 Parquet, ORC, and feather reading / writing pymysql 0.7.11 MySQL engine for sqlalchemy pyreadstat SPSS files (.sav) reading -pytables 3.4.3 HDF5 reading / writing +pytables 3.4.4 HDF5 reading / writing pyxlsb 1.0.6 Reading for xlsb files qtpy Clipboard I/O s3fs 0.4.0 Amazon S3 access tabulate 0.8.3 Printing in Markdown-friendly format (see `tabulate`_) -xarray 0.8.2 pandas-like API for N-dimensional data +xarray 0.12.0 pandas-like API for N-dimensional data xclip Clipboard I/O on linux xlrd 1.1.0 Excel reading -xlwt 1.2.0 Excel writing +xlwt 1.3.0 Excel writing xsel Clipboard I/O on linux zlib Compression for HDF5 ========================= ================== ============================================================= diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index ebc8ebf0dc2f7..86f47a5826214 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -54,6 +54,84 @@ Other enhancements - - +.. _whatsnew_120.api_breaking.python: + +Increased minimum version for Python +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Pandas 1.2.0 supports Python 3.7.1 and higher (:issue:`35214`). + +.. _whatsnew_120.api_breaking.deps: + +Increased minimum versions for dependencies +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Some minimum supported versions of dependencies were updated (:issue:`35214`). +If installed, we now require: + ++-----------------+-----------------+----------+---------+ +| Package | Minimum Version | Required | Changed | ++=================+=================+==========+=========+ +| numpy | 1.16.5 | X | X | ++-----------------+-----------------+----------+---------+ +| pytz | 2017.3 | X | X | ++-----------------+-----------------+----------+---------+ +| python-dateutil | 2.7.3 | X | | ++-----------------+-----------------+----------+---------+ +| bottleneck | 1.2.1 | | | ++-----------------+-----------------+----------+---------+ +| numexpr | 2.6.8 | | X | ++-----------------+-----------------+----------+---------+ +| pytest (dev) | 5.0.1 | | X | ++-----------------+-----------------+----------+---------+ + +For `optional libraries `_ the general recommendation is to use the latest version. +The following table lists the lowest version per library that is currently being tested throughout the development of pandas. +Optional libraries below the lowest tested version may still work, but are not considered supported. + ++-----------------+-----------------+---------+ +| Package | Minimum Version | Changed | ++=================+=================+=========+ +| beautifulsoup4 | 4.6.0 | | ++-----------------+-----------------+---------+ +| fastparquet | 0.3.2 | | ++-----------------+-----------------+---------+ +| fsspec | 0.7.4 | | ++-----------------+-----------------+---------+ +| gcsfs | 0.6.0 | | ++-----------------+-----------------+---------+ +| lxml | 4.3.0 | X | ++-----------------+-----------------+---------+ +| matplotlib | 2.2.3 | X | ++-----------------+-----------------+---------+ +| numba | 0.46.0 | | ++-----------------+-----------------+---------+ +| openpyxl | 2.5.7 | | ++-----------------+-----------------+---------+ +| pyarrow | 0.15.0 | X | ++-----------------+-----------------+---------+ +| pymysql | 0.7.11 | X | ++-----------------+-----------------+---------+ +| pytables | 3.4.4 | X | ++-----------------+-----------------+---------+ +| s3fs | 0.4.0 | | ++-----------------+-----------------+---------+ +| scipy | 1.2.0 | | ++-----------------+-----------------+---------+ +| sqlalchemy | 1.2.8 | X | ++-----------------+-----------------+---------+ +| xarray | 0.12.0 | X | ++-----------------+-----------------+---------+ +| xlrd | 1.1.0 | | ++-----------------+-----------------+---------+ +| xlsxwriter | 1.0.2 | X | ++-----------------+-----------------+---------+ +| xlwt | 1.3.0 | X | ++-----------------+-----------------+---------+ +| pandas-gbq | 0.12.0 | | ++-----------------+-----------------+---------+ + +See :ref:`install.dependencies` and :ref:`install.optional_dependencies` for more. .. --------------------------------------------------------------------------- diff --git a/environment.yml b/environment.yml index ed9762e5b8893..1e51470d43d36 100644 --- a/environment.yml +++ b/environment.yml @@ -4,7 +4,7 @@ channels: dependencies: # required # Pin numpy<1.19 until MPL 3.3.0 is released. - - numpy>=1.15,<1.19.0 + - numpy>=1.16.5,<1.19.0 - python=3 - python-dateutil>=2.7.3 - pytz @@ -93,11 +93,11 @@ dependencies: - odfpy - fastparquet>=0.3.2 # pandas.read_parquet, DataFrame.to_parquet - - pyarrow>=0.13.1 # pandas.read_parquet, DataFrame.to_parquet, pandas.read_feather, DataFrame.to_feather + - pyarrow>=0.15.0 # pandas.read_parquet, DataFrame.to_parquet, pandas.read_feather, DataFrame.to_feather - python-snappy # required by pyarrow - pyqt>=5.9.2 # pandas.read_clipboard - - pytables>=3.4.3 # pandas.read_hdf, DataFrame.to_hdf + - pytables>=3.4.4 # pandas.read_hdf, DataFrame.to_hdf - s3fs>=0.4.0 # file IO when using 's3://...' path - fsspec>=0.7.4 # for generic remote file operations - gcsfs>=0.6.0 # file IO when using 'gcs://...' path diff --git a/pandas/__init__.py b/pandas/__init__.py index d6584bf4f1c4f..36576da74c75d 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -20,7 +20,6 @@ # numpy compat from pandas.compat.numpy import ( - _np_version_under1p16, _np_version_under1p17, _np_version_under1p18, _is_numpy_dev, @@ -185,181 +184,76 @@ __git_version__ = v.get("full-revisionid") del get_versions, v + # GH 27101 # TODO: remove Panel compat in 1.0 -if pandas.compat.PY37: - - def __getattr__(name): - import warnings - - if name == "Panel": - - warnings.warn( - "The Panel class is removed from pandas. Accessing it " - "from the top-level namespace will also be removed in the next version", - FutureWarning, - stacklevel=2, - ) - - class Panel: - pass - - return Panel - - elif name == "datetime": - warnings.warn( - "The pandas.datetime class is deprecated " - "and will be removed from pandas in a future version. " - "Import from datetime module instead.", - FutureWarning, - stacklevel=2, - ) - - from datetime import datetime as dt - - return dt - - elif name == "np": - - warnings.warn( - "The pandas.np module is deprecated " - "and will be removed from pandas in a future version. " - "Import numpy directly instead", - FutureWarning, - stacklevel=2, - ) - import numpy as np - - return np - - elif name in {"SparseSeries", "SparseDataFrame"}: - warnings.warn( - f"The {name} class is removed from pandas. Accessing it from " - "the top-level namespace will also be removed in the next version", - FutureWarning, - stacklevel=2, - ) - - return type(name, (), {}) - - elif name == "SparseArray": - - warnings.warn( - "The pandas.SparseArray class is deprecated " - "and will be removed from pandas in a future version. " - "Use pandas.arrays.SparseArray instead.", - FutureWarning, - stacklevel=2, - ) - from pandas.core.arrays.sparse import SparseArray as _SparseArray +def __getattr__(name): + import warnings - return _SparseArray + if name == "Panel": - raise AttributeError(f"module 'pandas' has no attribute '{name}'") + warnings.warn( + "The Panel class is removed from pandas. Accessing it " + "from the top-level namespace will also be removed in the next version", + FutureWarning, + stacklevel=2, + ) + class Panel: + pass -else: + return Panel - class Panel: - pass - - class SparseDataFrame: - pass - - class SparseSeries: - pass - - class __numpy: - def __init__(self): - import numpy as np - import warnings - - self.np = np - self.warnings = warnings - - def __getattr__(self, item): - self.warnings.warn( - "The pandas.np module is deprecated " - "and will be removed from pandas in a future version. " - "Import numpy directly instead", - FutureWarning, - stacklevel=2, - ) - - try: - return getattr(self.np, item) - except AttributeError as err: - raise AttributeError(f"module numpy has no attribute {item}") from err - - np = __numpy() - - class __Datetime(type): + elif name == "datetime": + warnings.warn( + "The pandas.datetime class is deprecated " + "and will be removed from pandas in a future version. " + "Import from datetime module instead.", + FutureWarning, + stacklevel=2, + ) from datetime import datetime as dt - datetime = dt - - def __getattr__(cls, item): - cls.emit_warning() - - try: - return getattr(cls.datetime, item) - except AttributeError as err: - raise AttributeError( - f"module datetime has no attribute {item}" - ) from err - - def __instancecheck__(cls, other): - return isinstance(other, cls.datetime) - - class __DatetimeSub(metaclass=__Datetime): - def emit_warning(dummy=0): - import warnings - - warnings.warn( - "The pandas.datetime class is deprecated " - "and will be removed from pandas in a future version. " - "Import from datetime instead.", - FutureWarning, - stacklevel=3, - ) - - def __new__(cls, *args, **kwargs): - cls.emit_warning() - from datetime import datetime as dt - - return dt(*args, **kwargs) - - datetime = __DatetimeSub + return dt - class __SparseArray(type): + elif name == "np": - from pandas.core.arrays.sparse import SparseArray as sa + warnings.warn( + "The pandas.np module is deprecated " + "and will be removed from pandas in a future version. " + "Import numpy directly instead", + FutureWarning, + stacklevel=2, + ) + import numpy as np - SparseArray = sa + return np - def __instancecheck__(cls, other): - return isinstance(other, cls.SparseArray) + elif name in {"SparseSeries", "SparseDataFrame"}: + warnings.warn( + f"The {name} class is removed from pandas. Accessing it from " + "the top-level namespace will also be removed in the next version", + FutureWarning, + stacklevel=2, + ) - class __SparseArraySub(metaclass=__SparseArray): - def emit_warning(dummy=0): - import warnings + return type(name, (), {}) - warnings.warn( - "The pandas.SparseArray class is deprecated " - "and will be removed from pandas in a future version. " - "Use pandas.arrays.SparseArray instead.", - FutureWarning, - stacklevel=3, - ) + elif name == "SparseArray": - def __new__(cls, *args, **kwargs): - cls.emit_warning() - from pandas.core.arrays.sparse import SparseArray as sa + warnings.warn( + "The pandas.SparseArray class is deprecated " + "and will be removed from pandas in a future version. " + "Use pandas.arrays.SparseArray instead.", + FutureWarning, + stacklevel=2, + ) + from pandas.core.arrays.sparse import SparseArray as _SparseArray - return sa(*args, **kwargs) + return _SparseArray - SparseArray = __SparseArraySub + raise AttributeError(f"module 'pandas' has no attribute '{name}'") # module level doc-string diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index b5a1dc2b2fb94..ab2835932c95d 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -14,7 +14,6 @@ from pandas._typing import F -PY37 = sys.version_info >= (3, 7) PY38 = sys.version_info >= (3, 8) PY39 = sys.version_info >= (3, 9) PYPY = platform.python_implementation() == "PyPy" diff --git a/pandas/compat/numpy/__init__.py b/pandas/compat/numpy/__init__.py index 789a4668b6fee..08d06da93bb45 100644 --- a/pandas/compat/numpy/__init__.py +++ b/pandas/compat/numpy/__init__.py @@ -8,19 +8,19 @@ # numpy versioning _np_version = np.__version__ _nlv = LooseVersion(_np_version) -_np_version_under1p16 = _nlv < LooseVersion("1.16") _np_version_under1p17 = _nlv < LooseVersion("1.17") _np_version_under1p18 = _nlv < LooseVersion("1.18") _np_version_under1p19 = _nlv < LooseVersion("1.19") _np_version_under1p20 = _nlv < LooseVersion("1.20") _is_numpy_dev = ".dev" in str(_nlv) +_min_numpy_ver = "1.16.5" -if _nlv < "1.15.4": +if _nlv < _min_numpy_ver: raise ImportError( - "this version of pandas is incompatible with numpy < 1.15.4\n" + f"this version of pandas is incompatible with numpy < {_min_numpy_ver}\n" f"your numpy version is {_np_version}.\n" - "Please upgrade numpy to >= 1.15.4 to use this pandas version" + f"Please upgrade numpy to >= {_min_numpy_ver} to use this pandas version" ) @@ -65,7 +65,6 @@ def np_array_datetime64_compat(arr, *args, **kwargs): __all__ = [ "np", "_np_version", - "_np_version_under1p16", "_np_version_under1p17", "_is_numpy_dev", ] diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 9d0751fcce460..547d86f221b5f 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -58,7 +58,6 @@ StorageOptions, ValueKeyFunc, ) -from pandas.compat import PY37 from pandas.compat._optional import import_optional_dependency from pandas.compat.numpy import function as nv from pandas.util._decorators import ( @@ -1088,9 +1087,7 @@ def itertuples(self, index=True, name="Pandas"): # use integer indexing because of possible duplicate column names arrays.extend(self.iloc[:, k] for k in range(len(self.columns))) - # Python versions before 3.7 support at most 255 arguments to constructors - can_return_named_tuples = PY37 or len(self.columns) + index < 255 - if name is not None and can_return_named_tuples: + if name is not None: itertuple = collections.namedtuple(name, fields, rename=True) return map(itertuple._make, zip(*arrays)) diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index caa348d3a1fb9..1d25336cd3b70 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -5,7 +5,7 @@ import pytest import pandas as pd -from pandas import api, compat +from pandas import api import pandas._testing as tm @@ -100,11 +100,6 @@ class TestPDApi(Base): # these should be deprecated in the future deprecated_classes_in_future: List[str] = ["SparseArray"] - if not compat.PY37: - classes.extend(["Panel", "SparseSeries", "SparseDataFrame"]) - # deprecated_modules.extend(["np", "datetime"]) - # deprecated_classes_in_future.extend(["SparseArray"]) - # external modules exposed in pandas namespace modules: List[str] = [] @@ -193,7 +188,6 @@ class TestPDApi(Base): "_hashtable", "_lib", "_libs", - "_np_version_under1p16", "_np_version_under1p17", "_np_version_under1p18", "_is_numpy_dev", @@ -217,14 +211,6 @@ def test_api(self): + self.funcs_to + self.private_modules ) - if not compat.PY37: - checkthese.extend( - self.deprecated_modules - + self.deprecated_classes - + self.deprecated_classes_in_future - + self.deprecated_funcs_in_future - + self.deprecated_funcs - ) self.check(pd, checkthese, self.ignored) def test_depr(self): @@ -237,14 +223,7 @@ def test_depr(self): ) for depr in deprecated_list: with tm.assert_produces_warning(FutureWarning): - deprecated = getattr(pd, depr) - if not compat.PY37: - if depr == "datetime": - deprecated.__getattr__(dir(pd.datetime.datetime)[-1]) - elif depr == "SparseArray": - deprecated([]) - else: - deprecated.__getattr__(dir(deprecated)[-1]) + _ = getattr(pd, depr) def test_datetime(): diff --git a/pandas/tests/arrays/categorical/test_constructors.py b/pandas/tests/arrays/categorical/test_constructors.py index ca942c9288898..89fbfbd5b8324 100644 --- a/pandas/tests/arrays/categorical/test_constructors.py +++ b/pandas/tests/arrays/categorical/test_constructors.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas.compat.numpy import _np_version_under1p16 - from pandas.core.dtypes.common import is_float_dtype, is_integer_dtype from pandas.core.dtypes.dtypes import CategoricalDtype @@ -637,7 +635,6 @@ def test_constructor_imaginary(self): tm.assert_index_equal(c1.categories, Index(values)) tm.assert_numpy_array_equal(np.array(c1), np.array(values)) - @pytest.mark.skipif(_np_version_under1p16, reason="Skipping for NumPy <1.16") def test_constructor_string_and_tuples(self): # GH 21416 c = pd.Categorical(np.array(["c", ("a", "b"), ("b", "a"), "c"], dtype=object)) diff --git a/pandas/tests/extension/arrow/test_bool.py b/pandas/tests/extension/arrow/test_bool.py index 7841360e568ed..12426a0c92c55 100644 --- a/pandas/tests/extension/arrow/test_bool.py +++ b/pandas/tests/extension/arrow/test_bool.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas.compat import PY37 - import pandas as pd import pandas._testing as tm from pandas.tests.extension import base @@ -62,13 +60,11 @@ def test_from_dtype(self, data): def test_from_sequence_from_cls(self, data): super().test_from_sequence_from_cls(data) - @pytest.mark.skipif(not PY37, reason="timeout on Linux py36_locale") @pytest.mark.xfail(reason="pa.NULL is not recognised as scalar, GH-33899") def test_series_constructor_no_data_with_index(self, dtype, na_value): # pyarrow.lib.ArrowInvalid: only handle 1-dimensional arrays super().test_series_constructor_no_data_with_index(dtype, na_value) - @pytest.mark.skipif(not PY37, reason="timeout on Linux py36_locale") @pytest.mark.xfail(reason="pa.NULL is not recognised as scalar, GH-33899") def test_series_constructor_scalar_na_with_index(self, dtype, na_value): # pyarrow.lib.ArrowInvalid: only handle 1-dimensional arrays diff --git a/pandas/tests/extension/test_numpy.py b/pandas/tests/extension/test_numpy.py index 78000c0252375..b9219f9f833de 100644 --- a/pandas/tests/extension/test_numpy.py +++ b/pandas/tests/extension/test_numpy.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas.compat.numpy import _np_version_under1p16 - import pandas as pd import pandas._testing as tm from pandas.core.arrays.numpy_ import PandasArray, PandasDtype @@ -46,11 +44,7 @@ def data(allow_in_pandas, dtype): @pytest.fixture def data_missing(allow_in_pandas, dtype): - # For NumPy <1.16, np.array([np.nan, (1,)]) raises - # ValueError: setting an array element with a sequence. if dtype.numpy_dtype == "object": - if _np_version_under1p16: - raise pytest.skip("Skipping for NumPy <1.16") return PandasArray(np.array([np.nan, (1,)], dtype=object)) return PandasArray(np.array([np.nan, 1.0])) diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py index cc57a3970d18b..2fb1f7f911a9c 100644 --- a/pandas/tests/frame/test_api.py +++ b/pandas/tests/frame/test_api.py @@ -6,7 +6,6 @@ import numpy as np import pytest -from pandas.compat import PY37 from pandas.util._test_decorators import async_mark, skip_if_no import pandas as pd @@ -274,10 +273,7 @@ def test_itertuples(self, float_frame): # will raise SyntaxError if trying to create namedtuple tup3 = next(df3.itertuples()) assert isinstance(tup3, tuple) - if PY37: - assert hasattr(tup3, "_fields") - else: - assert not hasattr(tup3, "_fields") + assert hasattr(tup3, "_fields") # GH 28282 df_254_columns = DataFrame([{f"foo_{i}": f"bar_{i}" for i in range(254)}]) @@ -288,12 +284,7 @@ def test_itertuples(self, float_frame): df_255_columns = DataFrame([{f"foo_{i}": f"bar_{i}" for i in range(255)}]) result_255_columns = next(df_255_columns.itertuples(index=False)) assert isinstance(result_255_columns, tuple) - - # Dataframes with >=255 columns will fallback to regular tuples on python < 3.7 - if PY37: - assert hasattr(result_255_columns, "_fields") - else: - assert not hasattr(result_255_columns, "_fields") + assert hasattr(result_255_columns, "_fields") def test_sequence_like_with_categorical(self): diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index d0f774344a33d..c8f5b2b0f6364 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -10,7 +10,7 @@ import pytest import pytz -from pandas.compat import PY37, is_platform_little_endian +from pandas.compat import is_platform_little_endian from pandas.compat.numpy import _np_version_under1p19 from pandas.core.dtypes.common import is_integer_dtype @@ -1418,7 +1418,6 @@ def test_constructor_list_of_namedtuples(self): result = DataFrame(tuples, columns=["y", "z"]) tm.assert_frame_equal(result, expected) - @pytest.mark.skipif(not PY37, reason="Requires Python >= 3.7") def test_constructor_list_of_dataclasses(self): # GH21910 from dataclasses import make_dataclass @@ -1430,7 +1429,6 @@ def test_constructor_list_of_dataclasses(self): result = DataFrame(datas) tm.assert_frame_equal(result, expected) - @pytest.mark.skipif(not PY37, reason="Requires Python >= 3.7") def test_constructor_list_of_dataclasses_with_varying_types(self): # GH21910 from dataclasses import make_dataclass @@ -1447,7 +1445,6 @@ def test_constructor_list_of_dataclasses_with_varying_types(self): result = DataFrame(datas) tm.assert_frame_equal(result, expected) - @pytest.mark.skipif(not PY37, reason="Requires Python >= 3.7") def test_constructor_list_of_dataclasses_error_thrown(self): # GH21910 from dataclasses import make_dataclass diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index c74c1529eb537..13a32e285e70a 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas.compat import PY37, is_platform_windows - import pandas as pd from pandas import ( Categorical, @@ -13,7 +11,6 @@ Index, MultiIndex, Series, - _np_version_under1p17, qcut, ) import pandas._testing as tm @@ -244,12 +241,6 @@ def test_level_get_group(observed): tm.assert_frame_equal(result, expected) -# GH#21636 flaky on py37; may be related to older numpy, see discussion -# https://github.com/MacPython/pandas-wheels/pull/64 -@pytest.mark.xfail( - PY37 and _np_version_under1p17 and not is_platform_windows(), - reason="Flaky, GH-27902", -) @pytest.mark.parametrize("ordered", [True, False]) def test_apply(ordered): # GH 10138 diff --git a/pandas/tests/io/json/test_json_table_schema.py b/pandas/tests/io/json/test_json_table_schema.py index 22b4ec189a0f1..8f1ed193b100f 100644 --- a/pandas/tests/io/json/test_json_table_schema.py +++ b/pandas/tests/io/json/test_json_table_schema.py @@ -256,6 +256,9 @@ def test_read_json_from_to_json_results(self): tm.assert_frame_equal(result1, df) tm.assert_frame_equal(result2, df) + @pytest.mark.filterwarnings( + "ignore:an integer is required (got type float)*:DeprecationWarning" + ) def test_to_json(self): df = self.df.copy() df.index.name = "idx" @@ -432,6 +435,9 @@ def test_to_json_categorical_index(self): assert result == expected + @pytest.mark.filterwarnings( + "ignore:an integer is required (got type float)*:DeprecationWarning" + ) def test_date_format_raises(self): with pytest.raises(ValueError): self.df.to_json(orient="table", date_format="epoch") diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index c4db0170ecc90..1280d0fd434d5 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -35,6 +35,9 @@ def assert_json_roundtrip_equal(result, expected, orient): tm.assert_frame_equal(result, expected) +@pytest.mark.filterwarnings( + "ignore:an integer is required (got type float)*:DeprecationWarning" +) @pytest.mark.filterwarnings("ignore:the 'numpy' keyword is deprecated:FutureWarning") class TestPandasContainer: @pytest.fixture(autouse=True) diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index 5154a9ba6fdf0..c84c0048cc838 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -1138,6 +1138,7 @@ def test_parse_integers_above_fp_precision(all_parsers): tm.assert_frame_equal(result, expected) +@pytest.mark.skip("unreliable test #35214") def test_chunks_have_consistent_numerical_type(all_parsers): parser = all_parsers integers = [str(i) for i in range(499999)] @@ -1151,6 +1152,7 @@ def test_chunks_have_consistent_numerical_type(all_parsers): assert result.a.dtype == float +@pytest.mark.skip("unreliable test #35214") def test_warn_if_chunks_have_mismatched_type(all_parsers): warning_type = None parser = all_parsers diff --git a/pandas/tests/scalar/test_nat.py b/pandas/tests/scalar/test_nat.py index 03830019affa1..09d5d9c1677d0 100644 --- a/pandas/tests/scalar/test_nat.py +++ b/pandas/tests/scalar/test_nat.py @@ -308,10 +308,6 @@ def test_overlap_public_nat_methods(klass, expected): # In case when Timestamp, Timedelta, and NaT are overlap, the overlap # is considered to be with Timestamp and NaT, not Timedelta. - # "fromisoformat" was introduced in 3.7 - if klass is Timestamp and not compat.PY37: - expected.remove("fromisoformat") - # "fromisocalendar" was introduced in 3.8 if klass is Timestamp and not compat.PY38: expected.remove("fromisocalendar") diff --git a/pandas/tests/tseries/offsets/test_offsets.py b/pandas/tests/tseries/offsets/test_offsets.py index 8c51908c547f4..d1ab797056ece 100644 --- a/pandas/tests/tseries/offsets/test_offsets.py +++ b/pandas/tests/tseries/offsets/test_offsets.py @@ -14,7 +14,6 @@ import pandas._libs.tslibs.offsets as liboffsets from pandas._libs.tslibs.offsets import ApplyTypeError, _get_offset, _offset_map from pandas._libs.tslibs.period import INVALID_FREQ_ERR_MSG -import pandas.compat as compat from pandas.compat.numpy import np_datetime64_compat from pandas.errors import PerformanceWarning @@ -744,10 +743,7 @@ def test_repr(self): assert repr(self.offset) == "" assert repr(self.offset2) == "<2 * BusinessDays>" - if compat.PY37: - expected = "" - else: - expected = "" + expected = "" assert repr(self.offset + timedelta(1)) == expected def test_with_offset(self): @@ -2636,10 +2632,7 @@ def test_repr(self): assert repr(self.offset) == "" assert repr(self.offset2) == "<2 * CustomBusinessDays>" - if compat.PY37: - expected = "" - else: - expected = "" + expected = "" assert repr(self.offset + timedelta(1)) == expected def test_with_offset(self): diff --git a/pandas/util/__init__.py b/pandas/util/__init__.py index b5271dbc0443e..9f2bf156b7e37 100644 --- a/pandas/util/__init__.py +++ b/pandas/util/__init__.py @@ -1,30 +1,12 @@ from pandas.util._decorators import Appender, Substitution, cache_readonly # noqa -from pandas import compat from pandas.core.util.hashing import hash_array, hash_pandas_object # noqa -# compatibility for import pandas; pandas.util.testing -if compat.PY37: +def __getattr__(name): + if name == "testing": + import pandas.util.testing - def __getattr__(name): - if name == "testing": - import pandas.util.testing - - return pandas.util.testing - else: - raise AttributeError(f"module 'pandas.util' has no attribute '{name}'") - - -else: - - class _testing: - def __getattr__(self, item): - import pandas.util.testing - - return getattr(pandas.util.testing, item) - - testing = _testing() - - -del compat + return pandas.util.testing + else: + raise AttributeError(f"module 'pandas.util' has no attribute '{name}'") diff --git a/pyproject.toml b/pyproject.toml index f282f2a085000..f6f8081b6c464 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,16 +5,14 @@ requires = [ "setuptools", "wheel", "Cython>=0.29.16,<3", # Note: sync with setup.py - "numpy==1.15.4; python_version=='3.6' and platform_system!='AIX'", - "numpy==1.15.4; python_version=='3.7' and platform_system!='AIX'", + "numpy==1.16.5; python_version=='3.7' and platform_system!='AIX'", "numpy==1.17.3; python_version>='3.8' and platform_system!='AIX'", - "numpy==1.16.0; python_version=='3.6' and platform_system=='AIX'", - "numpy==1.16.0; python_version=='3.7' and platform_system=='AIX'", + "numpy==1.16.5; python_version=='3.7' and platform_system=='AIX'", "numpy==1.17.3; python_version>='3.8' and platform_system=='AIX'", ] [tool.black] -target-version = ['py36', 'py37', 'py38'] +target-version = ['py37', 'py38'] exclude = ''' ( asv_bench/env diff --git a/requirements-dev.txt b/requirements-dev.txt index 6a87b0a99a4f8..66e72641cd5bb 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,7 +1,7 @@ # This file is auto-generated from environment.yml, do not modify. # See that file for comments about the need/usage of each dependency. -numpy>=1.15,<1.19.0 +numpy>=1.16.5,<1.19.0 python-dateutil>=2.7.3 pytz asv @@ -60,10 +60,10 @@ xlsxwriter xlwt odfpy fastparquet>=0.3.2 -pyarrow>=0.13.1 +pyarrow>=0.15.0 python-snappy pyqt5>=5.9.2 -tables>=3.4.3 +tables>=3.4.4 s3fs>=0.4.0 fsspec>=0.7.4 gcsfs>=0.6.0 diff --git a/setup.py b/setup.py index aebbdbf4d1e96..43d19d525876b 100755 --- a/setup.py +++ b/setup.py @@ -33,7 +33,7 @@ def is_platform_mac(): return sys.platform == "darwin" -min_numpy_ver = "1.15.4" +min_numpy_ver = "1.16.5" min_cython_ver = "0.29.16" # note: sync with pyproject.toml try: @@ -197,7 +197,6 @@ def build_extensions(self): "Intended Audience :: Science/Research", "Programming Language :: Python", "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.6", "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.8", "Programming Language :: Cython", @@ -742,7 +741,7 @@ def setup_package(): setuptools_kwargs = { "install_requires": [ "python-dateutil >= 2.7.3", - "pytz >= 2017.2", + "pytz >= 2017.3", f"numpy >= {min_numpy_ver}", ], "setup_requires": [f"numpy >= {min_numpy_ver}"], @@ -766,11 +765,11 @@ def setup_package(): long_description=LONG_DESCRIPTION, classifiers=CLASSIFIERS, platforms="any", - python_requires=">=3.6.1", + python_requires=">=3.7.1", extras_require={ "test": [ # sync with setup.cfg minversion & install.rst - "pytest>=4.0.2", + "pytest>=5.0.1", "pytest-xdist", "hypothesis>=3.58", ] From 1f25056503164d500ce904a8143abfc454a1a533 Mon Sep 17 00:00:00 2001 From: Fangchen Li Date: Tue, 11 Aug 2020 16:57:56 -0500 Subject: [PATCH 0483/1025] CI/TST: change skip to xfail #35660 (#35672) --- pandas/tests/io/parser/test_common.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index c84c0048cc838..3d5f6ae3a4af9 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -1138,7 +1138,7 @@ def test_parse_integers_above_fp_precision(all_parsers): tm.assert_frame_equal(result, expected) -@pytest.mark.skip("unreliable test #35214") +@pytest.mark.xfail(reason="ResourceWarning #35660", strict=False) def test_chunks_have_consistent_numerical_type(all_parsers): parser = all_parsers integers = [str(i) for i in range(499999)] @@ -1152,7 +1152,7 @@ def test_chunks_have_consistent_numerical_type(all_parsers): assert result.a.dtype == float -@pytest.mark.skip("unreliable test #35214") +@pytest.mark.xfail(reason="ResourceWarning #35660", strict=False) def test_warn_if_chunks_have_mismatched_type(all_parsers): warning_type = None parser = all_parsers From 43c772f692c538e537b08b4516a5d84d5655bc1e Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Wed, 12 Aug 2020 13:22:54 +0100 Subject: [PATCH 0484/1025] CLN: consistent signatures for equals methods (#35636) --- pandas/_libs/sparse.pyx | 4 ++-- pandas/core/arrays/base.py | 14 ++++++++++---- pandas/core/arrays/categorical.py | 6 ++++-- pandas/core/generic.py | 4 +++- pandas/core/indexes/base.py | 2 +- pandas/core/indexes/category.py | 2 +- pandas/core/indexes/datetimelike.py | 6 +++--- pandas/core/indexes/interval.py | 7 ++++--- pandas/core/indexes/multi.py | 11 +++++------ pandas/core/indexes/range.py | 2 +- pandas/core/internals/managers.py | 5 ++++- 11 files changed, 38 insertions(+), 25 deletions(-) diff --git a/pandas/_libs/sparse.pyx b/pandas/_libs/sparse.pyx index 321d7c374d8ec..0c3d8915b749b 100644 --- a/pandas/_libs/sparse.pyx +++ b/pandas/_libs/sparse.pyx @@ -103,7 +103,7 @@ cdef class IntIndex(SparseIndex): if not monotonic: raise ValueError("Indices must be strictly increasing") - def equals(self, other) -> bool: + def equals(self, other: object) -> bool: if not isinstance(other, IntIndex): return False @@ -399,7 +399,7 @@ cdef class BlockIndex(SparseIndex): if blengths[i] == 0: raise ValueError(f'Zero-length block {i}') - def equals(self, other) -> bool: + def equals(self, other: object) -> bool: if not isinstance(other, BlockIndex): return False diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 921927325a144..d85647edc3b81 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -7,7 +7,7 @@ without warning. """ import operator -from typing import Any, Callable, Dict, Optional, Sequence, Tuple, Union +from typing import Any, Callable, Dict, Optional, Sequence, Tuple, Union, cast import numpy as np @@ -20,7 +20,12 @@ from pandas.util._validators import validate_fillna_kwargs from pandas.core.dtypes.cast import maybe_cast_to_extension_array -from pandas.core.dtypes.common import is_array_like, is_list_like, pandas_dtype +from pandas.core.dtypes.common import ( + is_array_like, + is_dtype_equal, + is_list_like, + pandas_dtype, +) from pandas.core.dtypes.dtypes import ExtensionDtype from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries from pandas.core.dtypes.missing import isna @@ -742,7 +747,7 @@ def searchsorted(self, value, side="left", sorter=None): arr = self.astype(object) return arr.searchsorted(value, side=side, sorter=sorter) - def equals(self, other: "ExtensionArray") -> bool: + def equals(self, other: object) -> bool: """ Return if another array is equivalent to this array. @@ -762,7 +767,8 @@ def equals(self, other: "ExtensionArray") -> bool: """ if not type(self) == type(other): return False - elif not self.dtype == other.dtype: + other = cast(ExtensionArray, other) + if not is_dtype_equal(self.dtype, other.dtype): return False elif not len(self) == len(other): return False diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 6e5c7bc699962..a28b341669918 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2242,7 +2242,7 @@ def _from_factorized(cls, uniques, original): original.categories.take(uniques), dtype=original.dtype ) - def equals(self, other): + def equals(self, other: object) -> bool: """ Returns True if categorical arrays are equal. @@ -2254,7 +2254,9 @@ def equals(self, other): ------- bool """ - if self.is_dtype_equal(other): + if not isinstance(other, Categorical): + return False + elif self.is_dtype_equal(other): if self.categories.equals(other.categories): # fastpath to avoid re-coding other_codes = other._codes diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 520023050d49d..11147bffa32c3 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -22,6 +22,7 @@ Tuple, Type, Union, + cast, ) import warnings import weakref @@ -1196,7 +1197,7 @@ def _indexed_same(self, other) -> bool: self._get_axis(a).equals(other._get_axis(a)) for a in self._AXIS_ORDERS ) - def equals(self, other): + def equals(self, other: object) -> bool: """ Test whether two objects contain the same elements. @@ -1276,6 +1277,7 @@ def equals(self, other): """ if not (isinstance(other, type(self)) or isinstance(self, type(other))): return False + other = cast(NDFrame, other) return self._mgr.equals(other._mgr) # ------------------------------------------------------------------------- diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index ecd3670e724a1..623ce68201492 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4176,7 +4176,7 @@ def putmask(self, mask, value): # coerces to object return self.astype(object).putmask(mask, value) - def equals(self, other: Any) -> bool: + def equals(self, other: object) -> bool: """ Determine if two Index object are equal. diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index fb283cbe02954..4990e6a8e20e9 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -290,7 +290,7 @@ def _is_dtype_compat(self, other) -> bool: return other - def equals(self, other) -> bool: + def equals(self, other: object) -> bool: """ Determine if two CategoricalIndex objects contain the same elements. diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 8ccdab21339df..6d9d75a69e91d 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -24,7 +24,7 @@ is_scalar, ) from pandas.core.dtypes.concat import concat_compat -from pandas.core.dtypes.generic import ABCIndex, ABCIndexClass, ABCSeries +from pandas.core.dtypes.generic import ABCIndex, ABCSeries from pandas.core import algorithms from pandas.core.arrays import DatetimeArray, PeriodArray, TimedeltaArray @@ -130,14 +130,14 @@ def __array_wrap__(self, result, context=None): # ------------------------------------------------------------------------ - def equals(self, other) -> bool: + def equals(self, other: object) -> bool: """ Determines if two Index objects contain the same elements. """ if self.is_(other): return True - if not isinstance(other, ABCIndexClass): + if not isinstance(other, Index): return False elif not isinstance(other, type(self)): try: diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 9548ebbd9c3b2..e8d0a44324cc5 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -1005,19 +1005,20 @@ def _format_space(self) -> str: def argsort(self, *args, **kwargs) -> np.ndarray: return np.lexsort((self.right, self.left)) - def equals(self, other) -> bool: + def equals(self, other: object) -> bool: """ Determines if two IntervalIndex objects contain the same elements. """ if self.is_(other): return True - # if we can coerce to an II - # then we can compare + # if we can coerce to an IntervalIndex then we can compare if not isinstance(other, IntervalIndex): if not is_interval_dtype(other): return False other = Index(other) + if not isinstance(other, IntervalIndex): + return False return ( self.left.equals(other.left) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 448c2dfe4a29d..ffbd03d0c3ba7 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -3221,7 +3221,7 @@ def truncate(self, before=None, after=None): verify_integrity=False, ) - def equals(self, other) -> bool: + def equals(self, other: object) -> bool: """ Determines if two MultiIndex objects have the same labeling information (the levels themselves do not necessarily have to be the same) @@ -3264,11 +3264,10 @@ def equals(self, other) -> bool: np.asarray(other.levels[i]._values), other_codes, allow_fill=False ) - # since we use NaT both datetime64 and timedelta64 - # we can have a situation where a level is typed say - # timedelta64 in self (IOW it has other values than NaT) - # but types datetime64 in other (where its all NaT) - # but these are equivalent + # since we use NaT both datetime64 and timedelta64 we can have a + # situation where a level is typed say timedelta64 in self (IOW it + # has other values than NaT) but types datetime64 in other (where + # its all NaT) but these are equivalent if len(self_values) == 0 and len(other_values) == 0: continue diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 3577a7aacc008..6080c32052266 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -433,7 +433,7 @@ def argsort(self, *args, **kwargs) -> np.ndarray: else: return np.arange(len(self) - 1, -1, -1) - def equals(self, other) -> bool: + def equals(self, other: object) -> bool: """ Determines if two Index objects contain the same elements. """ diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index aa74d173d69b3..371b721f08b27 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1437,7 +1437,10 @@ def take(self, indexer, axis: int = 1, verify: bool = True, convert: bool = True new_axis=new_labels, indexer=indexer, axis=axis, allow_dups=True ) - def equals(self, other: "BlockManager") -> bool: + def equals(self, other: object) -> bool: + if not isinstance(other, BlockManager): + return False + self_axes, other_axes = self.axes, other.axes if len(self_axes) != len(other_axes): return False From bb1da4d4567b42062b46aa86ce6357281c1afe4d Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Wed, 12 Aug 2020 15:14:30 -0700 Subject: [PATCH 0485/1025] BUG: Support custom BaseIndexers in groupby.rolling (#35647) --- doc/source/whatsnew/v1.1.1.rst | 1 + pandas/core/window/indexers.py | 14 ++++++++++---- pandas/core/window/rolling.py | 15 +++++++++++---- pandas/tests/window/test_grouper.py | 23 +++++++++++++++++++++++ 4 files changed, 45 insertions(+), 8 deletions(-) diff --git a/doc/source/whatsnew/v1.1.1.rst b/doc/source/whatsnew/v1.1.1.rst index 415f9e508feb8..cdc244ca193b4 100644 --- a/doc/source/whatsnew/v1.1.1.rst +++ b/doc/source/whatsnew/v1.1.1.rst @@ -22,6 +22,7 @@ Fixed regressions - Fixed regression in :meth:`DataFrame.shift` with ``axis=1`` and heterogeneous dtypes (:issue:`35488`) - Fixed regression in ``.groupby(..).rolling(..)`` where a segfault would occur with ``center=True`` and an odd number of values (:issue:`35552`) - Fixed regression in :meth:`DataFrame.apply` where functions that altered the input in-place only operated on a single row (:issue:`35462`) +- Fixed regression in ``.groupby(..).rolling(..)`` where a custom ``BaseIndexer`` would be ignored (:issue:`35557`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/window/indexers.py b/pandas/core/window/indexers.py index bc36bdca982e8..7cbe34cdebf9f 100644 --- a/pandas/core/window/indexers.py +++ b/pandas/core/window/indexers.py @@ -1,6 +1,6 @@ """Indexer objects for computing start/end window bounds for rolling operations""" from datetime import timedelta -from typing import Dict, Optional, Tuple, Type, Union +from typing import Dict, Optional, Tuple, Type import numpy as np @@ -265,7 +265,8 @@ def __init__( index_array: Optional[np.ndarray], window_size: int, groupby_indicies: Dict, - rolling_indexer: Union[Type[FixedWindowIndexer], Type[VariableWindowIndexer]], + rolling_indexer: Type[BaseIndexer], + indexer_kwargs: Optional[Dict], **kwargs, ): """ @@ -276,7 +277,10 @@ def __init__( """ self.groupby_indicies = groupby_indicies self.rolling_indexer = rolling_indexer - super().__init__(index_array, window_size, **kwargs) + self.indexer_kwargs = indexer_kwargs or {} + super().__init__( + index_array, self.indexer_kwargs.pop("window_size", window_size), **kwargs + ) @Appender(get_window_bounds_doc) def get_window_bounds( @@ -298,7 +302,9 @@ def get_window_bounds( else: index_array = self.index_array indexer = self.rolling_indexer( - index_array=index_array, window_size=self.window_size, + index_array=index_array, + window_size=self.window_size, + **self.indexer_kwargs, ) start, end = indexer.get_window_bounds( len(indicies), min_periods, center, closed diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 7347d5686aabc..0306d4de2fc73 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -145,7 +145,7 @@ class _Window(PandasObject, ShallowMixin, SelectionMixin): def __init__( self, - obj, + obj: FrameOrSeries, window=None, min_periods: Optional[int] = None, center: bool = False, @@ -2271,10 +2271,16 @@ def _get_window_indexer(self, window: int) -> GroupbyRollingIndexer: ------- GroupbyRollingIndexer """ - rolling_indexer: Union[Type[FixedWindowIndexer], Type[VariableWindowIndexer]] - if self.is_freq_type: + rolling_indexer: Type[BaseIndexer] + indexer_kwargs: Optional[Dict] = None + index_array = self.obj.index.asi8 + if isinstance(self.window, BaseIndexer): + rolling_indexer = type(self.window) + indexer_kwargs = self.window.__dict__ + # We'll be using the index of each group later + indexer_kwargs.pop("index_array", None) + elif self.is_freq_type: rolling_indexer = VariableWindowIndexer - index_array = self.obj.index.asi8 else: rolling_indexer = FixedWindowIndexer index_array = None @@ -2283,6 +2289,7 @@ def _get_window_indexer(self, window: int) -> GroupbyRollingIndexer: window_size=window, groupby_indicies=self._groupby.indices, rolling_indexer=rolling_indexer, + indexer_kwargs=indexer_kwargs, ) return window_indexer diff --git a/pandas/tests/window/test_grouper.py b/pandas/tests/window/test_grouper.py index e1dcac06c39cc..a9590c7e1233a 100644 --- a/pandas/tests/window/test_grouper.py +++ b/pandas/tests/window/test_grouper.py @@ -305,6 +305,29 @@ def test_groupby_subselect_rolling(self): ) tm.assert_series_equal(result, expected) + def test_groupby_rolling_custom_indexer(self): + # GH 35557 + class SimpleIndexer(pd.api.indexers.BaseIndexer): + def get_window_bounds( + self, num_values=0, min_periods=None, center=None, closed=None + ): + min_periods = self.window_size if min_periods is None else 0 + end = np.arange(num_values, dtype=np.int64) + 1 + start = end.copy() - self.window_size + start[start < 0] = min_periods + return start, end + + df = pd.DataFrame( + {"a": [1.0, 2.0, 3.0, 4.0, 5.0] * 3}, index=[0] * 5 + [1] * 5 + [2] * 5 + ) + result = ( + df.groupby(df.index) + .rolling(SimpleIndexer(window_size=3), min_periods=1) + .sum() + ) + expected = df.groupby(df.index).rolling(window=3, min_periods=1).sum() + tm.assert_frame_equal(result, expected) + def test_groupby_rolling_subset_with_closed(self): # GH 35549 df = pd.DataFrame( From 60791ce7a35f8b319f8b718056950fe24e2d691f Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 12 Aug 2020 15:15:35 -0700 Subject: [PATCH 0486/1025] REF: _cython_agg_blocks follow patterns similar to _apply_blockwise (#35632) --- pandas/core/groupby/generic.py | 41 +++++++++++++++++++++------------- 1 file changed, 26 insertions(+), 15 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 53242c0332a8c..b7280a9f7db3c 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1026,8 +1026,7 @@ def _cython_agg_blocks( if numeric_only: data = data.get_numeric_data(copy=False) - agg_blocks: List[Block] = [] - new_items: List[np.ndarray] = [] + agg_blocks: List["Block"] = [] deleted_items: List[np.ndarray] = [] no_result = object() @@ -1056,11 +1055,12 @@ def cast_result_block(result, block: "Block", how: str) -> "Block": # reshape to be valid for non-Extension Block result = result.reshape(1, -1) - agg_block: Block = block.make_block(result) + agg_block: "Block" = block.make_block(result) return agg_block - for block in data.blocks: - # Avoid inheriting result from earlier in the loop + def blk_func(block: "Block") -> List["Block"]: + new_blocks: List["Block"] = [] + result = no_result locs = block.mgr_locs.as_array try: @@ -1076,8 +1076,7 @@ def cast_result_block(result, block: "Block", how: str) -> "Block": # we cannot perform the operation # in an alternate way, exclude the block assert how == "ohlc" - deleted_items.append(locs) - continue + raise # call our grouper again with only this block obj = self.obj[data.items[locs]] @@ -1096,8 +1095,7 @@ def cast_result_block(result, block: "Block", how: str) -> "Block": except TypeError: # we may have an exception in trying to aggregate # continue and exclude the block - deleted_items.append(locs) - continue + raise else: result = cast(DataFrame, result) # unwrap DataFrame to get array @@ -1108,20 +1106,33 @@ def cast_result_block(result, block: "Block", how: str) -> "Block": # clean, we choose to clean up this mess later on. assert len(locs) == result.shape[1] for i, loc in enumerate(locs): - new_items.append(np.array([loc], dtype=locs.dtype)) agg_block = result.iloc[:, [i]]._mgr.blocks[0] - agg_blocks.append(agg_block) + new_blocks.append(agg_block) else: result = result._mgr.blocks[0].values if isinstance(result, np.ndarray) and result.ndim == 1: result = result.reshape(1, -1) agg_block = cast_result_block(result, block, how) - new_items.append(locs) - agg_blocks.append(agg_block) + new_blocks = [agg_block] else: agg_block = cast_result_block(result, block, how) - new_items.append(locs) - agg_blocks.append(agg_block) + new_blocks = [agg_block] + return new_blocks + + skipped: List[int] = [] + new_items: List[np.ndarray] = [] + for i, block in enumerate(data.blocks): + try: + nbs = blk_func(block) + except (NotImplementedError, TypeError): + # TypeError -> we may have an exception in trying to aggregate + # continue and exclude the block + # NotImplementedError -> "ohlc" with wrong dtype + skipped.append(i) + deleted_items.append(block.mgr_locs.as_array) + else: + agg_blocks.extend(nbs) + new_items.append(block.mgr_locs.as_array) if not agg_blocks: raise DataError("No numeric types to aggregate") From 7da437ba6c7551b1605c815242baae5b2d3bad86 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torsten=20W=C3=B6rtwein?= Date: Wed, 12 Aug 2020 18:23:25 -0400 Subject: [PATCH 0487/1025] BUG: to_pickle/read_pickle do not close user-provided file objects (#35686) --- doc/source/whatsnew/v1.2.0.rst | 2 +- pandas/io/pickle.py | 8 ++++++-- pandas/tests/io/test_pickle.py | 9 +++++++++ 3 files changed, 16 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 86f47a5826214..deb5697053ea8 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -234,7 +234,7 @@ I/O - Bug in :meth:`to_csv` caused a ``ValueError`` when it was called with a filename in combination with ``mode`` containing a ``b`` (:issue:`35058`) - In :meth:`read_csv` `float_precision='round_trip'` now handles `decimal` and `thousands` parameters (:issue:`35365`) -- +- :meth:`to_pickle` and :meth:`read_pickle` were closing user-provided file objects (:issue:`35679`) Plotting ^^^^^^^^ diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py index 549d55e65546d..eee6ec7c9feca 100644 --- a/pandas/io/pickle.py +++ b/pandas/io/pickle.py @@ -100,7 +100,9 @@ def to_pickle( try: f.write(pickle.dumps(obj, protocol=protocol)) finally: - f.close() + if f != filepath_or_buffer: + # do not close user-provided file objects GH 35679 + f.close() for _f in fh: _f.close() if should_close: @@ -215,7 +217,9 @@ def read_pickle( # e.g. can occur for files written in py27; see GH#28645 and GH#31988 return pc.load(f, encoding="latin-1") finally: - f.close() + if f != filepath_or_buffer: + # do not close user-provided file objects GH 35679 + f.close() for _f in fh: _f.close() if should_close: diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py index e4d43db7834e3..6331113ab8945 100644 --- a/pandas/tests/io/test_pickle.py +++ b/pandas/tests/io/test_pickle.py @@ -183,6 +183,15 @@ def python_unpickler(path): result = python_unpickler(path) compare_element(result, expected, typ) + # and the same for file objects (GH 35679) + with open(path, mode="wb") as handle: + writer(expected, path) + handle.seek(0) # shouldn't close file handle + with open(path, mode="rb") as handle: + result = pd.read_pickle(handle) + handle.seek(0) # shouldn't close file handle + compare_element(result, expected, typ) + def test_pickle_path_pathlib(): df = tm.makeDataFrame() From 24919e61174e566a84dbf0f92ebbc50c8fa73982 Mon Sep 17 00:00:00 2001 From: Yutaro Ikeda Date: Thu, 13 Aug 2020 19:15:51 +0900 Subject: [PATCH 0488/1025] BUG: GH-35558 merge_asof tolerance error (#35654) --- doc/source/whatsnew/v1.1.1.rst | 1 + pandas/core/reshape/merge.py | 2 +- pandas/tests/reshape/merge/test_merge_asof.py | 22 +++++++++++++++++++ 3 files changed, 24 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.1.rst b/doc/source/whatsnew/v1.1.1.rst index cdc244ca193b4..b37103910afab 100644 --- a/doc/source/whatsnew/v1.1.1.rst +++ b/doc/source/whatsnew/v1.1.1.rst @@ -22,6 +22,7 @@ Fixed regressions - Fixed regression in :meth:`DataFrame.shift` with ``axis=1`` and heterogeneous dtypes (:issue:`35488`) - Fixed regression in ``.groupby(..).rolling(..)`` where a segfault would occur with ``center=True`` and an odd number of values (:issue:`35552`) - Fixed regression in :meth:`DataFrame.apply` where functions that altered the input in-place only operated on a single row (:issue:`35462`) +- Fixed regression where :meth:`DataFrame.merge_asof` would raise a ``UnboundLocalError`` when ``left_index`` , ``right_index`` and ``tolerance`` were set (:issue:`35558`) - Fixed regression in ``.groupby(..).rolling(..)`` where a custom ``BaseIndexer`` would be ignored (:issue:`35557`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 27b331babe692..2349cb1dcc0c7 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1667,7 +1667,7 @@ def _get_merge_keys(self): msg = ( f"incompatible tolerance {self.tolerance}, must be compat " - f"with type {repr(lk.dtype)}" + f"with type {repr(lt.dtype)}" ) if needs_i8_conversion(lt): diff --git a/pandas/tests/reshape/merge/test_merge_asof.py b/pandas/tests/reshape/merge/test_merge_asof.py index 9b09f0033715d..895de2b748c34 100644 --- a/pandas/tests/reshape/merge/test_merge_asof.py +++ b/pandas/tests/reshape/merge/test_merge_asof.py @@ -1339,3 +1339,25 @@ def test_merge_index_column_tz(self): index=pd.Index([0, 1, 2, 3, 4]), ) tm.assert_frame_equal(result, expected) + + def test_left_index_right_index_tolerance(self): + # https://github.com/pandas-dev/pandas/issues/35558 + dr1 = pd.date_range( + start="1/1/2020", end="1/20/2020", freq="2D" + ) + pd.Timedelta(seconds=0.4) + dr2 = pd.date_range(start="1/1/2020", end="2/1/2020") + + df1 = pd.DataFrame({"val1": "foo"}, index=pd.DatetimeIndex(dr1)) + df2 = pd.DataFrame({"val2": "bar"}, index=pd.DatetimeIndex(dr2)) + + expected = pd.DataFrame( + {"val1": "foo", "val2": "bar"}, index=pd.DatetimeIndex(dr1) + ) + result = pd.merge_asof( + df1, + df2, + left_index=True, + right_index=True, + tolerance=pd.Timedelta(seconds=0.5), + ) + tm.assert_frame_equal(result, expected) From 4639375c2e011bc9ef88162a45e16f3894c1b289 Mon Sep 17 00:00:00 2001 From: Elliot Rampono Date: Thu, 13 Aug 2020 12:21:46 -0400 Subject: [PATCH 0489/1025] Reorganize imports to be compliant with isort (and conventional) (#35708) --- web/pandas_web.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/web/pandas_web.py b/web/pandas_web.py index e62deaa8cdc7f..7dd63175e69ac 100755 --- a/web/pandas_web.py +++ b/web/pandas_web.py @@ -34,13 +34,12 @@ import time import typing +import feedparser import jinja2 +import markdown import requests import yaml -import feedparser -import markdown - class Preprocessors: """ From e6921424ad1e98b26def1a67b58bd59c79d903fa Mon Sep 17 00:00:00 2001 From: Elliot Rampono Date: Thu, 13 Aug 2020 13:52:32 -0400 Subject: [PATCH 0490/1025] add web/ directory to isort checks (#35709) Co-authored-by: elliot rampono --- ci/code_checks.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 816bb23865c04..852f66763683b 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -121,7 +121,7 @@ if [[ -z "$CHECK" || "$CHECK" == "lint" ]]; then # Imports - Check formatting using isort see setup.cfg for settings MSG='Check import format using isort' ; echo $MSG - ISORT_CMD="isort --quiet --check-only pandas asv_bench scripts" + ISORT_CMD="isort --quiet --check-only pandas asv_bench scripts web" if [[ "$GITHUB_ACTIONS" == "true" ]]; then eval $ISORT_CMD | awk '{print "##[error]" $0}'; RET=$(($RET + ${PIPESTATUS[0]})) else From d44e2f820df3abd2b791f2837f5bd24115b1a0b0 Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Thu, 13 Aug 2020 19:06:35 +0100 Subject: [PATCH 0491/1025] PERF: make RangeIndex iterate over ._range (#35676) --- asv_bench/benchmarks/index_object.py | 24 +++++++++++++++-------- pandas/core/indexes/range.py | 4 ++++ pandas/tests/indexes/ranges/test_range.py | 4 ++++ 3 files changed, 24 insertions(+), 8 deletions(-) diff --git a/asv_bench/benchmarks/index_object.py b/asv_bench/benchmarks/index_object.py index b242de6a17208..9c05019c70396 100644 --- a/asv_bench/benchmarks/index_object.py +++ b/asv_bench/benchmarks/index_object.py @@ -57,8 +57,8 @@ def time_datetime_difference_disjoint(self): class Range: def setup(self): - self.idx_inc = RangeIndex(start=0, stop=10 ** 7, step=3) - self.idx_dec = RangeIndex(start=10 ** 7, stop=-1, step=-3) + self.idx_inc = RangeIndex(start=0, stop=10 ** 6, step=3) + self.idx_dec = RangeIndex(start=10 ** 6, stop=-1, step=-3) def time_max(self): self.idx_inc.max() @@ -73,15 +73,23 @@ def time_min_trivial(self): self.idx_inc.min() def time_get_loc_inc(self): - self.idx_inc.get_loc(900000) + self.idx_inc.get_loc(900_000) def time_get_loc_dec(self): - self.idx_dec.get_loc(100000) + self.idx_dec.get_loc(100_000) + + def time_iter_inc(self): + for _ in self.idx_inc: + pass + + def time_iter_dec(self): + for _ in self.idx_dec: + pass class IndexEquals: def setup(self): - idx_large_fast = RangeIndex(100000) + idx_large_fast = RangeIndex(100_000) idx_small_slow = date_range(start="1/1/2012", periods=1) self.mi_large_slow = MultiIndex.from_product([idx_large_fast, idx_small_slow]) @@ -94,7 +102,7 @@ def time_non_object_equals_multiindex(self): class IndexAppend: def setup(self): - N = 10000 + N = 10_000 self.range_idx = RangeIndex(0, 100) self.int_idx = self.range_idx.astype(int) self.obj_idx = self.int_idx.astype(str) @@ -168,7 +176,7 @@ def time_get_loc_non_unique_sorted(self, dtype): class Float64IndexMethod: # GH 13166 def setup(self): - N = 100000 + N = 100_000 a = np.arange(N) self.ind = Float64Index(a * 4.8000000418824129e-08) @@ -212,7 +220,7 @@ class GC: params = [1, 2, 5] def create_use_drop(self): - idx = Index(list(range(1000 * 1000))) + idx = Index(list(range(1_000_000))) idx._engine def peakmem_gc_instances(self, N): diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 6080c32052266..c65c3d5ff3d9c 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -373,6 +373,10 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): def tolist(self): return list(self._range) + @doc(Int64Index.__iter__) + def __iter__(self): + yield from self._range + @doc(Int64Index._shallow_copy) def _shallow_copy(self, values=None, name: Label = no_default): name = self.name if name is no_default else name diff --git a/pandas/tests/indexes/ranges/test_range.py b/pandas/tests/indexes/ranges/test_range.py index ef4bb9a0869b0..c4c242746e92c 100644 --- a/pandas/tests/indexes/ranges/test_range.py +++ b/pandas/tests/indexes/ranges/test_range.py @@ -167,6 +167,10 @@ def test_cache(self): idx.any() assert idx._cache == {} + for _ in idx: + pass + assert idx._cache == {} + df = pd.DataFrame({"a": range(10)}, index=idx) df.loc[50] From be343bb14892b3146d742bc56d6283d484787902 Mon Sep 17 00:00:00 2001 From: SylvainLan Date: Thu, 13 Aug 2020 20:17:39 +0200 Subject: [PATCH 0492/1025] Refactor tables latex (#35649) --- pandas/io/formats/latex.py | 103 +++++++---------------- pandas/tests/io/formats/test_to_latex.py | 3 +- 2 files changed, 33 insertions(+), 73 deletions(-) diff --git a/pandas/io/formats/latex.py b/pandas/io/formats/latex.py index 5d6f0a08ef2b5..715b8bbdf5672 100644 --- a/pandas/io/formats/latex.py +++ b/pandas/io/formats/latex.py @@ -121,10 +121,7 @@ def pad_empties(x): else: column_format = self.column_format - if self.longtable: - self._write_longtable_begin(buf, column_format) - else: - self._write_tabular_begin(buf, column_format) + self._write_tabular_begin(buf, column_format) buf.write("\\toprule\n") @@ -190,10 +187,7 @@ def pad_empties(x): if self.multirow and i < len(strrows) - 1: self._print_cline(buf, i, len(strcols)) - if self.longtable: - self._write_longtable_end(buf) - else: - self._write_tabular_end(buf) + self._write_tabular_end(buf) def _format_multicolumn(self, row: List[str], ilevels: int) -> List[str]: r""" @@ -288,7 +282,7 @@ def _write_tabular_begin(self, buf, column_format: str): for 3 columns """ if self._table_float: - # then write output in a nested table/tabular environment + # then write output in a nested table/tabular or longtable environment if self.caption is None: caption_ = "" else: @@ -304,12 +298,27 @@ def _write_tabular_begin(self, buf, column_format: str): else: position_ = f"[{self.position}]" - buf.write(f"\\begin{{table}}{position_}\n\\centering{caption_}{label_}\n") + if self.longtable: + table_ = f"\\begin{{longtable}}{position_}{{{column_format}}}" + tabular_ = "\n" + else: + table_ = f"\\begin{{table}}{position_}\n\\centering" + tabular_ = f"\n\\begin{{tabular}}{{{column_format}}}\n" + + if self.longtable and (self.caption is not None or self.label is not None): + # a double-backslash is required at the end of the line + # as discussed here: + # https://tex.stackexchange.com/questions/219138 + backlash_ = "\\\\" + else: + backlash_ = "" + buf.write(f"{table_}{caption_}{label_}{backlash_}{tabular_}") else: - # then write output only in a tabular environment - pass - - buf.write(f"\\begin{{tabular}}{{{column_format}}}\n") + if self.longtable: + tabletype_ = "longtable" + else: + tabletype_ = "tabular" + buf.write(f"\\begin{{{tabletype_}}}{{{column_format}}}\n") def _write_tabular_end(self, buf): """ @@ -323,62 +332,12 @@ def _write_tabular_end(self, buf): a string. """ - buf.write("\\bottomrule\n") - buf.write("\\end{tabular}\n") - if self._table_float: - buf.write("\\end{table}\n") - else: - pass - - def _write_longtable_begin(self, buf, column_format: str): - """ - Write the beginning of a longtable environment including caption and - label if provided by user. - - Parameters - ---------- - buf : string or file handle - File path or object. If not specified, the result is returned as - a string. - column_format : str - The columns format as specified in `LaTeX table format - `__ e.g 'rcl' - for 3 columns - """ - if self.caption is None: - caption_ = "" - else: - caption_ = f"\\caption{{{self.caption}}}" - - if self.label is None: - label_ = "" - else: - label_ = f"\\label{{{self.label}}}" - - if self.position is None: - position_ = "" + if self.longtable: + buf.write("\\end{longtable}\n") else: - position_ = f"[{self.position}]" - - buf.write( - f"\\begin{{longtable}}{position_}{{{column_format}}}\n{caption_}{label_}" - ) - if self.caption is not None or self.label is not None: - # a double-backslash is required at the end of the line - # as discussed here: - # https://tex.stackexchange.com/questions/219138 - buf.write("\\\\\n") - - @staticmethod - def _write_longtable_end(buf): - """ - Write the end of a longtable environment. - - Parameters - ---------- - buf : string or file handle - File path or object. If not specified, the result is returned as - a string. - - """ - buf.write("\\end{longtable}\n") + buf.write("\\bottomrule\n") + buf.write("\\end{tabular}\n") + if self._table_float: + buf.write("\\end{table}\n") + else: + pass diff --git a/pandas/tests/io/formats/test_to_latex.py b/pandas/tests/io/formats/test_to_latex.py index 93ad3739e59c7..96a9ed2b86cf4 100644 --- a/pandas/tests/io/formats/test_to_latex.py +++ b/pandas/tests/io/formats/test_to_latex.py @@ -555,7 +555,8 @@ def test_to_latex_longtable_caption_label(self): result_cl = df.to_latex(longtable=True, caption=the_caption, label=the_label) expected_cl = r"""\begin{longtable}{lrl} -\caption{a table in a \texttt{longtable} environment}\label{tab:longtable}\\ +\caption{a table in a \texttt{longtable} environment} +\label{tab:longtable}\\ \toprule {} & a & b \\ \midrule From 9ba7a68269646919ab438cd2bb491e2ae589678a Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 13 Aug 2020 12:18:47 -0700 Subject: [PATCH 0493/1025] CI: avoid file leaks in sas_xport tests (#35693) --- pandas/io/sas/sasreader.py | 10 ++++++++-- pandas/tests/io/sas/test_xport.py | 13 ++++++++++-- pandas/util/_test_decorators.py | 33 +++++++++++++++++++++---------- 3 files changed, 42 insertions(+), 14 deletions(-) diff --git a/pandas/io/sas/sasreader.py b/pandas/io/sas/sasreader.py index 291c9d1ee7f0c..fffdebda8c87a 100644 --- a/pandas/io/sas/sasreader.py +++ b/pandas/io/sas/sasreader.py @@ -6,7 +6,7 @@ from pandas._typing import FilePathOrBuffer, Label -from pandas.io.common import stringify_path +from pandas.io.common import get_filepath_or_buffer, stringify_path if TYPE_CHECKING: from pandas import DataFrame # noqa: F401 @@ -109,6 +109,10 @@ def read_sas( else: raise ValueError("unable to infer format of SAS file") + filepath_or_buffer, _, _, should_close = get_filepath_or_buffer( + filepath_or_buffer, encoding + ) + reader: ReaderBase if format.lower() == "xport": from pandas.io.sas.sas_xport import XportReader @@ -129,5 +133,7 @@ def read_sas( return reader data = reader.read() - reader.close() + + if should_close: + reader.close() return data diff --git a/pandas/tests/io/sas/test_xport.py b/pandas/tests/io/sas/test_xport.py index 2682bafedb8f1..939edb3d8e0b4 100644 --- a/pandas/tests/io/sas/test_xport.py +++ b/pandas/tests/io/sas/test_xport.py @@ -3,6 +3,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + import pandas as pd import pandas._testing as tm @@ -26,10 +28,12 @@ def setup_method(self, datapath): self.dirpath = datapath("io", "sas", "data") self.file01 = os.path.join(self.dirpath, "DEMO_G.xpt") self.file02 = os.path.join(self.dirpath, "SSHSV1_A.xpt") - self.file02b = open(os.path.join(self.dirpath, "SSHSV1_A.xpt"), "rb") self.file03 = os.path.join(self.dirpath, "DRXFCD_G.xpt") self.file04 = os.path.join(self.dirpath, "paxraw_d_short.xpt") + with td.file_leak_context(): + yield + def test1_basic(self): # Tests with DEMO_G.xpt (all numeric file) @@ -127,7 +131,12 @@ def test2_binary(self): data_csv = pd.read_csv(self.file02.replace(".xpt", ".csv")) numeric_as_float(data_csv) - data = read_sas(self.file02b, format="xport") + with open(self.file02, "rb") as fd: + with td.file_leak_context(): + # GH#35693 ensure that if we pass an open file, we + # dont incorrectly close it in read_sas + data = read_sas(fd, format="xport") + tm.assert_frame_equal(data, data_csv) def test_multiple_types(self): diff --git a/pandas/util/_test_decorators.py b/pandas/util/_test_decorators.py index bdf633839b2cd..0dad8c7397e37 100644 --- a/pandas/util/_test_decorators.py +++ b/pandas/util/_test_decorators.py @@ -23,8 +23,8 @@ def test_foo(): For more information, refer to the ``pytest`` documentation on ``skipif``. """ +from contextlib import contextmanager from distutils.version import LooseVersion -from functools import wraps import locale from typing import Callable, Optional @@ -237,23 +237,36 @@ def documented_fixture(fixture): def check_file_leaks(func) -> Callable: """ - Decorate a test function tot check that we are not leaking file descriptors. + Decorate a test function to check that we are not leaking file descriptors. """ - psutil = safe_import("psutil") - if not psutil: + with file_leak_context(): return func - @wraps(func) - def new_func(*args, **kwargs): + +@contextmanager +def file_leak_context(): + """ + ContextManager analogue to check_file_leaks. + """ + psutil = safe_import("psutil") + if not psutil: + yield + else: proc = psutil.Process() flist = proc.open_files() + conns = proc.connections() - func(*args, **kwargs) + yield flist2 = proc.open_files() - assert flist2 == flist - - return new_func + # on some builds open_files includes file position, which we _dont_ + # expect to remain unchanged, so we need to compare excluding that + flist_ex = [(x.path, x.fd) for x in flist] + flist2_ex = [(x.path, x.fd) for x in flist2] + assert flist2_ex == flist_ex, (flist2, flist) + + conns2 = proc.connections() + assert conns2 == conns, (conns2, conns) def async_mark(): From 3d63f768f4a6111629c5ad04dadaa928167be115 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torsten=20W=C3=B6rtwein?= Date: Thu, 13 Aug 2020 18:04:49 -0400 Subject: [PATCH 0494/1025] BUG/ENH: consistent gzip compression arguments (#35645) --- doc/source/user_guide/io.rst | 11 +++++--- doc/source/whatsnew/v1.2.0.rst | 1 + pandas/_typing.py | 5 ++++ pandas/core/generic.py | 13 +++++++-- pandas/io/common.py | 31 +++++++++++---------- pandas/io/formats/csvs.py | 6 ++-- pandas/io/json/_json.py | 31 +++++++++++++++------ pandas/io/pickle.py | 8 +++--- pandas/io/stata.py | 19 ++++--------- pandas/tests/io/test_compression.py | 43 +++++++++++++++++++++++++++++ 10 files changed, 118 insertions(+), 50 deletions(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 35403b5c8b66f..43030d76d945a 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -287,16 +287,19 @@ Quoting, compression, and file format compression : {``'infer'``, ``'gzip'``, ``'bz2'``, ``'zip'``, ``'xz'``, ``None``, ``dict``}, default ``'infer'`` For on-the-fly decompression of on-disk data. If 'infer', then use gzip, - bz2, zip, or xz if filepath_or_buffer is a string ending in '.gz', '.bz2', + bz2, zip, or xz if ``filepath_or_buffer`` is path-like ending in '.gz', '.bz2', '.zip', or '.xz', respectively, and no decompression otherwise. If using 'zip', the ZIP file must contain only one data file to be read in. Set to ``None`` for no decompression. Can also be a dict with key ``'method'`` - set to one of {``'zip'``, ``'gzip'``, ``'bz2'``}, and other keys set to - compression settings. As an example, the following could be passed for - faster compression: ``compression={'method': 'gzip', 'compresslevel': 1}``. + set to one of {``'zip'``, ``'gzip'``, ``'bz2'``} and other key-value pairs are + forwarded to ``zipfile.ZipFile``, ``gzip.GzipFile``, or ``bz2.BZ2File``. + As an example, the following could be passed for faster compression and to + create a reproducible gzip archive: + ``compression={'method': 'gzip', 'compresslevel': 1, 'mtime': 1}``. .. versionchanged:: 0.24.0 'infer' option added and set to default. .. versionchanged:: 1.1.0 dict option extended to support ``gzip`` and ``bz2``. + .. versionchanged:: 1.2.0 Previous versions forwarded dict entries for 'gzip' to `gzip.open`. thousands : str, default ``None`` Thousands separator. decimal : str, default ``'.'`` diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index deb5697053ea8..6612f741d925d 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -235,6 +235,7 @@ I/O - Bug in :meth:`to_csv` caused a ``ValueError`` when it was called with a filename in combination with ``mode`` containing a ``b`` (:issue:`35058`) - In :meth:`read_csv` `float_precision='round_trip'` now handles `decimal` and `thousands` parameters (:issue:`35365`) - :meth:`to_pickle` and :meth:`read_pickle` were closing user-provided file objects (:issue:`35679`) +- :meth:`to_csv` passes compression arguments for `'gzip'` always to `gzip.GzipFile` (:issue:`28103`) Plotting ^^^^^^^^ diff --git a/pandas/_typing.py b/pandas/_typing.py index 47a102ddc70e0..1b972030ef5a5 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -109,3 +109,8 @@ # for arbitrary kwargs passed during reading/writing files StorageOptions = Optional[Dict[str, Any]] + + +# compression keywords and compression +CompressionDict = Mapping[str, Optional[Union[str, int, bool]]] +CompressionOptions = Optional[Union[str, CompressionDict]] diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 11147bffa32c3..2219d54477d9e 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -35,6 +35,7 @@ from pandas._libs.tslibs import Tick, Timestamp, to_offset from pandas._typing import ( Axis, + CompressionOptions, FilePathOrBuffer, FrameOrSeries, JSONSerializable, @@ -2058,7 +2059,7 @@ def to_json( date_unit: str = "ms", default_handler: Optional[Callable[[Any], JSONSerializable]] = None, lines: bool_t = False, - compression: Optional[str] = "infer", + compression: CompressionOptions = "infer", index: bool_t = True, indent: Optional[int] = None, storage_options: StorageOptions = None, @@ -2646,7 +2647,7 @@ def to_sql( def to_pickle( self, path, - compression: Optional[str] = "infer", + compression: CompressionOptions = "infer", protocol: int = pickle.HIGHEST_PROTOCOL, storage_options: StorageOptions = None, ) -> None: @@ -3053,7 +3054,7 @@ def to_csv( index_label: Optional[Union[bool_t, str, Sequence[Label]]] = None, mode: str = "w", encoding: Optional[str] = None, - compression: Optional[Union[str, Mapping[str, str]]] = "infer", + compression: CompressionOptions = "infer", quoting: Optional[int] = None, quotechar: str = '"', line_terminator: Optional[str] = None, @@ -3144,6 +3145,12 @@ def to_csv( Compression is supported for binary file objects. + .. versionchanged:: 1.2.0 + + Previous versions forwarded dict entries for 'gzip' to + `gzip.open` instead of `gzip.GzipFile` which prevented + setting `mtime`. + quoting : optional constant from csv module Defaults to csv.QUOTE_MINIMAL. If you have set a `float_format` then floats are converted to strings and thus csv.QUOTE_NONNUMERIC diff --git a/pandas/io/common.py b/pandas/io/common.py index 9ac642e58b544..54f35e689aac8 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -18,7 +18,6 @@ Optional, Tuple, Type, - Union, ) from urllib.parse import ( urljoin, @@ -29,7 +28,12 @@ ) import zipfile -from pandas._typing import FilePathOrBuffer, StorageOptions +from pandas._typing import ( + CompressionDict, + CompressionOptions, + FilePathOrBuffer, + StorageOptions, +) from pandas.compat import _get_lzma_file, _import_lzma from pandas.compat._optional import import_optional_dependency @@ -160,7 +164,7 @@ def is_fsspec_url(url: FilePathOrBuffer) -> bool: def get_filepath_or_buffer( filepath_or_buffer: FilePathOrBuffer, encoding: Optional[str] = None, - compression: Optional[str] = None, + compression: CompressionOptions = None, mode: Optional[str] = None, storage_options: StorageOptions = None, ): @@ -188,7 +192,7 @@ def get_filepath_or_buffer( Returns ------- - Tuple[FilePathOrBuffer, str, str, bool] + Tuple[FilePathOrBuffer, str, CompressionOptions, bool] Tuple containing the filepath or buffer, the encoding, the compression and should_close. """ @@ -291,8 +295,8 @@ def file_path_to_url(path: str) -> str: def get_compression_method( - compression: Optional[Union[str, Mapping[str, Any]]] -) -> Tuple[Optional[str], Dict[str, Any]]: + compression: CompressionOptions, +) -> Tuple[Optional[str], CompressionDict]: """ Simplifies a compression argument to a compression method string and a mapping containing additional arguments. @@ -316,7 +320,7 @@ def get_compression_method( if isinstance(compression, Mapping): compression_args = dict(compression) try: - compression_method = compression_args.pop("method") + compression_method = compression_args.pop("method") # type: ignore except KeyError as err: raise ValueError("If mapping, compression must have key 'method'") from err else: @@ -383,7 +387,7 @@ def get_handle( path_or_buf, mode: str, encoding=None, - compression: Optional[Union[str, Mapping[str, Any]]] = None, + compression: CompressionOptions = None, memory_map: bool = False, is_text: bool = True, errors=None, @@ -464,16 +468,13 @@ def get_handle( # GZ Compression if compression == "gzip": if is_path: - f = gzip.open(path_or_buf, mode, **compression_args) + f = gzip.GzipFile(filename=path_or_buf, mode=mode, **compression_args) else: f = gzip.GzipFile(fileobj=path_or_buf, mode=mode, **compression_args) # BZ Compression elif compression == "bz2": - if is_path: - f = bz2.BZ2File(path_or_buf, mode, **compression_args) - else: - f = bz2.BZ2File(path_or_buf, mode=mode, **compression_args) + f = bz2.BZ2File(path_or_buf, mode=mode, **compression_args) # ZIP Compression elif compression == "zip": @@ -577,7 +578,9 @@ def __init__( if mode in ["wb", "rb"]: mode = mode.replace("b", "") self.archive_name = archive_name - super().__init__(file, mode, zipfile.ZIP_DEFLATED, **kwargs) + kwargs_zip: Dict[str, Any] = {"compression": zipfile.ZIP_DEFLATED} + kwargs_zip.update(kwargs) + super().__init__(file, mode, **kwargs_zip) def write(self, data): archive_name = self.filename diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 6eceb94387171..c462a96da7133 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -5,13 +5,13 @@ import csv as csvlib from io import StringIO, TextIOWrapper import os -from typing import Hashable, List, Mapping, Optional, Sequence, Union +from typing import Hashable, List, Optional, Sequence, Union import warnings import numpy as np from pandas._libs import writers as libwriters -from pandas._typing import FilePathOrBuffer, StorageOptions +from pandas._typing import CompressionOptions, FilePathOrBuffer, StorageOptions from pandas.core.dtypes.generic import ( ABCDatetimeIndex, @@ -44,7 +44,7 @@ def __init__( mode: str = "w", encoding: Optional[str] = None, errors: str = "strict", - compression: Union[str, Mapping[str, str], None] = "infer", + compression: CompressionOptions = "infer", quoting: Optional[int] = None, line_terminator="\n", chunksize: Optional[int] = None, diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 0d2b351926343..c2bd6302940bb 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -3,13 +3,13 @@ from io import BytesIO, StringIO from itertools import islice import os -from typing import Any, Callable, Optional, Type +from typing import IO, Any, Callable, List, Optional, Type import numpy as np import pandas._libs.json as json from pandas._libs.tslibs import iNaT -from pandas._typing import JSONSerializable, StorageOptions +from pandas._typing import CompressionOptions, JSONSerializable, StorageOptions from pandas.errors import AbstractMethodError from pandas.util._decorators import deprecate_kwarg, deprecate_nonkeyword_arguments @@ -19,7 +19,12 @@ from pandas.core.construction import create_series_with_explicit_dtype from pandas.core.reshape.concat import concat -from pandas.io.common import get_filepath_or_buffer, get_handle, infer_compression +from pandas.io.common import ( + get_compression_method, + get_filepath_or_buffer, + get_handle, + infer_compression, +) from pandas.io.json._normalize import convert_to_line_delimits from pandas.io.json._table_schema import build_table_schema, parse_table_schema from pandas.io.parsers import _validate_integer @@ -41,7 +46,7 @@ def to_json( date_unit: str = "ms", default_handler: Optional[Callable[[Any], JSONSerializable]] = None, lines: bool = False, - compression: Optional[str] = "infer", + compression: CompressionOptions = "infer", index: bool = True, indent: int = 0, storage_options: StorageOptions = None, @@ -369,7 +374,7 @@ def read_json( encoding=None, lines: bool = False, chunksize: Optional[int] = None, - compression="infer", + compression: CompressionOptions = "infer", nrows: Optional[int] = None, storage_options: StorageOptions = None, ): @@ -607,7 +612,9 @@ def read_json( if encoding is None: encoding = "utf-8" - compression = infer_compression(path_or_buf, compression) + compression_method, compression = get_compression_method(compression) + compression_method = infer_compression(path_or_buf, compression_method) + compression = dict(compression, method=compression_method) filepath_or_buffer, _, compression, should_close = get_filepath_or_buffer( path_or_buf, encoding=encoding, @@ -667,10 +674,13 @@ def __init__( encoding, lines: bool, chunksize: Optional[int], - compression, + compression: CompressionOptions, nrows: Optional[int], ): + compression_method, compression = get_compression_method(compression) + compression = dict(compression, method=compression_method) + self.orient = orient self.typ = typ self.dtype = dtype @@ -687,6 +697,7 @@ def __init__( self.nrows_seen = 0 self.should_close = False self.nrows = nrows + self.file_handles: List[IO] = [] if self.chunksize is not None: self.chunksize = _validate_integer("chunksize", self.chunksize, 1) @@ -735,8 +746,8 @@ def _get_data_from_filepath(self, filepath_or_buffer): except (TypeError, ValueError): pass - if exists or self.compression is not None: - data, _ = get_handle( + if exists or self.compression["method"] is not None: + data, self.file_handles = get_handle( filepath_or_buffer, "r", encoding=self.encoding, @@ -816,6 +827,8 @@ def close(self): self.open_stream.close() except (IOError, AttributeError): pass + for file_handle in self.file_handles: + file_handle.close() def __next__(self): if self.nrows: diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py index eee6ec7c9feca..fc1d2e385cf72 100644 --- a/pandas/io/pickle.py +++ b/pandas/io/pickle.py @@ -1,9 +1,9 @@ """ pickle compat """ import pickle -from typing import Any, Optional +from typing import Any import warnings -from pandas._typing import FilePathOrBuffer, StorageOptions +from pandas._typing import CompressionOptions, FilePathOrBuffer, StorageOptions from pandas.compat import pickle_compat as pc from pandas.io.common import get_filepath_or_buffer, get_handle @@ -12,7 +12,7 @@ def to_pickle( obj: Any, filepath_or_buffer: FilePathOrBuffer, - compression: Optional[str] = "infer", + compression: CompressionOptions = "infer", protocol: int = pickle.HIGHEST_PROTOCOL, storage_options: StorageOptions = None, ): @@ -114,7 +114,7 @@ def to_pickle( def read_pickle( filepath_or_buffer: FilePathOrBuffer, - compression: Optional[str] = "infer", + compression: CompressionOptions = "infer", storage_options: StorageOptions = None, ): """ diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 7a25617885839..ec3819f1673a8 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -35,7 +35,7 @@ from pandas._libs.lib import infer_dtype from pandas._libs.writers import max_len_string_array -from pandas._typing import FilePathOrBuffer, Label, StorageOptions +from pandas._typing import CompressionOptions, FilePathOrBuffer, Label, StorageOptions from pandas.util._decorators import Appender from pandas.core.dtypes.common import ( @@ -1938,9 +1938,9 @@ def read_stata( def _open_file_binary_write( fname: FilePathOrBuffer, - compression: Union[str, Mapping[str, str], None], + compression: CompressionOptions, storage_options: StorageOptions = None, -) -> Tuple[BinaryIO, bool, Optional[Union[str, Mapping[str, str]]]]: +) -> Tuple[BinaryIO, bool, CompressionOptions]: """ Open a binary file or no-op if file-like. @@ -1978,17 +1978,10 @@ def _open_file_binary_write( # Extract compression mode as given, if dict compression_typ, compression_args = get_compression_method(compression) compression_typ = infer_compression(fname, compression_typ) - path_or_buf, _, compression_typ, _ = get_filepath_or_buffer( - fname, - mode="wb", - compression=compression_typ, - storage_options=storage_options, + compression = dict(compression_args, method=compression_typ) + path_or_buf, _, compression, _ = get_filepath_or_buffer( + fname, mode="wb", compression=compression, storage_options=storage_options, ) - if compression_typ is not None: - compression = compression_args - compression["method"] = compression_typ - else: - compression = None f, _ = get_handle(path_or_buf, "wb", compression=compression, is_text=False) return f, True, compression else: diff --git a/pandas/tests/io/test_compression.py b/pandas/tests/io/test_compression.py index 902a3d5d2a397..bc14b485f75e5 100644 --- a/pandas/tests/io/test_compression.py +++ b/pandas/tests/io/test_compression.py @@ -1,7 +1,10 @@ +import io import os +from pathlib import Path import subprocess import sys import textwrap +import time import pytest @@ -130,6 +133,46 @@ def test_compression_binary(compression_only): ) +def test_gzip_reproducibility_file_name(): + """ + Gzip should create reproducible archives with mtime. + + Note: Archives created with different filenames will still be different! + + GH 28103 + """ + df = tm.makeDataFrame() + compression_options = {"method": "gzip", "mtime": 1} + + # test for filename + with tm.ensure_clean() as path: + path = Path(path) + df.to_csv(path, compression=compression_options) + time.sleep(2) + output = path.read_bytes() + df.to_csv(path, compression=compression_options) + assert output == path.read_bytes() + + +def test_gzip_reproducibility_file_object(): + """ + Gzip should create reproducible archives with mtime. + + GH 28103 + """ + df = tm.makeDataFrame() + compression_options = {"method": "gzip", "mtime": 1} + + # test for file object + buffer = io.BytesIO() + df.to_csv(buffer, compression=compression_options, mode="wb") + output = buffer.getvalue() + time.sleep(2) + buffer = io.BytesIO() + df.to_csv(buffer, compression=compression_options, mode="wb") + assert output == buffer.getvalue() + + def test_with_missing_lzma(): """Tests if import pandas works when lzma is not present.""" # https://github.com/pandas-dev/pandas/issues/27575 From 07ab8340223ca366cb88a7427e4f0cd54cda6f96 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Fri, 14 Aug 2020 02:50:51 +0100 Subject: [PATCH 0495/1025] REGR: Dataframe.reset_index() on empty DataFrame with MI and datatime level (#35673) --- doc/source/whatsnew/v1.1.1.rst | 1 + pandas/core/frame.py | 2 +- .../tests/frame/methods/test_reset_index.py | 30 +++++++++++++++++++ 3 files changed, 32 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.1.rst b/doc/source/whatsnew/v1.1.1.rst index b37103910afab..98d67e930ccc0 100644 --- a/doc/source/whatsnew/v1.1.1.rst +++ b/doc/source/whatsnew/v1.1.1.rst @@ -22,6 +22,7 @@ Fixed regressions - Fixed regression in :meth:`DataFrame.shift` with ``axis=1`` and heterogeneous dtypes (:issue:`35488`) - Fixed regression in ``.groupby(..).rolling(..)`` where a segfault would occur with ``center=True`` and an odd number of values (:issue:`35552`) - Fixed regression in :meth:`DataFrame.apply` where functions that altered the input in-place only operated on a single row (:issue:`35462`) +- Fixed regression in :meth:`DataFrame.reset_index` would raise a ``ValueError`` on empty :class:`DataFrame` with a :class:`MultiIndex` with a ``datetime64`` dtype level (:issue:`35606`, :issue:`35657`) - Fixed regression where :meth:`DataFrame.merge_asof` would raise a ``UnboundLocalError`` when ``left_index`` , ``right_index`` and ``tolerance`` were set (:issue:`35558`) - Fixed regression in ``.groupby(..).rolling(..)`` where a custom ``BaseIndexer`` would be ignored (:issue:`35557`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 547d86f221b5f..1587dd8798ec3 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4816,7 +4816,7 @@ def _maybe_casted_values(index, labels=None): # we can have situations where the whole mask is -1, # meaning there is nothing found in labels, so make all nan's - if mask.all(): + if mask.size > 0 and mask.all(): dtype = index.dtype fill_value = na_value_for_dtype(dtype) values = construct_1d_arraylike_from_scalar( diff --git a/pandas/tests/frame/methods/test_reset_index.py b/pandas/tests/frame/methods/test_reset_index.py index da4bfa9be4881..b88ef0e6691cb 100644 --- a/pandas/tests/frame/methods/test_reset_index.py +++ b/pandas/tests/frame/methods/test_reset_index.py @@ -318,3 +318,33 @@ def test_reset_index_dtypes_on_empty_frame_with_multiindex(array, dtype): result = DataFrame(index=idx)[:0].reset_index().dtypes expected = Series({"level_0": np.int64, "level_1": np.float64, "level_2": dtype}) tm.assert_series_equal(result, expected) + + +def test_reset_index_empty_frame_with_datetime64_multiindex(): + # https://github.com/pandas-dev/pandas/issues/35606 + idx = MultiIndex( + levels=[[pd.Timestamp("2020-07-20 00:00:00")], [3, 4]], + codes=[[], []], + names=["a", "b"], + ) + df = DataFrame(index=idx, columns=["c", "d"]) + result = df.reset_index() + expected = DataFrame( + columns=list("abcd"), index=RangeIndex(start=0, stop=0, step=1) + ) + expected["a"] = expected["a"].astype("datetime64[ns]") + expected["b"] = expected["b"].astype("int64") + tm.assert_frame_equal(result, expected) + + +def test_reset_index_empty_frame_with_datetime64_multiindex_from_groupby(): + # https://github.com/pandas-dev/pandas/issues/35657 + df = DataFrame(dict(c1=[10.0], c2=["a"], c3=pd.to_datetime("2020-01-01"))) + df = df.head(0).groupby(["c2", "c3"])[["c1"]].sum() + result = df.reset_index() + expected = DataFrame( + columns=["c2", "c3", "c1"], index=RangeIndex(start=0, stop=0, step=1) + ) + expected["c3"] = expected["c3"].astype("datetime64[ns]") + expected["c1"] = expected["c1"].astype("float64") + tm.assert_frame_equal(result, expected) From c70f72e48b48deb79ade102c1b2fc4e32cc168b3 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Fri, 14 Aug 2020 13:32:00 +0100 Subject: [PATCH 0496/1025] CLN: remove extant uses of built-in filter function (#35717) --- pandas/_config/localization.py | 14 ++++++++------ pandas/core/computation/expr.py | 7 +++---- pandas/core/reshape/merge.py | 7 +++++-- pandas/io/json/_json.py | 5 +++-- pandas/io/parsers.py | 4 +--- pandas/io/pytables.py | 20 +++++++++----------- pandas/tests/computation/test_eval.py | 26 +++++++++++++++----------- 7 files changed, 44 insertions(+), 39 deletions(-) diff --git a/pandas/_config/localization.py b/pandas/_config/localization.py index 66865e1afb952..3933c8f3d519c 100644 --- a/pandas/_config/localization.py +++ b/pandas/_config/localization.py @@ -88,12 +88,14 @@ def _valid_locales(locales, normalize): valid_locales : list A list of valid locales. """ - if normalize: - normalizer = lambda x: locale.normalize(x.strip()) - else: - normalizer = lambda x: x.strip() - - return list(filter(can_set_locale, map(normalizer, locales))) + return [ + loc + for loc in ( + locale.normalize(loc.strip()) if normalize else loc.strip() + for loc in locales + ) + if can_set_locale(loc) + ] def _default_locale_getter(): diff --git a/pandas/core/computation/expr.py b/pandas/core/computation/expr.py index fcccc24ed7615..125ecb0d88036 100644 --- a/pandas/core/computation/expr.py +++ b/pandas/core/computation/expr.py @@ -167,10 +167,9 @@ def _is_type(t): # partition all AST nodes _all_nodes = frozenset( - filter( - lambda x: isinstance(x, type) and issubclass(x, ast.AST), - (getattr(ast, node) for node in dir(ast)), - ) + node + for node in (getattr(ast, name) for name in dir(ast)) + if isinstance(node, type) and issubclass(node, ast.AST) ) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 2349cb1dcc0c7..01e20f49917ac 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -2012,8 +2012,11 @@ def _sort_labels(uniques: np.ndarray, left, right): def _get_join_keys(llab, rlab, shape, sort: bool): # how many levels can be done without overflow - pred = lambda i: not is_int64_overflow_possible(shape[:i]) - nlev = next(filter(pred, range(len(shape), 0, -1))) + nlev = next( + lev + for lev in range(len(shape), 0, -1) + if not is_int64_overflow_possible(shape[:lev]) + ) # get keys for the first `nlev` levels stride = np.prod(shape[1:nlev], dtype="i8") diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index c2bd6302940bb..fe5e172655ae1 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -765,8 +765,9 @@ def _combine_lines(self, lines) -> str: """ Combines a list of JSON objects into one JSON object. """ - lines = filter(None, map(lambda x: x.strip(), lines)) - return "[" + ",".join(lines) + "]" + return ( + f'[{",".join((line for line in (line.strip() for line in lines) if line))}]' + ) def read(self): """ diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 9dc0e1f71d13b..5d49757ce7d58 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -2161,9 +2161,7 @@ def read(self, nrows=None): if self.usecols is not None: columns = self._filter_usecols(columns) - col_dict = dict( - filter(lambda item: item[0] in columns, col_dict.items()) - ) + col_dict = {k: v for k, v in col_dict.items() if k in columns} return index, columns, col_dict diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 2abc570a04de3..f08e0514a68e1 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -99,22 +99,20 @@ def _ensure_str(name): def _ensure_term(where, scope_level: int): """ - ensure that the where is a Term or a list of Term - this makes sure that we are capturing the scope of variables - that are passed - create the terms here with a frame_level=2 (we are 2 levels down) + Ensure that the where is a Term or a list of Term. + + This makes sure that we are capturing the scope of variables that are + passed create the terms here with a frame_level=2 (we are 2 levels down) """ # only consider list/tuple here as an ndarray is automatically a coordinate # list level = scope_level + 1 if isinstance(where, (list, tuple)): - wlist = [] - for w in filter(lambda x: x is not None, where): - if not maybe_expression(w): - wlist.append(w) - else: - wlist.append(Term(w, scope_level=level)) - where = wlist + where = [ + Term(term, scope_level=level + 1) if maybe_expression(term) else term + for term in where + if term is not None + ] elif maybe_expression(where): where = Term(where, scope_level=level) return where if where is None or len(where) else None diff --git a/pandas/tests/computation/test_eval.py b/pandas/tests/computation/test_eval.py index 08d8d5ca342b7..853ab00853d1b 100644 --- a/pandas/tests/computation/test_eval.py +++ b/pandas/tests/computation/test_eval.py @@ -168,7 +168,7 @@ def setup_ops(self): def setup_method(self, method): self.setup_ops() self.setup_data() - self.current_engines = filter(lambda x: x != self.engine, _engines) + self.current_engines = (engine for engine in _engines if engine != self.engine) def teardown_method(self, method): del self.lhses, self.rhses, self.scalar_rhses, self.scalar_lhses @@ -774,11 +774,9 @@ def setup_class(cls): cls.parser = "python" def setup_ops(self): - self.cmp_ops = list( - filter(lambda x: x not in ("in", "not in"), expr._cmp_ops_syms) - ) + self.cmp_ops = [op for op in expr._cmp_ops_syms if op not in ("in", "not in")] self.cmp2_ops = self.cmp_ops[::-1] - self.bin_ops = [s for s in expr._bool_ops_syms if s not in ("and", "or")] + self.bin_ops = [op for op in expr._bool_ops_syms if op not in ("and", "or")] self.special_case_ops = _special_case_arith_ops_syms self.arith_ops = _good_arith_ops self.unary_ops = "+", "-", "~" @@ -1150,9 +1148,9 @@ def eval(self, *args, **kwargs): return pd.eval(*args, **kwargs) def test_simple_arith_ops(self): - ops = self.arith_ops + ops = (op for op in self.arith_ops if op != "//") - for op in filter(lambda x: x != "//", ops): + for op in ops: ex = f"1 {op} 1" ex2 = f"x {op} 1" ex3 = f"1 {op} (x + 1)" @@ -1637,8 +1635,11 @@ def setup_class(cls): super().setup_class() cls.engine = "numexpr" cls.parser = "python" - cls.arith_ops = expr._arith_ops_syms + expr._cmp_ops_syms - cls.arith_ops = filter(lambda x: x not in ("in", "not in"), cls.arith_ops) + cls.arith_ops = [ + op + for op in expr._arith_ops_syms + expr._cmp_ops_syms + if op not in ("in", "not in") + ] def test_check_many_exprs(self): a = 1 # noqa @@ -1726,8 +1727,11 @@ class TestOperationsPythonPython(TestOperationsNumExprPython): def setup_class(cls): super().setup_class() cls.engine = cls.parser = "python" - cls.arith_ops = expr._arith_ops_syms + expr._cmp_ops_syms - cls.arith_ops = filter(lambda x: x not in ("in", "not in"), cls.arith_ops) + cls.arith_ops = [ + op + for op in expr._arith_ops_syms + expr._cmp_ops_syms + if op not in ("in", "not in") + ] class TestOperationsPythonPandas(TestOperationsNumExprPandas): From 87f652befb50305ea07c7b354faa818711fa95f8 Mon Sep 17 00:00:00 2001 From: attack68 <24256554+attack68@users.noreply.github.com> Date: Fri, 14 Aug 2020 14:36:09 +0200 Subject: [PATCH 0497/1025] BUG: Styler cell_ids fails on multiple renders (#35664) --- doc/source/whatsnew/v1.1.1.rst | 2 +- pandas/io/formats/style.py | 14 +++++++------- pandas/tests/io/formats/test_style.py | 5 ++++- 3 files changed, 12 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v1.1.1.rst b/doc/source/whatsnew/v1.1.1.rst index 98d67e930ccc0..3f177b29d52b8 100644 --- a/doc/source/whatsnew/v1.1.1.rst +++ b/doc/source/whatsnew/v1.1.1.rst @@ -33,7 +33,7 @@ Fixed regressions Bug fixes ~~~~~~~~~ -- Bug in ``Styler`` whereby `cell_ids` argument had no effect due to other recent changes (:issue:`35588`). +- Bug in ``Styler`` whereby `cell_ids` argument had no effect due to other recent changes (:issue:`35588`) (:issue:`35663`). Categorical ^^^^^^^^^^^ diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index 584f42a6cab12..3bbb5271bce61 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -390,16 +390,16 @@ def format_attr(pair): "is_visible": (c not in hidden_columns), } # only add an id if the cell has a style + props = [] if self.cell_ids or (r, c) in ctx: row_dict["id"] = "_".join(cs[1:]) + for x in ctx[r, c]: + # have to handle empty styles like [''] + if x.count(":"): + props.append(tuple(x.split(":"))) + else: + props.append(("", "")) row_es.append(row_dict) - props = [] - for x in ctx[r, c]: - # have to handle empty styles like [''] - if x.count(":"): - props.append(tuple(x.split(":"))) - else: - props.append(("", "")) cellstyle_map[tuple(props)].append(f"row{r}_col{c}") body.append(row_es) diff --git a/pandas/tests/io/formats/test_style.py b/pandas/tests/io/formats/test_style.py index 3ef5157655e78..6025649e9dbec 100644 --- a/pandas/tests/io/formats/test_style.py +++ b/pandas/tests/io/formats/test_style.py @@ -1684,8 +1684,11 @@ def f(a, b, styler): def test_no_cell_ids(self): # GH 35588 + # GH 35663 df = pd.DataFrame(data=[[0]]) - s = Styler(df, uuid="_", cell_ids=False).render() + styler = Styler(df, uuid="_", cell_ids=False) + styler.render() + s = styler.render() # render twice to ensure ctx is not updated assert s.find('') != -1 From 39daad3df5c26165b20c736a0bd1f9d427f3983d Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 14 Aug 2020 16:35:21 +0200 Subject: [PATCH 0498/1025] REGR: fix DataFrame.diff with read-only data (#35707) Co-authored-by: Jeff Reback --- doc/source/whatsnew/v1.1.1.rst | 3 ++- pandas/_libs/algos.pyx | 7 ++++--- pandas/tests/frame/methods/test_diff.py | 9 +++++++++ setup.py | 3 +++ 4 files changed, 18 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v1.1.1.rst b/doc/source/whatsnew/v1.1.1.rst index 3f177b29d52b8..85e2a335c55c6 100644 --- a/doc/source/whatsnew/v1.1.1.rst +++ b/doc/source/whatsnew/v1.1.1.rst @@ -16,10 +16,11 @@ Fixed regressions ~~~~~~~~~~~~~~~~~ - Fixed regression where :meth:`DataFrame.to_numpy` would raise a ``RuntimeError`` for mixed dtypes when converting to ``str`` (:issue:`35455`) -- Fixed regression where :func:`read_csv` would raise a ``ValueError`` when ``pandas.options.mode.use_inf_as_na`` was set to ``True`` (:issue:`35493`). +- Fixed regression where :func:`read_csv` would raise a ``ValueError`` when ``pandas.options.mode.use_inf_as_na`` was set to ``True`` (:issue:`35493`) - Fixed regression where :func:`pandas.testing.assert_series_equal` would raise an error when non-numeric dtypes were passed with ``check_exact=True`` (:issue:`35446`) - Fixed regression in :class:`pandas.core.groupby.RollingGroupby` where column selection was ignored (:issue:`35486`) - Fixed regression in :meth:`DataFrame.shift` with ``axis=1`` and heterogeneous dtypes (:issue:`35488`) +- Fixed regression in :meth:`DataFrame.diff` with read-only data (:issue:`35559`) - Fixed regression in ``.groupby(..).rolling(..)`` where a segfault would occur with ``center=True`` and an odd number of values (:issue:`35552`) - Fixed regression in :meth:`DataFrame.apply` where functions that altered the input in-place only operated on a single row (:issue:`35462`) - Fixed regression in :meth:`DataFrame.reset_index` would raise a ``ValueError`` on empty :class:`DataFrame` with a :class:`MultiIndex` with a ``datetime64`` dtype level (:issue:`35606`, :issue:`35657`) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 7e90a8cc681ef..0a70afda893cf 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -1200,14 +1200,15 @@ ctypedef fused out_t: @cython.boundscheck(False) @cython.wraparound(False) def diff_2d( - diff_t[:, :] arr, - out_t[:, :] out, + ndarray[diff_t, ndim=2] arr, # TODO(cython 3) update to "const diff_t[:, :] arr" + ndarray[out_t, ndim=2] out, Py_ssize_t periods, int axis, ): cdef: Py_ssize_t i, j, sx, sy, start, stop - bint f_contig = arr.is_f_contig() + bint f_contig = arr.flags.f_contiguous + # bint f_contig = arr.is_f_contig() # TODO(cython 3) # Disable for unsupported dtype combinations, # see https://github.com/cython/cython/issues/2646 diff --git a/pandas/tests/frame/methods/test_diff.py b/pandas/tests/frame/methods/test_diff.py index 45f134a93a23a..0486fb2d588b6 100644 --- a/pandas/tests/frame/methods/test_diff.py +++ b/pandas/tests/frame/methods/test_diff.py @@ -214,3 +214,12 @@ def test_diff_integer_na(self, axis, expected): # Test case for default behaviour of diff result = df.diff(axis=axis) tm.assert_frame_equal(result, expected) + + def test_diff_readonly(self): + # https://github.com/pandas-dev/pandas/issues/35559 + arr = np.random.randn(5, 2) + arr.flags.writeable = False + df = pd.DataFrame(arr) + result = df.diff() + expected = pd.DataFrame(np.array(df)).diff() + tm.assert_frame_equal(result, expected) diff --git a/setup.py b/setup.py index 43d19d525876b..f6f0cd9aabc0e 100755 --- a/setup.py +++ b/setup.py @@ -456,6 +456,9 @@ def run(self): if sys.version_info[:2] == (3, 8): # GH 33239 extra_compile_args.append("-Wno-error=deprecated-declarations") + # https://github.com/pandas-dev/pandas/issues/35559 + extra_compile_args.append("-Wno-error=unreachable-code") + # enable coverage by building cython files by setting the environment variable # "PANDAS_CYTHON_COVERAGE" (with a Truthy value) or by running build_ext # with `--with-cython-coverage`enabled From a1ce997fca3eaaa1f525c7eeefe238d435c4c863 Mon Sep 17 00:00:00 2001 From: Jiaxiang Date: Sat, 15 Aug 2020 00:17:37 +0800 Subject: [PATCH 0499/1025] [BUG] fixed DateOffset pickle bug when months >= 12 (#35258) --- doc/source/whatsnew/v1.2.0.rst | 1 + pandas/_libs/tslibs/offsets.pyx | 7 - .../1.1.0/1.1.0_x86_64_darwin_3.8.5.pickle | Bin 0 -> 127216 bytes .../tests/io/generate_legacy_storage_files.py | 8 +- .../offsets/data/dateoffset_0_15_2.pickle | 183 ------------------ pandas/tests/tseries/offsets/test_offsets.py | 25 +-- 6 files changed, 14 insertions(+), 210 deletions(-) create mode 100644 pandas/tests/io/data/legacy_pickle/1.1.0/1.1.0_x86_64_darwin_3.8.5.pickle delete mode 100644 pandas/tests/tseries/offsets/data/dateoffset_0_15_2.pickle diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 6612f741d925d..a3bb6dfd86bd2 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -172,6 +172,7 @@ Datetimelike ^^^^^^^^^^^^ - Bug in :attr:`DatetimeArray.date` where a ``ValueError`` would be raised with a read-only backing array (:issue:`33530`) - Bug in ``NaT`` comparisons failing to raise ``TypeError`` on invalid inequality comparisons (:issue:`35046`) +- Bug in :class:`DateOffset` where attributes reconstructed from pickle files differ from original objects when input values exceed normal ranges (e.g months=12) (:issue:`34511`) - Timedelta diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index ac2725fc58aee..7f0314d737619 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -989,13 +989,6 @@ cdef class RelativeDeltaOffset(BaseOffset): state["_offset"] = state.pop("offset") state["kwds"]["offset"] = state["_offset"] - if "_offset" in state and not isinstance(state["_offset"], timedelta): - # relativedelta, we need to populate using its kwds - offset = state["_offset"] - odict = offset.__dict__ - kwds = {key: odict[key] for key in odict if odict[key]} - state.update(kwds) - self.n = state.pop("n") self.normalize = state.pop("normalize") self._cache = state.pop("_cache", {}) diff --git a/pandas/tests/io/data/legacy_pickle/1.1.0/1.1.0_x86_64_darwin_3.8.5.pickle b/pandas/tests/io/data/legacy_pickle/1.1.0/1.1.0_x86_64_darwin_3.8.5.pickle new file mode 100644 index 0000000000000000000000000000000000000000..f8df9afff65658d0f58a5b2afb30997cea53e4c8 GIT binary patch literal 127216 zcmdqq1z6n7qp0y@0u-oHcNc2Z-AkQ9-G!oEwpfw6JMGq~ySux)ySux)+nwybTd4Q@ z&i|Zy&z(NqAK7F>GM>qV?O`+On2%x(U89{%cHw?u0e<1p`p>rhfgxTI(Wb;V{aS~9N`xh>=hX9666)^)!HvCT<)!2 zKyd3ozlwn&K5g7B??#(<;@l$Il=9n8`}%e83wQDM3ipdPIeG^BwfC$ZEZ>vcn^OI3 z%P_Ct*7~M(b*-D~r(zTA733G~VX_U6@Cu8Fc2i6?;SnLB(QdZ-4}S6oHL_oH-Dod) z49S8cgF-u6&Ne7AFe1P!EX=E8v?-ovn4eEbaCk&mq)&u?2)5=8qfK#S?3gzhDR1YoRj~ZWOFu%Ad5z*dJ zuFjU^u)aPm$4})g+T+{q(Q-K_GD4#b2~2k0di;k+w=lnL*Ul?2QeTnXWG_?EQ?8RKW$zssAWuJ7{%9K>7~rFy zft%uH>!#`_5a|^eE`Ptt##5eLv_UmFcn0}Jc=>uoc*(sQl6#qKy&@uHig%R@9+BQA zP3DGZgPkS27#s}oV@GLDlL&|vk%qj#jYgIDv-17&Uqz## zs2;_|4eo{#<~TEyHqilM6Y#BGBDI{5ki=T5y=tW!6sUB1gh z<@7w%BHFpC^>p3TuZ7YxFu*(9B|;{qPC$^KEDk}TGJPHDTNh*+542a5A%_o+@8jg* zp~t#O^A8D$u1dC3{Qf4_S)(g3Cc47^GP*>B%!of^6H~gM6?mE3{LJSe>lJhTP>DCJ zh2JpJ6#x78%oTz;pPJIf-l@3|hr7hu=hu%`pnOx;*?O8k{$u8JlsQ#qPWPA;viTu% z%3tp)WTBGPgZvi}-A~VdQLeQWW%u3lcBc|6O0rHN4Rn3u+RW5lF>&m3UEz|q!f9!p zE!&3bPOaVWvhI^>_w$^QmJRRTS$D8a(*0)=>$^4Cdisa?wT(8V;Y7d50U`eW;eIl< zOKay0)CS34NgCA>dY;ZZ0g+g7Rs92^_*=N=-19KP_NmY zep&8pr86o_d|4IO^g z2FM(ys;YVKKii2>ktY@9YAaENzLlTJAv`23A|%Y$FHG+)=-md(F+4E5^80np*AFPq zU$@+e%Im+&(sEzYW9*7Kv8=Io$B^9c;JX-oCf|Eye$UVF_E&N8^>qH>NgEwyGrit@j! z{9h^BSV|V(%3gZHns}YsuP$ zH>~`&sxdbLvOUy0ud=z(x0k$JkguJh4LxEf zrzwuP_{knqlK7$%S3 zl+fCTtKamytY;G$93Zbz^$wL`mB}WwqdWnV@C=pR8E2DYgL?TYd&vvDu;Az#Q`dj_ zA}{>3n(p;wbjk}D@1N(FSmzcKr>nn-(+z(gr*@&TsSWw(y?;~cpY_<-nJKlWXK2R` zo{_PlgzNh6aLNE!|EZFCT-SykN9m zF&jV0hz^hJYi%t5BJ*GWCZQhxbwWkT;f^qUkU)?Pu<3$g73e;n!mXmaTrZK2dqg z`dDKL|AmA0iYYHCWuHH$!ONyESsFa?UHxiu&@ZJdF>I(T&sknJ%d3v+#>Db7bn_jI z$t??VxZuASt+js>tv|bTu|~4YN8beNvOF$x_|@{)iu*qkt>yKIv<^TR-SwrPN2{@< zzOJ&dl#V4z4gWGqZGOzDF`b@o^6an2p=7N8q#G~C$1J(|AB}kZ*b}hqN3W!;X=F`~ zUoD#}sUNNgEc>yv%MYf$8<49ov8L zF}+h9`(u7FHG@A{*jcY=OB>t83>-q{#0>08Dd@1{avZ=CS%LkUA|aKlWd6s!sOU0mw><9$Jg$( z;ep9E!Vu=?7cH+-&ACG#%9q2{zFbfL$F3TMBFvp_zi+P^jhAHX8}G^&%VqL^ro>yW zl>bSImj_@D-;XrtCG~&4l*P*T`}N!@t8ViBXMNF9cYmAjZRC-gSH$G|U*}?DF+J=2 zyQcDM-ihgf{BQ~To56V*nLnnBW~tE!xLSLOq{g56A4$twHcW|WF(YQite730F&E~E z8Cm(OV)L6*kJXvUsVB+TR=0#Tft0V+jOA9y&#D=derfC}hsbjyWDzi$^q0Z<%Mkr#sCZ}n=f{y+U~{=fZWHn-&eht?I} z7mVlD72kKQ-&$7)H#;|B?xp{x#Ke>1h?YVV(JB1j7MhsT{=3@syHfI3W0Kaf$lq(e z$Xo0o2eo#~xVN;Z#(jE~zhAx>RvC{;^wGq>8VKDw$TQU&Es636|M*J}$zSSh(p)Ro zk9JR#Kp)uDf3yy4{?F(ASoyH1yull*t6NE5{8?Am!@43SAO0p+#kc0Ff^v}hUuUbB zHt=heH0f&ahHTg_y`{B?#_K5d=#NoMFj_{R)=KxSw;o2Ws3S2;4A(=tB) zKc9{NvgZHI07tiPbHB-&$zwVjmRfkUbwy0>r+Z>u@i*fba@^W{Ti-lxU0vqfU)_S& zr>6XDgi|)6<{)c$bCAE>9RA7V4C|d(lZ_nAl|vB!&rZ}ZK9Z+r9#=75*6VJkueVvg znIQ4qy^?S4R2%Qf4;mlJmp|S*u|~NKacqlr7zgdq0Ua?e#>4n%E=ntnZ}c-*VzPG+ z363y)y`iGt5!Y|0Si9zm{E$(|)HL5mkttEQ$;lv*F>|i8ALd-i>(E%oE#u7` zh)(4{lJy<*d1$_VfpYqz%vRsrfj1XWIq&A{7V>U-0)57vdGl{_*SFK$xOMmQ&sp!_ zaf$zvTX)1vP)yAHli6rQQtjp`XXp_&b#4ACdPF@I z&JoA;30wL5&5w*L?Qwm3%J|ZwWiM<`+PnI0(flX-J%oEt04 z-)4&c{oAr0HrWLF`O91V|CcLN+pvJvhQB(D|5T=Slxe1i*oD$szek$lgv^DwsgqraMLW&d@Q zOP;l=3ZZG{`1)ZuEcS+Sl9~Qb-|!EE=izc1TBv8tRD6T>7jrTaek~SqW|=8N>>JqB zBjKSk<@{XCOL7iSJ@ewX)rcv+r)4Woa}kO*CHZC=f@PhYIB34*Z7oG^34b~cXr7{1 zIMx)spH9=(>kcKJHN*W7fiiiF3uPgVa<$i|hw4?f9)ae0$qB5-YMs9R!>1iAEmk%8 zsaWBkjz-6fGMf{5nunc=S|#$#Qv>@VlGqv1g*2;&_vxWKBEdC;91bI&BGMe9HTya&-Sl`h68% z$@`$`m$n$D8&u)>W!;U(`{jKtjq97Hi|*SL=UTs!R-G5LT6c6wVf$8k>%i!Gx`kWh zp01n8>E7UX($MK$FY0b7lsNZ$X@Xs|JLa`A_(_rE9ojaT)AQKO8vq(4d21hQXZNz|A! zb4qR+Q)V9MOMg*j9K0jKzm}RvlfA5!t+Rv-+P@qPj%m5R9f2@UBK>9<;%`z!@8ii5 zDf{IA*%E24=@$PfrVCkPTE9^Lq5RB{v23!JovW`6h2xJoKF-$)KCRxp)Vr4QV~XCj z)GK_wdC$WZ6idqEbo>2T>BpmY&h)11 zfB9L-+KGxUtM!cj3u&g;b#g?a+JCC&`m;1_Y_99v%%fR;E;wVt+ZB_Wcc!SYycOe))g@~JAOU-9B%ZL*}!~VD6c7t|5t9V>s6IyZkXyNA87EF|K+1Ns;~Sn zyDqA~{NI{u7|Zhwf4JkWSD<Gs}epeqV zZyZ&ze1@1D{GTeS){9Jipg<3L7fKKMfPo(5fPpzA#6&1Ng61F}ax(`x>TC{4F&QSu z6lgBT^5;}6r^Ymx7SmyRw4QoKmNQ{y%z{}l8)nBG=!`ir7v{!1m=|3zA6hGt0xTEA zLRc7!U{NfF)^@Q3%O$ZCmc}wz7QcJ+SgFAK710$dVP&j>*4wkz3$W_EUjyB+Cc2{s z)7wchtY=8~15jMsq*c6*#b8LYv(Gy#t7kZ-)`l28DV{0^E02;9k24WBf zV+e*~TMWZ+jKD~2hwZTgcEnED8M|Ot?1tU32lm8X*c*d2I0nb!I2?}?a3W5^$v6e4;xwF&GjJx(!r3?n=i)q^j|*@iF2cpQ z1efA6T#hSnC9cBNxCYnaI$Vz%a3gNQ&A0`(;x^olJ8&oN!rizB_u@X>j|cD|9>T+T z1drk|JdP*uB%Z?4cm~hnIXsUS@FHHq%XkH^;x)XEH}EFj!rOQU@8UhYj}P!6KElWN z1fSwFe2y>hCBDMf_y*tNJA98H@FRZ0&-ew+7clZ?Rkge!=k}R{hBj!6b{Gfk(E;@b zPW7Naa;k@T7#|a0LbN`7?8I_nOoB-<879XRm=aTAYD|M^F&(DI444r!VP?#N=F1^@ z9@$vVjycd7b7C&cjd?IHx?n!cj|H$G7Q(_<1dF0RJy;LLu>_Vx{kc>!}C*0aZP;Mg5^j zJ%pn^yH^j9*bduc2keNQurqeSuBhKr(S!a_f*yKcPwa)gu@Cmee%K!eU=$9-L1@Ck zI0T2{FdU8}a3n_KC>)Jra4e3)@i+k|;v}4mQ*bIy!|6B!XW}fJjdO4=&cpe*02ksS zT#QR_DK5k1xB^$=DqM|ga4oLG^|%2y;wIdTTW~9G!|k{Ocj7MGjeBq}?!*0f01x6J zJd8*1C?3P(cmhx2DLjp5@GPFg^LPO-;w8L{SMVxc!|QkhZ{jVyjd$=a-oyL&03YHb ze2h=_xJ%n;wSu!U(h^wDeD1Mn5zRtpnUS)9RAZYToSO~ zgqR4OP=De@4@od7CPV#sQ9YzU{c%)1q{7sg2Ge3XOph5bBWA+Pm<6+<{-CQKvSSW( z#+;Z7b7LONi!PWC^J4)lh=s5)7Qv!e42xq4)SnL3gZ_A^9!g^wEQ{r^JXXMp=!%uF zGFHK=SPiRV4Rphrs6WxF2M?@;wXqJ?#d=sD8(>3hgpIKYHpOPx99v*Z^u$)^h2H3c zzUYVgLymfAjRp)rBeuan48mXx!BA|AVHl1P7>W8*m3nB89k3&I!p^8aY^jH?*bTd5 z5A2D(us8O>zSs}@;{c4pfj9_FI2ecEP#lKCaRiRUXdH#3aSV>daX20);6$8+lW_`8 z#c4PlXW&eng|l%E&c%5+9~a<4T!f2p2`Lkg}ZSN?!|q$9}nO`JcNhw2p+{_cpOjQNj!z8@eH2Db9f#v;6=QIm+=Z- z#cOySZ{SV5g}3nz-o<-(A0OaDe1wnj2|mSV_#9v0OMHc|@eRJkclaJZ;79y~pYaP? zr%0-rrJfL|q5jaS9&FJLzU{Xwm$uR|5|L|BUmQ!OI zG~arb$DfYn^q2uNVkXRtSuiVR!|a#?ol$?1R1di@H|D{-=z{q$KNi4(SO^Pa5iE+u zusG_^uIiyAmcr6l2FqeOERPkiBD!KFtc>QFqVo4tWw{zw#~SE{HPIbCuol+FI#?I= zhd}jE9~)ppY=n)m2{y%M*c@A6OZ3E6s6S?^2XFL2U-UzNY>fsCKqI!nKn%iQ48c%r zi(wd!5vV_hsfTvh9y?%1?1Y`M3wFhB*d2RdPwa)gu@Cmee%K!eU=$9-L1@CkI0T2{ zFdU8}a3n_KC>)Jra4e3)@i+k|;v}4mQ*bIy!|6B!XW}fJjdO4=&cpe*02ksST#QR_ zDK5k1xB^$=DqM|ga4oLG^|%2y;wIdTTW~9G!|k{Ocj7MGjeBq}?!*0f01x6JJd8*1 zC?3P(cmhx2DLjp5@GPFg^LPO-;w8L{SMVxc!|QkhZ{jVyjd##IuY)HXet$dPa@$@1 zgFmtxz(KvchR~4aXa@v%woLm$~PM2~}9F@58hI)K?6Fs4lNO4jUD@l~3N-`z6 zl0r!-Z-J*)(kN+_bV_<9gOX9nq-0jIC|Q+kN_HiO;;iIUaw)l$JW5{0Maie+R|+Tv zl|o8krHE2gDW()xN+>0jQc7v1j8ax9r<7MJC>0e~rIJ!vsUn}ot)^60YRKu9H5GTo zL#d_IR_Z8qm3m5jrGe5=X{0n(nkY?`W=eCVh0;>-R9Y!sinrpU_$q#iztUPUC;^I5 zX`=)xK}xU^qJ%1Kl`th-iBKYyc1nAtgVIsyq;yugC|#9qN_VA)(o^ZB^j7*PeU*Mn ze`SCYr3_RCDJEsGGDI1w3{!?HBb1R!v@%K=t&CB|D&v&#$^>PiGD(@NOi`vP)0FAT z3}vP=OPQ_AQRXW1l=;d6WudZ2S*$EkmMY7X<;n_WrLsy{t*lYjD(jT>$_8blvPs#j zY*Dr<+m!9f4rQmZOWCdLQT8hPl>N#9<)Cs%IjkH}jw;8L5p9<)QLOd8|B9o+{6j=gJG^rSeL7 zt-Mj*D({r{$_M46@=5uu$lEjW&zlNWQ*BgR)lQA0+N%z#qZ(I@r^Z(ks0r0Xs*{>n zO`;}Mlc~wo6lzK}m6}>jqo!5Usp-`WYDP7anpw@FW>vGP+0`7Xvzk-QrRG-isCiWv zHJ_SaEua=u3#o6h)b?rzwWHce?W}fDyQHsxL9jFdcP3mBEh&ogqrVdv}s3X;Ab(A_<9ixs_$EoAh3F<_3k~&$P zqE1z(sngXN>P&T(I$NEi&Q<5B^VJ3FLUob4SY4tnRhOyD)fMVWb(OkWU8Am5*Qx8( z4eCa9le$^mqHa~UsoT{Z>P~f+x?A0&?p61x`_%*LLG_S&SUsX1RgbC1)f4JT^^|&A zJ)@pg&#C9t3+hGnl6qOaqFz<6sn^vT>P_{QdRx7t-c|3Z_tgjLL-mpRSbd^CRiCNP z)fehZ^_BWseWSir->L7_59&wtllobe!*fDDd>}MoBW#78h$HNUgK!jaMLZE-BoGNj zBH<(wizFhcNG6ht6e6WaB~pttBCSX#(u)itqsSyOi!36m$R@Ij9Ku=T6uCrhkw@ee zE+U`EFA9i)qL3&oiio13m?$nvh?1g|C@so}vZ9a%Ra6tzMGfI5 zY6^GZA!>=*qK>F5>WTWIfoLciiN>OdXeyeC=AwmYDLh3h;U&CVPd!#Ax4U5F-nXUW5if7PK*~5#6&SkOcqnbR549V7c<06F-y!A zbHrRRPs|q!#6q!1EEY?|Qn5@d7c0a{u}Z8KYs6ZyPOKLj#7416Y!+L@R5#6fXL92Q5!QE^Nh7bnC?aY~#PXT(`?PMjAP#6@vQTozZvRdG#R z7dOOBaZB75cf?(BPuv#|#6$5&JQh#HQ}IkZ7caz1@k+cFZ^T>iPP`W%#7FT-d=~QZ zOj9-a@QG%l*=lxL9L-*H&>Xe6T0AYjmOx9WCDNR<#99(9sg_JjuBFgYYN@o;S{f~_ zmQG8rWzaHenY7GW7A>omP0Oz3(44iLS}rZOmPgC0xoG*c{8|C6pjJpLtQFCUYQ?nT zS_!SBR!S?amC?#-<+Soz1+Ajys#Ve|YgM$WS~ab@Rzq{sYHIG9hgM6gt<}-$YW1}G zS_7@2)<|otHPM=C&9vrP3$3N*skPF)G;hsE^VR$`f33A<&;m50)n<2d$&lN$ae2(Yk8gwC-9Dt*6#Y>#gS7{Mrb3oXl;}>S{tK{)y8S#wF%lpZIU)wo1#tCrfJi)8QM&3mNr|Pqs`Uk zY4f!O+CpuSwpd%DE!CE3%e58SN^O<4T3e&7)z)e2wGG-vZIiZH+oEmNwrSh79okN9 zm$qBmqwUr9Y5TPU+ClA*c33;29o3F$$F&pMN$r$&T05hi)y`?>wF}xs?UHs`yP{py zu4&h`8`@3nmUdgaV}6Q=se6SA{_ClGiE>z`?)4~c9luv(Kfw~dyP`KwxYpO$vfd8O zoeTN1JWJh5EG#jCdMR~6q8|cOo1se6{f~Cm=@Dvddz?sF%xFSESMFu zVRp=c&X^N(VQ$QWdC>*)VSX%t1+fqo#v)i0i(zprfhDmNmc}wz7RzCItbi5K6)Rz7 ztb$ds8dk>|=!P}X9X+rX*2X$m7wchtY=8~15jMsq*c6*#b8LYv(Gy#t7kZ-)`l28D zV{0^E02;9k24WBfV+e*~TMWZ+jKD~2hwZTgcEnED8M|Ot?1tU32lm8X*c*d2I0nb!I2?}?a3W5^$v6e4;xwF&GjJx(!r3?n z=i)q^j|*@iF2cpQ1efA6T#hSnC9cBNxCYnaI$Vz%a3gNQ&A0`(;x^olJ8&oN!rizB z_u@X>j|cD|9>T+T1drk|JdP*uB%Z?4cm~hnIXsUS@FHHq%XkH^;x)XEH}EFj!rOQU z@8UhYj}P!6KElWN1fSwFe2y>hCBDMf_y*tNJA98H@FRZ0&-ewc4Zo^d-Vmsv4cej| z#zA{@Ku3&=@i0Cnz=W6xoiH&b!K9cBlVb`@iK#F(roptB4%1@>%!rvVGiJf8m<_XI z4s^zxm0#-yj ztPAsC8nF$}{o0wb{9Zm2oK{CJc`HgIG(_hcnVMB89a;U@H}3?i+Bky;}yJ$*YG;tz?*mrZ{r=j zi}&z8KEQ|g2p{7Ue2UNTIljP`_zGX+8+?oJ@I8LOkN62c;}?{-2h2f5fg0MNE!trm zv_}VY#JCs_<6{C$h>6e%6JrugipelJrofb#3R7bmOpEC-J!Zg+m;O(V-YNh#jrS*z>-)BOJf-7)R z4Xa}fbiLgWIkv!-=!vb+3%$_?ebEp7u{9bn z0FBrN12G7LF$6=gErww@Mqnhi!}iz#J7Op7j9suRcEj%21AAgG?2Ub}FZRR!H~^z? zAPzzk4#puk6o=t(9DyS-8b{%19D`$V9FE5cI1wk|WSoLiaT-p?88{PX;cT3Pb8#Nd z#|5|$7vW-Df=h83F2@zP5?A4BT!U+I9j?a>xDhwuX54~XaT{*O9k>&B;cnc6dvPD` z#{+l}58+`vf=BTf9>)`S5>Mf2JcDQP9G=Guco8q*WxRq{@fu#o8+a3M;cdKwckv$H z#|QWjAK_zsf=}@oKF1gM5?|qKe1mWC9lpm8_z^$hXZ(V4a)71%M}ZpJpe@>A9JEIV zbi}wA594D3Oo)ll2@_)yOp3`cIi|prm{ z5Fg=Ve1cE$89v7s_!3{?YkY%m@g2U$5BL#3;b;7U)*Jk)jinzTP(vHEMLUdx_UM3) z7#HJVd`y4|F%ddpVoZWbF&QSu6qpiIVQNf+X)zt9#|)SeGht@Tf>|*eX2%@pj5#qE z=Egjj7hNzP=Enk95DQ^pEQ0!%C+MLV7RM4;5=&ueEQ4jS9G1rlSP@-O|8@pFRK_Y; z6{}%&tbuM=6Wvk&QU^WM!rE8|>ta2uj}5RPHp0f(1e>D%9R+%5jxDeydSWZ|LT~gz zU-UzNY>fsCKqI!nKn%iQ48c%ri(wd!5g3W>uswFbj@Su1V;Ag--LO0Mz@FF(dt)E$ zi~X=a4!|fJh=b6CgK-EB#bG!cN8m_|#!)yL$KY5ThvRVqPQ*z#8K>Y>oQBhJ2F}D; zI2-5ST%3pVaRDyGMYtH3;8I+M%W(y+#8tQ&*Wg-QhwE_zZp2Nv8Mok8+=kn62kyjO zxEuH2UfhTK@cNB9_@;8T2t&+!Gm#8>zl-{4z(hwt$Ne#B4s8NZ-)VxMYjsW0`f zg3yD8HfW1>7zgdq0Ua?e#>4oS025*&bi%}#1e0PiOpYlqC8omEm$c`z@!U_Q)`1yKL413eVN!dL{0Vlga^C9oux!qTXJZ-O4m zVmU026|f?@VkNAMRj?{n!|GTA-LNLQqX*W)+E@qcVm+*n4X`0L!p7JHn_@F;jxDey zdSWZ|LT~gzU-UzNY>fsCKqI!nKn%iQ48c%ri(wd!5g3W>uswFbj@Su1V;Ag--LO0M zz@FF(dt)E$i~X=a4!|fJh=b6CgK-EB#bG!cN8m_|#!)yL$KY5ThvRVqPQ*z#8K>Y> zoQBhJ2F}D;I2-5ST%3pVaRDyGMYtH3;8I+M%W(y+#8tQ&*Wg-QhwE_zZp2Nv8Mok8 z+=kn62kyjOxEuH2UfhTK@cNB9_@;8T2t&+!Gm#8>zl-{4z(hwt$Ne#B4s8NZ+I13F?{jEC`2|Kt05NQjBh2@_)yOp3`cIi|prm3lsD#v)i0^}lqj zhvHZQ^}jf;hf-J?%V1e7hvl&XRzz2f*q9c!Te2exDhwuX54~XaT{*O9k>&B;cnc6dvPD`#{+l}58+`vf=BTf z9>)`S5>Mf2JcDQP9G=Guco8q*WxRq{@fu#o8+a3M;cdKwckv$H#|QWjAK_zsf=}@o zKF1gM5?|qKe1mWC9lpm8_z^$hXZ(WZZ&8u;kQ&Ea9Vr4ev_V_6!#HUE+AaA}2bLW% zF2=+7m;e)EB6PyUm;{qzGE9ysFeRqK)R+d-VmeHZ889Pe!pxWjvtl;Pjycd7b7C&c zjd?IHx?n!cj|H$G7Q(_<1dC!ZERH3xB$mR`SO&{tIV_JAup-KDi!g^uSQ)EeRjh{9 zu?D(fO>{>OtcA6)4%WqbSRWf;Lu`bNu?aTCX4o8CU`zDGR_KM^=!3rKhyK_a4H$q% zY=eOqguxhsq1YC~FdQQ=65C;W?0_Ay6L!Wf*cH2BckF>Zu^0BnKG+xgVSgNeQ8*9> zp$P}$5FCoba5#>@kr<7ma5Rp=u{aLL;{=?DlW;Ol!KpY6r{fHqiL-Dv&cV4j59i|o zT!@QsF)qQSxD1!$3S5b+a5b*MwYUz~;|AP_n{YF3!L7Irx8n}niMwz&?!mpd5BK8% zJcx(zFdo69cnpu@2|S6X@HC#mvv>~A;|08km+&%P!K-);uj388iMQ}J-od+g5AWjx ze29B2 z+?WURq6?b86j?@jewGVhK`exYu?QB$VptqYU`Z^6rLhc_#d264D_})*#Y$Kit6){E zhSjkKx?xRpM-QxpwXqJ?#d=sD8(>3hgpIKYHpOPx99v*Z^u$)^h2H3czUYVk*cuHO zfJSVCff$6r7=oeL7Q-+cBQO%%VSDU=9kCO3#xB?uyJ2_ifjzMo_QpQg7yDs<9Dq?c z5C@?N2jdVNioNOhq7M-6Sz7VR(& z+M@$HVqA=e@i74=#6;+Xi7^Q##blTqQ(#I=g{d(Orp0ua9y4G@%!HXS3ueV^m>qMV zGv>rxm>ct8UUb2Hm>&yZK`exYu?QB$VptqYU`Z^6rLhc_#d264D_})*#Y$Kit6){E zhSjkKx?xRpM-QxpwXqJ?#d=sD8(>3hgpIKYHpOPx99v*Z^u$)^h2H3czUYVk*cuHO zfJSVCff$6r7=oeL7Q-+cBQO%%VSDU=9kCO3#xB?uyJ2_ifjzMo_QpQg7yDs<9Dq?c z5C@?N2jdVNioA9JEIV zbi}wA594D3Oo)ll2@_)yOp3`cIi|prm{ z5Fg=Ve1cE$89v7s_!3{?YkY%m@g2U$5BL#3;b;7U^2-1$?LP|C&<1VM4&$IbI-n!Q z#dsJW6JSD2gie?klVDOzhRHDnro>d38q;7}Oo!<)17^fbm>IKRR?LRkF$X$hPRxb5 zF%RZN7tDwGu>cmtLRc7!U{NfF#jymI#8Oxq%V1e7hvl&XRzz2f*q9c!Q) z)tJ21hxM@mHpE8Q7@J^IY=+IT1-3*_Y=vIvjXvm$e&~;_(SQMH#5Nd+ zK^Tl77>aE%48t)3Be5N}#}3#LJ7H(+f?cs2cE=vr6MJEA?1O!=ANI!q7=;6I5Snl> z4#A-~42Rcz=gO7 z7vmCKipy|0uE3SJ3RmMAT#M^)J#N5_xCuAo7Tk*4a69h6owy5k;~w0L`*1%Vz=L=Q z591L$ipTIcp1_lM3Qyx1Jd5Y>JYK+ycnL4#6}*bq@H*bWn|KRv;~l(<_wYVGz=!w< zALA2ziqG&lzQC9G3SZ+Je2ee!J$}HC_z6Gb7nENGWNH6VpoTVRi*^_X?a=`pF)qf# z_?Q3_Vj^_H#Fzw=VlqsQDKI6b!qk`s(_%VIj~Or{X2Q&v1+!u{%#JzG8FOMT%#C?4 zFS=kp%#Q`IAQr;HSOkk=F)WTHuq2kk(pUz|VmU026|f?@VkNAMRj?{n!|GTA-LNLQ zqX*W)+E@qcVm+*n4X`0L!p7JHn_@F;jxDeydSWZ|LT~gzU-UzNY>fsCKqI!nKn%iQ z48c%ri(wd!5g3W>uswFbj@Su1V;Ag--LO0Mz@FF(dt)E$i~X=a4!|fJh=b6CgK-EB z#bG!cN8m_|#!)yL$KY5ThvRVqPQ*z#8K>Y>oQBhJ2F}D;I2-5ST%3pVaRDyGMYtH3 z;8I+M%W(y+#8tQ&*Wg-QhwE_zZp2Nv8Mok8+=kn62kyjOxEuH2UfhTK@cNB9_@ z;8T2t&+!Gm#8>zl-{4z(hwt$Ne#B4s8NZ&K6 zm=F`86DGzam=u#?a!i3KF%_o9G?*6CVS3Df88H)P#w?f>vtf43fzFr{b75}GgL%;f z^I?80fCaG-7RDl26pLYTEP*Al6qd#^SQg7+d8~jH(G@FUWvqf#u^Lv#8t8^K(H%Xo z7S_f(SQqPIeQbaYu@N@LCfF34VRLMOEzuKOp%;3i5Bj1X`eSP}U;rAi4F+Nm24e_@ zVp|NuaE!o6Y=`Z!19rqt*crQESL}w}u?P0VUf3J^U|;Nq{c!+B;XoXOCLD}Ia3~JL z;Wz?EVlZzFARfZQcm$8) zF+7eZ@FbqX(|88Y;yFBz7w{rp!pnFCui`bljyLco-oo2>2k+uNypIp?AwI&#_ynKg zGklIO@Fl*&*Z2nC;yZkgAMhi7!q4~x<<|gP+J6+Np$*!i9mYX>bU;Upi}5f%CcuQ4 z2%Ru7Cc&hb43lFDOo^#5HKxI|m=4op2F!?=Ff(Sste6e6V-9r2oR|x9V;;17pF*d=b*bJLv3v7v=*b2SS8-36h{m>s88#yz+f_u+m#fCupq9>ybh6p!I? zJb@?i6rRR2coxs$dAxuZ@e*FfD|i*J;dQ)$H}MwU#yfZy@8NxXfDiEzKE@~b6rbU9 ze1R|V6~4wd_!i&cd;EYO@e_W=FQ_D>{YQZs+Mq4kVH~ta2Xw@^7!TuP0!)aB&Js)Gh-IairFwb=0Io6iMcR0=E1z^g848%7Qlj7 z2n%BoEQ-ajIF`VYSPDyH87zzCusl}4is*`!urgM`s#p!HV-0k}n&^%mSPN@o9juG> zus$}xhS&%jV-swO&9FJPz?SHVtUuCPRAKI6KCOUoP%?59?r)FxDXfNVqAhtaTzYh6}S>t z;c8riYjGW}#|^j-exUdJ1F6K~;dyn}b~9^S_X_z)lAV|;>7@fkkH7x)ri z;cI+@Z}A%!rvVGiJf8m<_XI4s^zxm)<8F`iSFouwXinU!Ma!v>th3K zh>fr@Ho>OY44Y#MY>A%O3cb)9eb5*E&>vf)0Rzy8Z7>jnFc?EH6x(7LhGPUqVmoY) z9k3&I!p_(QyJ9!&jya)K z7RTXuoPZN?5>Cb`I2EVibew@RaTdx4=M$CknF$-qJY?vK$pfl#gT$mg4U|w{=e3%~#U_mT|g|P@0#bQ_- zOJGSXg{83!mc?>d9xGr)bj3gMk=?!5D&}*cQVu93wCi+hKd`fE}?D zcE&E)6}w?~?14S87xu%QFG>*ZsI1b0- z1e}PIa57H8sW=U%;|!dMvv4-f!MQjO=i>rgh>LJBF2SX^442~yT#2i2HLk(6xDMCj z2Hc37a5HYft+)-h;||=3yKpz|!M(T-_u~OPh==en9>Jq{43FapJc+09G@ik;cn;6w z1-yut@G@S(t9T8s;|;utx9~RJ!Mk`5@8bh}h>!3wKEbE>44>l*e2K5{HNL^O_zvIW z2mFYi@H2ivB{A(k3e?aBZP582_QcQ-)F$Jc?RG1pm zU|LLv=`jOl#7vkOvtU-thS@O(I%7`Eg}E^g=0z9GhxxGp7Q{kW7>i(0EQZCg1eU~7 zSQ^Vw{^SFD7Uu?kkjYFHg>pc~dicl5woSR3nLU95-ou>m&3M%WmeU{h>{ z&9McxL{DsmUg(WJ=!<^nkFC*w0cgZF7>Gd_j3F3`Z7~ePF#;p89k#~~*bzHnXY7Jq zu^V>B9@rCmVQ=h%eX$?*#{n3H191?Va4-(Rp*ReO;|Lsy(Krf6;}{%^<8VAqz==2s zC*u^Hiqmj9&cK;C3uogToQv~tJ}$t8xCj^H5?qSQa5=8PmADF5;~HFx>u^18z>T;G zH{%xEira8I?!cY63wPrl+>85gKOVq?cnA;U5j={=@Hn2plXwbG;~6}Q=kPpUz>9bZ zFXI)wir4Tu-oTr93vc5cyo>knK0d&Q_y`~46MTx#@HxJ~m-q@_;~RX7@9;f-z>oL| zKjRlvlFd38q;7}Oo!<)17^fbm>IKRR?LRkF$d0#?LISQ)EeRjh{9u?E(}T38$FU|p<-^|1jq#75W{ zn_yFHhRv}Bw!{!@g`wCQ+hAL4hwZTgcEnED8M|Ot?1tU32lm8X*cd38q;7}Oo!<)17^fbm>IKRR?LRkF$d0#?LISQ)EeRjh{9u?E(}T38$FU|p<-^|1jq#75W{n_yFH zhRv}Bw!{!@g`wCQ+hAL4hwZTgcEnED8M|Ot?1tU32lm8X*cd38q;7}Oo!<)17^fbm>IKRR?LRkF$d0#?LISQ)EeRjh{9u?E(}T38$FU|p<-^|1jq#75W{n_yFHhRv}B zw!{!@g`wCQ+hAL4hwZTgcEnED8M|Ot?1tU32lm8X*cd3 z8q;7}Oo!<)17^fbm>IKRR?LRkF$d0#?LISQ)EeRjh{9u?E(}T38$FU|p<-^|1jq#75W{n_yFHhRv}Bw!{!@ zg`wCQ+hAL4hwZTgcEnED8M|Ot?1tU32lm8X*cd38q;7} zOo!<)17^fbm>IKRR?LRkF$d0#?LISQ)EeRjh{9u?E(}T38$FU|p<-^|1jq#75W{n_yFHhRv}Bw!{!@g`wCQ z+hAL4hwZTgcEnED8M|Ot?1tU32lm8X*cd38q;7}Oo!<) z17^fbm>IKRR?LRkF$d z0#?LISQ)EeRjh{9u?E(}T38$FU|p<-^|1jq#75W{n_yFHhRv}Bw!{!@g`wCQ+hAL4 zhwZTgcEnED8M|Ot?1tU32lm8X*cd38q;7}Oo!<)17^fb zm>IKRR?LRkF$d0#?LI zSQ)EeRjh{9u?E(}T38$FU|p<-^|1jq#75W{n_yFHhRv}Bw!{!@g`wCQ+hAL4hwZTg zcEnED8M|Ot?1tU32lm8X*cd38q;7}Oo!<)17^fbm>IKR zR?LRkF$d0#?LISQ)Ee zRjh{9u?E(}T38$FU|p<-^|1jq#75W{n_yFHhRv}Bw!{!@g`wCQ+hAL4hwZTgcEnED z8M|Ot?1tU32lm8X*cd38q;7}Oo!<)17^fbm>IKRR?LRk zF$d0#?LISQ)EeRjh{9 zu?E(}T38$FU|p<-^|1jq#75W{n_yFHhRv}Bw!{!@g`wCQ+hAL4hwZTgcEnED8M|Ot z?1tU32lm8X*cd38q;7}Oo!<)17^fbm>IKRR?LRkF$d0#?LISQ)EeRjh{9u?E(} zT38$FU|p<-^|1jq#75W{n_yFHhRv}Bw!{!@g`wCQ+hAL4hwZTgcEnED8M|Ot?1tU3 z2lm8X*cd38q;7}Oo!<)17^fbm>IKRR?LRkF$d0#?LISQ)EeRjh{9u?E(}T38$F zU|p<-^|1jq#75W{n_yFHhRv}Bw!{!@g`wCQ+hAL4hwZTgcEnED8M|Ot?1tU32lm8X z*cd38q;7}Oo!<)17^fbm>IKRR?LRkF$d0#?LISQ)EeRjh{9u?E(}T38$FU|p<- z^|1jq#75W{n_yFHhRv}Bw!{!@g`wCQ+hAL4hwZTgcEnED8M|Ot?1tU32lm8X*cd38q;7}Oo!<)17^fbm>IKRR?LRkF$d0#?LISQ)EeRjh{9u?E(}T38$FU|p<-^|1jq z#75W{n_yFHhRv}Bw!{!@g`wCQ+hAL4hwZTgcEnED8M|Ot?1tU32lm8X*cd38q;7}Oo!<)17^fbm>IKRR?LRkF$d0#?LISQ)EeRjh{9u?E(}T38$FU|p<-^|1jq#75W{ zn_yFHhRv}Bw!{!@g`wCQ+hAL4hwZTgcEnED8M|Ot?1tU32lm8X*c7)R4Xa}ftckU-HrBzqSP$!C18j(murW5l zrq~RdV+(AFA=nB-u{E~Aw%88aV+ZVrov<@@!LHa1yJHXRiM_Bl_QAf`5BuW)9EgK( zFb=_?I1Gp52pox{a5Rp=u{aLL;{=?DlW;PI;S`*T({MV@z?nD;XX6~4i}P?kF2IGj z2p8iLT#CzZIj+E!xC&R}8eEI(a6N9ojkpOn;}+bC+i*MXz@4}YcjF%1i~Ddt9>9Zm z2oK{CJc`HgIG(_hcnVMB89a;U@H}3?i+Bky;}yJ$*YG;tz?*mrZ{r=ji}&z8KEQ|g z2p{7Ue2UNTIljP`_zGX+8+?oJ@I8LOkN62c;}`sj-|##Bz@PXFf8!tgi~rC{LjTc0 z7d;dxQK63-4F(tv!(#-Dh>7)R4Xa}ftckU-HrBzqSP$!C18j(murW5lrq~Rd zV+(AFA=nB-u{E~Aw%88aV+ZVrov<@@!LHa1yJHXRiM_Bl_QAf`5BuW)9EgK(Fb=_? zI1Gp52pox{a5Rp=u{aLL;{=?DlW;PI;S`*T({MV@z?nD;XX6~4i}P?kF2IGj2p8iL zT#CzZIj+E!xC&R}8eEI(a6N9ojkpOn;}+bC+i*MXz@4}YcjF%1i~Ddt9>9Zm2oK{C zJc`HgIG(_hcnVMB89a;U@H}3?i+Bky;}yJ$*YG;tz?*mrZ{r=ji}&z8KEQ|g2p{7U ze2UNTIljP`_zGX+8+?oJ@I8LOkN62c;}`sj-|##Bz@PXFf8!tgi~rC{O8?P87d;dx zQK63-4F(tv!(#-Dh>7)R4Xa}ftckU-HrBzqSP$!C18j(murW5lrq~RdV+(AF zA=nB-u{E~Aw%88aV+ZVrov<@@!LHa1yJHXRiM_Bl_QAf`5BuW)9EgK(Fb=_?I1Gp5 z2pox{a5Rp=u{aLL;{=?DlW;PI;S`*T({MV@z?nD;XX6~4i}P?kF2IGj2p8iLT#CzZ zIj+E!xC&R}8eEI(a6N9ojkpOn;}+bC+i*MXz@4}YcjF%1i~Ddt9>9Zm2oK{CJc`Hg zIG(_hcnVMB89a;U@H}3?i+Bky;}yJ$*YG;tz?*mrZ{r=ji}&z8KEQ|g2p{7Ue2UNT zIljP`_zGX+8+?oJ@I8LOkN62c;}`sj-|##Bz@PXFf8!tgi~rC{M*q=47d;dxQK63- z4F(tv!(#-Dh>7)R4Xa}ftckU-HrBzqSP$!C18j(murW5lrq~RdV+(AFA=nB- zu{E~Aw%88aV+ZVrov<@@!LHa1yJHXRiM_Bl_QAf`5BuW)9EgK(Fb=_?I1Gp52pox{ za5Rp=u{aLL;{=?DlW;PI;S`*T({MV@z?nD;XX6~4i}P?kF2IGj2p8iLT#CzZIj+E! zxC&R}8eEI(a6N9ojkpOn;}+bC+i*MXz@4}YcjF%1i~Ddt9>9Zm2oK{CJc`HgIG(_h zcnVMB89a;U@H}3?i+Bky;}yJ$*YG;tz?*mrZ{r=ji}&z8KEQ|g2p{7Ue2UNTIljP` z_zGX+8+?oJ@I8LOkN62c;}`sj-|##Bz@PXFf8!tgi~rC{PXEzC7d;dxQK63-4F(tv z!(#-Dh>7)R4Xa}ftckU-HrBzqSP$!C18j(murW5lrq~RdV+(AFA=nB-u{E~A zw%88aV+ZVrov<@@!LHa1yJHXRiM_Bl_QAf`5BuW)9EgK(Fb=_?I1Gp52pox{a5Rp= zu{aLL;{=?DlW;PI;S`*T({MV@z?nD;XX6~4i}P?kF2IGj2p8iLT#CzZIj+E!xC&R} z8eEI(a6N9ojkpOn;}+bC+i*MXz@4}YcjF%1i~Ddt9>9Zm2oK{CJc`HgIG(_hcnVMB z89a;U@H}3?i+Bky;}yJ$*YG;tz?*mrZ{r=ji}&z8KEQ|g2p{7Ue2UNTIljP`_zGX+ z8+?oJ@I8LOkN62c;}`sj-|##Bz@PXFf8!tgi~rC{LI2S~7d;dxQK63-4F(tv!(#-D zh>7)R4Xa}ftckU-HrBzqSP$!C18j(murW5lrq~RdV+(AFA=nB-u{E~Aw%88a zV+ZVrov<@@!LHa1yJHXRiM_Bl_QAf`5BuW)9EgK(Fb=_?I1Gp52pox{a5Rp=u{aLL z;{=?DlW;PI;S`*T({MV@z?nD;XX6~4i}P?kF2IGj2p8iLT#CzZIj+E!xC&R}8eEI( za6N9ojkpOn;}+bC+i*MXz@4}YcjF%1i~Ddt9>9Zm2oK{CJc`HgIG(_hcnVMB89a;U z@H}3?i+Bky;}yJ$*YG;tz?*mrZ{r=ji}&z8KEQ|g2p{7Ue2UNTIljP`_zGX+8+?oJ z@I8LOkN62c;}`sj-|##Bz@PXFf8!tgi~rC{N&nG77d;dxQK63-4F(tv!(#-Dh*O4D z`~L-mbqQ+{5~D)bVPRFmLI>NHHL`1y&>{aBk}xDmS!-xmNV4Jo{Yq8VE*~l7(6G>F z^1mTP5>@!$*|z0auH{+6l9sZ3OIyYYtZ-I%D}oi#ieyE$qF7O_XjXJ9h85F_WyQAQ zSaGd*R(vagmC#CLCAN}SNv&j7ax2Au`;p2@ZKbi&TIsCxRt77hmC4F%WwEka*{tkV z4lAdX%gSx#vGQ8^to&91tDsfLDr^<8idx02;#LW(q*cl)ZI!XgTIHTeCO23mux!PXFKs5Q(QZjG=; zTBEGd));H7HO?AuO|T|fldQ>Bm^H|8_jn*b>v$e(AYHhQ&TRW_s)-G$ewa40P?X&h< z2dsnEA?vVp#5!snvyNLQtdrI$>$G*oI%}P?&RZ9(i`FIUvUSC}YF)FgTQ{tm)-CI{ zb;r7E-Lvjn53GmQBkQsC#CmEyvz}Wote4g+>$Ua9dTYJ2-di86kJcyav-QRLYJIc5 zTR*Iy)-UU~^~d^a{rm6uZQHS3+p~o&ZDsqmwv8Rw;q35s1UsS~$&PGCv7_42?C5q3 zJEk4Wj%~-WFo4&20Npj$su5H(`>)Q3~`gQ}mq20)CY&Wr++Rg0db_=_u9b&h# zL+#df8@sLD&TemaushnF?9O%=h)K58GckJ~5gllCe5w0*`tYoD{v+ZXJM_9gqWeZ{_NU$d{< zH|(4CE&H~8$G&Udv+vsv?1%Ov`?3AReri9npW83&m-Z|Bwf)9^YrnJK+aK(Y_9y$Z z{l)%jf3v^aKkT3OFZ;Ls$Np>o`|tAGj^ntF=LknS%JCiT7$cbkodixoCy|rbN#Z1Rk~ztp6i!Mfm6O^@+sWhPb@Dm+odQllr;t82{lvCO% znmNs# z7EVhi#A)S(I<1{HPFts))86UebaXm7ot-XDSErlP-Ra@GHaHudP0nU#i?h|)=4^L% zI6IwP&TeOqv)9?@>~{`02c1LCVdsc*)H&uHcTPAbom0+f=ZtgKIp>^rE;tvROU`BI zigVSu=3IAfI5(YJ&TZ$8bJw}&+;<*051mKOW9NzU)OqGScV0Lzomb9l=Z*8$dFQ-$ zJ~$tpPtIrOi}Tg_=6rX4I6s|V&Tr?B^Vj+J-_viqj_bOfD_rR+*LSsR+`tXzhIb>l z5#2~`WH*W%)s5yxcVoCQ-B@mHH;x@Nv8@LVKMs8!b ziQCj|<~Da*xGmifx0M^}wszaNZQXWmd$)tz(e31RcDuM;-EMAow};!)?dA4%`?!7G zer|tvfIH9~ox9%M;BIs`xtrZB z?pAl3yWQR4?sRv#yWKtRUU#3n-#y?SbPu_Q-6QT%_n3R!J>i~oPr0YvGwxaUoO|BA z;9hhuxtHB5?p61id)>X^-gIxdx7|DLUH6`Q-+kadbRW5o-6!r-_nG_Lec`@zU%9W{ zH||^ao%`PX;C^&Jxu4xH?pODl``!KF{&au2zuiCXU-#dC;kP}wEHAbf$BXO5^Wu96yo6pNFR_=zOX?-_l6xt< zlwK+?wU@?A>!tJ3dl|fpUM4TIm&MEKW%IIoIlP=+E-$y2$II*G^YVKIynVihCuzl3ppVv{%L}>y`7$dlkHjUL~)xSH-L9Rr9KQHN2W$Ew8p$$E)kr^XhvI zyoO#Qud&y}Yw9)gntLt0mR^Y0$_w>cdu_b7UOTV7*TL)Pb@DoUUA(SdH?O5cM6dt?#=LKdb7OQ-W+ePH_w~zE$|k4i@e3&5^t%u%v_h^Tkmb~ zHhP=9&E6JotGCVD?(Oh)db_;c-X3qSx6j+}9q`GML{HI6^cHq#Dz=I3Vu#o% zc8T3$kJu~riT&b$I4BN@!{UfIDvpWc;)FOUPKndvj5sUKiSy!uxF{}(%i@Z-Dz1s^ z;)b{>Zi(CCj<_rCiTmP#cqkr;$Kr{2DxQhw;)QrAUWwP@jd&~GiTC1z_$WS!&*F>t zD!z&D;)nPteu>}WkN7M82}|12k*@TlkWwn?OD&BIWH=dKMvxI@BpF#okx^wd8C}MZ zF=Z?nTgH)bWjq;QCXfkbBAHkwkx6ATnOvrjDP<~|TBea{WjdK&W{??WCYf1gky&Ln znO)|PIb|-HTjr5@Wj>i-7LWyHAz4@!kws-OSzMNoC1ojDT9%PzWjR@1R*)5CC0SWk zkyT|iSzXqUHDxVXTh@_vWj$G6HjoWvBiUFskxgYY*<7}eEoF#oB|~Lv*+#aN?PPn| zL3Wg#WM|n$c9q>^ciBVsl)Yqc*+=%3{bYYRKn|3Ho7^sU$enVR+%5OWy>g%2FAvCr@{l|%kI19)m^>~| z$dmGvJT1@2v+|rgFE7Z8@{+tPugI(Nn!GM=$eZ$(ye;p@yYimAFCWN<@{xQjpU9{3 znS3r^$d~e!d@bL|xAL8QFF(kS@{{~5zsRrhoBS?+$e;3;{4M{;zw)27l&u`)Do+U| zl~TUa%BVnvQ{hzv6;VY}kyR8GRYg1u|Wsb;C! zYL1$#=BfE=fm)~*sl{rETB??*Z})~WSsgW9Mzsm*GO+N!px?P`bG zsdlN|YLD8h_No2qfI6rSsl)1sI;xJTWn(8&Z+b2g1V?Ksmtn$x~i_J z>*|KOscxy;>W;dr?y39gfqJMOsmJPxda9nO=jw%esa~nq>WzA<-l_NMgZijGsn6<* z`l`OE@9Ky8seY;7>W})X{wd41eaClw&lkS*mGAr7H-6xU^TYcQ{D^)eKe8XikLpMB zqx&)Zn0_ojwjalj>&Nrs`w9Gnej-1ypTtk&xoPI7px1Yz)>*w?H`vv@hej&fGU&Jr!7xRnzCH#_pDZjK|#xLua z^UM1c{EB`hzp`J&uj*IxtNS(lntm<6wqM7u>(}$^`wjetej~rJ-^6d~H}jkOE&P^# zh~LT&^;`RG{I-5OzrEkV@91~(JNsSyu6{SayWhj_>G$$``+fYrem}pzKfoX85Ap~5 zL;RusFn_o|!XN36@<;n){IUKxf4o1zpXg8WC;MUk6o0Be&7bbi@Mrq7{Mr5-f382z zpYJd57y66*#r_h1slUu$?yvAy`m6la{u+O+zs_IpZ}2zzoBYlG7JsY1&EM|t@OS#V z{N4T@f3LsK-|rvr5Bi7v!~PNfsDI2q?w{~a`ltNU{u%$Qf6hPeU+^#bm;B5A75}P# z&A;y7@NfFJ{M-H=|E_<}zwbZrANr5{$Nm%lssGG>?!WL~`mg-g{u}?T|IUB!fABy0 zpZw4M7yqmO&HwKI@PGQh{NMf`|F8egx3sMt?P^a8Ew$3V*4pSmhtuJ81RYUF(vfu( z9aTru(RBw(`Q|HpTbsn8p=hOLh0bNiR(uH*qT~rs-#dQf?QkT-Dbs1e&m(%5S z1zk~B(v@`;T~$}p)pZSBQ`ge9bsb$-*VFZN1Km(J(v5W!-BdTz&2FIigo~dW)*?NwitLN$YdVyZ37wN@%iC(Ig z>E(KbUa42<)q0IytJmrEdV}7mH|foKi{7fY>Fs)l-l=!#-FlDStM}>s`hY&D59!1D zh(4;1>Err@KB-UX)B21)tIz54`hvcwFX_wrioU9^>FfH2zNv5N+xm{atMBRi`hk9^ zAL+;XiGHe|>F4@|eyLyS*ZPfqtKaGO`h)(cKk3iF@f7{;7ZI-};aKtN&@s z*v2ug@r*FiDB~M#j0sFQ6W&BH5lti$*+el>O*9kT#4s^UEEC(rF>y^i6W=5-2~8rC z*d#GYO)`_*q%bK>DwEozF=+UO*WI=9)G#$oEmPanF?CHn zQ{OZ&4NW7{*fcRsO*7Nnv@k7Ah-qa)O>5J}v^DKad(**mG@VRm)5UZ(-As4W!}K)0 zOmEZ2^fmoVe>1=gG=t1wGsFxv!_06q!i+Sd%xE*lj5XuTcr(FFG?UC^6K1BEsb-p) zZf2O7W|o<4=9sx=o|$hJn1yDMS!|Y=rDmB~ZdRC;W|diO)|jY&X}|2oH=hUn2Y9; zxoobOtLB=yZf=;H=9al_?wGsgp1E%xn1|+(d2F7Tr{==2XTV9LA)S-kRV7HBnlD-NrI$7vLJboB1jpe3Q`AYg0w-p zAbpS_$QWb_G6z|LtU(0SGzuCAO@gLD zv!HpK|8gvV~2R(wGL9d{9&?o2{^b7h2 z1A>9UpkQz?Bp4bD3x)?Hf|0?fV017h7#oZW#s?FEiNT~`au60w38n_qg6YAGU}i8Y zm>tXs<_7bE`N4uw@*chG1i`DcBrr3AP5? zg6+YMU}vx^*d6Q%_6GZc{lS6YU~nin92^Ob2FHTq!HM8xa4I+*oC(eb=YsRWh2Uav zDYzV539bg$g6qMJ;AU_uxEFmtT z%B;H-cXzkq?xjGXEzkmOad#Bp?F?s6YccFn|dxU;_uZ zzym%A0hvH%kOgD~*+6!X1LOp`KyHu+VUeS9;go*fQFzEXbhTwrl1*U4#GeS z5Dr>`R-iSA0Bt~95DB6{G-wCfgAO1D#Db2X6X*=$Ks-nQT|igR4Ri-RKu^#M^ag!E zU(gTq2Lr%BFbE6=L%>il3=9V&z(|k?MuE{_3>XW>f$?AhNCFeVBrq9F0aL*=FdfVQ zGr=q{8_WT7!8|Y@EC36^BCr@N0ZYL$upF!aE5RzT8ms|p!8))WYycamilQKeD4N0)L$MS` z@f4v1N~9!8rW8u0G)kuo%A_pHrX0$pJj$m+s7zF5DhrjB%0^|Ua!@&`TvTo<50#h7 zN9CsqPz9+%RAH(JRg@}56{kv2C8<(WX{roWmMTY;rz%htsY+C3stOfKRi&y?)u|d( zO{x}Eo2oPPjb22ca3LDXPs2sM-% zMh&M%P$Q{CY7{k^8bghx#!=&`2~-j_k(xwJrlwF+scF=7Y6dlvnnlf~=1_B~dDMJr z0kx1?L@lP4P)n(0)N*PCwUSyzt)|vcYpHeAdTIl;k=jIUrnXR9scqDDY6rEG+C}ZA z_E3ANebj#H0CkW$L>;D%P)Dg_)N$$rb&@(oouIQX_x<%cl?ofBBd(?gE0ril2L_MaSP*166)N|?u^^$r;y{6t!Z>e|Gd+G!Ak@`e^ zroK>Lsc+PG>Ie0c`bGVw0$?hb8m57PFf9y%>0o-80cM235I_n-h#(CyWFQMU$U_1J zC_)L!P=PAcpbiaaLJQi^fiCo*4?|!km>Fh)Sz$Jq9p->JVJ?^(=7D)(KA0aCfCXV8 zSQr+8MPV^m9F~A3VJTP|mVsqqIanT6fE8gSSQ%DRaf!S=8NjDfMRBkTk_!#EfZ z6JQtE6?TK&VGr07_JX}(AJ`Z6gZ<$EI1mnkgW(W36b^&K;RrYqCc;s0G#mrR!f|js zoB)&HL^ugfhEw2FI1NsRGvG`(3(kgf;9NKl&W8)&LbwPnhD+d5xC}0bE8t4F3a*B0 z;99s2u7?}oMz{%XhFjoPxD9THJK#>Z3+{${;9j^7?uQ59L3jurhDYF0cnltgC*VnV z3Z8~%;8}PMo`)CUMR*BbhF9QKcnw~MH{eZp3*Lrz;9YnR-iHt1L-+_jhEL#A_zXUW zFW^h~3ciMK;9K|(zK0*+NB9YThF{=U_zixCKj2UJ3;u=yC>2VL(x5<;76qYnC_T!6 zGNNDv5QQK@h(;JOh(#RY5kUeHk%VNVAQfpyM+P#Hg>2*?7kS7>At)2djIyAtC>zR- za-f_j7s`$Dpu8v_%8v@5f~XKGjEbP5s2D1aN}!Uc6e^9%pt7hODvv6lil`E*jH;ke zR25Z2)lm&p6V*bsQ5{ql)kF1B1Jn>TLXA-q)D$&C%~2R?fx=Ns)C#pm5vUDniy~1J zibm~Fd(;8NpjgxqbwZs{9EwK?s0-?fx}ol<2kMD>q28zu>WliJ{%8Oihz6m-Xb2jL zhN0nT1R9AF(I_+;jX`74I5ZwjKuKsKnuI2!DQGI1hNhz#XeOG4W}`W1E}DntqXlRo zT7(v(C1@#HhL)ohXeC;OR--j&En0`xqYY>y+JrWvEodv+hPI;}XeZi*cB4ILFWQIp zqXXz5I)o0RBj_kPhK{2X=p;IYPNOsEEINnIqYLOFx`ZyHE9fe^hOVO<=q9>_ZlgQs zF1m;AqX+0AdW0UMC+I19hMuDr=p}lEUZXeYEqaIEqYvmK`h-5CFX$`!hQ6a8=qLJx zexm?76`h(+LkH4n=^#2Cou1A>XQYE^KvOiN5lz#WW@wh?Xr3mtK#R0Q%d|qPv_|W+ zL7TKi+q6Twv`71N2%U+}OlP69(%I8f-!x;kBhu1VLTYtwb;x^z9dKHY$B zNH?Mz(@p56bThg+9Y(jH!|9fEE4nouLARmX(vfr&9Zk2R+tVHB7&?~jNOz(;({Xe> zoj`Y?yVBk0?sN~jC*6zgP4}Vu(*5ZE^Zuf z5&9^7j6P1Epik1L=+pEW`Ye5pK2KkuFVdIj%k&lcDt(QAKs zJJ`h@_HhW#gfrtTI4jPEv*R2%C(ea)<2*Po&WH2k0=OV9gbU*$xF{}$i{lcwBrb(Z z<1)A`E{DtG3b-P!ge&7JI22dK)o^uO1J}g0aBW-%*Twa4ecS*y#Eo!c+ypnp&2V!Z zhFjop+!D9Kt#JfygWKXr9EGECJKP?3z%e)$cf_4=XB>y)aRTmwyW(!RJMMvd;$FBn z?t}Z{ez-p#fCu71crYGY)wn8r*KrYX~mY0iW(EtqhoCDV#&%|tM5n6^wL6U9U` z?U?pV2PTGzWjZpQn9fWb6VD_tU6`&+H>Nw&gXziiVtO-un7&Lurav=)8ORJ`1~WsL zq0BI5I5UD7$s{tPn9EhW;Qd2naj*$ z<}(YJh0G#mF|&kO$}D4+Gb@;t%qnIzvxZsAtYg+Q8<>sECT26Uh1tq%W41Fpn4Qcn zW;e5k*~{!>_A>{VgUli3Fmr@C${b^kGbfmn%qiwHbA~y~oMX;27nqC8CFU}7g}KUH zW3Dqdn48Qk<~DPOxy#&R?lTXVhs-19G4q6Z$~SW4<##n4ioq<~I|-reagGY1lwEEgQt9W7D%4*o*&*yu zb{IRH9l?%d6WLMhXm$)cmL12AXD6^p>_m1FJDHurPGzUD)7cs9Om-GKo1MeXW#_T; z*#+!Eb`iUnUBWJ9m$A#)73@lO6}y^U!>(o5vFq6l>_&DIyP4g>Ze_Qz+u0rLPIec& zo880iW%sfB*#qoB_7HoRJ;EMkkFm$u6YNR$6nmOI!=7c&vFF(f>_zqxdzrn$US+Sb z*V!BFP4*Uho4v!{W$&@~*$3=H_7VG-eZoFvpRv!`7wk*+75kcf!@gzTvG3Ur>__$! z`A4JCMlP5G9K}HnaWscHhGRL7 z<2k|!oXAO>%qg78X`Id(oXJ_7%{iRQd7RIMaGAKwTox`XmyOHL<=}F1xwzb19xgAJ zkIT;$;0khuxWZf!t|(WGE6$bRN^+&R(p(v?ELV;z&sE?ma+SEsToo>qtIAd5s&h5C znp`ceHdlwM%hluRa}BtLTqCYA*Mw`zHRGCdVO$F?oNLLo;#zYNTpO+}7s*9&(Of&O zJ=cMY;bOUtTqmwG7sthO30xPhE7y(d&h_AWa=p0TTpzA4*N^MZ4d4cHgSf%m5N;?p zj2q64;6`$Z+$e4|H-;O_jpN316SyR9A~%Vf%uV5@a?`l!+zf6eH;bFi&Ee*9^SJrk z0&XF z%zfd$a^JY`+z;+2_lx_@1@Ni()O;E~kWb48@#*;Vd<*V`4`5Jsp zz7}7bufx~n>+$vZ27E)l5#N|^!Z+oc@y+=#z6Br7x8z&#t@#MP4d0fJ@KALftnNBLv?asC8`P7Goai`c{=F7b#@LP#c(nPeeZNj8$5r0A!SK9Ql3;G6-gyhnN%U6q$;UKs*@U|CaFbglRBg>sYmLQ z2BaZrL>iMOq$z1env*cnf`pTnq!npRB1jw3mPC>$5>48X_M`)eA+e+*=|nn{I1*11 zNEgzTbR*qK57LwLBE3l;(wFoj{mB3_kPIS&$q+J>3?swI2r`l+l2K$d8AHaBab!H1 zK$6HrGKowkQ^-^@jZ7yq$V@Ve%qDZlTr!W$Ckx0zvWP4uOUP2Pj4UTB$V#$`tR`#7 zTC$F;CmYB{vWaXaTgX?V82Ub2tuCkMzua)=xzN61lfj2tH?$VqaF zoF-?;S#pk?Cl|;?a*13fSIAXzja(-;$W3yK+$ML(U2>1yClAO&@`yYpPsmg9j65eV z$V>8yye4nRTk?*)Cm+a1@`-#VU&vSTjeI9R$WQW%{3ZcHDj~IyMhFzr3PD0TA-#}6 z$S4F0K%fLDAb}RJzzD3s3A{iAK@bH=kOf6h1x?TeLofwPumwkO1yAsW5FwM0S;!(} z6|xE0g&aaoA(xO_$Rp$x@(KBc0zyHdkWg4CA`}&h3B`pHLP?>NP+BM>loiSe<%J4D zMWK>VS*Rj}3RQ(_LUo~rP*bQS)E4Rpb%lCDeW8KSP-rAH7Mci6g=Ru?Axvl?gbOW& zRzhnbLTDqj6(WTwAzElBv==%EF+!}+QRpOe7UG0>AwlRObQQV@-Gv@PPobC4Tj(S7 z75WMNg#p4qVURFb7$OW6h6%%k5yD6zQ5YqR7RCr;g>k}oVS)v6lMvtg*n1pVV*EwSRgDE772@mCBjl+nXp_~A*>Wu39E%Q!dhXSuwK|8Y!o&L zn}sdHR$-g4UDzS)6m|)_g+0PvVV|&HI3OGp4he^aBf?SPm~dP;A)FLW38#fK!dc;* za9+3|Tof(|mxU|BRpFX&UAQ6K6mAK(g*(Ds;hu0`cpy9!9tn?yC&E+VnebeAA-oh` z39p4W!du~;@Lu>Jd=x$jpM@{NSK*uRUHBpV6n+W6g#a;?m|9FD28wCLATgboUd$k7 z6oW+|QX&+QNQ+oxL{{WPUL>L*ilQXSq9UrIChDRgnxZAzq9eMZC;DQDm`ThmW)ZWB z*~IK(4l$>gOUy0i5%Y@q#Qb6bv7lH;EG!lgi;Bg>;$jK0q*zKUEtV0>isi)eVg<3H zSV^obRuMzRs$w;!T3Db^Bei*>}hVm-0G*g$M3HWC|)O~j^RGqJfCCbkg6#g<|# zv9%Z>wh`Nkkz$k>Ew&Teiyg!mF;?s-b`m>_abmofAa)VEirvKSVh^#W*h}m!_7VGv z{lxy_0CAu=NE|E<5r>My#Npxyaio|ijuJd?`fABxRPeNLi(9 zQg$halvBzj<(BeDd8K?(eyM;|P%0!9mWoJ4rD9TXsf1KgDkYVc%1C9Ua#DGzf>cqe zBvqEGNTE_yshU(>sv*^sYDu-FI#OM!o>X6IAT^X4NsXl@Qd6m!)LaUaT1eqiOR1IA zT8fa`NNuG^DN2f#+DYxD4pNL1D|M7QNu8xQDPBsDx=3B6Zc=xthtyN*CH0p2NPVS# zQh#ZHG*B8O4VH#TL#1KTaA|}zQc9FYNu#AP(pYJnG+vq@B}o&dNz!C#iZoT4CQX-S zNHe8b(rjstG*_A@&6gHP3#CQUVrhxAR9Yr2msUtCrB%{uX^pg2S|_cSHb@(#P10s* zi?mhRCT*8?NIRun(r#&wv{%|E?UxQn2c<*OVd;o;R5~Udmrh70rBl*r>5Oz%Iwzf% zE=U)pOVVZOigZ=FCS8|qNH?Wh(rxLEbXU43-IpFn52Z)aW9fomtIIOrB~8x z>5cSOdMCY?K1d&>Pts@Ui}Y3cCViKFNI#`t(r+n1P9>+7)5w8xS~*BgC#RP)$Qk8e z8OW3jWhB!wmKm9qIhmJ&oANFBwtPpvE8mmv%Mav-@+0}N{6u~#Ka-!!FXWf5li$l9K(G*=V6jQMjTX7Uu@f2SPQ8Fo+l`KkDC7Y66 z$)V&_aw)l$JW5_ApORlGpcGUJDTS3HN>QblQd}valvGM7rIj*DS*4s(Ua6o|R4OTz zl`2Z8QdOy@R99*!HI-UQZKaMUCqr@s5l}<`$B~FP~5|l1VSEZZMUFo6pRC+1Bl|D*erJvGY z8K4YQ1}TG;A<9r?m@-@$p^Q`#l~KxQWsEXb8K;a_CMZeDL}ijPS(&0tRi-J^l^M!R zWtK8qnWM~A<|*@)1qAXRGDa(}=%1ULGvRYZAtX0-2>y-`4MrD(-S=pj& zRkkVHl^x1XWtXyB*`w@L_9^?71Ij_=kaAc#q8wF@DaVx)%1Pyva#}f~oK?;#=amb} zMdgxmS-GNIRjw)5l^e=U<(6_=xue`w?kV?`2g*a`k@8r1qC8ceDbJM`%1h;y@>+SL zyj9*Q@0AbAN9B|9S^1)TRlX_Tl^@Da<(KkX2~bn1sns-Upqf?G@N4b+BeBek*GL~W`zQ=6+{Y6~@7ZK<|WTdNUj z8?~((sYa>MYCE;P+Chy`W7UpoC$+O0r^c%ZY8SPu+D+}Q_E3AOz0}@nAGNRAPwlS` zPzS1m)WPZyb*MT_9j=a0N2-bHD0Q?tMjfk;Q^%_l)FgGHI!T?ZPEn_-)70te40WbD zOP#IGQRk}j)cNWHb)mXQU92uqm#WLuIQYAx=G!vZc(?Y z+tlsq4t1xxOWm#RQTM9*)cxuK^`Lr4J**y4kE+MiIL

    Ie0s`bqt)eo?=w-_-Bw5A~<|OZ}||XsNW+S{f}-OREKG>9q7(1}&o&tO1SE zpoTPB!y2Qp8mI9Z(F9G@Bu&;7P1Q6_*9^_nEX~#&&DA{3*Fv;RT4pVamQ~B9W!G|O zIkj9`ZY__NSIej6*9vF_wL)59t%z1sE2b6KN@yjuQd()Pj8;}FrvzzHPxDF&9yMCg%+;0)LLn+wFs?^ z)>ezuqO@qOoz`CKpv7phT1Tyu)>(_w;rGN_16Yy z1GPcgU~PytR2!xZ*G6a~wM1={Hd-5_jn&3!y z+IDS+wo}`s?bh~ad$oPqe(ivEP&=d@){baLwPV_G?SytxJEfi0&S+<~bJ}_Bf_726 zq+QmoXjips+I8)Qc2m2h-PZ1CceQ)keeHqvP33hflldA zM>?%zozYpH(|Miff-dTkF6)Y}>YA?WhHmPXZtITj>YncFA$le~vz|rIs%O))>pAqC zdM-V;o=4BC=hO4+1@wY?A-%9(L@%lr(~IjR^pbihy|i9NFRPc+%j*^Nih3ozvR*|G z)vM~&^y+#Iy{2AEudUb7>+1FN`g#Msq25SutT)k{>do}#dYIlq57%4jt@PGjU(G z`XGIi>f`kB`UE{mpQumLC+k!6srod1x;{gnsn619 z>vQzE`aFHUzCd57FVYw5OZ27sGJUzeLSLz`(pT$i^tJjreZ9Ux->7fWH|tyUt@<{7 zyS_u;sqfNv>wEOQ`aXTXen3B{AJPx&NA#omG5xrHLO-dW(ogGW^t1Xo{k(obzo=i* zFY8zItNJzlx_(2yso&CX>v#0K`aS)={y=}IKhhuTPxPnyGyS>#LVu~h(qHRu^tbvu z{k{G{|EPb`KkHxgulhIryZ%G}ssGY{>j6e8Bejvn2sF|fK}I?wy^+DlXapO;pbTgr zgEp|i7_7k=yg>}X5Dm$Y4aHCm&Cm_QFb&JF4aaZ|&+v^9Ba@NY$YNwQvKiTp97awf zmyz4ZW8^jR8TpL@MnR*HQP?PA6g7$&#f=h1Nu!ie+9+d`HOd*~jS5CZqmohCsA7Z~ zRgG#!b)$w+)2LJlG&UKVjV;DjW1F$v z*kSB6b{V^kJ;q*RpRwOKU>r0K8HbG{#!=&#aojjzoHR}ur;RhlS>v2>-nd{~G%gvJ zjVs1gZW0ure)fuW4fki`eulk$;@nKF|(T4%qz-(wXG8>ys%%)~Dv$+{&wlKrZmS!unwHaZyG25Dv zW|SFiwlmwC9n2Ur*6e6@GCP}bX1tkTb}_q}-OTQ053{G)%j|9TG5ebR%>L#8bD%lM z9Bd9ThnmC8;pPZ)q?u@rGDn+Z%(3P;bG$jhOfn~$lg!EH6mzOM&75w|FlU;x%-QA~ zbFMkhoNq2L7n+OA#pV)oskzKtZmuv_nybv!<{ERYxz1c~ZZJ2Ro6ODT7IUk)&D?J8 zFn5}}%-!Z5bFaD2+;1K*51NO}!{!n5sCmphZk{ktny1Xu<{9&>dCoj8$it1}mc#Yype1poJ{j z!WLt(7H9Dmu>?!BBulmwOSLphw+zd)EX%eW%e6eqw?eE;R%R=UmDS2-Ww&xzIjvk) zZYz(K*UD$*w+dJVtwL5|tB6(9DrOb8N?0YWQdViJj8)buXO*`qSQV{GR%NS-6>3$r zs#(>o8dgoKmQ~xTW7W0lS@o?3Rzs_i)!1rcHMN>q&8;x2g%xhKv|3rMtq7}))z*r% zqO54Eoz>pzV8vLmR!6Io)!B-(;;jU$i`CWYW_7oESUs&?R&T41)z|80^|uCC1Fb>U zU~7mq)EZ_Dw?%P zbFF#Sd~1QV&{||Iww72+t!377YlXGaT4k-a)>vz;b=G=ogSFAxWNo&#SX-@a)^=-$ zwbR;V?Y8zx6aEI%S=<&RA!ybJlt5f_2flWL>td zSXZrU)^+QKbxK2wdS$(~-dJy~ch-CB zgZ0t+WPP^2SYNGg)_3cN_0#%gCI86GPGzUI)7XJ_T06*2XQ#I_*ct6$8`zW$ZDi9n zwi%nXIh(hME!d(h*|M$Js;$|&ZP=!5*|zQ2uI<^r9b#v)Guv70tadg#yPd<%Y3H(Y z+j;D~c0N15UBE7A7qSc6MeL$>F}t{3!Y*l-vP;`#?6P(_yS!b&u4q@XE8A7~OoK-O6rlN7!xbwsxc) zWk=iX?DlpCJI0Q+JKCM>&UTy~ZztGY?5=hFSD21E9{l_DtooP#$Ic$v)9`j?2Yy&d$Ya8-fC~Nx7$1H zo%Sw!x4p;SYwxr7+Xw7}_96SQeZ)R$AG43!C+w5;<-z@Z%IAcuCa z!#J$NIlMz0!4VzFksZZR9nH}l!!aGpu^q>89nbNd5GRw9*~#K$b+S3xog7Y1Czq4k z$>Zd8@;UjP0!~4vkW<(x;uLj?ImMk4PD!VfQ`#xxly%BE<(&#nMW>Qe*{R}$I#r!& zPIae-Q`4#C)OPAPb)9-neW!ub&}rl}cA7X%on}sRC(LQ#ggY&rR!(at!fE5Qbt0W8 zC)#P}w0AlP;m7CDQZCC*Z3nX}wk;jDC4IjfyD&RS=kv)j*Ip7?04mpRNBhFFhm~-4Y;hc0%Ij5a7&ROT2bKbe&Ty!oumz^ul zRp**>-MQi1bZ$AfojcB5=bm%ldEh*B9yyPlC(cvnne*Iv;kKQeGrxvAYWZlIgi4RX`D>D>%&MmN|6F6BZOxwMO2 z#${d3?*G6YOd}YuIXB??K-aOdamz=xS8C{ZWcGIo6XJc=5TYmx!l}t z9yhO>&&}@^a0|MH+`?`Vx2RjpE$)_ZOS+}p(ry{ItXs}4?^bXtx|Q6@ZWTAwt?E{D ztGhManr(+DYyA9liZX>s`+r(|^HglW1VQvdI+->Q$a$CC*ZX36)8|g;5 z(QZ4pz1zW!abw+%ZYQ_18|TKm32qm+tJ}@(?)Gqdy1m@qZXdU=+t2Oq4sZv$gWSRH z5O=6M%pLBIa7Vg{?kIP(JH{RBj&sMm6Wk`rl~y3^e0?hJRPJIkHz&T;3u z^W6FF0(YUi$X)C%ahJNw+~w{Hccr_^UG1)M*ShQ6_3j3Dqr1u7>~3+ly4&3C?hbdS zyUX3}?s50J``rER0r#ML$UW>HagVyk+~e*E_oREuJ?)-x&${Q_^X>)rqI=1`>|SxN zy4T$6?hW^*d&|A;-f{1`_uTvL1NWi($bIZSai6-++~@8K_oe&FeeJ$+-@5PI_wEPx zqx;GI?0#{-y5HRI?hp5;`^)|926(Bw)Lt4d&`awDdFj0LUIs6t7wiF#@}P%2+QS~> zu^#8~9`OWE^dwLA6i@XuPxlPZ^eoTz9MAPU&-X&SOkQR$iDtHyWN?v8JiWllt^{RQ* zy&7Ikua;NatK-%6>Us6O23|w2k=NL3;x+Y}dCk2ruZ0)xwe(tft-T1Zjn~$T^rF0I zubtQ4>)^$Bv0g{7lh@gc^Wwb(uZ!2!>*jU$dU!p(US4mnkJs1h=k@mncmusb-e7Nt zH`E*E4fjTPBfUg#lsDQNnaUXnM_o8(RQrg&4mY2I{ihBwoj<<0iycyqmZ z-h6L?x6oVUE%ugpOTA^@a&LvV(p%-N_SSf7y>;GtZ-ckd+vIKbws>2;ZQgcohqu$) zs4q?}B&HyX0N=u6S3y zYuPrYZ}bMJ-s(tG8-_TG4Jy?5Sw?}PWz z`{aH0zIb1~Z{BzBhxgO_<^A>o{8WBwKaC&gr}cyUbbfk2gP+k4_JL3N&__P)W1sO^ zpYwU2_<}F`k}vy;ulky=`-X4&mT&ux@A{ta`yqZNKeM03&+2FMv->&xoPI7px1Yz) z>*w?H`vv@hej&fGU&Jr!7xRnzCH#_pDZjK|#xLua^UM1c{EB`hzp`J&5B011)%@yy z4Zo&e%dhR%@$35a{Q7!{GNUJRgW`y>33exg6hAMKCv$NJ;^@%{uq$)D&?@+bRK{Hgvlf4V=zpXtx?XZv&fx&AzV zzQ4d<=r8gY`%C<#{xW~LzrtVXuku&>Yy7qTI)A;t!Qbd_@;Cci{H^{rf4jfK-|6r2 zcl&$%z5YIbzkk3#=pXVA`$znv{xSc!f5JcMpYl)pXZ*ANIsd$W!N2HV@-O>W{Hy*o z|GIy}zv8I=$p)jmh(i1y*}QC%Wh zN3@R*PfDczd;OoSX}d>+caBR6O_dnbAvPvHGFc5s3~U|V{g0BqeMET2e->#YV-r%K z0g354M8zb;NBq^}B3i}9wEm-m4pFT-$NtL@n8Kb^Ju&Sc?qnrRGFh?+Orc7aL4S#o zsz*Ob9S~4GG5B8u$?kMrBO==U-H{tvoz$@CkmRZ) zr;;irC7}#|uSryoh@|9Ego)^X3m6s|7892gttX@ZRxsL1R-@hjUH>5ph|ZM47Z9EK zugLOOWc@3${T12&iX4AM&SX(NI@h0P?my2wDUP7%ynhydvW(96N6-J~Ss=xoDtS_3 z!DMAv^j~yha7=j2f9hCm4~Z`H2Nqp8S)}9=TqYqdDkdT>u3U0Cqwgg5Me@~(zMmok zLNkO0q+HNH{Q&*}R!S*&Z#K1qrosd*ED8rwDj!7v^kvyld za&pYFK?tT8|FQi~q~vHRQBqnW5tmJfi;wN_PrQ_aC5QX}SW{xA8lIT0T5OD2p>tGH z=IZ|({NHr2PDF>OWJr8unTWPgF)3XCw`oH1OCBDY;cuS*ftQbI{U^`=GW>;#f1nxv zoAuwzKd_W8PE7v?l*0Az#!UpZ62d#jM|A$rnf}}GH%ywpVg3jDhvRGVS0v@Ie_d(* z6z88WMq-9W$xZSf#(%B;(xv)Cmo5eLKe_!!{|f+r0D%pXdnG0Fzseui-;Ys-6i-;) zgyepW9-4BGQV$RPOB$TwsnO<7t|?ppYyI~B*#2FBpbCxh z=gFTph4&vdg)%vKP-wcu!1l>>e{W7u`D8WX&+Yt2{loL;MFjqHasMj+>j Date: Fri, 14 Aug 2020 12:50:20 -0700 Subject: [PATCH 0500/1025] CLN: remove unused variable (#35726) --- pandas/core/window/rolling.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 0306d4de2fc73..966773b7c6982 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -318,7 +318,7 @@ def __repr__(self) -> str: def __iter__(self): window = self._get_window(win_type=None) - blocks, obj = self._create_blocks(self._selected_obj) + _, obj = self._create_blocks(self._selected_obj) index = self._get_window_indexer(window=window) start, end = index.get_window_bounds( From 51e66b2209789f4e48a7aa61b69ba1957201944a Mon Sep 17 00:00:00 2001 From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> Date: Fri, 14 Aug 2020 16:59:09 -0400 Subject: [PATCH 0501/1025] agg with list of non-aggregating functions (#35723) --- doc/source/whatsnew/v1.1.1.rst | 1 + pandas/core/groupby/generic.py | 25 +++++++++++-------- pandas/core/groupby/groupby.py | 10 +++++--- .../tests/groupby/aggregate/test_aggregate.py | 13 ++++++++++ 4 files changed, 35 insertions(+), 14 deletions(-) diff --git a/doc/source/whatsnew/v1.1.1.rst b/doc/source/whatsnew/v1.1.1.rst index 85e2a335c55c6..565b4a014bd0c 100644 --- a/doc/source/whatsnew/v1.1.1.rst +++ b/doc/source/whatsnew/v1.1.1.rst @@ -26,6 +26,7 @@ Fixed regressions - Fixed regression in :meth:`DataFrame.reset_index` would raise a ``ValueError`` on empty :class:`DataFrame` with a :class:`MultiIndex` with a ``datetime64`` dtype level (:issue:`35606`, :issue:`35657`) - Fixed regression where :meth:`DataFrame.merge_asof` would raise a ``UnboundLocalError`` when ``left_index`` , ``right_index`` and ``tolerance`` were set (:issue:`35558`) - Fixed regression in ``.groupby(..).rolling(..)`` where a custom ``BaseIndexer`` would be ignored (:issue:`35557`) +- Fixed regression in :meth:`~pandas.core.groupby.DataFrameGroupBy.agg` where a list of functions would produce the wrong results if at least one of the functions did not aggregate. (:issue:`35490`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index b7280a9f7db3c..b806d9856d20f 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -322,11 +322,14 @@ def _aggregate_multiple_funcs(self, arg): # let higher level handle return results - output = self._wrap_aggregated_output(results) + output = self._wrap_aggregated_output(results, index=None) return self.obj._constructor_expanddim(output, columns=columns) + # TODO: index should not be Optional - see GH 35490 def _wrap_series_output( - self, output: Mapping[base.OutputKey, Union[Series, np.ndarray]], index: Index, + self, + output: Mapping[base.OutputKey, Union[Series, np.ndarray]], + index: Optional[Index], ) -> Union[Series, DataFrame]: """ Wraps the output of a SeriesGroupBy operation into the expected result. @@ -335,7 +338,7 @@ def _wrap_series_output( ---------- output : Mapping[base.OutputKey, Union[Series, np.ndarray]] Data to wrap. - index : pd.Index + index : pd.Index or None Index to apply to the output. Returns @@ -363,8 +366,11 @@ def _wrap_series_output( return result + # TODO: Remove index argument, use self.grouper.result_index, see GH 35490 def _wrap_aggregated_output( - self, output: Mapping[base.OutputKey, Union[Series, np.ndarray]] + self, + output: Mapping[base.OutputKey, Union[Series, np.ndarray]], + index: Optional[Index], ) -> Union[Series, DataFrame]: """ Wraps the output of a SeriesGroupBy aggregation into the expected result. @@ -383,9 +389,7 @@ def _wrap_aggregated_output( In the vast majority of cases output will only contain one element. The exception is operations that expand dimensions, like ohlc. """ - result = self._wrap_series_output( - output=output, index=self.grouper.result_index - ) + result = self._wrap_series_output(output=output, index=index) return self._reindex_output(result) def _wrap_transformed_output( @@ -1720,7 +1724,9 @@ def _insert_inaxis_grouper_inplace(self, result): result.insert(0, name, lev) def _wrap_aggregated_output( - self, output: Mapping[base.OutputKey, Union[Series, np.ndarray]] + self, + output: Mapping[base.OutputKey, Union[Series, np.ndarray]], + index: Optional[Index], ) -> DataFrame: """ Wraps the output of DataFrameGroupBy aggregations into the expected result. @@ -1745,8 +1751,7 @@ def _wrap_aggregated_output( self._insert_inaxis_grouper_inplace(result) result = result._consolidate() else: - index = self.grouper.result_index - result.index = index + result.index = self.grouper.result_index if self.axis == 1: result = result.T diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 4597afeeaddbf..0047877ef78ee 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -974,7 +974,9 @@ def _cython_transform(self, how: str, numeric_only: bool = True, **kwargs): return self._wrap_transformed_output(output) - def _wrap_aggregated_output(self, output: Mapping[base.OutputKey, np.ndarray]): + def _wrap_aggregated_output( + self, output: Mapping[base.OutputKey, np.ndarray], index: Optional[Index] + ): raise AbstractMethodError(self) def _wrap_transformed_output(self, output: Mapping[base.OutputKey, np.ndarray]): @@ -1049,7 +1051,7 @@ def _cython_agg_general( if len(output) == 0: raise DataError("No numeric types to aggregate") - return self._wrap_aggregated_output(output) + return self._wrap_aggregated_output(output, index=self.grouper.result_index) def _python_agg_general( self, func, *args, engine="cython", engine_kwargs=None, **kwargs @@ -1102,7 +1104,7 @@ def _python_agg_general( output[key] = maybe_cast_result(values[mask], result) - return self._wrap_aggregated_output(output) + return self._wrap_aggregated_output(output, index=self.grouper.result_index) def _concat_objects(self, keys, values, not_indexed_same: bool = False): from pandas.core.reshape.concat import concat @@ -2534,7 +2536,7 @@ def _get_cythonized_result( raise TypeError(error_msg) if aggregate: - return self._wrap_aggregated_output(output) + return self._wrap_aggregated_output(output, index=self.grouper.result_index) else: return self._wrap_transformed_output(output) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 40a20c8210052..ce9d4b892d775 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -1061,3 +1061,16 @@ def test_groupby_get_by_index(): res = df.groupby("A").agg({"B": lambda x: x.get(x.index[-1])}) expected = pd.DataFrame(dict(A=["S", "W"], B=[1.0, 2.0])).set_index("A") pd.testing.assert_frame_equal(res, expected) + + +def test_nonagg_agg(): + # GH 35490 - Single/Multiple agg of non-agg function give same results + # TODO: agg should raise for functions that don't aggregate + df = pd.DataFrame({"a": [1, 1, 2, 2], "b": [1, 2, 2, 1]}) + g = df.groupby("a") + + result = g.agg(["cumsum"]) + result.columns = result.columns.droplevel(-1) + expected = g.agg("cumsum") + + tm.assert_frame_equal(result, expected) From 7f3869e8eaf31309f0ec86977f9489ddface7c84 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 14 Aug 2020 14:01:02 -0700 Subject: [PATCH 0502/1025] BLD: bump xlrd min version to 1.2.0 (#35728) --- ci/deps/azure-37-locale_slow.yaml | 2 +- ci/deps/azure-37-minimum_versions.yaml | 2 +- doc/source/getting_started/install.rst | 2 +- doc/source/whatsnew/v1.2.0.rst | 2 +- pandas/compat/_optional.py | 2 +- pandas/tests/io/excel/test_readers.py | 43 +++++++------------------- 6 files changed, 16 insertions(+), 37 deletions(-) diff --git a/ci/deps/azure-37-locale_slow.yaml b/ci/deps/azure-37-locale_slow.yaml index 3ccb66e09fe7e..8000f3e6b9a9c 100644 --- a/ci/deps/azure-37-locale_slow.yaml +++ b/ci/deps/azure-37-locale_slow.yaml @@ -24,7 +24,7 @@ dependencies: - pytz=2017.3 - scipy - sqlalchemy=1.2.8 - - xlrd=1.1.0 + - xlrd=1.2.0 - xlsxwriter=1.0.2 - xlwt=1.3.0 - html5lib=1.0.1 diff --git a/ci/deps/azure-37-minimum_versions.yaml b/ci/deps/azure-37-minimum_versions.yaml index 94cc5812bcc10..05b1957198bc4 100644 --- a/ci/deps/azure-37-minimum_versions.yaml +++ b/ci/deps/azure-37-minimum_versions.yaml @@ -25,7 +25,7 @@ dependencies: - pytz=2017.3 - pyarrow=0.15 - scipy=1.2 - - xlrd=1.1.0 + - xlrd=1.2.0 - xlsxwriter=1.0.2 - xlwt=1.3.0 - html5lib=1.0.1 diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index 7ab150394bf51..4c270117e079e 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -287,7 +287,7 @@ s3fs 0.4.0 Amazon S3 access tabulate 0.8.3 Printing in Markdown-friendly format (see `tabulate`_) xarray 0.12.0 pandas-like API for N-dimensional data xclip Clipboard I/O on linux -xlrd 1.1.0 Excel reading +xlrd 1.2.0 Excel reading xlwt 1.3.0 Excel writing xsel Clipboard I/O on linux zlib Compression for HDF5 diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index a3bb6dfd86bd2..42f95d88d74ac 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -122,7 +122,7 @@ Optional libraries below the lowest tested version may still work, but are not c +-----------------+-----------------+---------+ | xarray | 0.12.0 | X | +-----------------+-----------------+---------+ -| xlrd | 1.1.0 | | +| xlrd | 1.2.0 | X | +-----------------+-----------------+---------+ | xlsxwriter | 1.0.2 | X | +-----------------+-----------------+---------+ diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py index 6423064732def..81eac490fe5b9 100644 --- a/pandas/compat/_optional.py +++ b/pandas/compat/_optional.py @@ -27,7 +27,7 @@ "tables": "3.4.3", "tabulate": "0.8.3", "xarray": "0.8.2", - "xlrd": "1.1.0", + "xlrd": "1.2.0", "xlwt": "1.2.0", "xlsxwriter": "0.9.8", "numba": "0.46.0", diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index b610c5ec3a838..51fbbf836a03f 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -1,9 +1,7 @@ -import contextlib from datetime import datetime, time from functools import partial import os from urllib.error import URLError -import warnings import numpy as np import pytest @@ -14,22 +12,6 @@ from pandas import DataFrame, Index, MultiIndex, Series import pandas._testing as tm - -@contextlib.contextmanager -def ignore_xlrd_time_clock_warning(): - """ - Context manager to ignore warnings raised by the xlrd library, - regarding the deprecation of `time.clock` in Python 3.7. - """ - with warnings.catch_warnings(): - warnings.filterwarnings( - action="ignore", - message="time.clock has been deprecated", - category=DeprecationWarning, - ) - yield - - read_ext_params = [".xls", ".xlsx", ".xlsm", ".xlsb", ".ods"] engine_params = [ # Add any engines to test here @@ -134,21 +116,19 @@ def test_usecols_int(self, read_ext, df_ref): # usecols as int msg = "Passing an integer for `usecols`" with pytest.raises(ValueError, match=msg): - with ignore_xlrd_time_clock_warning(): - pd.read_excel( - "test1" + read_ext, sheet_name="Sheet1", index_col=0, usecols=3 - ) + pd.read_excel( + "test1" + read_ext, sheet_name="Sheet1", index_col=0, usecols=3 + ) # usecols as int with pytest.raises(ValueError, match=msg): - with ignore_xlrd_time_clock_warning(): - pd.read_excel( - "test1" + read_ext, - sheet_name="Sheet2", - skiprows=[1], - index_col=0, - usecols=3, - ) + pd.read_excel( + "test1" + read_ext, + sheet_name="Sheet2", + skiprows=[1], + index_col=0, + usecols=3, + ) def test_usecols_list(self, read_ext, df_ref): if pd.read_excel.keywords["engine"] == "pyxlsb": @@ -597,8 +577,7 @@ def test_sheet_name(self, read_ext, df_ref): df1 = pd.read_excel( filename + read_ext, sheet_name=sheet_name, index_col=0 ) # doc - with ignore_xlrd_time_clock_warning(): - df2 = pd.read_excel(filename + read_ext, index_col=0, sheet_name=sheet_name) + df2 = pd.read_excel(filename + read_ext, index_col=0, sheet_name=sheet_name) tm.assert_frame_equal(df1, df_ref, check_names=False) tm.assert_frame_equal(df2, df_ref, check_names=False) From 1c6f503b88925ba6b97b9895ba0dbfaf1c5b1c22 Mon Sep 17 00:00:00 2001 From: estasney Date: Fri, 14 Aug 2020 17:01:49 -0400 Subject: [PATCH 0503/1025] Fix broken link in cookbook.rst (#35729) --- doc/source/user_guide/cookbook.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/user_guide/cookbook.rst b/doc/source/user_guide/cookbook.rst index 49487ac327e73..7542e1dc7df6f 100644 --- a/doc/source/user_guide/cookbook.rst +++ b/doc/source/user_guide/cookbook.rst @@ -765,7 +765,7 @@ Timeseries `__ `Aggregation and plotting time series -`__ +`__ Turn a matrix with hours in columns and days in rows into a continuous row sequence in the form of a time series. `How to rearrange a Python pandas DataFrame? From 66a0e304957e5b0cd1bec4278f5be4a3f8e1fded Mon Sep 17 00:00:00 2001 From: Ali McMaster Date: Mon, 17 Aug 2020 09:50:00 +0100 Subject: [PATCH 0504/1025] CI: Min Pytest Cov Version/Restrict xdist version (#35754) --- ci/deps/azure-windows-37.yaml | 2 +- ci/deps/azure-windows-38.yaml | 2 +- ci/deps/travis-37-cov.yaml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/ci/deps/azure-windows-37.yaml b/ci/deps/azure-windows-37.yaml index 4d745454afcab..f4c238ab8b173 100644 --- a/ci/deps/azure-windows-37.yaml +++ b/ci/deps/azure-windows-37.yaml @@ -8,7 +8,7 @@ dependencies: # tools - cython>=0.29.16 - pytest>=5.0.1 - - pytest-xdist>=1.21 + - pytest-xdist>=1.21,<2.0.0 # GH 35737 - hypothesis>=3.58.0 - pytest-azurepipelines diff --git a/ci/deps/azure-windows-38.yaml b/ci/deps/azure-windows-38.yaml index f428a6dadfaa2..1f383164b5328 100644 --- a/ci/deps/azure-windows-38.yaml +++ b/ci/deps/azure-windows-38.yaml @@ -8,7 +8,7 @@ dependencies: # tools - cython>=0.29.16 - pytest>=5.0.1 - - pytest-xdist>=1.21 + - pytest-xdist>=1.21,<2.0.0 # GH 35737 - hypothesis>=3.58.0 - pytest-azurepipelines diff --git a/ci/deps/travis-37-cov.yaml b/ci/deps/travis-37-cov.yaml index 3a0827a16f97a..edc11bdf4ab35 100644 --- a/ci/deps/travis-37-cov.yaml +++ b/ci/deps/travis-37-cov.yaml @@ -10,7 +10,7 @@ dependencies: - pytest>=5.0.1 - pytest-xdist>=1.21 - hypothesis>=3.58.0 - - pytest-cov # this is only needed in the coverage build + - pytest-cov>=2.10.1 # this is only needed in the coverage build, ref: GH 35737 # pandas dependencies - beautifulsoup4 From a8bf160d1ab0ac48220756f8789899c8e96d4f2d Mon Sep 17 00:00:00 2001 From: sanderland <48946947+sanderland@users.noreply.github.com> Date: Mon, 17 Aug 2020 12:22:34 +0200 Subject: [PATCH 0505/1025] REGR: Fix interpolation on empty dataframe (#35543) --- doc/source/whatsnew/v1.1.1.rst | 1 + pandas/core/generic.py | 3 +++ pandas/tests/frame/methods/test_interpolate.py | 7 +++++++ 3 files changed, 11 insertions(+) diff --git a/doc/source/whatsnew/v1.1.1.rst b/doc/source/whatsnew/v1.1.1.rst index 565b4a014bd0c..b1fd76157b9f1 100644 --- a/doc/source/whatsnew/v1.1.1.rst +++ b/doc/source/whatsnew/v1.1.1.rst @@ -19,6 +19,7 @@ Fixed regressions - Fixed regression where :func:`read_csv` would raise a ``ValueError`` when ``pandas.options.mode.use_inf_as_na`` was set to ``True`` (:issue:`35493`) - Fixed regression where :func:`pandas.testing.assert_series_equal` would raise an error when non-numeric dtypes were passed with ``check_exact=True`` (:issue:`35446`) - Fixed regression in :class:`pandas.core.groupby.RollingGroupby` where column selection was ignored (:issue:`35486`) +- Fixed regression where :meth:`DataFrame.interpolate` would raise a ``TypeError`` when the :class:`DataFrame` was empty (:issue:`35598`). - Fixed regression in :meth:`DataFrame.shift` with ``axis=1`` and heterogeneous dtypes (:issue:`35488`) - Fixed regression in :meth:`DataFrame.diff` with read-only data (:issue:`35559`) - Fixed regression in ``.groupby(..).rolling(..)`` where a segfault would occur with ``center=True`` and an odd number of values (:issue:`35552`) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 2219d54477d9e..9cbe2f714fd57 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6892,6 +6892,9 @@ def interpolate( obj = self.T if should_transpose else self + if obj.empty: + return self + if method not in fillna_methods: axis = self._info_axis_number diff --git a/pandas/tests/frame/methods/test_interpolate.py b/pandas/tests/frame/methods/test_interpolate.py index ddb5723e7bd3e..3c9d79397e4bd 100644 --- a/pandas/tests/frame/methods/test_interpolate.py +++ b/pandas/tests/frame/methods/test_interpolate.py @@ -34,6 +34,13 @@ def test_interp_basic(self): expected.loc[5, "B"] = 9 tm.assert_frame_equal(result, expected) + def test_interp_empty(self): + # https://github.com/pandas-dev/pandas/issues/35598 + df = DataFrame() + result = df.interpolate() + expected = df + tm.assert_frame_equal(result, expected) + def test_interp_bad_method(self): df = DataFrame( { From df277f362e5b2150b488417b48db09c795d388cf Mon Sep 17 00:00:00 2001 From: Daniel Saxton <2658661+dsaxton@users.noreply.github.com> Date: Mon, 17 Aug 2020 05:59:08 -0500 Subject: [PATCH 0506/1025] REGR: Don't ignore compiled patterns in replace (#35697) --- doc/source/whatsnew/v1.1.1.rst | 1 + pandas/core/internals/managers.py | 23 ++++++++++++++++----- pandas/tests/frame/methods/test_replace.py | 8 +++++++ pandas/tests/series/methods/test_replace.py | 10 +++++++++ 4 files changed, 37 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v1.1.1.rst b/doc/source/whatsnew/v1.1.1.rst index b1fd76157b9f1..4fb98bc7c1217 100644 --- a/doc/source/whatsnew/v1.1.1.rst +++ b/doc/source/whatsnew/v1.1.1.rst @@ -27,6 +27,7 @@ Fixed regressions - Fixed regression in :meth:`DataFrame.reset_index` would raise a ``ValueError`` on empty :class:`DataFrame` with a :class:`MultiIndex` with a ``datetime64`` dtype level (:issue:`35606`, :issue:`35657`) - Fixed regression where :meth:`DataFrame.merge_asof` would raise a ``UnboundLocalError`` when ``left_index`` , ``right_index`` and ``tolerance`` were set (:issue:`35558`) - Fixed regression in ``.groupby(..).rolling(..)`` where a custom ``BaseIndexer`` would be ignored (:issue:`35557`) +- Fixed regression in :meth:`DataFrame.replace` and :meth:`Series.replace` where compiled regular expressions would be ignored during replacement (:issue:`35680`) - Fixed regression in :meth:`~pandas.core.groupby.DataFrameGroupBy.agg` where a list of functions would produce the wrong results if at least one of the functions did not aggregate. (:issue:`35490`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 371b721f08b27..5a215c4cd5fa3 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -2,7 +2,17 @@ import itertools import operator import re -from typing import DefaultDict, Dict, List, Optional, Sequence, Tuple, TypeVar, Union +from typing import ( + DefaultDict, + Dict, + List, + Optional, + Pattern, + Sequence, + Tuple, + TypeVar, + Union, +) import warnings import numpy as np @@ -1907,7 +1917,10 @@ def _merge_blocks( def _compare_or_regex_search( - a: ArrayLike, b: Scalar, regex: bool = False, mask: Optional[ArrayLike] = None + a: ArrayLike, + b: Union[Scalar, Pattern], + regex: bool = False, + mask: Optional[ArrayLike] = None, ) -> Union[ArrayLike, bool]: """ Compare two array_like inputs of the same shape or two scalar values @@ -1918,7 +1931,7 @@ def _compare_or_regex_search( Parameters ---------- a : array_like - b : scalar + b : scalar or regex pattern regex : bool, default False mask : array_like or None (default) @@ -1928,7 +1941,7 @@ def _compare_or_regex_search( """ def _check_comparison_types( - result: Union[ArrayLike, bool], a: ArrayLike, b: Scalar, + result: Union[ArrayLike, bool], a: ArrayLike, b: Union[Scalar, Pattern], ): """ Raises an error if the two arrays (a,b) cannot be compared. @@ -1949,7 +1962,7 @@ def _check_comparison_types( else: op = np.vectorize( lambda x: bool(re.search(b, x)) - if isinstance(x, str) and isinstance(b, str) + if isinstance(x, str) and isinstance(b, (str, Pattern)) else False ) diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py index a3f056dbf9648..8603bff0587b6 100644 --- a/pandas/tests/frame/methods/test_replace.py +++ b/pandas/tests/frame/methods/test_replace.py @@ -1573,3 +1573,11 @@ def test_replace_dict_category_type(self, input_category_df, expected_category_d result = input_df.replace({"a": "z", "obj1": "obj9", "cat1": "catX"}) tm.assert_frame_equal(result, expected) + + def test_replace_with_compiled_regex(self): + # https://github.com/pandas-dev/pandas/issues/35680 + df = pd.DataFrame(["a", "b", "c"]) + regex = re.compile("^a$") + result = df.replace({regex: "z"}, regex=True) + expected = pd.DataFrame(["z", "b", "c"]) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/series/methods/test_replace.py b/pandas/tests/series/methods/test_replace.py index 11802c59a29da..f78a28c66e946 100644 --- a/pandas/tests/series/methods/test_replace.py +++ b/pandas/tests/series/methods/test_replace.py @@ -1,3 +1,5 @@ +import re + import numpy as np import pytest @@ -415,3 +417,11 @@ def test_replace_extension_other(self): # https://github.com/pandas-dev/pandas/issues/34530 ser = pd.Series(pd.array([1, 2, 3], dtype="Int64")) ser.replace("", "") # no exception + + def test_replace_with_compiled_regex(self): + # https://github.com/pandas-dev/pandas/issues/35680 + s = pd.Series(["a", "b", "c"]) + regex = re.compile("^a$") + result = s.replace({regex: "z"}, regex=True) + expected = pd.Series(["z", "b", "c"]) + tm.assert_series_equal(result, expected) From e7d7c67fbbe24424aa0551ddad39d7ad6b1886ad Mon Sep 17 00:00:00 2001 From: Fangchen Li Date: Mon, 17 Aug 2020 08:11:53 -0500 Subject: [PATCH 0507/1025] BLD: update min versions #35732 (#35733) --- pandas/compat/_optional.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py index 81eac490fe5b9..689c7c889ef66 100644 --- a/pandas/compat/_optional.py +++ b/pandas/compat/_optional.py @@ -11,25 +11,25 @@ "fsspec": "0.7.4", "fastparquet": "0.3.2", "gcsfs": "0.6.0", - "lxml.etree": "3.8.0", - "matplotlib": "2.2.2", - "numexpr": "2.6.2", + "lxml.etree": "4.3.0", + "matplotlib": "2.2.3", + "numexpr": "2.6.8", "odfpy": "1.3.0", "openpyxl": "2.5.7", "pandas_gbq": "0.12.0", - "pyarrow": "0.13.0", - "pytables": "3.4.3", + "pyarrow": "0.15.0", + "pytables": "3.4.4", "pytest": "5.0.1", "pyxlsb": "1.0.6", "s3fs": "0.4.0", "scipy": "1.2.0", - "sqlalchemy": "1.1.4", - "tables": "3.4.3", + "sqlalchemy": "1.2.8", + "tables": "3.4.4", "tabulate": "0.8.3", - "xarray": "0.8.2", + "xarray": "0.12.0", "xlrd": "1.2.0", - "xlwt": "1.2.0", - "xlsxwriter": "0.9.8", + "xlwt": "1.3.0", + "xlsxwriter": "1.0.2", "numba": "0.46.0", } From d1e8d5461e235cfe169ff97ca6dce8e85a98212a Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Mon, 17 Aug 2020 15:34:59 +0100 Subject: [PATCH 0508/1025] REF: StringArray._from_sequence, use less memory (#35519) --- asv_bench/benchmarks/strings.py | 15 +++++++ doc/source/whatsnew/v1.1.1.rst | 5 +++ pandas/_libs/lib.pyx | 51 ++++++++++++++-------- pandas/core/arrays/string_.py | 25 +++-------- pandas/core/dtypes/cast.py | 16 ++----- pandas/tests/arrays/string_/test_string.py | 14 +++--- 6 files changed, 73 insertions(+), 53 deletions(-) diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py index d7fb2775376c0..2023858181baa 100644 --- a/asv_bench/benchmarks/strings.py +++ b/asv_bench/benchmarks/strings.py @@ -7,6 +7,21 @@ from .pandas_vb_common import tm +class Construction: + + params = ["str", "string"] + param_names = ["dtype"] + + def setup(self, dtype): + self.data = tm.rands_array(nchars=10 ** 5, size=10) + + def time_construction(self, dtype): + Series(self.data, dtype=dtype) + + def peakmem_construction(self, dtype): + Series(self.data, dtype=dtype) + + class Methods: def setup(self): self.s = Series(tm.makeStringIndex(10 ** 5)) diff --git a/doc/source/whatsnew/v1.1.1.rst b/doc/source/whatsnew/v1.1.1.rst index 4fb98bc7c1217..c028fe6bea719 100644 --- a/doc/source/whatsnew/v1.1.1.rst +++ b/doc/source/whatsnew/v1.1.1.rst @@ -76,6 +76,11 @@ Categorical - Bug in :class:`DataFrame` constructor failing to raise ``ValueError`` in some cases when data and index have mismatched lengths (:issue:`33437`) - +**Strings** + +- fix memory usage issue when instantiating large :class:`pandas.arrays.StringArray` (:issue:`35499`) + + .. --------------------------------------------------------------------------- .. _whatsnew_111.contributors: diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 5fa91ffee8ea8..eadfcefaac73d 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -618,35 +618,52 @@ def astype_intsafe(ndarray[object] arr, new_dtype): @cython.wraparound(False) @cython.boundscheck(False) -def astype_str(arr: ndarray, skipna: bool=False) -> ndarray[object]: - """ - Convert all elements in an array to string. +cpdef ndarray[object] ensure_string_array( + arr, + object na_value=np.nan, + bint convert_na_value=True, + bint copy=True, + bint skipna=True, +): + """Returns a new numpy array with object dtype and only strings and na values. Parameters ---------- - arr : ndarray - The array whose elements we are casting. - skipna : bool, default False + arr : array-like + The values to be converted to str, if needed. + na_value : Any + The value to use for na. For example, np.nan or pd.NA. + convert_na_value : bool, default True + If False, existing na values will be used unchanged in the new array. + copy : bool, default True + Whether to ensure that a new array is returned. + skipna : bool, default True Whether or not to coerce nulls to their stringified form - (e.g. NaN becomes 'nan'). + (e.g. if False, NaN becomes 'nan'). Returns ------- ndarray - A new array with the input array's elements casted. + An array with the input array's elements casted to str or nan-like. """ cdef: - object arr_i - Py_ssize_t i, n = arr.size - ndarray[object] result = np.empty(n, dtype=object) - - for i in range(n): - arr_i = arr[i] + Py_ssize_t i = 0, n = len(arr) - if not (skipna and checknull(arr_i)): - arr_i = str(arr_i) + result = np.asarray(arr, dtype="object") + if copy and result is arr: + result = result.copy() - result[i] = arr_i + for i in range(n): + val = result[i] + if not checknull(val): + result[i] = str(val) + else: + if convert_na_value: + val = na_value + if skipna: + result[i] = val + else: + result[i] = str(val) return result diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index bb55c3cdea45c..381968f9724b6 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -177,11 +177,10 @@ class StringArray(PandasArray): def __init__(self, values, copy=False): values = extract_array(values) - skip_validation = isinstance(values, type(self)) super().__init__(values, copy=copy) self._dtype = StringDtype() - if not skip_validation: + if not isinstance(values, type(self)): self._validate() def _validate(self): @@ -200,23 +199,11 @@ def _from_sequence(cls, scalars, dtype=None, copy=False): assert dtype == "string" result = np.asarray(scalars, dtype="object") - if copy and result is scalars: - result = result.copy() - - # Standardize all missing-like values to NA - # TODO: it would be nice to do this in _validate / lib.is_string_array - # We are already doing a scan over the values there. - na_values = isna(result) - has_nans = na_values.any() - if has_nans and result is scalars: - # force a copy now, if we haven't already - result = result.copy() - - # convert to str, then to object to avoid dtype like '>> construct_1d_ndarray_preserving_na([1.0, 2.0, None], dtype=np.dtype('str')) array(['1.0', '2.0', None], dtype=object) """ - subarr = np.array(values, dtype=dtype, copy=copy) if dtype is not None and dtype.kind == "U": - # GH-21083 - # We can't just return np.array(subarr, dtype='str') since - # NumPy will convert the non-string objects into strings - # Including NA values. Se we have to go - # string -> object -> update NA, which requires an - # additional pass over the data. - na_values = isna(values) - subarr2 = subarr.astype(object) - subarr2[na_values] = np.asarray(values, dtype=object)[na_values] - subarr = subarr2 + subarr = lib.ensure_string_array(values, convert_na_value=False, copy=copy) + else: + subarr = np.array(values, dtype=dtype, copy=copy) return subarr diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 6f9a1a5be4c43..efd5d29ae0717 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -206,12 +206,16 @@ def test_constructor_raises(): @pytest.mark.parametrize("copy", [True, False]) def test_from_sequence_no_mutate(copy): - a = np.array(["a", np.nan], dtype=object) - original = a.copy() - result = pd.arrays.StringArray._from_sequence(a, copy=copy) - expected = pd.arrays.StringArray(np.array(["a", pd.NA], dtype=object)) + nan_arr = np.array(["a", np.nan], dtype=object) + na_arr = np.array(["a", pd.NA], dtype=object) + + result = pd.arrays.StringArray._from_sequence(nan_arr, copy=copy) + expected = pd.arrays.StringArray(na_arr) + tm.assert_extension_array_equal(result, expected) - tm.assert_numpy_array_equal(a, original) + + expected = nan_arr if copy else na_arr + tm.assert_numpy_array_equal(nan_arr, expected) def test_astype_int(): From e6cd1aaf96b9e9b2e4867f6637d7375d97573980 Mon Sep 17 00:00:00 2001 From: Daniel Saxton <2658661+dsaxton@users.noreply.github.com> Date: Mon, 17 Aug 2020 13:20:19 -0500 Subject: [PATCH 0509/1025] Pass check_dtype to assert_extension_array_equal (#35750) --- doc/source/whatsnew/v1.1.1.rst | 1 + pandas/_testing.py | 10 ++++++++-- pandas/tests/util/test_assert_extension_array_equal.py | 9 +++++++++ pandas/tests/util/test_assert_frame_equal.py | 8 ++++++++ pandas/tests/util/test_assert_series_equal.py | 8 ++++++++ 5 files changed, 34 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.1.1.rst b/doc/source/whatsnew/v1.1.1.rst index c028fe6bea719..ff5bbccf63ffe 100644 --- a/doc/source/whatsnew/v1.1.1.rst +++ b/doc/source/whatsnew/v1.1.1.rst @@ -38,6 +38,7 @@ Bug fixes ~~~~~~~~~ - Bug in ``Styler`` whereby `cell_ids` argument had no effect due to other recent changes (:issue:`35588`) (:issue:`35663`). +- Bug in :func:`pandas.testing.assert_series_equal` and :func:`pandas.testing.assert_frame_equal` where extension dtypes were not ignored when ``check_dtypes`` was set to ``False`` (:issue:`35715`). Categorical ^^^^^^^^^^^ diff --git a/pandas/_testing.py b/pandas/_testing.py index 713f29466f097..ef6232fa6d575 100644 --- a/pandas/_testing.py +++ b/pandas/_testing.py @@ -1377,12 +1377,18 @@ def assert_series_equal( ) elif is_extension_array_dtype(left.dtype) and is_extension_array_dtype(right.dtype): assert_extension_array_equal( - left._values, right._values, index_values=np.asarray(left.index) + left._values, + right._values, + check_dtype=check_dtype, + index_values=np.asarray(left.index), ) elif needs_i8_conversion(left.dtype) or needs_i8_conversion(right.dtype): # DatetimeArray or TimedeltaArray assert_extension_array_equal( - left._values, right._values, index_values=np.asarray(left.index) + left._values, + right._values, + check_dtype=check_dtype, + index_values=np.asarray(left.index), ) else: _testing.assert_almost_equal( diff --git a/pandas/tests/util/test_assert_extension_array_equal.py b/pandas/tests/util/test_assert_extension_array_equal.py index d9fdf1491c328..f9259beab5d13 100644 --- a/pandas/tests/util/test_assert_extension_array_equal.py +++ b/pandas/tests/util/test_assert_extension_array_equal.py @@ -1,6 +1,7 @@ import numpy as np import pytest +from pandas import array import pandas._testing as tm from pandas.core.arrays.sparse import SparseArray @@ -102,3 +103,11 @@ def test_assert_extension_array_equal_non_extension_array(side): with pytest.raises(AssertionError, match=msg): tm.assert_extension_array_equal(*args) + + +@pytest.mark.parametrize("right_dtype", ["Int32", "int64"]) +def test_assert_extension_array_equal_ignore_dtype_mismatch(right_dtype): + # https://github.com/pandas-dev/pandas/issues/35715 + left = array([1, 2, 3], dtype="Int64") + right = array([1, 2, 3], dtype=right_dtype) + tm.assert_extension_array_equal(left, right, check_dtype=False) diff --git a/pandas/tests/util/test_assert_frame_equal.py b/pandas/tests/util/test_assert_frame_equal.py index fe3e1ff906919..3aa3c64923b14 100644 --- a/pandas/tests/util/test_assert_frame_equal.py +++ b/pandas/tests/util/test_assert_frame_equal.py @@ -260,3 +260,11 @@ def test_assert_frame_equal_interval_dtype_mismatch(): with pytest.raises(AssertionError, match=msg): tm.assert_frame_equal(left, right, check_dtype=True) + + +@pytest.mark.parametrize("right_dtype", ["Int32", "int64"]) +def test_assert_frame_equal_ignore_extension_dtype_mismatch(right_dtype): + # https://github.com/pandas-dev/pandas/issues/35715 + left = pd.DataFrame({"a": [1, 2, 3]}, dtype="Int64") + right = pd.DataFrame({"a": [1, 2, 3]}, dtype=right_dtype) + tm.assert_frame_equal(left, right, check_dtype=False) diff --git a/pandas/tests/util/test_assert_series_equal.py b/pandas/tests/util/test_assert_series_equal.py index a7b5aeac560e4..f3c66052b1904 100644 --- a/pandas/tests/util/test_assert_series_equal.py +++ b/pandas/tests/util/test_assert_series_equal.py @@ -296,3 +296,11 @@ def test_series_equal_exact_for_nonnumeric(): tm.assert_series_equal(s1, s3, check_exact=True) with pytest.raises(AssertionError): tm.assert_series_equal(s3, s1, check_exact=True) + + +@pytest.mark.parametrize("right_dtype", ["Int32", "int64"]) +def test_assert_series_equal_ignore_extension_dtype_mismatch(right_dtype): + # https://github.com/pandas-dev/pandas/issues/35715 + left = pd.Series([1, 2, 3], dtype="Int64") + right = pd.Series([1, 2, 3], dtype=right_dtype) + tm.assert_series_equal(left, right, check_dtype=False) From cdfa0d39da2afc4c3daf0044812faac37a3c240d Mon Sep 17 00:00:00 2001 From: Kevin Sheppard Date: Mon, 17 Aug 2020 19:27:45 +0100 Subject: [PATCH 0510/1025] MAINT: Initialize year to silence warning (#35763) Initialize year to silence warning due to subtracting from value that compiler cannot reason must be either initialized or never reached closes #35622 --- pandas/_libs/tslibs/parsing.pyx | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index 8429aebbd85b8..7478179df3b75 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -381,7 +381,8 @@ cdef inline object _parse_dateabbr_string(object date_string, datetime default, object freq): cdef: object ret - int year, quarter = -1, month, mnum, date_len + # year initialized to prevent compiler warnings + int year = -1, quarter = -1, month, mnum, date_len # special handling for possibilities eg, 2Q2005, 2Q05, 2005Q1, 05Q1 assert isinstance(date_string, str) From 04fc1e01de5c3d960bcf61f0258ca03f803c0f2c Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 17 Aug 2020 11:28:27 -0700 Subject: [PATCH 0511/1025] REF: implement reset_dropped_locs (#35696) --- pandas/core/groupby/generic.py | 24 ++--------------------- pandas/core/internals/managers.py | 32 +++++++++++++++++++++++++++++++ 2 files changed, 34 insertions(+), 22 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index b806d9856d20f..1f0cdbd07560f 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1111,6 +1111,7 @@ def blk_func(block: "Block") -> List["Block"]: assert len(locs) == result.shape[1] for i, loc in enumerate(locs): agg_block = result.iloc[:, [i]]._mgr.blocks[0] + agg_block.mgr_locs = [loc] new_blocks.append(agg_block) else: result = result._mgr.blocks[0].values @@ -1124,7 +1125,6 @@ def blk_func(block: "Block") -> List["Block"]: return new_blocks skipped: List[int] = [] - new_items: List[np.ndarray] = [] for i, block in enumerate(data.blocks): try: nbs = blk_func(block) @@ -1136,33 +1136,13 @@ def blk_func(block: "Block") -> List["Block"]: deleted_items.append(block.mgr_locs.as_array) else: agg_blocks.extend(nbs) - new_items.append(block.mgr_locs.as_array) if not agg_blocks: raise DataError("No numeric types to aggregate") # reset the locs in the blocks to correspond to our # current ordering - indexer = np.concatenate(new_items) - agg_items = data.items.take(np.sort(indexer)) - - if deleted_items: - - # we need to adjust the indexer to account for the - # items we have removed - # really should be done in internals :< - - deleted = np.concatenate(deleted_items) - ai = np.arange(len(data)) - mask = np.zeros(len(data)) - mask[deleted] = 1 - indexer = (ai - mask.cumsum())[indexer] - - offset = 0 - for blk in agg_blocks: - loc = len(blk.mgr_locs) - blk.mgr_locs = indexer[offset : (offset + loc)] - offset += loc + agg_items = data.reset_dropped_locs(agg_blocks, skipped) return agg_blocks, agg_items diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 5a215c4cd5fa3..f05d4cf1c4be6 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1504,6 +1504,38 @@ def unstack(self, unstacker, fill_value) -> "BlockManager": bm = BlockManager(new_blocks, [new_columns, new_index]) return bm + def reset_dropped_locs(self, blocks: List[Block], skipped: List[int]) -> Index: + """ + Decrement the mgr_locs of the given blocks with `skipped` removed. + + Notes + ----- + Alters each block's mgr_locs inplace. + """ + ncols = len(self) + + new_locs = [blk.mgr_locs.as_array for blk in blocks] + indexer = np.concatenate(new_locs) + + new_items = self.items.take(np.sort(indexer)) + + if skipped: + # we need to adjust the indexer to account for the + # items we have removed + deleted_items = [self.blocks[i].mgr_locs.as_array for i in skipped] + deleted = np.concatenate(deleted_items) + ai = np.arange(ncols) + mask = np.zeros(ncols) + mask[deleted] = 1 + indexer = (ai - mask.cumsum())[indexer] + + offset = 0 + for blk in blocks: + loc = len(blk.mgr_locs) + blk.mgr_locs = indexer[offset : (offset + loc)] + offset += loc + return new_items + class SingleBlockManager(BlockManager): """ manage a single block with """ From 1eecd5c8efb802b376cc3ed12e26c198bb9fbdcb Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 17 Aug 2020 11:58:31 -0700 Subject: [PATCH 0512/1025] BUG: close file handles in mmap (#35748) --- pandas/io/common.py | 5 ++++- pandas/tests/io/parser/test_common.py | 1 + 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index 54f35e689aac8..d1305c9cabe0e 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -18,6 +18,7 @@ Optional, Tuple, Type, + Union, ) from urllib.parse import ( urljoin, @@ -452,7 +453,7 @@ def get_handle( except ImportError: need_text_wrapping = (BufferedIOBase, RawIOBase) - handles: List[IO] = list() + handles: List[Union[IO, _MMapWrapper]] = list() f = path_or_buf # Convert pathlib.Path/py.path.local or string @@ -535,6 +536,8 @@ def get_handle( try: wrapped = _MMapWrapper(f) f.close() + handles.remove(f) + handles.append(wrapped) f = wrapped except Exception: # we catch any errors that may have occurred diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index 3d5f6ae3a4af9..1d8d5a29686a4 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -1836,6 +1836,7 @@ def test_raise_on_no_columns(all_parsers, nrows): parser.read_csv(StringIO(data)) +@td.check_file_leaks def test_memory_map(all_parsers, csv_dir_path): mmap_file = os.path.join(csv_dir_path, "test_mmap.csv") parser = all_parsers From 9528025567768d9de784b1c28e3923d064b9d303 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torsten=20W=C3=B6rtwein?= Date: Mon, 17 Aug 2020 14:59:42 -0400 Subject: [PATCH 0513/1025] TST: encoding for URLs in read_csv (#35742) --- pandas/tests/io/parser/test_network.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/pandas/tests/io/parser/test_network.py b/pandas/tests/io/parser/test_network.py index 509ae89909699..b30a7b1ef34de 100644 --- a/pandas/tests/io/parser/test_network.py +++ b/pandas/tests/io/parser/test_network.py @@ -46,6 +46,21 @@ def check_compressed_urls(salaries_table, compression, extension, mode, engine): tm.assert_frame_equal(url_table, salaries_table) +@tm.network("https://raw.githubusercontent.com/", check_before_test=True) +def test_url_encoding_csv(): + """ + read_csv should honor the requested encoding for URLs. + + GH 10424 + """ + path = ( + "https://raw.githubusercontent.com/pandas-dev/pandas/master/" + + "pandas/tests/io/parser/data/unicode_series.csv" + ) + df = read_csv(path, encoding="latin-1", header=None) + assert df.loc[15, 1] == "Á köldum klaka (Cold Fever) (1994)" + + @pytest.fixture def tips_df(datapath): """DataFrame with the tips dataset.""" From 3621f0d2398dd8bc2cf25187a4528719ada12cec Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 17 Aug 2020 13:50:13 -0700 Subject: [PATCH 0514/1025] CI: close sockets in SQL tests (#35772) --- pandas/tests/io/test_sql.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 29b787d39c09d..a7e3162ed7b73 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -263,7 +263,8 @@ def _get_all_tables(self): return table_list def _close_conn(self): - pass + # https://docs.sqlalchemy.org/en/13/core/connections.html#engine-disposal + self.conn.dispose() class PandasSQLTest: @@ -1242,7 +1243,7 @@ class _TestSQLAlchemy(SQLAlchemyMixIn, PandasSQLTest): def setup_class(cls): cls.setup_import() cls.setup_driver() - conn = cls.connect() + conn = cls.conn = cls.connect() conn.connect() def load_test_data_and_sql(self): From 4d4466169bfeb1f1437b1adaca2fb685f01d9f27 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Tue, 18 Aug 2020 13:54:55 +0100 Subject: [PATCH 0515/1025] REGR: follow-up to return copy with df.interpolate on empty DataFrame (#35774) --- pandas/core/generic.py | 2 +- pandas/tests/frame/methods/test_interpolate.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 9cbe2f714fd57..fe412bc0ce937 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6893,7 +6893,7 @@ def interpolate( obj = self.T if should_transpose else self if obj.empty: - return self + return self.copy() if method not in fillna_methods: axis = self._info_axis_number diff --git a/pandas/tests/frame/methods/test_interpolate.py b/pandas/tests/frame/methods/test_interpolate.py index 3c9d79397e4bd..6b86a13fcf1b9 100644 --- a/pandas/tests/frame/methods/test_interpolate.py +++ b/pandas/tests/frame/methods/test_interpolate.py @@ -38,6 +38,7 @@ def test_interp_empty(self): # https://github.com/pandas-dev/pandas/issues/35598 df = DataFrame() result = df.interpolate() + assert result is not df expected = df tm.assert_frame_equal(result, expected) From 7c7a6e64490f1f46ec015a33609e2dce74ad7596 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 18 Aug 2020 06:05:30 -0700 Subject: [PATCH 0516/1025] CLN: remove unused variable (#35783) --- pandas/core/groupby/generic.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 1f0cdbd07560f..166631e69f523 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1031,7 +1031,6 @@ def _cython_agg_blocks( data = data.get_numeric_data(copy=False) agg_blocks: List["Block"] = [] - deleted_items: List[np.ndarray] = [] no_result = object() @@ -1133,7 +1132,6 @@ def blk_func(block: "Block") -> List["Block"]: # continue and exclude the block # NotImplementedError -> "ohlc" with wrong dtype skipped.append(i) - deleted_items.append(block.mgr_locs.as_array) else: agg_blocks.extend(nbs) From 400bcfaeb769d21a591b6fa294cb1d14c0a97322 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Wed, 19 Aug 2020 10:48:41 +0100 Subject: [PATCH 0517/1025] DOC: clean v1.1.1 release notes (#35787) Co-authored-by: Joris Van den Bossche --- doc/source/whatsnew/v1.1.1.rst | 60 +++++++--------------------------- 1 file changed, 12 insertions(+), 48 deletions(-) diff --git a/doc/source/whatsnew/v1.1.1.rst b/doc/source/whatsnew/v1.1.1.rst index ff5bbccf63ffe..43ffed273adbc 100644 --- a/doc/source/whatsnew/v1.1.1.rst +++ b/doc/source/whatsnew/v1.1.1.rst @@ -1,7 +1,7 @@ .. _whatsnew_111: -What's new in 1.1.1 (?) ------------------------ +What's new in 1.1.1 (August XX, 2020) +------------------------------------- These are the changes in pandas 1.1.1. See :ref:`release` for a full changelog including other versions of pandas. @@ -15,20 +15,23 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ +- Fixed regression in :meth:`CategoricalIndex.format` where, when stringified scalars had different lengths, the shorter string would be right-filled with spaces, so it had the same length as the longest string (:issue:`35439`) +- Fixed regression in :meth:`Series.truncate` when trying to truncate a single-element series (:issue:`35544`) - Fixed regression where :meth:`DataFrame.to_numpy` would raise a ``RuntimeError`` for mixed dtypes when converting to ``str`` (:issue:`35455`) - Fixed regression where :func:`read_csv` would raise a ``ValueError`` when ``pandas.options.mode.use_inf_as_na`` was set to ``True`` (:issue:`35493`) - Fixed regression where :func:`pandas.testing.assert_series_equal` would raise an error when non-numeric dtypes were passed with ``check_exact=True`` (:issue:`35446`) -- Fixed regression in :class:`pandas.core.groupby.RollingGroupby` where column selection was ignored (:issue:`35486`) -- Fixed regression where :meth:`DataFrame.interpolate` would raise a ``TypeError`` when the :class:`DataFrame` was empty (:issue:`35598`). +- Fixed regression in ``.groupby(..).rolling(..)`` where column selection was ignored (:issue:`35486`) +- Fixed regression where :meth:`DataFrame.interpolate` would raise a ``TypeError`` when the :class:`DataFrame` was empty (:issue:`35598`) - Fixed regression in :meth:`DataFrame.shift` with ``axis=1`` and heterogeneous dtypes (:issue:`35488`) - Fixed regression in :meth:`DataFrame.diff` with read-only data (:issue:`35559`) - Fixed regression in ``.groupby(..).rolling(..)`` where a segfault would occur with ``center=True`` and an odd number of values (:issue:`35552`) - Fixed regression in :meth:`DataFrame.apply` where functions that altered the input in-place only operated on a single row (:issue:`35462`) - Fixed regression in :meth:`DataFrame.reset_index` would raise a ``ValueError`` on empty :class:`DataFrame` with a :class:`MultiIndex` with a ``datetime64`` dtype level (:issue:`35606`, :issue:`35657`) -- Fixed regression where :meth:`DataFrame.merge_asof` would raise a ``UnboundLocalError`` when ``left_index`` , ``right_index`` and ``tolerance`` were set (:issue:`35558`) +- Fixed regression where :func:`pandas.merge_asof` would raise a ``UnboundLocalError`` when ``left_index`` , ``right_index`` and ``tolerance`` were set (:issue:`35558`) - Fixed regression in ``.groupby(..).rolling(..)`` where a custom ``BaseIndexer`` would be ignored (:issue:`35557`) - Fixed regression in :meth:`DataFrame.replace` and :meth:`Series.replace` where compiled regular expressions would be ignored during replacement (:issue:`35680`) -- Fixed regression in :meth:`~pandas.core.groupby.DataFrameGroupBy.agg` where a list of functions would produce the wrong results if at least one of the functions did not aggregate. (:issue:`35490`) +- Fixed regression in :meth:`~pandas.core.groupby.DataFrameGroupBy.aggregate` where a list of functions would produce the wrong results if at least one of the functions did not aggregate (:issue:`35490`) +- Fixed memory usage issue when instantiating large :class:`pandas.arrays.StringArray` (:issue:`35499`) .. --------------------------------------------------------------------------- @@ -37,50 +40,11 @@ Fixed regressions Bug fixes ~~~~~~~~~ -- Bug in ``Styler`` whereby `cell_ids` argument had no effect due to other recent changes (:issue:`35588`) (:issue:`35663`). -- Bug in :func:`pandas.testing.assert_series_equal` and :func:`pandas.testing.assert_frame_equal` where extension dtypes were not ignored when ``check_dtypes`` was set to ``False`` (:issue:`35715`). - -Categorical -^^^^^^^^^^^ - -- Bug in :meth:`CategoricalIndex.format` where, when stringified scalars had different lengths, the shorter string would be right-filled with spaces, so it had the same length as the longest string (:issue:`35439`) - - -**Datetimelike** - -- -- - -**Timedelta** - +- Bug in :class:`~pandas.io.formats.style.Styler` whereby `cell_ids` argument had no effect due to other recent changes (:issue:`35588`) (:issue:`35663`) +- Bug in :func:`pandas.testing.assert_series_equal` and :func:`pandas.testing.assert_frame_equal` where extension dtypes were not ignored when ``check_dtypes`` was set to ``False`` (:issue:`35715`) - Bug in :meth:`to_timedelta` fails when arg is a :class:`Series` with `Int64` dtype containing null values (:issue:`35574`) - - -**Numeric** - -- -- - -**Groupby/resample/rolling** - -- Bug in :class:`pandas.core.groupby.RollingGroupby` where passing ``closed`` with column selection would raise a ``ValueError`` (:issue:`35549`) - -**Plotting** - -- - -**Indexing** - -- Bug in :meth:`Series.truncate` when trying to truncate a single-element series (:issue:`35544`) - -**DataFrame** +- Bug in ``.groupby(..).rolling(..)`` where passing ``closed`` with column selection would raise a ``ValueError`` (:issue:`35549`) - Bug in :class:`DataFrame` constructor failing to raise ``ValueError`` in some cases when data and index have mismatched lengths (:issue:`33437`) -- - -**Strings** - -- fix memory usage issue when instantiating large :class:`pandas.arrays.StringArray` (:issue:`35499`) - .. --------------------------------------------------------------------------- From ce971660a4e6d4e666f57d175894f786f8ab323c Mon Sep 17 00:00:00 2001 From: tpanza <19810086+tpanza@users.noreply.github.com> Date: Wed, 19 Aug 2020 06:05:40 -0700 Subject: [PATCH 0518/1025] lreshape and wide_to_long documentation (Closes #33417) (#33418) --- pandas/core/frame.py | 6 +++++ pandas/core/reshape/melt.py | 51 ++++++++++++++++++++++++++++++++----- 2 files changed, 50 insertions(+), 7 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 1587dd8798ec3..a4408d1f5d23d 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -6609,6 +6609,8 @@ def groupby( duplicate values for one index/column pair. DataFrame.unstack : Pivot based on the index values instead of a column. + wide_to_long : Wide panel to long format. Less flexible but more + user-friendly than melt. Notes ----- @@ -6763,6 +6765,10 @@ def pivot(self, index=None, columns=None, values=None) -> "DataFrame": -------- DataFrame.pivot : Pivot without aggregation that can handle non-numeric data. + DataFrame.melt: Unpivot a DataFrame from wide to long format, + optionally leaving identifiers set. + wide_to_long : Wide panel to long format. Less flexible but more + user-friendly than melt. Examples -------- diff --git a/pandas/core/reshape/melt.py b/pandas/core/reshape/melt.py index 1ba6854a79265..8724f7674f0c8 100644 --- a/pandas/core/reshape/melt.py +++ b/pandas/core/reshape/melt.py @@ -144,14 +144,43 @@ def melt( @deprecate_kwarg(old_arg_name="label", new_arg_name=None) def lreshape(data: "DataFrame", groups, dropna: bool = True, label=None) -> "DataFrame": """ - Reshape long-format data to wide. Generalized inverse of DataFrame.pivot + Reshape wide-format data to long. Generalized inverse of DataFrame.pivot. + + Accepts a dictionary, ``groups``, in which each key is a new column name + and each value is a list of old column names that will be "melted" under + the new column name as part of the reshape. Parameters ---------- data : DataFrame + The wide-format DataFrame. groups : dict - {new_name : list_of_columns} - dropna : boolean, default True + {new_name : list_of_columns}. + dropna : bool, default True + Do not include columns whose entries are all NaN. + label : None + Not used. + + .. deprecated:: 1.0.0 + + Returns + ------- + DataFrame + Reshaped DataFrame. + + See Also + -------- + melt : Unpivot a DataFrame from wide to long format, optionally leaving + identifiers set. + pivot : Create a spreadsheet-style pivot table as a DataFrame. + DataFrame.pivot : Pivot without aggregation that can handle + non-numeric data. + DataFrame.pivot_table : Generalization of pivot that can handle + duplicate values for one index/column pair. + DataFrame.unstack : Pivot based on the index values instead of a + column. + wide_to_long : Wide panel to long format. Less flexible but more + user-friendly than melt. Examples -------- @@ -169,10 +198,6 @@ def lreshape(data: "DataFrame", groups, dropna: bool = True, label=None) -> "Dat 1 Yankees 2007 573 2 Red Sox 2008 545 3 Yankees 2008 526 - - Returns - ------- - reshaped : DataFrame """ if isinstance(groups, dict): keys = list(groups.keys()) @@ -262,6 +287,18 @@ def wide_to_long( A DataFrame that contains each stub name as a variable, with new index (i, j). + See Also + -------- + melt : Unpivot a DataFrame from wide to long format, optionally leaving + identifiers set. + pivot : Create a spreadsheet-style pivot table as a DataFrame. + DataFrame.pivot : Pivot without aggregation that can handle + non-numeric data. + DataFrame.pivot_table : Generalization of pivot that can handle + duplicate values for one index/column pair. + DataFrame.unstack : Pivot based on the index values instead of a + column. + Notes ----- All extra variables are left untouched. This simply uses From 7280837baf443a1047a2579c8f45ba8e4dbcde72 Mon Sep 17 00:00:00 2001 From: edwardkong <33737404+edwardkong@users.noreply.github.com> Date: Wed, 19 Aug 2020 09:16:41 -0400 Subject: [PATCH 0519/1025] Changed 'int' type to 'integer' in to_numeric docstring (#35776) --- pandas/core/arrays/sparse/array.py | 2 +- pandas/core/tools/numeric.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index d8db196e4b92f..1531f7b292365 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -234,7 +234,7 @@ class SparseArray(PandasObject, ExtensionArray, ExtensionOpsMixin): 3. ``data.dtype.fill_value`` if `fill_value` is None and `dtype` is not a ``SparseDtype`` and `data` is a ``SparseArray``. - kind : {'int', 'block'}, default 'int' + kind : {'integer', 'block'}, default 'integer' The type of storage for sparse locations. * 'block': Stores a `block` and `block_length` for each diff --git a/pandas/core/tools/numeric.py b/pandas/core/tools/numeric.py index 41548931f17f8..cff4695603d06 100644 --- a/pandas/core/tools/numeric.py +++ b/pandas/core/tools/numeric.py @@ -40,13 +40,13 @@ def to_numeric(arg, errors="raise", downcast=None): - If 'raise', then invalid parsing will raise an exception. - If 'coerce', then invalid parsing will be set as NaN. - If 'ignore', then invalid parsing will return the input. - downcast : {'int', 'signed', 'unsigned', 'float'}, default None + downcast : {'integer', 'signed', 'unsigned', 'float'}, default None If not None, and if the data has been successfully cast to a numerical dtype (or if the data was numeric to begin with), downcast that resulting data to the smallest numerical dtype possible according to the following rules: - - 'int' or 'signed': smallest signed int dtype (min.: np.int8) + - 'integer' or 'signed': smallest signed int dtype (min.: np.int8) - 'unsigned': smallest unsigned int dtype (min.: np.uint8) - 'float': smallest float dtype (min.: np.float32) From 8e339305c3b171026e861675310c945cba52c9d3 Mon Sep 17 00:00:00 2001 From: Ali McMaster Date: Wed, 19 Aug 2020 18:42:47 +0100 Subject: [PATCH 0520/1025] CI: Unpin Pytest + Pytest Asyncio Min Version (#35757) --- ci/deps/azure-38-locale.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ci/deps/azure-38-locale.yaml b/ci/deps/azure-38-locale.yaml index c466a5929ea29..c7090d3a46a77 100644 --- a/ci/deps/azure-38-locale.yaml +++ b/ci/deps/azure-38-locale.yaml @@ -6,9 +6,9 @@ dependencies: # tools - cython>=0.29.16 - - pytest>=5.0.1,<6.0.0 # https://github.com/pandas-dev/pandas/issues/35620 + - pytest>=5.0.1 - pytest-xdist>=1.21 - - pytest-asyncio + - pytest-asyncio>=0.12.0 - hypothesis>=3.58.0 - pytest-azurepipelines From b1383d41cbf8db7a1d01f359e54bae09ff5163c5 Mon Sep 17 00:00:00 2001 From: Tobias Pitters <31857876+CloseChoice@users.noreply.github.com> Date: Wed, 19 Aug 2020 19:59:03 +0200 Subject: [PATCH 0521/1025] BUG: pd.crosstab fails when passed multiple columns, margins True and normalize True (#35150) --- doc/source/whatsnew/v1.2.0.rst | 1 + pandas/core/reshape/pivot.py | 7 ++--- pandas/tests/reshape/test_crosstab.py | 45 +++++++++++++++++++++++++++ 3 files changed, 49 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 42f95d88d74ac..f27c83fafef55 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -258,6 +258,7 @@ Reshaping - Bug in :meth:`DataFrame.pivot_table` with ``aggfunc='count'`` or ``aggfunc='sum'`` returning ``NaN`` for missing categories when pivoted on a ``Categorical``. Now returning ``0`` (:issue:`31422`) - Bug in :func:`union_indexes` where input index names are not preserved in some cases. Affects :func:`concat` and :class:`DataFrame` constructor (:issue:`13475`) +- Bug in func :meth:`crosstab` when using multiple columns with ``margins=True`` and ``normalize=True`` (:issue:`35144`) - Sparse diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index ea5916eff3afa..64a9e2dbf6d99 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -670,12 +670,11 @@ def _normalize(table, normalize, margins: bool, margins_name="All"): # keep index and column of pivoted table table_index = table.index table_columns = table.columns + last_ind_or_col = table.iloc[-1, :].name - # check if margin name is in (for MI cases) or equal to last + # check if margin name is not in (for MI cases) and not equal to last # index/column and save the column and index margin - if (margins_name not in table.iloc[-1, :].name) | ( - margins_name != table.iloc[:, -1].name - ): + if (margins_name not in last_ind_or_col) & (margins_name != last_ind_or_col): raise ValueError(f"{margins_name} not in pivoted DataFrame") column_margin = table.iloc[:-1, -1] index_margin = table.iloc[-1, :-1] diff --git a/pandas/tests/reshape/test_crosstab.py b/pandas/tests/reshape/test_crosstab.py index 8795af2e11122..6f5550a6f8209 100644 --- a/pandas/tests/reshape/test_crosstab.py +++ b/pandas/tests/reshape/test_crosstab.py @@ -698,3 +698,48 @@ def test_margin_normalize(self): names=["A", "B"], ) tm.assert_frame_equal(result, expected) + + def test_margin_normalize_multiple_columns(self): + # GH 35144 + # use multiple columns with margins and normalization + df = DataFrame( + { + "A": ["foo", "foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar"], + "B": ["one", "one", "one", "two", "two", "one", "one", "two", "two"], + "C": [ + "small", + "large", + "large", + "small", + "small", + "large", + "small", + "small", + "large", + ], + "D": [1, 2, 2, 3, 3, 4, 5, 6, 7], + "E": [2, 4, 5, 5, 6, 6, 8, 9, 9], + } + ) + result = crosstab( + index=df.C, + columns=[df.A, df.B], + margins=True, + margins_name="margin", + normalize=True, + ) + expected = DataFrame( + [ + [0.111111, 0.111111, 0.222222, 0.000000, 0.444444], + [0.111111, 0.111111, 0.111111, 0.222222, 0.555556], + [0.222222, 0.222222, 0.333333, 0.222222, 1.0], + ], + index=["large", "small", "margin"], + ) + expected.columns = MultiIndex( + levels=[["bar", "foo", "margin"], ["", "one", "two"]], + codes=[[0, 0, 1, 1, 2], [1, 2, 1, 2, 0]], + names=["A", "B"], + ) + expected.index.name = "C" + tm.assert_frame_equal(result, expected) From 31bb99de7d24b6dc118aa7b29edce8dbb0419c7d Mon Sep 17 00:00:00 2001 From: Yutaro Ikeda Date: Thu, 20 Aug 2020 03:02:55 +0900 Subject: [PATCH 0522/1025] ENH: GH-35611 Tests for top-level Pandas functions serializable (#35692) --- pandas/tests/test_common.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py index bcfed2d0d3a10..3d45a1f7389b7 100644 --- a/pandas/tests/test_common.py +++ b/pandas/tests/test_common.py @@ -10,6 +10,7 @@ import pandas as pd from pandas import Series, Timestamp +import pandas._testing as tm from pandas.core import ops import pandas.core.common as com @@ -157,3 +158,12 @@ def test_version_tag(): raise ValueError( "No git tags exist, please sync tags between upstream and your repo" ) + + +@pytest.mark.parametrize( + "obj", [(obj,) for obj in pd.__dict__.values() if callable(obj)] +) +def test_serializable(obj): + # GH 35611 + unpickled = tm.round_trip_pickle(obj) + assert type(obj) == type(unpickled) From 11f6d766f714a4a3a7512ac1e14c6ce285dd38ab Mon Sep 17 00:00:00 2001 From: Tobias Pitters <31857876+CloseChoice@users.noreply.github.com> Date: Wed, 19 Aug 2020 22:22:59 +0200 Subject: [PATCH 0523/1025] =?UTF-8?q?fix=20bug=20when=20combining=20groupb?= =?UTF-8?q?y=20with=20resample=20and=20interpolate=20with=20dat=E2=80=A6?= =?UTF-8?q?=20(#35360)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- doc/source/whatsnew/v1.2.0.rst | 3 +- pandas/core/resample.py | 2 +- pandas/tests/resample/test_time_grouper.py | 62 ++++++++++++++++++++++ 3 files changed, 64 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index f27c83fafef55..8ec75b4846ae2 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -249,8 +249,7 @@ Groupby/resample/rolling - Bug in :meth:`DataFrameGroupBy.count` and :meth:`SeriesGroupBy.sum` returning ``NaN`` for missing categories when grouped on multiple ``Categoricals``. Now returning ``0`` (:issue:`35028`) - Bug in :meth:`DataFrameGroupBy.apply` that would some times throw an erroneous ``ValueError`` if the grouping axis had duplicate entries (:issue:`16646`) -- -- +- Bug when combining methods :meth:`DataFrame.groupby` with :meth:`DataFrame.resample` and :meth:`DataFrame.interpolate` raising an ``TypeError`` (:issue:`35325`) - Bug in :meth:`DataFrameGroupBy.apply` where a non-nuisance grouping column would be dropped from the output columns if another groupby method was called before ``.apply()`` (:issue:`34656`) Reshaping diff --git a/pandas/core/resample.py b/pandas/core/resample.py index e82a1d4d2cda8..fc54128ae5aa6 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -795,7 +795,7 @@ def interpolate( """ Interpolate values according to different methods. """ - result = self._upsample(None) + result = self._upsample("asfreq") return result.interpolate( method=method, axis=axis, diff --git a/pandas/tests/resample/test_time_grouper.py b/pandas/tests/resample/test_time_grouper.py index 26e429c47b494..f638706207679 100644 --- a/pandas/tests/resample/test_time_grouper.py +++ b/pandas/tests/resample/test_time_grouper.py @@ -287,3 +287,65 @@ def test_upsample_sum(method, method_args, expected_values): result = methodcaller(method, **method_args)(resampled) expected = pd.Series(expected_values, index=index) tm.assert_series_equal(result, expected) + + +def test_groupby_resample_interpolate(): + # GH 35325 + d = {"price": [10, 11, 9], "volume": [50, 60, 50]} + + df = pd.DataFrame(d) + + df["week_starting"] = pd.date_range("01/01/2018", periods=3, freq="W") + + result = ( + df.set_index("week_starting") + .groupby("volume") + .resample("1D") + .interpolate(method="linear") + ) + expected_ind = pd.MultiIndex.from_tuples( + [ + (50, "2018-01-07"), + (50, pd.Timestamp("2018-01-08")), + (50, pd.Timestamp("2018-01-09")), + (50, pd.Timestamp("2018-01-10")), + (50, pd.Timestamp("2018-01-11")), + (50, pd.Timestamp("2018-01-12")), + (50, pd.Timestamp("2018-01-13")), + (50, pd.Timestamp("2018-01-14")), + (50, pd.Timestamp("2018-01-15")), + (50, pd.Timestamp("2018-01-16")), + (50, pd.Timestamp("2018-01-17")), + (50, pd.Timestamp("2018-01-18")), + (50, pd.Timestamp("2018-01-19")), + (50, pd.Timestamp("2018-01-20")), + (50, pd.Timestamp("2018-01-21")), + (60, pd.Timestamp("2018-01-14")), + ], + names=["volume", "week_starting"], + ) + expected = pd.DataFrame( + data={ + "price": [ + 10.0, + 9.928571428571429, + 9.857142857142858, + 9.785714285714286, + 9.714285714285714, + 9.642857142857142, + 9.571428571428571, + 9.5, + 9.428571428571429, + 9.357142857142858, + 9.285714285714286, + 9.214285714285714, + 9.142857142857142, + 9.071428571428571, + 9.0, + 11.0, + ], + "volume": [50.0] * 15 + [60], + }, + index=expected_ind, + ) + tm.assert_frame_equal(result, expected) From 705ba1871b313f33490e6aae0883bae8ac2ae0af Mon Sep 17 00:00:00 2001 From: Fangchen Li Date: Wed, 19 Aug 2020 15:40:04 -0500 Subject: [PATCH 0524/1025] CI: unpin numpy and matplotlib #35779 (#35780) --- doc/source/user_guide/visualization.rst | 2 ++ environment.yml | 5 ++--- pandas/core/frame.py | 2 +- pandas/core/series.py | 2 +- pandas/plotting/_matplotlib/boxplot.py | 2 +- requirements-dev.txt | 4 ++-- 6 files changed, 9 insertions(+), 8 deletions(-) diff --git a/doc/source/user_guide/visualization.rst b/doc/source/user_guide/visualization.rst index 5bc87bca87211..8ce4b30c717a4 100644 --- a/doc/source/user_guide/visualization.rst +++ b/doc/source/user_guide/visualization.rst @@ -668,6 +668,7 @@ A ``ValueError`` will be raised if there are any negative values in your data. plt.figure() .. ipython:: python + :okwarning: series = pd.Series(3 * np.random.rand(4), index=['a', 'b', 'c', 'd'], name='series') @@ -742,6 +743,7 @@ If you pass values whose sum total is less than 1.0, matplotlib draws a semicirc plt.figure() .. ipython:: python + :okwarning: series = pd.Series([0.1] * 4, index=['a', 'b', 'c', 'd'], name='series2') diff --git a/environment.yml b/environment.yml index 1e51470d43d36..aaabf09b8f190 100644 --- a/environment.yml +++ b/environment.yml @@ -3,8 +3,7 @@ channels: - conda-forge dependencies: # required - # Pin numpy<1.19 until MPL 3.3.0 is released. - - numpy>=1.16.5,<1.19.0 + - numpy>=1.16.5 - python=3 - python-dateutil>=2.7.3 - pytz @@ -73,7 +72,7 @@ dependencies: - ipykernel - ipython>=7.11.1 - jinja2 # pandas.Styler - - matplotlib>=2.2.2,<3.3.0 # pandas.plotting, Series.plot, DataFrame.plot + - matplotlib>=2.2.2 # pandas.plotting, Series.plot, DataFrame.plot - numexpr>=2.6.8 - scipy>=1.2 - numba>=0.46.0 diff --git a/pandas/core/frame.py b/pandas/core/frame.py index a4408d1f5d23d..837bd35414773 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2600,7 +2600,7 @@ def to_html( 1 column_2 1000000 non-null object 2 column_3 1000000 non-null object dtypes: object(3) - memory usage: 188.8 MB""" + memory usage: 165.9 MB""" ), see_also_sub=( """ diff --git a/pandas/core/series.py b/pandas/core/series.py index e8bf87a39b572..cd2db659fbd0e 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -4637,7 +4637,7 @@ def memory_usage(self, index=True, deep=False): >>> s.memory_usage() 144 >>> s.memory_usage(deep=True) - 260 + 244 """ v = super().memory_usage(deep=deep) if index: diff --git a/pandas/plotting/_matplotlib/boxplot.py b/pandas/plotting/_matplotlib/boxplot.py index 53ef97bbe9a72..b33daf39de37c 100644 --- a/pandas/plotting/_matplotlib/boxplot.py +++ b/pandas/plotting/_matplotlib/boxplot.py @@ -294,7 +294,7 @@ def maybe_color_bp(bp, **kwds): def plot_group(keys, values, ax): keys = [pprint_thing(x) for x in keys] - values = [np.asarray(remove_na_arraylike(v)) for v in values] + values = [np.asarray(remove_na_arraylike(v), dtype=object) for v in values] bp = ax.boxplot(values, **kwds) if fontsize is not None: ax.tick_params(axis="both", labelsize=fontsize) diff --git a/requirements-dev.txt b/requirements-dev.txt index 66e72641cd5bb..3d0778b74ccbd 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,7 +1,7 @@ # This file is auto-generated from environment.yml, do not modify. # See that file for comments about the need/usage of each dependency. -numpy>=1.16.5,<1.19.0 +numpy>=1.16.5 python-dateutil>=2.7.3 pytz asv @@ -47,7 +47,7 @@ bottleneck>=1.2.1 ipykernel ipython>=7.11.1 jinja2 -matplotlib>=2.2.2,<3.3.0 +matplotlib>=2.2.2 numexpr>=2.6.8 scipy>=1.2 numba>=0.46.0 From 226216eca261265b6768315b551469861a2dd699 Mon Sep 17 00:00:00 2001 From: Fangchen Li Date: Thu, 20 Aug 2020 02:24:26 -0500 Subject: [PATCH 0525/1025] CI/DOC: unpin sphinx, fix autodoc usage (#35815) --- doc/source/reference/frame.rst | 1 + doc/source/reference/series.rst | 1 + environment.yml | 2 +- requirements-dev.txt | 2 +- 4 files changed, 4 insertions(+), 2 deletions(-) diff --git a/doc/source/reference/frame.rst b/doc/source/reference/frame.rst index e3dfb552651a0..4d9d18e3d204e 100644 --- a/doc/source/reference/frame.rst +++ b/doc/source/reference/frame.rst @@ -343,6 +343,7 @@ Sparse-dtype specific methods and attributes are provided under the .. autosummary:: :toctree: api/ + :template: autosummary/accessor_method.rst DataFrame.sparse.from_spmatrix DataFrame.sparse.to_coo diff --git a/doc/source/reference/series.rst b/doc/source/reference/series.rst index 3b595ba5ab206..ae3e121ca8212 100644 --- a/doc/source/reference/series.rst +++ b/doc/source/reference/series.rst @@ -522,6 +522,7 @@ Sparse-dtype specific methods and attributes are provided under the .. autosummary:: :toctree: api/ + :template: autosummary/accessor_method.rst Series.sparse.from_coo Series.sparse.to_coo diff --git a/environment.yml b/environment.yml index aaabf09b8f190..806119631d5ee 100644 --- a/environment.yml +++ b/environment.yml @@ -27,7 +27,7 @@ dependencies: # documentation - gitpython # obtain contributors from git for whatsnew - gitdb2=2.0.6 # GH-32060 - - sphinx<=3.1.1 + - sphinx # documentation (jupyter notebooks) - nbconvert>=5.4.1 diff --git a/requirements-dev.txt b/requirements-dev.txt index 3d0778b74ccbd..deaed8ab9d5f1 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -16,7 +16,7 @@ mypy==0.730 pycodestyle gitpython gitdb2==2.0.6 -sphinx<=3.1.1 +sphinx nbconvert>=5.4.1 nbsphinx pandoc From 637150eef4fdc5ac5de907697d94677f88315bc2 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Thu, 20 Aug 2020 13:03:30 +0100 Subject: [PATCH 0526/1025] CI: more xfail failing 32-bit tests (#35809) --- pandas/tests/window/test_grouper.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pandas/tests/window/test_grouper.py b/pandas/tests/window/test_grouper.py index a9590c7e1233a..d0a62374d0888 100644 --- a/pandas/tests/window/test_grouper.py +++ b/pandas/tests/window/test_grouper.py @@ -215,6 +215,7 @@ def foo(x): ) tm.assert_series_equal(result, expected) + @pytest.mark.xfail(not compat.IS64, reason="GH-35294") def test_groupby_rolling_center_center(self): # GH 35552 series = Series(range(1, 6)) @@ -280,6 +281,7 @@ def test_groupby_rolling_center_center(self): ) tm.assert_frame_equal(result, expected) + @pytest.mark.xfail(not compat.IS64, reason="GH-35294") def test_groupby_subselect_rolling(self): # GH 35486 df = DataFrame( @@ -305,6 +307,7 @@ def test_groupby_subselect_rolling(self): ) tm.assert_series_equal(result, expected) + @pytest.mark.xfail(not compat.IS64, reason="GH-35294") def test_groupby_rolling_custom_indexer(self): # GH 35557 class SimpleIndexer(pd.api.indexers.BaseIndexer): @@ -328,6 +331,7 @@ def get_window_bounds( expected = df.groupby(df.index).rolling(window=3, min_periods=1).sum() tm.assert_frame_equal(result, expected) + @pytest.mark.xfail(not compat.IS64, reason="GH-35294") def test_groupby_rolling_subset_with_closed(self): # GH 35549 df = pd.DataFrame( @@ -352,6 +356,7 @@ def test_groupby_rolling_subset_with_closed(self): ) tm.assert_series_equal(result, expected) + @pytest.mark.xfail(not compat.IS64, reason="GH-35294") def test_groupby_subset_rolling_subset_with_closed(self): # GH 35549 df = pd.DataFrame( From b5741dadcd82b4c9f5a7558bbb49accc217e5950 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 20 Aug 2020 16:52:52 +0200 Subject: [PATCH 0527/1025] ROADMAP: add consistent missing values for all dtypes to the roadmap (#35208) --- doc/source/development/roadmap.rst | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/doc/source/development/roadmap.rst b/doc/source/development/roadmap.rst index d331491d02883..efee21b5889ed 100644 --- a/doc/source/development/roadmap.rst +++ b/doc/source/development/roadmap.rst @@ -53,6 +53,32 @@ need to implement certain operations expected by pandas users (for example the algorithm used in, ``Series.str.upper``). That work may be done outside of pandas. +Consistent missing value handling +--------------------------------- + +Currently, pandas handles missing data differently for different data types. We +use different types to indicate that a value is missing (``np.nan`` for +floating-point data, ``np.nan`` or ``None`` for object-dtype data -- typically +strings or booleans -- with missing values, and ``pd.NaT`` for datetimelike +data). Integer data cannot store missing data or are cast to float. In addition, +pandas 1.0 introduced a new missing value sentinel, ``pd.NA``, which is being +used for the experimental nullable integer, boolean, and string data types. + +These different missing values have different behaviors in user-facing +operations. Specifically, we introduced different semantics for the nullable +data types for certain operations (e.g. propagating in comparison operations +instead of comparing as False). + +Long term, we want to introduce consistent missing data handling for all data +types. This includes consistent behavior in all operations (indexing, arithmetic +operations, comparisons, etc.). We want to eventually make the new semantics the +default. + +This has been discussed at +`github #28095 `__ (and +linked issues), and described in more detail in this +`design doc `__. + Apache Arrow interoperability ----------------------------- From e96addbfb03df3d0422ed0f682aa310a79b5da70 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Thu, 20 Aug 2020 16:05:46 +0100 Subject: [PATCH 0528/1025] DOC: another pass of v1.1.1 release notes (#35801) --- doc/source/whatsnew/v1.1.1.rst | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v1.1.1.rst b/doc/source/whatsnew/v1.1.1.rst index 43ffed273adbc..721f07c865409 100644 --- a/doc/source/whatsnew/v1.1.1.rst +++ b/doc/source/whatsnew/v1.1.1.rst @@ -1,6 +1,6 @@ .. _whatsnew_111: -What's new in 1.1.1 (August XX, 2020) +What's new in 1.1.1 (August 20, 2020) ------------------------------------- These are the changes in pandas 1.1.1. See :ref:`release` for a full changelog @@ -27,7 +27,7 @@ Fixed regressions - Fixed regression in ``.groupby(..).rolling(..)`` where a segfault would occur with ``center=True`` and an odd number of values (:issue:`35552`) - Fixed regression in :meth:`DataFrame.apply` where functions that altered the input in-place only operated on a single row (:issue:`35462`) - Fixed regression in :meth:`DataFrame.reset_index` would raise a ``ValueError`` on empty :class:`DataFrame` with a :class:`MultiIndex` with a ``datetime64`` dtype level (:issue:`35606`, :issue:`35657`) -- Fixed regression where :func:`pandas.merge_asof` would raise a ``UnboundLocalError`` when ``left_index`` , ``right_index`` and ``tolerance`` were set (:issue:`35558`) +- Fixed regression where :func:`pandas.merge_asof` would raise a ``UnboundLocalError`` when ``left_index``, ``right_index`` and ``tolerance`` were set (:issue:`35558`) - Fixed regression in ``.groupby(..).rolling(..)`` where a custom ``BaseIndexer`` would be ignored (:issue:`35557`) - Fixed regression in :meth:`DataFrame.replace` and :meth:`Series.replace` where compiled regular expressions would be ignored during replacement (:issue:`35680`) - Fixed regression in :meth:`~pandas.core.groupby.DataFrameGroupBy.aggregate` where a list of functions would produce the wrong results if at least one of the functions did not aggregate (:issue:`35490`) @@ -40,11 +40,11 @@ Fixed regressions Bug fixes ~~~~~~~~~ -- Bug in :class:`~pandas.io.formats.style.Styler` whereby `cell_ids` argument had no effect due to other recent changes (:issue:`35588`) (:issue:`35663`) +- Bug in :class:`~pandas.io.formats.style.Styler` whereby ``cell_ids`` argument had no effect due to other recent changes (:issue:`35588`) (:issue:`35663`) - Bug in :func:`pandas.testing.assert_series_equal` and :func:`pandas.testing.assert_frame_equal` where extension dtypes were not ignored when ``check_dtypes`` was set to ``False`` (:issue:`35715`) -- Bug in :meth:`to_timedelta` fails when arg is a :class:`Series` with `Int64` dtype containing null values (:issue:`35574`) +- Bug in :meth:`to_timedelta` fails when ``arg`` is a :class:`Series` with ``Int64`` dtype containing null values (:issue:`35574`) - Bug in ``.groupby(..).rolling(..)`` where passing ``closed`` with column selection would raise a ``ValueError`` (:issue:`35549`) -- Bug in :class:`DataFrame` constructor failing to raise ``ValueError`` in some cases when data and index have mismatched lengths (:issue:`33437`) +- Bug in :class:`DataFrame` constructor failing to raise ``ValueError`` in some cases when ``data`` and ``index`` have mismatched lengths (:issue:`33437`) .. --------------------------------------------------------------------------- From 0fdf1e236e2e7246bb4fb249109c7a5df5f36f89 Mon Sep 17 00:00:00 2001 From: Ali McMaster Date: Thu, 20 Aug 2020 17:26:30 +0100 Subject: [PATCH 0529/1025] Add new core members (#35812) --- web/pandas/config.yml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/web/pandas/config.yml b/web/pandas/config.yml index 23575cc123050..9a178d26659c3 100644 --- a/web/pandas/config.yml +++ b/web/pandas/config.yml @@ -79,6 +79,13 @@ maintainers: - datapythonista - simonjayhawkins - topper-123 + - alimcmaster1 + - bashtage + - charlesdong1991 + - Dr-Irv + - dsaxton + - MarcoGorelli + - rhshadrach emeritus: - Wouter Overmeire - Skipper Seabold From f73f3263838b6a1b5d86537a22c5ca656ad277f0 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 20 Aug 2020 09:59:23 -0700 Subject: [PATCH 0530/1025] REF: insert self.on column _after_ concat (#35746) --- pandas/core/window/rolling.py | 50 +++++++++++++++-------------------- 1 file changed, 21 insertions(+), 29 deletions(-) diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 966773b7c6982..ac96258cbc3c9 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -38,7 +38,7 @@ from pandas.core.base import DataError, PandasObject, SelectionMixin, ShallowMixin import pandas.core.common as com from pandas.core.construction import extract_array -from pandas.core.indexes.api import Index, MultiIndex, ensure_index +from pandas.core.indexes.api import Index, MultiIndex from pandas.core.util.numba_ import NUMBA_FUNC_CACHE, maybe_use_numba from pandas.core.window.common import ( WindowGroupByMixin, @@ -402,36 +402,27 @@ def _wrap_results(self, results, blocks, obj, exclude=None) -> FrameOrSeries: return result final.append(result) - # if we have an 'on' column - # we want to put it back into the results - # in the same location - columns = self._selected_obj.columns - if self.on is not None and not self._on.equals(obj.index): - - name = self._on.name - final.append(Series(self._on, index=obj.index, name=name)) - - if self._selection is not None: - - selection = ensure_index(self._selection) - - # need to reorder to include original location of - # the on column (if its not already there) - if name not in selection: - columns = self.obj.columns - indexer = columns.get_indexer(selection.tolist() + [name]) - columns = columns.take(sorted(indexer)) - - # exclude nuisance columns so that they are not reindexed - if exclude is not None and exclude: - columns = [c for c in columns if c not in exclude] + exclude = exclude or [] + columns = [c for c in self._selected_obj.columns if c not in exclude] + if not columns and not len(final) and exclude: + raise DataError("No numeric types to aggregate") + elif not len(final): + return obj.astype("float64") - if not columns: - raise DataError("No numeric types to aggregate") + df = concat(final, axis=1).reindex(columns=columns, copy=False) - if not len(final): - return obj.astype("float64") - return concat(final, axis=1).reindex(columns=columns, copy=False) + # if we have an 'on' column we want to put it back into + # the results in the same location + if self.on is not None and not self._on.equals(obj.index): + name = self._on.name + extra_col = Series(self._on, index=obj.index, name=name) + if name not in df.columns and name not in df.index.names: + new_loc = len(df.columns) + df.insert(new_loc, name, extra_col) + elif name in df.columns: + # TODO: sure we want to overwrite results? + df[name] = extra_col + return df def _center_window(self, result, window) -> np.ndarray: """ @@ -2277,6 +2268,7 @@ def _get_window_indexer(self, window: int) -> GroupbyRollingIndexer: if isinstance(self.window, BaseIndexer): rolling_indexer = type(self.window) indexer_kwargs = self.window.__dict__ + assert isinstance(indexer_kwargs, dict) # We'll be using the index of each group later indexer_kwargs.pop("index_array", None) elif self.is_freq_type: From 3bc41fbf946b781d5d24db8b9d318d4a07ceea96 Mon Sep 17 00:00:00 2001 From: Ali McMaster Date: Thu, 20 Aug 2020 22:26:04 +0100 Subject: [PATCH 0531/1025] Pyarrow Min Versoin (#35828) --- ci/deps/travis-37-locale.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/ci/deps/travis-37-locale.yaml b/ci/deps/travis-37-locale.yaml index 4427c1d940bf2..6dc1c2f89cc6f 100644 --- a/ci/deps/travis-37-locale.yaml +++ b/ci/deps/travis-37-locale.yaml @@ -28,6 +28,7 @@ dependencies: - openpyxl - pandas-gbq=0.12.0 - psycopg2=2.7 + - pyarrow>=0.15.0 # GH #35813 - pymysql=0.7.11 - pytables - python-dateutil From 4318d76113cf3baf748b6eeff2f6bf9801e6cee8 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Thu, 20 Aug 2020 23:19:00 +0100 Subject: [PATCH 0532/1025] DOC: Start 1.1.2 (#35825) --- doc/source/whatsnew/index.rst | 1 + doc/source/whatsnew/v1.1.1.rst | 2 +- doc/source/whatsnew/v1.1.2.rst | 38 ++++++++++++++++++++++++++++++++++ 3 files changed, 40 insertions(+), 1 deletion(-) create mode 100644 doc/source/whatsnew/v1.1.2.rst diff --git a/doc/source/whatsnew/index.rst b/doc/source/whatsnew/index.rst index a280a981c789b..1827d151579a1 100644 --- a/doc/source/whatsnew/index.rst +++ b/doc/source/whatsnew/index.rst @@ -24,6 +24,7 @@ Version 1.1 .. toctree:: :maxdepth: 2 + v1.1.2 v1.1.1 v1.1.0 diff --git a/doc/source/whatsnew/v1.1.1.rst b/doc/source/whatsnew/v1.1.1.rst index 721f07c865409..77ea67f76f655 100644 --- a/doc/source/whatsnew/v1.1.1.rst +++ b/doc/source/whatsnew/v1.1.1.rst @@ -53,4 +53,4 @@ Bug fixes Contributors ~~~~~~~~~~~~ -.. contributors:: v1.1.0..v1.1.1|HEAD +.. contributors:: v1.1.0..v1.1.1 diff --git a/doc/source/whatsnew/v1.1.2.rst b/doc/source/whatsnew/v1.1.2.rst new file mode 100644 index 0000000000000..81acd567027e5 --- /dev/null +++ b/doc/source/whatsnew/v1.1.2.rst @@ -0,0 +1,38 @@ +.. _whatsnew_112: + +What's new in 1.1.2 (??) +------------------------ + +These are the changes in pandas 1.1.2. See :ref:`release` for a full changelog +including other versions of pandas. + +{{ header }} + +.. --------------------------------------------------------------------------- + +.. _whatsnew_112.regressions: + +Fixed regressions +~~~~~~~~~~~~~~~~~ + +- +- + +.. --------------------------------------------------------------------------- + +.. _whatsnew_112.bug_fixes: + +Bug fixes +~~~~~~~~~ + +- +- + +.. --------------------------------------------------------------------------- + +.. _whatsnew_112.contributors: + +Contributors +~~~~~~~~~~~~ + +.. contributors:: v1.1.1..v1.1.2|HEAD From fc46bfc07b82c96ab08c0396214b315cee535489 Mon Sep 17 00:00:00 2001 From: joooeey Date: Fri, 21 Aug 2020 19:16:29 +0200 Subject: [PATCH 0533/1025] DOC: timezone warning for dates beyond TODAY (#34100) * DOC: timezone warning for dates beyond TODAY introducing a suggestion discussed in PR #33863 : Added a warning in the user guide that timezone conversion on future dates is inherently unreliable. * shorter warning text Co-authored-by: Marco Gorelli --- doc/source/user_guide/timeseries.rst | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst index a03ba6c775e68..0bfe9d9b68cdb 100644 --- a/doc/source/user_guide/timeseries.rst +++ b/doc/source/user_guide/timeseries.rst @@ -2319,13 +2319,18 @@ you can use the ``tz_convert`` method. Instead, the datetime needs to be localized using the ``localize`` method on the ``pytz`` time zone object. +.. warning:: + + Be aware that for times in the future, correct conversion between time zones + (and UTC) cannot be guaranteed by any time zone library because a timezone's + offset from UTC may be changed by the respective government. + .. warning:: If you are using dates beyond 2038-01-18, due to current deficiencies in the underlying libraries caused by the year 2038 problem, daylight saving time (DST) adjustments to timezone aware dates will not be applied. If and when the underlying libraries are fixed, - the DST transitions will be applied. It should be noted though, that time zone data for far future time zones - are likely to be inaccurate, as they are simple extrapolations of the current set of (regularly revised) rules. + the DST transitions will be applied. For example, for two dates that are in British Summer Time (and so would normally be GMT+1), both the following asserts evaluate as true: From 14488ecc917f8d57f592f427e16763704ee9afdc Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Fri, 21 Aug 2020 16:14:03 -0400 Subject: [PATCH 0534/1025] Moto server (#35655) --- ci/deps/azure-37-locale.yaml | 1 + ci/deps/azure-37-slow.yaml | 2 + ci/deps/azure-38-locale.yaml | 2 + ci/deps/azure-windows-37.yaml | 7 +- ci/deps/azure-windows-38.yaml | 4 + ci/deps/travis-37-arm64.yaml | 1 + ci/deps/travis-37-cov.yaml | 3 +- ci/deps/travis-37-locale.yaml | 2 +- ci/deps/travis-37.yaml | 4 +- doc/source/whatsnew/v1.2.0.rst | 3 + environment.yml | 1 + pandas/io/excel/_base.py | 30 ++++- pandas/io/excel/_odfreader.py | 14 +- pandas/io/excel/_openpyxl.py | 12 +- pandas/io/excel/_pyxlsb.py | 14 +- pandas/io/excel/_xlrd.py | 7 +- pandas/io/feather_format.py | 7 +- pandas/io/parsers.py | 4 +- pandas/tests/io/conftest.py | 155 ++++++++++++++++------- pandas/tests/io/excel/test_readers.py | 5 +- pandas/tests/io/json/test_compression.py | 6 +- pandas/tests/io/json/test_pandas.py | 13 +- pandas/tests/io/parser/test_network.py | 85 ++++++++----- pandas/tests/io/test_fsspec.py | 29 +++-- pandas/tests/io/test_parquet.py | 15 ++- requirements-dev.txt | 1 + 26 files changed, 307 insertions(+), 120 deletions(-) diff --git a/ci/deps/azure-37-locale.yaml b/ci/deps/azure-37-locale.yaml index a6552aa096a22..cc996f4077cd9 100644 --- a/ci/deps/azure-37-locale.yaml +++ b/ci/deps/azure-37-locale.yaml @@ -21,6 +21,7 @@ dependencies: - lxml - matplotlib>=3.3.0 - moto + - flask - nomkl - numexpr - numpy=1.16.* diff --git a/ci/deps/azure-37-slow.yaml b/ci/deps/azure-37-slow.yaml index e8ffd3d74ca5e..d17a8a2b0ed9b 100644 --- a/ci/deps/azure-37-slow.yaml +++ b/ci/deps/azure-37-slow.yaml @@ -27,9 +27,11 @@ dependencies: - python-dateutil - pytz - s3fs>=0.4.0 + - moto>=1.3.14 - scipy - sqlalchemy - xlrd - xlsxwriter - xlwt - moto + - flask diff --git a/ci/deps/azure-38-locale.yaml b/ci/deps/azure-38-locale.yaml index c7090d3a46a77..bb40127b672d3 100644 --- a/ci/deps/azure-38-locale.yaml +++ b/ci/deps/azure-38-locale.yaml @@ -14,6 +14,7 @@ dependencies: # pandas dependencies - beautifulsoup4 + - flask - html5lib - ipython - jinja2 @@ -32,6 +33,7 @@ dependencies: - xlrd - xlsxwriter - xlwt + - moto - pyarrow>=0.15 - pip - pip: diff --git a/ci/deps/azure-windows-37.yaml b/ci/deps/azure-windows-37.yaml index f4c238ab8b173..4894129915722 100644 --- a/ci/deps/azure-windows-37.yaml +++ b/ci/deps/azure-windows-37.yaml @@ -15,13 +15,14 @@ dependencies: # pandas dependencies - beautifulsoup4 - bottleneck - - fsspec>=0.7.4 + - fsspec>=0.8.0 - gcsfs>=0.6.0 - html5lib - jinja2 - lxml - matplotlib=2.2.* - - moto + - moto>=1.3.14 + - flask - numexpr - numpy=1.16.* - openpyxl @@ -29,7 +30,7 @@ dependencies: - pytables - python-dateutil - pytz - - s3fs>=0.4.0 + - s3fs>=0.4.2 - scipy - sqlalchemy - xlrd diff --git a/ci/deps/azure-windows-38.yaml b/ci/deps/azure-windows-38.yaml index 1f383164b5328..2853e12b28e35 100644 --- a/ci/deps/azure-windows-38.yaml +++ b/ci/deps/azure-windows-38.yaml @@ -16,7 +16,10 @@ dependencies: - blosc - bottleneck - fastparquet>=0.3.2 + - flask + - fsspec>=0.8.0 - matplotlib=3.1.3 + - moto>=1.3.14 - numba - numexpr - numpy=1.18.* @@ -26,6 +29,7 @@ dependencies: - pytables - python-dateutil - pytz + - s3fs>=0.4.0 - scipy - xlrd - xlsxwriter diff --git a/ci/deps/travis-37-arm64.yaml b/ci/deps/travis-37-arm64.yaml index 5cb53489be225..ea29cbef1272b 100644 --- a/ci/deps/travis-37-arm64.yaml +++ b/ci/deps/travis-37-arm64.yaml @@ -17,5 +17,6 @@ dependencies: - python-dateutil - pytz - pip + - flask - pip: - moto diff --git a/ci/deps/travis-37-cov.yaml b/ci/deps/travis-37-cov.yaml index edc11bdf4ab35..33ee6dfffb1a3 100644 --- a/ci/deps/travis-37-cov.yaml +++ b/ci/deps/travis-37-cov.yaml @@ -23,7 +23,8 @@ dependencies: - geopandas - html5lib - matplotlib - - moto + - moto>=1.3.14 + - flask - nomkl - numexpr - numpy=1.16.* diff --git a/ci/deps/travis-37-locale.yaml b/ci/deps/travis-37-locale.yaml index 6dc1c2f89cc6f..306f74a0101e3 100644 --- a/ci/deps/travis-37-locale.yaml +++ b/ci/deps/travis-37-locale.yaml @@ -21,12 +21,12 @@ dependencies: - jinja2 - lxml=4.3.0 - matplotlib=3.0.* - - moto - nomkl - numexpr - numpy - openpyxl - pandas-gbq=0.12.0 + - pyarrow>=0.17 - psycopg2=2.7 - pyarrow>=0.15.0 # GH #35813 - pymysql=0.7.11 diff --git a/ci/deps/travis-37.yaml b/ci/deps/travis-37.yaml index e896233aac63c..26d6c2910a7cc 100644 --- a/ci/deps/travis-37.yaml +++ b/ci/deps/travis-37.yaml @@ -20,8 +20,8 @@ dependencies: - pyarrow - pytz - s3fs>=0.4.0 + - moto>=1.3.14 + - flask - tabulate - pyreadstat - pip - - pip: - - moto diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 8ec75b4846ae2..3e6ed1cdf8f7e 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -24,6 +24,9 @@ of the individual storage backends (detailed from the fsspec docs for `builtin implementations`_ and linked to `external ones`_). See Section :ref:`io.remote`. +:issue:`35655` added fsspec support (including ``storage_options``) +for reading excel files. + .. _builtin implementations: https://filesystem-spec.readthedocs.io/en/latest/api.html#built-in-implementations .. _external ones: https://filesystem-spec.readthedocs.io/en/latest/api.html#other-known-implementations diff --git a/environment.yml b/environment.yml index 806119631d5ee..6afc19c227512 100644 --- a/environment.yml +++ b/environment.yml @@ -51,6 +51,7 @@ dependencies: - botocore>=1.11 - hypothesis>=3.82 - moto # mock S3 + - flask - pytest>=5.0.1 - pytest-cov - pytest-xdist>=1.21 diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index b1bbda4a4b7e0..aaef71910c9ab 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -3,11 +3,12 @@ from io import BufferedIOBase, BytesIO, RawIOBase import os from textwrap import fill -from typing import Union +from typing import Any, Mapping, Union from pandas._config import config from pandas._libs.parsers import STR_NA_VALUES +from pandas._typing import StorageOptions from pandas.errors import EmptyDataError from pandas.util._decorators import Appender, deprecate_nonkeyword_arguments @@ -199,6 +200,15 @@ Duplicate columns will be specified as 'X', 'X.1', ...'X.N', rather than 'X'...'X'. Passing in False will cause data to be overwritten if there are duplicate names in the columns. +storage_options : StorageOptions + Extra options that make sense for a particular storage connection, e.g. + host, port, username, password, etc., if using a URL that will + be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error + will be raised if providing this argument with a local path or + a file-like buffer. See the fsspec and backend storage implementation + docs for the set of allowed keys and values + + .. versionadded:: 1.2.0 Returns ------- @@ -298,10 +308,11 @@ def read_excel( skipfooter=0, convert_float=True, mangle_dupe_cols=True, + storage_options: StorageOptions = None, ): if not isinstance(io, ExcelFile): - io = ExcelFile(io, engine=engine) + io = ExcelFile(io, storage_options=storage_options, engine=engine) elif engine and engine != io.engine: raise ValueError( "Engine should not be specified when passing " @@ -336,12 +347,14 @@ def read_excel( class _BaseExcelReader(metaclass=abc.ABCMeta): - def __init__(self, filepath_or_buffer): + def __init__(self, filepath_or_buffer, storage_options: StorageOptions = None): # If filepath_or_buffer is a url, load the data into a BytesIO if is_url(filepath_or_buffer): filepath_or_buffer = BytesIO(urlopen(filepath_or_buffer).read()) elif not isinstance(filepath_or_buffer, (ExcelFile, self._workbook_class)): - filepath_or_buffer, _, _, _ = get_filepath_or_buffer(filepath_or_buffer) + filepath_or_buffer, _, _, _ = get_filepath_or_buffer( + filepath_or_buffer, storage_options=storage_options + ) if isinstance(filepath_or_buffer, self._workbook_class): self.book = filepath_or_buffer @@ -837,14 +850,16 @@ class ExcelFile: from pandas.io.excel._pyxlsb import _PyxlsbReader from pandas.io.excel._xlrd import _XlrdReader - _engines = { + _engines: Mapping[str, Any] = { "xlrd": _XlrdReader, "openpyxl": _OpenpyxlReader, "odf": _ODFReader, "pyxlsb": _PyxlsbReader, } - def __init__(self, path_or_buffer, engine=None): + def __init__( + self, path_or_buffer, engine=None, storage_options: StorageOptions = None + ): if engine is None: engine = "xlrd" if isinstance(path_or_buffer, (BufferedIOBase, RawIOBase)): @@ -858,13 +873,14 @@ def __init__(self, path_or_buffer, engine=None): raise ValueError(f"Unknown engine: {engine}") self.engine = engine + self.storage_options = storage_options # Could be a str, ExcelFile, Book, etc. self.io = path_or_buffer # Always a string self._io = stringify_path(path_or_buffer) - self._reader = self._engines[engine](self._io) + self._reader = self._engines[engine](self._io, storage_options=storage_options) def __fspath__(self): return self._io diff --git a/pandas/io/excel/_odfreader.py b/pandas/io/excel/_odfreader.py index 44abaf5d3b3c9..a6cd8f524503b 100644 --- a/pandas/io/excel/_odfreader.py +++ b/pandas/io/excel/_odfreader.py @@ -2,7 +2,7 @@ import numpy as np -from pandas._typing import FilePathOrBuffer, Scalar +from pandas._typing import FilePathOrBuffer, Scalar, StorageOptions from pandas.compat._optional import import_optional_dependency import pandas as pd @@ -16,13 +16,19 @@ class _ODFReader(_BaseExcelReader): Parameters ---------- - filepath_or_buffer: string, path to be parsed or + filepath_or_buffer : string, path to be parsed or an open readable stream. + storage_options : StorageOptions + passed to fsspec for appropriate URLs (see ``get_filepath_or_buffer``) """ - def __init__(self, filepath_or_buffer: FilePathOrBuffer): + def __init__( + self, + filepath_or_buffer: FilePathOrBuffer, + storage_options: StorageOptions = None, + ): import_optional_dependency("odf") - super().__init__(filepath_or_buffer) + super().__init__(filepath_or_buffer, storage_options=storage_options) @property def _workbook_class(self): diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py index 03a30cbd62f9a..73239190604db 100644 --- a/pandas/io/excel/_openpyxl.py +++ b/pandas/io/excel/_openpyxl.py @@ -2,7 +2,7 @@ import numpy as np -from pandas._typing import FilePathOrBuffer, Scalar +from pandas._typing import FilePathOrBuffer, Scalar, StorageOptions from pandas.compat._optional import import_optional_dependency from pandas.io.excel._base import ExcelWriter, _BaseExcelReader @@ -467,7 +467,11 @@ def write_cells( class _OpenpyxlReader(_BaseExcelReader): - def __init__(self, filepath_or_buffer: FilePathOrBuffer) -> None: + def __init__( + self, + filepath_or_buffer: FilePathOrBuffer, + storage_options: StorageOptions = None, + ) -> None: """ Reader using openpyxl engine. @@ -475,9 +479,11 @@ def __init__(self, filepath_or_buffer: FilePathOrBuffer) -> None: ---------- filepath_or_buffer : string, path object or Workbook Object to be parsed. + storage_options : StorageOptions + passed to fsspec for appropriate URLs (see ``get_filepath_or_buffer``) """ import_optional_dependency("openpyxl") - super().__init__(filepath_or_buffer) + super().__init__(filepath_or_buffer, storage_options=storage_options) @property def _workbook_class(self): diff --git a/pandas/io/excel/_pyxlsb.py b/pandas/io/excel/_pyxlsb.py index 0d96c8c4acdb8..c0e281ff6c2da 100644 --- a/pandas/io/excel/_pyxlsb.py +++ b/pandas/io/excel/_pyxlsb.py @@ -1,25 +1,31 @@ from typing import List -from pandas._typing import FilePathOrBuffer, Scalar +from pandas._typing import FilePathOrBuffer, Scalar, StorageOptions from pandas.compat._optional import import_optional_dependency from pandas.io.excel._base import _BaseExcelReader class _PyxlsbReader(_BaseExcelReader): - def __init__(self, filepath_or_buffer: FilePathOrBuffer): + def __init__( + self, + filepath_or_buffer: FilePathOrBuffer, + storage_options: StorageOptions = None, + ): """ Reader using pyxlsb engine. Parameters ---------- - filepath_or_buffer: str, path object, or Workbook + filepath_or_buffer : str, path object, or Workbook Object to be parsed. + storage_options : StorageOptions + passed to fsspec for appropriate URLs (see ``get_filepath_or_buffer``) """ import_optional_dependency("pyxlsb") # This will call load_workbook on the filepath or buffer # And set the result to the book-attribute - super().__init__(filepath_or_buffer) + super().__init__(filepath_or_buffer, storage_options=storage_options) @property def _workbook_class(self): diff --git a/pandas/io/excel/_xlrd.py b/pandas/io/excel/_xlrd.py index af82c15fd6b66..ff1b3c8bdb964 100644 --- a/pandas/io/excel/_xlrd.py +++ b/pandas/io/excel/_xlrd.py @@ -2,13 +2,14 @@ import numpy as np +from pandas._typing import StorageOptions from pandas.compat._optional import import_optional_dependency from pandas.io.excel._base import _BaseExcelReader class _XlrdReader(_BaseExcelReader): - def __init__(self, filepath_or_buffer): + def __init__(self, filepath_or_buffer, storage_options: StorageOptions = None): """ Reader using xlrd engine. @@ -16,10 +17,12 @@ def __init__(self, filepath_or_buffer): ---------- filepath_or_buffer : string, path object or Workbook Object to be parsed. + storage_options : StorageOptions + passed to fsspec for appropriate URLs (see ``get_filepath_or_buffer``) """ err_msg = "Install xlrd >= 1.0.0 for Excel support" import_optional_dependency("xlrd", extra=err_msg) - super().__init__(filepath_or_buffer) + super().__init__(filepath_or_buffer, storage_options=storage_options) @property def _workbook_class(self): diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py index 2c664e73b9463..2d86fa44f22a4 100644 --- a/pandas/io/feather_format.py +++ b/pandas/io/feather_format.py @@ -1,5 +1,6 @@ """ feather-format compat """ +from pandas._typing import StorageOptions from pandas.compat._optional import import_optional_dependency from pandas import DataFrame, Int64Index, RangeIndex @@ -7,7 +8,7 @@ from pandas.io.common import get_filepath_or_buffer -def to_feather(df: DataFrame, path, storage_options=None, **kwargs): +def to_feather(df: DataFrame, path, storage_options: StorageOptions = None, **kwargs): """ Write a DataFrame to the binary Feather format. @@ -77,7 +78,9 @@ def to_feather(df: DataFrame, path, storage_options=None, **kwargs): feather.write_feather(df, path, **kwargs) -def read_feather(path, columns=None, use_threads: bool = True, storage_options=None): +def read_feather( + path, columns=None, use_threads: bool = True, storage_options: StorageOptions = None +): """ Load a feather-format object from the file path. diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 5d49757ce7d58..983aa56324083 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -20,7 +20,7 @@ import pandas._libs.parsers as parsers from pandas._libs.parsers import STR_NA_VALUES from pandas._libs.tslibs import parsing -from pandas._typing import FilePathOrBuffer, Union +from pandas._typing import FilePathOrBuffer, StorageOptions, Union from pandas.errors import ( AbstractMethodError, EmptyDataError, @@ -596,7 +596,7 @@ def read_csv( low_memory=_c_parser_defaults["low_memory"], memory_map=False, float_precision=None, - storage_options=None, + storage_options: StorageOptions = None, ): # gh-23761 # diff --git a/pandas/tests/io/conftest.py b/pandas/tests/io/conftest.py index fcee25c258efa..518f31d73efa9 100644 --- a/pandas/tests/io/conftest.py +++ b/pandas/tests/io/conftest.py @@ -1,4 +1,7 @@ import os +import shlex +import subprocess +import time import pytest @@ -31,10 +34,62 @@ def feather_file(datapath): @pytest.fixture -def s3_resource(tips_file, jsonl_file, feather_file): +def s3so(): + return dict(client_kwargs={"endpoint_url": "http://127.0.0.1:5555/"}) + + +@pytest.fixture(scope="module") +def s3_base(): """ Fixture for mocking S3 interaction. + Sets up moto server in separate process + """ + pytest.importorskip("s3fs") + pytest.importorskip("boto3") + requests = pytest.importorskip("requests") + + with tm.ensure_safe_environment_variables(): + # temporary workaround as moto fails for botocore >= 1.11 otherwise, + # see https://github.com/spulec/moto/issues/1924 & 1952 + os.environ.setdefault("AWS_ACCESS_KEY_ID", "foobar_key") + os.environ.setdefault("AWS_SECRET_ACCESS_KEY", "foobar_secret") + + pytest.importorskip("moto", minversion="1.3.14") + pytest.importorskip("flask") # server mode needs flask too + + # Launching moto in server mode, i.e., as a separate process + # with an S3 endpoint on localhost + + endpoint_uri = "http://127.0.0.1:5555/" + + # pipe to null to avoid logging in terminal + proc = subprocess.Popen( + shlex.split("moto_server s3 -p 5555"), stdout=subprocess.DEVNULL + ) + + timeout = 5 + while timeout > 0: + try: + # OK to go once server is accepting connections + r = requests.get(endpoint_uri) + if r.ok: + break + except Exception: + pass + timeout -= 0.1 + time.sleep(0.1) + yield + + proc.terminate() + proc.wait() + + +@pytest.fixture() +def s3_resource(s3_base, tips_file, jsonl_file, feather_file): + """ + Sets up S3 bucket with contents + The primary bucket name is "pandas-test". The following datasets are loaded. @@ -46,45 +101,59 @@ def s3_resource(tips_file, jsonl_file, feather_file): A private bucket "cant_get_it" is also created. The boto3 s3 resource is yielded by the fixture. """ - s3fs = pytest.importorskip("s3fs") - boto3 = pytest.importorskip("boto3") - - with tm.ensure_safe_environment_variables(): - # temporary workaround as moto fails for botocore >= 1.11 otherwise, - # see https://github.com/spulec/moto/issues/1924 & 1952 - os.environ.setdefault("AWS_ACCESS_KEY_ID", "foobar_key") - os.environ.setdefault("AWS_SECRET_ACCESS_KEY", "foobar_secret") - - moto = pytest.importorskip("moto") - - test_s3_files = [ - ("tips#1.csv", tips_file), - ("tips.csv", tips_file), - ("tips.csv.gz", tips_file + ".gz"), - ("tips.csv.bz2", tips_file + ".bz2"), - ("items.jsonl", jsonl_file), - ("simple_dataset.feather", feather_file), - ] - - def add_tips_files(bucket_name): - for s3_key, file_name in test_s3_files: - with open(file_name, "rb") as f: - conn.Bucket(bucket_name).put_object(Key=s3_key, Body=f) - - try: - s3 = moto.mock_s3() - s3.start() - - # see gh-16135 - bucket = "pandas-test" - conn = boto3.resource("s3", region_name="us-east-1") - - conn.create_bucket(Bucket=bucket) - add_tips_files(bucket) - - conn.create_bucket(Bucket="cant_get_it", ACL="private") - add_tips_files("cant_get_it") - s3fs.S3FileSystem.clear_instance_cache() - yield conn - finally: - s3.stop() + import boto3 + import s3fs + + test_s3_files = [ + ("tips#1.csv", tips_file), + ("tips.csv", tips_file), + ("tips.csv.gz", tips_file + ".gz"), + ("tips.csv.bz2", tips_file + ".bz2"), + ("items.jsonl", jsonl_file), + ("simple_dataset.feather", feather_file), + ] + + def add_tips_files(bucket_name): + for s3_key, file_name in test_s3_files: + with open(file_name, "rb") as f: + cli.put_object(Bucket=bucket_name, Key=s3_key, Body=f) + + bucket = "pandas-test" + endpoint_uri = "http://127.0.0.1:5555/" + conn = boto3.resource("s3", endpoint_url=endpoint_uri) + cli = boto3.client("s3", endpoint_url=endpoint_uri) + + try: + cli.create_bucket(Bucket=bucket) + except: # noqa + # OK is bucket already exists + pass + try: + cli.create_bucket(Bucket="cant_get_it", ACL="private") + except: # noqa + # OK is bucket already exists + pass + timeout = 2 + while not cli.list_buckets()["Buckets"] and timeout > 0: + time.sleep(0.1) + timeout -= 0.1 + + add_tips_files(bucket) + add_tips_files("cant_get_it") + s3fs.S3FileSystem.clear_instance_cache() + yield conn + + s3 = s3fs.S3FileSystem(client_kwargs={"endpoint_url": "http://127.0.0.1:5555/"}) + + try: + s3.rm(bucket, recursive=True) + except: # noqa + pass + try: + s3.rm("cant_get_it", recursive=True) + except: # noqa + pass + timeout = 2 + while cli.list_buckets()["Buckets"] and timeout > 0: + time.sleep(0.1) + timeout -= 0.1 diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 51fbbf836a03f..431a50477fccc 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -606,13 +606,14 @@ def test_read_from_http_url(self, read_ext): tm.assert_frame_equal(url_table, local_table) @td.skip_if_not_us_locale - def test_read_from_s3_url(self, read_ext, s3_resource): + def test_read_from_s3_url(self, read_ext, s3_resource, s3so): # Bucket "pandas-test" created in tests/io/conftest.py with open("test1" + read_ext, "rb") as f: s3_resource.Bucket("pandas-test").put_object(Key="test1" + read_ext, Body=f) url = "s3://pandas-test/test1" + read_ext - url_table = pd.read_excel(url) + + url_table = pd.read_excel(url, storage_options=s3so) local_table = pd.read_excel("test1" + read_ext) tm.assert_frame_equal(url_table, local_table) diff --git a/pandas/tests/io/json/test_compression.py b/pandas/tests/io/json/test_compression.py index 182c21ed1d416..5bb205842269e 100644 --- a/pandas/tests/io/json/test_compression.py +++ b/pandas/tests/io/json/test_compression.py @@ -44,7 +44,11 @@ def test_with_s3_url(compression, s3_resource): with open(path, "rb") as f: s3_resource.Bucket("pandas-test").put_object(Key="test-1", Body=f) - roundtripped_df = pd.read_json("s3://pandas-test/test-1", compression=compression) + roundtripped_df = pd.read_json( + "s3://pandas-test/test-1", + compression=compression, + storage_options=dict(client_kwargs={"endpoint_url": "http://127.0.0.1:5555/"}), + ) tm.assert_frame_equal(df, roundtripped_df) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 1280d0fd434d5..64a666079876f 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1213,10 +1213,12 @@ def test_read_inline_jsonl(self): tm.assert_frame_equal(result, expected) @td.skip_if_not_us_locale - def test_read_s3_jsonl(self, s3_resource): + def test_read_s3_jsonl(self, s3_resource, s3so): # GH17200 - result = read_json("s3n://pandas-test/items.jsonl", lines=True) + result = read_json( + "s3n://pandas-test/items.jsonl", lines=True, storage_options=s3so + ) expected = DataFrame([[1, 2], [1, 2]], columns=["a", "b"]) tm.assert_frame_equal(result, expected) @@ -1706,7 +1708,12 @@ def test_to_s3(self, s3_resource): # GH 28375 mock_bucket_name, target_file = "pandas-test", "test.json" df = DataFrame({"x": [1, 2, 3], "y": [2, 4, 6]}) - df.to_json(f"s3://{mock_bucket_name}/{target_file}") + df.to_json( + f"s3://{mock_bucket_name}/{target_file}", + storage_options=dict( + client_kwargs={"endpoint_url": "http://127.0.0.1:5555/"} + ), + ) timeout = 5 while True: if target_file in ( diff --git a/pandas/tests/io/parser/test_network.py b/pandas/tests/io/parser/test_network.py index b30a7b1ef34de..b8b03cbd14a1d 100644 --- a/pandas/tests/io/parser/test_network.py +++ b/pandas/tests/io/parser/test_network.py @@ -71,50 +71,62 @@ def tips_df(datapath): @td.skip_if_not_us_locale() class TestS3: @td.skip_if_no("s3fs") - def test_parse_public_s3_bucket(self, tips_df): + def test_parse_public_s3_bucket(self, tips_df, s3so): # more of an integration test due to the not-public contents portion # can probably mock this though. for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]: - df = read_csv("s3://pandas-test/tips.csv" + ext, compression=comp) + df = read_csv( + "s3://pandas-test/tips.csv" + ext, + compression=comp, + storage_options=s3so, + ) assert isinstance(df, DataFrame) assert not df.empty tm.assert_frame_equal(df, tips_df) # Read public file from bucket with not-public contents - df = read_csv("s3://cant_get_it/tips.csv") + df = read_csv("s3://cant_get_it/tips.csv", storage_options=s3so) assert isinstance(df, DataFrame) assert not df.empty tm.assert_frame_equal(df, tips_df) - def test_parse_public_s3n_bucket(self, tips_df): + def test_parse_public_s3n_bucket(self, tips_df, s3so): # Read from AWS s3 as "s3n" URL - df = read_csv("s3n://pandas-test/tips.csv", nrows=10) + df = read_csv("s3n://pandas-test/tips.csv", nrows=10, storage_options=s3so) assert isinstance(df, DataFrame) assert not df.empty tm.assert_frame_equal(tips_df.iloc[:10], df) - def test_parse_public_s3a_bucket(self, tips_df): + def test_parse_public_s3a_bucket(self, tips_df, s3so): # Read from AWS s3 as "s3a" URL - df = read_csv("s3a://pandas-test/tips.csv", nrows=10) + df = read_csv("s3a://pandas-test/tips.csv", nrows=10, storage_options=s3so) assert isinstance(df, DataFrame) assert not df.empty tm.assert_frame_equal(tips_df.iloc[:10], df) - def test_parse_public_s3_bucket_nrows(self, tips_df): + def test_parse_public_s3_bucket_nrows(self, tips_df, s3so): for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]: - df = read_csv("s3://pandas-test/tips.csv" + ext, nrows=10, compression=comp) + df = read_csv( + "s3://pandas-test/tips.csv" + ext, + nrows=10, + compression=comp, + storage_options=s3so, + ) assert isinstance(df, DataFrame) assert not df.empty tm.assert_frame_equal(tips_df.iloc[:10], df) - def test_parse_public_s3_bucket_chunked(self, tips_df): + def test_parse_public_s3_bucket_chunked(self, tips_df, s3so): # Read with a chunksize chunksize = 5 for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]: df_reader = read_csv( - "s3://pandas-test/tips.csv" + ext, chunksize=chunksize, compression=comp + "s3://pandas-test/tips.csv" + ext, + chunksize=chunksize, + compression=comp, + storage_options=s3so, ) assert df_reader.chunksize == chunksize for i_chunk in [0, 1, 2]: @@ -126,7 +138,7 @@ def test_parse_public_s3_bucket_chunked(self, tips_df): true_df = tips_df.iloc[chunksize * i_chunk : chunksize * (i_chunk + 1)] tm.assert_frame_equal(true_df, df) - def test_parse_public_s3_bucket_chunked_python(self, tips_df): + def test_parse_public_s3_bucket_chunked_python(self, tips_df, s3so): # Read with a chunksize using the Python parser chunksize = 5 for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]: @@ -135,6 +147,7 @@ def test_parse_public_s3_bucket_chunked_python(self, tips_df): chunksize=chunksize, compression=comp, engine="python", + storage_options=s3so, ) assert df_reader.chunksize == chunksize for i_chunk in [0, 1, 2]: @@ -145,46 +158,53 @@ def test_parse_public_s3_bucket_chunked_python(self, tips_df): true_df = tips_df.iloc[chunksize * i_chunk : chunksize * (i_chunk + 1)] tm.assert_frame_equal(true_df, df) - def test_parse_public_s3_bucket_python(self, tips_df): + def test_parse_public_s3_bucket_python(self, tips_df, s3so): for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]: df = read_csv( - "s3://pandas-test/tips.csv" + ext, engine="python", compression=comp + "s3://pandas-test/tips.csv" + ext, + engine="python", + compression=comp, + storage_options=s3so, ) assert isinstance(df, DataFrame) assert not df.empty tm.assert_frame_equal(df, tips_df) - def test_infer_s3_compression(self, tips_df): + def test_infer_s3_compression(self, tips_df, s3so): for ext in ["", ".gz", ".bz2"]: df = read_csv( - "s3://pandas-test/tips.csv" + ext, engine="python", compression="infer" + "s3://pandas-test/tips.csv" + ext, + engine="python", + compression="infer", + storage_options=s3so, ) assert isinstance(df, DataFrame) assert not df.empty tm.assert_frame_equal(df, tips_df) - def test_parse_public_s3_bucket_nrows_python(self, tips_df): + def test_parse_public_s3_bucket_nrows_python(self, tips_df, s3so): for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]: df = read_csv( "s3://pandas-test/tips.csv" + ext, engine="python", nrows=10, compression=comp, + storage_options=s3so, ) assert isinstance(df, DataFrame) assert not df.empty tm.assert_frame_equal(tips_df.iloc[:10], df) - def test_read_s3_fails(self): + def test_read_s3_fails(self, s3so): with pytest.raises(IOError): - read_csv("s3://nyqpug/asdf.csv") + read_csv("s3://nyqpug/asdf.csv", storage_options=s3so) # Receive a permission error when trying to read a private bucket. # It's irrelevant here that this isn't actually a table. with pytest.raises(IOError): read_csv("s3://cant_get_it/file.csv") - def test_write_s3_csv_fails(self, tips_df): + def test_write_s3_csv_fails(self, tips_df, s3so): # GH 32486 # Attempting to write to an invalid S3 path should raise import botocore @@ -195,10 +215,12 @@ def test_write_s3_csv_fails(self, tips_df): error = (FileNotFoundError, botocore.exceptions.ClientError) with pytest.raises(error, match="The specified bucket does not exist"): - tips_df.to_csv("s3://an_s3_bucket_data_doesnt_exit/not_real.csv") + tips_df.to_csv( + "s3://an_s3_bucket_data_doesnt_exit/not_real.csv", storage_options=s3so + ) @td.skip_if_no("pyarrow") - def test_write_s3_parquet_fails(self, tips_df): + def test_write_s3_parquet_fails(self, tips_df, s3so): # GH 27679 # Attempting to write to an invalid S3 path should raise import botocore @@ -209,7 +231,10 @@ def test_write_s3_parquet_fails(self, tips_df): error = (FileNotFoundError, botocore.exceptions.ClientError) with pytest.raises(error, match="The specified bucket does not exist"): - tips_df.to_parquet("s3://an_s3_bucket_data_doesnt_exit/not_real.parquet") + tips_df.to_parquet( + "s3://an_s3_bucket_data_doesnt_exit/not_real.parquet", + storage_options=s3so, + ) def test_read_csv_handles_boto_s3_object(self, s3_resource, tips_file): # see gh-16135 @@ -225,7 +250,7 @@ def test_read_csv_handles_boto_s3_object(self, s3_resource, tips_file): expected = read_csv(tips_file) tm.assert_frame_equal(result, expected) - def test_read_csv_chunked_download(self, s3_resource, caplog): + def test_read_csv_chunked_download(self, s3_resource, caplog, s3so): # 8 MB, S3FS usees 5MB chunks import s3fs @@ -245,18 +270,20 @@ def test_read_csv_chunked_download(self, s3_resource, caplog): s3fs.S3FileSystem.clear_instance_cache() with caplog.at_level(logging.DEBUG, logger="s3fs"): - read_csv("s3://pandas-test/large-file.csv", nrows=5) + read_csv("s3://pandas-test/large-file.csv", nrows=5, storage_options=s3so) # log of fetch_range (start, stop) assert (0, 5505024) in (x.args[-2:] for x in caplog.records) - def test_read_s3_with_hash_in_key(self, tips_df): + def test_read_s3_with_hash_in_key(self, tips_df, s3so): # GH 25945 - result = read_csv("s3://pandas-test/tips#1.csv") + result = read_csv("s3://pandas-test/tips#1.csv", storage_options=s3so) tm.assert_frame_equal(tips_df, result) @td.skip_if_no("pyarrow") - def test_read_feather_s3_file_path(self, feather_file): + def test_read_feather_s3_file_path(self, feather_file, s3so): # GH 29055 expected = read_feather(feather_file) - res = read_feather("s3://pandas-test/simple_dataset.feather") + res = read_feather( + "s3://pandas-test/simple_dataset.feather", storage_options=s3so + ) tm.assert_frame_equal(expected, res) diff --git a/pandas/tests/io/test_fsspec.py b/pandas/tests/io/test_fsspec.py index 3e89f6ca4ae16..666da677d702e 100644 --- a/pandas/tests/io/test_fsspec.py +++ b/pandas/tests/io/test_fsspec.py @@ -131,27 +131,38 @@ def test_fastparquet_options(fsspectest): @td.skip_if_no("s3fs") -def test_from_s3_csv(s3_resource, tips_file): - tm.assert_equal(read_csv("s3://pandas-test/tips.csv"), read_csv(tips_file)) +def test_from_s3_csv(s3_resource, tips_file, s3so): + tm.assert_equal( + read_csv("s3://pandas-test/tips.csv", storage_options=s3so), read_csv(tips_file) + ) # the following are decompressed by pandas, not fsspec - tm.assert_equal(read_csv("s3://pandas-test/tips.csv.gz"), read_csv(tips_file)) - tm.assert_equal(read_csv("s3://pandas-test/tips.csv.bz2"), read_csv(tips_file)) + tm.assert_equal( + read_csv("s3://pandas-test/tips.csv.gz", storage_options=s3so), + read_csv(tips_file), + ) + tm.assert_equal( + read_csv("s3://pandas-test/tips.csv.bz2", storage_options=s3so), + read_csv(tips_file), + ) @pytest.mark.parametrize("protocol", ["s3", "s3a", "s3n"]) @td.skip_if_no("s3fs") -def test_s3_protocols(s3_resource, tips_file, protocol): +def test_s3_protocols(s3_resource, tips_file, protocol, s3so): tm.assert_equal( - read_csv("%s://pandas-test/tips.csv" % protocol), read_csv(tips_file) + read_csv("%s://pandas-test/tips.csv" % protocol, storage_options=s3so), + read_csv(tips_file), ) @td.skip_if_no("s3fs") @td.skip_if_no("fastparquet") -def test_s3_parquet(s3_resource): +def test_s3_parquet(s3_resource, s3so): fn = "s3://pandas-test/test.parquet" - df1.to_parquet(fn, index=False, engine="fastparquet", compression=None) - df2 = read_parquet(fn, engine="fastparquet") + df1.to_parquet( + fn, index=False, engine="fastparquet", compression=None, storage_options=s3so + ) + df2 = read_parquet(fn, engine="fastparquet", storage_options=s3so) tm.assert_equal(df1, df2) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 82157f3d722a9..3a3ba99484a3a 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -158,6 +158,10 @@ def check_round_trip( """ write_kwargs = write_kwargs or {"compression": None} read_kwargs = read_kwargs or {} + if isinstance(path, str) and "s3://" in path: + s3so = dict(client_kwargs={"endpoint_url": "http://127.0.0.1:5555/"}) + read_kwargs["storage_options"] = s3so + write_kwargs["storage_options"] = s3so if expected is None: expected = df @@ -537,9 +541,11 @@ def test_categorical(self, pa): expected = df.astype(object) check_round_trip(df, pa, expected=expected) - def test_s3_roundtrip_explicit_fs(self, df_compat, s3_resource, pa): + def test_s3_roundtrip_explicit_fs(self, df_compat, s3_resource, pa, s3so): s3fs = pytest.importorskip("s3fs") - s3 = s3fs.S3FileSystem() + if LooseVersion(pyarrow.__version__) <= LooseVersion("0.17.0"): + pytest.skip() + s3 = s3fs.S3FileSystem(**s3so) kw = dict(filesystem=s3) check_round_trip( df_compat, @@ -550,6 +556,8 @@ def test_s3_roundtrip_explicit_fs(self, df_compat, s3_resource, pa): ) def test_s3_roundtrip(self, df_compat, s3_resource, pa): + if LooseVersion(pyarrow.__version__) <= LooseVersion("0.17.0"): + pytest.skip() # GH #19134 check_round_trip(df_compat, pa, path="s3://pandas-test/pyarrow.parquet") @@ -560,10 +568,13 @@ def test_s3_roundtrip_for_dir(self, df_compat, s3_resource, pa, partition_col): # https://github.com/apache/arrow/blob/master/python/pyarrow/tests/test_parquet.py#L2716 # As per pyarrow partitioned columns become 'categorical' dtypes # and are added to back of dataframe on read + if partition_col and pd.compat.is_platform_windows(): + pytest.skip("pyarrow/win incompatibility #35791") expected_df = df_compat.copy() if partition_col: expected_df[partition_col] = expected_df[partition_col].astype("category") + check_round_trip( df_compat, pa, diff --git a/requirements-dev.txt b/requirements-dev.txt index deaed8ab9d5f1..2fbb20ddfd3bf 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -32,6 +32,7 @@ boto3 botocore>=1.11 hypothesis>=3.82 moto +flask pytest>=5.0.1 pytest-cov pytest-xdist>=1.21 From ed0ff06387d96371755188c31b29f54783945fe7 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 21 Aug 2020 14:07:28 -0700 Subject: [PATCH 0535/1025] REF: simplify _cython_agg_blocks (#35841) --- pandas/core/groupby/generic.py | 30 ++++++++++++------------------ 1 file changed, 12 insertions(+), 18 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 166631e69f523..60e23b14eaf09 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -24,7 +24,6 @@ Tuple, Type, Union, - cast, ) import warnings @@ -1100,24 +1099,19 @@ def blk_func(block: "Block") -> List["Block"]: # continue and exclude the block raise else: - result = cast(DataFrame, result) + assert isinstance(result, (Series, DataFrame)) # for mypy + # In the case of object dtype block, it may have been split + # in the operation. We un-split here. + result = result._consolidate() + assert isinstance(result, (Series, DataFrame)) # for mypy + assert len(result._mgr.blocks) == 1 + # unwrap DataFrame to get array - if len(result._mgr.blocks) != 1: - # We've split an object block! Everything we've assumed - # about a single block input returning a single block output - # is a lie. To keep the code-path for the typical non-split case - # clean, we choose to clean up this mess later on. - assert len(locs) == result.shape[1] - for i, loc in enumerate(locs): - agg_block = result.iloc[:, [i]]._mgr.blocks[0] - agg_block.mgr_locs = [loc] - new_blocks.append(agg_block) - else: - result = result._mgr.blocks[0].values - if isinstance(result, np.ndarray) and result.ndim == 1: - result = result.reshape(1, -1) - agg_block = cast_result_block(result, block, how) - new_blocks = [agg_block] + result = result._mgr.blocks[0].values + if isinstance(result, np.ndarray) and result.ndim == 1: + result = result.reshape(1, -1) + agg_block = cast_result_block(result, block, how) + new_blocks = [agg_block] else: agg_block = cast_result_block(result, block, how) new_blocks = [agg_block] From 6e3bf44dc4f289a53e1128ae021a5e72d2440501 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 21 Aug 2020 14:11:33 -0700 Subject: [PATCH 0536/1025] CI: avoid file leak from ipython tests (#35836) --- pandas/conftest.py | 8 +++++++- pandas/tests/frame/test_api.py | 2 ++ pandas/tests/io/formats/test_format.py | 2 ++ pandas/tests/resample/test_resampler_grouper.py | 2 ++ pandas/tests/series/test_api.py | 2 ++ 5 files changed, 15 insertions(+), 1 deletion(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index 97cc514e31bb3..0878380d00837 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1181,7 +1181,13 @@ def ip(): pytest.importorskip("IPython", minversion="6.0.0") from IPython.core.interactiveshell import InteractiveShell - return InteractiveShell() + # GH#35711 make sure sqlite history file handle is not leaked + from traitlets.config import Config # noqa: F401 isort:skip + + c = Config() + c.HistoryManager.hist_file = ":memory:" + + return InteractiveShell(config=c) @pytest.fixture(params=["bsr", "coo", "csc", "csr", "dia", "dok", "lil"]) diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py index 2fb1f7f911a9c..0716cf5e27119 100644 --- a/pandas/tests/frame/test_api.py +++ b/pandas/tests/frame/test_api.py @@ -6,6 +6,7 @@ import numpy as np import pytest +import pandas.util._test_decorators as td from pandas.util._test_decorators import async_mark, skip_if_no import pandas as pd @@ -521,6 +522,7 @@ def _check_f(base, f): _check_f(d.copy(), f) @async_mark() + @td.check_file_leaks async def test_tab_complete_warning(self, ip): # GH 16409 pytest.importorskip("IPython", minversion="6.0.0") diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index d3031a9e1695a..a0f475acc4cbb 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -19,6 +19,7 @@ import pytz from pandas.compat import is_platform_32bit, is_platform_windows +import pandas.util._test_decorators as td import pandas as pd from pandas import ( @@ -3346,6 +3347,7 @@ def test_format_percentiles_integer_idx(): assert result == expected +@td.check_file_leaks def test_repr_html_ipython_config(ip): code = textwrap.dedent( """\ diff --git a/pandas/tests/resample/test_resampler_grouper.py b/pandas/tests/resample/test_resampler_grouper.py index b36b11582c1ec..f18aaa5e86829 100644 --- a/pandas/tests/resample/test_resampler_grouper.py +++ b/pandas/tests/resample/test_resampler_grouper.py @@ -3,6 +3,7 @@ import numpy as np import pytest +import pandas.util._test_decorators as td from pandas.util._test_decorators import async_mark import pandas as pd @@ -17,6 +18,7 @@ @async_mark() +@td.check_file_leaks async def test_tab_complete_ipython6_warning(ip): from IPython.core.completer import provisionalcompleter diff --git a/pandas/tests/series/test_api.py b/pandas/tests/series/test_api.py index b174eb0e42776..d81e8a4f82ffb 100644 --- a/pandas/tests/series/test_api.py +++ b/pandas/tests/series/test_api.py @@ -5,6 +5,7 @@ import numpy as np import pytest +import pandas.util._test_decorators as td from pandas.util._test_decorators import async_mark import pandas as pd @@ -486,6 +487,7 @@ def test_empty_method(self): assert not full_series.empty @async_mark() + @td.check_file_leaks async def test_tab_complete_warning(self, ip): # https://github.com/pandas-dev/pandas/issues/16409 pytest.importorskip("IPython", minversion="6.0.0") From d14a0103f271dbf41776c639656a1bc987625e6e Mon Sep 17 00:00:00 2001 From: Fangchen Li Date: Fri, 21 Aug 2020 16:12:58 -0500 Subject: [PATCH 0537/1025] CI/DOC: unpin gitdb #35823 (#35824) --- environment.yml | 2 +- requirements-dev.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/environment.yml b/environment.yml index 6afc19c227512..96f2c8d2086c7 100644 --- a/environment.yml +++ b/environment.yml @@ -26,7 +26,7 @@ dependencies: # documentation - gitpython # obtain contributors from git for whatsnew - - gitdb2=2.0.6 # GH-32060 + - gitdb - sphinx # documentation (jupyter notebooks) diff --git a/requirements-dev.txt b/requirements-dev.txt index 2fbb20ddfd3bf..1fca25c9fecd9 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -15,7 +15,7 @@ isort>=5.2.1 mypy==0.730 pycodestyle gitpython -gitdb2==2.0.6 +gitdb sphinx nbconvert>=5.4.1 nbsphinx From df3298e7e87eeaa8777b8b4ffe17c9a2bd0b994a Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 21 Aug 2020 14:53:51 -0700 Subject: [PATCH 0538/1025] BUG: issubclass check with dtype instead of type, closes GH#24883 (#35794) --- doc/source/whatsnew/v1.1.2.rst | 2 +- pandas/core/computation/ops.py | 14 +++++++++++--- pandas/tests/frame/test_query_eval.py | 7 +++++++ 3 files changed, 19 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v1.1.2.rst b/doc/source/whatsnew/v1.1.2.rst index 81acd567027e5..7bd547bf03a87 100644 --- a/doc/source/whatsnew/v1.1.2.rst +++ b/doc/source/whatsnew/v1.1.2.rst @@ -24,7 +24,7 @@ Fixed regressions Bug fixes ~~~~~~~~~ - +- Bug in :meth:`DataFrame.eval` with ``object`` dtype column binary operations (:issue:`35794`) - - diff --git a/pandas/core/computation/ops.py b/pandas/core/computation/ops.py index bc9ff7c44b689..e55df1e1d8155 100644 --- a/pandas/core/computation/ops.py +++ b/pandas/core/computation/ops.py @@ -481,13 +481,21 @@ def stringify(value): self.lhs.update(v) def _disallow_scalar_only_bool_ops(self): + rhs = self.rhs + lhs = self.lhs + + # GH#24883 unwrap dtype if necessary to ensure we have a type object + rhs_rt = rhs.return_type + rhs_rt = getattr(rhs_rt, "type", rhs_rt) + lhs_rt = lhs.return_type + lhs_rt = getattr(lhs_rt, "type", lhs_rt) if ( - (self.lhs.is_scalar or self.rhs.is_scalar) + (lhs.is_scalar or rhs.is_scalar) and self.op in _bool_ops_dict and ( not ( - issubclass(self.rhs.return_type, (bool, np.bool_)) - and issubclass(self.lhs.return_type, (bool, np.bool_)) + issubclass(rhs_rt, (bool, np.bool_)) + and issubclass(lhs_rt, (bool, np.bool_)) ) ) ): diff --git a/pandas/tests/frame/test_query_eval.py b/pandas/tests/frame/test_query_eval.py index 628b955a1de92..56d178daee7fd 100644 --- a/pandas/tests/frame/test_query_eval.py +++ b/pandas/tests/frame/test_query_eval.py @@ -160,6 +160,13 @@ def test_eval_resolvers_as_list(self): assert df.eval("a + b", resolvers=[dict1, dict2]) == dict1["a"] + dict2["b"] assert pd.eval("a + b", resolvers=[dict1, dict2]) == dict1["a"] + dict2["b"] + def test_eval_object_dtype_binop(self): + # GH#24883 + df = pd.DataFrame({"a1": ["Y", "N"]}) + res = df.eval("c = ((a1 == 'Y') & True)") + expected = pd.DataFrame({"a1": ["Y", "N"], "c": [True, False]}) + tm.assert_frame_equal(res, expected) + class TestDataFrameQueryWithMultiIndex: def test_query_with_named_multiindex(self, parser, engine): From 01fa19fdf5be177033e61fc6225e6bbb8a020509 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 21 Aug 2020 14:56:50 -0700 Subject: [PATCH 0539/1025] REF: _apply_blockwise define exclude in terms of skipped (#35740) --- pandas/core/window/rolling.py | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index ac96258cbc3c9..f516871f789d0 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -12,7 +12,7 @@ from pandas._libs.tslibs import BaseOffset, to_offset import pandas._libs.window.aggregations as window_aggregations -from pandas._typing import ArrayLike, Axis, FrameOrSeries, Scalar +from pandas._typing import ArrayLike, Axis, FrameOrSeries, Label from pandas.compat._optional import import_optional_dependency from pandas.compat.numpy import function as nv from pandas.util._decorators import Appender, Substitution, cache_readonly, doc @@ -381,21 +381,31 @@ def _wrap_result(self, result, block=None, obj=None): return type(obj)(result, index=index, columns=block.columns) return result - def _wrap_results(self, results, blocks, obj, exclude=None) -> FrameOrSeries: + def _wrap_results(self, results, obj, skipped: List[int]) -> FrameOrSeries: """ Wrap the results. Parameters ---------- results : list of ndarrays - blocks : list of blocks obj : conformed data (may be resampled) - exclude: list of columns to exclude, default to None + skipped: List[int] + Indices of blocks that are skipped. """ from pandas import Series, concat + exclude: List[Label] = [] + if obj.ndim == 2: + orig_blocks = list(obj._to_dict_of_blocks(copy=False).values()) + for i in skipped: + exclude.extend(orig_blocks[i].columns) + else: + orig_blocks = [obj] + + kept_blocks = [blk for i, blk in enumerate(orig_blocks) if i not in skipped] + final = [] - for result, block in zip(results, blocks): + for result, block in zip(results, kept_blocks): result = self._wrap_result(result, block=block, obj=obj) if result.ndim == 1: @@ -491,7 +501,6 @@ def _apply_blockwise( skipped: List[int] = [] results: List[ArrayLike] = [] - exclude: List[Scalar] = [] for i, b in enumerate(blocks): try: values = self._prep_values(b.values) @@ -499,7 +508,6 @@ def _apply_blockwise( except (TypeError, NotImplementedError) as err: if isinstance(obj, ABCDataFrame): skipped.append(i) - exclude.extend(b.columns) continue else: raise DataError("No numeric types to aggregate") from err @@ -507,8 +515,7 @@ def _apply_blockwise( result = homogeneous_func(values) results.append(result) - block_list = [blk for i, blk in enumerate(blocks) if i not in skipped] - return self._wrap_results(results, block_list, obj, exclude) + return self._wrap_results(results, obj, skipped) def _apply( self, @@ -1283,7 +1290,7 @@ def count(self): ).sum() results.append(result) - return self._wrap_results(results, blocks, obj) + return self._wrap_results(results, obj, skipped=[]) _shared_docs["apply"] = dedent( r""" From b55eb0c458b12fd9ada8364a21843e18115e5865 Mon Sep 17 00:00:00 2001 From: Karthik Mathur <22126205+mathurk1@users.noreply.github.com> Date: Fri, 21 Aug 2020 17:34:51 -0500 Subject: [PATCH 0540/1025] TST: add test for agg on ordered categorical cols (#35630) --- .../tests/groupby/aggregate/test_aggregate.py | 79 +++++++++++++++++++ 1 file changed, 79 insertions(+) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index ce9d4b892d775..8fe450fe6abfc 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -1063,6 +1063,85 @@ def test_groupby_get_by_index(): pd.testing.assert_frame_equal(res, expected) +@pytest.mark.parametrize( + "grp_col_dict, exp_data", + [ + ({"nr": "min", "cat_ord": "min"}, {"nr": [1, 5], "cat_ord": ["a", "c"]}), + ({"cat_ord": "min"}, {"cat_ord": ["a", "c"]}), + ({"nr": "min"}, {"nr": [1, 5]}), + ], +) +def test_groupby_single_agg_cat_cols(grp_col_dict, exp_data): + # test single aggregations on ordered categorical cols GHGH27800 + + # create the result dataframe + input_df = pd.DataFrame( + { + "nr": [1, 2, 3, 4, 5, 6, 7, 8], + "cat_ord": list("aabbccdd"), + "cat": list("aaaabbbb"), + } + ) + + input_df = input_df.astype({"cat": "category", "cat_ord": "category"}) + input_df["cat_ord"] = input_df["cat_ord"].cat.as_ordered() + result_df = input_df.groupby("cat").agg(grp_col_dict) + + # create expected dataframe + cat_index = pd.CategoricalIndex( + ["a", "b"], categories=["a", "b"], ordered=False, name="cat", dtype="category" + ) + + expected_df = pd.DataFrame(data=exp_data, index=cat_index) + + tm.assert_frame_equal(result_df, expected_df) + + +@pytest.mark.parametrize( + "grp_col_dict, exp_data", + [ + ({"nr": ["min", "max"], "cat_ord": "min"}, [(1, 4, "a"), (5, 8, "c")]), + ({"nr": "min", "cat_ord": ["min", "max"]}, [(1, "a", "b"), (5, "c", "d")]), + ({"cat_ord": ["min", "max"]}, [("a", "b"), ("c", "d")]), + ], +) +def test_groupby_combined_aggs_cat_cols(grp_col_dict, exp_data): + # test combined aggregations on ordered categorical cols GH27800 + + # create the result dataframe + input_df = pd.DataFrame( + { + "nr": [1, 2, 3, 4, 5, 6, 7, 8], + "cat_ord": list("aabbccdd"), + "cat": list("aaaabbbb"), + } + ) + + input_df = input_df.astype({"cat": "category", "cat_ord": "category"}) + input_df["cat_ord"] = input_df["cat_ord"].cat.as_ordered() + result_df = input_df.groupby("cat").agg(grp_col_dict) + + # create expected dataframe + cat_index = pd.CategoricalIndex( + ["a", "b"], categories=["a", "b"], ordered=False, name="cat", dtype="category" + ) + + # unpack the grp_col_dict to create the multi-index tuple + # this tuple will be used to create the expected dataframe index + multi_index_list = [] + for k, v in grp_col_dict.items(): + if isinstance(v, list): + for value in v: + multi_index_list.append([k, value]) + else: + multi_index_list.append([k, v]) + multi_index = pd.MultiIndex.from_tuples(tuple(multi_index_list)) + + expected_df = pd.DataFrame(data=exp_data, columns=multi_index, index=cat_index) + + tm.assert_frame_equal(result_df, expected_df) + + def test_nonagg_agg(): # GH 35490 - Single/Multiple agg of non-agg function give same results # TODO: agg should raise for functions that don't aggregate From 7fd758c128686dde187fcfc336985f9fba647eb6 Mon Sep 17 00:00:00 2001 From: tkmz-n <60312218+tkmz-n@users.noreply.github.com> Date: Sat, 22 Aug 2020 07:42:50 +0900 Subject: [PATCH 0541/1025] TST: resample does not yield empty groups (#10603) (#35799) --- pandas/tests/resample/test_timedelta.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/pandas/tests/resample/test_timedelta.py b/pandas/tests/resample/test_timedelta.py index 0fbb60c176b30..3fa85e62d028c 100644 --- a/pandas/tests/resample/test_timedelta.py +++ b/pandas/tests/resample/test_timedelta.py @@ -150,3 +150,18 @@ def test_resample_timedelta_edge_case(start, end, freq, resample_freq): tm.assert_index_equal(result.index, expected_index) assert result.index.freq == expected_index.freq assert not np.isnan(result[-1]) + + +def test_resample_with_timedelta_yields_no_empty_groups(): + # GH 10603 + df = pd.DataFrame( + np.random.normal(size=(10000, 4)), + index=pd.timedelta_range(start="0s", periods=10000, freq="3906250n"), + ) + result = df.loc["1s":, :].resample("3s").apply(lambda x: len(x)) + + expected = pd.DataFrame( + [[768.0] * 4] * 12 + [[528.0] * 4], + index=pd.timedelta_range(start="1s", periods=13, freq="3s"), + ) + tm.assert_frame_equal(result, expected) From da364e111208ce68189f1f8670f9e5c51e8d83ef Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 21 Aug 2020 19:02:01 -0700 Subject: [PATCH 0542/1025] REF: remove unnecesary try/except (#35839) --- pandas/core/groupby/generic.py | 61 ++++++++++++++++------------------ 1 file changed, 29 insertions(+), 32 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 60e23b14eaf09..4b1f6cfe0a662 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -30,7 +30,7 @@ import numpy as np from pandas._libs import lib -from pandas._typing import FrameOrSeries, FrameOrSeriesUnion +from pandas._typing import ArrayLike, FrameOrSeries, FrameOrSeriesUnion from pandas.util._decorators import Appender, Substitution, doc from pandas.core.dtypes.cast import ( @@ -59,6 +59,7 @@ validate_func_kwargs, ) import pandas.core.algorithms as algorithms +from pandas.core.arrays import ExtensionArray from pandas.core.base import DataError, SpecificationError import pandas.core.common as com from pandas.core.construction import create_series_with_explicit_dtype @@ -1033,32 +1034,31 @@ def _cython_agg_blocks( no_result = object() - def cast_result_block(result, block: "Block", how: str) -> "Block": - # see if we can cast the block to the desired dtype + def cast_agg_result(result, values: ArrayLike, how: str) -> ArrayLike: + # see if we can cast the values to the desired dtype # this may not be the original dtype assert not isinstance(result, DataFrame) assert result is not no_result - dtype = maybe_cast_result_dtype(block.dtype, how) + dtype = maybe_cast_result_dtype(values.dtype, how) result = maybe_downcast_numeric(result, dtype) - if block.is_extension and isinstance(result, np.ndarray): - # e.g. block.values was an IntegerArray - # (1, N) case can occur if block.values was Categorical + if isinstance(values, ExtensionArray) and isinstance(result, np.ndarray): + # e.g. values was an IntegerArray + # (1, N) case can occur if values was Categorical # and result is ndarray[object] # TODO(EA2D): special casing not needed with 2D EAs assert result.ndim == 1 or result.shape[0] == 1 try: # Cast back if feasible - result = type(block.values)._from_sequence( - result.ravel(), dtype=block.values.dtype + result = type(values)._from_sequence( + result.ravel(), dtype=values.dtype ) except (ValueError, TypeError): # reshape to be valid for non-Extension Block result = result.reshape(1, -1) - agg_block: "Block" = block.make_block(result) - return agg_block + return result def blk_func(block: "Block") -> List["Block"]: new_blocks: List["Block"] = [] @@ -1092,28 +1092,25 @@ def blk_func(block: "Block") -> List["Block"]: # Categoricals. This will done by later self._reindex_output() # Doing it here creates an error. See GH#34951 sgb = get_groupby(obj, self.grouper, observed=True) - try: - result = sgb.aggregate(lambda x: alt(x, axis=self.axis)) - except TypeError: - # we may have an exception in trying to aggregate - # continue and exclude the block - raise - else: - assert isinstance(result, (Series, DataFrame)) # for mypy - # In the case of object dtype block, it may have been split - # in the operation. We un-split here. - result = result._consolidate() - assert isinstance(result, (Series, DataFrame)) # for mypy - assert len(result._mgr.blocks) == 1 - - # unwrap DataFrame to get array - result = result._mgr.blocks[0].values - if isinstance(result, np.ndarray) and result.ndim == 1: - result = result.reshape(1, -1) - agg_block = cast_result_block(result, block, how) - new_blocks = [agg_block] + result = sgb.aggregate(lambda x: alt(x, axis=self.axis)) + + assert isinstance(result, (Series, DataFrame)) # for mypy + # In the case of object dtype block, it may have been split + # in the operation. We un-split here. + result = result._consolidate() + assert isinstance(result, (Series, DataFrame)) # for mypy + assert len(result._mgr.blocks) == 1 + + # unwrap DataFrame to get array + result = result._mgr.blocks[0].values + if isinstance(result, np.ndarray) and result.ndim == 1: + result = result.reshape(1, -1) + res_values = cast_agg_result(result, block.values, how) + agg_block = block.make_block(res_values) + new_blocks = [agg_block] else: - agg_block = cast_result_block(result, block, how) + res_values = cast_agg_result(result, block.values, how) + agg_block = block.make_block(res_values) new_blocks = [agg_block] return new_blocks From c1625297319c98f30d6bf5c69955d07f1aa7253e Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 21 Aug 2020 19:04:20 -0700 Subject: [PATCH 0543/1025] BUG: DataFrame.apply with result_type=reduce incorrect index (#35777) * BUG: DataFrame.apply with result_type=reduce incorrect index * move whatsnew to 1.1.1 * move whatsnew --- doc/source/whatsnew/v1.1.2.rst | 1 + pandas/core/apply.py | 5 ++++- pandas/tests/frame/apply/test_frame_apply.py | 9 +++++++++ 3 files changed, 14 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.2.rst b/doc/source/whatsnew/v1.1.2.rst index 7bd547bf03a87..c1b73c60be92b 100644 --- a/doc/source/whatsnew/v1.1.2.rst +++ b/doc/source/whatsnew/v1.1.2.rst @@ -25,6 +25,7 @@ Fixed regressions Bug fixes ~~~~~~~~~ - Bug in :meth:`DataFrame.eval` with ``object`` dtype column binary operations (:issue:`35794`) +- Bug in :meth:`DataFrame.apply` with ``result_type="reduce"`` returning with incorrect index (:issue:`35683`) - - diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 6d44cf917a07a..99a9e1377563c 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -340,7 +340,10 @@ def wrap_results_for_axis( if self.result_type == "reduce": # e.g. test_apply_dict GH#8735 - return self.obj._constructor_sliced(results) + res = self.obj._constructor_sliced(results) + res.index = res_index + return res + elif self.result_type is None and all( isinstance(x, dict) for x in results.values() ): diff --git a/pandas/tests/frame/apply/test_frame_apply.py b/pandas/tests/frame/apply/test_frame_apply.py index 538978358c8e7..5a1e448beb40f 100644 --- a/pandas/tests/frame/apply/test_frame_apply.py +++ b/pandas/tests/frame/apply/test_frame_apply.py @@ -1541,3 +1541,12 @@ def func(row): tm.assert_frame_equal(result, expected) tm.assert_frame_equal(df, result) + + +def test_apply_empty_list_reduce(): + # GH#35683 get columns correct + df = pd.DataFrame([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]], columns=["a", "b"]) + + result = df.apply(lambda x: [], result_type="reduce") + expected = pd.Series({"a": [], "b": []}, dtype=object) + tm.assert_series_equal(result, expected) From fbaf278d19384785b3534df4b3a2b5fc15b1dd08 Mon Sep 17 00:00:00 2001 From: Chankey Pathak Date: Sat, 22 Aug 2020 08:38:19 +0530 Subject: [PATCH 0544/1025] Avoid redirect (#35674) --- doc/source/getting_started/tutorials.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/getting_started/tutorials.rst b/doc/source/getting_started/tutorials.rst index 4c2d0621c6103..b8940d2efed2f 100644 --- a/doc/source/getting_started/tutorials.rst +++ b/doc/source/getting_started/tutorials.rst @@ -94,4 +94,4 @@ Various tutorials * `Intro to pandas data structures, by Greg Reda `_ * `Pandas and Python: Top 10, by Manish Amde `_ * `Pandas DataFrames Tutorial, by Karlijn Willems `_ -* `A concise tutorial with real life examples `_ +* `A concise tutorial with real life examples `_ From 49ad8ba9ab00f450587efe503fad46e912d2722b Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Fri, 21 Aug 2020 20:30:05 -0700 Subject: [PATCH 0545/1025] PERF: Allow jitting of groupby agg loop (#35759) * Roll back groupby agg changes * Add aggragate_with_numba * Fix cases where operation on Series inputs * Simplify case, handle Series correctly * Ensure function is being cached, validate the udf signature for groupby agg * Move some functionality to groupby/numba_.py * Change ValueError to NotImplementedError * Comment that it's only 1 function that is supported * Add whatsnew * Add issue number and correct typing * Add docstring for _aggregate_with_numba * Lint Co-authored-by: Matt Roeschke --- doc/source/whatsnew/v1.2.0.rst | 2 +- pandas/core/groupby/generic.py | 45 +++-- pandas/core/groupby/groupby.py | 70 +++++--- pandas/core/groupby/numba_.py | 172 +++++++++++++++++++ pandas/core/groupby/ops.py | 48 +----- pandas/core/util/numba_.py | 93 ---------- pandas/tests/groupby/aggregate/test_numba.py | 24 ++- 7 files changed, 274 insertions(+), 180 deletions(-) create mode 100644 pandas/core/groupby/numba_.py diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 3e6ed1cdf8f7e..09a5bcb0917c2 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -154,7 +154,7 @@ Deprecations Performance improvements ~~~~~~~~~~~~~~~~~~~~~~~~ -- +- Performance improvement in :meth:`GroupBy.agg` with the ``numba`` engine (:issue:`35759`) - .. --------------------------------------------------------------------------- diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 4b1f6cfe0a662..0edbfe3d67ca5 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -70,19 +70,16 @@ GroupBy, _agg_template, _apply_docs, + _group_selection_context, _transform_template, get_groupby, ) +from pandas.core.groupby.numba_ import generate_numba_func, split_for_numba from pandas.core.indexes.api import Index, MultiIndex, all_indexes_same import pandas.core.indexes.base as ibase from pandas.core.internals import BlockManager, make_block from pandas.core.series import Series -from pandas.core.util.numba_ import ( - NUMBA_FUNC_CACHE, - generate_numba_func, - maybe_use_numba, - split_for_numba, -) +from pandas.core.util.numba_ import NUMBA_FUNC_CACHE, maybe_use_numba from pandas.plotting import boxplot_frame_groupby @@ -230,6 +227,18 @@ def apply(self, func, *args, **kwargs): ) def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs): + if maybe_use_numba(engine): + if not callable(func): + raise NotImplementedError( + "Numba engine can only be used with a single function." + ) + with _group_selection_context(self): + data = self._selected_obj + result, index = self._aggregate_with_numba( + data.to_frame(), func, *args, engine_kwargs=engine_kwargs, **kwargs + ) + return self.obj._constructor(result.ravel(), index=index, name=data.name) + relabeling = func is None columns = None if relabeling: @@ -252,16 +261,11 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs) return getattr(self, cyfunc)() if self.grouper.nkeys > 1: - return self._python_agg_general( - func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs - ) + return self._python_agg_general(func, *args, **kwargs) try: - return self._python_agg_general( - func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs - ) + return self._python_agg_general(func, *args, **kwargs) except (ValueError, KeyError): - # Do not catch Numba errors here, we want to raise and not fall back. # TODO: KeyError is raised in _python_agg_general, # see see test_groupby.test_basic result = self._aggregate_named(func, *args, **kwargs) @@ -937,12 +941,19 @@ class DataFrameGroupBy(GroupBy[DataFrame]): ) def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs): - relabeling, func, columns, order = reconstruct_func(func, **kwargs) - if maybe_use_numba(engine): - return self._python_agg_general( - func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs + if not callable(func): + raise NotImplementedError( + "Numba engine can only be used with a single function." + ) + with _group_selection_context(self): + data = self._selected_obj + result, index = self._aggregate_with_numba( + data, func, *args, engine_kwargs=engine_kwargs, **kwargs ) + return self.obj._constructor(result, index=index, columns=data.columns) + + relabeling, func, columns, order = reconstruct_func(func, **kwargs) result, how = self._aggregate(func, *args, **kwargs) if how is None: diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 0047877ef78ee..f96b488fb8d0d 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -34,7 +34,7 @@ class providing the base-class of operations. from pandas._config.config import option_context -from pandas._libs import Timestamp +from pandas._libs import Timestamp, lib import pandas._libs.groupby as libgroupby from pandas._typing import F, FrameOrSeries, FrameOrSeriesUnion, Scalar from pandas.compat.numpy import function as nv @@ -61,11 +61,11 @@ class providing the base-class of operations. import pandas.core.common as com from pandas.core.frame import DataFrame from pandas.core.generic import NDFrame -from pandas.core.groupby import base, ops +from pandas.core.groupby import base, numba_, ops from pandas.core.indexes.api import CategoricalIndex, Index, MultiIndex from pandas.core.series import Series from pandas.core.sorting import get_group_index_sorter -from pandas.core.util.numba_ import maybe_use_numba +from pandas.core.util.numba_ import NUMBA_FUNC_CACHE _common_see_also = """ See Also @@ -384,7 +384,8 @@ class providing the base-class of operations. - dict of axis labels -> functions, function names or list of such. Can also accept a Numba JIT function with - ``engine='numba'`` specified. + ``engine='numba'`` specified. Only passing a single function is supported + with this engine. If the ``'numba'`` engine is chosen, the function must be a user defined function with ``values`` and ``index`` as the @@ -1053,12 +1054,43 @@ def _cython_agg_general( return self._wrap_aggregated_output(output, index=self.grouper.result_index) - def _python_agg_general( - self, func, *args, engine="cython", engine_kwargs=None, **kwargs - ): + def _aggregate_with_numba(self, data, func, *args, engine_kwargs=None, **kwargs): + """ + Perform groupby aggregation routine with the numba engine. + + This routine mimics the data splitting routine of the DataSplitter class + to generate the indices of each group in the sorted data and then passes the + data and indices into a Numba jitted function. + """ + group_keys = self.grouper._get_group_keys() + labels, _, n_groups = self.grouper.group_info + sorted_index = get_group_index_sorter(labels, n_groups) + sorted_labels = algorithms.take_nd(labels, sorted_index, allow_fill=False) + sorted_data = data.take(sorted_index, axis=self.axis).to_numpy() + starts, ends = lib.generate_slices(sorted_labels, n_groups) + cache_key = (func, "groupby_agg") + if cache_key in NUMBA_FUNC_CACHE: + # Return an already compiled version of roll_apply if available + numba_agg_func = NUMBA_FUNC_CACHE[cache_key] + else: + numba_agg_func = numba_.generate_numba_agg_func( + tuple(args), kwargs, func, engine_kwargs + ) + result = numba_agg_func( + sorted_data, sorted_index, starts, ends, len(group_keys), len(data.columns), + ) + if cache_key not in NUMBA_FUNC_CACHE: + NUMBA_FUNC_CACHE[cache_key] = numba_agg_func + + if self.grouper.nkeys > 1: + index = MultiIndex.from_tuples(group_keys, names=self.grouper.names) + else: + index = Index(group_keys, name=self.grouper.names[0]) + return result, index + + def _python_agg_general(self, func, *args, **kwargs): func = self._is_builtin_func(func) - if engine != "numba": - f = lambda x: func(x, *args, **kwargs) + f = lambda x: func(x, *args, **kwargs) # iterate through "columns" ex exclusions to populate output dict output: Dict[base.OutputKey, np.ndarray] = {} @@ -1069,21 +1101,11 @@ def _python_agg_general( # agg_series below assumes ngroups > 0 continue - if maybe_use_numba(engine): - result, counts = self.grouper.agg_series( - obj, - func, - *args, - engine=engine, - engine_kwargs=engine_kwargs, - **kwargs, - ) - else: - try: - # if this function is invalid for this dtype, we will ignore it. - result, counts = self.grouper.agg_series(obj, f) - except TypeError: - continue + try: + # if this function is invalid for this dtype, we will ignore it. + result, counts = self.grouper.agg_series(obj, f) + except TypeError: + continue assert result is not None key = base.OutputKey(label=name, position=idx) diff --git a/pandas/core/groupby/numba_.py b/pandas/core/groupby/numba_.py new file mode 100644 index 0000000000000..aebe60f797fcd --- /dev/null +++ b/pandas/core/groupby/numba_.py @@ -0,0 +1,172 @@ +"""Common utilities for Numba operations with groupby ops""" +import inspect +from typing import Any, Callable, Dict, Optional, Tuple + +import numpy as np + +from pandas._typing import FrameOrSeries, Scalar +from pandas.compat._optional import import_optional_dependency + +from pandas.core.util.numba_ import ( + NUMBA_FUNC_CACHE, + NumbaUtilError, + check_kwargs_and_nopython, + get_jit_arguments, + jit_user_function, +) + + +def split_for_numba(arg: FrameOrSeries) -> Tuple[np.ndarray, np.ndarray]: + """ + Split pandas object into its components as numpy arrays for numba functions. + + Parameters + ---------- + arg : Series or DataFrame + + Returns + ------- + (ndarray, ndarray) + values, index + """ + return arg.to_numpy(), arg.index.to_numpy() + + +def validate_udf(func: Callable) -> None: + """ + Validate user defined function for ops when using Numba with groupby ops. + + The first signature arguments should include: + + def f(values, index, ...): + ... + + Parameters + ---------- + func : function, default False + user defined function + + Returns + ------- + None + + Raises + ------ + NumbaUtilError + """ + udf_signature = list(inspect.signature(func).parameters.keys()) + expected_args = ["values", "index"] + min_number_args = len(expected_args) + if ( + len(udf_signature) < min_number_args + or udf_signature[:min_number_args] != expected_args + ): + raise NumbaUtilError( + f"The first {min_number_args} arguments to {func.__name__} must be " + f"{expected_args}" + ) + + +def generate_numba_func( + func: Callable, + engine_kwargs: Optional[Dict[str, bool]], + kwargs: dict, + cache_key_str: str, +) -> Tuple[Callable, Tuple[Callable, str]]: + """ + Return a JITed function and cache key for the NUMBA_FUNC_CACHE + + This _may_ be specific to groupby (as it's only used there currently). + + Parameters + ---------- + func : function + user defined function + engine_kwargs : dict or None + numba.jit arguments + kwargs : dict + kwargs for func + cache_key_str : str + string representing the second part of the cache key tuple + + Returns + ------- + (JITed function, cache key) + + Raises + ------ + NumbaUtilError + """ + nopython, nogil, parallel = get_jit_arguments(engine_kwargs) + check_kwargs_and_nopython(kwargs, nopython) + validate_udf(func) + cache_key = (func, cache_key_str) + numba_func = NUMBA_FUNC_CACHE.get( + cache_key, jit_user_function(func, nopython, nogil, parallel) + ) + return numba_func, cache_key + + +def generate_numba_agg_func( + args: Tuple, + kwargs: Dict[str, Any], + func: Callable[..., Scalar], + engine_kwargs: Optional[Dict[str, bool]], +) -> Callable[[np.ndarray, np.ndarray, np.ndarray, np.ndarray, int, int], np.ndarray]: + """ + Generate a numba jitted agg function specified by values from engine_kwargs. + + 1. jit the user's function + 2. Return a groupby agg function with the jitted function inline + + Configurations specified in engine_kwargs apply to both the user's + function _AND_ the rolling apply function. + + Parameters + ---------- + args : tuple + *args to be passed into the function + kwargs : dict + **kwargs to be passed into the function + func : function + function to be applied to each window and will be JITed + engine_kwargs : dict + dictionary of arguments to be passed into numba.jit + + Returns + ------- + Numba function + """ + nopython, nogil, parallel = get_jit_arguments(engine_kwargs) + + check_kwargs_and_nopython(kwargs, nopython) + + validate_udf(func) + + numba_func = jit_user_function(func, nopython, nogil, parallel) + + numba = import_optional_dependency("numba") + + if parallel: + loop_range = numba.prange + else: + loop_range = range + + @numba.jit(nopython=nopython, nogil=nogil, parallel=parallel) + def group_apply( + values: np.ndarray, + index: np.ndarray, + begin: np.ndarray, + end: np.ndarray, + num_groups: int, + num_columns: int, + ) -> np.ndarray: + result = np.empty((num_groups, num_columns)) + for i in loop_range(num_groups): + group_index = index[begin[i] : end[i]] + for j in loop_range(num_columns): + group = values[begin[i] : end[i], j] + result[i, j] = numba_func(group, group_index, *args) + return result + + return group_apply diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 64eb413fe78fa..c6171a55359fe 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -55,12 +55,6 @@ get_group_index_sorter, get_indexer_dict, ) -from pandas.core.util.numba_ import ( - NUMBA_FUNC_CACHE, - generate_numba_func, - maybe_use_numba, - split_for_numba, -) class BaseGrouper: @@ -610,21 +604,11 @@ def _transform( return result def agg_series( - self, - obj: Series, - func: F, - *args, - engine: str = "cython", - engine_kwargs=None, - **kwargs, + self, obj: Series, func: F, *args, **kwargs, ): # Caller is responsible for checking ngroups != 0 assert self.ngroups != 0 - if maybe_use_numba(engine): - return self._aggregate_series_pure_python( - obj, func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs - ) if len(obj) == 0: # SeriesGrouper would raise if we were to call _aggregate_series_fast return self._aggregate_series_pure_python(obj, func) @@ -670,20 +654,8 @@ def _aggregate_series_fast(self, obj: Series, func: F): return result, counts def _aggregate_series_pure_python( - self, - obj: Series, - func: F, - *args, - engine: str = "cython", - engine_kwargs=None, - **kwargs, + self, obj: Series, func: F, *args, **kwargs, ): - - if maybe_use_numba(engine): - numba_func, cache_key = generate_numba_func( - func, engine_kwargs, kwargs, "groupby_agg" - ) - group_index, _, ngroups = self.group_info counts = np.zeros(ngroups, dtype=int) @@ -692,13 +664,7 @@ def _aggregate_series_pure_python( splitter = get_splitter(obj, group_index, ngroups, axis=0) for label, group in splitter: - if maybe_use_numba(engine): - values, index = split_for_numba(group) - res = numba_func(values, index, *args) - if cache_key not in NUMBA_FUNC_CACHE: - NUMBA_FUNC_CACHE[cache_key] = numba_func - else: - res = func(group, *args, **kwargs) + res = func(group, *args, **kwargs) if result is None: if isinstance(res, (Series, Index, np.ndarray)): @@ -876,13 +842,7 @@ def groupings(self) -> "List[grouper.Grouping]": ] def agg_series( - self, - obj: Series, - func: F, - *args, - engine: str = "cython", - engine_kwargs=None, - **kwargs, + self, obj: Series, func: F, *args, **kwargs, ): # Caller is responsible for checking ngroups != 0 assert self.ngroups != 0 diff --git a/pandas/core/util/numba_.py b/pandas/core/util/numba_.py index c9b7943478cdd..b951cd4f0cc2a 100644 --- a/pandas/core/util/numba_.py +++ b/pandas/core/util/numba_.py @@ -1,12 +1,10 @@ """Common utilities for Numba operations""" from distutils.version import LooseVersion -import inspect import types from typing import Callable, Dict, Optional, Tuple import numpy as np -from pandas._typing import FrameOrSeries from pandas.compat._optional import import_optional_dependency from pandas.errors import NumbaUtilError @@ -129,94 +127,3 @@ def impl(data, *_args): return impl return numba_func - - -def split_for_numba(arg: FrameOrSeries) -> Tuple[np.ndarray, np.ndarray]: - """ - Split pandas object into its components as numpy arrays for numba functions. - - Parameters - ---------- - arg : Series or DataFrame - - Returns - ------- - (ndarray, ndarray) - values, index - """ - return arg.to_numpy(), arg.index.to_numpy() - - -def validate_udf(func: Callable) -> None: - """ - Validate user defined function for ops when using Numba. - - The first signature arguments should include: - - def f(values, index, ...): - ... - - Parameters - ---------- - func : function, default False - user defined function - - Returns - ------- - None - - Raises - ------ - NumbaUtilError - """ - udf_signature = list(inspect.signature(func).parameters.keys()) - expected_args = ["values", "index"] - min_number_args = len(expected_args) - if ( - len(udf_signature) < min_number_args - or udf_signature[:min_number_args] != expected_args - ): - raise NumbaUtilError( - f"The first {min_number_args} arguments to {func.__name__} must be " - f"{expected_args}" - ) - - -def generate_numba_func( - func: Callable, - engine_kwargs: Optional[Dict[str, bool]], - kwargs: dict, - cache_key_str: str, -) -> Tuple[Callable, Tuple[Callable, str]]: - """ - Return a JITed function and cache key for the NUMBA_FUNC_CACHE - - This _may_ be specific to groupby (as it's only used there currently). - - Parameters - ---------- - func : function - user defined function - engine_kwargs : dict or None - numba.jit arguments - kwargs : dict - kwargs for func - cache_key_str : str - string representing the second part of the cache key tuple - - Returns - ------- - (JITed function, cache key) - - Raises - ------ - NumbaUtilError - """ - nopython, nogil, parallel = get_jit_arguments(engine_kwargs) - check_kwargs_and_nopython(kwargs, nopython) - validate_udf(func) - cache_key = (func, cache_key_str) - numba_func = NUMBA_FUNC_CACHE.get( - cache_key, jit_user_function(func, nopython, nogil, parallel) - ) - return numba_func, cache_key diff --git a/pandas/tests/groupby/aggregate/test_numba.py b/pandas/tests/groupby/aggregate/test_numba.py index 690694b0e66f5..29e65e938f6f9 100644 --- a/pandas/tests/groupby/aggregate/test_numba.py +++ b/pandas/tests/groupby/aggregate/test_numba.py @@ -4,7 +4,7 @@ from pandas.errors import NumbaUtilError import pandas.util._test_decorators as td -from pandas import DataFrame, option_context +from pandas import DataFrame, NamedAgg, option_context import pandas._testing as tm from pandas.core.util.numba_ import NUMBA_FUNC_CACHE @@ -128,3 +128,25 @@ def func_1(values, index): with option_context("compute.use_numba", True): result = grouped.agg(func_1, engine=None) tm.assert_frame_equal(expected, result) + + +@td.skip_if_no("numba", "0.46.0") +@pytest.mark.parametrize( + "agg_func", + [ + ["min", "max"], + "min", + {"B": ["min", "max"], "C": "sum"}, + NamedAgg(column="B", aggfunc="min"), + ], +) +def test_multifunc_notimplimented(agg_func): + data = DataFrame( + {0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1], + ) + grouped = data.groupby(0) + with pytest.raises(NotImplementedError, match="Numba engine can"): + grouped.agg(agg_func, engine="numba") + + with pytest.raises(NotImplementedError, match="Numba engine can"): + grouped[1].agg(agg_func, engine="numba") From 1bc09bd0fa162bb3b0f3ace4c4368351c6271415 Mon Sep 17 00:00:00 2001 From: Marian Denes Date: Sat, 22 Aug 2020 18:03:20 +0200 Subject: [PATCH 0546/1025] DOC: Fixed docstring for mode() (#35624) * DOC: Fixed docstring for mode() * DOC: Changed docstring for mode() - Explanation of mode() function * Update pandas/core/series.py Co-authored-by: Marco Gorelli * DOC: Changes in docstring for mode() - Fixed 1st line of docstring - Added explanation for mode() function - Explanation for mode(): joined 2 lines into 1 * Update pandas/core/series.py Co-authored-by: Daniel Saxton <2658661+dsaxton@users.noreply.github.com> Co-authored-by: Marco Gorelli Co-authored-by: Daniel Saxton <2658661+dsaxton@users.noreply.github.com> --- pandas/core/series.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index cd2db659fbd0e..555024ad75f5e 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1800,7 +1800,9 @@ def count(self, level=None): def mode(self, dropna=True) -> "Series": """ - Return the mode(s) of the dataset. + Return the mode(s) of the Series. + + The mode is the value that appears most often. There can be multiple modes. Always returns Series even if only one value is returned. From 1ccd735f6fe7c6a9f00be0bf6fbb58208e734a00 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 22 Aug 2020 17:06:16 -0700 Subject: [PATCH 0547/1025] REF: move towards making _apply_blockwise actually block-wise (#35730) * REF: move towards making _apply_blockwise actually block-wise * mypy fixup * mypy fixup * Series->_constructor * dummy commit to force CI --- pandas/core/window/rolling.py | 75 ++++++++++++++++++++++++----------- 1 file changed, 52 insertions(+), 23 deletions(-) diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index f516871f789d0..f7e81f41b8675 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -6,13 +6,23 @@ from functools import partial import inspect from textwrap import dedent -from typing import Callable, Dict, List, Optional, Set, Tuple, Type, Union +from typing import ( + TYPE_CHECKING, + Callable, + Dict, + List, + Optional, + Set, + Tuple, + Type, + Union, +) import numpy as np from pandas._libs.tslibs import BaseOffset, to_offset import pandas._libs.window.aggregations as window_aggregations -from pandas._typing import ArrayLike, Axis, FrameOrSeries, Label +from pandas._typing import ArrayLike, Axis, FrameOrSeriesUnion, Label from pandas.compat._optional import import_optional_dependency from pandas.compat.numpy import function as nv from pandas.util._decorators import Appender, Substitution, cache_readonly, doc @@ -55,6 +65,9 @@ ) from pandas.core.window.numba_ import generate_numba_apply_func +if TYPE_CHECKING: + from pandas import Series + def calculate_center_offset(window) -> int: """ @@ -145,7 +158,7 @@ class _Window(PandasObject, ShallowMixin, SelectionMixin): def __init__( self, - obj: FrameOrSeries, + obj: FrameOrSeriesUnion, window=None, min_periods: Optional[int] = None, center: bool = False, @@ -219,7 +232,7 @@ def _validate_get_window_bounds_signature(window: BaseIndexer) -> None: f"get_window_bounds" ) - def _create_blocks(self, obj: FrameOrSeries): + def _create_blocks(self, obj: FrameOrSeriesUnion): """ Split data into blocks & return conformed data. """ @@ -381,7 +394,7 @@ def _wrap_result(self, result, block=None, obj=None): return type(obj)(result, index=index, columns=block.columns) return result - def _wrap_results(self, results, obj, skipped: List[int]) -> FrameOrSeries: + def _wrap_results(self, results, obj, skipped: List[int]) -> FrameOrSeriesUnion: """ Wrap the results. @@ -394,22 +407,23 @@ def _wrap_results(self, results, obj, skipped: List[int]) -> FrameOrSeries: """ from pandas import Series, concat + if obj.ndim == 1: + if not results: + raise DataError("No numeric types to aggregate") + assert len(results) == 1 + return Series(results[0], index=obj.index, name=obj.name) + exclude: List[Label] = [] - if obj.ndim == 2: - orig_blocks = list(obj._to_dict_of_blocks(copy=False).values()) - for i in skipped: - exclude.extend(orig_blocks[i].columns) - else: - orig_blocks = [obj] + orig_blocks = list(obj._to_dict_of_blocks(copy=False).values()) + for i in skipped: + exclude.extend(orig_blocks[i].columns) kept_blocks = [blk for i, blk in enumerate(orig_blocks) if i not in skipped] final = [] for result, block in zip(results, kept_blocks): - result = self._wrap_result(result, block=block, obj=obj) - if result.ndim == 1: - return result + result = type(obj)(result, index=obj.index, columns=block.columns) final.append(result) exclude = exclude or [] @@ -488,13 +502,31 @@ def _get_window_indexer(self, window: int) -> BaseIndexer: return VariableWindowIndexer(index_array=self._on.asi8, window_size=window) return FixedWindowIndexer(window_size=window) + def _apply_series(self, homogeneous_func: Callable[..., ArrayLike]) -> "Series": + """ + Series version of _apply_blockwise + """ + _, obj = self._create_blocks(self._selected_obj) + values = obj.values + + try: + values = self._prep_values(obj.values) + except (TypeError, NotImplementedError) as err: + raise DataError("No numeric types to aggregate") from err + + result = homogeneous_func(values) + return obj._constructor(result, index=obj.index, name=obj.name) + def _apply_blockwise( self, homogeneous_func: Callable[..., ArrayLike] - ) -> FrameOrSeries: + ) -> FrameOrSeriesUnion: """ Apply the given function to the DataFrame broken down into homogeneous sub-frames. """ + if self._selected_obj.ndim == 1: + return self._apply_series(homogeneous_func) + # This isn't quite blockwise, since `blocks` is actually a collection # of homogenenous DataFrames. blocks, obj = self._create_blocks(self._selected_obj) @@ -505,12 +537,9 @@ def _apply_blockwise( try: values = self._prep_values(b.values) - except (TypeError, NotImplementedError) as err: - if isinstance(obj, ABCDataFrame): - skipped.append(i) - continue - else: - raise DataError("No numeric types to aggregate") from err + except (TypeError, NotImplementedError): + skipped.append(i) + continue result = homogeneous_func(values) results.append(result) @@ -2234,7 +2263,7 @@ def _apply( def _constructor(self): return Rolling - def _create_blocks(self, obj: FrameOrSeries): + def _create_blocks(self, obj: FrameOrSeriesUnion): """ Split data into blocks & return conformed data. """ @@ -2275,7 +2304,7 @@ def _get_window_indexer(self, window: int) -> GroupbyRollingIndexer: if isinstance(self.window, BaseIndexer): rolling_indexer = type(self.window) indexer_kwargs = self.window.__dict__ - assert isinstance(indexer_kwargs, dict) + assert isinstance(indexer_kwargs, dict) # for mypy # We'll be using the index of each group later indexer_kwargs.pop("index_array", None) elif self.is_freq_type: From 3f86b680c371d4fa0ec29115692ecafc8231f383 Mon Sep 17 00:00:00 2001 From: 21CSM <69096687+21CSM@users.noreply.github.com> Date: Sun, 23 Aug 2020 03:40:29 -0400 Subject: [PATCH 0548/1025] DOC: Fix code of conduct link (#35857) Fixes #35855 --- web/pandas/about/team.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/web/pandas/about/team.md b/web/pandas/about/team.md index 8eb2edebec817..39f63202e1986 100644 --- a/web/pandas/about/team.md +++ b/web/pandas/about/team.md @@ -2,7 +2,7 @@ ## Contributors -_pandas_ is made with love by more than [1,500 volunteer contributors](https://github.com/pandas-dev/pandas/graphs/contributors). +_pandas_ is made with love by more than [2,000 volunteer contributors](https://github.com/pandas-dev/pandas/graphs/contributors). If you want to support pandas development, you can find information in the [donations page](../donate.html). @@ -42,7 +42,7 @@ If you want to support pandas development, you can find information in the [dona > or anyone willing to increase the diversity of our team. > We have identified visible gaps and obstacles in sustaining diversity and inclusion in the open-source communities and we are proactive in increasing > the diversity of our team. -> We have a [code of conduct]({base_url}/community/coc.html) to ensure a friendly and welcoming environment. +> We have a [code of conduct](../community/coc.html) to ensure a friendly and welcoming environment. > Please send an email to [pandas-code-of-conduct-committee](mailto:pandas-coc@googlegroups.com), if you think we can do a > better job at achieving this goal. From 1c8b2fe7dbf062c7bb4516c0d605aea36c43657c Mon Sep 17 00:00:00 2001 From: Daniel Saxton <2658661+dsaxton@users.noreply.github.com> Date: Mon, 24 Aug 2020 01:50:11 -0500 Subject: [PATCH 0549/1025] DOC: Mention NA for missing data in README (#35638) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index a72e8402e68a0..a2f2f1c04442a 100644 --- a/README.md +++ b/README.md @@ -32,7 +32,7 @@ its way towards this goal. Here are just a few of the things that pandas does well: - Easy handling of [**missing data**][missing-data] (represented as - `NaN`) in floating point as well as non-floating point data + `NaN`, `NA`, or `NaT`) in floating point as well as non-floating point data - Size mutability: columns can be [**inserted and deleted**][insertion-deletion] from DataFrame and higher dimensional objects From 4b5b8aecf1ceda270e2babeebb1ef009be8a6bf2 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 24 Aug 2020 07:24:04 -0700 Subject: [PATCH 0550/1025] CLN/PERF: delay evaluation of get_day_of_month (#35866) --- pandas/_libs/tslibs/offsets.pyx | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index 7f0314d737619..161e5f4e54f51 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -3705,7 +3705,7 @@ cdef inline void _shift_months(const int64_t[:] dtindex, """See shift_months.__doc__""" cdef: Py_ssize_t i - int months_to_roll, compare_day + int months_to_roll npy_datetimestruct dts for i in range(count): @@ -3715,10 +3715,8 @@ cdef inline void _shift_months(const int64_t[:] dtindex, dt64_to_dtstruct(dtindex[i], &dts) months_to_roll = months - compare_day = get_day_of_month(&dts, day_opt) - months_to_roll = roll_convention(dts.day, months_to_roll, - compare_day) + months_to_roll = _roll_qtrday(&dts, months_to_roll, 0, day_opt) dts.year = year_add_months(dts, months_to_roll) dts.month = month_add_months(dts, months_to_roll) From 64b42b703d924db71a2f24ee185c44c148006920 Mon Sep 17 00:00:00 2001 From: guru kiran <47276342+gurukiran07@users.noreply.github.com> Date: Mon, 24 Aug 2020 21:06:17 +0530 Subject: [PATCH 0551/1025] DOC: Updated aggregate docstring (#35042) * DOC: Updated aggregate docstring * Doc: updated aggregate docstring * Update pandas/core/generic.py Co-authored-by: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> * Update generic.py * Update generic.py * Revert "Update generic.py" This reverts commit 15ecaf724e98c5bcb2d459e720ca60e22e758346. * Revert "Revert "Update generic.py"" This reverts commit cc231c80a7a0bf9a18f3d4342e156d9d95c5ac8b. * Updated docstring of agg * Trailing whitespace removed * DOC: Updated docstring of agg * Update generic.py * Updated Docstring Co-authored-by: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> --- pandas/core/generic.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index fe412bc0ce937..9f36405bf6428 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -5169,6 +5169,9 @@ def pipe(self, func, *args, **kwargs): ----- `agg` is an alias for `aggregate`. Use the alias. + In pandas, agg, as most operations just ignores the missing values, + and returns the operation only considering the values that are present. + A passed user-defined-function will be passed a Series for evaluation. {examples}""" ) From 3c0223631001615335ccde993f59bb4f43688865 Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Mon, 24 Aug 2020 23:55:43 +0100 Subject: [PATCH 0552/1025] DEPR: deprecate dtype param in Index.copy (#35853) --- doc/source/whatsnew/v1.2.0.rst | 2 +- pandas/core/indexes/base.py | 9 +++++++++ pandas/core/indexes/multi.py | 19 ++++++++++++++----- pandas/core/indexes/range.py | 11 +++++++++-- pandas/tests/indexes/common.py | 7 ++++++- pandas/tests/indexes/test_base.py | 5 ----- pandas/tests/indexes/test_common.py | 8 ++------ pandas/tests/indexes/test_numeric.py | 4 ++-- 8 files changed, 43 insertions(+), 22 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 09a5bcb0917c2..adc1806523d6e 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -143,7 +143,7 @@ See :ref:`install.dependencies` and :ref:`install.optional_dependencies` for mor Deprecations ~~~~~~~~~~~~ - Deprecated parameter ``inplace`` in :meth:`MultiIndex.set_codes` and :meth:`MultiIndex.set_levels` (:issue:`35626`) -- +- Deprecated parameter ``dtype`` in :~meth:`Index.copy` on method all index classes. Use the :meth:`Index.astype` method instead for changing dtype(:issue:`35853`) - .. --------------------------------------------------------------------------- diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 623ce68201492..ceb109fdf6d7a 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -800,6 +800,9 @@ def copy(self, name=None, deep=False, dtype=None, names=None): deep : bool, default False dtype : numpy dtype or pandas type, optional Set dtype for new object. + + .. deprecated:: 1.2.0 + use ``astype`` method instead. names : list-like, optional Kept for compatibility with MultiIndex. Should not be used. @@ -820,6 +823,12 @@ def copy(self, name=None, deep=False, dtype=None, names=None): new_index = self._shallow_copy(name=name) if dtype: + warnings.warn( + "parameter dtype is deprecated and will be removed in a future " + "version. Use the astype method instead.", + FutureWarning, + stacklevel=2, + ) new_index = new_index.astype(dtype) return new_index diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index ffbd03d0c3ba7..b29c27982f087 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1030,7 +1030,6 @@ def _shallow_copy( name=lib.no_default, levels=None, codes=None, - dtype=None, sortorder=None, names=lib.no_default, _set_identity: bool = True, @@ -1041,7 +1040,7 @@ def _shallow_copy( names = name if name is not lib.no_default else self.names if values is not None: - assert levels is None and codes is None and dtype is None + assert levels is None and codes is None return MultiIndex.from_tuples(values, sortorder=sortorder, names=names) levels = levels if levels is not None else self.levels @@ -1050,7 +1049,6 @@ def _shallow_copy( result = MultiIndex( levels=levels, codes=codes, - dtype=dtype, sortorder=sortorder, names=names, verify_integrity=False, @@ -1092,6 +1090,8 @@ def copy( ---------- names : sequence, optional dtype : numpy dtype or pandas type, optional + + .. deprecated:: 1.2.0 levels : sequence, optional codes : sequence, optional deep : bool, default False @@ -1117,15 +1117,24 @@ def copy( if codes is None: codes = deepcopy(self.codes) - return self._shallow_copy( + new_index = self._shallow_copy( levels=levels, codes=codes, names=names, - dtype=dtype, sortorder=self.sortorder, _set_identity=_set_identity, ) + if dtype: + warnings.warn( + "parameter dtype is deprecated and will be removed in a future " + "version. Use the astype method instead.", + FutureWarning, + stacklevel=2, + ) + new_index = new_index.astype(dtype) + return new_index + def __array__(self, dtype=None) -> np.ndarray: """ the array interface, return my values """ return self.values diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index c65c3d5ff3d9c..c5572a9de7fa5 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -390,10 +390,17 @@ def _shallow_copy(self, values=None, name: Label = no_default): @doc(Int64Index.copy) def copy(self, name=None, deep=False, dtype=None, names=None): - self._validate_dtype(dtype) - name = self._validate_names(name=name, names=names, deep=deep)[0] new_index = self._shallow_copy(name=name) + + if dtype: + warnings.warn( + "parameter dtype is deprecated and will be removed in a future " + "version. Use the astype method instead.", + FutureWarning, + stacklevel=2, + ) + new_index = new_index.astype(dtype) return new_index def _minmax(self, meth: str): diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index 98f7c0eadb4bb..e4d0b46f7c716 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -270,7 +270,7 @@ def test_copy_name(self, index): s3 = s1 * s2 assert s3.index.name == "mario" - def test_name2(self, index): + def test_copy_name2(self, index): # gh-35592 if isinstance(index, MultiIndex): return @@ -284,6 +284,11 @@ def test_name2(self, index): with pytest.raises(TypeError, match=msg): index.copy(name=[["mario"]]) + def test_copy_dtype_deprecated(self, index): + # GH35853 + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + index.copy(dtype=object) + def test_ensure_copied_data(self, index): # Check the "copy" argument of each Index.__new__ is honoured # GH12309 diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 70eb9e502f78a..aee4b16621b4d 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -62,11 +62,6 @@ def test_new_axis(self, index): assert new_index.ndim == 2 assert isinstance(new_index, np.ndarray) - @pytest.mark.parametrize("index", ["int", "uint", "float"], indirect=True) - def test_copy_and_deepcopy(self, index): - new_copy2 = index.copy(dtype=int) - assert new_copy2.dtype.kind == "i" - def test_constructor_regular(self, index): tm.assert_contains_all(index, index) diff --git a/pandas/tests/indexes/test_common.py b/pandas/tests/indexes/test_common.py index 02a173eb4958d..db260b71e7186 100644 --- a/pandas/tests/indexes/test_common.py +++ b/pandas/tests/indexes/test_common.py @@ -374,8 +374,7 @@ def test_has_duplicates(self, index): "dtype", ["int64", "uint64", "float64", "category", "datetime64[ns]", "timedelta64[ns]"], ) - @pytest.mark.parametrize("copy", [True, False]) - def test_astype_preserves_name(self, index, dtype, copy): + def test_astype_preserves_name(self, index, dtype): # https://github.com/pandas-dev/pandas/issues/32013 if isinstance(index, MultiIndex): index.names = ["idx" + str(i) for i in range(index.nlevels)] @@ -384,10 +383,7 @@ def test_astype_preserves_name(self, index, dtype, copy): try: # Some of these conversions cannot succeed so we use a try / except - if copy: - result = index.copy(dtype=dtype) - else: - result = index.astype(dtype) + result = index.astype(dtype) except (ValueError, TypeError, NotImplementedError, SystemError): return diff --git a/pandas/tests/indexes/test_numeric.py b/pandas/tests/indexes/test_numeric.py index bfcac5d433d2c..e6f455e60eee3 100644 --- a/pandas/tests/indexes/test_numeric.py +++ b/pandas/tests/indexes/test_numeric.py @@ -394,7 +394,7 @@ def test_identical(self): same_values_different_type = Index(i, dtype=object) assert not i.identical(same_values_different_type) - i = index.copy(dtype=object) + i = index.astype(dtype=object) i = i.rename("foo") same_values = Index(i, dtype=object) assert same_values.identical(i) @@ -402,7 +402,7 @@ def test_identical(self): assert not i.identical(index) assert Index(same_values, name="foo", dtype=object).identical(i) - assert not index.copy(dtype=object).identical(index.copy(dtype=self._dtype)) + assert not index.astype(dtype=object).identical(index.astype(dtype=self._dtype)) def test_union_noncomparable(self): # corner case, non-Int64Index From 9ac7c15ef0aa2f7af7f5ebfa03d7e295f4da1025 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 24 Aug 2020 16:32:53 -0700 Subject: [PATCH 0553/1025] REF: use Block.apply in cython_agg_blocks (#35854) --- pandas/core/groupby/generic.py | 41 +++++++++++++++++----------------- 1 file changed, 20 insertions(+), 21 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 0edbfe3d67ca5..1198baab12ac1 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1069,16 +1069,17 @@ def cast_agg_result(result, values: ArrayLike, how: str) -> ArrayLike: # reshape to be valid for non-Extension Block result = result.reshape(1, -1) + elif isinstance(result, np.ndarray) and result.ndim == 1: + # We went through a SeriesGroupByPath and need to reshape + result = result.reshape(1, -1) + return result - def blk_func(block: "Block") -> List["Block"]: - new_blocks: List["Block"] = [] + def blk_func(bvalues: ArrayLike) -> ArrayLike: - result = no_result - locs = block.mgr_locs.as_array try: result, _ = self.grouper.aggregate( - block.values, how, axis=1, min_count=min_count + bvalues, how, axis=1, min_count=min_count ) except NotImplementedError: # generally if we have numeric_only=False @@ -1091,12 +1092,17 @@ def blk_func(block: "Block") -> List["Block"]: assert how == "ohlc" raise + obj: Union[Series, DataFrame] # call our grouper again with only this block - obj = self.obj[data.items[locs]] - if obj.shape[1] == 1: - # Avoid call to self.values that can occur in DataFrame - # reductions; see GH#28949 - obj = obj.iloc[:, 0] + if isinstance(bvalues, ExtensionArray): + # TODO(EA2D): special case not needed with 2D EAs + obj = Series(bvalues) + else: + obj = DataFrame(bvalues.T) + if obj.shape[1] == 1: + # Avoid call to self.values that can occur in DataFrame + # reductions; see GH#28949 + obj = obj.iloc[:, 0] # Create SeriesGroupBy with observed=True so that it does # not try to add missing categories if grouping over multiple @@ -1114,21 +1120,14 @@ def blk_func(block: "Block") -> List["Block"]: # unwrap DataFrame to get array result = result._mgr.blocks[0].values - if isinstance(result, np.ndarray) and result.ndim == 1: - result = result.reshape(1, -1) - res_values = cast_agg_result(result, block.values, how) - agg_block = block.make_block(res_values) - new_blocks = [agg_block] - else: - res_values = cast_agg_result(result, block.values, how) - agg_block = block.make_block(res_values) - new_blocks = [agg_block] - return new_blocks + + res_values = cast_agg_result(result, bvalues, how) + return res_values skipped: List[int] = [] for i, block in enumerate(data.blocks): try: - nbs = blk_func(block) + nbs = block.apply(blk_func) except (NotImplementedError, TypeError): # TypeError -> we may have an exception in trying to aggregate # continue and exclude the block From 35d253bdf3bb63a1be0404a4c8c001ea7ba97cbb Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 24 Aug 2020 16:35:40 -0700 Subject: [PATCH 0554/1025] REF: implement Block.reduce for DataFrame._reduce (#35867) --- pandas/core/frame.py | 10 ++++------ pandas/core/internals/blocks.py | 15 +++++++++++++++ pandas/core/internals/managers.py | 29 ++++++++--------------------- 3 files changed, 27 insertions(+), 27 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 837bd35414773..606bd4cc3b52d 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -8647,13 +8647,11 @@ def blk_func(values): return op(values, axis=1, skipna=skipna, **kwds) # After possibly _get_data and transposing, we are now in the - # simple case where we can use BlockManager._reduce + # simple case where we can use BlockManager.reduce res = df._mgr.reduce(blk_func) - assert isinstance(res, dict) - if len(res): - assert len(res) == max(list(res.keys())) + 1, res.keys() - out = df._constructor_sliced(res, index=range(len(res)), dtype=out_dtype) - out.index = df.columns + out = df._constructor(res,).iloc[0].rename(None) + if out_dtype is not None: + out = out.astype(out_dtype) if axis == 0 and is_object_dtype(out.dtype): out[:] = coerce_to_dtypes(out.values, df.dtypes) return out diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index f3286b3c20965..c62be4f767f00 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -346,6 +346,21 @@ def apply(self, func, **kwargs) -> List["Block"]: return self._split_op_result(result) + def reduce(self, func) -> List["Block"]: + # We will apply the function and reshape the result into a single-row + # Block with the same mgr_locs; squeezing will be done at a higher level + assert self.ndim == 2 + + result = func(self.values) + if np.ndim(result) == 0: + # TODO(EA2D): special case not needed with 2D EAs + res_values = np.array([[result]]) + else: + res_values = result.reshape(-1, 1) + + nb = self.make_block(res_values) + return [nb] + def _split_op_result(self, result) -> List["Block"]: # See also: split_and_operate if is_extension_array_dtype(result) and result.ndim > 1: diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index f05d4cf1c4be6..297ad3077ef1d 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -330,31 +330,18 @@ def _verify_integrity(self) -> None: f"tot_items: {tot_items}" ) - def reduce(self, func): + def reduce(self: T, func) -> T: # If 2D, we assume that we're operating column-wise - if self.ndim == 1: - # we'll be returning a scalar - blk = self.blocks[0] - return func(blk.values) + assert self.ndim == 2 - res = {} + res_blocks = [] for blk in self.blocks: - bres = func(blk.values) - - if np.ndim(bres) == 0: - # EA - assert blk.shape[0] == 1 - new_res = zip(blk.mgr_locs.as_array, [bres]) - else: - assert bres.ndim == 1, bres.shape - assert blk.shape[0] == len(bres), (blk.shape, bres.shape) - new_res = zip(blk.mgr_locs.as_array, bres) - - nr = dict(new_res) - assert not any(key in res for key in nr) - res.update(nr) + nbs = blk.reduce(func) + res_blocks.extend(nbs) - return res + index = Index([0]) # placeholder + new_mgr = BlockManager.from_blocks(res_blocks, [self.items, index]) + return new_mgr def operate_blockwise(self, other: "BlockManager", array_op) -> "BlockManager": """ From 530ddc220fa36d5f58cedaab26c85e3b9dc61eb5 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 24 Aug 2020 16:36:49 -0700 Subject: [PATCH 0555/1025] REF: make window _apply_blockwise actually blockwise (#35861) --- pandas/core/window/rolling.py | 71 +++++++++++++++++++++++------------ 1 file changed, 46 insertions(+), 25 deletions(-) diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index f7e81f41b8675..a70247d9f7f9c 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -66,7 +66,8 @@ from pandas.core.window.numba_ import generate_numba_apply_func if TYPE_CHECKING: - from pandas import Series + from pandas import DataFrame, Series + from pandas.core.internals import Block # noqa:F401 def calculate_center_offset(window) -> int: @@ -418,35 +419,40 @@ def _wrap_results(self, results, obj, skipped: List[int]) -> FrameOrSeriesUnion: for i in skipped: exclude.extend(orig_blocks[i].columns) - kept_blocks = [blk for i, blk in enumerate(orig_blocks) if i not in skipped] - - final = [] - for result, block in zip(results, kept_blocks): - - result = type(obj)(result, index=obj.index, columns=block.columns) - final.append(result) - - exclude = exclude or [] columns = [c for c in self._selected_obj.columns if c not in exclude] - if not columns and not len(final) and exclude: + if not columns and not len(results) and exclude: raise DataError("No numeric types to aggregate") - elif not len(final): + elif not len(results): return obj.astype("float64") - df = concat(final, axis=1).reindex(columns=columns, copy=False) + df = concat(results, axis=1).reindex(columns=columns, copy=False) + self._insert_on_column(df, obj) + return df + def _insert_on_column(self, result: "DataFrame", obj: "DataFrame"): # if we have an 'on' column we want to put it back into # the results in the same location + from pandas import Series + if self.on is not None and not self._on.equals(obj.index): name = self._on.name extra_col = Series(self._on, index=obj.index, name=name) - if name not in df.columns and name not in df.index.names: - new_loc = len(df.columns) - df.insert(new_loc, name, extra_col) - elif name in df.columns: + if name in result.columns: # TODO: sure we want to overwrite results? - df[name] = extra_col - return df + result[name] = extra_col + elif name in result.index.names: + pass + elif name in self._selected_obj.columns: + # insert in the same location as we had in _selected_obj + old_cols = self._selected_obj.columns + new_cols = result.columns + old_loc = old_cols.get_loc(name) + overlap = new_cols.intersection(old_cols[:old_loc]) + new_loc = len(overlap) + result.insert(new_loc, name, extra_col) + else: + # insert at the end + result[name] = extra_col def _center_window(self, result, window) -> np.ndarray: """ @@ -530,21 +536,36 @@ def _apply_blockwise( # This isn't quite blockwise, since `blocks` is actually a collection # of homogenenous DataFrames. blocks, obj = self._create_blocks(self._selected_obj) + mgr = obj._mgr + + def hfunc(bvalues: ArrayLike) -> ArrayLike: + # TODO(EA2D): getattr unnecessary with 2D EAs + values = self._prep_values(getattr(bvalues, "T", bvalues)) + res_values = homogeneous_func(values) + return getattr(res_values, "T", res_values) skipped: List[int] = [] - results: List[ArrayLike] = [] - for i, b in enumerate(blocks): + res_blocks: List["Block"] = [] + for i, blk in enumerate(mgr.blocks): try: - values = self._prep_values(b.values) + nbs = blk.apply(hfunc) except (TypeError, NotImplementedError): skipped.append(i) continue - result = homogeneous_func(values) - results.append(result) + res_blocks.extend(nbs) + + if not len(res_blocks) and skipped: + raise DataError("No numeric types to aggregate") + elif not len(res_blocks): + return obj.astype("float64") - return self._wrap_results(results, obj, skipped) + new_cols = mgr.reset_dropped_locs(res_blocks, skipped) + new_mgr = type(mgr).from_blocks(res_blocks, [new_cols, obj.index]) + out = obj._constructor(new_mgr) + self._insert_on_column(out, obj) + return out def _apply( self, From d60e8bd8d238aace0d6d9ea560ffc6324dc9461f Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Mon, 24 Aug 2020 16:42:01 -0700 Subject: [PATCH 0556/1025] COMPAT: Ensure rolling indexers return intp during take operations (#35875) --- pandas/core/window/indexers.py | 16 +++++++++------- pandas/tests/resample/test_resampler_grouper.py | 3 +-- pandas/tests/window/test_api.py | 5 ++--- pandas/tests/window/test_apply.py | 3 +-- pandas/tests/window/test_grouper.py | 13 +------------ pandas/tests/window/test_rolling.py | 4 +--- pandas/tests/window/test_timeseries_window.py | 3 --- 7 files changed, 15 insertions(+), 32 deletions(-) diff --git a/pandas/core/window/indexers.py b/pandas/core/window/indexers.py index 7cbe34cdebf9f..7c76a8e2a0b22 100644 --- a/pandas/core/window/indexers.py +++ b/pandas/core/window/indexers.py @@ -7,6 +7,8 @@ from pandas._libs.window.indexers import calculate_variable_window_bounds from pandas.util._decorators import Appender +from pandas.core.dtypes.common import ensure_platform_int + from pandas.tseries.offsets import Nano get_window_bounds_doc = """ @@ -296,9 +298,9 @@ def get_window_bounds( start_arrays = [] end_arrays = [] window_indicies_start = 0 - for key, indicies in self.groupby_indicies.items(): + for key, indices in self.groupby_indicies.items(): if self.index_array is not None: - index_array = self.index_array.take(indicies) + index_array = self.index_array.take(ensure_platform_int(indices)) else: index_array = self.index_array indexer = self.rolling_indexer( @@ -307,22 +309,22 @@ def get_window_bounds( **self.indexer_kwargs, ) start, end = indexer.get_window_bounds( - len(indicies), min_periods, center, closed + len(indices), min_periods, center, closed ) start = start.astype(np.int64) end = end.astype(np.int64) # Cannot use groupby_indicies as they might not be monotonic with the object # we're rolling over window_indicies = np.arange( - window_indicies_start, window_indicies_start + len(indicies), + window_indicies_start, window_indicies_start + len(indices), ) - window_indicies_start += len(indicies) + window_indicies_start += len(indices) # Extend as we'll be slicing window like [start, end) window_indicies = np.append( window_indicies, [window_indicies[-1] + 1] ).astype(np.int64) - start_arrays.append(window_indicies.take(start)) - end_arrays.append(window_indicies.take(end)) + start_arrays.append(window_indicies.take(ensure_platform_int(start))) + end_arrays.append(window_indicies.take(ensure_platform_int(end))) start = np.concatenate(start_arrays) end = np.concatenate(end_arrays) # GH 35552: Need to adjust start and end based on the nans appended to values diff --git a/pandas/tests/resample/test_resampler_grouper.py b/pandas/tests/resample/test_resampler_grouper.py index f18aaa5e86829..73bf7dafac254 100644 --- a/pandas/tests/resample/test_resampler_grouper.py +++ b/pandas/tests/resample/test_resampler_grouper.py @@ -7,7 +7,7 @@ from pandas.util._test_decorators import async_mark import pandas as pd -from pandas import DataFrame, Series, Timestamp, compat +from pandas import DataFrame, Series, Timestamp import pandas._testing as tm from pandas.core.indexes.datetimes import date_range @@ -319,7 +319,6 @@ def test_resample_groupby_with_label(): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(not compat.IS64, reason="GH-35148") def test_consistency_with_window(): # consistent return values with window diff --git a/pandas/tests/window/test_api.py b/pandas/tests/window/test_api.py index 28e27791cad35..2c3d8b4608806 100644 --- a/pandas/tests/window/test_api.py +++ b/pandas/tests/window/test_api.py @@ -6,7 +6,7 @@ import pandas.util._test_decorators as td import pandas as pd -from pandas import DataFrame, Index, Series, Timestamp, compat, concat +from pandas import DataFrame, Index, Series, Timestamp, concat import pandas._testing as tm from pandas.core.base import SpecificationError @@ -277,7 +277,7 @@ def test_preserve_metadata(): @pytest.mark.parametrize( "func,window_size,expected_vals", [ - pytest.param( + ( "rolling", 2, [ @@ -289,7 +289,6 @@ def test_preserve_metadata(): [35.0, 40.0, 60.0, 40.0], [60.0, 80.0, 85.0, 80], ], - marks=pytest.mark.xfail(not compat.IS64, reason="GH-35294"), ), ( "expanding", diff --git a/pandas/tests/window/test_apply.py b/pandas/tests/window/test_apply.py index 2aaf6af103e98..bc38634da8941 100644 --- a/pandas/tests/window/test_apply.py +++ b/pandas/tests/window/test_apply.py @@ -4,7 +4,7 @@ from pandas.errors import NumbaUtilError import pandas.util._test_decorators as td -from pandas import DataFrame, Index, MultiIndex, Series, Timestamp, compat, date_range +from pandas import DataFrame, Index, MultiIndex, Series, Timestamp, date_range import pandas._testing as tm @@ -142,7 +142,6 @@ def test_invalid_kwargs_nopython(): @pytest.mark.parametrize("args_kwargs", [[None, {"par": 10}], [(10,), None]]) -@pytest.mark.xfail(not compat.IS64, reason="GH-35294") def test_rolling_apply_args_kwargs(args_kwargs): # GH 33433 def foo(x, par): diff --git a/pandas/tests/window/test_grouper.py b/pandas/tests/window/test_grouper.py index d0a62374d0888..170bf100b3891 100644 --- a/pandas/tests/window/test_grouper.py +++ b/pandas/tests/window/test_grouper.py @@ -2,7 +2,7 @@ import pytest import pandas as pd -from pandas import DataFrame, Series, compat +from pandas import DataFrame, Series import pandas._testing as tm from pandas.core.groupby.groupby import get_groupby @@ -23,7 +23,6 @@ def test_mutated(self): g = get_groupby(self.frame, by="A", mutated=True) assert g.mutated - @pytest.mark.xfail(not compat.IS64, reason="GH-35294") def test_getitem(self): g = self.frame.groupby("A") g_mutated = get_groupby(self.frame, by="A", mutated=True) @@ -56,7 +55,6 @@ def test_getitem_multiple(self): result = r.B.count() tm.assert_series_equal(result, expected) - @pytest.mark.xfail(not compat.IS64, reason="GH-35294") def test_rolling(self): g = self.frame.groupby("A") r = g.rolling(window=4) @@ -74,7 +72,6 @@ def test_rolling(self): @pytest.mark.parametrize( "interpolation", ["linear", "lower", "higher", "midpoint", "nearest"] ) - @pytest.mark.xfail(not compat.IS64, reason="GH-35294") def test_rolling_quantile(self, interpolation): g = self.frame.groupby("A") r = g.rolling(window=4) @@ -105,7 +102,6 @@ def func(x): expected = g.apply(func) tm.assert_series_equal(result, expected) - @pytest.mark.xfail(not compat.IS64, reason="GH-35294") def test_rolling_apply(self, raw): g = self.frame.groupby("A") r = g.rolling(window=4) @@ -115,7 +111,6 @@ def test_rolling_apply(self, raw): expected = g.apply(lambda x: x.rolling(4).apply(lambda y: y.sum(), raw=raw)) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(not compat.IS64, reason="GH-35294") def test_rolling_apply_mutability(self): # GH 14013 df = pd.DataFrame({"A": ["foo"] * 3 + ["bar"] * 3, "B": [1] * 6}) @@ -197,7 +192,6 @@ def test_expanding_apply(self, raw): tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("expected_value,raw_value", [[1.0, True], [0.0, False]]) - @pytest.mark.xfail(not compat.IS64, reason="GH-35294") def test_groupby_rolling(self, expected_value, raw_value): # GH 31754 @@ -215,7 +209,6 @@ def foo(x): ) tm.assert_series_equal(result, expected) - @pytest.mark.xfail(not compat.IS64, reason="GH-35294") def test_groupby_rolling_center_center(self): # GH 35552 series = Series(range(1, 6)) @@ -281,7 +274,6 @@ def test_groupby_rolling_center_center(self): ) tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(not compat.IS64, reason="GH-35294") def test_groupby_subselect_rolling(self): # GH 35486 df = DataFrame( @@ -307,7 +299,6 @@ def test_groupby_subselect_rolling(self): ) tm.assert_series_equal(result, expected) - @pytest.mark.xfail(not compat.IS64, reason="GH-35294") def test_groupby_rolling_custom_indexer(self): # GH 35557 class SimpleIndexer(pd.api.indexers.BaseIndexer): @@ -331,7 +322,6 @@ def get_window_bounds( expected = df.groupby(df.index).rolling(window=3, min_periods=1).sum() tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(not compat.IS64, reason="GH-35294") def test_groupby_rolling_subset_with_closed(self): # GH 35549 df = pd.DataFrame( @@ -356,7 +346,6 @@ def test_groupby_rolling_subset_with_closed(self): ) tm.assert_series_equal(result, expected) - @pytest.mark.xfail(not compat.IS64, reason="GH-35294") def test_groupby_subset_rolling_subset_with_closed(self): # GH 35549 df = pd.DataFrame( diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index bea239a245a4f..8d72e2cb92ca9 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -7,7 +7,7 @@ import pandas.util._test_decorators as td import pandas as pd -from pandas import DataFrame, Series, compat, date_range +from pandas import DataFrame, Series, date_range import pandas._testing as tm from pandas.core.window import Rolling @@ -150,7 +150,6 @@ def test_closed_one_entry(func): @pytest.mark.parametrize("func", ["min", "max"]) -@pytest.mark.xfail(not compat.IS64, reason="GH-35294") def test_closed_one_entry_groupby(func): # GH24718 ser = pd.DataFrame( @@ -683,7 +682,6 @@ def test_iter_rolling_datetime(expected, expected_index, window): ), ], ) -@pytest.mark.xfail(not compat.IS64, reason="GH-35294") def test_rolling_positional_argument(grouping, _index, raw): # GH 34605 diff --git a/pandas/tests/window/test_timeseries_window.py b/pandas/tests/window/test_timeseries_window.py index 90f919d5565b0..8aa4d7103e48a 100644 --- a/pandas/tests/window/test_timeseries_window.py +++ b/pandas/tests/window/test_timeseries_window.py @@ -7,7 +7,6 @@ MultiIndex, Series, Timestamp, - compat, date_range, to_datetime, ) @@ -657,7 +656,6 @@ def agg_by_day(x): tm.assert_frame_equal(result, expected) - @pytest.mark.xfail(not compat.IS64, reason="GH-35294") def test_groupby_monotonic(self): # GH 15130 @@ -687,7 +685,6 @@ def test_groupby_monotonic(self): result = df.groupby("name").rolling("180D", on="date")["amount"].sum() tm.assert_series_equal(result, expected) - @pytest.mark.xfail(not compat.IS64, reason="GH-35294") def test_non_monotonic(self): # GH 13966 (similar to #15130, closed by #15175) From 94e84427df86b419abeed82692f192c101ab7c53 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 24 Aug 2020 16:45:45 -0700 Subject: [PATCH 0557/1025] REGR: DatetimeIndex.intersection incorrectly raising AssertionError (#35877) --- doc/source/whatsnew/v1.1.2.rst | 2 +- pandas/core/indexes/datetimelike.py | 4 ++-- pandas/tests/indexes/datetimes/test_setops.py | 7 +++++++ 3 files changed, 10 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.1.2.rst b/doc/source/whatsnew/v1.1.2.rst index c1b73c60be92b..af61354470a71 100644 --- a/doc/source/whatsnew/v1.1.2.rst +++ b/doc/source/whatsnew/v1.1.2.rst @@ -14,7 +14,7 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ - +- Regression in :meth:`DatetimeIndex.intersection` incorrectly raising ``AssertionError`` when intersecting against a list (:issue:`35876`) - - diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 6d9d75a69e91d..9d00f50a65a06 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -704,16 +704,16 @@ def intersection(self, other, sort=False): if result.freq is None: # TODO: no tests rely on this; needed? result = result._with_freq("infer") - assert result.name == res_name + result.name = res_name return result elif not self._can_fast_intersect(other): result = Index.intersection(self, other, sort=sort) - assert result.name == res_name # We need to invalidate the freq because Index.intersection # uses _shallow_copy on a view of self._data, which will preserve # self.freq if we're not careful. result = result._with_freq(None)._with_freq("infer") + result.name = res_name return result # to make our life easier, "sort" the two ranges diff --git a/pandas/tests/indexes/datetimes/test_setops.py b/pandas/tests/indexes/datetimes/test_setops.py index 6670b079ddd29..f19e78323ab23 100644 --- a/pandas/tests/indexes/datetimes/test_setops.py +++ b/pandas/tests/indexes/datetimes/test_setops.py @@ -470,6 +470,13 @@ def test_intersection_bug(self): tm.assert_index_equal(result, b) assert result.freq == b.freq + def test_intersection_list(self): + # GH#35876 + values = [pd.Timestamp("2020-01-01"), pd.Timestamp("2020-02-01")] + idx = pd.DatetimeIndex(values, name="a") + res = idx.intersection(values) + tm.assert_index_equal(res, idx) + def test_month_range_union_tz_pytz(self, sort): from pytz import timezone From 1920456b8da5e021d53f85c5f0c8d76b4d5d2d77 Mon Sep 17 00:00:00 2001 From: Honfung Wong <21543236+onshek@users.noreply.github.com> Date: Tue, 25 Aug 2020 13:37:03 +0800 Subject: [PATCH 0558/1025] DOC: fix documentation for pandas.Series.transform (#35885) --- pandas/core/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 9f36405bf6428..286da6e1de9d5 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -10703,7 +10703,7 @@ def transform(self, func, *args, **kwargs): - function - string function name - - list of functions and/or function names, e.g. ``[np.exp. 'sqrt']`` + - list of functions and/or function names, e.g. ``[np.exp, 'sqrt']`` - dict of axis labels -> functions, function names or list of such. {axis} *args From 7c1cd534988f179e2858d3d10488ec701b10f132 Mon Sep 17 00:00:00 2001 From: Ali McMaster Date: Tue, 25 Aug 2020 07:41:48 +0100 Subject: [PATCH 0559/1025] TST: Fix test_parquet failures for pyarrow 1.0 (#35814) --- pandas/tests/io/test_parquet.py | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 3a3ba99484a3a..4e0c16c71a6a8 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -565,15 +565,22 @@ def test_s3_roundtrip(self, df_compat, s3_resource, pa): @pytest.mark.parametrize("partition_col", [["A"], []]) def test_s3_roundtrip_for_dir(self, df_compat, s3_resource, pa, partition_col): # GH #26388 - # https://github.com/apache/arrow/blob/master/python/pyarrow/tests/test_parquet.py#L2716 - # As per pyarrow partitioned columns become 'categorical' dtypes - # and are added to back of dataframe on read - if partition_col and pd.compat.is_platform_windows(): - pytest.skip("pyarrow/win incompatibility #35791") - expected_df = df_compat.copy() - if partition_col: - expected_df[partition_col] = expected_df[partition_col].astype("category") + + # GH #35791 + # read_table uses the new Arrow Datasets API since pyarrow 1.0.0 + # Previous behaviour was pyarrow partitioned columns become 'category' dtypes + # These are added to back of dataframe on read. In new API category dtype is + # only used if partition field is string. + legacy_read_table = LooseVersion(pyarrow.__version__) < LooseVersion("1.0.0") + if partition_col and legacy_read_table: + partition_col_type = "category" + else: + partition_col_type = "int32" + + expected_df[partition_col] = expected_df[partition_col].astype( + partition_col_type + ) check_round_trip( df_compat, From 492e6ef3100313739acf9b8de4c8241714f17267 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 25 Aug 2020 05:58:43 -0700 Subject: [PATCH 0560/1025] BUG: to_dict_of_blocks failing to invalidate item_cache (#35874) --- pandas/core/internals/managers.py | 5 ----- pandas/tests/frame/test_block_internals.py | 18 ++++++++++++++++++ 2 files changed, 18 insertions(+), 5 deletions(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 297ad3077ef1d..da1f10f924ae7 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -896,12 +896,7 @@ def to_dict(self, copy: bool = True): Returns ------- values : a dict of dtype -> BlockManager - - Notes - ----- - This consolidates based on str(dtype) """ - self._consolidate_inplace() bd: Dict[str, List[Block]] = {} for b in self.blocks: diff --git a/pandas/tests/frame/test_block_internals.py b/pandas/tests/frame/test_block_internals.py index c9fec3215d57f..8ecd9066ceff0 100644 --- a/pandas/tests/frame/test_block_internals.py +++ b/pandas/tests/frame/test_block_internals.py @@ -626,3 +626,21 @@ def test_add_column_with_pandas_array(self): assert type(df["c"]._mgr.blocks[0]) == ObjectBlock assert type(df2["c"]._mgr.blocks[0]) == ObjectBlock tm.assert_frame_equal(df, df2) + + +def test_to_dict_of_blocks_item_cache(): + # Calling to_dict_of_blocks should not poison item_cache + df = pd.DataFrame({"a": [1, 2, 3, 4], "b": ["a", "b", "c", "d"]}) + df["c"] = pd.arrays.PandasArray(np.array([1, 2, None, 3], dtype=object)) + mgr = df._mgr + assert len(mgr.blocks) == 3 # i.e. not consolidated + + ser = df["b"] # populations item_cache["b"] + + df._to_dict_of_blocks() + + # Check that the to_dict_of_blocks didnt break link between ser and df + ser.values[0] = "foo" + assert df.loc[0, "b"] == "foo" + + assert df["b"] is ser From 61e8e02a2f0b26ca73b91e6e8f248d5bc17d94e8 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 25 Aug 2020 05:59:41 -0700 Subject: [PATCH 0561/1025] REF: reuse _combine instead of reset_dropped_locs (#35884) --- pandas/core/groupby/generic.py | 17 ++++++---------- pandas/core/internals/managers.py | 32 ------------------------------- pandas/core/window/rolling.py | 3 +-- 3 files changed, 7 insertions(+), 45 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 1198baab12ac1..70a8379de64e9 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -21,7 +21,6 @@ Mapping, Optional, Sequence, - Tuple, Type, Union, ) @@ -1025,16 +1024,14 @@ def _iterate_slices(self) -> Iterable[Series]: def _cython_agg_general( self, how: str, alt=None, numeric_only: bool = True, min_count: int = -1 ) -> DataFrame: - agg_blocks, agg_items = self._cython_agg_blocks( + agg_mgr = self._cython_agg_blocks( how, alt=alt, numeric_only=numeric_only, min_count=min_count ) - return self._wrap_agged_blocks(agg_blocks, items=agg_items) + return self._wrap_agged_blocks(agg_mgr.blocks, items=agg_mgr.items) def _cython_agg_blocks( self, how: str, alt=None, numeric_only: bool = True, min_count: int = -1 - ) -> "Tuple[List[Block], Index]": - # TODO: the actual managing of mgr_locs is a PITA - # here, it should happen via BlockManager.combine + ) -> BlockManager: data: BlockManager = self._get_data_to_aggregate() @@ -1124,7 +1121,6 @@ def blk_func(bvalues: ArrayLike) -> ArrayLike: res_values = cast_agg_result(result, bvalues, how) return res_values - skipped: List[int] = [] for i, block in enumerate(data.blocks): try: nbs = block.apply(blk_func) @@ -1132,7 +1128,7 @@ def blk_func(bvalues: ArrayLike) -> ArrayLike: # TypeError -> we may have an exception in trying to aggregate # continue and exclude the block # NotImplementedError -> "ohlc" with wrong dtype - skipped.append(i) + pass else: agg_blocks.extend(nbs) @@ -1141,9 +1137,8 @@ def blk_func(bvalues: ArrayLike) -> ArrayLike: # reset the locs in the blocks to correspond to our # current ordering - agg_items = data.reset_dropped_locs(agg_blocks, skipped) - - return agg_blocks, agg_items + new_mgr = data._combine(agg_blocks) + return new_mgr def _aggregate_frame(self, func, *args, **kwargs) -> DataFrame: if self.grouper.nkeys != 1: diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index da1f10f924ae7..a5372b14d210f 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1486,38 +1486,6 @@ def unstack(self, unstacker, fill_value) -> "BlockManager": bm = BlockManager(new_blocks, [new_columns, new_index]) return bm - def reset_dropped_locs(self, blocks: List[Block], skipped: List[int]) -> Index: - """ - Decrement the mgr_locs of the given blocks with `skipped` removed. - - Notes - ----- - Alters each block's mgr_locs inplace. - """ - ncols = len(self) - - new_locs = [blk.mgr_locs.as_array for blk in blocks] - indexer = np.concatenate(new_locs) - - new_items = self.items.take(np.sort(indexer)) - - if skipped: - # we need to adjust the indexer to account for the - # items we have removed - deleted_items = [self.blocks[i].mgr_locs.as_array for i in skipped] - deleted = np.concatenate(deleted_items) - ai = np.arange(ncols) - mask = np.zeros(ncols) - mask[deleted] = 1 - indexer = (ai - mask.cumsum())[indexer] - - offset = 0 - for blk in blocks: - loc = len(blk.mgr_locs) - blk.mgr_locs = indexer[offset : (offset + loc)] - offset += loc - return new_items - class SingleBlockManager(BlockManager): """ manage a single block with """ diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index a70247d9f7f9c..baabdf0fca29a 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -561,8 +561,7 @@ def hfunc(bvalues: ArrayLike) -> ArrayLike: elif not len(res_blocks): return obj.astype("float64") - new_cols = mgr.reset_dropped_locs(res_blocks, skipped) - new_mgr = type(mgr).from_blocks(res_blocks, [new_cols, obj.index]) + new_mgr = mgr._combine(res_blocks) out = obj._constructor(new_mgr) self._insert_on_column(out, obj) return out From b972e68cb1ff4ff4c9825930b67786acbdf1779b Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 25 Aug 2020 22:12:31 +0200 Subject: [PATCH 0562/1025] DOC: avoid StorageOptions type alias in docstrings (#35894) --- pandas/io/excel/_base.py | 4 ++-- pandas/io/excel/_odfreader.py | 2 +- pandas/io/excel/_openpyxl.py | 2 +- pandas/io/excel/_pyxlsb.py | 2 +- pandas/io/excel/_xlrd.py | 2 +- pandas/io/feather_format.py | 12 ++++++++++-- 6 files changed, 16 insertions(+), 8 deletions(-) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index aaef71910c9ab..3cd0d721bbdc6 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -200,13 +200,13 @@ Duplicate columns will be specified as 'X', 'X.1', ...'X.N', rather than 'X'...'X'. Passing in False will cause data to be overwritten if there are duplicate names in the columns. -storage_options : StorageOptions +storage_options : dict, optional Extra options that make sense for a particular storage connection, e.g. host, port, username, password, etc., if using a URL that will be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error will be raised if providing this argument with a local path or a file-like buffer. See the fsspec and backend storage implementation - docs for the set of allowed keys and values + docs for the set of allowed keys and values. .. versionadded:: 1.2.0 diff --git a/pandas/io/excel/_odfreader.py b/pandas/io/excel/_odfreader.py index a6cd8f524503b..6cbca59aed97e 100644 --- a/pandas/io/excel/_odfreader.py +++ b/pandas/io/excel/_odfreader.py @@ -18,7 +18,7 @@ class _ODFReader(_BaseExcelReader): ---------- filepath_or_buffer : string, path to be parsed or an open readable stream. - storage_options : StorageOptions + storage_options : dict, optional passed to fsspec for appropriate URLs (see ``get_filepath_or_buffer``) """ diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py index 73239190604db..c2730536af8a3 100644 --- a/pandas/io/excel/_openpyxl.py +++ b/pandas/io/excel/_openpyxl.py @@ -479,7 +479,7 @@ def __init__( ---------- filepath_or_buffer : string, path object or Workbook Object to be parsed. - storage_options : StorageOptions + storage_options : dict, optional passed to fsspec for appropriate URLs (see ``get_filepath_or_buffer``) """ import_optional_dependency("openpyxl") diff --git a/pandas/io/excel/_pyxlsb.py b/pandas/io/excel/_pyxlsb.py index c0e281ff6c2da..c15a52abe4d53 100644 --- a/pandas/io/excel/_pyxlsb.py +++ b/pandas/io/excel/_pyxlsb.py @@ -19,7 +19,7 @@ def __init__( ---------- filepath_or_buffer : str, path object, or Workbook Object to be parsed. - storage_options : StorageOptions + storage_options : dict, optional passed to fsspec for appropriate URLs (see ``get_filepath_or_buffer``) """ import_optional_dependency("pyxlsb") diff --git a/pandas/io/excel/_xlrd.py b/pandas/io/excel/_xlrd.py index ff1b3c8bdb964..a7fb519af61c6 100644 --- a/pandas/io/excel/_xlrd.py +++ b/pandas/io/excel/_xlrd.py @@ -17,7 +17,7 @@ def __init__(self, filepath_or_buffer, storage_options: StorageOptions = None): ---------- filepath_or_buffer : string, path object or Workbook Object to be parsed. - storage_options : StorageOptions + storage_options : dict, optional passed to fsspec for appropriate URLs (see ``get_filepath_or_buffer``) """ err_msg = "Install xlrd >= 1.0.0 for Excel support" diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py index 2d86fa44f22a4..fb606b5ec8aef 100644 --- a/pandas/io/feather_format.py +++ b/pandas/io/feather_format.py @@ -16,14 +16,13 @@ def to_feather(df: DataFrame, path, storage_options: StorageOptions = None, **kw ---------- df : DataFrame path : string file path, or file-like object - storage_options : dict, optional Extra options that make sense for a particular storage connection, e.g. host, port, username, password, etc., if using a URL that will be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error will be raised if providing this argument with a local path or a file-like buffer. See the fsspec and backend storage implementation - docs for the set of allowed keys and values + docs for the set of allowed keys and values. .. versionadded:: 1.2.0 @@ -106,6 +105,15 @@ def read_feather( Whether to parallelize reading using multiple threads. .. versionadded:: 0.24.0 + storage_options : dict, optional + Extra options that make sense for a particular storage connection, e.g. + host, port, username, password, etc., if using a URL that will + be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error + will be raised if providing this argument with a local path or + a file-like buffer. See the fsspec and backend storage implementation + docs for the set of allowed keys and values. + + .. versionadded:: 1.2.0 Returns ------- From c9b4cf1902cf28d02f0b2bbcea264584383f855f Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 25 Aug 2020 21:21:40 -0500 Subject: [PATCH 0563/1025] CI: Mark s3 tests parallel safe (#35895) Closes https://github.com/pandas-dev/pandas/issues/35856 --- pandas/tests/io/conftest.py | 24 +++++++++-------- pandas/tests/io/json/test_compression.py | 6 ++--- pandas/tests/io/json/test_pandas.py | 7 ++--- pandas/tests/io/test_parquet.py | 34 +++++++++++++++++------- 4 files changed, 41 insertions(+), 30 deletions(-) diff --git a/pandas/tests/io/conftest.py b/pandas/tests/io/conftest.py index 518f31d73efa9..193baa8c3ed74 100644 --- a/pandas/tests/io/conftest.py +++ b/pandas/tests/io/conftest.py @@ -34,12 +34,13 @@ def feather_file(datapath): @pytest.fixture -def s3so(): - return dict(client_kwargs={"endpoint_url": "http://127.0.0.1:5555/"}) +def s3so(worker_id): + worker_id = "5" if worker_id == "master" else worker_id.lstrip("gw") + return dict(client_kwargs={"endpoint_url": f"http://127.0.0.1:555{worker_id}/"}) -@pytest.fixture(scope="module") -def s3_base(): +@pytest.fixture(scope="session") +def s3_base(worker_id): """ Fixture for mocking S3 interaction. @@ -61,11 +62,13 @@ def s3_base(): # Launching moto in server mode, i.e., as a separate process # with an S3 endpoint on localhost - endpoint_uri = "http://127.0.0.1:5555/" + worker_id = "5" if worker_id == "master" else worker_id.lstrip("gw") + endpoint_port = f"555{worker_id}" + endpoint_uri = f"http://127.0.0.1:{endpoint_port}/" # pipe to null to avoid logging in terminal proc = subprocess.Popen( - shlex.split("moto_server s3 -p 5555"), stdout=subprocess.DEVNULL + shlex.split(f"moto_server s3 -p {endpoint_port}"), stdout=subprocess.DEVNULL ) timeout = 5 @@ -79,7 +82,7 @@ def s3_base(): pass timeout -= 0.1 time.sleep(0.1) - yield + yield endpoint_uri proc.terminate() proc.wait() @@ -119,9 +122,8 @@ def add_tips_files(bucket_name): cli.put_object(Bucket=bucket_name, Key=s3_key, Body=f) bucket = "pandas-test" - endpoint_uri = "http://127.0.0.1:5555/" - conn = boto3.resource("s3", endpoint_url=endpoint_uri) - cli = boto3.client("s3", endpoint_url=endpoint_uri) + conn = boto3.resource("s3", endpoint_url=s3_base) + cli = boto3.client("s3", endpoint_url=s3_base) try: cli.create_bucket(Bucket=bucket) @@ -143,7 +145,7 @@ def add_tips_files(bucket_name): s3fs.S3FileSystem.clear_instance_cache() yield conn - s3 = s3fs.S3FileSystem(client_kwargs={"endpoint_url": "http://127.0.0.1:5555/"}) + s3 = s3fs.S3FileSystem(client_kwargs={"endpoint_url": s3_base}) try: s3.rm(bucket, recursive=True) diff --git a/pandas/tests/io/json/test_compression.py b/pandas/tests/io/json/test_compression.py index 5bb205842269e..c0e3220454bf1 100644 --- a/pandas/tests/io/json/test_compression.py +++ b/pandas/tests/io/json/test_compression.py @@ -34,7 +34,7 @@ def test_read_zipped_json(datapath): @td.skip_if_not_us_locale -def test_with_s3_url(compression, s3_resource): +def test_with_s3_url(compression, s3_resource, s3so): # Bucket "pandas-test" created in tests/io/conftest.py df = pd.read_json('{"a": [1, 2, 3], "b": [4, 5, 6]}') @@ -45,9 +45,7 @@ def test_with_s3_url(compression, s3_resource): s3_resource.Bucket("pandas-test").put_object(Key="test-1", Body=f) roundtripped_df = pd.read_json( - "s3://pandas-test/test-1", - compression=compression, - storage_options=dict(client_kwargs={"endpoint_url": "http://127.0.0.1:5555/"}), + "s3://pandas-test/test-1", compression=compression, storage_options=s3so, ) tm.assert_frame_equal(df, roundtripped_df) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 64a666079876f..2022abbaee323 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1702,17 +1702,14 @@ def test_json_multiindex(self, dataframe, expected): result = series.to_json(orient="index") assert result == expected - def test_to_s3(self, s3_resource): + def test_to_s3(self, s3_resource, s3so): import time # GH 28375 mock_bucket_name, target_file = "pandas-test", "test.json" df = DataFrame({"x": [1, 2, 3], "y": [2, 4, 6]}) df.to_json( - f"s3://{mock_bucket_name}/{target_file}", - storage_options=dict( - client_kwargs={"endpoint_url": "http://127.0.0.1:5555/"} - ), + f"s3://{mock_bucket_name}/{target_file}", storage_options=s3so, ) timeout = 5 while True: diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 4e0c16c71a6a8..15f9837176315 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -158,10 +158,6 @@ def check_round_trip( """ write_kwargs = write_kwargs or {"compression": None} read_kwargs = read_kwargs or {} - if isinstance(path, str) and "s3://" in path: - s3so = dict(client_kwargs={"endpoint_url": "http://127.0.0.1:5555/"}) - read_kwargs["storage_options"] = s3so - write_kwargs["storage_options"] = s3so if expected is None: expected = df @@ -555,15 +551,24 @@ def test_s3_roundtrip_explicit_fs(self, df_compat, s3_resource, pa, s3so): write_kwargs=kw, ) - def test_s3_roundtrip(self, df_compat, s3_resource, pa): + def test_s3_roundtrip(self, df_compat, s3_resource, pa, s3so): if LooseVersion(pyarrow.__version__) <= LooseVersion("0.17.0"): pytest.skip() # GH #19134 - check_round_trip(df_compat, pa, path="s3://pandas-test/pyarrow.parquet") + s3so = dict(storage_options=s3so) + check_round_trip( + df_compat, + pa, + path="s3://pandas-test/pyarrow.parquet", + read_kwargs=s3so, + write_kwargs=s3so, + ) @td.skip_if_no("s3fs") @pytest.mark.parametrize("partition_col", [["A"], []]) - def test_s3_roundtrip_for_dir(self, df_compat, s3_resource, pa, partition_col): + def test_s3_roundtrip_for_dir( + self, df_compat, s3_resource, pa, partition_col, s3so + ): # GH #26388 expected_df = df_compat.copy() @@ -587,7 +592,10 @@ def test_s3_roundtrip_for_dir(self, df_compat, s3_resource, pa, partition_col): pa, expected=expected_df, path="s3://pandas-test/parquet_dir", - write_kwargs={"partition_cols": partition_col, "compression": None}, + read_kwargs=dict(storage_options=s3so), + write_kwargs=dict( + partition_cols=partition_col, compression=None, storage_options=s3so + ), check_like=True, repeat=1, ) @@ -761,9 +769,15 @@ def test_filter_row_groups(self, fp): result = read_parquet(path, fp, filters=[("a", "==", 0)]) assert len(result) == 1 - def test_s3_roundtrip(self, df_compat, s3_resource, fp): + def test_s3_roundtrip(self, df_compat, s3_resource, fp, s3so): # GH #19134 - check_round_trip(df_compat, fp, path="s3://pandas-test/fastparquet.parquet") + check_round_trip( + df_compat, + fp, + path="s3://pandas-test/fastparquet.parquet", + read_kwargs=dict(storage_options=s3so), + write_kwargs=dict(compression=None, storage_options=s3so), + ) def test_partition_cols_supported(self, fp, df_full): # GH #23283 From c44d6dfee74883630cb57648c8d8f813394cd8fe Mon Sep 17 00:00:00 2001 From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> Date: Tue, 25 Aug 2020 23:21:44 -0400 Subject: [PATCH 0564/1025] CLN/BUG: Clean/Simplify _wrap_applied_output (#35792) --- doc/source/whatsnew/v1.2.0.rst | 1 + pandas/core/groupby/generic.py | 90 +++++++++--------------------- pandas/core/indexes/api.py | 7 ++- pandas/tests/groupby/test_apply.py | 7 ++- 4 files changed, 34 insertions(+), 71 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index adc1806523d6e..55570341cf4e8 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -254,6 +254,7 @@ Groupby/resample/rolling - Bug in :meth:`DataFrameGroupBy.apply` that would some times throw an erroneous ``ValueError`` if the grouping axis had duplicate entries (:issue:`16646`) - Bug when combining methods :meth:`DataFrame.groupby` with :meth:`DataFrame.resample` and :meth:`DataFrame.interpolate` raising an ``TypeError`` (:issue:`35325`) - Bug in :meth:`DataFrameGroupBy.apply` where a non-nuisance grouping column would be dropped from the output columns if another groupby method was called before ``.apply()`` (:issue:`34656`) +- Bug in :meth:`DataFrameGroupby.apply` would drop a :class:`CategoricalIndex` when grouped on. (:issue:`35792`) Reshaping ^^^^^^^^^ diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 70a8379de64e9..2afa56b50c3c7 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1197,57 +1197,25 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): if len(keys) == 0: return self.obj._constructor(index=keys) - key_names = self.grouper.names - # GH12824 first_not_none = next(com.not_none(*values), None) if first_not_none is None: - # GH9684. If all values are None, then this will throw an error. - # We'd prefer it return an empty dataframe. + # GH9684 - All values are None, return an empty frame. return self.obj._constructor() elif isinstance(first_not_none, DataFrame): return self._concat_objects(keys, values, not_indexed_same=not_indexed_same) else: - if len(self.grouper.groupings) > 1: - key_index = self.grouper.result_index - - else: - ping = self.grouper.groupings[0] - if len(keys) == ping.ngroups: - key_index = ping.group_index - key_index.name = key_names[0] - - key_lookup = Index(keys) - indexer = key_lookup.get_indexer(key_index) - - # reorder the values - values = [values[i] for i in indexer] - - # update due to the potential reorder - first_not_none = next(com.not_none(*values), None) - else: - - key_index = Index(keys, name=key_names[0]) - - # don't use the key indexer - if not self.as_index: - key_index = None + key_index = self.grouper.result_index if self.as_index else None - # make Nones an empty object - if first_not_none is None: - return self.obj._constructor() - elif isinstance(first_not_none, NDFrame): + if isinstance(first_not_none, Series): # this is to silence a DeprecationWarning # TODO: Remove when default dtype of empty Series is object kwargs = first_not_none._construct_axes_dict() - if isinstance(first_not_none, Series): - backup = create_series_with_explicit_dtype( - **kwargs, dtype_if_empty=object - ) - else: - backup = first_not_none._constructor(**kwargs) + backup = create_series_with_explicit_dtype( + **kwargs, dtype_if_empty=object + ) values = [x if (x is not None) else backup for x in values] @@ -1256,7 +1224,7 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): if isinstance(v, (np.ndarray, Index, Series)) or not self.as_index: if isinstance(v, Series): applied_index = self._selected_obj._get_axis(self.axis) - all_indexed_same = all_indexes_same([x.index for x in values]) + all_indexed_same = all_indexes_same((x.index for x in values)) singular_series = len(values) == 1 and applied_index.nlevels == 1 # GH3596 @@ -1288,7 +1256,6 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): # GH 8467 return self._concat_objects(keys, values, not_indexed_same=True) - if self.axis == 0 and isinstance(v, ABCSeries): # GH6124 if the list of Series have a consistent name, # then propagate that name to the result. index = v.index.copy() @@ -1301,34 +1268,27 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): if len(names) == 1: index.name = list(names)[0] - # normally use vstack as its faster than concat - # and if we have mi-columns - if ( - isinstance(v.index, MultiIndex) - or key_index is None - or isinstance(key_index, MultiIndex) - ): - stacked_values = np.vstack([np.asarray(v) for v in values]) - result = self.obj._constructor( - stacked_values, index=key_index, columns=index - ) - else: - # GH5788 instead of stacking; concat gets the - # dtypes correct - from pandas.core.reshape.concat import concat - - result = concat( - values, - keys=key_index, - names=key_index.names, - axis=self.axis, - ).unstack() - result.columns = index - elif isinstance(v, ABCSeries): + # Combine values + # vstack+constructor is faster than concat and handles MI-columns stacked_values = np.vstack([np.asarray(v) for v in values]) + + if self.axis == 0: + index = key_index + columns = v.index.copy() + if columns.name is None: + # GH6124 - propagate name of Series when it's consistent + names = {v.name for v in values} + if len(names) == 1: + columns.name = list(names)[0] + else: + index = v.index + columns = key_index + stacked_values = stacked_values.T + result = self.obj._constructor( - stacked_values.T, index=v.index, columns=key_index + stacked_values, index=index, columns=columns ) + elif not self.as_index: # We add grouping column below, so create a frame here result = DataFrame( diff --git a/pandas/core/indexes/api.py b/pandas/core/indexes/api.py index 30cc8cf480dcf..d352b001f5d2a 100644 --- a/pandas/core/indexes/api.py +++ b/pandas/core/indexes/api.py @@ -297,15 +297,16 @@ def all_indexes_same(indexes): Parameters ---------- - indexes : list of Index objects + indexes : iterable of Index objects Returns ------- bool True if all indexes contain the same elements, False otherwise. """ - first = indexes[0] - for index in indexes[1:]: + itr = iter(indexes) + first = next(itr) + for index in itr: if not first.equals(index): return False return True diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index ee38722ffb8ce..a1dcb28a32c6c 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -861,13 +861,14 @@ def test_apply_multi_level_name(category): b = [1, 2] * 5 if category: b = pd.Categorical(b, categories=[1, 2, 3]) + expected_index = pd.CategoricalIndex([1, 2], categories=[1, 2, 3], name="B") + else: + expected_index = pd.Index([1, 2], name="B") df = pd.DataFrame( {"A": np.arange(10), "B": b, "C": list(range(10)), "D": list(range(10))} ).set_index(["A", "B"]) result = df.groupby("B").apply(lambda x: x.sum()) - expected = pd.DataFrame( - {"C": [20, 25], "D": [20, 25]}, index=pd.Index([1, 2], name="B") - ) + expected = pd.DataFrame({"C": [20, 25], "D": [20, 25]}, index=expected_index) tm.assert_frame_equal(result, expected) assert df.index.names == ["A", "B"] From 78d4101422abfc6bd326e1c7324ff1c385780c75 Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Wed, 26 Aug 2020 12:12:55 +0100 Subject: [PATCH 0565/1025] PERF: RangeIndex.format performance (#35712) --- doc/source/whatsnew/v0.25.0.rst | 2 +- doc/source/whatsnew/v1.1.2.rst | 5 +++-- pandas/core/indexes/base.py | 4 +++- pandas/core/indexes/category.py | 2 +- pandas/core/indexes/datetimelike.py | 11 ++++++++--- pandas/core/indexes/interval.py | 2 +- pandas/core/indexes/range.py | 11 ++++++++++- pandas/tests/indexes/common.py | 10 ++++++++-- pandas/tests/indexes/period/test_period.py | 6 ++++++ pandas/tests/indexes/ranges/test_range.py | 12 ++++++++++++ 10 files changed, 53 insertions(+), 12 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 3cd920158f774..0f0f009307c75 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -540,7 +540,7 @@ with :attr:`numpy.nan` in the case of an empty :class:`DataFrame` (:issue:`26397 .. ipython:: python - df.describe() + df.describe() ``__str__`` methods now call ``__repr__`` rather than vice versa ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/doc/source/whatsnew/v1.1.2.rst b/doc/source/whatsnew/v1.1.2.rst index af61354470a71..7739a483e3d38 100644 --- a/doc/source/whatsnew/v1.1.2.rst +++ b/doc/source/whatsnew/v1.1.2.rst @@ -15,8 +15,9 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ - Regression in :meth:`DatetimeIndex.intersection` incorrectly raising ``AssertionError`` when intersecting against a list (:issue:`35876`) +- Performance regression for :meth:`RangeIndex.format` (:issue:`35712`) - -- + .. --------------------------------------------------------------------------- @@ -26,7 +27,7 @@ Bug fixes ~~~~~~~~~ - Bug in :meth:`DataFrame.eval` with ``object`` dtype column binary operations (:issue:`35794`) - Bug in :meth:`DataFrame.apply` with ``result_type="reduce"`` returning with incorrect index (:issue:`35683`) -- +- Bug in :meth:`DateTimeIndex.format` and :meth:`PeriodIndex.format` with ``name=True`` setting the first item to ``"None"`` where it should bw ``""`` (:issue:`35712`) - .. --------------------------------------------------------------------------- diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index ceb109fdf6d7a..b1e5d5627e3f6 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -933,7 +933,9 @@ def format( return self._format_with_header(header, na_rep=na_rep) - def _format_with_header(self, header, na_rep="NaN") -> List[str_t]: + def _format_with_header( + self, header: List[str_t], na_rep: str_t = "NaN" + ) -> List[str_t]: from pandas.io.formats.format import format_array values = self._values diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 4990e6a8e20e9..cbb30763797d1 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -347,7 +347,7 @@ def _format_attrs(self): attrs.append(("length", len(self))) return attrs - def _format_with_header(self, header, na_rep="NaN") -> List[str]: + def _format_with_header(self, header: List[str], na_rep: str = "NaN") -> List[str]: from pandas.io.formats.printing import pprint_thing result = [ diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 9d00f50a65a06..0e8d7c1b866b8 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -354,15 +354,20 @@ def format( """ header = [] if name: - fmt_name = ibase.pprint_thing(self.name, escape_chars=("\t", "\r", "\n")) - header.append(fmt_name) + header.append( + ibase.pprint_thing(self.name, escape_chars=("\t", "\r", "\n")) + if self.name is not None + else "" + ) if formatter is not None: return header + list(self.map(formatter)) return self._format_with_header(header, na_rep=na_rep, date_format=date_format) - def _format_with_header(self, header, na_rep="NaT", date_format=None) -> List[str]: + def _format_with_header( + self, header: List[str], na_rep: str = "NaT", date_format: Optional[str] = None + ) -> List[str]: return header + list( self._format_native_types(na_rep=na_rep, date_format=date_format) ) diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index e8d0a44324cc5..9281f8017761d 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -948,7 +948,7 @@ def take(self, indices, axis=0, allow_fill=True, fill_value=None, **kwargs): # Rendering Methods # __repr__ associated methods are based on MultiIndex - def _format_with_header(self, header, na_rep="NaN") -> List[str]: + def _format_with_header(self, header: List[str], na_rep: str = "NaN") -> List[str]: return header + list(self._format_native_types(na_rep=na_rep)) def _format_native_types(self, na_rep="NaN", quoting=None, **kwargs): diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index c5572a9de7fa5..b85e2d3947cb1 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -1,7 +1,7 @@ from datetime import timedelta import operator from sys import getsizeof -from typing import Any +from typing import Any, List import warnings import numpy as np @@ -187,6 +187,15 @@ def _format_data(self, name=None): # we are formatting thru the attributes return None + def _format_with_header(self, header: List[str], na_rep: str = "NaN") -> List[str]: + if not len(self._range): + return header + first_val_str = str(self._range[0]) + last_val_str = str(self._range[-1]) + max_length = max(len(first_val_str), len(last_val_str)) + + return header + [f"{x:<{max_length}}" for x in self._range] + # -------------------------------------------------------------------- _deprecation_message = ( "RangeIndex.{} is deprecated and will be " diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index e4d0b46f7c716..e95e7267f17ec 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -1,5 +1,5 @@ import gc -from typing import Optional, Type +from typing import Type import numpy as np import pytest @@ -33,7 +33,7 @@ class Base: """ base class for index sub-class tests """ - _holder: Optional[Type[Index]] = None + _holder: Type[Index] _compat_props = ["shape", "ndim", "size", "nbytes"] def create_index(self) -> Index: @@ -686,6 +686,12 @@ def test_format(self): expected = [str(x) for x in idx] assert idx.format() == expected + def test_format_empty(self): + # GH35712 + empty_idx = self._holder([]) + assert empty_idx.format() == [] + assert empty_idx.format(name=True) == [""] + def test_hasnans_isnans(self, index): # GH 11343, added tests for hasnans / isnans if isinstance(index, MultiIndex): diff --git a/pandas/tests/indexes/period/test_period.py b/pandas/tests/indexes/period/test_period.py index 15a88ab3819ce..085d41aaa5b76 100644 --- a/pandas/tests/indexes/period/test_period.py +++ b/pandas/tests/indexes/period/test_period.py @@ -536,6 +536,12 @@ def test_contains_raise_error_if_period_index_is_in_multi_index(self, msg, key): with pytest.raises(KeyError, match=msg): df.loc[key] + def test_format_empty(self): + # GH35712 + empty_idx = self._holder([], freq="A") + assert empty_idx.format() == [] + assert empty_idx.format(name=True) == [""] + def test_maybe_convert_timedelta(): pi = PeriodIndex(["2000", "2001"], freq="D") diff --git a/pandas/tests/indexes/ranges/test_range.py b/pandas/tests/indexes/ranges/test_range.py index c4c242746e92c..172cd4a106ac1 100644 --- a/pandas/tests/indexes/ranges/test_range.py +++ b/pandas/tests/indexes/ranges/test_range.py @@ -171,8 +171,14 @@ def test_cache(self): pass assert idx._cache == {} + idx.format() + assert idx._cache == {} + df = pd.DataFrame({"a": range(10)}, index=idx) + str(df) + assert idx._cache == {} + df.loc[50] assert idx._cache == {} @@ -515,3 +521,9 @@ def test_engineless_lookup(self): idx.get_loc("a") assert "_engine" not in idx._cache + + def test_format_empty(self): + # GH35712 + empty_idx = self._holder(0) + assert empty_idx.format() == [] + assert empty_idx.format(name=True) == [""] From 9515944cb81daacb24fabe404289d59d77fc9afd Mon Sep 17 00:00:00 2001 From: Avinash Pancham <44933366+avinashpancham@users.noreply.github.com> Date: Wed, 26 Aug 2020 14:44:25 +0200 Subject: [PATCH 0566/1025] TST: Verify whether read-only datetime64 array can be factorized (35650) (#35775) * TST: Verify whether read-only datetime64 array can be factorized (35650) * CLN: Make test more inline with similar factorize tests --- pandas/tests/test_algos.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 6c6bdb6b1b2bd..67a2dc2303550 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -252,6 +252,19 @@ def test_object_factorize(self, writable): tm.assert_numpy_array_equal(codes, expected_codes) tm.assert_numpy_array_equal(uniques, expected_uniques) + def test_datetime64_factorize(self, writable): + # GH35650 Verify whether read-only datetime64 array can be factorized + data = np.array([np.datetime64("2020-01-01T00:00:00.000")]) + data.setflags(write=writable) + expected_codes = np.array([0], dtype=np.int64) + expected_uniques = np.array( + ["2020-01-01T00:00:00.000000000"], dtype="datetime64[ns]" + ) + + codes, uniques = pd.factorize(data) + tm.assert_numpy_array_equal(codes, expected_codes) + tm.assert_numpy_array_equal(uniques, expected_uniques) + def test_deprecate_order(self): # gh 19727 - check warning is raised for deprecated keyword, order. # Test not valid once order keyword is removed. From 9f6f4c418bf365ddd5d935c2647cdc4d87feaf0b Mon Sep 17 00:00:00 2001 From: Daniel Saxton <2658661+dsaxton@users.noreply.github.com> Date: Wed, 26 Aug 2020 08:57:40 -0500 Subject: [PATCH 0567/1025] Bump asv Python version (#35798) * Bump asv Python version * 3.7.1 -> 3.8 --- asv_bench/asv.conf.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/asv_bench/asv.conf.json b/asv_bench/asv.conf.json index 4583fac85b776..1863a17e3d5f7 100644 --- a/asv_bench/asv.conf.json +++ b/asv_bench/asv.conf.json @@ -26,7 +26,7 @@ // The Pythons you'd like to test against. If not provided, defaults // to the current version of Python used to run `asv`. // "pythons": ["2.7", "3.4"], - "pythons": ["3.6"], + "pythons": ["3.8"], // The matrix of dependencies to test. Each key is the name of a // package (in PyPI) and the values are version numbers. An empty From 4940165c4666e1ce2b4daca783bdc290001cfffb Mon Sep 17 00:00:00 2001 From: Daniel Saxton <2658661+dsaxton@users.noreply.github.com> Date: Wed, 26 Aug 2020 21:22:02 -0500 Subject: [PATCH 0568/1025] Fix Series construction from Sparse["datetime64[ns]"] (#35838) --- doc/source/whatsnew/v1.1.2.rst | 1 + pandas/core/construction.py | 6 ++++-- pandas/core/dtypes/cast.py | 7 +++++-- pandas/tests/series/test_constructors.py | 15 +++++++++++++++ 4 files changed, 25 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v1.1.2.rst b/doc/source/whatsnew/v1.1.2.rst index 7739a483e3d38..9747a8ef3e71f 100644 --- a/doc/source/whatsnew/v1.1.2.rst +++ b/doc/source/whatsnew/v1.1.2.rst @@ -26,6 +26,7 @@ Fixed regressions Bug fixes ~~~~~~~~~ - Bug in :meth:`DataFrame.eval` with ``object`` dtype column binary operations (:issue:`35794`) +- Bug in :class:`Series` constructor raising a ``TypeError`` when constructing sparse datetime64 dtypes (:issue:`35762`) - Bug in :meth:`DataFrame.apply` with ``result_type="reduce"`` returning with incorrect index (:issue:`35683`) - Bug in :meth:`DateTimeIndex.format` and :meth:`PeriodIndex.format` with ``name=True`` setting the first item to ``"None"`` where it should bw ``""`` (:issue:`35712`) - diff --git a/pandas/core/construction.py b/pandas/core/construction.py index 47f10f1f65f4a..e8c9f28e50084 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -35,6 +35,7 @@ is_iterator, is_list_like, is_object_dtype, + is_sparse, is_timedelta64_ns_dtype, ) from pandas.core.dtypes.generic import ( @@ -535,9 +536,10 @@ def _try_cast( if maybe_castable(arr) and not copy and dtype is None: return arr - if isinstance(dtype, ExtensionDtype) and dtype.kind != "M": + if isinstance(dtype, ExtensionDtype) and (dtype.kind != "M" or is_sparse(dtype)): # create an extension array from its dtype - # DatetimeTZ case needs to go through maybe_cast_to_datetime + # DatetimeTZ case needs to go through maybe_cast_to_datetime but + # SparseDtype does not array_type = dtype.construct_array_type()._from_sequence subarr = array_type(arr, dtype=dtype, copy=copy) return subarr diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 2697f42eb05a4..e6b4cb598989b 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -50,6 +50,7 @@ is_numeric_dtype, is_object_dtype, is_scalar, + is_sparse, is_string_dtype, is_timedelta64_dtype, is_timedelta64_ns_dtype, @@ -1323,7 +1324,9 @@ def maybe_cast_to_datetime(value, dtype, errors: str = "raise"): f"Please pass in '{dtype.name}[ns]' instead." ) - if is_datetime64 and not is_dtype_equal(dtype, DT64NS_DTYPE): + if is_datetime64 and not is_dtype_equal( + getattr(dtype, "subtype", dtype), DT64NS_DTYPE + ): # pandas supports dtype whose granularity is less than [ns] # e.g., [ps], [fs], [as] @@ -1355,7 +1358,7 @@ def maybe_cast_to_datetime(value, dtype, errors: str = "raise"): if is_scalar(value): if value == iNaT or isna(value): value = iNaT - else: + elif not is_sparse(value): value = np.array(value, copy=False) # have a scalar array-like (e.g. NaT) diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 1dd410ad02ee0..bcf7039ec9039 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -1449,3 +1449,18 @@ def test_constructor_datetimelike_scalar_to_string_dtype(self): result = Series("M", index=[1, 2, 3], dtype="string") expected = pd.Series(["M", "M", "M"], index=[1, 2, 3], dtype="string") tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "values", + [ + [np.datetime64("2012-01-01"), np.datetime64("2013-01-01")], + ["2012-01-01", "2013-01-01"], + ], + ) + def test_constructor_sparse_datetime64(self, values): + # https://github.com/pandas-dev/pandas/issues/35762 + dtype = pd.SparseDtype("datetime64[ns]") + result = pd.Series(values, dtype=dtype) + arr = pd.arrays.SparseArray(values, dtype=dtype) + expected = pd.Series(arr) + tm.assert_series_equal(result, expected) From c62a02ce815b203c33be39d08a8ff9d6a31dbd5f Mon Sep 17 00:00:00 2001 From: Ali McMaster Date: Thu, 27 Aug 2020 17:06:25 +0100 Subject: [PATCH 0569/1025] Attempt to unpin pytest-xdist (#35910) --- ci/deps/azure-windows-37.yaml | 2 +- ci/deps/azure-windows-38.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ci/deps/azure-windows-37.yaml b/ci/deps/azure-windows-37.yaml index 4894129915722..1d15ca41c0f8e 100644 --- a/ci/deps/azure-windows-37.yaml +++ b/ci/deps/azure-windows-37.yaml @@ -8,7 +8,7 @@ dependencies: # tools - cython>=0.29.16 - pytest>=5.0.1 - - pytest-xdist>=1.21,<2.0.0 # GH 35737 + - pytest-xdist>=1.21 - hypothesis>=3.58.0 - pytest-azurepipelines diff --git a/ci/deps/azure-windows-38.yaml b/ci/deps/azure-windows-38.yaml index 2853e12b28e35..23bede5eb26f1 100644 --- a/ci/deps/azure-windows-38.yaml +++ b/ci/deps/azure-windows-38.yaml @@ -8,7 +8,7 @@ dependencies: # tools - cython>=0.29.16 - pytest>=5.0.1 - - pytest-xdist>=1.21,<2.0.0 # GH 35737 + - pytest-xdist>=1.21 - hypothesis>=3.58.0 - pytest-azurepipelines From c4814993b97ceb848a87ecaf16eac061ad39e30a Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 27 Aug 2020 11:25:02 -0700 Subject: [PATCH 0570/1025] TYP: annotate tseries.holiday (#35913) --- pandas/tseries/holiday.py | 28 +++++++++++++++------------- setup.cfg | 3 --- 2 files changed, 15 insertions(+), 16 deletions(-) diff --git a/pandas/tseries/holiday.py b/pandas/tseries/holiday.py index 8ab37f787bd10..d8a3040919e7b 100644 --- a/pandas/tseries/holiday.py +++ b/pandas/tseries/holiday.py @@ -12,7 +12,7 @@ from pandas.tseries.offsets import Day, Easter -def next_monday(dt): +def next_monday(dt: datetime) -> datetime: """ If holiday falls on Saturday, use following Monday instead; if holiday falls on Sunday, use Monday instead @@ -24,7 +24,7 @@ def next_monday(dt): return dt -def next_monday_or_tuesday(dt): +def next_monday_or_tuesday(dt: datetime) -> datetime: """ For second holiday of two adjacent ones! If holiday falls on Saturday, use following Monday instead; @@ -39,7 +39,7 @@ def next_monday_or_tuesday(dt): return dt -def previous_friday(dt): +def previous_friday(dt: datetime) -> datetime: """ If holiday falls on Saturday or Sunday, use previous Friday instead. """ @@ -50,7 +50,7 @@ def previous_friday(dt): return dt -def sunday_to_monday(dt): +def sunday_to_monday(dt: datetime) -> datetime: """ If holiday falls on Sunday, use day thereafter (Monday) instead. """ @@ -59,7 +59,7 @@ def sunday_to_monday(dt): return dt -def weekend_to_monday(dt): +def weekend_to_monday(dt: datetime) -> datetime: """ If holiday falls on Sunday or Saturday, use day thereafter (Monday) instead. @@ -72,7 +72,7 @@ def weekend_to_monday(dt): return dt -def nearest_workday(dt): +def nearest_workday(dt: datetime) -> datetime: """ If holiday falls on Saturday, use day before (Friday) instead; if holiday falls on Sunday, use day thereafter (Monday) instead. @@ -84,7 +84,7 @@ def nearest_workday(dt): return dt -def next_workday(dt): +def next_workday(dt: datetime) -> datetime: """ returns next weekday used for observances """ @@ -95,7 +95,7 @@ def next_workday(dt): return dt -def previous_workday(dt): +def previous_workday(dt: datetime) -> datetime: """ returns previous weekday used for observances """ @@ -106,14 +106,14 @@ def previous_workday(dt): return dt -def before_nearest_workday(dt): +def before_nearest_workday(dt: datetime) -> datetime: """ returns previous workday after nearest workday """ return previous_workday(nearest_workday(dt)) -def after_nearest_workday(dt): +def after_nearest_workday(dt: datetime) -> datetime: """ returns next workday after nearest workday needed for Boxing day or multiple holidays in a series @@ -428,9 +428,11 @@ def holidays(self, start=None, end=None, return_name=False): # If we don't have a cache or the dates are outside the prior cache, we # get them again if self._cache is None or start < self._cache[0] or end > self._cache[1]: - holidays = [rule.dates(start, end, return_name=True) for rule in self.rules] - if holidays: - holidays = concat(holidays) + pre_holidays = [ + rule.dates(start, end, return_name=True) for rule in self.rules + ] + if pre_holidays: + holidays = concat(pre_holidays) else: holidays = Series(index=DatetimeIndex([]), dtype=object) diff --git a/setup.cfg b/setup.cfg index e4c0b3dcf37ef..aa1535a171f0a 100644 --- a/setup.cfg +++ b/setup.cfg @@ -276,6 +276,3 @@ check_untyped_defs=False [mypy-pandas.plotting._matplotlib.misc] check_untyped_defs=False - -[mypy-pandas.tseries.holiday] -check_untyped_defs=False From 09ec9b6898f18ca6527c4f9cfee3cc8d98e9c8c6 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 28 Aug 2020 02:07:05 -0700 Subject: [PATCH 0571/1025] TYP: annotations in pandas.plotting (#35935) --- pandas/plotting/_matplotlib/converter.py | 9 +++++---- pandas/plotting/_matplotlib/timeseries.py | 4 ++-- pandas/plotting/_matplotlib/tools.py | 22 ++++++++++++++-------- 3 files changed, 21 insertions(+), 14 deletions(-) diff --git a/pandas/plotting/_matplotlib/converter.py b/pandas/plotting/_matplotlib/converter.py index 8f2080658e63e..214a67690d695 100644 --- a/pandas/plotting/_matplotlib/converter.py +++ b/pandas/plotting/_matplotlib/converter.py @@ -1,7 +1,8 @@ import contextlib import datetime as pydt -from datetime import datetime, timedelta +from datetime import datetime, timedelta, tzinfo import functools +from typing import Optional, Tuple from dateutil.relativedelta import relativedelta import matplotlib.dates as dates @@ -152,7 +153,7 @@ def axisinfo(unit, axis): return units.AxisInfo(majloc=majloc, majfmt=majfmt, label="time") @staticmethod - def default_units(x, axis): + def default_units(x, axis) -> str: return "time" @@ -421,7 +422,7 @@ def autoscale(self): return self.nonsingular(vmin, vmax) -def _from_ordinal(x, tz=None): +def _from_ordinal(x, tz: Optional[tzinfo] = None) -> datetime: ix = int(x) dt = datetime.fromordinal(ix) remainder = float(x) - ix @@ -450,7 +451,7 @@ def _from_ordinal(x, tz=None): # ------------------------------------------------------------------------- -def _get_default_annual_spacing(nyears): +def _get_default_annual_spacing(nyears) -> Tuple[int, int]: """ Returns a default spacing between consecutive ticks for annual data. """ diff --git a/pandas/plotting/_matplotlib/timeseries.py b/pandas/plotting/_matplotlib/timeseries.py index eef4276f0ed09..193602e1baf4a 100644 --- a/pandas/plotting/_matplotlib/timeseries.py +++ b/pandas/plotting/_matplotlib/timeseries.py @@ -62,13 +62,13 @@ def _maybe_resample(series: "Series", ax, kwargs): return freq, series -def _is_sub(f1, f2): +def _is_sub(f1: str, f2: str) -> bool: return (f1.startswith("W") and is_subperiod("D", f2)) or ( f2.startswith("W") and is_subperiod(f1, "D") ) -def _is_sup(f1, f2): +def _is_sup(f1: str, f2: str) -> bool: return (f1.startswith("W") and is_superperiod("D", f2)) or ( f2.startswith("W") and is_superperiod(f1, "D") ) diff --git a/pandas/plotting/_matplotlib/tools.py b/pandas/plotting/_matplotlib/tools.py index caf2f27de9276..26b25597ce1a6 100644 --- a/pandas/plotting/_matplotlib/tools.py +++ b/pandas/plotting/_matplotlib/tools.py @@ -1,16 +1,22 @@ # being a bit too dynamic from math import ceil +from typing import TYPE_CHECKING, Tuple import warnings import matplotlib.table import matplotlib.ticker as ticker import numpy as np +from pandas._typing import FrameOrSeries + from pandas.core.dtypes.common import is_list_like from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries from pandas.plotting._matplotlib import compat +if TYPE_CHECKING: + from matplotlib.table import Table + def format_date_labels(ax, rot): # mini version of autofmt_xdate @@ -21,7 +27,7 @@ def format_date_labels(ax, rot): fig.subplots_adjust(bottom=0.2) -def table(ax, data, rowLabels=None, colLabels=None, **kwargs): +def table(ax, data: FrameOrSeries, rowLabels=None, colLabels=None, **kwargs) -> "Table": if isinstance(data, ABCSeries): data = data.to_frame() elif isinstance(data, ABCDataFrame): @@ -43,7 +49,7 @@ def table(ax, data, rowLabels=None, colLabels=None, **kwargs): return table -def _get_layout(nplots, layout=None, layout_type="box"): +def _get_layout(nplots: int, layout=None, layout_type: str = "box") -> Tuple[int, int]: if layout is not None: if not isinstance(layout, (tuple, list)) or len(layout) != 2: raise ValueError("Layout must be a tuple of (rows, columns)") @@ -92,14 +98,14 @@ def _get_layout(nplots, layout=None, layout_type="box"): def _subplots( - naxes=None, - sharex=False, - sharey=False, - squeeze=True, + naxes: int, + sharex: bool = False, + sharey: bool = False, + squeeze: bool = True, subplot_kw=None, ax=None, layout=None, - layout_type="box", + layout_type: str = "box", **fig_kw, ): """ @@ -369,7 +375,7 @@ def _get_all_lines(ax): return lines -def _get_xlim(lines): +def _get_xlim(lines) -> Tuple[float, float]: left, right = np.inf, -np.inf for l in lines: x = l.get_xdata(orig=False) From 04c5e8301cb30537203b3447c645160a145dace4 Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Fri, 28 Aug 2020 10:21:58 +0100 Subject: [PATCH 0572/1025] remove unnecessary trailing commas (#35930) --- pandas/core/aggregation.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/pandas/core/aggregation.py b/pandas/core/aggregation.py index 891048ae82dfd..e2374b81ca13b 100644 --- a/pandas/core/aggregation.py +++ b/pandas/core/aggregation.py @@ -28,10 +28,8 @@ def reconstruct_func( - func: Optional[AggFuncType], **kwargs, -) -> Tuple[ - bool, Optional[AggFuncType], Optional[List[str]], Optional[List[int]], -]: + func: Optional[AggFuncType], **kwargs +) -> Tuple[bool, Optional[AggFuncType], Optional[List[str]], Optional[List[int]]]: """ This is the internal function to reconstruct func given if there is relabeling or not and also normalize the keyword to get new order of columns. From 2ae173eccac7fe6cacef57b401596d1b43df4719 Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Fri, 28 Aug 2020 10:26:43 +0100 Subject: [PATCH 0573/1025] CLN remove unnecessary trailing commas to get ready for new version of black: _testing -> generic (#35949) * pandas/_testing.py * pandas/core/algorithms.py * pandas/core/arrays/_mixins.py * pandas/core/arrays/categorical.py * pandas/core/arrays/integer.py * pandas/core/arrays/masked.py * pandas/core/arrays/numpy_.py * pandas/core/arrays/period.py * pandas/core/construction.py * pandas/core/generic.py --- pandas/_testing.py | 12 +++++------- pandas/core/algorithms.py | 2 +- pandas/core/arrays/_mixins.py | 2 +- pandas/core/arrays/categorical.py | 2 +- pandas/core/arrays/integer.py | 4 ++-- pandas/core/arrays/masked.py | 2 +- pandas/core/arrays/numpy_.py | 2 +- pandas/core/arrays/period.py | 2 +- pandas/core/construction.py | 4 +--- pandas/core/generic.py | 18 +++++++----------- 10 files changed, 21 insertions(+), 29 deletions(-) diff --git a/pandas/_testing.py b/pandas/_testing.py index ef6232fa6d575..b402b040d9268 100644 --- a/pandas/_testing.py +++ b/pandas/_testing.py @@ -939,7 +939,7 @@ def assert_categorical_equal( if check_category_order: assert_index_equal(left.categories, right.categories, obj=f"{obj}.categories") assert_numpy_array_equal( - left.codes, right.codes, check_dtype=check_dtype, obj=f"{obj}.codes", + left.codes, right.codes, check_dtype=check_dtype, obj=f"{obj}.codes" ) else: try: @@ -948,9 +948,7 @@ def assert_categorical_equal( except TypeError: # e.g. '<' not supported between instances of 'int' and 'str' lc, rc = left.categories, right.categories - assert_index_equal( - lc, rc, obj=f"{obj}.categories", - ) + assert_index_equal(lc, rc, obj=f"{obj}.categories") assert_index_equal( left.categories.take(left.codes), right.categories.take(right.codes), @@ -1092,7 +1090,7 @@ def _raise(left, right, err_msg): if err_msg is None: if left.shape != right.shape: raise_assert_detail( - obj, f"{obj} shapes are different", left.shape, right.shape, + obj, f"{obj} shapes are different", left.shape, right.shape ) diff = 0 @@ -1559,7 +1557,7 @@ def assert_frame_equal( # shape comparison if left.shape != right.shape: raise_assert_detail( - obj, f"{obj} shape mismatch", f"{repr(left.shape)}", f"{repr(right.shape)}", + obj, f"{obj} shape mismatch", f"{repr(left.shape)}", f"{repr(right.shape)}" ) if check_like: @@ -2884,7 +2882,7 @@ def convert_rows_list_to_csv_str(rows_list: List[str]): return expected -def external_error_raised(expected_exception: Type[Exception],) -> ContextManager: +def external_error_raised(expected_exception: Type[Exception]) -> ContextManager: """ Helper function to mark pytest.raises that have an external error message. diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index befde7c355818..2a6e983eff3ee 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -462,7 +462,7 @@ def isin(comps: AnyArrayLike, values: AnyArrayLike) -> np.ndarray: def _factorize_array( - values, na_sentinel: int = -1, size_hint=None, na_value=None, mask=None, + values, na_sentinel: int = -1, size_hint=None, na_value=None, mask=None ) -> Tuple[np.ndarray, np.ndarray]: """ Factorize an array-like to codes and uniques. diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py index 832d09b062265..2976747d66dfa 100644 --- a/pandas/core/arrays/_mixins.py +++ b/pandas/core/arrays/_mixins.py @@ -40,7 +40,7 @@ def take( fill_value = self._validate_fill_value(fill_value) new_data = take( - self._ndarray, indices, allow_fill=allow_fill, fill_value=fill_value, + self._ndarray, indices, allow_fill=allow_fill, fill_value=fill_value ) return self._from_backing_data(new_data) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index a28b341669918..27b1afdb438cb 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1505,7 +1505,7 @@ def argsort(self, ascending=True, kind="quicksort", **kwargs): return super().argsort(ascending=ascending, kind=kind, **kwargs) def sort_values( - self, inplace: bool = False, ascending: bool = True, na_position: str = "last", + self, inplace: bool = False, ascending: bool = True, na_position: str = "last" ): """ Sort the Categorical by category value returning a new diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 57df067c7b16e..d83ff91a1315f 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -138,7 +138,7 @@ def __from_arrow__( return IntegerArray._concat_same_type(results) -def integer_array(values, dtype=None, copy: bool = False,) -> "IntegerArray": +def integer_array(values, dtype=None, copy: bool = False) -> "IntegerArray": """ Infer and return an integer array of the values. @@ -182,7 +182,7 @@ def safe_cast(values, dtype, copy: bool): def coerce_to_array( - values, dtype, mask=None, copy: bool = False, + values, dtype, mask=None, copy: bool = False ) -> Tuple[np.ndarray, np.ndarray]: """ Coerce the input values array to numpy arrays with a mask diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 235840d6d201e..1237dea5c1a64 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -126,7 +126,7 @@ def __invert__(self: BaseMaskedArrayT) -> BaseMaskedArrayT: return type(self)(~self._data, self._mask) def to_numpy( - self, dtype=None, copy: bool = False, na_value: Scalar = lib.no_default, + self, dtype=None, copy: bool = False, na_value: Scalar = lib.no_default ) -> np.ndarray: """ Convert to a NumPy Array. diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index 05f901518d82f..23a4a70734c81 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -280,7 +280,7 @@ def isna(self) -> np.ndarray: return isna(self._ndarray) def fillna( - self, value=None, method: Optional[str] = None, limit: Optional[int] = None, + self, value=None, method: Optional[str] = None, limit: Optional[int] = None ) -> "PandasArray": # TODO(_values_for_fillna): remove this value, method = validate_fillna_kwargs(value, method) diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index ddaf6d39f1837..cc39ffb5d1203 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -634,7 +634,7 @@ def _sub_period_array(self, other): return new_values def _addsub_int_array( - self, other: np.ndarray, op: Callable[[Any, Any], Any], + self, other: np.ndarray, op: Callable[[Any, Any], Any] ) -> "PeriodArray": """ Add or subtract array of integers; equivalent to applying diff --git a/pandas/core/construction.py b/pandas/core/construction.py index e8c9f28e50084..f145e76046bee 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -514,9 +514,7 @@ def sanitize_array( return subarr -def _try_cast( - arr, dtype: Optional[DtypeObj], copy: bool, raise_cast_failure: bool, -): +def _try_cast(arr, dtype: Optional[DtypeObj], copy: bool, raise_cast_failure: bool): """ Convert input to numpy ndarray and optionally cast to a given dtype. diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 286da6e1de9d5..fea3efedb6abb 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -315,17 +315,13 @@ def _data(self): @property def _AXIS_NUMBERS(self) -> Dict[str, int]: """.. deprecated:: 1.1.0""" - warnings.warn( - "_AXIS_NUMBERS has been deprecated.", FutureWarning, stacklevel=3, - ) + warnings.warn("_AXIS_NUMBERS has been deprecated.", FutureWarning, stacklevel=3) return {"index": 0} @property def _AXIS_NAMES(self) -> Dict[int, str]: """.. deprecated:: 1.1.0""" - warnings.warn( - "_AXIS_NAMES has been deprecated.", FutureWarning, stacklevel=3, - ) + warnings.warn("_AXIS_NAMES has been deprecated.", FutureWarning, stacklevel=3) return {0: "index"} def _construct_axes_dict(self, axes=None, **kwargs): @@ -5128,7 +5124,7 @@ def pipe(self, func, *args, **kwargs): ... .pipe(g, arg1=a) ... .pipe((func, 'arg2'), arg1=a, arg3=c) ... ) # doctest: +SKIP - """ + """ return com.pipe(self, func, *args, **kwargs) _shared_docs["aggregate"] = dedent( @@ -5630,7 +5626,7 @@ def astype( else: # else, only a single dtype is given - new_data = self._mgr.astype(dtype=dtype, copy=copy, errors=errors,) + new_data = self._mgr.astype(dtype=dtype, copy=copy, errors=errors) return self._constructor(new_data).__finalize__(self, method="astype") # GH 33113: handle empty frame or series @@ -6520,7 +6516,7 @@ def replace( 3 b 4 b dtype: object - """ + """ if not ( is_scalar(to_replace) or is_re_compilable(to_replace) @@ -7772,7 +7768,7 @@ def between_time( raise TypeError("Index must be DatetimeIndex") indexer = index.indexer_between_time( - start_time, end_time, include_start=include_start, include_end=include_end, + start_time, end_time, include_start=include_start, include_end=include_end ) return self._take_with_is_copy(indexer, axis=axis) @@ -8939,7 +8935,7 @@ def _where( self._check_inplace_setting(other) new_data = self._mgr.putmask( - mask=cond, new=other, align=align, axis=block_axis, + mask=cond, new=other, align=align, axis=block_axis ) result = self._constructor(new_data) return self._update_inplace(result) From 9857b435c085fd3e1225450f732de72714b332aa Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Fri, 28 Aug 2020 10:27:16 +0100 Subject: [PATCH 0574/1025] CLN remove unnecessary trailing commas to get ready for new version of black: generic -> blocks (#35950) * pandas/core/groupby/generic.py * pandas/core/groupby/groupby.py * pandas/core/groupby/ops.py * pandas/core/indexes/datetimelike.py * pandas/core/indexes/interval.py * pandas/core/indexes/numeric.py * pandas/core/indexes/range.py * pandas/core/internals/blocks.py --- pandas/core/groupby/generic.py | 8 ++------ pandas/core/groupby/groupby.py | 8 +++----- pandas/core/groupby/ops.py | 14 ++++---------- pandas/core/indexes/datetimelike.py | 4 +--- pandas/core/indexes/interval.py | 4 ++-- pandas/core/indexes/numeric.py | 2 +- pandas/core/indexes/range.py | 2 +- pandas/core/internals/blocks.py | 30 ++++++++++++++--------------- 8 files changed, 29 insertions(+), 43 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 2afa56b50c3c7..82e629d184b19 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -221,9 +221,7 @@ def _selection_name(self): def apply(self, func, *args, **kwargs): return super().apply(func, *args, **kwargs) - @doc( - _agg_template, examples=_agg_examples_doc, klass="Series", - ) + @doc(_agg_template, examples=_agg_examples_doc, klass="Series") def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs): if maybe_use_numba(engine): @@ -935,9 +933,7 @@ class DataFrameGroupBy(GroupBy[DataFrame]): See :ref:`groupby.aggregate.named` for more.""" ) - @doc( - _agg_template, examples=_agg_examples_doc, klass="DataFrame", - ) + @doc(_agg_template, examples=_agg_examples_doc, klass="DataFrame") def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs): if maybe_use_numba(engine): diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index f96b488fb8d0d..a91366af61d0d 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1077,7 +1077,7 @@ def _aggregate_with_numba(self, data, func, *args, engine_kwargs=None, **kwargs) tuple(args), kwargs, func, engine_kwargs ) result = numba_agg_func( - sorted_data, sorted_index, starts, ends, len(group_keys), len(data.columns), + sorted_data, sorted_index, starts, ends, len(group_keys), len(data.columns) ) if cache_key not in NUMBA_FUNC_CACHE: NUMBA_FUNC_CACHE[cache_key] = numba_agg_func @@ -1595,8 +1595,7 @@ def max(self, numeric_only: bool = False, min_count: int = -1): def first(self, numeric_only: bool = False, min_count: int = -1): def first_compat(obj: FrameOrSeries, axis: int = 0): def first(x: Series): - """Helper function for first item that isn't NA. - """ + """Helper function for first item that isn't NA.""" x = x.array[notna(x.array)] if len(x) == 0: return np.nan @@ -1620,8 +1619,7 @@ def first(x: Series): def last(self, numeric_only: bool = False, min_count: int = -1): def last_compat(obj: FrameOrSeries, axis: int = 0): def last(x: Series): - """Helper function for last item that isn't NA. - """ + """Helper function for last item that isn't NA.""" x = x.array[notna(x.array)] if len(x) == 0: return np.nan diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index c6171a55359fe..290680f380f5f 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -583,7 +583,7 @@ def transform(self, values, how: str, axis: int = 0, **kwargs): return self._cython_operation("transform", values, how, axis, **kwargs) def _aggregate( - self, result, counts, values, comp_ids, agg_func, min_count: int = -1, + self, result, counts, values, comp_ids, agg_func, min_count: int = -1 ): if agg_func is libgroupby.group_nth: # different signature from the others @@ -603,9 +603,7 @@ def _transform( return result - def agg_series( - self, obj: Series, func: F, *args, **kwargs, - ): + def agg_series(self, obj: Series, func: F, *args, **kwargs): # Caller is responsible for checking ngroups != 0 assert self.ngroups != 0 @@ -653,9 +651,7 @@ def _aggregate_series_fast(self, obj: Series, func: F): result, counts = grouper.get_result() return result, counts - def _aggregate_series_pure_python( - self, obj: Series, func: F, *args, **kwargs, - ): + def _aggregate_series_pure_python(self, obj: Series, func: F, *args, **kwargs): group_index, _, ngroups = self.group_info counts = np.zeros(ngroups, dtype=int) @@ -841,9 +837,7 @@ def groupings(self) -> "List[grouper.Grouping]": for lvl, name in zip(self.levels, self.names) ] - def agg_series( - self, obj: Series, func: F, *args, **kwargs, - ): + def agg_series(self, obj: Series, func: F, *args, **kwargs): # Caller is responsible for checking ngroups != 0 assert self.ngroups != 0 assert len(self.bins) > 0 # otherwise we'd get IndexError in get_result diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 0e8d7c1b866b8..efe1a853a9a76 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -81,9 +81,7 @@ def wrapper(left, right): DatetimeLikeArrayMixin, cache=True, ) -@inherit_names( - ["mean", "asi8", "freq", "freqstr", "_box_func"], DatetimeLikeArrayMixin, -) +@inherit_names(["mean", "asi8", "freq", "freqstr", "_box_func"], DatetimeLikeArrayMixin) class DatetimeIndexOpsMixin(ExtensionIndex): """ Common ops mixin to support a unified interface datetimelike Index. diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 9281f8017761d..5d309ef7cd515 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -182,10 +182,10 @@ def func(intvidx_self, other, sort=False): ) @inherit_names(["set_closed", "to_tuples"], IntervalArray, wrap=True) @inherit_names( - ["__array__", "overlaps", "contains", "left", "right", "length"], IntervalArray, + ["__array__", "overlaps", "contains", "left", "right", "length"], IntervalArray ) @inherit_names( - ["is_non_overlapping_monotonic", "mid", "closed"], IntervalArray, cache=True, + ["is_non_overlapping_monotonic", "mid", "closed"], IntervalArray, cache=True ) class IntervalIndex(IntervalMixin, ExtensionIndex): _typ = "intervalindex" diff --git a/pandas/core/indexes/numeric.py b/pandas/core/indexes/numeric.py index 731907993d08f..80bb9f10fadd9 100644 --- a/pandas/core/indexes/numeric.py +++ b/pandas/core/indexes/numeric.py @@ -436,7 +436,7 @@ def isin(self, values, level=None): def _is_compatible_with_other(self, other) -> bool: return super()._is_compatible_with_other(other) or all( isinstance( - obj, (ABCInt64Index, ABCFloat64Index, ABCUInt64Index, ABCRangeIndex), + obj, (ABCInt64Index, ABCFloat64Index, ABCUInt64Index, ABCRangeIndex) ) for obj in [self, other] ) diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index b85e2d3947cb1..f1457a9aac62b 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -82,7 +82,7 @@ class RangeIndex(Int64Index): # Constructors def __new__( - cls, start=None, stop=None, step=None, dtype=None, copy=False, name=None, + cls, start=None, stop=None, step=None, dtype=None, copy=False, name=None ): cls._validate_dtype(dtype) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index c62be4f767f00..a38b47a4c2a25 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -724,7 +724,7 @@ def replace( # _can_hold_element checks have reduced this back to the # scalar case and we can avoid a costly object cast return self.replace( - to_replace[0], value, inplace=inplace, regex=regex, convert=convert, + to_replace[0], value, inplace=inplace, regex=regex, convert=convert ) # GH 22083, TypeError or ValueError occurred within error handling @@ -905,7 +905,7 @@ def setitem(self, indexer, value): return block def putmask( - self, mask, new, inplace: bool = False, axis: int = 0, transpose: bool = False, + self, mask, new, inplace: bool = False, axis: int = 0, transpose: bool = False ) -> List["Block"]: """ putmask the data to the block; it is possible that we may create a @@ -1292,7 +1292,7 @@ def shift(self, periods: int, axis: int = 0, fill_value=None): return [self.make_block(new_values)] def where( - self, other, cond, errors="raise", try_cast: bool = False, axis: int = 0, + self, other, cond, errors="raise", try_cast: bool = False, axis: int = 0 ) -> List["Block"]: """ evaluate the block; return result block(s) from the result @@ -1366,7 +1366,7 @@ def where_func(cond, values, other): # we are explicitly ignoring errors block = self.coerce_to_target_dtype(other) blocks = block.where( - orig_other, cond, errors=errors, try_cast=try_cast, axis=axis, + orig_other, cond, errors=errors, try_cast=try_cast, axis=axis ) return self._maybe_downcast(blocks, "infer") @@ -1605,7 +1605,7 @@ def set(self, locs, values): self.values = values def putmask( - self, mask, new, inplace: bool = False, axis: int = 0, transpose: bool = False, + self, mask, new, inplace: bool = False, axis: int = 0, transpose: bool = False ) -> List["Block"]: """ See Block.putmask.__doc__ @@ -1816,7 +1816,7 @@ def diff(self, n: int, axis: int = 1) -> List["Block"]: return super().diff(n, axis) def shift( - self, periods: int, axis: int = 0, fill_value: Any = None, + self, periods: int, axis: int = 0, fill_value: Any = None ) -> List["ExtensionBlock"]: """ Shift the block by `periods`. @@ -1833,7 +1833,7 @@ def shift( ] def where( - self, other, cond, errors="raise", try_cast: bool = False, axis: int = 0, + self, other, cond, errors="raise", try_cast: bool = False, axis: int = 0 ) -> List["Block"]: cond = _extract_bool_array(cond) @@ -1945,7 +1945,7 @@ def _can_hold_element(self, element: Any) -> bool: ) def to_native_types( - self, na_rep="", float_format=None, decimal=".", quoting=None, **kwargs, + self, na_rep="", float_format=None, decimal=".", quoting=None, **kwargs ): """ convert to our native types format """ values = self.values @@ -2369,7 +2369,7 @@ def replace(self, to_replace, value, inplace=False, regex=False, convert=True): if not np.can_cast(to_replace_values, bool): return self return super().replace( - to_replace, value, inplace=inplace, regex=regex, convert=convert, + to_replace, value, inplace=inplace, regex=regex, convert=convert ) @@ -2453,18 +2453,18 @@ def replace(self, to_replace, value, inplace=False, regex=False, convert=True): if not either_list and is_re(to_replace): return self._replace_single( - to_replace, value, inplace=inplace, regex=True, convert=convert, + to_replace, value, inplace=inplace, regex=True, convert=convert ) elif not (either_list or regex): return super().replace( - to_replace, value, inplace=inplace, regex=regex, convert=convert, + to_replace, value, inplace=inplace, regex=regex, convert=convert ) elif both_lists: for to_rep, v in zip(to_replace, value): result_blocks = [] for b in blocks: result = b._replace_single( - to_rep, v, inplace=inplace, regex=regex, convert=convert, + to_rep, v, inplace=inplace, regex=regex, convert=convert ) result_blocks = _extend_blocks(result, result_blocks) blocks = result_blocks @@ -2475,18 +2475,18 @@ def replace(self, to_replace, value, inplace=False, regex=False, convert=True): result_blocks = [] for b in blocks: result = b._replace_single( - to_rep, value, inplace=inplace, regex=regex, convert=convert, + to_rep, value, inplace=inplace, regex=regex, convert=convert ) result_blocks = _extend_blocks(result, result_blocks) blocks = result_blocks return result_blocks return self._replace_single( - to_replace, value, inplace=inplace, convert=convert, regex=regex, + to_replace, value, inplace=inplace, convert=convert, regex=regex ) def _replace_single( - self, to_replace, value, inplace=False, regex=False, convert=True, mask=None, + self, to_replace, value, inplace=False, regex=False, convert=True, mask=None ): """ Replace elements by the given value. From 3b7bf1fc3adb60b8d16d44d9ac3fd2b0f8e42ee8 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Fri, 28 Aug 2020 18:00:57 +0100 Subject: [PATCH 0575/1025] TYP: misc cleanup in core\groupby\generic.py (#35955) --- pandas/core/groupby/generic.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 82e629d184b19..3172fb4e0e853 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -9,7 +9,6 @@ import copy from functools import partial from textwrap import dedent -import typing from typing import ( TYPE_CHECKING, Any, @@ -22,6 +21,7 @@ Optional, Sequence, Type, + TypeVar, Union, ) import warnings @@ -92,7 +92,7 @@ # TODO: validate types on ScalarResult and move to _typing # Blocked from using by https://github.com/python/mypy/issues/1484 # See note at _mangle_lambda_list -ScalarResult = typing.TypeVar("ScalarResult") +ScalarResult = TypeVar("ScalarResult") def generate_property(name: str, klass: Type[FrameOrSeries]): @@ -606,8 +606,8 @@ def filter(self, func, dropna=True, *args, **kwargs): wrapper = lambda x: func(x, *args, **kwargs) # Interpret np.nan as False. - def true_and_notna(x, *args, **kwargs) -> bool: - b = wrapper(x, *args, **kwargs) + def true_and_notna(x) -> bool: + b = wrapper(x) return b and notna(b) try: @@ -1210,7 +1210,7 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): # TODO: Remove when default dtype of empty Series is object kwargs = first_not_none._construct_axes_dict() backup = create_series_with_explicit_dtype( - **kwargs, dtype_if_empty=object + dtype_if_empty=object, **kwargs ) values = [x if (x is not None) else backup for x in values] From 68dde1931612cc39f427430e31b6dd58b3a167b4 Mon Sep 17 00:00:00 2001 From: Johnny Pribyl Date: Fri, 28 Aug 2020 12:06:08 -0600 Subject: [PATCH 0576/1025] Issue35925 remove trailing commas (#35956) --- pandas/core/internals/concat.py | 6 +++--- pandas/core/internals/managers.py | 8 +++----- pandas/core/internals/ops.py | 2 +- pandas/core/nanops.py | 2 +- pandas/core/ops/docstrings.py | 2 +- pandas/core/reshape/concat.py | 2 +- pandas/core/reshape/pivot.py | 4 ++-- pandas/core/reshape/reshape.py | 8 +++----- 8 files changed, 15 insertions(+), 19 deletions(-) diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 2c0d4931a7bf2..99a586f056b12 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -29,7 +29,7 @@ def concatenate_block_managers( - mgrs_indexers, axes, concat_axis: int, copy: bool, + mgrs_indexers, axes, concat_axis: int, copy: bool ) -> BlockManager: """ Concatenate block managers into one. @@ -76,7 +76,7 @@ def concatenate_block_managers( b = make_block(values, placement=placement, ndim=blk.ndim) else: b = make_block( - _concatenate_join_units(join_units, concat_axis, copy=copy,), + _concatenate_join_units(join_units, concat_axis, copy=copy), placement=placement, ) blocks.append(b) @@ -339,7 +339,7 @@ def _concatenate_join_units(join_units, concat_axis, copy): # 2D to put it a non-EA Block concat_values = np.atleast_2d(concat_values) else: - concat_values = concat_compat(to_concat, axis=concat_axis,) + concat_values = concat_compat(to_concat, axis=concat_axis) return concat_values diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index a5372b14d210f..67ff3b9456ccf 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -491,7 +491,7 @@ def get_axe(block, qs, axes): values = values.take(indexer) return SingleBlockManager( - make_block(values, ndim=1, placement=np.arange(len(values))), axes[0], + make_block(values, ndim=1, placement=np.arange(len(values))), axes[0] ) def isna(self, func) -> "BlockManager": @@ -519,9 +519,7 @@ def where( def setitem(self, indexer, value) -> "BlockManager": return self.apply("setitem", indexer=indexer, value=value) - def putmask( - self, mask, new, align: bool = True, axis: int = 0, - ): + def putmask(self, mask, new, align: bool = True, axis: int = 0): transpose = self.ndim == 2 if align: @@ -1923,7 +1921,7 @@ def _compare_or_regex_search( """ def _check_comparison_types( - result: Union[ArrayLike, bool], a: ArrayLike, b: Union[Scalar, Pattern], + result: Union[ArrayLike, bool], a: ArrayLike, b: Union[Scalar, Pattern] ): """ Raises an error if the two arrays (a,b) cannot be compared. diff --git a/pandas/core/internals/ops.py b/pandas/core/internals/ops.py index ae4892c720d5b..05f5f9a00ae1b 100644 --- a/pandas/core/internals/ops.py +++ b/pandas/core/internals/ops.py @@ -11,7 +11,7 @@ BlockPairInfo = namedtuple( - "BlockPairInfo", ["lvals", "rvals", "locs", "left_ea", "right_ea", "rblk"], + "BlockPairInfo", ["lvals", "rvals", "locs", "left_ea", "right_ea", "rblk"] ) diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index e7e28798d84a2..e3f16a3ef4f90 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -1329,7 +1329,7 @@ def _zero_out_fperr(arg): @disallow("M8", "m8") def nancorr( - a: np.ndarray, b: np.ndarray, method="pearson", min_periods: Optional[int] = None, + a: np.ndarray, b: np.ndarray, method="pearson", min_periods: Optional[int] = None ): """ a, b: ndarrays diff --git a/pandas/core/ops/docstrings.py b/pandas/core/ops/docstrings.py index 4ace873f029ae..99c2fefc97ae7 100644 --- a/pandas/core/ops/docstrings.py +++ b/pandas/core/ops/docstrings.py @@ -31,7 +31,7 @@ def _make_flex_doc(op_name, typ): base_doc = _flex_doc_SERIES if op_desc["reverse"]: base_doc += _see_also_reverse_SERIES.format( - reverse=op_desc["reverse"], see_also_desc=op_desc["see_also_desc"], + reverse=op_desc["reverse"], see_also_desc=op_desc["see_also_desc"] ) doc_no_examples = base_doc.format( desc=op_desc["desc"], diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index 9e8fb643791f2..299b68c6e71e0 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -500,7 +500,7 @@ def get_result(self): mgrs_indexers.append((obj._mgr, indexers)) new_data = concatenate_block_managers( - mgrs_indexers, self.new_axes, concat_axis=self.bm_axis, copy=self.copy, + mgrs_indexers, self.new_axes, concat_axis=self.bm_axis, copy=self.copy ) if not self.copy: new_data._consolidate_inplace() diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index 64a9e2dbf6d99..969ac56e41860 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -239,7 +239,7 @@ def _add_margins( elif values: marginal_result_set = _generate_marginal_results( - table, data, values, rows, cols, aggfunc, observed, margins_name, + table, data, values, rows, cols, aggfunc, observed, margins_name ) if not isinstance(marginal_result_set, tuple): return marginal_result_set @@ -308,7 +308,7 @@ def _compute_grand_margin(data, values, aggfunc, margins_name: str = "All"): def _generate_marginal_results( - table, data, values, rows, cols, aggfunc, observed, margins_name: str = "All", + table, data, values, rows, cols, aggfunc, observed, margins_name: str = "All" ): if len(cols) > 0: # need to "interleave" the margins diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 391313fbb5283..e81dd8f0c735c 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -81,9 +81,7 @@ class _Unstacker: unstacked : DataFrame """ - def __init__( - self, index: MultiIndex, level=-1, constructor=None, - ): + def __init__(self, index: MultiIndex, level=-1, constructor=None): if constructor is None: constructor = DataFrame @@ -422,7 +420,7 @@ def unstack(obj, level, fill_value=None): if is_extension_array_dtype(obj.dtype): return _unstack_extension_series(obj, level, fill_value) unstacker = _Unstacker( - obj.index, level=level, constructor=obj._constructor_expanddim, + obj.index, level=level, constructor=obj._constructor_expanddim ) return unstacker.get_result( obj.values, value_columns=None, fill_value=fill_value @@ -436,7 +434,7 @@ def _unstack_frame(obj, level, fill_value=None): return obj._constructor(mgr) else: return _Unstacker( - obj.index, level=level, constructor=obj._constructor, + obj.index, level=level, constructor=obj._constructor ).get_result(obj._values, value_columns=obj.columns, fill_value=fill_value) From 454eb8d820856167f7695c7d4f3952e3a5a01c44 Mon Sep 17 00:00:00 2001 From: Johnny Pribyl Date: Fri, 28 Aug 2020 12:39:48 -0600 Subject: [PATCH 0577/1025] Issue35925 remove more trailing commas (#35959) * pandas/core/series.py * pandas/core/window/ewm.py * pandas/core/window/indexers.py * pandas/core/window/numba_.py * pandas/core/window/rolling.py * pandas/io/formats/css.py * pandas/io/formats/format.py * pandas/io/orc.py --- pandas/core/series.py | 4 ++-- pandas/core/window/ewm.py | 4 ++-- pandas/core/window/indexers.py | 6 +++--- pandas/core/window/numba_.py | 2 +- pandas/core/window/rolling.py | 2 +- pandas/io/formats/css.py | 9 ++------- pandas/io/formats/format.py | 6 +++--- pandas/io/orc.py | 2 +- 8 files changed, 15 insertions(+), 20 deletions(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index 555024ad75f5e..dbc105be3c62b 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -962,12 +962,12 @@ def _get_values_tuple(self, key): # If key is contained, would have returned by now indexer, new_index = self.index.get_loc_level(key) return self._constructor(self._values[indexer], index=new_index).__finalize__( - self, + self ) def _get_values(self, indexer): try: - return self._constructor(self._mgr.get_slice(indexer)).__finalize__(self,) + return self._constructor(self._mgr.get_slice(indexer)).__finalize__(self) except ValueError: # mpl compat if we look up e.g. ser[:, np.newaxis]; # see tests.series.timeseries.test_mpl_compat_hack diff --git a/pandas/core/window/ewm.py b/pandas/core/window/ewm.py index c57c434dd3040..1913b51a68c15 100644 --- a/pandas/core/window/ewm.py +++ b/pandas/core/window/ewm.py @@ -362,7 +362,7 @@ def var(self, bias: bool = False, *args, **kwargs): def f(arg): return window_aggregations.ewmcov( - arg, arg, self.com, self.adjust, self.ignore_na, self.min_periods, bias, + arg, arg, self.com, self.adjust, self.ignore_na, self.min_periods, bias ) return self._apply(f) @@ -458,7 +458,7 @@ def _get_corr(X, Y): def _cov(x, y): return window_aggregations.ewmcov( - x, y, self.com, self.adjust, self.ignore_na, self.min_periods, 1, + x, y, self.com, self.adjust, self.ignore_na, self.min_periods, 1 ) x_values = X._prep_values() diff --git a/pandas/core/window/indexers.py b/pandas/core/window/indexers.py index 7c76a8e2a0b22..a21521f4ce8bb 100644 --- a/pandas/core/window/indexers.py +++ b/pandas/core/window/indexers.py @@ -40,7 +40,7 @@ class BaseIndexer: """Base class for window bounds calculations.""" def __init__( - self, index_array: Optional[np.ndarray] = None, window_size: int = 0, **kwargs, + self, index_array: Optional[np.ndarray] = None, window_size: int = 0, **kwargs ): """ Parameters @@ -105,7 +105,7 @@ def get_window_bounds( ) -> Tuple[np.ndarray, np.ndarray]: return calculate_variable_window_bounds( - num_values, self.window_size, min_periods, center, closed, self.index_array, + num_values, self.window_size, min_periods, center, closed, self.index_array ) @@ -316,7 +316,7 @@ def get_window_bounds( # Cannot use groupby_indicies as they might not be monotonic with the object # we're rolling over window_indicies = np.arange( - window_indicies_start, window_indicies_start + len(indices), + window_indicies_start, window_indicies_start + len(indices) ) window_indicies_start += len(indices) # Extend as we'll be slicing window like [start, end) diff --git a/pandas/core/window/numba_.py b/pandas/core/window/numba_.py index 5d35ec7457ab0..aec294c3c84c2 100644 --- a/pandas/core/window/numba_.py +++ b/pandas/core/window/numba_.py @@ -57,7 +57,7 @@ def generate_numba_apply_func( @numba.jit(nopython=nopython, nogil=nogil, parallel=parallel) def roll_apply( - values: np.ndarray, begin: np.ndarray, end: np.ndarray, minimum_periods: int, + values: np.ndarray, begin: np.ndarray, end: np.ndarray, minimum_periods: int ) -> np.ndarray: result = np.empty(len(begin)) for i in loop_range(len(result)): diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index baabdf0fca29a..39fcfcbe2bff6 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -2117,7 +2117,7 @@ def count(self): @Substitution(name="rolling") @Appender(_shared_docs["apply"]) def apply( - self, func, raw=False, engine=None, engine_kwargs=None, args=None, kwargs=None, + self, func, raw=False, engine=None, engine_kwargs=None, args=None, kwargs=None ): return super().apply( func, diff --git a/pandas/io/formats/css.py b/pandas/io/formats/css.py index b40d2a57b8106..4d6f03489725f 100644 --- a/pandas/io/formats/css.py +++ b/pandas/io/formats/css.py @@ -20,9 +20,7 @@ def expand(self, prop, value: str): try: mapping = self.SIDE_SHORTHANDS[len(tokens)] except KeyError: - warnings.warn( - f'Could not expand "{prop}: {value}"', CSSWarning, - ) + warnings.warn(f'Could not expand "{prop}: {value}"', CSSWarning) return for key, idx in zip(self.SIDES, mapping): yield prop_fmt.format(key), tokens[idx] @@ -117,10 +115,7 @@ def __call__(self, declarations_str, inherited=None): props[prop] = self.size_to_pt( props[prop], em_pt=font_size, conversions=self.BORDER_WIDTH_RATIOS ) - for prop in [ - f"margin-{side}", - f"padding-{side}", - ]: + for prop in [f"margin-{side}", f"padding-{side}"]: if prop in props: # TODO: support % props[prop] = self.size_to_pt( diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 9546f674aa124..1616c5345a899 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -80,7 +80,7 @@ FloatFormatType = Union[str, Callable, "EngFormatter"] ColspaceType = Mapping[Label, Union[str, int]] ColspaceArgType = Union[ - str, int, Sequence[Union[str, int]], Mapping[Label, Union[str, int]], + str, int, Sequence[Union[str, int]], Mapping[Label, Union[str, int]] ] common_docstring = """ @@ -741,7 +741,7 @@ def _to_str_columns(self) -> List[List[str]]: for i, c in enumerate(frame): fmt_values = self._format_col(i) fmt_values = _make_fixed_width( - fmt_values, self.justify, minimum=col_space.get(c, 0), adj=self.adj, + fmt_values, self.justify, minimum=col_space.get(c, 0), adj=self.adj ) stringified.append(fmt_values) else: @@ -1069,7 +1069,7 @@ def _get_formatted_index(self, frame: "DataFrame") -> List[str]: fmt_index = [ tuple( _make_fixed_width( - list(x), justify="left", minimum=col_space.get("", 0), adj=self.adj, + list(x), justify="left", minimum=col_space.get("", 0), adj=self.adj ) ) for x in fmt_index diff --git a/pandas/io/orc.py b/pandas/io/orc.py index ea79efd0579e5..b556732e4d116 100644 --- a/pandas/io/orc.py +++ b/pandas/io/orc.py @@ -12,7 +12,7 @@ def read_orc( - path: FilePathOrBuffer, columns: Optional[List[str]] = None, **kwargs, + path: FilePathOrBuffer, columns: Optional[List[str]] = None, **kwargs ) -> "DataFrame": """ Load an ORC object from the file path, returning a DataFrame. From 27da03c10b59e6d99f4819228e82bbe48e542096 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sat, 29 Aug 2020 12:39:35 +0100 Subject: [PATCH 0578/1025] TYP: misc typing cleanups for #32911 (#35954) --- pandas/io/excel/_odswriter.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/pandas/io/excel/_odswriter.py b/pandas/io/excel/_odswriter.py index 0131240f99cf6..72f3d81b1c662 100644 --- a/pandas/io/excel/_odswriter.py +++ b/pandas/io/excel/_odswriter.py @@ -42,7 +42,7 @@ def write_cells( sheet_name: Optional[str] = None, startrow: int = 0, startcol: int = 0, - freeze_panes: Optional[List] = None, + freeze_panes: Optional[Tuple[int, int]] = None, ) -> None: """ Write the frame cells using odf @@ -215,14 +215,17 @@ def _process_style(self, style: Dict[str, Any]) -> str: self.book.styles.addElement(odf_style) return name - def _create_freeze_panes(self, sheet_name: str, freeze_panes: List[int]) -> None: - """Create freeze panes in the sheet + def _create_freeze_panes( + self, sheet_name: str, freeze_panes: Tuple[int, int] + ) -> None: + """ + Create freeze panes in the sheet. Parameters ---------- sheet_name : str Name of the spreadsheet - freeze_panes : list + freeze_panes : tuple of (int, int) Freeze pane location x and y """ from odf.config import ( From c3fc406815d823eefb69836cb06fa2673d1f4d94 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sun, 30 Aug 2020 00:57:01 +0100 Subject: [PATCH 0579/1025] TYP: misc cleanup in core\generic.py (#35963) --- pandas/core/generic.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index fea3efedb6abb..dd7b02d98ad42 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -387,7 +387,7 @@ def _get_block_manager_axis(cls, axis: Axis) -> int: return m - axis return axis - def _get_axis_resolvers(self, axis: str) -> Dict[str, ABCSeries]: + def _get_axis_resolvers(self, axis: str) -> Dict[str, Union["Series", MultiIndex]]: # index or columns axis_index = getattr(self, axis) d = dict() @@ -417,10 +417,10 @@ def _get_axis_resolvers(self, axis: str) -> Dict[str, ABCSeries]: d[axis] = dindex return d - def _get_index_resolvers(self) -> Dict[str, ABCSeries]: + def _get_index_resolvers(self) -> Dict[str, Union["Series", MultiIndex]]: from pandas.core.computation.parsing import clean_column_name - d: Dict[str, ABCSeries] = {} + d: Dict[str, Union["Series", MultiIndex]] = {} for axis_name in self._AXIS_ORDERS: d.update(self._get_axis_resolvers(axis_name)) @@ -4703,14 +4703,15 @@ def filter( return self.reindex(**{name: [r for r in items if r in labels]}) elif like: - def f(x): + def f(x) -> bool: + assert like is not None # needed for mypy return like in ensure_str(x) values = labels.map(f) return self.loc(axis=axis)[values] elif regex: - def f(x): + def f(x) -> bool: return matcher.search(ensure_str(x)) is not None matcher = re.compile(regex) @@ -6556,7 +6557,10 @@ def replace( regex = True items = list(to_replace.items()) - keys, values = zip(*items) if items else ([], []) + if items: + keys, values = zip(*items) + else: + keys, values = ([], []) are_mappings = [is_dict_like(v) for v in values] From ce825b4102fab0e47c73ea9f2cad044cea853458 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 30 Aug 2020 04:59:00 -0700 Subject: [PATCH 0580/1025] TYP: annotate plotting based on _get_axe_freq (#35960) --- pandas/plotting/_matplotlib/core.py | 7 +++++-- pandas/plotting/_matplotlib/timeseries.py | 21 +++++++++++---------- 2 files changed, 16 insertions(+), 12 deletions(-) diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index b490e07e43753..4d23a5e5fc249 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -1,5 +1,5 @@ import re -from typing import List, Optional +from typing import TYPE_CHECKING, List, Optional import warnings from matplotlib.artist import Artist @@ -43,6 +43,9 @@ table, ) +if TYPE_CHECKING: + from matplotlib.axes import Axes + class MPLPlot: """ @@ -1147,7 +1150,7 @@ def _plot(cls, ax, x, y, style=None, column_num=None, stacking_id=None, **kwds): return lines @classmethod - def _ts_plot(cls, ax, x, data, style=None, **kwds): + def _ts_plot(cls, ax: "Axes", x, data, style=None, **kwds): from pandas.plotting._matplotlib.timeseries import ( _decorate_axes, _maybe_resample, diff --git a/pandas/plotting/_matplotlib/timeseries.py b/pandas/plotting/_matplotlib/timeseries.py index 193602e1baf4a..fd89a093d25a4 100644 --- a/pandas/plotting/_matplotlib/timeseries.py +++ b/pandas/plotting/_matplotlib/timeseries.py @@ -24,14 +24,15 @@ from pandas.tseries.frequencies import get_period_alias, is_subperiod, is_superperiod if TYPE_CHECKING: - from pandas import Index, Series # noqa:F401 + from matplotlib.axes import Axes + from pandas import Index, Series # noqa:F401 # --------------------------------------------------------------------- # Plotting functions and monkey patches -def _maybe_resample(series: "Series", ax, kwargs): +def _maybe_resample(series: "Series", ax: "Axes", kwargs): # resample against axes freq if necessary freq, ax_freq = _get_freq(ax, series) @@ -74,7 +75,7 @@ def _is_sup(f1: str, f2: str) -> bool: ) -def _upsample_others(ax, freq, kwargs): +def _upsample_others(ax: "Axes", freq, kwargs): legend = ax.get_legend() lines, labels = _replot_ax(ax, freq, kwargs) _replot_ax(ax, freq, kwargs) @@ -97,7 +98,7 @@ def _upsample_others(ax, freq, kwargs): ax.legend(lines, labels, loc="best", title=title) -def _replot_ax(ax, freq, kwargs): +def _replot_ax(ax: "Axes", freq, kwargs): data = getattr(ax, "_plot_data", None) # clear current axes and data @@ -127,7 +128,7 @@ def _replot_ax(ax, freq, kwargs): return lines, labels -def _decorate_axes(ax, freq, kwargs): +def _decorate_axes(ax: "Axes", freq, kwargs): """Initialize axes for time-series plotting""" if not hasattr(ax, "_plot_data"): ax._plot_data = [] @@ -143,7 +144,7 @@ def _decorate_axes(ax, freq, kwargs): ax.date_axis_info = None -def _get_ax_freq(ax): +def _get_ax_freq(ax: "Axes"): """ Get the freq attribute of the ax object if set. Also checks shared axes (eg when using secondary yaxis, sharex=True @@ -174,7 +175,7 @@ def _get_period_alias(freq) -> Optional[str]: return freq -def _get_freq(ax, series: "Series"): +def _get_freq(ax: "Axes", series: "Series"): # get frequency from data freq = getattr(series.index, "freq", None) if freq is None: @@ -192,7 +193,7 @@ def _get_freq(ax, series: "Series"): return freq, ax_freq -def _use_dynamic_x(ax, data: "FrameOrSeriesUnion") -> bool: +def _use_dynamic_x(ax: "Axes", data: FrameOrSeriesUnion) -> bool: freq = _get_index_freq(data.index) ax_freq = _get_ax_freq(ax) @@ -234,7 +235,7 @@ def _get_index_freq(index: "Index") -> Optional[BaseOffset]: return freq -def _maybe_convert_index(ax, data): +def _maybe_convert_index(ax: "Axes", data): # tsplot converts automatically, but don't want to convert index # over and over for DataFrames if isinstance(data.index, (ABCDatetimeIndex, ABCPeriodIndex)): @@ -264,7 +265,7 @@ def _maybe_convert_index(ax, data): # Do we need the rest for convenience? -def _format_coord(freq, t, y): +def _format_coord(freq, t, y) -> str: time_period = Period(ordinal=int(t), freq=freq) return f"t = {time_period} y = {y:8f}" From bb7691ed5a68d70528e86ffd42ab98df03786803 Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Sun, 30 Aug 2020 13:00:24 +0100 Subject: [PATCH 0581/1025] TYP: Remove NDFrame._add_series_or_dataframe_operations (#35957) --- pandas/core/frame.py | 1 - pandas/core/generic.py | 295 +++++++-------------------- pandas/core/series.py | 1 - pandas/core/shared_docs.py | 363 +++++++++++++++++++++++----------- pandas/core/window/common.py | 2 +- pandas/core/window/rolling.py | 4 +- 6 files changed, 323 insertions(+), 343 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 606bd4cc3b52d..95bd757f1994e 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -9306,7 +9306,6 @@ def _AXIS_NAMES(self) -> Dict[int, str]: DataFrame._add_numeric_operations() -DataFrame._add_series_or_dataframe_operations() ops.add_flex_arithmetic_methods(DataFrame) ops.add_special_arithmetic_methods(DataFrame) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index dd7b02d98ad42..3bad2d6dd18b9 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6,7 +6,6 @@ import operator import pickle import re -from textwrap import dedent from typing import ( TYPE_CHECKING, Any, @@ -101,17 +100,22 @@ from pandas.core.missing import find_valid_index from pandas.core.ops import _align_method_FRAME from pandas.core.shared_docs import _shared_docs +from pandas.core.window import Expanding, ExponentialMovingWindow, Rolling, Window from pandas.io.formats import format as fmt from pandas.io.formats.format import DataFrameFormatter, format_percentiles from pandas.io.formats.printing import pprint_thing if TYPE_CHECKING: + from pandas._libs.tslibs import BaseOffset + from pandas.core.resample import Resampler from pandas.core.series import Series # noqa: F401 + from pandas.core.window.indexers import BaseIndexer # goal is to be able to define the docs close to function, while still being # able to share +_shared_docs = {**_shared_docs} _shared_doc_kwargs = dict( axes="keywords for axes", klass="Series/DataFrame", @@ -5128,51 +5132,6 @@ def pipe(self, func, *args, **kwargs): """ return com.pipe(self, func, *args, **kwargs) - _shared_docs["aggregate"] = dedent( - """ - Aggregate using one or more operations over the specified axis. - {versionadded} - Parameters - ---------- - func : function, str, list or dict - Function to use for aggregating the data. If a function, must either - work when passed a {klass} or when passed to {klass}.apply. - - Accepted combinations are: - - - function - - string function name - - list of functions and/or function names, e.g. ``[np.sum, 'mean']`` - - dict of axis labels -> functions, function names or list of such. - {axis} - *args - Positional arguments to pass to `func`. - **kwargs - Keyword arguments to pass to `func`. - - Returns - ------- - scalar, Series or DataFrame - - The return can be: - - * scalar : when Series.agg is called with single function - * Series : when DataFrame.agg is called with a single function - * DataFrame : when DataFrame.agg is called with several functions - - Return scalar, Series or DataFrame. - {see_also} - Notes - ----- - `agg` is an alias for `aggregate`. Use the alias. - - In pandas, agg, as most operations just ignores the missing values, - and returns the operation only considering the values that are present. - - A passed user-defined-function will be passed a Series for evaluation. - {examples}""" - ) - # ---------------------------------------------------------------------- # Attribute access @@ -7452,77 +7411,6 @@ def clip( return result - _shared_docs[ - "groupby" - ] = """ - Group %(klass)s using a mapper or by a Series of columns. - - A groupby operation involves some combination of splitting the - object, applying a function, and combining the results. This can be - used to group large amounts of data and compute operations on these - groups. - - Parameters - ---------- - by : mapping, function, label, or list of labels - Used to determine the groups for the groupby. - If ``by`` is a function, it's called on each value of the object's - index. If a dict or Series is passed, the Series or dict VALUES - will be used to determine the groups (the Series' values are first - aligned; see ``.align()`` method). If an ndarray is passed, the - values are used as-is determine the groups. A label or list of - labels may be passed to group by the columns in ``self``. Notice - that a tuple is interpreted as a (single) key. - axis : {0 or 'index', 1 or 'columns'}, default 0 - Split along rows (0) or columns (1). - level : int, level name, or sequence of such, default None - If the axis is a MultiIndex (hierarchical), group by a particular - level or levels. - as_index : bool, default True - For aggregated output, return object with group labels as the - index. Only relevant for DataFrame input. as_index=False is - effectively "SQL-style" grouped output. - sort : bool, default True - Sort group keys. Get better performance by turning this off. - Note this does not influence the order of observations within each - group. Groupby preserves the order of rows within each group. - group_keys : bool, default True - When calling apply, add group keys to index to identify pieces. - squeeze : bool, default False - Reduce the dimensionality of the return type if possible, - otherwise return a consistent type. - - .. deprecated:: 1.1.0 - - observed : bool, default False - This only applies if any of the groupers are Categoricals. - If True: only show observed values for categorical groupers. - If False: show all values for categorical groupers. - - .. versionadded:: 0.23.0 - dropna : bool, default True - If True, and if group keys contain NA values, NA values together - with row/column will be dropped. - If False, NA values will also be treated as the key in groups - - .. versionadded:: 1.1.0 - - Returns - ------- - %(klass)sGroupBy - Returns a groupby object that contains information about the groups. - - See Also - -------- - resample : Convenience method for frequency conversion and resampling - of time series. - - Notes - ----- - See the `user guide - `_ for more. - """ - def asfreq( self: FrameOrSeries, freq, @@ -8431,35 +8319,6 @@ def ranker(data): return ranker(data) - _shared_docs[ - "compare" - ] = """ - Compare to another %(klass)s and show the differences. - - .. versionadded:: 1.1.0 - - Parameters - ---------- - other : %(klass)s - Object to compare with. - - align_axis : {0 or 'index', 1 or 'columns'}, default 1 - Determine which axis to align the comparison on. - - * 0, or 'index' : Resulting differences are stacked vertically - with rows drawn alternately from self and other. - * 1, or 'columns' : Resulting differences are aligned horizontally - with columns drawn alternately from self and other. - - keep_shape : bool, default False - If true, all rows and columns are kept. - Otherwise, only the ones with different values are kept. - - keep_equal : bool, default False - If true, the result keeps values that are equal. - Otherwise, equal values are shown as NaNs. - """ - @Appender(_shared_docs["compare"] % _shared_doc_kwargs) def compare( self, @@ -10589,45 +10448,21 @@ def mad(self, axis=None, skipna=None, level=None): examples=_min_examples, ) - @classmethod - def _add_series_or_dataframe_operations(cls): - """ - Add the series or dataframe only operations to the cls; evaluate - the doc strings again. - """ - from pandas.core.window import ( - Expanding, - ExponentialMovingWindow, - Rolling, - Window, - ) - - @doc(Rolling) - def rolling( - self, - window, - min_periods=None, - center=False, - win_type=None, - on=None, - axis=0, - closed=None, - ): - axis = self._get_axis_number(axis) - - if win_type is not None: - return Window( - self, - window=window, - min_periods=min_periods, - center=center, - win_type=win_type, - on=on, - axis=axis, - closed=closed, - ) + @doc(Rolling) + def rolling( + self, + window: "Union[int, timedelta, BaseOffset, BaseIndexer]", + min_periods: Optional[int] = None, + center: bool_t = False, + win_type: Optional[str] = None, + on: Optional[str] = None, + axis: Axis = 0, + closed: Optional[str] = None, + ): + axis = self._get_axis_number(axis) - return Rolling( + if win_type is not None: + return Window( self, window=window, min_periods=min_periods, @@ -10638,53 +10473,59 @@ def rolling( closed=closed, ) - cls.rolling = rolling - - @doc(Expanding) - def expanding(self, min_periods=1, center=None, axis=0): - axis = self._get_axis_number(axis) - if center is not None: - warnings.warn( - "The `center` argument on `expanding` " - "will be removed in the future", - FutureWarning, - stacklevel=2, - ) - else: - center = False + return Rolling( + self, + window=window, + min_periods=min_periods, + center=center, + win_type=win_type, + on=on, + axis=axis, + closed=closed, + ) - return Expanding(self, min_periods=min_periods, center=center, axis=axis) + @doc(Expanding) + def expanding( + self, min_periods: int = 1, center: Optional[bool_t] = None, axis: Axis = 0 + ) -> Expanding: + axis = self._get_axis_number(axis) + if center is not None: + warnings.warn( + "The `center` argument on `expanding` will be removed in the future", + FutureWarning, + stacklevel=2, + ) + else: + center = False - cls.expanding = expanding + return Expanding(self, min_periods=min_periods, center=center, axis=axis) - @doc(ExponentialMovingWindow) - def ewm( + @doc(ExponentialMovingWindow) + def ewm( + self, + com: Optional[float] = None, + span: Optional[float] = None, + halflife: Optional[Union[float, TimedeltaConvertibleTypes]] = None, + alpha: Optional[float] = None, + min_periods: int = 0, + adjust: bool_t = True, + ignore_na: bool_t = False, + axis: Axis = 0, + times: Optional[Union[str, np.ndarray, FrameOrSeries]] = None, + ) -> ExponentialMovingWindow: + axis = self._get_axis_number(axis) + return ExponentialMovingWindow( self, - com=None, - span=None, - halflife=None, - alpha=None, - min_periods=0, - adjust=True, - ignore_na=False, - axis=0, - times=None, - ): - axis = self._get_axis_number(axis) - return ExponentialMovingWindow( - self, - com=com, - span=span, - halflife=halflife, - alpha=alpha, - min_periods=min_periods, - adjust=adjust, - ignore_na=ignore_na, - axis=axis, - times=times, - ) - - cls.ewm = ewm + com=com, + span=span, + halflife=halflife, + alpha=alpha, + min_periods=min_periods, + adjust=adjust, + ignore_na=ignore_na, + axis=axis, + times=times, + ) @doc(klass=_shared_doc_kwargs["klass"], axis="") def transform(self, func, *args, **kwargs): diff --git a/pandas/core/series.py b/pandas/core/series.py index dbc105be3c62b..a8a2d300fa168 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -5000,7 +5000,6 @@ def to_period(self, freq=None, copy=True) -> "Series": Series._add_numeric_operations() -Series._add_series_or_dataframe_operations() # Add arithmetic! ops.add_flex_arithmetic_methods(Series) diff --git a/pandas/core/shared_docs.py b/pandas/core/shared_docs.py index b81942f062b19..0aaccb47efc44 100644 --- a/pandas/core/shared_docs.py +++ b/pandas/core/shared_docs.py @@ -2,117 +2,258 @@ _shared_docs: Dict[str, str] = dict() +_shared_docs[ + "aggregate" +] = """\ +Aggregate using one or more operations over the specified axis. +{versionadded} +Parameters +---------- +func : function, str, list or dict + Function to use for aggregating the data. If a function, must either + work when passed a {klass} or when passed to {klass}.apply. + + Accepted combinations are: + + - function + - string function name + - list of functions and/or function names, e.g. ``[np.sum, 'mean']`` + - dict of axis labels -> functions, function names or list of such. +{axis} +*args + Positional arguments to pass to `func`. +**kwargs + Keyword arguments to pass to `func`. + +Returns +------- +scalar, Series or DataFrame + + The return can be: + + * scalar : when Series.agg is called with single function + * Series : when DataFrame.agg is called with a single function + * DataFrame : when DataFrame.agg is called with several functions + + Return scalar, Series or DataFrame. +{see_also} +Notes +----- +`agg` is an alias for `aggregate`. Use the alias. + +A passed user-defined-function will be passed a Series for evaluation. +{examples}""" + +_shared_docs[ + "compare" +] = """\ +Compare to another %(klass)s and show the differences. + +.. versionadded:: 1.1.0 + +Parameters +---------- +other : %(klass)s + Object to compare with. + +align_axis : {0 or 'index', 1 or 'columns'}, default 1 + Determine which axis to align the comparison on. + + * 0, or 'index' : Resulting differences are stacked vertically + with rows drawn alternately from self and other. + * 1, or 'columns' : Resulting differences are aligned horizontally + with columns drawn alternately from self and other. + +keep_shape : bool, default False + If true, all rows and columns are kept. + Otherwise, only the ones with different values are kept. + +keep_equal : bool, default False + If true, the result keeps values that are equal. + Otherwise, equal values are shown as NaNs. +""" + +_shared_docs[ + "groupby" +] = """\ +Group %(klass)s using a mapper or by a Series of columns. + +A groupby operation involves some combination of splitting the +object, applying a function, and combining the results. This can be +used to group large amounts of data and compute operations on these +groups. + +Parameters +---------- +by : mapping, function, label, or list of labels + Used to determine the groups for the groupby. + If ``by`` is a function, it's called on each value of the object's + index. If a dict or Series is passed, the Series or dict VALUES + will be used to determine the groups (the Series' values are first + aligned; see ``.align()`` method). If an ndarray is passed, the + values are used as-is determine the groups. A label or list of + labels may be passed to group by the columns in ``self``. Notice + that a tuple is interpreted as a (single) key. +axis : {0 or 'index', 1 or 'columns'}, default 0 + Split along rows (0) or columns (1). +level : int, level name, or sequence of such, default None + If the axis is a MultiIndex (hierarchical), group by a particular + level or levels. +as_index : bool, default True + For aggregated output, return object with group labels as the + index. Only relevant for DataFrame input. as_index=False is + effectively "SQL-style" grouped output. +sort : bool, default True + Sort group keys. Get better performance by turning this off. + Note this does not influence the order of observations within each + group. Groupby preserves the order of rows within each group. +group_keys : bool, default True + When calling apply, add group keys to index to identify pieces. +squeeze : bool, default False + Reduce the dimensionality of the return type if possible, + otherwise return a consistent type. + + .. deprecated:: 1.1.0 + +observed : bool, default False + This only applies if any of the groupers are Categoricals. + If True: only show observed values for categorical groupers. + If False: show all values for categorical groupers. + + .. versionadded:: 0.23.0 +dropna : bool, default True + If True, and if group keys contain NA values, NA values together + with row/column will be dropped. + If False, NA values will also be treated as the key in groups + + .. versionadded:: 1.1.0 + +Returns +------- +%(klass)sGroupBy + Returns a groupby object that contains information about the groups. + +See Also +-------- +resample : Convenience method for frequency conversion and resampling + of time series. + +Notes +----- +See the `user guide +`_ for more. +""" _shared_docs[ "melt" -] = """ - Unpivot a DataFrame from wide to long format, optionally leaving identifiers set. - - This function is useful to massage a DataFrame into a format where one - or more columns are identifier variables (`id_vars`), while all other - columns, considered measured variables (`value_vars`), are "unpivoted" to - the row axis, leaving just two non-identifier columns, 'variable' and - 'value'. - %(versionadded)s - Parameters - ---------- - id_vars : tuple, list, or ndarray, optional - Column(s) to use as identifier variables. - value_vars : tuple, list, or ndarray, optional - Column(s) to unpivot. If not specified, uses all columns that - are not set as `id_vars`. - var_name : scalar - Name to use for the 'variable' column. If None it uses - ``frame.columns.name`` or 'variable'. - value_name : scalar, default 'value' - Name to use for the 'value' column. - col_level : int or str, optional - If columns are a MultiIndex then use this level to melt. - ignore_index : bool, default True - If True, original index is ignored. If False, the original index is retained. - Index labels will be repeated as necessary. - - .. versionadded:: 1.1.0 - - Returns - ------- - DataFrame - Unpivoted DataFrame. - - See Also - -------- - %(other)s : Identical method. - pivot_table : Create a spreadsheet-style pivot table as a DataFrame. - DataFrame.pivot : Return reshaped DataFrame organized - by given index / column values. - DataFrame.explode : Explode a DataFrame from list-like - columns to long format. - - Examples - -------- - >>> df = pd.DataFrame({'A': {0: 'a', 1: 'b', 2: 'c'}, - ... 'B': {0: 1, 1: 3, 2: 5}, - ... 'C': {0: 2, 1: 4, 2: 6}}) - >>> df - A B C - 0 a 1 2 - 1 b 3 4 - 2 c 5 6 - - >>> %(caller)sid_vars=['A'], value_vars=['B']) - A variable value - 0 a B 1 - 1 b B 3 - 2 c B 5 - - >>> %(caller)sid_vars=['A'], value_vars=['B', 'C']) - A variable value - 0 a B 1 - 1 b B 3 - 2 c B 5 - 3 a C 2 - 4 b C 4 - 5 c C 6 - - The names of 'variable' and 'value' columns can be customized: - - >>> %(caller)sid_vars=['A'], value_vars=['B'], - ... var_name='myVarname', value_name='myValname') - A myVarname myValname - 0 a B 1 - 1 b B 3 - 2 c B 5 - - Original index values can be kept around: - - >>> %(caller)sid_vars=['A'], value_vars=['B', 'C'], ignore_index=False) - A variable value - 0 a B 1 - 1 b B 3 - 2 c B 5 - 0 a C 2 - 1 b C 4 - 2 c C 6 - - If you have multi-index columns: - - >>> df.columns = [list('ABC'), list('DEF')] - >>> df - A B C - D E F - 0 a 1 2 - 1 b 3 4 - 2 c 5 6 - - >>> %(caller)scol_level=0, id_vars=['A'], value_vars=['B']) - A variable value - 0 a B 1 - 1 b B 3 - 2 c B 5 - - >>> %(caller)sid_vars=[('A', 'D')], value_vars=[('B', 'E')]) - (A, D) variable_0 variable_1 value - 0 a B E 1 - 1 b B E 3 - 2 c B E 5 - """ +] = """\ +Unpivot a DataFrame from wide to long format, optionally leaving identifiers set. + +This function is useful to massage a DataFrame into a format where one +or more columns are identifier variables (`id_vars`), while all other +columns, considered measured variables (`value_vars`), are "unpivoted" to +the row axis, leaving just two non-identifier columns, 'variable' and +'value'. +%(versionadded)s +Parameters +---------- +id_vars : tuple, list, or ndarray, optional + Column(s) to use as identifier variables. +value_vars : tuple, list, or ndarray, optional + Column(s) to unpivot. If not specified, uses all columns that + are not set as `id_vars`. +var_name : scalar + Name to use for the 'variable' column. If None it uses + ``frame.columns.name`` or 'variable'. +value_name : scalar, default 'value' + Name to use for the 'value' column. +col_level : int or str, optional + If columns are a MultiIndex then use this level to melt. +ignore_index : bool, default True + If True, original index is ignored. If False, the original index is retained. + Index labels will be repeated as necessary. + + .. versionadded:: 1.1.0 + +Returns +------- +DataFrame + Unpivoted DataFrame. + +See Also +-------- +%(other)s : Identical method. +pivot_table : Create a spreadsheet-style pivot table as a DataFrame. +DataFrame.pivot : Return reshaped DataFrame organized + by given index / column values. +DataFrame.explode : Explode a DataFrame from list-like + columns to long format. + +Examples +-------- +>>> df = pd.DataFrame({'A': {0: 'a', 1: 'b', 2: 'c'}, +... 'B': {0: 1, 1: 3, 2: 5}, +... 'C': {0: 2, 1: 4, 2: 6}}) +>>> df + A B C +0 a 1 2 +1 b 3 4 +2 c 5 6 + +>>> %(caller)sid_vars=['A'], value_vars=['B']) + A variable value +0 a B 1 +1 b B 3 +2 c B 5 + +>>> %(caller)sid_vars=['A'], value_vars=['B', 'C']) + A variable value +0 a B 1 +1 b B 3 +2 c B 5 +3 a C 2 +4 b C 4 +5 c C 6 + +The names of 'variable' and 'value' columns can be customized: + +>>> %(caller)sid_vars=['A'], value_vars=['B'], +... var_name='myVarname', value_name='myValname') + A myVarname myValname +0 a B 1 +1 b B 3 +2 c B 5 + +Original index values can be kept around: + +>>> %(caller)sid_vars=['A'], value_vars=['B', 'C'], ignore_index=False) + A variable value +0 a B 1 +1 b B 3 +2 c B 5 +0 a C 2 +1 b C 4 +2 c C 6 + +If you have multi-index columns: + +>>> df.columns = [list('ABC'), list('DEF')] +>>> df + A B C + D E F +0 a 1 2 +1 b 3 4 +2 c 5 6 + +>>> %(caller)scol_level=0, id_vars=['A'], value_vars=['B']) + A variable value +0 a B 1 +1 b B 3 +2 c B 5 + +>>> %(caller)sid_vars=[('A', 'D')], value_vars=[('B', 'E')]) + (A, D) variable_0 variable_1 value +0 a B E 1 +1 b B E 3 +2 c B E 5 +""" diff --git a/pandas/core/window/common.py b/pandas/core/window/common.py index 51a067427e867..2f3058db4493b 100644 --- a/pandas/core/window/common.py +++ b/pandas/core/window/common.py @@ -7,9 +7,9 @@ from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries -from pandas.core.generic import _shared_docs from pandas.core.groupby.base import GroupByMixin from pandas.core.indexes.api import MultiIndex +from pandas.core.shared_docs import _shared_docs _shared_docs = dict(**_shared_docs) _doc_template = """ diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 39fcfcbe2bff6..04509a40b98df 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -22,7 +22,7 @@ from pandas._libs.tslibs import BaseOffset, to_offset import pandas._libs.window.aggregations as window_aggregations -from pandas._typing import ArrayLike, Axis, FrameOrSeriesUnion, Label +from pandas._typing import ArrayLike, Axis, FrameOrSeries, FrameOrSeriesUnion, Label from pandas.compat._optional import import_optional_dependency from pandas.compat.numpy import function as nv from pandas.util._decorators import Appender, Substitution, cache_readonly, doc @@ -159,7 +159,7 @@ class _Window(PandasObject, ShallowMixin, SelectionMixin): def __init__( self, - obj: FrameOrSeriesUnion, + obj: FrameOrSeries, window=None, min_periods: Optional[int] = None, center: bool = False, From cf55654f885f87e03d6f05bcec8818c44dcda2ff Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 30 Aug 2020 05:01:51 -0700 Subject: [PATCH 0582/1025] TYP: Annotations (#35933) --- pandas/core/algorithms.py | 11 +++++++---- pandas/core/arrays/base.py | 12 ++++++++++++ pandas/core/groupby/base.py | 9 +++++++-- pandas/core/indexes/base.py | 7 ++----- pandas/core/indexes/datetimelike.py | 4 +++- pandas/core/indexes/numeric.py | 2 ++ pandas/core/internals/blocks.py | 2 +- pandas/core/internals/managers.py | 4 ++-- 8 files changed, 36 insertions(+), 15 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 2a6e983eff3ee..6d6bb21165814 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -10,7 +10,7 @@ import numpy as np from pandas._libs import Timestamp, algos, hashtable as htable, iNaT, lib -from pandas._typing import AnyArrayLike, ArrayLike, DtypeObj +from pandas._typing import AnyArrayLike, ArrayLike, DtypeObj, FrameOrSeriesUnion from pandas.util._decorators import doc from pandas.core.dtypes.cast import ( @@ -58,7 +58,7 @@ from pandas.core.indexers import validate_indices if TYPE_CHECKING: - from pandas import Series + from pandas import DataFrame, Series _shared_docs: Dict[str, str] = {} @@ -1101,6 +1101,9 @@ def __init__(self, obj, n: int, keep: str): if self.keep not in ("first", "last", "all"): raise ValueError('keep must be either "first", "last" or "all"') + def compute(self, method: str) -> FrameOrSeriesUnion: + raise NotImplementedError + def nlargest(self): return self.compute("nlargest") @@ -1133,7 +1136,7 @@ class SelectNSeries(SelectN): nordered : Series """ - def compute(self, method): + def compute(self, method: str) -> "Series": n = self.n dtype = self.obj.dtype @@ -1207,7 +1210,7 @@ def __init__(self, obj, n: int, keep: str, columns): columns = list(columns) self.columns = columns - def compute(self, method): + def compute(self, method: str) -> "DataFrame": from pandas import Int64Index diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index d85647edc3b81..8193d65b3b30c 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -1167,6 +1167,10 @@ class ExtensionOpsMixin: with NumPy arrays. """ + @classmethod + def _create_arithmetic_method(cls, op): + raise AbstractMethodError(cls) + @classmethod def _add_arithmetic_ops(cls): cls.__add__ = cls._create_arithmetic_method(operator.add) @@ -1186,6 +1190,10 @@ def _add_arithmetic_ops(cls): cls.__divmod__ = cls._create_arithmetic_method(divmod) cls.__rdivmod__ = cls._create_arithmetic_method(ops.rdivmod) + @classmethod + def _create_comparison_method(cls, op): + raise AbstractMethodError(cls) + @classmethod def _add_comparison_ops(cls): cls.__eq__ = cls._create_comparison_method(operator.eq) @@ -1195,6 +1203,10 @@ def _add_comparison_ops(cls): cls.__le__ = cls._create_comparison_method(operator.le) cls.__ge__ = cls._create_comparison_method(operator.ge) + @classmethod + def _create_logical_method(cls, op): + raise AbstractMethodError(cls) + @classmethod def _add_logical_ops(cls): cls.__and__ = cls._create_logical_method(operator.and_) diff --git a/pandas/core/groupby/base.py b/pandas/core/groupby/base.py index e71b2f94c8014..999873e7b81e4 100644 --- a/pandas/core/groupby/base.py +++ b/pandas/core/groupby/base.py @@ -4,17 +4,22 @@ SeriesGroupBy and the DataFrameGroupBy objects. """ import collections +from typing import List from pandas.core.dtypes.common import is_list_like, is_scalar +from pandas.core.base import PandasObject + OutputKey = collections.namedtuple("OutputKey", ["label", "position"]) -class GroupByMixin: +class GroupByMixin(PandasObject): """ Provide the groupby facilities to the mixed object. """ + _attributes: List[str] + def _gotitem(self, key, ndim, subset=None): """ Sub-classes to define. Return a sliced object. @@ -22,7 +27,7 @@ def _gotitem(self, key, ndim, subset=None): Parameters ---------- key : string / list of selections - ndim : 1,2 + ndim : {1, 2} requested ndim of result subset : object, default None subset to act on diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index b1e5d5627e3f6..a07c3328def54 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -3541,10 +3541,7 @@ def _join_multi(self, other, how, return_indexers=True): if not overlap: raise ValueError("cannot join with no overlapping index names") - self_is_mi = isinstance(self, ABCMultiIndex) - other_is_mi = isinstance(other, ABCMultiIndex) - - if self_is_mi and other_is_mi: + if isinstance(self, MultiIndex) and isinstance(other, MultiIndex): # Drop the non-matching levels from left and right respectively ldrop_names = list(self_names - overlap) @@ -3590,7 +3587,7 @@ def _join_multi(self, other, how, return_indexers=True): # Case where only one index is multi # make the indices into mi's that match flip_order = False - if self_is_mi: + if isinstance(self, MultiIndex): self, other = other, self flip_order = True # flip if join method is right or left diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index efe1a853a9a76..e7e93068d9175 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -1,7 +1,7 @@ """ Base and utility classes for tseries type pandas objects. """ -from datetime import datetime +from datetime import datetime, tzinfo from typing import Any, List, Optional, TypeVar, Union, cast import numpy as np @@ -630,6 +630,8 @@ class DatetimeTimedeltaMixin(DatetimeIndexOpsMixin, Int64Index): but not PeriodIndex """ + tz: Optional[tzinfo] + # Compat for frequency inference, see GH#23789 _is_monotonic_increasing = Index.is_monotonic_increasing _is_monotonic_decreasing = Index.is_monotonic_decreasing diff --git a/pandas/core/indexes/numeric.py b/pandas/core/indexes/numeric.py index 80bb9f10fadd9..cd3f1f51a86d2 100644 --- a/pandas/core/indexes/numeric.py +++ b/pandas/core/indexes/numeric.py @@ -45,6 +45,8 @@ class NumericIndex(Index): This is an abstract class. """ + _default_dtype: np.dtype + _is_numeric_dtype = True def __new__(cls, data=None, dtype=None, copy=False, name=None): diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index a38b47a4c2a25..1b42df1b0147c 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1382,7 +1382,7 @@ def where_func(cond, values, other): cond = cond.swapaxes(axis, 0) mask = np.array([cond[i].all() for i in range(cond.shape[0])], dtype=bool) - result_blocks = [] + result_blocks: List["Block"] = [] for m in [mask, ~mask]: if m.any(): taken = result.take(m.nonzero()[0], axis=axis) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 67ff3b9456ccf..bade891939c84 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -334,7 +334,7 @@ def reduce(self: T, func) -> T: # If 2D, we assume that we're operating column-wise assert self.ndim == 2 - res_blocks = [] + res_blocks: List[Block] = [] for blk in self.blocks: nbs = blk.reduce(func) res_blocks.extend(nbs) @@ -728,7 +728,7 @@ def _combine(self, blocks: List[Block], copy: bool = True) -> "BlockManager": indexer = np.sort(np.concatenate([b.mgr_locs.as_array for b in blocks])) inv_indexer = lib.get_reverse_indexer(indexer, self.shape[0]) - new_blocks = [] + new_blocks: List[Block] = [] for b in blocks: b = b.copy(deep=copy) b.mgr_locs = inv_indexer[b.mgr_locs.indexer] From 19f847097f6845ca835030b593005f08c765406a Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 30 Aug 2020 06:23:44 -0700 Subject: [PATCH 0583/1025] TYP: annotate plotting._matplotlib.converter (#35978) --- pandas/plotting/_matplotlib/converter.py | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/pandas/plotting/_matplotlib/converter.py b/pandas/plotting/_matplotlib/converter.py index 214a67690d695..3db7c38eced65 100644 --- a/pandas/plotting/_matplotlib/converter.py +++ b/pandas/plotting/_matplotlib/converter.py @@ -2,7 +2,7 @@ import datetime as pydt from datetime import datetime, timedelta, tzinfo import functools -from typing import Optional, Tuple +from typing import Any, List, Optional, Tuple from dateutil.relativedelta import relativedelta import matplotlib.dates as dates @@ -144,7 +144,7 @@ def convert(value, unit, axis): return value @staticmethod - def axisinfo(unit, axis): + def axisinfo(unit, axis) -> Optional[units.AxisInfo]: if unit != "time": return None @@ -294,7 +294,7 @@ def try_parse(values): return values @staticmethod - def axisinfo(unit, axis): + def axisinfo(unit: Optional[tzinfo], axis) -> units.AxisInfo: """ Return the :class:`~matplotlib.units.AxisInfo` for *unit*. @@ -473,7 +473,7 @@ def _get_default_annual_spacing(nyears) -> Tuple[int, int]: return (min_spacing, maj_spacing) -def period_break(dates, period): +def period_break(dates: PeriodIndex, period: str) -> np.ndarray: """ Returns the indices where the given period changes. @@ -489,7 +489,7 @@ def period_break(dates, period): return np.nonzero(current - previous)[0] -def has_level_label(label_flags, vmin): +def has_level_label(label_flags: np.ndarray, vmin: float) -> bool: """ Returns true if the ``label_flags`` indicate there is at least one label for this level. @@ -984,18 +984,24 @@ class TimeSeries_DateFormatter(Formatter): ---------- freq : {int, string} Valid frequency specifier. - minor_locator : {False, True} + minor_locator : bool, default False Whether the current formatter should apply to minor ticks (True) or major ticks (False). - dynamic_mode : {True, False} + dynamic_mode : bool, default True Whether the formatter works in dynamic mode or not. """ - def __init__(self, freq, minor_locator=False, dynamic_mode=True, plot_obj=None): + def __init__( + self, + freq, + minor_locator: bool = False, + dynamic_mode: bool = True, + plot_obj=None, + ): freq = to_offset(freq) self.format = None self.freq = freq - self.locs = [] + self.locs: List[Any] = [] # unused, for matplotlib compat self.formatdict = None self.isminor = minor_locator self.isdynamic = dynamic_mode From 67c3cdb3d0b2ccc12c0c3883eee707aea501d433 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sun, 30 Aug 2020 18:50:37 +0100 Subject: [PATCH 0584/1025] TYP: misc typing cleanups for #29116 (#35953) --- pandas/core/aggregation.py | 14 ++++++++------ pandas/core/frame.py | 6 ++++++ 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/pandas/core/aggregation.py b/pandas/core/aggregation.py index e2374b81ca13b..7ca68d8289bd5 100644 --- a/pandas/core/aggregation.py +++ b/pandas/core/aggregation.py @@ -10,6 +10,7 @@ Callable, DefaultDict, Dict, + Iterable, List, Optional, Sequence, @@ -17,14 +18,14 @@ Union, ) -from pandas._typing import AggFuncType, Label +from pandas._typing import AggFuncType, FrameOrSeries, Label from pandas.core.dtypes.common import is_dict_like, is_list_like from pandas.core.base import SpecificationError import pandas.core.common as com from pandas.core.indexes.api import Index -from pandas.core.series import FrameOrSeriesUnion, Series +from pandas.core.series import Series def reconstruct_func( @@ -276,12 +277,13 @@ def maybe_mangle_lambdas(agg_spec: Any) -> Any: def relabel_result( - result: FrameOrSeriesUnion, + result: FrameOrSeries, func: Dict[str, List[Union[Callable, str]]], - columns: Tuple, - order: List[int], + columns: Iterable[Label], + order: Iterable[int], ) -> Dict[Label, Series]: - """Internal function to reorder result if relabelling is True for + """ + Internal function to reorder result if relabelling is True for dataframe.agg, and return the reordered result in dict. Parameters: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 95bd757f1994e..4668f264000e7 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -7415,6 +7415,12 @@ def aggregate(self, func=None, axis=0, *args, **kwargs): if relabeling: # This is to keep the order to columns occurrence unchanged, and also # keep the order of new columns occurrence unchanged + + # For the return values of reconstruct_func, if relabeling is + # False, columns and order will be None. + assert columns is not None + assert order is not None + result_in_dict = relabel_result(result, func, columns, order) result = DataFrame(result_in_dict, index=columns) From d1c82b7ae65d79b53242ca591c79ec9d69ef2a45 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Mon, 31 Aug 2020 02:31:22 +0100 Subject: [PATCH 0585/1025] TYP: check_untyped_defs core.dtypes.cast (#35992) --- pandas/core/indexes/datetimes.py | 7 ++++++- pandas/core/tools/datetimes.py | 4 +--- setup.cfg | 3 --- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index f71fd0d406c54..e66f513e347a9 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -75,7 +75,7 @@ def _new_DatetimeIndex(cls, d): + [ method for method in DatetimeArray._datetimelike_methods - if method not in ("tz_localize",) + if method not in ("tz_localize", "tz_convert") ], DatetimeArray, wrap=True, @@ -228,6 +228,11 @@ class DatetimeIndex(DatetimeTimedeltaMixin): # -------------------------------------------------------------------- # methods that dispatch to array and wrap result in DatetimeIndex + @doc(DatetimeArray.tz_convert) + def tz_convert(self, tz) -> "DatetimeIndex": + arr = self._data.tz_convert(tz) + return type(self)._simple_new(arr, name=self.name) + @doc(DatetimeArray.tz_localize) def tz_localize( self, tz, ambiguous="raise", nonexistent="raise" diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 3c1fe6bacefcf..8fcc5f74ea897 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -307,9 +307,7 @@ def _convert_listlike_datetimes( if not isinstance(arg, (DatetimeArray, DatetimeIndex)): return DatetimeIndex(arg, tz=tz, name=name) if tz == "utc": - # error: Item "DatetimeIndex" of "Union[DatetimeArray, DatetimeIndex]" has - # no attribute "tz_convert" - arg = arg.tz_convert(None).tz_localize(tz) # type: ignore[union-attr] + arg = arg.tz_convert(None).tz_localize(tz) return arg elif is_datetime64_ns_dtype(arg_dtype): diff --git a/setup.cfg b/setup.cfg index aa1535a171f0a..2ba22e5aad3c7 100644 --- a/setup.cfg +++ b/setup.cfg @@ -157,9 +157,6 @@ check_untyped_defs=False [mypy-pandas.core.computation.scope] check_untyped_defs=False -[mypy-pandas.core.dtypes.cast] -check_untyped_defs=False - [mypy-pandas.core.frame] check_untyped_defs=False From bdbf7e29fd4cbcd495857a9e3ec6d605387d58d4 Mon Sep 17 00:00:00 2001 From: Metehan Kutlu Date: Mon, 31 Aug 2020 12:59:17 +0300 Subject: [PATCH 0586/1025] Issue35925 Remove trailing commas (#35996) * pandas/tests/test_multilevel.py * pandas/tests/test_nanops.py * pandas/.../test_moments_consistency_rolling.py * pandas/tests/window/moments/test_moments_ewm.py * pandas/.../test_moments_rolling.py * pandas/tests/window/test_base_indexer.py * pandas/tests/window/test_pairwise.py * pandas/tests/window/test_rolling.py * pandas/tseries/frequencies.py * pandas/util/_test_decorators.py --- pandas/tests/test_multilevel.py | 2 +- pandas/tests/test_nanops.py | 2 +- .../window/moments/test_moments_consistency_rolling.py | 8 ++++---- pandas/tests/window/moments/test_moments_ewm.py | 4 ++-- pandas/tests/window/moments/test_moments_rolling.py | 8 ++++---- pandas/tests/window/test_base_indexer.py | 6 +++--- pandas/tests/window/test_pairwise.py | 2 +- pandas/tests/window/test_rolling.py | 8 ++++---- pandas/tseries/frequencies.py | 2 +- pandas/util/_test_decorators.py | 4 ++-- 10 files changed, 23 insertions(+), 23 deletions(-) diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 724558bd49ea2..274860b3fdb5c 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -1846,7 +1846,7 @@ def test_multilevel_index_loc_order(self, dim, keys, expected): # GH 22797 # Try to respect order of keys given for MultiIndex.loc kwargs = {dim: [["c", "a", "a", "b", "b"], [1, 1, 2, 1, 2]]} - df = pd.DataFrame(np.arange(25).reshape(5, 5), **kwargs,) + df = pd.DataFrame(np.arange(25).reshape(5, 5), **kwargs) exp_index = MultiIndex.from_arrays(expected) if dim == "index": res = df.loc[keys, :] diff --git a/pandas/tests/test_nanops.py b/pandas/tests/test_nanops.py index 0d60e6e8a978f..c45e4508c6153 100644 --- a/pandas/tests/test_nanops.py +++ b/pandas/tests/test_nanops.py @@ -285,7 +285,7 @@ def test_nansum(self, skipna): def test_nanmean(self, skipna): self.check_funs( - nanops.nanmean, np.mean, skipna, allow_obj=False, allow_date=False, + nanops.nanmean, np.mean, skipna, allow_obj=False, allow_date=False ) def test_nanmean_overflow(self): diff --git a/pandas/tests/window/moments/test_moments_consistency_rolling.py b/pandas/tests/window/moments/test_moments_consistency_rolling.py index a3de8aa69f840..158b994cf03ae 100644 --- a/pandas/tests/window/moments/test_moments_consistency_rolling.py +++ b/pandas/tests/window/moments/test_moments_consistency_rolling.py @@ -95,7 +95,7 @@ def test_rolling_apply_consistency( with warnings.catch_warnings(): warnings.filterwarnings( - "ignore", message=".*(empty slice|0 for slice).*", category=RuntimeWarning, + "ignore", message=".*(empty slice|0 for slice).*", category=RuntimeWarning ) # test consistency between rolling_xyz() and either (a) # rolling_apply of Series.xyz(), or (b) rolling_apply of @@ -107,7 +107,7 @@ def test_rolling_apply_consistency( functions = no_nan_functions + base_functions for (f, require_min_periods, name) in functions: rolling_f = getattr( - x.rolling(window=window, center=center, min_periods=min_periods), name, + x.rolling(window=window, center=center, min_periods=min_periods), name ) if ( @@ -492,7 +492,7 @@ def test_moment_functions_zero_length_pairwise(): df2["a"] = df2["a"].astype("float64") df1_expected = DataFrame( - index=pd.MultiIndex.from_product([df1.index, df1.columns]), columns=Index([]), + index=pd.MultiIndex.from_product([df1.index, df1.columns]), columns=Index([]) ) df2_expected = DataFrame( index=pd.MultiIndex.from_product( @@ -635,7 +635,7 @@ def test_rolling_consistency(consistency_data, window, min_periods, center): # with empty/0-length Series/DataFrames with warnings.catch_warnings(): warnings.filterwarnings( - "ignore", message=".*(empty slice|0 for slice).*", category=RuntimeWarning, + "ignore", message=".*(empty slice|0 for slice).*", category=RuntimeWarning ) # test consistency between different rolling_* moments diff --git a/pandas/tests/window/moments/test_moments_ewm.py b/pandas/tests/window/moments/test_moments_ewm.py index 89d46a8bb6cb5..a83bfabc4a048 100644 --- a/pandas/tests/window/moments/test_moments_ewm.py +++ b/pandas/tests/window/moments/test_moments_ewm.py @@ -73,7 +73,7 @@ def simple_wma(s, w): (s1, True, True, [(1.0 - alpha), np.nan, 1.0]), (s1, False, False, [(1.0 - alpha) ** 2, np.nan, alpha]), (s1, False, True, [(1.0 - alpha), np.nan, alpha]), - (s2, True, False, [np.nan, (1.0 - alpha) ** 3, np.nan, np.nan, 1.0, np.nan],), + (s2, True, False, [np.nan, (1.0 - alpha) ** 3, np.nan, np.nan, 1.0, np.nan]), (s2, True, True, [np.nan, (1.0 - alpha), np.nan, np.nan, 1.0, np.nan]), ( s2, @@ -95,7 +95,7 @@ def simple_wma(s, w): alpha * ((1.0 - alpha) ** 2 + alpha), ], ), - (s3, False, True, [(1.0 - alpha) ** 2, np.nan, (1.0 - alpha) * alpha, alpha],), + (s3, False, True, [(1.0 - alpha) ** 2, np.nan, (1.0 - alpha) * alpha, alpha]), ]: expected = simple_wma(s, Series(w)) result = s.ewm(com=com, adjust=adjust, ignore_na=ignore_na).mean() diff --git a/pandas/tests/window/moments/test_moments_rolling.py b/pandas/tests/window/moments/test_moments_rolling.py index 81f020fe7de23..da256e80dff7e 100644 --- a/pandas/tests/window/moments/test_moments_rolling.py +++ b/pandas/tests/window/moments/test_moments_rolling.py @@ -150,14 +150,14 @@ def get_result(obj, window, min_periods=None, center=False): series_xp = ( get_result( - series.reindex(list(series.index) + s), window=25, min_periods=minp, + series.reindex(list(series.index) + s), window=25, min_periods=minp ) .shift(-12) .reindex(series.index) ) frame_xp = ( get_result( - frame.reindex(list(frame.index) + s), window=25, min_periods=minp, + frame.reindex(list(frame.index) + s), window=25, min_periods=minp ) .shift(-12) .reindex(frame.index) @@ -169,14 +169,14 @@ def get_result(obj, window, min_periods=None, center=False): else: series_xp = ( get_result( - series.reindex(list(series.index) + s), window=25, min_periods=0, + series.reindex(list(series.index) + s), window=25, min_periods=0 ) .shift(-12) .reindex(series.index) ) frame_xp = ( get_result( - frame.reindex(list(frame.index) + s), window=25, min_periods=0, + frame.reindex(list(frame.index) + s), window=25, min_periods=0 ) .shift(-12) .reindex(frame.index) diff --git a/pandas/tests/window/test_base_indexer.py b/pandas/tests/window/test_base_indexer.py index 2300d8dd5529b..ab73e075eed04 100644 --- a/pandas/tests/window/test_base_indexer.py +++ b/pandas/tests/window/test_base_indexer.py @@ -88,8 +88,8 @@ def get_window_bounds(self, num_values, min_periods, center, closed): @pytest.mark.parametrize( "func,np_func,expected,np_kwargs", [ - ("count", len, [3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 2.0, np.nan], {},), - ("min", np.min, [0.0, 1.0, 2.0, 3.0, 4.0, 6.0, 6.0, 7.0, 8.0, np.nan], {},), + ("count", len, [3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 2.0, np.nan], {}), + ("min", np.min, [0.0, 1.0, 2.0, 3.0, 4.0, 6.0, 6.0, 7.0, 8.0, np.nan], {}), ( "max", np.max, @@ -204,7 +204,7 @@ def test_rolling_forward_skewness(constructor): @pytest.mark.parametrize( "func,expected", [ - ("cov", [2.0, 2.0, 2.0, 97.0, 2.0, -93.0, 2.0, 2.0, np.nan, np.nan],), + ("cov", [2.0, 2.0, 2.0, 97.0, 2.0, -93.0, 2.0, 2.0, np.nan, np.nan]), ( "corr", [ diff --git a/pandas/tests/window/test_pairwise.py b/pandas/tests/window/test_pairwise.py index e82d4b8cbf770..7425cc5df4c2f 100644 --- a/pandas/tests/window/test_pairwise.py +++ b/pandas/tests/window/test_pairwise.py @@ -195,7 +195,7 @@ def test_cov_mulittindex(self): columns = MultiIndex.from_product([list("ab"), list("xy"), list("AB")]) index = range(3) - df = DataFrame(np.arange(24).reshape(3, 8), index=index, columns=columns,) + df = DataFrame(np.arange(24).reshape(3, 8), index=index, columns=columns) result = df.ewm(alpha=0.1).cov() diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index 8d72e2cb92ca9..67b20fd2d6daa 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -73,7 +73,7 @@ def test_constructor_with_timedelta_window(window): # GH 15440 n = 10 df = DataFrame( - {"value": np.arange(n)}, index=pd.date_range("2015-12-24", periods=n, freq="D"), + {"value": np.arange(n)}, index=pd.date_range("2015-12-24", periods=n, freq="D") ) expected_data = np.append([0.0, 1.0], np.arange(3.0, 27.0, 3)) @@ -92,7 +92,7 @@ def test_constructor_timedelta_window_and_minperiods(window, raw): # GH 15305 n = 10 df = DataFrame( - {"value": np.arange(n)}, index=pd.date_range("2017-08-08", periods=n, freq="D"), + {"value": np.arange(n)}, index=pd.date_range("2017-08-08", periods=n, freq="D") ) expected = DataFrame( {"value": np.append([np.NaN, 1.0], np.arange(3.0, 27.0, 3))}, @@ -153,7 +153,7 @@ def test_closed_one_entry(func): def test_closed_one_entry_groupby(func): # GH24718 ser = pd.DataFrame( - data={"A": [1, 1, 2], "B": [3, 2, 1]}, index=pd.date_range("2000", periods=3), + data={"A": [1, 1, 2], "B": [3, 2, 1]}, index=pd.date_range("2000", periods=3) ) result = getattr( ser.groupby("A", sort=False)["B"].rolling("10D", closed="left"), func @@ -182,7 +182,7 @@ def test_closed_one_entry_groupby(func): def test_closed_min_max_datetime(input_dtype, func, closed, expected): # see gh-21704 ser = pd.Series( - data=np.arange(10).astype(input_dtype), index=pd.date_range("2000", periods=10), + data=np.arange(10).astype(input_dtype), index=pd.date_range("2000", periods=10) ) result = getattr(ser.rolling("3D", closed=closed), func)() diff --git a/pandas/tseries/frequencies.py b/pandas/tseries/frequencies.py index f80ff1a53cd69..8ef6dac2862db 100644 --- a/pandas/tseries/frequencies.py +++ b/pandas/tseries/frequencies.py @@ -548,7 +548,7 @@ def is_superperiod(source, target) -> bool: def _maybe_coerce_freq(code) -> str: - """ we might need to coerce a code to a rule_code + """we might need to coerce a code to a rule_code and uppercase it Parameters diff --git a/pandas/util/_test_decorators.py b/pandas/util/_test_decorators.py index 0dad8c7397e37..ca7b99492bbf7 100644 --- a/pandas/util/_test_decorators.py +++ b/pandas/util/_test_decorators.py @@ -186,10 +186,10 @@ def skip_if_no(package: str, min_version: Optional[str] = None): is_platform_windows(), reason="not used on win32" ) skip_if_has_locale = pytest.mark.skipif( - _skip_if_has_locale(), reason=f"Specific locale is set {locale.getlocale()[0]}", + _skip_if_has_locale(), reason=f"Specific locale is set {locale.getlocale()[0]}" ) skip_if_not_us_locale = pytest.mark.skipif( - _skip_if_not_us_locale(), reason=f"Specific locale is set {locale.getlocale()[0]}", + _skip_if_not_us_locale(), reason=f"Specific locale is set {locale.getlocale()[0]}" ) skip_if_no_scipy = pytest.mark.skipif( _skip_if_no_scipy(), reason="Missing SciPy requirement" From 2246ba0a432b7603bcce7fbef81af41495dce4d4 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 31 Aug 2020 03:15:04 -0700 Subject: [PATCH 0587/1025] TYP: annotate plotting._matplotlib.tools (#35968) --- pandas/plotting/_matplotlib/tools.py | 33 +++++++++++++++++++++------- 1 file changed, 25 insertions(+), 8 deletions(-) diff --git a/pandas/plotting/_matplotlib/tools.py b/pandas/plotting/_matplotlib/tools.py index 26b25597ce1a6..4d643ffb734e4 100644 --- a/pandas/plotting/_matplotlib/tools.py +++ b/pandas/plotting/_matplotlib/tools.py @@ -1,6 +1,6 @@ # being a bit too dynamic from math import ceil -from typing import TYPE_CHECKING, Tuple +from typing import TYPE_CHECKING, Iterable, List, Sequence, Tuple, Union import warnings import matplotlib.table @@ -15,10 +15,13 @@ from pandas.plotting._matplotlib import compat if TYPE_CHECKING: + from matplotlib.axes import Axes + from matplotlib.axis import Axis + from matplotlib.lines import Line2D # noqa:F401 from matplotlib.table import Table -def format_date_labels(ax, rot): +def format_date_labels(ax: "Axes", rot): # mini version of autofmt_xdate for label in ax.get_xticklabels(): label.set_ha("right") @@ -278,7 +281,7 @@ def _subplots( return fig, axes -def _remove_labels_from_axis(axis): +def _remove_labels_from_axis(axis: "Axis"): for t in axis.get_majorticklabels(): t.set_visible(False) @@ -294,7 +297,15 @@ def _remove_labels_from_axis(axis): axis.get_label().set_visible(False) -def _handle_shared_axes(axarr, nplots, naxes, nrows, ncols, sharex, sharey): +def _handle_shared_axes( + axarr: Iterable["Axes"], + nplots: int, + naxes: int, + nrows: int, + ncols: int, + sharex: bool, + sharey: bool, +): if nplots > 1: if compat._mpl_ge_3_2_0(): row_num = lambda x: x.get_subplotspec().rowspan.start @@ -340,7 +351,7 @@ def _handle_shared_axes(axarr, nplots, naxes, nrows, ncols, sharex, sharey): _remove_labels_from_axis(ax.yaxis) -def _flatten(axes): +def _flatten(axes: Union["Axes", Sequence["Axes"]]) -> Sequence["Axes"]: if not is_list_like(axes): return np.array([axes]) elif isinstance(axes, (np.ndarray, ABCIndexClass)): @@ -348,7 +359,13 @@ def _flatten(axes): return np.array(axes) -def _set_ticks_props(axes, xlabelsize=None, xrot=None, ylabelsize=None, yrot=None): +def _set_ticks_props( + axes: Union["Axes", Sequence["Axes"]], + xlabelsize=None, + xrot=None, + ylabelsize=None, + yrot=None, +): import matplotlib.pyplot as plt for ax in _flatten(axes): @@ -363,7 +380,7 @@ def _set_ticks_props(axes, xlabelsize=None, xrot=None, ylabelsize=None, yrot=Non return axes -def _get_all_lines(ax): +def _get_all_lines(ax: "Axes") -> List["Line2D"]: lines = ax.get_lines() if hasattr(ax, "right_ax"): @@ -375,7 +392,7 @@ def _get_all_lines(ax): return lines -def _get_xlim(lines) -> Tuple[float, float]: +def _get_xlim(lines: Iterable["Line2D"]) -> Tuple[float, float]: left, right = np.inf, -np.inf for l in lines: x = l.get_xdata(orig=False) From d872607e7a488c4c3df0ed351a36f66636c01c3a Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 31 Aug 2020 03:16:15 -0700 Subject: [PATCH 0588/1025] TYP: annotations in core.groupby (#35939) --- pandas/core/groupby/categorical.py | 16 ++++++++++---- pandas/core/groupby/generic.py | 34 +++++++++++++----------------- pandas/core/groupby/groupby.py | 6 +++--- pandas/core/groupby/grouper.py | 6 ++++-- pandas/core/groupby/ops.py | 2 +- 5 files changed, 35 insertions(+), 29 deletions(-) diff --git a/pandas/core/groupby/categorical.py b/pandas/core/groupby/categorical.py index db734bb2f0c07..4d5acf527a867 100644 --- a/pandas/core/groupby/categorical.py +++ b/pandas/core/groupby/categorical.py @@ -1,3 +1,5 @@ +from typing import Optional, Tuple + import numpy as np from pandas.core.algorithms import unique1d @@ -6,9 +8,12 @@ CategoricalDtype, recode_for_categories, ) +from pandas.core.indexes.api import CategoricalIndex -def recode_for_groupby(c: Categorical, sort: bool, observed: bool): +def recode_for_groupby( + c: Categorical, sort: bool, observed: bool +) -> Tuple[Categorical, Optional[Categorical]]: """ Code the categories to ensure we can groupby for categoricals. @@ -73,7 +78,9 @@ def recode_for_groupby(c: Categorical, sort: bool, observed: bool): return c.reorder_categories(cat.categories), None -def recode_from_groupby(c: Categorical, sort: bool, ci): +def recode_from_groupby( + c: Categorical, sort: bool, ci: CategoricalIndex +) -> CategoricalIndex: """ Reverse the codes_to_groupby to account for sort / observed. @@ -91,7 +98,8 @@ def recode_from_groupby(c: Categorical, sort: bool, ci): """ # we re-order to the original category orderings if sort: - return ci.set_categories(c.categories) + return ci.set_categories(c.categories) # type: ignore [attr-defined] # we are not sorting, so add unobserved to the end - return ci.add_categories(c.categories[~c.categories.isin(ci.categories)]) + new_cats = c.categories[~c.categories.isin(ci.categories)] + return ci.add_categories(new_cats) # type: ignore [attr-defined] diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 3172fb4e0e853..e39464628ccaa 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -23,6 +23,7 @@ Type, TypeVar, Union, + cast, ) import warnings @@ -83,7 +84,7 @@ from pandas.plotting import boxplot_frame_groupby if TYPE_CHECKING: - from pandas.core.internals import Block + from pandas.core.internals import Block # noqa:F401 NamedAgg = namedtuple("NamedAgg", ["column", "aggfunc"]) @@ -1591,7 +1592,7 @@ def _gotitem(self, key, ndim: int, subset=None): Parameters ---------- key : string / list of selections - ndim : 1,2 + ndim : {1, 2} requested ndim of result subset : object, default None subset to act on @@ -1617,7 +1618,7 @@ def _gotitem(self, key, ndim: int, subset=None): raise AssertionError("invalid ndim for _gotitem") - def _wrap_frame_output(self, result, obj) -> DataFrame: + def _wrap_frame_output(self, result, obj: DataFrame) -> DataFrame: result_index = self.grouper.levels[0] if self.axis == 0: @@ -1634,20 +1635,14 @@ def _get_data_to_aggregate(self) -> BlockManager: else: return obj._mgr - def _insert_inaxis_grouper_inplace(self, result): + def _insert_inaxis_grouper_inplace(self, result: DataFrame) -> None: # zip in reverse so we can always insert at loc 0 - izip = zip( - *map( - reversed, - ( - self.grouper.names, - self.grouper.get_group_levels(), - [grp.in_axis for grp in self.grouper.groupings], - ), - ) - ) columns = result.columns - for name, lev, in_axis in izip: + for name, lev, in_axis in zip( + reversed(self.grouper.names), + reversed(self.grouper.get_group_levels()), + reversed([grp.in_axis for grp in self.grouper.groupings]), + ): # GH #28549 # When using .apply(-), name will be in columns already if in_axis and name not in columns: @@ -1712,7 +1707,7 @@ def _wrap_transformed_output( return result - def _wrap_agged_blocks(self, blocks: "Sequence[Block]", items: Index) -> DataFrame: + def _wrap_agged_blocks(self, blocks: Sequence["Block"], items: Index) -> DataFrame: if not self.as_index: index = np.arange(blocks[0].values.shape[-1]) mgr = BlockManager(blocks, axes=[items, index]) @@ -1739,7 +1734,7 @@ def _iterate_column_groupbys(self): exclusions=self.exclusions, ) - def _apply_to_column_groupbys(self, func): + def _apply_to_column_groupbys(self, func) -> DataFrame: from pandas.core.reshape.concat import concat return concat( @@ -1748,7 +1743,7 @@ def _apply_to_column_groupbys(self, func): axis=1, ) - def count(self): + def count(self) -> DataFrame: """ Compute count of group, excluding missing values. @@ -1778,7 +1773,7 @@ def count(self): return self._reindex_output(result, fill_value=0) - def nunique(self, dropna: bool = True): + def nunique(self, dropna: bool = True) -> DataFrame: """ Return DataFrame with counts of unique elements in each position. @@ -1844,6 +1839,7 @@ def nunique(self, dropna: bool = True): ], axis=1, ) + results = cast(DataFrame, results) if axis_number == 1: results = results.T diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index a91366af61d0d..651af2d314251 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -459,7 +459,7 @@ def f(self): @contextmanager -def _group_selection_context(groupby): +def _group_selection_context(groupby: "_GroupBy"): """ Set / reset the _group_selection_context. """ @@ -489,7 +489,7 @@ def __init__( keys: Optional[_KeysArgType] = None, axis: int = 0, level=None, - grouper: "Optional[ops.BaseGrouper]" = None, + grouper: Optional["ops.BaseGrouper"] = None, exclusions=None, selection=None, as_index: bool = True, @@ -734,7 +734,7 @@ def pipe(self, func, *args, **kwargs): plot = property(GroupByPlot) - def _make_wrapper(self, name): + def _make_wrapper(self, name: str) -> Callable: assert name in self._apply_allowlist with _group_selection_context(self): diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 8239a792c65dd..18970ea0544e4 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -568,7 +568,9 @@ def codes(self) -> np.ndarray: @cache_readonly def result_index(self) -> Index: if self.all_grouper is not None: - return recode_from_groupby(self.all_grouper, self.sort, self.group_index) + group_idx = self.group_index + assert isinstance(group_idx, CategoricalIndex) # set in __init__ + return recode_from_groupby(self.all_grouper, self.sort, group_idx) return self.group_index @property @@ -607,7 +609,7 @@ def get_grouper( mutated: bool = False, validate: bool = True, dropna: bool = True, -) -> "Tuple[ops.BaseGrouper, List[Hashable], FrameOrSeries]": +) -> Tuple["ops.BaseGrouper", List[Hashable], FrameOrSeries]: """ Create and return a BaseGrouper, which is an internal mapping of how to create the grouper indexers. diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 290680f380f5f..4dd5b7f30e7f0 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -82,7 +82,7 @@ class BaseGrouper: def __init__( self, axis: Index, - groupings: "Sequence[grouper.Grouping]", + groupings: Sequence["grouper.Grouping"], sort: bool = True, group_keys: bool = True, mutated: bool = False, From 87f2eafb3bbb7a119266b386014bebb5a43e2789 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 31 Aug 2020 14:36:12 +0200 Subject: [PATCH 0589/1025] REGR: Fix inplace updates on column to set correct values (#35936) --- doc/source/whatsnew/v1.1.2.rst | 1 + pandas/core/internals/managers.py | 1 + pandas/tests/extension/test_numpy.py | 6 ++++++ pandas/tests/frame/test_block_internals.py | 14 ++++++++++++++ 4 files changed, 22 insertions(+) diff --git a/doc/source/whatsnew/v1.1.2.rst b/doc/source/whatsnew/v1.1.2.rst index 9747a8ef3e71f..b4c196f548147 100644 --- a/doc/source/whatsnew/v1.1.2.rst +++ b/doc/source/whatsnew/v1.1.2.rst @@ -15,6 +15,7 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ - Regression in :meth:`DatetimeIndex.intersection` incorrectly raising ``AssertionError`` when intersecting against a list (:issue:`35876`) +- Fix regression in updating a column inplace (e.g. using ``df['col'].fillna(.., inplace=True)``) (:issue:`35731`) - Performance regression for :meth:`RangeIndex.format` (:issue:`35712`) - diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index bade891939c84..00321b76cb6bf 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1025,6 +1025,7 @@ def iset(self, loc: Union[int, slice, np.ndarray], value): Set new item in-place. Does not consolidate. Adds new Block if not contained in the current set of items """ + value = extract_array(value, extract_numpy=True) # FIXME: refactor, clearly separate broadcasting & zip-like assignment # can prob also fix the various if tests for sparse/categorical if self._blklocs is None and self.ndim > 1: diff --git a/pandas/tests/extension/test_numpy.py b/pandas/tests/extension/test_numpy.py index b9219f9f833de..bbfaacae1b444 100644 --- a/pandas/tests/extension/test_numpy.py +++ b/pandas/tests/extension/test_numpy.py @@ -348,6 +348,12 @@ def test_fillna_frame(self, data_missing): # Non-scalar "scalar" values. super().test_fillna_frame(data_missing) + @pytest.mark.skip("Invalid test") + def test_fillna_fill_other(self, data): + # inplace update doesn't work correctly with patched extension arrays + # extract_array returns PandasArray, while dtype is a numpy dtype + super().test_fillna_fill_other(data_missing) + class TestReshaping(BaseNumPyTests, base.BaseReshapingTests): @pytest.mark.skip("Incorrect parent test") diff --git a/pandas/tests/frame/test_block_internals.py b/pandas/tests/frame/test_block_internals.py index 8ecd9066ceff0..00cfa6265934f 100644 --- a/pandas/tests/frame/test_block_internals.py +++ b/pandas/tests/frame/test_block_internals.py @@ -644,3 +644,17 @@ def test_to_dict_of_blocks_item_cache(): assert df.loc[0, "b"] == "foo" assert df["b"] is ser + + +def test_update_inplace_sets_valid_block_values(): + # https://github.com/pandas-dev/pandas/issues/33457 + df = pd.DataFrame({"a": pd.Series([1, 2, None], dtype="category")}) + + # inplace update of a single column + df["a"].fillna(1, inplace=True) + + # check we havent put a Series into any block.values + assert isinstance(df._mgr.blocks[0].values, pd.Categorical) + + # smoketest for OP bug from GH#35731 + assert df.isnull().sum().sum() == 0 From 53c8800c11f9b516cb5bc1c194592d4ef94e9def Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Mon, 31 Aug 2020 16:06:35 +0100 Subject: [PATCH 0590/1025] TYP: misc typing fixes for pandas\core\frame.py (#35990) * TYP: misc typing fixes for pandas\core\frame.py * correct issue number --- pandas/core/frame.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 4668f264000e7..fde83a8393241 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1014,7 +1014,7 @@ def iterrows(self) -> Iterable[Tuple[Label, Series]]: s = klass(v, index=columns, name=k) yield k, s - def itertuples(self, index=True, name="Pandas"): + def itertuples(self, index: bool = True, name: Optional[str] = "Pandas"): """ Iterate over DataFrame rows as namedtuples. @@ -1088,7 +1088,11 @@ def itertuples(self, index=True, name="Pandas"): arrays.extend(self.iloc[:, k] for k in range(len(self.columns))) if name is not None: - itertuple = collections.namedtuple(name, fields, rename=True) + # https://github.com/python/mypy/issues/9046 + # error: namedtuple() expects a string literal as the first argument + itertuple = collections.namedtuple( # type: ignore[misc] + name, fields, rename=True + ) return map(itertuple._make, zip(*arrays)) # fallback to regular tuples @@ -4591,7 +4595,7 @@ def set_index( frame = self.copy() arrays = [] - names = [] + names: List[Label] = [] if append: names = list(self.index.names) if isinstance(self.index, MultiIndex): From bf0c2af5128670544371c6798e77946f8fa6e1e1 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 31 Aug 2020 09:27:27 -0700 Subject: [PATCH 0591/1025] CI: suppress another setuptools warning (#36011) --- pandas/tests/util/test_show_versions.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/util/test_show_versions.py b/pandas/tests/util/test_show_versions.py index 04e841c05e44a..fe5fc3e21d960 100644 --- a/pandas/tests/util/test_show_versions.py +++ b/pandas/tests/util/test_show_versions.py @@ -25,6 +25,7 @@ # https://github.com/pandas-dev/pandas/issues/35252 "ignore:Distutils:UserWarning" ) +@pytest.mark.filterwarnings("ignore:Setuptools is replacing distutils:UserWarning") def test_show_versions(capsys): # gh-32041 pd.show_versions() From cd814f325ca7638bbc586cedd0ed7a0c409d121c Mon Sep 17 00:00:00 2001 From: Fangchen Li Date: Mon, 31 Aug 2020 13:22:34 -0500 Subject: [PATCH 0592/1025] TYP: typing errors in _xlsxwriter.py #35994 (#35995) * TYP: typing errors in _xlsxwriter.py #35994 * TYP: add param type * TYP: remove book=None in base class --- pandas/io/excel/_base.py | 1 - pandas/io/excel/_odswriter.py | 2 +- pandas/io/excel/_xlsxwriter.py | 8 +++++--- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 3cd0d721bbdc6..ead36c95556b1 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -653,7 +653,6 @@ def __new__(cls, path, engine=None, **kwargs): return object.__new__(cls) # declare external properties you can count on - book = None curr_sheet = None path = None diff --git a/pandas/io/excel/_odswriter.py b/pandas/io/excel/_odswriter.py index 72f3d81b1c662..f39391ae1fe7f 100644 --- a/pandas/io/excel/_odswriter.py +++ b/pandas/io/excel/_odswriter.py @@ -25,7 +25,7 @@ def __init__( super().__init__(path, mode=mode, **engine_kwargs) - self.book: OpenDocumentSpreadsheet = OpenDocumentSpreadsheet() + self.book = OpenDocumentSpreadsheet() self._style_dict: Dict[str, str] = {} def save(self) -> None: diff --git a/pandas/io/excel/_xlsxwriter.py b/pandas/io/excel/_xlsxwriter.py index 85a1bb031f457..bdbb006ae93dc 100644 --- a/pandas/io/excel/_xlsxwriter.py +++ b/pandas/io/excel/_xlsxwriter.py @@ -1,3 +1,5 @@ +from typing import Dict, List, Tuple + import pandas._libs.json as json from pandas.io.excel._base import ExcelWriter @@ -8,7 +10,7 @@ class _XlsxStyler: # Map from openpyxl-oriented styles to flatter xlsxwriter representation # Ordering necessary for both determinism and because some are keyed by # prefixes of others. - STYLE_MAPPING = { + STYLE_MAPPING: Dict[str, List[Tuple[Tuple[str, ...], str]]] = { "font": [ (("name",), "font_name"), (("sz",), "font_size"), @@ -170,7 +172,7 @@ def __init__( **engine_kwargs, ): # Use the xlsxwriter module as the Excel writer. - import xlsxwriter + from xlsxwriter import Workbook if mode == "a": raise ValueError("Append mode is not supported with xlsxwriter!") @@ -184,7 +186,7 @@ def __init__( **engine_kwargs, ) - self.book = xlsxwriter.Workbook(path, **engine_kwargs) + self.book = Workbook(path, **engine_kwargs) def save(self): """ From 0e57a5d156b9912887ee3e697f0a48911b4ff234 Mon Sep 17 00:00:00 2001 From: Pranjal Bhardwaj <50989807+Bhard27@users.noreply.github.com> Date: Mon, 31 Aug 2020 23:54:04 +0530 Subject: [PATCH 0593/1025] DOC clean up doc/source/getting_started/overview.rst (#35981) * improved the documentation * Update doc/source/getting_started/overview.rst Co-authored-by: Marco Gorelli * new commit * content changed * new commit Co-authored-by: Marco Gorelli --- doc/source/getting_started/overview.rst | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/doc/source/getting_started/overview.rst b/doc/source/getting_started/overview.rst index d8a40c5406dee..032ba73a7293d 100644 --- a/doc/source/getting_started/overview.rst +++ b/doc/source/getting_started/overview.rst @@ -9,9 +9,9 @@ Package overview **pandas** is a `Python `__ package providing fast, flexible, and expressive data structures designed to make working with "relational" or "labeled" data both easy and intuitive. It aims to be the -fundamental high-level building block for doing practical, **real world** data +fundamental high-level building block for doing practical, **real-world** data analysis in Python. Additionally, it has the broader goal of becoming **the -most powerful and flexible open source data analysis / manipulation tool +most powerful and flexible open source data analysis/manipulation tool available in any language**. It is already well on its way toward this goal. pandas is well suited for many different kinds of data: @@ -21,7 +21,7 @@ pandas is well suited for many different kinds of data: - Ordered and unordered (not necessarily fixed-frequency) time series data. - Arbitrary matrix data (homogeneously typed or heterogeneous) with row and column labels - - Any other form of observational / statistical data sets. The data actually + - Any other form of observational / statistical data sets. The data need not be labeled at all to be placed into a pandas data structure The two primary data structures of pandas, :class:`Series` (1-dimensional) @@ -57,7 +57,7 @@ Here are just a few of the things that pandas does well: Excel files, databases, and saving / loading data from the ultrafast **HDF5 format** - **Time series**-specific functionality: date range generation and frequency - conversion, moving window statistics, date shifting and lagging. + conversion, moving window statistics, date shifting, and lagging. Many of these principles are here to address the shortcomings frequently experienced using other languages / scientific research environments. For data @@ -101,12 +101,12 @@ fashion. Also, we would like sensible default behaviors for the common API functions which take into account the typical orientation of time series and -cross-sectional data sets. When using ndarrays to store 2- and 3-dimensional +cross-sectional data sets. When using the N-dimensional array (ndarrays) to store 2- and 3-dimensional data, a burden is placed on the user to consider the orientation of the data set when writing functions; axes are considered more or less equivalent (except when C- or Fortran-contiguousness matters for performance). In pandas, the axes are intended to lend more semantic meaning to the data; i.e., for a particular -data set there is likely to be a "right" way to orient the data. The goal, +data set, there is likely to be a "right" way to orient the data. The goal, then, is to reduce the amount of mental effort required to code up data transformations in downstream functions. @@ -148,8 +148,8 @@ pandas possible. Thanks to `all of our contributors `. pandas is a `NumFOCUS `__ sponsored project. -This will help ensure the success of development of pandas as a world-class open-source -project, and makes it possible to `donate `__ to the project. +This will help ensure the success of the development of pandas as a world-class open-source +project and makes it possible to `donate `__ to the project. Project governance ------------------ From 6f4e59377b51ebf9bd80999b39eabd32465a937f Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 31 Aug 2020 11:28:24 -0700 Subject: [PATCH 0594/1025] REF: use BlockManager.apply for Rolling.count (#35883) * REF: remove unnecesary try/except * TST: add test for agg on ordered categorical cols (#35630) * TST: resample does not yield empty groups (#10603) (#35799) * revert accidental rebase * REF: use BlockManager.apply for Rolling.count Co-authored-by: Karthik Mathur <22126205+mathurk1@users.noreply.github.com> Co-authored-by: tkmz-n <60312218+tkmz-n@users.noreply.github.com> --- pandas/core/window/rolling.py | 59 ++++++++++------------------------- 1 file changed, 17 insertions(+), 42 deletions(-) diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 04509a40b98df..246bf8e6f71b7 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -22,7 +22,7 @@ from pandas._libs.tslibs import BaseOffset, to_offset import pandas._libs.window.aggregations as window_aggregations -from pandas._typing import ArrayLike, Axis, FrameOrSeries, FrameOrSeriesUnion, Label +from pandas._typing import ArrayLike, Axis, FrameOrSeries, FrameOrSeriesUnion from pandas.compat._optional import import_optional_dependency from pandas.compat.numpy import function as nv from pandas.util._decorators import Appender, Substitution, cache_readonly, doc @@ -44,6 +44,7 @@ ABCSeries, ABCTimedeltaIndex, ) +from pandas.core.dtypes.missing import notna from pandas.core.base import DataError, PandasObject, SelectionMixin, ShallowMixin import pandas.core.common as com @@ -395,40 +396,6 @@ def _wrap_result(self, result, block=None, obj=None): return type(obj)(result, index=index, columns=block.columns) return result - def _wrap_results(self, results, obj, skipped: List[int]) -> FrameOrSeriesUnion: - """ - Wrap the results. - - Parameters - ---------- - results : list of ndarrays - obj : conformed data (may be resampled) - skipped: List[int] - Indices of blocks that are skipped. - """ - from pandas import Series, concat - - if obj.ndim == 1: - if not results: - raise DataError("No numeric types to aggregate") - assert len(results) == 1 - return Series(results[0], index=obj.index, name=obj.name) - - exclude: List[Label] = [] - orig_blocks = list(obj._to_dict_of_blocks(copy=False).values()) - for i in skipped: - exclude.extend(orig_blocks[i].columns) - - columns = [c for c in self._selected_obj.columns if c not in exclude] - if not columns and not len(results) and exclude: - raise DataError("No numeric types to aggregate") - elif not len(results): - return obj.astype("float64") - - df = concat(results, axis=1).reindex(columns=columns, copy=False) - self._insert_on_column(df, obj) - return df - def _insert_on_column(self, result: "DataFrame", obj: "DataFrame"): # if we have an 'on' column we want to put it back into # the results in the same location @@ -1325,21 +1292,29 @@ def count(self): # implementations shouldn't end up here assert not isinstance(self.window, BaseIndexer) - blocks, obj = self._create_blocks(self._selected_obj) - results = [] - for b in blocks: - result = b.notna().astype(int) + _, obj = self._create_blocks(self._selected_obj) + + def hfunc(values: np.ndarray) -> np.ndarray: + result = notna(values) + result = result.astype(int) + frame = type(obj)(result.T) result = self._constructor( - result, + frame, window=self._get_window(), min_periods=self.min_periods or 0, center=self.center, axis=self.axis, closed=self.closed, ).sum() - results.append(result) + return result.values.T - return self._wrap_results(results, obj, skipped=[]) + new_mgr = obj._mgr.apply(hfunc) + out = obj._constructor(new_mgr) + if obj.ndim == 1: + out.name = obj.name + else: + self._insert_on_column(out, obj) + return out _shared_docs["apply"] = dedent( r""" From 07946bf6ac45299540f908b30c6e6202cd0f4cc1 Mon Sep 17 00:00:00 2001 From: Honfung Wong Date: Tue, 1 Sep 2020 02:40:11 +0800 Subject: [PATCH 0595/1025] DOC: complement the documentation for pandas.DataFrame.agg #35912 (#35941) * DOC: complement the documentation for pandas.DataFrame.agg * DOC: complement the documentation for pandas.DataFrame.agg reformat the documentation according to PEP 8 * DOC: complement the documentation for pandas.DataFrame.agg reformat the documentation according to PEP 8 --- pandas/core/frame.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index fde83a8393241..312d449e36022 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -7380,6 +7380,15 @@ def _gotitem( min 1.0 2.0 sum 12.0 NaN + Aggregate different functions over the columns and rename the index of the resulting + DataFrame. + + >>> df.agg(x=('A', max), y=('B', 'min'), z=('C', np.mean)) + A B C + x 7.0 NaN NaN + y NaN 2.0 NaN + z NaN NaN 6.0 + Aggregate over the columns. >>> df.agg("mean", axis="columns") From 3f19fc9cdce32c1d9dea9ad898b7aac17235288b Mon Sep 17 00:00:00 2001 From: Daniel Saxton <2658661+dsaxton@users.noreply.github.com> Date: Mon, 31 Aug 2020 13:45:11 -0500 Subject: [PATCH 0596/1025] Update SparseDtype user guide doc (#35837) * Update SparseDtype user guide doc * Reword --- doc/source/user_guide/sparse.rst | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/doc/source/user_guide/sparse.rst b/doc/source/user_guide/sparse.rst index ca8e9a2f313f6..35e0e0fb86472 100644 --- a/doc/source/user_guide/sparse.rst +++ b/doc/source/user_guide/sparse.rst @@ -87,14 +87,15 @@ The :attr:`SparseArray.dtype` property stores two pieces of information sparr.dtype -A :class:`SparseDtype` may be constructed by passing each of these +A :class:`SparseDtype` may be constructed by passing only a dtype .. ipython:: python pd.SparseDtype(np.dtype('datetime64[ns]')) -The default fill value for a given NumPy dtype is the "missing" value for that dtype, -though it may be overridden. +in which case a default fill value will be used (for NumPy dtypes this is often the +"missing" value for that dtype). To override this default an explicit fill value may be +passed instead .. ipython:: python From c76dfbdef9f9aee039d858a288b6883cf5e67dfb Mon Sep 17 00:00:00 2001 From: Ben Forbes Date: Tue, 1 Sep 2020 04:47:25 +1000 Subject: [PATCH 0597/1025] clarify VS installer instructions (#35640) --- doc/source/development/contributing.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/development/contributing.rst b/doc/source/development/contributing.rst index 4ffd1d586a99a..e5c6f77eea3ef 100644 --- a/doc/source/development/contributing.rst +++ b/doc/source/development/contributing.rst @@ -204,6 +204,7 @@ You will need `Build Tools for Visual Studio 2017 You DO NOT need to install Visual Studio 2019. You only need "Build Tools for Visual Studio 2019" found by scrolling down to "All downloads" -> "Tools for Visual Studio 2019". + In the installer, select the "C++ build tools" workload. **Mac OS** From 5896b5cf24a1b63bea59ee1baaec7d6bcfe200b9 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Mon, 31 Aug 2020 21:26:34 +0100 Subject: [PATCH 0598/1025] TYP: check_untyped_defs core.internals.concat (#36008) --- pandas/core/internals/concat.py | 19 ++++++++++--------- setup.cfg | 3 --- 2 files changed, 10 insertions(+), 12 deletions(-) diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 99a586f056b12..88839d2211f81 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -1,10 +1,11 @@ from collections import defaultdict import copy -from typing import List +from typing import Dict, List import numpy as np from pandas._libs import NaT, internals as libinternals +from pandas._typing import DtypeObj from pandas.util._decorators import cache_readonly from pandas.core.dtypes.cast import maybe_promote @@ -100,10 +101,10 @@ def _get_mgr_concatenation_plan(mgr, indexers): """ # Calculate post-reindex shape , save for item axis which will be separate # for each block anyway. - mgr_shape = list(mgr.shape) + mgr_shape_list = list(mgr.shape) for ax, indexer in indexers.items(): - mgr_shape[ax] = len(indexer) - mgr_shape = tuple(mgr_shape) + mgr_shape_list[ax] = len(indexer) + mgr_shape = tuple(mgr_shape_list) if 0 in indexers: ax0_indexer = indexers.pop(0) @@ -126,9 +127,9 @@ def _get_mgr_concatenation_plan(mgr, indexers): join_unit_indexers = indexers.copy() - shape = list(mgr_shape) - shape[0] = len(placements) - shape = tuple(shape) + shape_list = list(mgr_shape) + shape_list[0] = len(placements) + shape = tuple(shape_list) if blkno == -1: unit = JoinUnit(None, shape) @@ -374,8 +375,8 @@ def _get_empty_dtype_and_na(join_units): else: dtypes[i] = unit.dtype - upcast_classes = defaultdict(list) - null_upcast_classes = defaultdict(list) + upcast_classes: Dict[str, List[DtypeObj]] = defaultdict(list) + null_upcast_classes: Dict[str, List[DtypeObj]] = defaultdict(list) for dtype, unit in zip(dtypes, join_units): if dtype is None: continue diff --git a/setup.cfg b/setup.cfg index 2ba22e5aad3c7..c10624d60aaff 100644 --- a/setup.cfg +++ b/setup.cfg @@ -184,9 +184,6 @@ check_untyped_defs=False [mypy-pandas.core.internals.blocks] check_untyped_defs=False -[mypy-pandas.core.internals.concat] -check_untyped_defs=False - [mypy-pandas.core.internals.construction] check_untyped_defs=False From 684938babbe4bbc5ddcdee0c7859dec296d4f129 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Mon, 31 Aug 2020 13:42:05 -0700 Subject: [PATCH 0599/1025] CLN: window/rolling.py (#35982) * CLN: rolling.py * Use obj._constructor instead Co-authored-by: Matt Roeschke --- pandas/core/window/rolling.py | 23 ++++++----------------- 1 file changed, 6 insertions(+), 17 deletions(-) diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 246bf8e6f71b7..a3f60c0bc5098 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -378,23 +378,13 @@ def _prep_values(self, values: Optional[np.ndarray] = None) -> np.ndarray: return values - def _wrap_result(self, result, block=None, obj=None): + def _wrap_result(self, result: np.ndarray) -> "Series": """ - Wrap a single result. + Wrap a single 1D result. """ - if obj is None: - obj = self._selected_obj - index = obj.index + obj = self._selected_obj - if isinstance(result, np.ndarray): - - if result.ndim == 1: - from pandas import Series - - return Series(result, index, name=obj.name) - - return type(obj)(result, index=index, columns=block.columns) - return result + return obj._constructor(result, obj.index, name=obj.name) def _insert_on_column(self, result: "DataFrame", obj: "DataFrame"): # if we have an 'on' column we want to put it back into @@ -421,7 +411,7 @@ def _insert_on_column(self, result: "DataFrame", obj: "DataFrame"): # insert at the end result[name] = extra_col - def _center_window(self, result, window) -> np.ndarray: + def _center_window(self, result: np.ndarray, window) -> np.ndarray: """ Center the result in the window. """ @@ -480,7 +470,6 @@ def _apply_series(self, homogeneous_func: Callable[..., ArrayLike]) -> "Series": Series version of _apply_blockwise """ _, obj = self._create_blocks(self._selected_obj) - values = obj.values try: values = self._prep_values(obj.values) @@ -502,7 +491,7 @@ def _apply_blockwise( # This isn't quite blockwise, since `blocks` is actually a collection # of homogenenous DataFrames. - blocks, obj = self._create_blocks(self._selected_obj) + _, obj = self._create_blocks(self._selected_obj) mgr = obj._mgr def hfunc(bvalues: ArrayLike) -> ArrayLike: From 6b0ec74bab480d5cd0d58429127ac28ebd51b510 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Mon, 31 Aug 2020 21:45:17 +0100 Subject: [PATCH 0600/1025] TYP: misc typing cleanup for core/computation/expressions.py (#36005) --- pandas/core/computation/expressions.py | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/pandas/core/computation/expressions.py b/pandas/core/computation/expressions.py index 05a5538a88772..a9c0cb0571446 100644 --- a/pandas/core/computation/expressions.py +++ b/pandas/core/computation/expressions.py @@ -6,6 +6,7 @@ """ import operator +from typing import List, Set import warnings import numpy as np @@ -21,7 +22,7 @@ import numexpr as ne _TEST_MODE = None -_TEST_RESULT = None +_TEST_RESULT: List[bool] = list() _USE_NUMEXPR = _NUMEXPR_INSTALLED _evaluate = None _where = None @@ -75,7 +76,7 @@ def _can_use_numexpr(op, op_str, a, b, dtype_check): # required min elements (otherwise we are adding overhead) if np.prod(a.shape) > _MIN_ELEMENTS: # check for dtype compatibility - dtypes = set() + dtypes: Set[str] = set() for o in [a, b]: # Series implements dtypes, check for dimension count as well if hasattr(o, "dtypes") and o.ndim > 1: @@ -247,25 +248,28 @@ def where(cond, a, b, use_numexpr=True): return _where(cond, a, b) if use_numexpr else _where_standard(cond, a, b) -def set_test_mode(v=True): +def set_test_mode(v: bool = True) -> None: """ - Keeps track of whether numexpr was used. Stores an additional ``True`` - for every successful use of evaluate with numexpr since the last - ``get_test_result`` + Keeps track of whether numexpr was used. + + Stores an additional ``True`` for every successful use of evaluate with + numexpr since the last ``get_test_result``. """ global _TEST_MODE, _TEST_RESULT _TEST_MODE = v _TEST_RESULT = [] -def _store_test_result(used_numexpr): +def _store_test_result(used_numexpr: bool) -> None: global _TEST_RESULT if used_numexpr: _TEST_RESULT.append(used_numexpr) -def get_test_result(): - """get test result and reset test_results""" +def get_test_result() -> List[bool]: + """ + Get test result and reset test_results. + """ global _TEST_RESULT res = _TEST_RESULT _TEST_RESULT = [] From e3f96b1bbca63a7d52364de213972dd75d76b17b Mon Sep 17 00:00:00 2001 From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> Date: Mon, 31 Aug 2020 18:31:07 -0400 Subject: [PATCH 0601/1025] BUG: Attributes are lost when subsetting columns in groupby (#35444) --- doc/source/whatsnew/v1.2.0.rst | 2 ++ pandas/core/groupby/generic.py | 19 +++++++++++-- pandas/tests/groupby/test_groupby.py | 42 ++++++++++++++++++++++++++++ 3 files changed, 61 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 55570341cf4e8..1617bf66c4f04 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -255,6 +255,8 @@ Groupby/resample/rolling - Bug when combining methods :meth:`DataFrame.groupby` with :meth:`DataFrame.resample` and :meth:`DataFrame.interpolate` raising an ``TypeError`` (:issue:`35325`) - Bug in :meth:`DataFrameGroupBy.apply` where a non-nuisance grouping column would be dropped from the output columns if another groupby method was called before ``.apply()`` (:issue:`34656`) - Bug in :meth:`DataFrameGroupby.apply` would drop a :class:`CategoricalIndex` when grouped on. (:issue:`35792`) +- Bug when subsetting columns on a :class:`~pandas.core.groupby.DataFrameGroupBy` (e.g. ``df.groupby('a')[['b']])``) would reset the attributes ``axis``, ``dropna``, ``group_keys``, ``level``, ``mutated``, ``sort``, and ``squeeze`` to their default values. (:issue:`9959`) +- Reshaping ^^^^^^^^^ diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index e39464628ccaa..7b45a114e548b 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1603,17 +1603,32 @@ def _gotitem(self, key, ndim: int, subset=None): return DataFrameGroupBy( subset, self.grouper, - selection=key, + axis=self.axis, + level=self.level, grouper=self.grouper, exclusions=self.exclusions, + selection=key, as_index=self.as_index, + sort=self.sort, + group_keys=self.group_keys, + squeeze=self.squeeze, observed=self.observed, + mutated=self.mutated, + dropna=self.dropna, ) elif ndim == 1: if subset is None: subset = self.obj[key] return SeriesGroupBy( - subset, selection=key, grouper=self.grouper, observed=self.observed + subset, + level=self.level, + grouper=self.grouper, + selection=key, + sort=self.sort, + group_keys=self.group_keys, + squeeze=self.squeeze, + observed=self.observed, + dropna=self.dropna, ) raise AssertionError("invalid ndim for _gotitem") diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 8c51ebf89f5c0..c743058c988b4 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2069,3 +2069,45 @@ def test_group_on_two_row_multiindex_returns_one_tuple_key(): assert len(result) == 1 key = (1, 2) assert (result[key] == expected[key]).all() + + +@pytest.mark.parametrize( + "klass, attr, value", + [ + (DataFrame, "axis", 1), + (DataFrame, "level", "a"), + (DataFrame, "as_index", False), + (DataFrame, "sort", False), + (DataFrame, "group_keys", False), + (DataFrame, "squeeze", True), + (DataFrame, "observed", True), + (DataFrame, "dropna", False), + pytest.param( + Series, + "axis", + 1, + marks=pytest.mark.xfail( + reason="GH 35443: Attribute currently not passed on to series" + ), + ), + (Series, "level", "a"), + (Series, "as_index", False), + (Series, "sort", False), + (Series, "group_keys", False), + (Series, "squeeze", True), + (Series, "observed", True), + (Series, "dropna", False), + ], +) +@pytest.mark.filterwarnings( + "ignore:The `squeeze` parameter is deprecated:FutureWarning" +) +def test_subsetting_columns_keeps_attrs(klass, attr, value): + # GH 9959 - When subsetting columns, don't drop attributes + df = pd.DataFrame({"a": [1], "b": [2], "c": [3]}) + if attr != "axis": + df = df.set_index("a") + + expected = df.groupby("a", **{attr: value}) + result = expected[["b"]] if klass is DataFrame else expected["b"] + assert getattr(result, attr) == getattr(expected, attr) From 1500da9161ecba6f1e63ea928862c4556c561e82 Mon Sep 17 00:00:00 2001 From: Daniel Saxton <2658661+dsaxton@users.noreply.github.com> Date: Mon, 31 Aug 2020 17:32:33 -0500 Subject: [PATCH 0602/1025] REGR: Fix comparison broadcasting over array of Intervals (#35938) --- doc/source/whatsnew/v1.1.2.rst | 1 + pandas/_libs/interval.pyx | 5 +++++ pandas/tests/frame/methods/test_replace.py | 7 +++++++ pandas/tests/scalar/interval/test_arithmetic.py | 12 ++++++++++++ pandas/tests/scalar/interval/test_interval.py | 9 +++++++++ 5 files changed, 34 insertions(+) diff --git a/doc/source/whatsnew/v1.1.2.rst b/doc/source/whatsnew/v1.1.2.rst index b4c196f548147..c6917d1b50619 100644 --- a/doc/source/whatsnew/v1.1.2.rst +++ b/doc/source/whatsnew/v1.1.2.rst @@ -17,6 +17,7 @@ Fixed regressions - Regression in :meth:`DatetimeIndex.intersection` incorrectly raising ``AssertionError`` when intersecting against a list (:issue:`35876`) - Fix regression in updating a column inplace (e.g. using ``df['col'].fillna(.., inplace=True)``) (:issue:`35731`) - Performance regression for :meth:`RangeIndex.format` (:issue:`35712`) +- Regression in :meth:`DataFrame.replace` where a ``TypeError`` would be raised when attempting to replace elements of type :class:`Interval` (:issue:`35931`) - diff --git a/pandas/_libs/interval.pyx b/pandas/_libs/interval.pyx index 6867e8aba7411..40bd5ad8f5a1f 100644 --- a/pandas/_libs/interval.pyx +++ b/pandas/_libs/interval.pyx @@ -358,6 +358,11 @@ cdef class Interval(IntervalMixin): self_tuple = (self.left, self.right, self.closed) other_tuple = (other.left, other.right, other.closed) return PyObject_RichCompare(self_tuple, other_tuple, op) + elif util.is_array(other): + return np.array( + [PyObject_RichCompare(self, x, op) for x in other], + dtype=bool, + ) return NotImplemented diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py index 8603bff0587b6..83dfd42ae2a6e 100644 --- a/pandas/tests/frame/methods/test_replace.py +++ b/pandas/tests/frame/methods/test_replace.py @@ -1581,3 +1581,10 @@ def test_replace_with_compiled_regex(self): result = df.replace({regex: "z"}, regex=True) expected = pd.DataFrame(["z", "b", "c"]) tm.assert_frame_equal(result, expected) + + def test_replace_intervals(self): + # https://github.com/pandas-dev/pandas/issues/35931 + df = pd.DataFrame({"a": [pd.Interval(0, 1), pd.Interval(0, 1)]}) + result = df.replace({"a": {pd.Interval(0, 1): "x"}}) + expected = pd.DataFrame({"a": ["x", "x"]}) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/scalar/interval/test_arithmetic.py b/pandas/tests/scalar/interval/test_arithmetic.py index 5252f1a4d5a24..b4c2b448e252a 100644 --- a/pandas/tests/scalar/interval/test_arithmetic.py +++ b/pandas/tests/scalar/interval/test_arithmetic.py @@ -45,3 +45,15 @@ def test_numeric_interval_add_timedelta_raises(interval, delta): with pytest.raises((TypeError, ValueError), match=msg): delta + interval + + +@pytest.mark.parametrize("klass", [timedelta, np.timedelta64, Timedelta]) +def test_timdelta_add_timestamp_interval(klass): + delta = klass(0) + expected = Interval(Timestamp("2020-01-01"), Timestamp("2020-02-01")) + + result = delta + expected + assert result == expected + + result = expected + delta + assert result == expected diff --git a/pandas/tests/scalar/interval/test_interval.py b/pandas/tests/scalar/interval/test_interval.py index a0151bb9ac7bf..8ad9a2c7a9c70 100644 --- a/pandas/tests/scalar/interval/test_interval.py +++ b/pandas/tests/scalar/interval/test_interval.py @@ -2,6 +2,7 @@ import pytest from pandas import Interval, Period, Timedelta, Timestamp +import pandas._testing as tm import pandas.core.common as com @@ -267,3 +268,11 @@ def test_constructor_errors_tz(self, tz_left, tz_right): msg = "left and right must have the same time zone" with pytest.raises(error, match=msg): Interval(left, right) + + def test_equality_comparison_broadcasts_over_array(self): + # https://github.com/pandas-dev/pandas/issues/35931 + interval = Interval(0, 1) + arr = np.array([interval, interval]) + result = interval == arr + expected = np.array([True, True]) + tm.assert_numpy_array_equal(result, expected) From 19fc21552883e33180d25239cac5ae4bce2cce3c Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 31 Aug 2020 18:20:40 -0700 Subject: [PATCH 0603/1025] BUG: None in Float64Index raising TypeError, should return False (#35999) --- doc/source/whatsnew/v1.1.2.rst | 2 +- pandas/_libs/index.pyx | 6 +++++- pandas/tests/indexes/numeric/test_indexing.py | 6 ++++++ 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.1.2.rst b/doc/source/whatsnew/v1.1.2.rst index c6917d1b50619..9b1ad658d4666 100644 --- a/doc/source/whatsnew/v1.1.2.rst +++ b/doc/source/whatsnew/v1.1.2.rst @@ -31,7 +31,7 @@ Bug fixes - Bug in :class:`Series` constructor raising a ``TypeError`` when constructing sparse datetime64 dtypes (:issue:`35762`) - Bug in :meth:`DataFrame.apply` with ``result_type="reduce"`` returning with incorrect index (:issue:`35683`) - Bug in :meth:`DateTimeIndex.format` and :meth:`PeriodIndex.format` with ``name=True`` setting the first item to ``"None"`` where it should bw ``""`` (:issue:`35712`) -- +- Bug in :meth:`Float64Index.__contains__` incorrectly raising ``TypeError`` instead of returning ``False`` (:issue:`35788`) .. --------------------------------------------------------------------------- diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index d6659cc1895b1..569562f5b5037 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -80,7 +80,11 @@ cdef class IndexEngine: values = self._get_index_values() self._check_type(val) - loc = _bin_search(values, val) # .searchsorted(val, side='left') + try: + loc = _bin_search(values, val) # .searchsorted(val, side='left') + except TypeError: + # GH#35788 e.g. val=None with float64 values + raise KeyError(val) if loc >= len(values): raise KeyError(val) if values[loc] != val: diff --git a/pandas/tests/indexes/numeric/test_indexing.py b/pandas/tests/indexes/numeric/test_indexing.py index 473e370c76f8b..508bd2f566507 100644 --- a/pandas/tests/indexes/numeric/test_indexing.py +++ b/pandas/tests/indexes/numeric/test_indexing.py @@ -228,6 +228,12 @@ def test_take_fill_value_ints(self, klass): class TestContains: + @pytest.mark.parametrize("klass", [Float64Index, Int64Index, UInt64Index]) + def test_contains_none(self, klass): + # GH#35788 should return False, not raise TypeError + index = klass([0, 1, 2, 3, 4]) + assert None not in index + def test_contains_float64_nans(self): index = Float64Index([1.0, 2.0, np.nan]) assert np.nan in index From ca6955bf2c123d920be23f2b66b7df5cdc54d579 Mon Sep 17 00:00:00 2001 From: Erfan Nariman <34067903+erfannariman@users.noreply.github.com> Date: Tue, 1 Sep 2020 13:42:03 +0200 Subject: [PATCH 0604/1025] Removed mypy from pre commit (#35066) --- .pre-commit-config.yaml | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index b7fd797fb7230..fcd0ecdc9fcd2 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -30,15 +30,3 @@ repos: - id: isort language: python_venv exclude: ^pandas/__init__\.py$|^pandas/core/api\.py$ -- repo: https://github.com/pre-commit/mirrors-mypy - rev: v0.730 - hooks: - - id: mypy - args: - # As long as a some files are excluded from check-untyped-defs - # we have to exclude it from the pre-commit hook as the configuration - # is based on modules but the hook runs on files. - - --no-check-untyped-defs - - --follow-imports - - skip - files: pandas/ From d26ae0744a596ab73841c9ada6dff3648855ea7a Mon Sep 17 00:00:00 2001 From: Jonathan Shreckengost Date: Tue, 1 Sep 2020 08:38:27 -0400 Subject: [PATCH 0605/1025] Comma cleanup for #35925 (#36023) --- pandas/tests/frame/test_analytics.py | 8 ++------ pandas/tests/frame/test_constructors.py | 2 +- pandas/tests/frame/test_reshape.py | 4 ++-- 3 files changed, 5 insertions(+), 9 deletions(-) diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index 52a1e3aae9058..b0ba0d991c9b0 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -86,11 +86,7 @@ def wrapper(x): result0 = f(axis=0, skipna=False) result1 = f(axis=1, skipna=False) tm.assert_series_equal( - result0, - frame.apply(wrapper), - check_dtype=check_dtype, - rtol=rtol, - atol=atol, + result0, frame.apply(wrapper), check_dtype=check_dtype, rtol=rtol, atol=atol ) # HACK: win32 tm.assert_series_equal( @@ -116,7 +112,7 @@ def wrapper(x): if opname in ["sum", "prod"]: expected = frame.apply(skipna_wrapper, axis=1) tm.assert_series_equal( - result1, expected, check_dtype=False, rtol=rtol, atol=atol, + result1, expected, check_dtype=False, rtol=rtol, atol=atol ) # check dtypes diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index c8f5b2b0f6364..0d1004809f7f1 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -932,7 +932,7 @@ def test_constructor_mrecarray(self): # from GH3479 assert_fr_equal = functools.partial( - tm.assert_frame_equal, check_index_type=True, check_column_type=True, + tm.assert_frame_equal, check_index_type=True, check_column_type=True ) arrays = [ ("float", np.array([1.5, 2.0])), diff --git a/pandas/tests/frame/test_reshape.py b/pandas/tests/frame/test_reshape.py index 6a8f1e7c1aca2..d80ebaa09b6a8 100644 --- a/pandas/tests/frame/test_reshape.py +++ b/pandas/tests/frame/test_reshape.py @@ -417,7 +417,7 @@ def test_unstack_mixed_type_name_in_multiindex( result = df.unstack(unstack_idx) expected = pd.DataFrame( - expected_values, columns=expected_columns, index=expected_index, + expected_values, columns=expected_columns, index=expected_index ) tm.assert_frame_equal(result, expected) @@ -807,7 +807,7 @@ def test_unstack_multi_level_cols(self): [["B", "C"], ["B", "D"]], names=["c1", "c2"] ), index=pd.MultiIndex.from_tuples( - [[10, 20, 30], [10, 20, 40]], names=["i1", "i2", "i3"], + [[10, 20, 30], [10, 20, 40]], names=["i1", "i2", "i3"] ), ) assert df.unstack(["i2", "i1"]).columns.names[-2:] == ["i2", "i1"] From f613ce3a07c4591f551108e2e60be4e75056ec6f Mon Sep 17 00:00:00 2001 From: Fangchen Li Date: Tue, 1 Sep 2020 11:06:10 -0500 Subject: [PATCH 0606/1025] TYP: add type annotation to `_xlwt.py` #36024 (#36025) --- pandas/io/excel/_xlwt.py | 14 +++++++++----- setup.cfg | 3 --- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/pandas/io/excel/_xlwt.py b/pandas/io/excel/_xlwt.py index 78efe77e9fe2d..e1f72eb533c51 100644 --- a/pandas/io/excel/_xlwt.py +++ b/pandas/io/excel/_xlwt.py @@ -1,8 +1,13 @@ +from typing import TYPE_CHECKING, Dict + import pandas._libs.json as json from pandas.io.excel._base import ExcelWriter from pandas.io.excel._util import _validate_freeze_panes +if TYPE_CHECKING: + from xlwt import XFStyle + class _XlwtWriter(ExcelWriter): engine = "xlwt" @@ -29,12 +34,11 @@ def save(self): """ Save workbook to disk. """ - return self.book.save(self.path) + self.book.save(self.path) def write_cells( self, cells, sheet_name=None, startrow=0, startcol=0, freeze_panes=None ): - # Write the frame cells using xlwt. sheet_name = self._get_sheet_name(sheet_name) @@ -49,7 +53,7 @@ def write_cells( wks.set_horz_split_pos(freeze_panes[0]) wks.set_vert_split_pos(freeze_panes[1]) - style_dict = {} + style_dict: Dict[str, XFStyle] = {} for cell in cells: val, fmt = self._value_with_fmt(cell.val) @@ -101,14 +105,14 @@ def _style_to_xlwt( f"{key}: {cls._style_to_xlwt(value, False)}" for key, value in item.items() ] - out = f"{(line_sep).join(it)} " + out = f"{line_sep.join(it)} " return out else: it = [ f"{key} {cls._style_to_xlwt(value, False)}" for key, value in item.items() ] - out = f"{(field_sep).join(it)} " + out = f"{field_sep.join(it)} " return out else: item = f"{item}" diff --git a/setup.cfg b/setup.cfg index c10624d60aaff..2447a91f88f4e 100644 --- a/setup.cfg +++ b/setup.cfg @@ -223,9 +223,6 @@ check_untyped_defs=False [mypy-pandas.io.excel._util] check_untyped_defs=False -[mypy-pandas.io.excel._xlwt] -check_untyped_defs=False - [mypy-pandas.io.formats.console] check_untyped_defs=False From 25eed27dd0f7240fb46a5c9c217e7afd3b08b2e9 Mon Sep 17 00:00:00 2001 From: Souris Ash Date: Tue, 1 Sep 2020 21:45:25 +0530 Subject: [PATCH 0607/1025] Removed outdated examples for pd.Interval (pandas-dev#36002) (#36026) --- pandas/_libs/interval.pyx | 6 ------ 1 file changed, 6 deletions(-) diff --git a/pandas/_libs/interval.pyx b/pandas/_libs/interval.pyx index 40bd5ad8f5a1f..931ad8326c371 100644 --- a/pandas/_libs/interval.pyx +++ b/pandas/_libs/interval.pyx @@ -291,12 +291,6 @@ cdef class Interval(IntervalMixin): True >>> year_2017.length Timedelta('365 days 00:00:00') - - And also you can create string intervals - - >>> volume_1 = pd.Interval('Ant', 'Dog', closed='both') - >>> 'Bee' in volume_1 - True """ _typ = "interval" __array_priority__ = 1000 From 9de2b059e25b7aa20e5b0ba0f739bee51fb4b998 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Tue, 1 Sep 2020 17:26:45 +0100 Subject: [PATCH 0608/1025] TYP: misc typing cleanup in core/indexes/multi.py (#36007) * TYP: misc typing cleanup in core/indexes/multi.py * update per comments --- pandas/core/indexes/multi.py | 46 +++++++++++++++++++++++++----------- 1 file changed, 32 insertions(+), 14 deletions(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index b29c27982f087..f66b009e6d505 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -19,7 +19,7 @@ from pandas._libs import algos as libalgos, index as libindex, lib from pandas._libs.hashtable import duplicated_int64 -from pandas._typing import AnyArrayLike, Scalar +from pandas._typing import AnyArrayLike, Label, Scalar from pandas.compat.numpy import function as nv from pandas.errors import InvalidIndexError, PerformanceWarning, UnsortedIndexError from pandas.util._decorators import Appender, cache_readonly, doc @@ -449,7 +449,12 @@ def from_arrays(cls, arrays, sortorder=None, names=lib.no_default) -> "MultiInde ) @classmethod - def from_tuples(cls, tuples, sortorder=None, names=None): + def from_tuples( + cls, + tuples, + sortorder: Optional[int] = None, + names: Optional[Sequence[Label]] = None, + ): """ Convert list of tuples to MultiIndex. @@ -490,6 +495,7 @@ def from_tuples(cls, tuples, sortorder=None, names=None): elif is_iterator(tuples): tuples = list(tuples) + arrays: List[Sequence[Label]] if len(tuples) == 0: if names is None: raise TypeError("Cannot infer number of levels from empty list") @@ -700,8 +706,13 @@ def levels(self): return FrozenList(result) def _set_levels( - self, levels, level=None, copy=False, validate=True, verify_integrity=False - ): + self, + levels, + level=None, + copy: bool = False, + validate: bool = True, + verify_integrity: bool = False, + ) -> None: # This is NOT part of the levels property because it should be # externally not allowed to set levels. User beware if you change # _levels directly @@ -719,10 +730,10 @@ def _set_levels( ) else: level_numbers = [self._get_level_number(lev) for lev in level] - new_levels = list(self._levels) + new_levels_list = list(self._levels) for lev_num, lev in zip(level_numbers, levels): - new_levels[lev_num] = ensure_index(lev, copy=copy)._shallow_copy() - new_levels = FrozenList(new_levels) + new_levels_list[lev_num] = ensure_index(lev, copy=copy)._shallow_copy() + new_levels = FrozenList(new_levels_list) if verify_integrity: new_codes = self._verify_integrity(levels=new_levels) @@ -875,8 +886,13 @@ def codes(self): return self._codes def _set_codes( - self, codes, level=None, copy=False, validate=True, verify_integrity=False - ): + self, + codes, + level=None, + copy: bool = False, + validate: bool = True, + verify_integrity: bool = False, + ) -> None: if validate: if level is None and len(codes) != self.nlevels: raise ValueError("Length of codes must match number of levels") @@ -890,11 +906,13 @@ def _set_codes( ) else: level_numbers = [self._get_level_number(lev) for lev in level] - new_codes = list(self._codes) + new_codes_list = list(self._codes) for lev_num, level_codes in zip(level_numbers, codes): lev = self.levels[lev_num] - new_codes[lev_num] = _coerce_indexer_frozen(level_codes, lev, copy=copy) - new_codes = FrozenList(new_codes) + new_codes_list[lev_num] = _coerce_indexer_frozen( + level_codes, lev, copy=copy + ) + new_codes = FrozenList(new_codes_list) if verify_integrity: new_codes = self._verify_integrity(codes=new_codes) @@ -2435,7 +2453,7 @@ def _get_partial_string_timestamp_match_key(self, key): if isinstance(key, str) and self.levels[0]._supports_partial_string_indexing: # Convert key '2016-01-01' to # ('2016-01-01'[, slice(None, None, None)]+) - key = tuple([key] + [slice(None)] * (len(self.levels) - 1)) + key = (key,) + (slice(None),) * (len(self.levels) - 1) if isinstance(key, tuple): # Convert (..., '2016-01-01', ...) in tuple to @@ -3086,7 +3104,7 @@ def _update_indexer(idxr, indexer=indexer): elif is_list_like(k): # a collection of labels to include from this level (these # are or'd) - indexers = None + indexers: Optional[Int64Index] = None for x in k: try: idxrs = _convert_to_indexer( From 2e61736358c97fe1e28113e8434256f4bd1c0e82 Mon Sep 17 00:00:00 2001 From: Anshoo Rajput <57529264+rajanshoo25@users.noreply.github.com> Date: Tue, 1 Sep 2020 22:06:50 +0530 Subject: [PATCH 0609/1025] remove trailing commas for #35925 (#36029) --- pandas/io/parquet.py | 4 ++-- pandas/io/parsers.py | 4 ---- pandas/io/pytables.py | 12 ++++++------ 3 files changed, 8 insertions(+), 12 deletions(-) diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 7f0eef039a1e8..f2ce2f056ce82 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -128,7 +128,7 @@ def write( self.api.parquet.write_table(table, path, compression=compression, **kwargs) def read( - self, path, columns=None, storage_options: StorageOptions = None, **kwargs, + self, path, columns=None, storage_options: StorageOptions = None, **kwargs ): if is_fsspec_url(path) and "filesystem" not in kwargs: import_optional_dependency("fsspec") @@ -218,7 +218,7 @@ def write( ) def read( - self, path, columns=None, storage_options: StorageOptions = None, **kwargs, + self, path, columns=None, storage_options: StorageOptions = None, **kwargs ): if is_fsspec_url(path): fsspec = import_optional_dependency("fsspec") diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 983aa56324083..9ad527684120e 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1967,10 +1967,6 @@ def _do_date_conversions(self, names, data): class CParserWrapper(ParserBase): - """ - - """ - def __init__(self, src, **kwds): self.kwds = kwds kwds = kwds.copy() diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index f08e0514a68e1..0913627324c48 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -2931,7 +2931,7 @@ def read_index_node( # If the index was an empty array write_array_empty() will # have written a sentinel. Here we replace it with the original. if "shape" in node._v_attrs and np.prod(node._v_attrs.shape) == 0: - data = np.empty(node._v_attrs.shape, dtype=node._v_attrs.value_type,) + data = np.empty(node._v_attrs.shape, dtype=node._v_attrs.value_type) kind = _ensure_decoded(node._v_attrs.kind) name = None @@ -4103,7 +4103,7 @@ def create_description( return d def read_coordinates( - self, where=None, start: Optional[int] = None, stop: Optional[int] = None, + self, where=None, start: Optional[int] = None, stop: Optional[int] = None ): """ select coordinates (row numbers) from a table; return the @@ -4374,7 +4374,7 @@ def write_data_chunk( self.table.flush() def delete( - self, where=None, start: Optional[int] = None, stop: Optional[int] = None, + self, where=None, start: Optional[int] = None, stop: Optional[int] = None ): # delete all rows (and return the nrows) @@ -4805,7 +4805,7 @@ def _convert_index(name: str, index: Index, encoding: str, errors: str) -> Index if inferred_type == "date": converted = np.asarray([v.toordinal() for v in values], dtype=np.int32) return IndexCol( - name, converted, "date", _tables().Time32Col(), index_name=index_name, + name, converted, "date", _tables().Time32Col(), index_name=index_name ) elif inferred_type == "string": @@ -4821,13 +4821,13 @@ def _convert_index(name: str, index: Index, encoding: str, errors: str) -> Index elif inferred_type in ["integer", "floating"]: return IndexCol( - name, values=converted, kind=kind, typ=atom, index_name=index_name, + name, values=converted, kind=kind, typ=atom, index_name=index_name ) else: assert isinstance(converted, np.ndarray) and converted.dtype == object assert kind == "object", kind atom = _tables().ObjectAtom() - return IndexCol(name, converted, kind, atom, index_name=index_name,) + return IndexCol(name, converted, kind, atom, index_name=index_name) def _unconvert_index( From 83df3ff03a2973f309566c5005bb09f579d738bc Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 1 Sep 2020 09:39:54 -0700 Subject: [PATCH 0610/1025] TYP: annotate plotting._matplotlib.misc (#36017) --- pandas/plotting/_matplotlib/misc.py | 62 +++++++++++++++++++++------- pandas/plotting/_matplotlib/style.py | 2 +- 2 files changed, 48 insertions(+), 16 deletions(-) diff --git a/pandas/plotting/_matplotlib/misc.py b/pandas/plotting/_matplotlib/misc.py index bb6530b0f6412..c5e7c55970c3e 100644 --- a/pandas/plotting/_matplotlib/misc.py +++ b/pandas/plotting/_matplotlib/misc.py @@ -1,18 +1,27 @@ import random +from typing import TYPE_CHECKING, Dict, List, Optional, Set import matplotlib.lines as mlines import matplotlib.patches as patches import numpy as np +from pandas._typing import Label + from pandas.core.dtypes.missing import notna from pandas.io.formats.printing import pprint_thing from pandas.plotting._matplotlib.style import _get_standard_colors from pandas.plotting._matplotlib.tools import _set_ticks_props, _subplots +if TYPE_CHECKING: + from matplotlib.axes import Axes + from matplotlib.figure import Figure + + from pandas import DataFrame, Series + def scatter_matrix( - frame, + frame: "DataFrame", alpha=0.5, figsize=None, ax=None, @@ -114,7 +123,14 @@ def _get_marker_compat(marker): return marker -def radviz(frame, class_column, ax=None, color=None, colormap=None, **kwds): +def radviz( + frame: "DataFrame", + class_column, + ax: Optional["Axes"] = None, + color=None, + colormap=None, + **kwds, +) -> "Axes": import matplotlib.pyplot as plt def normalize(series): @@ -130,7 +146,7 @@ def normalize(series): if ax is None: ax = plt.gca(xlim=[-1, 1], ylim=[-1, 1]) - to_plot = {} + to_plot: Dict[Label, List[List]] = {} colors = _get_standard_colors( num_colors=len(classes), colormap=colormap, color_type="random", color=color ) @@ -197,8 +213,14 @@ def normalize(series): def andrews_curves( - frame, class_column, ax=None, samples=200, color=None, colormap=None, **kwds -): + frame: "DataFrame", + class_column, + ax: Optional["Axes"] = None, + samples: int = 200, + color=None, + colormap=None, + **kwds, +) -> "Axes": import matplotlib.pyplot as plt def function(amplitudes): @@ -231,7 +253,7 @@ def f(t): classes = frame[class_column].drop_duplicates() df = frame.drop(class_column, axis=1) t = np.linspace(-np.pi, np.pi, samples) - used_legends = set() + used_legends: Set[str] = set() color_values = _get_standard_colors( num_colors=len(classes), colormap=colormap, color_type="random", color=color @@ -256,7 +278,13 @@ def f(t): return ax -def bootstrap_plot(series, fig=None, size=50, samples=500, **kwds): +def bootstrap_plot( + series: "Series", + fig: Optional["Figure"] = None, + size: int = 50, + samples: int = 500, + **kwds, +) -> "Figure": import matplotlib.pyplot as plt @@ -306,19 +334,19 @@ def bootstrap_plot(series, fig=None, size=50, samples=500, **kwds): def parallel_coordinates( - frame, + frame: "DataFrame", class_column, cols=None, - ax=None, + ax: Optional["Axes"] = None, color=None, use_columns=False, xticks=None, colormap=None, - axvlines=True, + axvlines: bool = True, axvlines_kwds=None, - sort_labels=False, + sort_labels: bool = False, **kwds, -): +) -> "Axes": import matplotlib.pyplot as plt if axvlines_kwds is None: @@ -333,7 +361,7 @@ def parallel_coordinates( else: df = frame[cols] - used_legends = set() + used_legends: Set[str] = set() ncols = len(df.columns) @@ -385,7 +413,9 @@ def parallel_coordinates( return ax -def lag_plot(series, lag=1, ax=None, **kwds): +def lag_plot( + series: "Series", lag: int = 1, ax: Optional["Axes"] = None, **kwds +) -> "Axes": # workaround because `c='b'` is hardcoded in matplotlib's scatter method import matplotlib.pyplot as plt @@ -402,7 +432,9 @@ def lag_plot(series, lag=1, ax=None, **kwds): return ax -def autocorrelation_plot(series, ax=None, **kwds): +def autocorrelation_plot( + series: "Series", ax: Optional["Axes"] = None, **kwds +) -> "Axes": import matplotlib.pyplot as plt n = len(series) diff --git a/pandas/plotting/_matplotlib/style.py b/pandas/plotting/_matplotlib/style.py index 7990bff4f517c..5f1105f0e4233 100644 --- a/pandas/plotting/_matplotlib/style.py +++ b/pandas/plotting/_matplotlib/style.py @@ -11,7 +11,7 @@ def _get_standard_colors( - num_colors=None, colormap=None, color_type="default", color=None + num_colors=None, colormap=None, color_type: str = "default", color=None ): import matplotlib.pyplot as plt From b416a6519b8a2fb208e28a5ca184b6f39373d955 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 1 Sep 2020 10:28:02 -0700 Subject: [PATCH 0611/1025] TYP: Annotate plotting stacker (#36016) --- pandas/plotting/_matplotlib/boxplot.py | 8 +++- pandas/plotting/_matplotlib/core.py | 66 +++++++++++++------------- pandas/plotting/_matplotlib/hist.py | 7 ++- 3 files changed, 45 insertions(+), 36 deletions(-) diff --git a/pandas/plotting/_matplotlib/boxplot.py b/pandas/plotting/_matplotlib/boxplot.py index b33daf39de37c..01fe98a6f5403 100644 --- a/pandas/plotting/_matplotlib/boxplot.py +++ b/pandas/plotting/_matplotlib/boxplot.py @@ -1,4 +1,5 @@ from collections import namedtuple +from typing import TYPE_CHECKING import warnings from matplotlib.artist import setp @@ -14,6 +15,9 @@ from pandas.plotting._matplotlib.style import _get_standard_colors from pandas.plotting._matplotlib.tools import _flatten, _subplots +if TYPE_CHECKING: + from matplotlib.axes import Axes + class BoxPlot(LinePlot): _kind = "box" @@ -150,7 +154,7 @@ def _make_plot(self): labels = [pprint_thing(key) for key in range(len(labels))] self._set_ticklabels(ax, labels) - def _set_ticklabels(self, ax, labels): + def _set_ticklabels(self, ax: "Axes", labels): if self.orientation == "vertical": ax.set_xticklabels(labels) else: @@ -292,7 +296,7 @@ def maybe_color_bp(bp, **kwds): if not kwds.get("capprops"): setp(bp["caps"], color=colors[3], alpha=1) - def plot_group(keys, values, ax): + def plot_group(keys, values, ax: "Axes"): keys = [pprint_thing(x) for x in keys] values = [np.asarray(remove_na_arraylike(v), dtype=object) for v in values] bp = ax.boxplot(values, **kwds) diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index 4d23a5e5fc249..93ba9bd26630b 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -1,5 +1,5 @@ import re -from typing import TYPE_CHECKING, List, Optional +from typing import TYPE_CHECKING, List, Optional, Tuple import warnings from matplotlib.artist import Artist @@ -45,6 +45,7 @@ if TYPE_CHECKING: from matplotlib.axes import Axes + from matplotlib.axis import Axis class MPLPlot: @@ -68,16 +69,10 @@ def _kind(self): _pop_attributes = [ "label", "style", - "logy", - "logx", - "loglog", "mark_right", "stacked", ] _attr_defaults = { - "logy": False, - "logx": False, - "loglog": False, "mark_right": True, "stacked": False, } @@ -167,6 +162,9 @@ def __init__( self.legend_handles: List[Artist] = [] self.legend_labels: List[Label] = [] + self.logx = kwds.pop("logx", False) + self.logy = kwds.pop("logy", False) + self.loglog = kwds.pop("loglog", False) for attr in self._pop_attributes: value = kwds.pop(attr, self._attr_defaults.get(attr, None)) setattr(self, attr, value) @@ -283,11 +281,11 @@ def generate(self): def _args_adjust(self): pass - def _has_plotted_object(self, ax): + def _has_plotted_object(self, ax: "Axes") -> bool: """check whether ax has data""" return len(ax.lines) != 0 or len(ax.artists) != 0 or len(ax.containers) != 0 - def _maybe_right_yaxis(self, ax, axes_num): + def _maybe_right_yaxis(self, ax: "Axes", axes_num): if not self.on_right(axes_num): # secondary axes may be passed via ax kw return self._get_ax_layer(ax) @@ -523,7 +521,7 @@ def _adorn_subplots(self): raise ValueError(msg) self.axes[0].set_title(self.title) - def _apply_axis_properties(self, axis, rot=None, fontsize=None): + def _apply_axis_properties(self, axis: "Axis", rot=None, fontsize=None): """ Tick creation within matplotlib is reasonably expensive and is internally deferred until accessed as Ticks are created/destroyed @@ -540,7 +538,7 @@ def _apply_axis_properties(self, axis, rot=None, fontsize=None): label.set_fontsize(fontsize) @property - def legend_title(self): + def legend_title(self) -> Optional[str]: if not isinstance(self.data.columns, ABCMultiIndex): name = self.data.columns.name if name is not None: @@ -591,7 +589,7 @@ def _make_legend(self): if ax.get_visible(): ax.legend(loc="best") - def _get_ax_legend_handle(self, ax): + def _get_ax_legend_handle(self, ax: "Axes"): """ Take in axes and return ax, legend and handle under different scenarios """ @@ -616,7 +614,7 @@ def plt(self): _need_to_set_index = False - def _get_xticks(self, convert_period=False): + def _get_xticks(self, convert_period: bool = False): index = self.data.index is_datetype = index.inferred_type in ("datetime", "date", "datetime64", "time") @@ -646,7 +644,7 @@ def _get_xticks(self, convert_period=False): @classmethod @register_pandas_matplotlib_converters - def _plot(cls, ax, x, y, style=None, is_errorbar=False, **kwds): + def _plot(cls, ax: "Axes", x, y, style=None, is_errorbar: bool = False, **kwds): mask = isna(y) if mask.any(): y = np.ma.array(y) @@ -667,10 +665,10 @@ def _plot(cls, ax, x, y, style=None, is_errorbar=False, **kwds): if style is not None: args = (x, y, style) else: - args = (x, y) + args = (x, y) # type:ignore[assignment] return ax.plot(*args, **kwds) - def _get_index_name(self): + def _get_index_name(self) -> Optional[str]: if isinstance(self.data.index, ABCMultiIndex): name = self.data.index.names if com.any_not_none(*name): @@ -877,7 +875,7 @@ def _get_subplots(self): ax for ax in self.axes[0].get_figure().get_axes() if isinstance(ax, Subplot) ] - def _get_axes_layout(self): + def _get_axes_layout(self) -> Tuple[int, int]: axes = self._get_subplots() x_set = set() y_set = set() @@ -916,15 +914,15 @@ def __init__(self, data, x, y, **kwargs): self.y = y @property - def nseries(self): + def nseries(self) -> int: return 1 - def _post_plot_logic(self, ax, data): + def _post_plot_logic(self, ax: "Axes", data): x, y = self.x, self.y ax.set_ylabel(pprint_thing(y)) ax.set_xlabel(pprint_thing(x)) - def _plot_colorbar(self, ax, **kwds): + def _plot_colorbar(self, ax: "Axes", **kwds): # Addresses issues #10611 and #10678: # When plotting scatterplots and hexbinplots in IPython # inline backend the colorbar axis height tends not to @@ -1080,7 +1078,7 @@ def __init__(self, data, **kwargs): if "x_compat" in self.kwds: self.x_compat = bool(self.kwds.pop("x_compat")) - def _is_ts_plot(self): + def _is_ts_plot(self) -> bool: # this is slightly deceptive return not self.x_compat and self.use_index and self._use_dynamic_x() @@ -1139,7 +1137,9 @@ def _make_plot(self): ax.set_xlim(left, right) @classmethod - def _plot(cls, ax, x, y, style=None, column_num=None, stacking_id=None, **kwds): + def _plot( + cls, ax: "Axes", x, y, style=None, column_num=None, stacking_id=None, **kwds + ): # column_num is used to get the target column from plotf in line and # area plots if column_num == 0: @@ -1183,7 +1183,7 @@ def _get_stacking_id(self): return None @classmethod - def _initialize_stacker(cls, ax, stacking_id, n): + def _initialize_stacker(cls, ax: "Axes", stacking_id, n: int): if stacking_id is None: return if not hasattr(ax, "_stacker_pos_prior"): @@ -1194,7 +1194,7 @@ def _initialize_stacker(cls, ax, stacking_id, n): ax._stacker_neg_prior[stacking_id] = np.zeros(n) @classmethod - def _get_stacked_values(cls, ax, stacking_id, values, label): + def _get_stacked_values(cls, ax: "Axes", stacking_id, values, label): if stacking_id is None: return values if not hasattr(ax, "_stacker_pos_prior"): @@ -1213,7 +1213,7 @@ def _get_stacked_values(cls, ax, stacking_id, values, label): ) @classmethod - def _update_stacker(cls, ax, stacking_id, values): + def _update_stacker(cls, ax: "Axes", stacking_id, values): if stacking_id is None: return if (values >= 0).all(): @@ -1221,7 +1221,7 @@ def _update_stacker(cls, ax, stacking_id, values): elif (values <= 0).all(): ax._stacker_neg_prior[stacking_id] += values - def _post_plot_logic(self, ax, data): + def _post_plot_logic(self, ax: "Axes", data): from matplotlib.ticker import FixedLocator def get_label(i): @@ -1276,7 +1276,7 @@ def __init__(self, data, **kwargs): @classmethod def _plot( cls, - ax, + ax: "Axes", x, y, style=None, @@ -1318,7 +1318,7 @@ def _plot( res = [rect] return res - def _post_plot_logic(self, ax, data): + def _post_plot_logic(self, ax: "Axes", data): LinePlot._post_plot_logic(self, ax, data) if self.ylim is None: @@ -1372,7 +1372,7 @@ def _args_adjust(self): self.left = np.array(self.left) @classmethod - def _plot(cls, ax, x, y, w, start=0, log=False, **kwds): + def _plot(cls, ax: "Axes", x, y, w, start=0, log=False, **kwds): return ax.bar(x, y, w, bottom=start, log=log, **kwds) @property @@ -1454,7 +1454,7 @@ def _make_plot(self): ) self._add_legend_handle(rect, label, index=i) - def _post_plot_logic(self, ax, data): + def _post_plot_logic(self, ax: "Axes", data): if self.use_index: str_index = [pprint_thing(key) for key in data.index] else: @@ -1466,7 +1466,7 @@ def _post_plot_logic(self, ax, data): self._decorate_ticks(ax, name, str_index, s_edge, e_edge) - def _decorate_ticks(self, ax, name, ticklabels, start_edge, end_edge): + def _decorate_ticks(self, ax: "Axes", name, ticklabels, start_edge, end_edge): ax.set_xlim((start_edge, end_edge)) if self.xticks is not None: @@ -1489,10 +1489,10 @@ def _start_base(self): return self.left @classmethod - def _plot(cls, ax, x, y, w, start=0, log=False, **kwds): + def _plot(cls, ax: "Axes", x, y, w, start=0, log=False, **kwds): return ax.barh(x, y, w, left=start, log=log, **kwds) - def _decorate_ticks(self, ax, name, ticklabels, start_edge, end_edge): + def _decorate_ticks(self, ax: "Axes", name, ticklabels, start_edge, end_edge): # horizontal bars ax.set_ylim((start_edge, end_edge)) ax.set_yticks(self.tick_pos) diff --git a/pandas/plotting/_matplotlib/hist.py b/pandas/plotting/_matplotlib/hist.py index ee41479b3c7c9..ffd46d1b191db 100644 --- a/pandas/plotting/_matplotlib/hist.py +++ b/pandas/plotting/_matplotlib/hist.py @@ -1,3 +1,5 @@ +from typing import TYPE_CHECKING + import numpy as np from pandas.core.dtypes.common import is_integer, is_list_like @@ -8,6 +10,9 @@ from pandas.plotting._matplotlib.core import LinePlot, MPLPlot from pandas.plotting._matplotlib.tools import _flatten, _set_ticks_props, _subplots +if TYPE_CHECKING: + from matplotlib.axes import Axes + class HistPlot(LinePlot): _kind = "hist" @@ -90,7 +95,7 @@ def _make_plot_keywords(self, kwds, y): kwds["bins"] = self.bins return kwds - def _post_plot_logic(self, ax, data): + def _post_plot_logic(self, ax: "Axes", data): if self.orientation == "horizontal": ax.set_xlabel("Frequency") else: From 643d43e5eb99a100ebf73b6aacf72f510fc114f7 Mon Sep 17 00:00:00 2001 From: Fangchen Li Date: Tue, 1 Sep 2020 14:36:20 -0500 Subject: [PATCH 0612/1025] TYP/CLN: cleanup `_openpyxl.py`, add type annotation #36021 (#36022) --- ci/deps/azure-37-locale_slow.yaml | 2 +- ci/deps/azure-37-minimum_versions.yaml | 2 +- doc/source/getting_started/install.rst | 2 +- doc/source/whatsnew/v1.2.0.rst | 2 +- pandas/io/excel/_openpyxl.py | 59 +++++++------------------- setup.cfg | 3 -- 6 files changed, 20 insertions(+), 50 deletions(-) diff --git a/ci/deps/azure-37-locale_slow.yaml b/ci/deps/azure-37-locale_slow.yaml index 8000f3e6b9a9c..fbb1ea671d696 100644 --- a/ci/deps/azure-37-locale_slow.yaml +++ b/ci/deps/azure-37-locale_slow.yaml @@ -18,7 +18,7 @@ dependencies: - lxml - matplotlib=3.0.0 - numpy=1.16.* - - openpyxl=2.5.7 + - openpyxl=2.6.0 - python-dateutil - python-blosc - pytz=2017.3 diff --git a/ci/deps/azure-37-minimum_versions.yaml b/ci/deps/azure-37-minimum_versions.yaml index 05b1957198bc4..31f82f3304db3 100644 --- a/ci/deps/azure-37-minimum_versions.yaml +++ b/ci/deps/azure-37-minimum_versions.yaml @@ -19,7 +19,7 @@ dependencies: - numba=0.46.0 - numexpr=2.6.8 - numpy=1.16.5 - - openpyxl=2.5.7 + - openpyxl=2.6.0 - pytables=3.4.4 - python-dateutil=2.7.3 - pytz=2017.3 diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index 4c270117e079e..c9ac1b0d284a3 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -274,7 +274,7 @@ html5lib 1.0.1 HTML parser for read_html (see :ref lxml 4.3.0 HTML parser for read_html (see :ref:`note `) matplotlib 2.2.3 Visualization numba 0.46.0 Alternative execution engine for rolling operations -openpyxl 2.5.7 Reading / writing for xlsx files +openpyxl 2.6.0 Reading / writing for xlsx files pandas-gbq 0.12.0 Google Big Query access psycopg2 2.7 PostgreSQL engine for sqlalchemy pyarrow 0.15.0 Parquet, ORC, and feather reading / writing diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 1617bf66c4f04..76bebd4a9a1cb 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -109,7 +109,7 @@ Optional libraries below the lowest tested version may still work, but are not c +-----------------+-----------------+---------+ | numba | 0.46.0 | | +-----------------+-----------------+---------+ -| openpyxl | 2.5.7 | | +| openpyxl | 2.6.0 | X | +-----------------+-----------------+---------+ | pyarrow | 0.15.0 | X | +-----------------+-----------------+---------+ diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py index c2730536af8a3..3c67902d41baa 100644 --- a/pandas/io/excel/_openpyxl.py +++ b/pandas/io/excel/_openpyxl.py @@ -1,4 +1,4 @@ -from typing import List +from typing import TYPE_CHECKING, Dict, List, Optional import numpy as np @@ -8,6 +8,9 @@ from pandas.io.excel._base import ExcelWriter, _BaseExcelReader from pandas.io.excel._util import _validate_freeze_panes +if TYPE_CHECKING: + from openpyxl.descriptors.serialisable import Serialisable + class _OpenpyxlWriter(ExcelWriter): engine = "openpyxl" @@ -22,53 +25,22 @@ def __init__(self, path, engine=None, mode="w", **engine_kwargs): if self.mode == "a": # Load from existing workbook from openpyxl import load_workbook - book = load_workbook(self.path) - self.book = book + self.book = load_workbook(self.path) else: # Create workbook object with default optimized_write=True. self.book = Workbook() if self.book.worksheets: - try: - self.book.remove(self.book.worksheets[0]) - except AttributeError: - - # compat - for openpyxl <= 2.4 - self.book.remove_sheet(self.book.worksheets[0]) + self.book.remove(self.book.worksheets[0]) def save(self): """ Save workbook to disk. """ - return self.book.save(self.path) - - @classmethod - def _convert_to_style(cls, style_dict): - """ - Converts a style_dict to an openpyxl style object. - - Parameters - ---------- - style_dict : style dictionary to convert - """ - from openpyxl.style import Style - - xls_style = Style() - for key, value in style_dict.items(): - for nk, nv in value.items(): - if key == "borders": - ( - xls_style.borders.__getattribute__(nk).__setattr__( - "border_style", nv - ) - ) - else: - xls_style.__getattribute__(key).__setattr__(nk, nv) - - return xls_style + self.book.save(self.path) @classmethod - def _convert_to_style_kwargs(cls, style_dict): + def _convert_to_style_kwargs(cls, style_dict: dict) -> Dict[str, "Serialisable"]: """ Convert a style_dict to a set of kwargs suitable for initializing or updating-on-copy an openpyxl v2 style object. @@ -93,7 +65,7 @@ def _convert_to_style_kwargs(cls, style_dict): """ _style_key_map = {"borders": "border"} - style_kwargs = {} + style_kwargs: Dict[str, Serialisable] = {} for k, v in style_dict.items(): if k in _style_key_map: k = _style_key_map[k] @@ -404,7 +376,7 @@ def write_cells( # Write the frame cells using openpyxl. sheet_name = self._get_sheet_name(sheet_name) - _style_cache = {} + _style_cache: Dict[str, Dict[str, Serialisable]] = {} if sheet_name in self.sheets: wks = self.sheets[sheet_name] @@ -426,7 +398,7 @@ def write_cells( if fmt: xcell.number_format = fmt - style_kwargs = {} + style_kwargs: Optional[Dict[str, Serialisable]] = {} if cell.style: key = str(cell.style) style_kwargs = _style_cache.get(key) @@ -515,16 +487,17 @@ def get_sheet_by_index(self, index: int): def _convert_cell(self, cell, convert_float: bool) -> Scalar: - # TODO: replace with openpyxl constants + from openpyxl.cell.cell import TYPE_BOOL, TYPE_ERROR, TYPE_NUMERIC + if cell.is_date: return cell.value - elif cell.data_type == "e": + elif cell.data_type == TYPE_ERROR: return np.nan - elif cell.data_type == "b": + elif cell.data_type == TYPE_BOOL: return bool(cell.value) elif cell.value is None: return "" # compat with xlrd - elif cell.data_type == "n": + elif cell.data_type == TYPE_NUMERIC: # GH5394 if convert_float: val = int(cell.value) diff --git a/setup.cfg b/setup.cfg index 2447a91f88f4e..c952e01c14bea 100644 --- a/setup.cfg +++ b/setup.cfg @@ -217,9 +217,6 @@ check_untyped_defs=False [mypy-pandas.io.excel._base] check_untyped_defs=False -[mypy-pandas.io.excel._openpyxl] -check_untyped_defs=False - [mypy-pandas.io.excel._util] check_untyped_defs=False From 6d610a43a8b459a25d36fcef732e949c9512212a Mon Sep 17 00:00:00 2001 From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> Date: Tue, 1 Sep 2020 19:17:23 -0400 Subject: [PATCH 0613/1025] CLN: _wrap_applied_output (#36053) --- pandas/core/groupby/generic.py | 36 ++++++++++++---------------------- 1 file changed, 13 insertions(+), 23 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 7b45a114e548b..a92e3af0764a7 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1206,7 +1206,6 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): key_index = self.grouper.result_index if self.as_index else None if isinstance(first_not_none, Series): - # this is to silence a DeprecationWarning # TODO: Remove when default dtype of empty Series is object kwargs = first_not_none._construct_axes_dict() @@ -1218,16 +1217,26 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): v = values[0] - if isinstance(v, (np.ndarray, Index, Series)) or not self.as_index: + if not isinstance(v, (np.ndarray, Index, Series)) and self.as_index: + # values are not series or array-like but scalars + # self._selection_name not passed through to Series as the + # result should not take the name of original selection + # of columns + return self.obj._constructor_sliced(values, index=key_index) + + else: if isinstance(v, Series): - applied_index = self._selected_obj._get_axis(self.axis) all_indexed_same = all_indexes_same((x.index for x in values)) - singular_series = len(values) == 1 and applied_index.nlevels == 1 # GH3596 # provide a reduction (Frame -> Series) if groups are # unique if self.squeeze: + applied_index = self._selected_obj._get_axis(self.axis) + singular_series = ( + len(values) == 1 and applied_index.nlevels == 1 + ) + # assign the name to this series if singular_series: values[0].name = keys[0] @@ -1253,18 +1262,6 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): # GH 8467 return self._concat_objects(keys, values, not_indexed_same=True) - # GH6124 if the list of Series have a consistent name, - # then propagate that name to the result. - index = v.index.copy() - if index.name is None: - # Only propagate the series name to the result - # if all series have a consistent name. If the - # series do not have a consistent name, do - # nothing. - names = {v.name for v in values} - if len(names) == 1: - index.name = list(names)[0] - # Combine values # vstack+constructor is faster than concat and handles MI-columns stacked_values = np.vstack([np.asarray(v) for v in values]) @@ -1313,13 +1310,6 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): return self._reindex_output(result) - # values are not series or array-like but scalars - else: - # self._selection_name not passed through to Series as the - # result should not take the name of original selection - # of columns - return self.obj._constructor_sliced(values, index=key_index) - def _transform_general( self, func, *args, engine="cython", engine_kwargs=None, **kwargs ): From bf00c6a02caa294a18b83973322981bea6745d41 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 1 Sep 2020 16:31:09 -0700 Subject: [PATCH 0614/1025] REF: implement Block._replace_list (#36020) --- pandas/core/internals/blocks.py | 37 +++++++++++++++++++++++++ pandas/core/internals/managers.py | 45 +++++++++++-------------------- 2 files changed, 52 insertions(+), 30 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 1b42df1b0147c..ad388ef3f53b0 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -788,6 +788,43 @@ def _replace_single(self, *args, **kwargs): """ no-op on a non-ObjectBlock """ return self if kwargs["inplace"] else self.copy() + def _replace_list( + self, + src_list: List[Any], + dest_list: List[Any], + masks: List[np.ndarray], + inplace: bool = False, + regex: bool = False, + ) -> List["Block"]: + """ + See BlockManager._replace_list docstring. + """ + src_len = len(src_list) - 1 + + rb = [self if inplace else self.copy()] + for i, (src, dest) in enumerate(zip(src_list, dest_list)): + new_rb: List["Block"] = [] + for blk in rb: + m = masks[i][blk.mgr_locs.indexer] + convert = i == src_len # only convert once at the end + result = blk._replace_coerce( + mask=m, + to_replace=src, + value=dest, + inplace=inplace, + convert=convert, + regex=regex, + ) + if m.any() or convert: + if isinstance(result, list): + new_rb.extend(result) + else: + new_rb.append(result) + else: + new_rb.append(blk) + rb = new_rb + return rb + def setitem(self, indexer, value): """ Attempt self.values[indexer] = value, possibly creating a new array. diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 00321b76cb6bf..389252e7ef0f2 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -3,6 +3,7 @@ import operator import re from typing import ( + Any, DefaultDict, Dict, List, @@ -600,8 +601,12 @@ def replace(self, value, **kwargs) -> "BlockManager": return self.apply("replace", value=value, **kwargs) def replace_list( - self, src_list, dest_list, inplace: bool = False, regex: bool = False - ) -> "BlockManager": + self: T, + src_list: List[Any], + dest_list: List[Any], + inplace: bool = False, + regex: bool = False, + ) -> T: """ do a list replace """ inplace = validate_bool_kwarg(inplace, "inplace") @@ -625,34 +630,14 @@ def comp(s: Scalar, mask: np.ndarray, regex: bool = False): masks = [comp(s, mask, regex) for s in src_list] - result_blocks = [] - src_len = len(src_list) - 1 - for blk in self.blocks: - - # its possible to get multiple result blocks here - # replace ALWAYS will return a list - rb = [blk if inplace else blk.copy()] - for i, (s, d) in enumerate(zip(src_list, dest_list)): - new_rb: List[Block] = [] - for b in rb: - m = masks[i][b.mgr_locs.indexer] - convert = i == src_len # only convert once at the end - result = b._replace_coerce( - mask=m, - to_replace=s, - value=d, - inplace=inplace, - convert=convert, - regex=regex, - ) - if m.any() or convert: - new_rb = _extend_blocks(result, new_rb) - else: - new_rb.append(b) - rb = new_rb - result_blocks.extend(rb) - - bm = type(self).from_blocks(result_blocks, self.axes) + bm = self.apply( + "_replace_list", + src_list=src_list, + dest_list=dest_list, + masks=masks, + inplace=inplace, + regex=regex, + ) bm._consolidate_inplace() return bm From 1be41f25fd1355bda85911482a1efa5a5bbd0a81 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 1 Sep 2020 16:32:00 -0700 Subject: [PATCH 0615/1025] BUG: PeriodIndex.get_loc incorrectly raising ValueError instead of KeyError (#36015) --- doc/source/whatsnew/v1.2.0.rst | 2 +- pandas/core/groupby/grouper.py | 6 +++--- pandas/core/indexes/period.py | 2 +- pandas/tests/indexes/period/test_indexing.py | 16 ++++++++++++++++ 4 files changed, 21 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 76bebd4a9a1cb..407e8ba029ada 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -217,7 +217,7 @@ Interval Indexing ^^^^^^^^ - +- Bug in :meth:`PeriodIndex.get_loc` incorrectly raising ``ValueError`` on non-datelike strings instead of ``KeyError``, causing similar errors in :meth:`Series.__geitem__`, :meth:`Series.__contains__`, and :meth:`Series.loc.__getitem__` (:issue:`34240`) - - diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 18970ea0544e4..3017521c6a065 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -755,9 +755,9 @@ def is_in_obj(gpr) -> bool: return False try: return gpr is obj[gpr.name] - except (KeyError, IndexError, ValueError): - # TODO: ValueError: Given date string not likely a datetime. - # should be KeyError? + except (KeyError, IndexError): + # IndexError reached in e.g. test_skip_group_keys when we pass + # lambda here return False for i, (gpr, level) in enumerate(zip(keys, levels)): diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index 11334803d4583..cdb502199c6f1 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -504,7 +504,7 @@ def get_loc(self, key, method=None, tolerance=None): try: asdt, reso = parse_time_string(key, self.freq) - except DateParseError as err: + except (ValueError, DateParseError) as err: # A string with invalid format raise KeyError(f"Cannot interpret '{key}' as period") from err diff --git a/pandas/tests/indexes/period/test_indexing.py b/pandas/tests/indexes/period/test_indexing.py index b61d1d903f89a..d2499b85ad181 100644 --- a/pandas/tests/indexes/period/test_indexing.py +++ b/pandas/tests/indexes/period/test_indexing.py @@ -359,6 +359,22 @@ def test_get_loc2(self): ], ) + def test_get_loc_invalid_string_raises_keyerror(self): + # GH#34240 + pi = pd.period_range("2000", periods=3, name="A") + with pytest.raises(KeyError, match="A"): + pi.get_loc("A") + + ser = pd.Series([1, 2, 3], index=pi) + with pytest.raises(KeyError, match="A"): + ser.loc["A"] + + with pytest.raises(KeyError, match="A"): + ser["A"] + + assert "A" not in ser + assert "A" not in pi + class TestGetIndexer: def test_get_indexer(self): From fe93136b2415745c8c0994f656dd26cb7249ca69 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Wed, 2 Sep 2020 00:34:15 +0100 Subject: [PATCH 0616/1025] CI: Unpin MyPy (#36012) --- environment.yml | 2 +- pandas/_config/config.py | 4 +- pandas/core/common.py | 7 +++- pandas/core/computation/expr.py | 2 +- pandas/core/indexes/frozen.py | 6 ++- pandas/core/resample.py | 3 +- requirements-dev.txt | 2 +- setup.cfg | 71 ++++++++++++++++++++++++++++++--- 8 files changed, 81 insertions(+), 16 deletions(-) diff --git a/environment.yml b/environment.yml index 96f2c8d2086c7..4622aac1dc6f8 100644 --- a/environment.yml +++ b/environment.yml @@ -21,7 +21,7 @@ dependencies: - flake8-comprehensions>=3.1.0 # used by flake8, linting of unnecessary comprehensions - flake8-rst>=0.6.0,<=0.7.0 # linting of code blocks in rst files - isort>=5.2.1 # check that imports are in the right order - - mypy=0.730 + - mypy=0.782 - pycodestyle # used by flake8 # documentation diff --git a/pandas/_config/config.py b/pandas/_config/config.py index fb41b37980b2e..0b802f2cc9e69 100644 --- a/pandas/_config/config.py +++ b/pandas/_config/config.py @@ -460,9 +460,7 @@ def register_option( path = key.split(".") for k in path: - # NOTE: tokenize.Name is not a public constant - # error: Module has no attribute "Name" [attr-defined] - if not re.match("^" + tokenize.Name + "$", k): # type: ignore[attr-defined] + if not re.match("^" + tokenize.Name + "$", k): raise ValueError(f"{k} is not a valid identifier") if keyword.iskeyword(k): raise ValueError(f"{k} is a python keyword") diff --git a/pandas/core/common.py b/pandas/core/common.py index e7260a9923ee0..6fd4700ab7f3f 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -9,7 +9,7 @@ from datetime import datetime, timedelta from functools import partial import inspect -from typing import Any, Collection, Iterable, Iterator, List, Union +from typing import Any, Collection, Iterable, Iterator, List, Union, cast import warnings import numpy as np @@ -277,6 +277,11 @@ def maybe_iterable_to_list(obj: Union[Iterable[T], T]) -> Union[Collection[T], T """ if isinstance(obj, abc.Iterable) and not isinstance(obj, abc.Sized): return list(obj) + # error: Incompatible return value type (got + # "Union[pandas.core.common., + # pandas.core.common.1, T]", expected + # "Union[Collection[T], T]") [return-value] + obj = cast(Collection, obj) return obj diff --git a/pandas/core/computation/expr.py b/pandas/core/computation/expr.py index 125ecb0d88036..df71b4fe415f8 100644 --- a/pandas/core/computation/expr.py +++ b/pandas/core/computation/expr.py @@ -364,7 +364,7 @@ class BaseExprVisitor(ast.NodeVisitor): unary_ops = _unary_ops_syms unary_op_nodes = "UAdd", "USub", "Invert", "Not" - unary_op_nodes_map = dict(zip(unary_ops, unary_op_nodes)) + unary_op_nodes_map = {k: v for k, v in zip(unary_ops, unary_op_nodes)} rewrite_map = { ast.Eq: ast.In, diff --git a/pandas/core/indexes/frozen.py b/pandas/core/indexes/frozen.py index 909643d50e9d7..8c4437f2cdeb9 100644 --- a/pandas/core/indexes/frozen.py +++ b/pandas/core/indexes/frozen.py @@ -103,5 +103,7 @@ def __str__(self) -> str: def __repr__(self) -> str: return f"{type(self).__name__}({str(self)})" - __setitem__ = __setslice__ = __delitem__ = __delslice__ = _disabled - pop = append = extend = remove = sort = insert = _disabled + __setitem__ = __setslice__ = _disabled # type: ignore[assignment] + __delitem__ = __delslice__ = _disabled # type: ignore[assignment] + pop = append = extend = _disabled # type: ignore[assignment] + remove = sort = insert = _disabled # type: ignore[assignment] diff --git a/pandas/core/resample.py b/pandas/core/resample.py index fc54128ae5aa6..7b5154756e613 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -966,8 +966,7 @@ def __init__(self, obj, *args, **kwargs): for attr in self._attributes: setattr(self, attr, kwargs.get(attr, getattr(parent, attr))) - # error: Too many arguments for "__init__" of "object" - super().__init__(None) # type: ignore[call-arg] + super().__init__(None) self._groupby = groupby self._groupby.mutated = True self._groupby.grouper.mutated = True diff --git a/requirements-dev.txt b/requirements-dev.txt index 1fca25c9fecd9..cc3775de3a4ba 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -12,7 +12,7 @@ flake8<3.8.0 flake8-comprehensions>=3.1.0 flake8-rst>=0.6.0,<=0.7.0 isort>=5.2.1 -mypy==0.730 +mypy==0.782 pycodestyle gitpython gitdb diff --git a/setup.cfg b/setup.cfg index c952e01c14bea..a47bc88d282ab 100644 --- a/setup.cfg +++ b/setup.cfg @@ -127,10 +127,7 @@ show_error_codes = True [mypy-pandas.tests.*] check_untyped_defs=False -[mypy-pandas.conftest] -ignore_errors=True - -[mypy-pandas.tests.tools.test_to_datetime] +[mypy-pandas.conftest,pandas.tests.window.conftest] ignore_errors=True [mypy-pandas._testing] @@ -139,7 +136,22 @@ check_untyped_defs=False [mypy-pandas._version] check_untyped_defs=False -[mypy-pandas.core.arrays.interval] +[mypy-pandas.compat.pickle_compat] +check_untyped_defs=False + +[mypy-pandas.core.apply] +check_untyped_defs=False + +[mypy-pandas.core.arrays.base] +check_untyped_defs=False + +[mypy-pandas.core.arrays.datetimelike] +check_untyped_defs=False + +[mypy-pandas.core.arrays.sparse.array] +check_untyped_defs=False + +[mypy-pandas.core.arrays.string_] check_untyped_defs=False [mypy-pandas.core.base] @@ -151,6 +163,9 @@ check_untyped_defs=False [mypy-pandas.core.computation.expressions] check_untyped_defs=False +[mypy-pandas.core.computation.ops] +check_untyped_defs=False + [mypy-pandas.core.computation.pytables] check_untyped_defs=False @@ -163,6 +178,9 @@ check_untyped_defs=False [mypy-pandas.core.generic] check_untyped_defs=False +[mypy-pandas.core.groupby.base] +check_untyped_defs=False + [mypy-pandas.core.groupby.generic] check_untyped_defs=False @@ -172,15 +190,33 @@ check_untyped_defs=False [mypy-pandas.core.groupby.ops] check_untyped_defs=False +[mypy-pandas.core.indexes.base] +check_untyped_defs=False + +[mypy-pandas.core.indexes.category] +check_untyped_defs=False + +[mypy-pandas.core.indexes.datetimelike] +check_untyped_defs=False + [mypy-pandas.core.indexes.datetimes] check_untyped_defs=False +[mypy-pandas.core.indexes.extension] +check_untyped_defs=False + [mypy-pandas.core.indexes.interval] check_untyped_defs=False [mypy-pandas.core.indexes.multi] check_untyped_defs=False +[mypy-pandas.core.indexes.period] +check_untyped_defs=False + +[mypy-pandas.core.indexes.range] +check_untyped_defs=False + [mypy-pandas.core.internals.blocks] check_untyped_defs=False @@ -190,15 +226,27 @@ check_untyped_defs=False [mypy-pandas.core.internals.managers] check_untyped_defs=False +[mypy-pandas.core.internals.ops] +check_untyped_defs=False + [mypy-pandas.core.missing] check_untyped_defs=False [mypy-pandas.core.ops.docstrings] check_untyped_defs=False +[mypy-pandas.core.resample] +check_untyped_defs=False + +[mypy-pandas.core.reshape.concat] +check_untyped_defs=False + [mypy-pandas.core.reshape.merge] check_untyped_defs=False +[mypy-pandas.core.series] +check_untyped_defs=False + [mypy-pandas.core.strings] check_untyped_defs=False @@ -214,6 +262,9 @@ check_untyped_defs=False [mypy-pandas.io.clipboard] check_untyped_defs=False +[mypy-pandas.io.common] +check_untyped_defs=False + [mypy-pandas.io.excel._base] check_untyped_defs=False @@ -226,6 +277,9 @@ check_untyped_defs=False [mypy-pandas.io.formats.css] check_untyped_defs=False +[mypy-pandas.io.formats.csvs] +check_untyped_defs=False + [mypy-pandas.io.formats.excel] check_untyped_defs=False @@ -264,3 +318,10 @@ check_untyped_defs=False [mypy-pandas.plotting._matplotlib.misc] check_untyped_defs=False + +[mypy-pandas.plotting._misc] +check_untyped_defs=False + +[mypy-pandas.util._decorators] +check_untyped_defs=False + From 428d82dc94763e355efec766c0ce51a55293bf8b Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 1 Sep 2020 16:37:08 -0700 Subject: [PATCH 0617/1025] ENH: vendor typing_extensions (#36000) --- pandas/_vendored/__init__.py | 0 pandas/_vendored/typing_extensions.py | 2466 +++++++++++++++++++++++++ setup.cfg | 9 +- 3 files changed, 2473 insertions(+), 2 deletions(-) create mode 100644 pandas/_vendored/__init__.py create mode 100644 pandas/_vendored/typing_extensions.py diff --git a/pandas/_vendored/__init__.py b/pandas/_vendored/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/pandas/_vendored/typing_extensions.py b/pandas/_vendored/typing_extensions.py new file mode 100644 index 0000000000000..53df8da175a56 --- /dev/null +++ b/pandas/_vendored/typing_extensions.py @@ -0,0 +1,2466 @@ +""" +vendored copy of typing_extensions, copied from +https://raw.githubusercontent.com/python/typing/master/typing_extensions/src_py3/typing_extensions.py + +on 2020-08-30. + +typing_extensions is distributed under the Python Software Foundation License. + +This is not a direct copy/paste of the original file. Changes are: + - this docstring + - ran `black` + - ran `isort` + - edited strings split by black to adhere to pandas style conventions + - AsyncContextManager is defined without `exec` + - python2-style super usages are updated + - replace foo[dot]__class__ with type(foo) + - Change a comment-syntax annotation in a docstring to newer syntax +""" + +# These are used by Protocol implementation +# We use internal typing helpers here, but this significantly reduces +# code duplication. (Also this is only until Protocol is in typing.) +import abc +import collections +import collections.abc as collections_abc +import contextlib +import operator +import sys +import typing +from typing import Callable, Generic, Tuple, TypeVar + +# After PEP 560, internal typing API was substantially reworked. +# This is especially important for Protocol class which uses internal APIs +# quite extensivelly. +PEP_560 = sys.version_info[:3] >= (3, 7, 0) + +if PEP_560: + GenericMeta = TypingMeta = type +else: + from typing import GenericMeta, TypingMeta +OLD_GENERICS = False +try: + from typing import _next_in_mro, _type_check, _type_vars +except ImportError: + OLD_GENERICS = True +try: + from typing import _subs_tree # noqa + + SUBS_TREE = True +except ImportError: + SUBS_TREE = False +try: + from typing import _tp_cache +except ImportError: + + def _tp_cache(x): + return x + + +try: + from typing import _TypingEllipsis, _TypingEmpty +except ImportError: + + class _TypingEllipsis: + pass + + class _TypingEmpty: + pass + + +# The two functions below are copies of typing internal helpers. +# They are needed by _ProtocolMeta + + +def _no_slots_copy(dct): + dict_copy = dict(dct) + if "__slots__" in dict_copy: + for slot in dict_copy["__slots__"]: + dict_copy.pop(slot, None) + return dict_copy + + +def _check_generic(cls, parameters): + if not cls.__parameters__: + raise TypeError("%s is not a generic class" % repr(cls)) + alen = len(parameters) + elen = len(cls.__parameters__) + if alen != elen: + raise TypeError( + "Too %s parameters for %s; actual %s, expected %s" + % ("many" if alen > elen else "few", repr(cls), alen, elen) + ) + + +if hasattr(typing, "_generic_new"): + _generic_new = typing._generic_new +else: + # Note: The '_generic_new(...)' function is used as a part of the + # process of creating a generic type and was added to the typing module + # as of Python 3.5.3. + # + # We've defined '_generic_new(...)' below to exactly match the behavior + # implemented in older versions of 'typing' bundled with Python 3.5.0 to + # 3.5.2. This helps eliminate redundancy when defining collection types + # like 'Deque' later. + # + # See https://github.com/python/typing/pull/308 for more details -- in + # particular, compare and contrast the definition of types like + # 'typing.List' before and after the merge. + + def _generic_new(base_cls, cls, *args, **kwargs): + return base_cls.__new__(cls, *args, **kwargs) + + +# See https://github.com/python/typing/pull/439 +if hasattr(typing, "_geqv"): + from typing import _geqv + + _geqv_defined = True +else: + _geqv = None + _geqv_defined = False + +if sys.version_info[:2] >= (3, 6): + import _collections_abc + + _check_methods_in_mro = _collections_abc._check_methods +else: + + def _check_methods_in_mro(C, *methods): + mro = C.__mro__ + for method in methods: + for B in mro: + if method in B.__dict__: + if B.__dict__[method] is None: + return NotImplemented + break + else: + return NotImplemented + return True + + +# Please keep __all__ alphabetized within each category. +__all__ = [ + # Super-special typing primitives. + "ClassVar", + "Final", + "Type", + # ABCs (from collections.abc). + # The following are added depending on presence + # of their non-generic counterparts in stdlib: + # 'Awaitable', + # 'AsyncIterator', + # 'AsyncIterable', + # 'Coroutine', + # 'AsyncGenerator', + # 'AsyncContextManager', + # 'ChainMap', + # Concrete collection types. + "ContextManager", + "Counter", + "Deque", + "DefaultDict", + "TypedDict", + # Structural checks, a.k.a. protocols. + "SupportsIndex", + # One-off things. + "final", + "IntVar", + "Literal", + "NewType", + "overload", + "Text", + "TYPE_CHECKING", +] + +# Annotated relies on substitution trees of pep 560. It will not work for +# versions of typing older than 3.5.3 +HAVE_ANNOTATED = PEP_560 or SUBS_TREE + +if PEP_560: + __all__.extend(["get_args", "get_origin", "get_type_hints"]) + +if HAVE_ANNOTATED: + __all__.append("Annotated") + +# Protocols are hard to backport to the original version of typing 3.5.0 +HAVE_PROTOCOLS = sys.version_info[:3] != (3, 5, 0) + +if HAVE_PROTOCOLS: + __all__.extend(["Protocol", "runtime", "runtime_checkable"]) + + +# TODO +if hasattr(typing, "NoReturn"): + NoReturn = typing.NoReturn +elif hasattr(typing, "_FinalTypingBase"): + + class _NoReturn(typing._FinalTypingBase, _root=True): + """Special type indicating functions that never return. + Example:: + + from typing import NoReturn + + def stop() -> NoReturn: + raise Exception('no way') + + This type is invalid in other positions, e.g., ``List[NoReturn]`` + will fail in static type checkers. + """ + + __slots__ = () + + def __instancecheck__(self, obj): + raise TypeError("NoReturn cannot be used with isinstance().") + + def __subclasscheck__(self, cls): + raise TypeError("NoReturn cannot be used with issubclass().") + + NoReturn = _NoReturn(_root=True) +else: + + class _NoReturnMeta(typing.TypingMeta): + """Metaclass for NoReturn""" + + def __new__(cls, name, bases, namespace, _root=False): + return super().__new__(cls, name, bases, namespace, _root=_root) + + def __instancecheck__(self, obj): + raise TypeError("NoReturn cannot be used with isinstance().") + + def __subclasscheck__(self, cls): + raise TypeError("NoReturn cannot be used with issubclass().") + + class NoReturn(typing.Final, metaclass=_NoReturnMeta, _root=True): + """Special type indicating functions that never return. + Example:: + + from typing import NoReturn + + def stop() -> NoReturn: + raise Exception('no way') + + This type is invalid in other positions, e.g., ``List[NoReturn]`` + will fail in static type checkers. + """ + + __slots__ = () + + +# Some unconstrained type variables. These are used by the container types. +# (These are not for export.) +T = typing.TypeVar("T") # Any type. +KT = typing.TypeVar("KT") # Key type. +VT = typing.TypeVar("VT") # Value type. +T_co = typing.TypeVar("T_co", covariant=True) # Any type covariant containers. +V_co = typing.TypeVar("V_co", covariant=True) # Any type covariant containers. +VT_co = typing.TypeVar("VT_co", covariant=True) # Value type covariant containers. +T_contra = typing.TypeVar("T_contra", contravariant=True) # Ditto contravariant. + + +if hasattr(typing, "ClassVar"): + ClassVar = typing.ClassVar +elif hasattr(typing, "_FinalTypingBase"): + + class _ClassVar(typing._FinalTypingBase, _root=True): + """Special type construct to mark class variables. + + An annotation wrapped in ClassVar indicates that a given + attribute is intended to be used as a class variable and + should not be set on instances of that class. Usage:: + + class Starship: + stats: ClassVar[Dict[str, int]] = {} # class variable + damage: int = 10 # instance variable + + ClassVar accepts only types and cannot be further subscribed. + + Note that ClassVar is not a class itself, and should not + be used with isinstance() or issubclass(). + """ + + __slots__ = ("__type__",) + + def __init__(self, tp=None, **kwds): + self.__type__ = tp + + def __getitem__(self, item): + cls = type(self) + if self.__type__ is None: + return cls( + typing._type_check( + item, "{} accepts only single type.".format(cls.__name__[1:]) + ), + _root=True, + ) + raise TypeError("{} cannot be further subscripted".format(cls.__name__[1:])) + + def _eval_type(self, globalns, localns): + new_tp = typing._eval_type(self.__type__, globalns, localns) + if new_tp == self.__type__: + return self + return type(self)(new_tp, _root=True) + + def __repr__(self): + r = super().__repr__() + if self.__type__ is not None: + r += "[{}]".format(typing._type_repr(self.__type__)) + return r + + def __hash__(self): + return hash((type(self).__name__, self.__type__)) + + def __eq__(self, other): + if not isinstance(other, _ClassVar): + return NotImplemented + if self.__type__ is not None: + return self.__type__ == other.__type__ + return self is other + + ClassVar = _ClassVar(_root=True) +else: + + class _ClassVarMeta(typing.TypingMeta): + """Metaclass for ClassVar""" + + def __new__(cls, name, bases, namespace, tp=None, _root=False): + self = super().__new__(cls, name, bases, namespace, _root=_root) + if tp is not None: + self.__type__ = tp + return self + + def __instancecheck__(self, obj): + raise TypeError("ClassVar cannot be used with isinstance().") + + def __subclasscheck__(self, cls): + raise TypeError("ClassVar cannot be used with issubclass().") + + def __getitem__(self, item): + cls = type(self) + if self.__type__ is not None: + raise TypeError( + "{} cannot be further subscripted".format(cls.__name__[1:]) + ) + + param = typing._type_check( + item, "{} accepts only single type.".format(cls.__name__[1:]) + ) + return cls( + self.__name__, self.__bases__, dict(self.__dict__), tp=param, _root=True + ) + + def _eval_type(self, globalns, localns): + new_tp = typing._eval_type(self.__type__, globalns, localns) + if new_tp == self.__type__: + return self + return type(self)( + self.__name__, + self.__bases__, + dict(self.__dict__), + tp=self.__type__, + _root=True, + ) + + def __repr__(self): + r = super().__repr__() + if self.__type__ is not None: + r += "[{}]".format(typing._type_repr(self.__type__)) + return r + + def __hash__(self): + return hash((type(self).__name__, self.__type__)) + + def __eq__(self, other): + if not isinstance(other, ClassVar): + return NotImplemented + if self.__type__ is not None: + return self.__type__ == other.__type__ + return self is other + + class ClassVar(typing.Final, metaclass=_ClassVarMeta, _root=True): + """Special type construct to mark class variables. + + An annotation wrapped in ClassVar indicates that a given + attribute is intended to be used as a class variable and + should not be set on instances of that class. Usage:: + + class Starship: + stats: ClassVar[Dict[str, int]] = {} # class variable + damage: int = 10 # instance variable + + ClassVar accepts only types and cannot be further subscribed. + + Note that ClassVar is not a class itself, and should not + be used with isinstance() or issubclass(). + """ + + __type__ = None + + +# On older versions of typing there is an internal class named "Final". +if hasattr(typing, "Final") and sys.version_info[:2] >= (3, 7): + Final = typing.Final +elif sys.version_info[:2] >= (3, 7): + + class _FinalForm(typing._SpecialForm, _root=True): + def __repr__(self): + return "typing_extensions." + self._name + + def __getitem__(self, parameters): + item = typing._type_check( + parameters, "{} accepts only single type".format(self._name) + ) + return _GenericAlias(self, (item,)) + + Final = _FinalForm( + "Final", + doc="""A special typing construct to indicate that a name + cannot be re-assigned or overridden in a subclass. + For example: + + MAX_SIZE: Final = 9000 + MAX_SIZE += 1 # Error reported by type checker + + class Connection: + TIMEOUT: Final[int] = 10 + class FastConnector(Connection): + TIMEOUT = 1 # Error reported by type checker + + There is no runtime checking of these properties.""", + ) +elif hasattr(typing, "_FinalTypingBase"): + + class _Final(typing._FinalTypingBase, _root=True): + """A special typing construct to indicate that a name + cannot be re-assigned or overridden in a subclass. + For example: + + MAX_SIZE: Final = 9000 + MAX_SIZE += 1 # Error reported by type checker + + class Connection: + TIMEOUT: Final[int] = 10 + class FastConnector(Connection): + TIMEOUT = 1 # Error reported by type checker + + There is no runtime checking of these properties. + """ + + __slots__ = ("__type__",) + + def __init__(self, tp=None, **kwds): + self.__type__ = tp + + def __getitem__(self, item): + cls = type(self) + if self.__type__ is None: + return cls( + typing._type_check( + item, "{} accepts only single type.".format(cls.__name__[1:]) + ), + _root=True, + ) + raise TypeError("{} cannot be further subscripted".format(cls.__name__[1:])) + + def _eval_type(self, globalns, localns): + new_tp = typing._eval_type(self.__type__, globalns, localns) + if new_tp == self.__type__: + return self + return type(self)(new_tp, _root=True) + + def __repr__(self): + r = super().__repr__() + if self.__type__ is not None: + r += "[{}]".format(typing._type_repr(self.__type__)) + return r + + def __hash__(self): + return hash((type(self).__name__, self.__type__)) + + def __eq__(self, other): + if not isinstance(other, _Final): + return NotImplemented + if self.__type__ is not None: + return self.__type__ == other.__type__ + return self is other + + Final = _Final(_root=True) +else: + + class _FinalMeta(typing.TypingMeta): + """Metaclass for Final""" + + def __new__(cls, name, bases, namespace, tp=None, _root=False): + self = super().__new__(cls, name, bases, namespace, _root=_root) + if tp is not None: + self.__type__ = tp + return self + + def __instancecheck__(self, obj): + raise TypeError("Final cannot be used with isinstance().") + + def __subclasscheck__(self, cls): + raise TypeError("Final cannot be used with issubclass().") + + def __getitem__(self, item): + cls = type(self) + if self.__type__ is not None: + raise TypeError( + "{} cannot be further subscripted".format(cls.__name__[1:]) + ) + + param = typing._type_check( + item, "{} accepts only single type.".format(cls.__name__[1:]) + ) + return cls( + self.__name__, self.__bases__, dict(self.__dict__), tp=param, _root=True + ) + + def _eval_type(self, globalns, localns): + new_tp = typing._eval_type(self.__type__, globalns, localns) + if new_tp == self.__type__: + return self + return type(self)( + self.__name__, + self.__bases__, + dict(self.__dict__), + tp=self.__type__, + _root=True, + ) + + def __repr__(self): + r = super().__repr__() + if self.__type__ is not None: + r += "[{}]".format(typing._type_repr(self.__type__)) + return r + + def __hash__(self): + return hash((type(self).__name__, self.__type__)) + + def __eq__(self, other): + if not isinstance(other, Final): + return NotImplemented + if self.__type__ is not None: + return self.__type__ == other.__type__ + return self is other + + class Final(typing.Final, metaclass=_FinalMeta, _root=True): + """A special typing construct to indicate that a name + cannot be re-assigned or overridden in a subclass. + For example: + + MAX_SIZE: Final = 9000 + MAX_SIZE += 1 # Error reported by type checker + + class Connection: + TIMEOUT: Final[int] = 10 + class FastConnector(Connection): + TIMEOUT = 1 # Error reported by type checker + + There is no runtime checking of these properties. + """ + + __type__ = None + + +if hasattr(typing, "final"): + final = typing.final +else: + + def final(f): + """This decorator can be used to indicate to type checkers that + the decorated method cannot be overridden, and decorated class + cannot be subclassed. For example: + + class Base: + @final + def done(self) -> None: + ... + class Sub(Base): + def done(self) -> None: # Error reported by type checker + ... + @final + class Leaf: + ... + class Other(Leaf): # Error reported by type checker + ... + + There is no runtime checking of these properties. + """ + return f + + +def IntVar(name): + return TypeVar(name) + + +if hasattr(typing, "Literal"): + Literal = typing.Literal +elif sys.version_info[:2] >= (3, 7): + + class _LiteralForm(typing._SpecialForm, _root=True): + def __repr__(self): + return "typing_extensions." + self._name + + def __getitem__(self, parameters): + return _GenericAlias(self, parameters) + + Literal = _LiteralForm( + "Literal", + doc="""A type that can be used to indicate to type checkers + that the corresponding value has a value literally equivalent + to the provided parameter. For example: + + var: Literal[4] = 4 + + The type checker understands that 'var' is literally equal to + the value 4 and no other value. + + Literal[...] cannot be subclassed. There is no runtime + checking verifying that the parameter is actually a value + instead of a type.""", + ) +elif hasattr(typing, "_FinalTypingBase"): + + class _Literal(typing._FinalTypingBase, _root=True): + """A type that can be used to indicate to type checkers that the + corresponding value has a value literally equivalent to the + provided parameter. For example: + + var: Literal[4] = 4 + + The type checker understands that 'var' is literally equal to the + value 4 and no other value. + + Literal[...] cannot be subclassed. There is no runtime checking + verifying that the parameter is actually a value instead of a type. + """ + + __slots__ = ("__values__",) + + def __init__(self, values=None, **kwds): + self.__values__ = values + + def __getitem__(self, values): + cls = type(self) + if self.__values__ is None: + if not isinstance(values, tuple): + values = (values,) + return cls(values, _root=True) + raise TypeError("{} cannot be further subscripted".format(cls.__name__[1:])) + + def _eval_type(self, globalns, localns): + return self + + def __repr__(self): + r = super().__repr__() + if self.__values__ is not None: + r += "[{}]".format(", ".join(map(typing._type_repr, self.__values__))) + return r + + def __hash__(self): + return hash((type(self).__name__, self.__values__)) + + def __eq__(self, other): + if not isinstance(other, _Literal): + return NotImplemented + if self.__values__ is not None: + return self.__values__ == other.__values__ + return self is other + + Literal = _Literal(_root=True) +else: + + class _LiteralMeta(typing.TypingMeta): + """Metaclass for Literal""" + + def __new__(cls, name, bases, namespace, values=None, _root=False): + self = super().__new__(cls, name, bases, namespace, _root=_root) + if values is not None: + self.__values__ = values + return self + + def __instancecheck__(self, obj): + raise TypeError("Literal cannot be used with isinstance().") + + def __subclasscheck__(self, cls): + raise TypeError("Literal cannot be used with issubclass().") + + def __getitem__(self, item): + cls = type(self) + if self.__values__ is not None: + raise TypeError( + "{} cannot be further subscripted".format(cls.__name__[1:]) + ) + + if not isinstance(item, tuple): + item = (item,) + return cls( + self.__name__, + self.__bases__, + dict(self.__dict__), + values=item, + _root=True, + ) + + def _eval_type(self, globalns, localns): + return self + + def __repr__(self): + r = super().__repr__() + if self.__values__ is not None: + r += "[{}]".format(", ".join(map(typing._type_repr, self.__values__))) + return r + + def __hash__(self): + return hash((type(self).__name__, self.__values__)) + + def __eq__(self, other): + if not isinstance(other, Literal): + return NotImplemented + if self.__values__ is not None: + return self.__values__ == other.__values__ + return self is other + + class Literal(typing.Final, metaclass=_LiteralMeta, _root=True): + """A type that can be used to indicate to type checkers that the + corresponding value has a value literally equivalent to the + provided parameter. For example: + + var: Literal[4] = 4 + + The type checker understands that 'var' is literally equal to the + value 4 and no other value. + + Literal[...] cannot be subclassed. There is no runtime checking + verifying that the parameter is actually a value instead of a type. + """ + + __values__ = None + + +def _overload_dummy(*args, **kwds): + """Helper for @overload to raise when called.""" + raise NotImplementedError( + "You should not call an overloaded function. " + "A series of @overload-decorated functions " + "outside a stub module should always be followed " + "by an implementation that is not @overload-ed." + ) + + +def overload(func): + """Decorator for overloaded functions/methods. + + In a stub file, place two or more stub definitions for the same + function in a row, each decorated with @overload. For example: + + @overload + def utf8(value: None) -> None: ... + @overload + def utf8(value: bytes) -> bytes: ... + @overload + def utf8(value: str) -> bytes: ... + + In a non-stub file (i.e. a regular .py file), do the same but + follow it with an implementation. The implementation should *not* + be decorated with @overload. For example: + + @overload + def utf8(value: None) -> None: ... + @overload + def utf8(value: bytes) -> bytes: ... + @overload + def utf8(value: str) -> bytes: ... + def utf8(value): + # implementation goes here + """ + return _overload_dummy + + +# This is not a real generic class. Don't use outside annotations. +if hasattr(typing, "Type"): + Type = typing.Type +else: + # Internal type variable used for Type[]. + CT_co = typing.TypeVar("CT_co", covariant=True, bound=type) + + class Type(typing.Generic[CT_co], extra=type): + """A special construct usable to annotate class objects. + + For example, suppose we have the following classes:: + + class User: ... # Abstract base for User classes + class BasicUser(User): ... + class ProUser(User): ... + class TeamUser(User): ... + + And a function that takes a class argument that's a subclass of + User and returns an instance of the corresponding class:: + + U = TypeVar('U', bound=User) + def new_user(user_class: Type[U]) -> U: + user = user_class() + # (Here we could write the user object to a database) + return user + joe = new_user(BasicUser) + + At this point the type checker knows that joe has type BasicUser. + """ + + __slots__ = () + + +# Various ABCs mimicking those in collections.abc. +# A few are simply re-exported for completeness. + + +def _define_guard(type_name): + """ + Returns True if the given type isn't defined in typing but + is defined in collections_abc. + + Adds the type to __all__ if the collection is found in either + typing or collection_abc. + """ + if hasattr(typing, type_name): + __all__.append(type_name) + globals()[type_name] = getattr(typing, type_name) + return False + elif hasattr(collections_abc, type_name): + __all__.append(type_name) + return True + else: + return False + + +class _ExtensionsGenericMeta(GenericMeta): + def __subclasscheck__(self, subclass): + """This mimics a more modern GenericMeta.__subclasscheck__() logic + (that does not have problems with recursion) to work around interactions + between collections, typing, and typing_extensions on older + versions of Python, see https://github.com/python/typing/issues/501. + """ + if sys.version_info[:3] >= (3, 5, 3) or sys.version_info[:3] < (3, 5, 0): + if self.__origin__ is not None: + if sys._getframe(1).f_globals["__name__"] not in ["abc", "functools"]: + raise TypeError( + "Parameterized generics cannot be used with class " + "or instance checks" + ) + return False + if not self.__extra__: + return super().__subclasscheck__(subclass) + res = self.__extra__.__subclasshook__(subclass) + if res is not NotImplemented: + return res + if self.__extra__ in subclass.__mro__: + return True + for scls in self.__extra__.__subclasses__(): + if isinstance(scls, GenericMeta): + continue + if issubclass(subclass, scls): + return True + return False + + +if _define_guard("Awaitable"): + + class Awaitable( + typing.Generic[T_co], + metaclass=_ExtensionsGenericMeta, + extra=collections_abc.Awaitable, + ): + __slots__ = () + + +if _define_guard("Coroutine"): + + class Coroutine( + Awaitable[V_co], + typing.Generic[T_co, T_contra, V_co], + metaclass=_ExtensionsGenericMeta, + extra=collections_abc.Coroutine, + ): + __slots__ = () + + +if _define_guard("AsyncIterable"): + + class AsyncIterable( + typing.Generic[T_co], + metaclass=_ExtensionsGenericMeta, + extra=collections_abc.AsyncIterable, + ): + __slots__ = () + + +if _define_guard("AsyncIterator"): + + class AsyncIterator( + AsyncIterable[T_co], + metaclass=_ExtensionsGenericMeta, + extra=collections_abc.AsyncIterator, + ): + __slots__ = () + + +if hasattr(typing, "Deque"): + Deque = typing.Deque +elif _geqv_defined: + + class Deque( + collections.deque, + typing.MutableSequence[T], + metaclass=_ExtensionsGenericMeta, + extra=collections.deque, + ): + __slots__ = () + + def __new__(cls, *args, **kwds): + if _geqv(cls, Deque): + return collections.deque(*args, **kwds) + return _generic_new(collections.deque, cls, *args, **kwds) + + +else: + + class Deque( + collections.deque, + typing.MutableSequence[T], + metaclass=_ExtensionsGenericMeta, + extra=collections.deque, + ): + __slots__ = () + + def __new__(cls, *args, **kwds): + if cls._gorg is Deque: + return collections.deque(*args, **kwds) + return _generic_new(collections.deque, cls, *args, **kwds) + + +if hasattr(typing, "ContextManager"): + ContextManager = typing.ContextManager +elif hasattr(contextlib, "AbstractContextManager"): + + class ContextManager( + typing.Generic[T_co], + metaclass=_ExtensionsGenericMeta, + extra=contextlib.AbstractContextManager, + ): + __slots__ = () + + +else: + + class ContextManager(typing.Generic[T_co]): + __slots__ = () + + def __enter__(self): + return self + + @abc.abstractmethod + def __exit__(self, exc_type, exc_value, traceback): + return None + + @classmethod + def __subclasshook__(cls, C): + if cls is ContextManager: + # In Python 3.6+, it is possible to set a method to None to + # explicitly indicate that the class does not implement an ABC + # (https://bugs.python.org/issue25958), but we do not support + # that pattern here because this fallback class is only used + # in Python 3.5 and earlier. + if any("__enter__" in B.__dict__ for B in C.__mro__) and any( + "__exit__" in B.__dict__ for B in C.__mro__ + ): + return True + return NotImplemented + + +if hasattr(typing, "AsyncContextManager"): + AsyncContextManager = typing.AsyncContextManager + __all__.append("AsyncContextManager") +elif hasattr(contextlib, "AbstractAsyncContextManager"): + + class AsyncContextManager( + typing.Generic[T_co], + metaclass=_ExtensionsGenericMeta, + extra=contextlib.AbstractAsyncContextManager, + ): + __slots__ = () + + __all__.append("AsyncContextManager") + +else: + + class AsyncContextManager(typing.Generic[T_co]): + __slots__ = () + + async def __aenter__(self): + return self + + @abc.abstractmethod + async def __aexit__(self, exc_type, exc_value, traceback): + return None + + @classmethod + def __subclasshook__(cls, C): + if cls is AsyncContextManager: + return _check_methods_in_mro(C, "__aenter__", "__aexit__") + return NotImplemented + + __all__.append("AsyncContextManager") + + +if hasattr(typing, "DefaultDict"): + DefaultDict = typing.DefaultDict +elif _geqv_defined: + + class DefaultDict( + collections.defaultdict, + typing.MutableMapping[KT, VT], + metaclass=_ExtensionsGenericMeta, + extra=collections.defaultdict, + ): + + __slots__ = () + + def __new__(cls, *args, **kwds): + if _geqv(cls, DefaultDict): + return collections.defaultdict(*args, **kwds) + return _generic_new(collections.defaultdict, cls, *args, **kwds) + + +else: + + class DefaultDict( + collections.defaultdict, + typing.MutableMapping[KT, VT], + metaclass=_ExtensionsGenericMeta, + extra=collections.defaultdict, + ): + + __slots__ = () + + def __new__(cls, *args, **kwds): + if cls._gorg is DefaultDict: + return collections.defaultdict(*args, **kwds) + return _generic_new(collections.defaultdict, cls, *args, **kwds) + + +if hasattr(typing, "Counter"): + Counter = typing.Counter +elif (3, 5, 0) <= sys.version_info[:3] <= (3, 5, 1): + assert _geqv_defined + _TInt = typing.TypeVar("_TInt") + + class _CounterMeta(typing.GenericMeta): + """Metaclass for Counter""" + + def __getitem__(self, item): + return super().__getitem__((item, int)) + + class Counter( + collections.Counter, + typing.Dict[T, int], + metaclass=_CounterMeta, + extra=collections.Counter, + ): + + __slots__ = () + + def __new__(cls, *args, **kwds): + if _geqv(cls, Counter): + return collections.Counter(*args, **kwds) + return _generic_new(collections.Counter, cls, *args, **kwds) + + +elif _geqv_defined: + + class Counter( + collections.Counter, + typing.Dict[T, int], + metaclass=_ExtensionsGenericMeta, + extra=collections.Counter, + ): + + __slots__ = () + + def __new__(cls, *args, **kwds): + if _geqv(cls, Counter): + return collections.Counter(*args, **kwds) + return _generic_new(collections.Counter, cls, *args, **kwds) + + +else: + + class Counter( + collections.Counter, + typing.Dict[T, int], + metaclass=_ExtensionsGenericMeta, + extra=collections.Counter, + ): + + __slots__ = () + + def __new__(cls, *args, **kwds): + if cls._gorg is Counter: + return collections.Counter(*args, **kwds) + return _generic_new(collections.Counter, cls, *args, **kwds) + + +if hasattr(typing, "ChainMap"): + ChainMap = typing.ChainMap + __all__.append("ChainMap") +elif hasattr(collections, "ChainMap"): + # ChainMap only exists in 3.3+ + if _geqv_defined: + + class ChainMap( + collections.ChainMap, + typing.MutableMapping[KT, VT], + metaclass=_ExtensionsGenericMeta, + extra=collections.ChainMap, + ): + + __slots__ = () + + def __new__(cls, *args, **kwds): + if _geqv(cls, ChainMap): + return collections.ChainMap(*args, **kwds) + return _generic_new(collections.ChainMap, cls, *args, **kwds) + + else: + + class ChainMap( + collections.ChainMap, + typing.MutableMapping[KT, VT], + metaclass=_ExtensionsGenericMeta, + extra=collections.ChainMap, + ): + + __slots__ = () + + def __new__(cls, *args, **kwds): + if cls._gorg is ChainMap: + return collections.ChainMap(*args, **kwds) + return _generic_new(collections.ChainMap, cls, *args, **kwds) + + __all__.append("ChainMap") + + +if _define_guard("AsyncGenerator"): + + class AsyncGenerator( + AsyncIterator[T_co], + typing.Generic[T_co, T_contra], + metaclass=_ExtensionsGenericMeta, + extra=collections_abc.AsyncGenerator, + ): + __slots__ = () + + +if hasattr(typing, "NewType"): + NewType = typing.NewType +else: + + def NewType(name, tp): + """NewType creates simple unique types with almost zero + runtime overhead. NewType(name, tp) is considered a subtype of tp + by static type checkers. At runtime, NewType(name, tp) returns + a dummy function that simply returns its argument. Usage:: + + UserId = NewType('UserId', int) + + def name_by_id(user_id: UserId) -> str: + ... + + UserId('user') # Fails type check + + name_by_id(42) # Fails type check + name_by_id(UserId(42)) # OK + + num: int = UserId(5) + 1 + """ + + def new_type(x): + return x + + new_type.__name__ = name + new_type.__supertype__ = tp + return new_type + + +if hasattr(typing, "Text"): + Text = typing.Text +else: + Text = str + + +if hasattr(typing, "TYPE_CHECKING"): + TYPE_CHECKING = typing.TYPE_CHECKING +else: + # Constant that's True when type checking, but False here. + TYPE_CHECKING = False + + +def _gorg(cls): + """This function exists for compatibility with old typing versions.""" + assert isinstance(cls, GenericMeta) + if hasattr(cls, "_gorg"): + return cls._gorg + while cls.__origin__ is not None: + cls = cls.__origin__ + return cls + + +if OLD_GENERICS: + + def _next_in_mro(cls): # noqa + """This function exists for compatibility with old typing versions.""" + next_in_mro = object + for i, c in enumerate(cls.__mro__[:-1]): + if isinstance(c, GenericMeta) and _gorg(c) is Generic: + next_in_mro = cls.__mro__[i + 1] + return next_in_mro + + +_PROTO_WHITELIST = [ + "Callable", + "Awaitable", + "Iterable", + "Iterator", + "AsyncIterable", + "AsyncIterator", + "Hashable", + "Sized", + "Container", + "Collection", + "Reversible", + "ContextManager", + "AsyncContextManager", +] + + +def _get_protocol_attrs(cls): + attrs = set() + for base in cls.__mro__[:-1]: # without object + if base.__name__ in ("Protocol", "Generic"): + continue + annotations = getattr(base, "__annotations__", {}) + for attr in list(base.__dict__.keys()) + list(annotations.keys()): + if not attr.startswith("_abc_") and attr not in ( + "__abstractmethods__", + "__annotations__", + "__weakref__", + "_is_protocol", + "_is_runtime_protocol", + "__dict__", + "__args__", + "__slots__", + "__next_in_mro__", + "__parameters__", + "__origin__", + "__orig_bases__", + "__extra__", + "__tree_hash__", + "__doc__", + "__subclasshook__", + "__init__", + "__new__", + "__module__", + "_MutableMapping__marker", + "_gorg", + ): + attrs.add(attr) + return attrs + + +def _is_callable_members_only(cls): + return all(callable(getattr(cls, attr, None)) for attr in _get_protocol_attrs(cls)) + + +if hasattr(typing, "Protocol"): + Protocol = typing.Protocol +elif HAVE_PROTOCOLS and not PEP_560: + + class _ProtocolMeta(GenericMeta): + """Internal metaclass for Protocol. + + This exists so Protocol classes can be generic without deriving + from Generic. + """ + + if not OLD_GENERICS: + + def __new__( + cls, + name, + bases, + namespace, + tvars=None, + args=None, + origin=None, + extra=None, + orig_bases=None, + ): + # This is just a version copied from GenericMeta.__new__ that + # includes "Protocol" special treatment. (Comments removed for brevity.) + assert extra is None # Protocols should not have extra + if tvars is not None: + assert origin is not None + assert all(isinstance(t, TypeVar) for t in tvars), tvars + else: + tvars = _type_vars(bases) + gvars = None + for base in bases: + if base is Generic: + raise TypeError("Cannot inherit from plain Generic") + if isinstance(base, GenericMeta) and base.__origin__ in ( + Generic, + Protocol, + ): + if gvars is not None: + raise TypeError( + "Cannot inherit from Generic[...] or " + "Protocol[...] multiple times." + ) + gvars = base.__parameters__ + if gvars is None: + gvars = tvars + else: + tvarset = set(tvars) + gvarset = set(gvars) + if not tvarset <= gvarset: + raise TypeError( + "Some type variables (%s) " + "are not listed in %s[%s]" + % ( + ", ".join( + str(t) for t in tvars if t not in gvarset + ), + "Generic" + if any(b.__origin__ is Generic for b in bases) + else "Protocol", + ", ".join(str(g) for g in gvars), + ) + ) + tvars = gvars + + initial_bases = bases + if ( + extra is not None + and type(extra) is abc.ABCMeta + and extra not in bases + ): + bases = (extra,) + bases + bases = tuple( + _gorg(b) if isinstance(b, GenericMeta) else b for b in bases + ) + if any(isinstance(b, GenericMeta) and b is not Generic for b in bases): + bases = tuple(b for b in bases if b is not Generic) + namespace.update({"__origin__": origin, "__extra__": extra}) + self = super().__new__(cls, name, bases, namespace, _root=True) + super().__setattr__("_gorg", self if not origin else _gorg(origin)) + self.__parameters__ = tvars + self.__args__ = ( + tuple( + ... if a is _TypingEllipsis else () if a is _TypingEmpty else a + for a in args + ) + if args + else None + ) + self.__next_in_mro__ = _next_in_mro(self) + if orig_bases is None: + self.__orig_bases__ = initial_bases + elif origin is not None: + self._abc_registry = origin._abc_registry + self._abc_cache = origin._abc_cache + if hasattr(self, "_subs_tree"): + self.__tree_hash__ = ( + hash(self._subs_tree()) if origin else super().__hash__() + ) + return self + + def __init__(cls, *args, **kwargs): + super().__init__(*args, **kwargs) + if not cls.__dict__.get("_is_protocol", None): + cls._is_protocol = any( + b is Protocol + or isinstance(b, _ProtocolMeta) + and b.__origin__ is Protocol + for b in cls.__bases__ + ) + if cls._is_protocol: + for base in cls.__mro__[1:]: + if not ( + base in (object, Generic) + or base.__module__ == "collections.abc" + and base.__name__ in _PROTO_WHITELIST + or isinstance(base, TypingMeta) + and base._is_protocol + or isinstance(base, GenericMeta) + and base.__origin__ is Generic + ): + raise TypeError( + "Protocols can only inherit from other " + "protocols, got %r" % base + ) + + def _no_init(self, *args, **kwargs): + if type(self)._is_protocol: + raise TypeError("Protocols cannot be instantiated") + + cls.__init__ = _no_init + + def _proto_hook(other): + if not cls.__dict__.get("_is_protocol", None): + return NotImplemented + if not isinstance(other, type): + # Same error as for issubclass(1, int) + raise TypeError("issubclass() arg 1 must be a class") + for attr in _get_protocol_attrs(cls): + for base in other.__mro__: + if attr in base.__dict__: + if base.__dict__[attr] is None: + return NotImplemented + break + annotations = getattr(base, "__annotations__", {}) + if ( + isinstance(annotations, typing.Mapping) + and attr in annotations + and isinstance(other, _ProtocolMeta) + and other._is_protocol + ): + break + else: + return NotImplemented + return True + + if "__subclasshook__" not in cls.__dict__: + cls.__subclasshook__ = _proto_hook + + def __instancecheck__(self, instance): + # We need this method for situations where attributes are + # assigned in __init__. + if ( + not getattr(self, "_is_protocol", False) + or _is_callable_members_only(self) + ) and issubclass(type(instance), self): + return True + if self._is_protocol: + if all( + hasattr(instance, attr) + and ( + not callable(getattr(self, attr, None)) + or getattr(instance, attr) is not None + ) + for attr in _get_protocol_attrs(self) + ): + return True + return super().__instancecheck__(instance) + + def __subclasscheck__(self, cls): + if self.__origin__ is not None: + if sys._getframe(1).f_globals["__name__"] not in ["abc", "functools"]: + raise TypeError( + "Parameterized generics cannot be used with class " + "or instance checks" + ) + return False + if self.__dict__.get("_is_protocol", None) and not self.__dict__.get( + "_is_runtime_protocol", None + ): + if sys._getframe(1).f_globals["__name__"] in [ + "abc", + "functools", + "typing", + ]: + return False + raise TypeError( + "Instance and class checks can only be used with " + "@runtime protocols" + ) + if self.__dict__.get( + "_is_runtime_protocol", None + ) and not _is_callable_members_only(self): + if sys._getframe(1).f_globals["__name__"] in [ + "abc", + "functools", + "typing", + ]: + return super().__subclasscheck__(cls) + raise TypeError( + "Protocols with non-method members don't support issubclass()" + ) + return super().__subclasscheck__(cls) + + if not OLD_GENERICS: + + @_tp_cache + def __getitem__(self, params): + # We also need to copy this from GenericMeta.__getitem__ to get + # special treatment of "Protocol". (Comments removed for brevity.) + if not isinstance(params, tuple): + params = (params,) + if not params and _gorg(self) is not Tuple: + raise TypeError( + "Parameter list to %s[...] cannot be empty" % self.__qualname__ + ) + msg = "Parameters to generic types must be types." + params = tuple(_type_check(p, msg) for p in params) + if self in (Generic, Protocol): + if not all(isinstance(p, TypeVar) for p in params): + raise TypeError( + "Parameters to %r[...] must all be type variables" % self + ) + if len(set(params)) != len(params): + raise TypeError( + "Parameters to %r[...] must all be unique" % self + ) + tvars = params + args = params + elif self in (Tuple, Callable): + tvars = _type_vars(params) + args = params + elif self.__origin__ in (Generic, Protocol): + raise TypeError( + "Cannot subscript already-subscripted %s" % repr(self) + ) + else: + _check_generic(self, params) + tvars = _type_vars(params) + args = params + + prepend = (self,) if self.__origin__ is None else () + return type(self)( + self.__name__, + prepend + self.__bases__, + _no_slots_copy(self.__dict__), + tvars=tvars, + args=args, + origin=self, + extra=self.__extra__, + orig_bases=self.__orig_bases__, + ) + + class Protocol(metaclass=_ProtocolMeta): + """Base class for protocol classes. Protocol classes are defined as:: + + class Proto(Protocol): + def meth(self) -> int: + ... + + Such classes are primarily used with static type checkers that recognize + structural subtyping (static duck-typing), for example:: + + class C: + def meth(self) -> int: + return 0 + + def func(x: Proto) -> int: + return x.meth() + + func(C()) # Passes static type check + + See PEP 544 for details. Protocol classes decorated with + @typing_extensions.runtime act as simple-minded runtime protocol that checks + only the presence of given attributes, ignoring their type signatures. + + Protocol classes can be generic, they are defined as:: + + class GenProto({bases}): + def meth(self) -> T: + ... + """ + + __slots__ = () + _is_protocol = True + + def __new__(cls, *args, **kwds): + if _gorg(cls) is Protocol: + raise TypeError( + "Type Protocol cannot be instantiated; " + "it can be used only as a base class" + ) + if OLD_GENERICS: + return _generic_new(_next_in_mro(cls), cls, *args, **kwds) + return _generic_new(cls.__next_in_mro__, cls, *args, **kwds) + + if Protocol.__doc__ is not None: + Protocol.__doc__ = Protocol.__doc__.format( + bases="Protocol, Generic[T]" if OLD_GENERICS else "Protocol[T]" + ) + + +elif PEP_560: + from typing import _collect_type_vars, _GenericAlias, _type_check # noqa + + class _ProtocolMeta(abc.ABCMeta): + # This metaclass is a bit unfortunate and exists only because of the lack + # of __instancehook__. + def __instancecheck__(cls, instance): + # We need this method for situations where attributes are + # assigned in __init__. + if ( + not getattr(cls, "_is_protocol", False) + or _is_callable_members_only(cls) + ) and issubclass(type(instance), cls): + return True + if cls._is_protocol: + if all( + hasattr(instance, attr) + and ( + not callable(getattr(cls, attr, None)) + or getattr(instance, attr) is not None + ) + for attr in _get_protocol_attrs(cls) + ): + return True + return super().__instancecheck__(instance) + + class Protocol(metaclass=_ProtocolMeta): + # There is quite a lot of overlapping code with typing.Generic. + # Unfortunately it is hard to avoid this while these live in two different + # modules. The duplicated code will be removed when Protocol is moved to typing. + """Base class for protocol classes. Protocol classes are defined as:: + + class Proto(Protocol): + def meth(self) -> int: + ... + + Such classes are primarily used with static type checkers that recognize + structural subtyping (static duck-typing), for example:: + + class C: + def meth(self) -> int: + return 0 + + def func(x: Proto) -> int: + return x.meth() + + func(C()) # Passes static type check + + See PEP 544 for details. Protocol classes decorated with + @typing_extensions.runtime act as simple-minded runtime protocol that checks + only the presence of given attributes, ignoring their type signatures. + + Protocol classes can be generic, they are defined as:: + + class GenProto(Protocol[T]): + def meth(self) -> T: + ... + """ + __slots__ = () + _is_protocol = True + + def __new__(cls, *args, **kwds): + if cls is Protocol: + raise TypeError( + "Type Protocol cannot be instantiated; " + "it can only be used as a base class" + ) + return super().__new__(cls) + + @_tp_cache + def __class_getitem__(cls, params): + if not isinstance(params, tuple): + params = (params,) + if not params and cls is not Tuple: + raise TypeError( + "Parameter list to {}[...] cannot be empty".format(cls.__qualname__) + ) + msg = "Parameters to generic types must be types." + params = tuple(_type_check(p, msg) for p in params) + if cls is Protocol: + # Generic can only be subscripted with unique type variables. + if not all(isinstance(p, TypeVar) for p in params): + i = 0 + while isinstance(params[i], TypeVar): + i += 1 + raise TypeError( + "Parameters to Protocol[...] must all be type variables. " + "Parameter {} is {}".format(i + 1, params[i]) + ) + if len(set(params)) != len(params): + raise TypeError("Parameters to Protocol[...] must all be unique") + else: + # Subscripting a regular Generic subclass. + _check_generic(cls, params) + return _GenericAlias(cls, params) + + def __init_subclass__(cls, *args, **kwargs): + tvars = [] + if "__orig_bases__" in cls.__dict__: + error = Generic in cls.__orig_bases__ + else: + error = Generic in cls.__bases__ + if error: + raise TypeError("Cannot inherit from plain Generic") + if "__orig_bases__" in cls.__dict__: + tvars = _collect_type_vars(cls.__orig_bases__) + # Look for Generic[T1, ..., Tn] or Protocol[T1, ..., Tn]. + # If found, tvars must be a subset of it. + # If not found, tvars is it. + # Also check for and reject plain Generic, + # and reject multiple Generic[...] and/or Protocol[...]. + gvars = None + for base in cls.__orig_bases__: + if isinstance(base, _GenericAlias) and base.__origin__ in ( + Generic, + Protocol, + ): + # for error messages + the_base = ( + "Generic" if base.__origin__ is Generic else "Protocol" + ) + if gvars is not None: + raise TypeError( + "Cannot inherit from Generic[...] " + "and/or Protocol[...] multiple types." + ) + gvars = base.__parameters__ + if gvars is None: + gvars = tvars + else: + tvarset = set(tvars) + gvarset = set(gvars) + if not tvarset <= gvarset: + s_vars = ", ".join(str(t) for t in tvars if t not in gvarset) + s_args = ", ".join(str(g) for g in gvars) + raise TypeError( + "Some type variables ({}) are " + "not listed in {}[{}]".format(s_vars, the_base, s_args) + ) + tvars = gvars + cls.__parameters__ = tuple(tvars) + + # Determine if this is a protocol or a concrete subclass. + if not cls.__dict__.get("_is_protocol", None): + cls._is_protocol = any(b is Protocol for b in cls.__bases__) + + # Set (or override) the protocol subclass hook. + def _proto_hook(other): + if not cls.__dict__.get("_is_protocol", None): + return NotImplemented + if not getattr(cls, "_is_runtime_protocol", False): + if sys._getframe(2).f_globals["__name__"] in ["abc", "functools"]: + return NotImplemented + raise TypeError( + "Instance and class checks can only be used with " + "@runtime protocols" + ) + if not _is_callable_members_only(cls): + if sys._getframe(2).f_globals["__name__"] in ["abc", "functools"]: + return NotImplemented + raise TypeError( + "Protocols with non-method members " + "don't support issubclass()" + ) + if not isinstance(other, type): + # Same error as for issubclass(1, int) + raise TypeError("issubclass() arg 1 must be a class") + for attr in _get_protocol_attrs(cls): + for base in other.__mro__: + if attr in base.__dict__: + if base.__dict__[attr] is None: + return NotImplemented + break + annotations = getattr(base, "__annotations__", {}) + if ( + isinstance(annotations, typing.Mapping) + and attr in annotations + and isinstance(other, _ProtocolMeta) + and other._is_protocol + ): + break + else: + return NotImplemented + return True + + if "__subclasshook__" not in cls.__dict__: + cls.__subclasshook__ = _proto_hook + + # We have nothing more to do for non-protocols. + if not cls._is_protocol: + return + + # Check consistency of bases. + for base in cls.__bases__: + if not ( + base in (object, Generic) + or base.__module__ == "collections.abc" + and base.__name__ in _PROTO_WHITELIST + or isinstance(base, _ProtocolMeta) + and base._is_protocol + ): + raise TypeError( + "Protocols can only inherit from other " + "protocols, got %r" % base + ) + + def _no_init(self, *args, **kwargs): + if type(self)._is_protocol: + raise TypeError("Protocols cannot be instantiated") + + cls.__init__ = _no_init + + +if hasattr(typing, "runtime_checkable"): + runtime_checkable = typing.runtime_checkable +elif HAVE_PROTOCOLS: + + def runtime_checkable(cls): + """Mark a protocol class as a runtime protocol, so that it + can be used with isinstance() and issubclass(). Raise TypeError + if applied to a non-protocol class. + + This allows a simple-minded structural check very similar to the + one-offs in collections.abc such as Hashable. + """ + if not isinstance(cls, _ProtocolMeta) or not cls._is_protocol: + raise TypeError( + "@runtime_checkable can be only applied to protocol classes, " + "got %r" % cls + ) + cls._is_runtime_protocol = True + return cls + + +if HAVE_PROTOCOLS: + # Exists for backwards compatibility. + runtime = runtime_checkable + + +if hasattr(typing, "SupportsIndex"): + SupportsIndex = typing.SupportsIndex +elif HAVE_PROTOCOLS: + + @runtime_checkable + class SupportsIndex(Protocol): + __slots__ = () + + @abc.abstractmethod + def __index__(self) -> int: + pass + + +if sys.version_info[:2] >= (3, 9): + # The standard library TypedDict in Python 3.8 does not store runtime information + # about which (if any) keys are optional. See https://bugs.python.org/issue38834 + TypedDict = typing.TypedDict +else: + + def _check_fails(cls, other): + try: + if sys._getframe(1).f_globals["__name__"] not in [ + "abc", + "functools", + "typing", + ]: + # Typed dicts are only for static structural subtyping. + raise TypeError("TypedDict does not support instance and class checks") + except (AttributeError, ValueError): + pass + return False + + def _dict_new(*args, **kwargs): + if not args: + raise TypeError("TypedDict.__new__(): not enough arguments") + _, args = args[0], args[1:] # allow the "cls" keyword be passed + return dict(*args, **kwargs) + + _dict_new.__text_signature__ = "($cls, _typename, _fields=None, /, **kwargs)" + + def _typeddict_new(*args, total=True, **kwargs): + if not args: + raise TypeError("TypedDict.__new__(): not enough arguments") + _, args = args[0], args[1:] # allow the "cls" keyword be passed + if args: + typename, args = ( + args[0], + args[1:], + ) # allow the "_typename" keyword be passed + elif "_typename" in kwargs: + typename = kwargs.pop("_typename") + import warnings + + warnings.warn( + "Passing '_typename' as keyword argument is deprecated", + DeprecationWarning, + stacklevel=2, + ) + else: + raise TypeError( + "TypedDict.__new__() missing 1 required positional " + "argument: '_typename'" + ) + if args: + try: + (fields,) = args # allow the "_fields" keyword be passed + except ValueError: + raise TypeError( + "TypedDict.__new__() takes from 2 to 3 " + "positional arguments but {} " + "were given".format(len(args) + 2) + ) + elif "_fields" in kwargs and len(kwargs) == 1: + fields = kwargs.pop("_fields") + import warnings + + warnings.warn( + "Passing '_fields' as keyword argument is deprecated", + DeprecationWarning, + stacklevel=2, + ) + else: + fields = None + + if fields is None: + fields = kwargs + elif kwargs: + raise TypeError( + "TypedDict takes either a dict or keyword arguments, but not both" + ) + + ns = {"__annotations__": dict(fields), "__total__": total} + try: + # Setting correct module is necessary to make typed dict classes pickleable. + ns["__module__"] = sys._getframe(1).f_globals.get("__name__", "__main__") + except (AttributeError, ValueError): + pass + + return _TypedDictMeta(typename, (), ns) + + _typeddict_new.__text_signature__ = ( + "($cls, _typename, _fields=None, /, *, total=True, **kwargs)" + ) + + class _TypedDictMeta(type): + def __new__(cls, name, bases, ns, total=True): + # Create new typed dict class object. + # This method is called directly when TypedDict is subclassed, + # or via _typeddict_new when TypedDict is instantiated. This way + # TypedDict supports all three syntaxes described in its docstring. + # Subclasses and instances of TypedDict return actual dictionaries + # via _dict_new. + ns["__new__"] = _typeddict_new if name == "TypedDict" else _dict_new + tp_dict = super().__new__(cls, name, (dict,), ns) + + annotations = {} + own_annotations = ns.get("__annotations__", {}) + own_annotation_keys = set(own_annotations.keys()) + msg = "TypedDict('Name', {f0: t0, f1: t1, ...}); each t must be a type" + own_annotations = { + n: typing._type_check(tp, msg) for n, tp in own_annotations.items() + } + required_keys = set() + optional_keys = set() + + for base in bases: + annotations.update(base.__dict__.get("__annotations__", {})) + required_keys.update(base.__dict__.get("__required_keys__", ())) + optional_keys.update(base.__dict__.get("__optional_keys__", ())) + + annotations.update(own_annotations) + if total: + required_keys.update(own_annotation_keys) + else: + optional_keys.update(own_annotation_keys) + + tp_dict.__annotations__ = annotations + tp_dict.__required_keys__ = frozenset(required_keys) + tp_dict.__optional_keys__ = frozenset(optional_keys) + if not hasattr(tp_dict, "__total__"): + tp_dict.__total__ = total + return tp_dict + + __instancecheck__ = __subclasscheck__ = _check_fails + + TypedDict = _TypedDictMeta("TypedDict", (dict,), {}) + TypedDict.__module__ = __name__ + TypedDict.__doc__ = """A simple typed name space. At runtime it is equivalent to a plain dict. + + TypedDict creates a dictionary type that expects all of its + instances to have a certain set of keys, with each key + associated with a value of a consistent type. This expectation + is not checked at runtime but is only enforced by type checkers. + Usage:: + + class Point2D(TypedDict): + x: int + y: int + label: str + + a: Point2D = {'x': 1, 'y': 2, 'label': 'good'} # OK + b: Point2D = {'z': 3, 'label': 'bad'} # Fails type check + + assert Point2D(x=1, y=2, label='first') == dict(x=1, y=2, label='first') + + The type info can be accessed via the Point2D.__annotations__ dict, and + the Point2D.__required_keys__ and Point2D.__optional_keys__ frozensets. + TypedDict supports two additional equivalent forms:: + + Point2D = TypedDict('Point2D', x=int, y=int, label=str) + Point2D = TypedDict('Point2D', {'x': int, 'y': int, 'label': str}) + + The class syntax is only supported in Python 3.6+, while two other + syntax forms work for Python 2.7 and 3.2+ + """ + + +# Python 3.9+ has PEP 593 (Annotated and modified get_type_hints) +if hasattr(typing, "Annotated"): + Annotated = typing.Annotated + get_type_hints = typing.get_type_hints + # Not exported and not a public API, but needed for get_origin() and get_args() + # to work. + _AnnotatedAlias = typing._AnnotatedAlias +elif PEP_560: + + class _AnnotatedAlias(typing._GenericAlias, _root=True): + """Runtime representation of an annotated type. + + At its core 'Annotated[t, dec1, dec2, ...]' is an alias for the type 't' + with extra annotations. The alias behaves like a normal typing alias, + instantiating is the same as instantiating the underlying type, binding + it to types is also the same. + """ + + def __init__(self, origin, metadata): + if isinstance(origin, _AnnotatedAlias): + metadata = origin.__metadata__ + metadata + origin = origin.__origin__ + super().__init__(origin, origin) + self.__metadata__ = metadata + + def copy_with(self, params): + assert len(params) == 1 + new_type = params[0] + return _AnnotatedAlias(new_type, self.__metadata__) + + def __repr__(self): + return "typing_extensions.Annotated[{}, {}]".format( + typing._type_repr(self.__origin__), + ", ".join(repr(a) for a in self.__metadata__), + ) + + def __reduce__(self): + return operator.getitem, (Annotated, (self.__origin__,) + self.__metadata__) + + def __eq__(self, other): + if not isinstance(other, _AnnotatedAlias): + return NotImplemented + if self.__origin__ != other.__origin__: + return False + return self.__metadata__ == other.__metadata__ + + def __hash__(self): + return hash((self.__origin__, self.__metadata__)) + + class Annotated: + """Add context specific metadata to a type. + + Example: Annotated[int, runtime_check.Unsigned] indicates to the + hypothetical runtime_check module that this type is an unsigned int. + Every other consumer of this type can ignore this metadata and treat + this type as int. + + The first argument to Annotated must be a valid type (and will be in + the __origin__ field), the remaining arguments are kept as a tuple in + the __extra__ field. + + Details: + + - It's an error to call `Annotated` with less than two arguments. + - Nested Annotated are flattened:: + + Annotated[Annotated[T, Ann1, Ann2], Ann3] == Annotated[T, Ann1, Ann2, Ann3] + + - Instantiating an annotated type is equivalent to instantiating the + underlying type:: + + Annotated[C, Ann1](5) == C(5) + + - Annotated can be used as a generic type alias:: + + Optimized = Annotated[T, runtime.Optimize()] + Optimized[int] == Annotated[int, runtime.Optimize()] + + OptimizedList = Annotated[List[T], runtime.Optimize()] + OptimizedList[int] == Annotated[List[int], runtime.Optimize()] + """ + + __slots__ = () + + def __new__(cls, *args, **kwargs): + raise TypeError("Type Annotated cannot be instantiated.") + + @_tp_cache + def __class_getitem__(cls, params): + if not isinstance(params, tuple) or len(params) < 2: + raise TypeError( + "Annotated[...] should be used " + "with at least two arguments (a type and an " + "annotation)." + ) + msg = "Annotated[t, ...]: t must be a type." + origin = typing._type_check(params[0], msg) + metadata = tuple(params[1:]) + return _AnnotatedAlias(origin, metadata) + + def __init_subclass__(cls, *args, **kwargs): + raise TypeError("Cannot subclass {}.Annotated".format(cls.__module__)) + + def _strip_annotations(t): + """Strips the annotations from a given type. + """ + if isinstance(t, _AnnotatedAlias): + return _strip_annotations(t.__origin__) + if isinstance(t, typing._GenericAlias): + stripped_args = tuple(_strip_annotations(a) for a in t.__args__) + if stripped_args == t.__args__: + return t + res = t.copy_with(stripped_args) + res._special = t._special + return res + return t + + def get_type_hints(obj, globalns=None, localns=None, include_extras=False): + """Return type hints for an object. + + This is often the same as obj.__annotations__, but it handles + forward references encoded as string literals, adds Optional[t] if a + default value equal to None is set and recursively replaces all + 'Annotated[T, ...]' with 'T' (unless 'include_extras=True'). + + The argument may be a module, class, method, or function. The annotations + are returned as a dictionary. For classes, annotations include also + inherited members. + + TypeError is raised if the argument is not of a type that can contain + annotations, and an empty dictionary is returned if no annotations are + present. + + BEWARE -- the behavior of globalns and localns is counterintuitive + (unless you are familiar with how eval and exec work). The + search order is locals first, then globals. + + - If no dict arguments are passed, an attempt is made to use the + globals from obj (or the respective module's globals for classes), + and these are also used as the locals. If the object does not appear + to have globals, an empty dictionary is used. + + - If one dict argument is passed, it is used for both globals and + locals. + + - If two dict arguments are passed, they specify globals and + locals, respectively. + """ + hint = typing.get_type_hints(obj, globalns=globalns, localns=localns) + if include_extras: + return hint + return {k: _strip_annotations(t) for k, t in hint.items()} + + +elif HAVE_ANNOTATED: + + def _is_dunder(name): + """Returns True if name is a __dunder_variable_name__.""" + return len(name) > 4 and name.startswith("__") and name.endswith("__") + + # Prior to Python 3.7 types did not have `copy_with`. A lot of the equality + # checks, argument expansion etc. are done on the _subs_tre. As a result we + # can't provide a get_type_hints function that strips out annotations. + + class AnnotatedMeta(typing.GenericMeta): + """Metaclass for Annotated""" + + def __new__(cls, name, bases, namespace, **kwargs): + if any(b is not object for b in bases): + raise TypeError("Cannot subclass " + str(Annotated)) + return super().__new__(cls, name, bases, namespace, **kwargs) + + @property + def __metadata__(self): + return self._subs_tree()[2] + + def _tree_repr(self, tree): + cls, origin, metadata = tree + if not isinstance(origin, tuple): + tp_repr = typing._type_repr(origin) + else: + tp_repr = origin[0]._tree_repr(origin) + metadata_reprs = ", ".join(repr(arg) for arg in metadata) + return "%s[%s, %s]" % (cls, tp_repr, metadata_reprs) + + def _subs_tree(self, tvars=None, args=None): # noqa + if self is Annotated: + return Annotated + res = super()._subs_tree(tvars=tvars, args=args) + # Flatten nested Annotated + if isinstance(res[1], tuple) and res[1][0] is Annotated: + sub_tp = res[1][1] + sub_annot = res[1][2] + return (Annotated, sub_tp, sub_annot + res[2]) + return res + + def _get_cons(self): + """Return the class used to create instance of this type.""" + if self.__origin__ is None: + raise TypeError( + "Cannot get the underlying type of a " + "non-specialized Annotated type." + ) + tree = self._subs_tree() + while isinstance(tree, tuple) and tree[0] is Annotated: + tree = tree[1] + if isinstance(tree, tuple): + return tree[0] + else: + return tree + + @_tp_cache + def __getitem__(self, params): + if not isinstance(params, tuple): + params = (params,) + if self.__origin__ is not None: # specializing an instantiated type + return super().__getitem__(params) + elif not isinstance(params, tuple) or len(params) < 2: + raise TypeError( + "Annotated[...] should be instantiated " + "with at least two arguments (a type and an " + "annotation)." + ) + else: + msg = "Annotated[t, ...]: t must be a type." + tp = typing._type_check(params[0], msg) + metadata = tuple(params[1:]) + return type(self)( + self.__name__, + self.__bases__, + _no_slots_copy(self.__dict__), + tvars=_type_vars((tp,)), + # Metadata is a tuple so it won't be touched by _replace_args et al. + args=(tp, metadata), + origin=self, + ) + + def __call__(self, *args, **kwargs): + cons = self._get_cons() + result = cons(*args, **kwargs) + try: + result.__orig_class__ = self + except AttributeError: + pass + return result + + def __getattr__(self, attr): + # For simplicity we just don't relay all dunder names + if self.__origin__ is not None and not _is_dunder(attr): + return getattr(self._get_cons(), attr) + raise AttributeError(attr) + + def __setattr__(self, attr, value): + if _is_dunder(attr) or attr.startswith("_abc_"): + super().__setattr__(attr, value) + elif self.__origin__ is None: + raise AttributeError(attr) + else: + setattr(self._get_cons(), attr, value) + + def __instancecheck__(self, obj): + raise TypeError("Annotated cannot be used with isinstance().") + + def __subclasscheck__(self, cls): + raise TypeError("Annotated cannot be used with issubclass().") + + class Annotated(metaclass=AnnotatedMeta): + """Add context specific metadata to a type. + + Example: Annotated[int, runtime_check.Unsigned] indicates to the + hypothetical runtime_check module that this type is an unsigned int. + Every other consumer of this type can ignore this metadata and treat + this type as int. + + The first argument to Annotated must be a valid type, the remaining + arguments are kept as a tuple in the __metadata__ field. + + Details: + + - It's an error to call `Annotated` with less than two arguments. + - Nested Annotated are flattened:: + + Annotated[Annotated[T, Ann1, Ann2], Ann3] == Annotated[T, Ann1, Ann2, Ann3] + + - Instantiating an annotated type is equivalent to instantiating the + underlying type:: + + Annotated[C, Ann1](5) == C(5) + + - Annotated can be used as a generic type alias:: + + Optimized = Annotated[T, runtime.Optimize()] + Optimized[int] == Annotated[int, runtime.Optimize()] + + OptimizedList = Annotated[List[T], runtime.Optimize()] + OptimizedList[int] == Annotated[List[int], runtime.Optimize()] + """ + + +# Python 3.8 has get_origin() and get_args() but those implementations aren't +# Annotated-aware, so we can't use those, only Python 3.9 versions will do. +if sys.version_info[:2] >= (3, 9): + get_origin = typing.get_origin + get_args = typing.get_args +elif PEP_560: + from typing import _GenericAlias # noqa + + def get_origin(tp): + """Get the unsubscripted version of a type. + + This supports generic types, Callable, Tuple, Union, Literal, Final, ClassVar + and Annotated. Return None for unsupported types. Examples:: + + get_origin(Literal[42]) is Literal + get_origin(int) is None + get_origin(ClassVar[int]) is ClassVar + get_origin(Generic) is Generic + get_origin(Generic[T]) is Generic + get_origin(Union[T, int]) is Union + get_origin(List[Tuple[T, T]][int]) == list + """ + if isinstance(tp, _AnnotatedAlias): + return Annotated + if isinstance(tp, _GenericAlias): + return tp.__origin__ + if tp is Generic: + return Generic + return None + + def get_args(tp): + """Get type arguments with all substitutions performed. + + For unions, basic simplifications used by Union constructor are performed. + Examples:: + get_args(Dict[str, int]) == (str, int) + get_args(int) == () + get_args(Union[int, Union[T, int], str][int]) == (int, str) + get_args(Union[int, Tuple[T, int]][str]) == (int, Tuple[str, int]) + get_args(Callable[[], T][int]) == ([], int) + """ + if isinstance(tp, _AnnotatedAlias): + return (tp.__origin__,) + tp.__metadata__ + if isinstance(tp, _GenericAlias): + res = tp.__args__ + if get_origin(tp) is collections.abc.Callable and res[0] is not Ellipsis: + res = (list(res[:-1]), res[-1]) + return res + return () + + +if hasattr(typing, "TypeAlias"): + TypeAlias = typing.TypeAlias +elif sys.version_info[:2] >= (3, 9): + + class _TypeAliasForm(typing._SpecialForm, _root=True): + def __repr__(self): + return "typing_extensions." + self._name + + @_TypeAliasForm + def TypeAlias(self, parameters): + """Special marker indicating that an assignment should + be recognized as a proper type alias definition by type + checkers. + + For example:: + + Predicate: TypeAlias = Callable[..., bool] + + It's invalid when used anywhere except as in the example above. + """ + raise TypeError("{} is not subscriptable".format(self)) + + +elif sys.version_info[:2] >= (3, 7): + + class _TypeAliasForm(typing._SpecialForm, _root=True): + def __repr__(self): + return "typing_extensions." + self._name + + TypeAlias = _TypeAliasForm( + "TypeAlias", + doc="""Special marker indicating that an assignment should + be recognized as a proper type alias definition by type + checkers. + + For example:: + + Predicate: TypeAlias = Callable[..., bool] + + It's invalid when used anywhere except as in the example + above.""", + ) + +elif hasattr(typing, "_FinalTypingBase"): + + class _TypeAliasMeta(typing.TypingMeta): + """Metaclass for TypeAlias""" + + def __repr__(self): + return "typing_extensions.TypeAlias" + + class _TypeAliasBase(typing._FinalTypingBase, metaclass=_TypeAliasMeta, _root=True): + """Special marker indicating that an assignment should + be recognized as a proper type alias definition by type + checkers. + + For example:: + + Predicate: TypeAlias = Callable[..., bool] + + It's invalid when used anywhere except as in the example above. + """ + + __slots__ = () + + def __instancecheck__(self, obj): + raise TypeError("TypeAlias cannot be used with isinstance().") + + def __subclasscheck__(self, cls): + raise TypeError("TypeAlias cannot be used with issubclass().") + + def __repr__(self): + return "typing_extensions.TypeAlias" + + TypeAlias = _TypeAliasBase(_root=True) +else: + + class _TypeAliasMeta(typing.TypingMeta): + """Metaclass for TypeAlias""" + + def __instancecheck__(self, obj): + raise TypeError("TypeAlias cannot be used with isinstance().") + + def __subclasscheck__(self, cls): + raise TypeError("TypeAlias cannot be used with issubclass().") + + def __call__(self, *args, **kwargs): + raise TypeError("Cannot instantiate TypeAlias") + + class TypeAlias(metaclass=_TypeAliasMeta, _root=True): + """Special marker indicating that an assignment should + be recognized as a proper type alias definition by type + checkers. + + For example:: + + Predicate: TypeAlias = Callable[..., bool] + + It's invalid when used anywhere except as in the example above. + """ + + __slots__ = () diff --git a/setup.cfg b/setup.cfg index a47bc88d282ab..2d1c8037636de 100644 --- a/setup.cfg +++ b/setup.cfg @@ -68,6 +68,7 @@ omit = */tests/* pandas/_typing.py pandas/_version.py + pandas/_vendored/typing_extensions.py plugins = Cython.Coverage [coverage:report] @@ -99,7 +100,7 @@ directory = coverage_html_report # To be kept consistent with "Import Formatting" section in contributing.rst [isort] -known_pre_libs = pandas._config +known_pre_libs = pandas._config,pandas._vendored known_pre_core = pandas._libs,pandas._typing,pandas.util._*,pandas.compat,pandas.errors known_dtypes = pandas.core.dtypes known_post_core = pandas.tseries,pandas.io,pandas.plotting @@ -113,7 +114,7 @@ combine_as_imports = True line_length = 88 force_sort_within_sections = True skip_glob = env, -skip = pandas/__init__.py +skip = pandas/__init__.py,pandas/_vendored/typing_extensions.py [mypy] ignore_missing_imports=True @@ -124,6 +125,10 @@ warn_redundant_casts = True warn_unused_ignores = True show_error_codes = True +[mypy-pandas._vendored.*] +check_untyped_defs=False +ignore_errors=True + [mypy-pandas.tests.*] check_untyped_defs=False From 9e88ea06e04e22068e4106ae85e043866842f02f Mon Sep 17 00:00:00 2001 From: Erfan Nariman <34067903+erfannariman@users.noreply.github.com> Date: Wed, 2 Sep 2020 01:52:08 +0200 Subject: [PATCH 0618/1025] Added numba as an argument (#35778) --- doc/source/user_guide/computation.rst | 3 +++ doc/source/user_guide/enhancingperf.rst | 7 +++++++ 2 files changed, 10 insertions(+) diff --git a/doc/source/user_guide/computation.rst b/doc/source/user_guide/computation.rst index d7875e5b8d861..151ef36be7c98 100644 --- a/doc/source/user_guide/computation.rst +++ b/doc/source/user_guide/computation.rst @@ -361,6 +361,9 @@ compute the mean absolute deviation on a rolling basis: @savefig rolling_apply_ex.png s.rolling(window=60).apply(mad, raw=True).plot(style='k') +Using the Numba engine +~~~~~~~~~~~~~~~~~~~~~~ + .. versionadded:: 1.0 Additionally, :meth:`~Rolling.apply` can leverage `Numba `__ diff --git a/doc/source/user_guide/enhancingperf.rst b/doc/source/user_guide/enhancingperf.rst index 24fcb369804c6..9e101c1a20371 100644 --- a/doc/source/user_guide/enhancingperf.rst +++ b/doc/source/user_guide/enhancingperf.rst @@ -373,6 +373,13 @@ nicer interface by passing/returning pandas objects. In this example, using Numba was faster than Cython. +Numba as an argument +~~~~~~~~~~~~~~~~~~~~ + +Additionally, we can leverage the power of `Numba `__ +by calling it as an argument in :meth:`~Rolling.apply`. See :ref:`Computation tools +` for an extensive example. + Vectorize ~~~~~~~~~ From 160345d584ba7398e1a2b5825adb315a4d4535e0 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 1 Sep 2020 18:56:40 -0700 Subject: [PATCH 0619/1025] REF: handle axis=None case inside DataFrame.any/all to simplify _reduce (#35899) * REF: remove unnecesary try/except * TST: add test for agg on ordered categorical cols (#35630) * TST: resample does not yield empty groups (#10603) (#35799) * revert accidental rebase * REF: handle axis=None cases inside DataFrame.all/any * annotate * dummy commit to force Travis Co-authored-by: Karthik Mathur <22126205+mathurk1@users.noreply.github.com> Co-authored-by: tkmz-n <60312218+tkmz-n@users.noreply.github.com> --- pandas/core/frame.py | 61 +++++++++++++++--------------------------- pandas/core/generic.py | 8 ++++++ 2 files changed, 30 insertions(+), 39 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 312d449e36022..e78c15d125e8d 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -8617,14 +8617,11 @@ def _reduce( cols = self.columns[~dtype_is_dt] self = self[cols] - if axis is None and filter_type == "bool": - labels = None - constructor = None - else: - # TODO: Make other agg func handle axis=None properly - axis = self._get_axis_number(axis) - labels = self._get_agg_axis(axis) - constructor = self._constructor + # TODO: Make other agg func handle axis=None properly + axis = self._get_axis_number(axis) + labels = self._get_agg_axis(axis) + constructor = self._constructor + assert axis in [0, 1] def func(values): if is_extension_array_dtype(values.dtype): @@ -8632,7 +8629,7 @@ def func(values): else: return op(values, axis=axis, skipna=skipna, **kwds) - def _get_data(axis_matters): + def _get_data(axis_matters: bool) -> "DataFrame": if filter_type is None: data = self._get_numeric_data() elif filter_type == "bool": @@ -8649,7 +8646,7 @@ def _get_data(axis_matters): raise NotImplementedError(msg) return data - if numeric_only is not None and axis in [0, 1]: + if numeric_only is not None: df = self if numeric_only is True: df = _get_data(axis_matters=True) @@ -8675,6 +8672,8 @@ def blk_func(values): out[:] = coerce_to_dtypes(out.values, df.dtypes) return out + assert numeric_only is None + if not self._is_homogeneous_type or self._mgr.any_extension_types: # try to avoid self.values call @@ -8702,40 +8701,24 @@ def blk_func(values): result = result.iloc[0].rename(None) return result - if numeric_only is None: - data = self - values = data.values - - try: - result = func(values) - - except TypeError: - # e.g. in nanops trying to convert strs to float + data = self + values = data.values - # TODO: why doesnt axis matter here? - data = _get_data(axis_matters=False) - labels = data._get_agg_axis(axis) + try: + result = func(values) - values = data.values - with np.errstate(all="ignore"): - result = func(values) + except TypeError: + # e.g. in nanops trying to convert strs to float - else: - if numeric_only: - data = _get_data(axis_matters=True) - labels = data._get_agg_axis(axis) + # TODO: why doesnt axis matter here? + data = _get_data(axis_matters=False) + labels = data._get_agg_axis(axis) - values = data.values - else: - data = self - values = data.values - result = func(values) + values = data.values + with np.errstate(all="ignore"): + result = func(values) - if filter_type == "bool" and is_object_dtype(values) and axis is None: - # work around https://github.com/numpy/numpy/issues/10489 - # TODO: can we de-duplicate parts of this with the next blocK? - result = np.bool_(result) - elif hasattr(result, "dtype") and is_object_dtype(result.dtype): + if is_object_dtype(result.dtype): try: if filter_type is None: result = result.astype(np.float64) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 3bad2d6dd18b9..c80a95f79a7a0 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -11499,6 +11499,14 @@ def logical_func(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs "Option bool_only is not implemented with option level." ) return self._agg_by_level(name, axis=axis, level=level, skipna=skipna) + + if self.ndim > 1 and axis is None: + # Reduce along one dimension then the other, to simplify DataFrame._reduce + res = logical_func( + self, axis=0, bool_only=bool_only, skipna=skipna, **kwargs + ) + return logical_func(res, skipna=skipna, **kwargs) + return self._reduce( func, name=name, From 6a7a42deb36ace8b6a20eb72a6459712194414ee Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 1 Sep 2020 20:18:16 -0700 Subject: [PATCH 0620/1025] BUG: BlockSlider not clearing index._cache (#35937) * REF: remove unnecesary try/except * TST: add test for agg on ordered categorical cols (#35630) * TST: resample does not yield empty groups (#10603) (#35799) * revert accidental rebase * BUG: BlockSlider not clearing index._cache * update whatsnew Co-authored-by: Karthik Mathur <22126205+mathurk1@users.noreply.github.com> Co-authored-by: tkmz-n <60312218+tkmz-n@users.noreply.github.com> --- doc/source/whatsnew/v1.2.0.rst | 1 + pandas/_libs/reduction.pyx | 3 +++ pandas/tests/groupby/test_allowlist.py | 6 +++++- 3 files changed, 9 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 407e8ba029ada..fca7e7d209031 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -256,6 +256,7 @@ Groupby/resample/rolling - Bug in :meth:`DataFrameGroupBy.apply` where a non-nuisance grouping column would be dropped from the output columns if another groupby method was called before ``.apply()`` (:issue:`34656`) - Bug in :meth:`DataFrameGroupby.apply` would drop a :class:`CategoricalIndex` when grouped on. (:issue:`35792`) - Bug when subsetting columns on a :class:`~pandas.core.groupby.DataFrameGroupBy` (e.g. ``df.groupby('a')[['b']])``) would reset the attributes ``axis``, ``dropna``, ``group_keys``, ``level``, ``mutated``, ``sort``, and ``squeeze`` to their default values. (:issue:`9959`) +- Bug in :meth:`DataFrameGroupby.tshift` failing to raise ``ValueError`` when a frequency cannot be inferred for the index of a group (:issue:`35937`) - Reshaping diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx index 7b36bc8baf891..8161b5c5c2b11 100644 --- a/pandas/_libs/reduction.pyx +++ b/pandas/_libs/reduction.pyx @@ -53,6 +53,7 @@ cdef class _BaseGrouper: # to a 1-d ndarray like datetime / timedelta / period. object.__setattr__(cached_ityp, '_index_data', islider.buf) cached_ityp._engine.clear_mapping() + cached_ityp._cache.clear() # e.g. inferred_freq must go object.__setattr__(cached_typ._mgr._block, 'values', vslider.buf) object.__setattr__(cached_typ._mgr._block, 'mgr_locs', slice(len(vslider.buf))) @@ -71,6 +72,7 @@ cdef class _BaseGrouper: object res cached_ityp._engine.clear_mapping() + cached_ityp._cache.clear() # e.g. inferred_freq must go res = self.f(cached_typ) res = _extract_result(res) if not initialized: @@ -455,6 +457,7 @@ cdef class BlockSlider: object.__setattr__(self.index, '_index_data', self.idx_slider.buf) self.index._engine.clear_mapping() + self.index._cache.clear() # e.g. inferred_freq must go cdef reset(self): cdef: diff --git a/pandas/tests/groupby/test_allowlist.py b/pandas/tests/groupby/test_allowlist.py index 0fd66cc047017..4a735fc7bb686 100644 --- a/pandas/tests/groupby/test_allowlist.py +++ b/pandas/tests/groupby/test_allowlist.py @@ -369,7 +369,6 @@ def test_groupby_selection_with_methods(df): "ffill", "bfill", "pct_change", - "tshift", ] for m in methods: @@ -379,6 +378,11 @@ def test_groupby_selection_with_methods(df): # should always be frames! tm.assert_frame_equal(res, exp) + # check that the index cache is cleared + with pytest.raises(ValueError, match="Freq was not set in the index"): + # GH#35937 + g.tshift() + # methods which aren't just .foo() tm.assert_frame_equal(g.fillna(0), g_exp.fillna(0)) tm.assert_frame_equal(g.dtypes, g_exp.dtypes) From e1c380c6181e32167cdc86774d9f561239942b32 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 1 Sep 2020 20:19:26 -0700 Subject: [PATCH 0621/1025] BUG: NDFrame.replace wrong exception type, wrong return when size==0 (#36045) * REF: remove unnecesary try/except * TST: add test for agg on ordered categorical cols (#35630) * TST: resample does not yield empty groups (#10603) (#35799) * revert accidental rebase * BUG: NDFrame.replace wrong exception type, wrong return when size==0 * bool->bool_t * whatsnew Co-authored-by: Karthik Mathur <22126205+mathurk1@users.noreply.github.com> Co-authored-by: tkmz-n <60312218+tkmz-n@users.noreply.github.com> --- doc/source/whatsnew/v1.2.0.rst | 2 +- pandas/core/generic.py | 14 +++++++------ pandas/tests/series/methods/test_replace.py | 23 +++++++++++++++++++++ 3 files changed, 32 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index fca7e7d209031..0cfe010b63a6f 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -282,7 +282,7 @@ ExtensionArray Other ^^^^^ -- +- Bug in :meth:`DataFrame.replace` and :meth:`Series.replace` incorrectly raising ``AssertionError`` instead of ``ValueError`` when invalid parameter combinations are passed (:issue:`36045`) - .. --------------------------------------------------------------------------- diff --git a/pandas/core/generic.py b/pandas/core/generic.py index c80a95f79a7a0..233d48bfc85c3 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6179,8 +6179,8 @@ def replace( self, to_replace=None, value=None, - inplace=False, - limit=None, + inplace: bool_t = False, + limit: Optional[int] = None, regex=False, method="pad", ): @@ -6256,7 +6256,7 @@ def replace( If True, in place. Note: this will modify any other views on this object (e.g. a column from a DataFrame). Returns the caller if this is True. - limit : int, default None + limit : int or None, default None Maximum size gap to forward or backward fill. regex : bool or same types as `to_replace`, default False Whether to interpret `to_replace` and/or `value` as regular @@ -6490,7 +6490,7 @@ def replace( inplace = validate_bool_kwarg(inplace, "inplace") if not is_bool(regex) and to_replace is not None: - raise AssertionError("'to_replace' must be 'None' if 'regex' is not a bool") + raise ValueError("'to_replace' must be 'None' if 'regex' is not a bool") if value is None: # passing a single value that is scalar like @@ -6550,12 +6550,14 @@ def replace( # need a non-zero len on all axes if not self.size: - return self + if inplace: + return + return self.copy() if is_dict_like(to_replace): if is_dict_like(value): # {'A' : NA} -> {'A' : 0} # Note: Checking below for `in foo.keys()` instead of - # `in foo`is needed for when we have a Series and not dict + # `in foo` is needed for when we have a Series and not dict mapping = { col: (to_replace[col], value[col]) for col in to_replace.keys() diff --git a/pandas/tests/series/methods/test_replace.py b/pandas/tests/series/methods/test_replace.py index f78a28c66e946..ccaa005369a1c 100644 --- a/pandas/tests/series/methods/test_replace.py +++ b/pandas/tests/series/methods/test_replace.py @@ -397,6 +397,29 @@ def test_replace_invalid_to_replace(self): with pytest.raises(TypeError, match=msg): series.replace(lambda x: x.strip()) + @pytest.mark.parametrize("frame", [False, True]) + def test_replace_nonbool_regex(self, frame): + obj = pd.Series(["a", "b", "c "]) + if frame: + obj = obj.to_frame() + + msg = "'to_replace' must be 'None' if 'regex' is not a bool" + with pytest.raises(ValueError, match=msg): + obj.replace(to_replace=["a"], regex="foo") + + @pytest.mark.parametrize("frame", [False, True]) + def test_replace_empty_copy(self, frame): + obj = pd.Series([], dtype=np.float64) + if frame: + obj = obj.to_frame() + + res = obj.replace(4, 5, inplace=True) + assert res is None + + res = obj.replace(4, 5, inplace=False) + tm.assert_equal(res, obj) + assert res is not obj + def test_replace_only_one_dictlike_arg(self): # GH#33340 From ce7d07e035961c7030263f066069dd5988f6ab78 Mon Sep 17 00:00:00 2001 From: Jonathan Shreckengost Date: Wed, 2 Sep 2020 09:28:45 -0400 Subject: [PATCH 0622/1025] Comma cleanup for #35925 (#36058) --- pandas/tests/generic/test_finalize.py | 10 ++++------ pandas/tests/generic/test_to_xarray.py | 4 +--- pandas/tests/groupby/aggregate/test_numba.py | 8 ++++---- pandas/tests/groupby/test_apply.py | 4 +--- pandas/tests/groupby/test_categorical.py | 4 ++-- pandas/tests/groupby/test_groupby.py | 2 +- pandas/tests/groupby/test_groupby_dropna.py | 4 +--- pandas/tests/groupby/test_groupby_subclass.py | 4 +--- pandas/tests/groupby/test_size.py | 2 +- 9 files changed, 16 insertions(+), 26 deletions(-) diff --git a/pandas/tests/generic/test_finalize.py b/pandas/tests/generic/test_finalize.py index 4d0f1a326225d..8898619e374ab 100644 --- a/pandas/tests/generic/test_finalize.py +++ b/pandas/tests/generic/test_finalize.py @@ -123,7 +123,7 @@ (pd.DataFrame, frame_data, operator.methodcaller("sort_index")), (pd.DataFrame, frame_data, operator.methodcaller("nlargest", 1, "A")), (pd.DataFrame, frame_data, operator.methodcaller("nsmallest", 1, "A")), - (pd.DataFrame, frame_mi_data, operator.methodcaller("swaplevel"),), + (pd.DataFrame, frame_mi_data, operator.methodcaller("swaplevel")), pytest.param( ( pd.DataFrame, @@ -178,7 +178,7 @@ marks=not_implemented_mark, ), pytest.param( - (pd.DataFrame, frame_mi_data, operator.methodcaller("unstack"),), + (pd.DataFrame, frame_mi_data, operator.methodcaller("unstack")), marks=not_implemented_mark, ), pytest.param( @@ -317,7 +317,7 @@ marks=not_implemented_mark, ), pytest.param( - (pd.Series, ([1, 2],), operator.methodcaller("squeeze")), + (pd.Series, ([1, 2],), operator.methodcaller("squeeze")) # marks=not_implemented_mark, ), (pd.Series, ([1, 2],), operator.methodcaller("rename_axis", index="a")), @@ -733,9 +733,7 @@ def test_timedelta_property(attr): assert result.attrs == {"a": 1} -@pytest.mark.parametrize( - "method", [operator.methodcaller("total_seconds")], -) +@pytest.mark.parametrize("method", [operator.methodcaller("total_seconds")]) @not_implemented_mark def test_timedelta_methods(method): s = pd.Series(pd.timedelta_range("2000", periods=4)) diff --git a/pandas/tests/generic/test_to_xarray.py b/pandas/tests/generic/test_to_xarray.py index ab56a752f7e90..a85d7ddc1ea53 100644 --- a/pandas/tests/generic/test_to_xarray.py +++ b/pandas/tests/generic/test_to_xarray.py @@ -47,9 +47,7 @@ def test_to_xarray_index_types(self, index): expected = df.copy() expected["f"] = expected["f"].astype(object) expected.columns.name = None - tm.assert_frame_equal( - result.to_dataframe(), expected, - ) + tm.assert_frame_equal(result.to_dataframe(), expected) @td.skip_if_no("xarray", min_version="0.7.0") def test_to_xarray(self): diff --git a/pandas/tests/groupby/aggregate/test_numba.py b/pandas/tests/groupby/aggregate/test_numba.py index 29e65e938f6f9..c4266996748c2 100644 --- a/pandas/tests/groupby/aggregate/test_numba.py +++ b/pandas/tests/groupby/aggregate/test_numba.py @@ -57,7 +57,7 @@ def func_numba(values, index): func_numba = numba.jit(func_numba) data = DataFrame( - {0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1], + {0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1] ) engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython} grouped = data.groupby(0) @@ -90,7 +90,7 @@ def func_2(values, index): func_2 = numba.jit(func_2) data = DataFrame( - {0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1], + {0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1] ) engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython} grouped = data.groupby(0) @@ -121,7 +121,7 @@ def func_1(values, index): return np.mean(values) - 3.4 data = DataFrame( - {0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1], + {0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1] ) grouped = data.groupby(0) expected = grouped.agg(func_1, engine="numba") @@ -142,7 +142,7 @@ def func_1(values, index): ) def test_multifunc_notimplimented(agg_func): data = DataFrame( - {0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1], + {0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1] ) grouped = data.groupby(0) with pytest.raises(NotImplementedError, match="Numba engine can"): diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index a1dcb28a32c6c..3183305fe2933 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -946,9 +946,7 @@ def fct(group): tm.assert_series_equal(result, expected) -@pytest.mark.parametrize( - "function", [lambda gr: gr.index, lambda gr: gr.index + 1 - 1], -) +@pytest.mark.parametrize("function", [lambda gr: gr.index, lambda gr: gr.index + 1 - 1]) def test_apply_function_index_return(function): # GH: 22541 df = pd.DataFrame([1, 2, 2, 2, 1, 2, 3, 1, 3, 1], columns=["id"]) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 13a32e285e70a..711daf7fe415d 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -17,7 +17,7 @@ def cartesian_product_for_groupers(result, args, names, fill_value=np.NaN): - """ Reindex to a cartesian production for the groupers, + """Reindex to a cartesian production for the groupers, preserving the nature (Categorical) of each grouper """ @@ -1449,7 +1449,7 @@ def test_groupby_agg_categorical_columns(func, expected_values): result = df.groupby("groups").agg(func) expected = pd.DataFrame( - {"value": expected_values}, index=pd.Index([0, 1, 2], name="groups"), + {"value": expected_values}, index=pd.Index([0, 1, 2], name="groups") ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index c743058c988b4..eec9e8064d584 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -676,7 +676,7 @@ def test_ops_not_as_index(reduction_func): if reduction_func in ("corrwith",): pytest.skip("Test not applicable") - if reduction_func in ("nth", "ngroup",): + if reduction_func in ("nth", "ngroup"): pytest.skip("Skip until behavior is determined (GH #5755)") df = DataFrame(np.random.randint(0, 5, size=(100, 2)), columns=["a", "b"]) diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index adf62c4723526..d1501111cb22b 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -246,9 +246,7 @@ def test_groupby_dropna_multi_index_dataframe_agg(dropna, tuples, outputs): (pd.Period("2020-01-01"), pd.Period("2020-02-01")), ], ) -@pytest.mark.parametrize( - "dropna, values", [(True, [12, 3]), (False, [12, 3, 6],)], -) +@pytest.mark.parametrize("dropna, values", [(True, [12, 3]), (False, [12, 3, 6])]) def test_groupby_dropna_datetime_like_data( dropna, values, datetime1, datetime2, unique_nulls_fixture, unique_nulls_fixture2 ): diff --git a/pandas/tests/groupby/test_groupby_subclass.py b/pandas/tests/groupby/test_groupby_subclass.py index 7271911c5f80f..cc7a79e976513 100644 --- a/pandas/tests/groupby/test_groupby_subclass.py +++ b/pandas/tests/groupby/test_groupby_subclass.py @@ -51,9 +51,7 @@ def test_groupby_preserves_subclass(obj, groupby_func): tm.assert_series_equal(result1, result2) -@pytest.mark.parametrize( - "obj", [DataFrame, tm.SubclassedDataFrame], -) +@pytest.mark.parametrize("obj", [DataFrame, tm.SubclassedDataFrame]) def test_groupby_resample_preserves_subclass(obj): # GH28330 -- preserve subclass through groupby.resample() diff --git a/pandas/tests/groupby/test_size.py b/pandas/tests/groupby/test_size.py index 9cff8b966dad0..ba27e5a24ba00 100644 --- a/pandas/tests/groupby/test_size.py +++ b/pandas/tests/groupby/test_size.py @@ -53,7 +53,7 @@ def test_size_on_categorical(as_index): result = df.groupby(["A", "B"], as_index=as_index).size() expected = DataFrame( - [[1, 1, 1], [1, 2, 0], [2, 1, 0], [2, 2, 1]], columns=["A", "B", "size"], + [[1, 1, 1], [1, 2, 0], [2, 1, 0], [2, 2, 1]], columns=["A", "B", "size"] ) expected["A"] = expected["A"].astype("category") if as_index: From bb88589bfea6a8d9f8eab627b8728781bf2e81e2 Mon Sep 17 00:00:00 2001 From: Kaiqi Dong Date: Wed, 2 Sep 2020 17:00:49 +0200 Subject: [PATCH 0623/1025] API: replace dropna=False option with na_sentinel=None in factorize (#35852) * remove \n from docstring * fix issue 17038 * revert change * revert change * add dropna doc for factorize * rephrase the doc * flake8 * fixup * use NaN * add dropna in series.factorize * black * add test * linting * linting * doct * fix black * fixup * fix doctest * add whatsnew * linting * fix test * try one time * hide dropna and use na_sentinel=None * update whatsnew * rename test function * remove dropna from factorize * update doc * docstring * update doc * add comment * code change on review * update doc * code change on review * minor move in whatsnew * add default example * doc * one more try * explicit doc * add space --- doc/source/whatsnew/v1.1.2.rst | 8 ++++++ pandas/core/algorithms.py | 33 +++++++++++++++++++--- pandas/core/base.py | 2 +- pandas/core/groupby/grouper.py | 7 ++++- pandas/tests/base/test_factorize.py | 13 +++++++++ pandas/tests/test_algos.py | 44 ++++++----------------------- 6 files changed, 66 insertions(+), 41 deletions(-) diff --git a/doc/source/whatsnew/v1.1.2.rst b/doc/source/whatsnew/v1.1.2.rst index 9b1ad658d4666..fdfb084b47a89 100644 --- a/doc/source/whatsnew/v1.1.2.rst +++ b/doc/source/whatsnew/v1.1.2.rst @@ -35,6 +35,14 @@ Bug fixes .. --------------------------------------------------------------------------- +.. _whatsnew_112.other: + +Other +~~~~~ +- :meth:`factorize` now supports ``na_sentinel=None`` to include NaN in the uniques of the values and remove ``dropna`` keyword which was unintentionally exposed to public facing API in 1.1 version from :meth:`factorize`(:issue:`35667`) + +.. --------------------------------------------------------------------------- + .. _whatsnew_112.contributors: Contributors diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 6d6bb21165814..d2af6c132eca2 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -526,9 +526,8 @@ def _factorize_array( def factorize( values, sort: bool = False, - na_sentinel: int = -1, + na_sentinel: Optional[int] = -1, size_hint: Optional[int] = None, - dropna: bool = True, ) -> Tuple[np.ndarray, Union[np.ndarray, ABCIndex]]: """ Encode the object as an enumerated type or categorical variable. @@ -541,8 +540,11 @@ def factorize( Parameters ---------- {values}{sort} - na_sentinel : int, default -1 - Value to mark "not found". + na_sentinel : int or None, default -1 + Value to mark "not found". If None, will not drop the NaN + from the uniques of the values. + + .. versionchanged:: 1.1.2 {size_hint}\ Returns @@ -620,6 +622,22 @@ def factorize( array([0, 0, 1]...) >>> uniques Index(['a', 'c'], dtype='object') + + If NaN is in the values, and we want to include NaN in the uniques of the + values, it can be achieved by setting ``na_sentinel=None``. + + >>> values = np.array([1, 2, 1, np.nan]) + >>> codes, uniques = pd.factorize(values) # default: na_sentinel=-1 + >>> codes + array([ 0, 1, 0, -1]) + >>> uniques + array([1., 2.]) + + >>> codes, uniques = pd.factorize(values, na_sentinel=None) + >>> codes + array([0, 1, 0, 2]) + >>> uniques + array([ 1., 2., nan]) """ # Implementation notes: This method is responsible for 3 things # 1.) coercing data to array-like (ndarray, Index, extension array) @@ -633,6 +651,13 @@ def factorize( values = _ensure_arraylike(values) original = values + # GH35667, if na_sentinel=None, we will not dropna NaNs from the uniques + # of values, assign na_sentinel=-1 to replace code value for NaN. + dropna = True + if na_sentinel is None: + na_sentinel = -1 + dropna = False + if is_extension_array_dtype(values.dtype): values = extract_array(values) codes, uniques = values.factorize(na_sentinel=na_sentinel) diff --git a/pandas/core/base.py b/pandas/core/base.py index b62ef668df5e1..1926803d8f04b 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -1398,7 +1398,7 @@ def memory_usage(self, deep=False): """ ), ) - def factorize(self, sort=False, na_sentinel=-1): + def factorize(self, sort: bool = False, na_sentinel: Optional[int] = -1): return algorithms.factorize(self, sort=sort, na_sentinel=na_sentinel) _shared_docs[ diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 3017521c6a065..6678edc3821c8 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -587,8 +587,13 @@ def _make_codes(self) -> None: codes = self.grouper.codes_info uniques = self.grouper.result_index else: + # GH35667, replace dropna=False with na_sentinel=None + if not self.dropna: + na_sentinel = None + else: + na_sentinel = -1 codes, uniques = algorithms.factorize( - self.grouper, sort=self.sort, dropna=self.dropna + self.grouper, sort=self.sort, na_sentinel=na_sentinel ) uniques = Index(uniques, name=self.name) self._codes = codes diff --git a/pandas/tests/base/test_factorize.py b/pandas/tests/base/test_factorize.py index 415a8b7e4362f..9fad9856d53cc 100644 --- a/pandas/tests/base/test_factorize.py +++ b/pandas/tests/base/test_factorize.py @@ -26,3 +26,16 @@ def test_factorize(index_or_series_obj, sort): tm.assert_numpy_array_equal(result_codes, expected_codes) tm.assert_index_equal(result_uniques, expected_uniques) + + +def test_series_factorize_na_sentinel_none(): + # GH35667 + values = np.array([1, 2, 1, np.nan]) + ser = pd.Series(values) + codes, uniques = ser.factorize(na_sentinel=None) + + expected_codes = np.array([0, 1, 0, 2], dtype="int64") + expected_uniques = pd.Index([1.0, 2.0, np.nan]) + + tm.assert_numpy_array_equal(codes, expected_codes) + tm.assert_index_equal(uniques, expected_uniques) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 67a2dc2303550..b4e97f1e341e4 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -340,73 +340,47 @@ def test_factorize_na_sentinel(self, sort, na_sentinel, data, uniques): tm.assert_extension_array_equal(uniques, expected_uniques) @pytest.mark.parametrize( - "data, dropna, expected_codes, expected_uniques", + "data, expected_codes, expected_uniques", [ ( ["a", None, "b", "a"], - True, - np.array([0, -1, 1, 0], dtype=np.dtype("intp")), - np.array(["a", "b"], dtype=object), - ), - ( - ["a", np.nan, "b", "a"], - True, - np.array([0, -1, 1, 0], dtype=np.dtype("intp")), - np.array(["a", "b"], dtype=object), - ), - ( - ["a", None, "b", "a"], - False, np.array([0, 2, 1, 0], dtype=np.dtype("intp")), np.array(["a", "b", np.nan], dtype=object), ), ( ["a", np.nan, "b", "a"], - False, np.array([0, 2, 1, 0], dtype=np.dtype("intp")), np.array(["a", "b", np.nan], dtype=object), ), ], ) - def test_object_factorize_dropna( - self, data, dropna, expected_codes, expected_uniques + def test_object_factorize_na_sentinel_none( + self, data, expected_codes, expected_uniques ): - codes, uniques = algos.factorize(data, dropna=dropna) + codes, uniques = algos.factorize(data, na_sentinel=None) tm.assert_numpy_array_equal(uniques, expected_uniques) tm.assert_numpy_array_equal(codes, expected_codes) @pytest.mark.parametrize( - "data, dropna, expected_codes, expected_uniques", + "data, expected_codes, expected_uniques", [ ( [1, None, 1, 2], - True, - np.array([0, -1, 0, 1], dtype=np.dtype("intp")), - np.array([1, 2], dtype="O"), - ), - ( - [1, np.nan, 1, 2], - True, - np.array([0, -1, 0, 1], dtype=np.dtype("intp")), - np.array([1, 2], dtype=np.float64), - ), - ( - [1, None, 1, 2], - False, np.array([0, 2, 0, 1], dtype=np.dtype("intp")), np.array([1, 2, np.nan], dtype="O"), ), ( [1, np.nan, 1, 2], - False, np.array([0, 2, 0, 1], dtype=np.dtype("intp")), np.array([1, 2, np.nan], dtype=np.float64), ), ], ) - def test_int_factorize_dropna(self, data, dropna, expected_codes, expected_uniques): - codes, uniques = algos.factorize(data, dropna=dropna) + def test_int_factorize_na_sentinel_none( + self, data, expected_codes, expected_uniques + ): + codes, uniques = algos.factorize(data, na_sentinel=None) tm.assert_numpy_array_equal(uniques, expected_uniques) tm.assert_numpy_array_equal(codes, expected_codes) From a4f1338322d750daba7edd0e7ebe4c3913cb6a7b Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Wed, 2 Sep 2020 16:41:39 +0100 Subject: [PATCH 0624/1025] TYP: update setup.cfg (#36067) --- setup.cfg | 3 --- 1 file changed, 3 deletions(-) diff --git a/setup.cfg b/setup.cfg index 2d1c8037636de..29c731848de8e 100644 --- a/setup.cfg +++ b/setup.cfg @@ -321,9 +321,6 @@ check_untyped_defs=False [mypy-pandas.plotting._matplotlib.core] check_untyped_defs=False -[mypy-pandas.plotting._matplotlib.misc] -check_untyped_defs=False - [mypy-pandas.plotting._misc] check_untyped_defs=False From 3b46b97b026b486de3233c115bdee39928380043 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Wed, 2 Sep 2020 16:42:30 +0100 Subject: [PATCH 0625/1025] TYP: statically define attributes in plotting._matplotlib.core (#36068) pandas\plotting\_matplotlib\core.py:231: error: "MPLPlot" has no attribute "style" [attr-defined] pandas\plotting\_matplotlib\core.py:232: error: "MPLPlot" has no attribute "style" [attr-defined] pandas\plotting\_matplotlib\core.py:233: error: "MPLPlot" has no attribute "style" [attr-defined] pandas\plotting\_matplotlib\core.py:235: error: "MPLPlot" has no attribute "style" [attr-defined] pandas\plotting\_matplotlib\core.py:385: error: "MPLPlot" has no attribute "label"; maybe "ylabel" or "xlabel"? [attr-defined] pandas\plotting\_matplotlib\core.py:553: error: "MPLPlot" has no attribute "mark_right" [attr-defined] pandas\plotting\_matplotlib\core.py:732: error: "MPLPlot" has no attribute "style" [attr-defined] pandas\plotting\_matplotlib\core.py:733: error: "MPLPlot" has no attribute "style" [attr-defined] pandas\plotting\_matplotlib\core.py:735: error: "MPLPlot" has no attribute "style" [attr-defined] pandas\plotting\_matplotlib\core.py:738: error: "MPLPlot" has no attribute "style" [attr-defined] pandas\plotting\_matplotlib\core.py:739: error: "MPLPlot" has no attribute "style" [attr-defined] pandas\plotting\_matplotlib\core.py:741: error: "MPLPlot" has no attribute "style" [attr-defined] pandas\plotting\_matplotlib\core.py:1008: error: "ScatterPlot" has no attribute "label" [attr-defined] pandas\plotting\_matplotlib\core.py:1075: error: "LinePlot" has no attribute "stacked" [attr-defined] pandas\plotting\_matplotlib\core.py:1180: error: "LinePlot" has no attribute "stacked" [attr-defined] pandas\plotting\_matplotlib\core.py:1269: error: "AreaPlot" has no attribute "stacked" [attr-defined] pandas\plotting\_matplotlib\core.py:1351: error: "BarPlot" has no attribute "stacked" [attr-defined] pandas\plotting\_matplotlib\core.py:1427: error: "BarPlot" has no attribute "stacked" [attr-defined] --- pandas/plotting/_matplotlib/core.py | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index 93ba9bd26630b..5270c7362d29f 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -66,16 +66,6 @@ def _kind(self): _layout_type = "vertical" _default_rot = 0 orientation: Optional[str] = None - _pop_attributes = [ - "label", - "style", - "mark_right", - "stacked", - ] - _attr_defaults = { - "mark_right": True, - "stacked": False, - } def __init__( self, @@ -165,9 +155,10 @@ def __init__( self.logx = kwds.pop("logx", False) self.logy = kwds.pop("logy", False) self.loglog = kwds.pop("loglog", False) - for attr in self._pop_attributes: - value = kwds.pop(attr, self._attr_defaults.get(attr, None)) - setattr(self, attr, value) + self.label = kwds.pop("label", None) + self.style = kwds.pop("style", None) + self.mark_right = kwds.pop("mark_right", True) + self.stacked = kwds.pop("stacked", False) self.ax = ax self.fig = fig From 1e9b9e3d9ce8f6a6788c09cee583f57be066bfbc Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 2 Sep 2020 08:43:16 -0700 Subject: [PATCH 0626/1025] BUG: frame._item_cache not cleared when Series is altered (#36051) --- doc/source/whatsnew/v1.1.2.rst | 1 + pandas/core/generic.py | 4 ++++ pandas/tests/frame/test_missing.py | 15 +++++++++++---- .../tests/indexing/test_chaining_and_caching.py | 15 +++++++++++++++ 4 files changed, 31 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v1.1.2.rst b/doc/source/whatsnew/v1.1.2.rst index fdfb084b47a89..c740c7b3882c9 100644 --- a/doc/source/whatsnew/v1.1.2.rst +++ b/doc/source/whatsnew/v1.1.2.rst @@ -32,6 +32,7 @@ Bug fixes - Bug in :meth:`DataFrame.apply` with ``result_type="reduce"`` returning with incorrect index (:issue:`35683`) - Bug in :meth:`DateTimeIndex.format` and :meth:`PeriodIndex.format` with ``name=True`` setting the first item to ``"None"`` where it should bw ``""`` (:issue:`35712`) - Bug in :meth:`Float64Index.__contains__` incorrectly raising ``TypeError`` instead of returning ``False`` (:issue:`35788`) +- Bug in :class:`DataFrame` indexing returning an incorrect :class:`Series` in some cases when the series has been altered and a cache not invalidated (:issue:`36051`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 233d48bfc85c3..486bea7cd1b47 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3315,6 +3315,10 @@ def _maybe_update_cacher( if len(self) == len(ref): # otherwise, either self or ref has swapped in new arrays ref._maybe_cache_changed(cacher[0], self) + else: + # GH#33675 we have swapped in a new array, so parent + # reference to self is now invalid + ref._item_cache.pop(cacher[0], None) if verify_is_copy: self._check_setitem_copy(stacklevel=5, t="referant") diff --git a/pandas/tests/frame/test_missing.py b/pandas/tests/frame/test_missing.py index 9bf5d24085697..b4f91590e09d1 100644 --- a/pandas/tests/frame/test_missing.py +++ b/pandas/tests/frame/test_missing.py @@ -135,13 +135,20 @@ def test_drop_and_dropna_caching(self): df2 = df.copy() df["A"].dropna() tm.assert_series_equal(df["A"], original) - return_value = df["A"].dropna(inplace=True) - tm.assert_series_equal(df["A"], expected) + + ser = df["A"] + return_value = ser.dropna(inplace=True) + tm.assert_series_equal(ser, expected) + tm.assert_series_equal(df["A"], original) assert return_value is None + df2["A"].drop([1]) tm.assert_series_equal(df2["A"], original) - return_value = df2["A"].drop([1], inplace=True) - tm.assert_series_equal(df2["A"], original.drop([1])) + + ser = df2["A"] + return_value = ser.drop([1], inplace=True) + tm.assert_series_equal(ser, original.drop([1])) + tm.assert_series_equal(df2["A"], original) assert return_value is None def test_dropna_corner(self, float_frame): diff --git a/pandas/tests/indexing/test_chaining_and_caching.py b/pandas/tests/indexing/test_chaining_and_caching.py index fa5fe5ba5c384..9910ef1b04b1a 100644 --- a/pandas/tests/indexing/test_chaining_and_caching.py +++ b/pandas/tests/indexing/test_chaining_and_caching.py @@ -81,6 +81,21 @@ def test_setitem_cache_updating(self): tm.assert_frame_equal(out, expected) tm.assert_series_equal(out["A"], expected["A"]) + def test_altering_series_clears_parent_cache(self): + # GH #33675 + df = pd.DataFrame([[1, 2], [3, 4]], index=["a", "b"], columns=["A", "B"]) + ser = df["A"] + + assert "A" in df._item_cache + + # Adding a new entry to ser swaps in a new array, so "A" needs to + # be removed from df._item_cache + ser["c"] = 5 + assert len(ser) == 3 + assert "A" not in df._item_cache + assert df["A"] is not ser + assert len(df["A"]) == 2 + class TestChaining: def test_setitem_chained_setfault(self): From 2b99012c2da87cdbd13a3004f7f17ce341c727e4 Mon Sep 17 00:00:00 2001 From: tiagohonorato <61059243+tiagohonorato@users.noreply.github.com> Date: Wed, 2 Sep 2020 13:14:00 -0300 Subject: [PATCH 0627/1025] CLN remove trailing commas (#36057) --- pandas/tests/arithmetic/test_interval.py | 4 +--- pandas/tests/arithmetic/test_numeric.py | 4 ++-- pandas/tests/arrays/boolean/test_logical.py | 4 +--- 3 files changed, 4 insertions(+), 8 deletions(-) diff --git a/pandas/tests/arithmetic/test_interval.py b/pandas/tests/arithmetic/test_interval.py index 50b5fe8e6f6b9..72ef7ea6bf8ca 100644 --- a/pandas/tests/arithmetic/test_interval.py +++ b/pandas/tests/arithmetic/test_interval.py @@ -156,9 +156,7 @@ def test_compare_scalar_other(self, op, array, other): expected = self.elementwise_comparison(op, array, other) tm.assert_numpy_array_equal(result, expected) - def test_compare_list_like_interval( - self, op, array, interval_constructor, - ): + def test_compare_list_like_interval(self, op, array, interval_constructor): # same endpoints other = interval_constructor(array.left, array.right) result = op(array, other) diff --git a/pandas/tests/arithmetic/test_numeric.py b/pandas/tests/arithmetic/test_numeric.py index 484f83deb0f55..ecac08ffe3ba2 100644 --- a/pandas/tests/arithmetic/test_numeric.py +++ b/pandas/tests/arithmetic/test_numeric.py @@ -99,7 +99,7 @@ class TestNumericArraylikeArithmeticWithDatetimeLike: # TODO: also check name retentention @pytest.mark.parametrize("box_cls", [np.array, pd.Index, pd.Series]) @pytest.mark.parametrize( - "left", lefts, ids=lambda x: type(x).__name__ + str(x.dtype), + "left", lefts, ids=lambda x: type(x).__name__ + str(x.dtype) ) def test_mul_td64arr(self, left, box_cls): # GH#22390 @@ -119,7 +119,7 @@ def test_mul_td64arr(self, left, box_cls): # TODO: also check name retentention @pytest.mark.parametrize("box_cls", [np.array, pd.Index, pd.Series]) @pytest.mark.parametrize( - "left", lefts, ids=lambda x: type(x).__name__ + str(x.dtype), + "left", lefts, ids=lambda x: type(x).__name__ + str(x.dtype) ) def test_div_td64arr(self, left, box_cls): # GH#22390 diff --git a/pandas/tests/arrays/boolean/test_logical.py b/pandas/tests/arrays/boolean/test_logical.py index e79262e1b7934..8ed1c27087b02 100644 --- a/pandas/tests/arrays/boolean/test_logical.py +++ b/pandas/tests/arrays/boolean/test_logical.py @@ -205,9 +205,7 @@ def test_kleene_xor_scalar(self, other, expected): a, pd.array([True, False, None], dtype="boolean") ) - @pytest.mark.parametrize( - "other", [True, False, pd.NA, [True, False, None] * 3], - ) + @pytest.mark.parametrize("other", [True, False, pd.NA, [True, False, None] * 3]) def test_no_masked_assumptions(self, other, all_logical_operators): # The logical operations should not assume that masked values are False! a = pd.arrays.BooleanArray( From 0f2377d8ea90fb7bfe2bb8132d80b5cc31dc42c3 Mon Sep 17 00:00:00 2001 From: Chuanzhu Xu Date: Wed, 2 Sep 2020 12:19:48 -0400 Subject: [PATCH 0628/1025] CLN remove unnecessary trailing commas in groupby tests (#36059) --- pandas/tests/groupby/test_timegrouper.py | 2 +- pandas/tests/groupby/transform/test_numba.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/tests/groupby/test_timegrouper.py b/pandas/tests/groupby/test_timegrouper.py index 84fd7a1bdfb05..4ccbc6a65fd88 100644 --- a/pandas/tests/groupby/test_timegrouper.py +++ b/pandas/tests/groupby/test_timegrouper.py @@ -780,6 +780,6 @@ def test_grouper_period_index(self): result = period_series.groupby(period_series.index.month).sum() expected = pd.Series( - range(0, periods), index=Index(range(1, periods + 1), name=index.name), + range(0, periods), index=Index(range(1, periods + 1), name=index.name) ) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/groupby/transform/test_numba.py b/pandas/tests/groupby/transform/test_numba.py index ee482571e644d..87723cd7c8f50 100644 --- a/pandas/tests/groupby/transform/test_numba.py +++ b/pandas/tests/groupby/transform/test_numba.py @@ -56,7 +56,7 @@ def func(values, index): func = numba.jit(func) data = DataFrame( - {0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1], + {0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1] ) engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython} grouped = data.groupby(0) @@ -89,7 +89,7 @@ def func_2(values, index): func_2 = numba.jit(func_2) data = DataFrame( - {0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1], + {0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1] ) engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython} grouped = data.groupby(0) @@ -120,7 +120,7 @@ def func_1(values, index): return values + 1 data = DataFrame( - {0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1], + {0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1] ) grouped = data.groupby(0) expected = grouped.transform(func_1, engine="numba") From b3480e98d2a06db9caeb165a8eadd9c7f215c113 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 2 Sep 2020 10:54:27 -0700 Subject: [PATCH 0629/1025] CLN: rename private functions used across modules (#36049) --- pandas/plotting/_matplotlib/boxplot.py | 16 +++++++-------- pandas/plotting/_matplotlib/core.py | 24 +++++++++++----------- pandas/plotting/_matplotlib/hist.py | 20 ++++++++++-------- pandas/plotting/_matplotlib/misc.py | 14 ++++++------- pandas/plotting/_matplotlib/style.py | 2 +- pandas/plotting/_matplotlib/tools.py | 20 +++++++++--------- pandas/tests/plotting/common.py | 16 +++++++-------- pandas/tests/plotting/test_misc.py | 20 +++++++++--------- pandas/tests/plotting/test_series.py | 28 +++++++++++++------------- 9 files changed, 82 insertions(+), 78 deletions(-) diff --git a/pandas/plotting/_matplotlib/boxplot.py b/pandas/plotting/_matplotlib/boxplot.py index 01fe98a6f5403..8ceba22b1f7a4 100644 --- a/pandas/plotting/_matplotlib/boxplot.py +++ b/pandas/plotting/_matplotlib/boxplot.py @@ -12,8 +12,8 @@ from pandas.io.formats.printing import pprint_thing from pandas.plotting._matplotlib.core import LinePlot, MPLPlot -from pandas.plotting._matplotlib.style import _get_standard_colors -from pandas.plotting._matplotlib.tools import _flatten, _subplots +from pandas.plotting._matplotlib.style import get_standard_colors +from pandas.plotting._matplotlib.tools import create_subplots, flatten_axes if TYPE_CHECKING: from matplotlib.axes import Axes @@ -84,7 +84,7 @@ def _validate_color_args(self): self.color = None # get standard colors for default - colors = _get_standard_colors(num_colors=3, colormap=self.colormap, color=None) + colors = get_standard_colors(num_colors=3, colormap=self.colormap, color=None) # use 2 colors by default, for box/whisker and median # flier colors isn't needed here # because it can be specified by ``sym`` kw @@ -200,11 +200,11 @@ def _grouped_plot_by_column( by = [by] columns = data._get_numeric_data().columns.difference(by) naxes = len(columns) - fig, axes = _subplots( + fig, axes = create_subplots( naxes=naxes, sharex=True, sharey=True, figsize=figsize, ax=ax, layout=layout ) - _axes = _flatten(axes) + _axes = flatten_axes(axes) ax_values = [] @@ -259,7 +259,7 @@ def _get_colors(): # num_colors=3 is required as method maybe_color_bp takes the colors # in positions 0 and 2. # if colors not provided, use same defaults as DataFrame.plot.box - result = _get_standard_colors(num_colors=3) + result = get_standard_colors(num_colors=3) result = np.take(result, [0, 0, 2]) result = np.append(result, "k") @@ -414,7 +414,7 @@ def boxplot_frame_groupby( ): if subplots is True: naxes = len(grouped) - fig, axes = _subplots( + fig, axes = create_subplots( naxes=naxes, squeeze=False, ax=ax, @@ -423,7 +423,7 @@ def boxplot_frame_groupby( figsize=figsize, layout=layout, ) - axes = _flatten(axes) + axes = flatten_axes(axes) ret = pd.Series(dtype=object) diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index 5270c7362d29f..2d64e1b051444 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -32,14 +32,14 @@ from pandas.io.formats.printing import pprint_thing from pandas.plotting._matplotlib.compat import _mpl_ge_3_0_0 from pandas.plotting._matplotlib.converter import register_pandas_matplotlib_converters -from pandas.plotting._matplotlib.style import _get_standard_colors +from pandas.plotting._matplotlib.style import get_standard_colors from pandas.plotting._matplotlib.tools import ( - _flatten, - _get_all_lines, - _get_xlim, - _handle_shared_axes, - _subplots, + create_subplots, + flatten_axes, format_date_labels, + get_all_lines, + get_xlim, + handle_shared_axes, table, ) @@ -306,7 +306,7 @@ def _maybe_right_yaxis(self, ax: "Axes", axes_num): def _setup_subplots(self): if self.subplots: - fig, axes = _subplots( + fig, axes = create_subplots( naxes=self.nseries, sharex=self.sharex, sharey=self.sharey, @@ -325,7 +325,7 @@ def _setup_subplots(self): fig.set_size_inches(self.figsize) axes = self.ax - axes = _flatten(axes) + axes = flatten_axes(axes) valid_log = {False, True, "sym", None} input_log = {self.logx, self.logy, self.loglog} @@ -457,7 +457,7 @@ def _adorn_subplots(self): if len(self.axes) > 0: all_axes = self._get_subplots() nrows, ncols = self._get_axes_layout() - _handle_shared_axes( + handle_shared_axes( axarr=all_axes, nplots=len(all_axes), naxes=nrows * ncols, @@ -744,7 +744,7 @@ def _get_colors(self, num_colors=None, color_kwds="color"): if num_colors is None: num_colors = self.nseries - return _get_standard_colors( + return get_standard_colors( num_colors=num_colors, colormap=self.colormap, color=self.kwds.get(color_kwds), @@ -1123,8 +1123,8 @@ def _make_plot(self): # reset of xlim should be used for ts data # TODO: GH28021, should find a way to change view limit on xaxis - lines = _get_all_lines(ax) - left, right = _get_xlim(lines) + lines = get_all_lines(ax) + left, right = get_xlim(lines) ax.set_xlim(left, right) @classmethod diff --git a/pandas/plotting/_matplotlib/hist.py b/pandas/plotting/_matplotlib/hist.py index ffd46d1b191db..89035552d4309 100644 --- a/pandas/plotting/_matplotlib/hist.py +++ b/pandas/plotting/_matplotlib/hist.py @@ -8,7 +8,11 @@ from pandas.io.formats.printing import pprint_thing from pandas.plotting._matplotlib.core import LinePlot, MPLPlot -from pandas.plotting._matplotlib.tools import _flatten, _set_ticks_props, _subplots +from pandas.plotting._matplotlib.tools import ( + create_subplots, + flatten_axes, + set_ticks_props, +) if TYPE_CHECKING: from matplotlib.axes import Axes @@ -198,11 +202,11 @@ def _grouped_plot( grouped = grouped[column] naxes = len(grouped) - fig, axes = _subplots( + fig, axes = create_subplots( naxes=naxes, figsize=figsize, sharex=sharex, sharey=sharey, ax=ax, layout=layout ) - _axes = _flatten(axes) + _axes = flatten_axes(axes) for i, (key, group) in enumerate(grouped): ax = _axes[i] @@ -286,7 +290,7 @@ def plot_group(group, ax): rot=rot, ) - _set_ticks_props( + set_ticks_props( axes, xlabelsize=xlabelsize, xrot=xrot, ylabelsize=ylabelsize, yrot=yrot ) @@ -337,7 +341,7 @@ def hist_series( ax.grid(grid) axes = np.array([ax]) - _set_ticks_props( + set_ticks_props( axes, xlabelsize=xlabelsize, xrot=xrot, ylabelsize=ylabelsize, yrot=yrot ) @@ -419,7 +423,7 @@ def hist_frame( if naxes == 0: raise ValueError("hist method requires numerical columns, nothing to plot.") - fig, axes = _subplots( + fig, axes = create_subplots( naxes=naxes, ax=ax, squeeze=False, @@ -428,7 +432,7 @@ def hist_frame( figsize=figsize, layout=layout, ) - _axes = _flatten(axes) + _axes = flatten_axes(axes) can_set_label = "label" not in kwds @@ -442,7 +446,7 @@ def hist_frame( if legend: ax.legend() - _set_ticks_props( + set_ticks_props( axes, xlabelsize=xlabelsize, xrot=xrot, ylabelsize=ylabelsize, yrot=yrot ) fig.subplots_adjust(wspace=0.3, hspace=0.3) diff --git a/pandas/plotting/_matplotlib/misc.py b/pandas/plotting/_matplotlib/misc.py index c5e7c55970c3e..a1c62f9fce23c 100644 --- a/pandas/plotting/_matplotlib/misc.py +++ b/pandas/plotting/_matplotlib/misc.py @@ -10,8 +10,8 @@ from pandas.core.dtypes.missing import notna from pandas.io.formats.printing import pprint_thing -from pandas.plotting._matplotlib.style import _get_standard_colors -from pandas.plotting._matplotlib.tools import _set_ticks_props, _subplots +from pandas.plotting._matplotlib.style import get_standard_colors +from pandas.plotting._matplotlib.tools import create_subplots, set_ticks_props if TYPE_CHECKING: from matplotlib.axes import Axes @@ -36,7 +36,7 @@ def scatter_matrix( df = frame._get_numeric_data() n = df.columns.size naxes = n * n - fig, axes = _subplots(naxes=naxes, figsize=figsize, ax=ax, squeeze=False) + fig, axes = create_subplots(naxes=naxes, figsize=figsize, ax=ax, squeeze=False) # no gaps between subplots fig.subplots_adjust(wspace=0, hspace=0) @@ -112,7 +112,7 @@ def scatter_matrix( locs = locs.astype(int) axes[0][0].yaxis.set_ticklabels(locs) - _set_ticks_props(axes, xlabelsize=8, xrot=90, ylabelsize=8, yrot=0) + set_ticks_props(axes, xlabelsize=8, xrot=90, ylabelsize=8, yrot=0) return axes @@ -147,7 +147,7 @@ def normalize(series): ax = plt.gca(xlim=[-1, 1], ylim=[-1, 1]) to_plot: Dict[Label, List[List]] = {} - colors = _get_standard_colors( + colors = get_standard_colors( num_colors=len(classes), colormap=colormap, color_type="random", color=color ) @@ -255,7 +255,7 @@ def f(t): t = np.linspace(-np.pi, np.pi, samples) used_legends: Set[str] = set() - color_values = _get_standard_colors( + color_values = get_standard_colors( num_colors=len(classes), colormap=colormap, color_type="random", color=color ) colors = dict(zip(classes, color_values)) @@ -382,7 +382,7 @@ def parallel_coordinates( if ax is None: ax = plt.gca() - color_values = _get_standard_colors( + color_values = get_standard_colors( num_colors=len(classes), colormap=colormap, color_type="random", color=color ) diff --git a/pandas/plotting/_matplotlib/style.py b/pandas/plotting/_matplotlib/style.py index 5f1105f0e4233..904a760a03e58 100644 --- a/pandas/plotting/_matplotlib/style.py +++ b/pandas/plotting/_matplotlib/style.py @@ -10,7 +10,7 @@ import pandas.core.common as com -def _get_standard_colors( +def get_standard_colors( num_colors=None, colormap=None, color_type: str = "default", color=None ): import matplotlib.pyplot as plt diff --git a/pandas/plotting/_matplotlib/tools.py b/pandas/plotting/_matplotlib/tools.py index 4d643ffb734e4..98aaab6838fba 100644 --- a/pandas/plotting/_matplotlib/tools.py +++ b/pandas/plotting/_matplotlib/tools.py @@ -100,7 +100,7 @@ def _get_layout(nplots: int, layout=None, layout_type: str = "box") -> Tuple[int # copied from matplotlib/pyplot.py and modified for pandas.plotting -def _subplots( +def create_subplots( naxes: int, sharex: bool = False, sharey: bool = False, @@ -194,7 +194,7 @@ def _subplots( fig = plt.figure(**fig_kw) else: if is_list_like(ax): - ax = _flatten(ax) + ax = flatten_axes(ax) if layout is not None: warnings.warn( "When passing multiple axes, layout keyword is ignored", UserWarning @@ -221,7 +221,7 @@ def _subplots( if squeeze: return fig, ax else: - return fig, _flatten(ax) + return fig, flatten_axes(ax) else: warnings.warn( "To output multiple subplots, the figure containing " @@ -264,7 +264,7 @@ def _subplots( for ax in axarr[naxes:]: ax.set_visible(False) - _handle_shared_axes(axarr, nplots, naxes, nrows, ncols, sharex, sharey) + handle_shared_axes(axarr, nplots, naxes, nrows, ncols, sharex, sharey) if squeeze: # Reshape the array to have the final desired dimension (nrow,ncol), @@ -297,7 +297,7 @@ def _remove_labels_from_axis(axis: "Axis"): axis.get_label().set_visible(False) -def _handle_shared_axes( +def handle_shared_axes( axarr: Iterable["Axes"], nplots: int, naxes: int, @@ -351,7 +351,7 @@ def _handle_shared_axes( _remove_labels_from_axis(ax.yaxis) -def _flatten(axes: Union["Axes", Sequence["Axes"]]) -> Sequence["Axes"]: +def flatten_axes(axes: Union["Axes", Sequence["Axes"]]) -> Sequence["Axes"]: if not is_list_like(axes): return np.array([axes]) elif isinstance(axes, (np.ndarray, ABCIndexClass)): @@ -359,7 +359,7 @@ def _flatten(axes: Union["Axes", Sequence["Axes"]]) -> Sequence["Axes"]: return np.array(axes) -def _set_ticks_props( +def set_ticks_props( axes: Union["Axes", Sequence["Axes"]], xlabelsize=None, xrot=None, @@ -368,7 +368,7 @@ def _set_ticks_props( ): import matplotlib.pyplot as plt - for ax in _flatten(axes): + for ax in flatten_axes(axes): if xlabelsize is not None: plt.setp(ax.get_xticklabels(), fontsize=xlabelsize) if xrot is not None: @@ -380,7 +380,7 @@ def _set_ticks_props( return axes -def _get_all_lines(ax: "Axes") -> List["Line2D"]: +def get_all_lines(ax: "Axes") -> List["Line2D"]: lines = ax.get_lines() if hasattr(ax, "right_ax"): @@ -392,7 +392,7 @@ def _get_all_lines(ax: "Axes") -> List["Line2D"]: return lines -def _get_xlim(lines: Iterable["Line2D"]) -> Tuple[float, float]: +def get_xlim(lines: Iterable["Line2D"]) -> Tuple[float, float]: left, right = np.inf, -np.inf for l in lines: x = l.get_xdata(orig=False) diff --git a/pandas/tests/plotting/common.py b/pandas/tests/plotting/common.py index 3b1ff233c5ec1..b753c96af6290 100644 --- a/pandas/tests/plotting/common.py +++ b/pandas/tests/plotting/common.py @@ -13,13 +13,13 @@ from pandas import DataFrame, Series import pandas._testing as tm -""" -This is a common base class used for various plotting tests -""" - @td.skip_if_no_mpl class TestPlotBase: + """ + This is a common base class used for various plotting tests + """ + def setup_method(self, method): import matplotlib as mpl @@ -330,7 +330,7 @@ def _check_axes_shape(self, axes, axes_num=None, layout=None, figsize=None): figsize : tuple expected figsize. default is matplotlib default """ - from pandas.plotting._matplotlib.tools import _flatten + from pandas.plotting._matplotlib.tools import flatten_axes if figsize is None: figsize = self.default_figsize @@ -343,7 +343,7 @@ def _check_axes_shape(self, axes, axes_num=None, layout=None, figsize=None): assert len(ax.get_children()) > 0 if layout is not None: - result = self._get_axes_layout(_flatten(axes)) + result = self._get_axes_layout(flatten_axes(axes)) assert result == layout tm.assert_numpy_array_equal( @@ -370,9 +370,9 @@ def _flatten_visible(self, axes): axes : matplotlib Axes object, or its list-like """ - from pandas.plotting._matplotlib.tools import _flatten + from pandas.plotting._matplotlib.tools import flatten_axes - axes = _flatten(axes) + axes = flatten_axes(axes) axes = [ax for ax in axes if ax.get_visible()] return axes diff --git a/pandas/tests/plotting/test_misc.py b/pandas/tests/plotting/test_misc.py index f5c1c58f3f7ed..130acaa8bcd58 100644 --- a/pandas/tests/plotting/test_misc.py +++ b/pandas/tests/plotting/test_misc.py @@ -353,7 +353,7 @@ def test_get_standard_colors_random_seed(self): # GH17525 df = DataFrame(np.zeros((10, 10))) - # Make sure that the random seed isn't reset by _get_standard_colors + # Make sure that the random seed isn't reset by get_standard_colors plotting.parallel_coordinates(df, 0) rand1 = random.random() plotting.parallel_coordinates(df, 0) @@ -361,19 +361,19 @@ def test_get_standard_colors_random_seed(self): assert rand1 != rand2 # Make sure it produces the same colors every time it's called - from pandas.plotting._matplotlib.style import _get_standard_colors + from pandas.plotting._matplotlib.style import get_standard_colors - color1 = _get_standard_colors(1, color_type="random") - color2 = _get_standard_colors(1, color_type="random") + color1 = get_standard_colors(1, color_type="random") + color2 = get_standard_colors(1, color_type="random") assert color1 == color2 def test_get_standard_colors_default_num_colors(self): - from pandas.plotting._matplotlib.style import _get_standard_colors + from pandas.plotting._matplotlib.style import get_standard_colors # Make sure the default color_types returns the specified amount - color1 = _get_standard_colors(1, color_type="default") - color2 = _get_standard_colors(9, color_type="default") - color3 = _get_standard_colors(20, color_type="default") + color1 = get_standard_colors(1, color_type="default") + color2 = get_standard_colors(9, color_type="default") + color3 = get_standard_colors(20, color_type="default") assert len(color1) == 1 assert len(color2) == 9 assert len(color3) == 20 @@ -401,10 +401,10 @@ def test_get_standard_colors_no_appending(self): # correctly. from matplotlib import cm - from pandas.plotting._matplotlib.style import _get_standard_colors + from pandas.plotting._matplotlib.style import get_standard_colors color_before = cm.gnuplot(range(5)) - color_after = _get_standard_colors(1, color=color_before) + color_after = get_standard_colors(1, color=color_before) assert len(color_after) == len(color_before) df = DataFrame(np.random.randn(48, 4), columns=list("ABCD")) diff --git a/pandas/tests/plotting/test_series.py b/pandas/tests/plotting/test_series.py index cc00626e992f3..c296e2a6278c5 100644 --- a/pandas/tests/plotting/test_series.py +++ b/pandas/tests/plotting/test_series.py @@ -809,53 +809,53 @@ def test_series_grid_settings(self): @pytest.mark.slow def test_standard_colors(self): - from pandas.plotting._matplotlib.style import _get_standard_colors + from pandas.plotting._matplotlib.style import get_standard_colors for c in ["r", "red", "green", "#FF0000"]: - result = _get_standard_colors(1, color=c) + result = get_standard_colors(1, color=c) assert result == [c] - result = _get_standard_colors(1, color=[c]) + result = get_standard_colors(1, color=[c]) assert result == [c] - result = _get_standard_colors(3, color=c) + result = get_standard_colors(3, color=c) assert result == [c] * 3 - result = _get_standard_colors(3, color=[c]) + result = get_standard_colors(3, color=[c]) assert result == [c] * 3 @pytest.mark.slow def test_standard_colors_all(self): import matplotlib.colors as colors - from pandas.plotting._matplotlib.style import _get_standard_colors + from pandas.plotting._matplotlib.style import get_standard_colors # multiple colors like mediumaquamarine for c in colors.cnames: - result = _get_standard_colors(num_colors=1, color=c) + result = get_standard_colors(num_colors=1, color=c) assert result == [c] - result = _get_standard_colors(num_colors=1, color=[c]) + result = get_standard_colors(num_colors=1, color=[c]) assert result == [c] - result = _get_standard_colors(num_colors=3, color=c) + result = get_standard_colors(num_colors=3, color=c) assert result == [c] * 3 - result = _get_standard_colors(num_colors=3, color=[c]) + result = get_standard_colors(num_colors=3, color=[c]) assert result == [c] * 3 # single letter colors like k for c in colors.ColorConverter.colors: - result = _get_standard_colors(num_colors=1, color=c) + result = get_standard_colors(num_colors=1, color=c) assert result == [c] - result = _get_standard_colors(num_colors=1, color=[c]) + result = get_standard_colors(num_colors=1, color=[c]) assert result == [c] - result = _get_standard_colors(num_colors=3, color=c) + result = get_standard_colors(num_colors=3, color=c) assert result == [c] * 3 - result = _get_standard_colors(num_colors=3, color=[c]) + result = get_standard_colors(num_colors=3, color=[c]) assert result == [c] * 3 def test_series_plot_color_kwargs(self): From 9ee81f566fcdd74cc428cb670d0a321017a2ec7f Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Wed, 2 Sep 2020 19:44:39 +0100 Subject: [PATCH 0630/1025] TYP: misc typing in core\indexes\base.py (#35991) --- pandas/core/frame.py | 6 ++-- pandas/core/indexes/base.py | 51 ++++++++++++++++++++++++++------- pandas/core/indexes/interval.py | 6 +++- 3 files changed, 48 insertions(+), 15 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index e78c15d125e8d..5b8c421db3ce1 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1776,13 +1776,13 @@ def from_records( arrays = [data[k] for k in columns] else: arrays = [] - arr_columns = [] + arr_columns_list = [] for k, v in data.items(): if k in columns: - arr_columns.append(k) + arr_columns_list.append(k) arrays.append(v) - arrays, arr_columns = reorder_arrays(arrays, arr_columns, columns) + arrays, arr_columns = reorder_arrays(arrays, arr_columns_list, columns) elif isinstance(data, (np.ndarray, DataFrame)): arrays, columns = to_arrays(data, columns) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index a07c3328def54..48b02fc525cc1 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -10,6 +10,8 @@ Hashable, List, Optional, + Sequence, + TypeVar, Union, ) import warnings @@ -22,7 +24,7 @@ from pandas._libs.tslibs import OutOfBoundsDatetime, Timestamp from pandas._libs.tslibs.period import IncompatibleFrequency from pandas._libs.tslibs.timezones import tz_compare -from pandas._typing import DtypeObj, Label +from pandas._typing import AnyArrayLike, Dtype, DtypeObj, Label from pandas.compat import set_function_name from pandas.compat.numpy import function as nv from pandas.errors import InvalidIndexError @@ -98,7 +100,7 @@ ) if TYPE_CHECKING: - from pandas import Series + from pandas import RangeIndex, Series __all__ = ["Index"] @@ -188,6 +190,9 @@ def _new_Index(cls, d): return cls.__new__(cls, **d) +_IndexT = TypeVar("_IndexT", bound="Index") + + class Index(IndexOpsMixin, PandasObject): """ Immutable ndarray implementing an ordered, sliceable set. The basic object @@ -787,7 +792,13 @@ def repeat(self, repeats, axis=None): # -------------------------------------------------------------------- # Copying Methods - def copy(self, name=None, deep=False, dtype=None, names=None): + def copy( + self: _IndexT, + name: Optional[Label] = None, + deep: bool = False, + dtype: Optional[Dtype] = None, + names: Optional[Sequence[Label]] = None, + ) -> _IndexT: """ Make a copy of this object. @@ -949,10 +960,9 @@ def _format_with_header( # could have nans mask = isna(values) if mask.any(): - result = np.array(result) - result[mask] = na_rep - # error: "List[str]" has no attribute "tolist" - result = result.tolist() # type: ignore[attr-defined] + result_arr = np.array(result) + result_arr[mask] = na_rep + result = result_arr.tolist() else: result = trim_front(format_array(values, None, justify="left")) return header + result @@ -4913,7 +4923,13 @@ def _get_string_slice(self, key: str_t, use_lhs: bool = True, use_rhs: bool = Tr # overridden in DatetimeIndex, TimedeltaIndex and PeriodIndex raise NotImplementedError - def slice_indexer(self, start=None, end=None, step=None, kind=None): + def slice_indexer( + self, + start: Optional[Label] = None, + end: Optional[Label] = None, + step: Optional[int] = None, + kind: Optional[str_t] = None, + ) -> slice: """ Compute the slice indexer for input labels and step. @@ -5513,7 +5529,9 @@ def ensure_index_from_sequences(sequences, names=None): return MultiIndex.from_arrays(sequences, names=names) -def ensure_index(index_like, copy: bool = False): +def ensure_index( + index_like: Union[AnyArrayLike, Sequence], copy: bool = False +) -> Index: """ Ensure that we have an index from some index-like object. @@ -5549,7 +5567,18 @@ def ensure_index(index_like, copy: bool = False): index_like = index_like.copy() return index_like if hasattr(index_like, "name"): - return Index(index_like, name=index_like.name, copy=copy) + # https://github.com/python/mypy/issues/1424 + # error: Item "ExtensionArray" of "Union[ExtensionArray, + # Sequence[Any]]" has no attribute "name" [union-attr] + # error: Item "Sequence[Any]" of "Union[ExtensionArray, Sequence[Any]]" + # has no attribute "name" [union-attr] + # error: "Sequence[Any]" has no attribute "name" [attr-defined] + # error: Item "Sequence[Any]" of "Union[Series, Sequence[Any]]" has no + # attribute "name" [union-attr] + # error: Item "Sequence[Any]" of "Union[Any, Sequence[Any]]" has no + # attribute "name" [union-attr] + name = index_like.name # type: ignore[union-attr, attr-defined] + return Index(index_like, name=name, copy=copy) if is_iterator(index_like): index_like = list(index_like) @@ -5604,7 +5633,7 @@ def _validate_join_method(method: str): raise ValueError(f"do not recognize join method {method}") -def default_index(n): +def default_index(n: int) -> "RangeIndex": from pandas.core.indexes.range import RangeIndex return RangeIndex(0, n, name=None) diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 5d309ef7cd515..08f9bd51de77b 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -1,7 +1,7 @@ """ define the IntervalIndex """ from operator import le, lt import textwrap -from typing import Any, List, Optional, Tuple, Union +from typing import TYPE_CHECKING, Any, List, Optional, Tuple, Union, cast import numpy as np @@ -56,6 +56,9 @@ from pandas.core.indexes.timedeltas import TimedeltaIndex, timedelta_range from pandas.core.ops import get_op_result_name +if TYPE_CHECKING: + from pandas import CategoricalIndex + _VALID_CLOSED = {"left", "right", "both", "neither"} _index_doc_kwargs = dict(ibase._index_doc_kwargs) @@ -786,6 +789,7 @@ def get_indexer( right_indexer = self.right.get_indexer(target_as_index.right) indexer = np.where(left_indexer == right_indexer, left_indexer, -1) elif is_categorical_dtype(target_as_index.dtype): + target_as_index = cast("CategoricalIndex", target_as_index) # get an indexer for unique categories then propagate to codes via take_1d categories_indexer = self.get_indexer(target_as_index.categories) indexer = take_1d(categories_indexer, target_as_index.codes, fill_value=-1) From 94b7f450f8202fda351dffc8ba2f8686c833e81a Mon Sep 17 00:00:00 2001 From: Byron Boulton Date: Wed, 2 Sep 2020 15:09:22 -0400 Subject: [PATCH 0631/1025] DOC: Fix typo of `=!` to `!=` in docstring (#36077) This fixes GH36075. --- pandas/core/ops/docstrings.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/ops/docstrings.py b/pandas/core/ops/docstrings.py index 99c2fefc97ae7..e3a68ad328d55 100644 --- a/pandas/core/ops/docstrings.py +++ b/pandas/core/ops/docstrings.py @@ -611,7 +611,7 @@ def _make_flex_doc(op_name, typ): Among flexible wrappers (`eq`, `ne`, `le`, `lt`, `ge`, `gt`) to comparison operators. -Equivalent to `==`, `=!`, `<=`, `<`, `>=`, `>` with support to choose axis +Equivalent to `==`, `!=`, `<=`, `<`, `>=`, `>` with support to choose axis (rows or columns) and level for comparison. Parameters From 0a4af99d2912cb01dce897d9c285d6fcc2d49a3d Mon Sep 17 00:00:00 2001 From: Sarthak Vineet Kumar Date: Thu, 3 Sep 2020 00:51:29 +0530 Subject: [PATCH 0632/1025] cleared commas (#36073) Co-authored-by: Sarthak --- pandas/tests/scalar/test_na_scalar.py | 10 +++------- pandas/tests/scalar/timestamp/test_arithmetic.py | 4 ++-- pandas/tests/series/methods/test_argsort.py | 2 +- pandas/tests/series/methods/test_convert_dtypes.py | 4 ++-- pandas/tests/series/methods/test_drop_duplicates.py | 2 +- pandas/tests/series/methods/test_interpolate.py | 4 ++-- pandas/tests/series/methods/test_unstack.py | 6 ++---- pandas/tests/series/test_cumulative.py | 2 +- pandas/tests/test_algos.py | 2 +- 9 files changed, 15 insertions(+), 21 deletions(-) diff --git a/pandas/tests/scalar/test_na_scalar.py b/pandas/tests/scalar/test_na_scalar.py index dc5eb15348c1b..0a7dfbee4e672 100644 --- a/pandas/tests/scalar/test_na_scalar.py +++ b/pandas/tests/scalar/test_na_scalar.py @@ -111,7 +111,7 @@ def test_pow_special(value, asarray): @pytest.mark.parametrize( - "value", [1, 1.0, True, np.bool_(True), np.int_(1), np.float_(1)], + "value", [1, 1.0, True, np.bool_(True), np.int_(1), np.float_(1)] ) @pytest.mark.parametrize("asarray", [True, False]) def test_rpow_special(value, asarray): @@ -128,9 +128,7 @@ def test_rpow_special(value, asarray): assert result == value -@pytest.mark.parametrize( - "value", [-1, -1.0, np.int_(-1), np.float_(-1)], -) +@pytest.mark.parametrize("value", [-1, -1.0, np.int_(-1), np.float_(-1)]) @pytest.mark.parametrize("asarray", [True, False]) def test_rpow_minus_one(value, asarray): if asarray: @@ -193,9 +191,7 @@ def test_logical_not(): assert ~NA is NA -@pytest.mark.parametrize( - "shape", [(3,), (3, 3), (1, 2, 3)], -) +@pytest.mark.parametrize("shape", [(3,), (3, 3), (1, 2, 3)]) def test_arithmetic_ndarray(shape, all_arithmetic_functions): op = all_arithmetic_functions a = np.zeros(shape) diff --git a/pandas/tests/scalar/timestamp/test_arithmetic.py b/pandas/tests/scalar/timestamp/test_arithmetic.py index 954301b979074..1e980b6e4559c 100644 --- a/pandas/tests/scalar/timestamp/test_arithmetic.py +++ b/pandas/tests/scalar/timestamp/test_arithmetic.py @@ -213,7 +213,7 @@ def test_add_int_with_freq(self, ts, other): with pytest.raises(TypeError, match=msg): other - ts - @pytest.mark.parametrize("shape", [(6,), (2, 3,)]) + @pytest.mark.parametrize("shape", [(6,), (2, 3)]) def test_addsub_m8ndarray(self, shape): # GH#33296 ts = Timestamp("2020-04-04 15:45") @@ -237,7 +237,7 @@ def test_addsub_m8ndarray(self, shape): with pytest.raises(TypeError, match=msg): other - ts - @pytest.mark.parametrize("shape", [(6,), (2, 3,)]) + @pytest.mark.parametrize("shape", [(6,), (2, 3)]) def test_addsub_m8ndarray_tzaware(self, shape): # GH#33296 ts = Timestamp("2020-04-04 15:45", tz="US/Pacific") diff --git a/pandas/tests/series/methods/test_argsort.py b/pandas/tests/series/methods/test_argsort.py index 4353eb4c8cd64..ec9ba468c996c 100644 --- a/pandas/tests/series/methods/test_argsort.py +++ b/pandas/tests/series/methods/test_argsort.py @@ -9,7 +9,7 @@ class TestSeriesArgsort: def _check_accum_op(self, name, ser, check_dtype=True): func = getattr(np, name) tm.assert_numpy_array_equal( - func(ser).values, func(np.array(ser)), check_dtype=check_dtype, + func(ser).values, func(np.array(ser)), check_dtype=check_dtype ) # with missing values diff --git a/pandas/tests/series/methods/test_convert_dtypes.py b/pandas/tests/series/methods/test_convert_dtypes.py index dd4bf642e68e8..8a915324a72c1 100644 --- a/pandas/tests/series/methods/test_convert_dtypes.py +++ b/pandas/tests/series/methods/test_convert_dtypes.py @@ -219,10 +219,10 @@ class TestSeriesConvertDtypes: pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]), object, { - ((True,), (True, False), (True, False), (True, False),): np.dtype( + ((True,), (True, False), (True, False), (True, False)): np.dtype( "datetime64[ns]" ), - ((False,), (True, False), (True, False), (True, False),): np.dtype( + ((False,), (True, False), (True, False), (True, False)): np.dtype( "O" ), }, diff --git a/pandas/tests/series/methods/test_drop_duplicates.py b/pandas/tests/series/methods/test_drop_duplicates.py index 40651c4342e8a..6eb0e09f12658 100644 --- a/pandas/tests/series/methods/test_drop_duplicates.py +++ b/pandas/tests/series/methods/test_drop_duplicates.py @@ -141,7 +141,7 @@ def test_drop_duplicates_categorical_non_bool(self, dtype, ordered): def test_drop_duplicates_categorical_bool(self, ordered): tc = Series( Categorical( - [True, False, True, False], categories=[True, False], ordered=ordered, + [True, False, True, False], categories=[True, False], ordered=ordered ) ) diff --git a/pandas/tests/series/methods/test_interpolate.py b/pandas/tests/series/methods/test_interpolate.py index c4b10e0ccdc3e..cba9443005f2f 100644 --- a/pandas/tests/series/methods/test_interpolate.py +++ b/pandas/tests/series/methods/test_interpolate.py @@ -30,7 +30,7 @@ ] ) def nontemporal_method(request): - """ Fixture that returns an (method name, required kwargs) pair. + """Fixture that returns an (method name, required kwargs) pair. This fixture does not include method 'time' as a parameterization; that method requires a Series with a DatetimeIndex, and is generally tested @@ -60,7 +60,7 @@ def nontemporal_method(request): ] ) def interp_methods_ind(request): - """ Fixture that returns a (method name, required kwargs) pair to + """Fixture that returns a (method name, required kwargs) pair to be tested for various Index types. This fixture does not include methods - 'time', 'index', 'nearest', diff --git a/pandas/tests/series/methods/test_unstack.py b/pandas/tests/series/methods/test_unstack.py index cdf6a16e88ad0..d651315d64561 100644 --- a/pandas/tests/series/methods/test_unstack.py +++ b/pandas/tests/series/methods/test_unstack.py @@ -75,9 +75,7 @@ def test_unstack_tuplename_in_multiindex(): expected = pd.DataFrame( [[1, 1, 1], [1, 1, 1], [1, 1, 1]], - columns=pd.MultiIndex.from_tuples( - [("a",), ("b",), ("c",)], names=[("A", "a")], - ), + columns=pd.MultiIndex.from_tuples([("a",), ("b",), ("c",)], names=[("A", "a")]), index=pd.Index([1, 2, 3], name=("B", "b")), ) tm.assert_frame_equal(result, expected) @@ -115,7 +113,7 @@ def test_unstack_mixed_type_name_in_multiindex( result = ser.unstack(unstack_idx) expected = pd.DataFrame( - expected_values, columns=expected_columns, index=expected_index, + expected_values, columns=expected_columns, index=expected_index ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/series/test_cumulative.py b/pandas/tests/series/test_cumulative.py index 0b4c5f091106a..e070b86717503 100644 --- a/pandas/tests/series/test_cumulative.py +++ b/pandas/tests/series/test_cumulative.py @@ -17,7 +17,7 @@ def _check_accum_op(name, series, check_dtype=True): func = getattr(np, name) tm.assert_numpy_array_equal( - func(series).values, func(np.array(series)), check_dtype=check_dtype, + func(series).values, func(np.array(series)), check_dtype=check_dtype ) # with missing values diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index b4e97f1e341e4..59c6a5d53e7bb 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -944,7 +944,7 @@ def test_isin_int_df_string_search(self): @pytest.mark.xfail(reason="problem related with issue #34125") def test_isin_nan_df_string_search(self): """Comparing df with nan value (np.nan,2) with a string at isin() ("NaN") - -> should not match values because np.nan is not equal str NaN """ + -> should not match values because np.nan is not equal str NaN""" df = pd.DataFrame({"values": [np.nan, 2]}) result = df.isin(["NaN"]) expected_false = pd.DataFrame({"values": [False, False]}) From 03d00b6c92f346c433cc617fd7c8599e1b856389 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Wed, 2 Sep 2020 22:31:48 +0100 Subject: [PATCH 0633/1025] TYP: Postponed Evaluation of Annotations (PEP 563) (#36034) --- pandas/core/algorithms.py | 10 ++++++---- pandas/core/construction.py | 13 ++++++------- pandas/core/frame.py | 5 +++-- pandas/core/generic.py | 16 +++++++++------- 4 files changed, 24 insertions(+), 20 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index d2af6c132eca2..9d75d21c5637a 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -2,6 +2,8 @@ Generic data algorithms. This module is experimental at the moment and not intended for public consumption """ +from __future__ import annotations + import operator from textwrap import dedent from typing import TYPE_CHECKING, Dict, Optional, Tuple, Union @@ -707,7 +709,7 @@ def value_counts( normalize: bool = False, bins=None, dropna: bool = True, -) -> "Series": +) -> Series: """ Compute a histogram of the counts of non-null values. @@ -849,7 +851,7 @@ def duplicated(values, keep="first") -> np.ndarray: return f(values, keep=keep) -def mode(values, dropna: bool = True) -> "Series": +def mode(values, dropna: bool = True) -> Series: """ Returns the mode(s) of an array. @@ -1161,7 +1163,7 @@ class SelectNSeries(SelectN): nordered : Series """ - def compute(self, method: str) -> "Series": + def compute(self, method: str) -> Series: n = self.n dtype = self.obj.dtype @@ -1235,7 +1237,7 @@ def __init__(self, obj, n: int, keep: str, columns): columns = list(columns) self.columns = columns - def compute(self, method: str) -> "DataFrame": + def compute(self, method: str) -> DataFrame: from pandas import Int64Index diff --git a/pandas/core/construction.py b/pandas/core/construction.py index f145e76046bee..02b8ed17244cd 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -4,6 +4,7 @@ These should not depend on core.internals. """ +from __future__ import annotations from collections import abc from typing import TYPE_CHECKING, Any, Optional, Sequence, Union, cast @@ -49,16 +50,14 @@ import pandas.core.common as com if TYPE_CHECKING: - from pandas.core.arrays import ExtensionArray # noqa: F401 - from pandas.core.indexes.api import Index # noqa: F401 - from pandas.core.series import Series # noqa: F401 + from pandas import ExtensionArray, Index, Series def array( data: Union[Sequence[object], AnyArrayLike], dtype: Optional[Dtype] = None, copy: bool = True, -) -> "ExtensionArray": +) -> ExtensionArray: """ Create an array. @@ -389,7 +388,7 @@ def extract_array(obj, extract_numpy: bool = False): def sanitize_array( data, - index: Optional["Index"], + index: Optional[Index], dtype: Optional[DtypeObj] = None, copy: bool = False, raise_cast_failure: bool = False, @@ -594,13 +593,13 @@ def is_empty_data(data: Any) -> bool: def create_series_with_explicit_dtype( data: Any = None, - index: Optional[Union[ArrayLike, "Index"]] = None, + index: Optional[Union[ArrayLike, Index]] = None, dtype: Optional[Dtype] = None, name: Optional[str] = None, copy: bool = False, fastpath: bool = False, dtype_if_empty: Dtype = object, -) -> "Series": +) -> Series: """ Helper to pass an explicit dtype when instantiating an empty Series. diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 5b8c421db3ce1..7832547685567 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -8,6 +8,7 @@ alignment and a host of useful data manipulation methods having to do with the labeling information """ +from __future__ import annotations import collections from collections import abc @@ -885,7 +886,7 @@ def to_string( # ---------------------------------------------------------------------- @property - def style(self) -> "Styler": + def style(self) -> Styler: """ Returns a Styler object. @@ -6530,7 +6531,7 @@ def groupby( squeeze: bool = no_default, observed: bool = False, dropna: bool = True, - ) -> "DataFrameGroupBy": + ) -> DataFrameGroupBy: from pandas.core.groupby.generic import DataFrameGroupBy if squeeze is not no_default: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 486bea7cd1b47..fd924c964c1e1 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import collections from datetime import timedelta import functools @@ -110,7 +112,7 @@ from pandas._libs.tslibs import BaseOffset from pandas.core.resample import Resampler - from pandas.core.series import Series # noqa: F401 + from pandas.core.series import Series from pandas.core.window.indexers import BaseIndexer # goal is to be able to define the docs close to function, while still being @@ -391,7 +393,7 @@ def _get_block_manager_axis(cls, axis: Axis) -> int: return m - axis return axis - def _get_axis_resolvers(self, axis: str) -> Dict[str, Union["Series", MultiIndex]]: + def _get_axis_resolvers(self, axis: str) -> Dict[str, Union[Series, MultiIndex]]: # index or columns axis_index = getattr(self, axis) d = dict() @@ -421,10 +423,10 @@ def _get_axis_resolvers(self, axis: str) -> Dict[str, Union["Series", MultiIndex d[axis] = dindex return d - def _get_index_resolvers(self) -> Dict[str, Union["Series", MultiIndex]]: + def _get_index_resolvers(self) -> Dict[str, Union[Series, MultiIndex]]: from pandas.core.computation.parsing import clean_column_name - d: Dict[str, Union["Series", MultiIndex]] = {} + d: Dict[str, Union[Series, MultiIndex]] = {} for axis_name in self._AXIS_ORDERS: d.update(self._get_axis_resolvers(axis_name)) @@ -660,7 +662,7 @@ def droplevel(self: FrameOrSeries, level, axis=0) -> FrameOrSeries: result = self.set_axis(new_labels, axis=axis, inplace=False) return result - def pop(self, item: Label) -> Union["Series", Any]: + def pop(self, item: Label) -> Union[Series, Any]: result = self[item] del self[item] if self.ndim == 2: @@ -7684,7 +7686,7 @@ def resample( level=None, origin: Union[str, TimestampConvertibleTypes] = "start_day", offset: Optional[TimedeltaConvertibleTypes] = None, - ) -> "Resampler": + ) -> Resampler: """ Resample time-series data. @@ -10457,7 +10459,7 @@ def mad(self, axis=None, skipna=None, level=None): @doc(Rolling) def rolling( self, - window: "Union[int, timedelta, BaseOffset, BaseIndexer]", + window: Union[int, timedelta, BaseOffset, BaseIndexer], min_periods: Optional[int] = None, center: bool_t = False, win_type: Optional[str] = None, From d60c3a10d7e1214efc88f330d2977bf3a685bf06 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Wed, 2 Sep 2020 14:36:37 -0700 Subject: [PATCH 0634/1025] BUG: Index.get_slice_bounds does not accept datetime.date or tz naive datetime.datetimes (#35848) --- doc/source/whatsnew/v1.2.0.rst | 3 +- pandas/core/indexes/datetimes.py | 5 +-- .../tests/indexes/base_class/test_indexing.py | 26 ++++++++++++ .../tests/indexes/datetimes/test_indexing.py | 42 ++++++++++++++++++- pandas/tests/indexes/test_numeric.py | 19 +++++++++ 5 files changed, 90 insertions(+), 5 deletions(-) create mode 100644 pandas/tests/indexes/base_class/test_indexing.py diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 0cfe010b63a6f..9c8ee10a8a0af 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -176,7 +176,8 @@ Datetimelike - Bug in :attr:`DatetimeArray.date` where a ``ValueError`` would be raised with a read-only backing array (:issue:`33530`) - Bug in ``NaT`` comparisons failing to raise ``TypeError`` on invalid inequality comparisons (:issue:`35046`) - Bug in :class:`DateOffset` where attributes reconstructed from pickle files differ from original objects when input values exceed normal ranges (e.g months=12) (:issue:`34511`) -- +- Bug in :meth:`DatetimeIndex.get_slice_bound` where ``datetime.date`` objects were not accepted or naive :class:`Timestamp` with a tz-aware :class:`DatetimeIndex` (:issue:`35690`) +- Bug in :meth:`DatetimeIndex.slice_locs` where ``datetime.date`` objects were not accepted (:issue:`34077`) Timedelta ^^^^^^^^^ diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index e66f513e347a9..6dcb9250812d0 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -632,7 +632,7 @@ def get_loc(self, key, method=None, tolerance=None): raise KeyError(orig_key) from err def _maybe_cast_for_get_loc(self, key) -> Timestamp: - # needed to localize naive datetimes + # needed to localize naive datetimes or dates (GH 35690) key = Timestamp(key) if key.tzinfo is None: key = key.tz_localize(self.tz) @@ -677,8 +677,7 @@ def _maybe_cast_slice_bound(self, label, side: str, kind): if self._is_strictly_monotonic_decreasing and len(self) > 1: return upper if side == "left" else lower return lower if side == "left" else upper - else: - return label + return self._maybe_cast_for_get_loc(label) def _get_string_slice(self, key: str, use_lhs: bool = True, use_rhs: bool = True): freq = getattr(self, "freqstr", getattr(self, "inferred_freq", None)) diff --git a/pandas/tests/indexes/base_class/test_indexing.py b/pandas/tests/indexes/base_class/test_indexing.py new file mode 100644 index 0000000000000..196c0401a72be --- /dev/null +++ b/pandas/tests/indexes/base_class/test_indexing.py @@ -0,0 +1,26 @@ +import pytest + +from pandas import Index + + +class TestGetSliceBounds: + @pytest.mark.parametrize("kind", ["getitem", "loc", None]) + @pytest.mark.parametrize("side, expected", [("left", 4), ("right", 5)]) + def test_get_slice_bounds_within(self, kind, side, expected): + index = Index(list("abcdef")) + result = index.get_slice_bound("e", kind=kind, side=side) + assert result == expected + + @pytest.mark.parametrize("kind", ["getitem", "loc", None]) + @pytest.mark.parametrize("side", ["left", "right"]) + @pytest.mark.parametrize( + "data, bound, expected", [(list("abcdef"), "x", 6), (list("bcdefg"), "a", 0)], + ) + def test_get_slice_bounds_outside(self, kind, side, expected, data, bound): + index = Index(data) + result = index.get_slice_bound(bound, kind=kind, side=side) + assert result == expected + + def test_get_slice_bounds_invalid_side(self): + with pytest.raises(ValueError, match="Invalid value for side kwarg"): + Index([]).get_slice_bound("a", kind=None, side="middle") diff --git a/pandas/tests/indexes/datetimes/test_indexing.py b/pandas/tests/indexes/datetimes/test_indexing.py index 5d2c6daba3f57..539d9cb8f06a7 100644 --- a/pandas/tests/indexes/datetimes/test_indexing.py +++ b/pandas/tests/indexes/datetimes/test_indexing.py @@ -6,7 +6,7 @@ from pandas.errors import InvalidIndexError import pandas as pd -from pandas import DatetimeIndex, Index, Timestamp, date_range, notna +from pandas import DatetimeIndex, Index, Timestamp, bdate_range, date_range, notna import pandas._testing as tm from pandas.tseries.offsets import BDay, CDay @@ -665,3 +665,43 @@ def test_get_value(self): with tm.assert_produces_warning(FutureWarning): result = dti.get_value(ser, key.to_datetime64()) assert result == 7 + + +class TestGetSliceBounds: + @pytest.mark.parametrize("box", [date, datetime, Timestamp]) + @pytest.mark.parametrize("kind", ["getitem", "loc", None]) + @pytest.mark.parametrize("side, expected", [("left", 4), ("right", 5)]) + def test_get_slice_bounds_datetime_within( + self, box, kind, side, expected, tz_aware_fixture + ): + # GH 35690 + index = bdate_range("2000-01-03", "2000-02-11").tz_localize(tz_aware_fixture) + result = index.get_slice_bound( + box(year=2000, month=1, day=7), kind=kind, side=side + ) + assert result == expected + + @pytest.mark.parametrize("box", [date, datetime, Timestamp]) + @pytest.mark.parametrize("kind", ["getitem", "loc", None]) + @pytest.mark.parametrize("side", ["left", "right"]) + @pytest.mark.parametrize("year, expected", [(1999, 0), (2020, 30)]) + def test_get_slice_bounds_datetime_outside( + self, box, kind, side, year, expected, tz_aware_fixture + ): + # GH 35690 + index = bdate_range("2000-01-03", "2000-02-11").tz_localize(tz_aware_fixture) + result = index.get_slice_bound( + box(year=year, month=1, day=7), kind=kind, side=side + ) + assert result == expected + + @pytest.mark.parametrize("box", [date, datetime, Timestamp]) + @pytest.mark.parametrize("kind", ["getitem", "loc", None]) + def test_slice_datetime_locs(self, box, kind, tz_aware_fixture): + # GH 34077 + index = DatetimeIndex(["2010-01-01", "2010-01-03"]).tz_localize( + tz_aware_fixture + ) + result = index.slice_locs(box(2010, 1, 1), box(2010, 1, 2)) + expected = (0, 1) + assert result == expected diff --git a/pandas/tests/indexes/test_numeric.py b/pandas/tests/indexes/test_numeric.py index e6f455e60eee3..1ffdbbc9afd3f 100644 --- a/pandas/tests/indexes/test_numeric.py +++ b/pandas/tests/indexes/test_numeric.py @@ -679,3 +679,22 @@ def test_float64_index_difference(): result = string_index.difference(float_index) tm.assert_index_equal(result, string_index) + + +class TestGetSliceBounds: + @pytest.mark.parametrize("kind", ["getitem", "loc", None]) + @pytest.mark.parametrize("side, expected", [("left", 4), ("right", 5)]) + def test_get_slice_bounds_within(self, kind, side, expected): + index = Index(range(6)) + result = index.get_slice_bound(4, kind=kind, side=side) + assert result == expected + + @pytest.mark.parametrize("kind", ["getitem", "loc", None]) + @pytest.mark.parametrize("side", ["left", "right"]) + @pytest.mark.parametrize( + "bound, expected", [(-1, 0), (10, 6)], + ) + def test_get_slice_bounds_outside(self, kind, side, expected, bound): + index = Index(range(6)) + result = index.get_slice_bound(bound, kind=kind, side=side) + assert result == expected From 1a460150cb5e799d5e9dc0a323d25091804a7dd1 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 2 Sep 2020 19:56:33 -0700 Subject: [PATCH 0635/1025] REF: use BlockManager.apply for cython_agg_blocks, apply_blockwise (#35900) --- pandas/core/groupby/generic.py | 21 +++++---------------- pandas/core/internals/managers.py | 30 ++++++++++++++++++++++++------ pandas/core/window/rolling.py | 21 ++++----------------- 3 files changed, 33 insertions(+), 39 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index a92e3af0764a7..537feace59fcb 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1035,8 +1035,6 @@ def _cython_agg_blocks( if numeric_only: data = data.get_numeric_data(copy=False) - agg_blocks: List["Block"] = [] - no_result = object() def cast_agg_result(result, values: ArrayLike, how: str) -> ArrayLike: @@ -1118,23 +1116,14 @@ def blk_func(bvalues: ArrayLike) -> ArrayLike: res_values = cast_agg_result(result, bvalues, how) return res_values - for i, block in enumerate(data.blocks): - try: - nbs = block.apply(blk_func) - except (NotImplementedError, TypeError): - # TypeError -> we may have an exception in trying to aggregate - # continue and exclude the block - # NotImplementedError -> "ohlc" with wrong dtype - pass - else: - agg_blocks.extend(nbs) + # TypeError -> we may have an exception in trying to aggregate + # continue and exclude the block + # NotImplementedError -> "ohlc" with wrong dtype + new_mgr = data.apply(blk_func, ignore_failures=True) - if not agg_blocks: + if not len(new_mgr): raise DataError("No numeric types to aggregate") - # reset the locs in the blocks to correspond to our - # current ordering - new_mgr = data._combine(agg_blocks) return new_mgr def _aggregate_frame(self, func, *args, **kwargs) -> DataFrame: diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 389252e7ef0f2..2e3098d94afcb 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -350,7 +350,13 @@ def operate_blockwise(self, other: "BlockManager", array_op) -> "BlockManager": """ return operate_blockwise(self, other, array_op) - def apply(self: T, f, align_keys=None, **kwargs) -> T: + def apply( + self: T, + f, + align_keys: Optional[List[str]] = None, + ignore_failures: bool = False, + **kwargs, + ) -> T: """ Iterate over the blocks, collect and create a new BlockManager. @@ -358,6 +364,10 @@ def apply(self: T, f, align_keys=None, **kwargs) -> T: ---------- f : str or callable Name of the Block method to apply. + align_keys: List[str] or None, default None + ignore_failures: bool, default False + **kwargs + Keywords to pass to `f` Returns ------- @@ -387,12 +397,20 @@ def apply(self: T, f, align_keys=None, **kwargs) -> T: # otherwise we have an ndarray kwargs[k] = obj[b.mgr_locs.indexer] - if callable(f): - applied = b.apply(f, **kwargs) - else: - applied = getattr(b, f)(**kwargs) + try: + if callable(f): + applied = b.apply(f, **kwargs) + else: + applied = getattr(b, f)(**kwargs) + except (TypeError, NotImplementedError): + if not ignore_failures: + raise + continue result_blocks = _extend_blocks(applied, result_blocks) + if ignore_failures: + return self._combine(result_blocks) + if len(result_blocks) == 0: return self.make_empty(self.axes) @@ -704,7 +722,7 @@ def get_numeric_data(self, copy: bool = False) -> "BlockManager": self._consolidate_inplace() return self._combine([b for b in self.blocks if b.is_numeric], copy) - def _combine(self, blocks: List[Block], copy: bool = True) -> "BlockManager": + def _combine(self: T, blocks: List[Block], copy: bool = True) -> T: """ return a new manager with the blocks """ if len(blocks) == 0: return self.make_empty() diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index a3f60c0bc5098..558c0eeb0ea65 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -489,8 +489,6 @@ def _apply_blockwise( if self._selected_obj.ndim == 1: return self._apply_series(homogeneous_func) - # This isn't quite blockwise, since `blocks` is actually a collection - # of homogenenous DataFrames. _, obj = self._create_blocks(self._selected_obj) mgr = obj._mgr @@ -500,25 +498,14 @@ def hfunc(bvalues: ArrayLike) -> ArrayLike: res_values = homogeneous_func(values) return getattr(res_values, "T", res_values) - skipped: List[int] = [] - res_blocks: List["Block"] = [] - for i, blk in enumerate(mgr.blocks): - try: - nbs = blk.apply(hfunc) - - except (TypeError, NotImplementedError): - skipped.append(i) - continue - - res_blocks.extend(nbs) + new_mgr = mgr.apply(hfunc, ignore_failures=True) + out = obj._constructor(new_mgr) - if not len(res_blocks) and skipped: + if out.shape[1] == 0 and obj.shape[1] > 0: raise DataError("No numeric types to aggregate") - elif not len(res_blocks): + elif out.shape[1] == 0: return obj.astype("float64") - new_mgr = mgr._combine(res_blocks) - out = obj._constructor(new_mgr) self._insert_on_column(out, obj) return out From ec9699985ca072d73439a9ee172d08c3e11dc436 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 2 Sep 2020 22:00:00 -0500 Subject: [PATCH 0636/1025] Optionally disallow duplicate labels (#28394) --- doc/source/reference/frame.rst | 16 + .../reference/general_utility_functions.rst | 1 + doc/source/reference/series.rst | 15 + doc/source/user_guide/duplicates.rst | 210 ++++++++ doc/source/user_guide/index.rst | 1 + doc/source/whatsnew/v1.2.0.rst | 49 ++ pandas/__init__.py | 1 + pandas/_testing.py | 15 + pandas/core/api.py | 1 + pandas/core/flags.py | 113 +++++ pandas/core/frame.py | 11 +- pandas/core/generic.py | 130 ++++- pandas/core/indexes/base.py | 48 +- pandas/core/series.py | 10 +- pandas/errors/__init__.py | 21 + pandas/tests/api/test_api.py | 1 + pandas/tests/base/test_misc.py | 3 +- pandas/tests/frame/test_api.py | 27 ++ pandas/tests/generic/test_duplicate_labels.py | 450 ++++++++++++++++++ pandas/tests/generic/test_generic.py | 10 + pandas/tests/series/test_api.py | 26 + pandas/tests/test_flags.py | 48 ++ pandas/tests/util/test_assert_frame_equal.py | 15 + pandas/tests/util/test_assert_series_equal.py | 15 + 24 files changed, 1227 insertions(+), 10 deletions(-) create mode 100644 doc/source/user_guide/duplicates.rst create mode 100644 pandas/core/flags.py create mode 100644 pandas/tests/generic/test_duplicate_labels.py create mode 100644 pandas/tests/test_flags.py diff --git a/doc/source/reference/frame.rst b/doc/source/reference/frame.rst index 4d9d18e3d204e..9a1ebc8d670dc 100644 --- a/doc/source/reference/frame.rst +++ b/doc/source/reference/frame.rst @@ -37,6 +37,7 @@ Attributes and underlying data DataFrame.shape DataFrame.memory_usage DataFrame.empty + DataFrame.set_flags Conversion ~~~~~~~~~~ @@ -276,6 +277,21 @@ Time Series-related DataFrame.tz_convert DataFrame.tz_localize +.. _api.frame.flags: + +Flags +~~~~~ + +Flags refer to attributes of the pandas object. Properties of the dataset (like +the date is was recorded, the URL it was accessed from, etc.) should be stored +in :attr:`DataFrame.attrs`. + +.. autosummary:: + :toctree: api/ + + Flags + + .. _api.frame.metadata: Metadata diff --git a/doc/source/reference/general_utility_functions.rst b/doc/source/reference/general_utility_functions.rst index c1759110b94ad..3cba0a81a7011 100644 --- a/doc/source/reference/general_utility_functions.rst +++ b/doc/source/reference/general_utility_functions.rst @@ -37,6 +37,7 @@ Exceptions and warnings errors.AccessorRegistrationWarning errors.DtypeWarning + errors.DuplicateLabelError errors.EmptyDataError errors.InvalidIndexError errors.MergeError diff --git a/doc/source/reference/series.rst b/doc/source/reference/series.rst index ae3e121ca8212..5131d35334693 100644 --- a/doc/source/reference/series.rst +++ b/doc/source/reference/series.rst @@ -39,6 +39,8 @@ Attributes Series.empty Series.dtypes Series.name + Series.flags + Series.set_flags Conversion ---------- @@ -527,6 +529,19 @@ Sparse-dtype specific methods and attributes are provided under the Series.sparse.from_coo Series.sparse.to_coo +.. _api.series.flags: + +Flags +~~~~~ + +Flags refer to attributes of the pandas object. Properties of the dataset (like +the date is was recorded, the URL it was accessed from, etc.) should be stored +in :attr:`Series.attrs`. + +.. autosummary:: + :toctree: api/ + + Flags .. _api.series.metadata: diff --git a/doc/source/user_guide/duplicates.rst b/doc/source/user_guide/duplicates.rst new file mode 100644 index 0000000000000..b65822fab2b23 --- /dev/null +++ b/doc/source/user_guide/duplicates.rst @@ -0,0 +1,210 @@ +.. _duplicates: + +**************** +Duplicate Labels +**************** + +:class:`Index` objects are not required to be unique; you can have duplicate row +or column labels. This may be a bit confusing at first. If you're familiar with +SQL, you know that row labels are similar to a primary key on a table, and you +would never want duplicates in a SQL table. But one of pandas' roles is to clean +messy, real-world data before it goes to some downstream system. And real-world +data has duplicates, even in fields that are supposed to be unique. + +This section describes how duplicate labels change the behavior of certain +operations, and how prevent duplicates from arising during operations, or to +detect them if they do. + +.. ipython:: python + + import pandas as pd + import numpy as np + +Consequences of Duplicate Labels +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Some pandas methods (:meth:`Series.reindex` for example) just don't work with +duplicates present. The output can't be determined, and so pandas raises. + +.. ipython:: python + :okexcept: + + s1 = pd.Series([0, 1, 2], index=['a', 'b', 'b']) + s1.reindex(['a', 'b', 'c']) + +Other methods, like indexing, can give very surprising results. Typically +indexing with a scalar will *reduce dimensionality*. Slicing a ``DataFrame`` +with a scalar will return a ``Series``. Slicing a ``Series`` with a scalar will +return a scalar. But with duplicates, this isn't the case. + +.. ipython:: python + + df1 = pd.DataFrame([[0, 1, 2], [3, 4, 5]], columns=['A', 'A', 'B']) + df1 + +We have duplicates in the columns. If we slice ``'B'``, we get back a ``Series`` + +.. ipython:: python + + df1['B'] # a series + +But slicing ``'A'`` returns a ``DataFrame`` + + +.. ipython:: python + + df1['A'] # a DataFrame + +This applies to row labels as well + +.. ipython:: python + + df2 = pd.DataFrame({"A": [0, 1, 2]}, index=['a', 'a', 'b']) + df2 + df2.loc['b', 'A'] # a scalar + df2.loc['a', 'A'] # a Series + +Duplicate Label Detection +~~~~~~~~~~~~~~~~~~~~~~~~~ + +You can check whether an :class:`Index` (storing the row or column labels) is +unique with :attr:`Index.is_unique`: + +.. ipython:: python + + df2 + df2.index.is_unique + df2.columns.is_unique + +.. note:: + + Checking whether an index is unique is somewhat expensive for large datasets. + Pandas does cache this result, so re-checking on the same index is very fast. + +:meth:`Index.duplicated` will return a boolean ndarray indicating whether a +label is repeated. + +.. ipython:: python + + df2.index.duplicated() + +Which can be used as a boolean filter to drop duplicate rows. + +.. ipython:: python + + df2.loc[~df2.index.duplicated(), :] + +If you need additional logic to handle duplicate labels, rather than just +dropping the repeats, using :meth:`~DataFrame.groupby` on the index is a common +trick. For example, we'll resolve duplicates by taking the average of all rows +with the same label. + +.. ipython:: python + + df2.groupby(level=0).mean() + +.. _duplicates.disallow: + +Disallowing Duplicate Labels +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. versionadded:: 1.2.0 + +As noted above, handling duplicates is an important feature when reading in raw +data. That said, you may want to avoid introducing duplicates as part of a data +processing pipeline (from methods like :meth:`pandas.concat`, +:meth:`~DataFrame.rename`, etc.). Both :class:`Series` and :class:`DataFrame` +*disallow* duplicate labels by calling ``.set_flags(allows_duplicate_labels=False)``. +(the default is to allow them). If there are duplicate labels, an exception +will be raised. + +.. ipython:: python + :okexcept: + + pd.Series( + [0, 1, 2], + index=['a', 'b', 'b'] + ).set_flags(allows_duplicate_labels=False) + +This applies to both row and column labels for a :class:`DataFrame` + +.. ipython:: python + :okexcept: + + pd.DataFrame( + [[0, 1, 2], [3, 4, 5]], columns=["A", "B", "C"], + ).set_flags(allows_duplicate_labels=False) + +This attribute can be checked or set with :attr:`~DataFrame.flags.allows_duplicate_labels`, +which indicates whether that object can have duplicate labels. + +.. ipython:: python + + df = ( + pd.DataFrame({"A": [0, 1, 2, 3]}, + index=['x', 'y', 'X', 'Y']) + .set_flags(allows_duplicate_labels=False) + ) + df + df.flags.allows_duplicate_labels + +:meth:`DataFrame.set_flags` can be used to return a new ``DataFrame`` with attributes +like ``allows_duplicate_labels`` set to some value + +.. ipython:: python + + df2 = df.set_flags(allows_duplicate_labels=True) + df2.flags.allows_duplicate_labels + +The new ``DataFrame`` returned is a view on the same data as the old ``DataFrame``. +Or the property can just be set directly on the same object + + +.. ipython:: python + + df2.flags.allows_duplicate_labels = False + df2.flags.allows_duplicate_labels + +When processing raw, messy data you might initially read in the messy data +(which potentially has duplicate labels), deduplicate, and then disallow duplicates +going forward, to ensure that your data pipeline doesn't introduce duplicates. + + +.. code-block:: python + + >>> raw = pd.read_csv("...") + >>> deduplicated = raw.groupby(level=0).first() # remove duplicates + >>> deduplicated.flags.allows_duplicate_labels = False # disallow going forward + +Setting ``allows_duplicate_labels=True`` on a ``Series`` or ``DataFrame`` with duplicate +labels or performing an operation that introduces duplicate labels on a ``Series`` or +``DataFrame`` that disallows duplicates will raise an +:class:`errors.DuplicateLabelError`. + +.. ipython:: python + :okexcept: + + df.rename(str.upper) + +This error message contains the labels that are duplicated, and the numeric positions +of all the duplicates (including the "original") in the ``Series`` or ``DataFrame`` + +Duplicate Label Propagation +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +In general, disallowing duplicates is "sticky". It's preserved through +operations. + +.. ipython:: python + :okexcept: + + s1 = pd.Series(0, index=['a', 'b']).set_flags(allows_duplicate_labels=False) + s1 + s1.head().rename({"a": "b"}) + +.. warning:: + + This is an experimental feature. Currently, many methods fail to + propagate the ``allows_duplicate_labels`` value. In future versions + it is expected that every method taking or returning one or more + DataFrame or Series objects will propagate ``allows_duplicate_labels``. diff --git a/doc/source/user_guide/index.rst b/doc/source/user_guide/index.rst index 8226e72779588..2fc9e066e6712 100644 --- a/doc/source/user_guide/index.rst +++ b/doc/source/user_guide/index.rst @@ -33,6 +33,7 @@ Further information on any specific method can be obtained in the reshaping text missing_data + duplicates categorical integer_na boolean diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 9c8ee10a8a0af..7c083b95b21f3 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -13,6 +13,53 @@ including other versions of pandas. Enhancements ~~~~~~~~~~~~ +.. _whatsnew_120.duplicate_labels: + +Optionally disallow duplicate labels +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:class:`Series` and :class:`DataFrame` can now be created with ``allows_duplicate_labels=False`` flag to +control whether the index or columns can contain duplicate labels (:issue:`28394`). This can be used to +prevent accidental introduction of duplicate labels, which can affect downstream operations. + +By default, duplicates continue to be allowed + +.. ipython:: python + + pd.Series([1, 2], index=['a', 'a']) + +.. ipython:: python + :okexcept: + + pd.Series([1, 2], index=['a', 'a']).set_flags(allows_duplicate_labels=False) + +Pandas will propagate the ``allows_duplicate_labels`` property through many operations. + +.. ipython:: python + :okexcept: + + a = ( + pd.Series([1, 2], index=['a', 'b']) + .set_flags(allows_duplicate_labels=False) + ) + a + # An operation introducing duplicates + a.reindex(['a', 'b', 'a']) + +.. warning:: + + This is an experimental feature. Currently, many methods fail to + propagate the ``allows_duplicate_labels`` value. In future versions + it is expected that every method taking or returning one or more + DataFrame or Series objects will propagate ``allows_duplicate_labels``. + +See :ref:`duplicates` for more. + +The ``allows_duplicate_labels`` flag is stored in the new :attr:`DataFrame.flags` +attribute. This stores global attributes that apply to the *pandas object*. This +differs from :attr:`DataFrame.attrs`, which stores information that applies to +the dataset. + Passing arguments to fsspec backends ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -53,6 +100,8 @@ For example: Other enhancements ^^^^^^^^^^^^^^^^^^ + +- Added :meth:`~DataFrame.set_flags` for setting table-wide flags on a ``Series`` or ``DataFrame`` (:issue:`28394`) - :class:`Index` with object dtype supports division and multiplication (:issue:`34160`) - - diff --git a/pandas/__init__.py b/pandas/__init__.py index 36576da74c75d..2737bcd8f9ccf 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -100,6 +100,7 @@ to_datetime, to_timedelta, # misc + Flags, Grouper, factorize, unique, diff --git a/pandas/_testing.py b/pandas/_testing.py index b402b040d9268..04d36749a3d8c 100644 --- a/pandas/_testing.py +++ b/pandas/_testing.py @@ -1225,6 +1225,7 @@ def assert_series_equal( check_categorical=True, check_category_order=True, check_freq=True, + check_flags=True, rtol=1.0e-5, atol=1.0e-8, obj="Series", @@ -1271,6 +1272,11 @@ def assert_series_equal( .. versionadded:: 1.0.2 check_freq : bool, default True Whether to check the `freq` attribute on a DatetimeIndex or TimedeltaIndex. + check_flags : bool, default True + Whether to check the `flags` attribute. + + .. versionadded:: 1.2.0 + rtol : float, default 1e-5 Relative tolerance. Only used when check_exact is False. @@ -1307,6 +1313,9 @@ def assert_series_equal( msg2 = f"{len(right)}, {right.index}" raise_assert_detail(obj, "Series length are different", msg1, msg2) + if check_flags: + assert left.flags == right.flags, f"{repr(left.flags)} != {repr(right.flags)}" + # index comparison assert_index_equal( left.index, @@ -1429,6 +1438,7 @@ def assert_frame_equal( check_categorical=True, check_like=False, check_freq=True, + check_flags=True, rtol=1.0e-5, atol=1.0e-8, obj="DataFrame", @@ -1490,6 +1500,8 @@ def assert_frame_equal( (same as in columns) - same labels must be with the same data. check_freq : bool, default True Whether to check the `freq` attribute on a DatetimeIndex or TimedeltaIndex. + check_flags : bool, default True + Whether to check the `flags` attribute. rtol : float, default 1e-5 Relative tolerance. Only used when check_exact is False. @@ -1563,6 +1575,9 @@ def assert_frame_equal( if check_like: left, right = left.reindex_like(right), right + if check_flags: + assert left.flags == right.flags, f"{repr(left.flags)} != {repr(right.flags)}" + # index comparison assert_index_equal( left.index, diff --git a/pandas/core/api.py b/pandas/core/api.py index b0b65f9d0be34..348e9206d6e19 100644 --- a/pandas/core/api.py +++ b/pandas/core/api.py @@ -26,6 +26,7 @@ ) from pandas.core.arrays.string_ import StringDtype from pandas.core.construction import array +from pandas.core.flags import Flags from pandas.core.groupby import Grouper, NamedAgg from pandas.core.indexes.api import ( CategoricalIndex, diff --git a/pandas/core/flags.py b/pandas/core/flags.py new file mode 100644 index 0000000000000..15966d8ddce2a --- /dev/null +++ b/pandas/core/flags.py @@ -0,0 +1,113 @@ +import weakref + + +class Flags: + """ + Flags that apply to pandas objects. + + .. versionadded:: 1.2.0 + + Parameters + ---------- + obj : Series or DataFrame + The object these flags are associated with + allows_duplicate_labels : bool, default True + Whether to allow duplicate labels in this object. By default, + duplicate labels are permitted. Setting this to ``False`` will + cause an :class:`errors.DuplicateLabelError` to be raised when + `index` (or columns for DataFrame) is not unique, or any + subsequent operation on introduces duplicates. + See :ref:`duplicates.disallow` for more. + + .. warning:: + + This is an experimental feature. Currently, many methods fail to + propagate the ``allows_duplicate_labels`` value. In future versions + it is expected that every method taking or returning one or more + DataFrame or Series objects will propagate ``allows_duplicate_labels``. + + Notes + ----- + Attributes can be set in two ways + + >>> df = pd.DataFrame() + >>> df.flags + + >>> df.flags.allows_duplicate_labels = False + >>> df.flags + + + >>> df.flags['allows_duplicate_labels'] = True + >>> df.flags + + """ + + _keys = {"allows_duplicate_labels"} + + def __init__(self, obj, *, allows_duplicate_labels): + self._allows_duplicate_labels = allows_duplicate_labels + self._obj = weakref.ref(obj) + + @property + def allows_duplicate_labels(self) -> bool: + """ + Whether this object allows duplicate labels. + + Setting ``allows_duplicate_labels=False`` ensures that the + index (and columns of a DataFrame) are unique. Most methods + that accept and return a Series or DataFrame will propagate + the value of ``allows_duplicate_labels``. + + See :ref:`duplicates` for more. + + See Also + -------- + DataFrame.attrs : Set global metadata on this object. + DataFrame.set_flags : Set global flags on this object. + + Examples + -------- + >>> df = pd.DataFrame({"A": [1, 2]}, index=['a', 'a']) + >>> df.allows_duplicate_labels + True + >>> df.allows_duplicate_labels = False + Traceback (most recent call last): + ... + pandas.errors.DuplicateLabelError: Index has duplicates. + positions + label + a [0, 1] + """ + return self._allows_duplicate_labels + + @allows_duplicate_labels.setter + def allows_duplicate_labels(self, value: bool): + value = bool(value) + obj = self._obj() + if obj is None: + raise ValueError("This flag's object has been deleted.") + + if not value: + for ax in obj.axes: + ax._maybe_check_unique() + + self._allows_duplicate_labels = value + + def __getitem__(self, key): + if key not in self._keys: + raise KeyError(key) + + return getattr(self, key) + + def __setitem__(self, key, value): + if key not in self._keys: + raise ValueError(f"Unknown flag {key}. Must be one of {self._keys}") + setattr(self, key, value) + + def __repr__(self): + return f"" + + def __eq__(self, other): + if isinstance(other, type(self)): + return self.allows_duplicate_labels == other.allows_duplicate_labels + return False diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 7832547685567..b4c12b9e52f56 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -458,7 +458,9 @@ def __init__( if isinstance(data, BlockManager): if index is None and columns is None and dtype is None and copy is False: # GH#33357 fastpath - NDFrame.__init__(self, data) + NDFrame.__init__( + self, data, + ) return mgr = self._init_mgr( @@ -3659,6 +3661,11 @@ def insert(self, loc, column, value, allow_duplicates=False) -> None: value : int, Series, or array-like allow_duplicates : bool, optional """ + if allow_duplicates and not self.flags.allows_duplicate_labels: + raise ValueError( + "Cannot specify 'allow_duplicates=True' when " + "'self.flags.allows_duplicate_labels' is False." + ) self._ensure_valid_index(value) value = self._sanitize_column(column, value, broadcast=False) self._mgr.insert(loc, column, value, allow_duplicates=allow_duplicates) @@ -4559,6 +4566,7 @@ def set_index( 4 16 10 2014 31 """ inplace = validate_bool_kwarg(inplace, "inplace") + self._check_inplace_and_allows_duplicate_labels(inplace) if not isinstance(keys, list): keys = [keys] @@ -4804,6 +4812,7 @@ class max type monkey mammal NaN jump """ inplace = validate_bool_kwarg(inplace, "inplace") + self._check_inplace_and_allows_duplicate_labels(inplace) if inplace: new_obj = self else: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index fd924c964c1e1..c9eb4a34683f8 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -94,6 +94,7 @@ from pandas.core.base import PandasObject, SelectionMixin import pandas.core.common as com from pandas.core.construction import create_series_with_explicit_dtype +from pandas.core.flags import Flags from pandas.core.indexes.api import Index, MultiIndex, RangeIndex, ensure_index from pandas.core.indexes.datetimes import DatetimeIndex from pandas.core.indexes.period import Period, PeriodIndex @@ -188,6 +189,7 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): "_metadata", "__array_struct__", "__array_interface__", + "_flags", ] _internal_names_set: Set[str] = set(_internal_names) _accessors: Set[str] = set() @@ -217,6 +219,7 @@ def __init__( else: attrs = dict(attrs) object.__setattr__(self, "_attrs", attrs) + object.__setattr__(self, "_flags", Flags(self, allows_duplicate_labels=True)) @classmethod def _init_mgr(cls, mgr, axes, dtype=None, copy: bool = False) -> BlockManager: @@ -237,15 +240,20 @@ def _init_mgr(cls, mgr, axes, dtype=None, copy: bool = False) -> BlockManager: return mgr # ---------------------------------------------------------------------- + # attrs and flags @property def attrs(self) -> Dict[Optional[Hashable], Any]: """ - Dictionary of global attributes on this object. + Dictionary of global attributes of this dataset. .. warning:: attrs is experimental and may change without warning. + + See Also + -------- + DataFrame.flags """ if self._attrs is None: self._attrs = {} @@ -255,6 +263,96 @@ def attrs(self) -> Dict[Optional[Hashable], Any]: def attrs(self, value: Mapping[Optional[Hashable], Any]) -> None: self._attrs = dict(value) + @property + def flags(self) -> Flags: + """ + Get the properties associated with this pandas object. + + The available flags are + + * :attr:`Flags.allows_duplicate_labels` + + See Also + -------- + Flags + DataFrame.attrs + + Notes + ----- + "Flags" differ from "metadata". Flags reflect properties of the + pandas object (the Series or DataFrame). Metadata refer to properties + of the dataset, and should be stored in :attr:`DataFrame.attrs`. + + Examples + -------- + >>> df = pd.DataFrame({"A": [1, 2]}) + >>> df.flags + + + Flags can be get or set using ``.`` + + >>> df.flags.allows_duplicate_labels + True + >>> df.flags.allows_duplicate_labels = False + + Or by slicing with a key + + >>> df.flags["allows_duplicate_labels"] + False + >>> df.flags["allows_duplicate_labels"] = True + """ + return self._flags + + def set_flags( + self: FrameOrSeries, + *, + copy: bool = False, + allows_duplicate_labels: Optional[bool] = None, + ) -> FrameOrSeries: + """ + Return a new object with updated flags. + + Parameters + ---------- + allows_duplicate_labels : bool, optional + Whether the returned object allows duplicate labels. + + Returns + ------- + Series or DataFrame + The same type as the caller. + + See Also + -------- + DataFrame.attrs : Global metadata applying to this dataset. + DataFrame.flags : Global flags applying to this object. + + Notes + ----- + This method returns a new object that's a view on the same data + as the input. Mutating the input or the output values will be reflected + in the other. + + This method is intended to be used in method chains. + + "Flags" differ from "metadata". Flags reflect properties of the + pandas object (the Series or DataFrame). Metadata refer to properties + of the dataset, and should be stored in :attr:`DataFrame.attrs`. + + Examples + -------- + >>> df = pd.DataFrame({"A": [1, 2]}) + >>> df.flags.allows_duplicate_labels + True + >>> df2 = df.set_flags(allows_duplicate_labels=False) + >>> df2.flags.allows_duplicate_labels + False + """ + df = self.copy(deep=copy) + if allows_duplicate_labels is not None: + df.flags["allows_duplicate_labels"] = allows_duplicate_labels + return df + @classmethod def _validate_dtype(cls, dtype): """ validate the passed dtype """ @@ -557,6 +655,11 @@ def set_axis(self, labels, axis: Axis = 0, inplace: bool = False): -------- %(klass)s.rename_axis : Alter the name of the index%(see_also_sub)s. """ + self._check_inplace_and_allows_duplicate_labels(inplace) + return self._set_axis_nocheck(labels, axis, inplace) + + def _set_axis_nocheck(self, labels, axis: Axis, inplace: bool): + # NDFrame.rename with inplace=False calls set_axis(inplace=True) on a copy. if inplace: setattr(self, self._get_axis_name(axis), labels) else: @@ -926,6 +1029,7 @@ def rename( else: index = mapper + self._check_inplace_and_allows_duplicate_labels(inplace) result = self if inplace else self.copy(deep=copy) for axis_no, replacements in enumerate((index, columns)): @@ -950,7 +1054,7 @@ def rename( raise KeyError(f"{missing_labels} not found in axis") new_index = ax._transform_index(f, level) - result.set_axis(new_index, axis=axis_no, inplace=True) + result._set_axis_nocheck(new_index, axis=axis_no, inplace=True) result._clear_item_cache() if inplace: @@ -1828,11 +1932,11 @@ def __getstate__(self) -> Dict[str, Any]: _typ=self._typ, _metadata=self._metadata, attrs=self.attrs, + _flags={k: self.flags[k] for k in self.flags._keys}, **meta, ) def __setstate__(self, state): - if isinstance(state, BlockManager): self._mgr = state elif isinstance(state, dict): @@ -1843,6 +1947,8 @@ def __setstate__(self, state): if typ is not None: attrs = state.get("_attrs", {}) object.__setattr__(self, "_attrs", attrs) + flags = state.get("_flags", dict(allows_duplicate_labels=True)) + object.__setattr__(self, "_flags", Flags(self, **flags)) # set in the order of internal names # to avoid definitional recursion @@ -1850,7 +1956,7 @@ def __setstate__(self, state): # defined meta = set(self._internal_names + self._metadata) for k in list(meta): - if k in state: + if k in state and k != "_flags": v = state[k] object.__setattr__(self, k, v) @@ -3802,6 +3908,13 @@ def __delitem__(self, key) -> None: # ---------------------------------------------------------------------- # Unsorted + def _check_inplace_and_allows_duplicate_labels(self, inplace): + if inplace and not self.flags.allows_duplicate_labels: + raise ValueError( + "Cannot specify 'inplace=True' when " + "'self.flags.allows_duplicate_labels' is False." + ) + def get(self, key, default=None): """ Get item from object for given key (ex: DataFrame column). @@ -5163,10 +5276,19 @@ def __finalize__( if isinstance(other, NDFrame): for name in other.attrs: self.attrs[name] = other.attrs[name] + + self.flags.allows_duplicate_labels = other.flags.allows_duplicate_labels # For subclasses using _metadata. for name in self._metadata: assert isinstance(name, str) object.__setattr__(self, name, getattr(other, name, None)) + + if method == "concat": + allows_duplicate_labels = all( + x.flags.allows_duplicate_labels for x in other.objs + ) + self.flags.allows_duplicate_labels = allows_duplicate_labels + return self def __getattr__(self, name: str): diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 48b02fc525cc1..65b5dfb6df911 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -27,7 +27,7 @@ from pandas._typing import AnyArrayLike, Dtype, DtypeObj, Label from pandas.compat import set_function_name from pandas.compat.numpy import function as nv -from pandas.errors import InvalidIndexError +from pandas.errors import DuplicateLabelError, InvalidIndexError from pandas.util._decorators import Appender, Substitution, cache_readonly, doc from pandas.core.dtypes import concat as _concat @@ -488,6 +488,52 @@ def _simple_new(cls, values, name: Label = None): def _constructor(self): return type(self) + def _maybe_check_unique(self): + """ + Check that an Index has no duplicates. + + This is typically only called via + `NDFrame.flags.allows_duplicate_labels.setter` when it's set to + True (duplicates aren't allowed). + + Raises + ------ + DuplicateLabelError + When the index is not unique. + """ + if not self.is_unique: + msg = """Index has duplicates.""" + duplicates = self._format_duplicate_message() + msg += "\n{}".format(duplicates) + + raise DuplicateLabelError(msg) + + def _format_duplicate_message(self): + """ + Construct the DataFrame for a DuplicateLabelError. + + This returns a DataFrame indicating the labels and positions + of duplicates in an index. This should only be called when it's + already known that duplicates are present. + + Examples + -------- + >>> idx = pd.Index(['a', 'b', 'a']) + >>> idx._format_duplicate_message() + positions + label + a [0, 2] + """ + from pandas import Series + + duplicates = self[self.duplicated(keep="first")].unique() + assert len(duplicates) + + out = Series(np.arange(len(self))).groupby(self).agg(list)[duplicates] + if self.nlevels == 1: + out = out.rename_axis("label") + return out.to_frame(name="positions") + # -------------------------------------------------------------------- # Index Internals Methods diff --git a/pandas/core/series.py b/pandas/core/series.py index a8a2d300fa168..9d84ce4b9ab2e 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -201,7 +201,7 @@ class Series(base.IndexOpsMixin, generic.NDFrame): # Constructors def __init__( - self, data=None, index=None, dtype=None, name=None, copy=False, fastpath=False + self, data=None, index=None, dtype=None, name=None, copy=False, fastpath=False, ): if ( @@ -211,7 +211,9 @@ def __init__( and copy is False ): # GH#33357 called with just the SingleBlockManager - NDFrame.__init__(self, data) + NDFrame.__init__( + self, data, + ) self.name = name return @@ -330,7 +332,9 @@ def __init__( data = SingleBlockManager.from_array(data, index) - generic.NDFrame.__init__(self, data) + generic.NDFrame.__init__( + self, data, + ) self.name = name self._set_axis(0, index, fastpath=True) diff --git a/pandas/errors/__init__.py b/pandas/errors/__init__.py index 6ac3004d29996..15389ca2c3e61 100644 --- a/pandas/errors/__init__.py +++ b/pandas/errors/__init__.py @@ -202,6 +202,27 @@ class NumbaUtilError(Exception): """ +class DuplicateLabelError(ValueError): + """ + Error raised when an operation would introduce duplicate labels. + + .. versionadded:: 1.2.0 + + Examples + -------- + >>> s = pd.Series([0, 1, 2], index=['a', 'b', 'c']).set_flags( + ... allows_duplicate_labels=False + ... ) + >>> s.reindex(['a', 'a', 'b']) + Traceback (most recent call last): + ... + DuplicateLabelError: Index has duplicates. + positions + label + a [0, 1] + """ + + class InvalidIndexError(Exception): """ Exception raised when attemping to use an invalid index key. diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index 1d25336cd3b70..54da13c3c620b 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -61,6 +61,7 @@ class TestPDApi(Base): "ExcelFile", "ExcelWriter", "Float64Index", + "Flags", "Grouper", "HDFStore", "Index", diff --git a/pandas/tests/base/test_misc.py b/pandas/tests/base/test_misc.py index 78a830c7f43d8..9523fba953ad0 100644 --- a/pandas/tests/base/test_misc.py +++ b/pandas/tests/base/test_misc.py @@ -99,7 +99,7 @@ def test_ndarray_compat_properties(index_or_series_obj): assert getattr(obj, p, None) is not None # deprecated properties - for p in ["flags", "strides", "itemsize", "base", "data"]: + for p in ["strides", "itemsize", "base", "data"]: assert not hasattr(obj, p) msg = "can only convert an array of size 1 to a Python scalar" @@ -116,6 +116,7 @@ def test_ndarray_compat_properties(index_or_series_obj): @pytest.mark.skipif(PYPY, reason="not relevant for PyPy") def test_memory_usage(index_or_series_obj): obj = index_or_series_obj + res = obj.memory_usage() res_deep = obj.memory_usage(deep=True) diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py index 0716cf5e27119..b1c31a6f90133 100644 --- a/pandas/tests/frame/test_api.py +++ b/pandas/tests/frame/test_api.py @@ -553,6 +553,33 @@ def test_attrs(self): result = df.rename(columns=str) assert result.attrs == {"version": 1} + @pytest.mark.parametrize("allows_duplicate_labels", [True, False, None]) + def test_set_flags(self, allows_duplicate_labels): + df = pd.DataFrame({"A": [1, 2]}) + result = df.set_flags(allows_duplicate_labels=allows_duplicate_labels) + if allows_duplicate_labels is None: + # We don't update when it's not provided + assert result.flags.allows_duplicate_labels is True + else: + assert result.flags.allows_duplicate_labels is allows_duplicate_labels + + # We made a copy + assert df is not result + + # We didn't mutate df + assert df.flags.allows_duplicate_labels is True + + # But we didn't copy data + result.iloc[0, 0] = 0 + assert df.iloc[0, 0] == 0 + + # Now we do copy. + result = df.set_flags( + copy=True, allows_duplicate_labels=allows_duplicate_labels + ) + result.iloc[0, 0] = 10 + assert df.iloc[0, 0] == 0 + def test_cache_on_copy(self): # GH 31784 _item_cache not cleared on copy causes incorrect reads after updates df = DataFrame({"a": [1]}) diff --git a/pandas/tests/generic/test_duplicate_labels.py b/pandas/tests/generic/test_duplicate_labels.py new file mode 100644 index 0000000000000..97468e1f10a8b --- /dev/null +++ b/pandas/tests/generic/test_duplicate_labels.py @@ -0,0 +1,450 @@ +"""Tests dealing with the NDFrame.allows_duplicates.""" +import operator + +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm + +not_implemented = pytest.mark.xfail(reason="Not implemented.") + +# ---------------------------------------------------------------------------- +# Preservation + + +class TestPreserves: + @pytest.mark.parametrize( + "cls, data", + [ + (pd.Series, np.array([])), + (pd.Series, [1, 2]), + (pd.DataFrame, {}), + (pd.DataFrame, {"A": [1, 2]}), + ], + ) + def test_construction_ok(self, cls, data): + result = cls(data) + assert result.flags.allows_duplicate_labels is True + + result = cls(data).set_flags(allows_duplicate_labels=False) + assert result.flags.allows_duplicate_labels is False + + @pytest.mark.parametrize( + "func", + [ + operator.itemgetter(["a"]), + operator.methodcaller("add", 1), + operator.methodcaller("rename", str.upper), + operator.methodcaller("rename", "name"), + pytest.param(operator.methodcaller("abs"), marks=not_implemented), + # TODO: test np.abs + ], + ) + def test_preserved_series(self, func): + s = pd.Series([0, 1], index=["a", "b"]).set_flags(allows_duplicate_labels=False) + assert func(s).flags.allows_duplicate_labels is False + + @pytest.mark.parametrize( + "other", [pd.Series(0, index=["a", "b", "c"]), pd.Series(0, index=["a", "b"])] + ) + # TODO: frame + @not_implemented + def test_align(self, other): + s = pd.Series([0, 1], index=["a", "b"]).set_flags(allows_duplicate_labels=False) + a, b = s.align(other) + assert a.flags.allows_duplicate_labels is False + assert b.flags.allows_duplicate_labels is False + + def test_preserved_frame(self): + df = pd.DataFrame({"A": [1, 2], "B": [3, 4]}, index=["a", "b"]).set_flags( + allows_duplicate_labels=False + ) + assert df.loc[["a"]].flags.allows_duplicate_labels is False + assert df.loc[:, ["A", "B"]].flags.allows_duplicate_labels is False + + @not_implemented + def test_to_frame(self): + s = pd.Series(dtype=float).set_flags(allows_duplicate_labels=False) + assert s.to_frame().flags.allows_duplicate_labels is False + + @pytest.mark.parametrize("func", ["add", "sub"]) + @pytest.mark.parametrize( + "frame", [False, pytest.param(True, marks=not_implemented)] + ) + @pytest.mark.parametrize("other", [1, pd.Series([1, 2], name="A")]) + def test_binops(self, func, other, frame): + df = pd.Series([1, 2], name="A", index=["a", "b"]).set_flags( + allows_duplicate_labels=False + ) + if frame: + df = df.to_frame() + if isinstance(other, pd.Series) and frame: + other = other.to_frame() + func = operator.methodcaller(func, other) + assert df.flags.allows_duplicate_labels is False + assert func(df).flags.allows_duplicate_labels is False + + @not_implemented + def test_preserve_getitem(self): + df = pd.DataFrame({"A": [1, 2]}).set_flags(allows_duplicate_labels=False) + assert df[["A"]].flags.allows_duplicate_labels is False + assert df["A"].flags.allows_duplicate_labels is False + assert df.loc[0].flags.allows_duplicate_labels is False + assert df.loc[[0]].flags.allows_duplicate_labels is False + assert df.loc[0, ["A"]].flags.allows_duplicate_labels is False + + @pytest.mark.xfail(reason="Unclear behavior.") + def test_ndframe_getitem_caching_issue(self): + # NDFrame.__getitem__ will cache the first df['A']. May need to + # invalidate that cache? Update the cached entries? + df = pd.DataFrame({"A": [0]}).set_flags(allows_duplicate_labels=False) + assert df["A"].flags.allows_duplicate_labels is False + df.flags.allows_duplicate_labels = True + assert df["A"].flags.allows_duplicate_labels is True + + @pytest.mark.parametrize( + "objs, kwargs", + [ + # Series + ( + [ + pd.Series(1, index=["a", "b"]).set_flags( + allows_duplicate_labels=False + ), + pd.Series(2, index=["c", "d"]).set_flags( + allows_duplicate_labels=False + ), + ], + {}, + ), + ( + [ + pd.Series(1, index=["a", "b"]).set_flags( + allows_duplicate_labels=False + ), + pd.Series(2, index=["a", "b"]).set_flags( + allows_duplicate_labels=False + ), + ], + {"ignore_index": True}, + ), + ( + [ + pd.Series(1, index=["a", "b"]).set_flags( + allows_duplicate_labels=False + ), + pd.Series(2, index=["a", "b"]).set_flags( + allows_duplicate_labels=False + ), + ], + {"axis": 1}, + ), + # Frame + ( + [ + pd.DataFrame({"A": [1, 2]}, index=["a", "b"]).set_flags( + allows_duplicate_labels=False + ), + pd.DataFrame({"A": [1, 2]}, index=["c", "d"]).set_flags( + allows_duplicate_labels=False + ), + ], + {}, + ), + ( + [ + pd.DataFrame({"A": [1, 2]}, index=["a", "b"]).set_flags( + allows_duplicate_labels=False + ), + pd.DataFrame({"A": [1, 2]}, index=["a", "b"]).set_flags( + allows_duplicate_labels=False + ), + ], + {"ignore_index": True}, + ), + ( + [ + pd.DataFrame({"A": [1, 2]}, index=["a", "b"]).set_flags( + allows_duplicate_labels=False + ), + pd.DataFrame({"B": [1, 2]}, index=["a", "b"]).set_flags( + allows_duplicate_labels=False + ), + ], + {"axis": 1}, + ), + # Series / Frame + ( + [ + pd.DataFrame({"A": [1, 2]}, index=["a", "b"]).set_flags( + allows_duplicate_labels=False + ), + pd.Series([1, 2], index=["a", "b"], name="B",).set_flags( + allows_duplicate_labels=False, + ), + ], + {"axis": 1}, + ), + ], + ) + def test_concat(self, objs, kwargs): + result = pd.concat(objs, **kwargs) + assert result.flags.allows_duplicate_labels is False + + @pytest.mark.parametrize( + "left, right, kwargs, expected", + [ + # false false false + pytest.param( + pd.DataFrame({"A": [0, 1]}, index=["a", "b"]).set_flags( + allows_duplicate_labels=False + ), + pd.DataFrame({"B": [0, 1]}, index=["a", "d"]).set_flags( + allows_duplicate_labels=False + ), + dict(left_index=True, right_index=True), + False, + marks=not_implemented, + ), + # false true false + pytest.param( + pd.DataFrame({"A": [0, 1]}, index=["a", "b"]).set_flags( + allows_duplicate_labels=False + ), + pd.DataFrame({"B": [0, 1]}, index=["a", "d"]), + dict(left_index=True, right_index=True), + False, + marks=not_implemented, + ), + # true true true + ( + pd.DataFrame({"A": [0, 1]}, index=["a", "b"]), + pd.DataFrame({"B": [0, 1]}, index=["a", "d"]), + dict(left_index=True, right_index=True), + True, + ), + ], + ) + def test_merge(self, left, right, kwargs, expected): + result = pd.merge(left, right, **kwargs) + assert result.flags.allows_duplicate_labels is expected + + @not_implemented + def test_groupby(self): + # XXX: This is under tested + # TODO: + # - apply + # - transform + # - Should passing a grouper that disallows duplicates propagate? + df = pd.DataFrame({"A": [1, 2, 3]}).set_flags(allows_duplicate_labels=False) + result = df.groupby([0, 0, 1]).agg("count") + assert result.flags.allows_duplicate_labels is False + + @pytest.mark.parametrize("frame", [True, False]) + @not_implemented + def test_window(self, frame): + df = pd.Series( + 1, + index=pd.date_range("2000", periods=12), + name="A", + allows_duplicate_labels=False, + ) + if frame: + df = df.to_frame() + assert df.rolling(3).mean().flags.allows_duplicate_labels is False + assert df.ewm(3).mean().flags.allows_duplicate_labels is False + assert df.expanding(3).mean().flags.allows_duplicate_labels is False + + +# ---------------------------------------------------------------------------- +# Raises + + +class TestRaises: + @pytest.mark.parametrize( + "cls, axes", + [ + (pd.Series, {"index": ["a", "a"], "dtype": float}), + (pd.DataFrame, {"index": ["a", "a"]}), + (pd.DataFrame, {"index": ["a", "a"], "columns": ["b", "b"]}), + (pd.DataFrame, {"columns": ["b", "b"]}), + ], + ) + def test_set_flags_with_duplicates(self, cls, axes): + result = cls(**axes) + assert result.flags.allows_duplicate_labels is True + + with pytest.raises(pd.errors.DuplicateLabelError): + cls(**axes).set_flags(allows_duplicate_labels=False) + + @pytest.mark.parametrize( + "data", + [ + pd.Series(index=[0, 0], dtype=float), + pd.DataFrame(index=[0, 0]), + pd.DataFrame(columns=[0, 0]), + ], + ) + def test_setting_allows_duplicate_labels_raises(self, data): + with pytest.raises(pd.errors.DuplicateLabelError): + data.flags.allows_duplicate_labels = False + + assert data.flags.allows_duplicate_labels is True + + @pytest.mark.parametrize( + "func", [operator.methodcaller("append", pd.Series(0, index=["a", "b"]))] + ) + def test_series_raises(self, func): + s = pd.Series([0, 1], index=["a", "b"]).set_flags(allows_duplicate_labels=False) + with pytest.raises(pd.errors.DuplicateLabelError): + func(s) + + @pytest.mark.parametrize( + "getter, target", + [ + (operator.itemgetter(["A", "A"]), None), + # loc + (operator.itemgetter(["a", "a"]), "loc"), + pytest.param( + operator.itemgetter(("a", ["A", "A"])), "loc", marks=not_implemented + ), + pytest.param( + operator.itemgetter((["a", "a"], "A")), "loc", marks=not_implemented + ), + # iloc + (operator.itemgetter([0, 0]), "iloc"), + pytest.param( + operator.itemgetter((0, [0, 0])), "iloc", marks=not_implemented + ), + pytest.param( + operator.itemgetter(([0, 0], 0)), "iloc", marks=not_implemented + ), + ], + ) + def test_getitem_raises(self, getter, target): + df = pd.DataFrame({"A": [1, 2], "B": [3, 4]}, index=["a", "b"]).set_flags( + allows_duplicate_labels=False + ) + if target: + # df, df.loc, or df.iloc + target = getattr(df, target) + else: + target = df + + with pytest.raises(pd.errors.DuplicateLabelError): + getter(target) + + @pytest.mark.parametrize( + "objs, kwargs", + [ + ( + [ + pd.Series(1, index=[0, 1], name="a").set_flags( + allows_duplicate_labels=False + ), + pd.Series(2, index=[0, 1], name="a").set_flags( + allows_duplicate_labels=False + ), + ], + {"axis": 1}, + ) + ], + ) + def test_concat_raises(self, objs, kwargs): + with pytest.raises(pd.errors.DuplicateLabelError): + pd.concat(objs, **kwargs) + + @not_implemented + def test_merge_raises(self): + a = pd.DataFrame({"A": [0, 1, 2]}, index=["a", "b", "c"]).set_flags( + allows_duplicate_labels=False + ) + b = pd.DataFrame({"B": [0, 1, 2]}, index=["a", "b", "b"]) + with pytest.raises(pd.errors.DuplicateLabelError): + pd.merge(a, b, left_index=True, right_index=True) + + +@pytest.mark.parametrize( + "idx", + [ + pd.Index([1, 1]), + pd.Index(["a", "a"]), + pd.Index([1.1, 1.1]), + pd.PeriodIndex([pd.Period("2000", "D")] * 2), + pd.DatetimeIndex([pd.Timestamp("2000")] * 2), + pd.TimedeltaIndex([pd.Timedelta("1D")] * 2), + pd.CategoricalIndex(["a", "a"]), + pd.IntervalIndex([pd.Interval(0, 1)] * 2), + pd.MultiIndex.from_tuples([("a", 1), ("a", 1)]), + ], + ids=lambda x: type(x).__name__, +) +def test_raises_basic(idx): + with pytest.raises(pd.errors.DuplicateLabelError): + pd.Series(1, index=idx).set_flags(allows_duplicate_labels=False) + + with pytest.raises(pd.errors.DuplicateLabelError): + pd.DataFrame({"A": [1, 1]}, index=idx).set_flags(allows_duplicate_labels=False) + + with pytest.raises(pd.errors.DuplicateLabelError): + pd.DataFrame([[1, 2]], columns=idx).set_flags(allows_duplicate_labels=False) + + +def test_format_duplicate_labels_message(): + idx = pd.Index(["a", "b", "a", "b", "c"]) + result = idx._format_duplicate_message() + expected = pd.DataFrame( + {"positions": [[0, 2], [1, 3]]}, index=pd.Index(["a", "b"], name="label") + ) + tm.assert_frame_equal(result, expected) + + +def test_format_duplicate_labels_message_multi(): + idx = pd.MultiIndex.from_product([["A"], ["a", "b", "a", "b", "c"]]) + result = idx._format_duplicate_message() + expected = pd.DataFrame( + {"positions": [[0, 2], [1, 3]]}, + index=pd.MultiIndex.from_product([["A"], ["a", "b"]]), + ) + tm.assert_frame_equal(result, expected) + + +def test_dataframe_insert_raises(): + df = pd.DataFrame({"A": [1, 2]}).set_flags(allows_duplicate_labels=False) + with pytest.raises(ValueError, match="Cannot specify"): + df.insert(0, "A", [3, 4], allow_duplicates=True) + + +@pytest.mark.parametrize( + "method, frame_only", + [ + (operator.methodcaller("set_index", "A", inplace=True), True), + (operator.methodcaller("set_axis", ["A", "B"], inplace=True), False), + (operator.methodcaller("reset_index", inplace=True), True), + (operator.methodcaller("rename", lambda x: x, inplace=True), False), + ], +) +def test_inplace_raises(method, frame_only): + df = pd.DataFrame({"A": [0, 0], "B": [1, 2]}).set_flags( + allows_duplicate_labels=False + ) + s = df["A"] + s.flags.allows_duplicate_labels = False + msg = "Cannot specify" + + with pytest.raises(ValueError, match=msg): + method(df) + if not frame_only: + with pytest.raises(ValueError, match=msg): + method(s) + + +def test_pickle(): + a = pd.Series([1, 2]).set_flags(allows_duplicate_labels=False) + b = tm.round_trip_pickle(a) + tm.assert_series_equal(a, b) + + a = pd.DataFrame({"A": []}).set_flags(allows_duplicate_labels=False) + b = tm.round_trip_pickle(a) + tm.assert_frame_equal(a, b) diff --git a/pandas/tests/generic/test_generic.py b/pandas/tests/generic/test_generic.py index 5e66925a38ec6..23bb673586768 100644 --- a/pandas/tests/generic/test_generic.py +++ b/pandas/tests/generic/test_generic.py @@ -887,3 +887,13 @@ def test_axis_numbers_deprecated(self, box): obj = box(dtype=object) with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): obj._AXIS_NUMBERS + + @pytest.mark.parametrize("as_frame", [True, False]) + def test_flags_identity(self, as_frame): + s = pd.Series([1, 2]) + if as_frame: + s = s.to_frame() + + assert s.flags is s.flags + s2 = s.copy() + assert s2.flags is not s.flags diff --git a/pandas/tests/series/test_api.py b/pandas/tests/series/test_api.py index d81e8a4f82ffb..a69c0ee75eaba 100644 --- a/pandas/tests/series/test_api.py +++ b/pandas/tests/series/test_api.py @@ -524,6 +524,32 @@ def test_attrs(self): result = s + 1 assert result.attrs == {"version": 1} + @pytest.mark.parametrize("allows_duplicate_labels", [True, False, None]) + def test_set_flags(self, allows_duplicate_labels): + df = pd.Series([1, 2]) + result = df.set_flags(allows_duplicate_labels=allows_duplicate_labels) + if allows_duplicate_labels is None: + # We don't update when it's not provided + assert result.flags.allows_duplicate_labels is True + else: + assert result.flags.allows_duplicate_labels is allows_duplicate_labels + + # We made a copy + assert df is not result + # We didn't mutate df + assert df.flags.allows_duplicate_labels is True + + # But we didn't copy data + result.iloc[0] = 0 + assert df.iloc[0] == 0 + + # Now we do copy. + result = df.set_flags( + copy=True, allows_duplicate_labels=allows_duplicate_labels + ) + result.iloc[0] = 10 + assert df.iloc[0] == 0 + class TestCategoricalSeries: @pytest.mark.parametrize( diff --git a/pandas/tests/test_flags.py b/pandas/tests/test_flags.py new file mode 100644 index 0000000000000..f6e3ae4980afb --- /dev/null +++ b/pandas/tests/test_flags.py @@ -0,0 +1,48 @@ +import pytest + +import pandas as pd + + +class TestFlags: + def test_equality(self): + a = pd.DataFrame().set_flags(allows_duplicate_labels=True).flags + b = pd.DataFrame().set_flags(allows_duplicate_labels=False).flags + + assert a == a + assert b == b + assert a != b + assert a != 2 + + def test_set(self): + df = pd.DataFrame().set_flags(allows_duplicate_labels=True) + a = df.flags + a.allows_duplicate_labels = False + assert a.allows_duplicate_labels is False + a["allows_duplicate_labels"] = True + assert a.allows_duplicate_labels is True + + def test_repr(self): + a = repr(pd.DataFrame({"A"}).set_flags(allows_duplicate_labels=True).flags) + assert a == "" + a = repr(pd.DataFrame({"A"}).set_flags(allows_duplicate_labels=False).flags) + assert a == "" + + def test_obj_ref(self): + df = pd.DataFrame() + flags = df.flags + del df + with pytest.raises(ValueError, match="object has been deleted"): + flags.allows_duplicate_labels = True + + def test_getitem(self): + df = pd.DataFrame() + flags = df.flags + assert flags["allows_duplicate_labels"] is True + flags["allows_duplicate_labels"] = False + assert flags["allows_duplicate_labels"] is False + + with pytest.raises(KeyError): + flags["a"] + + with pytest.raises(ValueError): + flags["a"] = 10 diff --git a/pandas/tests/util/test_assert_frame_equal.py b/pandas/tests/util/test_assert_frame_equal.py index 3aa3c64923b14..5174ff005b5fb 100644 --- a/pandas/tests/util/test_assert_frame_equal.py +++ b/pandas/tests/util/test_assert_frame_equal.py @@ -268,3 +268,18 @@ def test_assert_frame_equal_ignore_extension_dtype_mismatch(right_dtype): left = pd.DataFrame({"a": [1, 2, 3]}, dtype="Int64") right = pd.DataFrame({"a": [1, 2, 3]}, dtype=right_dtype) tm.assert_frame_equal(left, right, check_dtype=False) + + +def test_allows_duplicate_labels(): + left = pd.DataFrame() + right = pd.DataFrame().set_flags(allows_duplicate_labels=False) + tm.assert_frame_equal(left, left) + tm.assert_frame_equal(right, right) + tm.assert_frame_equal(left, right, check_flags=False) + tm.assert_frame_equal(right, left, check_flags=False) + + with pytest.raises(AssertionError, match=" Date: Wed, 2 Sep 2020 23:06:18 -0400 Subject: [PATCH 0637/1025] BUG/ENH: compression for google cloud storage in to_csv (#35681) --- doc/source/whatsnew/v1.2.0.rst | 2 + pandas/_typing.py | 29 +++++++- pandas/core/frame.py | 13 ++-- pandas/core/generic.py | 2 + pandas/io/common.py | 104 ++++++++++++++++++++++------ pandas/io/excel/_base.py | 4 +- pandas/io/feather_format.py | 23 +++--- pandas/io/formats/csvs.py | 11 +-- pandas/io/json/_json.py | 19 +++-- pandas/io/orc.py | 4 +- pandas/io/parquet.py | 14 ++-- pandas/io/parsers.py | 11 +-- pandas/io/pickle.py | 28 +++++--- pandas/io/sas/sas7bdat.py | 2 +- pandas/io/sas/sas_xport.py | 9 +-- pandas/io/sas/sasreader.py | 16 +++-- pandas/io/stata.py | 15 ++-- pandas/tests/io/test_common.py | 43 ++++++++---- pandas/tests/io/test_compression.py | 10 +++ pandas/tests/io/test_gcs.py | 92 ++++++++++++++++++------ 20 files changed, 321 insertions(+), 130 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 7c083b95b21f3..b07351d05defb 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -290,6 +290,8 @@ I/O - In :meth:`read_csv` `float_precision='round_trip'` now handles `decimal` and `thousands` parameters (:issue:`35365`) - :meth:`to_pickle` and :meth:`read_pickle` were closing user-provided file objects (:issue:`35679`) - :meth:`to_csv` passes compression arguments for `'gzip'` always to `gzip.GzipFile` (:issue:`28103`) +- :meth:`to_csv` did not support zip compression for binary file object not having a filename (:issue: `35058`) +- :meth:`to_csv` and :meth:`read_csv` did not honor `compression` and `encoding` for path-like objects that are internally converted to file-like objects (:issue:`35677`, :issue:`26124`, and :issue:`32392`) Plotting ^^^^^^^^ diff --git a/pandas/_typing.py b/pandas/_typing.py index 1b972030ef5a5..f8af92e07c674 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -1,4 +1,6 @@ +from dataclasses import dataclass from datetime import datetime, timedelta, tzinfo +from io import IOBase from pathlib import Path from typing import ( IO, @@ -8,6 +10,7 @@ Callable, Collection, Dict, + Generic, Hashable, List, Mapping, @@ -62,7 +65,8 @@ "ExtensionDtype", str, np.dtype, Type[Union[str, float, int, complex, bool]] ] DtypeObj = Union[np.dtype, "ExtensionDtype"] -FilePathOrBuffer = Union[str, Path, IO[AnyStr]] +FilePathOrBuffer = Union[str, Path, IO[AnyStr], IOBase] +FileOrBuffer = Union[str, IO[AnyStr], IOBase] # FrameOrSeriesUnion means either a DataFrame or a Series. E.g. # `def func(a: FrameOrSeriesUnion) -> FrameOrSeriesUnion: ...` means that if a Series @@ -114,3 +118,26 @@ # compression keywords and compression CompressionDict = Mapping[str, Optional[Union[str, int, bool]]] CompressionOptions = Optional[Union[str, CompressionDict]] + + +# let's bind types +ModeVar = TypeVar("ModeVar", str, None, Optional[str]) +EncodingVar = TypeVar("EncodingVar", str, None, Optional[str]) + + +@dataclass +class IOargs(Generic[ModeVar, EncodingVar]): + """ + Return value of io/common.py:get_filepath_or_buffer. + + Note (copy&past from io/parsers): + filepath_or_buffer can be Union[FilePathOrBuffer, s3fs.S3File, gcsfs.GCSFile] + though mypy handling of conditional imports is difficult. + See https://github.com/python/mypy/issues/1297 + """ + + filepath_or_buffer: FileOrBuffer + encoding: EncodingVar + compression: CompressionOptions + should_close: bool + mode: Union[ModeVar, str] diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b4c12b9e52f56..c48bec9b670ad 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2284,14 +2284,11 @@ def to_markdown( result = tabulate.tabulate(self, **kwargs) if buf is None: return result - buf, _, _, should_close = get_filepath_or_buffer( - buf, mode=mode, storage_options=storage_options - ) - assert buf is not None # Help mypy. - assert not isinstance(buf, str) - buf.writelines(result) - if should_close: - buf.close() + ioargs = get_filepath_or_buffer(buf, mode=mode, storage_options=storage_options) + assert not isinstance(ioargs.filepath_or_buffer, str) + ioargs.filepath_or_buffer.writelines(result) + if ioargs.should_close: + ioargs.filepath_or_buffer.close() return None @deprecate_kwarg(old_arg_name="fname", new_arg_name="path") diff --git a/pandas/core/generic.py b/pandas/core/generic.py index c9eb4a34683f8..e22f9567ee955 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -4,6 +4,7 @@ from datetime import timedelta import functools import gc +from io import StringIO import json import operator import pickle @@ -3357,6 +3358,7 @@ def to_csv( formatter.save() if path_or_buf is None: + assert isinstance(formatter.path_or_buf, StringIO) return formatter.path_or_buf.getvalue() return None diff --git a/pandas/io/common.py b/pandas/io/common.py index d1305c9cabe0e..97dbc7f1031a2 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -27,12 +27,17 @@ uses_params, uses_relative, ) +import warnings import zipfile from pandas._typing import ( CompressionDict, CompressionOptions, + EncodingVar, + FileOrBuffer, FilePathOrBuffer, + IOargs, + ModeVar, StorageOptions, ) from pandas.compat import _get_lzma_file, _import_lzma @@ -69,9 +74,7 @@ def is_url(url) -> bool: return parse_url(url).scheme in _VALID_URLS -def _expand_user( - filepath_or_buffer: FilePathOrBuffer[AnyStr], -) -> FilePathOrBuffer[AnyStr]: +def _expand_user(filepath_or_buffer: FileOrBuffer[AnyStr]) -> FileOrBuffer[AnyStr]: """ Return the argument with an initial component of ~ or ~user replaced by that user's home directory. @@ -101,7 +104,7 @@ def validate_header_arg(header) -> None: def stringify_path( filepath_or_buffer: FilePathOrBuffer[AnyStr], -) -> FilePathOrBuffer[AnyStr]: +) -> FileOrBuffer[AnyStr]: """ Attempt to convert a path-like object to a string. @@ -134,9 +137,9 @@ def stringify_path( # "__fspath__" [union-attr] # error: Item "IO[bytes]" of "Union[str, Path, IO[bytes]]" has no # attribute "__fspath__" [union-attr] - return filepath_or_buffer.__fspath__() # type: ignore[union-attr] + filepath_or_buffer = filepath_or_buffer.__fspath__() # type: ignore[union-attr] elif isinstance(filepath_or_buffer, pathlib.Path): - return str(filepath_or_buffer) + filepath_or_buffer = str(filepath_or_buffer) return _expand_user(filepath_or_buffer) @@ -162,13 +165,13 @@ def is_fsspec_url(url: FilePathOrBuffer) -> bool: ) -def get_filepath_or_buffer( +def get_filepath_or_buffer( # type: ignore[assignment] filepath_or_buffer: FilePathOrBuffer, - encoding: Optional[str] = None, + encoding: EncodingVar = None, compression: CompressionOptions = None, - mode: Optional[str] = None, + mode: ModeVar = None, storage_options: StorageOptions = None, -): +) -> IOargs[ModeVar, EncodingVar]: """ If the filepath_or_buffer is a url, translate and return the buffer. Otherwise passthrough. @@ -191,14 +194,35 @@ def get_filepath_or_buffer( .. versionadded:: 1.2.0 - Returns - ------- - Tuple[FilePathOrBuffer, str, CompressionOptions, bool] - Tuple containing the filepath or buffer, the encoding, the compression - and should_close. + ..versionchange:: 1.2.0 + + Returns the dataclass IOargs. """ filepath_or_buffer = stringify_path(filepath_or_buffer) + # bz2 and xz do not write the byte order mark for utf-16 and utf-32 + # print a warning when writing such files + compression_method = infer_compression( + filepath_or_buffer, get_compression_method(compression)[0] + ) + if ( + mode + and "w" in mode + and compression_method in ["bz2", "xz"] + and encoding in ["utf-16", "utf-32"] + ): + warnings.warn( + f"{compression} will not write the byte order mark for {encoding}", + UnicodeWarning, + ) + + # Use binary mode when converting path-like objects to file-like objects (fsspec) + # except when text mode is explicitly requested. The original mode is returned if + # fsspec is not used. + fsspec_mode = mode or "rb" + if "t" not in fsspec_mode and "b" not in fsspec_mode: + fsspec_mode += "b" + if isinstance(filepath_or_buffer, str) and is_url(filepath_or_buffer): # TODO: fsspec can also handle HTTP via requests, but leaving this unchanged if storage_options: @@ -212,7 +236,13 @@ def get_filepath_or_buffer( compression = "gzip" reader = BytesIO(req.read()) req.close() - return reader, encoding, compression, True + return IOargs( + filepath_or_buffer=reader, + encoding=encoding, + compression=compression, + should_close=True, + mode=fsspec_mode, + ) if is_fsspec_url(filepath_or_buffer): assert isinstance( @@ -244,7 +274,7 @@ def get_filepath_or_buffer( try: file_obj = fsspec.open( - filepath_or_buffer, mode=mode or "rb", **(storage_options or {}) + filepath_or_buffer, mode=fsspec_mode, **(storage_options or {}) ).open() # GH 34626 Reads from Public Buckets without Credentials needs anon=True except tuple(err_types_to_retry_with_anon): @@ -255,23 +285,41 @@ def get_filepath_or_buffer( storage_options = dict(storage_options) storage_options["anon"] = True file_obj = fsspec.open( - filepath_or_buffer, mode=mode or "rb", **(storage_options or {}) + filepath_or_buffer, mode=fsspec_mode, **(storage_options or {}) ).open() - return file_obj, encoding, compression, True + return IOargs( + filepath_or_buffer=file_obj, + encoding=encoding, + compression=compression, + should_close=True, + mode=fsspec_mode, + ) elif storage_options: raise ValueError( "storage_options passed with file object or non-fsspec file path" ) if isinstance(filepath_or_buffer, (str, bytes, mmap.mmap)): - return _expand_user(filepath_or_buffer), None, compression, False + return IOargs( + filepath_or_buffer=_expand_user(filepath_or_buffer), + encoding=encoding, + compression=compression, + should_close=False, + mode=mode, + ) if not is_file_like(filepath_or_buffer): msg = f"Invalid file path or buffer object type: {type(filepath_or_buffer)}" raise ValueError(msg) - return filepath_or_buffer, None, compression, False + return IOargs( + filepath_or_buffer=filepath_or_buffer, + encoding=encoding, + compression=compression, + should_close=False, + mode=mode, + ) def file_path_to_url(path: str) -> str: @@ -452,6 +500,15 @@ def get_handle( need_text_wrapping = (BufferedIOBase, RawIOBase, S3File) except ImportError: need_text_wrapping = (BufferedIOBase, RawIOBase) + # fsspec is an optional dependency. If it is available, add its file-object + # class to the list of classes that need text wrapping. If fsspec is too old and is + # needed, get_filepath_or_buffer would already have thrown an exception. + try: + from fsspec.spec import AbstractFileSystem + + need_text_wrapping = (*need_text_wrapping, AbstractFileSystem) + except ImportError: + pass handles: List[Union[IO, _MMapWrapper]] = list() f = path_or_buf @@ -583,12 +640,15 @@ def __init__( self.archive_name = archive_name kwargs_zip: Dict[str, Any] = {"compression": zipfile.ZIP_DEFLATED} kwargs_zip.update(kwargs) - super().__init__(file, mode, **kwargs_zip) + super().__init__(file, mode, **kwargs_zip) # type: ignore[arg-type] def write(self, data): archive_name = self.filename if self.archive_name is not None: archive_name = self.archive_name + if archive_name is None: + # ZipFile needs a non-empty string + archive_name = "zip" super().writestr(archive_name, data) @property diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index ead36c95556b1..9bc1d7fedcb31 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -352,9 +352,9 @@ def __init__(self, filepath_or_buffer, storage_options: StorageOptions = None): if is_url(filepath_or_buffer): filepath_or_buffer = BytesIO(urlopen(filepath_or_buffer).read()) elif not isinstance(filepath_or_buffer, (ExcelFile, self._workbook_class)): - filepath_or_buffer, _, _, _ = get_filepath_or_buffer( + filepath_or_buffer = get_filepath_or_buffer( filepath_or_buffer, storage_options=storage_options - ) + ).filepath_or_buffer if isinstance(filepath_or_buffer, self._workbook_class): self.book = filepath_or_buffer diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py index fb606b5ec8aef..a98eebe1c6a2a 100644 --- a/pandas/io/feather_format.py +++ b/pandas/io/feather_format.py @@ -34,9 +34,7 @@ def to_feather(df: DataFrame, path, storage_options: StorageOptions = None, **kw import_optional_dependency("pyarrow") from pyarrow import feather - path, _, _, should_close = get_filepath_or_buffer( - path, mode="wb", storage_options=storage_options - ) + ioargs = get_filepath_or_buffer(path, mode="wb", storage_options=storage_options) if not isinstance(df, DataFrame): raise ValueError("feather only support IO with DataFrames") @@ -74,7 +72,11 @@ def to_feather(df: DataFrame, path, storage_options: StorageOptions = None, **kw if df.columns.inferred_type not in valid_types: raise ValueError("feather must have string column names") - feather.write_feather(df, path, **kwargs) + feather.write_feather(df, ioargs.filepath_or_buffer, **kwargs) + + if ioargs.should_close: + assert not isinstance(ioargs.filepath_or_buffer, str) + ioargs.filepath_or_buffer.close() def read_feather( @@ -122,14 +124,15 @@ def read_feather( import_optional_dependency("pyarrow") from pyarrow import feather - path, _, _, should_close = get_filepath_or_buffer( - path, storage_options=storage_options - ) + ioargs = get_filepath_or_buffer(path, storage_options=storage_options) - df = feather.read_feather(path, columns=columns, use_threads=bool(use_threads)) + df = feather.read_feather( + ioargs.filepath_or_buffer, columns=columns, use_threads=bool(use_threads) + ) # s3fs only validates the credentials when the file is closed. - if should_close: - path.close() + if ioargs.should_close: + assert not isinstance(ioargs.filepath_or_buffer, str) + ioargs.filepath_or_buffer.close() return df diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index c462a96da7133..270caec022fef 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -62,14 +62,19 @@ def __init__( # Extract compression mode as given, if dict compression, self.compression_args = get_compression_method(compression) + self.compression = infer_compression(path_or_buf, compression) - self.path_or_buf, _, _, self.should_close = get_filepath_or_buffer( + ioargs = get_filepath_or_buffer( path_or_buf, encoding=encoding, - compression=compression, + compression=self.compression, mode=mode, storage_options=storage_options, ) + self.path_or_buf = ioargs.filepath_or_buffer + self.should_close = ioargs.should_close + self.mode = ioargs.mode + self.sep = sep self.na_rep = na_rep self.float_format = float_format @@ -78,12 +83,10 @@ def __init__( self.header = header self.index = index self.index_label = index_label - self.mode = mode if encoding is None: encoding = "utf-8" self.encoding = encoding self.errors = errors - self.compression = infer_compression(self.path_or_buf, compression) if quoting is None: quoting = csvlib.QUOTE_MINIMAL diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index fe5e172655ae1..7a3b76ff7e3d0 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -58,12 +58,14 @@ def to_json( ) if path_or_buf is not None: - path_or_buf, _, _, should_close = get_filepath_or_buffer( + ioargs = get_filepath_or_buffer( path_or_buf, compression=compression, mode="wt", storage_options=storage_options, ) + path_or_buf = ioargs.filepath_or_buffer + should_close = ioargs.should_close if lines and orient != "records": raise ValueError("'lines' keyword only valid when 'orient' is records") @@ -102,6 +104,8 @@ def to_json( fh.write(s) finally: fh.close() + for handle in handles: + handle.close() elif path_or_buf is None: return s else: @@ -615,7 +619,7 @@ def read_json( compression_method, compression = get_compression_method(compression) compression_method = infer_compression(path_or_buf, compression_method) compression = dict(compression, method=compression_method) - filepath_or_buffer, _, compression, should_close = get_filepath_or_buffer( + ioargs = get_filepath_or_buffer( path_or_buf, encoding=encoding, compression=compression, @@ -623,7 +627,7 @@ def read_json( ) json_reader = JsonReader( - filepath_or_buffer, + ioargs.filepath_or_buffer, orient=orient, typ=typ, dtype=dtype, @@ -633,10 +637,10 @@ def read_json( numpy=numpy, precise_float=precise_float, date_unit=date_unit, - encoding=encoding, + encoding=ioargs.encoding, lines=lines, chunksize=chunksize, - compression=compression, + compression=ioargs.compression, nrows=nrows, ) @@ -644,8 +648,9 @@ def read_json( return json_reader result = json_reader.read() - if should_close: - filepath_or_buffer.close() + if ioargs.should_close: + assert not isinstance(ioargs.filepath_or_buffer, str) + ioargs.filepath_or_buffer.close() return result diff --git a/pandas/io/orc.py b/pandas/io/orc.py index b556732e4d116..f1b1aa6a43cb5 100644 --- a/pandas/io/orc.py +++ b/pandas/io/orc.py @@ -50,7 +50,7 @@ def read_orc( import pyarrow.orc - path, _, _, _ = get_filepath_or_buffer(path) - orc_file = pyarrow.orc.ORCFile(path) + ioargs = get_filepath_or_buffer(path) + orc_file = pyarrow.orc.ORCFile(ioargs.filepath_or_buffer) result = orc_file.read(columns=columns, **kwargs).to_pandas() return result diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index f2ce2f056ce82..07f2078931687 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -9,7 +9,7 @@ from pandas import DataFrame, get_option -from pandas.io.common import _expand_user, get_filepath_or_buffer, is_fsspec_url +from pandas.io.common import get_filepath_or_buffer, is_fsspec_url, stringify_path def get_engine(engine: str) -> "BaseImpl": @@ -113,7 +113,7 @@ def write( raise ValueError( "storage_options passed with file object or non-fsspec file path" ) - path = _expand_user(path) + path = stringify_path(path) if partition_cols is not None: # writes to multiple files under the given path self.api.parquet.write_to_dataset( @@ -143,10 +143,12 @@ def read( ) fs = kwargs.pop("filesystem", None) should_close = False - path = _expand_user(path) + path = stringify_path(path) if not fs: - path, _, _, should_close = get_filepath_or_buffer(path) + ioargs = get_filepath_or_buffer(path) + path = ioargs.filepath_or_buffer + should_close = ioargs.should_close kwargs["use_pandas_metadata"] = True result = self.api.parquet.read_table( @@ -205,7 +207,7 @@ def write( raise ValueError( "storage_options passed with file object or non-fsspec file path" ) - path, _, _, _ = get_filepath_or_buffer(path) + path = get_filepath_or_buffer(path).filepath_or_buffer with catch_warnings(record=True): self.api.write( @@ -228,7 +230,7 @@ def read( ).open() parquet_file = self.api.ParquetFile(path, open_with=open_with) else: - path, _, _, _ = get_filepath_or_buffer(path) + path = get_filepath_or_buffer(path).filepath_or_buffer parquet_file = self.api.ParquetFile(path) return parquet_file.to_pandas(columns=columns, **kwargs) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 9ad527684120e..c6ef5221e7ead 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -432,10 +432,10 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds): # Union[FilePathOrBuffer, s3fs.S3File, gcsfs.GCSFile] # though mypy handling of conditional imports is difficult. # See https://github.com/python/mypy/issues/1297 - fp_or_buf, _, compression, should_close = get_filepath_or_buffer( + ioargs = get_filepath_or_buffer( filepath_or_buffer, encoding, compression, storage_options=storage_options ) - kwds["compression"] = compression + kwds["compression"] = ioargs.compression if kwds.get("date_parser", None) is not None: if isinstance(kwds["parse_dates"], bool): @@ -450,7 +450,7 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds): _validate_names(kwds.get("names", None)) # Create the parser. - parser = TextFileReader(fp_or_buf, **kwds) + parser = TextFileReader(ioargs.filepath_or_buffer, **kwds) if chunksize or iterator: return parser @@ -460,9 +460,10 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds): finally: parser.close() - if should_close: + if ioargs.should_close: + assert not isinstance(ioargs.filepath_or_buffer, str) try: - fp_or_buf.close() + ioargs.filepath_or_buffer.close() except ValueError: pass diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py index fc1d2e385cf72..857a2d1b69be4 100644 --- a/pandas/io/pickle.py +++ b/pandas/io/pickle.py @@ -86,15 +86,18 @@ def to_pickle( >>> import os >>> os.remove("./dummy.pkl") """ - fp_or_buf, _, compression, should_close = get_filepath_or_buffer( + ioargs = get_filepath_or_buffer( filepath_or_buffer, compression=compression, mode="wb", storage_options=storage_options, ) - if not isinstance(fp_or_buf, str) and compression == "infer": + compression = ioargs.compression + if not isinstance(ioargs.filepath_or_buffer, str) and compression == "infer": compression = None - f, fh = get_handle(fp_or_buf, "wb", compression=compression, is_text=False) + f, fh = get_handle( + ioargs.filepath_or_buffer, "wb", compression=compression, is_text=False + ) if protocol < 0: protocol = pickle.HIGHEST_PROTOCOL try: @@ -105,9 +108,10 @@ def to_pickle( f.close() for _f in fh: _f.close() - if should_close: + if ioargs.should_close: + assert not isinstance(ioargs.filepath_or_buffer, str) try: - fp_or_buf.close() + ioargs.filepath_or_buffer.close() except ValueError: pass @@ -189,12 +193,15 @@ def read_pickle( >>> import os >>> os.remove("./dummy.pkl") """ - fp_or_buf, _, compression, should_close = get_filepath_or_buffer( + ioargs = get_filepath_or_buffer( filepath_or_buffer, compression=compression, storage_options=storage_options ) - if not isinstance(fp_or_buf, str) and compression == "infer": + compression = ioargs.compression + if not isinstance(ioargs.filepath_or_buffer, str) and compression == "infer": compression = None - f, fh = get_handle(fp_or_buf, "rb", compression=compression, is_text=False) + f, fh = get_handle( + ioargs.filepath_or_buffer, "rb", compression=compression, is_text=False + ) # 1) try standard library Pickle # 2) try pickle_compat (older pandas version) to handle subclass changes @@ -222,8 +229,9 @@ def read_pickle( f.close() for _f in fh: _f.close() - if should_close: + if ioargs.should_close: + assert not isinstance(ioargs.filepath_or_buffer, str) try: - fp_or_buf.close() + ioargs.filepath_or_buffer.close() except ValueError: pass diff --git a/pandas/io/sas/sas7bdat.py b/pandas/io/sas/sas7bdat.py index 3d9be7c15726b..76dac39d1889f 100644 --- a/pandas/io/sas/sas7bdat.py +++ b/pandas/io/sas/sas7bdat.py @@ -137,7 +137,7 @@ def __init__( self._current_row_on_page_index = 0 self._current_row_in_file_index = 0 - self._path_or_buf, _, _, _ = get_filepath_or_buffer(path_or_buf) + self._path_or_buf = get_filepath_or_buffer(path_or_buf).filepath_or_buffer if isinstance(self._path_or_buf, str): self._path_or_buf = open(self._path_or_buf, "rb") self.handle = self._path_or_buf diff --git a/pandas/io/sas/sas_xport.py b/pandas/io/sas/sas_xport.py index 6cf248b748107..e4d9324ce5130 100644 --- a/pandas/io/sas/sas_xport.py +++ b/pandas/io/sas/sas_xport.py @@ -253,12 +253,9 @@ def __init__( self._chunksize = chunksize if isinstance(filepath_or_buffer, str): - ( - filepath_or_buffer, - encoding, - compression, - should_close, - ) = get_filepath_or_buffer(filepath_or_buffer, encoding=encoding) + filepath_or_buffer = get_filepath_or_buffer( + filepath_or_buffer, encoding=encoding + ).filepath_or_buffer if isinstance(filepath_or_buffer, (str, bytes)): self.filepath_or_buffer = open(filepath_or_buffer, "rb") diff --git a/pandas/io/sas/sasreader.py b/pandas/io/sas/sasreader.py index fffdebda8c87a..ae9457a8e3147 100644 --- a/pandas/io/sas/sasreader.py +++ b/pandas/io/sas/sasreader.py @@ -109,22 +109,26 @@ def read_sas( else: raise ValueError("unable to infer format of SAS file") - filepath_or_buffer, _, _, should_close = get_filepath_or_buffer( - filepath_or_buffer, encoding - ) + ioargs = get_filepath_or_buffer(filepath_or_buffer, encoding) reader: ReaderBase if format.lower() == "xport": from pandas.io.sas.sas_xport import XportReader reader = XportReader( - filepath_or_buffer, index=index, encoding=encoding, chunksize=chunksize + ioargs.filepath_or_buffer, + index=index, + encoding=ioargs.encoding, + chunksize=chunksize, ) elif format.lower() == "sas7bdat": from pandas.io.sas.sas7bdat import SAS7BDATReader reader = SAS7BDATReader( - filepath_or_buffer, index=index, encoding=encoding, chunksize=chunksize + ioargs.filepath_or_buffer, + index=index, + encoding=ioargs.encoding, + chunksize=chunksize, ) else: raise ValueError("unknown SAS format") @@ -134,6 +138,6 @@ def read_sas( data = reader.read() - if should_close: + if ioargs.should_close: reader.close() return data diff --git a/pandas/io/stata.py b/pandas/io/stata.py index ec3819f1673a8..0074ebc4decb0 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -1069,9 +1069,9 @@ def __init__( self._native_byteorder = _set_endianness(sys.byteorder) path_or_buf = stringify_path(path_or_buf) if isinstance(path_or_buf, str): - path_or_buf, encoding, _, should_close = get_filepath_or_buffer( + path_or_buf = get_filepath_or_buffer( path_or_buf, storage_options=storage_options - ) + ).filepath_or_buffer if isinstance(path_or_buf, (str, bytes)): self.path_or_buf = open(path_or_buf, "rb") @@ -1979,11 +1979,16 @@ def _open_file_binary_write( compression_typ, compression_args = get_compression_method(compression) compression_typ = infer_compression(fname, compression_typ) compression = dict(compression_args, method=compression_typ) - path_or_buf, _, compression, _ = get_filepath_or_buffer( + ioargs = get_filepath_or_buffer( fname, mode="wb", compression=compression, storage_options=storage_options, ) - f, _ = get_handle(path_or_buf, "wb", compression=compression, is_text=False) - return f, True, compression + f, _ = get_handle( + ioargs.filepath_or_buffer, + "wb", + compression=ioargs.compression, + is_text=False, + ) + return f, True, ioargs.compression else: raise TypeError("fname must be a binary file, buffer or path-like.") diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index 5ce2233bc0cd0..85a12a13d19fb 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -105,21 +105,21 @@ def test_infer_compression_from_path(self, extension, expected, path_type): compression = icom.infer_compression(path, compression="infer") assert compression == expected - def test_get_filepath_or_buffer_with_path(self): - filename = "~/sometest" - filepath_or_buffer, _, _, should_close = icom.get_filepath_or_buffer(filename) - assert filepath_or_buffer != filename - assert os.path.isabs(filepath_or_buffer) - assert os.path.expanduser(filename) == filepath_or_buffer - assert not should_close + @pytest.mark.parametrize("path_type", [str, CustomFSPath, Path]) + def test_get_filepath_or_buffer_with_path(self, path_type): + # ignore LocalPath: it creates strange paths: /absolute/~/sometest + filename = path_type("~/sometest") + ioargs = icom.get_filepath_or_buffer(filename) + assert ioargs.filepath_or_buffer != filename + assert os.path.isabs(ioargs.filepath_or_buffer) + assert os.path.expanduser(filename) == ioargs.filepath_or_buffer + assert not ioargs.should_close def test_get_filepath_or_buffer_with_buffer(self): input_buffer = StringIO() - filepath_or_buffer, _, _, should_close = icom.get_filepath_or_buffer( - input_buffer - ) - assert filepath_or_buffer == input_buffer - assert not should_close + ioargs = icom.get_filepath_or_buffer(input_buffer) + assert ioargs.filepath_or_buffer == input_buffer + assert not ioargs.should_close def test_iterator(self): reader = pd.read_csv(StringIO(self.data1), chunksize=1) @@ -389,6 +389,25 @@ def test_binary_mode(self): df.to_csv(path, mode="w+b") tm.assert_frame_equal(df, pd.read_csv(path, index_col=0)) + @pytest.mark.parametrize("encoding", ["utf-16", "utf-32"]) + @pytest.mark.parametrize("compression_", ["bz2", "xz"]) + def test_warning_missing_utf_bom(self, encoding, compression_): + """ + bz2 and xz do not write the byte order mark (BOM) for utf-16/32. + + https://stackoverflow.com/questions/55171439 + + GH 35681 + """ + df = tm.makeDataFrame() + with tm.ensure_clean() as path: + with tm.assert_produces_warning(UnicodeWarning): + df.to_csv(path, compression=compression_, encoding=encoding) + + # reading should fail (otherwise we wouldn't need the warning) + with pytest.raises(Exception): + pd.read_csv(path, compression=compression_, encoding=encoding) + def test_is_fsspec_url(): assert icom.is_fsspec_url("gcs://pandas/somethingelse.com") diff --git a/pandas/tests/io/test_compression.py b/pandas/tests/io/test_compression.py index bc14b485f75e5..31e9ad4cf4416 100644 --- a/pandas/tests/io/test_compression.py +++ b/pandas/tests/io/test_compression.py @@ -124,6 +124,8 @@ def test_compression_binary(compression_only): GH22555 """ df = tm.makeDataFrame() + + # with a file with tm.ensure_clean() as path: with open(path, mode="wb") as file: df.to_csv(file, mode="wb", compression=compression_only) @@ -132,6 +134,14 @@ def test_compression_binary(compression_only): df, pd.read_csv(path, index_col=0, compression=compression_only) ) + # with BytesIO + file = io.BytesIO() + df.to_csv(file, mode="wb", compression=compression_only) + file.seek(0) # file shouldn't be closed + tm.assert_frame_equal( + df, pd.read_csv(file, index_col=0, compression=compression_only) + ) + def test_gzip_reproducibility_file_name(): """ diff --git a/pandas/tests/io/test_gcs.py b/pandas/tests/io/test_gcs.py index eacf4fa08545d..18b5743a3375a 100644 --- a/pandas/tests/io/test_gcs.py +++ b/pandas/tests/io/test_gcs.py @@ -9,12 +9,32 @@ from pandas.util import _test_decorators as td -@td.skip_if_no("gcsfs") -def test_read_csv_gcs(monkeypatch): +@pytest.fixture +def gcs_buffer(monkeypatch): + """Emulate GCS using a binary buffer.""" from fsspec import AbstractFileSystem, registry registry.target.clear() # noqa # remove state + gcs_buffer = BytesIO() + gcs_buffer.close = lambda: True + + class MockGCSFileSystem(AbstractFileSystem): + def open(*args, **kwargs): + gcs_buffer.seek(0) + return gcs_buffer + + monkeypatch.setattr("gcsfs.GCSFileSystem", MockGCSFileSystem) + + return gcs_buffer + + +@td.skip_if_no("gcsfs") +def test_read_csv_gcs(gcs_buffer): + from fsspec import registry + + registry.target.clear() # noqa # remove state + df1 = DataFrame( { "int": [1, 3], @@ -24,21 +44,19 @@ def test_read_csv_gcs(monkeypatch): } ) - class MockGCSFileSystem(AbstractFileSystem): - def open(*args, **kwargs): - return BytesIO(df1.to_csv(index=False).encode()) + gcs_buffer.write(df1.to_csv(index=False).encode()) - monkeypatch.setattr("gcsfs.GCSFileSystem", MockGCSFileSystem) df2 = read_csv("gs://test/test.csv", parse_dates=["dt"]) tm.assert_frame_equal(df1, df2) @td.skip_if_no("gcsfs") -def test_to_csv_gcs(monkeypatch): - from fsspec import AbstractFileSystem, registry +def test_to_csv_gcs(gcs_buffer): + from fsspec import registry registry.target.clear() # noqa # remove state + df1 = DataFrame( { "int": [1, 3], @@ -47,29 +65,57 @@ def test_to_csv_gcs(monkeypatch): "dt": date_range("2018-06-18", periods=2), } ) - s = BytesIO() - s.close = lambda: True - - class MockGCSFileSystem(AbstractFileSystem): - def open(*args, **kwargs): - s.seek(0) - return s - monkeypatch.setattr("gcsfs.GCSFileSystem", MockGCSFileSystem) df1.to_csv("gs://test/test.csv", index=True) - def mock_get_filepath_or_buffer(*args, **kwargs): - return BytesIO(df1.to_csv(index=True).encode()), None, None, False - - monkeypatch.setattr( - "pandas.io.common.get_filepath_or_buffer", mock_get_filepath_or_buffer - ) - df2 = read_csv("gs://test/test.csv", parse_dates=["dt"], index_col=0) tm.assert_frame_equal(df1, df2) +@td.skip_if_no("gcsfs") +@pytest.mark.parametrize("encoding", ["utf-8", "cp1251"]) +def test_to_csv_compression_encoding_gcs(gcs_buffer, compression_only, encoding): + """ + Compression and encoding should with GCS. + + GH 35677 (to_csv, compression), GH 26124 (to_csv, encoding), and + GH 32392 (read_csv, encoding) + """ + from fsspec import registry + + registry.target.clear() # noqa # remove state + df = tm.makeDataFrame() + + # reference of compressed and encoded file + compression = {"method": compression_only} + if compression_only == "gzip": + compression["mtime"] = 1 # be reproducible + buffer = BytesIO() + df.to_csv(buffer, compression=compression, encoding=encoding, mode="wb") + + # write compressed file with explicit compression + path_gcs = "gs://test/test.csv" + df.to_csv(path_gcs, compression=compression, encoding=encoding) + assert gcs_buffer.getvalue() == buffer.getvalue() + read_df = read_csv( + path_gcs, index_col=0, compression=compression_only, encoding=encoding + ) + tm.assert_frame_equal(df, read_df) + + # write compressed file with implicit compression + if compression_only == "gzip": + compression_only = "gz" + compression["method"] = "infer" + path_gcs += f".{compression_only}" + df.to_csv( + path_gcs, compression=compression, encoding=encoding, + ) + assert gcs_buffer.getvalue() == buffer.getvalue() + read_df = read_csv(path_gcs, index_col=0, compression="infer", encoding=encoding) + tm.assert_frame_equal(df, read_df) + + @td.skip_if_no("fastparquet") @td.skip_if_no("gcsfs") def test_to_parquet_gcs_new_file(monkeypatch, tmpdir): From 8985c61e4f889d889ca93eb5d7878734b89a7579 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Thu, 3 Sep 2020 11:20:51 +0100 Subject: [PATCH 0638/1025] CI: MyPy fixup (#36085) --- pandas/io/common.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index 97dbc7f1031a2..9328f90ce67a3 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -165,11 +165,16 @@ def is_fsspec_url(url: FilePathOrBuffer) -> bool: ) -def get_filepath_or_buffer( # type: ignore[assignment] +# https://github.com/python/mypy/issues/8708 +# error: Incompatible default for argument "encoding" (default has type "None", +# argument has type "str") +# error: Incompatible default for argument "mode" (default has type "None", +# argument has type "str") +def get_filepath_or_buffer( filepath_or_buffer: FilePathOrBuffer, - encoding: EncodingVar = None, + encoding: EncodingVar = None, # type: ignore[assignment] compression: CompressionOptions = None, - mode: ModeVar = None, + mode: ModeVar = None, # type: ignore[assignment] storage_options: StorageOptions = None, ) -> IOargs[ModeVar, EncodingVar]: """ From 188c0d4d1aa9f0f8ce42351a60033b6aaa332f80 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Thu, 3 Sep 2020 18:14:09 +0200 Subject: [PATCH 0639/1025] Update contributing_docstring.rst (#36087) --- doc/source/development/contributing_docstring.rst | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/doc/source/development/contributing_docstring.rst b/doc/source/development/contributing_docstring.rst index 0c780ad5f5847..33f30e1d97512 100644 --- a/doc/source/development/contributing_docstring.rst +++ b/doc/source/development/contributing_docstring.rst @@ -32,18 +32,18 @@ The next example gives an idea of what a docstring looks like: Parameters ---------- num1 : int - First number to add + First number to add. num2 : int - Second number to add + Second number to add. Returns ------- int - The sum of `num1` and `num2` + The sum of `num1` and `num2`. See Also -------- - subtract : Subtract one integer from another + subtract : Subtract one integer from another. Examples -------- @@ -998,4 +998,4 @@ mapping function names to docstrings. Wherever possible, we prefer using See ``pandas.core.generic.NDFrame.fillna`` for an example template, and ``pandas.core.series.Series.fillna`` and ``pandas.core.generic.frame.fillna`` -for the filled versions. \ No newline at end of file +for the filled versions. From 06495cfe1d106728b664a30fab8ff53f918ecc7f Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Thu, 3 Sep 2020 17:29:46 +0100 Subject: [PATCH 0640/1025] DOC: minor fixes to whatsnew\v1.1.2.rst (#36086) --- doc/source/whatsnew/v1.1.2.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.1.2.rst b/doc/source/whatsnew/v1.1.2.rst index c740c7b3882c9..ac9fe9d2fca26 100644 --- a/doc/source/whatsnew/v1.1.2.rst +++ b/doc/source/whatsnew/v1.1.2.rst @@ -30,9 +30,9 @@ Bug fixes - Bug in :meth:`DataFrame.eval` with ``object`` dtype column binary operations (:issue:`35794`) - Bug in :class:`Series` constructor raising a ``TypeError`` when constructing sparse datetime64 dtypes (:issue:`35762`) - Bug in :meth:`DataFrame.apply` with ``result_type="reduce"`` returning with incorrect index (:issue:`35683`) -- Bug in :meth:`DateTimeIndex.format` and :meth:`PeriodIndex.format` with ``name=True`` setting the first item to ``"None"`` where it should bw ``""`` (:issue:`35712`) +- Bug in :meth:`DateTimeIndex.format` and :meth:`PeriodIndex.format` with ``name=True`` setting the first item to ``"None"`` where it should be ``""`` (:issue:`35712`) - Bug in :meth:`Float64Index.__contains__` incorrectly raising ``TypeError`` instead of returning ``False`` (:issue:`35788`) -- Bug in :class:`DataFrame` indexing returning an incorrect :class:`Series` in some cases when the series has been altered and a cache not invalidated (:issue:`36051`) +- Bug in :class:`DataFrame` indexing returning an incorrect :class:`Series` in some cases when the series has been altered and a cache not invalidated (:issue:`33675`) .. --------------------------------------------------------------------------- @@ -40,7 +40,7 @@ Bug fixes Other ~~~~~ -- :meth:`factorize` now supports ``na_sentinel=None`` to include NaN in the uniques of the values and remove ``dropna`` keyword which was unintentionally exposed to public facing API in 1.1 version from :meth:`factorize`(:issue:`35667`) +- :meth:`factorize` now supports ``na_sentinel=None`` to include NaN in the uniques of the values and remove ``dropna`` keyword which was unintentionally exposed to public facing API in 1.1 version from :meth:`factorize` (:issue:`35667`) .. --------------------------------------------------------------------------- From 0909db6faff594f8c856e7c35d450a318ee09cc5 Mon Sep 17 00:00:00 2001 From: Sarthak Vineet Kumar Date: Thu, 3 Sep 2020 22:03:46 +0530 Subject: [PATCH 0641/1025] CLN remove unnecessary trailing commas in pandas/io (#36052) --- pandas/io/sas/sas_xport.py | 2 +- pandas/io/stata.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/io/sas/sas_xport.py b/pandas/io/sas/sas_xport.py index e4d9324ce5130..1a4ba544f5d59 100644 --- a/pandas/io/sas/sas_xport.py +++ b/pandas/io/sas/sas_xport.py @@ -244,7 +244,7 @@ class XportReader(ReaderBase, abc.Iterator): __doc__ = _xport_reader_doc def __init__( - self, filepath_or_buffer, index=None, encoding="ISO-8859-1", chunksize=None, + self, filepath_or_buffer, index=None, encoding="ISO-8859-1", chunksize=None ): self._encoding = encoding diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 0074ebc4decb0..34d520004cc65 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -1980,7 +1980,7 @@ def _open_file_binary_write( compression_typ = infer_compression(fname, compression_typ) compression = dict(compression_args, method=compression_typ) ioargs = get_filepath_or_buffer( - fname, mode="wb", compression=compression, storage_options=storage_options, + fname, mode="wb", compression=compression, storage_options=storage_options ) f, _ = get_handle( ioargs.filepath_or_buffer, From 6f3f868f39b6c9205088c02e53c9ae23740c425b Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Thu, 3 Sep 2020 17:35:04 +0100 Subject: [PATCH 0642/1025] DOC: add mypy version to whatsnew\v1.2.0.rst (#36090) --- doc/source/whatsnew/v1.2.0.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index b07351d05defb..e65daa439a225 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -136,6 +136,8 @@ If installed, we now require: +-----------------+-----------------+----------+---------+ | pytest (dev) | 5.0.1 | | X | +-----------------+-----------------+----------+---------+ +| mypy (dev) | 0.782 | | X | ++-----------------+-----------------+----------+---------+ For `optional libraries `_ the general recommendation is to use the latest version. The following table lists the lowest version per library that is currently being tested throughout the development of pandas. From 3d13985185d8b9dda823327f50410c3d9442a133 Mon Sep 17 00:00:00 2001 From: timhunderwood <43515959+timhunderwood@users.noreply.github.com> Date: Thu, 3 Sep 2020 17:58:36 +0100 Subject: [PATCH 0643/1025] DOC: Add Notes about difference to numpy behaviour for ddof in std() GH35985 (#35986) * DOC: Add Notes about difference to numpy behaviour for ddof. GH35985. * remove trailing whitespace. * Update pandas/core/generic.py wording change. Co-authored-by: Daniel Saxton <2658661+dsaxton@users.noreply.github.com> * Make wording simpler and remove reference to normalization. * Make wording simpler and remove reference to normalization. Co-authored-by: Daniel Saxton <2658661+dsaxton@users.noreply.github.com> --- pandas/core/generic.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index e22f9567ee955..6c8780a0fc186 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -10832,7 +10832,12 @@ def _doc_parms(cls): Returns ------- -%(name1)s or %(name2)s (if level specified)\n""" +%(name1)s or %(name2)s (if level specified) + +Notes +----- +To have the same behaviour as `numpy.std`, use `ddof=0` (instead of the +default `ddof=1`)\n""" _bool_doc = """ %(desc)s From fe567596b8d0eb9d0be6a576719cb1eeb9ce4e7f Mon Sep 17 00:00:00 2001 From: Jeet Parekh <12874561+jeet-parekh@users.noreply.github.com> Date: Fri, 4 Sep 2020 19:58:15 +0530 Subject: [PATCH 0644/1025] BUG: groupby and agg on read-only array gives ValueError: buffer source array is read-only (#36061) --- doc/source/whatsnew/v1.1.2.rst | 2 +- pandas/_libs/groupby.pyx | 32 ++++++++------- pandas/tests/groupby/aggregate/test_cython.py | 41 +++++++++++++++++++ 3 files changed, 60 insertions(+), 15 deletions(-) diff --git a/doc/source/whatsnew/v1.1.2.rst b/doc/source/whatsnew/v1.1.2.rst index ac9fe9d2fca26..7195f3d7a3885 100644 --- a/doc/source/whatsnew/v1.1.2.rst +++ b/doc/source/whatsnew/v1.1.2.rst @@ -18,7 +18,7 @@ Fixed regressions - Fix regression in updating a column inplace (e.g. using ``df['col'].fillna(.., inplace=True)``) (:issue:`35731`) - Performance regression for :meth:`RangeIndex.format` (:issue:`35712`) - Regression in :meth:`DataFrame.replace` where a ``TypeError`` would be raised when attempting to replace elements of type :class:`Interval` (:issue:`35931`) -- +- Fixed regression in :meth:`DataFrameGroupBy.agg` where a ``ValueError: buffer source array is read-only`` would be raised when the underlying array is read-only (:issue:`36014`) .. --------------------------------------------------------------------------- diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 38cb973d6dde9..a83634aad3ce2 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -229,7 +229,7 @@ def group_cumprod_float64(float64_t[:, :] out, @cython.boundscheck(False) @cython.wraparound(False) def group_cumsum(numeric[:, :] out, - numeric[:, :] values, + ndarray[numeric, ndim=2] values, const int64_t[:] labels, int ngroups, is_datetimelike, @@ -472,7 +472,7 @@ ctypedef fused complexfloating_t: @cython.boundscheck(False) def _group_add(complexfloating_t[:, :] out, int64_t[:] counts, - complexfloating_t[:, :] values, + ndarray[complexfloating_t, ndim=2] values, const int64_t[:] labels, Py_ssize_t min_count=0): """ @@ -483,8 +483,9 @@ def _group_add(complexfloating_t[:, :] out, complexfloating_t val, count complexfloating_t[:, :] sumx int64_t[:, :] nobs + Py_ssize_t len_values = len(values), len_labels = len(labels) - if len(values) != len(labels): + if len_values != len_labels: raise ValueError("len(index) != len(labels)") nobs = np.zeros((out).shape, dtype=np.int64) @@ -530,7 +531,7 @@ group_add_complex128 = _group_add['double complex'] @cython.boundscheck(False) def _group_prod(floating[:, :] out, int64_t[:] counts, - floating[:, :] values, + ndarray[floating, ndim=2] values, const int64_t[:] labels, Py_ssize_t min_count=0): """ @@ -541,8 +542,9 @@ def _group_prod(floating[:, :] out, floating val, count floating[:, :] prodx int64_t[:, :] nobs + Py_ssize_t len_values = len(values), len_labels = len(labels) - if not len(values) == len(labels): + if len_values != len_labels: raise ValueError("len(index) != len(labels)") nobs = np.zeros((out).shape, dtype=np.int64) @@ -582,7 +584,7 @@ group_prod_float64 = _group_prod['double'] @cython.cdivision(True) def _group_var(floating[:, :] out, int64_t[:] counts, - floating[:, :] values, + ndarray[floating, ndim=2] values, const int64_t[:] labels, Py_ssize_t min_count=-1, int64_t ddof=1): @@ -591,10 +593,11 @@ def _group_var(floating[:, :] out, floating val, ct, oldmean floating[:, :] mean int64_t[:, :] nobs + Py_ssize_t len_values = len(values), len_labels = len(labels) assert min_count == -1, "'min_count' only used in add and prod" - if not len(values) == len(labels): + if len_values != len_labels: raise ValueError("len(index) != len(labels)") nobs = np.zeros((out).shape, dtype=np.int64) @@ -639,7 +642,7 @@ group_var_float64 = _group_var['double'] @cython.boundscheck(False) def _group_mean(floating[:, :] out, int64_t[:] counts, - floating[:, :] values, + ndarray[floating, ndim=2] values, const int64_t[:] labels, Py_ssize_t min_count=-1): cdef: @@ -647,10 +650,11 @@ def _group_mean(floating[:, :] out, floating val, count floating[:, :] sumx int64_t[:, :] nobs + Py_ssize_t len_values = len(values), len_labels = len(labels) assert min_count == -1, "'min_count' only used in add and prod" - if not len(values) == len(labels): + if len_values != len_labels: raise ValueError("len(index) != len(labels)") nobs = np.zeros((out).shape, dtype=np.int64) @@ -689,7 +693,7 @@ group_mean_float64 = _group_mean['double'] @cython.boundscheck(False) def _group_ohlc(floating[:, :] out, int64_t[:] counts, - floating[:, :] values, + ndarray[floating, ndim=2] values, const int64_t[:] labels, Py_ssize_t min_count=-1): """ @@ -740,7 +744,7 @@ group_ohlc_float64 = _group_ohlc['double'] @cython.boundscheck(False) @cython.wraparound(False) def group_quantile(ndarray[float64_t] out, - numeric[:] values, + ndarray[numeric, ndim=1] values, ndarray[int64_t] labels, ndarray[uint8_t] mask, float64_t q, @@ -1072,7 +1076,7 @@ def group_nth(rank_t[:, :] out, @cython.boundscheck(False) @cython.wraparound(False) def group_rank(float64_t[:, :] out, - rank_t[:, :] values, + ndarray[rank_t, ndim=2] values, const int64_t[:] labels, int ngroups, bint is_datetimelike, object ties_method="average", @@ -1424,7 +1428,7 @@ def group_min(groupby_t[:, :] out, @cython.boundscheck(False) @cython.wraparound(False) def group_cummin(groupby_t[:, :] out, - groupby_t[:, :] values, + ndarray[groupby_t, ndim=2] values, const int64_t[:] labels, int ngroups, bint is_datetimelike): @@ -1484,7 +1488,7 @@ def group_cummin(groupby_t[:, :] out, @cython.boundscheck(False) @cython.wraparound(False) def group_cummax(groupby_t[:, :] out, - groupby_t[:, :] values, + ndarray[groupby_t, ndim=2] values, const int64_t[:] labels, int ngroups, bint is_datetimelike): diff --git a/pandas/tests/groupby/aggregate/test_cython.py b/pandas/tests/groupby/aggregate/test_cython.py index 5ddda264642de..87ebd8b5a27fb 100644 --- a/pandas/tests/groupby/aggregate/test_cython.py +++ b/pandas/tests/groupby/aggregate/test_cython.py @@ -236,3 +236,44 @@ def test_cython_with_timestamp_and_nat(op, data): result = df.groupby("a").aggregate(op) tm.assert_frame_equal(expected, result) + + +@pytest.mark.parametrize( + "agg", + [ + "min", + "max", + "count", + "sum", + "prod", + "var", + "mean", + "median", + "ohlc", + "cumprod", + "cumsum", + "shift", + "any", + "all", + "quantile", + "first", + "last", + "rank", + "cummin", + "cummax", + ], +) +def test_read_only_buffer_source_agg(agg): + # https://github.com/pandas-dev/pandas/issues/36014 + df = DataFrame( + { + "sepal_length": [5.1, 4.9, 4.7, 4.6, 5.0], + "species": ["setosa", "setosa", "setosa", "setosa", "setosa"], + } + ) + df._mgr.blocks[0].values.flags.writeable = False + + result = df.groupby(["species"]).agg({"sepal_length": agg}) + expected = df.copy().groupby(["species"]).agg({"sepal_length": agg}) + + tm.assert_equal(result, expected) From 09c2355a326bcb87be31da38485e808bb625b5ba Mon Sep 17 00:00:00 2001 From: Fangchen Li Date: Fri, 4 Sep 2020 09:32:41 -0500 Subject: [PATCH 0645/1025] CLN: use IS64 instead of is_platform_32bit #36108 (#36109) --- pandas/_libs/missing.pyx | 4 ++-- pandas/compat/__init__.py | 24 +------------------ pandas/tests/frame/test_api.py | 5 ++-- .../indexes/interval/test_interval_tree.py | 8 +++---- pandas/tests/indexing/test_coercion.py | 4 ++-- pandas/tests/io/formats/test_format.py | 4 ++-- pandas/tests/io/json/test_pandas.py | 8 +++---- pandas/tests/io/json/test_ujson.py | 12 ++++------ pandas/tests/test_algos.py | 6 ++--- pandas/util/_test_decorators.py | 4 ++-- 10 files changed, 26 insertions(+), 53 deletions(-) diff --git a/pandas/_libs/missing.pyx b/pandas/_libs/missing.pyx index 771e8053ac9be..abf38265ddc6d 100644 --- a/pandas/_libs/missing.pyx +++ b/pandas/_libs/missing.pyx @@ -18,7 +18,7 @@ from pandas._libs.tslibs.nattype cimport ( from pandas._libs.tslibs.np_datetime cimport get_datetime64_value, get_timedelta64_value from pandas._libs.ops_dispatch import maybe_dispatch_ufunc_to_dunder_op -from pandas.compat import is_platform_32bit +from pandas.compat import IS64 cdef: float64_t INF = np.inf @@ -26,7 +26,7 @@ cdef: int64_t NPY_NAT = util.get_nat() - bint is_32bit = is_platform_32bit() + bint is_32bit = not IS64 cpdef bint checknull(object val): diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index ab2835932c95d..f2018a5c01711 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -8,7 +8,6 @@ * platform checker """ import platform -import struct import sys import warnings @@ -20,14 +19,6 @@ IS64 = sys.maxsize > 2 ** 32 -# ---------------------------------------------------------------------------- -# functions largely based / taken from the six module - -# Much of the code in this module comes from Benjamin Peterson's six library. -# The license for this library can be found in LICENSES/SIX and the code can be -# found at https://bitbucket.org/gutworth/six - - def set_function_name(f: F, name: str, cls) -> F: """ Bind the name/qualname attributes of the function. @@ -38,7 +29,6 @@ def set_function_name(f: F, name: str, cls) -> F: return f -# https://github.com/pandas-dev/pandas/pull/9123 def is_platform_little_endian() -> bool: """ Checking if the running platform is little endian. @@ -72,7 +62,7 @@ def is_platform_linux() -> bool: bool True if the running platform is linux. """ - return sys.platform == "linux2" + return sys.platform == "linux" def is_platform_mac() -> bool: @@ -87,18 +77,6 @@ def is_platform_mac() -> bool: return sys.platform == "darwin" -def is_platform_32bit() -> bool: - """ - Checking if the running platform is 32-bit. - - Returns - ------- - bool - True if the running platform is 32-bit. - """ - return struct.calcsize("P") * 8 < 64 - - def _import_lzma(): """ Importing the `lzma` module. diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py index b1c31a6f90133..8b5d0c7ade56c 100644 --- a/pandas/tests/frame/test_api.py +++ b/pandas/tests/frame/test_api.py @@ -6,11 +6,12 @@ import numpy as np import pytest +from pandas.compat import IS64, is_platform_windows import pandas.util._test_decorators as td from pandas.util._test_decorators import async_mark, skip_if_no import pandas as pd -from pandas import Categorical, DataFrame, Series, compat, date_range, timedelta_range +from pandas import Categorical, DataFrame, Series, date_range, timedelta_range import pandas._testing as tm @@ -254,7 +255,7 @@ def test_itertuples(self, float_frame): assert list(dfaa.itertuples()) == [(0, 1, 1), (1, 2, 2), (2, 3, 3)] # repr with int on 32-bit/windows - if not (compat.is_platform_windows() or compat.is_platform_32bit()): + if not (is_platform_windows() or not IS64): assert ( repr(list(df.itertuples(name=None))) == "[(0, 1, 4), (1, 2, 5), (2, 3, 6)]" diff --git a/pandas/tests/indexes/interval/test_interval_tree.py b/pandas/tests/indexes/interval/test_interval_tree.py index 476ec1dd10b4b..ab6eac482211d 100644 --- a/pandas/tests/indexes/interval/test_interval_tree.py +++ b/pandas/tests/indexes/interval/test_interval_tree.py @@ -4,8 +4,8 @@ import pytest from pandas._libs.interval import IntervalTree +from pandas.compat import IS64 -from pandas import compat import pandas._testing as tm @@ -14,9 +14,7 @@ def skipif_32bit(param): Skip parameters in a parametrize on 32bit systems. Specifically used here to skip leaf_size parameters related to GH 23440. """ - marks = pytest.mark.skipif( - compat.is_platform_32bit(), reason="GH 23440: int type mismatch on 32bit" - ) + marks = pytest.mark.skipif(not IS64, reason="GH 23440: int type mismatch on 32bit") return pytest.param(param, marks=marks) @@ -181,7 +179,7 @@ def test_is_overlapping_trivial(self, closed, left, right): tree = IntervalTree(left, right, closed=closed) assert tree.is_overlapping is False - @pytest.mark.skipif(compat.is_platform_32bit(), reason="GH 23440") + @pytest.mark.skipif(not IS64, reason="GH 23440") def test_construction_overflow(self): # GH 25485 left, right = np.arange(101, dtype="int64"), [np.iinfo(np.int64).max] * 101 diff --git a/pandas/tests/indexing/test_coercion.py b/pandas/tests/indexing/test_coercion.py index 1512c88a68778..1c5f00ff754a4 100644 --- a/pandas/tests/indexing/test_coercion.py +++ b/pandas/tests/indexing/test_coercion.py @@ -5,7 +5,7 @@ import numpy as np import pytest -import pandas.compat as compat +from pandas.compat import IS64, is_platform_windows import pandas as pd import pandas._testing as tm @@ -1041,7 +1041,7 @@ def test_replace_series(self, how, to_key, from_key): from_key == "complex128" and to_key in ("int64", "float64") ): - if compat.is_platform_32bit() or compat.is_platform_windows(): + if not IS64 or is_platform_windows(): pytest.skip(f"32-bit platform buggy: {from_key} -> {to_key}") # Expected: do not downcast by replacement diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index a0f475acc4cbb..7daed015f4c57 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -18,7 +18,7 @@ import pytest import pytz -from pandas.compat import is_platform_32bit, is_platform_windows +from pandas.compat import IS64, is_platform_windows import pandas.util._test_decorators as td import pandas as pd @@ -41,7 +41,7 @@ import pandas.io.formats.format as fmt import pandas.io.formats.printing as printing -use_32bit_repr = is_platform_windows() or is_platform_32bit() +use_32bit_repr = is_platform_windows() or not IS64 @pytest.fixture(params=["string", "pathlike", "buffer"]) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 2022abbaee323..59d64e1a6e909 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -9,7 +9,7 @@ import numpy as np import pytest -from pandas.compat import is_platform_32bit, is_platform_windows +from pandas.compat import IS64, is_platform_windows import pandas.util._test_decorators as td import pandas as pd @@ -154,7 +154,7 @@ def test_roundtrip_intframe(self, orient, convert_axes, numpy, dtype, int_frame) expected = int_frame if ( numpy - and (is_platform_32bit() or is_platform_windows()) + and (not IS64 or is_platform_windows()) and not dtype and orient != "split" ): @@ -361,9 +361,7 @@ def test_frame_infinity(self, orient, inf, dtype): result = read_json(df.to_json(), dtype=dtype) assert np.isnan(result.iloc[0, 2]) - @pytest.mark.skipif( - is_platform_32bit(), reason="not compliant on 32-bit, xref #15865" - ) + @pytest.mark.skipif(not IS64, reason="not compliant on 32-bit, xref #15865") @pytest.mark.parametrize( "value,precision,expected_val", [ diff --git a/pandas/tests/io/json/test_ujson.py b/pandas/tests/io/json/test_ujson.py index f969cbca9f427..e2007e07c572a 100644 --- a/pandas/tests/io/json/test_ujson.py +++ b/pandas/tests/io/json/test_ujson.py @@ -15,7 +15,7 @@ import pandas._libs.json as ujson from pandas._libs.tslib import Timestamp -import pandas.compat as compat +from pandas.compat import IS64, is_platform_windows from pandas import DataFrame, DatetimeIndex, Index, NaT, Series, Timedelta, date_range import pandas._testing as tm @@ -53,7 +53,7 @@ def get_int32_compat_dtype(numpy, orient): # See GH#32527 dtype = np.int64 if not ((numpy is None or orient == "index") or (numpy is True and orient is None)): - if compat.is_platform_windows(): + if is_platform_windows(): dtype = np.int32 else: dtype = np.intp @@ -62,9 +62,7 @@ def get_int32_compat_dtype(numpy, orient): class TestUltraJSONTests: - @pytest.mark.skipif( - compat.is_platform_32bit(), reason="not compliant on 32-bit, xref #15865" - ) + @pytest.mark.skipif(not IS64, reason="not compliant on 32-bit, xref #15865") def test_encode_decimal(self): sut = decimal.Decimal("1337.1337") encoded = ujson.encode(sut, double_precision=15) @@ -561,7 +559,7 @@ def test_encode_long_conversion(self): assert long_input == ujson.decode(output) @pytest.mark.parametrize("bigNum", [sys.maxsize + 1, -(sys.maxsize + 2)]) - @pytest.mark.xfail(not compat.IS64, reason="GH-35288") + @pytest.mark.xfail(not IS64, reason="GH-35288") def test_dumps_ints_larger_than_maxsize(self, bigNum): # GH34395 bigNum = sys.maxsize + 1 @@ -703,7 +701,7 @@ def test_int_array(self, any_int_dtype): tm.assert_numpy_array_equal(arr_input, arr_output) def test_int_max(self, any_int_dtype): - if any_int_dtype in ("int64", "uint64") and compat.is_platform_32bit(): + if any_int_dtype in ("int64", "uint64") and not IS64: pytest.skip("Cannot test 64-bit integer on 32-bit platform") klass = np.dtype(any_int_dtype).type diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 59c6a5d53e7bb..72a679d980641 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -8,6 +8,7 @@ from pandas._libs import algos as libalgos, hashtable as ht from pandas._libs.groupby import group_var_float32, group_var_float64 +from pandas.compat import IS64 from pandas.compat.numpy import np_array_datetime64_compat import pandas.util._test_decorators as td @@ -29,7 +30,6 @@ IntervalIndex, Series, Timestamp, - compat, ) import pandas._testing as tm import pandas.core.algorithms as algos @@ -1137,7 +1137,7 @@ def test_dropna(self): ) # 32-bit linux has a different ordering - if not compat.is_platform_32bit(): + if IS64: result = Series([10.3, 5.0, 5.0, None]).value_counts(dropna=False) expected = Series([2, 1, 1], index=[5.0, 10.3, np.nan]) tm.assert_series_equal(result, expected) @@ -1170,7 +1170,7 @@ def test_value_counts_uint64(self): result = algos.value_counts(arr) # 32-bit linux has a different ordering - if not compat.is_platform_32bit(): + if IS64: tm.assert_series_equal(result, expected) diff --git a/pandas/util/_test_decorators.py b/pandas/util/_test_decorators.py index ca7b99492bbf7..78facd6694635 100644 --- a/pandas/util/_test_decorators.py +++ b/pandas/util/_test_decorators.py @@ -31,7 +31,7 @@ def test_foo(): import numpy as np import pytest -from pandas.compat import is_platform_32bit, is_platform_windows +from pandas.compat import IS64, is_platform_windows from pandas.compat._optional import import_optional_dependency from pandas.compat.numpy import _np_version @@ -180,7 +180,7 @@ def skip_if_no(package: str, min_version: Optional[str] = None): _skip_if_no_mpl(), reason="Missing matplotlib dependency" ) skip_if_mpl = pytest.mark.skipif(not _skip_if_no_mpl(), reason="matplotlib is present") -skip_if_32bit = pytest.mark.skipif(is_platform_32bit(), reason="skipping for 32 bit") +skip_if_32bit = pytest.mark.skipif(not IS64, reason="skipping for 32 bit") skip_if_windows = pytest.mark.skipif(is_platform_windows(), reason="Running on Windows") skip_if_windows_python_3 = pytest.mark.skipif( is_platform_windows(), reason="not used on win32" From 8db14d17e248b6facebc9d14c20313fd0ca85092 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 4 Sep 2020 17:17:09 +0200 Subject: [PATCH 0646/1025] REGR: ensure closed attribute of IntervalIndex is preserved in pickle roundtrip (#36118) --- doc/source/whatsnew/v1.1.2.rst | 3 ++- pandas/core/indexes/interval.py | 2 +- pandas/tests/indexes/interval/test_interval.py | 7 +++++++ 3 files changed, 10 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.1.2.rst b/doc/source/whatsnew/v1.1.2.rst index 7195f3d7a3885..232d0c4b4bbcd 100644 --- a/doc/source/whatsnew/v1.1.2.rst +++ b/doc/source/whatsnew/v1.1.2.rst @@ -18,8 +18,9 @@ Fixed regressions - Fix regression in updating a column inplace (e.g. using ``df['col'].fillna(.., inplace=True)``) (:issue:`35731`) - Performance regression for :meth:`RangeIndex.format` (:issue:`35712`) - Regression in :meth:`DataFrame.replace` where a ``TypeError`` would be raised when attempting to replace elements of type :class:`Interval` (:issue:`35931`) +- Fix regression in pickle roundtrip of the ``closed`` attribute of :class:`IntervalIndex` (:issue:`35658`) - Fixed regression in :meth:`DataFrameGroupBy.agg` where a ``ValueError: buffer source array is read-only`` would be raised when the underlying array is read-only (:issue:`36014`) - +- .. --------------------------------------------------------------------------- diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 08f9bd51de77b..419ff81a2a478 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -193,7 +193,7 @@ def func(intvidx_self, other, sort=False): class IntervalIndex(IntervalMixin, ExtensionIndex): _typ = "intervalindex" _comparables = ["name"] - _attributes = ["name"] + _attributes = ["name", "closed"] # we would like our indexing holder to defer to us _defer_to_indexing = True diff --git a/pandas/tests/indexes/interval/test_interval.py b/pandas/tests/indexes/interval/test_interval.py index 2755b186f3eae..a20e542b1edd7 100644 --- a/pandas/tests/indexes/interval/test_interval.py +++ b/pandas/tests/indexes/interval/test_interval.py @@ -874,6 +874,13 @@ def test_get_value_non_scalar_errors(self, key): with tm.assert_produces_warning(FutureWarning): idx.get_value(s, key) + @pytest.mark.parametrize("closed", ["left", "right", "both"]) + def test_pickle_round_trip_closed(self, closed): + # https://github.com/pandas-dev/pandas/issues/35658 + idx = IntervalIndex.from_tuples([(1, 2), (2, 3)], closed=closed) + result = tm.round_trip_pickle(idx) + tm.assert_index_equal(result, idx) + def test_dir(): # GH#27571 dir(interval_index) should not raise From fc38fa0c9014d6c57e2a3ab1be32f23339b2d482 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Fri, 4 Sep 2020 18:03:45 +0100 Subject: [PATCH 0647/1025] TYP: misc fixes for numpy types 2 (#36099) --- pandas/core/dtypes/cast.py | 2 +- pandas/core/dtypes/common.py | 5 ++--- pandas/core/reshape/merge.py | 2 +- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index e6b4cb598989b..1489e08d82bf0 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -651,7 +651,7 @@ def infer_dtype_from_scalar(val, pandas_dtype: bool = False) -> Tuple[DtypeObj, If False, scalar belongs to pandas extension types is inferred as object """ - dtype = np.dtype(object) + dtype: DtypeObj = np.dtype(object) # a 1-element ndarray if isinstance(val, np.ndarray): diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 1e70ff90fcd44..0bf032725547e 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -108,7 +108,7 @@ def ensure_str(value: Union[bytes, Any]) -> str: return value -def ensure_int_or_float(arr: ArrayLike, copy: bool = False) -> np.array: +def ensure_int_or_float(arr: ArrayLike, copy: bool = False) -> np.ndarray: """ Ensure that an dtype array of some integer dtype has an int64 dtype if possible. @@ -1388,8 +1388,7 @@ def is_bool_dtype(arr_or_dtype) -> bool: # guess this return arr_or_dtype.is_object and arr_or_dtype.inferred_type == "boolean" elif is_extension_array_dtype(arr_or_dtype): - dtype = getattr(arr_or_dtype, "dtype", arr_or_dtype) - return dtype._is_boolean + return getattr(arr_or_dtype, "dtype", arr_or_dtype)._is_boolean return issubclass(dtype.type, np.bool_) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 01e20f49917ac..602ff226f8878 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1870,7 +1870,7 @@ def _right_outer_join(x, y, max_groups): def _factorize_keys( lk: ArrayLike, rk: ArrayLike, sort: bool = True, how: str = "inner" -) -> Tuple[np.array, np.array, int]: +) -> Tuple[np.ndarray, np.ndarray, int]: """ Encode left and right keys as enumerated types. From 2c35c44d160b487b65c00711551ad877f432a8a2 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 4 Sep 2020 10:34:49 -0700 Subject: [PATCH 0648/1025] TYP: io (#36120) --- pandas/io/excel/_util.py | 12 +++++++----- pandas/io/formats/css.py | 9 ++++++--- setup.cfg | 3 --- 3 files changed, 13 insertions(+), 11 deletions(-) diff --git a/pandas/io/excel/_util.py b/pandas/io/excel/_util.py index 285aeaf7d4c6e..a4b5b61734ab7 100644 --- a/pandas/io/excel/_util.py +++ b/pandas/io/excel/_util.py @@ -1,3 +1,5 @@ +from typing import List + from pandas.compat._optional import import_optional_dependency from pandas.core.dtypes.common import is_integer, is_list_like @@ -56,7 +58,7 @@ def get_writer(engine_name): raise ValueError(f"No Excel writer '{engine_name}'") from err -def _excel2num(x): +def _excel2num(x: str) -> int: """ Convert Excel column name like 'AB' to 0-based column index. @@ -88,7 +90,7 @@ def _excel2num(x): return index - 1 -def _range2cols(areas): +def _range2cols(areas: str) -> List[int]: """ Convert comma separated list of column names and ranges to indices. @@ -109,12 +111,12 @@ def _range2cols(areas): >>> _range2cols('A,C,Z:AB') [0, 2, 25, 26, 27] """ - cols = [] + cols: List[int] = [] for rng in areas.split(","): if ":" in rng: - rng = rng.split(":") - cols.extend(range(_excel2num(rng[0]), _excel2num(rng[1]) + 1)) + rngs = rng.split(":") + cols.extend(range(_excel2num(rngs[0]), _excel2num(rngs[1]) + 1)) else: cols.append(_excel2num(rng)) diff --git a/pandas/io/formats/css.py b/pandas/io/formats/css.py index 4d6f03489725f..2e9ee192a1182 100644 --- a/pandas/io/formats/css.py +++ b/pandas/io/formats/css.py @@ -3,6 +3,7 @@ """ import re +from typing import Optional import warnings @@ -93,6 +94,7 @@ def __call__(self, declarations_str, inherited=None): props[prop] = val # 2. resolve relative font size + font_size: Optional[float] if props.get("font-size"): if "font-size" in inherited: em_pt = inherited["font-size"] @@ -173,10 +175,11 @@ def _error(): warnings.warn(f"Unhandled size: {repr(in_val)}", CSSWarning) return self.size_to_pt("1!!default", conversions=conversions) - try: - val, unit = re.match(r"^(\S*?)([a-zA-Z%!].*)", in_val).groups() - except AttributeError: + match = re.match(r"^(\S*?)([a-zA-Z%!].*)", in_val) + if match is None: return _error() + + val, unit = match.groups() if val == "": # hack for 'large' etc. val = 1 diff --git a/setup.cfg b/setup.cfg index 29c731848de8e..e7d7df7ff19a2 100644 --- a/setup.cfg +++ b/setup.cfg @@ -279,9 +279,6 @@ check_untyped_defs=False [mypy-pandas.io.formats.console] check_untyped_defs=False -[mypy-pandas.io.formats.css] -check_untyped_defs=False - [mypy-pandas.io.formats.csvs] check_untyped_defs=False From e2b6c6014d081190e5fad277ef5fde5c02eabfe3 Mon Sep 17 00:00:00 2001 From: Asish Mahapatra Date: Fri, 4 Sep 2020 16:29:25 -0400 Subject: [PATCH 0649/1025] BUG: incorrect year returned in isocalendar for certain dates (#36050) --- doc/source/whatsnew/v1.1.2.rst | 1 + pandas/_libs/tslibs/ccalendar.pyx | 4 ++-- pandas/tests/series/test_datetime_values.py | 3 +++ pandas/tests/tslibs/test_ccalendar.py | 15 +++++++++++++++ 4 files changed, 21 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.1.2.rst b/doc/source/whatsnew/v1.1.2.rst index 232d0c4b4bbcd..39850905f60fa 100644 --- a/doc/source/whatsnew/v1.1.2.rst +++ b/doc/source/whatsnew/v1.1.2.rst @@ -33,6 +33,7 @@ Bug fixes - Bug in :meth:`DataFrame.apply` with ``result_type="reduce"`` returning with incorrect index (:issue:`35683`) - Bug in :meth:`DateTimeIndex.format` and :meth:`PeriodIndex.format` with ``name=True`` setting the first item to ``"None"`` where it should be ``""`` (:issue:`35712`) - Bug in :meth:`Float64Index.__contains__` incorrectly raising ``TypeError`` instead of returning ``False`` (:issue:`35788`) +- Bug in :meth:`Series.dt.isocalendar` and :meth:`DatetimeIndex.isocalendar` that returned incorrect year for certain dates (:issue:`36032`) - Bug in :class:`DataFrame` indexing returning an incorrect :class:`Series` in some cases when the series has been altered and a cache not invalidated (:issue:`33675`) .. --------------------------------------------------------------------------- diff --git a/pandas/_libs/tslibs/ccalendar.pyx b/pandas/_libs/tslibs/ccalendar.pyx index 6cce2f5e1fd95..d8c83daa661a3 100644 --- a/pandas/_libs/tslibs/ccalendar.pyx +++ b/pandas/_libs/tslibs/ccalendar.pyx @@ -201,10 +201,10 @@ cpdef iso_calendar_t get_iso_calendar(int year, int month, int day) nogil: iso_week = 1 iso_year = year - if iso_week == 1 and doy > 7: + if iso_week == 1 and month == 12: iso_year += 1 - elif iso_week >= 52 and doy < 7: + elif iso_week >= 52 and month == 1: iso_year -= 1 return iso_year, iso_week, dow + 1 diff --git a/pandas/tests/series/test_datetime_values.py b/pandas/tests/series/test_datetime_values.py index d2ad9c8c398ea..723bd303b1974 100644 --- a/pandas/tests/series/test_datetime_values.py +++ b/pandas/tests/series/test_datetime_values.py @@ -682,6 +682,9 @@ def test_setitem_with_different_tz(self): [[pd.NaT], [[np.NaN, np.NaN, np.NaN]]], [["2019-12-31", "2019-12-29"], [[2020, 1, 2], [2019, 52, 7]]], [["2010-01-01", pd.NaT], [[2009, 53, 5], [np.NaN, np.NaN, np.NaN]]], + # see GH#36032 + [["2016-01-08", "2016-01-04"], [[2016, 1, 5], [2016, 1, 1]]], + [["2016-01-07", "2016-01-01"], [[2016, 1, 4], [2015, 53, 5]]], ], ) def test_isocalendar(self, input_series, expected_output): diff --git a/pandas/tests/tslibs/test_ccalendar.py b/pandas/tests/tslibs/test_ccalendar.py index aab86d3a2df69..1ff700fdc23a3 100644 --- a/pandas/tests/tslibs/test_ccalendar.py +++ b/pandas/tests/tslibs/test_ccalendar.py @@ -1,10 +1,13 @@ from datetime import date, datetime +from hypothesis import given, strategies as st import numpy as np import pytest from pandas._libs.tslibs import ccalendar +import pandas as pd + @pytest.mark.parametrize( "date_tuple,expected", @@ -48,3 +51,15 @@ def test_dt_correct_iso_8601_year_week_and_day(input_date_tuple, expected_iso_tu expected_from_date_isocalendar = date(*input_date_tuple).isocalendar() assert result == expected_from_date_isocalendar assert result == expected_iso_tuple + + +@given( + st.datetimes( + min_value=pd.Timestamp.min.to_pydatetime(warn=False), + max_value=pd.Timestamp.max.to_pydatetime(warn=False), + ) +) +def test_isocalendar(dt): + expected = dt.isocalendar() + result = ccalendar.get_iso_calendar(dt.year, dt.month, dt.day) + assert result == expected From 266d410ca7ff9a9a47d00d15d0d2c2bd7da48df8 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 4 Sep 2020 13:47:12 -0700 Subject: [PATCH 0650/1025] REF: use BlockManager.apply for DataFrameGroupBy.count (#35924) --- pandas/core/groupby/generic.py | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 537feace59fcb..53edd056a6802 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -77,7 +77,7 @@ from pandas.core.groupby.numba_ import generate_numba_func, split_for_numba from pandas.core.indexes.api import Index, MultiIndex, all_indexes_same import pandas.core.indexes.base as ibase -from pandas.core.internals import BlockManager, make_block +from pandas.core.internals import BlockManager from pandas.core.series import Series from pandas.core.util.numba_ import NUMBA_FUNC_CACHE, maybe_use_numba @@ -1750,20 +1750,24 @@ def count(self) -> DataFrame: ids, _, ngroups = self.grouper.group_info mask = ids != -1 - # TODO(2DEA): reshape would not be necessary with 2D EAs - vals = ((mask & ~isna(blk.values).reshape(blk.shape)) for blk in data.blocks) - locs = (blk.mgr_locs for blk in data.blocks) + def hfunc(bvalues: ArrayLike) -> ArrayLike: + # TODO(2DEA): reshape would not be necessary with 2D EAs + if bvalues.ndim == 1: + # EA + masked = mask & ~isna(bvalues).reshape(1, -1) + else: + masked = mask & ~isna(bvalues) - counted = ( - lib.count_level_2d(x, labels=ids, max_bin=ngroups, axis=1) for x in vals - ) - blocks = [make_block(val, placement=loc) for val, loc in zip(counted, locs)] + counted = lib.count_level_2d(masked, labels=ids, max_bin=ngroups, axis=1) + return counted + + new_mgr = data.apply(hfunc) # If we are grouping on categoricals we want unobserved categories to # return zero, rather than the default of NaN which the reindexing in # _wrap_agged_blocks() returns. GH 35028 with com.temp_setattr(self, "observed", True): - result = self._wrap_agged_blocks(blocks, items=data.items) + result = self._wrap_agged_blocks(new_mgr.blocks, items=data.items) return self._reindex_output(result, fill_value=0) From 0f6cc434211d10515b1af2ac05b84c1aea9682a4 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 4 Sep 2020 22:48:43 +0200 Subject: [PATCH 0651/1025] REGR: fix consolidation/cache issue with take operation (#36114) --- doc/source/whatsnew/v1.1.2.rst | 1 + pandas/core/generic.py | 2 ++ pandas/tests/frame/test_block_internals.py | 23 ++++++++++++++++++++++ 3 files changed, 26 insertions(+) diff --git a/doc/source/whatsnew/v1.1.2.rst b/doc/source/whatsnew/v1.1.2.rst index 39850905f60fa..d1a66256454ca 100644 --- a/doc/source/whatsnew/v1.1.2.rst +++ b/doc/source/whatsnew/v1.1.2.rst @@ -17,6 +17,7 @@ Fixed regressions - Regression in :meth:`DatetimeIndex.intersection` incorrectly raising ``AssertionError`` when intersecting against a list (:issue:`35876`) - Fix regression in updating a column inplace (e.g. using ``df['col'].fillna(.., inplace=True)``) (:issue:`35731`) - Performance regression for :meth:`RangeIndex.format` (:issue:`35712`) +- Fix regression in invalid cache after an indexing operation; this can manifest when setting which does not update the data (:issue:`35521`) - Regression in :meth:`DataFrame.replace` where a ``TypeError`` would be raised when attempting to replace elements of type :class:`Interval` (:issue:`35931`) - Fix regression in pickle roundtrip of the ``closed`` attribute of :class:`IntervalIndex` (:issue:`35658`) - Fixed regression in :meth:`DataFrameGroupBy.agg` where a ``ValueError: buffer source array is read-only`` would be raised when the underlying array is read-only (:issue:`36014`) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 6c8780a0fc186..2af323ccc1dd3 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3534,6 +3534,8 @@ class max_speed nv.validate_take(tuple(), kwargs) + self._consolidate_inplace() + new_data = self._mgr.take( indices, axis=self._get_block_manager_axis(axis), verify=True ) diff --git a/pandas/tests/frame/test_block_internals.py b/pandas/tests/frame/test_block_internals.py index 00cfa6265934f..4a85da72bc8b1 100644 --- a/pandas/tests/frame/test_block_internals.py +++ b/pandas/tests/frame/test_block_internals.py @@ -658,3 +658,26 @@ def test_update_inplace_sets_valid_block_values(): # smoketest for OP bug from GH#35731 assert df.isnull().sum().sum() == 0 + + +def test_nonconsolidated_item_cache_take(): + # https://github.com/pandas-dev/pandas/issues/35521 + + # create non-consolidated dataframe with object dtype columns + df = pd.DataFrame() + df["col1"] = pd.Series(["a"], dtype=object) + df["col2"] = pd.Series([0], dtype=object) + + # access column (item cache) + df["col1"] == "A" + # take operation + # (regression was that this consolidated but didn't reset item cache, + # resulting in an invalid cache and the .at operation not working properly) + df[df["col2"] == 0] + + # now setting value should update actual dataframe + df.at[0, "col1"] = "A" + + expected = pd.DataFrame({"col1": ["A"], "col2": [0]}, dtype=object) + tm.assert_frame_equal(df, expected) + assert df.at[0, "col1"] == "A" From 209fb9a823bf290e3ca6a303c17e2adc68369ee4 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 4 Sep 2020 13:51:21 -0700 Subject: [PATCH 0652/1025] De-privatize (#36107) --- pandas/_libs/indexing.pyx | 2 +- pandas/_libs/tslibs/parsing.pyx | 4 ++-- pandas/core/arrays/datetimes.py | 4 ++-- pandas/core/arrays/timedeltas.py | 8 ++++---- pandas/core/common.py | 4 ++-- pandas/core/computation/common.py | 2 +- pandas/core/computation/ops.py | 6 +++--- pandas/core/computation/pytables.py | 8 ++++---- pandas/core/dtypes/common.py | 16 ++++++++-------- pandas/core/dtypes/inference.py | 8 ++++---- pandas/core/dtypes/missing.py | 8 ++++---- pandas/core/groupby/ops.py | 8 +++----- pandas/core/indexes/timedeltas.py | 4 ++-- pandas/core/indexing.py | 10 +++++----- pandas/core/internals/blocks.py | 14 +++++++------- pandas/core/internals/concat.py | 4 ++-- pandas/core/internals/managers.py | 6 +++--- pandas/core/nanops.py | 6 +++--- pandas/core/reshape/melt.py | 4 ++-- pandas/core/reshape/util.py | 4 ++-- pandas/core/tools/datetimes.py | 8 ++++---- pandas/io/formats/format.py | 8 ++++---- pandas/tests/dtypes/test_common.py | 8 ++++---- pandas/tests/tslibs/test_parsing.py | 10 +++++----- 24 files changed, 81 insertions(+), 83 deletions(-) diff --git a/pandas/_libs/indexing.pyx b/pandas/_libs/indexing.pyx index f9aedeb8ad93e..7966fe8d4f045 100644 --- a/pandas/_libs/indexing.pyx +++ b/pandas/_libs/indexing.pyx @@ -1,4 +1,4 @@ -cdef class _NDFrameIndexerBase: +cdef class NDFrameIndexerBase: """ A base class for _NDFrameIndexer for fast instantiation and attribute access. """ diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index 7478179df3b75..aeb1be121bc9e 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -771,7 +771,7 @@ class _timelex: _DATEUTIL_LEXER_SPLIT = _timelex.split -def _format_is_iso(f) -> bint: +def format_is_iso(f: str) -> bint: """ Does format match the iso8601 set that can be handled by the C parser? Generally of form YYYY-MM-DDTHH:MM:SS - date separator can be different @@ -789,7 +789,7 @@ def _format_is_iso(f) -> bint: return False -def _guess_datetime_format( +def guess_datetime_format( dt_str, bint dayfirst=False, dt_str_parse=du_parse, diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 8b2bb7832b5d0..1bea3a9eb137e 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -602,9 +602,9 @@ def astype(self, dtype, copy=True): # Rendering Methods def _format_native_types(self, na_rep="NaT", date_format=None, **kwargs): - from pandas.io.formats.format import _get_format_datetime64_from_values + from pandas.io.formats.format import get_format_datetime64_from_values - fmt = _get_format_datetime64_from_values(self, date_format) + fmt = get_format_datetime64_from_values(self, date_format) return tslib.format_array_from_datetime( self.asi8.ravel(), tz=self.tz, format=fmt, na_rep=na_rep diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index 3e21d01355dda..2d694c469b3a9 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -379,14 +379,14 @@ def median( # Rendering Methods def _formatter(self, boxed=False): - from pandas.io.formats.format import _get_format_timedelta64 + from pandas.io.formats.format import get_format_timedelta64 - return _get_format_timedelta64(self, box=True) + return get_format_timedelta64(self, box=True) def _format_native_types(self, na_rep="NaT", date_format=None, **kwargs): - from pandas.io.formats.format import _get_format_timedelta64 + from pandas.io.formats.format import get_format_timedelta64 - formatter = _get_format_timedelta64(self._data, na_rep) + formatter = get_format_timedelta64(self._data, na_rep) return np.array([formatter(x) for x in self._data.ravel()]).reshape(self.shape) # ---------------------------------------------------------------- diff --git a/pandas/core/common.py b/pandas/core/common.py index 6fd4700ab7f3f..279d512e5a046 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -31,7 +31,7 @@ ABCIndexClass, ABCSeries, ) -from pandas.core.dtypes.inference import _iterable_not_string +from pandas.core.dtypes.inference import iterable_not_string from pandas.core.dtypes.missing import isna, isnull, notnull # noqa @@ -61,7 +61,7 @@ def flatten(l): flattened : generator """ for el in l: - if _iterable_not_string(el): + if iterable_not_string(el): for s in flatten(el): yield s else: diff --git a/pandas/core/computation/common.py b/pandas/core/computation/common.py index 327ec21c3c11c..8a9583c465f50 100644 --- a/pandas/core/computation/common.py +++ b/pandas/core/computation/common.py @@ -5,7 +5,7 @@ from pandas._config import get_option -def _ensure_decoded(s): +def ensure_decoded(s): """ If we have bytes, decode them to unicode. """ diff --git a/pandas/core/computation/ops.py b/pandas/core/computation/ops.py index e55df1e1d8155..b2144c45c6323 100644 --- a/pandas/core/computation/ops.py +++ b/pandas/core/computation/ops.py @@ -15,7 +15,7 @@ from pandas.core.dtypes.common import is_list_like, is_scalar import pandas.core.common as com -from pandas.core.computation.common import _ensure_decoded, result_type_many +from pandas.core.computation.common import ensure_decoded, result_type_many from pandas.core.computation.scope import _DEFAULT_GLOBALS from pandas.io.formats.printing import pprint_thing, pprint_thing_encoded @@ -466,7 +466,7 @@ def stringify(value): v = rhs.value if isinstance(v, (int, float)): v = stringify(v) - v = Timestamp(_ensure_decoded(v)) + v = Timestamp(ensure_decoded(v)) if v.tz is not None: v = v.tz_convert("UTC") self.rhs.update(v) @@ -475,7 +475,7 @@ def stringify(value): v = lhs.value if isinstance(v, (int, float)): v = stringify(v) - v = Timestamp(_ensure_decoded(v)) + v = Timestamp(ensure_decoded(v)) if v.tz is not None: v = v.tz_convert("UTC") self.lhs.update(v) diff --git a/pandas/core/computation/pytables.py b/pandas/core/computation/pytables.py index f1b11a6869c2b..8dd7c1a22d0ae 100644 --- a/pandas/core/computation/pytables.py +++ b/pandas/core/computation/pytables.py @@ -14,7 +14,7 @@ import pandas as pd import pandas.core.common as com from pandas.core.computation import expr, ops, scope as _scope -from pandas.core.computation.common import _ensure_decoded +from pandas.core.computation.common import ensure_decoded from pandas.core.computation.expr import BaseExprVisitor from pandas.core.computation.ops import UndefinedVariableError, is_term from pandas.core.construction import extract_array @@ -189,12 +189,12 @@ def stringify(value): encoder = pprint_thing return encoder(value) - kind = _ensure_decoded(self.kind) - meta = _ensure_decoded(self.meta) + kind = ensure_decoded(self.kind) + meta = ensure_decoded(self.meta) if kind == "datetime64" or kind == "datetime": if isinstance(v, (int, float)): v = stringify(v) - v = _ensure_decoded(v) + v = ensure_decoded(v) v = Timestamp(v) if v.tz is not None: v = v.tz_convert("UTC") diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 0bf032725547e..6ad46eb967275 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -635,8 +635,8 @@ def is_dtype_equal(source, target) -> bool: False """ try: - source = _get_dtype(source) - target = _get_dtype(target) + source = get_dtype(source) + target = get_dtype(target) return source == target except (TypeError, AttributeError): @@ -984,10 +984,10 @@ def is_datetime64_ns_dtype(arr_or_dtype) -> bool: if arr_or_dtype is None: return False try: - tipo = _get_dtype(arr_or_dtype) + tipo = get_dtype(arr_or_dtype) except TypeError: if is_datetime64tz_dtype(arr_or_dtype): - tipo = _get_dtype(arr_or_dtype.dtype) + tipo = get_dtype(arr_or_dtype.dtype) else: return False return tipo == DT64NS_DTYPE or getattr(tipo, "base", None) == DT64NS_DTYPE @@ -1372,7 +1372,7 @@ def is_bool_dtype(arr_or_dtype) -> bool: if arr_or_dtype is None: return False try: - dtype = _get_dtype(arr_or_dtype) + dtype = get_dtype(arr_or_dtype) except TypeError: return False @@ -1557,13 +1557,13 @@ def _is_dtype(arr_or_dtype, condition) -> bool: if arr_or_dtype is None: return False try: - dtype = _get_dtype(arr_or_dtype) + dtype = get_dtype(arr_or_dtype) except (TypeError, ValueError, UnicodeEncodeError): return False return condition(dtype) -def _get_dtype(arr_or_dtype) -> DtypeObj: +def get_dtype(arr_or_dtype) -> DtypeObj: """ Get the dtype instance associated with an array or dtype object. @@ -1694,7 +1694,7 @@ def infer_dtype_from_object(dtype): try: return infer_dtype_from_object(getattr(np, dtype)) except (AttributeError, TypeError): - # Handles cases like _get_dtype(int) i.e., + # Handles cases like get_dtype(int) i.e., # Python objects that are valid dtypes # (unlike user-defined types, in general) # diff --git a/pandas/core/dtypes/inference.py b/pandas/core/dtypes/inference.py index d1607b5ede6c3..329c4445b05bc 100644 --- a/pandas/core/dtypes/inference.py +++ b/pandas/core/dtypes/inference.py @@ -68,7 +68,7 @@ def is_number(obj) -> bool: return isinstance(obj, (Number, np.number)) -def _iterable_not_string(obj) -> bool: +def iterable_not_string(obj) -> bool: """ Check if the object is an iterable but not a string. @@ -83,11 +83,11 @@ def _iterable_not_string(obj) -> bool: Examples -------- - >>> _iterable_not_string([1, 2, 3]) + >>> iterable_not_string([1, 2, 3]) True - >>> _iterable_not_string("foo") + >>> iterable_not_string("foo") False - >>> _iterable_not_string(1) + >>> iterable_not_string(1) False """ return isinstance(obj, abc.Iterable) and not isinstance(obj, str) diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index f59bb31af2828..163500525dbd8 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -338,7 +338,7 @@ def notna(obj): notnull = notna -def _isna_compat(arr, fill_value=np.nan) -> bool: +def isna_compat(arr, fill_value=np.nan) -> bool: """ Parameters ---------- @@ -496,7 +496,7 @@ def array_equals(left: ArrayLike, right: ArrayLike) -> bool: return array_equivalent(left, right, dtype_equal=True) -def _infer_fill_value(val): +def infer_fill_value(val): """ infer the fill value for the nan/NaT from the provided scalar/ndarray/list-like if we are a NaT, return the correct dtyped @@ -516,11 +516,11 @@ def _infer_fill_value(val): return np.nan -def _maybe_fill(arr, fill_value=np.nan): +def maybe_fill(arr, fill_value=np.nan): """ if we have a compatible fill_value and arr dtype, then fill """ - if _isna_compat(arr, fill_value): + if isna_compat(arr, fill_value): arr.fill(fill_value) return arr diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 4dd5b7f30e7f0..c076b6e2e181b 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -37,7 +37,7 @@ is_timedelta64_dtype, needs_i8_conversion, ) -from pandas.core.dtypes.missing import _maybe_fill, isna +from pandas.core.dtypes.missing import isna, maybe_fill import pandas.core.algorithms as algorithms from pandas.core.base import SelectionMixin @@ -524,13 +524,11 @@ def _cython_operation( codes, _, _ = self.group_info if kind == "aggregate": - result = _maybe_fill( - np.empty(out_shape, dtype=out_dtype), fill_value=np.nan - ) + result = maybe_fill(np.empty(out_shape, dtype=out_dtype), fill_value=np.nan) counts = np.zeros(self.ngroups, dtype=np.int64) result = self._aggregate(result, counts, values, codes, func, min_count) elif kind == "transform": - result = _maybe_fill( + result = maybe_fill( np.empty_like(values, dtype=out_dtype), fill_value=np.nan ) diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index dccc8369c5366..85c8396dfd1fe 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -177,9 +177,9 @@ def _simple_new(cls, values: TimedeltaArray, name: Label = None): @property def _formatter_func(self): - from pandas.io.formats.format import _get_format_timedelta64 + from pandas.io.formats.format import get_format_timedelta64 - return _get_format_timedelta64(self, box=True) + return get_format_timedelta64(self, box=True) # ------------------------------------------------------------------- diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index dd81823055390..cfb17b9498a36 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -4,7 +4,7 @@ from pandas._config.config import option_context -from pandas._libs.indexing import _NDFrameIndexerBase +from pandas._libs.indexing import NDFrameIndexerBase from pandas._libs.lib import item_from_zerodim from pandas.errors import AbstractMethodError, InvalidIndexError from pandas.util._decorators import doc @@ -22,7 +22,7 @@ ) from pandas.core.dtypes.concat import concat_compat from pandas.core.dtypes.generic import ABCDataFrame, ABCMultiIndex, ABCSeries -from pandas.core.dtypes.missing import _infer_fill_value, isna +from pandas.core.dtypes.missing import infer_fill_value, isna import pandas.core.common as com from pandas.core.construction import array as pd_array @@ -583,7 +583,7 @@ def iat(self) -> "_iAtIndexer": return _iAtIndexer("iat", self) -class _LocationIndexer(_NDFrameIndexerBase): +class _LocationIndexer(NDFrameIndexerBase): _valid_types: str axis = None @@ -1604,7 +1604,7 @@ def _setitem_with_indexer(self, indexer, value): return # add a new item with the dtype setup - self.obj[key] = _infer_fill_value(value) + self.obj[key] = infer_fill_value(value) new_indexer = convert_from_missing_indexer_tuple( indexer, self.obj.axes @@ -2017,7 +2017,7 @@ def _align_frame(self, indexer, df: ABCDataFrame): raise ValueError("Incompatible indexer with DataFrame") -class _ScalarAccessIndexer(_NDFrameIndexerBase): +class _ScalarAccessIndexer(NDFrameIndexerBase): """ Access scalars quickly. """ diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index ad388ef3f53b0..b2305736f9d46 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -56,7 +56,7 @@ ABCPandasArray, ABCSeries, ) -from pandas.core.dtypes.missing import _isna_compat, is_valid_nat_for_dtype, isna +from pandas.core.dtypes.missing import is_valid_nat_for_dtype, isna, isna_compat import pandas.core.algorithms as algos from pandas.core.array_algos.transforms import shift @@ -487,7 +487,7 @@ def _maybe_downcast(self, blocks: List["Block"], downcast=None) -> List["Block"] ): return blocks - return _extend_blocks([b.downcast(downcast) for b in blocks]) + return extend_blocks([b.downcast(downcast) for b in blocks]) def downcast(self, dtypes=None): """ try to downcast each item to the dict of dtypes if present """ @@ -2474,7 +2474,7 @@ def _maybe_downcast(self, blocks: List["Block"], downcast=None) -> List["Block"] return blocks # split and convert the blocks - return _extend_blocks([b.convert(datetime=True, numeric=False) for b in blocks]) + return extend_blocks([b.convert(datetime=True, numeric=False) for b in blocks]) def _can_hold_element(self, element: Any) -> bool: return True @@ -2503,7 +2503,7 @@ def replace(self, to_replace, value, inplace=False, regex=False, convert=True): result = b._replace_single( to_rep, v, inplace=inplace, regex=regex, convert=convert ) - result_blocks = _extend_blocks(result, result_blocks) + result_blocks = extend_blocks(result, result_blocks) blocks = result_blocks return result_blocks @@ -2514,7 +2514,7 @@ def replace(self, to_replace, value, inplace=False, regex=False, convert=True): result = b._replace_single( to_rep, value, inplace=inplace, regex=regex, convert=convert ) - result_blocks = _extend_blocks(result, result_blocks) + result_blocks = extend_blocks(result, result_blocks) blocks = result_blocks return result_blocks @@ -2769,7 +2769,7 @@ def make_block(values, placement, klass=None, ndim=None, dtype=None): # ----------------------------------------------------------------- -def _extend_blocks(result, blocks=None): +def extend_blocks(result, blocks=None): """ return a new extended blocks, given the result """ if blocks is None: blocks = [] @@ -2860,7 +2860,7 @@ def _putmask_smart(v: np.ndarray, mask: np.ndarray, n) -> np.ndarray: else: # make sure that we have a nullable type # if we have nulls - if not _isna_compat(v, nn[0]): + if not isna_compat(v, nn[0]): pass elif not (is_float_dtype(nn.dtype) or is_integer_dtype(nn.dtype)): # only compare integers/floats diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 88839d2211f81..b45f0890cafa4 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -10,7 +10,7 @@ from pandas.core.dtypes.cast import maybe_promote from pandas.core.dtypes.common import ( - _get_dtype, + get_dtype, is_categorical_dtype, is_datetime64_dtype, is_datetime64tz_dtype, @@ -200,7 +200,7 @@ def dtype(self): if not self.needs_filling: return self.block.dtype else: - return _get_dtype(maybe_promote(self.block.dtype, self.block.fill_value)[0]) + return get_dtype(maybe_promote(self.block.dtype, self.block.fill_value)[0]) @cache_readonly def is_na(self): diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 2e3098d94afcb..753b949f7c802 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -54,8 +54,8 @@ DatetimeTZBlock, ExtensionBlock, ObjectValuesExtensionBlock, - _extend_blocks, _safe_reshape, + extend_blocks, get_block_type, make_block, ) @@ -406,7 +406,7 @@ def apply( if not ignore_failures: raise continue - result_blocks = _extend_blocks(applied, result_blocks) + result_blocks = extend_blocks(applied, result_blocks) if ignore_failures: return self._combine(result_blocks) @@ -1868,7 +1868,7 @@ def _consolidate(blocks): merged_blocks = _merge_blocks( list(group_blocks), dtype=dtype, can_consolidate=_can_consolidate ) - new_blocks = _extend_blocks(merged_blocks, new_blocks) + new_blocks = extend_blocks(merged_blocks, new_blocks) return new_blocks diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index e3f16a3ef4f90..6fdde22a1c514 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -13,7 +13,7 @@ from pandas.core.dtypes.cast import _int64_max, maybe_upcast_putmask from pandas.core.dtypes.common import ( - _get_dtype, + get_dtype, is_any_int_dtype, is_bool_dtype, is_complex, @@ -678,7 +678,7 @@ def _get_counts_nanvar( count : scalar or array d : scalar or array """ - dtype = _get_dtype(dtype) + dtype = get_dtype(dtype) count = _get_counts(value_counts, mask, axis, dtype=dtype) d = count - dtype.type(ddof) @@ -1234,7 +1234,7 @@ def _get_counts( ------- count : scalar or array """ - dtype = _get_dtype(dtype) + dtype = get_dtype(dtype) if axis is None: if mask is not None: n = mask.size - mask.sum() diff --git a/pandas/core/reshape/melt.py b/pandas/core/reshape/melt.py index 8724f7674f0c8..33ce5ed49b9c2 100644 --- a/pandas/core/reshape/melt.py +++ b/pandas/core/reshape/melt.py @@ -14,7 +14,7 @@ import pandas.core.common as com from pandas.core.indexes.api import Index, MultiIndex from pandas.core.reshape.concat import concat -from pandas.core.reshape.util import _tile_compat +from pandas.core.reshape.util import tile_compat from pandas.core.shared_docs import _shared_docs from pandas.core.tools.numeric import to_numeric @@ -136,7 +136,7 @@ def melt( result = frame._constructor(mdata, columns=mcolumns) if not ignore_index: - result.index = _tile_compat(frame.index, K) + result.index = tile_compat(frame.index, K) return result diff --git a/pandas/core/reshape/util.py b/pandas/core/reshape/util.py index 6949270317f7c..a1bf3f8ee4119 100644 --- a/pandas/core/reshape/util.py +++ b/pandas/core/reshape/util.py @@ -48,10 +48,10 @@ def cartesian_product(X): # if any factor is empty, the cartesian product is empty b = np.zeros_like(cumprodX) - return [_tile_compat(np.repeat(x, b[i]), np.product(a[i])) for i, x in enumerate(X)] + return [tile_compat(np.repeat(x, b[i]), np.product(a[i])) for i, x in enumerate(X)] -def _tile_compat(arr, num: int): +def tile_compat(arr, num: int): """ Index compat for np.tile. diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 8fcc5f74ea897..09a53d5a10ae6 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -20,8 +20,8 @@ from pandas._libs.tslibs import Timestamp, conversion, parsing from pandas._libs.tslibs.parsing import ( # noqa DateParseError, - _format_is_iso, - _guess_datetime_format, + format_is_iso, + guess_datetime_format, ) from pandas._libs.tslibs.strptime import array_strptime from pandas._typing import ArrayLike, Label, Timezone @@ -73,7 +73,7 @@ def _guess_datetime_format_for_array(arr, **kwargs): # Try to guess the format based on the first non-NaN element non_nan_elements = notna(arr).nonzero()[0] if len(non_nan_elements): - return _guess_datetime_format(arr[non_nan_elements[0]], **kwargs) + return guess_datetime_format(arr[non_nan_elements[0]], **kwargs) def should_cache( @@ -387,7 +387,7 @@ def _convert_listlike_datetimes( # datetime strings, so in those cases don't use the inferred # format because this path makes process slower in this # special case - format_is_iso8601 = _format_is_iso(format) + format_is_iso8601 = format_is_iso(format) if format_is_iso8601: require_iso8601 = not infer_datetime_format format = None diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 1616c5345a899..dfd3e317196e2 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -1475,7 +1475,7 @@ def _format_strings(self) -> List[str]: fmt_values = format_array_from_datetime( values.asi8.ravel(), - format=_get_format_datetime64_from_values(values, self.date_format), + format=get_format_datetime64_from_values(values, self.date_format), na_rep=self.nat_rep, ).reshape(values.shape) return fmt_values.tolist() @@ -1638,7 +1638,7 @@ def _get_format_datetime64( return lambda x, tz=None: _format_datetime64(x, tz=tz, nat_rep=nat_rep) -def _get_format_datetime64_from_values( +def get_format_datetime64_from_values( values: Union[np.ndarray, DatetimeArray, DatetimeIndex], date_format: Optional[str] ) -> Optional[str]: """ given values and a date_format, return a string format """ @@ -1679,13 +1679,13 @@ def __init__( self.box = box def _format_strings(self) -> List[str]: - formatter = self.formatter or _get_format_timedelta64( + formatter = self.formatter or get_format_timedelta64( self.values, nat_rep=self.nat_rep, box=self.box ) return [formatter(x) for x in self.values] -def _get_format_timedelta64( +def get_format_timedelta64( values: Union[np.ndarray, TimedeltaIndex, TimedeltaArray], nat_rep: str = "NaT", box: bool = False, diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index a6c526fcb008a..2db9a9a403e1c 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -649,8 +649,8 @@ def test_is_complex_dtype(): (IntervalDtype(), IntervalDtype()), ], ) -def test__get_dtype(input_param, result): - assert com._get_dtype(input_param) == result +def test_get_dtype(input_param, result): + assert com.get_dtype(input_param) == result @pytest.mark.parametrize( @@ -664,12 +664,12 @@ def test__get_dtype(input_param, result): (pd.DataFrame([1, 2]), "data type not understood"), ], ) -def test__get_dtype_fails(input_param, expected_error_message): +def test_get_dtype_fails(input_param, expected_error_message): # python objects # 2020-02-02 npdev changed error message expected_error_message += f"|Cannot interpret '{input_param}' as a data type" with pytest.raises(TypeError, match=expected_error_message): - com._get_dtype(input_param) + com.get_dtype(input_param) @pytest.mark.parametrize( diff --git a/pandas/tests/tslibs/test_parsing.py b/pandas/tests/tslibs/test_parsing.py index dc7421ea63464..70fa724464226 100644 --- a/pandas/tests/tslibs/test_parsing.py +++ b/pandas/tests/tslibs/test_parsing.py @@ -148,14 +148,14 @@ def test_parsers_month_freq(date_str, expected): ], ) def test_guess_datetime_format_with_parseable_formats(string, fmt): - result = parsing._guess_datetime_format(string) + result = parsing.guess_datetime_format(string) assert result == fmt @pytest.mark.parametrize("dayfirst,expected", [(True, "%d/%m/%Y"), (False, "%m/%d/%Y")]) def test_guess_datetime_format_with_dayfirst(dayfirst, expected): ambiguous_string = "01/01/2011" - result = parsing._guess_datetime_format(ambiguous_string, dayfirst=dayfirst) + result = parsing.guess_datetime_format(ambiguous_string, dayfirst=dayfirst) assert result == expected @@ -169,7 +169,7 @@ def test_guess_datetime_format_with_dayfirst(dayfirst, expected): ], ) def test_guess_datetime_format_with_locale_specific_formats(string, fmt): - result = parsing._guess_datetime_format(string) + result = parsing.guess_datetime_format(string) assert result == fmt @@ -189,7 +189,7 @@ def test_guess_datetime_format_with_locale_specific_formats(string, fmt): def test_guess_datetime_format_invalid_inputs(invalid_dt): # A datetime string must include a year, month and a day for it to be # guessable, in addition to being a string that looks like a datetime. - assert parsing._guess_datetime_format(invalid_dt) is None + assert parsing.guess_datetime_format(invalid_dt) is None @pytest.mark.parametrize( @@ -205,7 +205,7 @@ def test_guess_datetime_format_invalid_inputs(invalid_dt): ) def test_guess_datetime_format_no_padding(string, fmt): # see gh-11142 - result = parsing._guess_datetime_format(string) + result = parsing.guess_datetime_format(string) assert result == fmt From 1f6fd496cb72b3a1503419ba1e1f978fd428831f Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 4 Sep 2020 13:52:13 -0700 Subject: [PATCH 0653/1025] De-privatize functions in io.excel (#36104) --- pandas/io/excel/_base.py | 16 ++++++++-------- pandas/io/excel/_odswriter.py | 4 ++-- pandas/io/excel/_openpyxl.py | 4 ++-- pandas/io/excel/_util.py | 18 +++++------------- pandas/io/excel/_xlsxwriter.py | 4 ++-- pandas/io/excel/_xlwt.py | 4 ++-- 6 files changed, 21 insertions(+), 29 deletions(-) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 9bc1d7fedcb31..74eb65521f5b2 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -24,11 +24,11 @@ validate_header_arg, ) from pandas.io.excel._util import ( - _fill_mi_header, - _get_default_writer, - _maybe_convert_usecols, - _pop_header_name, + fill_mi_header, + get_default_writer, get_writer, + maybe_convert_usecols, + pop_header_name, ) from pandas.io.parsers import TextParser @@ -454,7 +454,7 @@ def parse( sheet = self.get_sheet_by_index(asheetname) data = self.get_sheet_data(sheet, convert_float) - usecols = _maybe_convert_usecols(usecols) + usecols = maybe_convert_usecols(usecols) if not data: output[asheetname] = DataFrame() @@ -473,10 +473,10 @@ def parse( if is_integer(skiprows): row += skiprows - data[row], control_row = _fill_mi_header(data[row], control_row) + data[row], control_row = fill_mi_header(data[row], control_row) if index_col is not None: - header_name, _ = _pop_header_name(data[row], index_col) + header_name, _ = pop_header_name(data[row], index_col) header_names.append(header_name) if is_list_like(index_col): @@ -645,7 +645,7 @@ def __new__(cls, path, engine=None, **kwargs): try: engine = config.get_option(f"io.excel.{ext}.writer") if engine == "auto": - engine = _get_default_writer(ext) + engine = get_default_writer(ext) except KeyError as err: raise ValueError(f"No engine for filetype: '{ext}'") from err cls = get_writer(engine) diff --git a/pandas/io/excel/_odswriter.py b/pandas/io/excel/_odswriter.py index f39391ae1fe7f..e7684012c1d4c 100644 --- a/pandas/io/excel/_odswriter.py +++ b/pandas/io/excel/_odswriter.py @@ -5,7 +5,7 @@ import pandas._libs.json as json from pandas.io.excel._base import ExcelWriter -from pandas.io.excel._util import _validate_freeze_panes +from pandas.io.excel._util import validate_freeze_panes from pandas.io.formats.excel import ExcelCell @@ -59,7 +59,7 @@ def write_cells( wks = Table(name=sheet_name) self.sheets[sheet_name] = wks - if _validate_freeze_panes(freeze_panes): + if validate_freeze_panes(freeze_panes): assert freeze_panes is not None self._create_freeze_panes(sheet_name, freeze_panes) diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py index 3c67902d41baa..89b581da6ed31 100644 --- a/pandas/io/excel/_openpyxl.py +++ b/pandas/io/excel/_openpyxl.py @@ -6,7 +6,7 @@ from pandas.compat._optional import import_optional_dependency from pandas.io.excel._base import ExcelWriter, _BaseExcelReader -from pandas.io.excel._util import _validate_freeze_panes +from pandas.io.excel._util import validate_freeze_panes if TYPE_CHECKING: from openpyxl.descriptors.serialisable import Serialisable @@ -385,7 +385,7 @@ def write_cells( wks.title = sheet_name self.sheets[sheet_name] = wks - if _validate_freeze_panes(freeze_panes): + if validate_freeze_panes(freeze_panes): wks.freeze_panes = wks.cell( row=freeze_panes[0] + 1, column=freeze_panes[1] + 1 ) diff --git a/pandas/io/excel/_util.py b/pandas/io/excel/_util.py index a4b5b61734ab7..47105916a9c78 100644 --- a/pandas/io/excel/_util.py +++ b/pandas/io/excel/_util.py @@ -23,7 +23,7 @@ def register_writer(klass): _writers[engine_name] = klass -def _get_default_writer(ext): +def get_default_writer(ext): """ Return the default writer for the given extension. @@ -123,7 +123,7 @@ def _range2cols(areas: str) -> List[int]: return cols -def _maybe_convert_usecols(usecols): +def maybe_convert_usecols(usecols): """ Convert `usecols` into a compatible format for parsing in `parsers.py`. @@ -152,7 +152,7 @@ def _maybe_convert_usecols(usecols): return usecols -def _validate_freeze_panes(freeze_panes): +def validate_freeze_panes(freeze_panes): if freeze_panes is not None: if len(freeze_panes) == 2 and all( isinstance(item, int) for item in freeze_panes @@ -169,15 +169,7 @@ def _validate_freeze_panes(freeze_panes): return False -def _trim_excel_header(row): - # trim header row so auto-index inference works - # xlrd uses '' , openpyxl None - while len(row) > 0 and (row[0] == "" or row[0] is None): - row = row[1:] - return row - - -def _fill_mi_header(row, control_row): +def fill_mi_header(row, control_row): """ Forward fill blank entries in row but only inside the same parent index. @@ -210,7 +202,7 @@ def _fill_mi_header(row, control_row): return row, control_row -def _pop_header_name(row, index_col): +def pop_header_name(row, index_col): """ Pop the header name for MultiIndex parsing. diff --git a/pandas/io/excel/_xlsxwriter.py b/pandas/io/excel/_xlsxwriter.py index bdbb006ae93dc..53f0c94d12e4c 100644 --- a/pandas/io/excel/_xlsxwriter.py +++ b/pandas/io/excel/_xlsxwriter.py @@ -3,7 +3,7 @@ import pandas._libs.json as json from pandas.io.excel._base import ExcelWriter -from pandas.io.excel._util import _validate_freeze_panes +from pandas.io.excel._util import validate_freeze_panes class _XlsxStyler: @@ -208,7 +208,7 @@ def write_cells( style_dict = {"null": None} - if _validate_freeze_panes(freeze_panes): + if validate_freeze_panes(freeze_panes): wks.freeze_panes(*(freeze_panes)) for cell in cells: diff --git a/pandas/io/excel/_xlwt.py b/pandas/io/excel/_xlwt.py index e1f72eb533c51..faebe526d17bd 100644 --- a/pandas/io/excel/_xlwt.py +++ b/pandas/io/excel/_xlwt.py @@ -3,7 +3,7 @@ import pandas._libs.json as json from pandas.io.excel._base import ExcelWriter -from pandas.io.excel._util import _validate_freeze_panes +from pandas.io.excel._util import validate_freeze_panes if TYPE_CHECKING: from xlwt import XFStyle @@ -48,7 +48,7 @@ def write_cells( wks = self.book.add_sheet(sheet_name) self.sheets[sheet_name] = wks - if _validate_freeze_panes(freeze_panes): + if validate_freeze_panes(freeze_panes): wks.set_panes_frozen(True) wks.set_horz_split_pos(freeze_panes[0]) wks.set_vert_split_pos(freeze_panes[1]) From a2c64c8866fa8f2e27578c2588c462d971fbe7eb Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Fri, 4 Sep 2020 21:59:51 +0100 Subject: [PATCH 0654/1025] TYP: activate Check for missing error codes (#36088) --- ci/code_checks.sh | 7 +++---- pandas/core/arrays/datetimelike.py | 5 ++--- pandas/core/groupby/categorical.py | 6 ++++-- pandas/io/common.py | 6 +++++- pandas/plotting/_matplotlib/core.py | 2 +- 5 files changed, 15 insertions(+), 11 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 852f66763683b..2e0f27fefca0b 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -230,10 +230,9 @@ if [[ -z "$CHECK" || "$CHECK" == "patterns" ]]; then invgrep -R --include="*.py" -P '# type: (?!ignore)' pandas RET=$(($RET + $?)) ; echo $MSG "DONE" - # https://github.com/python/mypy/issues/7384 - # MSG='Check for missing error codes with # type: ignore' ; echo $MSG - # invgrep -R --include="*.py" -P '# type: ignore(?!\[)' pandas - # RET=$(($RET + $?)) ; echo $MSG "DONE" + MSG='Check for missing error codes with # type: ignore' ; echo $MSG + invgrep -R --include="*.py" -P '# type:\s?ignore(?!\[)' pandas + RET=$(($RET + $?)) ; echo $MSG "DONE" MSG='Check for use of foo.__class__ instead of type(foo)' ; echo $MSG invgrep -R --include=*.{py,pyx} '\.__class__' pandas diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 1b5e1d81f00d6..5a44f87400b79 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -468,10 +468,9 @@ def _ndarray(self) -> np.ndarray: def _from_backing_data(self: _T, arr: np.ndarray) -> _T: # Note: we do not retain `freq` + # error: Too many arguments for "NDArrayBackedExtensionArray" # error: Unexpected keyword argument "dtype" for "NDArrayBackedExtensionArray" - # TODO: add my error code - # https://github.com/python/mypy/issues/7384 - return type(self)(arr, dtype=self.dtype) # type: ignore + return type(self)(arr, dtype=self.dtype) # type: ignore[call-arg] # ------------------------------------------------------------------ diff --git a/pandas/core/groupby/categorical.py b/pandas/core/groupby/categorical.py index 4d5acf527a867..3f04339803bf6 100644 --- a/pandas/core/groupby/categorical.py +++ b/pandas/core/groupby/categorical.py @@ -98,8 +98,10 @@ def recode_from_groupby( """ # we re-order to the original category orderings if sort: - return ci.set_categories(c.categories) # type: ignore [attr-defined] + # error: "CategoricalIndex" has no attribute "set_categories" + return ci.set_categories(c.categories) # type: ignore[attr-defined] # we are not sorting, so add unobserved to the end new_cats = c.categories[~c.categories.isin(ci.categories)] - return ci.add_categories(new_cats) # type: ignore [attr-defined] + # error: "CategoricalIndex" has no attribute "add_categories" + return ci.add_categories(new_cats) # type: ignore[attr-defined] diff --git a/pandas/io/common.py b/pandas/io/common.py index 9328f90ce67a3..2b13d54ec3aed 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -374,7 +374,11 @@ def get_compression_method( if isinstance(compression, Mapping): compression_args = dict(compression) try: - compression_method = compression_args.pop("method") # type: ignore + # error: Incompatible types in assignment (expression has type + # "Union[str, int, None]", variable has type "Optional[str]") + compression_method = compression_args.pop( # type: ignore[assignment] + "method" + ) except KeyError as err: raise ValueError("If mapping, compression must have key 'method'") from err else: diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index 2d64e1b051444..147e4efd74bc3 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -656,7 +656,7 @@ def _plot(cls, ax: "Axes", x, y, style=None, is_errorbar: bool = False, **kwds): if style is not None: args = (x, y, style) else: - args = (x, y) # type:ignore[assignment] + args = (x, y) # type: ignore[assignment] return ax.plot(*args, **kwds) def _get_index_name(self) -> Optional[str]: From 6afb1d8709d0a635a7c3cfc14366c2b8f9249624 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 4 Sep 2020 14:57:24 -0700 Subject: [PATCH 0655/1025] CLN: remove xfails/skips for no-longer-supported numpys (#36128) --- pandas/plotting/_matplotlib/style.py | 2 +- pandas/tests/arrays/sparse/test_array.py | 5 +- pandas/tests/frame/test_analytics.py | 58 ++++-------------------- pandas/tests/io/formats/test_to_csv.py | 4 -- pandas/tests/series/test_analytics.py | 5 -- 5 files changed, 10 insertions(+), 64 deletions(-) diff --git a/pandas/plotting/_matplotlib/style.py b/pandas/plotting/_matplotlib/style.py index 904a760a03e58..3e0954ef3d74d 100644 --- a/pandas/plotting/_matplotlib/style.py +++ b/pandas/plotting/_matplotlib/style.py @@ -11,7 +11,7 @@ def get_standard_colors( - num_colors=None, colormap=None, color_type: str = "default", color=None + num_colors: int, colormap=None, color_type: str = "default", color=None ): import matplotlib.pyplot as plt diff --git a/pandas/tests/arrays/sparse/test_array.py b/pandas/tests/arrays/sparse/test_array.py index 04215bfe1bedb..ece9367cea7fe 100644 --- a/pandas/tests/arrays/sparse/test_array.py +++ b/pandas/tests/arrays/sparse/test_array.py @@ -194,8 +194,7 @@ def test_constructor_inferred_fill_value(self, data, fill_value): @pytest.mark.parametrize("format", ["coo", "csc", "csr"]) @pytest.mark.parametrize( - "size", - [pytest.param(0, marks=td.skip_if_np_lt("1.16", reason="NumPy-11383")), 10], + "size", [0, 10], ) @td.skip_if_no_scipy def test_from_spmatrix(self, size, format): @@ -904,7 +903,6 @@ def test_all(self, data, pos, neg): ([1.0, 2.0, 1.0], 1.0, 0.0), ], ) - @td.skip_if_np_lt("1.15") # prior didn't dispatch def test_numpy_all(self, data, pos, neg): # GH 17570 out = np.all(SparseArray(data)) @@ -956,7 +954,6 @@ def test_any(self, data, pos, neg): ([0.0, 2.0, 0.0], 2.0, 0.0), ], ) - @td.skip_if_np_lt("1.15") # prior didn't dispatch def test_numpy_any(self, data, pos, neg): # GH 17570 out = np.any(SparseArray(data)) diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index b0ba0d991c9b0..f21b1d3dfe487 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -1060,54 +1060,14 @@ def test_any_all_bool_only(self): (np.any, {"A": pd.Series([0.0, 1.0], dtype="float")}, True), (np.all, {"A": pd.Series([0, 1], dtype=int)}, False), (np.any, {"A": pd.Series([0, 1], dtype=int)}, True), - pytest.param( - np.all, - {"A": pd.Series([0, 1], dtype="M8[ns]")}, - False, - marks=[td.skip_if_np_lt("1.15")], - ), - pytest.param( - np.any, - {"A": pd.Series([0, 1], dtype="M8[ns]")}, - True, - marks=[td.skip_if_np_lt("1.15")], - ), - pytest.param( - np.all, - {"A": pd.Series([1, 2], dtype="M8[ns]")}, - True, - marks=[td.skip_if_np_lt("1.15")], - ), - pytest.param( - np.any, - {"A": pd.Series([1, 2], dtype="M8[ns]")}, - True, - marks=[td.skip_if_np_lt("1.15")], - ), - pytest.param( - np.all, - {"A": pd.Series([0, 1], dtype="m8[ns]")}, - False, - marks=[td.skip_if_np_lt("1.15")], - ), - pytest.param( - np.any, - {"A": pd.Series([0, 1], dtype="m8[ns]")}, - True, - marks=[td.skip_if_np_lt("1.15")], - ), - pytest.param( - np.all, - {"A": pd.Series([1, 2], dtype="m8[ns]")}, - True, - marks=[td.skip_if_np_lt("1.15")], - ), - pytest.param( - np.any, - {"A": pd.Series([1, 2], dtype="m8[ns]")}, - True, - marks=[td.skip_if_np_lt("1.15")], - ), + pytest.param(np.all, {"A": pd.Series([0, 1], dtype="M8[ns]")}, False,), + pytest.param(np.any, {"A": pd.Series([0, 1], dtype="M8[ns]")}, True,), + pytest.param(np.all, {"A": pd.Series([1, 2], dtype="M8[ns]")}, True,), + pytest.param(np.any, {"A": pd.Series([1, 2], dtype="M8[ns]")}, True,), + pytest.param(np.all, {"A": pd.Series([0, 1], dtype="m8[ns]")}, False,), + pytest.param(np.any, {"A": pd.Series([0, 1], dtype="m8[ns]")}, True,), + pytest.param(np.all, {"A": pd.Series([1, 2], dtype="m8[ns]")}, True,), + pytest.param(np.any, {"A": pd.Series([1, 2], dtype="m8[ns]")}, True,), (np.all, {"A": pd.Series([0, 1], dtype="category")}, False), (np.any, {"A": pd.Series([0, 1], dtype="category")}, True), (np.all, {"A": pd.Series([1, 2], dtype="category")}, True), @@ -1120,8 +1080,6 @@ def test_any_all_bool_only(self): "B": pd.Series([10, 20], dtype="m8[ns]"), }, True, - # In 1.13.3 and 1.14 np.all(df) returns a Timedelta here - marks=[td.skip_if_np_lt("1.15")], ), ], ) diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py index 753b8b6eda9c5..c40935b2cc5dd 100644 --- a/pandas/tests/io/formats/test_to_csv.py +++ b/pandas/tests/io/formats/test_to_csv.py @@ -11,10 +11,6 @@ class TestToCSV: - @pytest.mark.xfail( - (3, 6, 5) > sys.version_info, - reason=("Python csv library bug (see https://bugs.python.org/issue32255)"), - ) def test_to_csv_with_single_column(self): # see gh-18676, https://bugs.python.org/issue32255 # diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index ab8618eb0a7d4..e39083b709f38 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -3,8 +3,6 @@ import numpy as np import pytest -import pandas.util._test_decorators as td - import pandas as pd from pandas import DataFrame, Series import pandas._testing as tm @@ -130,7 +128,6 @@ def test_is_monotonic(self): @pytest.mark.parametrize("func", [np.any, np.all]) @pytest.mark.parametrize("kwargs", [dict(keepdims=True), dict(out=object())]) - @td.skip_if_np_lt("1.15") def test_validate_any_all_out_keepdims_raises(self, kwargs, func): s = pd.Series([1, 2]) param = list(kwargs)[0] @@ -144,7 +141,6 @@ def test_validate_any_all_out_keepdims_raises(self, kwargs, func): with pytest.raises(ValueError, match=msg): func(s, **kwargs) - @td.skip_if_np_lt("1.15") def test_validate_sum_initial(self): s = pd.Series([1, 2]) msg = ( @@ -167,7 +163,6 @@ def test_validate_median_initial(self): # method instead of the ufunc. s.median(overwrite_input=True) - @td.skip_if_np_lt("1.15") def test_validate_stat_keepdims(self): s = pd.Series([1, 2]) msg = ( From 85b61801786e960b3d9460e44359283ab37eb693 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 4 Sep 2020 19:51:04 -0700 Subject: [PATCH 0656/1025] De-privatize (#36130) --- pandas/core/dtypes/dtypes.py | 4 +-- pandas/core/indexes/datetimes.py | 4 +-- pandas/core/indexing.py | 4 +-- pandas/core/util/hashing.py | 8 ++--- pandas/io/formats/format.py | 4 +-- pandas/io/formats/style.py | 20 ++++++------- pandas/plotting/_matplotlib/core.py | 29 +++++++++---------- pandas/plotting/_matplotlib/timeseries.py | 10 +++---- .../tests/indexing/multiindex/test_slice.py | 4 +-- pandas/tests/indexing/test_indexing.py | 12 ++++---- 10 files changed, 48 insertions(+), 51 deletions(-) diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 8dc500dddeafa..e321fdd9b3a9b 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -395,7 +395,7 @@ def _hash_categories(categories, ordered: Ordered = True) -> int: from pandas.core.dtypes.common import DT64NS_DTYPE, is_datetime64tz_dtype from pandas.core.util.hashing import ( - _combine_hash_arrays, + combine_hash_arrays, hash_array, hash_tuples, ) @@ -427,7 +427,7 @@ def _hash_categories(categories, ordered: Ordered = True) -> int: ) else: cat_array = [cat_array] - hashed = _combine_hash_arrays(iter(cat_array), num_items=len(cat_array)) + hashed = combine_hash_arrays(iter(cat_array), num_items=len(cat_array)) return np.bitwise_xor.reduce(hashed) @classmethod diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 6dcb9250812d0..3fd93a8159041 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -354,9 +354,9 @@ def _mpl_repr(self): @property def _formatter_func(self): - from pandas.io.formats.format import _get_format_datetime64 + from pandas.io.formats.format import get_format_datetime64 - formatter = _get_format_datetime64(is_dates_only=self._is_dates_only) + formatter = get_format_datetime64(is_dates_only=self._is_dates_only) return lambda x: f"'{formatter(x, tz=self.tz)}'" # -------------------------------------------------------------------- diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index cfb17b9498a36..fe2fec1c52063 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -2291,7 +2291,7 @@ def need_slice(obj) -> bool: ) -def _non_reducing_slice(slice_): +def non_reducing_slice(slice_): """ Ensure that a slice doesn't reduce to a Series or Scalar. @@ -2330,7 +2330,7 @@ def pred(part) -> bool: return tuple(slice_) -def _maybe_numeric_slice(df, slice_, include_bool=False): +def maybe_numeric_slice(df, slice_, include_bool: bool = False): """ Want nice defaults for background_gradient that don't break with non-numeric data. But if slice_ is passed go with that. diff --git a/pandas/core/util/hashing.py b/pandas/core/util/hashing.py index d79b9f4092325..df082c7285ae8 100644 --- a/pandas/core/util/hashing.py +++ b/pandas/core/util/hashing.py @@ -24,7 +24,7 @@ _default_hash_key = "0123456789123456" -def _combine_hash_arrays(arrays, num_items: int): +def combine_hash_arrays(arrays, num_items: int): """ Parameters ---------- @@ -108,7 +108,7 @@ def hash_pandas_object( for _ in [None] ) arrays = itertools.chain([h], index_iter) - h = _combine_hash_arrays(arrays, 2) + h = combine_hash_arrays(arrays, 2) h = Series(h, index=obj.index, dtype="uint64", copy=False) @@ -131,7 +131,7 @@ def hash_pandas_object( # keep `hashes` specifically a generator to keep mypy happy _hashes = itertools.chain(hashes, index_hash_generator) hashes = (x for x in _hashes) - h = _combine_hash_arrays(hashes, num_items) + h = combine_hash_arrays(hashes, num_items) h = Series(h, index=obj.index, dtype="uint64", copy=False) else: @@ -175,7 +175,7 @@ def hash_tuples(vals, encoding="utf8", hash_key: str = _default_hash_key): hashes = ( _hash_categorical(cat, encoding=encoding, hash_key=hash_key) for cat in vals ) - h = _combine_hash_arrays(hashes, len(vals)) + h = combine_hash_arrays(hashes, len(vals)) if is_tuple: h = h[0] diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index dfd3e317196e2..75228a865c6cc 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -1626,7 +1626,7 @@ def _format_datetime64_dateonly( return x._date_repr -def _get_format_datetime64( +def get_format_datetime64( is_dates_only: bool, nat_rep: str = "NaT", date_format: None = None ) -> Callable: @@ -1658,7 +1658,7 @@ def _format_strings(self) -> List[str]: """ we by definition have a TZ """ values = self.values.astype(object) is_dates_only = _is_dates_only(values) - formatter = self.formatter or _get_format_datetime64( + formatter = self.formatter or get_format_datetime64( is_dates_only, date_format=self.date_format ) fmt_values = [formatter(x) for x in values] diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index 3bbb5271bce61..023557dd6494d 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -36,7 +36,7 @@ import pandas.core.common as com from pandas.core.frame import DataFrame from pandas.core.generic import NDFrame -from pandas.core.indexing import _maybe_numeric_slice, _non_reducing_slice +from pandas.core.indexing import maybe_numeric_slice, non_reducing_slice jinja2 = import_optional_dependency("jinja2", extra="DataFrame.style requires jinja2.") @@ -475,7 +475,7 @@ def format(self, formatter, subset=None, na_rep: Optional[str] = None) -> "Style row_locs = range(len(self.data)) col_locs = range(len(self.data.columns)) else: - subset = _non_reducing_slice(subset) + subset = non_reducing_slice(subset) if len(subset) == 1: subset = subset, self.data.columns @@ -633,7 +633,7 @@ def _apply( **kwargs, ) -> "Styler": subset = slice(None) if subset is None else subset - subset = _non_reducing_slice(subset) + subset = non_reducing_slice(subset) data = self.data.loc[subset] if axis is not None: result = data.apply(func, axis=axis, result_type="expand", **kwargs) @@ -725,7 +725,7 @@ def _applymap(self, func: Callable, subset=None, **kwargs) -> "Styler": func = partial(func, **kwargs) # applymap doesn't take kwargs? if subset is None: subset = pd.IndexSlice[:] - subset = _non_reducing_slice(subset) + subset = non_reducing_slice(subset) result = self.data.loc[subset].applymap(func) self._update_ctx(result) return self @@ -985,7 +985,7 @@ def hide_columns(self, subset) -> "Styler": ------- self : Styler """ - subset = _non_reducing_slice(subset) + subset = non_reducing_slice(subset) hidden_df = self.data.loc[subset] self.hidden_columns = self.columns.get_indexer_for(hidden_df.columns) return self @@ -1087,8 +1087,8 @@ def background_gradient( of the data is extended by ``low * (x.max() - x.min())`` and ``high * (x.max() - x.min())`` before normalizing. """ - subset = _maybe_numeric_slice(self.data, subset) - subset = _non_reducing_slice(subset) + subset = maybe_numeric_slice(self.data, subset) + subset = non_reducing_slice(subset) self.apply( self._background_gradient, cmap=cmap, @@ -1322,8 +1322,8 @@ def bar( "(eg: color=['#d65f5f', '#5fba7d'])" ) - subset = _maybe_numeric_slice(self.data, subset) - subset = _non_reducing_slice(subset) + subset = maybe_numeric_slice(self.data, subset) + subset = non_reducing_slice(subset) self.apply( self._bar, subset=subset, @@ -1390,7 +1390,7 @@ def _highlight_handler( axis: Optional[Axis] = None, max_: bool = True, ) -> "Styler": - subset = _non_reducing_slice(_maybe_numeric_slice(self.data, subset)) + subset = non_reducing_slice(maybe_numeric_slice(self.data, subset)) self.apply( self._highlight_extrema, color=color, axis=axis, subset=subset, max_=max_ ) diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index 147e4efd74bc3..c1ba7881165f1 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -33,6 +33,13 @@ from pandas.plotting._matplotlib.compat import _mpl_ge_3_0_0 from pandas.plotting._matplotlib.converter import register_pandas_matplotlib_converters from pandas.plotting._matplotlib.style import get_standard_colors +from pandas.plotting._matplotlib.timeseries import ( + decorate_axes, + format_dateaxis, + maybe_convert_index, + maybe_resample, + use_dynamic_x, +) from pandas.plotting._matplotlib.tools import ( create_subplots, flatten_axes, @@ -1074,15 +1081,11 @@ def _is_ts_plot(self) -> bool: return not self.x_compat and self.use_index and self._use_dynamic_x() def _use_dynamic_x(self): - from pandas.plotting._matplotlib.timeseries import _use_dynamic_x - - return _use_dynamic_x(self._get_ax(0), self.data) + return use_dynamic_x(self._get_ax(0), self.data) def _make_plot(self): if self._is_ts_plot(): - from pandas.plotting._matplotlib.timeseries import _maybe_convert_index - - data = _maybe_convert_index(self._get_ax(0), self.data) + data = maybe_convert_index(self._get_ax(0), self.data) x = data.index # dummy, not used plotf = self._ts_plot @@ -1142,24 +1145,18 @@ def _plot( @classmethod def _ts_plot(cls, ax: "Axes", x, data, style=None, **kwds): - from pandas.plotting._matplotlib.timeseries import ( - _decorate_axes, - _maybe_resample, - format_dateaxis, - ) - # accept x to be consistent with normal plot func, # x is not passed to tsplot as it uses data.index as x coordinate # column_num must be in kwds for stacking purpose - freq, data = _maybe_resample(data, ax, kwds) + freq, data = maybe_resample(data, ax, kwds) # Set ax with freq info - _decorate_axes(ax, freq, kwds) + decorate_axes(ax, freq, kwds) # digging deeper if hasattr(ax, "left_ax"): - _decorate_axes(ax.left_ax, freq, kwds) + decorate_axes(ax.left_ax, freq, kwds) if hasattr(ax, "right_ax"): - _decorate_axes(ax.right_ax, freq, kwds) + decorate_axes(ax.right_ax, freq, kwds) ax._plot_data.append((data, cls._kind, kwds)) lines = cls._plot(ax, data.index, data.values, style=style, **kwds) diff --git a/pandas/plotting/_matplotlib/timeseries.py b/pandas/plotting/_matplotlib/timeseries.py index fd89a093d25a4..f8faac6a6a026 100644 --- a/pandas/plotting/_matplotlib/timeseries.py +++ b/pandas/plotting/_matplotlib/timeseries.py @@ -32,7 +32,7 @@ # Plotting functions and monkey patches -def _maybe_resample(series: "Series", ax: "Axes", kwargs): +def maybe_resample(series: "Series", ax: "Axes", kwargs): # resample against axes freq if necessary freq, ax_freq = _get_freq(ax, series) @@ -105,7 +105,7 @@ def _replot_ax(ax: "Axes", freq, kwargs): ax._plot_data = [] ax.clear() - _decorate_axes(ax, freq, kwargs) + decorate_axes(ax, freq, kwargs) lines = [] labels = [] @@ -128,7 +128,7 @@ def _replot_ax(ax: "Axes", freq, kwargs): return lines, labels -def _decorate_axes(ax: "Axes", freq, kwargs): +def decorate_axes(ax: "Axes", freq, kwargs): """Initialize axes for time-series plotting""" if not hasattr(ax, "_plot_data"): ax._plot_data = [] @@ -193,7 +193,7 @@ def _get_freq(ax: "Axes", series: "Series"): return freq, ax_freq -def _use_dynamic_x(ax: "Axes", data: FrameOrSeriesUnion) -> bool: +def use_dynamic_x(ax: "Axes", data: FrameOrSeriesUnion) -> bool: freq = _get_index_freq(data.index) ax_freq = _get_ax_freq(ax) @@ -235,7 +235,7 @@ def _get_index_freq(index: "Index") -> Optional[BaseOffset]: return freq -def _maybe_convert_index(ax: "Axes", data): +def maybe_convert_index(ax: "Axes", data): # tsplot converts automatically, but don't want to convert index # over and over for DataFrames if isinstance(data.index, (ABCDatetimeIndex, ABCPeriodIndex)): diff --git a/pandas/tests/indexing/multiindex/test_slice.py b/pandas/tests/indexing/multiindex/test_slice.py index 532bb4f2e6dac..ec0391a2ccc26 100644 --- a/pandas/tests/indexing/multiindex/test_slice.py +++ b/pandas/tests/indexing/multiindex/test_slice.py @@ -6,7 +6,7 @@ import pandas as pd from pandas import DataFrame, Index, MultiIndex, Series, Timestamp import pandas._testing as tm -from pandas.core.indexing import _non_reducing_slice +from pandas.core.indexing import non_reducing_slice from pandas.tests.indexing.common import _mklbl @@ -739,7 +739,7 @@ def test_non_reducing_slice_on_multiindex(self): df = pd.DataFrame(dic, index=[0, 1]) idx = pd.IndexSlice slice_ = idx[:, idx["b", "d"]] - tslice_ = _non_reducing_slice(slice_) + tslice_ = non_reducing_slice(slice_) result = df.loc[tslice_] expected = pd.DataFrame({("b", "d"): [4, 1]}) diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index 5b7f013d5de31..a080c5d169215 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -12,7 +12,7 @@ import pandas as pd from pandas import DataFrame, Index, NaT, Series import pandas._testing as tm -from pandas.core.indexing import _maybe_numeric_slice, _non_reducing_slice +from pandas.core.indexing import maybe_numeric_slice, non_reducing_slice from pandas.tests.indexing.common import _mklbl # ------------------------------------------------------------------------ @@ -822,7 +822,7 @@ def test_range_in_series_indexing(self, size): def test_non_reducing_slice(self, slc): df = DataFrame([[0, 1], [2, 3]]) - tslice_ = _non_reducing_slice(slc) + tslice_ = non_reducing_slice(slc) assert isinstance(df.loc[tslice_], DataFrame) def test_list_slice(self): @@ -831,18 +831,18 @@ def test_list_slice(self): df = DataFrame({"A": [1, 2], "B": [3, 4]}, index=["A", "B"]) expected = pd.IndexSlice[:, ["A"]] for subset in slices: - result = _non_reducing_slice(subset) + result = non_reducing_slice(subset) tm.assert_frame_equal(df.loc[result], df.loc[expected]) def test_maybe_numeric_slice(self): df = DataFrame({"A": [1, 2], "B": ["c", "d"], "C": [True, False]}) - result = _maybe_numeric_slice(df, slice_=None) + result = maybe_numeric_slice(df, slice_=None) expected = pd.IndexSlice[:, ["A"]] assert result == expected - result = _maybe_numeric_slice(df, None, include_bool=True) + result = maybe_numeric_slice(df, None, include_bool=True) expected = pd.IndexSlice[:, ["A", "C"]] - result = _maybe_numeric_slice(df, [1]) + result = maybe_numeric_slice(df, [1]) expected = [1] assert result == expected From fdb79116fade7ea1aaec702d4a95dace727e6e4f Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sat, 5 Sep 2020 03:56:30 +0100 Subject: [PATCH 0657/1025] TYP: misc fixes for numpy types (#36098) --- pandas/_typing.py | 2 +- pandas/core/algorithms.py | 7 +++---- pandas/core/arrays/categorical.py | 2 +- pandas/core/construction.py | 6 ++++-- pandas/core/dtypes/cast.py | 4 ++-- 5 files changed, 11 insertions(+), 10 deletions(-) diff --git a/pandas/_typing.py b/pandas/_typing.py index f8af92e07c674..74bfc9134c3af 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -62,7 +62,7 @@ # other Dtype = Union[ - "ExtensionDtype", str, np.dtype, Type[Union[str, float, int, complex, bool]] + "ExtensionDtype", str, np.dtype, Type[Union[str, float, int, complex, bool, object]] ] DtypeObj = Union[np.dtype, "ExtensionDtype"] FilePathOrBuffer = Union[str, Path, IO[AnyStr], IOBase] diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 9d75d21c5637a..f297c7165208f 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -6,7 +6,7 @@ import operator from textwrap import dedent -from typing import TYPE_CHECKING, Dict, Optional, Tuple, Union +from typing import TYPE_CHECKING, Dict, Optional, Tuple, Union, cast from warnings import catch_warnings, simplefilter, warn import numpy as np @@ -60,7 +60,7 @@ from pandas.core.indexers import validate_indices if TYPE_CHECKING: - from pandas import DataFrame, Series + from pandas import Categorical, DataFrame, Series _shared_docs: Dict[str, str] = {} @@ -429,8 +429,7 @@ def isin(comps: AnyArrayLike, values: AnyArrayLike) -> np.ndarray: if is_categorical_dtype(comps): # TODO(extension) # handle categoricals - # error: "ExtensionArray" has no attribute "isin" [attr-defined] - return comps.isin(values) # type: ignore[attr-defined] + return cast("Categorical", comps).isin(values) comps, dtype = _ensure_data(comps) values, _ = _ensure_data(values, dtype=dtype) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 27b1afdb438cb..ec85ec47d625c 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2316,7 +2316,7 @@ def _concat_same_type(self, to_concat): return union_categoricals(to_concat) - def isin(self, values): + def isin(self, values) -> np.ndarray: """ Check whether `values` are contained in Categorical. diff --git a/pandas/core/construction.py b/pandas/core/construction.py index 02b8ed17244cd..9d6c2789af25b 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -335,7 +335,7 @@ def array( return result -def extract_array(obj, extract_numpy: bool = False): +def extract_array(obj: AnyArrayLike, extract_numpy: bool = False) -> ArrayLike: """ Extract the ndarray or ExtensionArray from a Series or Index. @@ -383,7 +383,9 @@ def extract_array(obj, extract_numpy: bool = False): if extract_numpy and isinstance(obj, ABCPandasArray): obj = obj.to_numpy() - return obj + # error: Incompatible return value type (got "Index", expected "ExtensionArray") + # error: Incompatible return value type (got "Series", expected "ExtensionArray") + return obj # type: ignore[return-value] def sanitize_array( diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 1489e08d82bf0..7c5aafcbbc7e9 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1488,7 +1488,7 @@ def find_common_type(types: List[DtypeObj]) -> DtypeObj: if has_bools: for t in types: if is_integer_dtype(t) or is_float_dtype(t) or is_complex_dtype(t): - return object + return np.dtype("object") return np.find_common_type(types, []) @@ -1550,7 +1550,7 @@ def construct_1d_arraylike_from_scalar( elif isinstance(dtype, np.dtype) and dtype.kind in ("U", "S"): # we need to coerce to object dtype to avoid # to allow numpy to take our string as a scalar value - dtype = object + dtype = np.dtype("object") if not isna(value): value = ensure_str(value) From fe1504214bad30a4dff38a3ddd69c65452e936da Mon Sep 17 00:00:00 2001 From: Jonathan Shreckengost Date: Fri, 4 Sep 2020 23:10:49 -0400 Subject: [PATCH 0658/1025] Comma cleanup (#36082) --- .../tests/indexes/datetimes/test_datetime.py | 2 +- .../tests/indexes/datetimes/test_timezones.py | 2 +- .../tests/indexes/multi/test_constructors.py | 6 +++--- pandas/tests/indexes/multi/test_isin.py | 2 +- pandas/tests/indexes/test_base.py | 2 +- .../indexes/timedeltas/test_scalar_compat.py | 8 ++++---- .../indexes/timedeltas/test_searchsorted.py | 2 +- pandas/tests/indexing/common.py | 4 +--- pandas/tests/indexing/test_callable.py | 18 ++++++------------ pandas/tests/indexing/test_check_indexer.py | 8 +++----- pandas/tests/indexing/test_coercion.py | 2 +- pandas/tests/indexing/test_floats.py | 14 ++++---------- 12 files changed, 27 insertions(+), 43 deletions(-) diff --git a/pandas/tests/indexes/datetimes/test_datetime.py b/pandas/tests/indexes/datetimes/test_datetime.py index 7bb1d98086a91..8e2ac4feb7ded 100644 --- a/pandas/tests/indexes/datetimes/test_datetime.py +++ b/pandas/tests/indexes/datetimes/test_datetime.py @@ -51,7 +51,7 @@ def test_reindex_with_same_tz(self): "2010-01-02 00:00:00", ] expected1 = DatetimeIndex( - expected_list1, dtype="datetime64[ns, UTC]", freq=None, + expected_list1, dtype="datetime64[ns, UTC]", freq=None ) expected2 = np.array([0] + [-1] * 21 + [23], dtype=np.dtype("intp")) tm.assert_index_equal(result1, expected1) diff --git a/pandas/tests/indexes/datetimes/test_timezones.py b/pandas/tests/indexes/datetimes/test_timezones.py index ea68e8759c123..233835bb4b5f7 100644 --- a/pandas/tests/indexes/datetimes/test_timezones.py +++ b/pandas/tests/indexes/datetimes/test_timezones.py @@ -799,7 +799,7 @@ def test_dti_from_tzaware_datetime(self, tz): @pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"]) def test_dti_tz_constructors(self, tzstr): - """ Test different DatetimeIndex constructions with timezone + """Test different DatetimeIndex constructions with timezone Follow-up of GH#4229 """ arr = ["11/10/2005 08:00:00", "11/10/2005 09:00:00"] diff --git a/pandas/tests/indexes/multi/test_constructors.py b/pandas/tests/indexes/multi/test_constructors.py index 1157c7f8bb962..16af884c89e9e 100644 --- a/pandas/tests/indexes/multi/test_constructors.py +++ b/pandas/tests/indexes/multi/test_constructors.py @@ -741,18 +741,18 @@ def test_raise_invalid_sortorder(): with pytest.raises(ValueError, match=r".* sortorder 2 with lexsort_depth 1.*"): MultiIndex( - levels=levels, codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 2, 1]], sortorder=2, + levels=levels, codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 2, 1]], sortorder=2 ) with pytest.raises(ValueError, match=r".* sortorder 1 with lexsort_depth 0.*"): MultiIndex( - levels=levels, codes=[[0, 0, 1, 0, 1, 1], [0, 1, 0, 2, 2, 1]], sortorder=1, + levels=levels, codes=[[0, 0, 1, 0, 1, 1], [0, 1, 0, 2, 2, 1]], sortorder=1 ) def test_datetimeindex(): idx1 = pd.DatetimeIndex( - ["2013-04-01 9:00", "2013-04-02 9:00", "2013-04-03 9:00"] * 2, tz="Asia/Tokyo", + ["2013-04-01 9:00", "2013-04-02 9:00", "2013-04-03 9:00"] * 2, tz="Asia/Tokyo" ) idx2 = pd.date_range("2010/01/01", periods=6, freq="M", tz="US/Eastern") idx = MultiIndex.from_arrays([idx1, idx2]) diff --git a/pandas/tests/indexes/multi/test_isin.py b/pandas/tests/indexes/multi/test_isin.py index 122263e6ec198..b369b9a50954e 100644 --- a/pandas/tests/indexes/multi/test_isin.py +++ b/pandas/tests/indexes/multi/test_isin.py @@ -78,7 +78,7 @@ def test_isin_level_kwarg(): @pytest.mark.parametrize( "labels,expected,level", [ - ([("b", np.nan)], np.array([False, False, True]), None,), + ([("b", np.nan)], np.array([False, False, True]), None), ([np.nan, "a"], np.array([True, True, False]), 0), (["d", np.nan], np.array([False, True, True]), 1), ], diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index aee4b16621b4d..7720db9d98ebf 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -2426,7 +2426,7 @@ def test_index_with_tuple_bool(self): # TODO: remove tupleize_cols=False once correct behaviour is restored # TODO: also this op right now produces FutureWarning from numpy idx = Index([("a", "b"), ("b", "c"), ("c", "a")], tupleize_cols=False) - result = idx == ("c", "a",) + result = idx == ("c", "a") expected = np.array([False, False, True]) tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/indexes/timedeltas/test_scalar_compat.py b/pandas/tests/indexes/timedeltas/test_scalar_compat.py index 16c19b8d00380..6a2238d90b590 100644 --- a/pandas/tests/indexes/timedeltas/test_scalar_compat.py +++ b/pandas/tests/indexes/timedeltas/test_scalar_compat.py @@ -104,18 +104,18 @@ def test_round(self): "L", t1a, TimedeltaIndex( - ["-1 days +00:00:00", "-2 days +23:58:58", "-2 days +23:57:56"], + ["-1 days +00:00:00", "-2 days +23:58:58", "-2 days +23:57:56"] ), ), ( "S", t1a, TimedeltaIndex( - ["-1 days +00:00:00", "-2 days +23:58:58", "-2 days +23:57:56"], + ["-1 days +00:00:00", "-2 days +23:58:58", "-2 days +23:57:56"] ), ), - ("12T", t1c, TimedeltaIndex(["-1 days", "-1 days", "-1 days"],),), - ("H", t1c, TimedeltaIndex(["-1 days", "-1 days", "-1 days"],),), + ("12T", t1c, TimedeltaIndex(["-1 days", "-1 days", "-1 days"])), + ("H", t1c, TimedeltaIndex(["-1 days", "-1 days", "-1 days"])), ("d", t1c, TimedeltaIndex([-1, -1, -1], unit="D")), ]: diff --git a/pandas/tests/indexes/timedeltas/test_searchsorted.py b/pandas/tests/indexes/timedeltas/test_searchsorted.py index 4806a9acff96f..3cf45931cf6b7 100644 --- a/pandas/tests/indexes/timedeltas/test_searchsorted.py +++ b/pandas/tests/indexes/timedeltas/test_searchsorted.py @@ -17,7 +17,7 @@ def test_searchsorted_different_argument_classes(self, klass): tm.assert_numpy_array_equal(result, expected) @pytest.mark.parametrize( - "arg", [[1, 2], ["a", "b"], [Timestamp("2020-01-01", tz="Europe/London")] * 2], + "arg", [[1, 2], ["a", "b"], [Timestamp("2020-01-01", tz="Europe/London")] * 2] ) def test_searchsorted_invalid_argument_dtype(self, arg): idx = TimedeltaIndex(["1 day", "2 days", "3 days"]) diff --git a/pandas/tests/indexing/common.py b/pandas/tests/indexing/common.py index 9cc031001f81c..656d25bec2a6b 100644 --- a/pandas/tests/indexing/common.py +++ b/pandas/tests/indexing/common.py @@ -144,9 +144,7 @@ def check_values(self, f, func, values=False): tm.assert_almost_equal(result, expected) - def check_result( - self, method, key, typs=None, axes=None, fails=None, - ): + def check_result(self, method, key, typs=None, axes=None, fails=None): def _eq(axis, obj, key): """ compare equal for these 2 keys """ axified = _axify(obj, key, axis) diff --git a/pandas/tests/indexing/test_callable.py b/pandas/tests/indexing/test_callable.py index 621417eb38d94..bf51c3e5d1695 100644 --- a/pandas/tests/indexing/test_callable.py +++ b/pandas/tests/indexing/test_callable.py @@ -17,15 +17,11 @@ def test_frame_loc_callable(self): res = df.loc[lambda x: x.A > 2] tm.assert_frame_equal(res, df.loc[df.A > 2]) - res = df.loc[ - lambda x: x.A > 2, - ] # noqa: E231 - tm.assert_frame_equal(res, df.loc[df.A > 2,]) # noqa: E231 + res = df.loc[lambda x: x.A > 2] # noqa: E231 + tm.assert_frame_equal(res, df.loc[df.A > 2]) # noqa: E231 - res = df.loc[ - lambda x: x.A > 2, - ] # noqa: E231 - tm.assert_frame_equal(res, df.loc[df.A > 2,]) # noqa: E231 + res = df.loc[lambda x: x.A > 2] # noqa: E231 + tm.assert_frame_equal(res, df.loc[df.A > 2]) # noqa: E231 res = df.loc[lambda x: x.B == "b", :] tm.assert_frame_equal(res, df.loc[df.B == "b", :]) @@ -94,10 +90,8 @@ def test_frame_loc_callable_labels(self): res = df.loc[lambda x: ["A", "C"]] tm.assert_frame_equal(res, df.loc[["A", "C"]]) - res = df.loc[ - lambda x: ["A", "C"], - ] # noqa: E231 - tm.assert_frame_equal(res, df.loc[["A", "C"],]) # noqa: E231 + res = df.loc[lambda x: ["A", "C"]] # noqa: E231 + tm.assert_frame_equal(res, df.loc[["A", "C"]]) # noqa: E231 res = df.loc[lambda x: ["A", "C"], :] tm.assert_frame_equal(res, df.loc[["A", "C"], :]) diff --git a/pandas/tests/indexing/test_check_indexer.py b/pandas/tests/indexing/test_check_indexer.py index 69d4065234d93..865ecb129cdfa 100644 --- a/pandas/tests/indexing/test_check_indexer.py +++ b/pandas/tests/indexing/test_check_indexer.py @@ -32,7 +32,7 @@ def test_valid_input(indexer, expected): @pytest.mark.parametrize( - "indexer", [[True, False, None], pd.array([True, False, None], dtype="boolean")], + "indexer", [[True, False, None], pd.array([True, False, None], dtype="boolean")] ) def test_boolean_na_returns_indexer(indexer): # https://github.com/pandas-dev/pandas/issues/31503 @@ -61,7 +61,7 @@ def test_bool_raise_length(indexer): @pytest.mark.parametrize( - "indexer", [[0, 1, None], pd.array([0, 1, pd.NA], dtype="Int64")], + "indexer", [[0, 1, None], pd.array([0, 1, pd.NA], dtype="Int64")] ) def test_int_raise_missing_values(indexer): array = np.array([1, 2, 3]) @@ -89,9 +89,7 @@ def test_raise_invalid_array_dtypes(indexer): check_array_indexer(array, indexer) -@pytest.mark.parametrize( - "indexer", [None, Ellipsis, slice(0, 3), (None,)], -) +@pytest.mark.parametrize("indexer", [None, Ellipsis, slice(0, 3), (None,)]) def test_pass_through_non_array_likes(indexer): array = np.array([1, 2, 3]) diff --git a/pandas/tests/indexing/test_coercion.py b/pandas/tests/indexing/test_coercion.py index 1c5f00ff754a4..752ecd47fe089 100644 --- a/pandas/tests/indexing/test_coercion.py +++ b/pandas/tests/indexing/test_coercion.py @@ -87,7 +87,7 @@ def _assert_setitem_series_conversion( # tm.assert_series_equal(temp, expected_series) @pytest.mark.parametrize( - "val,exp_dtype", [(1, object), (1.1, object), (1 + 1j, object), (True, object)], + "val,exp_dtype", [(1, object), (1.1, object), (1 + 1j, object), (True, object)] ) def test_setitem_series_object(self, val, exp_dtype): obj = pd.Series(list("abcd")) diff --git a/pandas/tests/indexing/test_floats.py b/pandas/tests/indexing/test_floats.py index 18b9898e7d800..c48e0a129e161 100644 --- a/pandas/tests/indexing/test_floats.py +++ b/pandas/tests/indexing/test_floats.py @@ -181,9 +181,7 @@ def test_scalar_with_mixed(self): expected = 3 assert result == expected - @pytest.mark.parametrize( - "index_func", [tm.makeIntIndex, tm.makeRangeIndex], - ) + @pytest.mark.parametrize("index_func", [tm.makeIntIndex, tm.makeRangeIndex]) @pytest.mark.parametrize("klass", [Series, DataFrame]) def test_scalar_integer(self, index_func, klass): @@ -405,7 +403,7 @@ def test_slice_integer(self): @pytest.mark.parametrize("l", [slice(2, 4.0), slice(2.0, 4), slice(2.0, 4.0)]) def test_integer_positional_indexing(self, l): - """ make sure that we are raising on positional indexing + """make sure that we are raising on positional indexing w.r.t. an integer index """ s = Series(range(2, 6), index=range(2, 6)) @@ -425,9 +423,7 @@ def test_integer_positional_indexing(self, l): with pytest.raises(TypeError, match=msg): s.iloc[l] - @pytest.mark.parametrize( - "index_func", [tm.makeIntIndex, tm.makeRangeIndex], - ) + @pytest.mark.parametrize("index_func", [tm.makeIntIndex, tm.makeRangeIndex]) def test_slice_integer_frame_getitem(self, index_func): # similar to above, but on the getitem dim (of a DataFrame) @@ -486,9 +482,7 @@ def test_slice_integer_frame_getitem(self, index_func): s[l] @pytest.mark.parametrize("l", [slice(3.0, 4), slice(3, 4.0), slice(3.0, 4.0)]) - @pytest.mark.parametrize( - "index_func", [tm.makeIntIndex, tm.makeRangeIndex], - ) + @pytest.mark.parametrize("index_func", [tm.makeIntIndex, tm.makeRangeIndex]) def test_float_slice_getitem_with_integer_index_raises(self, l, index_func): # similar to above, but on the getitem dim (of a DataFrame) From 265a2ca148d1017cfb638bccc91015c411ce9724 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 4 Sep 2020 20:11:39 -0700 Subject: [PATCH 0659/1025] CLN: remove unused args/kwargs (#36129) --- pandas/core/groupby/generic.py | 1 + pandas/core/groupby/groupby.py | 2 ++ pandas/core/groupby/ops.py | 8 ++++---- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 53edd056a6802..173ff99912f05 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1084,6 +1084,7 @@ def blk_func(bvalues: ArrayLike) -> ArrayLike: assert how == "ohlc" raise + # We get here with a) EADtypes and b) object dtype obj: Union[Series, DataFrame] # call our grouper again with only this block if isinstance(bvalues, ExtensionArray): diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 651af2d314251..6ef2e67030881 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1012,6 +1012,8 @@ def _agg_general( # raised in _get_cython_function, in some cases can # be trimmed by implementing cython funcs for more dtypes pass + else: + raise # apply a non-cython aggregation result = self.aggregate(lambda x: npfunc(x, axis=self.axis)) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index c076b6e2e181b..e9525f03368fa 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -601,7 +601,7 @@ def _transform( return result - def agg_series(self, obj: Series, func: F, *args, **kwargs): + def agg_series(self, obj: Series, func: F): # Caller is responsible for checking ngroups != 0 assert self.ngroups != 0 @@ -649,7 +649,7 @@ def _aggregate_series_fast(self, obj: Series, func: F): result, counts = grouper.get_result() return result, counts - def _aggregate_series_pure_python(self, obj: Series, func: F, *args, **kwargs): + def _aggregate_series_pure_python(self, obj: Series, func: F): group_index, _, ngroups = self.group_info counts = np.zeros(ngroups, dtype=int) @@ -658,7 +658,7 @@ def _aggregate_series_pure_python(self, obj: Series, func: F, *args, **kwargs): splitter = get_splitter(obj, group_index, ngroups, axis=0) for label, group in splitter: - res = func(group, *args, **kwargs) + res = func(group) if result is None: if isinstance(res, (Series, Index, np.ndarray)): @@ -835,7 +835,7 @@ def groupings(self) -> "List[grouper.Grouping]": for lvl, name in zip(self.levels, self.names) ] - def agg_series(self, obj: Series, func: F, *args, **kwargs): + def agg_series(self, obj: Series, func: F): # Caller is responsible for checking ngroups != 0 assert self.ngroups != 0 assert len(self.bins) > 0 # otherwise we'd get IndexError in get_result From 94cd6e1b270f53a579691db1bf22dfd0f0222a7b Mon Sep 17 00:00:00 2001 From: David Kwong Date: Sat, 5 Sep 2020 13:15:03 +1000 Subject: [PATCH 0660/1025] BUG: Fix DataFrame.groupby().apply() for NaN groups with dropna=False (#35951) --- doc/source/whatsnew/v1.2.0.rst | 3 +- pandas/core/reshape/concat.py | 6 ++- pandas/tests/groupby/test_groupby_dropna.py | 53 +++++++++++++++++++++ 3 files changed, 59 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index e65daa439a225..aa3255e673797 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -214,7 +214,8 @@ Performance improvements Bug fixes ~~~~~~~~~ - +- Bug in :meth:`DataFrameGroupBy.apply` raising error with ``np.nan`` group(s) when ``dropna=False`` (:issue:`35889`) +- Categorical ^^^^^^^^^^^ diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index 299b68c6e71e0..9b94dae8556f6 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -11,6 +11,7 @@ from pandas.core.dtypes.concat import concat_compat from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries +from pandas.core.dtypes.missing import isna from pandas.core.arrays.categorical import ( factorize_from_iterable, @@ -624,10 +625,11 @@ def _make_concat_multiindex(indexes, keys, levels=None, names=None) -> MultiInde for hlevel, level in zip(zipped, levels): to_concat = [] for key, index in zip(hlevel, indexes): - mask = level == key + # Find matching codes, include matching nan values as equal. + mask = (isna(level) & isna(key)) | (level == key) if not mask.any(): raise ValueError(f"Key {key} not in level {level}") - i = np.nonzero(level == key)[0][0] + i = np.nonzero(mask)[0][0] to_concat.append(np.repeat(i, len(index))) codes_list.append(np.concatenate(to_concat)) diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index d1501111cb22b..66db06eeebdfb 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -274,3 +274,56 @@ def test_groupby_dropna_datetime_like_data( expected = pd.DataFrame({"values": values}, index=pd.Index(indexes, name="dt")) tm.assert_frame_equal(grouped, expected) + + +@pytest.mark.parametrize( + "dropna, data, selected_data, levels", + [ + pytest.param( + False, + {"groups": ["a", "a", "b", np.nan], "values": [10, 10, 20, 30]}, + {"values": [0, 1, 0, 0]}, + ["a", "b", np.nan], + id="dropna_false_has_nan", + ), + pytest.param( + True, + {"groups": ["a", "a", "b", np.nan], "values": [10, 10, 20, 30]}, + {"values": [0, 1, 0]}, + None, + id="dropna_true_has_nan", + ), + pytest.param( + # no nan in "groups"; dropna=True|False should be same. + False, + {"groups": ["a", "a", "b", "c"], "values": [10, 10, 20, 30]}, + {"values": [0, 1, 0, 0]}, + None, + id="dropna_false_no_nan", + ), + pytest.param( + # no nan in "groups"; dropna=True|False should be same. + True, + {"groups": ["a", "a", "b", "c"], "values": [10, 10, 20, 30]}, + {"values": [0, 1, 0, 0]}, + None, + id="dropna_true_no_nan", + ), + ], +) +def test_groupby_apply_with_dropna_for_multi_index(dropna, data, selected_data, levels): + # GH 35889 + + df = pd.DataFrame(data) + gb = df.groupby("groups", dropna=dropna) + result = gb.apply(lambda grp: pd.DataFrame({"values": range(len(grp))})) + + mi_tuples = tuple(zip(data["groups"], selected_data["values"])) + mi = pd.MultiIndex.from_tuples(mi_tuples, names=["groups", None]) + # Since right now, by default MI will drop NA from levels when we create MI + # via `from_*`, so we need to add NA for level manually afterwards. + if not dropna and levels: + mi = mi.set_levels(levels, level="groups") + + expected = pd.DataFrame(selected_data, index=mi) + tm.assert_frame_equal(result, expected) From 844ea85c10f9d87d18ad9d408d20b9afd0c07698 Mon Sep 17 00:00:00 2001 From: patrick <61934744+phofl@users.noreply.github.com> Date: Sat, 5 Sep 2020 05:18:12 +0200 Subject: [PATCH 0661/1025] Bug 29764 groupby loses index name sometimes (#36121) --- doc/source/whatsnew/v1.2.0.rst | 1 + pandas/core/groupby/generic.py | 1 + pandas/tests/groupby/test_groupby.py | 23 +++++++++++++++++++++++ 3 files changed, 25 insertions(+) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index aa3255e673797..3b252202c14c5 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -312,6 +312,7 @@ Groupby/resample/rolling - Bug in :meth:`DataFrameGroupby.apply` would drop a :class:`CategoricalIndex` when grouped on. (:issue:`35792`) - Bug when subsetting columns on a :class:`~pandas.core.groupby.DataFrameGroupBy` (e.g. ``df.groupby('a')[['b']])``) would reset the attributes ``axis``, ``dropna``, ``group_keys``, ``level``, ``mutated``, ``sort``, and ``squeeze`` to their default values. (:issue:`9959`) - Bug in :meth:`DataFrameGroupby.tshift` failing to raise ``ValueError`` when a frequency cannot be inferred for the index of a group (:issue:`35937`) +- Bug in :meth:`DataFrame.groupby` does not always maintain column index name for ``any``, ``all``, ``bfill``, ``ffill``, ``shift`` (:issue:`29764`) - Reshaping diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 173ff99912f05..b855ce65f41b2 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1695,6 +1695,7 @@ def _wrap_transformed_output( """ indexed_output = {key.position: val for key, val in output.items()} columns = Index(key.label for key in output) + columns.name = self.obj.columns.name result = self.obj._constructor(indexed_output) result.columns = columns diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index eec9e8064d584..e0196df7ceac0 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2111,3 +2111,26 @@ def test_subsetting_columns_keeps_attrs(klass, attr, value): expected = df.groupby("a", **{attr: value}) result = expected[["b"]] if klass is DataFrame else expected["b"] assert getattr(result, attr) == getattr(expected, attr) + + +@pytest.mark.parametrize("func", ["sum", "any", "shift"]) +def test_groupby_column_index_name_lost(func): + # GH: 29764 groupby loses index sometimes + expected = pd.Index(["a"], name="idx") + df = pd.DataFrame([[1]], columns=expected) + df_grouped = df.groupby([1]) + result = getattr(df_grouped, func)().columns + tm.assert_index_equal(result, expected) + + +@pytest.mark.parametrize("func", ["ffill", "bfill"]) +def test_groupby_column_index_name_lost_fill_funcs(func): + # GH: 29764 groupby loses index sometimes + df = pd.DataFrame( + [[1, 1.0, -1.0], [1, np.nan, np.nan], [1, 2.0, -2.0]], + columns=pd.Index(["type", "a", "b"], name="idx"), + ) + df_grouped = df.groupby(["type"])[["a", "b"]] + result = getattr(df_grouped, func)().columns + expected = pd.Index(["a", "b"], name="idx") + tm.assert_index_equal(result, expected) From e67b81f84ddc4102e6aed32375fc95ff7fc8b79d Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sat, 5 Sep 2020 04:18:59 +0100 Subject: [PATCH 0662/1025] STY: add code check for use of builtin filter function (#36089) --- ci/code_checks.sh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 2e0f27fefca0b..6006d09bc3e78 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -179,6 +179,10 @@ if [[ -z "$CHECK" || "$CHECK" == "patterns" ]]; then invgrep -R --include="*.py" -E "super\(\w*, (self|cls)\)" pandas RET=$(($RET + $?)) ; echo $MSG "DONE" + MSG='Check for use of builtin filter function' ; echo $MSG + invgrep -R --include="*.py" -P '(? Date: Fri, 4 Sep 2020 20:21:49 -0700 Subject: [PATCH 0663/1025] BUG: df.replace with numeric values and str to_replace (#36093) --- doc/source/user_guide/missing_data.rst | 26 ----- doc/source/whatsnew/v1.2.0.rst | 1 + pandas/core/array_algos/replace.py | 95 ++++++++++++++++++ pandas/core/generic.py | 14 --- pandas/core/internals/blocks.py | 27 ++++- pandas/core/internals/managers.py | 104 +------------------- pandas/tests/frame/methods/test_replace.py | 15 ++- pandas/tests/series/methods/test_replace.py | 5 +- 8 files changed, 136 insertions(+), 151 deletions(-) create mode 100644 pandas/core/array_algos/replace.py diff --git a/doc/source/user_guide/missing_data.rst b/doc/source/user_guide/missing_data.rst index 2e68a0598bb71..28206192dd161 100644 --- a/doc/source/user_guide/missing_data.rst +++ b/doc/source/user_guide/missing_data.rst @@ -689,32 +689,6 @@ You can also operate on the DataFrame in place: df.replace(1.5, np.nan, inplace=True) -.. warning:: - - When replacing multiple ``bool`` or ``datetime64`` objects, the first - argument to ``replace`` (``to_replace``) must match the type of the value - being replaced. For example, - - .. code-block:: python - - >>> s = pd.Series([True, False, True]) - >>> s.replace({'a string': 'new value', True: False}) # raises - TypeError: Cannot compare types 'ndarray(dtype=bool)' and 'str' - - will raise a ``TypeError`` because one of the ``dict`` keys is not of the - correct type for replacement. - - However, when replacing a *single* object such as, - - .. ipython:: python - - s = pd.Series([True, False, True]) - s.replace('a string', 'another string') - - the original ``NDFrame`` object will be returned untouched. We're working on - unifying this API, but for backwards compatibility reasons we cannot break - the latter behavior. See :issue:`6354` for more details. - Missing data casting rules and indexing --------------------------------------- diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 3b252202c14c5..8b28a4439e1da 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -339,6 +339,7 @@ ExtensionArray Other ^^^^^ - Bug in :meth:`DataFrame.replace` and :meth:`Series.replace` incorrectly raising ``AssertionError`` instead of ``ValueError`` when invalid parameter combinations are passed (:issue:`36045`) +- Bug in :meth:`DataFrame.replace` and :meth:`Series.replace` with numeric values and string ``to_replace`` (:issue:`34789`) - .. --------------------------------------------------------------------------- diff --git a/pandas/core/array_algos/replace.py b/pandas/core/array_algos/replace.py new file mode 100644 index 0000000000000..6ac3cc1f9f2fe --- /dev/null +++ b/pandas/core/array_algos/replace.py @@ -0,0 +1,95 @@ +""" +Methods used by Block.replace and related methods. +""" +import operator +import re +from typing import Optional, Pattern, Union + +import numpy as np + +from pandas._typing import ArrayLike, Scalar + +from pandas.core.dtypes.common import ( + is_datetimelike_v_numeric, + is_numeric_v_string_like, + is_scalar, +) +from pandas.core.dtypes.missing import isna + + +def compare_or_regex_search( + a: ArrayLike, + b: Union[Scalar, Pattern], + regex: bool = False, + mask: Optional[ArrayLike] = None, +) -> Union[ArrayLike, bool]: + """ + Compare two array_like inputs of the same shape or two scalar values + + Calls operator.eq or re.search, depending on regex argument. If regex is + True, perform an element-wise regex matching. + + Parameters + ---------- + a : array_like + b : scalar or regex pattern + regex : bool, default False + mask : array_like or None (default) + + Returns + ------- + mask : array_like of bool + """ + + def _check_comparison_types( + result: Union[ArrayLike, bool], a: ArrayLike, b: Union[Scalar, Pattern] + ): + """ + Raises an error if the two arrays (a,b) cannot be compared. + Otherwise, returns the comparison result as expected. + """ + if is_scalar(result) and isinstance(a, np.ndarray): + type_names = [type(a).__name__, type(b).__name__] + + if isinstance(a, np.ndarray): + type_names[0] = f"ndarray(dtype={a.dtype})" + + raise TypeError( + f"Cannot compare types {repr(type_names[0])} and {repr(type_names[1])}" + ) + + if not regex: + op = lambda x: operator.eq(x, b) + else: + op = np.vectorize( + lambda x: bool(re.search(b, x)) + if isinstance(x, str) and isinstance(b, (str, Pattern)) + else False + ) + + # GH#32621 use mask to avoid comparing to NAs + if mask is None and isinstance(a, np.ndarray) and not isinstance(b, np.ndarray): + mask = np.reshape(~(isna(a)), a.shape) + if isinstance(a, np.ndarray): + a = a[mask] + + if is_numeric_v_string_like(a, b): + # GH#29553 avoid deprecation warnings from numpy + return np.zeros(a.shape, dtype=bool) + + elif is_datetimelike_v_numeric(a, b): + # GH#29553 avoid deprecation warnings from numpy + _check_comparison_types(False, a, b) + return False + + result = op(a) + + if isinstance(result, np.ndarray) and mask is not None: + # The shape of the mask can differ to that of the result + # since we may compare only a subset of a's or b's elements + tmp = np.zeros(mask.shape, dtype=np.bool_) + tmp[mask] = result + result = tmp + + _check_comparison_types(result, a, b) + return result diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 2af323ccc1dd3..93c945638a174 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6561,20 +6561,6 @@ def replace( 1 new new 2 bait xyz - Note that when replacing multiple ``bool`` or ``datetime64`` objects, - the data types in the `to_replace` parameter must match the data - type of the value being replaced: - - >>> df = pd.DataFrame({{'A': [True, False, True], - ... 'B': [False, True, False]}}) - >>> df.replace({{'a string': 'new value', True: False}}) # raises - Traceback (most recent call last): - ... - TypeError: Cannot compare types 'ndarray(dtype=bool)' and 'str' - - This raises a ``TypeError`` because one of the ``dict`` keys is not of - the correct type for replacement. - Compare the behavior of ``s.replace({{'a': None}})`` and ``s.replace('a', None)`` to understand the peculiarities of the `to_replace` parameter: diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index b2305736f9d46..3bcd4debbf41a 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -11,7 +11,7 @@ from pandas._libs.internals import BlockPlacement from pandas._libs.tslibs import conversion from pandas._libs.tslibs.timezones import tz_compare -from pandas._typing import ArrayLike +from pandas._typing import ArrayLike, Scalar from pandas.util._validators import validate_bool_kwarg from pandas.core.dtypes.cast import ( @@ -59,6 +59,7 @@ from pandas.core.dtypes.missing import is_valid_nat_for_dtype, isna, isna_compat import pandas.core.algorithms as algos +from pandas.core.array_algos.replace import compare_or_regex_search from pandas.core.array_algos.transforms import shift from pandas.core.arrays import ( Categorical, @@ -792,7 +793,6 @@ def _replace_list( self, src_list: List[Any], dest_list: List[Any], - masks: List[np.ndarray], inplace: bool = False, regex: bool = False, ) -> List["Block"]: @@ -801,11 +801,28 @@ def _replace_list( """ src_len = len(src_list) - 1 + def comp(s: Scalar, mask: np.ndarray, regex: bool = False) -> np.ndarray: + """ + Generate a bool array by perform an equality check, or perform + an element-wise regular expression matching + """ + if isna(s): + return ~mask + + s = com.maybe_box_datetimelike(s) + return compare_or_regex_search(self.values, s, regex, mask) + + # Calculate the mask once, prior to the call of comp + # in order to avoid repeating the same computations + mask = ~isna(self.values) + + masks = [comp(s, mask, regex) for s in src_list] + rb = [self if inplace else self.copy()] for i, (src, dest) in enumerate(zip(src_list, dest_list)): new_rb: List["Block"] = [] for blk in rb: - m = masks[i][blk.mgr_locs.indexer] + m = masks[i] convert = i == src_len # only convert once at the end result = blk._replace_coerce( mask=m, @@ -2908,7 +2925,9 @@ def _extract_bool_array(mask: ArrayLike) -> np.ndarray: """ if isinstance(mask, ExtensionArray): # We could have BooleanArray, Sparse[bool], ... - mask = np.asarray(mask, dtype=np.bool_) + # Except for BooleanArray, this is equivalent to just + # np.asarray(mask, dtype=bool) + mask = mask.to_numpy(dtype=bool, na_value=False) assert isinstance(mask, np.ndarray), type(mask) assert mask.dtype == bool, mask.dtype diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 753b949f7c802..57a4a8c2ace8a 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1,14 +1,11 @@ from collections import defaultdict import itertools -import operator -import re from typing import ( Any, DefaultDict, Dict, List, Optional, - Pattern, Sequence, Tuple, TypeVar, @@ -19,7 +16,7 @@ import numpy as np from pandas._libs import internals as libinternals, lib -from pandas._typing import ArrayLike, DtypeObj, Label, Scalar +from pandas._typing import ArrayLike, DtypeObj, Label from pandas.util._validators import validate_bool_kwarg from pandas.core.dtypes.cast import ( @@ -29,12 +26,9 @@ ) from pandas.core.dtypes.common import ( DT64NS_DTYPE, - is_datetimelike_v_numeric, is_dtype_equal, is_extension_array_dtype, is_list_like, - is_numeric_v_string_like, - is_scalar, ) from pandas.core.dtypes.concat import concat_compat from pandas.core.dtypes.dtypes import ExtensionDtype @@ -44,7 +38,6 @@ import pandas.core.algorithms as algos from pandas.core.arrays.sparse import SparseDtype from pandas.core.base import PandasObject -import pandas.core.common as com from pandas.core.construction import extract_array from pandas.core.indexers import maybe_convert_indices from pandas.core.indexes.api import Index, ensure_index @@ -628,31 +621,10 @@ def replace_list( """ do a list replace """ inplace = validate_bool_kwarg(inplace, "inplace") - # figure out our mask apriori to avoid repeated replacements - values = self.as_array() - - def comp(s: Scalar, mask: np.ndarray, regex: bool = False): - """ - Generate a bool array by perform an equality check, or perform - an element-wise regular expression matching - """ - if isna(s): - return ~mask - - s = com.maybe_box_datetimelike(s) - return _compare_or_regex_search(values, s, regex, mask) - - # Calculate the mask once, prior to the call of comp - # in order to avoid repeating the same computations - mask = ~isna(values) - - masks = [comp(s, mask, regex) for s in src_list] - bm = self.apply( "_replace_list", src_list=src_list, dest_list=dest_list, - masks=masks, inplace=inplace, regex=regex, ) @@ -1900,80 +1872,6 @@ def _merge_blocks( return blocks -def _compare_or_regex_search( - a: ArrayLike, - b: Union[Scalar, Pattern], - regex: bool = False, - mask: Optional[ArrayLike] = None, -) -> Union[ArrayLike, bool]: - """ - Compare two array_like inputs of the same shape or two scalar values - - Calls operator.eq or re.search, depending on regex argument. If regex is - True, perform an element-wise regex matching. - - Parameters - ---------- - a : array_like - b : scalar or regex pattern - regex : bool, default False - mask : array_like or None (default) - - Returns - ------- - mask : array_like of bool - """ - - def _check_comparison_types( - result: Union[ArrayLike, bool], a: ArrayLike, b: Union[Scalar, Pattern] - ): - """ - Raises an error if the two arrays (a,b) cannot be compared. - Otherwise, returns the comparison result as expected. - """ - if is_scalar(result) and isinstance(a, np.ndarray): - type_names = [type(a).__name__, type(b).__name__] - - if isinstance(a, np.ndarray): - type_names[0] = f"ndarray(dtype={a.dtype})" - - raise TypeError( - f"Cannot compare types {repr(type_names[0])} and {repr(type_names[1])}" - ) - - if not regex: - op = lambda x: operator.eq(x, b) - else: - op = np.vectorize( - lambda x: bool(re.search(b, x)) - if isinstance(x, str) and isinstance(b, (str, Pattern)) - else False - ) - - # GH#32621 use mask to avoid comparing to NAs - if mask is None and isinstance(a, np.ndarray) and not isinstance(b, np.ndarray): - mask = np.reshape(~(isna(a)), a.shape) - if isinstance(a, np.ndarray): - a = a[mask] - - if is_datetimelike_v_numeric(a, b) or is_numeric_v_string_like(a, b): - # GH#29553 avoid deprecation warnings from numpy - _check_comparison_types(False, a, b) - return False - - result = op(a) - - if isinstance(result, np.ndarray) and mask is not None: - # The shape of the mask can differ to that of the result - # since we may compare only a subset of a's or b's elements - tmp = np.zeros(mask.shape, dtype=np.bool_) - tmp[mask] = result - result = tmp - - _check_comparison_types(result, a, b) - return result - - def _fast_count_smallints(arr: np.ndarray) -> np.ndarray: """Faster version of set(arr) for sequences of small numbers.""" counts = np.bincount(arr.astype(np.int_)) diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py index 83dfd42ae2a6e..ea2488dfc0877 100644 --- a/pandas/tests/frame/methods/test_replace.py +++ b/pandas/tests/frame/methods/test_replace.py @@ -1131,8 +1131,19 @@ def test_replace_bool_with_bool(self): def test_replace_with_dict_with_bool_keys(self): df = DataFrame({0: [True, False], 1: [False, True]}) - with pytest.raises(TypeError, match="Cannot compare types .+"): - df.replace({"asdf": "asdb", True: "yes"}) + result = df.replace({"asdf": "asdb", True: "yes"}) + expected = DataFrame({0: ["yes", False], 1: [False, "yes"]}) + tm.assert_frame_equal(result, expected) + + def test_replace_dict_strings_vs_ints(self): + # GH#34789 + df = pd.DataFrame({"Y0": [1, 2], "Y1": [3, 4]}) + result = df.replace({"replace_string": "test"}) + + tm.assert_frame_equal(result, df) + + result = df["Y0"].replace({"replace_string": "test"}) + tm.assert_series_equal(result, df["Y0"]) def test_replace_truthy(self): df = DataFrame({"a": [True, True]}) diff --git a/pandas/tests/series/methods/test_replace.py b/pandas/tests/series/methods/test_replace.py index ccaa005369a1c..e255d46e81851 100644 --- a/pandas/tests/series/methods/test_replace.py +++ b/pandas/tests/series/methods/test_replace.py @@ -218,8 +218,9 @@ def test_replace_bool_with_bool(self): def test_replace_with_dict_with_bool_keys(self): s = pd.Series([True, False, True]) - with pytest.raises(TypeError, match="Cannot compare types .+"): - s.replace({"asdf": "asdb", True: "yes"}) + result = s.replace({"asdf": "asdb", True: "yes"}) + expected = pd.Series(["yes", False, "yes"]) + tm.assert_series_equal(result, expected) def test_replace2(self): N = 100 From e6fad94948e711d2565ae3783eed275a85847943 Mon Sep 17 00:00:00 2001 From: Fangchen Li Date: Fri, 4 Sep 2020 22:33:49 -0500 Subject: [PATCH 0664/1025] CLN: resolve UserWarning in `pandas/plotting/_matplotlib/core.py` #35945 (#35946) --- pandas/plotting/_matplotlib/core.py | 2 +- pandas/tests/plotting/test_frame.py | 10 ++++++---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index c1ba7881165f1..f0b35e1cd2a74 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -1223,8 +1223,8 @@ def get_label(i): if self._need_to_set_index: xticks = ax.get_xticks() xticklabels = [get_label(x) for x in xticks] - ax.set_xticklabels(xticklabels) ax.xaxis.set_major_locator(FixedLocator(xticks)) + ax.set_xticklabels(xticklabels) condition = ( not self._use_dynamic_x() diff --git a/pandas/tests/plotting/test_frame.py b/pandas/tests/plotting/test_frame.py index ee43e5d7072fe..9ab697cb57690 100644 --- a/pandas/tests/plotting/test_frame.py +++ b/pandas/tests/plotting/test_frame.py @@ -2796,10 +2796,12 @@ def test_table(self): _check_plot_works(df.plot, table=True) _check_plot_works(df.plot, table=df) - ax = df.plot() - assert len(ax.tables) == 0 - plotting.table(ax, df.T) - assert len(ax.tables) == 1 + # GH 35945 UserWarning + with tm.assert_produces_warning(None): + ax = df.plot() + assert len(ax.tables) == 0 + plotting.table(ax, df.T) + assert len(ax.tables) == 1 def test_errorbar_scatter(self): df = DataFrame(np.random.randn(5, 2), index=range(5), columns=["x", "y"]) From fe5933f188fc04ec92bfe9c42228c8a5536b81b6 Mon Sep 17 00:00:00 2001 From: Andrew Wieteska <48889395+arw2019@users.noreply.github.com> Date: Sat, 5 Sep 2020 06:50:57 -0400 Subject: [PATCH 0665/1025] add note about missing values to Categorical docstring (#36125) --- pandas/core/arrays/categorical.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index ec85ec47d625c..c3c9009dda659 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -280,6 +280,19 @@ class Categorical(NDArrayBackedExtensionArray, PandasObject): ['a', 'b', 'c', 'a', 'b', 'c'] Categories (3, object): ['a', 'b', 'c'] + Missing values are not included as a category. + + >>> c = pd.Categorical([1, 2, 3, 1, 2, 3, np.nan]) + >>> c + [1, 2, 3, 1, 2, 3, NaN] + Categories (3, int64): [1, 2, 3] + + However, their presence is indicated in the `codes` attribute + by code `-1`. + + >>> c.codes + array([ 0, 1, 2, 0, 1, 2, -1], dtype=int8) + Ordered `Categoricals` can be sorted according to the custom order of the categories and can have a min and max value. From 79b2c306766e8a8b88b8cd8714e76e96e83b6f77 Mon Sep 17 00:00:00 2001 From: Sarthak Vineet Kumar Date: Sat, 5 Sep 2020 18:09:11 +0530 Subject: [PATCH 0666/1025] CLN removing trailing commas (#36101) --- pandas/tests/io/test_sql.py | 3 --- pandas/tests/io/test_stata.py | 4 ++-- pandas/tests/plotting/test_frame.py | 4 ++-- pandas/tests/resample/test_datetime_index.py | 10 ++++------ .../tests/reshape/merge/test_merge_index_as_string.py | 4 ++-- pandas/tests/reshape/test_crosstab.py | 4 ++-- pandas/tests/reshape/test_get_dummies.py | 2 +- 7 files changed, 13 insertions(+), 18 deletions(-) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index a7e3162ed7b73..1edcc937f72c3 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -2349,9 +2349,6 @@ def date_format(dt): def format_query(sql, *args): - """ - - """ processed_args = [] for arg in args: if isinstance(arg, float) and isna(arg): diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index 6d7fec803a8e0..88f61390957a6 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -1153,7 +1153,7 @@ def test_read_chunks_117( from_frame = parsed.iloc[pos : pos + chunksize, :].copy() from_frame = self._convert_categorical(from_frame) tm.assert_frame_equal( - from_frame, chunk, check_dtype=False, check_datetimelike_compat=True, + from_frame, chunk, check_dtype=False, check_datetimelike_compat=True ) pos += chunksize @@ -1251,7 +1251,7 @@ def test_read_chunks_115( from_frame = parsed.iloc[pos : pos + chunksize, :].copy() from_frame = self._convert_categorical(from_frame) tm.assert_frame_equal( - from_frame, chunk, check_dtype=False, check_datetimelike_compat=True, + from_frame, chunk, check_dtype=False, check_datetimelike_compat=True ) pos += chunksize diff --git a/pandas/tests/plotting/test_frame.py b/pandas/tests/plotting/test_frame.py index 9ab697cb57690..128a7bdb6730a 100644 --- a/pandas/tests/plotting/test_frame.py +++ b/pandas/tests/plotting/test_frame.py @@ -1321,7 +1321,7 @@ def test_scatter_with_c_column_name_with_colors(self, cmap): def test_plot_scatter_with_s(self): # this refers to GH 32904 - df = DataFrame(np.random.random((10, 3)) * 100, columns=["a", "b", "c"],) + df = DataFrame(np.random.random((10, 3)) * 100, columns=["a", "b", "c"]) ax = df.plot.scatter(x="a", y="b", s="c") tm.assert_numpy_array_equal(df["c"].values, right=ax.collections[0].get_sizes()) @@ -1716,7 +1716,7 @@ def test_hist_df(self): def test_hist_weights(self, weights): # GH 33173 np.random.seed(0) - df = pd.DataFrame(dict(zip(["A", "B"], np.random.randn(2, 100,)))) + df = pd.DataFrame(dict(zip(["A", "B"], np.random.randn(2, 100)))) ax1 = _check_plot_works(df.plot, kind="hist", weights=weights) ax2 = _check_plot_works(df.plot, kind="hist") diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index e7637a598403f..59a0183304c76 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -124,7 +124,7 @@ def test_resample_integerarray(): result = ts.resample("3T").mean() expected = Series( - [1, 4, 7], index=pd.date_range("1/1/2000", periods=3, freq="3T"), dtype="Int64", + [1, 4, 7], index=pd.date_range("1/1/2000", periods=3, freq="3T"), dtype="Int64" ) tm.assert_series_equal(result, expected) @@ -764,7 +764,7 @@ def test_resample_origin(): @pytest.mark.parametrize( - "origin", ["invalid_value", "epch", "startday", "startt", "2000-30-30", object()], + "origin", ["invalid_value", "epch", "startday", "startt", "2000-30-30", object()] ) def test_resample_bad_origin(origin): rng = date_range("2000-01-01 00:00:00", "2000-01-01 02:00", freq="s") @@ -777,9 +777,7 @@ def test_resample_bad_origin(origin): ts.resample("5min", origin=origin) -@pytest.mark.parametrize( - "offset", ["invalid_value", "12dayys", "2000-30-30", object()], -) +@pytest.mark.parametrize("offset", ["invalid_value", "12dayys", "2000-30-30", object()]) def test_resample_bad_offset(offset): rng = date_range("2000-01-01 00:00:00", "2000-01-01 02:00", freq="s") ts = Series(np.random.randn(len(rng)), index=rng) @@ -1595,7 +1593,7 @@ def test_downsample_dst_at_midnight(): "America/Havana", ambiguous=True ) dti = pd.DatetimeIndex(dti, freq="D") - expected = DataFrame([7.5, 28.0, 44.5], index=dti,) + expected = DataFrame([7.5, 28.0, 44.5], index=dti) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/merge/test_merge_index_as_string.py b/pandas/tests/reshape/merge/test_merge_index_as_string.py index 08614d04caf4b..d20d93370ec7e 100644 --- a/pandas/tests/reshape/merge/test_merge_index_as_string.py +++ b/pandas/tests/reshape/merge/test_merge_index_as_string.py @@ -29,7 +29,7 @@ def df2(): @pytest.fixture(params=[[], ["outer"], ["outer", "inner"]]) def left_df(request, df1): - """ Construct left test DataFrame with specified levels + """Construct left test DataFrame with specified levels (any of 'outer', 'inner', and 'v1') """ levels = request.param @@ -41,7 +41,7 @@ def left_df(request, df1): @pytest.fixture(params=[[], ["outer"], ["outer", "inner"]]) def right_df(request, df2): - """ Construct right test DataFrame with specified levels + """Construct right test DataFrame with specified levels (any of 'outer', 'inner', and 'v2') """ levels = request.param diff --git a/pandas/tests/reshape/test_crosstab.py b/pandas/tests/reshape/test_crosstab.py index 6f5550a6f8209..1aadcfdc30f1b 100644 --- a/pandas/tests/reshape/test_crosstab.py +++ b/pandas/tests/reshape/test_crosstab.py @@ -354,7 +354,7 @@ def test_crosstab_normalize(self): crosstab(df.a, df.b, normalize="columns"), ) tm.assert_frame_equal( - crosstab(df.a, df.b, normalize=0), crosstab(df.a, df.b, normalize="index"), + crosstab(df.a, df.b, normalize=0), crosstab(df.a, df.b, normalize="index") ) row_normal_margins = DataFrame( @@ -377,7 +377,7 @@ def test_crosstab_normalize(self): crosstab(df.a, df.b, normalize="index", margins=True), row_normal_margins ) tm.assert_frame_equal( - crosstab(df.a, df.b, normalize="columns", margins=True), col_normal_margins, + crosstab(df.a, df.b, normalize="columns", margins=True), col_normal_margins ) tm.assert_frame_equal( crosstab(df.a, df.b, normalize=True, margins=True), all_normal_margins diff --git a/pandas/tests/reshape/test_get_dummies.py b/pandas/tests/reshape/test_get_dummies.py index c003bfa6a239a..ce13762ea8f86 100644 --- a/pandas/tests/reshape/test_get_dummies.py +++ b/pandas/tests/reshape/test_get_dummies.py @@ -161,7 +161,7 @@ def test_get_dummies_unicode(self, sparse): s = [e, eacute, eacute] res = get_dummies(s, prefix="letter", sparse=sparse) exp = DataFrame( - {"letter_e": [1, 0, 0], f"letter_{eacute}": [0, 1, 1]}, dtype=np.uint8, + {"letter_e": [1, 0, 0], f"letter_{eacute}": [0, 1, 1]}, dtype=np.uint8 ) if sparse: exp = exp.apply(SparseArray, fill_value=0) From e595d782bfbfcff4e0f51aca72faaab8414ca4b9 Mon Sep 17 00:00:00 2001 From: Thomas Dickson Date: Sat, 5 Sep 2020 15:44:26 +0100 Subject: [PATCH 0667/1025] Updated series documentation to close #35406 (#36139) --- pandas/core/series.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index 9d84ce4b9ab2e..d8fdaa2a60252 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -164,9 +164,9 @@ class Series(base.IndexOpsMixin, generic.NDFrame): index : array-like or Index (1d) Values must be hashable and have the same length as `data`. Non-unique index values are allowed. Will default to - RangeIndex (0, 1, 2, ..., n) if not provided. If both a dict and index - sequence are used, the index will override the keys found in the - dict. + RangeIndex (0, 1, 2, ..., n) if not provided. If data is dict-like + and index is None, then the values in the index are used to + reindex the Series after it is created using the keys in the data. dtype : str, numpy.dtype, or ExtensionDtype, optional Data type for the output Series. If not specified, this will be inferred from `data`. From 48814fa41bda92ef2cf487910389f3d0136645ee Mon Sep 17 00:00:00 2001 From: joooeey Date: Sat, 5 Sep 2020 16:49:09 +0200 Subject: [PATCH 0668/1025] BUG: repair 'style' kwd handling in DataFrame.plot (#21003) (#33821) --- doc/source/whatsnew/v1.2.0.rst | 2 +- pandas/plotting/_matplotlib/core.py | 27 ++++++++++++++++----------- pandas/tests/plotting/test_frame.py | 18 ++++++++++++++++++ pandas/tests/plotting/test_series.py | 2 +- 4 files changed, 36 insertions(+), 13 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 8b28a4439e1da..39e53daf516c4 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -299,7 +299,7 @@ I/O Plotting ^^^^^^^^ -- +- Bug in :meth:`DataFrame.plot` where a marker letter in the ``style`` keyword sometimes causes a ``ValueError`` (:issue:`21003`) - Groupby/resample/rolling diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index f0b35e1cd2a74..def4a1dc3f5c4 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -1,4 +1,3 @@ -import re from typing import TYPE_CHECKING, List, Optional, Tuple import warnings @@ -55,6 +54,15 @@ from matplotlib.axis import Axis +def _color_in_style(style: str) -> bool: + """ + Check if there is a color letter in the style string. + """ + from matplotlib.colors import BASE_COLORS + + return not set(BASE_COLORS).isdisjoint(style) + + class MPLPlot: """ Base class for assembling a pandas plot using matplotlib @@ -200,8 +208,6 @@ def __init__( self._validate_color_args() def _validate_color_args(self): - import matplotlib.colors - if ( "color" in self.kwds and self.nseries == 1 @@ -233,13 +239,12 @@ def _validate_color_args(self): styles = [self.style] # need only a single match for s in styles: - for char in s: - if char in matplotlib.colors.BASE_COLORS: - raise ValueError( - "Cannot pass 'style' string with a color symbol and " - "'color' keyword argument. Please use one or the other or " - "pass 'style' without a color symbol" - ) + if _color_in_style(s): + raise ValueError( + "Cannot pass 'style' string with a color symbol and " + "'color' keyword argument. Please use one or the " + "other or pass 'style' without a color symbol" + ) def _iter_data(self, data=None, keep_index=False, fillna=None): if data is None: @@ -739,7 +744,7 @@ def _apply_style_colors(self, colors, kwds, col_num, label): style = self.style has_color = "color" in kwds or self.colormap is not None - nocolor_style = style is None or re.match("[a-z]+", style) is None + nocolor_style = style is None or not _color_in_style(style) if (has_color or self.subplots) and nocolor_style: if isinstance(colors, dict): kwds["color"] = colors[label] diff --git a/pandas/tests/plotting/test_frame.py b/pandas/tests/plotting/test_frame.py index 128a7bdb6730a..3b3902647390d 100644 --- a/pandas/tests/plotting/test_frame.py +++ b/pandas/tests/plotting/test_frame.py @@ -205,6 +205,24 @@ def test_color_and_style_arguments(self): with pytest.raises(ValueError): df.plot(color=["red", "black"], style=["k-", "r--"]) + @pytest.mark.parametrize( + "color, expected", + [ + ("green", ["green"] * 4), + (["yellow", "red", "green", "blue"], ["yellow", "red", "green", "blue"]), + ], + ) + def test_color_and_marker(self, color, expected): + # GH 21003 + df = DataFrame(np.random.random((7, 4))) + ax = df.plot(color=color, style="d--") + # check colors + result = [i.get_color() for i in ax.lines] + assert result == expected + # check markers and linestyles + assert all(i.get_linestyle() == "--" for i in ax.lines) + assert all(i.get_marker() == "d" for i in ax.lines) + def test_nonnumeric_exclude(self): df = DataFrame({"A": ["x", "y", "z"], "B": [1, 2, 3]}) ax = df.plot() diff --git a/pandas/tests/plotting/test_series.py b/pandas/tests/plotting/test_series.py index c296e2a6278c5..85c06b2e7b748 100644 --- a/pandas/tests/plotting/test_series.py +++ b/pandas/tests/plotting/test_series.py @@ -958,7 +958,7 @@ def test_plot_no_numeric_data(self): def test_style_single_ok(self): s = pd.Series([1, 2]) ax = s.plot(style="s", color="C3") - assert ax.lines[0].get_color() == ["C3"] + assert ax.lines[0].get_color() == "C3" @pytest.mark.parametrize( "index_name, old_label, new_label", From 1d08f07fd601f6a5fb6b2324896597d52753b703 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torsten=20W=C3=B6rtwein?= Date: Sat, 5 Sep 2020 10:50:03 -0400 Subject: [PATCH 0669/1025] BUG/ENH: to_pickle/read_pickle support compression for file ojects (#35736) --- doc/source/whatsnew/v1.2.0.rst | 1 + pandas/_typing.py | 4 ++-- pandas/core/frame.py | 4 ++-- pandas/io/common.py | 24 +++++++++--------------- pandas/io/formats/csvs.py | 15 ++++----------- pandas/io/json/_json.py | 11 ++--------- pandas/io/parsers.py | 13 +++++-------- pandas/io/pickle.py | 10 ++-------- pandas/io/stata.py | 30 +++++------------------------- pandas/tests/io/test_pickle.py | 29 +++++++++++++++++++++++++++++ 10 files changed, 61 insertions(+), 80 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 39e53daf516c4..b1229a5d5823d 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -295,6 +295,7 @@ I/O - :meth:`to_csv` passes compression arguments for `'gzip'` always to `gzip.GzipFile` (:issue:`28103`) - :meth:`to_csv` did not support zip compression for binary file object not having a filename (:issue: `35058`) - :meth:`to_csv` and :meth:`read_csv` did not honor `compression` and `encoding` for path-like objects that are internally converted to file-like objects (:issue:`35677`, :issue:`26124`, and :issue:`32392`) +- :meth:`to_picke` and :meth:`read_pickle` did not support compression for file-objects (:issue:`26237`, :issue:`29054`, and :issue:`29570`) Plotting ^^^^^^^^ diff --git a/pandas/_typing.py b/pandas/_typing.py index 74bfc9134c3af..b237013ac7805 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -116,7 +116,7 @@ # compression keywords and compression -CompressionDict = Mapping[str, Optional[Union[str, int, bool]]] +CompressionDict = Dict[str, Any] CompressionOptions = Optional[Union[str, CompressionDict]] @@ -138,6 +138,6 @@ class IOargs(Generic[ModeVar, EncodingVar]): filepath_or_buffer: FileOrBuffer encoding: EncodingVar - compression: CompressionOptions + compression: CompressionDict should_close: bool mode: Union[ModeVar, str] diff --git a/pandas/core/frame.py b/pandas/core/frame.py index c48bec9b670ad..1713743b98bff 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -27,7 +27,6 @@ Iterable, Iterator, List, - Mapping, Optional, Sequence, Set, @@ -49,6 +48,7 @@ ArrayLike, Axes, Axis, + CompressionOptions, Dtype, FilePathOrBuffer, FrameOrSeriesUnion, @@ -2062,7 +2062,7 @@ def to_stata( variable_labels: Optional[Dict[Label, str]] = None, version: Optional[int] = 114, convert_strl: Optional[Sequence[Label]] = None, - compression: Union[str, Mapping[str, str], None] = "infer", + compression: CompressionOptions = "infer", storage_options: StorageOptions = None, ) -> None: """ diff --git a/pandas/io/common.py b/pandas/io/common.py index 2b13d54ec3aed..a80b89569f429 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -205,11 +205,13 @@ def get_filepath_or_buffer( """ filepath_or_buffer = stringify_path(filepath_or_buffer) + # handle compression dict + compression_method, compression = get_compression_method(compression) + compression_method = infer_compression(filepath_or_buffer, compression_method) + compression = dict(compression, method=compression_method) + # bz2 and xz do not write the byte order mark for utf-16 and utf-32 # print a warning when writing such files - compression_method = infer_compression( - filepath_or_buffer, get_compression_method(compression)[0] - ) if ( mode and "w" in mode @@ -238,7 +240,7 @@ def get_filepath_or_buffer( content_encoding = req.headers.get("Content-Encoding", None) if content_encoding == "gzip": # Override compression based on Content-Encoding header - compression = "gzip" + compression = {"method": "gzip"} reader = BytesIO(req.read()) req.close() return IOargs( @@ -374,11 +376,7 @@ def get_compression_method( if isinstance(compression, Mapping): compression_args = dict(compression) try: - # error: Incompatible types in assignment (expression has type - # "Union[str, int, None]", variable has type "Optional[str]") - compression_method = compression_args.pop( # type: ignore[assignment] - "method" - ) + compression_method = compression_args.pop("method") except KeyError as err: raise ValueError("If mapping, compression must have key 'method'") from err else: @@ -652,12 +650,8 @@ def __init__( super().__init__(file, mode, **kwargs_zip) # type: ignore[arg-type] def write(self, data): - archive_name = self.filename - if self.archive_name is not None: - archive_name = self.archive_name - if archive_name is None: - # ZipFile needs a non-empty string - archive_name = "zip" + # ZipFile needs a non-empty string + archive_name = self.archive_name or self.filename or "zip" super().writestr(archive_name, data) @property diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 270caec022fef..15cd5c026c6b6 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -21,12 +21,7 @@ ) from pandas.core.dtypes.missing import notna -from pandas.io.common import ( - get_compression_method, - get_filepath_or_buffer, - get_handle, - infer_compression, -) +from pandas.io.common import get_filepath_or_buffer, get_handle class CSVFormatter: @@ -60,17 +55,15 @@ def __init__( if path_or_buf is None: path_or_buf = StringIO() - # Extract compression mode as given, if dict - compression, self.compression_args = get_compression_method(compression) - self.compression = infer_compression(path_or_buf, compression) - ioargs = get_filepath_or_buffer( path_or_buf, encoding=encoding, - compression=self.compression, + compression=compression, mode=mode, storage_options=storage_options, ) + self.compression = ioargs.compression.pop("method") + self.compression_args = ioargs.compression self.path_or_buf = ioargs.filepath_or_buffer self.should_close = ioargs.should_close self.mode = ioargs.mode diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 7a3b76ff7e3d0..a4d923fdbe45a 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -19,12 +19,7 @@ from pandas.core.construction import create_series_with_explicit_dtype from pandas.core.reshape.concat import concat -from pandas.io.common import ( - get_compression_method, - get_filepath_or_buffer, - get_handle, - infer_compression, -) +from pandas.io.common import get_compression_method, get_filepath_or_buffer, get_handle from pandas.io.json._normalize import convert_to_line_delimits from pandas.io.json._table_schema import build_table_schema, parse_table_schema from pandas.io.parsers import _validate_integer @@ -66,6 +61,7 @@ def to_json( ) path_or_buf = ioargs.filepath_or_buffer should_close = ioargs.should_close + compression = ioargs.compression if lines and orient != "records": raise ValueError("'lines' keyword only valid when 'orient' is records") @@ -616,9 +612,6 @@ def read_json( if encoding is None: encoding = "utf-8" - compression_method, compression = get_compression_method(compression) - compression_method = infer_compression(path_or_buf, compression_method) - compression = dict(compression, method=compression_method) ioargs = get_filepath_or_buffer( path_or_buf, encoding=encoding, diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index c6ef5221e7ead..a0466c5ac6b57 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -63,12 +63,7 @@ from pandas.core.series import Series from pandas.core.tools import datetimes as tools -from pandas.io.common import ( - get_filepath_or_buffer, - get_handle, - infer_compression, - validate_header_arg, -) +from pandas.io.common import get_filepath_or_buffer, get_handle, validate_header_arg from pandas.io.date_converters import generic_parser # BOM character (byte order mark) @@ -424,9 +419,7 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds): if encoding is not None: encoding = re.sub("_", "-", encoding).lower() kwds["encoding"] = encoding - compression = kwds.get("compression", "infer") - compression = infer_compression(filepath_or_buffer, compression) # TODO: get_filepath_or_buffer could return # Union[FilePathOrBuffer, s3fs.S3File, gcsfs.GCSFile] @@ -1976,6 +1969,10 @@ def __init__(self, src, **kwds): encoding = kwds.get("encoding") + # parsers.TextReader doesn't support compression dicts + if isinstance(kwds.get("compression"), dict): + kwds["compression"] = kwds["compression"]["method"] + if kwds.get("compression") is None and encoding: if isinstance(src, str): src = open(src, "rb") diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py index 857a2d1b69be4..655deb5ca3779 100644 --- a/pandas/io/pickle.py +++ b/pandas/io/pickle.py @@ -92,11 +92,8 @@ def to_pickle( mode="wb", storage_options=storage_options, ) - compression = ioargs.compression - if not isinstance(ioargs.filepath_or_buffer, str) and compression == "infer": - compression = None f, fh = get_handle( - ioargs.filepath_or_buffer, "wb", compression=compression, is_text=False + ioargs.filepath_or_buffer, "wb", compression=ioargs.compression, is_text=False ) if protocol < 0: protocol = pickle.HIGHEST_PROTOCOL @@ -196,11 +193,8 @@ def read_pickle( ioargs = get_filepath_or_buffer( filepath_or_buffer, compression=compression, storage_options=storage_options ) - compression = ioargs.compression - if not isinstance(ioargs.filepath_or_buffer, str) and compression == "infer": - compression = None f, fh = get_handle( - ioargs.filepath_or_buffer, "rb", compression=compression, is_text=False + ioargs.filepath_or_buffer, "rb", compression=ioargs.compression, is_text=False ) # 1) try standard library Pickle diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 34d520004cc65..b3b16e04a5d9e 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -16,18 +16,7 @@ from pathlib import Path import struct import sys -from typing import ( - Any, - AnyStr, - BinaryIO, - Dict, - List, - Mapping, - Optional, - Sequence, - Tuple, - Union, -) +from typing import Any, AnyStr, BinaryIO, Dict, List, Optional, Sequence, Tuple, Union import warnings from dateutil.relativedelta import relativedelta @@ -58,13 +47,7 @@ from pandas.core.indexes.base import Index from pandas.core.series import Series -from pandas.io.common import ( - get_compression_method, - get_filepath_or_buffer, - get_handle, - infer_compression, - stringify_path, -) +from pandas.io.common import get_filepath_or_buffer, get_handle, stringify_path _version_error = ( "Version of given Stata file is {version}. pandas supports importing " @@ -1976,9 +1959,6 @@ def _open_file_binary_write( return fname, False, None # type: ignore[return-value] elif isinstance(fname, (str, Path)): # Extract compression mode as given, if dict - compression_typ, compression_args = get_compression_method(compression) - compression_typ = infer_compression(fname, compression_typ) - compression = dict(compression_args, method=compression_typ) ioargs = get_filepath_or_buffer( fname, mode="wb", compression=compression, storage_options=storage_options ) @@ -2235,7 +2215,7 @@ def __init__( time_stamp: Optional[datetime.datetime] = None, data_label: Optional[str] = None, variable_labels: Optional[Dict[Label, str]] = None, - compression: Union[str, Mapping[str, str], None] = "infer", + compression: CompressionOptions = "infer", storage_options: StorageOptions = None, ): super().__init__() @@ -3118,7 +3098,7 @@ def __init__( data_label: Optional[str] = None, variable_labels: Optional[Dict[Label, str]] = None, convert_strl: Optional[Sequence[Label]] = None, - compression: Union[str, Mapping[str, str], None] = "infer", + compression: CompressionOptions = "infer", storage_options: StorageOptions = None, ): # Copy to new list since convert_strl might be modified later @@ -3523,7 +3503,7 @@ def __init__( variable_labels: Optional[Dict[Label, str]] = None, convert_strl: Optional[Sequence[Label]] = None, version: Optional[int] = None, - compression: Union[str, Mapping[str, str], None] = "infer", + compression: CompressionOptions = "infer", storage_options: StorageOptions = None, ): if version is None: diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py index 6331113ab8945..d1c6705dd7a6f 100644 --- a/pandas/tests/io/test_pickle.py +++ b/pandas/tests/io/test_pickle.py @@ -14,7 +14,9 @@ import datetime import glob import gzip +import io import os +from pathlib import Path import pickle import shutil from warnings import catch_warnings, simplefilter @@ -486,3 +488,30 @@ def test_read_pickle_with_subclass(): tm.assert_series_equal(result[0], expected[0]) assert isinstance(result[1], MyTz) + + +def test_pickle_binary_object_compression(compression): + """ + Read/write from binary file-objects w/wo compression. + + GH 26237, GH 29054, and GH 29570 + """ + df = tm.makeDataFrame() + + # reference for compression + with tm.ensure_clean() as path: + df.to_pickle(path, compression=compression) + reference = Path(path).read_bytes() + + # write + buffer = io.BytesIO() + df.to_pickle(buffer, compression=compression) + buffer.seek(0) + + # gzip and zip safe the filename: cannot compare the compressed content + assert buffer.getvalue() == reference or compression in ("gzip", "zip") + + # read + read_df = pd.read_pickle(buffer, compression=compression) + buffer.seek(0) + tm.assert_frame_equal(df, read_df) From e4db5f1f4f7e4a3b59a9cdd8c9b20e2b5fdc646c Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sat, 5 Sep 2020 15:55:11 +0100 Subject: [PATCH 0670/1025] TYP: Check for use of Union[Series, DataFrame] instead of FrameOrSeriesUnion alias (#36137) --- ci/code_checks.sh | 8 ++++++++ pandas/core/apply.py | 16 +++++++--------- pandas/core/groupby/generic.py | 10 +++++----- pandas/core/groupby/grouper.py | 2 +- pandas/core/reshape/merge.py | 10 +++++----- pandas/core/reshape/pivot.py | 4 ++-- pandas/io/pytables.py | 6 +++--- 7 files changed, 31 insertions(+), 25 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 6006d09bc3e78..8ee579cd25203 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -230,6 +230,9 @@ if [[ -z "$CHECK" || "$CHECK" == "patterns" ]]; then invgrep -R --include=*.{py,pyx} '!r}' pandas RET=$(($RET + $?)) ; echo $MSG "DONE" + # ------------------------------------------------------------------------- + # Type annotations + MSG='Check for use of comment-based annotation syntax' ; echo $MSG invgrep -R --include="*.py" -P '# type: (?!ignore)' pandas RET=$(($RET + $?)) ; echo $MSG "DONE" @@ -238,6 +241,11 @@ if [[ -z "$CHECK" || "$CHECK" == "patterns" ]]; then invgrep -R --include="*.py" -P '# type:\s?ignore(?!\[)' pandas RET=$(($RET + $?)) ; echo $MSG "DONE" + MSG='Check for use of Union[Series, DataFrame] instead of FrameOrSeriesUnion alias' ; echo $MSG + invgrep -R --include="*.py" --exclude=_typing.py -E 'Union\[.*(Series.*DataFrame|DataFrame.*Series).*\]' pandas + RET=$(($RET + $?)) ; echo $MSG "DONE" + + # ------------------------------------------------------------------------- MSG='Check for use of foo.__class__ instead of type(foo)' ; echo $MSG invgrep -R --include=*.{py,pyx} '\.__class__' pandas RET=$(($RET + $?)) ; echo $MSG "DONE" diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 99a9e1377563c..bbf832f33065b 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -1,12 +1,12 @@ import abc import inspect -from typing import TYPE_CHECKING, Any, Dict, Iterator, Optional, Tuple, Type, Union +from typing import TYPE_CHECKING, Any, Dict, Iterator, Optional, Tuple, Type import numpy as np from pandas._config import option_context -from pandas._typing import Axis +from pandas._typing import Axis, FrameOrSeriesUnion from pandas.util._decorators import cache_readonly from pandas.core.dtypes.common import is_dict_like, is_list_like, is_sequence @@ -73,7 +73,7 @@ def series_generator(self) -> Iterator["Series"]: @abc.abstractmethod def wrap_results_for_axis( self, results: ResType, res_index: "Index" - ) -> Union["Series", "DataFrame"]: + ) -> FrameOrSeriesUnion: pass # --------------------------------------------------------------- @@ -289,9 +289,7 @@ def apply_series_generator(self) -> Tuple[ResType, "Index"]: return results, res_index - def wrap_results( - self, results: ResType, res_index: "Index" - ) -> Union["Series", "DataFrame"]: + def wrap_results(self, results: ResType, res_index: "Index") -> FrameOrSeriesUnion: from pandas import Series # see if we can infer the results @@ -335,7 +333,7 @@ def result_columns(self) -> "Index": def wrap_results_for_axis( self, results: ResType, res_index: "Index" - ) -> Union["Series", "DataFrame"]: + ) -> FrameOrSeriesUnion: """ return the results for the rows """ if self.result_type == "reduce": @@ -408,9 +406,9 @@ def result_columns(self) -> "Index": def wrap_results_for_axis( self, results: ResType, res_index: "Index" - ) -> Union["Series", "DataFrame"]: + ) -> FrameOrSeriesUnion: """ return the results for the columns """ - result: Union["Series", "DataFrame"] + result: FrameOrSeriesUnion # we have requested to expand if self.result_type == "expand": diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index b855ce65f41b2..260e21b1f2593 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -308,7 +308,7 @@ def _aggregate_multiple_funcs(self, arg): arg = zip(columns, arg) - results: Dict[base.OutputKey, Union[Series, DataFrame]] = {} + results: Dict[base.OutputKey, FrameOrSeriesUnion] = {} for idx, (name, func) in enumerate(arg): obj = self @@ -332,7 +332,7 @@ def _wrap_series_output( self, output: Mapping[base.OutputKey, Union[Series, np.ndarray]], index: Optional[Index], - ) -> Union[Series, DataFrame]: + ) -> FrameOrSeriesUnion: """ Wraps the output of a SeriesGroupBy operation into the expected result. @@ -355,7 +355,7 @@ def _wrap_series_output( indexed_output = {key.position: val for key, val in output.items()} columns = Index(key.label for key in output) - result: Union[Series, DataFrame] + result: FrameOrSeriesUnion if len(output) > 1: result = self.obj._constructor_expanddim(indexed_output, index=index) result.columns = columns @@ -373,7 +373,7 @@ def _wrap_aggregated_output( self, output: Mapping[base.OutputKey, Union[Series, np.ndarray]], index: Optional[Index], - ) -> Union[Series, DataFrame]: + ) -> FrameOrSeriesUnion: """ Wraps the output of a SeriesGroupBy aggregation into the expected result. @@ -1085,7 +1085,7 @@ def blk_func(bvalues: ArrayLike) -> ArrayLike: raise # We get here with a) EADtypes and b) object dtype - obj: Union[Series, DataFrame] + obj: FrameOrSeriesUnion # call our grouper again with only this block if isinstance(bvalues, ExtensionArray): # TODO(EA2D): special case not needed with 2D EAs diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 6678edc3821c8..59ea7781025c4 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -393,7 +393,7 @@ class Grouping: ---------- index : Index grouper : - obj Union[DataFrame, Series]: + obj : DataFrame or Series name : Label level : observed : bool, default False diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 602ff226f8878..f1c5486222ea1 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -6,14 +6,14 @@ import datetime from functools import partial import string -from typing import TYPE_CHECKING, Optional, Tuple, Union +from typing import TYPE_CHECKING, Optional, Tuple import warnings import numpy as np from pandas._libs import Timedelta, hashtable as libhashtable, lib import pandas._libs.join as libjoin -from pandas._typing import ArrayLike, FrameOrSeries +from pandas._typing import ArrayLike, FrameOrSeries, FrameOrSeriesUnion from pandas.errors import MergeError from pandas.util._decorators import Appender, Substitution @@ -51,7 +51,7 @@ from pandas.core.sorting import is_int64_overflow_possible if TYPE_CHECKING: - from pandas import DataFrame, Series # noqa:F401 + from pandas import DataFrame # noqa:F401 @Substitution("\nleft : DataFrame") @@ -575,8 +575,8 @@ class _MergeOperation: def __init__( self, - left: Union["Series", "DataFrame"], - right: Union["Series", "DataFrame"], + left: FrameOrSeriesUnion, + right: FrameOrSeriesUnion, how: str = "inner", on=None, left_on=None, diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index 969ac56e41860..842a42f80e1b7 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -12,7 +12,7 @@ import numpy as np -from pandas._typing import Label +from pandas._typing import FrameOrSeriesUnion, Label from pandas.util._decorators import Appender, Substitution from pandas.core.dtypes.cast import maybe_downcast_to_dtype @@ -200,7 +200,7 @@ def pivot_table( def _add_margins( - table: Union["Series", "DataFrame"], + table: FrameOrSeriesUnion, data, values, rows, diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 0913627324c48..e850a101a0a63 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -16,7 +16,7 @@ from pandas._libs import lib, writers as libwriters from pandas._libs.tslibs import timezones -from pandas._typing import ArrayLike, FrameOrSeries, Label +from pandas._typing import ArrayLike, FrameOrSeries, FrameOrSeriesUnion, Label from pandas.compat._optional import import_optional_dependency from pandas.compat.pickle_compat import patch_pickle from pandas.errors import PerformanceWarning @@ -2566,7 +2566,7 @@ class Fixed: pandas_kind: str format_type: str = "fixed" # GH#30962 needed by dask - obj_type: Type[Union[DataFrame, Series]] + obj_type: Type[FrameOrSeriesUnion] ndim: int encoding: str parent: HDFStore @@ -4442,7 +4442,7 @@ class AppendableFrameTable(AppendableTable): pandas_kind = "frame_table" table_type = "appendable_frame" ndim = 2 - obj_type: Type[Union[DataFrame, Series]] = DataFrame + obj_type: Type[FrameOrSeriesUnion] = DataFrame @property def is_transposed(self) -> bool: From d68f30fdf4dec5e4cdb4b1f74c8abfbf52746301 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sat, 5 Sep 2020 16:03:54 +0100 Subject: [PATCH 0671/1025] TYP: remove string literals for type annotations in pandas\core\frame.py (#36140) --- pandas/core/frame.py | 104 +++++++++++++++++++++---------------------- 1 file changed, 52 insertions(+), 52 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 1713743b98bff..29d6fb9aa7d56 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -420,7 +420,7 @@ class DataFrame(NDFrame): _typ = "dataframe" @property - def _constructor(self) -> Type["DataFrame"]: + def _constructor(self) -> Type[DataFrame]: return DataFrame _constructor_sliced: Type[Series] = Series @@ -1233,7 +1233,7 @@ def __rmatmul__(self, other): # IO methods (to / from other formats) @classmethod - def from_dict(cls, data, orient="columns", dtype=None, columns=None) -> "DataFrame": + def from_dict(cls, data, orient="columns", dtype=None, columns=None) -> DataFrame: """ Construct DataFrame from dict of array-like or dicts. @@ -1671,7 +1671,7 @@ def from_records( columns=None, coerce_float=False, nrows=None, - ) -> "DataFrame": + ) -> DataFrame: """ Convert structured or record ndarray to DataFrame. @@ -2012,7 +2012,7 @@ def _from_arrays( index, dtype: Optional[Dtype] = None, verify_integrity: bool = True, - ) -> "DataFrame": + ) -> DataFrame: """ Create DataFrame from a list of arrays corresponding to the columns. @@ -2720,7 +2720,7 @@ def memory_usage(self, index=True, deep=False) -> Series: ).append(result) return result - def transpose(self, *args, copy: bool = False) -> "DataFrame": + def transpose(self, *args, copy: bool = False) -> DataFrame: """ Transpose index and columns. @@ -2843,7 +2843,7 @@ def transpose(self, *args, copy: bool = False) -> "DataFrame": return result.__finalize__(self, method="transpose") @property - def T(self) -> "DataFrame": + def T(self) -> DataFrame: return self.transpose() # ---------------------------------------------------------------------- @@ -3503,7 +3503,7 @@ def eval(self, expr, inplace=False, **kwargs): return _eval(expr, inplace=inplace, **kwargs) - def select_dtypes(self, include=None, exclude=None) -> "DataFrame": + def select_dtypes(self, include=None, exclude=None) -> DataFrame: """ Return a subset of the DataFrame's columns based on the column dtypes. @@ -3667,7 +3667,7 @@ def insert(self, loc, column, value, allow_duplicates=False) -> None: value = self._sanitize_column(column, value, broadcast=False) self._mgr.insert(loc, column, value, allow_duplicates=allow_duplicates) - def assign(self, **kwargs) -> "DataFrame": + def assign(self, **kwargs) -> DataFrame: r""" Assign new columns to a DataFrame. @@ -3965,7 +3965,7 @@ def _reindex_columns( allow_dups=False, ) - def _reindex_multi(self, axes, copy, fill_value) -> "DataFrame": + def _reindex_multi(self, axes, copy, fill_value) -> DataFrame: """ We are guaranteed non-Nones in the axes. """ @@ -3998,7 +3998,7 @@ def align( limit=None, fill_axis=0, broadcast_axis=None, - ) -> "DataFrame": + ) -> DataFrame: return super().align( other, join=join, @@ -4067,7 +4067,7 @@ def set_axis(self, labels, axis: Axis = 0, inplace: bool = False): ("tolerance", None), ], ) - def reindex(self, *args, **kwargs) -> "DataFrame": + def reindex(self, *args, **kwargs) -> DataFrame: axes = validate_axis_style_args(self, args, kwargs, "labels", "reindex") kwargs.update(axes) # Pop these, since the values are in `kwargs` under different names @@ -4229,7 +4229,7 @@ def rename( inplace: bool = False, level: Optional[Level] = None, errors: str = "ignore", - ) -> Optional["DataFrame"]: + ) -> Optional[DataFrame]: """ Alter axes labels. @@ -4357,7 +4357,7 @@ def fillna( inplace=False, limit=None, downcast=None, - ) -> Optional["DataFrame"]: + ) -> Optional[DataFrame]: return super().fillna( value=value, method=method, @@ -4465,7 +4465,7 @@ def _replace_columnwise( return res.__finalize__(self) @doc(NDFrame.shift, klass=_shared_doc_kwargs["klass"]) - def shift(self, periods=1, freq=None, axis=0, fill_value=None) -> "DataFrame": + def shift(self, periods=1, freq=None, axis=0, fill_value=None) -> DataFrame: return super().shift( periods=periods, freq=freq, axis=axis, fill_value=fill_value ) @@ -4666,7 +4666,7 @@ def reset_index( inplace: bool = False, col_level: Hashable = 0, col_fill: Label = "", - ) -> Optional["DataFrame"]: + ) -> Optional[DataFrame]: """ Reset the index, or a level of it. @@ -4910,20 +4910,20 @@ def _maybe_casted_values(index, labels=None): # Reindex-based selection methods @doc(NDFrame.isna, klass=_shared_doc_kwargs["klass"]) - def isna(self) -> "DataFrame": + def isna(self) -> DataFrame: result = self._constructor(self._data.isna(func=isna)) return result.__finalize__(self, method="isna") @doc(NDFrame.isna, klass=_shared_doc_kwargs["klass"]) - def isnull(self) -> "DataFrame": + def isnull(self) -> DataFrame: return self.isna() @doc(NDFrame.notna, klass=_shared_doc_kwargs["klass"]) - def notna(self) -> "DataFrame": + def notna(self) -> DataFrame: return ~self.isna() @doc(NDFrame.notna, klass=_shared_doc_kwargs["klass"]) - def notnull(self) -> "DataFrame": + def notnull(self) -> DataFrame: return ~self.isna() def dropna(self, axis=0, how="any", thresh=None, subset=None, inplace=False): @@ -5074,7 +5074,7 @@ def drop_duplicates( keep: Union[str, bool] = "first", inplace: bool = False, ignore_index: bool = False, - ) -> Optional["DataFrame"]: + ) -> Optional[DataFrame]: """ Return DataFrame with duplicate rows removed. @@ -5168,7 +5168,7 @@ def duplicated( self, subset: Optional[Union[Hashable, Sequence[Hashable]]] = None, keep: Union[str, bool] = "first", - ) -> "Series": + ) -> Series: """ Return boolean Series denoting duplicate rows. @@ -5619,7 +5619,7 @@ def value_counts( return counts - def nlargest(self, n, columns, keep="first") -> "DataFrame": + def nlargest(self, n, columns, keep="first") -> DataFrame: """ Return the first `n` rows ordered by `columns` in descending order. @@ -5728,7 +5728,7 @@ def nlargest(self, n, columns, keep="first") -> "DataFrame": """ return algorithms.SelectNFrame(self, n=n, keep=keep, columns=columns).nlargest() - def nsmallest(self, n, columns, keep="first") -> "DataFrame": + def nsmallest(self, n, columns, keep="first") -> DataFrame: """ Return the first `n` rows ordered by `columns` in ascending order. @@ -5830,7 +5830,7 @@ def nsmallest(self, n, columns, keep="first") -> "DataFrame": self, n=n, keep=keep, columns=columns ).nsmallest() - def swaplevel(self, i=-2, j=-1, axis=0) -> "DataFrame": + def swaplevel(self, i=-2, j=-1, axis=0) -> DataFrame: """ Swap levels i and j in a MultiIndex on a particular axis. @@ -5861,7 +5861,7 @@ def swaplevel(self, i=-2, j=-1, axis=0) -> "DataFrame": result.columns = result.columns.swaplevel(i, j) return result - def reorder_levels(self, order, axis=0) -> "DataFrame": + def reorder_levels(self, order, axis=0) -> DataFrame: """ Rearrange index levels using input order. May not drop or duplicate levels. @@ -5894,7 +5894,7 @@ def reorder_levels(self, order, axis=0) -> "DataFrame": # ---------------------------------------------------------------------- # Arithmetic / combination related - def _combine_frame(self, other: "DataFrame", func, fill_value=None): + def _combine_frame(self, other: DataFrame, func, fill_value=None): # at this point we have `self._indexed_same(other)` if fill_value is None: @@ -5914,7 +5914,7 @@ def _arith_op(left, right): new_data = ops.dispatch_to_series(self, other, _arith_op) return new_data - def _construct_result(self, result) -> "DataFrame": + def _construct_result(self, result) -> DataFrame: """ Wrap the result of an arithmetic, comparison, or logical operation. @@ -6031,11 +6031,11 @@ def _construct_result(self, result) -> "DataFrame": @Appender(_shared_docs["compare"] % _shared_doc_kwargs) def compare( self, - other: "DataFrame", + other: DataFrame, align_axis: Axis = 1, keep_shape: bool = False, keep_equal: bool = False, - ) -> "DataFrame": + ) -> DataFrame: return super().compare( other=other, align_axis=align_axis, @@ -6044,8 +6044,8 @@ def compare( ) def combine( - self, other: "DataFrame", func, fill_value=None, overwrite=True - ) -> "DataFrame": + self, other: DataFrame, func, fill_value=None, overwrite=True + ) -> DataFrame: """ Perform column-wise combine with another DataFrame. @@ -6212,7 +6212,7 @@ def combine( # convert_objects just in case return self._constructor(result, index=new_index, columns=new_columns) - def combine_first(self, other: "DataFrame") -> "DataFrame": + def combine_first(self, other: DataFrame) -> DataFrame: """ Update null elements with value in the same location in `other`. @@ -6718,7 +6718,7 @@ def groupby( @Substitution("") @Appender(_shared_docs["pivot"]) - def pivot(self, index=None, columns=None, values=None) -> "DataFrame": + def pivot(self, index=None, columns=None, values=None) -> DataFrame: from pandas.core.reshape.pivot import pivot return pivot(self, index=index, columns=columns, values=values) @@ -6870,7 +6870,7 @@ def pivot_table( dropna=True, margins_name="All", observed=False, - ) -> "DataFrame": + ) -> DataFrame: from pandas.core.reshape.pivot import pivot_table return pivot_table( @@ -7056,7 +7056,7 @@ def stack(self, level=-1, dropna=True): def explode( self, column: Union[str, Tuple], ignore_index: bool = False - ) -> "DataFrame": + ) -> DataFrame: """ Transform each element of a list-like to a row, replicating index values. @@ -7211,7 +7211,7 @@ def melt( value_name="value", col_level=None, ignore_index=True, - ) -> "DataFrame": + ) -> DataFrame: return melt( self, @@ -7299,7 +7299,7 @@ def melt( 1 255.0""" ), ) - def diff(self, periods: int = 1, axis: Axis = 0) -> "DataFrame": + def diff(self, periods: int = 1, axis: Axis = 0) -> DataFrame: bm_axis = self._get_block_manager_axis(axis) self._consolidate_inplace() @@ -7462,7 +7462,7 @@ def _aggregate(self, arg, axis=0, *args, **kwargs): klass=_shared_doc_kwargs["klass"], axis=_shared_doc_kwargs["axis"], ) - def transform(self, func, axis=0, *args, **kwargs) -> "DataFrame": + def transform(self, func, axis=0, *args, **kwargs) -> DataFrame: axis = self._get_axis_number(axis) if axis == 1: return self.T.transform(func, *args, **kwargs).T @@ -7616,7 +7616,7 @@ def apply(self, func, axis=0, raw=False, result_type=None, args=(), **kwds): ) return op.get_result() - def applymap(self, func) -> "DataFrame": + def applymap(self, func) -> DataFrame: """ Apply a function to a Dataframe elementwise. @@ -7678,7 +7678,7 @@ def infer(x): def append( self, other, ignore_index=False, verify_integrity=False, sort=False - ) -> "DataFrame": + ) -> DataFrame: """ Append rows of `other` to the end of caller, returning a new object. @@ -7818,7 +7818,7 @@ def append( def join( self, other, on=None, how="left", lsuffix="", rsuffix="", sort=False - ) -> "DataFrame": + ) -> DataFrame: """ Join columns of another DataFrame. @@ -8009,7 +8009,7 @@ def merge( copy=True, indicator=False, validate=None, - ) -> "DataFrame": + ) -> DataFrame: from pandas.core.reshape.merge import merge return merge( @@ -8028,7 +8028,7 @@ def merge( validate=validate, ) - def round(self, decimals=0, *args, **kwargs) -> "DataFrame": + def round(self, decimals=0, *args, **kwargs) -> DataFrame: """ Round a DataFrame to a variable number of decimal places. @@ -8142,7 +8142,7 @@ def _series_round(s, decimals): # ---------------------------------------------------------------------- # Statistical methods, etc. - def corr(self, method="pearson", min_periods=1) -> "DataFrame": + def corr(self, method="pearson", min_periods=1) -> DataFrame: """ Compute pairwise correlation of columns, excluding NA/null values. @@ -8233,7 +8233,7 @@ def corr(self, method="pearson", min_periods=1) -> "DataFrame": def cov( self, min_periods: Optional[int] = None, ddof: Optional[int] = 1 - ) -> "DataFrame": + ) -> DataFrame: """ Compute pairwise covariance of columns, excluding NA/null values. @@ -8636,7 +8636,7 @@ def func(values): else: return op(values, axis=axis, skipna=skipna, **kwds) - def _get_data(axis_matters: bool) -> "DataFrame": + def _get_data(axis_matters: bool) -> DataFrame: if filter_type is None: data = self._get_numeric_data() elif filter_type == "bool": @@ -8937,7 +8937,7 @@ def _get_agg_axis(self, axis_num: int) -> Index: else: raise ValueError(f"Axis must be 0 or 1 (got {repr(axis_num)})") - def mode(self, axis=0, numeric_only=False, dropna=True) -> "DataFrame": + def mode(self, axis=0, numeric_only=False, dropna=True) -> DataFrame: """ Get the mode(s) of each element along the selected axis. @@ -9122,7 +9122,7 @@ def quantile(self, q=0.5, axis=0, numeric_only=True, interpolation="linear"): def to_timestamp( self, freq=None, how: str = "start", axis: Axis = 0, copy: bool = True - ) -> "DataFrame": + ) -> DataFrame: """ Cast to DatetimeIndex of timestamps, at *beginning* of period. @@ -9151,7 +9151,7 @@ def to_timestamp( setattr(new_obj, axis_name, new_ax) return new_obj - def to_period(self, freq=None, axis: Axis = 0, copy: bool = True) -> "DataFrame": + def to_period(self, freq=None, axis: Axis = 0, copy: bool = True) -> DataFrame: """ Convert DataFrame from DatetimeIndex to PeriodIndex. @@ -9180,7 +9180,7 @@ def to_period(self, freq=None, axis: Axis = 0, copy: bool = True) -> "DataFrame" setattr(new_obj, axis_name, new_ax) return new_obj - def isin(self, values) -> "DataFrame": + def isin(self, values) -> DataFrame: """ Whether each element in the DataFrame is contained in values. @@ -9287,10 +9287,10 @@ def isin(self, values) -> "DataFrame": _info_axis_number = 1 _info_axis_name = "columns" - index: "Index" = properties.AxisProperty( + index: Index = properties.AxisProperty( axis=1, doc="The index (row labels) of the DataFrame." ) - columns: "Index" = properties.AxisProperty( + columns: Index = properties.AxisProperty( axis=0, doc="The column labels of the DataFrame." ) From 5b1a063d269d93e68e80587294c8bf60d1d8e9c6 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 5 Sep 2020 10:53:21 -0700 Subject: [PATCH 0672/1025] STY+CI: check for private function access across modules (#36144) --- Makefile | 7 +++ ci/code_checks.sh | 8 ++++ pandas/_libs/algos.pyx | 14 +++--- pandas/core/algorithms.py | 8 ++-- pandas/core/arrays/sparse/array.py | 2 +- pandas/core/internals/blocks.py | 2 +- pandas/core/missing.py | 2 +- pandas/plotting/_matplotlib/compat.py | 10 ++-- pandas/plotting/_matplotlib/core.py | 4 +- pandas/plotting/_matplotlib/tools.py | 2 +- pandas/tests/plotting/common.py | 8 ++-- pandas/tests/plotting/test_frame.py | 4 +- pandas/tests/plotting/test_misc.py | 4 +- scripts/validate_unwanted_patterns.py | 69 +++++++++++++++++++++++++-- 14 files changed, 111 insertions(+), 33 deletions(-) diff --git a/Makefile b/Makefile index f26689ab65ba5..4a9a48992f92f 100644 --- a/Makefile +++ b/Makefile @@ -25,3 +25,10 @@ doc: cd doc; \ python make.py clean; \ python make.py html + +check: + python3 scripts/validate_unwanted_patterns.py \ + --validation-type="private_function_across_module" \ + --included-file-extensions="py" \ + --excluded-file-paths=pandas/tests,asv_bench/,pandas/_vendored \ + pandas/ diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 8ee579cd25203..875f1dbb83ce3 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -116,6 +116,14 @@ if [[ -z "$CHECK" || "$CHECK" == "lint" ]]; then fi RET=$(($RET + $?)) ; echo $MSG "DONE" + MSG='Check for use of private module attribute access' ; echo $MSG + if [[ "$GITHUB_ACTIONS" == "true" ]]; then + $BASE_DIR/scripts/validate_unwanted_patterns.py --validation-type="private_function_across_module" --included-file-extensions="py" --excluded-file-paths=pandas/tests,asv_bench/,pandas/_vendored --format="##[error]{source_path}:{line_number}:{msg}" pandas/ + else + $BASE_DIR/scripts/validate_unwanted_patterns.py --validation-type="private_function_across_module" --included-file-extensions="py" --excluded-file-paths=pandas/tests,asv_bench/,pandas/_vendored pandas/ + fi + RET=$(($RET + $?)) ; echo $MSG "DONE" + echo "isort --version-number" isort --version-number diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 0a70afda893cf..c4723a5f064c7 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -412,7 +412,7 @@ ctypedef fused algos_t: uint8_t -def _validate_limit(nobs: int, limit=None) -> int: +def validate_limit(nobs: int, limit=None) -> int: """ Check that the `limit` argument is a positive integer. @@ -452,7 +452,7 @@ def pad(ndarray[algos_t] old, ndarray[algos_t] new, limit=None): indexer = np.empty(nright, dtype=np.int64) indexer[:] = -1 - lim = _validate_limit(nright, limit) + lim = validate_limit(nright, limit) if nleft == 0 or nright == 0 or new[nright - 1] < old[0]: return indexer @@ -509,7 +509,7 @@ def pad_inplace(algos_t[:] values, const uint8_t[:] mask, limit=None): if N == 0: return - lim = _validate_limit(N, limit) + lim = validate_limit(N, limit) val = values[0] for i in range(N): @@ -537,7 +537,7 @@ def pad_2d_inplace(algos_t[:, :] values, const uint8_t[:, :] mask, limit=None): if N == 0: return - lim = _validate_limit(N, limit) + lim = validate_limit(N, limit) for j in range(K): fill_count = 0 @@ -593,7 +593,7 @@ def backfill(ndarray[algos_t] old, ndarray[algos_t] new, limit=None) -> ndarray: indexer = np.empty(nright, dtype=np.int64) indexer[:] = -1 - lim = _validate_limit(nright, limit) + lim = validate_limit(nright, limit) if nleft == 0 or nright == 0 or new[0] > old[nleft - 1]: return indexer @@ -651,7 +651,7 @@ def backfill_inplace(algos_t[:] values, const uint8_t[:] mask, limit=None): if N == 0: return - lim = _validate_limit(N, limit) + lim = validate_limit(N, limit) val = values[N - 1] for i in range(N - 1, -1, -1): @@ -681,7 +681,7 @@ def backfill_2d_inplace(algos_t[:, :] values, if N == 0: return - lim = _validate_limit(N, limit) + lim = validate_limit(N, limit) for j in range(K): fill_count = 0 diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index f297c7165208f..50ec3714f454b 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -60,7 +60,7 @@ from pandas.core.indexers import validate_indices if TYPE_CHECKING: - from pandas import Categorical, DataFrame, Series + from pandas import Categorical, DataFrame, Series # noqa:F401 _shared_docs: Dict[str, str] = {} @@ -767,7 +767,7 @@ def value_counts( counts = result._values else: - keys, counts = _value_counts_arraylike(values, dropna) + keys, counts = value_counts_arraylike(values, dropna) result = Series(counts, index=keys, name=name) @@ -780,8 +780,8 @@ def value_counts( return result -# Called once from SparseArray -def _value_counts_arraylike(values, dropna: bool): +# Called once from SparseArray, otherwise could be private +def value_counts_arraylike(values, dropna: bool): """ Parameters ---------- diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index 1531f7b292365..47c960dc969d6 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -735,7 +735,7 @@ def value_counts(self, dropna=True): """ from pandas import Index, Series - keys, counts = algos._value_counts_arraylike(self.sp_values, dropna=dropna) + keys, counts = algos.value_counts_arraylike(self.sp_values, dropna=dropna) fcounts = self.sp_index.ngaps if fcounts > 0: if self._null_fill_value and dropna: diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 3bcd4debbf41a..9f4e535dc787d 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -390,7 +390,7 @@ def fillna( mask = isna(self.values) if limit is not None: - limit = libalgos._validate_limit(None, limit=limit) + limit = libalgos.validate_limit(None, limit=limit) mask[mask.cumsum(self.ndim - 1) > limit] = False if not self._can_hold_na: diff --git a/pandas/core/missing.py b/pandas/core/missing.py index 7802c5cbdbfb3..be66b19d10064 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -228,7 +228,7 @@ def interpolate_1d( ) # default limit is unlimited GH #16282 - limit = algos._validate_limit(nobs=None, limit=limit) + limit = algos.validate_limit(nobs=None, limit=limit) # These are sets of index pointers to invalid values... i.e. {0, 1, etc... all_nans = set(np.flatnonzero(invalid)) diff --git a/pandas/plotting/_matplotlib/compat.py b/pandas/plotting/_matplotlib/compat.py index 7f107f18eca25..964596d9b6319 100644 --- a/pandas/plotting/_matplotlib/compat.py +++ b/pandas/plotting/_matplotlib/compat.py @@ -17,8 +17,8 @@ def inner(): return inner -_mpl_ge_2_2_3 = _mpl_version("2.2.3", operator.ge) -_mpl_ge_3_0_0 = _mpl_version("3.0.0", operator.ge) -_mpl_ge_3_1_0 = _mpl_version("3.1.0", operator.ge) -_mpl_ge_3_2_0 = _mpl_version("3.2.0", operator.ge) -_mpl_ge_3_3_0 = _mpl_version("3.3.0", operator.ge) +mpl_ge_2_2_3 = _mpl_version("2.2.3", operator.ge) +mpl_ge_3_0_0 = _mpl_version("3.0.0", operator.ge) +mpl_ge_3_1_0 = _mpl_version("3.1.0", operator.ge) +mpl_ge_3_2_0 = _mpl_version("3.2.0", operator.ge) +mpl_ge_3_3_0 = _mpl_version("3.3.0", operator.ge) diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index def4a1dc3f5c4..8275c0991e464 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -29,7 +29,7 @@ import pandas.core.common as com from pandas.io.formats.printing import pprint_thing -from pandas.plotting._matplotlib.compat import _mpl_ge_3_0_0 +from pandas.plotting._matplotlib.compat import mpl_ge_3_0_0 from pandas.plotting._matplotlib.converter import register_pandas_matplotlib_converters from pandas.plotting._matplotlib.style import get_standard_colors from pandas.plotting._matplotlib.timeseries import ( @@ -944,7 +944,7 @@ def _plot_colorbar(self, ax: "Axes", **kwds): img = ax.collections[-1] cbar = self.fig.colorbar(img, ax=ax, **kwds) - if _mpl_ge_3_0_0(): + if mpl_ge_3_0_0(): # The workaround below is no longer necessary. return diff --git a/pandas/plotting/_matplotlib/tools.py b/pandas/plotting/_matplotlib/tools.py index 98aaab6838fba..c5b44f37150bb 100644 --- a/pandas/plotting/_matplotlib/tools.py +++ b/pandas/plotting/_matplotlib/tools.py @@ -307,7 +307,7 @@ def handle_shared_axes( sharey: bool, ): if nplots > 1: - if compat._mpl_ge_3_2_0(): + if compat.mpl_ge_3_2_0(): row_num = lambda x: x.get_subplotspec().rowspan.start col_num = lambda x: x.get_subplotspec().colspan.start else: diff --git a/pandas/tests/plotting/common.py b/pandas/tests/plotting/common.py index b753c96af6290..9301a29933d45 100644 --- a/pandas/tests/plotting/common.py +++ b/pandas/tests/plotting/common.py @@ -28,10 +28,10 @@ def setup_method(self, method): mpl.rcdefaults() - self.mpl_ge_2_2_3 = compat._mpl_ge_2_2_3() - self.mpl_ge_3_0_0 = compat._mpl_ge_3_0_0() - self.mpl_ge_3_1_0 = compat._mpl_ge_3_1_0() - self.mpl_ge_3_2_0 = compat._mpl_ge_3_2_0() + self.mpl_ge_2_2_3 = compat.mpl_ge_2_2_3() + self.mpl_ge_3_0_0 = compat.mpl_ge_3_0_0() + self.mpl_ge_3_1_0 = compat.mpl_ge_3_1_0() + self.mpl_ge_3_2_0 = compat.mpl_ge_3_2_0() self.bp_n_objects = 7 self.polycollection_factor = 2 diff --git a/pandas/tests/plotting/test_frame.py b/pandas/tests/plotting/test_frame.py index 3b3902647390d..d2b22c7a4c2e3 100644 --- a/pandas/tests/plotting/test_frame.py +++ b/pandas/tests/plotting/test_frame.py @@ -51,7 +51,7 @@ def _assert_xtickslabels_visibility(self, axes, expected): @pytest.mark.xfail(reason="Waiting for PR 34334", strict=True) @pytest.mark.slow def test_plot(self): - from pandas.plotting._matplotlib.compat import _mpl_ge_3_1_0 + from pandas.plotting._matplotlib.compat import mpl_ge_3_1_0 df = self.tdf _check_plot_works(df.plot, grid=False) @@ -69,7 +69,7 @@ def test_plot(self): self._check_axes_shape(axes, axes_num=4, layout=(4, 1)) df = DataFrame({"x": [1, 2], "y": [3, 4]}) - if _mpl_ge_3_1_0(): + if mpl_ge_3_1_0(): msg = "'Line2D' object has no property 'blarg'" else: msg = "Unknown property blarg" diff --git a/pandas/tests/plotting/test_misc.py b/pandas/tests/plotting/test_misc.py index 130acaa8bcd58..0208ab3e0225b 100644 --- a/pandas/tests/plotting/test_misc.py +++ b/pandas/tests/plotting/test_misc.py @@ -96,7 +96,7 @@ def test_bootstrap_plot(self): class TestDataFramePlots(TestPlotBase): @td.skip_if_no_scipy def test_scatter_matrix_axis(self): - from pandas.plotting._matplotlib.compat import _mpl_ge_3_0_0 + from pandas.plotting._matplotlib.compat import mpl_ge_3_0_0 scatter_matrix = plotting.scatter_matrix @@ -105,7 +105,7 @@ def test_scatter_matrix_axis(self): # we are plotting multiples on a sub-plot with tm.assert_produces_warning( - UserWarning, raise_on_extra_warnings=_mpl_ge_3_0_0() + UserWarning, raise_on_extra_warnings=mpl_ge_3_0_0() ): axes = _check_plot_works( scatter_matrix, filterwarnings="always", frame=df, range_padding=0.1 diff --git a/scripts/validate_unwanted_patterns.py b/scripts/validate_unwanted_patterns.py index 193fef026a96b..1a6d8cc8b9914 100755 --- a/scripts/validate_unwanted_patterns.py +++ b/scripts/validate_unwanted_patterns.py @@ -16,9 +16,7 @@ import sys import token import tokenize -from typing import IO, Callable, FrozenSet, Iterable, List, Tuple - -PATHS_TO_IGNORE: Tuple[str, ...] = ("asv_bench/env",) +from typing import IO, Callable, FrozenSet, Iterable, List, Set, Tuple def _get_literal_string_prefix_len(token_string: str) -> int: @@ -114,6 +112,58 @@ def bare_pytest_raises(file_obj: IO[str]) -> Iterable[Tuple[int, str]]: ) +PRIVATE_FUNCTIONS_ALLOWED = {"sys._getframe"} # no known alternative + + +def private_function_across_module(file_obj: IO[str]) -> Iterable[Tuple[int, str]]: + """ + Checking that a private function is not used across modules. + Parameters + ---------- + file_obj : IO + File-like object containing the Python code to validate. + Yields + ------ + line_number : int + Line number of the private function that is used across modules. + msg : str + Explenation of the error. + """ + contents = file_obj.read() + tree = ast.parse(contents) + + imported_modules: Set[str] = set() + + for node in ast.walk(tree): + if isinstance(node, (ast.Import, ast.ImportFrom)): + for module in node.names: + module_fqdn = module.name if module.asname is None else module.asname + imported_modules.add(module_fqdn) + + if not isinstance(node, ast.Call): + continue + + try: + module_name = node.func.value.id + function_name = node.func.attr + except AttributeError: + continue + + # Exception section # + + # (Debatable) Class case + if module_name[0].isupper(): + continue + # (Debatable) Dunder methods case + elif function_name.startswith("__") and function_name.endswith("__"): + continue + elif module_name + "." + function_name in PRIVATE_FUNCTIONS_ALLOWED: + continue + + if module_name in imported_modules and function_name.startswith("_"): + yield (node.lineno, f"Private function '{module_name}.{function_name}'") + + def strings_to_concatenate(file_obj: IO[str]) -> Iterable[Tuple[int, str]]: """ This test case is necessary after 'Black' (https://github.com/psf/black), @@ -293,6 +343,7 @@ def main( source_path: str, output_format: str, file_extensions_to_check: str, + excluded_file_paths: str, ) -> bool: """ Main entry point of the script. @@ -305,6 +356,10 @@ def main( Source path representing path to a file/directory. output_format : str Output format of the error message. + file_extensions_to_check : str + Coma seperated values of what file extensions to check. + excluded_file_paths : str + Coma seperated values of what file paths to exclude during the check. Returns ------- @@ -325,6 +380,7 @@ def main( FILE_EXTENSIONS_TO_CHECK: FrozenSet[str] = frozenset( file_extensions_to_check.split(",") ) + PATHS_TO_IGNORE = frozenset(excluded_file_paths.split(",")) if os.path.isfile(source_path): file_path = source_path @@ -362,6 +418,7 @@ def main( if __name__ == "__main__": available_validation_types: List[str] = [ "bare_pytest_raises", + "private_function_across_module", "strings_to_concatenate", "strings_with_wrong_placed_whitespace", ] @@ -389,6 +446,11 @@ def main( default="py,pyx,pxd,pxi", help="Coma seperated file extensions to check.", ) + parser.add_argument( + "--excluded-file-paths", + default="asv_bench/env", + help="Comma separated file extensions to check.", + ) args = parser.parse_args() @@ -398,5 +460,6 @@ def main( source_path=args.path, output_format=args.format, file_extensions_to_check=args.included_file_extensions, + excluded_file_paths=args.excluded_file_paths, ) ) From 0e632f21f04f9c21e152e712bdd0b711064d2e99 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 5 Sep 2020 10:54:30 -0700 Subject: [PATCH 0673/1025] CLN: unused case in compare_or_regex_search (#36143) --- pandas/core/array_algos/replace.py | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/pandas/core/array_algos/replace.py b/pandas/core/array_algos/replace.py index 6ac3cc1f9f2fe..09f9aefd64096 100644 --- a/pandas/core/array_algos/replace.py +++ b/pandas/core/array_algos/replace.py @@ -3,7 +3,7 @@ """ import operator import re -from typing import Optional, Pattern, Union +from typing import Pattern, Union import numpy as np @@ -14,14 +14,10 @@ is_numeric_v_string_like, is_scalar, ) -from pandas.core.dtypes.missing import isna def compare_or_regex_search( - a: ArrayLike, - b: Union[Scalar, Pattern], - regex: bool = False, - mask: Optional[ArrayLike] = None, + a: ArrayLike, b: Union[Scalar, Pattern], regex: bool, mask: ArrayLike, ) -> Union[ArrayLike, bool]: """ Compare two array_like inputs of the same shape or two scalar values @@ -33,8 +29,8 @@ def compare_or_regex_search( ---------- a : array_like b : scalar or regex pattern - regex : bool, default False - mask : array_like or None (default) + regex : bool + mask : array_like Returns ------- @@ -68,8 +64,6 @@ def _check_comparison_types( ) # GH#32621 use mask to avoid comparing to NAs - if mask is None and isinstance(a, np.ndarray) and not isinstance(b, np.ndarray): - mask = np.reshape(~(isna(a)), a.shape) if isinstance(a, np.ndarray): a = a[mask] From b8bf25da50458a33d9e3358ecc135fbeabaed04d Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Sat, 5 Sep 2020 12:40:06 -0700 Subject: [PATCH 0674/1025] REF: window/test_dtypes.py with pytest idioms (#35918) --- pandas/tests/window/conftest.py | 31 +++ pandas/tests/window/test_dtypes.py | 315 ++++++++--------------------- 2 files changed, 118 insertions(+), 228 deletions(-) diff --git a/pandas/tests/window/conftest.py b/pandas/tests/window/conftest.py index eb8252d5731be..7f03fa2a5ea0d 100644 --- a/pandas/tests/window/conftest.py +++ b/pandas/tests/window/conftest.py @@ -308,3 +308,34 @@ def which(request): def halflife_with_times(request): """Halflife argument for EWM when times is specified.""" return request.param + + +@pytest.fixture( + params=[ + "object", + "category", + "int8", + "int16", + "int32", + "int64", + "uint8", + "uint16", + "uint32", + "uint64", + "float16", + "float32", + "float64", + "m8[ns]", + "M8[ns]", + pytest.param( + "datetime64[ns, UTC]", + marks=pytest.mark.skip( + "direct creation of extension dtype datetime64[ns, UTC] " + "is not supported ATM" + ), + ), + ] +) +def dtypes(request): + """Dtypes for window tests""" + return request.param diff --git a/pandas/tests/window/test_dtypes.py b/pandas/tests/window/test_dtypes.py index 0aa5bf019ff5e..245b48b351684 100644 --- a/pandas/tests/window/test_dtypes.py +++ b/pandas/tests/window/test_dtypes.py @@ -1,5 +1,3 @@ -from itertools import product - import numpy as np import pytest @@ -10,234 +8,95 @@ # gh-12373 : rolling functions error on float32 data # make sure rolling functions works for different dtypes # -# NOTE that these are yielded tests and so _create_data -# is explicitly called. -# # further note that we are only checking rolling for fully dtype # compliance (though both expanding and ewm inherit) -class Dtype: - window = 2 - - funcs = { - "count": lambda v: v.count(), - "max": lambda v: v.max(), - "min": lambda v: v.min(), - "sum": lambda v: v.sum(), - "mean": lambda v: v.mean(), - "std": lambda v: v.std(), - "var": lambda v: v.var(), - "median": lambda v: v.median(), - } - - def get_expects(self): - expects = { - "sr1": { - "count": Series([1, 2, 2, 2, 2], dtype="float64"), - "max": Series([np.nan, 1, 2, 3, 4], dtype="float64"), - "min": Series([np.nan, 0, 1, 2, 3], dtype="float64"), - "sum": Series([np.nan, 1, 3, 5, 7], dtype="float64"), - "mean": Series([np.nan, 0.5, 1.5, 2.5, 3.5], dtype="float64"), - "std": Series([np.nan] + [np.sqrt(0.5)] * 4, dtype="float64"), - "var": Series([np.nan, 0.5, 0.5, 0.5, 0.5], dtype="float64"), - "median": Series([np.nan, 0.5, 1.5, 2.5, 3.5], dtype="float64"), +def get_dtype(dtype, coerce_int=None): + if coerce_int is False and "int" in dtype: + return None + if dtype != "category": + return np.dtype(dtype) + return dtype + + +@pytest.mark.parametrize( + "method, data, expected_data, coerce_int", + [ + ("count", np.arange(5), [1, 2, 2, 2, 2], True), + ("count", np.arange(10, 0, -2), [1, 2, 2, 2, 2], True), + ("count", [0, 1, 2, np.nan, 4], [1, 2, 2, 1, 1], False), + ("max", np.arange(5), [np.nan, 1, 2, 3, 4], True), + ("max", np.arange(10, 0, -2), [np.nan, 10, 8, 6, 4], True), + ("max", [0, 1, 2, np.nan, 4], [np.nan, 1, 2, np.nan, np.nan], False), + ("min", np.arange(5), [np.nan, 0, 1, 2, 3], True), + ("min", np.arange(10, 0, -2), [np.nan, 8, 6, 4, 2], True), + ("min", [0, 1, 2, np.nan, 4], [np.nan, 0, 1, np.nan, np.nan], False), + ("sum", np.arange(5), [np.nan, 1, 3, 5, 7], True), + ("sum", np.arange(10, 0, -2), [np.nan, 18, 14, 10, 6], True), + ("sum", [0, 1, 2, np.nan, 4], [np.nan, 1, 3, np.nan, np.nan], False), + ("mean", np.arange(5), [np.nan, 0.5, 1.5, 2.5, 3.5], True), + ("mean", np.arange(10, 0, -2), [np.nan, 9, 7, 5, 3], True), + ("mean", [0, 1, 2, np.nan, 4], [np.nan, 0.5, 1.5, np.nan, np.nan], False), + ("std", np.arange(5), [np.nan] + [np.sqrt(0.5)] * 4, True), + ("std", np.arange(10, 0, -2), [np.nan] + [np.sqrt(2)] * 4, True), + ( + "std", + [0, 1, 2, np.nan, 4], + [np.nan] + [np.sqrt(0.5)] * 2 + [np.nan] * 2, + False, + ), + ("var", np.arange(5), [np.nan, 0.5, 0.5, 0.5, 0.5], True), + ("var", np.arange(10, 0, -2), [np.nan, 2, 2, 2, 2], True), + ("var", [0, 1, 2, np.nan, 4], [np.nan, 0.5, 0.5, np.nan, np.nan], False), + ("median", np.arange(5), [np.nan, 0.5, 1.5, 2.5, 3.5], True), + ("median", np.arange(10, 0, -2), [np.nan, 9, 7, 5, 3], True), + ("median", [0, 1, 2, np.nan, 4], [np.nan, 0.5, 1.5, np.nan, np.nan], False), + ], +) +def test_series_dtypes(method, data, expected_data, coerce_int, dtypes): + s = Series(data, dtype=get_dtype(dtypes, coerce_int=coerce_int)) + if dtypes in ("m8[ns]", "M8[ns]") and method != "count": + msg = "No numeric types to aggregate" + with pytest.raises(DataError, match=msg): + getattr(s.rolling(2), method)() + else: + result = getattr(s.rolling(2), method)() + expected = Series(expected_data, dtype="float64") + tm.assert_almost_equal(result, expected) + + +@pytest.mark.parametrize( + "method, expected_data", + [ + ("count", {0: Series([1, 2, 2, 2, 2]), 1: Series([1, 2, 2, 2, 2])}), + ("max", {0: Series([np.nan, 2, 4, 6, 8]), 1: Series([np.nan, 3, 5, 7, 9])}), + ("min", {0: Series([np.nan, 0, 2, 4, 6]), 1: Series([np.nan, 1, 3, 5, 7])}), + ( + "sum", + {0: Series([np.nan, 2, 6, 10, 14]), 1: Series([np.nan, 4, 8, 12, 16])}, + ), + ("mean", {0: Series([np.nan, 1, 3, 5, 7]), 1: Series([np.nan, 2, 4, 6, 8])}), + ( + "std", + { + 0: Series([np.nan] + [np.sqrt(2)] * 4), + 1: Series([np.nan] + [np.sqrt(2)] * 4), }, - "sr2": { - "count": Series([1, 2, 2, 2, 2], dtype="float64"), - "max": Series([np.nan, 10, 8, 6, 4], dtype="float64"), - "min": Series([np.nan, 8, 6, 4, 2], dtype="float64"), - "sum": Series([np.nan, 18, 14, 10, 6], dtype="float64"), - "mean": Series([np.nan, 9, 7, 5, 3], dtype="float64"), - "std": Series([np.nan] + [np.sqrt(2)] * 4, dtype="float64"), - "var": Series([np.nan, 2, 2, 2, 2], dtype="float64"), - "median": Series([np.nan, 9, 7, 5, 3], dtype="float64"), - }, - "sr3": { - "count": Series([1, 2, 2, 1, 1], dtype="float64"), - "max": Series([np.nan, 1, 2, np.nan, np.nan], dtype="float64"), - "min": Series([np.nan, 0, 1, np.nan, np.nan], dtype="float64"), - "sum": Series([np.nan, 1, 3, np.nan, np.nan], dtype="float64"), - "mean": Series([np.nan, 0.5, 1.5, np.nan, np.nan], dtype="float64"), - "std": Series( - [np.nan] + [np.sqrt(0.5)] * 2 + [np.nan] * 2, dtype="float64" - ), - "var": Series([np.nan, 0.5, 0.5, np.nan, np.nan], dtype="float64"), - "median": Series([np.nan, 0.5, 1.5, np.nan, np.nan], dtype="float64"), - }, - "df": { - "count": DataFrame( - {0: Series([1, 2, 2, 2, 2]), 1: Series([1, 2, 2, 2, 2])}, - dtype="float64", - ), - "max": DataFrame( - {0: Series([np.nan, 2, 4, 6, 8]), 1: Series([np.nan, 3, 5, 7, 9])}, - dtype="float64", - ), - "min": DataFrame( - {0: Series([np.nan, 0, 2, 4, 6]), 1: Series([np.nan, 1, 3, 5, 7])}, - dtype="float64", - ), - "sum": DataFrame( - { - 0: Series([np.nan, 2, 6, 10, 14]), - 1: Series([np.nan, 4, 8, 12, 16]), - }, - dtype="float64", - ), - "mean": DataFrame( - {0: Series([np.nan, 1, 3, 5, 7]), 1: Series([np.nan, 2, 4, 6, 8])}, - dtype="float64", - ), - "std": DataFrame( - { - 0: Series([np.nan] + [np.sqrt(2)] * 4), - 1: Series([np.nan] + [np.sqrt(2)] * 4), - }, - dtype="float64", - ), - "var": DataFrame( - {0: Series([np.nan, 2, 2, 2, 2]), 1: Series([np.nan, 2, 2, 2, 2])}, - dtype="float64", - ), - "median": DataFrame( - {0: Series([np.nan, 1, 3, 5, 7]), 1: Series([np.nan, 2, 4, 6, 8])}, - dtype="float64", - ), - }, - } - return expects - - def _create_dtype_data(self, dtype): - sr1 = Series(np.arange(5), dtype=dtype) - sr2 = Series(np.arange(10, 0, -2), dtype=dtype) - sr3 = sr1.copy() - sr3[3] = np.NaN - df = DataFrame(np.arange(10).reshape((5, 2)), dtype=dtype) - - data = {"sr1": sr1, "sr2": sr2, "sr3": sr3, "df": df} - - return data - - def _create_data(self): - self.data = self._create_dtype_data(self.dtype) - self.expects = self.get_expects() - - def test_dtypes(self): - self._create_data() - for f_name, d_name in product(self.funcs.keys(), self.data.keys()): - - f = self.funcs[f_name] - d = self.data[d_name] - exp = self.expects[d_name][f_name] - self.check_dtypes(f, f_name, d, d_name, exp) - - def check_dtypes(self, f, f_name, d, d_name, exp): - roll = d.rolling(window=self.window) - result = f(roll) - tm.assert_almost_equal(result, exp) - - -class TestDtype_object(Dtype): - dtype = object - - -class Dtype_integer(Dtype): - pass - - -class TestDtype_int8(Dtype_integer): - dtype = np.int8 - - -class TestDtype_int16(Dtype_integer): - dtype = np.int16 - - -class TestDtype_int32(Dtype_integer): - dtype = np.int32 - - -class TestDtype_int64(Dtype_integer): - dtype = np.int64 - - -class Dtype_uinteger(Dtype): - pass - - -class TestDtype_uint8(Dtype_uinteger): - dtype = np.uint8 - - -class TestDtype_uint16(Dtype_uinteger): - dtype = np.uint16 - - -class TestDtype_uint32(Dtype_uinteger): - dtype = np.uint32 - - -class TestDtype_uint64(Dtype_uinteger): - dtype = np.uint64 - - -class Dtype_float(Dtype): - pass - - -class TestDtype_float16(Dtype_float): - dtype = np.float16 - - -class TestDtype_float32(Dtype_float): - dtype = np.float32 - - -class TestDtype_float64(Dtype_float): - dtype = np.float64 - - -class TestDtype_category(Dtype): - dtype = "category" - include_df = False - - def _create_dtype_data(self, dtype): - sr1 = Series(range(5), dtype=dtype) - sr2 = Series(range(10, 0, -2), dtype=dtype) - - data = {"sr1": sr1, "sr2": sr2} - - return data - - -class DatetimeLike(Dtype): - def check_dtypes(self, f, f_name, d, d_name, exp): - - roll = d.rolling(window=self.window) - if f_name == "count": - result = f(roll) - tm.assert_almost_equal(result, exp) - - else: - msg = "No numeric types to aggregate" - with pytest.raises(DataError, match=msg): - f(roll) - - -class TestDtype_timedelta(DatetimeLike): - dtype = np.dtype("m8[ns]") - - -class TestDtype_datetime(DatetimeLike): - dtype = np.dtype("M8[ns]") - - -class TestDtype_datetime64UTC(DatetimeLike): - dtype = "datetime64[ns, UTC]" - - def _create_data(self): - pytest.skip( - "direct creation of extension dtype " - "datetime64[ns, UTC] is not supported ATM" - ) + ), + ("var", {0: Series([np.nan, 2, 2, 2, 2]), 1: Series([np.nan, 2, 2, 2, 2])}), + ("median", {0: Series([np.nan, 1, 3, 5, 7]), 1: Series([np.nan, 2, 4, 6, 8])}), + ], +) +def test_dataframe_dtypes(method, expected_data, dtypes): + if dtypes == "category": + pytest.skip("Category dataframe testing not implemented.") + df = DataFrame(np.arange(10).reshape((5, 2)), dtype=get_dtype(dtypes)) + if dtypes in ("m8[ns]", "M8[ns]") and method != "count": + msg = "No numeric types to aggregate" + with pytest.raises(DataError, match=msg): + getattr(df.rolling(2), method)() + else: + result = getattr(df.rolling(2), method)() + expected = DataFrame(expected_data, dtype="float64") + tm.assert_frame_equal(result, expected) From 2a0e640ab07a6df684080bbf8616f8aca17eefc4 Mon Sep 17 00:00:00 2001 From: Fangchen Li Date: Sat, 5 Sep 2020 14:50:43 -0500 Subject: [PATCH 0675/1025] DOC: add userwarning doc about mpl #35684 (#36145) --- doc/source/whatsnew/v1.2.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index b1229a5d5823d..d7d2e3cf876ca 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -301,7 +301,7 @@ Plotting ^^^^^^^^ - Bug in :meth:`DataFrame.plot` where a marker letter in the ``style`` keyword sometimes causes a ``ValueError`` (:issue:`21003`) -- +- meth:`DataFrame.plot` and meth:`Series.plot` raise ``UserWarning`` about usage of FixedFormatter and FixedLocator (:issue:`35684` and :issue:`35945`) Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ From 90d55ffcb71e7c3a00b068a49f9c53fc3642fd08 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 5 Sep 2020 12:55:36 -0700 Subject: [PATCH 0676/1025] BUG: item_cache invalidation in get_numeric_data (#35882) --- doc/source/whatsnew/v1.1.2.rst | 1 + pandas/core/internals/managers.py | 1 - pandas/tests/frame/methods/test_cov_corr.py | 17 +++++++++++++++++ 3 files changed, 18 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.2.rst b/doc/source/whatsnew/v1.1.2.rst index d1a66256454ca..6935a64c7572f 100644 --- a/doc/source/whatsnew/v1.1.2.rst +++ b/doc/source/whatsnew/v1.1.2.rst @@ -36,6 +36,7 @@ Bug fixes - Bug in :meth:`Float64Index.__contains__` incorrectly raising ``TypeError`` instead of returning ``False`` (:issue:`35788`) - Bug in :meth:`Series.dt.isocalendar` and :meth:`DatetimeIndex.isocalendar` that returned incorrect year for certain dates (:issue:`36032`) - Bug in :class:`DataFrame` indexing returning an incorrect :class:`Series` in some cases when the series has been altered and a cache not invalidated (:issue:`33675`) +- Bug in :meth:`DataFrame.corr` causing subsequent indexing lookups to be incorrect (:issue:`35882`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 57a4a8c2ace8a..13bc6a2e82195 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -691,7 +691,6 @@ def get_numeric_data(self, copy: bool = False) -> "BlockManager": copy : bool, default False Whether to copy the blocks """ - self._consolidate_inplace() return self._combine([b for b in self.blocks if b.is_numeric], copy) def _combine(self: T, blocks: List[Block], copy: bool = True) -> T: diff --git a/pandas/tests/frame/methods/test_cov_corr.py b/pandas/tests/frame/methods/test_cov_corr.py index d3548b639572d..f307acd8c2178 100644 --- a/pandas/tests/frame/methods/test_cov_corr.py +++ b/pandas/tests/frame/methods/test_cov_corr.py @@ -191,6 +191,23 @@ def test_corr_nullable_integer(self, nullable_column, other_column, method): expected = pd.DataFrame(np.ones((2, 2)), columns=["a", "b"], index=["a", "b"]) tm.assert_frame_equal(result, expected) + def test_corr_item_cache(self): + # Check that corr does not lead to incorrect entries in item_cache + + df = pd.DataFrame({"A": range(10)}) + df["B"] = range(10)[::-1] + + ser = df["A"] # populate item_cache + assert len(df._mgr.blocks) == 2 + + _ = df.corr() + + # Check that the corr didnt break link between ser and df + ser.values[0] = 99 + assert df.loc[0, "A"] == 99 + assert df["A"] is ser + assert df.values[0, 0] == 99 + class TestDataFrameCorrWith: def test_corrwith(self, datetime_frame): From c5afc12e1cb07fcf05b49f96326aa6b131ac03c7 Mon Sep 17 00:00:00 2001 From: Daniel Saxton <2658661+dsaxton@users.noreply.github.com> Date: Sat, 5 Sep 2020 17:18:51 -0400 Subject: [PATCH 0677/1025] Make MultiIndex.get_loc raise for unhashable type (#35914) Co-authored-by: Jeff Reback --- doc/source/whatsnew/v1.1.2.rst | 1 + pandas/core/indexes/multi.py | 5 +++-- pandas/tests/frame/indexing/test_indexing.py | 2 +- pandas/tests/indexing/multiindex/test_multiindex.py | 8 ++++++++ pandas/tests/series/indexing/test_setitem.py | 11 ++++++++++- 5 files changed, 23 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v1.1.2.rst b/doc/source/whatsnew/v1.1.2.rst index 6935a64c7572f..c6cfcc6730112 100644 --- a/doc/source/whatsnew/v1.1.2.rst +++ b/doc/source/whatsnew/v1.1.2.rst @@ -17,6 +17,7 @@ Fixed regressions - Regression in :meth:`DatetimeIndex.intersection` incorrectly raising ``AssertionError`` when intersecting against a list (:issue:`35876`) - Fix regression in updating a column inplace (e.g. using ``df['col'].fillna(.., inplace=True)``) (:issue:`35731`) - Performance regression for :meth:`RangeIndex.format` (:issue:`35712`) +- Regression where :meth:`MultiIndex.get_loc` would return a slice spanning the full index when passed an empty list (:issue:`35878`) - Fix regression in invalid cache after an indexing operation; this can manifest when setting which does not update the data (:issue:`35521`) - Regression in :meth:`DataFrame.replace` where a ``TypeError`` would be raised when attempting to replace elements of type :class:`Interval` (:issue:`35931`) - Fix regression in pickle roundtrip of the ``closed`` attribute of :class:`IntervalIndex` (:issue:`35658`) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index f66b009e6d505..080ece8547479 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -2725,6 +2725,8 @@ def get_loc(self, key, method=None): "currently supported for MultiIndex" ) + hash(key) + def _maybe_to_slice(loc): """convert integer indexer to boolean mask or slice if possible""" if not isinstance(loc, np.ndarray) or loc.dtype != "int64": @@ -2739,8 +2741,7 @@ def _maybe_to_slice(loc): mask[loc] = True return mask - if not isinstance(key, (tuple, list)): - # not including list here breaks some indexing, xref #30892 + if not isinstance(key, tuple): loc = self._get_level_indexer(key, level=0) return _maybe_to_slice(loc) diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index d27487dfb8aaa..e4549dfb3e68d 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -2111,7 +2111,7 @@ def test_type_error_multiindex(self): ) dg = df.pivot_table(index="i", columns="c", values=["x", "y"]) - with pytest.raises(TypeError, match="is an invalid key"): + with pytest.raises(TypeError, match="unhashable type"): dg[:, 0] index = Index(range(2), name="i") diff --git a/pandas/tests/indexing/multiindex/test_multiindex.py b/pandas/tests/indexing/multiindex/test_multiindex.py index 5e5fcd3db88d8..4565d79c632de 100644 --- a/pandas/tests/indexing/multiindex/test_multiindex.py +++ b/pandas/tests/indexing/multiindex/test_multiindex.py @@ -1,4 +1,5 @@ import numpy as np +import pytest import pandas._libs.index as _index from pandas.errors import PerformanceWarning @@ -83,3 +84,10 @@ def test_nested_tuples_duplicates(self): df3 = df.copy(deep=True) df3.loc[[(dti[0], "a")], "c2"] = 1.0 tm.assert_frame_equal(df3, expected) + + def test_multiindex_get_loc_list_raises(self): + # https://github.com/pandas-dev/pandas/issues/35878 + idx = pd.MultiIndex.from_tuples([("a", 1), ("b", 2)]) + msg = "unhashable type" + with pytest.raises(TypeError, match=msg): + idx.get_loc([]) diff --git a/pandas/tests/series/indexing/test_setitem.py b/pandas/tests/series/indexing/test_setitem.py index 3463de25ad91b..593d1c78a19e2 100644 --- a/pandas/tests/series/indexing/test_setitem.py +++ b/pandas/tests/series/indexing/test_setitem.py @@ -1,6 +1,7 @@ import numpy as np -from pandas import NaT, Series, date_range +from pandas import MultiIndex, NaT, Series, date_range +import pandas.testing as tm class TestSetitemDT64Values: @@ -17,3 +18,11 @@ def test_setitem_none_nan(self): series[5:7] = np.nan assert series[6] is NaT + + def test_setitem_multiindex_empty_slice(self): + # https://github.com/pandas-dev/pandas/issues/35878 + idx = MultiIndex.from_tuples([("a", 1), ("b", 2)]) + result = Series([1, 2], index=idx) + expected = result.copy() + result.loc[[]] = 0 + tm.assert_series_equal(result, expected) From e4ede3414b920ab2c674162a9c646436930236f2 Mon Sep 17 00:00:00 2001 From: Daniel Saxton <2658661+dsaxton@users.noreply.github.com> Date: Sat, 5 Sep 2020 18:36:42 -0400 Subject: [PATCH 0678/1025] ENH: Make explode work for sets (#35637) --- doc/source/whatsnew/v1.2.0.rst | 2 +- pandas/_libs/reshape.pyx | 6 ++++-- pandas/core/frame.py | 7 ++++--- pandas/core/series.py | 7 ++++--- pandas/tests/frame/methods/test_explode.py | 8 ++++++++ pandas/tests/series/methods/test_explode.py | 8 ++++++++ 6 files changed, 29 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index d7d2e3cf876ca..ff9e803b4990a 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -103,7 +103,7 @@ Other enhancements - Added :meth:`~DataFrame.set_flags` for setting table-wide flags on a ``Series`` or ``DataFrame`` (:issue:`28394`) - :class:`Index` with object dtype supports division and multiplication (:issue:`34160`) -- +- :meth:`DataFrame.explode` and :meth:`Series.explode` now support exploding of sets (:issue:`35614`) - .. _whatsnew_120.api_breaking.python: diff --git a/pandas/_libs/reshape.pyx b/pandas/_libs/reshape.pyx index 5c6c15fb50fed..75dbb4b74aabd 100644 --- a/pandas/_libs/reshape.pyx +++ b/pandas/_libs/reshape.pyx @@ -124,7 +124,8 @@ def explode(ndarray[object] values): counts = np.zeros(n, dtype='int64') for i in range(n): v = values[i] - if c_is_list_like(v, False): + + if c_is_list_like(v, True): if len(v): counts[i] += len(v) else: @@ -138,8 +139,9 @@ def explode(ndarray[object] values): for i in range(n): v = values[i] - if c_is_list_like(v, False): + if c_is_list_like(v, True): if len(v): + v = list(v) for j in range(len(v)): result[count] = v[j] count += 1 diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 29d6fb9aa7d56..150d6e24dbb86 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -7091,10 +7091,11 @@ def explode( Notes ----- - This routine will explode list-likes including lists, tuples, + This routine will explode list-likes including lists, tuples, sets, Series, and np.ndarray. The result dtype of the subset rows will - be object. Scalars will be returned unchanged. Empty list-likes will - result in a np.nan for that row. + be object. Scalars will be returned unchanged, and empty list-likes will + result in a np.nan for that row. In addition, the ordering of rows in the + output will be non-deterministic when exploding sets. Examples -------- diff --git a/pandas/core/series.py b/pandas/core/series.py index d8fdaa2a60252..6cbd93135a2ca 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -3829,10 +3829,11 @@ def explode(self, ignore_index: bool = False) -> "Series": Notes ----- - This routine will explode list-likes including lists, tuples, + This routine will explode list-likes including lists, tuples, sets, Series, and np.ndarray. The result dtype of the subset rows will - be object. Scalars will be returned unchanged. Empty list-likes will - result in a np.nan for that row. + be object. Scalars will be returned unchanged, and empty list-likes will + result in a np.nan for that row. In addition, the ordering of elements in + the output will be non-deterministic when exploding sets. Examples -------- diff --git a/pandas/tests/frame/methods/test_explode.py b/pandas/tests/frame/methods/test_explode.py index 2bbe8ac2d5b81..bd0901387eeed 100644 --- a/pandas/tests/frame/methods/test_explode.py +++ b/pandas/tests/frame/methods/test_explode.py @@ -172,3 +172,11 @@ def test_ignore_index(): {"id": [0, 0, 10, 10], "values": list("abcd")}, index=[0, 1, 2, 3] ) tm.assert_frame_equal(result, expected) + + +def test_explode_sets(): + # https://github.com/pandas-dev/pandas/issues/35614 + df = pd.DataFrame({"a": [{"x", "y"}], "b": [1]}, index=[1]) + result = df.explode(column="a").sort_values(by="a") + expected = pd.DataFrame({"a": ["x", "y"], "b": [1, 1]}, index=[1, 1]) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/series/methods/test_explode.py b/pandas/tests/series/methods/test_explode.py index 4b65e042f7b02..1f0fbd1cc5ecb 100644 --- a/pandas/tests/series/methods/test_explode.py +++ b/pandas/tests/series/methods/test_explode.py @@ -126,3 +126,11 @@ def test_ignore_index(): result = s.explode(ignore_index=True) expected = pd.Series([1, 2, 3, 4], index=[0, 1, 2, 3], dtype=object) tm.assert_series_equal(result, expected) + + +def test_explode_sets(): + # https://github.com/pandas-dev/pandas/issues/35614 + s = pd.Series([{"a", "b", "c"}], index=[1]) + result = s.explode().sort_values() + expected = pd.Series(["a", "b", "c"], index=[1, 1, 1]) + tm.assert_series_equal(result, expected) From 212e33ba1e8250f6f6efe3be161090e1ad611bcb Mon Sep 17 00:00:00 2001 From: Daniel Saxton <2658661+dsaxton@users.noreply.github.com> Date: Sat, 5 Sep 2020 19:13:44 -0400 Subject: [PATCH 0679/1025] BUG: Don't raise when constructing Series from ordered set (#36054) --- doc/source/whatsnew/v1.1.2.rst | 1 + pandas/core/construction.py | 9 ++++++--- pandas/tests/series/test_constructors.py | 10 ++++++++++ 3 files changed, 17 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.1.2.rst b/doc/source/whatsnew/v1.1.2.rst index c6cfcc6730112..b8f6d0e52d058 100644 --- a/doc/source/whatsnew/v1.1.2.rst +++ b/doc/source/whatsnew/v1.1.2.rst @@ -35,6 +35,7 @@ Bug fixes - Bug in :meth:`DataFrame.apply` with ``result_type="reduce"`` returning with incorrect index (:issue:`35683`) - Bug in :meth:`DateTimeIndex.format` and :meth:`PeriodIndex.format` with ``name=True`` setting the first item to ``"None"`` where it should be ``""`` (:issue:`35712`) - Bug in :meth:`Float64Index.__contains__` incorrectly raising ``TypeError`` instead of returning ``False`` (:issue:`35788`) +- Bug in :class:`Series` constructor incorrectly raising a ``TypeError`` when passed an ordered set (:issue:`36044`) - Bug in :meth:`Series.dt.isocalendar` and :meth:`DatetimeIndex.isocalendar` that returned incorrect year for certain dates (:issue:`36032`) - Bug in :class:`DataFrame` indexing returning an incorrect :class:`Series` in some cases when the series has been altered and a cache not invalidated (:issue:`33675`) - Bug in :meth:`DataFrame.corr` causing subsequent indexing lookups to be incorrect (:issue:`35882`) diff --git a/pandas/core/construction.py b/pandas/core/construction.py index 9d6c2789af25b..3812c306b8eb4 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -438,7 +438,12 @@ def sanitize_array( subarr = subarr.copy() return subarr - elif isinstance(data, (list, tuple)) and len(data) > 0: + elif isinstance(data, (list, tuple, abc.Set, abc.ValuesView)) and len(data) > 0: + if isinstance(data, set): + # Raise only for unordered sets, e.g., not for dict_keys + raise TypeError("Set type is unordered") + data = list(data) + if dtype is not None: subarr = _try_cast(data, dtype, copy, raise_cast_failure) else: @@ -450,8 +455,6 @@ def sanitize_array( # GH#16804 arr = np.arange(data.start, data.stop, data.step, dtype="int64") subarr = _try_cast(arr, dtype, copy, raise_cast_failure) - elif isinstance(data, abc.Set): - raise TypeError("Set type is unordered") elif lib.is_scalar(data) and index is not None and dtype is not None: data = maybe_cast_to_datetime(data, dtype) if not lib.is_scalar(data): diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index bcf7039ec9039..ce078059479b4 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -1464,3 +1464,13 @@ def test_constructor_sparse_datetime64(self, values): arr = pd.arrays.SparseArray(values, dtype=dtype) expected = pd.Series(arr) tm.assert_series_equal(result, expected) + + def test_construction_from_ordered_collection(self): + # https://github.com/pandas-dev/pandas/issues/36044 + result = Series({"a": 1, "b": 2}.keys()) + expected = Series(["a", "b"]) + tm.assert_series_equal(result, expected) + + result = Series({"a": 1, "b": 2}.values()) + expected = Series([1, 2]) + tm.assert_series_equal(result, expected) From 4f62f5b5ed75da11a645da57ac8e016449fa89a6 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sun, 6 Sep 2020 18:58:32 +0200 Subject: [PATCH 0680/1025] REGR: append tz-aware DataFrame with tz-naive values (#36115) --- doc/source/whatsnew/v1.1.2.rst | 1 + pandas/core/dtypes/concat.py | 6 ++++-- pandas/core/internals/concat.py | 8 ++++++-- pandas/tests/reshape/test_concat.py | 17 +++++++++++++++++ 4 files changed, 28 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v1.1.2.rst b/doc/source/whatsnew/v1.1.2.rst index b8f6d0e52d058..f0adc951a5f99 100644 --- a/doc/source/whatsnew/v1.1.2.rst +++ b/doc/source/whatsnew/v1.1.2.rst @@ -16,6 +16,7 @@ Fixed regressions ~~~~~~~~~~~~~~~~~ - Regression in :meth:`DatetimeIndex.intersection` incorrectly raising ``AssertionError`` when intersecting against a list (:issue:`35876`) - Fix regression in updating a column inplace (e.g. using ``df['col'].fillna(.., inplace=True)``) (:issue:`35731`) +- Fix regression in :meth:`DataFrame.append` mixing tz-aware and tz-naive datetime columns (:issue:`35460`) - Performance regression for :meth:`RangeIndex.format` (:issue:`35712`) - Regression where :meth:`MultiIndex.get_loc` would return a slice spanning the full index when passed an empty list (:issue:`35878`) - Fix regression in invalid cache after an indexing operation; this can manifest when setting which does not update the data (:issue:`35521`) diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index 9902016475b22..dd005752a4832 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -148,15 +148,17 @@ def is_nonempty(x) -> bool: any_ea = any(is_extension_array_dtype(x.dtype) for x in to_concat) if any_ea: + # we ignore axis here, as internally concatting with EAs is always + # for axis=0 if not single_dtype: target_dtype = find_common_type([x.dtype for x in to_concat]) to_concat = [_cast_to_common_type(arr, target_dtype) for arr in to_concat] - if isinstance(to_concat[0], ExtensionArray) and axis == 0: + if isinstance(to_concat[0], ExtensionArray): cls = type(to_concat[0]) return cls._concat_same_type(to_concat) else: - return np.concatenate(to_concat, axis=axis) + return np.concatenate(to_concat) elif _contains_datetime or "timedelta" in typs: return concat_datetime(to_concat, axis=axis, typs=typs) diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index b45f0890cafa4..513c5fed1ca62 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -24,7 +24,7 @@ from pandas.core.dtypes.missing import isna import pandas.core.algorithms as algos -from pandas.core.arrays import ExtensionArray +from pandas.core.arrays import DatetimeArray, ExtensionArray from pandas.core.internals.blocks import make_block from pandas.core.internals.managers import BlockManager @@ -335,9 +335,13 @@ def _concatenate_join_units(join_units, concat_axis, copy): # the non-EA values are 2D arrays with shape (1, n) to_concat = [t if isinstance(t, ExtensionArray) else t[0, :] for t in to_concat] concat_values = concat_compat(to_concat, axis=0) - if not isinstance(concat_values, ExtensionArray): + if not isinstance(concat_values, ExtensionArray) or ( + isinstance(concat_values, DatetimeArray) and concat_values.tz is None + ): # if the result of concat is not an EA but an ndarray, reshape to # 2D to put it a non-EA Block + # special case DatetimeArray, which *is* an EA, but is put in a + # consolidated 2D block concat_values = np.atleast_2d(concat_values) else: concat_values = concat_compat(to_concat, axis=concat_axis) diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py index 38cf2cc2402a1..90705f827af25 100644 --- a/pandas/tests/reshape/test_concat.py +++ b/pandas/tests/reshape/test_concat.py @@ -1110,6 +1110,23 @@ def test_append_empty_frame_to_series_with_dateutil_tz(self): result = df.append([s, s], ignore_index=True) tm.assert_frame_equal(result, expected) + def test_append_empty_tz_frame_with_datetime64ns(self): + # https://github.com/pandas-dev/pandas/issues/35460 + df = pd.DataFrame(columns=["a"]).astype("datetime64[ns, UTC]") + + # pd.NaT gets inferred as tz-naive, so append result is tz-naive + result = df.append({"a": pd.NaT}, ignore_index=True) + expected = pd.DataFrame({"a": [pd.NaT]}).astype("datetime64[ns]") + tm.assert_frame_equal(result, expected) + + # also test with typed value to append + df = pd.DataFrame(columns=["a"]).astype("datetime64[ns, UTC]") + result = df.append( + pd.Series({"a": pd.NaT}, dtype="datetime64[ns]"), ignore_index=True + ) + expected = pd.DataFrame({"a": [pd.NaT]}).astype("datetime64[ns]") + tm.assert_frame_equal(result, expected) + class TestConcatenate: def test_concat_copy(self): From 6032bedf9cf1b3e95fe610b7f0d2d232d1b12881 Mon Sep 17 00:00:00 2001 From: Daniel Saxton <2658661+dsaxton@users.noreply.github.com> Date: Sun, 6 Sep 2020 12:59:43 -0400 Subject: [PATCH 0681/1025] BUG: Respect errors="ignore" during extension astype (#35979) --- doc/source/whatsnew/v1.1.2.rst | 1 + pandas/core/internals/blocks.py | 9 ++++++-- pandas/tests/frame/methods/test_astype.py | 22 +++++++++++++++++++ pandas/tests/series/methods/test_astype.py | 25 +++++++++++++++++++++- 4 files changed, 54 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.1.2.rst b/doc/source/whatsnew/v1.1.2.rst index f0adc951a5f99..1e946d325ace1 100644 --- a/doc/source/whatsnew/v1.1.2.rst +++ b/doc/source/whatsnew/v1.1.2.rst @@ -34,6 +34,7 @@ Bug fixes - Bug in :meth:`DataFrame.eval` with ``object`` dtype column binary operations (:issue:`35794`) - Bug in :class:`Series` constructor raising a ``TypeError`` when constructing sparse datetime64 dtypes (:issue:`35762`) - Bug in :meth:`DataFrame.apply` with ``result_type="reduce"`` returning with incorrect index (:issue:`35683`) +- Bug in :meth:`Series.astype` and :meth:`DataFrame.astype` not respecting the ``errors`` argument when set to ``"ignore"`` for extension dtypes (:issue:`35471`) - Bug in :meth:`DateTimeIndex.format` and :meth:`PeriodIndex.format` with ``name=True`` setting the first item to ``"None"`` where it should be ``""`` (:issue:`35712`) - Bug in :meth:`Float64Index.__contains__` incorrectly raising ``TypeError`` instead of returning ``False`` (:issue:`35788`) - Bug in :class:`Series` constructor incorrectly raising a ``TypeError`` when passed an ordered set (:issue:`36044`) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 9f4e535dc787d..263c7c2b6940a 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -581,8 +581,13 @@ def astype(self, dtype, copy: bool = False, errors: str = "raise"): # force the copy here if self.is_extension: - # TODO: Should we try/except this astype? - values = self.values.astype(dtype) + try: + values = self.values.astype(dtype) + except (ValueError, TypeError): + if errors == "ignore": + values = self.values + else: + raise else: if issubclass(dtype.type, str): diff --git a/pandas/tests/frame/methods/test_astype.py b/pandas/tests/frame/methods/test_astype.py index b0fd0496ea81e..d3f256259b15f 100644 --- a/pandas/tests/frame/methods/test_astype.py +++ b/pandas/tests/frame/methods/test_astype.py @@ -8,6 +8,7 @@ CategoricalDtype, DataFrame, DatetimeTZDtype, + Interval, IntervalDtype, NaT, Series, @@ -565,3 +566,24 @@ def test_astype_empty_dtype_dict(self): result = df.astype(dict()) tm.assert_frame_equal(result, df) assert result is not df + + @pytest.mark.parametrize( + "df", + [ + DataFrame(Series(["x", "y", "z"], dtype="string")), + DataFrame(Series(["x", "y", "z"], dtype="category")), + DataFrame(Series(3 * [Timestamp("2020-01-01", tz="UTC")])), + DataFrame(Series(3 * [Interval(0, 1)])), + ], + ) + @pytest.mark.parametrize("errors", ["raise", "ignore"]) + def test_astype_ignores_errors_for_extension_dtypes(self, df, errors): + # https://github.com/pandas-dev/pandas/issues/35471 + if errors == "ignore": + expected = df + result = df.astype(float, errors=errors) + tm.assert_frame_equal(result, expected) + else: + msg = "(Cannot cast)|(could not convert)" + with pytest.raises((ValueError, TypeError), match=msg): + df.astype(float, errors=errors) diff --git a/pandas/tests/series/methods/test_astype.py b/pandas/tests/series/methods/test_astype.py index 9fdc4179de2e1..b9d90a9fc63dd 100644 --- a/pandas/tests/series/methods/test_astype.py +++ b/pandas/tests/series/methods/test_astype.py @@ -1,4 +1,6 @@ -from pandas import Series, date_range +import pytest + +from pandas import Interval, Series, Timestamp, date_range import pandas._testing as tm @@ -23,3 +25,24 @@ def test_astype_dt64tz_to_str(self): dtype=object, ) tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "values", + [ + Series(["x", "y", "z"], dtype="string"), + Series(["x", "y", "z"], dtype="category"), + Series(3 * [Timestamp("2020-01-01", tz="UTC")]), + Series(3 * [Interval(0, 1)]), + ], + ) + @pytest.mark.parametrize("errors", ["raise", "ignore"]) + def test_astype_ignores_errors_for_extension_dtypes(self, values, errors): + # https://github.com/pandas-dev/pandas/issues/35471 + if errors == "ignore": + expected = values + result = values.astype(float, errors="ignore") + tm.assert_series_equal(result, expected) + else: + msg = "(Cannot cast)|(could not convert)" + with pytest.raises((ValueError, TypeError), match=msg): + values.astype(float, errors=errors) From 2a1593cbaeac412c67e67d87b450ef53f50a93d6 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 6 Sep 2020 10:05:35 -0700 Subject: [PATCH 0682/1025] De-privatize imported names (#36156) --- pandas/_libs/hashtable.pyx | 4 ++-- pandas/_libs/hashtable_class_helper.pxi.in | 6 +++--- pandas/_libs/hashtable_func_helper.pxi.in | 2 +- pandas/_libs/parsers.pyx | 8 ++++---- pandas/_testing.py | 8 ++++---- pandas/compat/__init__.py | 4 ++-- pandas/core/algorithms.py | 4 ++-- pandas/core/arrays/base.py | 4 ++-- pandas/core/arrays/masked.py | 4 ++-- pandas/core/computation/check.py | 10 +++++----- pandas/core/computation/eval.py | 6 +++--- pandas/core/computation/expressions.py | 10 +++++----- pandas/core/computation/ops.py | 6 +++--- pandas/core/frame.py | 4 ++-- pandas/core/indexes/multi.py | 4 ++-- pandas/core/internals/__init__.py | 4 ++-- pandas/core/internals/blocks.py | 4 ++-- pandas/core/internals/managers.py | 6 +++--- pandas/core/sorting.py | 2 +- pandas/core/window/common.py | 4 ++-- pandas/core/window/ewm.py | 6 +++--- pandas/core/window/rolling.py | 6 +++--- pandas/io/common.py | 6 +++--- pandas/io/excel/_base.py | 2 +- pandas/io/excel/_odfreader.py | 4 ++-- pandas/io/excel/_openpyxl.py | 4 ++-- pandas/io/excel/_pyxlsb.py | 4 ++-- pandas/io/excel/_xlrd.py | 4 ++-- pandas/io/formats/format.py | 8 ++++---- pandas/io/formats/printing.py | 4 ++-- pandas/tests/computation/test_compat.py | 6 +++--- pandas/tests/computation/test_eval.py | 12 ++++++------ pandas/tests/extension/json/array.py | 2 +- pandas/tests/frame/test_arithmetic.py | 4 ++-- pandas/tests/frame/test_query_eval.py | 6 +++--- pandas/tests/io/formats/test_format.py | 2 +- pandas/tests/io/test_pickle.py | 6 +++--- pandas/tests/test_algos.py | 2 +- .../moments/test_moments_consistency_rolling.py | 4 ++-- pandas/tests/window/test_pairwise.py | 2 +- pandas/util/_test_decorators.py | 4 ++-- 41 files changed, 101 insertions(+), 101 deletions(-) diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx index ffaf6d6505955..5a0cddb0af197 100644 --- a/pandas/_libs/hashtable.pyx +++ b/pandas/_libs/hashtable.pyx @@ -56,7 +56,7 @@ from pandas._libs.missing cimport checknull cdef int64_t NPY_NAT = util.get_nat() -_SIZE_HINT_LIMIT = (1 << 20) + 7 +SIZE_HINT_LIMIT = (1 << 20) + 7 cdef Py_ssize_t _INIT_VEC_CAP = 128 @@ -176,7 +176,7 @@ def unique_label_indices(const int64_t[:] labels): ndarray[int64_t, ndim=1] arr Int64VectorData *ud = idx.data - kh_resize_int64(table, min(n, _SIZE_HINT_LIMIT)) + kh_resize_int64(table, min(n, SIZE_HINT_LIMIT)) with nogil: for i in range(n): diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index e0e026fe7cb5e..5e4da96d57e42 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -268,7 +268,7 @@ cdef class {{name}}HashTable(HashTable): def __cinit__(self, int64_t size_hint=1): self.table = kh_init_{{dtype}}() if size_hint is not None: - size_hint = min(size_hint, _SIZE_HINT_LIMIT) + size_hint = min(size_hint, SIZE_HINT_LIMIT) kh_resize_{{dtype}}(self.table, size_hint) def __len__(self) -> int: @@ -603,7 +603,7 @@ cdef class StringHashTable(HashTable): def __init__(self, int64_t size_hint=1): self.table = kh_init_str() if size_hint is not None: - size_hint = min(size_hint, _SIZE_HINT_LIMIT) + size_hint = min(size_hint, SIZE_HINT_LIMIT) kh_resize_str(self.table, size_hint) def __dealloc__(self): @@ -916,7 +916,7 @@ cdef class PyObjectHashTable(HashTable): def __init__(self, int64_t size_hint=1): self.table = kh_init_pymap() if size_hint is not None: - size_hint = min(size_hint, _SIZE_HINT_LIMIT) + size_hint = min(size_hint, SIZE_HINT_LIMIT) kh_resize_pymap(self.table, size_hint) def __dealloc__(self): diff --git a/pandas/_libs/hashtable_func_helper.pxi.in b/pandas/_libs/hashtable_func_helper.pxi.in index 0cc0a6b192df5..fcd081f563f92 100644 --- a/pandas/_libs/hashtable_func_helper.pxi.in +++ b/pandas/_libs/hashtable_func_helper.pxi.in @@ -138,7 +138,7 @@ def duplicated_{{dtype}}(const {{c_type}}[:] values, object keep='first'): kh_{{ttype}}_t *table = kh_init_{{ttype}}() ndarray[uint8_t, ndim=1, cast=True] out = np.empty(n, dtype='bool') - kh_resize_{{ttype}}(table, min(n, _SIZE_HINT_LIMIT)) + kh_resize_{{ttype}}(table, min(n, SIZE_HINT_LIMIT)) if keep not in ('last', 'first', False): raise ValueError('keep must be either "first", "last" or False') diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index fa77af6bd5a25..811e28b830921 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -67,7 +67,7 @@ from pandas._libs.khash cimport ( khiter_t, ) -from pandas.compat import _get_lzma_file, _import_lzma +from pandas.compat import get_lzma_file, import_lzma from pandas.errors import DtypeWarning, EmptyDataError, ParserError, ParserWarning from pandas.core.dtypes.common import ( @@ -82,7 +82,7 @@ from pandas.core.dtypes.common import ( ) from pandas.core.dtypes.concat import union_categoricals -lzma = _import_lzma() +lzma = import_lzma() cdef: float64_t INF = np.inf @@ -638,9 +638,9 @@ cdef class TextReader: f'zip file {zip_names}') elif self.compression == 'xz': if isinstance(source, str): - source = _get_lzma_file(lzma)(source, 'rb') + source = get_lzma_file(lzma)(source, 'rb') else: - source = _get_lzma_file(lzma)(filename=source) + source = get_lzma_file(lzma)(filename=source) else: raise ValueError(f'Unrecognized compression type: ' f'{self.compression}') diff --git a/pandas/_testing.py b/pandas/_testing.py index 04d36749a3d8c..7dba578951deb 100644 --- a/pandas/_testing.py +++ b/pandas/_testing.py @@ -25,7 +25,7 @@ from pandas._libs.lib import no_default import pandas._libs.testing as _testing from pandas._typing import Dtype, FilePathOrBuffer, FrameOrSeries -from pandas.compat import _get_lzma_file, _import_lzma +from pandas.compat import get_lzma_file, import_lzma from pandas.core.dtypes.common import ( is_bool, @@ -70,7 +70,7 @@ from pandas.io.common import urlopen from pandas.io.formats.printing import pprint_thing -lzma = _import_lzma() +lzma = import_lzma() _N = 30 _K = 4 @@ -243,7 +243,7 @@ def decompress_file(path, compression): elif compression == "bz2": f = bz2.BZ2File(path, "rb") elif compression == "xz": - f = _get_lzma_file(lzma)(path, "rb") + f = get_lzma_file(lzma)(path, "rb") elif compression == "zip": zip_file = zipfile.ZipFile(path) zip_names = zip_file.namelist() @@ -288,7 +288,7 @@ def write_to_compressed(compression, path, data, dest="test"): elif compression == "bz2": compress_method = bz2.BZ2File elif compression == "xz": - compress_method = _get_lzma_file(lzma) + compress_method = get_lzma_file(lzma) else: raise ValueError(f"Unrecognized compression type: {compression}") diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index f2018a5c01711..57e378758cc78 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -77,7 +77,7 @@ def is_platform_mac() -> bool: return sys.platform == "darwin" -def _import_lzma(): +def import_lzma(): """ Importing the `lzma` module. @@ -97,7 +97,7 @@ def _import_lzma(): warnings.warn(msg) -def _get_lzma_file(lzma): +def get_lzma_file(lzma): """ Importing the `LZMAFile` class from the `lzma` module. diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 50ec3714f454b..57e63daff29e4 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -462,7 +462,7 @@ def isin(comps: AnyArrayLike, values: AnyArrayLike) -> np.ndarray: return f(comps, values) -def _factorize_array( +def factorize_array( values, na_sentinel: int = -1, size_hint=None, na_value=None, mask=None ) -> Tuple[np.ndarray, np.ndarray]: """ @@ -671,7 +671,7 @@ def factorize( else: na_value = None - codes, uniques = _factorize_array( + codes, uniques = factorize_array( values, na_sentinel=na_sentinel, size_hint=size_hint, na_value=na_value ) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 8193d65b3b30c..0c8efda5fc588 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -31,7 +31,7 @@ from pandas.core.dtypes.missing import isna from pandas.core import ops -from pandas.core.algorithms import _factorize_array, unique +from pandas.core.algorithms import factorize_array, unique from pandas.core.missing import backfill_1d, pad_1d from pandas.core.sorting import nargminmax, nargsort @@ -845,7 +845,7 @@ def factorize(self, na_sentinel: int = -1) -> Tuple[np.ndarray, "ExtensionArray" # Complete control over factorization. arr, na_value = self._values_for_factorize() - codes, uniques = _factorize_array( + codes, uniques = factorize_array( arr, na_sentinel=na_sentinel, na_value=na_value ) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 1237dea5c1a64..31274232e2525 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -17,7 +17,7 @@ from pandas.core.dtypes.missing import isna, notna from pandas.core import nanops -from pandas.core.algorithms import _factorize_array, take +from pandas.core.algorithms import factorize_array, take from pandas.core.array_algos import masked_reductions from pandas.core.arrays import ExtensionArray, ExtensionOpsMixin from pandas.core.indexers import check_array_indexer @@ -287,7 +287,7 @@ def factorize(self, na_sentinel: int = -1) -> Tuple[np.ndarray, ExtensionArray]: arr = self._data mask = self._mask - codes, uniques = _factorize_array(arr, na_sentinel=na_sentinel, mask=mask) + codes, uniques = factorize_array(arr, na_sentinel=na_sentinel, mask=mask) # the hashtables don't handle all different types of bits uniques = uniques.astype(self.dtype.numpy_dtype, copy=False) diff --git a/pandas/core/computation/check.py b/pandas/core/computation/check.py index 4d205909b9e2e..6c7261b3b33c9 100644 --- a/pandas/core/computation/check.py +++ b/pandas/core/computation/check.py @@ -1,10 +1,10 @@ from pandas.compat._optional import import_optional_dependency ne = import_optional_dependency("numexpr", raise_on_missing=False, on_version="warn") -_NUMEXPR_INSTALLED = ne is not None -if _NUMEXPR_INSTALLED: - _NUMEXPR_VERSION = ne.__version__ +NUMEXPR_INSTALLED = ne is not None +if NUMEXPR_INSTALLED: + NUMEXPR_VERSION = ne.__version__ else: - _NUMEXPR_VERSION = None + NUMEXPR_VERSION = None -__all__ = ["_NUMEXPR_INSTALLED", "_NUMEXPR_VERSION"] +__all__ = ["NUMEXPR_INSTALLED", "NUMEXPR_VERSION"] diff --git a/pandas/core/computation/eval.py b/pandas/core/computation/eval.py index b74f99fca21c7..f6a7935142a32 100644 --- a/pandas/core/computation/eval.py +++ b/pandas/core/computation/eval.py @@ -38,10 +38,10 @@ def _check_engine(engine: Optional[str]) -> str: str Engine name. """ - from pandas.core.computation.check import _NUMEXPR_INSTALLED + from pandas.core.computation.check import NUMEXPR_INSTALLED if engine is None: - engine = "numexpr" if _NUMEXPR_INSTALLED else "python" + engine = "numexpr" if NUMEXPR_INSTALLED else "python" if engine not in _engines: valid_engines = list(_engines.keys()) @@ -53,7 +53,7 @@ def _check_engine(engine: Optional[str]) -> str: # that won't necessarily be import-able) # Could potentially be done on engine instantiation if engine == "numexpr": - if not _NUMEXPR_INSTALLED: + if not NUMEXPR_INSTALLED: raise ImportError( "'numexpr' is not installed or an unsupported version. Cannot use " "engine='numexpr' for query/eval if 'numexpr' is not installed" diff --git a/pandas/core/computation/expressions.py b/pandas/core/computation/expressions.py index a9c0cb0571446..d2c08c343ab4b 100644 --- a/pandas/core/computation/expressions.py +++ b/pandas/core/computation/expressions.py @@ -15,15 +15,15 @@ from pandas.core.dtypes.generic import ABCDataFrame -from pandas.core.computation.check import _NUMEXPR_INSTALLED +from pandas.core.computation.check import NUMEXPR_INSTALLED from pandas.core.ops import roperator -if _NUMEXPR_INSTALLED: +if NUMEXPR_INSTALLED: import numexpr as ne _TEST_MODE = None _TEST_RESULT: List[bool] = list() -_USE_NUMEXPR = _NUMEXPR_INSTALLED +_USE_NUMEXPR = NUMEXPR_INSTALLED _evaluate = None _where = None @@ -40,7 +40,7 @@ def set_use_numexpr(v=True): # set/unset to use numexpr global _USE_NUMEXPR - if _NUMEXPR_INSTALLED: + if NUMEXPR_INSTALLED: _USE_NUMEXPR = v # choose what we are going to do @@ -53,7 +53,7 @@ def set_use_numexpr(v=True): def set_numexpr_threads(n=None): # if we are using numexpr, set the threads to n # otherwise reset - if _NUMEXPR_INSTALLED and _USE_NUMEXPR: + if NUMEXPR_INSTALLED and _USE_NUMEXPR: if n is None: n = ne.detect_number_of_cores() ne.set_num_threads(n) diff --git a/pandas/core/computation/ops.py b/pandas/core/computation/ops.py index b2144c45c6323..1fb3910b8577d 100644 --- a/pandas/core/computation/ops.py +++ b/pandas/core/computation/ops.py @@ -600,11 +600,11 @@ def __repr__(self) -> str: class FuncNode: def __init__(self, name: str): - from pandas.core.computation.check import _NUMEXPR_INSTALLED, _NUMEXPR_VERSION + from pandas.core.computation.check import NUMEXPR_INSTALLED, NUMEXPR_VERSION if name not in _mathops or ( - _NUMEXPR_INSTALLED - and _NUMEXPR_VERSION < LooseVersion("2.6.9") + NUMEXPR_INSTALLED + and NUMEXPR_VERSION < LooseVersion("2.6.9") and name in ("floor", "ceil") ): raise ValueError(f'"{name}" is not a supported function') diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 150d6e24dbb86..e1a889bf79d95 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5257,7 +5257,7 @@ def duplicated( 4 True dtype: bool """ - from pandas._libs.hashtable import _SIZE_HINT_LIMIT, duplicated_int64 + from pandas._libs.hashtable import SIZE_HINT_LIMIT, duplicated_int64 from pandas.core.sorting import get_group_index @@ -5266,7 +5266,7 @@ def duplicated( def f(vals): labels, shape = algorithms.factorize( - vals, size_hint=min(len(self), _SIZE_HINT_LIMIT) + vals, size_hint=min(len(self), SIZE_HINT_LIMIT) ) return labels.astype("i8", copy=False), len(shape) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 080ece8547479..e49a23935efbd 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1342,9 +1342,9 @@ def format( ) if adjoin: - from pandas.io.formats.format import _get_adjustment + from pandas.io.formats.format import get_adjustment - adj = _get_adjustment() + adj = get_adjustment() return adj.adjoin(space, *result_levels).split("\n") else: return result_levels diff --git a/pandas/core/internals/__init__.py b/pandas/core/internals/__init__.py index e12e0d7760ea7..fbccac1c2af67 100644 --- a/pandas/core/internals/__init__.py +++ b/pandas/core/internals/__init__.py @@ -10,8 +10,8 @@ IntBlock, ObjectBlock, TimeDeltaBlock, - _safe_reshape, make_block, + safe_reshape, ) from pandas.core.internals.concat import concatenate_block_managers from pandas.core.internals.managers import ( @@ -33,7 +33,7 @@ "IntBlock", "ObjectBlock", "TimeDeltaBlock", - "_safe_reshape", + "safe_reshape", "make_block", "BlockManager", "SingleBlockManager", diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 263c7c2b6940a..c8da04fbbf987 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1678,7 +1678,7 @@ def putmask( if isinstance(new, (np.ndarray, ExtensionArray)) and len(new) == len(mask): new = new[mask] - mask = _safe_reshape(mask, new_values.shape) + mask = safe_reshape(mask, new_values.shape) new_values[mask] = new return [self.make_block(values=new_values)] @@ -2820,7 +2820,7 @@ def _block_shape(values: ArrayLike, ndim: int = 1) -> ArrayLike: return values -def _safe_reshape(arr, new_shape): +def safe_reshape(arr, new_shape): """ If possible, reshape `arr` to have shape `new_shape`, with a couple of exceptions (see gh-13012): diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 13bc6a2e82195..3f446874ffd0e 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -47,10 +47,10 @@ DatetimeTZBlock, ExtensionBlock, ObjectValuesExtensionBlock, - _safe_reshape, extend_blocks, get_block_type, make_block, + safe_reshape, ) from pandas.core.internals.ops import blockwise_all, operate_blockwise @@ -1015,7 +1015,7 @@ def value_getitem(placement): else: if value.ndim == self.ndim - 1: - value = _safe_reshape(value, (1,) + value.shape) + value = safe_reshape(value, (1,) + value.shape) def value_getitem(placement): return value @@ -1138,7 +1138,7 @@ def insert(self, loc: int, item: Label, value, allow_duplicates: bool = False): if value.ndim == self.ndim - 1 and not is_extension_array_dtype(value.dtype): # TODO(EA2D): special case not needed with 2D EAs - value = _safe_reshape(value, (1,) + value.shape) + value = safe_reshape(value, (1,) + value.shape) block = make_block(values=value, ndim=self.ndim, placement=slice(loc, loc + 1)) diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 8bdd466ae6f33..d03b2f29521b7 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -520,7 +520,7 @@ def compress_group_index(group_index, sort: bool = True): space can be huge, so this function compresses it, by computing offsets (comp_ids) into the list of unique labels (obs_group_ids). """ - size_hint = min(len(group_index), hashtable._SIZE_HINT_LIMIT) + size_hint = min(len(group_index), hashtable.SIZE_HINT_LIMIT) table = hashtable.Int64HashTable(size_hint) group_index = ensure_int64(group_index) diff --git a/pandas/core/window/common.py b/pandas/core/window/common.py index 2f3058db4493b..df60d2dcf5e84 100644 --- a/pandas/core/window/common.py +++ b/pandas/core/window/common.py @@ -92,7 +92,7 @@ def f(x, name=name, *args): return self._groupby.apply(f) -def _flex_binary_moment(arg1, arg2, f, pairwise=False): +def flex_binary_moment(arg1, arg2, f, pairwise=False): if not ( isinstance(arg1, (np.ndarray, ABCSeries, ABCDataFrame)) @@ -222,7 +222,7 @@ def dataframe_from_int_dict(data, frame_template): return dataframe_from_int_dict(results, arg1) else: - return _flex_binary_moment(arg2, arg1, f) + return flex_binary_moment(arg2, arg1, f) def zsqrt(x): diff --git a/pandas/core/window/ewm.py b/pandas/core/window/ewm.py index 1913b51a68c15..2bd36d8bff155 100644 --- a/pandas/core/window/ewm.py +++ b/pandas/core/window/ewm.py @@ -15,7 +15,7 @@ import pandas.core.common as common from pandas.core.window.common import _doc_template, _shared_docs, zsqrt -from pandas.core.window.rolling import _flex_binary_moment, _Rolling +from pandas.core.window.rolling import _Rolling, flex_binary_moment _bias_template = """ Parameters @@ -416,7 +416,7 @@ def _get_cov(X, Y): ) return X._wrap_result(cov) - return _flex_binary_moment( + return flex_binary_moment( self._selected_obj, other._selected_obj, _get_cov, pairwise=bool(pairwise) ) @@ -470,6 +470,6 @@ def _cov(x, y): corr = cov / zsqrt(x_var * y_var) return X._wrap_result(corr) - return _flex_binary_moment( + return flex_binary_moment( self._selected_obj, other._selected_obj, _get_corr, pairwise=bool(pairwise) ) diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 558c0eeb0ea65..4c4ec4d700b7f 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -54,8 +54,8 @@ from pandas.core.window.common import ( WindowGroupByMixin, _doc_template, - _flex_binary_moment, _shared_docs, + flex_binary_moment, zsqrt, ) from pandas.core.window.indexers import ( @@ -1774,7 +1774,7 @@ def _get_cov(X, Y): bias_adj = count / (count - ddof) return (mean(X * Y) - mean(X) * mean(Y)) * bias_adj - return _flex_binary_moment( + return flex_binary_moment( self._selected_obj, other._selected_obj, _get_cov, pairwise=bool(pairwise) ) @@ -1913,7 +1913,7 @@ def _get_corr(a, b): return a.cov(b, **kwargs) / (a.std(**kwargs) * b.std(**kwargs)) - return _flex_binary_moment( + return flex_binary_moment( self._selected_obj, other._selected_obj, _get_corr, pairwise=bool(pairwise) ) diff --git a/pandas/io/common.py b/pandas/io/common.py index a80b89569f429..3f130401558dd 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -40,12 +40,12 @@ ModeVar, StorageOptions, ) -from pandas.compat import _get_lzma_file, _import_lzma +from pandas.compat import get_lzma_file, import_lzma from pandas.compat._optional import import_optional_dependency from pandas.core.dtypes.common import is_file_like -lzma = _import_lzma() +lzma = import_lzma() _VALID_URLS = set(uses_relative + uses_netloc + uses_params) @@ -562,7 +562,7 @@ def get_handle( # XZ Compression elif compression == "xz": - f = _get_lzma_file(lzma)(path_or_buf, mode) + f = get_lzma_file(lzma)(path_or_buf, mode) # Unrecognized Compression else: diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 74eb65521f5b2..87343c22ad4e9 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -346,7 +346,7 @@ def read_excel( ) -class _BaseExcelReader(metaclass=abc.ABCMeta): +class BaseExcelReader(metaclass=abc.ABCMeta): def __init__(self, filepath_or_buffer, storage_options: StorageOptions = None): # If filepath_or_buffer is a url, load the data into a BytesIO if is_url(filepath_or_buffer): diff --git a/pandas/io/excel/_odfreader.py b/pandas/io/excel/_odfreader.py index 6cbca59aed97e..02575ab878f6e 100644 --- a/pandas/io/excel/_odfreader.py +++ b/pandas/io/excel/_odfreader.py @@ -7,10 +7,10 @@ import pandas as pd -from pandas.io.excel._base import _BaseExcelReader +from pandas.io.excel._base import BaseExcelReader -class _ODFReader(_BaseExcelReader): +class _ODFReader(BaseExcelReader): """ Read tables out of OpenDocument formatted files. diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py index 89b581da6ed31..f395127902101 100644 --- a/pandas/io/excel/_openpyxl.py +++ b/pandas/io/excel/_openpyxl.py @@ -5,7 +5,7 @@ from pandas._typing import FilePathOrBuffer, Scalar, StorageOptions from pandas.compat._optional import import_optional_dependency -from pandas.io.excel._base import ExcelWriter, _BaseExcelReader +from pandas.io.excel._base import BaseExcelReader, ExcelWriter from pandas.io.excel._util import validate_freeze_panes if TYPE_CHECKING: @@ -438,7 +438,7 @@ def write_cells( setattr(xcell, k, v) -class _OpenpyxlReader(_BaseExcelReader): +class _OpenpyxlReader(BaseExcelReader): def __init__( self, filepath_or_buffer: FilePathOrBuffer, diff --git a/pandas/io/excel/_pyxlsb.py b/pandas/io/excel/_pyxlsb.py index c15a52abe4d53..069c3a2eaa643 100644 --- a/pandas/io/excel/_pyxlsb.py +++ b/pandas/io/excel/_pyxlsb.py @@ -3,10 +3,10 @@ from pandas._typing import FilePathOrBuffer, Scalar, StorageOptions from pandas.compat._optional import import_optional_dependency -from pandas.io.excel._base import _BaseExcelReader +from pandas.io.excel._base import BaseExcelReader -class _PyxlsbReader(_BaseExcelReader): +class _PyxlsbReader(BaseExcelReader): def __init__( self, filepath_or_buffer: FilePathOrBuffer, diff --git a/pandas/io/excel/_xlrd.py b/pandas/io/excel/_xlrd.py index a7fb519af61c6..9057106fb08e5 100644 --- a/pandas/io/excel/_xlrd.py +++ b/pandas/io/excel/_xlrd.py @@ -5,10 +5,10 @@ from pandas._typing import StorageOptions from pandas.compat._optional import import_optional_dependency -from pandas.io.excel._base import _BaseExcelReader +from pandas.io.excel._base import BaseExcelReader -class _XlrdReader(_BaseExcelReader): +class _XlrdReader(BaseExcelReader): def __init__(self, filepath_or_buffer, storage_options: StorageOptions = None): """ Reader using xlrd engine. diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 75228a865c6cc..2cc8e7ec906be 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -256,7 +256,7 @@ def __init__( float_format = get_option("display.float_format") self.float_format = float_format self.dtype = dtype - self.adj = _get_adjustment() + self.adj = get_adjustment() self._chk_truncate() @@ -439,7 +439,7 @@ def _get_pad(t): return [x.rjust(_get_pad(x)) for x in texts] -def _get_adjustment() -> TextAdjustment: +def get_adjustment() -> TextAdjustment: use_east_asian_width = get_option("display.unicode.east_asian_width") if use_east_asian_width: return EastAsianTextAdjustment() @@ -628,7 +628,7 @@ def __init__( self.columns = frame.columns self._chk_truncate() - self.adj = _get_adjustment() + self.adj = get_adjustment() def _chk_truncate(self) -> None: """ @@ -1735,7 +1735,7 @@ def _make_fixed_width( return strings if adj is None: - adj = _get_adjustment() + adj = get_adjustment() max_len = max(adj.len(x) for x in strings) diff --git a/pandas/io/formats/printing.py b/pandas/io/formats/printing.py index 23daab725ec65..edc6fbfff61d7 100644 --- a/pandas/io/formats/printing.py +++ b/pandas/io/formats/printing.py @@ -321,7 +321,7 @@ def format_object_summary( summary string """ from pandas.io.formats.console import get_console_size - from pandas.io.formats.format import _get_adjustment + from pandas.io.formats.format import get_adjustment display_width, _ = get_console_size() if display_width is None: @@ -350,7 +350,7 @@ def format_object_summary( is_truncated = n > max_seq_items # adj can optionally handle unicode eastern asian width - adj = _get_adjustment() + adj = get_adjustment() def _extend_line( s: str, line: str, value: str, display_width: int, next_line_prefix: str diff --git a/pandas/tests/computation/test_compat.py b/pandas/tests/computation/test_compat.py index b3fbd8c17d8bf..ead102f532a20 100644 --- a/pandas/tests/computation/test_compat.py +++ b/pandas/tests/computation/test_compat.py @@ -12,16 +12,16 @@ def test_compat(): # test we have compat with our version of nu - from pandas.core.computation.check import _NUMEXPR_INSTALLED + from pandas.core.computation.check import NUMEXPR_INSTALLED try: import numexpr as ne ver = ne.__version__ if LooseVersion(ver) < LooseVersion(VERSIONS["numexpr"]): - assert not _NUMEXPR_INSTALLED + assert not NUMEXPR_INSTALLED else: - assert _NUMEXPR_INSTALLED + assert NUMEXPR_INSTALLED except ImportError: pytest.skip("not testing numexpr version compat") diff --git a/pandas/tests/computation/test_eval.py b/pandas/tests/computation/test_eval.py index 853ab00853d1b..49066428eb16c 100644 --- a/pandas/tests/computation/test_eval.py +++ b/pandas/tests/computation/test_eval.py @@ -18,7 +18,7 @@ from pandas import DataFrame, Series, compat, date_range import pandas._testing as tm from pandas.core.computation import pytables -from pandas.core.computation.check import _NUMEXPR_VERSION +from pandas.core.computation.check import NUMEXPR_VERSION from pandas.core.computation.engines import NumExprClobberingError, _engines import pandas.core.computation.expr as expr from pandas.core.computation.expr import ( @@ -26,7 +26,7 @@ PandasExprVisitor, PythonExprVisitor, ) -from pandas.core.computation.expressions import _NUMEXPR_INSTALLED, _USE_NUMEXPR +from pandas.core.computation.expressions import _USE_NUMEXPR, NUMEXPR_INSTALLED from pandas.core.computation.ops import ( _arith_ops_syms, _binary_math_ops, @@ -43,7 +43,7 @@ marks=pytest.mark.skipif( engine == "numexpr" and not _USE_NUMEXPR, reason=f"numexpr enabled->{_USE_NUMEXPR}, " - f"installed->{_NUMEXPR_INSTALLED}", + f"installed->{NUMEXPR_INSTALLED}", ), ) for engine in _engines @@ -60,15 +60,15 @@ def parser(request): @pytest.fixture def ne_lt_2_6_9(): - if _NUMEXPR_INSTALLED and _NUMEXPR_VERSION >= LooseVersion("2.6.9"): + if NUMEXPR_INSTALLED and NUMEXPR_VERSION >= LooseVersion("2.6.9"): pytest.skip("numexpr is >= 2.6.9") return "numexpr" @pytest.fixture def unary_fns_for_ne(): - if _NUMEXPR_INSTALLED: - if _NUMEXPR_VERSION >= LooseVersion("2.6.9"): + if NUMEXPR_INSTALLED: + if NUMEXPR_VERSION >= LooseVersion("2.6.9"): return _unary_math_ops else: return tuple(x for x in _unary_math_ops if x not in ("floor", "ceil")) diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py index 447a6108fc3c7..e3cdeb9c1951f 100644 --- a/pandas/tests/extension/json/array.py +++ b/pandas/tests/extension/json/array.py @@ -189,7 +189,7 @@ def _concat_same_type(cls, to_concat): def _values_for_factorize(self): frozen = self._values_for_argsort() if len(frozen) == 0: - # _factorize_array expects 1-d array, this is a len-0 2-d array. + # factorize_array expects 1-d array, this is a len-0 2-d array. frozen = frozen.ravel() return frozen, () diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index e17357e9845b5..70d0b4e9e835c 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -11,7 +11,7 @@ from pandas import DataFrame, MultiIndex, Series import pandas._testing as tm import pandas.core.common as com -from pandas.core.computation.expressions import _MIN_ELEMENTS, _NUMEXPR_INSTALLED +from pandas.core.computation.expressions import _MIN_ELEMENTS, NUMEXPR_INSTALLED from pandas.tests.frame.common import _check_mixed_float, _check_mixed_int # ------------------------------------------------------------------- @@ -375,7 +375,7 @@ def test_floordiv_axis0(self): result2 = df.floordiv(ser.values, axis=0) tm.assert_frame_equal(result2, expected) - @pytest.mark.skipif(not _NUMEXPR_INSTALLED, reason="numexpr not installed") + @pytest.mark.skipif(not NUMEXPR_INSTALLED, reason="numexpr not installed") @pytest.mark.parametrize("opname", ["floordiv", "pow"]) def test_floordiv_axis0_numexpr_path(self, opname): # case that goes through numexpr and has to fall back to masked_arith_op diff --git a/pandas/tests/frame/test_query_eval.py b/pandas/tests/frame/test_query_eval.py index 56d178daee7fd..2994482fa5139 100644 --- a/pandas/tests/frame/test_query_eval.py +++ b/pandas/tests/frame/test_query_eval.py @@ -9,7 +9,7 @@ import pandas as pd from pandas import DataFrame, Index, MultiIndex, Series, date_range import pandas._testing as tm -from pandas.core.computation.check import _NUMEXPR_INSTALLED +from pandas.core.computation.check import NUMEXPR_INSTALLED PARSERS = "python", "pandas" ENGINES = "python", pytest.param("numexpr", marks=td.skip_if_no_ne) @@ -39,7 +39,7 @@ def setup_method(self, method): def test_query_default(self): # GH 12749 - # this should always work, whether _NUMEXPR_INSTALLED or not + # this should always work, whether NUMEXPR_INSTALLED or not df = self.df result = df.query("A>0") tm.assert_frame_equal(result, self.expected1) @@ -65,7 +65,7 @@ def test_query_python(self): def test_query_numexpr(self): df = self.df - if _NUMEXPR_INSTALLED: + if NUMEXPR_INSTALLED: result = df.query("A>0", engine="numexpr") tm.assert_frame_equal(result, self.expected1) result = df.eval("A+1", engine="numexpr") diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index 7daed015f4c57..595b8c55c8ec7 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -226,7 +226,7 @@ def test_repr_truncation(self): r = repr(df) r = r[r.find("\n") + 1 :] - adj = fmt._get_adjustment() + adj = fmt.get_adjustment() for line, value in zip(r.split("\n"), df["B"]): if adj.len(value) + 1 > max_len: diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py index d1c6705dd7a6f..2241fe7013568 100644 --- a/pandas/tests/io/test_pickle.py +++ b/pandas/tests/io/test_pickle.py @@ -24,7 +24,7 @@ import pytest -from pandas.compat import _get_lzma_file, _import_lzma, is_platform_little_endian +from pandas.compat import get_lzma_file, import_lzma, is_platform_little_endian import pandas.util._test_decorators as td import pandas as pd @@ -33,7 +33,7 @@ from pandas.tseries.offsets import Day, MonthEnd -lzma = _import_lzma() +lzma = import_lzma() @pytest.fixture(scope="module") @@ -268,7 +268,7 @@ def compress_file(self, src_path, dest_path, compression): with zipfile.ZipFile(dest_path, "w", compression=zipfile.ZIP_DEFLATED) as f: f.write(src_path, os.path.basename(src_path)) elif compression == "xz": - f = _get_lzma_file(lzma)(dest_path, "w") + f = get_lzma_file(lzma)(dest_path, "w") else: msg = f"Unrecognized compression type: {compression}" raise ValueError(msg) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 72a679d980641..ec7413514d430 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -303,7 +303,7 @@ def test_parametrized_factorize_na_value_default(self, data): ], ) def test_parametrized_factorize_na_value(self, data, na_value): - codes, uniques = algos._factorize_array(data, na_value=na_value) + codes, uniques = algos.factorize_array(data, na_value=na_value) expected_uniques = data[[1, 3]] expected_codes = np.array([-1, 0, -1, 1], dtype=np.intp) tm.assert_numpy_array_equal(codes, expected_codes) diff --git a/pandas/tests/window/moments/test_moments_consistency_rolling.py b/pandas/tests/window/moments/test_moments_consistency_rolling.py index 158b994cf03ae..dfcbdde466d44 100644 --- a/pandas/tests/window/moments/test_moments_consistency_rolling.py +++ b/pandas/tests/window/moments/test_moments_consistency_rolling.py @@ -10,7 +10,7 @@ import pandas as pd from pandas import DataFrame, DatetimeIndex, Index, Series import pandas._testing as tm -from pandas.core.window.common import _flex_binary_moment +from pandas.core.window.common import flex_binary_moment from pandas.tests.window.common import ( check_pairwise_moment, moments_consistency_cov_data, @@ -150,7 +150,7 @@ def test_flex_binary_moment(): # don't blow the stack msg = "arguments to moment function must be of type np.ndarray/Series/DataFrame" with pytest.raises(TypeError, match=msg): - _flex_binary_moment(5, 6, None) + flex_binary_moment(5, 6, None) def test_corr_sanity(): diff --git a/pandas/tests/window/test_pairwise.py b/pandas/tests/window/test_pairwise.py index 7425cc5df4c2f..7f4e85b385b2d 100644 --- a/pandas/tests/window/test_pairwise.py +++ b/pandas/tests/window/test_pairwise.py @@ -41,7 +41,7 @@ def compare(self, result, expected): @pytest.mark.parametrize("f", [lambda x: x.cov(), lambda x: x.corr()]) def test_no_flex(self, f): - # DataFrame methods (which do not call _flex_binary_moment()) + # DataFrame methods (which do not call flex_binary_moment()) results = [f(df) for df in self.df1s] for (df, result) in zip(self.df1s, results): diff --git a/pandas/util/_test_decorators.py b/pandas/util/_test_decorators.py index 78facd6694635..94c252eca1671 100644 --- a/pandas/util/_test_decorators.py +++ b/pandas/util/_test_decorators.py @@ -35,7 +35,7 @@ def test_foo(): from pandas.compat._optional import import_optional_dependency from pandas.compat.numpy import _np_version -from pandas.core.computation.expressions import _NUMEXPR_INSTALLED, _USE_NUMEXPR +from pandas.core.computation.expressions import _USE_NUMEXPR, NUMEXPR_INSTALLED def safe_import(mod_name: str, min_version: Optional[str] = None): @@ -196,7 +196,7 @@ def skip_if_no(package: str, min_version: Optional[str] = None): ) skip_if_no_ne = pytest.mark.skipif( not _USE_NUMEXPR, - reason=f"numexpr enabled->{_USE_NUMEXPR}, installed->{_NUMEXPR_INSTALLED}", + reason=f"numexpr enabled->{_USE_NUMEXPR}, installed->{NUMEXPR_INSTALLED}", ) From fb26735158a30702a8bfa35c4169403f75f7435f Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 6 Sep 2020 10:08:26 -0700 Subject: [PATCH 0683/1025] REF: share more EA methods (#36154) --- pandas/core/arrays/_mixins.py | 33 +++++++- pandas/core/arrays/categorical.py | 126 ++--------------------------- pandas/core/arrays/datetimelike.py | 28 ++----- pandas/core/arrays/numpy_.py | 12 +-- 4 files changed, 45 insertions(+), 154 deletions(-) diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py index 2976747d66dfa..8b79f8ce66756 100644 --- a/pandas/core/arrays/_mixins.py +++ b/pandas/core/arrays/_mixins.py @@ -4,9 +4,10 @@ from pandas.compat.numpy import function as nv from pandas.errors import AbstractMethodError -from pandas.util._decorators import cache_readonly +from pandas.util._decorators import cache_readonly, doc -from pandas.core.algorithms import take, unique +from pandas.core.algorithms import searchsorted, take, unique +from pandas.core.array_algos.transforms import shift from pandas.core.arrays.base import ExtensionArray _T = TypeVar("_T", bound="NDArrayBackedExtensionArray") @@ -120,3 +121,31 @@ def repeat(self: _T, repeats, axis=None) -> _T: def unique(self: _T) -> _T: new_data = unique(self._ndarray) return self._from_backing_data(new_data) + + @classmethod + @doc(ExtensionArray._concat_same_type) + def _concat_same_type(cls, to_concat, axis: int = 0): + dtypes = {str(x.dtype) for x in to_concat} + if len(dtypes) != 1: + raise ValueError("to_concat must have the same dtype (tz)", dtypes) + + new_values = [x._ndarray for x in to_concat] + new_values = np.concatenate(new_values, axis=axis) + return to_concat[0]._from_backing_data(new_values) + + @doc(ExtensionArray.searchsorted) + def searchsorted(self, value, side="left", sorter=None): + return searchsorted(self._ndarray, value, side=side, sorter=sorter) + + @doc(ExtensionArray.shift) + def shift(self, periods=1, fill_value=None, axis=0): + + fill_value = self._validate_shift_value(fill_value) + new_values = shift(self._ndarray, periods, axis, fill_value) + + return self._from_backing_data(new_values) + + def _validate_shift_value(self, fill_value): + # TODO: after deprecation in datetimelikearraymixin is enforced, + # we can remove this and ust validate_fill_value directly + return self._validate_fill_value(fill_value) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index c3c9009dda659..02305479bef67 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -44,8 +44,7 @@ from pandas.core.accessor import PandasDelegate, delegate_names import pandas.core.algorithms as algorithms from pandas.core.algorithms import _get_data_algo, factorize, take_1d, unique1d -from pandas.core.array_algos.transforms import shift -from pandas.core.arrays._mixins import _T, NDArrayBackedExtensionArray +from pandas.core.arrays._mixins import NDArrayBackedExtensionArray from pandas.core.base import ( ExtensionArray, NoNewAttributesMixin, @@ -1193,35 +1192,6 @@ def map(self, mapper): __le__ = _cat_compare_op(operator.le) __ge__ = _cat_compare_op(operator.ge) - def shift(self, periods, fill_value=None): - """ - Shift Categorical by desired number of periods. - - Parameters - ---------- - periods : int - Number of periods to move, can be positive or negative - fill_value : object, optional - The scalar value to use for newly introduced missing values. - - .. versionadded:: 0.24.0 - - Returns - ------- - shifted : Categorical - """ - # since categoricals always have ndim == 1, an axis parameter - # doesn't make any sense here. - codes = self.codes - if codes.ndim > 1: - raise NotImplementedError("Categorical with ndim > 1.") - - fill_value = self._validate_fill_value(fill_value) - - codes = shift(codes, periods, axis=0, fill_value=fill_value) - - return self._constructor(codes, dtype=self.dtype, fastpath=True) - def _validate_fill_value(self, fill_value): """ Convert a user-facing fill_value to a representation to use with our @@ -1383,20 +1353,6 @@ def notna(self): notnull = notna - def dropna(self): - """ - Return the Categorical without null values. - - Missing values (-1 in .codes) are detected. - - Returns - ------- - valid : Categorical - """ - result = self[self.notna()] - - return result - def value_counts(self, dropna=True): """ Return a Series containing counts of each category. @@ -1749,81 +1705,6 @@ def fillna(self, value=None, method=None, limit=None): return self._constructor(codes, dtype=self.dtype, fastpath=True) - def take(self: _T, indexer, allow_fill: bool = False, fill_value=None) -> _T: - """ - Take elements from the Categorical. - - Parameters - ---------- - indexer : sequence of int - The indices in `self` to take. The meaning of negative values in - `indexer` depends on the value of `allow_fill`. - allow_fill : bool, default False - How to handle negative values in `indexer`. - - * False: negative values in `indices` indicate positional indices - from the right. This is similar to - :func:`numpy.take`. - - * True: negative values in `indices` indicate missing values - (the default). These values are set to `fill_value`. Any other - other negative values raise a ``ValueError``. - - .. versionchanged:: 1.0.0 - - Default value changed from ``True`` to ``False``. - - fill_value : object - The value to use for `indices` that are missing (-1), when - ``allow_fill=True``. This should be the category, i.e. a value - in ``self.categories``, not a code. - - Returns - ------- - Categorical - This Categorical will have the same categories and ordered as - `self`. - - See Also - -------- - Series.take : Similar method for Series. - numpy.ndarray.take : Similar method for NumPy arrays. - - Examples - -------- - >>> cat = pd.Categorical(['a', 'a', 'b']) - >>> cat - ['a', 'a', 'b'] - Categories (2, object): ['a', 'b'] - - Specify ``allow_fill==False`` to have negative indices mean indexing - from the right. - - >>> cat.take([0, -1, -2], allow_fill=False) - ['a', 'b', 'a'] - Categories (2, object): ['a', 'b'] - - With ``allow_fill=True``, indices equal to ``-1`` mean "missing" - values that should be filled with the `fill_value`, which is - ``np.nan`` by default. - - >>> cat.take([0, -1, -1], allow_fill=True) - ['a', NaN, NaN] - Categories (2, object): ['a', 'b'] - - The fill value can be specified. - - >>> cat.take([0, -1, -1], allow_fill=True, fill_value='a') - ['a', 'a', 'a'] - Categories (2, object): ['a', 'b'] - - Specifying a fill value that's not in ``self.categories`` - will raise a ``ValueError``. - """ - return NDArrayBackedExtensionArray.take( - self, indexer, allow_fill=allow_fill, fill_value=fill_value - ) - # ------------------------------------------------------------------ # NDArrayBackedExtensionArray compat @@ -1861,6 +1742,9 @@ def __contains__(self, key) -> bool: return contains(self, key, container=self._codes) + # ------------------------------------------------------------------ + # Rendering Methods + def _tidy_repr(self, max_vals=10, footer=True) -> str: """ a short repr displaying only max_vals and an optional (but default @@ -1959,6 +1843,8 @@ def __repr__(self) -> str: return result + # ------------------------------------------------------------------ + def _maybe_coerce_indexer(self, indexer): """ return an indexer coerced to the codes dtype diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 5a44f87400b79..a5b8032974fa4 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -54,9 +54,8 @@ from pandas.core import missing, nanops, ops from pandas.core.algorithms import checked_add_with_arr, unique1d, value_counts -from pandas.core.array_algos.transforms import shift from pandas.core.arrays._mixins import _T, NDArrayBackedExtensionArray -from pandas.core.arrays.base import ExtensionArray, ExtensionOpsMixin +from pandas.core.arrays.base import ExtensionOpsMixin import pandas.core.common as com from pandas.core.construction import array, extract_array from pandas.core.indexers import check_array_indexer @@ -672,18 +671,11 @@ def view(self, dtype=None): @classmethod def _concat_same_type(cls, to_concat, axis: int = 0): - - # do not pass tz to set because tzlocal cannot be hashed - dtypes = {str(x.dtype) for x in to_concat} - if len(dtypes) != 1: - raise ValueError("to_concat must have the same dtype (tz)", dtypes) + new_obj = super()._concat_same_type(to_concat, axis) obj = to_concat[0] dtype = obj.dtype - i8values = [x.asi8 for x in to_concat] - values = np.concatenate(i8values, axis=axis) - new_freq = None if is_period_dtype(dtype): new_freq = obj.freq @@ -697,11 +689,13 @@ def _concat_same_type(cls, to_concat, axis: int = 0): if all(pair[0][-1] + obj.freq == pair[1][0] for pair in pairs): new_freq = obj.freq - return cls._simple_new(values, dtype=dtype, freq=new_freq) + new_obj._freq = new_freq + return new_obj def copy(self: DatetimeLikeArrayT) -> DatetimeLikeArrayT: - values = self.asi8.copy() - return type(self)._simple_new(values, dtype=self.dtype, freq=self.freq) + new_obj = super().copy() + new_obj._freq = self.freq + return new_obj def _values_for_factorize(self): return self.asi8, iNaT @@ -713,14 +707,6 @@ def _from_factorized(cls, values, original): def _values_for_argsort(self): return self._data - @Appender(ExtensionArray.shift.__doc__) - def shift(self, periods=1, fill_value=None, axis=0): - - fill_value = self._validate_shift_value(fill_value) - new_values = shift(self._data, periods, axis, fill_value) - - return type(self)._simple_new(new_values, dtype=self.dtype) - # ------------------------------------------------------------------ # Validation Methods # TODO: try to de-duplicate these, ensure identical behavior diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index 23a4a70734c81..588d68514649a 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -7,7 +7,6 @@ from pandas._libs import lib from pandas._typing import Scalar from pandas.compat.numpy import function as nv -from pandas.util._decorators import doc from pandas.util._validators import validate_fillna_kwargs from pandas.core.dtypes.dtypes import ExtensionDtype @@ -16,10 +15,9 @@ from pandas import compat from pandas.core import nanops, ops -from pandas.core.algorithms import searchsorted from pandas.core.array_algos import masked_reductions from pandas.core.arrays._mixins import NDArrayBackedExtensionArray -from pandas.core.arrays.base import ExtensionArray, ExtensionOpsMixin +from pandas.core.arrays.base import ExtensionOpsMixin from pandas.core.construction import extract_array from pandas.core.indexers import check_array_indexer from pandas.core.missing import backfill_1d, pad_1d @@ -189,10 +187,6 @@ def _from_sequence(cls, scalars, dtype=None, copy: bool = False) -> "PandasArray def _from_factorized(cls, values, original) -> "PandasArray": return cls(values) - @classmethod - def _concat_same_type(cls, to_concat) -> "PandasArray": - return cls(np.concatenate(to_concat)) - def _from_backing_data(self, arr: np.ndarray) -> "PandasArray": return type(self)(arr) @@ -423,10 +417,6 @@ def to_numpy( return result - @doc(ExtensionArray.searchsorted) - def searchsorted(self, value, side="left", sorter=None): - return searchsorted(self.to_numpy(), value, side=side, sorter=sorter) - # ------------------------------------------------------------------------ # Ops From 2298225edd3ffc5e49b80bc9b04e99331d4bb4a5 Mon Sep 17 00:00:00 2001 From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> Date: Sun, 6 Sep 2020 13:11:04 -0400 Subject: [PATCH 0684/1025] CLN: Separate transform tests (#36146) --- pandas/tests/frame/apply/test_frame_apply.py | 49 +------------ .../tests/frame/apply/test_frame_transform.py | 72 +++++++++++++++++++ pandas/tests/frame/common.py | 24 +++++++ .../tests/series/apply/test_series_apply.py | 31 +------- .../series/apply/test_series_transform.py | 59 +++++++++++++++ 5 files changed, 157 insertions(+), 78 deletions(-) create mode 100644 pandas/tests/frame/apply/test_frame_transform.py create mode 100644 pandas/tests/series/apply/test_series_transform.py diff --git a/pandas/tests/frame/apply/test_frame_apply.py b/pandas/tests/frame/apply/test_frame_apply.py index 5a1e448beb40f..bc09501583e2c 100644 --- a/pandas/tests/frame/apply/test_frame_apply.py +++ b/pandas/tests/frame/apply/test_frame_apply.py @@ -1,7 +1,6 @@ from collections import OrderedDict from datetime import datetime from itertools import chain -import operator import warnings import numpy as np @@ -14,6 +13,7 @@ import pandas._testing as tm from pandas.core.apply import frame_apply from pandas.core.base import SpecificationError +from pandas.tests.frame.common import zip_frames @pytest.fixture @@ -1058,25 +1058,6 @@ def test_consistency_for_boxed(self, box, int_frame_const_col): tm.assert_frame_equal(result, expected) -def zip_frames(frames, axis=1): - """ - take a list of frames, zip them together under the - assumption that these all have the first frames' index/columns. - - Returns - ------- - new_frame : DataFrame - """ - if axis == 1: - columns = frames[0].columns - zipped = [f.loc[:, c] for c in columns for f in frames] - return pd.concat(zipped, axis=1) - else: - index = frames[0].index - zipped = [f.loc[i, :] for i in index for f in frames] - return pd.DataFrame(zipped) - - class TestDataFrameAggregate: def test_agg_transform(self, axis, float_frame): other_axis = 1 if axis in {0, "index"} else 0 @@ -1087,16 +1068,10 @@ def test_agg_transform(self, axis, float_frame): f_sqrt = np.sqrt(float_frame) # ufunc - result = float_frame.transform(np.sqrt, axis=axis) expected = f_sqrt.copy() - tm.assert_frame_equal(result, expected) - result = float_frame.apply(np.sqrt, axis=axis) tm.assert_frame_equal(result, expected) - result = float_frame.transform(np.sqrt, axis=axis) - tm.assert_frame_equal(result, expected) - # list-like result = float_frame.apply([np.sqrt], axis=axis) expected = f_sqrt.copy() @@ -1110,9 +1085,6 @@ def test_agg_transform(self, axis, float_frame): ) tm.assert_frame_equal(result, expected) - result = float_frame.transform([np.sqrt], axis=axis) - tm.assert_frame_equal(result, expected) - # multiple items in list # these are in the order as if we are applying both # functions per series and then concatting @@ -1128,38 +1100,19 @@ def test_agg_transform(self, axis, float_frame): ) tm.assert_frame_equal(result, expected) - result = float_frame.transform([np.abs, "sqrt"], axis=axis) - tm.assert_frame_equal(result, expected) - def test_transform_and_agg_err(self, axis, float_frame): # cannot both transform and agg - msg = "transforms cannot produce aggregated results" - with pytest.raises(ValueError, match=msg): - float_frame.transform(["max", "min"], axis=axis) - msg = "cannot combine transform and aggregation operations" with pytest.raises(ValueError, match=msg): with np.errstate(all="ignore"): float_frame.agg(["max", "sqrt"], axis=axis) - with pytest.raises(ValueError, match=msg): - with np.errstate(all="ignore"): - float_frame.transform(["max", "sqrt"], axis=axis) - df = pd.DataFrame({"A": range(5), "B": 5}) def f(): with np.errstate(all="ignore"): df.agg({"A": ["abs", "sum"], "B": ["mean", "max"]}, axis=axis) - @pytest.mark.parametrize("method", ["abs", "shift", "pct_change", "cumsum", "rank"]) - def test_transform_method_name(self, method): - # GH 19760 - df = pd.DataFrame({"A": [-1, 2]}) - result = df.transform(method) - expected = operator.methodcaller(method)(df) - tm.assert_frame_equal(result, expected) - def test_demo(self): # demonstration tests df = pd.DataFrame({"A": range(5), "B": 5}) diff --git a/pandas/tests/frame/apply/test_frame_transform.py b/pandas/tests/frame/apply/test_frame_transform.py new file mode 100644 index 0000000000000..3a345215482ed --- /dev/null +++ b/pandas/tests/frame/apply/test_frame_transform.py @@ -0,0 +1,72 @@ +import operator + +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm +from pandas.tests.frame.common import zip_frames + + +def test_agg_transform(axis, float_frame): + other_axis = 1 if axis in {0, "index"} else 0 + + with np.errstate(all="ignore"): + + f_abs = np.abs(float_frame) + f_sqrt = np.sqrt(float_frame) + + # ufunc + result = float_frame.transform(np.sqrt, axis=axis) + expected = f_sqrt.copy() + tm.assert_frame_equal(result, expected) + + result = float_frame.transform(np.sqrt, axis=axis) + tm.assert_frame_equal(result, expected) + + # list-like + expected = f_sqrt.copy() + if axis in {0, "index"}: + expected.columns = pd.MultiIndex.from_product( + [float_frame.columns, ["sqrt"]] + ) + else: + expected.index = pd.MultiIndex.from_product([float_frame.index, ["sqrt"]]) + result = float_frame.transform([np.sqrt], axis=axis) + tm.assert_frame_equal(result, expected) + + # multiple items in list + # these are in the order as if we are applying both + # functions per series and then concatting + expected = zip_frames([f_abs, f_sqrt], axis=other_axis) + if axis in {0, "index"}: + expected.columns = pd.MultiIndex.from_product( + [float_frame.columns, ["absolute", "sqrt"]] + ) + else: + expected.index = pd.MultiIndex.from_product( + [float_frame.index, ["absolute", "sqrt"]] + ) + result = float_frame.transform([np.abs, "sqrt"], axis=axis) + tm.assert_frame_equal(result, expected) + + +def test_transform_and_agg_err(axis, float_frame): + # cannot both transform and agg + msg = "transforms cannot produce aggregated results" + with pytest.raises(ValueError, match=msg): + float_frame.transform(["max", "min"], axis=axis) + + msg = "cannot combine transform and aggregation operations" + with pytest.raises(ValueError, match=msg): + with np.errstate(all="ignore"): + float_frame.transform(["max", "sqrt"], axis=axis) + + +@pytest.mark.parametrize("method", ["abs", "shift", "pct_change", "cumsum", "rank"]) +def test_transform_method_name(method): + # GH 19760 + df = pd.DataFrame({"A": [-1, 2]}) + result = df.transform(method) + expected = operator.methodcaller(method)(df) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/common.py b/pandas/tests/frame/common.py index 463a140972ab5..73e60ff389038 100644 --- a/pandas/tests/frame/common.py +++ b/pandas/tests/frame/common.py @@ -1,3 +1,8 @@ +from typing import List + +from pandas import DataFrame, concat + + def _check_mixed_float(df, dtype=None): # float16 are most likely to be upcasted to float32 dtypes = dict(A="float32", B="float32", C="float16", D="float64") @@ -29,3 +34,22 @@ def _check_mixed_int(df, dtype=None): assert df.dtypes["C"] == dtypes["C"] if dtypes.get("D"): assert df.dtypes["D"] == dtypes["D"] + + +def zip_frames(frames: List[DataFrame], axis: int = 1) -> DataFrame: + """ + take a list of frames, zip them together under the + assumption that these all have the first frames' index/columns. + + Returns + ------- + new_frame : DataFrame + """ + if axis == 1: + columns = frames[0].columns + zipped = [f.loc[:, c] for c in columns for f in frames] + return concat(zipped, axis=1) + else: + index = frames[0].index + zipped = [f.loc[i, :] for i in index for f in frames] + return DataFrame(zipped) diff --git a/pandas/tests/series/apply/test_series_apply.py b/pandas/tests/series/apply/test_series_apply.py index 308398642895c..b948317f32062 100644 --- a/pandas/tests/series/apply/test_series_apply.py +++ b/pandas/tests/series/apply/test_series_apply.py @@ -209,25 +209,16 @@ def test_transform(self, string_series): f_abs = np.abs(string_series) # ufunc - result = string_series.transform(np.sqrt) expected = f_sqrt.copy() - tm.assert_series_equal(result, expected) - result = string_series.apply(np.sqrt) tm.assert_series_equal(result, expected) # list-like - result = string_series.transform([np.sqrt]) + result = string_series.apply([np.sqrt]) expected = f_sqrt.to_frame().copy() expected.columns = ["sqrt"] tm.assert_frame_equal(result, expected) - result = string_series.transform([np.sqrt]) - tm.assert_frame_equal(result, expected) - - result = string_series.transform(["sqrt"]) - tm.assert_frame_equal(result, expected) - # multiple items in list # these are in the order as if we are applying both functions per # series and then concatting @@ -236,10 +227,6 @@ def test_transform(self, string_series): result = string_series.apply([np.sqrt, np.abs]) tm.assert_frame_equal(result, expected) - result = string_series.transform(["sqrt", "abs"]) - expected.columns = ["sqrt", "abs"] - tm.assert_frame_equal(result, expected) - # dict, provide renaming expected = pd.concat([f_sqrt, f_abs], axis=1) expected.columns = ["foo", "bar"] @@ -250,19 +237,11 @@ def test_transform(self, string_series): def test_transform_and_agg_error(self, string_series): # we are trying to transform with an aggregator - msg = "transforms cannot produce aggregated results" - with pytest.raises(ValueError, match=msg): - string_series.transform(["min", "max"]) - msg = "cannot combine transform and aggregation" with pytest.raises(ValueError, match=msg): with np.errstate(all="ignore"): string_series.agg(["sqrt", "max"]) - with pytest.raises(ValueError, match=msg): - with np.errstate(all="ignore"): - string_series.transform(["sqrt", "max"]) - msg = "cannot perform both aggregation and transformation" with pytest.raises(ValueError, match=msg): with np.errstate(all="ignore"): @@ -463,14 +442,6 @@ def test_agg_cython_table_raises(self, series, func, expected): # e.g. Series('a b'.split()).cumprod() will raise series.agg(func) - def test_transform_none_to_type(self): - # GH34377 - df = pd.DataFrame({"a": [None]}) - - msg = "DataFrame constructor called with incompatible data and dtype" - with pytest.raises(TypeError, match=msg): - df.transform({"a": int}) - class TestSeriesMap: def test_map(self, datetime_series): diff --git a/pandas/tests/series/apply/test_series_transform.py b/pandas/tests/series/apply/test_series_transform.py new file mode 100644 index 0000000000000..8bc3d2dc4d0db --- /dev/null +++ b/pandas/tests/series/apply/test_series_transform.py @@ -0,0 +1,59 @@ +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm + + +def test_transform(string_series): + # transforming functions + + with np.errstate(all="ignore"): + f_sqrt = np.sqrt(string_series) + f_abs = np.abs(string_series) + + # ufunc + result = string_series.transform(np.sqrt) + expected = f_sqrt.copy() + tm.assert_series_equal(result, expected) + + # list-like + result = string_series.transform([np.sqrt]) + expected = f_sqrt.to_frame().copy() + expected.columns = ["sqrt"] + tm.assert_frame_equal(result, expected) + + result = string_series.transform([np.sqrt]) + tm.assert_frame_equal(result, expected) + + result = string_series.transform(["sqrt"]) + tm.assert_frame_equal(result, expected) + + # multiple items in list + # these are in the order as if we are applying both functions per + # series and then concatting + expected = pd.concat([f_sqrt, f_abs], axis=1) + result = string_series.transform(["sqrt", "abs"]) + expected.columns = ["sqrt", "abs"] + tm.assert_frame_equal(result, expected) + + +def test_transform_and_agg_error(string_series): + # we are trying to transform with an aggregator + msg = "transforms cannot produce aggregated results" + with pytest.raises(ValueError, match=msg): + string_series.transform(["min", "max"]) + + msg = "cannot combine transform and aggregation operations" + with pytest.raises(ValueError, match=msg): + with np.errstate(all="ignore"): + string_series.transform(["sqrt", "max"]) + + +def test_transform_none_to_type(): + # GH34377 + df = pd.DataFrame({"a": [None]}) + + msg = "DataFrame constructor called with incompatible data and dtype" + with pytest.raises(TypeError, match=msg): + df.transform({"a": int}) From e53d53adb75277b8eb2fa4c527810e52ecb3cf0a Mon Sep 17 00:00:00 2001 From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> Date: Sun, 6 Sep 2020 13:24:03 -0400 Subject: [PATCH 0685/1025] CLN: _wrap_applied_output (#36160) --- pandas/core/groupby/generic.py | 191 ++++++++++++++++----------------- 1 file changed, 91 insertions(+), 100 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 260e21b1f2593..72003eab24b29 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1192,113 +1192,104 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): return self.obj._constructor() elif isinstance(first_not_none, DataFrame): return self._concat_objects(keys, values, not_indexed_same=not_indexed_same) - else: - key_index = self.grouper.result_index if self.as_index else None - - if isinstance(first_not_none, Series): - # this is to silence a DeprecationWarning - # TODO: Remove when default dtype of empty Series is object - kwargs = first_not_none._construct_axes_dict() - backup = create_series_with_explicit_dtype( - dtype_if_empty=object, **kwargs - ) - - values = [x if (x is not None) else backup for x in values] - v = values[0] - - if not isinstance(v, (np.ndarray, Index, Series)) and self.as_index: - # values are not series or array-like but scalars - # self._selection_name not passed through to Series as the - # result should not take the name of original selection - # of columns - return self.obj._constructor_sliced(values, index=key_index) + key_index = self.grouper.result_index if self.as_index else None + + if isinstance(first_not_none, Series): + # this is to silence a DeprecationWarning + # TODO: Remove when default dtype of empty Series is object + kwargs = first_not_none._construct_axes_dict() + backup = create_series_with_explicit_dtype(dtype_if_empty=object, **kwargs) + + values = [x if (x is not None) else backup for x in values] + + v = values[0] + + if not isinstance(v, (np.ndarray, Index, Series)) and self.as_index: + # values are not series or array-like but scalars + # self._selection_name not passed through to Series as the + # result should not take the name of original selection + # of columns + return self.obj._constructor_sliced(values, index=key_index) + + if isinstance(v, Series): + all_indexed_same = all_indexes_same((x.index for x in values)) + + # GH3596 + # provide a reduction (Frame -> Series) if groups are + # unique + if self.squeeze: + applied_index = self._selected_obj._get_axis(self.axis) + singular_series = len(values) == 1 and applied_index.nlevels == 1 + + # assign the name to this series + if singular_series: + values[0].name = keys[0] + + # GH2893 + # we have series in the values array, we want to + # produce a series: + # if any of the sub-series are not indexed the same + # OR we don't have a multi-index and we have only a + # single values + return self._concat_objects( + keys, values, not_indexed_same=not_indexed_same + ) + # still a series + # path added as of GH 5545 + elif all_indexed_same: + from pandas.core.reshape.concat import concat + + return concat(values) + + if not all_indexed_same: + # GH 8467 + return self._concat_objects(keys, values, not_indexed_same=True) + + # Combine values + # vstack+constructor is faster than concat and handles MI-columns + stacked_values = np.vstack([np.asarray(v) for v in values]) + + if self.axis == 0: + index = key_index + columns = v.index.copy() + if columns.name is None: + # GH6124 - propagate name of Series when it's consistent + names = {v.name for v in values} + if len(names) == 1: + columns.name = list(names)[0] else: - if isinstance(v, Series): - all_indexed_same = all_indexes_same((x.index for x in values)) - - # GH3596 - # provide a reduction (Frame -> Series) if groups are - # unique - if self.squeeze: - applied_index = self._selected_obj._get_axis(self.axis) - singular_series = ( - len(values) == 1 and applied_index.nlevels == 1 - ) - - # assign the name to this series - if singular_series: - values[0].name = keys[0] - - # GH2893 - # we have series in the values array, we want to - # produce a series: - # if any of the sub-series are not indexed the same - # OR we don't have a multi-index and we have only a - # single values - return self._concat_objects( - keys, values, not_indexed_same=not_indexed_same - ) - - # still a series - # path added as of GH 5545 - elif all_indexed_same: - from pandas.core.reshape.concat import concat - - return concat(values) - - if not all_indexed_same: - # GH 8467 - return self._concat_objects(keys, values, not_indexed_same=True) - - # Combine values - # vstack+constructor is faster than concat and handles MI-columns - stacked_values = np.vstack([np.asarray(v) for v in values]) - - if self.axis == 0: - index = key_index - columns = v.index.copy() - if columns.name is None: - # GH6124 - propagate name of Series when it's consistent - names = {v.name for v in values} - if len(names) == 1: - columns.name = list(names)[0] - else: - index = v.index - columns = key_index - stacked_values = stacked_values.T - - result = self.obj._constructor( - stacked_values, index=index, columns=columns - ) + index = v.index + columns = key_index + stacked_values = stacked_values.T - elif not self.as_index: - # We add grouping column below, so create a frame here - result = DataFrame( - values, index=key_index, columns=[self._selection] - ) - else: - # GH#1738: values is list of arrays of unequal lengths - # fall through to the outer else clause - # TODO: sure this is right? we used to do this - # after raising AttributeError above - return self.obj._constructor_sliced( - values, index=key_index, name=self._selection_name - ) + result = self.obj._constructor(stacked_values, index=index, columns=columns) - # if we have date/time like in the original, then coerce dates - # as we are stacking can easily have object dtypes here - so = self._selected_obj - if so.ndim == 2 and so.dtypes.apply(needs_i8_conversion).any(): - result = _recast_datetimelike_result(result) - else: - result = result._convert(datetime=True) + elif not self.as_index: + # We add grouping column below, so create a frame here + result = DataFrame(values, index=key_index, columns=[self._selection]) + else: + # GH#1738: values is list of arrays of unequal lengths + # fall through to the outer else clause + # TODO: sure this is right? we used to do this + # after raising AttributeError above + return self.obj._constructor_sliced( + values, index=key_index, name=self._selection_name + ) + + # if we have date/time like in the original, then coerce dates + # as we are stacking can easily have object dtypes here + so = self._selected_obj + if so.ndim == 2 and so.dtypes.apply(needs_i8_conversion).any(): + result = _recast_datetimelike_result(result) + else: + result = result._convert(datetime=True) - if not self.as_index: - self._insert_inaxis_grouper_inplace(result) + if not self.as_index: + self._insert_inaxis_grouper_inplace(result) - return self._reindex_output(result) + return self._reindex_output(result) def _transform_general( self, func, *args, engine="cython", engine_kwargs=None, **kwargs From e03d2245d15492ec05ebc641e89e6e3f943d7212 Mon Sep 17 00:00:00 2001 From: Alex Kirko Date: Sun, 6 Sep 2020 20:47:40 +0300 Subject: [PATCH 0686/1025] BUG: allow missing values in Index when calling Index.sort_values (#35604) --- doc/source/whatsnew/v1.2.0.rst | 3 +- pandas/conftest.py | 23 ++++++++ pandas/core/indexes/base.py | 27 ++++++++-- .../tests/indexes/interval/test_interval.py | 2 +- pandas/tests/indexes/period/test_ops.py | 16 ++++-- pandas/tests/indexes/test_common.py | 52 ++++++++++++++++++- 6 files changed, 112 insertions(+), 11 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index ff9e803b4990a..b4fdbf9588ffe 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -270,8 +270,9 @@ Interval Indexing ^^^^^^^^ + - Bug in :meth:`PeriodIndex.get_loc` incorrectly raising ``ValueError`` on non-datelike strings instead of ``KeyError``, causing similar errors in :meth:`Series.__geitem__`, :meth:`Series.__contains__`, and :meth:`Series.loc.__getitem__` (:issue:`34240`) -- +- Bug in :meth:`Index.sort_values` where, when empty values were passed, the method would break by trying to compare missing values instead of pushing them to the end of the sort order. (:issue:`35584`) - Missing diff --git a/pandas/conftest.py b/pandas/conftest.py index 0878380d00837..5474005a63b8e 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -437,6 +437,29 @@ def index(request): index_fixture2 = index +@pytest.fixture(params=indices_dict.keys()) +def index_with_missing(request): + """ + Fixture for indices with missing values + """ + if request.param in ["int", "uint", "range", "empty", "repeats"]: + pytest.xfail("missing values not supported") + # GH 35538. Use deep copy to avoid illusive bug on np-dev + # Azure pipeline that writes into indices_dict despite copy + ind = indices_dict[request.param].copy(deep=True) + vals = ind.values + if request.param in ["tuples", "mi-with-dt64tz-level", "multi"]: + # For setting missing values in the top level of MultiIndex + vals = ind.tolist() + vals[0] = tuple([None]) + vals[0][1:] + vals[-1] = tuple([None]) + vals[-1][1:] + return MultiIndex.from_tuples(vals) + else: + vals[0] = None + vals[-1] = None + return type(ind)(vals) + + # ---------------------------------------------------------------- # Series' # ---------------------------------------------------------------- diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 65b5dfb6df911..a1bc8a4659b24 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -88,7 +88,7 @@ import pandas.core.missing as missing from pandas.core.ops import get_op_result_name from pandas.core.ops.invalid import make_invalid_op -from pandas.core.sorting import ensure_key_mapped +from pandas.core.sorting import ensure_key_mapped, nargsort from pandas.core.strings import StringMethods from pandas.io.formats.printing import ( @@ -4443,7 +4443,11 @@ def asof_locs(self, where, mask): return result def sort_values( - self, return_indexer=False, ascending=True, key: Optional[Callable] = None + self, + return_indexer=False, + ascending=True, + na_position: str_t = "last", + key: Optional[Callable] = None, ): """ Return a sorted copy of the index. @@ -4457,6 +4461,12 @@ def sort_values( Should the indices that would sort the index be returned. ascending : bool, default True Should the index values be sorted in an ascending order. + na_position : {'first' or 'last'}, default 'last' + Argument 'first' puts NaNs at the beginning, 'last' puts NaNs at + the end. + + .. versionadded:: 1.2.0 + key : callable, optional If not None, apply the key function to the index values before sorting. This is similar to the `key` argument in the @@ -4497,9 +4507,16 @@ def sort_values( """ idx = ensure_key_mapped(self, key) - _as = idx.argsort() - if not ascending: - _as = _as[::-1] + # GH 35584. Sort missing values according to na_position kwarg + # ignore na_position for MutiIndex + if not isinstance(self, ABCMultiIndex): + _as = nargsort( + items=idx, ascending=ascending, na_position=na_position, key=key + ) + else: + _as = idx.argsort() + if not ascending: + _as = _as[::-1] sorted_index = self.take(_as) diff --git a/pandas/tests/indexes/interval/test_interval.py b/pandas/tests/indexes/interval/test_interval.py index a20e542b1edd7..42849e0bbb5c7 100644 --- a/pandas/tests/indexes/interval/test_interval.py +++ b/pandas/tests/indexes/interval/test_interval.py @@ -618,7 +618,7 @@ def test_sort_values(self, closed): expected = IntervalIndex([Interval(0, 1), Interval(1, 2), np.nan]) tm.assert_index_equal(result, expected) - result = index.sort_values(ascending=False) + result = index.sort_values(ascending=False, na_position="first") expected = IntervalIndex([np.nan, Interval(1, 2), Interval(0, 1)]) tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/period/test_ops.py b/pandas/tests/indexes/period/test_ops.py index e7dd76584d780..d1b34c315b682 100644 --- a/pandas/tests/indexes/period/test_ops.py +++ b/pandas/tests/indexes/period/test_ops.py @@ -174,9 +174,6 @@ def _check_freq(index, expected_index): ordered, indexer = idx.sort_values(return_indexer=True, ascending=False) tm.assert_index_equal(ordered, expected[::-1]) - - exp = np.array([2, 1, 3, 4, 0]) - tm.assert_numpy_array_equal(indexer, exp, check_dtype=False) _check_freq(ordered, idx) pidx = PeriodIndex(["2011", "2013", "NaT", "2011"], name="pidx", freq="D") @@ -333,3 +330,16 @@ def test_freq_setter_deprecated(self): # warning for setter with pytest.raises(AttributeError, match="can't set attribute"): idx.freq = pd.offsets.Day() + + +@pytest.mark.xfail(reason="Datetime-like sort_values currently unstable (GH 35922)") +def test_order_stability_compat(): + # GH 35584. The new implementation of sort_values for Index.sort_values + # is stable when sorting in descending order. Datetime-like sort_values + # currently aren't stable. xfail should be removed after + # the implementations' behavior is synchronized (xref GH 35922) + pidx = PeriodIndex(["2011", "2013", "2015", "2012", "2011"], name="pidx", freq="A") + iidx = Index([2011, 2013, 2015, 2012, 2011], name="idx") + ordered1, indexer1 = pidx.sort_values(return_indexer=True, ascending=False) + ordered2, indexer2 = iidx.sort_values(return_indexer=True, ascending=False) + tm.assert_numpy_array_equal(indexer1, indexer2) diff --git a/pandas/tests/indexes/test_common.py b/pandas/tests/indexes/test_common.py index db260b71e7186..aa6b395176b06 100644 --- a/pandas/tests/indexes/test_common.py +++ b/pandas/tests/indexes/test_common.py @@ -13,7 +13,14 @@ from pandas.core.dtypes.common import is_period_dtype, needs_i8_conversion import pandas as pd -from pandas import CategoricalIndex, MultiIndex, RangeIndex +from pandas import ( + CategoricalIndex, + DatetimeIndex, + MultiIndex, + PeriodIndex, + RangeIndex, + TimedeltaIndex, +) import pandas._testing as tm @@ -391,3 +398,46 @@ def test_astype_preserves_name(self, index, dtype): assert result.names == index.names else: assert result.name == index.name + + +@pytest.mark.parametrize("na_position", [None, "middle"]) +def test_sort_values_invalid_na_position(index_with_missing, na_position): + if isinstance(index_with_missing, (DatetimeIndex, PeriodIndex, TimedeltaIndex)): + # datetime-like indices will get na_position kwarg as part of + # synchronizing duplicate-sorting behavior, because we currently expect + # them, other indices, and Series to sort differently (xref 35922) + pytest.xfail("sort_values does not support na_position kwarg") + elif isinstance(index_with_missing, (CategoricalIndex, MultiIndex)): + pytest.xfail("missing value sorting order not defined for index type") + + if na_position not in ["first", "last"]: + with pytest.raises( + ValueError, match=f"invalid na_position: {na_position}", + ): + index_with_missing.sort_values(na_position=na_position) + + +@pytest.mark.parametrize("na_position", ["first", "last"]) +def test_sort_values_with_missing(index_with_missing, na_position): + # GH 35584. Test that sort_values works with missing values, + # sort non-missing and place missing according to na_position + + if isinstance(index_with_missing, (DatetimeIndex, PeriodIndex, TimedeltaIndex)): + # datetime-like indices will get na_position kwarg as part of + # synchronizing duplicate-sorting behavior, because we currently expect + # them, other indices, and Series to sort differently (xref 35922) + pytest.xfail("sort_values does not support na_position kwarg") + elif isinstance(index_with_missing, (CategoricalIndex, MultiIndex)): + pytest.xfail("missing value sorting order not defined for index type") + + missing_count = np.sum(index_with_missing.isna()) + not_na_vals = index_with_missing[index_with_missing.notna()].values + sorted_values = np.sort(not_na_vals) + if na_position == "first": + sorted_values = np.concatenate([[None] * missing_count, sorted_values]) + else: + sorted_values = np.concatenate([sorted_values, [None] * missing_count]) + expected = type(index_with_missing)(sorted_values) + + result = index_with_missing.sort_values(na_position=na_position) + tm.assert_index_equal(result, expected) From 4c1e46d8ecb89f2854e2c8446b3fb0c90fd58c24 Mon Sep 17 00:00:00 2001 From: Honfung Wong Date: Mon, 7 Sep 2020 01:49:26 +0800 Subject: [PATCH 0687/1025] BUG: extra leading space in to_string when index=False (#36094) --- doc/source/whatsnew/v1.2.0.rst | 5 ++- pandas/io/formats/format.py | 28 +++++++++++----- pandas/tests/io/formats/test_format.py | 42 +++++++++++++++++++++--- pandas/tests/io/formats/test_to_latex.py | 22 ++++++------- 4 files changed, 71 insertions(+), 26 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index b4fdbf9588ffe..9a778acba4764 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -214,8 +214,6 @@ Performance improvements Bug fixes ~~~~~~~~~ -- Bug in :meth:`DataFrameGroupBy.apply` raising error with ``np.nan`` group(s) when ``dropna=False`` (:issue:`35889`) -- Categorical ^^^^^^^^^^^ @@ -257,7 +255,7 @@ Conversion Strings ^^^^^^^ - +- Bug in :meth:`Series.to_string`, :meth:`DataFrame.to_string`, and :meth:`DataFrame.to_latex` adding a leading space when ``index=False`` (:issue:`24980`) - - @@ -315,6 +313,7 @@ Groupby/resample/rolling - Bug when subsetting columns on a :class:`~pandas.core.groupby.DataFrameGroupBy` (e.g. ``df.groupby('a')[['b']])``) would reset the attributes ``axis``, ``dropna``, ``group_keys``, ``level``, ``mutated``, ``sort``, and ``squeeze`` to their default values. (:issue:`9959`) - Bug in :meth:`DataFrameGroupby.tshift` failing to raise ``ValueError`` when a frequency cannot be inferred for the index of a group (:issue:`35937`) - Bug in :meth:`DataFrame.groupby` does not always maintain column index name for ``any``, ``all``, ``bfill``, ``ffill``, ``shift`` (:issue:`29764`) +- Bug in :meth:`DataFrameGroupBy.apply` raising error with ``np.nan`` group(s) when ``dropna=False`` (:issue:`35889`) - Reshaping diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 2cc8e7ec906be..13fa8908ff450 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -345,6 +345,7 @@ def _get_formatted_values(self) -> List[str]: None, float_format=self.float_format, na_rep=self.na_rep, + leading_space=self.index, ) def to_string(self) -> str: @@ -960,6 +961,7 @@ def _format_col(self, i: int) -> List[str]: na_rep=self.na_rep, space=self.col_space.get(frame.columns[i]), decimal=self.decimal, + leading_space=self.index, ) def to_html( @@ -1111,7 +1113,7 @@ def format_array( space: Optional[Union[str, int]] = None, justify: str = "right", decimal: str = ".", - leading_space: Optional[bool] = None, + leading_space: Optional[bool] = True, quoting: Optional[int] = None, ) -> List[str]: """ @@ -1127,7 +1129,7 @@ def format_array( space justify decimal - leading_space : bool, optional + leading_space : bool, optional, default True Whether the array should be formatted with a leading space. When an array as a column of a Series or DataFrame, we do want the leading space to pad between columns. @@ -1194,7 +1196,7 @@ def __init__( decimal: str = ".", quoting: Optional[int] = None, fixed_width: bool = True, - leading_space: Optional[bool] = None, + leading_space: Optional[bool] = True, ): self.values = values self.digits = digits @@ -1397,9 +1399,11 @@ def format_values_with(float_format): float_format: Optional[FloatFormatType] if self.float_format is None: if self.fixed_width: - float_format = partial( - "{value: .{digits:d}f}".format, digits=self.digits - ) + if self.leading_space is True: + fmt_str = "{value: .{digits:d}f}" + else: + fmt_str = "{value:.{digits:d}f}" + float_format = partial(fmt_str.format, digits=self.digits) else: float_format = self.float_format else: @@ -1431,7 +1435,11 @@ def format_values_with(float_format): ).any() if has_small_values or (too_long and has_large_values): - float_format = partial("{value: .{digits:d}e}".format, digits=self.digits) + if self.leading_space is True: + fmt_str = "{value: .{digits:d}e}" + else: + fmt_str = "{value:.{digits:d}e}" + float_format = partial(fmt_str.format, digits=self.digits) formatted_values = format_values_with(float_format) return formatted_values @@ -1446,7 +1454,11 @@ def _format_strings(self) -> List[str]: class IntArrayFormatter(GenericArrayFormatter): def _format_strings(self) -> List[str]: - formatter = self.formatter or (lambda x: f"{x: d}") + if self.leading_space is False: + formatter_str = lambda x: f"{x:d}".format(x=x) + else: + formatter_str = lambda x: f"{x: d}".format(x=x) + formatter = self.formatter or formatter_str fmt_values = [formatter(x) for x in self.values] return fmt_values diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index 595b8c55c8ec7..a7ee10e198a52 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -1546,11 +1546,11 @@ def test_to_string_no_index(self): df_s = df.to_string(index=False) # Leading space is expected for positive numbers. - expected = " x y z\n 11 33 AAA\n 22 -44 " + expected = " x y z\n11 33 AAA\n22 -44 " assert df_s == expected df_s = df[["y", "x", "z"]].to_string(index=False) - expected = " y x z\n 33 11 AAA\n-44 22 " + expected = " y x z\n 33 11 AAA\n-44 22 " assert df_s == expected def test_to_string_line_width_no_index(self): @@ -1565,7 +1565,7 @@ def test_to_string_line_width_no_index(self): df = DataFrame({"x": [11, 22, 33], "y": [4, 5, 6]}) df_s = df.to_string(line_width=1, index=False) - expected = " x \\\n 11 \n 22 \n 33 \n\n y \n 4 \n 5 \n 6 " + expected = " x \\\n11 \n22 \n33 \n\n y \n 4 \n 5 \n 6 " assert df_s == expected @@ -2269,7 +2269,7 @@ def test_to_string_without_index(self): # GH 11729 Test index=False option s = Series([1, 2, 3, 4]) result = s.to_string(index=False) - expected = " 1\n" + " 2\n" + " 3\n" + " 4" + expected = "1\n" + "2\n" + "3\n" + "4" assert result == expected def test_unicode_name_in_footer(self): @@ -3399,3 +3399,37 @@ def test_filepath_or_buffer_bad_arg_raises(float_frame, method): msg = "buf is not a file name and it has no write method" with pytest.raises(TypeError, match=msg): getattr(float_frame, method)(buf=object()) + + +@pytest.mark.parametrize( + "input_array, expected", + [ + ("a", "a"), + (["a", "b"], "a\nb"), + ([1, "a"], "1\na"), + (1, "1"), + ([0, -1], " 0\n-1"), + (1.0, "1.0"), + ([" a", " b"], " a\n b"), + ([".1", "1"], ".1\n 1"), + (["10", "-10"], " 10\n-10"), + ], +) +def test_format_remove_leading_space_series(input_array, expected): + # GH: 24980 + s = pd.Series(input_array).to_string(index=False) + assert s == expected + + +@pytest.mark.parametrize( + "input_array, expected", + [ + ({"A": ["a"]}, "A\na"), + ({"A": ["a", "b"], "B": ["c", "dd"]}, "A B\na c\nb dd"), + ({"A": ["a", 1], "B": ["aa", 1]}, "A B\na aa\n1 1"), + ], +) +def test_format_remove_leading_space_dataframe(input_array, expected): + # GH: 24980 + df = pd.DataFrame(input_array).to_string(index=False) + assert df == expected diff --git a/pandas/tests/io/formats/test_to_latex.py b/pandas/tests/io/formats/test_to_latex.py index 96a9ed2b86cf4..9dfd851e91c65 100644 --- a/pandas/tests/io/formats/test_to_latex.py +++ b/pandas/tests/io/formats/test_to_latex.py @@ -50,10 +50,10 @@ def test_to_latex(self, float_frame): withoutindex_result = df.to_latex(index=False) withoutindex_expected = r"""\begin{tabular}{rl} \toprule - a & b \\ + a & b \\ \midrule - 1 & b1 \\ - 2 & b2 \\ + 1 & b1 \\ + 2 & b2 \\ \bottomrule \end{tabular} """ @@ -413,7 +413,7 @@ def test_to_latex_longtable(self): withoutindex_result = df.to_latex(index=False, longtable=True) withoutindex_expected = r"""\begin{longtable}{rl} \toprule - a & b \\ + a & b \\ \midrule \endhead \midrule @@ -423,8 +423,8 @@ def test_to_latex_longtable(self): \bottomrule \endlastfoot - 1 & b1 \\ - 2 & b2 \\ + 1 & b1 \\ + 2 & b2 \\ \end{longtable} """ @@ -663,8 +663,8 @@ def test_to_latex_no_header(self): withoutindex_result = df.to_latex(index=False, header=False) withoutindex_expected = r"""\begin{tabular}{rl} \toprule - 1 & b1 \\ - 2 & b2 \\ +1 & b1 \\ +2 & b2 \\ \bottomrule \end{tabular} """ @@ -690,10 +690,10 @@ def test_to_latex_specified_header(self): withoutindex_result = df.to_latex(header=["AA", "BB"], index=False) withoutindex_expected = r"""\begin{tabular}{rl} \toprule -AA & BB \\ +AA & BB \\ \midrule - 1 & b1 \\ - 2 & b2 \\ + 1 & b1 \\ + 2 & b2 \\ \bottomrule \end{tabular} """ From a876d80cd4d265cb10adf5f8216def955015776c Mon Sep 17 00:00:00 2001 From: Harsh Sharma <51477130+hs2361@users.noreply.github.com> Date: Mon, 7 Sep 2020 16:46:11 +0530 Subject: [PATCH 0688/1025] =?UTF-8?q?BUG:=20shows=20correct=20package=20na?= =?UTF-8?q?me=20when=20import=5Foptional=5Fdependency=20is=20ca=E2=80=A6?= =?UTF-8?q?=20(#36134)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- doc/source/whatsnew/v1.1.2.rst | 1 + pandas/compat/_optional.py | 21 +++++++++++++++++++-- 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.1.2.rst b/doc/source/whatsnew/v1.1.2.rst index 1e946d325ace1..da261907565a1 100644 --- a/doc/source/whatsnew/v1.1.2.rst +++ b/doc/source/whatsnew/v1.1.2.rst @@ -41,6 +41,7 @@ Bug fixes - Bug in :meth:`Series.dt.isocalendar` and :meth:`DatetimeIndex.isocalendar` that returned incorrect year for certain dates (:issue:`36032`) - Bug in :class:`DataFrame` indexing returning an incorrect :class:`Series` in some cases when the series has been altered and a cache not invalidated (:issue:`33675`) - Bug in :meth:`DataFrame.corr` causing subsequent indexing lookups to be incorrect (:issue:`35882`) +- Bug in :meth:`import_optional_dependency` returning incorrect package names in cases where package name is different from import name (:issue:`35948`) .. --------------------------------------------------------------------------- diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py index 689c7c889ef66..40688a3978cfc 100644 --- a/pandas/compat/_optional.py +++ b/pandas/compat/_optional.py @@ -33,6 +33,19 @@ "numba": "0.46.0", } +# A mapping from import name to package name (on PyPI) for packages where +# these two names are different. + +INSTALL_MAPPING = { + "bs4": "beautifulsoup4", + "bottleneck": "Bottleneck", + "lxml.etree": "lxml", + "odf": "odfpy", + "pandas_gbq": "pandas-gbq", + "sqlalchemy": "SQLAlchemy", + "jinja2": "Jinja2", +} + def _get_version(module: types.ModuleType) -> str: version = getattr(module, "__version__", None) @@ -82,9 +95,13 @@ def import_optional_dependency( is False, or when the package's version is too old and `on_version` is ``'warn'``. """ + + package_name = INSTALL_MAPPING.get(name) + install_name = package_name if package_name is not None else name + msg = ( - f"Missing optional dependency '{name}'. {extra} " - f"Use pip or conda to install {name}." + f"Missing optional dependency '{install_name}'. {extra} " + f"Use pip or conda to install {install_name}." ) try: module = importlib.import_module(name) From 2057b2d9c07f6423591a65ed3e4ce89322a95a66 Mon Sep 17 00:00:00 2001 From: ivanovmg <41443370+ivanovmg@users.noreply.github.com> Date: Tue, 8 Sep 2020 01:58:50 +0700 Subject: [PATCH 0689/1025] REF: simplify latex formatting (#35872) --- pandas/io/formats/format.py | 7 +- pandas/io/formats/latex.py | 778 +++++++++++++++++------ pandas/tests/io/formats/test_to_latex.py | 102 +++ 3 files changed, 694 insertions(+), 193 deletions(-) diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 13fa8908ff450..f31e60a43e391 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -939,17 +939,18 @@ def to_latex( """ from pandas.io.formats.latex import LatexFormatter - return LatexFormatter( + latex_formatter = LatexFormatter( self, - column_format=column_format, longtable=longtable, + column_format=column_format, multicolumn=multicolumn, multicolumn_format=multicolumn_format, multirow=multirow, caption=caption, label=label, position=position, - ).get_result(buf=buf, encoding=encoding) + ) + return latex_formatter.get_result(buf=buf, encoding=encoding) def _format_col(self, i: int) -> List[str]: frame = self.tr_frame diff --git a/pandas/io/formats/latex.py b/pandas/io/formats/latex.py index 715b8bbdf5672..8080d953da308 100644 --- a/pandas/io/formats/latex.py +++ b/pandas/io/formats/latex.py @@ -1,7 +1,8 @@ """ Module for formatting output data in Latex. """ -from typing import IO, List, Optional, Tuple +from abc import ABC, abstractmethod +from typing import IO, Iterator, List, Optional, Type import numpy as np @@ -10,56 +11,95 @@ from pandas.io.formats.format import DataFrameFormatter, TableFormatter -class LatexFormatter(TableFormatter): - """ - Used to render a DataFrame to a LaTeX tabular/longtable environment output. +class RowStringConverter(ABC): + r"""Converter for dataframe rows into LaTeX strings. Parameters ---------- formatter : `DataFrameFormatter` - column_format : str, default None - The columns format as specified in `LaTeX table format - `__ e.g 'rcl' for 3 columns - longtable : boolean, default False - Use a longtable environment instead of tabular. + Instance of `DataFrameFormatter`. + multicolumn: bool, optional + Whether to use \multicolumn macro. + multicolumn_format: str, optional + Multicolumn format. + multirow: bool, optional + Whether to use \multirow macro. - See Also - -------- - HTMLFormatter """ def __init__( self, formatter: DataFrameFormatter, - column_format: Optional[str] = None, - longtable: bool = False, multicolumn: bool = False, multicolumn_format: Optional[str] = None, multirow: bool = False, - caption: Optional[str] = None, - label: Optional[str] = None, - position: Optional[str] = None, ): self.fmt = formatter self.frame = self.fmt.frame - self.bold_rows = self.fmt.bold_rows - self.column_format = column_format - self.longtable = longtable self.multicolumn = multicolumn self.multicolumn_format = multicolumn_format self.multirow = multirow - self.caption = caption - self.label = label - self.escape = self.fmt.escape - self.position = position - self._table_float = any(p is not None for p in (caption, label, position)) + self.clinebuf: List[List[int]] = [] + self.strcols = self._get_strcols() + self.strrows: List[List[str]] = ( + list(zip(*self.strcols)) # type: ignore[arg-type] + ) + + def get_strrow(self, row_num: int) -> str: + """Get string representation of the row.""" + row = self.strrows[row_num] + + is_multicol = ( + row_num < self.column_levels and self.fmt.header and self.multicolumn + ) + + is_multirow = ( + row_num >= self.header_levels + and self.fmt.index + and self.multirow + and self.index_levels > 1 + ) + + is_cline_maybe_required = is_multirow and row_num < len(self.strrows) - 1 + + crow = self._preprocess_row(row) + + if is_multicol: + crow = self._format_multicolumn(crow) + if is_multirow: + crow = self._format_multirow(crow, row_num) + + lst = [] + lst.append(" & ".join(crow)) + lst.append(" \\\\") + if is_cline_maybe_required: + cline = self._compose_cline(row_num, len(self.strcols)) + lst.append(cline) + return "".join(lst) + + @property + def _header_row_num(self) -> int: + """Number of rows in header.""" + return self.header_levels if self.fmt.header else 0 + + @property + def index_levels(self) -> int: + """Integer number of levels in index.""" + return self.frame.index.nlevels + + @property + def column_levels(self) -> int: + return self.frame.columns.nlevels + + @property + def header_levels(self) -> int: + nlevels = self.column_levels + if self.fmt.has_index_names and self.fmt.show_index_names: + nlevels += 1 + return nlevels - def write_result(self, buf: IO[str]) -> None: - """ - Render a DataFrame to a LaTeX tabular, longtable, or table/tabular - environment output. - """ - # string representation of the columns + def _get_strcols(self) -> List[List[str]]: + """String representation of the columns.""" if len(self.frame.columns) == 0 or len(self.frame.index) == 0: info_line = ( f"Empty {type(self.frame).__name__}\n" @@ -70,12 +110,6 @@ def write_result(self, buf: IO[str]) -> None: else: strcols = self.fmt._to_str_columns() - def get_col_type(dtype): - if issubclass(dtype.type, np.number): - return "r" - else: - return "l" - # reestablish the MultiIndex that has been joined by _to_str_column if self.fmt.index and isinstance(self.frame.index, ABCMultiIndex): out = self.frame.index.format( @@ -107,89 +141,19 @@ def pad_empties(x): # Get rid of old multiindex column and add new ones strcols = out + strcols[1:] + return strcols - if self.column_format is None: - dtypes = self.frame.dtypes._values - column_format = "".join(map(get_col_type, dtypes)) - if self.fmt.index: - index_format = "l" * self.frame.index.nlevels - column_format = index_format + column_format - elif not isinstance(self.column_format, str): # pragma: no cover - raise AssertionError( - f"column_format must be str or unicode, not {type(column_format)}" - ) + def _preprocess_row(self, row: List[str]) -> List[str]: + """Preprocess elements of the row.""" + if self.fmt.escape: + crow = _escape_symbols(row) else: - column_format = self.column_format - - self._write_tabular_begin(buf, column_format) - - buf.write("\\toprule\n") + crow = [x if x else "{}" for x in row] + if self.fmt.bold_rows and self.fmt.index: + crow = _convert_to_bold(crow, self.index_levels) + return crow - ilevels = self.frame.index.nlevels - clevels = self.frame.columns.nlevels - nlevels = clevels - if self.fmt.has_index_names and self.fmt.show_index_names: - nlevels += 1 - strrows = list(zip(*strcols)) - self.clinebuf: List[List[int]] = [] - - for i, row in enumerate(strrows): - if i == nlevels and self.fmt.header: - buf.write("\\midrule\n") # End of header - if self.longtable: - buf.write("\\endhead\n") - buf.write("\\midrule\n") - buf.write( - f"\\multicolumn{{{len(row)}}}{{r}}" - "{{Continued on next page}} \\\\\n" - ) - buf.write("\\midrule\n") - buf.write("\\endfoot\n\n") - buf.write("\\bottomrule\n") - buf.write("\\endlastfoot\n") - if self.escape: - # escape backslashes first - crow = [ - ( - x.replace("\\", "\\textbackslash ") - .replace("_", "\\_") - .replace("%", "\\%") - .replace("$", "\\$") - .replace("#", "\\#") - .replace("{", "\\{") - .replace("}", "\\}") - .replace("~", "\\textasciitilde ") - .replace("^", "\\textasciicircum ") - .replace("&", "\\&") - if (x and x != "{}") - else "{}" - ) - for x in row - ] - else: - crow = [x if x else "{}" for x in row] - if self.bold_rows and self.fmt.index: - # bold row labels - crow = [ - f"\\textbf{{{x}}}" - if j < ilevels and x.strip() not in ["", "{}"] - else x - for j, x in enumerate(crow) - ] - if i < clevels and self.fmt.header and self.multicolumn: - # sum up columns to multicolumns - crow = self._format_multicolumn(crow, ilevels) - if i >= nlevels and self.fmt.index and self.multirow and ilevels > 1: - # sum up rows to multirows - crow = self._format_multirow(crow, ilevels, i, strrows) - buf.write(" & ".join(crow)) - buf.write(" \\\\\n") - if self.multirow and i < len(strrows) - 1: - self._print_cline(buf, i, len(strcols)) - - self._write_tabular_end(buf) - - def _format_multicolumn(self, row: List[str], ilevels: int) -> List[str]: + def _format_multicolumn(self, row: List[str]) -> List[str]: r""" Combine columns belonging to a group to a single multicolumn entry according to self.multicolumn_format @@ -199,7 +163,7 @@ def _format_multicolumn(self, row: List[str], ilevels: int) -> List[str]: will become \multicolumn{3}{l}{a} & b & \multicolumn{2}{l}{c} """ - row2 = list(row[:ilevels]) + row2 = row[: self.index_levels] ncol = 1 coltext = "" @@ -214,7 +178,7 @@ def append_col(): else: row2.append(coltext) - for c in row[ilevels:]: + for c in row[self.index_levels :]: # if next col has text, write the previous if c.strip(): if coltext: @@ -229,9 +193,7 @@ def append_col(): append_col() return row2 - def _format_multirow( - self, row: List[str], ilevels: int, i: int, rows: List[Tuple[str, ...]] - ) -> List[str]: + def _format_multirow(self, row: List[str], i: int) -> List[str]: r""" Check following rows, whether row should be a multirow @@ -241,10 +203,10 @@ def _format_multirow( b & 0 & \cline{1-2} b & 0 & """ - for j in range(ilevels): + for j in range(self.index_levels): if row[j].strip(): nrow = 1 - for r in rows[i + 1 :]: + for r in self.strrows[i + 1 :]: if not r[j].strip(): nrow += 1 else: @@ -256,88 +218,524 @@ def _format_multirow( self.clinebuf.append([i + nrow - 1, j + 1]) return row - def _print_cline(self, buf: IO[str], i: int, icol: int) -> None: + def _compose_cline(self, i: int, icol: int) -> str: """ - Print clines after multirow-blocks are finished. + Create clines after multirow-blocks are finished. """ + lst = [] for cl in self.clinebuf: if cl[0] == i: - buf.write(f"\\cline{{{cl[1]:d}-{icol:d}}}\n") - # remove entries that have been written to buffer - self.clinebuf = [x for x in self.clinebuf if x[0] != i] + lst.append(f"\n\\cline{{{cl[1]:d}-{icol:d}}}") + # remove entries that have been written to buffer + self.clinebuf = [x for x in self.clinebuf if x[0] != i] + return "".join(lst) + + +class RowStringIterator(RowStringConverter): + """Iterator over rows of the header or the body of the table.""" + + @abstractmethod + def __iter__(self) -> Iterator[str]: + """Iterate over LaTeX string representations of rows.""" + + +class RowHeaderIterator(RowStringIterator): + """Iterator for the table header rows.""" + + def __iter__(self) -> Iterator[str]: + for row_num in range(len(self.strrows)): + if row_num < self._header_row_num: + yield self.get_strrow(row_num) + + +class RowBodyIterator(RowStringIterator): + """Iterator for the table body rows.""" + + def __iter__(self) -> Iterator[str]: + for row_num in range(len(self.strrows)): + if row_num >= self._header_row_num: + yield self.get_strrow(row_num) - def _write_tabular_begin(self, buf, column_format: str): - """ - Write the beginning of a tabular environment or - nested table/tabular environments including caption and label. + +class TableBuilderAbstract(ABC): + """ + Abstract table builder producing string representation of LaTeX table. + + Parameters + ---------- + formatter : `DataFrameFormatter` + Instance of `DataFrameFormatter`. + column_format: str, optional + Column format, for example, 'rcl' for three columns. + multicolumn: bool, optional + Use multicolumn to enhance MultiIndex columns. + multicolumn_format: str, optional + The alignment for multicolumns, similar to column_format. + multirow: bool, optional + Use multirow to enhance MultiIndex rows. + caption: str, optional + Table caption. + label: str, optional + LaTeX label. + position: str, optional + Float placement specifier, for example, 'htb'. + """ + + def __init__( + self, + formatter: DataFrameFormatter, + column_format: Optional[str] = None, + multicolumn: bool = False, + multicolumn_format: Optional[str] = None, + multirow: bool = False, + caption: Optional[str] = None, + label: Optional[str] = None, + position: Optional[str] = None, + ): + self.fmt = formatter + self.column_format = column_format + self.multicolumn = multicolumn + self.multicolumn_format = multicolumn_format + self.multirow = multirow + self.caption = caption + self.label = label + self.position = position + + def get_result(self) -> str: + """String representation of LaTeX table.""" + elements = [ + self.env_begin, + self.top_separator, + self.header, + self.middle_separator, + self.env_body, + self.bottom_separator, + self.env_end, + ] + result = "\n".join([item for item in elements if item]) + trailing_newline = "\n" + result += trailing_newline + return result + + @property + @abstractmethod + def env_begin(self) -> str: + """Beginning of the environment.""" + + @property + @abstractmethod + def top_separator(self) -> str: + """Top level separator.""" + + @property + @abstractmethod + def header(self) -> str: + """Header lines.""" + + @property + @abstractmethod + def middle_separator(self) -> str: + """Middle level separator.""" + + @property + @abstractmethod + def env_body(self) -> str: + """Environment body.""" + + @property + @abstractmethod + def bottom_separator(self) -> str: + """Bottom level separator.""" + + @property + @abstractmethod + def env_end(self) -> str: + """End of the environment.""" + + +class GenericTableBuilder(TableBuilderAbstract): + """Table builder producing string representation of LaTeX table.""" + + @property + def header(self) -> str: + iterator = self._create_row_iterator(over="header") + return "\n".join(list(iterator)) + + @property + def top_separator(self) -> str: + return "\\toprule" + + @property + def middle_separator(self) -> str: + return "\\midrule" if self._is_separator_required() else "" + + @property + def env_body(self) -> str: + iterator = self._create_row_iterator(over="body") + return "\n".join(list(iterator)) + + def _is_separator_required(self) -> bool: + return bool(self.header and self.env_body) + + @property + def _position_macro(self) -> str: + r"""Position macro, extracted from self.position, like [h].""" + return f"[{self.position}]" if self.position else "" + + @property + def _caption_macro(self) -> str: + r"""Caption macro, extracted from self.caption, like \caption{cap}.""" + return f"\\caption{{{self.caption}}}" if self.caption else "" + + @property + def _label_macro(self) -> str: + r"""Label macro, extracted from self.label, like \label{ref}.""" + return f"\\label{{{self.label}}}" if self.label else "" + + def _create_row_iterator(self, over: str) -> RowStringIterator: + """Create iterator over header or body of the table. Parameters ---------- - buf : string or file handle - File path or object. If not specified, the result is returned as - a string. - column_format : str - The columns format as specified in `LaTeX table format - `__ e.g 'rcl' - for 3 columns + over : {'body', 'header'} + Over what to iterate. + + Returns + ------- + RowStringIterator + Iterator over body or header. """ - if self._table_float: - # then write output in a nested table/tabular or longtable environment - if self.caption is None: - caption_ = "" - else: - caption_ = f"\n\\caption{{{self.caption}}}" + iterator_kind = self._select_iterator(over) + return iterator_kind( + formatter=self.fmt, + multicolumn=self.multicolumn, + multicolumn_format=self.multicolumn_format, + multirow=self.multirow, + ) + + def _select_iterator(self, over: str) -> Type[RowStringIterator]: + """Select proper iterator over table rows.""" + if over == "header": + return RowHeaderIterator + elif over == "body": + return RowBodyIterator + else: + msg = f"'over' must be either 'header' or 'body', but {over} was provided" + raise ValueError(msg) + + +class LongTableBuilder(GenericTableBuilder): + """Concrete table builder for longtable. + + >>> from pandas import DataFrame + >>> from pandas.io.formats import format as fmt + >>> df = DataFrame({"a": [1, 2], "b": ["b1", "b2"]}) + >>> formatter = fmt.DataFrameFormatter(df) + >>> builder = LongTableBuilder(formatter, caption='caption', label='lab', + ... column_format='lrl') + >>> table = builder.get_result() + >>> print(table) + \\begin{longtable}{lrl} + \\caption{caption} + \\label{lab}\\\\ + \\toprule + {} & a & b \\\\ + \\midrule + \\endhead + \\midrule + \\multicolumn{3}{r}{{Continued on next page}} \\\\ + \\midrule + \\endfoot + + \\bottomrule + \\endlastfoot + 0 & 1 & b1 \\\\ + 1 & 2 & b2 \\\\ + \\end{longtable} + + """ - if self.label is None: - label_ = "" - else: - label_ = f"\n\\label{{{self.label}}}" + @property + def env_begin(self) -> str: + first_row = ( + f"\\begin{{longtable}}{self._position_macro}{{{self.column_format}}}" + ) + elements = [first_row, f"{self._caption_and_label()}"] + return "\n".join([item for item in elements if item]) + + def _caption_and_label(self) -> str: + if self.caption or self.label: + double_backslash = "\\\\" + elements = [f"{self._caption_macro}", f"{self._label_macro}"] + caption_and_label = "\n".join([item for item in elements if item]) + caption_and_label += double_backslash + return caption_and_label + else: + return "" + + @property + def middle_separator(self) -> str: + iterator = self._create_row_iterator(over="header") + elements = [ + "\\midrule", + "\\endhead", + "\\midrule", + f"\\multicolumn{{{len(iterator.strcols)}}}{{r}}" + "{{Continued on next page}} \\\\", + "\\midrule", + "\\endfoot\n", + "\\bottomrule", + "\\endlastfoot", + ] + if self._is_separator_required(): + return "\n".join(elements) + return "" + + @property + def bottom_separator(self) -> str: + return "" + + @property + def env_end(self) -> str: + return "\\end{longtable}" + + +class RegularTableBuilder(GenericTableBuilder): + """Concrete table builder for regular table. + + >>> from pandas import DataFrame + >>> from pandas.io.formats import format as fmt + >>> df = DataFrame({"a": [1, 2], "b": ["b1", "b2"]}) + >>> formatter = fmt.DataFrameFormatter(df) + >>> builder = RegularTableBuilder(formatter, caption='caption', label='lab', + ... column_format='lrc') + >>> table = builder.get_result() + >>> print(table) + \\begin{table} + \\centering + \\caption{caption} + \\label{lab} + \\begin{tabular}{lrc} + \\toprule + {} & a & b \\\\ + \\midrule + 0 & 1 & b1 \\\\ + 1 & 2 & b2 \\\\ + \\bottomrule + \\end{tabular} + \\end{table} + + """ - if self.position is None: - position_ = "" - else: - position_ = f"[{self.position}]" + @property + def env_begin(self) -> str: + elements = [ + f"\\begin{{table}}{self._position_macro}", + "\\centering", + f"{self._caption_macro}", + f"{self._label_macro}", + f"\\begin{{tabular}}{{{self.column_format}}}", + ] + return "\n".join([item for item in elements if item]) + + @property + def bottom_separator(self) -> str: + return "\\bottomrule" + + @property + def env_end(self) -> str: + return "\n".join(["\\end{tabular}", "\\end{table}"]) + + +class TabularBuilder(GenericTableBuilder): + """Concrete table builder for tabular environment. + + >>> from pandas import DataFrame + >>> from pandas.io.formats import format as fmt + >>> df = DataFrame({"a": [1, 2], "b": ["b1", "b2"]}) + >>> formatter = fmt.DataFrameFormatter(df) + >>> builder = TabularBuilder(formatter, column_format='lrc') + >>> table = builder.get_result() + >>> print(table) + \\begin{tabular}{lrc} + \\toprule + {} & a & b \\\\ + \\midrule + 0 & 1 & b1 \\\\ + 1 & 2 & b2 \\\\ + \\bottomrule + \\end{tabular} + + """ - if self.longtable: - table_ = f"\\begin{{longtable}}{position_}{{{column_format}}}" - tabular_ = "\n" - else: - table_ = f"\\begin{{table}}{position_}\n\\centering" - tabular_ = f"\n\\begin{{tabular}}{{{column_format}}}\n" - - if self.longtable and (self.caption is not None or self.label is not None): - # a double-backslash is required at the end of the line - # as discussed here: - # https://tex.stackexchange.com/questions/219138 - backlash_ = "\\\\" - else: - backlash_ = "" - buf.write(f"{table_}{caption_}{label_}{backlash_}{tabular_}") - else: - if self.longtable: - tabletype_ = "longtable" - else: - tabletype_ = "tabular" - buf.write(f"\\begin{{{tabletype_}}}{{{column_format}}}\n") + @property + def env_begin(self) -> str: + return f"\\begin{{tabular}}{{{self.column_format}}}" + + @property + def bottom_separator(self) -> str: + return "\\bottomrule" + + @property + def env_end(self) -> str: + return "\\end{tabular}" + + +class LatexFormatter(TableFormatter): + """ + Used to render a DataFrame to a LaTeX tabular/longtable environment output. + + Parameters + ---------- + formatter : `DataFrameFormatter` + column_format : str, default None + The columns format as specified in `LaTeX table format + `__ e.g 'rcl' for 3 columns + + See Also + -------- + HTMLFormatter + """ + + def __init__( + self, + formatter: DataFrameFormatter, + longtable: bool = False, + column_format: Optional[str] = None, + multicolumn: bool = False, + multicolumn_format: Optional[str] = None, + multirow: bool = False, + caption: Optional[str] = None, + label: Optional[str] = None, + position: Optional[str] = None, + ): + self.fmt = formatter + self.frame = self.fmt.frame + self.longtable = longtable + self.column_format = column_format # type: ignore[assignment] + self.multicolumn = multicolumn + self.multicolumn_format = multicolumn_format + self.multirow = multirow + self.caption = caption + self.label = label + self.position = position - def _write_tabular_end(self, buf): + def write_result(self, buf: IO[str]) -> None: """ - Write the end of a tabular environment or nested table/tabular - environment. + Render a DataFrame to a LaTeX tabular, longtable, or table/tabular + environment output. + """ + table_string = self.builder.get_result() + buf.write(table_string) - Parameters - ---------- - buf : string or file handle - File path or object. If not specified, the result is returned as - a string. + @property + def builder(self) -> TableBuilderAbstract: + """Concrete table builder. + Returns + ------- + TableBuilder """ + builder = self._select_builder() + return builder( + formatter=self.fmt, + column_format=self.column_format, + multicolumn=self.multicolumn, + multicolumn_format=self.multicolumn_format, + multirow=self.multirow, + caption=self.caption, + label=self.label, + position=self.position, + ) + + def _select_builder(self) -> Type[TableBuilderAbstract]: + """Select proper table builder.""" if self.longtable: - buf.write("\\end{longtable}\n") + return LongTableBuilder + if any([self.caption, self.label, self.position]): + return RegularTableBuilder + return TabularBuilder + + @property + def column_format(self) -> str: + """Column format.""" + return self._column_format + + @column_format.setter + def column_format(self, input_column_format: Optional[str]) -> None: + """Setter for column format.""" + if input_column_format is None: + self._column_format = ( + self._get_index_format() + self._get_column_format_based_on_dtypes() + ) + elif not isinstance(input_column_format, str): + raise ValueError( + f"column_format must be str or unicode, " + f"not {type(input_column_format)}" + ) else: - buf.write("\\bottomrule\n") - buf.write("\\end{tabular}\n") - if self._table_float: - buf.write("\\end{table}\n") - else: - pass + self._column_format = input_column_format + + def _get_column_format_based_on_dtypes(self) -> str: + """Get column format based on data type. + + Right alignment for numbers and left - for strings. + """ + + def get_col_type(dtype): + if issubclass(dtype.type, np.number): + return "r" + return "l" + + dtypes = self.frame.dtypes._values + return "".join(map(get_col_type, dtypes)) + + def _get_index_format(self) -> str: + """Get index column format.""" + return "l" * self.frame.index.nlevels if self.fmt.index else "" + + +def _escape_symbols(row: List[str]) -> List[str]: + """Carry out string replacements for special symbols. + + Parameters + ---------- + row : list + List of string, that may contain special symbols. + + Returns + ------- + list + list of strings with the special symbols replaced. + """ + return [ + ( + x.replace("\\", "\\textbackslash ") + .replace("_", "\\_") + .replace("%", "\\%") + .replace("$", "\\$") + .replace("#", "\\#") + .replace("{", "\\{") + .replace("}", "\\}") + .replace("~", "\\textasciitilde ") + .replace("^", "\\textasciicircum ") + .replace("&", "\\&") + if (x and x != "{}") + else "{}" + ) + for x in row + ] + + +def _convert_to_bold(crow: List[str], ilevels: int) -> List[str]: + """Convert elements in ``crow`` to bold.""" + return [ + f"\\textbf{{{x}}}" if j < ilevels and x.strip() not in ["", "{}"] else x + for j, x in enumerate(crow) + ] + + +if __name__ == "__main__": + import doctest + + doctest.testmod() diff --git a/pandas/tests/io/formats/test_to_latex.py b/pandas/tests/io/formats/test_to_latex.py index 9dfd851e91c65..a98644250b328 100644 --- a/pandas/tests/io/formats/test_to_latex.py +++ b/pandas/tests/io/formats/test_to_latex.py @@ -7,6 +7,14 @@ from pandas import DataFrame, Series import pandas._testing as tm +from pandas.io.formats.format import DataFrameFormatter +from pandas.io.formats.latex import ( + RegularTableBuilder, + RowBodyIterator, + RowHeaderIterator, + RowStringConverter, +) + class TestToLatex: def test_to_latex_filename(self, float_frame): @@ -60,6 +68,16 @@ def test_to_latex(self, float_frame): assert withoutindex_result == withoutindex_expected + @pytest.mark.parametrize( + "bad_column_format", + [5, 1.2, ["l", "r"], ("r", "c"), {"r", "c", "l"}, dict(a="r", b="l")], + ) + def test_to_latex_bad_column_format(self, bad_column_format): + df = DataFrame({"a": [1, 2], "b": ["b1", "b2"]}) + msg = r"column_format must be str or unicode" + with pytest.raises(ValueError, match=msg): + df.to_latex(column_format=bad_column_format) + def test_to_latex_format(self, float_frame): # GH Bug #9402 float_frame.to_latex(column_format="ccc") @@ -930,3 +948,87 @@ def test_to_latex_multindex_header(self): \end{tabular} """ assert observed == expected + + +class TestTableBuilder: + @pytest.fixture + def dataframe(self): + return DataFrame({"a": [1, 2], "b": ["b1", "b2"]}) + + @pytest.fixture + def table_builder(self, dataframe): + return RegularTableBuilder(formatter=DataFrameFormatter(dataframe)) + + def test_create_row_iterator(self, table_builder): + iterator = table_builder._create_row_iterator(over="header") + assert isinstance(iterator, RowHeaderIterator) + + def test_create_body_iterator(self, table_builder): + iterator = table_builder._create_row_iterator(over="body") + assert isinstance(iterator, RowBodyIterator) + + def test_create_body_wrong_kwarg_raises(self, table_builder): + with pytest.raises(ValueError, match="must be either 'header' or 'body'"): + table_builder._create_row_iterator(over="SOMETHING BAD") + + +class TestRowStringConverter: + @pytest.mark.parametrize( + "row_num, expected", + [ + (0, r"{} & Design & ratio & xy \\"), + (1, r"0 & 1 & 4 & 10 \\"), + (2, r"1 & 2 & 5 & 11 \\"), + ], + ) + def test_get_strrow_normal_without_escape(self, row_num, expected): + df = DataFrame({r"Design": [1, 2, 3], r"ratio": [4, 5, 6], r"xy": [10, 11, 12]}) + row_string_converter = RowStringConverter( + formatter=DataFrameFormatter(df, escape=True), + ) + assert row_string_converter.get_strrow(row_num=row_num) == expected + + @pytest.mark.parametrize( + "row_num, expected", + [ + (0, r"{} & Design \# & ratio, \% & x\&y \\"), + (1, r"0 & 1 & 4 & 10 \\"), + (2, r"1 & 2 & 5 & 11 \\"), + ], + ) + def test_get_strrow_normal_with_escape(self, row_num, expected): + df = DataFrame( + {r"Design #": [1, 2, 3], r"ratio, %": [4, 5, 6], r"x&y": [10, 11, 12]} + ) + row_string_converter = RowStringConverter( + formatter=DataFrameFormatter(df, escape=True), + ) + assert row_string_converter.get_strrow(row_num=row_num) == expected + + @pytest.mark.parametrize( + "row_num, expected", + [ + (0, r"{} & \multicolumn{2}{r}{c1} & \multicolumn{2}{r}{c2} & c3 \\"), + (1, r"{} & 0 & 1 & 0 & 1 & 0 \\"), + (2, r"0 & 0 & 5 & 0 & 5 & 0 \\"), + ], + ) + def test_get_strrow_multindex_multicolumn(self, row_num, expected): + df = DataFrame( + { + ("c1", 0): {x: x for x in range(5)}, + ("c1", 1): {x: x + 5 for x in range(5)}, + ("c2", 0): {x: x for x in range(5)}, + ("c2", 1): {x: x + 5 for x in range(5)}, + ("c3", 0): {x: x for x in range(5)}, + } + ) + + row_string_converter = RowStringConverter( + formatter=DataFrameFormatter(df), + multicolumn=True, + multicolumn_format="r", + multirow=True, + ) + + assert row_string_converter.get_strrow(row_num=row_num) == expected From 0251dd895bab7bbc98c2bcc9eb2afdb5b7d68393 Mon Sep 17 00:00:00 2001 From: Jonathan Shreckengost Date: Mon, 7 Sep 2020 15:04:42 -0400 Subject: [PATCH 0690/1025] Comma cleanup (#36168) --- pandas/tests/indexing/test_iloc.py | 2 +- pandas/tests/indexing/test_indexing.py | 2 +- pandas/tests/indexing/test_loc.py | 33 +++++++------------- pandas/tests/internals/test_internals.py | 8 ++--- pandas/tests/io/formats/test_css.py | 12 +++---- pandas/tests/io/formats/test_info.py | 12 +++---- pandas/tests/io/json/test_compression.py | 2 +- pandas/tests/io/json/test_pandas.py | 10 ++---- pandas/tests/io/parser/test_c_parser_only.py | 4 +-- pandas/tests/io/parser/test_parse_dates.py | 4 +-- pandas/tests/io/parser/test_usecols.py | 2 +- 11 files changed, 34 insertions(+), 57 deletions(-) diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index 4fae01ec710fd..bfb62835add93 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -56,7 +56,7 @@ def test_is_scalar_access(self): assert ser.iloc._is_scalar_access((1,)) df = ser.to_frame() - assert df.iloc._is_scalar_access((1, 0,)) + assert df.iloc._is_scalar_access((1, 0)) def test_iloc_exceeds_bounds(self): diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index a080c5d169215..ca8a3ddc95575 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -1004,7 +1004,7 @@ def test_extension_array_cross_section(): def test_extension_array_cross_section_converts(): # all numeric columns -> numeric series df = pd.DataFrame( - {"A": pd.array([1, 2], dtype="Int64"), "B": np.array([1, 2])}, index=["a", "b"], + {"A": pd.array([1, 2], dtype="Int64"), "B": np.array([1, 2])}, index=["a", "b"] ) result = df.loc["a"] expected = pd.Series([1, 1], dtype="Int64", index=["A", "B"], name="a") diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 193800fae751f..e42d9679464d8 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -29,13 +29,11 @@ def test_loc_getitem_label_out_of_range(self): # out of range label self.check_result( - "loc", "f", typs=["ints", "uints", "labels", "mixed", "ts"], fails=KeyError, + "loc", "f", typs=["ints", "uints", "labels", "mixed", "ts"], fails=KeyError ) self.check_result("loc", "f", typs=["floats"], fails=KeyError) self.check_result("loc", "f", typs=["floats"], fails=KeyError) - self.check_result( - "loc", 20, typs=["ints", "uints", "mixed"], fails=KeyError, - ) + self.check_result("loc", 20, typs=["ints", "uints", "mixed"], fails=KeyError) self.check_result("loc", 20, typs=["labels"], fails=KeyError) self.check_result("loc", 20, typs=["ts"], axes=0, fails=KeyError) self.check_result("loc", 20, typs=["floats"], axes=0, fails=KeyError) @@ -46,26 +44,24 @@ def test_loc_getitem_label_list(self): pass def test_loc_getitem_label_list_with_missing(self): + self.check_result("loc", [0, 1, 2], typs=["empty"], fails=KeyError) self.check_result( - "loc", [0, 1, 2], typs=["empty"], fails=KeyError, - ) - self.check_result( - "loc", [0, 2, 10], typs=["ints", "uints", "floats"], axes=0, fails=KeyError, + "loc", [0, 2, 10], typs=["ints", "uints", "floats"], axes=0, fails=KeyError ) self.check_result( - "loc", [3, 6, 7], typs=["ints", "uints", "floats"], axes=1, fails=KeyError, + "loc", [3, 6, 7], typs=["ints", "uints", "floats"], axes=1, fails=KeyError ) # GH 17758 - MultiIndex and missing keys self.check_result( - "loc", [(1, 3), (1, 4), (2, 5)], typs=["multi"], axes=0, fails=KeyError, + "loc", [(1, 3), (1, 4), (2, 5)], typs=["multi"], axes=0, fails=KeyError ) def test_loc_getitem_label_list_fails(self): # fails self.check_result( - "loc", [20, 30, 40], typs=["ints", "uints"], axes=1, fails=KeyError, + "loc", [20, 30, 40], typs=["ints", "uints"], axes=1, fails=KeyError ) def test_loc_getitem_label_array_like(self): @@ -95,18 +91,14 @@ def test_loc_getitem_label_slice(self): ) self.check_result( - "loc", slice("20130102", "20130104"), typs=["ts"], axes=1, fails=TypeError, + "loc", slice("20130102", "20130104"), typs=["ts"], axes=1, fails=TypeError ) - self.check_result( - "loc", slice(2, 8), typs=["mixed"], axes=0, fails=TypeError, - ) - self.check_result( - "loc", slice(2, 8), typs=["mixed"], axes=1, fails=KeyError, - ) + self.check_result("loc", slice(2, 8), typs=["mixed"], axes=0, fails=TypeError) + self.check_result("loc", slice(2, 8), typs=["mixed"], axes=1, fails=KeyError) self.check_result( - "loc", slice(2, 4, 2), typs=["mixed"], axes=0, fails=TypeError, + "loc", slice(2, 4, 2), typs=["mixed"], axes=0, fails=TypeError ) def test_setitem_from_duplicate_axis(self): @@ -669,8 +661,7 @@ def test_loc_setitem_with_scalar_index(self, indexer, value): (1, ["A", "B", "C"]), np.array([7, 8, 9], dtype=np.int64), pd.DataFrame( - [[1, 2, np.nan], [7, 8, 9], [5, 6, np.nan]], - columns=["A", "B", "C"], + [[1, 2, np.nan], [7, 8, 9], [5, 6, np.nan]], columns=["A", "B", "C"] ), ), ( diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index 06ccdd2484a2a..1d73d1e35728b 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -892,16 +892,16 @@ def assert_reindex_indexer_is_ok(mgr, axis, new_labels, indexer, fill_value): fill_value, ) assert_reindex_indexer_is_ok( - mgr, ax, mgr.axes[ax][::-1], np.arange(mgr.shape[ax]), fill_value, + mgr, ax, mgr.axes[ax][::-1], np.arange(mgr.shape[ax]), fill_value ) assert_reindex_indexer_is_ok( - mgr, ax, mgr.axes[ax], np.arange(mgr.shape[ax])[::-1], fill_value, + mgr, ax, mgr.axes[ax], np.arange(mgr.shape[ax])[::-1], fill_value ) assert_reindex_indexer_is_ok( mgr, ax, pd.Index(["foo", "bar", "baz"]), [0, 0, 0], fill_value ) assert_reindex_indexer_is_ok( - mgr, ax, pd.Index(["foo", "bar", "baz"]), [-1, 0, -1], fill_value, + mgr, ax, pd.Index(["foo", "bar", "baz"]), [-1, 0, -1], fill_value ) assert_reindex_indexer_is_ok( mgr, @@ -913,7 +913,7 @@ def assert_reindex_indexer_is_ok(mgr, axis, new_labels, indexer, fill_value): if mgr.shape[ax] >= 3: assert_reindex_indexer_is_ok( - mgr, ax, pd.Index(["foo", "bar", "baz"]), [0, 1, 2], fill_value, + mgr, ax, pd.Index(["foo", "bar", "baz"]), [0, 1, 2], fill_value ) diff --git a/pandas/tests/io/formats/test_css.py b/pandas/tests/io/formats/test_css.py index 9383f86e335fa..785904fafd31a 100644 --- a/pandas/tests/io/formats/test_css.py +++ b/pandas/tests/io/formats/test_css.py @@ -99,11 +99,11 @@ def test_css_side_shorthands(shorthand, expansions): top, right, bottom, left = expansions assert_resolves( - f"{shorthand}: 1pt", {top: "1pt", right: "1pt", bottom: "1pt", left: "1pt"}, + f"{shorthand}: 1pt", {top: "1pt", right: "1pt", bottom: "1pt", left: "1pt"} ) assert_resolves( - f"{shorthand}: 1pt 4pt", {top: "1pt", right: "4pt", bottom: "1pt", left: "4pt"}, + f"{shorthand}: 1pt 4pt", {top: "1pt", right: "4pt", bottom: "1pt", left: "4pt"} ) assert_resolves( @@ -189,9 +189,7 @@ def test_css_absolute_font_size(size, relative_to, resolved): inherited = None else: inherited = {"font-size": relative_to} - assert_resolves( - f"font-size: {size}", {"font-size": resolved}, inherited=inherited, - ) + assert_resolves(f"font-size: {size}", {"font-size": resolved}, inherited=inherited) @pytest.mark.parametrize( @@ -225,6 +223,4 @@ def test_css_relative_font_size(size, relative_to, resolved): inherited = None else: inherited = {"font-size": relative_to} - assert_resolves( - f"font-size: {size}", {"font-size": resolved}, inherited=inherited, - ) + assert_resolves(f"font-size: {size}", {"font-size": resolved}, inherited=inherited) diff --git a/pandas/tests/io/formats/test_info.py b/pandas/tests/io/formats/test_info.py index 877bd1650ae60..7000daeb9b575 100644 --- a/pandas/tests/io/formats/test_info.py +++ b/pandas/tests/io/formats/test_info.py @@ -299,7 +299,7 @@ def test_info_memory_usage(): DataFrame(1, index=["a"], columns=["A"]).memory_usage(index=True) DataFrame(1, index=["a"], columns=["A"]).index.nbytes df = DataFrame( - data=1, index=MultiIndex.from_product([["a"], range(1000)]), columns=["A"], + data=1, index=MultiIndex.from_product([["a"], range(1000)]), columns=["A"] ) df.index.nbytes df.memory_usage(index=True) @@ -336,7 +336,7 @@ def test_info_memory_usage_deep_pypy(): @pytest.mark.skipif(PYPY, reason="PyPy getsizeof() fails by design") def test_usage_via_getsizeof(): df = DataFrame( - data=1, index=MultiIndex.from_product([["a"], range(1000)]), columns=["A"], + data=1, index=MultiIndex.from_product([["a"], range(1000)]), columns=["A"] ) mem = df.memory_usage(deep=True).sum() # sys.getsizeof will call the .memory_usage with @@ -359,16 +359,14 @@ def test_info_memory_usage_qualified(): buf = StringIO() df = DataFrame( - 1, columns=list("ab"), index=MultiIndex.from_product([range(3), range(3)]), + 1, columns=list("ab"), index=MultiIndex.from_product([range(3), range(3)]) ) df.info(buf=buf) assert "+" not in buf.getvalue() buf = StringIO() df = DataFrame( - 1, - columns=list("ab"), - index=MultiIndex.from_product([range(3), ["foo", "bar"]]), + 1, columns=list("ab"), index=MultiIndex.from_product([range(3), ["foo", "bar"]]) ) df.info(buf=buf) assert "+" in buf.getvalue() @@ -384,7 +382,7 @@ def memory_usage(f): N = 100 M = len(uppercase) index = MultiIndex.from_product( - [list(uppercase), date_range("20160101", periods=N)], names=["id", "date"], + [list(uppercase), date_range("20160101", periods=N)], names=["id", "date"] ) df = DataFrame({"value": np.random.randn(N * M)}, index=index) diff --git a/pandas/tests/io/json/test_compression.py b/pandas/tests/io/json/test_compression.py index c0e3220454bf1..a41af9886c617 100644 --- a/pandas/tests/io/json/test_compression.py +++ b/pandas/tests/io/json/test_compression.py @@ -45,7 +45,7 @@ def test_with_s3_url(compression, s3_resource, s3so): s3_resource.Bucket("pandas-test").put_object(Key="test-1", Body=f) roundtripped_df = pd.read_json( - "s3://pandas-test/test-1", compression=compression, storage_options=s3so, + "s3://pandas-test/test-1", compression=compression, storage_options=s3so ) tm.assert_frame_equal(df, roundtripped_df) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 59d64e1a6e909..13152f01abb04 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -745,11 +745,7 @@ def test_reconstruction_index(self): def test_path(self, float_frame, int_frame, datetime_frame): with tm.ensure_clean("test.json") as path: - for df in [ - float_frame, - int_frame, - datetime_frame, - ]: + for df in [float_frame, int_frame, datetime_frame]: df.to_json(path) read_json(path) @@ -1706,9 +1702,7 @@ def test_to_s3(self, s3_resource, s3so): # GH 28375 mock_bucket_name, target_file = "pandas-test", "test.json" df = DataFrame({"x": [1, 2, 3], "y": [2, 4, 6]}) - df.to_json( - f"s3://{mock_bucket_name}/{target_file}", storage_options=s3so, - ) + df.to_json(f"s3://{mock_bucket_name}/{target_file}", storage_options=s3so) timeout = 5 while True: if target_file in ( diff --git a/pandas/tests/io/parser/test_c_parser_only.py b/pandas/tests/io/parser/test_c_parser_only.py index 50179fc1ec4b8..50d5fb3e49c2a 100644 --- a/pandas/tests/io/parser/test_c_parser_only.py +++ b/pandas/tests/io/parser/test_c_parser_only.py @@ -646,9 +646,7 @@ def test_1000_sep_with_decimal( tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize( - "float_precision", [None, "high", "round_trip"], -) +@pytest.mark.parametrize("float_precision", [None, "high", "round_trip"]) @pytest.mark.parametrize( "value,expected", [ diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index ed947755e3419..833186b69c63b 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -1439,7 +1439,7 @@ def test_parse_timezone(all_parsers): end="2018-01-04 09:05:00", freq="1min", tz=pytz.FixedOffset(540), - ), + ) ), freq=None, ) @@ -1553,5 +1553,5 @@ def test_missing_parse_dates_column_raises( msg = f"Missing column provided to 'parse_dates': '{missing_cols}'" with pytest.raises(ValueError, match=msg): parser.read_csv( - content, sep=",", names=names, usecols=usecols, parse_dates=parse_dates, + content, sep=",", names=names, usecols=usecols, parse_dates=parse_dates ) diff --git a/pandas/tests/io/parser/test_usecols.py b/pandas/tests/io/parser/test_usecols.py index d4e049cc3fcc2..7e9c9866a666d 100644 --- a/pandas/tests/io/parser/test_usecols.py +++ b/pandas/tests/io/parser/test_usecols.py @@ -199,7 +199,7 @@ def test_usecols_with_whitespace(all_parsers): # Column selection by index. ([0, 1], DataFrame(data=[[1000, 2000], [4000, 5000]], columns=["2", "0"])), # Column selection by name. - (["0", "1"], DataFrame(data=[[2000, 3000], [5000, 6000]], columns=["0", "1"]),), + (["0", "1"], DataFrame(data=[[2000, 3000], [5000, 6000]], columns=["0", "1"])), ], ) def test_usecols_with_integer_like_header(all_parsers, usecols, expected): From acd173fe94b9edd7387d726ede4ab282eeb61c2a Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Mon, 7 Sep 2020 20:05:50 +0100 Subject: [PATCH 0691/1025] TST: test_datetime64_factorize on 32bit (#36192) --- pandas/tests/test_algos.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index ec7413514d430..a2c2ae22a0b62 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -256,7 +256,7 @@ def test_datetime64_factorize(self, writable): # GH35650 Verify whether read-only datetime64 array can be factorized data = np.array([np.datetime64("2020-01-01T00:00:00.000")]) data.setflags(write=writable) - expected_codes = np.array([0], dtype=np.int64) + expected_codes = np.array([0], dtype=np.intp) expected_uniques = np.array( ["2020-01-01T00:00:00.000000000"], dtype="datetime64[ns]" ) From be3c917961e9181b1f6d0cff7bf00cd35c7c6719 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Mon, 7 Sep 2020 20:39:15 +0100 Subject: [PATCH 0692/1025] TST: update test_series_factorize_na_sentinel_none for 32bit (#36191) --- pandas/tests/base/test_factorize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/base/test_factorize.py b/pandas/tests/base/test_factorize.py index 9fad9856d53cc..f8cbadb987d29 100644 --- a/pandas/tests/base/test_factorize.py +++ b/pandas/tests/base/test_factorize.py @@ -34,7 +34,7 @@ def test_series_factorize_na_sentinel_none(): ser = pd.Series(values) codes, uniques = ser.factorize(na_sentinel=None) - expected_codes = np.array([0, 1, 0, 2], dtype="int64") + expected_codes = np.array([0, 1, 0, 2], dtype=np.intp) expected_uniques = pd.Index([1.0, 2.0, np.nan]) tm.assert_numpy_array_equal(codes, expected_codes) From d2c937f93b6c73290da43154f849d05fe0caf28c Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Mon, 7 Sep 2020 20:41:42 +0100 Subject: [PATCH 0693/1025] DOC: move release note for #36155 (#36187) --- doc/source/whatsnew/v1.1.2.rst | 1 + doc/source/whatsnew/v1.2.0.rst | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.2.rst b/doc/source/whatsnew/v1.1.2.rst index da261907565a1..e9cba3de56920 100644 --- a/doc/source/whatsnew/v1.1.2.rst +++ b/doc/source/whatsnew/v1.1.2.rst @@ -50,6 +50,7 @@ Bug fixes Other ~~~~~ - :meth:`factorize` now supports ``na_sentinel=None`` to include NaN in the uniques of the values and remove ``dropna`` keyword which was unintentionally exposed to public facing API in 1.1 version from :meth:`factorize` (:issue:`35667`) +- :meth:`DataFrame.plot` and meth:`Series.plot` raise ``UserWarning`` about usage of FixedFormatter and FixedLocator (:issue:`35684` and :issue:`35945`) .. --------------------------------------------------------------------------- diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 9a778acba4764..ccaae9f996425 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -300,7 +300,6 @@ Plotting ^^^^^^^^ - Bug in :meth:`DataFrame.plot` where a marker letter in the ``style`` keyword sometimes causes a ``ValueError`` (:issue:`21003`) -- meth:`DataFrame.plot` and meth:`Series.plot` raise ``UserWarning`` about usage of FixedFormatter and FixedLocator (:issue:`35684` and :issue:`35945`) Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ From 1e2f9eb9c8b10985419fd010710b63d0909b577e Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 7 Sep 2020 12:56:18 -0700 Subject: [PATCH 0694/1025] REF: use _validate_foo pattern in Categorical (#36181) --- pandas/core/arrays/categorical.py | 31 ++++++++++++++++++++++--------- pandas/core/indexes/category.py | 11 +++-------- 2 files changed, 25 insertions(+), 17 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 02305479bef67..228e630f95863 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1192,6 +1192,26 @@ def map(self, mapper): __le__ = _cat_compare_op(operator.le) __ge__ = _cat_compare_op(operator.ge) + def _validate_insert_value(self, value) -> int: + code = self.categories.get_indexer([value]) + if (code == -1) and not (is_scalar(value) and isna(value)): + raise TypeError( + "cannot insert an item into a CategoricalIndex " + "that is not already an existing category" + ) + return code[0] + + def _validate_searchsorted_value(self, value): + # searchsorted is very performance sensitive. By converting codes + # to same dtype as self.codes, we get much faster performance. + if is_scalar(value): + codes = self.categories.get_loc(value) + codes = self.codes.dtype.type(codes) + else: + locs = [self.categories.get_loc(x) for x in value] + codes = np.array(locs, dtype=self.codes.dtype) + return codes + def _validate_fill_value(self, fill_value): """ Convert a user-facing fill_value to a representation to use with our @@ -1299,15 +1319,8 @@ def memory_usage(self, deep=False): @doc(_shared_docs["searchsorted"], klass="Categorical") def searchsorted(self, value, side="left", sorter=None): - # searchsorted is very performance sensitive. By converting codes - # to same dtype as self.codes, we get much faster performance. - if is_scalar(value): - codes = self.categories.get_loc(value) - codes = self.codes.dtype.type(codes) - else: - locs = [self.categories.get_loc(x) for x in value] - codes = np.array(locs, dtype=self.codes.dtype) - return self.codes.searchsorted(codes, side=side, sorter=sorter) + value = self._validate_searchsorted_value(value) + return self.codes.searchsorted(value, side=side, sorter=sorter) def isna(self): """ diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index cbb30763797d1..d38f77aaceb01 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -20,7 +20,7 @@ pandas_dtype, ) from pandas.core.dtypes.dtypes import CategoricalDtype -from pandas.core.dtypes.missing import is_valid_nat_for_dtype, isna, notna +from pandas.core.dtypes.missing import is_valid_nat_for_dtype, notna from pandas.core import accessor from pandas.core.algorithms import take_1d @@ -734,15 +734,10 @@ def insert(self, loc: int, item): ValueError if the item is not in the categories """ - code = self.categories.get_indexer([item]) - if (code == -1) and not (is_scalar(item) and isna(item)): - raise TypeError( - "cannot insert an item into a CategoricalIndex " - "that is not already an existing category" - ) + code = self._data._validate_insert_value(item) codes = self.codes - codes = np.concatenate((codes[:loc], code, codes[loc:])) + codes = np.concatenate((codes[:loc], [code], codes[loc:])) return self._create_from_codes(codes) def _concat(self, to_concat, name): From f5e6c6b94167f3678f9a39a9b174de01fba31fe7 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 7 Sep 2020 13:46:33 -0700 Subject: [PATCH 0695/1025] DTA/TDA/PA use self._data instead of self.asi8 for self._ndarray (#36171) --- pandas/core/arrays/datetimelike.py | 50 +++++++++++--------- pandas/core/arrays/datetimes.py | 4 ++ pandas/core/arrays/period.py | 4 ++ pandas/core/arrays/timedeltas.py | 4 ++ pandas/tests/frame/indexing/test_datetime.py | 4 +- 5 files changed, 43 insertions(+), 23 deletions(-) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index a5b8032974fa4..a218745db0a44 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -27,7 +27,7 @@ from pandas.compat import set_function_name from pandas.compat.numpy import function as nv from pandas.errors import AbstractMethodError, NullFrequencyError, PerformanceWarning -from pandas.util._decorators import Appender, Substitution +from pandas.util._decorators import Appender, Substitution, cache_readonly from pandas.util._validators import validate_fillna_kwargs from pandas.core.dtypes.common import ( @@ -175,6 +175,14 @@ def _scalar_from_string(self, value: str) -> DTScalarOrNaT: """ raise AbstractMethodError(self) + @classmethod + def _rebox_native(cls, value: int) -> Union[int, np.datetime64, np.timedelta64]: + """ + Box an integer unboxed via _unbox_scalar into the native type for + the underlying ndarray. + """ + raise AbstractMethodError(cls) + def _unbox_scalar(self, value: DTScalarOrNaT) -> int: """ Unbox the integer value of a scalar `value`. @@ -458,18 +466,15 @@ class DatetimeLikeArrayMixin( # ------------------------------------------------------------------ # NDArrayBackedExtensionArray compat - # TODO: make this a cache_readonly; need to get around _index_data - # kludge in libreduction - @property + @cache_readonly def _ndarray(self) -> np.ndarray: - # NB: A bunch of Interval tests fail if we use ._data - return self.asi8 + return self._data def _from_backing_data(self: _T, arr: np.ndarray) -> _T: # Note: we do not retain `freq` - # error: Too many arguments for "NDArrayBackedExtensionArray" - # error: Unexpected keyword argument "dtype" for "NDArrayBackedExtensionArray" - return type(self)(arr, dtype=self.dtype) # type: ignore[call-arg] + return type(self)._simple_new( # type: ignore[attr-defined] + arr, dtype=self.dtype + ) # ------------------------------------------------------------------ @@ -526,7 +531,7 @@ def __array__(self, dtype=None) -> np.ndarray: # used for Timedelta/DatetimeArray, overwritten by PeriodArray if is_object_dtype(dtype): return np.array(list(self), dtype=object) - return self._data + return self._ndarray def __getitem__(self, key): """ @@ -536,7 +541,7 @@ def __getitem__(self, key): if lib.is_integer(key): # fast-path - result = self._data[key] + result = self._ndarray[key] if self.ndim == 1: return self._box_func(result) return self._simple_new(result, dtype=self.dtype) @@ -557,7 +562,7 @@ def __getitem__(self, key): key = check_array_indexer(self, key) freq = self._get_getitem_freq(key) - result = self._data[key] + result = self._ndarray[key] if lib.is_scalar(result): return self._box_func(result) return self._simple_new(result, dtype=self.dtype, freq=freq) @@ -612,7 +617,7 @@ def __setitem__( value = self._validate_setitem_value(value) key = check_array_indexer(self, key) - self._data[key] = value + self._ndarray[key] = value self._maybe_clear_freq() def _maybe_clear_freq(self): @@ -663,8 +668,8 @@ def astype(self, dtype, copy=True): def view(self, dtype=None): if dtype is None or dtype is self.dtype: - return type(self)(self._data, dtype=self.dtype) - return self._data.view(dtype=dtype) + return type(self)(self._ndarray, dtype=self.dtype) + return self._ndarray.view(dtype=dtype) # ------------------------------------------------------------------ # ExtensionArray Interface @@ -705,7 +710,7 @@ def _from_factorized(cls, values, original): return cls(values, dtype=original.dtype) def _values_for_argsort(self): - return self._data + return self._ndarray # ------------------------------------------------------------------ # Validation Methods @@ -722,7 +727,7 @@ def _validate_fill_value(self, fill_value): Returns ------- - fill_value : np.int64 + fill_value : np.int64, np.datetime64, or np.timedelta64 Raises ------ @@ -736,7 +741,8 @@ def _validate_fill_value(self, fill_value): fill_value = self._validate_scalar(fill_value, msg) except TypeError as err: raise ValueError(msg) from err - return self._unbox(fill_value) + rv = self._unbox(fill_value) + return self._rebox_native(rv) def _validate_shift_value(self, fill_value): # TODO(2.0): once this deprecation is enforced, use _validate_fill_value @@ -951,9 +957,9 @@ def value_counts(self, dropna=False): from pandas import Index, Series if dropna: - values = self[~self.isna()]._data + values = self[~self.isna()]._ndarray else: - values = self._data + values = self._ndarray cls = type(self) @@ -1044,9 +1050,9 @@ def fillna(self, value=None, method=None, limit=None): else: func = missing.backfill_1d - values = self._data + values = self._ndarray if not is_period_dtype(self.dtype): - # For PeriodArray self._data is i8, which gets copied + # For PeriodArray self._ndarray is i8, which gets copied # by `func`. Otherwise we need to make a copy manually # to avoid modifying `self` in-place. values = values.copy() diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 1bea3a9eb137e..d913e7be9ae5f 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -446,6 +446,10 @@ def _generate_range( # ----------------------------------------------------------------- # DatetimeLike Interface + @classmethod + def _rebox_native(cls, value: int) -> np.datetime64: + return np.int64(value).view("M8[ns]") + def _unbox_scalar(self, value): if not isinstance(value, self._scalar_type) and value is not NaT: raise ValueError("'value' should be a Timestamp.") diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index cc39ffb5d1203..c3a9430736969 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -253,6 +253,10 @@ def _generate_range(cls, start, end, periods, freq, fields): # ----------------------------------------------------------------- # DatetimeLike Interface + @classmethod + def _rebox_native(cls, value: int) -> np.int64: + return np.int64(value) + def _unbox_scalar(self, value: Union[Period, NaTType]) -> int: if value is NaT: return value.value diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index 2d694c469b3a9..485ebb49a376d 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -271,6 +271,10 @@ def _generate_range(cls, start, end, periods, freq, closed=None): # ---------------------------------------------------------------- # DatetimeLike Interface + @classmethod + def _rebox_native(cls, value: int) -> np.timedelta64: + return np.int64(value).view("m8[ns]") + def _unbox_scalar(self, value): if not isinstance(value, self._scalar_type) and value is not NaT: raise ValueError("'value' should be a Timedelta.") diff --git a/pandas/tests/frame/indexing/test_datetime.py b/pandas/tests/frame/indexing/test_datetime.py index 1937a4c380dc9..1866ac341def6 100644 --- a/pandas/tests/frame/indexing/test_datetime.py +++ b/pandas/tests/frame/indexing/test_datetime.py @@ -23,7 +23,9 @@ def test_setitem(self, timezone_frame): b1 = df._mgr.blocks[1] b2 = df._mgr.blocks[2] tm.assert_extension_array_equal(b1.values, b2.values) - assert id(b1.values._data.base) != id(b2.values._data.base) + b1base = b1.values._data.base + b2base = b2.values._data.base + assert b1base is None or (id(b1base) != id(b2base)) # with nan df2 = df.copy() From e3769e9cca3904637d5f1f1d61d2704b1ae1f962 Mon Sep 17 00:00:00 2001 From: Thomas Dickson Date: Mon, 7 Sep 2020 21:47:39 +0100 Subject: [PATCH 0696/1025] TST verify groupby doesn't alter unit64s to floats #30859 (#36164) --- pandas/tests/groupby/test_groupby.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index e0196df7ceac0..69397228dd941 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1183,6 +1183,18 @@ def test_groupby_dtype_inference_empty(): tm.assert_frame_equal(result, expected, by_blocks=True) +def test_groupby_unit64_float_conversion(): + #  GH: 30859 groupby converts unit64 to floats sometimes + df = pd.DataFrame({"first": [1], "second": [1], "value": [16148277970000000000]}) + result = df.groupby(["first", "second"])["value"].max() + expected = pd.Series( + [16148277970000000000], + pd.MultiIndex.from_product([[1], [1]], names=["first", "second"]), + name="value", + ) + tm.assert_series_equal(result, expected) + + def test_groupby_list_infer_array_like(df): result = df.groupby(list(df["A"])).mean() expected = df.groupby(df["A"]).mean() From b9c3d7f131fc4edc4a764e03e839d5b0531c2814 Mon Sep 17 00:00:00 2001 From: patrick <61934744+phofl@users.noreply.github.com> Date: Mon, 7 Sep 2020 23:06:29 +0200 Subject: [PATCH 0697/1025] Fix compressed multiindex for output of groupby.rolling (#36152) --- doc/source/whatsnew/v1.1.2.rst | 1 + pandas/core/window/rolling.py | 10 +++++----- pandas/tests/window/test_grouper.py | 21 +++++++++++++++++++++ 3 files changed, 27 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v1.1.2.rst b/doc/source/whatsnew/v1.1.2.rst index e9cba3de56920..28ce49c11b3f0 100644 --- a/doc/source/whatsnew/v1.1.2.rst +++ b/doc/source/whatsnew/v1.1.2.rst @@ -23,6 +23,7 @@ Fixed regressions - Regression in :meth:`DataFrame.replace` where a ``TypeError`` would be raised when attempting to replace elements of type :class:`Interval` (:issue:`35931`) - Fix regression in pickle roundtrip of the ``closed`` attribute of :class:`IntervalIndex` (:issue:`35658`) - Fixed regression in :meth:`DataFrameGroupBy.agg` where a ``ValueError: buffer source array is read-only`` would be raised when the underlying array is read-only (:issue:`36014`) +- Fixed regression in :meth:`Series.groupby.rolling` number of levels of :class:`MultiIndex` in input was compressed to one (:issue:`36018`) - .. --------------------------------------------------------------------------- diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 4c4ec4d700b7f..235bd5364af02 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -2211,17 +2211,17 @@ def _apply( # Compose MultiIndex result from grouping levels then rolling level # Aggregate the MultiIndex data as tuples then the level names grouped_object_index = self.obj.index - grouped_index_name = [grouped_object_index.name] + grouped_index_name = [*grouped_object_index.names] groupby_keys = [grouping.name for grouping in self._groupby.grouper._groupings] result_index_names = groupby_keys + grouped_index_name result_index_data = [] for key, values in self._groupby.grouper.indices.items(): for value in values: - if not is_list_like(key): - data = [key, grouped_object_index[value]] - else: - data = [*key, grouped_object_index[value]] + data = [ + *com.maybe_make_list(key), + *com.maybe_make_list(grouped_object_index[value]), + ] result_index_data.append(tuple(data)) result_index = MultiIndex.from_tuples( diff --git a/pandas/tests/window/test_grouper.py b/pandas/tests/window/test_grouper.py index 170bf100b3891..cb85ad7584da7 100644 --- a/pandas/tests/window/test_grouper.py +++ b/pandas/tests/window/test_grouper.py @@ -372,3 +372,24 @@ def test_groupby_subset_rolling_subset_with_closed(self): name="column1", ) tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("func", ["max", "min"]) + def test_groupby_rolling_index_changed(self, func): + # GH: #36018 nlevels of MultiIndex changed + ds = Series( + [1, 2, 2], + index=pd.MultiIndex.from_tuples( + [("a", "x"), ("a", "y"), ("c", "z")], names=["1", "2"] + ), + name="a", + ) + + result = getattr(ds.groupby(ds).rolling(2), func)() + expected = Series( + [np.nan, np.nan, 2.0], + index=pd.MultiIndex.from_tuples( + [(1, "a", "x"), (2, "a", "y"), (2, "c", "z")], names=["a", "1", "2"] + ), + name="a", + ) + tm.assert_series_equal(result, expected) From dbf489e26a67c9908a6cbbc2f2e2aabc1878ce87 Mon Sep 17 00:00:00 2001 From: patrick <61934744+phofl@users.noreply.github.com> Date: Mon, 7 Sep 2020 23:11:29 +0200 Subject: [PATCH 0698/1025] TST: DataFrame.replace: TypeError: Cannot compare types 'ndarray(dtype=int64)' and 'unicode' (#36202) --- pandas/tests/frame/methods/test_replace.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py index ea2488dfc0877..a77753ed9f9d0 100644 --- a/pandas/tests/frame/methods/test_replace.py +++ b/pandas/tests/frame/methods/test_replace.py @@ -1599,3 +1599,11 @@ def test_replace_intervals(self): result = df.replace({"a": {pd.Interval(0, 1): "x"}}) expected = pd.DataFrame({"a": ["x", "x"]}) tm.assert_frame_equal(result, expected) + + def test_replace_unicode(self): + # GH: 16784 + columns_values_map = {"positive": {"正面": 1, "中立": 1, "负面": 0}} + df1 = pd.DataFrame({"positive": np.ones(3)}) + result = df1.replace(columns_values_map) + expected = pd.DataFrame({"positive": np.ones(3)}) + tm.assert_frame_equal(result, expected) From ca0399f1d95eff6d2bc3b0cb19a99a59a7feadc4 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 7 Sep 2020 14:43:12 -0700 Subject: [PATCH 0699/1025] REF: collect methods by topic (#36173) --- pandas/core/arrays/categorical.py | 148 +++++++++++++++++------------- pandas/core/indexes/category.py | 29 +++--- pandas/core/indexes/datetimes.py | 3 + pandas/core/indexes/interval.py | 64 +++++++------ pandas/core/indexes/multi.py | 2 + pandas/core/indexes/numeric.py | 42 +++++---- pandas/core/indexes/period.py | 67 +++++++------- pandas/core/indexes/range.py | 5 + pandas/core/indexes/timedeltas.py | 5 + 9 files changed, 212 insertions(+), 153 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 228e630f95863..58847528d2183 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -393,56 +393,6 @@ def __init__( self._dtype = self._dtype.update_dtype(dtype) self._codes = coerce_indexer_dtype(codes, dtype.categories) - @property - def categories(self): - """ - The categories of this categorical. - - Setting assigns new values to each category (effectively a rename of - each individual category). - - The assigned value has to be a list-like object. All items must be - unique and the number of items in the new categories must be the same - as the number of items in the old categories. - - Assigning to `categories` is a inplace operation! - - Raises - ------ - ValueError - If the new categories do not validate as categories or if the - number of new categories is unequal the number of old categories - - See Also - -------- - rename_categories : Rename categories. - reorder_categories : Reorder categories. - add_categories : Add new categories. - remove_categories : Remove the specified categories. - remove_unused_categories : Remove categories which are not used. - set_categories : Set the categories to the specified ones. - """ - return self.dtype.categories - - @categories.setter - def categories(self, categories): - new_dtype = CategoricalDtype(categories, ordered=self.ordered) - if self.dtype.categories is not None and len(self.dtype.categories) != len( - new_dtype.categories - ): - raise ValueError( - "new categories need to have the same number of " - "items as the old categories!" - ) - self._dtype = new_dtype - - @property - def ordered(self) -> Ordered: - """ - Whether the categories have an ordered relationship. - """ - return self.dtype.ordered - @property def dtype(self) -> CategoricalDtype: """ @@ -458,10 +408,6 @@ def _constructor(self) -> Type["Categorical"]: def _from_sequence(cls, scalars, dtype=None, copy=False): return Categorical(scalars, dtype=dtype) - def _formatter(self, boxed=False): - # Defer to CategoricalFormatter's formatter. - return None - def astype(self, dtype: Dtype, copy: bool = True) -> ArrayLike: """ Coerce this type to another dtype @@ -640,6 +586,59 @@ def from_codes(cls, codes, categories=None, ordered=None, dtype=None): return cls(codes, dtype=dtype, fastpath=True) + # ------------------------------------------------------------------ + # Categories/Codes/Ordered + + @property + def categories(self): + """ + The categories of this categorical. + + Setting assigns new values to each category (effectively a rename of + each individual category). + + The assigned value has to be a list-like object. All items must be + unique and the number of items in the new categories must be the same + as the number of items in the old categories. + + Assigning to `categories` is a inplace operation! + + Raises + ------ + ValueError + If the new categories do not validate as categories or if the + number of new categories is unequal the number of old categories + + See Also + -------- + rename_categories : Rename categories. + reorder_categories : Reorder categories. + add_categories : Add new categories. + remove_categories : Remove the specified categories. + remove_unused_categories : Remove categories which are not used. + set_categories : Set the categories to the specified ones. + """ + return self.dtype.categories + + @categories.setter + def categories(self, categories): + new_dtype = CategoricalDtype(categories, ordered=self.ordered) + if self.dtype.categories is not None and len(self.dtype.categories) != len( + new_dtype.categories + ): + raise ValueError( + "new categories need to have the same number of " + "items as the old categories!" + ) + self._dtype = new_dtype + + @property + def ordered(self) -> Ordered: + """ + Whether the categories have an ordered relationship. + """ + return self.dtype.ordered + @property def codes(self) -> np.ndarray: """ @@ -1104,6 +1103,8 @@ def remove_unused_categories(self, inplace=False): if not inplace: return cat + # ------------------------------------------------------------------ + def map(self, mapper): """ Map categories using input correspondence (dict, Series, or function). @@ -1192,6 +1193,9 @@ def map(self, mapper): __le__ = _cat_compare_op(operator.le) __ge__ = _cat_compare_op(operator.ge) + # ------------------------------------------------------------- + # Validators; ideally these can be de-duplicated + def _validate_insert_value(self, value) -> int: code = self.categories.get_indexer([value]) if (code == -1) and not (is_scalar(value) and isna(value)): @@ -1241,6 +1245,8 @@ def _validate_fill_value(self, fill_value): ) return fill_value + # ------------------------------------------------------------- + def __array__(self, dtype=None) -> np.ndarray: """ The numpy array interface. @@ -1758,6 +1764,10 @@ def __contains__(self, key) -> bool: # ------------------------------------------------------------------ # Rendering Methods + def _formatter(self, boxed=False): + # Defer to CategoricalFormatter's formatter. + return None + def _tidy_repr(self, max_vals=10, footer=True) -> str: """ a short repr displaying only max_vals and an optional (but default @@ -1987,7 +1997,9 @@ def _reverse_indexer(self) -> Dict[Hashable, np.ndarray]: result = dict(zip(categories, _result)) return result - # reduction ops # + # ------------------------------------------------------------------ + # Reductions + def _reduce(self, name: str, skipna: bool = True, **kwargs): func = getattr(self, name, None) if func is None: @@ -2090,6 +2102,9 @@ def mode(self, dropna=True): codes = sorted(htable.mode_int64(ensure_int64(codes), dropna)) return self._constructor(values=codes, dtype=self.dtype, fastpath=True) + # ------------------------------------------------------------------ + # ExtensionArray Interface + def unique(self): """ Return the ``Categorical`` which ``categories`` and ``codes`` are @@ -2179,6 +2194,18 @@ def equals(self, other: object) -> bool: return np.array_equal(self._codes, other_codes) return False + @property + def _can_hold_na(self): + return True + + @classmethod + def _concat_same_type(self, to_concat): + from pandas.core.dtypes.concat import union_categoricals + + return union_categoricals(to_concat) + + # ------------------------------------------------------------------ + def is_dtype_equal(self, other): """ Returns True if categoricals are the same dtype @@ -2217,17 +2244,6 @@ def describe(self): return result - # Implement the ExtensionArray interface - @property - def _can_hold_na(self): - return True - - @classmethod - def _concat_same_type(self, to_concat): - from pandas.core.dtypes.concat import union_categoricals - - return union_categoricals(to_concat) - def isin(self, values) -> np.ndarray: """ Check whether `values` are contained in Categorical. diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index d38f77aaceb01..7509cb35069e8 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -433,11 +433,6 @@ def _to_safe_for_reshape(self): """ convert to object if we are a categorical """ return self.astype("object") - def _maybe_cast_indexer(self, key): - code = self.categories.get_loc(key) - code = self.codes.dtype.type(code) - return code - @doc(Index.where) def where(self, cond, other=None): # TODO: Investigate an alternative implementation with @@ -537,6 +532,14 @@ def _reindex_non_unique(self, target): return new_target, indexer, new_indexer + # -------------------------------------------------------------------- + # Indexing Methods + + def _maybe_cast_indexer(self, key): + code = self.categories.get_loc(key) + code = self.codes.dtype.type(code) + return code + @Appender(_index_shared_docs["get_indexer"] % _index_doc_kwargs) def get_indexer(self, target, method=None, limit=None, tolerance=None): method = missing.clean_reindex_fill_method(method) @@ -619,6 +622,15 @@ def _convert_arr_indexer(self, keyarr): def _convert_index_indexer(self, keyarr): return self._shallow_copy(keyarr) + @doc(Index._maybe_cast_slice_bound) + def _maybe_cast_slice_bound(self, label, side, kind): + if kind == "loc": + return label + + return super()._maybe_cast_slice_bound(label, side, kind) + + # -------------------------------------------------------------------- + def take_nd(self, *args, **kwargs): """Alias for `take`""" warnings.warn( @@ -628,13 +640,6 @@ def take_nd(self, *args, **kwargs): ) return self.take(*args, **kwargs) - @doc(Index._maybe_cast_slice_bound) - def _maybe_cast_slice_bound(self, label, side, kind): - if kind == "loc": - return label - - return super()._maybe_cast_slice_bound(label, side, kind) - def map(self, mapper): """ Map values using input correspondence (a dict, Series, or function). diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 3fd93a8159041..f0b80c2852bd5 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -509,6 +509,9 @@ def snap(self, freq="S"): dta = DatetimeArray(snapped, dtype=self.dtype) return DatetimeIndex._simple_new(dta, name=self.name) + # -------------------------------------------------------------------- + # Indexing Methods + def _parsed_string_to_bounds(self, reso: Resolution, parsed: datetime): """ Calculate datetime bounds for parsed time string and its resolution. diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 419ff81a2a478..3f72577c9420e 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -57,7 +57,7 @@ from pandas.core.ops import get_op_result_name if TYPE_CHECKING: - from pandas import CategoricalIndex + from pandas import CategoricalIndex # noqa:F401 _VALID_CLOSED = {"left", "right", "both", "neither"} _index_doc_kwargs = dict(ibase._index_doc_kwargs) @@ -515,28 +515,6 @@ def is_overlapping(self) -> bool: # GH 23309 return self._engine.is_overlapping - def _should_fallback_to_positional(self) -> bool: - # integer lookups in Series.__getitem__ are unambiguously - # positional in this case - return self.dtype.subtype.kind in ["m", "M"] - - def _maybe_cast_slice_bound(self, label, side, kind): - return getattr(self, side)._maybe_cast_slice_bound(label, side, kind) - - @Appender(Index._convert_list_indexer.__doc__) - def _convert_list_indexer(self, keyarr): - """ - we are passed a list-like indexer. Return the - indexer for matching intervals. - """ - locs = self.get_indexer_for(keyarr) - - # we have missing values - if (locs == -1).any(): - raise KeyError - - return locs - def _can_reindex(self, indexer: np.ndarray) -> None: """ Check if we are allowing reindexing with this particular indexer. @@ -668,6 +646,9 @@ def _searchsorted_monotonic(self, label, side, exclude_label=False): return sub_idx._searchsorted_monotonic(label, side) + # -------------------------------------------------------------------- + # Indexing Methods + def get_loc( self, key, method: Optional[str] = None, tolerance=None ) -> Union[int, slice, np.ndarray]: @@ -885,6 +866,30 @@ def _convert_slice_indexer(self, key: slice, kind: str): return super()._convert_slice_indexer(key, kind) + def _should_fallback_to_positional(self) -> bool: + # integer lookups in Series.__getitem__ are unambiguously + # positional in this case + return self.dtype.subtype.kind in ["m", "M"] + + def _maybe_cast_slice_bound(self, label, side, kind): + return getattr(self, side)._maybe_cast_slice_bound(label, side, kind) + + @Appender(Index._convert_list_indexer.__doc__) + def _convert_list_indexer(self, keyarr): + """ + we are passed a list-like indexer. Return the + indexer for matching intervals. + """ + locs = self.get_indexer_for(keyarr) + + # we have missing values + if (locs == -1).any(): + raise KeyError + + return locs + + # -------------------------------------------------------------------- + @Appender(Index.where.__doc__) def where(self, cond, other=None): if other is None: @@ -1030,6 +1035,9 @@ def equals(self, other: object) -> bool: and self.closed == other.closed ) + # -------------------------------------------------------------------- + # Set Operations + @Appender(Index.intersection.__doc__) @SetopCheck(op_name="intersection") def intersection( @@ -1115,6 +1123,12 @@ def func(self, other, sort=sort): return func + union = _setop("union") + difference = _setop("difference") + symmetric_difference = _setop("symmetric_difference") + + # -------------------------------------------------------------------- + @property def is_all_dates(self) -> bool: """ @@ -1123,10 +1137,6 @@ def is_all_dates(self) -> bool: """ return False - union = _setop("union") - difference = _setop("difference") - symmetric_difference = _setop("symmetric_difference") - # TODO: arithmetic operations # GH#30817 until IntervalArray implements inequalities, get them from Index diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index e49a23935efbd..9630e154ccd17 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -3154,6 +3154,8 @@ def _update_indexer(idxr, indexer=indexer): return indexer._values + # -------------------------------------------------------------------- + def _reorder_indexer( self, seq: Tuple[Union[Scalar, Iterable, AnyArrayLike], ...], diff --git a/pandas/core/indexes/numeric.py b/pandas/core/indexes/numeric.py index cd3f1f51a86d2..079f43cb2c66b 100644 --- a/pandas/core/indexes/numeric.py +++ b/pandas/core/indexes/numeric.py @@ -97,6 +97,9 @@ def _validate_dtype(cls, dtype: Dtype) -> None: f"Incorrect `dtype` passed: expected {expected}, received {dtype}" ) + # ---------------------------------------------------------------- + # Indexing Methods + @doc(Index._maybe_cast_slice_bound) def _maybe_cast_slice_bound(self, label, side, kind): assert kind in ["loc", "getitem", None] @@ -104,6 +107,8 @@ def _maybe_cast_slice_bound(self, label, side, kind): # we will try to coerce to integers return self._maybe_cast_indexer(label) + # ---------------------------------------------------------------- + @doc(Index._shallow_copy) def _shallow_copy(self, values=None, name: Label = lib.no_default): if values is not None and not self._can_hold_na and values.dtype.kind == "f": @@ -293,6 +298,9 @@ class UInt64Index(IntegerIndex): _engine_type = libindex.UInt64Engine _default_dtype = np.dtype(np.uint64) + # ---------------------------------------------------------------- + # Indexing Methods + @doc(Index._convert_arr_indexer) def _convert_arr_indexer(self, keyarr): # Cast the indexer to uint64 if possible so that the values returned @@ -314,6 +322,8 @@ def _convert_index_indexer(self, keyarr): return keyarr.astype(np.uint64) return keyarr + # ---------------------------------------------------------------- + def _wrap_joined_index(self, joined, other): name = get_op_result_name(self, other) return UInt64Index(joined, name=name) @@ -385,6 +395,22 @@ def _convert_slice_indexer(self, key: slice, kind: str): # translate to locations return self.slice_indexer(key.start, key.stop, key.step, kind=kind) + @doc(Index.get_loc) + def get_loc(self, key, method=None, tolerance=None): + if is_bool(key): + # Catch this to avoid accidentally casting to 1.0 + raise KeyError(key) + + if is_float(key) and np.isnan(key): + nan_idxs = self._nan_idxs + if not len(nan_idxs): + raise KeyError(key) + elif len(nan_idxs) == 1: + return nan_idxs[0] + return nan_idxs + + return super().get_loc(key, method=method, tolerance=tolerance) + # ---------------------------------------------------------------- def _format_native_types( @@ -409,22 +435,6 @@ def __contains__(self, other: Any) -> bool: return is_float(other) and np.isnan(other) and self.hasnans - @doc(Index.get_loc) - def get_loc(self, key, method=None, tolerance=None): - if is_bool(key): - # Catch this to avoid accidentally casting to 1.0 - raise KeyError(key) - - if is_float(key) and np.isnan(key): - nan_idxs = self._nan_idxs - if not len(nan_idxs): - raise KeyError(key) - elif len(nan_idxs) == 1: - return nan_idxs[0] - return nan_idxs - - return super().get_loc(key, method=method, tolerance=tolerance) - @cache_readonly def is_unique(self) -> bool: return super().is_unique and self._nan_idxs.size < 2 diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index cdb502199c6f1..5282b6f0154b4 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -433,6 +433,41 @@ def inferred_type(self) -> str: # indexing return "period" + def insert(self, loc, item): + if not isinstance(item, Period) or self.freq != item.freq: + return self.astype(object).insert(loc, item) + + i8result = np.concatenate( + (self[:loc].asi8, np.array([item.ordinal]), self[loc:].asi8) + ) + arr = type(self._data)._simple_new(i8result, dtype=self.dtype) + return type(self)._simple_new(arr, name=self.name) + + def join(self, other, how="left", level=None, return_indexers=False, sort=False): + """ + See Index.join + """ + self._assert_can_do_setop(other) + + if not isinstance(other, PeriodIndex): + return self.astype(object).join( + other, how=how, level=level, return_indexers=return_indexers, sort=sort + ) + + # _assert_can_do_setop ensures we have matching dtype + result = Int64Index.join( + self, + other, + how=how, + level=level, + return_indexers=return_indexers, + sort=sort, + ) + return result + + # ------------------------------------------------------------------------ + # Indexing Methods + @Appender(_index_shared_docs["get_indexer"] % _index_doc_kwargs) def get_indexer(self, target, method=None, limit=None, tolerance=None): target = ensure_index(target) @@ -607,38 +642,6 @@ def _get_string_slice(self, key: str, use_lhs: bool = True, use_rhs: bool = True except KeyError as err: raise KeyError(key) from err - def insert(self, loc, item): - if not isinstance(item, Period) or self.freq != item.freq: - return self.astype(object).insert(loc, item) - - i8result = np.concatenate( - (self[:loc].asi8, np.array([item.ordinal]), self[loc:].asi8) - ) - arr = type(self._data)._simple_new(i8result, dtype=self.dtype) - return type(self)._simple_new(arr, name=self.name) - - def join(self, other, how="left", level=None, return_indexers=False, sort=False): - """ - See Index.join - """ - self._assert_can_do_setop(other) - - if not isinstance(other, PeriodIndex): - return self.astype(object).join( - other, how=how, level=level, return_indexers=return_indexers, sort=sort - ) - - # _assert_can_do_setop ensures we have matching dtype - result = Int64Index.join( - self, - other, - how=how, - level=level, - return_indexers=return_indexers, - sort=sort, - ) - return result - # ------------------------------------------------------------------------ # Set Operation Methods diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index f1457a9aac62b..684691501de5c 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -338,6 +338,9 @@ def __contains__(self, key: Any) -> bool: return False return key in self._range + # -------------------------------------------------------------------- + # Indexing Methods + @doc(Int64Index.get_loc) def get_loc(self, key, method=None, tolerance=None): if method is None and tolerance is None: @@ -379,6 +382,8 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): locs[valid] = len(self) - 1 - locs[valid] return ensure_platform_int(locs) + # -------------------------------------------------------------------- + def tolist(self): return list(self._range) diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index 85c8396dfd1fe..df08fda78823d 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -202,6 +202,9 @@ def _is_comparable_dtype(self, dtype: DtypeObj) -> bool: """ return is_timedelta64_dtype(dtype) + # ------------------------------------------------------------------- + # Indexing Methods + def get_loc(self, key, method=None, tolerance=None): """ Get integer location for requested label @@ -248,6 +251,8 @@ def _maybe_cast_slice_bound(self, label, side: str, kind): return label + # ------------------------------------------------------------------- + def is_type_compatible(self, typ) -> bool: return typ == self.inferred_type or typ == "timedelta" From 501bac04d03a58f30cecc33bd9116177f63c2110 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 7 Sep 2020 15:41:00 -0700 Subject: [PATCH 0700/1025] REF: implement Categorical._validate_setitem_value (#36180) --- pandas/core/arrays/categorical.py | 35 +++++++++++++++--------------- pandas/core/arrays/datetimelike.py | 16 +++++++++----- 2 files changed, 28 insertions(+), 23 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 58847528d2183..b732db4c66003 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -9,7 +9,7 @@ from pandas._config import get_option -from pandas._libs import NaT, algos as libalgos, hashtable as htable +from pandas._libs import NaT, algos as libalgos, hashtable as htable, lib from pandas._typing import ArrayLike, Dtype, Ordered, Scalar from pandas.compat.numpy import function as nv from pandas.util._decorators import cache_readonly, deprecate_kwarg, doc @@ -1868,14 +1868,6 @@ def __repr__(self) -> str: # ------------------------------------------------------------------ - def _maybe_coerce_indexer(self, indexer): - """ - return an indexer coerced to the codes dtype - """ - if isinstance(indexer, np.ndarray) and indexer.dtype.kind == "i": - indexer = indexer.astype(self._codes.dtype) - return indexer - def __getitem__(self, key): """ Return an item. @@ -1905,6 +1897,11 @@ def __setitem__(self, key, value): If (one or more) Value is not in categories or if a assigned `Categorical` does not have the same categories """ + key = self._validate_setitem_key(key) + value = self._validate_setitem_value(value) + self._ndarray[key] = value + + def _validate_setitem_value(self, value): value = extract_array(value, extract_numpy=True) # require identical categories set @@ -1934,12 +1931,19 @@ def __setitem__(self, key, value): "category, set the categories first" ) - # set by position - if isinstance(key, (int, np.integer)): + lindexer = self.categories.get_indexer(rvalue) + if isinstance(lindexer, np.ndarray) and lindexer.dtype.kind == "i": + lindexer = lindexer.astype(self._ndarray.dtype) + + return lindexer + + def _validate_setitem_key(self, key): + if lib.is_integer(key): + # set by position pass - # tuple of indexers (dataframe) elif isinstance(key, tuple): + # tuple of indexers (dataframe) # only allow 1 dimensional slicing, but can # in a 2-d case be passed (slice(None),....) if len(key) == 2: @@ -1951,17 +1955,14 @@ def __setitem__(self, key, value): else: raise AssertionError("invalid slicing for a 1-ndim categorical") - # slicing in Series or Categorical elif isinstance(key, slice): + # slicing in Series or Categorical pass # else: array of True/False in Series or Categorical - lindexer = self.categories.get_indexer(rvalue) - lindexer = self._maybe_coerce_indexer(lindexer) - key = check_array_indexer(self, key) - self._codes[key] = lindexer + return key def _reverse_indexer(self) -> Dict[Hashable, np.ndarray]: """ diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index a218745db0a44..2626890c2dbe5 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -546,6 +546,15 @@ def __getitem__(self, key): return self._box_func(result) return self._simple_new(result, dtype=self.dtype) + key = self._validate_getitem_key(key) + result = self._ndarray[key] + if lib.is_scalar(result): + return self._box_func(result) + + freq = self._get_getitem_freq(key) + return self._simple_new(result, dtype=self.dtype, freq=freq) + + def _validate_getitem_key(self, key): if com.is_bool_indexer(key): # first convert to boolean, because check_array_indexer doesn't # allow object dtype @@ -560,12 +569,7 @@ def __getitem__(self, key): pass else: key = check_array_indexer(self, key) - - freq = self._get_getitem_freq(key) - result = self._ndarray[key] - if lib.is_scalar(result): - return self._box_func(result) - return self._simple_new(result, dtype=self.dtype, freq=freq) + return key def _get_getitem_freq(self, key): """ From 78c2a351b3c3626cd73ec5942d2d0c0d00ec14fc Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 7 Sep 2020 17:15:51 -0700 Subject: [PATCH 0701/1025] COMPAT: match numpy behavior for searchsorted on dt64/td64 (#36176) --- doc/source/whatsnew/v1.2.0.rst | 1 + pandas/core/arrays/datetimelike.py | 7 +++---- pandas/tests/arrays/test_datetimelike.py | 11 ++++++++--- 3 files changed, 12 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index ccaae9f996425..2afa1f1a6199e 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -228,6 +228,7 @@ Datetimelike - Bug in :class:`DateOffset` where attributes reconstructed from pickle files differ from original objects when input values exceed normal ranges (e.g months=12) (:issue:`34511`) - Bug in :meth:`DatetimeIndex.get_slice_bound` where ``datetime.date`` objects were not accepted or naive :class:`Timestamp` with a tz-aware :class:`DatetimeIndex` (:issue:`35690`) - Bug in :meth:`DatetimeIndex.slice_locs` where ``datetime.date`` objects were not accepted (:issue:`34077`) +- Bug in :meth:`DatetimeIndex.searchsorted`, :meth:`TimedeltaIndex.searchsorted`, and :meth:`Series.searchsorted` with ``datetime64`` or ``timedelta64`` dtype placement of ``NaT`` values being inconsistent with ``NumPy`` (:issue:`36176`) Timedelta ^^^^^^^^^ diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 2626890c2dbe5..6477b94a823ce 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -862,7 +862,8 @@ def _validate_searchsorted_value(self, value): # TODO: cast_str? we accept it for scalar value = self._validate_listlike(value, "searchsorted") - return self._unbox(value) + rv = self._unbox(value) + return self._rebox_native(rv) def _validate_setitem_value(self, value): msg = ( @@ -941,9 +942,7 @@ def searchsorted(self, value, side="left", sorter=None): Array of insertion points with the same shape as `value`. """ value = self._validate_searchsorted_value(value) - - # TODO: Use datetime64 semantics for sorting, xref GH#29844 - return self.asi8.searchsorted(value, side=side, sorter=sorter) + return self._data.searchsorted(value, side=side, sorter=sorter) def value_counts(self, dropna=False): """ diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index b1ab700427c28..292557fc04258 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -241,10 +241,15 @@ def test_searchsorted(self): expected = np.array([2, 3], dtype=np.intp) tm.assert_numpy_array_equal(result, expected) - # Following numpy convention, NaT goes at the beginning - # (unlike NaN which goes at the end) + # GH#29884 match numpy convention on whether NaT goes + # at the end or the beginning result = arr.searchsorted(pd.NaT) - assert result == 0 + if _np_version_under1p18 or self.array_cls is PeriodArray: + # Following numpy convention, NaT goes at the beginning + # (unlike NaN which goes at the end) + assert result == 0 + else: + assert result == 10 def test_getitem_2d(self, arr1d): # 2d slicing on a 1D array From 07c275ebbde9002a3082e4c495a08cd7be8a3884 Mon Sep 17 00:00:00 2001 From: Nidhi Zare Date: Tue, 8 Sep 2020 06:00:19 +0530 Subject: [PATCH 0702/1025] pandas docs json_normalize example (#36194) Co-authored-by: Nidhi Zare --- pandas/io/json/_normalize.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/io/json/_normalize.py b/pandas/io/json/_normalize.py index 44765dbe74b46..2e1fc57e88ed1 100644 --- a/pandas/io/json/_normalize.py +++ b/pandas/io/json/_normalize.py @@ -176,7 +176,7 @@ def _json_normalize( ... 'fitness': {'height': 130, 'weight': 60}}, ... {'id': 2, 'name': 'Faye Raker', ... 'fitness': {'height': 130, 'weight': 60}}] - >>> json_normalize(data, max_level=0) + >>> pandas.json_normalize(data, max_level=0) fitness id name 0 {'height': 130, 'weight': 60} 1.0 Cole Volk 1 {'height': 130, 'weight': 60} NaN Mose Reg @@ -191,7 +191,7 @@ def _json_normalize( ... 'fitness': {'height': 130, 'weight': 60}}, ... {'id': 2, 'name': 'Faye Raker', ... 'fitness': {'height': 130, 'weight': 60}}] - >>> json_normalize(data, max_level=1) + >>> pandas.json_normalize(data, max_level=1) fitness.height fitness.weight id name 0 130 60 1.0 Cole Volk 1 130 60 NaN Mose Reg @@ -208,7 +208,7 @@ def _json_normalize( ... 'info': {'governor': 'John Kasich'}, ... 'counties': [{'name': 'Summit', 'population': 1234}, ... {'name': 'Cuyahoga', 'population': 1337}]}] - >>> result = json_normalize(data, 'counties', ['state', 'shortname', + >>> result = pandas.json_normalize(data, 'counties', ['state', 'shortname', ... ['info', 'governor']]) >>> result name population state shortname info.governor @@ -219,7 +219,7 @@ def _json_normalize( 4 Cuyahoga 1337 Ohio OH John Kasich >>> data = {'A': [1, 2]} - >>> json_normalize(data, 'A', record_prefix='Prefix.') + >>> pandas.json_normalize(data, 'A', record_prefix='Prefix.') Prefix.0 0 1 1 2 From e500b4e731fb87c85c1b88fbb4e4fa682c5dcfe5 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Tue, 8 Sep 2020 02:51:20 -0700 Subject: [PATCH 0703/1025] BUG: GroupbyRolling with an empty frame (#36208) Co-authored-by: Matt Roeschke --- doc/source/whatsnew/v1.1.2.rst | 2 +- pandas/core/window/rolling.py | 10 ++++++---- pandas/tests/window/test_grouper.py | 12 ++++++++++++ 3 files changed, 19 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v1.1.2.rst b/doc/source/whatsnew/v1.1.2.rst index 28ce49c11b3f0..f13d38d1f8f76 100644 --- a/doc/source/whatsnew/v1.1.2.rst +++ b/doc/source/whatsnew/v1.1.2.rst @@ -24,7 +24,7 @@ Fixed regressions - Fix regression in pickle roundtrip of the ``closed`` attribute of :class:`IntervalIndex` (:issue:`35658`) - Fixed regression in :meth:`DataFrameGroupBy.agg` where a ``ValueError: buffer source array is read-only`` would be raised when the underlying array is read-only (:issue:`36014`) - Fixed regression in :meth:`Series.groupby.rolling` number of levels of :class:`MultiIndex` in input was compressed to one (:issue:`36018`) -- +- Fixed regression in :class:`DataFrameGroupBy` on an empty :class:`DataFrame` (:issue:`36197`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 235bd5364af02..9466ada3f4578 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -2240,10 +2240,12 @@ def _create_blocks(self, obj: FrameOrSeriesUnion): """ # Ensure the object we're rolling over is monotonically sorted relative # to the groups - groupby_order = np.concatenate( - list(self._groupby.grouper.indices.values()) - ).astype(np.int64) - obj = obj.take(groupby_order) + # GH 36197 + if not obj.empty: + groupby_order = np.concatenate( + list(self._groupby.grouper.indices.values()) + ).astype(np.int64) + obj = obj.take(groupby_order) return super()._create_blocks(obj) def _get_cython_func_type(self, func: str) -> Callable: diff --git a/pandas/tests/window/test_grouper.py b/pandas/tests/window/test_grouper.py index cb85ad7584da7..786cf68d28871 100644 --- a/pandas/tests/window/test_grouper.py +++ b/pandas/tests/window/test_grouper.py @@ -393,3 +393,15 @@ def test_groupby_rolling_index_changed(self, func): name="a", ) tm.assert_series_equal(result, expected) + + def test_groupby_rolling_empty_frame(self): + # GH 36197 + expected = pd.DataFrame({"s1": []}) + result = expected.groupby("s1").rolling(window=1).sum() + expected.index = pd.MultiIndex.from_tuples([], names=["s1", None]) + tm.assert_frame_equal(result, expected) + + expected = pd.DataFrame({"s1": [], "s2": []}) + result = expected.groupby(["s1", "s2"]).rolling(window=1).sum() + expected.index = pd.MultiIndex.from_tuples([], names=["s1", "s2", None]) + tm.assert_frame_equal(result, expected) From 098d2f781ecaa63c8824ecbf1d79d2562b6a4b39 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Tue, 8 Sep 2020 11:22:20 +0100 Subject: [PATCH 0704/1025] DOC: doc fix (#36205) --- doc/source/whatsnew/v1.1.2.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.2.rst b/doc/source/whatsnew/v1.1.2.rst index f13d38d1f8f76..0e4a88f3ee56b 100644 --- a/doc/source/whatsnew/v1.1.2.rst +++ b/doc/source/whatsnew/v1.1.2.rst @@ -51,7 +51,7 @@ Bug fixes Other ~~~~~ - :meth:`factorize` now supports ``na_sentinel=None`` to include NaN in the uniques of the values and remove ``dropna`` keyword which was unintentionally exposed to public facing API in 1.1 version from :meth:`factorize` (:issue:`35667`) -- :meth:`DataFrame.plot` and meth:`Series.plot` raise ``UserWarning`` about usage of FixedFormatter and FixedLocator (:issue:`35684` and :issue:`35945`) +- :meth:`DataFrame.plot` and :meth:`Series.plot` raise ``UserWarning`` about usage of ``FixedFormatter`` and ``FixedLocator`` (:issue:`35684` and :issue:`35945`) .. --------------------------------------------------------------------------- From ce92c0a7e26872fd49b3567de9f223c8bc9d0b33 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Tue, 8 Sep 2020 12:50:13 +0100 Subject: [PATCH 0705/1025] DOC: release date for 1.1.2 (#36182) --- doc/source/whatsnew/v1.1.2.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.1.2.rst b/doc/source/whatsnew/v1.1.2.rst index 0e4a88f3ee56b..a214ad9762733 100644 --- a/doc/source/whatsnew/v1.1.2.rst +++ b/doc/source/whatsnew/v1.1.2.rst @@ -1,7 +1,7 @@ .. _whatsnew_112: -What's new in 1.1.2 (??) ------------------------- +What's new in 1.1.2 (September 8, 2020) +--------------------------------------- These are the changes in pandas 1.1.2. See :ref:`release` for a full changelog including other versions of pandas. From dcb600441024fe1aa49a027dc502fb7924200922 Mon Sep 17 00:00:00 2001 From: Yanxian Lin Date: Tue, 8 Sep 2020 06:01:43 -0700 Subject: [PATCH 0706/1025] Fixed pandas.json_normalize doctests errors` (#36207) --- pandas/io/json/_normalize.py | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/pandas/io/json/_normalize.py b/pandas/io/json/_normalize.py index 2e1fc57e88ed1..3ed0b5851b395 100644 --- a/pandas/io/json/_normalize.py +++ b/pandas/io/json/_normalize.py @@ -163,11 +163,11 @@ def _json_normalize( >>> data = [{'id': 1, 'name': {'first': 'Coleen', 'last': 'Volk'}}, ... {'name': {'given': 'Mose', 'family': 'Regner'}}, ... {'id': 2, 'name': 'Faye Raker'}] - >>> pandas.json_normalize(data) - id name name.family name.first name.given name.last - 0 1.0 NaN NaN Coleen NaN Volk - 1 NaN NaN Regner NaN Mose NaN - 2 2.0 Faye Raker NaN NaN NaN NaN + >>> pd.json_normalize(data) + id name.first name.last name.given name.family name + 0 1.0 Coleen Volk NaN NaN NaN + 1 NaN NaN NaN Mose Regner NaN + 2 2.0 NaN NaN NaN NaN Faye Raker >>> data = [{'id': 1, ... 'name': "Cole Volk", @@ -176,11 +176,11 @@ def _json_normalize( ... 'fitness': {'height': 130, 'weight': 60}}, ... {'id': 2, 'name': 'Faye Raker', ... 'fitness': {'height': 130, 'weight': 60}}] - >>> pandas.json_normalize(data, max_level=0) - fitness id name - 0 {'height': 130, 'weight': 60} 1.0 Cole Volk - 1 {'height': 130, 'weight': 60} NaN Mose Reg - 2 {'height': 130, 'weight': 60} 2.0 Faye Raker + >>> pd.json_normalize(data, max_level=0) + id name fitness + 0 1.0 Cole Volk {'height': 130, 'weight': 60} + 1 NaN Mose Reg {'height': 130, 'weight': 60} + 2 2.0 Faye Raker {'height': 130, 'weight': 60} Normalizes nested data up to level 1. @@ -191,11 +191,11 @@ def _json_normalize( ... 'fitness': {'height': 130, 'weight': 60}}, ... {'id': 2, 'name': 'Faye Raker', ... 'fitness': {'height': 130, 'weight': 60}}] - >>> pandas.json_normalize(data, max_level=1) - fitness.height fitness.weight id name - 0 130 60 1.0 Cole Volk - 1 130 60 NaN Mose Reg - 2 130 60 2.0 Faye Raker + >>> pd.json_normalize(data, max_level=1) + id name fitness.height fitness.weight + 0 1.0 Cole Volk 130 60 + 1 NaN Mose Reg 130 60 + 2 2.0 Faye Raker 130 60 >>> data = [{'state': 'Florida', ... 'shortname': 'FL', @@ -208,7 +208,7 @@ def _json_normalize( ... 'info': {'governor': 'John Kasich'}, ... 'counties': [{'name': 'Summit', 'population': 1234}, ... {'name': 'Cuyahoga', 'population': 1337}]}] - >>> result = pandas.json_normalize(data, 'counties', ['state', 'shortname', + >>> result = pd.json_normalize(data, 'counties', ['state', 'shortname', ... ['info', 'governor']]) >>> result name population state shortname info.governor @@ -219,7 +219,7 @@ def _json_normalize( 4 Cuyahoga 1337 Ohio OH John Kasich >>> data = {'A': [1, 2]} - >>> pandas.json_normalize(data, 'A', record_prefix='Prefix.') + >>> pd.json_normalize(data, 'A', record_prefix='Prefix.') Prefix.0 0 1 1 2 From 3c9592007b01004e879c06d93f3522facda7dc2b Mon Sep 17 00:00:00 2001 From: Irv Lustig Date: Tue, 8 Sep 2020 09:45:08 -0400 Subject: [PATCH 0707/1025] BUG: copying series into empty dataframe does not preserve dataframe index name (#36141) --- doc/source/whatsnew/v1.1.2.rst | 1 + pandas/core/frame.py | 8 +++++--- pandas/tests/indexing/test_partial.py | 12 ++++++++++++ 3 files changed, 18 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.1.2.rst b/doc/source/whatsnew/v1.1.2.rst index a214ad9762733..c6a08f4fb852a 100644 --- a/doc/source/whatsnew/v1.1.2.rst +++ b/doc/source/whatsnew/v1.1.2.rst @@ -43,6 +43,7 @@ Bug fixes - Bug in :class:`DataFrame` indexing returning an incorrect :class:`Series` in some cases when the series has been altered and a cache not invalidated (:issue:`33675`) - Bug in :meth:`DataFrame.corr` causing subsequent indexing lookups to be incorrect (:issue:`35882`) - Bug in :meth:`import_optional_dependency` returning incorrect package names in cases where package name is different from import name (:issue:`35948`) +- Bug when setting empty :class:`DataFrame` column to a :class:`Series` in preserving name of index in frame (:issue:`31368`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/frame.py b/pandas/core/frame.py index e1a889bf79d95..59cf4c0e2f81d 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3206,9 +3206,11 @@ def _ensure_valid_index(self, value): "and a value that cannot be converted to a Series" ) from err - self._mgr = self._mgr.reindex_axis( - value.index.copy(), axis=1, fill_value=np.nan - ) + # GH31368 preserve name of index + index_copy = value.index.copy() + index_copy.name = self.index.name + + self._mgr = self._mgr.reindex_axis(index_copy, axis=1, fill_value=np.nan) def _box_col_values(self, values, loc: int) -> Series: """ diff --git a/pandas/tests/indexing/test_partial.py b/pandas/tests/indexing/test_partial.py index 350f86b4e9fd0..7afbbc2b9ab2b 100644 --- a/pandas/tests/indexing/test_partial.py +++ b/pandas/tests/indexing/test_partial.py @@ -660,3 +660,15 @@ def test_indexing_timeseries_regression(self): expected = Series(rng, index=rng) tm.assert_series_equal(result, expected) + + def test_index_name_empty(self): + # GH 31368 + df = pd.DataFrame({}, index=pd.RangeIndex(0, name="df_index")) + series = pd.Series(1.23, index=pd.RangeIndex(4, name="series_index")) + + df["series"] = series + expected = pd.DataFrame( + {"series": [1.23] * 4}, index=pd.RangeIndex(4, name="df_index") + ) + + tm.assert_frame_equal(df, expected) From 12d68ce1f8b30551dc4d9c4aaa5c480353734db6 Mon Sep 17 00:00:00 2001 From: tiagohonorato <61059243+tiagohonorato@users.noreply.github.com> Date: Tue, 8 Sep 2020 12:28:21 -0300 Subject: [PATCH 0708/1025] CLN remove trailing commas (#36222) --- pandas/tests/io/pytables/test_timezones.py | 4 ++-- pandas/tests/io/test_feather.py | 2 +- pandas/tests/io/test_s3.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/tests/io/pytables/test_timezones.py b/pandas/tests/io/pytables/test_timezones.py index 38d32b0bdc8a3..1c29928991cde 100644 --- a/pandas/tests/io/pytables/test_timezones.py +++ b/pandas/tests/io/pytables/test_timezones.py @@ -110,7 +110,7 @@ def test_append_with_timezones_dateutil(setup_path): dti = dti._with_freq(None) # freq doesnt round-trip # GH 4098 example - df = DataFrame(dict(A=Series(range(3), index=dti,))) + df = DataFrame(dict(A=Series(range(3), index=dti))) _maybe_remove(store, "df") store.put("df", df) @@ -197,7 +197,7 @@ def test_append_with_timezones_pytz(setup_path): dti = dti._with_freq(None) # freq doesnt round-trip # GH 4098 example - df = DataFrame(dict(A=Series(range(3), index=dti,))) + df = DataFrame(dict(A=Series(range(3), index=dti))) _maybe_remove(store, "df") store.put("df", df) diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py index a8a5c8f00e6bf..c1e63f512b53e 100644 --- a/pandas/tests/io/test_feather.py +++ b/pandas/tests/io/test_feather.py @@ -76,7 +76,7 @@ def test_basic(self): pd.Timestamp("20130103"), ], "dtns": pd.DatetimeIndex( - list(pd.date_range("20130101", periods=3, freq="ns")), freq=None, + list(pd.date_range("20130101", periods=3, freq="ns")), freq=None ), } ) diff --git a/pandas/tests/io/test_s3.py b/pandas/tests/io/test_s3.py index a137e76b1696b..0ee6cb0796644 100644 --- a/pandas/tests/io/test_s3.py +++ b/pandas/tests/io/test_s3.py @@ -43,6 +43,6 @@ def test_read_with_creds_from_pub_bucket(): os.environ.setdefault("AWS_ACCESS_KEY_ID", "foobar_key") os.environ.setdefault("AWS_SECRET_ACCESS_KEY", "foobar_secret") df = read_csv( - "s3://gdelt-open-data/events/1981.csv", nrows=5, sep="\t", header=None, + "s3://gdelt-open-data/events/1981.csv", nrows=5, sep="\t", header=None ) assert len(df) == 5 From 1284767b71b05faf9bd93664b2f110aa7dfb1115 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 8 Sep 2020 08:29:03 -0700 Subject: [PATCH 0709/1025] CLN: remove unused return value in _create_blocks (#36196) --- pandas/core/window/rolling.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 9466ada3f4578..5a7482076903c 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -234,7 +234,7 @@ def _validate_get_window_bounds_signature(window: BaseIndexer) -> None: f"get_window_bounds" ) - def _create_blocks(self, obj: FrameOrSeriesUnion): + def _create_data(self, obj: FrameOrSeries) -> FrameOrSeries: """ Split data into blocks & return conformed data. """ @@ -242,9 +242,8 @@ def _create_blocks(self, obj: FrameOrSeriesUnion): if self.on is not None and not isinstance(self.on, Index): if obj.ndim == 2: obj = obj.reindex(columns=obj.columns.difference([self.on]), copy=False) - blocks = obj._to_dict_of_blocks(copy=False).values() - return blocks, obj + return obj def _gotitem(self, key, ndim, subset=None): """ @@ -333,7 +332,7 @@ def __repr__(self) -> str: def __iter__(self): window = self._get_window(win_type=None) - _, obj = self._create_blocks(self._selected_obj) + obj = self._create_data(self._selected_obj) index = self._get_window_indexer(window=window) start, end = index.get_window_bounds( @@ -469,7 +468,7 @@ def _apply_series(self, homogeneous_func: Callable[..., ArrayLike]) -> "Series": """ Series version of _apply_blockwise """ - _, obj = self._create_blocks(self._selected_obj) + obj = self._create_data(self._selected_obj) try: values = self._prep_values(obj.values) @@ -489,7 +488,7 @@ def _apply_blockwise( if self._selected_obj.ndim == 1: return self._apply_series(homogeneous_func) - _, obj = self._create_blocks(self._selected_obj) + obj = self._create_data(self._selected_obj) mgr = obj._mgr def hfunc(bvalues: ArrayLike) -> ArrayLike: @@ -1268,7 +1267,7 @@ def count(self): # implementations shouldn't end up here assert not isinstance(self.window, BaseIndexer) - _, obj = self._create_blocks(self._selected_obj) + obj = self._create_data(self._selected_obj) def hfunc(values: np.ndarray) -> np.ndarray: result = notna(values) @@ -2234,7 +2233,7 @@ def _apply( def _constructor(self): return Rolling - def _create_blocks(self, obj: FrameOrSeriesUnion): + def _create_data(self, obj: FrameOrSeries) -> FrameOrSeries: """ Split data into blocks & return conformed data. """ @@ -2246,7 +2245,7 @@ def _create_blocks(self, obj: FrameOrSeriesUnion): list(self._groupby.grouper.indices.values()) ).astype(np.int64) obj = obj.take(groupby_order) - return super()._create_blocks(obj) + return super()._create_data(obj) def _get_cython_func_type(self, func: str) -> Callable: """ From 6bc2c1a929d5c91907790ddc8022af130f5aed5c Mon Sep 17 00:00:00 2001 From: Irv Lustig Date: Tue, 8 Sep 2020 11:30:36 -0400 Subject: [PATCH 0710/1025] Make to_numeric default to correct precision (#36149) --- doc/source/whatsnew/v1.2.0.rst | 2 +- pandas/_libs/src/parse_helper.h | 4 +- pandas/tests/tools/test_to_numeric.py | 58 +++++++++++++++++++++++++++ 3 files changed, 62 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 2afa1f1a6199e..2aac2596c18cb 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -245,7 +245,7 @@ Timezones Numeric ^^^^^^^ -- +- Bug in :func:`to_numeric` where float precision was incorrect (:issue:`31364`) - Conversion diff --git a/pandas/_libs/src/parse_helper.h b/pandas/_libs/src/parse_helper.h index 2ada0a4bd173d..d161c4e29fe15 100644 --- a/pandas/_libs/src/parse_helper.h +++ b/pandas/_libs/src/parse_helper.h @@ -18,7 +18,9 @@ int to_double(char *item, double *p_value, char sci, char decimal, char *p_end = NULL; int error = 0; - *p_value = xstrtod(item, &p_end, decimal, sci, '\0', 1, &error, maybe_int); + /* Switch to precise xstrtod GH 31364 */ + *p_value = precise_xstrtod(item, &p_end, decimal, sci, '\0', 1, + &error, maybe_int); return (error == 0) && (!*p_end); } diff --git a/pandas/tests/tools/test_to_numeric.py b/pandas/tests/tools/test_to_numeric.py index 263887a8ea36e..450076f2824ad 100644 --- a/pandas/tests/tools/test_to_numeric.py +++ b/pandas/tests/tools/test_to_numeric.py @@ -649,3 +649,61 @@ def test_failure_to_convert_uint64_string_to_NaN(): ser = Series([32, 64, np.nan]) result = to_numeric(pd.Series(["32", "64", "uint64"]), errors="coerce") tm.assert_series_equal(result, ser) + + +@pytest.mark.parametrize( + "strrep", + [ + "243.164", + "245.968", + "249.585", + "259.745", + "265.742", + "272.567", + "279.196", + "280.366", + "275.034", + "271.351", + "272.889", + "270.627", + "280.828", + "290.383", + "308.153", + "319.945", + "336.0", + "344.09", + "351.385", + "356.178", + "359.82", + "361.03", + "367.701", + "380.812", + "387.98", + "391.749", + "391.171", + "385.97", + "385.345", + "386.121", + "390.996", + "399.734", + "413.073", + "421.532", + "430.221", + "437.092", + "439.746", + "446.01", + "451.191", + "460.463", + "469.779", + "472.025", + "479.49", + "474.864", + "467.54", + "471.978", + ], +) +def test_precision_float_conversion(strrep): + # GH 31364 + result = to_numeric(strrep) + + assert result == float(strrep) From 61e97ce2eb31e5fbd05cac5daf8041334aca468d Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 8 Sep 2020 15:24:54 -0700 Subject: [PATCH 0711/1025] REF: implement Categorical._box_func, make _box_func a method (#36206) --- pandas/core/arrays/categorical.py | 10 ++++++---- pandas/core/arrays/datetimelike.py | 3 +-- pandas/core/arrays/datetimes.py | 6 +++--- pandas/core/arrays/period.py | 5 ++--- pandas/core/arrays/timedeltas.py | 18 +++++++++++++----- pandas/core/indexes/datetimelike.py | 10 +++++----- 6 files changed, 30 insertions(+), 22 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index b732db4c66003..f20d3d5e316f8 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1734,6 +1734,11 @@ def _ndarray(self) -> np.ndarray: def _from_backing_data(self, arr: np.ndarray) -> "Categorical": return self._constructor(arr, dtype=self.dtype, fastpath=True) + def _box_func(self, i: int): + if i == -1: + return np.NaN + return self.categories[i] + # ------------------------------------------------------------------ def take_nd(self, indexer, allow_fill: bool = False, fill_value=None): @@ -1874,10 +1879,7 @@ def __getitem__(self, key): """ if isinstance(key, (int, np.integer)): i = self._codes[key] - if i == -1: - return np.nan - else: - return self.categories[i] + return self._box_func(i) key = check_array_indexer(self, key) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 6477b94a823ce..ba5bfc108f16b 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -478,8 +478,7 @@ def _from_backing_data(self: _T, arr: np.ndarray) -> _T: # ------------------------------------------------------------------ - @property - def _box_func(self): + def _box_func(self, x): """ box function to get object from internal representation """ diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index d913e7be9ae5f..9f10cc84dcfcc 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -7,6 +7,7 @@ from pandas._libs import lib, tslib from pandas._libs.tslibs import ( NaT, + NaTType, Resolution, Timestamp, conversion, @@ -475,9 +476,8 @@ def _maybe_clear_freq(self): # ----------------------------------------------------------------- # Descriptive Properties - @property - def _box_func(self): - return lambda x: Timestamp(x, freq=self.freq, tz=self.tz) + def _box_func(self, x) -> Union[Timestamp, NaTType]: + return Timestamp(x, freq=self.freq, tz=self.tz) @property def dtype(self) -> Union[np.dtype, DatetimeTZDtype]: diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index c3a9430736969..eea11bde77030 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -484,9 +484,8 @@ def _time_shift(self, periods, freq=None): values[self._isnan] = iNaT return type(self)(values, freq=self.freq) - @property - def _box_func(self): - return lambda x: Period._from_ordinal(ordinal=x, freq=self.freq) + def _box_func(self, x) -> Union[Period, NaTType]: + return Period._from_ordinal(ordinal=x, freq=self.freq) def asfreq(self, freq=None, how: str = "E") -> "PeriodArray": """ diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index 485ebb49a376d..5e3c0f2b8d876 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -1,10 +1,19 @@ from datetime import timedelta -from typing import List +from typing import List, Union import numpy as np from pandas._libs import lib, tslibs -from pandas._libs.tslibs import NaT, Period, Tick, Timedelta, Timestamp, iNaT, to_offset +from pandas._libs.tslibs import ( + NaT, + NaTType, + Period, + Tick, + Timedelta, + Timestamp, + iNaT, + to_offset, +) from pandas._libs.tslibs.conversion import precision_from_unit from pandas._libs.tslibs.fields import get_timedelta_field from pandas._libs.tslibs.timedeltas import array_to_timedelta64, parse_timedelta_unit @@ -108,9 +117,8 @@ class TimedeltaArray(dtl.DatetimeLikeArrayMixin, dtl.TimelikeOps): # Note: ndim must be defined to ensure NaT.__richcmp(TimedeltaArray) # operates pointwise. - @property - def _box_func(self): - return lambda x: Timedelta(x, unit="ns") + def _box_func(self, x) -> Union[Timedelta, NaTType]: + return Timedelta(x, unit="ns") @property def dtype(self): diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index e7e93068d9175..54c8ed60b6097 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -81,7 +81,7 @@ def wrapper(left, right): DatetimeLikeArrayMixin, cache=True, ) -@inherit_names(["mean", "asi8", "freq", "freqstr", "_box_func"], DatetimeLikeArrayMixin) +@inherit_names(["mean", "asi8", "freq", "freqstr"], DatetimeLikeArrayMixin) class DatetimeIndexOpsMixin(ExtensionIndex): """ Common ops mixin to support a unified interface datetimelike Index. @@ -244,7 +244,7 @@ def min(self, axis=None, skipna=True, *args, **kwargs): # quick check if len(i8) and self.is_monotonic: if i8[0] != iNaT: - return self._box_func(i8[0]) + return self._data._box_func(i8[0]) if self.hasnans: if skipna: @@ -253,7 +253,7 @@ def min(self, axis=None, skipna=True, *args, **kwargs): return self._na_value else: min_stamp = i8.min() - return self._box_func(min_stamp) + return self._data._box_func(min_stamp) except ValueError: return self._na_value @@ -301,7 +301,7 @@ def max(self, axis=None, skipna=True, *args, **kwargs): # quick check if len(i8) and self.is_monotonic: if i8[-1] != iNaT: - return self._box_func(i8[-1]) + return self._data._box_func(i8[-1]) if self.hasnans: if skipna: @@ -310,7 +310,7 @@ def max(self, axis=None, skipna=True, *args, **kwargs): return self._na_value else: max_stamp = i8.max() - return self._box_func(max_stamp) + return self._data._box_func(max_stamp) except ValueError: return self._na_value From f1ddf4f2a5dcb8a11e59d7f0c1c1f13c231ba386 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 8 Sep 2020 15:41:30 -0700 Subject: [PATCH 0712/1025] STY: de-privatize names imported across modules (#36178) --- pandas/__init__.py | 6 +-- pandas/_testing.py | 2 +- pandas/compat/numpy/__init__.py | 10 ++--- pandas/core/array_algos/masked_reductions.py | 4 +- pandas/core/arrays/sparse/accessor.py | 8 ++-- pandas/core/arrays/sparse/scipy_sparse.py | 4 +- pandas/core/common.py | 4 +- pandas/core/computation/engines.py | 4 +- pandas/core/computation/expr.py | 24 +++++------ pandas/core/computation/expressions.py | 12 +++--- pandas/core/computation/ops.py | 38 ++++++++--------- pandas/core/computation/scope.py | 4 +- pandas/core/dtypes/cast.py | 4 +- pandas/core/dtypes/common.py | 2 +- pandas/core/generic.py | 4 +- pandas/core/index.py | 2 +- pandas/core/indexes/base.py | 4 +- pandas/core/indexes/multi.py | 4 +- pandas/core/nanops.py | 6 +-- pandas/core/ops/__init__.py | 12 +++--- pandas/core/reshape/merge.py | 2 +- pandas/io/formats/format.py | 4 +- pandas/io/json/_json.py | 6 +-- pandas/io/parsers.py | 6 +-- pandas/tests/arrays/test_datetimelike.py | 4 +- pandas/tests/computation/test_eval.py | 42 +++++++++---------- pandas/tests/frame/test_arithmetic.py | 2 +- pandas/tests/generic/test_generic.py | 6 +-- pandas/tests/indexes/common.py | 4 +- pandas/tests/indexes/multi/test_analytics.py | 4 +- pandas/tests/indexes/test_numpy_compat.py | 8 ++-- pandas/tests/indexing/test_loc.py | 4 +- .../tests/scalar/timedelta/test_arithmetic.py | 6 ++- pandas/tests/test_common.py | 4 +- pandas/tests/test_expressions.py | 2 +- pandas/util/_test_decorators.py | 6 +-- 36 files changed, 135 insertions(+), 133 deletions(-) diff --git a/pandas/__init__.py b/pandas/__init__.py index 2737bcd8f9ccf..70bb0c8a2cb51 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -20,9 +20,9 @@ # numpy compat from pandas.compat.numpy import ( - _np_version_under1p17, - _np_version_under1p18, - _is_numpy_dev, + np_version_under1p17 as _np_version_under1p17, + np_version_under1p18 as _np_version_under1p18, + is_numpy_dev as _is_numpy_dev, ) try: diff --git a/pandas/_testing.py b/pandas/_testing.py index 7dba578951deb..9db0c3496e290 100644 --- a/pandas/_testing.py +++ b/pandas/_testing.py @@ -2713,7 +2713,7 @@ def use_numexpr(use, min_elements=None): if min_elements is None: min_elements = expr._MIN_ELEMENTS - olduse = expr._USE_NUMEXPR + olduse = expr.USE_NUMEXPR oldmin = expr._MIN_ELEMENTS expr.set_use_numexpr(use) expr._MIN_ELEMENTS = min_elements diff --git a/pandas/compat/numpy/__init__.py b/pandas/compat/numpy/__init__.py index 08d06da93bb45..a2444b7ba5a0d 100644 --- a/pandas/compat/numpy/__init__.py +++ b/pandas/compat/numpy/__init__.py @@ -8,11 +8,11 @@ # numpy versioning _np_version = np.__version__ _nlv = LooseVersion(_np_version) -_np_version_under1p17 = _nlv < LooseVersion("1.17") -_np_version_under1p18 = _nlv < LooseVersion("1.18") +np_version_under1p17 = _nlv < LooseVersion("1.17") +np_version_under1p18 = _nlv < LooseVersion("1.18") _np_version_under1p19 = _nlv < LooseVersion("1.19") _np_version_under1p20 = _nlv < LooseVersion("1.20") -_is_numpy_dev = ".dev" in str(_nlv) +is_numpy_dev = ".dev" in str(_nlv) _min_numpy_ver = "1.16.5" @@ -65,6 +65,6 @@ def np_array_datetime64_compat(arr, *args, **kwargs): __all__ = [ "np", "_np_version", - "_np_version_under1p17", - "_is_numpy_dev", + "np_version_under1p17", + "is_numpy_dev", ] diff --git a/pandas/core/array_algos/masked_reductions.py b/pandas/core/array_algos/masked_reductions.py index 1b9ed014f27b7..3f4625e2b712a 100644 --- a/pandas/core/array_algos/masked_reductions.py +++ b/pandas/core/array_algos/masked_reductions.py @@ -8,7 +8,7 @@ import numpy as np from pandas._libs import missing as libmissing -from pandas.compat.numpy import _np_version_under1p17 +from pandas.compat.numpy import np_version_under1p17 from pandas.core.nanops import check_below_min_count @@ -46,7 +46,7 @@ def _sumprod( if check_below_min_count(values.shape, mask, min_count): return libmissing.NA - if _np_version_under1p17: + if np_version_under1p17: return func(values[~mask]) else: return func(values, where=~mask) diff --git a/pandas/core/arrays/sparse/accessor.py b/pandas/core/arrays/sparse/accessor.py index da8d695c59b9e..ec4b0fd89860c 100644 --- a/pandas/core/arrays/sparse/accessor.py +++ b/pandas/core/arrays/sparse/accessor.py @@ -88,9 +88,9 @@ def from_coo(cls, A, dense_index=False): dtype: Sparse[float64, nan] """ from pandas import Series - from pandas.core.arrays.sparse.scipy_sparse import _coo_to_sparse_series + from pandas.core.arrays.sparse.scipy_sparse import coo_to_sparse_series - result = _coo_to_sparse_series(A, dense_index=dense_index) + result = coo_to_sparse_series(A, dense_index=dense_index) result = Series(result.array, index=result.index, copy=False) return result @@ -168,9 +168,9 @@ def to_coo(self, row_levels=(0,), column_levels=(1,), sort_labels=False): >>> columns [('a', 0), ('a', 1), ('b', 0), ('b', 1)] """ - from pandas.core.arrays.sparse.scipy_sparse import _sparse_series_to_coo + from pandas.core.arrays.sparse.scipy_sparse import sparse_series_to_coo - A, rows, columns = _sparse_series_to_coo( + A, rows, columns = sparse_series_to_coo( self._parent, row_levels, column_levels, sort_labels=sort_labels ) return A, rows, columns diff --git a/pandas/core/arrays/sparse/scipy_sparse.py b/pandas/core/arrays/sparse/scipy_sparse.py index eafd782dc9b9c..56c678c88b9c7 100644 --- a/pandas/core/arrays/sparse/scipy_sparse.py +++ b/pandas/core/arrays/sparse/scipy_sparse.py @@ -85,7 +85,7 @@ def _get_index_subset_to_coord_dict(index, subset, sort_labels=False): return values, i_coord, j_coord, i_labels, j_labels -def _sparse_series_to_coo(ss, row_levels=(0,), column_levels=(1,), sort_labels=False): +def sparse_series_to_coo(ss, row_levels=(0,), column_levels=(1,), sort_labels=False): """ Convert a sparse Series to a scipy.sparse.coo_matrix using index levels row_levels, column_levels as the row and column @@ -113,7 +113,7 @@ def _sparse_series_to_coo(ss, row_levels=(0,), column_levels=(1,), sort_labels=F return sparse_matrix, rows, columns -def _coo_to_sparse_series(A, dense_index: bool = False): +def coo_to_sparse_series(A, dense_index: bool = False): """ Convert a scipy.sparse.coo_matrix to a SparseSeries. diff --git a/pandas/core/common.py b/pandas/core/common.py index 279d512e5a046..968fb180abcd0 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -16,7 +16,7 @@ from pandas._libs import lib, tslibs from pandas._typing import AnyArrayLike, Scalar, T -from pandas.compat.numpy import _np_version_under1p18 +from pandas.compat.numpy import np_version_under1p18 from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike from pandas.core.dtypes.common import ( @@ -425,7 +425,7 @@ def random_state(state=None): if ( is_integer(state) or is_array_like(state) - or (not _np_version_under1p18 and isinstance(state, np.random.BitGenerator)) + or (not np_version_under1p18 and isinstance(state, np.random.BitGenerator)) ): return np.random.RandomState(state) elif isinstance(state, np.random.RandomState): diff --git a/pandas/core/computation/engines.py b/pandas/core/computation/engines.py index 9c5388faae1bd..0cdc0f530a7f3 100644 --- a/pandas/core/computation/engines.py +++ b/pandas/core/computation/engines.py @@ -6,11 +6,11 @@ from typing import Dict, Type from pandas.core.computation.align import align_terms, reconstruct_object -from pandas.core.computation.ops import _mathops, _reductions +from pandas.core.computation.ops import MATHOPS, REDUCTIONS import pandas.io.formats.printing as printing -_ne_builtins = frozenset(_mathops + _reductions) +_ne_builtins = frozenset(MATHOPS + REDUCTIONS) class NumExprClobberingError(NameError): diff --git a/pandas/core/computation/expr.py b/pandas/core/computation/expr.py index df71b4fe415f8..8cff6abc071ca 100644 --- a/pandas/core/computation/expr.py +++ b/pandas/core/computation/expr.py @@ -12,7 +12,13 @@ import pandas.core.common as com from pandas.core.computation.ops import ( - _LOCAL_TAG, + ARITH_OPS_SYMS, + BOOL_OPS_SYMS, + CMP_OPS_SYMS, + LOCAL_TAG, + MATHOPS, + REDUCTIONS, + UNARY_OPS_SYMS, BinOp, Constant, Div, @@ -21,12 +27,6 @@ Term, UnaryOp, UndefinedVariableError, - _arith_ops_syms, - _bool_ops_syms, - _cmp_ops_syms, - _mathops, - _reductions, - _unary_ops_syms, is_term, ) from pandas.core.computation.parsing import clean_backtick_quoted_toks, tokenize_string @@ -101,7 +101,7 @@ def _replace_locals(tok: Tuple[int, str]) -> Tuple[int, str]: """ toknum, tokval = tok if toknum == tokenize.OP and tokval == "@": - return tokenize.OP, _LOCAL_TAG + return tokenize.OP, LOCAL_TAG return toknum, tokval @@ -338,7 +338,7 @@ class BaseExprVisitor(ast.NodeVisitor): const_type: Type[Term] = Constant term_type = Term - binary_ops = _cmp_ops_syms + _bool_ops_syms + _arith_ops_syms + binary_ops = CMP_OPS_SYMS + BOOL_OPS_SYMS + ARITH_OPS_SYMS binary_op_nodes = ( "Gt", "Lt", @@ -362,7 +362,7 @@ class BaseExprVisitor(ast.NodeVisitor): ) binary_op_nodes_map = dict(zip(binary_ops, binary_op_nodes)) - unary_ops = _unary_ops_syms + unary_ops = UNARY_OPS_SYMS unary_op_nodes = "UAdd", "USub", "Invert", "Not" unary_op_nodes_map = {k: v for k, v in zip(unary_ops, unary_op_nodes)} @@ -494,7 +494,7 @@ def _maybe_evaluate_binop( if self.engine != "pytables": if ( - res.op in _cmp_ops_syms + res.op in CMP_OPS_SYMS and getattr(lhs, "is_datetime", False) or getattr(rhs, "is_datetime", False) ): @@ -726,7 +726,7 @@ def visitor(x, y): _python_not_supported = frozenset(["Dict", "BoolOp", "In", "NotIn"]) -_numexpr_supported_calls = frozenset(_reductions + _mathops) +_numexpr_supported_calls = frozenset(REDUCTIONS + MATHOPS) @disallow( diff --git a/pandas/core/computation/expressions.py b/pandas/core/computation/expressions.py index d2c08c343ab4b..0032fe97b8b33 100644 --- a/pandas/core/computation/expressions.py +++ b/pandas/core/computation/expressions.py @@ -23,7 +23,7 @@ _TEST_MODE = None _TEST_RESULT: List[bool] = list() -_USE_NUMEXPR = NUMEXPR_INSTALLED +USE_NUMEXPR = NUMEXPR_INSTALLED _evaluate = None _where = None @@ -39,21 +39,21 @@ def set_use_numexpr(v=True): # set/unset to use numexpr - global _USE_NUMEXPR + global USE_NUMEXPR if NUMEXPR_INSTALLED: - _USE_NUMEXPR = v + USE_NUMEXPR = v # choose what we are going to do global _evaluate, _where - _evaluate = _evaluate_numexpr if _USE_NUMEXPR else _evaluate_standard - _where = _where_numexpr if _USE_NUMEXPR else _where_standard + _evaluate = _evaluate_numexpr if USE_NUMEXPR else _evaluate_standard + _where = _where_numexpr if USE_NUMEXPR else _where_standard def set_numexpr_threads(n=None): # if we are using numexpr, set the threads to n # otherwise reset - if NUMEXPR_INSTALLED and _USE_NUMEXPR: + if NUMEXPR_INSTALLED and USE_NUMEXPR: if n is None: n = ne.detect_number_of_cores() ne.set_num_threads(n) diff --git a/pandas/core/computation/ops.py b/pandas/core/computation/ops.py index 1fb3910b8577d..5759cd17476d6 100644 --- a/pandas/core/computation/ops.py +++ b/pandas/core/computation/ops.py @@ -16,11 +16,11 @@ import pandas.core.common as com from pandas.core.computation.common import ensure_decoded, result_type_many -from pandas.core.computation.scope import _DEFAULT_GLOBALS +from pandas.core.computation.scope import DEFAULT_GLOBALS from pandas.io.formats.printing import pprint_thing, pprint_thing_encoded -_reductions = ("sum", "prod") +REDUCTIONS = ("sum", "prod") _unary_math_ops = ( "sin", @@ -46,10 +46,10 @@ ) _binary_math_ops = ("arctan2",) -_mathops = _unary_math_ops + _binary_math_ops +MATHOPS = _unary_math_ops + _binary_math_ops -_LOCAL_TAG = "__pd_eval_local_" +LOCAL_TAG = "__pd_eval_local_" class UndefinedVariableError(NameError): @@ -80,13 +80,13 @@ def __init__(self, name, env, side=None, encoding=None): self.env = env self.side = side tname = str(name) - self.is_local = tname.startswith(_LOCAL_TAG) or tname in _DEFAULT_GLOBALS + self.is_local = tname.startswith(LOCAL_TAG) or tname in DEFAULT_GLOBALS self._value = self._resolve_name() self.encoding = encoding @property def local_name(self) -> str: - return self.name.replace(_LOCAL_TAG, "") + return self.name.replace(LOCAL_TAG, "") def __repr__(self) -> str: return pprint_thing(self.name) @@ -220,7 +220,7 @@ def __repr__(self) -> str: @property def return_type(self): # clobber types to bool if the op is a boolean operator - if self.op in (_cmp_ops_syms + _bool_ops_syms): + if self.op in (CMP_OPS_SYMS + BOOL_OPS_SYMS): return np.bool_ return result_type_many(*(term.type for term in com.flatten(self))) @@ -280,7 +280,7 @@ def _not_in(x, y): return x not in y -_cmp_ops_syms = (">", "<", ">=", "<=", "==", "!=", "in", "not in") +CMP_OPS_SYMS = (">", "<", ">=", "<=", "==", "!=", "in", "not in") _cmp_ops_funcs = ( operator.gt, operator.lt, @@ -291,13 +291,13 @@ def _not_in(x, y): _in, _not_in, ) -_cmp_ops_dict = dict(zip(_cmp_ops_syms, _cmp_ops_funcs)) +_cmp_ops_dict = dict(zip(CMP_OPS_SYMS, _cmp_ops_funcs)) -_bool_ops_syms = ("&", "|", "and", "or") +BOOL_OPS_SYMS = ("&", "|", "and", "or") _bool_ops_funcs = (operator.and_, operator.or_, operator.and_, operator.or_) -_bool_ops_dict = dict(zip(_bool_ops_syms, _bool_ops_funcs)) +_bool_ops_dict = dict(zip(BOOL_OPS_SYMS, _bool_ops_funcs)) -_arith_ops_syms = ("+", "-", "*", "/", "**", "//", "%") +ARITH_OPS_SYMS = ("+", "-", "*", "/", "**", "//", "%") _arith_ops_funcs = ( operator.add, operator.sub, @@ -307,12 +307,12 @@ def _not_in(x, y): operator.floordiv, operator.mod, ) -_arith_ops_dict = dict(zip(_arith_ops_syms, _arith_ops_funcs)) +_arith_ops_dict = dict(zip(ARITH_OPS_SYMS, _arith_ops_funcs)) -_special_case_arith_ops_syms = ("**", "//", "%") +SPECIAL_CASE_ARITH_OPS_SYMS = ("**", "//", "%") _special_case_arith_ops_funcs = (operator.pow, operator.floordiv, operator.mod) _special_case_arith_ops_dict = dict( - zip(_special_case_arith_ops_syms, _special_case_arith_ops_funcs) + zip(SPECIAL_CASE_ARITH_OPS_SYMS, _special_case_arith_ops_funcs) ) _binary_ops_dict = {} @@ -530,9 +530,9 @@ def __init__(self, lhs, rhs): _cast_inplace(com.flatten(self), acceptable_dtypes, np.float_) -_unary_ops_syms = ("+", "-", "~", "not") +UNARY_OPS_SYMS = ("+", "-", "~", "not") _unary_ops_funcs = (operator.pos, operator.neg, operator.invert, operator.invert) -_unary_ops_dict = dict(zip(_unary_ops_syms, _unary_ops_funcs)) +_unary_ops_dict = dict(zip(UNARY_OPS_SYMS, _unary_ops_funcs)) class UnaryOp(Op): @@ -561,7 +561,7 @@ def __init__(self, op: str, operand): except KeyError as err: raise ValueError( f"Invalid unary operator {repr(op)}, " - f"valid operators are {_unary_ops_syms}" + f"valid operators are {UNARY_OPS_SYMS}" ) from err def __call__(self, env): @@ -602,7 +602,7 @@ class FuncNode: def __init__(self, name: str): from pandas.core.computation.check import NUMEXPR_INSTALLED, NUMEXPR_VERSION - if name not in _mathops or ( + if name not in MATHOPS or ( NUMEXPR_INSTALLED and NUMEXPR_VERSION < LooseVersion("2.6.9") and name in ("floor", "ceil") diff --git a/pandas/core/computation/scope.py b/pandas/core/computation/scope.py index 83bf92ad737e4..2925f583bfc56 100644 --- a/pandas/core/computation/scope.py +++ b/pandas/core/computation/scope.py @@ -53,7 +53,7 @@ def _raw_hex_id(obj) -> str: return "".join(_replacer(x) for x in packed) -_DEFAULT_GLOBALS = { +DEFAULT_GLOBALS = { "Timestamp": Timestamp, "datetime": datetime.datetime, "True": True, @@ -114,7 +114,7 @@ def __init__( # shallow copy because we don't want to keep filling this up with what # was there before if there are multiple calls to Scope/_ensure_scope - self.scope = DeepChainMap(_DEFAULT_GLOBALS.copy()) + self.scope = DeepChainMap(DEFAULT_GLOBALS.copy()) self.target = target if isinstance(local_dict, Scope): diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 7c5aafcbbc7e9..8f9c0cf7a01db 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -22,9 +22,9 @@ from pandas.util._validators import validate_bool_kwarg from pandas.core.dtypes.common import ( - _POSSIBLY_CAST_DTYPES, DT64NS_DTYPE, INT64_DTYPE, + POSSIBLY_CAST_DTYPES, TD64NS_DTYPE, ensure_int8, ensure_int16, @@ -1188,7 +1188,7 @@ def maybe_castable(arr) -> bool: elif kind == "m": return is_timedelta64_ns_dtype(arr.dtype) - return arr.dtype.name not in _POSSIBLY_CAST_DTYPES + return arr.dtype.name not in POSSIBLY_CAST_DTYPES def maybe_infer_to_datetimelike(value, convert_dates: bool = False): diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 6ad46eb967275..5987fdabf78bb 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -43,7 +43,7 @@ is_sequence, ) -_POSSIBLY_CAST_DTYPES = { +POSSIBLY_CAST_DTYPES = { np.dtype(t).name for t in [ "O", diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 93c945638a174..40f0c6200e835 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -102,7 +102,7 @@ import pandas.core.indexing as indexing from pandas.core.internals import BlockManager from pandas.core.missing import find_valid_index -from pandas.core.ops import _align_method_FRAME +from pandas.core.ops import align_method_FRAME from pandas.core.shared_docs import _shared_docs from pandas.core.window import Expanding, ExponentialMovingWindow, Rolling, Window @@ -7402,7 +7402,7 @@ def _clip_with_one_bound(self, threshold, method, axis, inplace): if isinstance(self, ABCSeries): threshold = self._constructor(threshold, index=self.index) else: - threshold = _align_method_FRAME(self, threshold, axis, flex=None)[1] + threshold = align_method_FRAME(self, threshold, axis, flex=None)[1] return self.where(subset, threshold, axis=axis, inplace=inplace) def clip( diff --git a/pandas/core/index.py b/pandas/core/index.py index a315b9619b0e7..44f434e038a4b 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -19,7 +19,7 @@ ensure_index_from_sequences, get_objs_combined_axis, ) -from pandas.core.indexes.multi import _sparsify # noqa:F401 +from pandas.core.indexes.multi import sparsify_labels # noqa:F401 # GH#30193 warnings.warn( diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index a1bc8a4659b24..526dae7e256b7 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -3586,7 +3586,7 @@ def join(self, other, how="left", level=None, return_indexers=False, sort=False) def _join_multi(self, other, how, return_indexers=True): from pandas.core.indexes.multi import MultiIndex - from pandas.core.reshape.merge import _restore_dropped_levels_multijoin + from pandas.core.reshape.merge import restore_dropped_levels_multijoin # figure out join names self_names = set(com.not_none(*self.names)) @@ -3622,7 +3622,7 @@ def _join_multi(self, other, how, return_indexers=True): # common levels, ldrop_names, rdrop_names dropped_names = ldrop_names + rdrop_names - levels, codes, names = _restore_dropped_levels_multijoin( + levels, codes, names = restore_dropped_levels_multijoin( self, other, dropped_names, join_idx, lidx, ridx ) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 9630e154ccd17..deeb7ff50b88c 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1337,7 +1337,7 @@ def format( if sparsify in [False, lib.no_default]: sentinel = sparsify # little bit of a kludge job for #1217 - result_levels = _sparsify( + result_levels = sparsify_labels( result_levels, start=int(names), sentinel=sentinel ) @@ -3692,7 +3692,7 @@ def _add_numeric_methods_disabled(cls): MultiIndex._add_logical_methods_disabled() -def _sparsify(label_list, start: int = 0, sentinel=""): +def sparsify_labels(label_list, start: int = 0, sentinel=""): pivoted = list(zip(*label_list)) k = len(label_list) diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 6fdde22a1c514..64470da2fb910 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -11,7 +11,7 @@ from pandas._typing import ArrayLike, Dtype, DtypeObj, F, Scalar from pandas.compat._optional import import_optional_dependency -from pandas.core.dtypes.cast import _int64_max, maybe_upcast_putmask +from pandas.core.dtypes.cast import maybe_upcast_putmask from pandas.core.dtypes.common import ( get_dtype, is_any_int_dtype, @@ -185,7 +185,7 @@ def _get_fill_value( else: if fill_value_typ == "+inf": # need the max int here - return _int64_max + return np.iinfo(np.int64).max else: return iNaT @@ -346,7 +346,7 @@ def _wrap_results(result, dtype: DtypeObj, fill_value=None): result = np.nan # raise if we have a timedelta64[ns] which is too large - if np.fabs(result) > _int64_max: + if np.fabs(result) > np.iinfo(np.int64).max: raise ValueError("overflow in timedelta operation") result = Timedelta(result, unit="ns") diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index 60f3d23aaed13..8fcbee6a20ac3 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -306,7 +306,7 @@ def dispatch_to_series(left, right, func, axis: Optional[int] = None): def _align_method_SERIES(left: "Series", right, align_asobject: bool = False): """ align lhs and rhs Series """ - # ToDo: Different from _align_method_FRAME, list, tuple and ndarray + # ToDo: Different from align_method_FRAME, list, tuple and ndarray # are not coerced here # because Series has inconsistencies described in #13637 @@ -430,7 +430,7 @@ def flex_wrapper(self, other, level=None, fill_value=None, axis=0): # DataFrame -def _align_method_FRAME( +def align_method_FRAME( left, right, axis, flex: Optional[bool] = False, level: Level = None ): """ @@ -571,7 +571,7 @@ def _frame_arith_method_with_reindex( new_right = right.iloc[:, rcols] result = op(new_left, new_right) - # Do the join on the columns instead of using _align_method_FRAME + # Do the join on the columns instead of using align_method_FRAME # to avoid constructing two potentially large/sparse DataFrames join_columns, _, _ = left.columns.join( right.columns, how="outer", level=None, return_indexers=True @@ -644,7 +644,7 @@ def f(self, other, axis=default_axis, level=None, fill_value=None): # TODO: why are we passing flex=True instead of flex=not special? # 15 tests fail if we pass flex=not special instead - self, other = _align_method_FRAME(self, other, axis, flex=True, level=level) + self, other = align_method_FRAME(self, other, axis, flex=True, level=level) if isinstance(other, ABCDataFrame): # Another DataFrame @@ -680,7 +680,7 @@ def _flex_comp_method_FRAME(cls: Type["DataFrame"], op, special: bool): def f(self, other, axis=default_axis, level=None): axis = self._get_axis_number(axis) if axis is not None else 1 - self, other = _align_method_FRAME(self, other, axis, flex=True, level=level) + self, other = align_method_FRAME(self, other, axis, flex=True, level=level) new_data = dispatch_to_series(self, other, op, axis=axis) return self._construct_result(new_data) @@ -698,7 +698,7 @@ def _comp_method_FRAME(cls: Type["DataFrame"], op, special: bool): def f(self, other): axis = 1 # only relevant for Series other case - self, other = _align_method_FRAME(self, other, axis, level=None, flex=False) + self, other = align_method_FRAME(self, other, axis, level=None, flex=False) # See GH#4537 for discussion of scalar op behavior new_data = dispatch_to_series(self, other, op, axis=axis) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index f1c5486222ea1..030dec369c2be 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1350,7 +1350,7 @@ def _get_join_indexers( return join_func(lkey, rkey, count, **kwargs) -def _restore_dropped_levels_multijoin( +def restore_dropped_levels_multijoin( left: MultiIndex, right: MultiIndex, dropped_level_names, diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index f31e60a43e391..444afcee49a61 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -995,7 +995,7 @@ def to_html( ) def _get_formatted_column_labels(self, frame: "DataFrame") -> List[List[str]]: - from pandas.core.indexes.multi import _sparsify + from pandas.core.indexes.multi import sparsify_labels columns = frame.columns @@ -1021,7 +1021,7 @@ def space_format(x, y): zip(*[[space_format(x, y) for y in x] for x in fmt_columns]) ) if self.sparsify and len(str_columns): - str_columns = _sparsify(str_columns) + str_columns = sparsify_labels(str_columns) str_columns = [list(x) for x in zip(*str_columns)] else: diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index a4d923fdbe45a..e3000788cb33a 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -22,7 +22,7 @@ from pandas.io.common import get_compression_method, get_filepath_or_buffer, get_handle from pandas.io.json._normalize import convert_to_line_delimits from pandas.io.json._table_schema import build_table_schema, parse_table_schema -from pandas.io.parsers import _validate_integer +from pandas.io.parsers import validate_integer loads = json.loads dumps = json.dumps @@ -698,11 +698,11 @@ def __init__( self.file_handles: List[IO] = [] if self.chunksize is not None: - self.chunksize = _validate_integer("chunksize", self.chunksize, 1) + self.chunksize = validate_integer("chunksize", self.chunksize, 1) if not self.lines: raise ValueError("chunksize can only be passed if lines=True") if self.nrows is not None: - self.nrows = _validate_integer("nrows", self.nrows, 0) + self.nrows = validate_integer("nrows", self.nrows, 0) if not self.lines: raise ValueError("nrows can only be passed if lines=True") diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index a0466c5ac6b57..4c619a636f057 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -361,7 +361,7 @@ ) -def _validate_integer(name, val, min_val=0): +def validate_integer(name, val, min_val=0): """ Checks whether the 'name' parameter for parsing is either an integer OR float that can SAFELY be cast to an integer @@ -436,7 +436,7 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds): # Extract some of the arguments (pass chunksize on). iterator = kwds.get("iterator", False) - chunksize = _validate_integer("chunksize", kwds.get("chunksize", None), 1) + chunksize = validate_integer("chunksize", kwds.get("chunksize", None), 1) nrows = kwds.get("nrows", None) # Check for duplicates in names. @@ -1179,7 +1179,7 @@ def _failover_to_python(self): raise AbstractMethodError(self) def read(self, nrows=None): - nrows = _validate_integer("nrows", nrows) + nrows = validate_integer("nrows", nrows) ret = self._engine.read(nrows) # May alter columns / col_dict diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index 292557fc04258..d2d3766959fbf 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -4,7 +4,7 @@ import pytest from pandas._libs import OutOfBoundsDatetime -from pandas.compat.numpy import _np_version_under1p18 +from pandas.compat.numpy import np_version_under1p18 import pandas as pd import pandas._testing as tm @@ -960,7 +960,7 @@ def test_invalid_nat_setitem_array(array, non_casting_nats): ], ) def test_to_numpy_extra(array): - if _np_version_under1p18: + if np_version_under1p18: # np.isnan(NaT) raises, so use pandas' isnan = pd.isna else: diff --git a/pandas/tests/computation/test_eval.py b/pandas/tests/computation/test_eval.py index 49066428eb16c..72dc04e68c154 100644 --- a/pandas/tests/computation/test_eval.py +++ b/pandas/tests/computation/test_eval.py @@ -26,12 +26,12 @@ PandasExprVisitor, PythonExprVisitor, ) -from pandas.core.computation.expressions import _USE_NUMEXPR, NUMEXPR_INSTALLED +from pandas.core.computation.expressions import NUMEXPR_INSTALLED, USE_NUMEXPR from pandas.core.computation.ops import ( - _arith_ops_syms, + ARITH_OPS_SYMS, + SPECIAL_CASE_ARITH_OPS_SYMS, _binary_math_ops, _binary_ops_dict, - _special_case_arith_ops_syms, _unary_math_ops, ) @@ -41,8 +41,8 @@ pytest.param( engine, marks=pytest.mark.skipif( - engine == "numexpr" and not _USE_NUMEXPR, - reason=f"numexpr enabled->{_USE_NUMEXPR}, " + engine == "numexpr" and not USE_NUMEXPR, + reason=f"numexpr enabled->{USE_NUMEXPR}, " f"installed->{NUMEXPR_INSTALLED}", ), ) @@ -114,7 +114,7 @@ def _is_py3_complex_incompat(result, expected): return isinstance(expected, (complex, np.complexfloating)) and np.isnan(result) -_good_arith_ops = set(_arith_ops_syms).difference(_special_case_arith_ops_syms) +_good_arith_ops = set(ARITH_OPS_SYMS).difference(SPECIAL_CASE_ARITH_OPS_SYMS) @td.skip_if_no_ne @@ -158,10 +158,10 @@ def setup_data(self): self.rhses = self.pandas_rhses + self.scalar_rhses def setup_ops(self): - self.cmp_ops = expr._cmp_ops_syms + self.cmp_ops = expr.CMP_OPS_SYMS self.cmp2_ops = self.cmp_ops[::-1] - self.bin_ops = expr._bool_ops_syms - self.special_case_ops = _special_case_arith_ops_syms + self.bin_ops = expr.BOOL_OPS_SYMS + self.special_case_ops = SPECIAL_CASE_ARITH_OPS_SYMS self.arith_ops = _good_arith_ops self.unary_ops = "-", "~", "not " @@ -774,10 +774,10 @@ def setup_class(cls): cls.parser = "python" def setup_ops(self): - self.cmp_ops = [op for op in expr._cmp_ops_syms if op not in ("in", "not in")] + self.cmp_ops = [op for op in expr.CMP_OPS_SYMS if op not in ("in", "not in")] self.cmp2_ops = self.cmp_ops[::-1] - self.bin_ops = [op for op in expr._bool_ops_syms if op not in ("and", "or")] - self.special_case_ops = _special_case_arith_ops_syms + self.bin_ops = [op for op in expr.BOOL_OPS_SYMS if op not in ("and", "or")] + self.special_case_ops = SPECIAL_CASE_ARITH_OPS_SYMS self.arith_ops = _good_arith_ops self.unary_ops = "+", "-", "~" @@ -1135,7 +1135,7 @@ class TestOperationsNumExprPandas: def setup_class(cls): cls.engine = "numexpr" cls.parser = "pandas" - cls.arith_ops = expr._arith_ops_syms + expr._cmp_ops_syms + cls.arith_ops = expr.ARITH_OPS_SYMS + expr.CMP_OPS_SYMS @classmethod def teardown_class(cls): @@ -1177,7 +1177,7 @@ def test_simple_arith_ops(self): assert y == expec def test_simple_bool_ops(self): - for op, lhs, rhs in product(expr._bool_ops_syms, (True, False), (True, False)): + for op, lhs, rhs in product(expr.BOOL_OPS_SYMS, (True, False), (True, False)): ex = f"{lhs} {op} {rhs}" res = self.eval(ex) exp = eval(ex) @@ -1185,7 +1185,7 @@ def test_simple_bool_ops(self): def test_bool_ops_with_constants(self): for op, lhs, rhs in product( - expr._bool_ops_syms, ("True", "False"), ("True", "False") + expr.BOOL_OPS_SYMS, ("True", "False"), ("True", "False") ): ex = f"{lhs} {op} {rhs}" res = self.eval(ex) @@ -1637,7 +1637,7 @@ def setup_class(cls): cls.parser = "python" cls.arith_ops = [ op - for op in expr._arith_ops_syms + expr._cmp_ops_syms + for op in expr.ARITH_OPS_SYMS + expr.CMP_OPS_SYMS if op not in ("in", "not in") ] @@ -1697,7 +1697,7 @@ def test_fails_pipe(self): def test_bool_ops_with_constants(self): for op, lhs, rhs in product( - expr._bool_ops_syms, ("True", "False"), ("True", "False") + expr.BOOL_OPS_SYMS, ("True", "False"), ("True", "False") ): ex = f"{lhs} {op} {rhs}" if op in ("and", "or"): @@ -1710,7 +1710,7 @@ def test_bool_ops_with_constants(self): assert res == exp def test_simple_bool_ops(self): - for op, lhs, rhs in product(expr._bool_ops_syms, (True, False), (True, False)): + for op, lhs, rhs in product(expr.BOOL_OPS_SYMS, (True, False), (True, False)): ex = f"lhs {op} rhs" if op in ("and", "or"): msg = "'BoolOp' nodes are not implemented" @@ -1729,7 +1729,7 @@ def setup_class(cls): cls.engine = cls.parser = "python" cls.arith_ops = [ op - for op in expr._arith_ops_syms + expr._cmp_ops_syms + for op in expr.ARITH_OPS_SYMS + expr.CMP_OPS_SYMS if op not in ("in", "not in") ] @@ -1740,7 +1740,7 @@ def setup_class(cls): super().setup_class() cls.engine = "python" cls.parser = "pandas" - cls.arith_ops = expr._arith_ops_syms + expr._cmp_ops_syms + cls.arith_ops = expr.ARITH_OPS_SYMS + expr.CMP_OPS_SYMS @td.skip_if_no_ne @@ -2020,7 +2020,7 @@ def test_equals_various(other): df = DataFrame({"A": ["a", "b", "c"]}) result = df.eval(f"A == {other}") expected = Series([False, False, False], name="A") - if _USE_NUMEXPR: + if USE_NUMEXPR: # https://github.com/pandas-dev/pandas/issues/10239 # lose name with numexpr engine. Remove when that's fixed. expected.name = None diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index 70d0b4e9e835c..6dd8d890e8a4b 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -1417,7 +1417,7 @@ def test_alignment_non_pandas(self): columns = ["X", "Y", "Z"] df = pd.DataFrame(np.random.randn(3, 3), index=index, columns=columns) - align = pd.core.ops._align_method_FRAME + align = pd.core.ops.align_method_FRAME for val in [ [1, 2, 3], (1, 2, 3), diff --git a/pandas/tests/generic/test_generic.py b/pandas/tests/generic/test_generic.py index 23bb673586768..2c2584e8dee01 100644 --- a/pandas/tests/generic/test_generic.py +++ b/pandas/tests/generic/test_generic.py @@ -3,7 +3,7 @@ import numpy as np import pytest -from pandas.compat.numpy import _np_version_under1p17 +from pandas.compat.numpy import np_version_under1p17 from pandas.core.dtypes.common import is_scalar @@ -652,12 +652,12 @@ def test_sample(sel): pytest.param( "np.random.MT19937", 3, - marks=pytest.mark.skipif(_np_version_under1p17, reason="NumPy<1.17"), + marks=pytest.mark.skipif(np_version_under1p17, reason="NumPy<1.17"), ), pytest.param( "np.random.PCG64", 11, - marks=pytest.mark.skipif(_np_version_under1p17, reason="NumPy<1.17"), + marks=pytest.mark.skipif(np_version_under1p17, reason="NumPy<1.17"), ), ], ) diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index e95e7267f17ec..11dc232af8de4 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -5,7 +5,7 @@ import pytest from pandas._libs import iNaT -from pandas.compat.numpy import _is_numpy_dev +from pandas.compat.numpy import is_numpy_dev from pandas.errors import InvalidIndexError from pandas.core.dtypes.common import is_datetime64tz_dtype @@ -475,7 +475,7 @@ def test_intersection_base(self, index, request): for case in cases: # https://github.com/pandas-dev/pandas/issues/35481 if ( - _is_numpy_dev + is_numpy_dev and isinstance(case, Series) and isinstance(index, UInt64Index) ): diff --git a/pandas/tests/indexes/multi/test_analytics.py b/pandas/tests/indexes/multi/test_analytics.py index 9e4e73e793bac..d661a56311e6c 100644 --- a/pandas/tests/indexes/multi/test_analytics.py +++ b/pandas/tests/indexes/multi/test_analytics.py @@ -1,7 +1,7 @@ import numpy as np import pytest -from pandas.compat.numpy import _np_version_under1p17 +from pandas.compat.numpy import np_version_under1p17 import pandas as pd from pandas import Index, MultiIndex, date_range, period_range @@ -240,7 +240,7 @@ def test_numpy_ufuncs(idx, func): # test ufuncs of numpy. see: # https://numpy.org/doc/stable/reference/ufuncs.html - if _np_version_under1p17: + if np_version_under1p17: expected_exception = AttributeError msg = f"'tuple' object has no attribute '{func.__name__}'" else: diff --git a/pandas/tests/indexes/test_numpy_compat.py b/pandas/tests/indexes/test_numpy_compat.py index 043539c173427..a83684464caf6 100644 --- a/pandas/tests/indexes/test_numpy_compat.py +++ b/pandas/tests/indexes/test_numpy_compat.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas.compat.numpy import np_version_under1p17, np_version_under1p18 + from pandas import ( DatetimeIndex, Float64Index, @@ -9,8 +11,6 @@ PeriodIndex, TimedeltaIndex, UInt64Index, - _np_version_under1p17, - _np_version_under1p18, ) import pandas._testing as tm from pandas.core.indexes.datetimelike import DatetimeIndexOpsMixin @@ -83,12 +83,12 @@ def test_numpy_ufuncs_other(index, func): if func in [np.isfinite, np.isnan, np.isinf]: pytest.xfail(reason="__array_ufunc__ is not defined") - if not _np_version_under1p18 and func in [np.isfinite, np.isinf, np.isnan]: + if not np_version_under1p18 and func in [np.isfinite, np.isinf, np.isnan]: # numpy 1.18(dev) changed isinf and isnan to not raise on dt64/tfd64 result = func(index) assert isinstance(result, np.ndarray) - elif not _np_version_under1p17 and func in [np.isfinite]: + elif not np_version_under1p17 and func in [np.isfinite]: # ok under numpy >= 1.17 # Results in bool array result = func(index) diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index e42d9679464d8..9a6f30ec920cc 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -5,7 +5,7 @@ import numpy as np import pytest -from pandas.compat.numpy import _is_numpy_dev +from pandas.compat.numpy import is_numpy_dev import pandas as pd from pandas import DataFrame, Series, Timestamp, date_range @@ -938,7 +938,7 @@ def test_loc_setitem_empty_append(self): df.loc[0, "x"] = expected.loc[0, "x"] tm.assert_frame_equal(df, expected) - @pytest.mark.xfail(_is_numpy_dev, reason="gh-35481") + @pytest.mark.xfail(is_numpy_dev, reason="gh-35481") def test_loc_setitem_empty_append_raises(self): # GH6173, various appends to an empty dataframe diff --git a/pandas/tests/scalar/timedelta/test_arithmetic.py b/pandas/tests/scalar/timedelta/test_arithmetic.py index cb33f99d9bd91..d4d7e4b85268f 100644 --- a/pandas/tests/scalar/timedelta/test_arithmetic.py +++ b/pandas/tests/scalar/timedelta/test_arithmetic.py @@ -7,8 +7,10 @@ import numpy as np import pytest +from pandas.compat.numpy import is_numpy_dev + import pandas as pd -from pandas import NaT, Timedelta, Timestamp, _is_numpy_dev, compat, offsets +from pandas import NaT, Timedelta, Timestamp, compat, offsets import pandas._testing as tm from pandas.core import ops @@ -426,7 +428,7 @@ def test_td_div_numeric_scalar(self): np.float64("NaN"), marks=pytest.mark.xfail( # Works on numpy dev only in python 3.9 - _is_numpy_dev and not compat.PY39, + is_numpy_dev and not compat.PY39, raises=RuntimeWarning, reason="https://github.com/pandas-dev/pandas/issues/31992", ), diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py index 3d45a1f7389b7..f7f3f1fa0c13d 100644 --- a/pandas/tests/test_common.py +++ b/pandas/tests/test_common.py @@ -6,7 +6,7 @@ import numpy as np import pytest -from pandas.compat.numpy import _np_version_under1p17 +from pandas.compat.numpy import np_version_under1p17 import pandas as pd from pandas import Series, Timestamp @@ -72,7 +72,7 @@ def test_random_state(): # Check BitGenerators # GH32503 - if not _np_version_under1p17: + if not np_version_under1p17: assert ( com.random_state(npr.MT19937(3)).uniform() == npr.RandomState(npr.MT19937(3)).uniform() diff --git a/pandas/tests/test_expressions.py b/pandas/tests/test_expressions.py index 2368e93ddc256..da7f8b9b4a721 100644 --- a/pandas/tests/test_expressions.py +++ b/pandas/tests/test_expressions.py @@ -35,7 +35,7 @@ ) -@pytest.mark.skipif(not expr._USE_NUMEXPR, reason="not using numexpr") +@pytest.mark.skipif(not expr.USE_NUMEXPR, reason="not using numexpr") class TestExpressions: def setup_method(self, method): diff --git a/pandas/util/_test_decorators.py b/pandas/util/_test_decorators.py index 94c252eca1671..e9deaf3fe67de 100644 --- a/pandas/util/_test_decorators.py +++ b/pandas/util/_test_decorators.py @@ -35,7 +35,7 @@ def test_foo(): from pandas.compat._optional import import_optional_dependency from pandas.compat.numpy import _np_version -from pandas.core.computation.expressions import _USE_NUMEXPR, NUMEXPR_INSTALLED +from pandas.core.computation.expressions import NUMEXPR_INSTALLED, USE_NUMEXPR def safe_import(mod_name: str, min_version: Optional[str] = None): @@ -195,8 +195,8 @@ def skip_if_no(package: str, min_version: Optional[str] = None): _skip_if_no_scipy(), reason="Missing SciPy requirement" ) skip_if_no_ne = pytest.mark.skipif( - not _USE_NUMEXPR, - reason=f"numexpr enabled->{_USE_NUMEXPR}, installed->{NUMEXPR_INSTALLED}", + not USE_NUMEXPR, + reason=f"numexpr enabled->{USE_NUMEXPR}, installed->{NUMEXPR_INSTALLED}", ) From 5e957b8f4864cfd2b948d01389eb6336ca7cde9c Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Tue, 8 Sep 2020 23:43:52 +0100 Subject: [PATCH 0713/1025] DOC: Start 1.1.3 (#36183) --- doc/source/whatsnew/index.rst | 1 + doc/source/whatsnew/v1.1.2.rst | 2 +- doc/source/whatsnew/v1.1.3.rst | 42 ++++++++++++++++++++++++++++++++++ 3 files changed, 44 insertions(+), 1 deletion(-) create mode 100644 doc/source/whatsnew/v1.1.3.rst diff --git a/doc/source/whatsnew/index.rst b/doc/source/whatsnew/index.rst index 1827d151579a1..933ed3cb8babf 100644 --- a/doc/source/whatsnew/index.rst +++ b/doc/source/whatsnew/index.rst @@ -24,6 +24,7 @@ Version 1.1 .. toctree:: :maxdepth: 2 + v1.1.3 v1.1.2 v1.1.1 v1.1.0 diff --git a/doc/source/whatsnew/v1.1.2.rst b/doc/source/whatsnew/v1.1.2.rst index c6a08f4fb852a..81b8e7df11625 100644 --- a/doc/source/whatsnew/v1.1.2.rst +++ b/doc/source/whatsnew/v1.1.2.rst @@ -61,4 +61,4 @@ Other Contributors ~~~~~~~~~~~~ -.. contributors:: v1.1.1..v1.1.2|HEAD +.. contributors:: v1.1.1..v1.1.2 diff --git a/doc/source/whatsnew/v1.1.3.rst b/doc/source/whatsnew/v1.1.3.rst new file mode 100644 index 0000000000000..e3161012da5d1 --- /dev/null +++ b/doc/source/whatsnew/v1.1.3.rst @@ -0,0 +1,42 @@ +.. _whatsnew_113: + +What's new in 1.1.3 (??) +------------------------ + +These are the changes in pandas 1.1.3. See :ref:`release` for a full changelog +including other versions of pandas. + +{{ header }} + +.. --------------------------------------------------------------------------- + +.. _whatsnew_113.regressions: + +Fixed regressions +~~~~~~~~~~~~~~~~~ +- + +.. --------------------------------------------------------------------------- + +.. _whatsnew_113.bug_fixes: + +Bug fixes +~~~~~~~~~ +- + +.. --------------------------------------------------------------------------- + +.. _whatsnew_113.other: + +Other +~~~~~ +- + +.. --------------------------------------------------------------------------- + +.. _whatsnew_113.contributors: + +Contributors +~~~~~~~~~~~~ + +.. contributors:: v1.1.2..v1.1.3|HEAD From f7fef6e0fac51465b8842b2743fe84e0ca78945a Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 8 Sep 2020 15:53:03 -0700 Subject: [PATCH 0714/1025] CLN: re-use invalid_comparison in Categorical comparisons (#36229) --- pandas/core/arrays/categorical.py | 10 +--------- pandas/tests/arrays/categorical/test_operators.py | 13 +++++-------- pandas/tests/series/test_arithmetic.py | 2 +- 3 files changed, 7 insertions(+), 18 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index f20d3d5e316f8..a2b5b54c55490 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -119,15 +119,7 @@ def func(self, other): ret[mask] = False return ret else: - if opname == "__eq__": - return np.zeros(len(self), dtype=bool) - elif opname == "__ne__": - return np.ones(len(self), dtype=bool) - else: - raise TypeError( - f"Cannot compare a Categorical for op {opname} with a " - "scalar, which is not a category." - ) + return ops.invalid_comparison(self, other, op) else: # allow categorical vs object dtype array comparisons for equality # these are only positional comparisons diff --git a/pandas/tests/arrays/categorical/test_operators.py b/pandas/tests/arrays/categorical/test_operators.py index 6ea003c122eea..bc5fb51883b3d 100644 --- a/pandas/tests/arrays/categorical/test_operators.py +++ b/pandas/tests/arrays/categorical/test_operators.py @@ -171,17 +171,14 @@ def test_comparison_with_unknown_scalars(self): # for unequal comps, but not for equal/not equal cat = Categorical([1, 2, 3], ordered=True) - msg = ( - "Cannot compare a Categorical for op __{}__ with a scalar, " - "which is not a category" - ) - with pytest.raises(TypeError, match=msg.format("lt")): + msg = "Invalid comparison between dtype=category and int" + with pytest.raises(TypeError, match=msg): cat < 4 - with pytest.raises(TypeError, match=msg.format("gt")): + with pytest.raises(TypeError, match=msg): cat > 4 - with pytest.raises(TypeError, match=msg.format("gt")): + with pytest.raises(TypeError, match=msg): 4 < cat - with pytest.raises(TypeError, match=msg.format("lt")): + with pytest.raises(TypeError, match=msg): 4 > cat tm.assert_numpy_array_equal(cat == 4, np.array([False, False, False])) diff --git a/pandas/tests/series/test_arithmetic.py b/pandas/tests/series/test_arithmetic.py index ef2bafd4ea2ad..c937e357b9dbc 100644 --- a/pandas/tests/series/test_arithmetic.py +++ b/pandas/tests/series/test_arithmetic.py @@ -501,7 +501,7 @@ def test_unequal_categorical_comparison_raises_type_error(self): # for unequal comps, but not for equal/not equal cat = Series(Categorical(list("abc"), ordered=True)) - msg = "Cannot compare a Categorical for op.+with a scalar" + msg = "Invalid comparison between dtype=category and str" with pytest.raises(TypeError, match=msg): cat < "d" with pytest.raises(TypeError, match=msg): From d8434f9bdccb38e80858b7b38a48d0686982d57b Mon Sep 17 00:00:00 2001 From: attack68 <24256554+attack68@users.noreply.github.com> Date: Wed, 9 Sep 2020 01:56:46 +0200 Subject: [PATCH 0715/1025] CLN: w3 formatting (#36223) --- pandas/io/formats/style.py | 2 +- pandas/tests/io/formats/test_style.py | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index 023557dd6494d..b27a4e036e137 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -327,7 +327,7 @@ def format_attr(pair): colspan = col_lengths.get((r, c), 0) if colspan > 1: es["attributes"] = [ - format_attr({"key": "colspan", "value": colspan}) + format_attr({"key": "colspan", "value": f'"{colspan}"'}) ] row_es.append(es) head.append(row_es) diff --git a/pandas/tests/io/formats/test_style.py b/pandas/tests/io/formats/test_style.py index 6025649e9dbec..de549ec3eb75e 100644 --- a/pandas/tests/io/formats/test_style.py +++ b/pandas/tests/io/formats/test_style.py @@ -1691,6 +1691,12 @@ def test_no_cell_ids(self): s = styler.render() # render twice to ensure ctx is not updated assert s.find('

    ') != -1 + def test_colspan_w3(self): + # GH 36223 + df = pd.DataFrame(data=[[1, 2]], columns=[["l0", "l0"], ["l1a", "l1b"]]) + s = Styler(df, uuid="_", cell_ids=False) + assert 'l0` elements. + + Parameters + ---------- + classes : DataFrame + DataFrame containing strings that will be translated to CSS classes, + mapped by identical column and index values that must exist on the + underlying `Styler` data. None, NaN values, and empty strings will + be ignored and not affect the rendered HTML. + + Returns + ------- + self : Styler + + Examples + -------- + >>> df = pd.DataFrame(data=[[1, 2, 3], [4, 5, 6]], columns=["A", "B", "C"]) + >>> classes = pd.DataFrame([ + ... ["min-val red", "", "blue"], + ... ["red", None, "blue max-val"] + ... ], index=df.index, columns=df.columns) + >>> df.style.set_td_classes(classes) + + Using `MultiIndex` columns and a `classes` `DataFrame` as a subset of the + underlying, + + >>> df = pd.DataFrame([[1,2],[3,4]], index=["a", "b"], + ... columns=[["level0", "level0"], ["level1a", "level1b"]]) + >>> classes = pd.DataFrame(["min-val"], index=["a"], + ... columns=[["level0"],["level1a"]]) + >>> df.style.set_td_classes(classes) + + Form of the output with new additional css classes, + + >>> df = pd.DataFrame([[1]]) + >>> css = pd.DataFrame(["other-class"]) + >>> s = Styler(df, uuid="_", cell_ids=False).set_td_classes(css) + >>> s.hide_index().render() + '' + '' + ' ' + ' ' + ' ' + ' ' + ' ' + ' ' + '
    0
    1
    ' + + """ + classes = classes.reindex_like(self.data) + + mask = (classes.isna()) | (classes.eq("")) + self.cell_context["data"] = { + r: {c: [str(classes.iloc[r, c])]} + for r, rn in enumerate(classes.index) + for c, cn in enumerate(classes.columns) + if not mask.iloc[r, c] + } + + return self + def render(self, **kwargs) -> str: """ Render the built up styles to HTML. @@ -609,6 +675,7 @@ def clear(self) -> None: Returns None. """ self.ctx.clear() + self.cell_context = {} self._todo = [] def _compute(self): diff --git a/pandas/tests/io/formats/test_style.py b/pandas/tests/io/formats/test_style.py index de549ec3eb75e..e7583e1ce2ce2 100644 --- a/pandas/tests/io/formats/test_style.py +++ b/pandas/tests/io/formats/test_style.py @@ -1691,6 +1691,27 @@ def test_no_cell_ids(self): s = styler.render() # render twice to ensure ctx is not updated assert s.find('
    ') != -1 + @pytest.mark.parametrize( + "classes", + [ + DataFrame( + data=[["", "test-class"], [np.nan, None]], + columns=["A", "B"], + index=["a", "b"], + ), + DataFrame(data=[["test-class"]], columns=["B"], index=["a"]), + DataFrame(data=[["test-class", "unused"]], columns=["B", "C"], index=["a"]), + ], + ) + def test_set_data_classes(self, classes): + # GH 36159 + df = DataFrame(data=[[0, 1], [2, 3]], columns=["A", "B"], index=["a", "b"]) + s = Styler(df, uuid="_", cell_ids=False).set_td_classes(classes).render() + assert '0123
    ` tag if specified. - - .. versionadded:: 0.23.0 - render_links : bool, default False Convert URLs to HTML links. @@ -3700,10 +3682,6 @@ def assign(self, **kwargs) -> DataFrame: Later items in '\*\*kwargs' may refer to newly created or modified columns in 'df'; items are computed and assigned into 'df' in order. - .. versionchanged:: 0.23.0 - - Keyword argument order is maintained. - Examples -------- >>> df = pd.DataFrame({'temp_c': [17.0, 25.0]}, @@ -6604,9 +6582,6 @@ def groupby( specified, all remaining columns will be used and the result will have hierarchically indexed columns. - .. versionchanged:: 0.23.0 - Also accept list of column names. - Returns ------- DataFrame @@ -7200,14 +7175,7 @@ def unstack(self, level=-1, fill_value=None): return unstack(self, level, fill_value) - @Appender( - _shared_docs["melt"] - % dict( - caller="df.melt(", - versionadded="\n .. versionadded:: 0.20.0\n", - other="melt", - ) - ) + @Appender(_shared_docs["melt"] % dict(caller="df.melt(", other="melt",)) def melt( self, id_vars=None, @@ -7418,7 +7386,6 @@ def _gotitem( axis=_shared_doc_kwargs["axis"], see_also=_agg_summary_and_see_also_doc, examples=_agg_examples_doc, - versionadded="\n.. versionadded:: 0.20.0\n", ) def aggregate(self, func=None, axis=0, *args, **kwargs): axis = self._get_axis_number(axis) @@ -7518,9 +7485,6 @@ def apply(self, func, axis=0, raw=False, result_type=None, args=(), **kwds): applied function: list-like results will be returned as a Series of those. However if the apply function returns a Series these are expanded to columns. - - .. versionadded:: 0.23.0 - args : tuple Positional arguments to pass to `func` in addition to the array/series. @@ -7720,7 +7684,6 @@ def append( sort : bool, default False Sort columns if the columns of `self` and `other` are not aligned. - .. versionadded:: 0.23.0 .. versionchanged:: 1.0.0 Changed to not sort by default. diff --git a/pandas/core/generic.py b/pandas/core/generic.py index d7b82923e7488..d78fa42cd1056 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2214,8 +2214,6 @@ def to_json( Describing the data, where data component is like ``orient='records'``. - .. versionchanged:: 0.20.0 - date_format : {None, 'epoch', 'iso'} Type of date conversion. 'epoch' = epoch milliseconds, 'iso' = ISO8601. The default depends on the `orient`. For @@ -2251,9 +2249,6 @@ def to_json( Whether to include the index values in the JSON string. Not including the index (``index=False``) is only supported when orient is 'split' or 'table'. - - .. versionadded:: 0.23.0 - indent : int, optional Length of whitespace used to indent each record. @@ -3011,9 +3006,6 @@ def to_latex( into a main LaTeX document or read from an external file with ``\input{table.tex}``. - .. versionchanged:: 0.20.2 - Added to Series. - .. versionchanged:: 1.0.0 Added caption and label arguments. @@ -6404,9 +6396,6 @@ def replace( The method to use when for replacement, when `to_replace` is a scalar, list or tuple and `value` is ``None``. - .. versionchanged:: 0.23.0 - Added to DataFrame. - Returns ------- {klass} @@ -6836,8 +6825,6 @@ def interpolate( (interpolate). * 'outside': Only fill NaNs outside valid values (extrapolate). - .. versionadded:: 0.23.0 - downcast : optional, 'infer' or None, defaults to None Downcast dtypes if possible. **kwargs @@ -11349,12 +11336,6 @@ def _doc_parms(cls): min_count : int, default 0 The required number of valid values to perform the operation. If fewer than ``min_count`` non-NA values are present the result will be NA. - - .. versionadded:: 0.22.0 - - Added with the default being 0. This means the sum of an all-NA - or empty Series is 0, and the product of an all-NA or empty - Series is 1. """ diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 30bd53a3ddff1..ceee78bfebe68 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -215,8 +215,6 @@ class providing the base-class of operations. Apply a function `func` with arguments to this %(klass)s object and return the function's result. -%(versionadded)s - Use `.pipe` when you want to improve readability by chaining together functions that expect Series, DataFrames, GroupBy or Resampler objects. Instead of writing @@ -709,7 +707,6 @@ def __getattr__(self, attr: str): @Substitution( klass="GroupBy", - versionadded=".. versionadded:: 0.21.0", examples="""\ >>> df = pd.DataFrame({'A': 'a b a b'.split(), 'B': [1, 2, 3, 4]}) >>> df diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 04a63beb2ef45..3d177e08bb0f5 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -1553,8 +1553,6 @@ def droplevel(self, level=0): If resulting index has only 1 level left, the result will be of Index type, not MultiIndex. - .. versionadded:: 0.23.1 (support for non-MultiIndex) - Parameters ---------- level : int, str, or list-like, default 0 @@ -2296,8 +2294,6 @@ def unique(self, level=None): level : int or str, optional, default None Only return values from specified level (for MultiIndex). - .. versionadded:: 0.23.0 - Returns ------- Index without duplicates diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 29b7bd7a63faa..f881f79cb5c1d 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -208,7 +208,6 @@ def _assure_grouper(self): @Substitution( klass="Resampler", - versionadded=".. versionadded:: 0.23.0", examples=""" >>> df = pd.DataFrame({'A': [1, 2, 3, 4]}, ... index=pd.date_range('2012-08-02', periods=4)) @@ -283,7 +282,6 @@ def pipe(self, func, *args, **kwargs): _shared_docs["aggregate"], see_also=_agg_see_also_doc, examples=_agg_examples_doc, - versionadded="", klass="DataFrame", axis="", ) diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index 9b94dae8556f6..dd4bcf77641ef 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -122,7 +122,6 @@ def concat( This has no effect when ``join='inner'``, which already preserves the order of the non-concatenation axis. - .. versionadded:: 0.23.0 .. versionchanged:: 1.0.0 Changed to not sort by default. diff --git a/pandas/core/reshape/melt.py b/pandas/core/reshape/melt.py index 33ce5ed49b9c2..7f5fb6b45f014 100644 --- a/pandas/core/reshape/melt.py +++ b/pandas/core/reshape/melt.py @@ -22,10 +22,7 @@ from pandas import DataFrame, Series # noqa: F401 -@Appender( - _shared_docs["melt"] - % dict(caller="pd.melt(df, ", versionadded="", other="DataFrame.melt") -) +@Appender(_shared_docs["melt"] % dict(caller="pd.melt(df, ", other="DataFrame.melt")) def melt( frame: "DataFrame", id_vars=None, @@ -274,12 +271,10 @@ def wide_to_long( A regular expression capturing the wanted suffixes. '\\d+' captures numeric suffixes. Suffixes with no numbers could be specified with the negated character class '\\D+'. You can also further disambiguate - suffixes, for example, if your wide variables are of the form - A-one, B-two,.., and you have an unrelated column A-rating, you can - ignore the last one by specifying `suffix='(!?one|two)'`. - - .. versionchanged:: 0.23.0 - When all suffixes are numeric, they are cast to int64/float64. + suffixes, for example, if your wide variables are of the form A-one, + B-two,.., and you have an unrelated column A-rating, you can ignore the + last one by specifying `suffix='(!?one|two)'`. When all suffixes are + numeric, they are cast to int64/float64. Returns ------- diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index e81dd8f0c735c..6ddf53b6493e3 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -764,8 +764,6 @@ def get_dummies( dtype : dtype, default np.uint8 Data type for new columns. Only a single dtype is allowed. - .. versionadded:: 0.23.0 - Returns ------- DataFrame diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index f7723bee532ff..077ad057f6e1d 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -84,8 +84,6 @@ def cut( Whether the first interval should be left-inclusive or not. duplicates : {default 'raise', 'drop'}, optional If bin edges are not unique, raise ValueError or drop non-uniques. - - .. versionadded:: 0.23.0 ordered : bool, default True Whether the labels are ordered or not. Applies to returned types Categorical and Series (with Categorical dtype). If True, diff --git a/pandas/core/series.py b/pandas/core/series.py index ef9ade5c7bb15..1d2c1ef59d299 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -118,7 +118,6 @@ optional_mapper="", optional_labels="", optional_axis="", - versionadded_to_excel="\n .. versionadded:: 0.20.0\n", ) @@ -157,12 +156,8 @@ class Series(base.IndexOpsMixin, generic.NDFrame): Parameters ---------- data : array-like, Iterable, dict, or scalar value - Contains data stored in Series. - - .. versionchanged:: 0.23.0 - If data is a dict, argument order is maintained for Python 3.6 - and later. - + Contains data stored in Series. If data is a dict, argument order is + maintained. index : array-like or Index (1d) Values must be hashable and have the same length as `data`. Non-unique index values are allowed. Will default to @@ -4047,7 +4042,6 @@ def _gotitem(self, key, ndim, subset=None) -> "Series": axis=_shared_doc_kwargs["axis"], see_also=_agg_see_also_doc, examples=_agg_examples_doc, - versionadded="\n.. versionadded:: 0.20.0\n", ) def aggregate(self, func=None, axis=0, *args, **kwargs): # Validate the axis parameter diff --git a/pandas/core/shared_docs.py b/pandas/core/shared_docs.py index 244ee3aa298db..14363dabfcdf3 100644 --- a/pandas/core/shared_docs.py +++ b/pandas/core/shared_docs.py @@ -6,7 +6,7 @@ "aggregate" ] = """\ Aggregate using one or more operations over the specified axis. -{versionadded} + Parameters ---------- func : function, str, list or dict @@ -119,8 +119,6 @@ This only applies if any of the groupers are Categoricals. If True: only show observed values for categorical groupers. If False: show all values for categorical groupers. - - .. versionadded:: 0.23.0 dropna : bool, default True If True, and if group keys contain NA values, NA values together with row/column will be dropped. @@ -154,7 +152,7 @@ columns, considered measured variables (`value_vars`), are "unpivoted" to the row axis, leaving just two non-identifier columns, 'variable' and 'value'. -%(versionadded)s + Parameters ---------- id_vars : tuple, list, or ndarray, optional diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 4decd86764ccc..ab6c9cfb51414 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -602,8 +602,6 @@ def str_replace(arr, pat, repl, n=-1, case=None, flags=0, regex=True): - Cannot be set to False if `pat` is a compiled regex or `repl` is a callable. - .. versionadded:: 0.23.0 - Returns ------- Series or Index of object @@ -2374,7 +2372,6 @@ def cat(self, others=None, sep=None, na_rep=None, join="left"): to match the length of the calling Series/Index). To disable alignment, use `.values` on any Series/Index/DataFrame in `others`. - .. versionadded:: 0.23.0 .. versionchanged:: 1.0.0 Changed default of `join` from None to `'left'`. diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 09a53d5a10ae6..ddb44898dbfad 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -680,8 +680,6 @@ def to_datetime( used when there are at least 50 values. The presence of out-of-bounds values will render the cache unusable and may slow down parsing. - .. versionadded:: 0.23.0 - .. versionchanged:: 0.25.0 - changed default value from False to True. diff --git a/pandas/core/window/ewm.py b/pandas/core/window/ewm.py index 4282cb41c4e91..34d9d9d8c00ef 100644 --- a/pandas/core/window/ewm.py +++ b/pandas/core/window/ewm.py @@ -278,7 +278,6 @@ def _constructor(self): _shared_docs["aggregate"], see_also=_agg_see_also_doc, examples=_agg_examples_doc, - versionadded="", klass="Series/Dataframe", axis="", ) diff --git a/pandas/core/window/expanding.py b/pandas/core/window/expanding.py index 46e002324ec75..319944fd48eae 100644 --- a/pandas/core/window/expanding.py +++ b/pandas/core/window/expanding.py @@ -117,7 +117,6 @@ def _get_window(self, other=None, **kwargs): _shared_docs["aggregate"], see_also=_agg_see_also_doc, examples=_agg_examples_doc, - versionadded="", klass="Series/Dataframe", axis="", ) diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index d094cc7d70a21..00fdf0813b027 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -1162,7 +1162,6 @@ def _get_window( _shared_docs["aggregate"], see_also=_agg_see_also_doc, examples=_agg_examples_doc, - versionadded="", klass="Series/DataFrame", axis="", ) @@ -1650,8 +1649,6 @@ def kurt(self, **kwargs): quantile : float Quantile to compute. 0 <= quantile <= 1. interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'} - .. versionadded:: 0.23.0 - This optional parameter specifies the interpolation method to use, when the desired quantile lies between two data points `i` and `j`: @@ -2043,7 +2040,6 @@ def _validate_freq(self): _shared_docs["aggregate"], see_also=_agg_see_also_doc, examples=_agg_examples_doc, - versionadded="", klass="Series/Dataframe", axis="", ) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index e9634ff0e9a05..65e95fd321772 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -124,9 +124,6 @@ Rows to skip at the beginning (0-indexed). nrows : int, default None Number of rows to parse. - - .. versionadded:: 0.23.0 - na_values : scalar, str, list-like, or dict, default None Additional strings to recognize as NA/NaN. If dict passed, specific per-column NA values. By default the following values are interpreted diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index 5c3a309b0e310..b5f0bc0a832c2 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -1027,8 +1027,6 @@ def hide_index(self) -> "Styler": """ Hide any indices from rendering. - .. versionadded:: 0.23.0 - Returns ------- self : Styler @@ -1040,8 +1038,6 @@ def hide_columns(self, subset) -> "Styler": """ Hide columns from rendering. - .. versionadded:: 0.23.0 - Parameters ---------- subset : IndexSlice diff --git a/pandas/io/html.py b/pandas/io/html.py index 8354cf413814e..40fde224a7ae9 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -161,8 +161,6 @@ class _HtmlFrameParser: displayed_only : bool Whether or not items with "display:none" should be ignored - .. versionadded:: 0.23.0 - Attributes ---------- io : str or file-like @@ -181,8 +179,6 @@ class _HtmlFrameParser: displayed_only : bool Whether or not items with "display:none" should be ignored - .. versionadded:: 0.23.0 - Notes ----- To subclass this class effectively you must override the following methods: diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index e3000788cb33a..c3977f89ac42f 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -428,9 +428,6 @@ def read_json( - The DataFrame columns must be unique for orients ``'index'``, ``'columns'``, and ``'records'``. - .. versionadded:: 0.23.0 - 'table' as an allowed value for the ``orient`` argument - typ : {'frame', 'series'}, default 'frame' The type of object to recover. diff --git a/pandas/io/stata.py b/pandas/io/stata.py index b3b16e04a5d9e..df5f6c3d53d30 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -164,8 +164,6 @@ path_or_buf : path (string), buffer or path object string, path object (pathlib.Path or py._path.local.LocalPath) or object implementing a binary read() functions. - - .. versionadded:: 0.23.0 support for pathlib, py.path. {_statafile_processing_params1} {_statafile_processing_params2} {_chunksize_params} @@ -2122,9 +2120,6 @@ class StataWriter(StataParser): object implementing a binary write() functions. If using a buffer then the buffer will not be automatically closed after the file is written. - - .. versionadded:: 0.23.0 support for pathlib, py.path. - data : DataFrame Input to save convert_dates : dict @@ -3000,8 +2995,6 @@ class StataWriter117(StataWriter): """ A class for writing Stata binary dta files in Stata 13 format (117) - .. versionadded:: 0.23.0 - Parameters ---------- fname : path (string), buffer or path object diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index 45a3818492b44..d02f12a8e1029 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -542,12 +542,8 @@ def boxplot_frame_groupby( The layout of the plot: (rows, columns). sharex : bool, default False Whether x-axes will be shared among subplots. - - .. versionadded:: 0.23.1 sharey : bool, default True Whether y-axes will be shared among subplots. - - .. versionadded:: 0.23.1 backend : str, default None Backend to use instead of the backend specified in the option ``plotting.backend``. For instance, 'matplotlib'. Alternatively, to From 11f0a907f1aa7bce534479e9edefde975898ecf5 Mon Sep 17 00:00:00 2001 From: Thomas Dickson Date: Sun, 13 Sep 2020 21:22:36 +0100 Subject: [PATCH 0769/1025] ERR: Cartesian product error (#36335) --- pandas/core/reshape/util.py | 3 +++ pandas/tests/reshape/test_util.py | 10 ++++++++++ 2 files changed, 13 insertions(+) diff --git a/pandas/core/reshape/util.py b/pandas/core/reshape/util.py index a1bf3f8ee4119..d2c08712abacd 100644 --- a/pandas/core/reshape/util.py +++ b/pandas/core/reshape/util.py @@ -39,6 +39,9 @@ def cartesian_product(X): lenX = np.fromiter((len(x) for x in X), dtype=np.intp) cumprodX = np.cumproduct(lenX) + if np.any(cumprodX < 0): + raise ValueError("Product space too large to allocate arrays!") + a = np.roll(cumprodX, 1) a[0] = 1 diff --git a/pandas/tests/reshape/test_util.py b/pandas/tests/reshape/test_util.py index 9d074b5ade425..0acadc54cec0c 100644 --- a/pandas/tests/reshape/test_util.py +++ b/pandas/tests/reshape/test_util.py @@ -65,3 +65,13 @@ def test_invalid_input(self, X): with pytest.raises(TypeError, match=msg): cartesian_product(X=X) + + def test_exceed_product_space(self): + # GH31355: raise useful error when produce space is too large + msg = "Product space too large to allocate arrays!" + + with pytest.raises(ValueError, match=msg): + dims = [np.arange(0, 22, dtype=np.int16) for i in range(12)] + [ + (np.arange(15128, dtype=np.int16)), + ] + cartesian_product(X=dims) From 0cf5534598bf8afaec6eb206a2c208f985f6e8b1 Mon Sep 17 00:00:00 2001 From: Rohith295 <57575037+Rohith295@users.noreply.github.com> Date: Sun, 13 Sep 2020 22:28:44 +0200 Subject: [PATCH 0770/1025] Pd.series.map performance (#34948) --- doc/source/whatsnew/v1.2.0.rst | 1 + pandas/core/series.py | 10 +++++++--- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 89d94dc0cabd6..dbc88d0b371e8 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -207,6 +207,7 @@ Performance improvements - Performance improvements when creating Series with dtype `str` or :class:`StringDtype` from array with many string elements (:issue:`36304`, :issue:`36317`) - Performance improvement in :meth:`GroupBy.agg` with the ``numba`` engine (:issue:`35759`) +- Performance improvements when creating :meth:`pd.Series.map` from a huge dictionary (:issue:`34717`) - Performance improvement in :meth:`GroupBy.transform` with the ``numba`` engine (:issue:`36240`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/series.py b/pandas/core/series.py index 1d2c1ef59d299..69376d8bf80d1 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -357,15 +357,19 @@ def _init_dict(self, data, index=None, dtype=None): # Looking for NaN in dict doesn't work ({np.nan : 1}[float('nan')] # raises KeyError), so we iterate the entire dict, and align if data: - keys, values = zip(*data.items()) - values = list(values) + # GH:34717, issue was using zip to extract key and values from data. + # using generators in effects the performance. + # Below is the new way of extracting the keys and values + + keys = tuple(data.keys()) + values = list(data.values()) # Generating list of values- faster way elif index is not None: # fastpath for Series(data=None). Just use broadcasting a scalar # instead of reindexing. values = na_value_for_dtype(dtype) keys = index else: - keys, values = [], [] + keys, values = tuple([]), [] # Input is now list-like, so rely on "standard" construction: From b0669debb28a6d5e826a25e6a4b39f4d7cd59b6e Mon Sep 17 00:00:00 2001 From: Dan Moore <9156191+drmrd@users.noreply.github.com> Date: Sun, 13 Sep 2020 16:29:41 -0400 Subject: [PATCH 0771/1025] BUG: Ensure read_spss accepts pathlib Paths (GH33666) (#36174) --- doc/source/whatsnew/v1.1.3.rst | 1 + pandas/io/spss.py | 4 +++- pandas/tests/io/test_spss.py | 7 +++++-- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.1.3.rst b/doc/source/whatsnew/v1.1.3.rst index d789518f93f6d..8e283aec39786 100644 --- a/doc/source/whatsnew/v1.1.3.rst +++ b/doc/source/whatsnew/v1.1.3.rst @@ -26,6 +26,7 @@ Fixed regressions Bug fixes ~~~~~~~~~ +- Bug in :func:`read_spss` where passing a ``pathlib.Path`` as ``path`` would raise a ``TypeError`` (:issue:`33666`) - Bug in :meth:`Series.str.startswith` and :meth:`Series.str.endswith` with ``category`` dtype not propagating ``na`` parameter (:issue:`36241`) - Bug in :class:`Series` constructor where integer overflow would occur for sufficiently large scalar inputs when an index was provided (:issue:`36291`) diff --git a/pandas/io/spss.py b/pandas/io/spss.py index 9605faeb36590..79cdfbf15392a 100644 --- a/pandas/io/spss.py +++ b/pandas/io/spss.py @@ -7,6 +7,8 @@ from pandas.core.api import DataFrame +from pandas.io.common import stringify_path + def read_spss( path: Union[str, Path], @@ -40,6 +42,6 @@ def read_spss( usecols = list(usecols) # pyreadstat requires a list df, _ = pyreadstat.read_sav( - path, usecols=usecols, apply_value_formats=convert_categoricals + stringify_path(path), usecols=usecols, apply_value_formats=convert_categoricals ) return df diff --git a/pandas/tests/io/test_spss.py b/pandas/tests/io/test_spss.py index 013f56f83c5ec..a4894ff66ab9f 100644 --- a/pandas/tests/io/test_spss.py +++ b/pandas/tests/io/test_spss.py @@ -1,3 +1,5 @@ +from pathlib import Path + import numpy as np import pytest @@ -7,9 +9,10 @@ pyreadstat = pytest.importorskip("pyreadstat") -def test_spss_labelled_num(datapath): +@pytest.mark.parametrize("path_klass", [lambda p: p, Path]) +def test_spss_labelled_num(path_klass, datapath): # test file from the Haven project (https://haven.tidyverse.org/) - fname = datapath("io", "data", "spss", "labelled-num.sav") + fname = path_klass(datapath("io", "data", "spss", "labelled-num.sav")) df = pd.read_spss(fname, convert_categoricals=True) expected = pd.DataFrame({"VAR00002": "This is one"}, index=[0]) From a886c8be3155b7a406a9e5b1e4ef07f15a490185 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 13 Sep 2020 13:37:18 -0700 Subject: [PATCH 0772/1025] BUG: iloc.__setitem__ with DataFrame value, multiple blocks, non-unique columns (#36337) --- pandas/core/indexing.py | 44 +++++++++++++++++++++++------- pandas/tests/indexing/test_iloc.py | 14 ++++++++++ 2 files changed, 48 insertions(+), 10 deletions(-) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 64da27a6574a6..9ecad335e2c3c 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1690,18 +1690,42 @@ def _setitem_with_indexer(self, indexer, value): sub_indexer = list(indexer) multiindex_indexer = isinstance(labels, ABCMultiIndex) # TODO: we are implicitly assuming value.columns is unique + unique_cols = value.columns.is_unique + + if not unique_cols and value.columns.equals(self.obj.columns): + # We assume we are already aligned, see + # test_iloc_setitem_frame_duplicate_columns_multiple_blocks + for loc in ilocs: + item = item_labels[loc] + if item in value: + sub_indexer[info_axis] = item + v = self._align_series( + tuple(sub_indexer), + value.iloc[:, loc], + multiindex_indexer, + ) + else: + v = np.nan - for loc in ilocs: - item = item_labels[loc] - if item in value: - sub_indexer[info_axis] = item - v = self._align_series( - tuple(sub_indexer), value[item], multiindex_indexer - ) - else: - v = np.nan + self._setitem_single_column(loc, v, pi) - self._setitem_single_column(loc, v, pi) + elif not unique_cols: + raise ValueError( + "Setting with non-unique columns is not allowed." + ) + + else: + for loc in ilocs: + item = item_labels[loc] + if item in value: + sub_indexer[info_axis] = item + v = self._align_series( + tuple(sub_indexer), value[item], multiindex_indexer + ) + else: + v = np.nan + + self._setitem_single_column(loc, v, pi) # we have an equal len ndarray/convertible to our labels # hasattr first, to avoid coercing to ndarray without reason. diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index bfb62835add93..d3d455f83c41a 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -369,6 +369,20 @@ def test_iloc_setitem_dups(self): df.iloc[[1, 0], [0, 1]] = df.iloc[[1, 0], [0, 1]].reset_index(drop=True) tm.assert_frame_equal(df, expected) + def test_iloc_setitem_frame_duplicate_columns_multiple_blocks(self): + # Same as the "assign back to self" check in test_iloc_setitem_dups + # but on a DataFrame with multiple blocks + df = pd.DataFrame([[0, 1], [2, 3]], columns=["B", "B"]) + + df.iloc[:, 0] = df.iloc[:, 0].astype("f8") + assert len(df._mgr.blocks) == 2 + expected = df.copy() + + # assign back to self + df.iloc[[0, 1], [0, 1]] = df.iloc[[0, 1], [0, 1]] + + tm.assert_frame_equal(df, expected) + # TODO: GH#27620 this test used to compare iloc against ix; check if this # is redundant with another test comparing iloc against loc def test_iloc_getitem_frame(self): From fc4d3b3692822f2a7c2cc195828db81fabed4abe Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Sun, 13 Sep 2020 21:46:08 +0100 Subject: [PATCH 0773/1025] BUG: xticks unnecessarily rotated (#34334) --- doc/source/whatsnew/v1.2.0.rst | 1 + pandas/plotting/_matplotlib/core.py | 10 +++++--- pandas/tests/plotting/test_datetimelike.py | 28 +++++++++++++++++++++- pandas/tests/plotting/test_frame.py | 13 ++++++---- pandas/tests/plotting/test_series.py | 5 ++++ 5 files changed, 49 insertions(+), 8 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index dbc88d0b371e8..bb79b91096867 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -305,6 +305,7 @@ I/O Plotting ^^^^^^^^ +- Bug in :meth:`DataFrame.plot` was rotating xticklabels when ``subplots=True``, even if the x-axis wasn't an irregular time series (:issue:`29460`) - Bug in :meth:`DataFrame.plot` where a marker letter in the ``style`` keyword sometimes causes a ``ValueError`` (:issue:`21003`) Groupby/resample/rolling diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index 8275c0991e464..602b42022f561 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -1231,11 +1231,15 @@ def get_label(i): ax.xaxis.set_major_locator(FixedLocator(xticks)) ax.set_xticklabels(xticklabels) + # If the index is an irregular time series, then by default + # we rotate the tick labels. The exception is if there are + # subplots which don't share their x-axes, in which we case + # we don't rotate the ticklabels as by default the subplots + # would be too close together. condition = ( not self._use_dynamic_x() - and data.index.is_all_dates - and not self.subplots - or (self.subplots and self.sharex) + and (data.index.is_all_dates and self.use_index) + and (not self.subplots or (self.subplots and self.sharex)) ) index_name = self._get_index_name() diff --git a/pandas/tests/plotting/test_datetimelike.py b/pandas/tests/plotting/test_datetimelike.py index ecf378d4fc04a..78aa1887f5611 100644 --- a/pandas/tests/plotting/test_datetimelike.py +++ b/pandas/tests/plotting/test_datetimelike.py @@ -9,7 +9,7 @@ from pandas._libs.tslibs import BaseOffset, to_offset import pandas.util._test_decorators as td -from pandas import DataFrame, Index, NaT, Series, isna +from pandas import DataFrame, Index, NaT, Series, isna, to_datetime import pandas._testing as tm from pandas.core.indexes.datetimes import DatetimeIndex, bdate_range, date_range from pandas.core.indexes.period import Period, PeriodIndex, period_range @@ -1494,6 +1494,32 @@ def test_matplotlib_scatter_datetime64(self): expected = "2017-12-12" assert label.get_text() == expected + def test_check_xticks_rot(self): + # https://github.com/pandas-dev/pandas/issues/29460 + # regular time series + x = to_datetime(["2020-05-01", "2020-05-02", "2020-05-03"]) + df = DataFrame({"x": x, "y": [1, 2, 3]}) + axes = df.plot(x="x", y="y") + self._check_ticks_props(axes, xrot=0) + + # irregular time series + x = to_datetime(["2020-05-01", "2020-05-02", "2020-05-04"]) + df = DataFrame({"x": x, "y": [1, 2, 3]}) + axes = df.plot(x="x", y="y") + self._check_ticks_props(axes, xrot=30) + + # use timeseries index or not + axes = df.set_index("x").plot(y="y", use_index=True) + self._check_ticks_props(axes, xrot=30) + axes = df.set_index("x").plot(y="y", use_index=False) + self._check_ticks_props(axes, xrot=0) + + # separate subplots + axes = df.plot(x="x", y="y", subplots=True, sharex=True) + self._check_ticks_props(axes, xrot=30) + axes = df.plot(x="x", y="y", subplots=True, sharex=False) + self._check_ticks_props(axes, xrot=0) + def _check_plot_works(f, freq=None, series=None, *args, **kwargs): import matplotlib.pyplot as plt diff --git a/pandas/tests/plotting/test_frame.py b/pandas/tests/plotting/test_frame.py index d2b22c7a4c2e3..ca4c2bdcc2fe1 100644 --- a/pandas/tests/plotting/test_frame.py +++ b/pandas/tests/plotting/test_frame.py @@ -48,7 +48,6 @@ def _assert_xtickslabels_visibility(self, axes, expected): for ax, exp in zip(axes, expected): self._check_visible(ax.get_xticklabels(), visible=exp) - @pytest.mark.xfail(reason="Waiting for PR 34334", strict=True) @pytest.mark.slow def test_plot(self): from pandas.plotting._matplotlib.compat import mpl_ge_3_1_0 @@ -66,6 +65,7 @@ def test_plot(self): with tm.assert_produces_warning(UserWarning): axes = _check_plot_works(df.plot, subplots=True, use_index=False) + self._check_ticks_props(axes, xrot=0) self._check_axes_shape(axes, axes_num=4, layout=(4, 1)) df = DataFrame({"x": [1, 2], "y": [3, 4]}) @@ -78,7 +78,8 @@ def test_plot(self): df = DataFrame(np.random.rand(10, 3), index=list(string.ascii_letters[:10])) - _check_plot_works(df.plot, use_index=True) + ax = _check_plot_works(df.plot, use_index=True) + self._check_ticks_props(ax, xrot=0) _check_plot_works(df.plot, sort_columns=False) _check_plot_works(df.plot, yticks=[1, 5, 10]) _check_plot_works(df.plot, xticks=[1, 5, 10]) @@ -110,7 +111,8 @@ def test_plot(self): tuples = zip(string.ascii_letters[:10], range(10)) df = DataFrame(np.random.rand(10, 3), index=MultiIndex.from_tuples(tuples)) - _check_plot_works(df.plot, use_index=True) + ax = _check_plot_works(df.plot, use_index=True) + self._check_ticks_props(ax, xrot=0) # unicode index = MultiIndex.from_tuples( @@ -304,12 +306,14 @@ def test_xcompat(self): ax = df.plot(x_compat=True) lines = ax.get_lines() assert not isinstance(lines[0].get_xdata(), PeriodIndex) + self._check_ticks_props(ax, xrot=30) tm.close() pd.plotting.plot_params["xaxis.compat"] = True ax = df.plot() lines = ax.get_lines() assert not isinstance(lines[0].get_xdata(), PeriodIndex) + self._check_ticks_props(ax, xrot=30) tm.close() pd.plotting.plot_params["x_compat"] = False @@ -325,12 +329,14 @@ def test_xcompat(self): ax = df.plot() lines = ax.get_lines() assert not isinstance(lines[0].get_xdata(), PeriodIndex) + self._check_ticks_props(ax, xrot=30) tm.close() ax = df.plot() lines = ax.get_lines() assert not isinstance(lines[0].get_xdata(), PeriodIndex) assert isinstance(PeriodIndex(lines[0].get_xdata()), PeriodIndex) + self._check_ticks_props(ax, xrot=0) def test_period_compat(self): # GH 9012 @@ -486,7 +492,6 @@ def test_groupby_boxplot_sharex(self): expected = [False, False, True, True] self._assert_xtickslabels_visibility(axes, expected) - @pytest.mark.xfail(reason="Waiting for PR 34334", strict=True) @pytest.mark.slow def test_subplots_timeseries(self): idx = date_range(start="2014-07-01", freq="M", periods=10) diff --git a/pandas/tests/plotting/test_series.py b/pandas/tests/plotting/test_series.py index 85c06b2e7b748..d56c882471a9a 100644 --- a/pandas/tests/plotting/test_series.py +++ b/pandas/tests/plotting/test_series.py @@ -109,6 +109,7 @@ def test_ts_area_lim(self): line = ax.get_lines()[0].get_data(orig=False)[0] assert xmin <= line[0] assert xmax >= line[-1] + self._check_ticks_props(ax, xrot=0) tm.close() # GH 7471 @@ -118,6 +119,7 @@ def test_ts_area_lim(self): line = ax.get_lines()[0].get_data(orig=False)[0] assert xmin <= line[0] assert xmax >= line[-1] + self._check_ticks_props(ax, xrot=30) tm.close() tz_ts = self.ts.copy() @@ -128,6 +130,7 @@ def test_ts_area_lim(self): line = ax.get_lines()[0].get_data(orig=False)[0] assert xmin <= line[0] assert xmax >= line[-1] + self._check_ticks_props(ax, xrot=0) tm.close() _, ax = self.plt.subplots() @@ -136,6 +139,7 @@ def test_ts_area_lim(self): line = ax.get_lines()[0].get_data(orig=False)[0] assert xmin <= line[0] assert xmax >= line[-1] + self._check_ticks_props(ax, xrot=0) def test_label(self): s = Series([1, 2]) @@ -284,6 +288,7 @@ def test_irregular_datetime(self): xp = DatetimeConverter.convert(datetime(1999, 1, 1), "", ax) ax.set_xlim("1/1/1999", "1/1/2001") assert xp == ax.get_xlim()[0] + self._check_ticks_props(ax, xrot=30) def test_unsorted_index_xlim(self): ser = Series( From 06f16e8406bbc94bc7de1e786509bbe55c0b04b1 Mon Sep 17 00:00:00 2001 From: patrick <61934744+phofl@users.noreply.github.com> Date: Mon, 14 Sep 2020 00:18:21 +0200 Subject: [PATCH 0774/1025] [TST]: Groupy raised ValueError for ffill with duplicate column names (#36326) --- pandas/tests/groupby/test_function.py | 45 -------------- pandas/tests/groupby/test_groupby.py | 20 ------- pandas/tests/groupby/test_missing.py | 84 +++++++++++++++++++++++++++ 3 files changed, 84 insertions(+), 65 deletions(-) create mode 100644 pandas/tests/groupby/test_missing.py diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 42945be923fa0..ab736b55b5743 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -495,51 +495,6 @@ def test_idxmin_idxmax_returns_int_types(func, values): tm.assert_frame_equal(result, expected) -def test_fill_consistency(): - - # GH9221 - # pass thru keyword arguments to the generated wrapper - # are set if the passed kw is None (only) - df = DataFrame( - index=pd.MultiIndex.from_product( - [["value1", "value2"], date_range("2014-01-01", "2014-01-06")] - ), - columns=Index(["1", "2"], name="id"), - ) - df["1"] = [ - np.nan, - 1, - np.nan, - np.nan, - 11, - np.nan, - np.nan, - 2, - np.nan, - np.nan, - 22, - np.nan, - ] - df["2"] = [ - np.nan, - 3, - np.nan, - np.nan, - 33, - np.nan, - np.nan, - 4, - np.nan, - np.nan, - 44, - np.nan, - ] - - expected = df.groupby(level=0, axis=0).fillna(method="ffill") - result = df.T.groupby(level=0, axis=1).fillna(method="ffill").T - tm.assert_frame_equal(result, expected) - - def test_groupby_cumprod(): # GH 4095 df = pd.DataFrame({"key": ["b"] * 10, "value": 2}) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 69397228dd941..313b0ea2434f9 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1961,13 +1961,6 @@ def test_shift_bfill_ffill_tz(tz_naive_fixture, op, expected): tm.assert_frame_equal(result, expected) -def test_ffill_missing_arguments(): - # GH 14955 - df = pd.DataFrame({"a": [1, 2], "b": [1, 1]}) - with pytest.raises(ValueError, match="Must specify a fill"): - df.groupby("b").fillna() - - def test_groupby_only_none_group(): # see GH21624 # this was crashing with "ValueError: Length of passed values is 1, index implies 0" @@ -2133,16 +2126,3 @@ def test_groupby_column_index_name_lost(func): df_grouped = df.groupby([1]) result = getattr(df_grouped, func)().columns tm.assert_index_equal(result, expected) - - -@pytest.mark.parametrize("func", ["ffill", "bfill"]) -def test_groupby_column_index_name_lost_fill_funcs(func): - # GH: 29764 groupby loses index sometimes - df = pd.DataFrame( - [[1, 1.0, -1.0], [1, np.nan, np.nan], [1, 2.0, -2.0]], - columns=pd.Index(["type", "a", "b"], name="idx"), - ) - df_grouped = df.groupby(["type"])[["a", "b"]] - result = getattr(df_grouped, func)().columns - expected = pd.Index(["a", "b"], name="idx") - tm.assert_index_equal(result, expected) diff --git a/pandas/tests/groupby/test_missing.py b/pandas/tests/groupby/test_missing.py new file mode 100644 index 0000000000000..116aed9935694 --- /dev/null +++ b/pandas/tests/groupby/test_missing.py @@ -0,0 +1,84 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas import DataFrame, Index, date_range +import pandas._testing as tm + + +@pytest.mark.parametrize("func", ["ffill", "bfill"]) +def test_groupby_column_index_name_lost_fill_funcs(func): + # GH: 29764 groupby loses index sometimes + df = pd.DataFrame( + [[1, 1.0, -1.0], [1, np.nan, np.nan], [1, 2.0, -2.0]], + columns=pd.Index(["type", "a", "b"], name="idx"), + ) + df_grouped = df.groupby(["type"])[["a", "b"]] + result = getattr(df_grouped, func)().columns + expected = pd.Index(["a", "b"], name="idx") + tm.assert_index_equal(result, expected) + + +@pytest.mark.parametrize("func", ["ffill", "bfill"]) +def test_groupby_fill_duplicate_column_names(func): + # GH: 25610 ValueError with duplicate column names + df1 = pd.DataFrame({"field1": [1, 3, 4], "field2": [1, 3, 4]}) + df2 = pd.DataFrame({"field1": [1, np.nan, 4]}) + df_grouped = pd.concat([df1, df2], axis=1).groupby(by=["field2"]) + expected = pd.DataFrame( + [[1, 1.0], [3, np.nan], [4, 4.0]], columns=["field1", "field1"] + ) + result = getattr(df_grouped, func)() + tm.assert_frame_equal(result, expected) + + +def test_ffill_missing_arguments(): + # GH 14955 + df = pd.DataFrame({"a": [1, 2], "b": [1, 1]}) + with pytest.raises(ValueError, match="Must specify a fill"): + df.groupby("b").fillna() + + +def test_fill_consistency(): + + # GH9221 + # pass thru keyword arguments to the generated wrapper + # are set if the passed kw is None (only) + df = DataFrame( + index=pd.MultiIndex.from_product( + [["value1", "value2"], date_range("2014-01-01", "2014-01-06")] + ), + columns=Index(["1", "2"], name="id"), + ) + df["1"] = [ + np.nan, + 1, + np.nan, + np.nan, + 11, + np.nan, + np.nan, + 2, + np.nan, + np.nan, + 22, + np.nan, + ] + df["2"] = [ + np.nan, + 3, + np.nan, + np.nan, + 33, + np.nan, + np.nan, + 4, + np.nan, + np.nan, + 44, + np.nan, + ] + + expected = df.groupby(level=0, axis=0).fillna(method="ffill") + result = df.T.groupby(level=0, axis=1).fillna(method="ffill").T + tm.assert_frame_equal(result, expected) From e9ca8da3b1da05a2849dfd1e4650e82baeae8687 Mon Sep 17 00:00:00 2001 From: smartvinnetou <61093810+smartvinnetou@users.noreply.github.com> Date: Sun, 13 Sep 2020 23:20:55 +0100 Subject: [PATCH 0775/1025] CLN: replaced Appender with doc (#33633) --- pandas/core/generic.py | 159 ++++++++++++++++++++--------------------- 1 file changed, 77 insertions(+), 82 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index d78fa42cd1056..5336d0828881b 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -54,12 +54,7 @@ from pandas.compat._optional import import_optional_dependency from pandas.compat.numpy import function as nv from pandas.errors import AbstractMethodError, InvalidIndexError -from pandas.util._decorators import ( - Appender, - Substitution, - doc, - rewrite_axis_style_signature, -) +from pandas.util._decorators import Appender, doc, rewrite_axis_style_signature from pandas.util._validators import ( validate_bool_kwarg, validate_fillna_kwargs, @@ -2973,7 +2968,7 @@ class (index) object 'bird' 'bird' 'mammal' 'mammal' else: return xarray.Dataset.from_dataframe(self) - @Substitution(returns=fmt.return_docstring) + @doc(returns=fmt.return_docstring) def to_latex( self, buf=None, @@ -3002,9 +2997,9 @@ def to_latex( r""" Render object to a LaTeX tabular, longtable, or nested table/tabular. - Requires ``\usepackage{booktabs}``. The output can be copy/pasted + Requires ``\usepackage{{booktabs}}``. The output can be copy/pasted into a main LaTeX document or read from an external file - with ``\input{table.tex}``. + with ``\input{{table.tex}}``. .. versionchanged:: 1.0.0 Added caption and label arguments. @@ -3024,13 +3019,13 @@ def to_latex( Write row names (index). na_rep : str, default 'NaN' Missing data representation. - formatters : list of functions or dict of {str: function}, optional + formatters : list of functions or dict of {{str: function}}, optional Formatter functions to apply to columns' elements by position or name. The result of each function must be a unicode string. List must be of length equal to the number of columns. float_format : one-parameter function or str, optional, default None Formatter for floating point numbers. For example - ``float_format="%%.2f"`` and ``float_format="{:0.2f}".format`` will + ``float_format="%.2f"`` and ``float_format="{{:0.2f}}".format`` will both result in 0.1234 being formatted as 0.12. sparsify : bool, optional Set to False for a DataFrame with a hierarchical index to print @@ -3048,7 +3043,7 @@ def to_latex( longtable : bool, optional By default, the value will be read from the pandas config module. Use a longtable environment instead of tabular. Requires - adding a \usepackage{longtable} to your LaTeX preamble. + adding a \usepackage{{longtable}} to your LaTeX preamble. escape : bool, optional By default, the value will be read from the pandas config module. When set to False prevents from escaping latex special @@ -3066,24 +3061,24 @@ def to_latex( The default will be read from the config module. multirow : bool, default False Use \multirow to enhance MultiIndex rows. Requires adding a - \usepackage{multirow} to your LaTeX preamble. Will print + \usepackage{{multirow}} to your LaTeX preamble. Will print centered labels (instead of top-aligned) across the contained rows, separating groups via clines. The default will be read from the pandas config module. caption : str, optional - The LaTeX caption to be placed inside ``\caption{}`` in the output. + The LaTeX caption to be placed inside ``\caption{{}}`` in the output. .. versionadded:: 1.0.0 label : str, optional - The LaTeX label to be placed inside ``\label{}`` in the output. - This is used with ``\ref{}`` in the main ``.tex`` file. + The LaTeX label to be placed inside ``\label{{}}`` in the output. + This is used with ``\ref{{}}`` in the main ``.tex`` file. .. versionadded:: 1.0.0 position : str, optional The LaTeX positional argument for tables, to be placed after - ``\begin{}`` in the output. - %(returns)s + ``\begin{{}}`` in the output. + {returns} See Also -------- DataFrame.to_string : Render a DataFrame to a console-friendly @@ -3092,18 +3087,18 @@ def to_latex( Examples -------- - >>> df = pd.DataFrame({'name': ['Raphael', 'Donatello'], - ... 'mask': ['red', 'purple'], - ... 'weapon': ['sai', 'bo staff']}) + >>> df = pd.DataFrame(dict(name=['Raphael', 'Donatello'], + ... mask=['red', 'purple'], + ... weapon=['sai', 'bo staff'])) >>> print(df.to_latex(index=False)) # doctest: +NORMALIZE_WHITESPACE - \begin{tabular}{lll} + \begin{{tabular}}{{lll}} \toprule name & mask & weapon \\ \midrule Raphael & red & sai \\ Donatello & purple & bo staff \\ \bottomrule - \end{tabular} + \end{{tabular}} """ # Get defaults from the pandas config if self.ndim == 1: @@ -6791,6 +6786,7 @@ def interpolate( `scipy.interpolate.BPoly.from_derivatives` which replaces 'piecewise_polynomial' interpolation method in scipy 0.18. + axis : {{0 or 'index', 1 or 'columns', None}}, default None Axis to interpolate along. limit : int, optional @@ -6827,7 +6823,7 @@ def interpolate( downcast : optional, 'infer' or None, defaults to None Downcast dtypes if possible. - **kwargs + ``**kwargs`` : optional Keyword arguments to pass on to the interpolating function. Returns @@ -7243,11 +7239,11 @@ def isna(self: FrameOrSeries) -> FrameOrSeries: -------- Show which entries in a DataFrame are NA. - >>> df = pd.DataFrame({{'age': [5, 6, np.NaN], - ... 'born': [pd.NaT, pd.Timestamp('1939-05-27'), + >>> df = pd.DataFrame(dict(age=[5, 6, np.NaN], + ... born=[pd.NaT, pd.Timestamp('1939-05-27'), ... pd.Timestamp('1940-04-25')], - ... 'name': ['Alfred', 'Batman', ''], - ... 'toy': [None, 'Batmobile', 'Joker']}}) + ... name=['Alfred', 'Batman', ''], + ... toy=[None, 'Batmobile', 'Joker'])) >>> df age born name toy 0 5.0 NaT Alfred None @@ -7310,11 +7306,11 @@ def notna(self: FrameOrSeries) -> FrameOrSeries: -------- Show which entries in a DataFrame are not NA. - >>> df = pd.DataFrame({{'age': [5, 6, np.NaN], - ... 'born': [pd.NaT, pd.Timestamp('1939-05-27'), + >>> df = pd.DataFrame(dict(age=[5, 6, np.NaN], + ... born=[pd.NaT, pd.Timestamp('1939-05-27'), ... pd.Timestamp('1940-04-25')], - ... 'name': ['Alfred', 'Batman', ''], - ... 'toy': [None, 'Batmobile', 'Joker']}}) + ... name=['Alfred', 'Batman', ''], + ... toy=[None, 'Batmobile', 'Joker'])) >>> df age born name toy 0 5.0 NaT Alfred None @@ -10252,10 +10248,10 @@ def pct_change( Percentage change in French franc, Deutsche Mark, and Italian lira from 1980-01-01 to 1980-03-01. - >>> df = pd.DataFrame({ - ... 'FR': [4.0405, 4.0963, 4.3149], - ... 'GR': [1.7246, 1.7482, 1.8519], - ... 'IT': [804.74, 810.01, 860.13]}, + >>> df = pd.DataFrame(dict( + ... FR=[4.0405, 4.0963, 4.3149], + ... GR=[1.7246, 1.7482, 1.8519], + ... IT=[804.74, 810.01, 860.13]), ... index=['1980-01-01', '1980-02-01', '1980-03-01']) >>> df FR GR IT @@ -10272,10 +10268,10 @@ def pct_change( Percentage of change in GOOG and APPL stock volume. Shows computing the percentage change between columns. - >>> df = pd.DataFrame({ - ... '2016': [1769950, 30586265], - ... '2015': [1500923, 40912316], - ... '2014': [1371819, 41403351]}, + >>> df = pd.DataFrame(dict([ + ... ('2016', [1769950, 30586265]), + ... ('2015', [1500923, 40912316]), + ... ('2014', [1371819, 41403351])]), ... index=['GOOG', 'APPL']) >>> df 2016 2015 2014 @@ -10691,43 +10687,43 @@ def _doc_parms(cls): _num_doc = """ -%(desc)s +{desc} Parameters ---------- -axis : %(axis_descr)s +axis : {axis_descr} Axis for the function to be applied on. skipna : bool, default True Exclude NA/null values when computing the result. level : int or level name, default None If the axis is a MultiIndex (hierarchical), count along a - particular level, collapsing into a %(name1)s. + particular level, collapsing into a {name1}. numeric_only : bool, default None Include only float, int, boolean columns. If None, will attempt to use everything, then use only numeric data. Not implemented for Series. -%(min_count)s\ +{min_count}\ **kwargs Additional keyword arguments to be passed to the function. Returns ------- -%(name1)s or %(name2)s (if level specified)\ -%(see_also)s\ -%(examples)s +{name1} or {name2} (if level specified)\ +{see_also}\ +{examples} """ _num_ddof_doc = """ -%(desc)s +{desc} Parameters ---------- -axis : %(axis_descr)s +axis : {axis_descr} skipna : bool, default True Exclude NA/null values. If an entire row/column is NA, the result will be NA. level : int or level name, default None If the axis is a MultiIndex (hierarchical), count along a - particular level, collapsing into a %(name1)s. + particular level, collapsing into a {name1}. ddof : int, default 1 Delta Degrees of Freedom. The divisor used in calculations is N - ddof, where N represents the number of elements. @@ -10737,7 +10733,7 @@ def _doc_parms(cls): Returns ------- -%(name1)s or %(name2)s (if level specified) +{name1} or {name2} (if level specified) Notes ----- @@ -10745,11 +10741,11 @@ def _doc_parms(cls): default `ddof=1`)\n""" _bool_doc = """ -%(desc)s +{desc} Parameters ---------- -axis : {0 or 'index', 1 or 'columns', None}, default 0 +axis : {{0 or 'index', 1 or 'columns', None}}, default 0 Indicate which axis or axes should be reduced. * 0 / 'index' : reduce the index, return a Series whose index is the @@ -10763,24 +10759,24 @@ def _doc_parms(cls): then use only boolean data. Not implemented for Series. skipna : bool, default True Exclude NA/null values. If the entire row/column is NA and skipna is - True, then the result will be %(empty_value)s, as for an empty row/column. + True, then the result will be {empty_value}, as for an empty row/column. If skipna is False, then NA are treated as True, because these are not equal to zero. level : int or level name, default None If the axis is a MultiIndex (hierarchical), count along a - particular level, collapsing into a %(name1)s. + particular level, collapsing into a {name1}. **kwargs : any, default None Additional keywords have no effect but might be accepted for compatibility with NumPy. Returns ------- -%(name1)s or %(name2)s - If level is specified, then, %(name2)s is returned; otherwise, %(name1)s +{name1} or {name2} + If level is specified, then, {name2} is returned; otherwise, {name1} is returned. -%(see_also)s -%(examples)s""" +{see_also} +{examples}""" _all_desc = """\ Return whether all elements are True, potentially over an axis. @@ -10843,14 +10839,14 @@ def _doc_parms(cls): """ _cnum_doc = """ -Return cumulative %(desc)s over a DataFrame or Series axis. +Return cumulative {desc} over a DataFrame or Series axis. Returns a DataFrame or Series of the same size containing the cumulative -%(desc)s. +{desc}. Parameters ---------- -axis : {0 or 'index', 1 or 'columns'}, default 0 +axis : {{0 or 'index', 1 or 'columns'}}, default 0 The index or the name of the axis. 0 is equivalent to None or 'index'. skipna : bool, default True Exclude NA/null values. If an entire row/column is NA, the result @@ -10861,21 +10857,21 @@ def _doc_parms(cls): Returns ------- -%(name1)s or %(name2)s - Return cumulative %(desc)s of %(name1)s or %(name2)s. +{name1} or {name2} + Return cumulative {desc} of {name1} or {name2}. See Also -------- -core.window.Expanding.%(accum_func_name)s : Similar functionality +core.window.Expanding.{accum_func_name} : Similar functionality but ignores ``NaN`` values. -%(name2)s.%(accum_func_name)s : Return the %(desc)s over - %(name2)s axis. -%(name2)s.cummax : Return cumulative maximum over %(name2)s axis. -%(name2)s.cummin : Return cumulative minimum over %(name2)s axis. -%(name2)s.cumsum : Return cumulative sum over %(name2)s axis. -%(name2)s.cumprod : Return cumulative product over %(name2)s axis. +{name2}.{accum_func_name} : Return the {desc} over + {name2} axis. +{name2}.cummax : Return cumulative maximum over {name2} axis. +{name2}.cummin : Return cumulative minimum over {name2} axis. +{name2}.cumsum : Return cumulative sum over {name2} axis. +{name2}.cumprod : Return cumulative product over {name2} axis. -%(examples)s""" +{examples}""" _cummin_examples = """\ Examples @@ -11350,7 +11346,8 @@ def _make_min_count_stat_function( see_also: str = "", examples: str = "", ) -> Callable: - @Substitution( + @doc( + _num_doc, desc=desc, name1=name1, name2=name2, @@ -11359,7 +11356,6 @@ def _make_min_count_stat_function( see_also=see_also, examples=examples, ) - @Appender(_num_doc) def stat_func( self, axis=None, @@ -11406,7 +11402,8 @@ def _make_stat_function( see_also: str = "", examples: str = "", ) -> Callable: - @Substitution( + @doc( + _num_doc, desc=desc, name1=name1, name2=name2, @@ -11415,7 +11412,6 @@ def _make_stat_function( see_also=see_also, examples=examples, ) - @Appender(_num_doc) def stat_func( self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs ): @@ -11439,8 +11435,7 @@ def stat_func( def _make_stat_function_ddof( cls, name: str, name1: str, name2: str, axis_descr: str, desc: str, func: Callable ) -> Callable: - @Substitution(desc=desc, name1=name1, name2=name2, axis_descr=axis_descr) - @Appender(_num_ddof_doc) + @doc(_num_ddof_doc, desc=desc, name1=name1, name2=name2, axis_descr=axis_descr) def stat_func( self, axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs ): @@ -11471,7 +11466,8 @@ def _make_cum_function( accum_func_name: str, examples: str, ) -> Callable: - @Substitution( + @doc( + _cnum_doc, desc=desc, name1=name1, name2=name2, @@ -11479,7 +11475,6 @@ def _make_cum_function( accum_func_name=accum_func_name, examples=examples, ) - @Appender(_cnum_doc) def cum_func(self, axis=None, skipna=True, *args, **kwargs): skipna = nv.validate_cum_func_with_skipna(skipna, args, kwargs, name) if axis is None: @@ -11517,7 +11512,8 @@ def _make_logical_function( examples: str, empty_value: bool, ) -> Callable: - @Substitution( + @doc( + _bool_doc, desc=desc, name1=name1, name2=name2, @@ -11526,7 +11522,6 @@ def _make_logical_function( examples=examples, empty_value=empty_value, ) - @Appender(_bool_doc) def logical_func(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs): nv.validate_logical_func(tuple(), kwargs, fname=name) if level is not None: From d6ae8419a9216c74d97388c44c14ab01644bfa92 Mon Sep 17 00:00:00 2001 From: rxxg Date: Mon, 14 Sep 2020 00:28:12 +0200 Subject: [PATCH 0776/1025] Ensure resource closure in all exceptional circumstances during construction (#35566) (#35587) --- doc/source/whatsnew/v1.2.0.rst | 1 + pandas/io/sas/sas7bdat.py | 8 ++++++-- pandas/io/sas/sas_xport.py | 6 +++++- pandas/io/sas/sasreader.py | 10 +++++----- pandas/tests/io/sas/data/corrupt.sas7bdat | Bin 0 -> 292 bytes pandas/tests/io/sas/test_sas7bdat.py | 8 ++++++++ 6 files changed, 25 insertions(+), 8 deletions(-) create mode 100644 pandas/tests/io/sas/data/corrupt.sas7bdat diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index bb79b91096867..bbee0062cf0ce 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -294,6 +294,7 @@ MultiIndex I/O ^^^ +- :func:`read_sas` no longer leaks resources on failure (:issue:`35566`) - Bug in :meth:`to_csv` caused a ``ValueError`` when it was called with a filename in combination with ``mode`` containing a ``b`` (:issue:`35058`) - In :meth:`read_csv` `float_precision='round_trip'` now handles `decimal` and `thousands` parameters (:issue:`35365`) - :meth:`to_pickle` and :meth:`read_pickle` were closing user-provided file objects (:issue:`35679`) diff --git a/pandas/io/sas/sas7bdat.py b/pandas/io/sas/sas7bdat.py index 76dac39d1889f..f2ee642d8fd42 100644 --- a/pandas/io/sas/sas7bdat.py +++ b/pandas/io/sas/sas7bdat.py @@ -142,8 +142,12 @@ def __init__( self._path_or_buf = open(self._path_or_buf, "rb") self.handle = self._path_or_buf - self._get_properties() - self._parse_metadata() + try: + self._get_properties() + self._parse_metadata() + except Exception: + self.close() + raise def column_data_lengths(self): """Return a numpy int64 array of the column data lengths""" diff --git a/pandas/io/sas/sas_xport.py b/pandas/io/sas/sas_xport.py index 1a4ba544f5d59..9727ec930119b 100644 --- a/pandas/io/sas/sas_xport.py +++ b/pandas/io/sas/sas_xport.py @@ -264,7 +264,11 @@ def __init__( # should already be opened in binary mode in Python 3. self.filepath_or_buffer = filepath_or_buffer - self._read_header() + try: + self._read_header() + except Exception: + self.close() + raise def close(self): self.filepath_or_buffer.close() diff --git a/pandas/io/sas/sasreader.py b/pandas/io/sas/sasreader.py index ae9457a8e3147..31d1a6ad471ea 100644 --- a/pandas/io/sas/sasreader.py +++ b/pandas/io/sas/sasreader.py @@ -136,8 +136,8 @@ def read_sas( if iterator or chunksize: return reader - data = reader.read() - - if ioargs.should_close: - reader.close() - return data + try: + return reader.read() + finally: + if ioargs.should_close: + reader.close() diff --git a/pandas/tests/io/sas/data/corrupt.sas7bdat b/pandas/tests/io/sas/data/corrupt.sas7bdat new file mode 100644 index 0000000000000000000000000000000000000000..2941ffe3ecdf5c72773d9727c689b90dced8ebdf GIT binary patch literal 292 zcmZQzK!8K98WT2)2%g_NiGzXjxM7ckynvvR5`(cZBa;yeTp2SXinuYOvN3}l1A_oF zBTPxKW3Ymor;n?%e^5|pK!^e^08@{Pc5w`G1nC9I$L;E@kE^@m2&2Jz6!tj4Xce&S gj10_R0SIBKXJBGr=xY{XW)dG96lQ3KBu5Gp0N4g1L;wH) literal 0 HcmV?d00001 diff --git a/pandas/tests/io/sas/test_sas7bdat.py b/pandas/tests/io/sas/test_sas7bdat.py index 8c14f9de9f61c..9de6ca75fd4d9 100644 --- a/pandas/tests/io/sas/test_sas7bdat.py +++ b/pandas/tests/io/sas/test_sas7bdat.py @@ -217,6 +217,14 @@ def test_zero_variables(datapath): pd.read_sas(fname) +def test_corrupt_read(datapath): + # We don't really care about the exact failure, the important thing is + # that the resource should be cleaned up afterwards (BUG #35566) + fname = datapath("io", "sas", "data", "corrupt.sas7bdat") + with pytest.raises(AttributeError): + pd.read_sas(fname) + + def round_datetime_to_ms(ts): if isinstance(ts, datetime): return ts.replace(microsecond=int(round(ts.microsecond, -3) / 1000) * 1000) From 6b8131f9dfd404c35f59a5fe382a54f882e4af59 Mon Sep 17 00:00:00 2001 From: Irv Lustig Date: Sun, 13 Sep 2020 18:54:41 -0400 Subject: [PATCH 0777/1025] Change default of float_precision for read_csv and read_table to "high" (#36228) --- doc/source/whatsnew/v1.2.0.rst | 13 ++++++++++ pandas/_libs/parsers.pyx | 7 +++-- pandas/io/parsers.py | 7 ++--- pandas/tests/io/parser/test_c_parser_only.py | 27 +++++++++++++++++--- 4 files changed, 46 insertions(+), 8 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index bbee0062cf0ce..b2e724ad868ce 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -96,6 +96,19 @@ For example: buffer = io.BytesIO() data.to_csv(buffer, mode="w+b", encoding="utf-8", compression="gzip") +:.. _whatsnew_read_csv_table_precision_default: + +Change in default floating precision for ``read_csv`` and ``read_table`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +For the C parsing engine, the methods :meth:`read_csv` and :meth:`read_table` previously defaulted to a parser that +could read floating point numbers slightly incorrectly with respect to the last bit in precision. +The option ``floating_precision="high"`` has always been available to avoid this issue. +Beginning with this version, the default is now to use the more accurate parser by making +``floating_precision=None`` correspond to the high precision parser, and the new option +``floating_precision="legacy"`` to use the legacy parser. The change to using the higher precision +parser by default should have no impact on performance. (:issue:`17154`) + .. _whatsnew_120.enhancements.other: Other enhancements diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 811e28b830921..b87e46f9b6648 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -476,10 +476,13 @@ cdef class TextReader: if float_precision == "round_trip": # see gh-15140 self.parser.double_converter = round_trip - elif float_precision == "high": + elif float_precision == "legacy": + self.parser.double_converter = xstrtod + elif float_precision == "high" or float_precision is None: self.parser.double_converter = precise_xstrtod else: - self.parser.double_converter = xstrtod + raise ValueError(f'Unrecognized float_precision option: ' + f'{float_precision}') if isinstance(dtype, dict): dtype = {k: pandas_dtype(dtype[k]) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index b963d5be69b5f..2780b1a7f86c9 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -338,9 +338,9 @@ option can improve performance because there is no longer any I/O overhead. float_precision : str, optional Specifies which converter the C engine should use for floating-point - values. The options are `None` for the ordinary converter, - `high` for the high-precision converter, and `round_trip` for the - round-trip converter. + values. The options are `None` or `high` for the ordinary converter, + `legacy` for the original lower precision pandas converter, and + `round_trip` for the round-trip converter. Returns ------- @@ -2284,6 +2284,7 @@ def TextParser(*args, **kwds): values. The options are None for the ordinary converter, 'high' for the high-precision converter, and 'round_trip' for the round-trip converter. + .. versionchanged:: 1.2 """ kwds["engine"] = "python" return TextFileReader(*args, **kwds) diff --git a/pandas/tests/io/parser/test_c_parser_only.py b/pandas/tests/io/parser/test_c_parser_only.py index 50d5fb3e49c2a..7c58afe867440 100644 --- a/pandas/tests/io/parser/test_c_parser_only.py +++ b/pandas/tests/io/parser/test_c_parser_only.py @@ -160,7 +160,9 @@ def test_precise_conversion(c_parser_only): # 25 decimal digits of precision text = f"a\n{num:.25}" - normal_val = float(parser.read_csv(StringIO(text))["a"][0]) + normal_val = float( + parser.read_csv(StringIO(text), float_precision="legacy")["a"][0] + ) precise_val = float( parser.read_csv(StringIO(text), float_precision="high")["a"][0] ) @@ -608,7 +610,7 @@ def test_unix_style_breaks(c_parser_only): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("float_precision", [None, "high", "round_trip"]) +@pytest.mark.parametrize("float_precision", [None, "legacy", "high", "round_trip"]) @pytest.mark.parametrize( "data,thousands,decimal", [ @@ -646,7 +648,7 @@ def test_1000_sep_with_decimal( tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize("float_precision", [None, "high", "round_trip"]) +@pytest.mark.parametrize("float_precision", [None, "legacy", "high", "round_trip"]) @pytest.mark.parametrize( "value,expected", [ @@ -702,3 +704,22 @@ def test_1000_sep_decimal_float_precision( ) val = df.iloc[0, 0] assert val == expected + + +def test_float_precision_options(c_parser_only): + # GH 17154, 36228 + parser = c_parser_only + s = "foo\n243.164\n" + df = parser.read_csv(StringIO(s)) + df2 = parser.read_csv(StringIO(s), float_precision="high") + + tm.assert_frame_equal(df, df2) + + df3 = parser.read_csv(StringIO(s), float_precision="legacy") + + assert not df.iloc[0, 0] == df3.iloc[0, 0] + + msg = "Unrecognized float_precision option: junk" + + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(s), float_precision="junk") From 4ae3c689b933c93079d6e7aa6bf2ec847da70efa Mon Sep 17 00:00:00 2001 From: Asish Mahapatra Date: Sun, 13 Sep 2020 18:57:44 -0400 Subject: [PATCH 0778/1025] BUG: read_excel for ods files raising UnboundLocalError in certain cases (#36175) --- doc/source/whatsnew/v1.2.0.rst | 1 + pandas/io/excel/_odfreader.py | 26 +++++++++++++----------- pandas/tests/io/data/excel/gh-35802.ods | Bin 0 -> 12692 bytes pandas/tests/io/data/excel/gh-36122.ods | Bin 0 -> 8974 bytes pandas/tests/io/excel/test_readers.py | 17 ++++++++++++++++ 5 files changed, 32 insertions(+), 12 deletions(-) create mode 100755 pandas/tests/io/data/excel/gh-35802.ods create mode 100755 pandas/tests/io/data/excel/gh-36122.ods diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index b2e724ad868ce..8b18b56929acd 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -315,6 +315,7 @@ I/O - :meth:`to_csv` did not support zip compression for binary file object not having a filename (:issue: `35058`) - :meth:`to_csv` and :meth:`read_csv` did not honor `compression` and `encoding` for path-like objects that are internally converted to file-like objects (:issue:`35677`, :issue:`26124`, and :issue:`32392`) - :meth:`to_picke` and :meth:`read_pickle` did not support compression for file-objects (:issue:`26237`, :issue:`29054`, and :issue:`29570`) +- Bug in :meth:`read_excel` with `engine="odf"` caused ``UnboundLocalError`` in some cases where cells had nested child nodes (:issue:`36122`, and :issue:`35802`) Plotting ^^^^^^^^ diff --git a/pandas/io/excel/_odfreader.py b/pandas/io/excel/_odfreader.py index ffb599cdfaaf8..4f9f8a29c0010 100644 --- a/pandas/io/excel/_odfreader.py +++ b/pandas/io/excel/_odfreader.py @@ -197,22 +197,24 @@ def _get_cell_string_value(self, cell) -> str: Find and decode OpenDocument text:s tags that represent a run length encoded sequence of space characters. """ - from odf.element import Element, Text + from odf.element import Element from odf.namespaces import TEXTNS - from odf.text import P, S + from odf.text import S - text_p = P().qname text_s = S().qname - p = cell.childNodes[0] - value = [] - if p.qname == text_p: - for k, fragment in enumerate(p.childNodes): - if isinstance(fragment, Text): - value.append(fragment.data) - elif isinstance(fragment, Element): - if fragment.qname == text_s: - spaces = int(fragment.attributes.get((TEXTNS, "c"), 1)) + + for fragment in cell.childNodes: + if isinstance(fragment, Element): + if fragment.qname == text_s: + spaces = int(fragment.attributes.get((TEXTNS, "c"), 1)) value.append(" " * spaces) + else: + # recursive impl needed in case of nested fragments + # with multiple spaces + # https://github.com/pandas-dev/pandas/pull/36175#discussion_r484639704 + value.append(self._get_cell_string_value(fragment)) + else: + value.append(str(fragment)) return "".join(value) diff --git a/pandas/tests/io/data/excel/gh-35802.ods b/pandas/tests/io/data/excel/gh-35802.ods new file mode 100755 index 0000000000000000000000000000000000000000..f3ad061f1d995488bf029f926dc3eba60fdded2f GIT binary patch literal 12692 zcmdseWmp|c)-DoULU4C?cMnc*cXx-4ySozzPH=a3cZZ+>g1ft4nEB4+%(-Xg`R?Dl z_S3zqclUa$YgKiv)vMkuD+&4m83+gx2q@G5MJ~{iBa9XZ2L-YWd9BG4d!2h z{#N>IV{L3^;%NU5GzUgHfQ^l%zMehZ|C^SvrHvlI=>Jvg?M@y4dtI1+bBDGzwvM*% zs{iF4|4z>Vpa*bt_+1|)B;-G={4aC770CXkK0N~iBTJ(<1KHTq893V8znj;^$`TYv zRuba9SAam?cW?5aO@{q;_OFJvwYM>`H*)yX4gJyW{{Z`+-QP9jy^C!1tc@)HivjTd zrbaU>Jrg4bIzcmlm7cA`|0w^R6=2_#t@NzTjEx)swDyL^WAS6w{`3d|r`&-?;|u(B zU|{y-!%gOyoh(7BV00HLh^y^U@yc?!#Rr+Qo8=-*Yf-hS$9mX<;njs*ip&NSB*S8+ zb^BDDEwoK}%bZIK^;CE%IOpd6d6|^W&eoJ1I*EZaY_OISHQD4YDVN+bt%O}a*zLq% z`)i{hp>QJw&91jRFgeCIpo~@I0EbyIuDAF+;VYa)S=G>b_3~}O_4u*A{E<*!h64cp z*v-J27x`oasAX zBTCRsV?2>LXyI~w-F=~*c*WSLsC5#u^^(7Fb7+$r72iC)-jP-cZ$2kp(?V1j@PAH2s;WS}92kX;!lb!>h{-nVYW&Lna{ zc~oYce_}WQXhXtrNK0;;9D%%vF!Cy+;H50`)=CWu&-Q6==Ew5tK6WqRE?$=74hWiVtNIdhWtK?kbFS-bbi)0iEqq;EyeLdmU9UXsiLtP!P-ss5qx zpdIGq<)tbesXjnO0O^P5X9s$8I!ZrVkjb-ri!??-3{ttcsa&xd?#xF`#^IovWC;X* z4t$qL0}=b6OGqR3oPZs1D6|D;CFUO5W-uum&x!-dv^E{23*Tv(1g~_mHbJepdz*40 z6%>wcI!Fg2zXm~=LzK0VGL+0VclyYUbpp0q`P+f9Tke%yJ6rIDqq2L7Mt_E8*Llew zK#U(c5T< z=(h*V0pMzBTUz2fA1weO(S zW31pHXH8cVo*B<|(UsBFGSzX7==8e4Lu2I%I9iVUh{-znym`4ot*(#u^kgy%*RadY znG(($06&_sIXDYvzUSqn=x+0eoAu1qh2^sE>yg$L1EBRDU6(piM^a)&Bg^cxH-o}k zHLOwPbycw@E?6))l?~^0k;l4J#myWQHG0u-PR=!S z9blr)C2hNysS7kG~LH@ubo$g`B6DY^Z9ffmzj79Hp10 zF@Yz)7^oqCO%BIdzRsCebX^Ues;00i8fv`G3yiP(xFFj4)ojsl1#PC%Yoey8R=dLF z$XXL{iZ{BEePyi(u(%w)+(_=8qpi$r+qUXojdQcNbf!U_;bzU0*q}PZX??@nj{h$LWZ6iucf|5d{%ln zVQ#m9%3VFXJsS7E1)3_Y)C%Wg^ul&MMAfsJ4Usu-($ec93nWPBhPs)Zd3J*R_Dfwx zm7pBlO!+!KL2)RmJizbCQ-Dop%!`C=@zv{jdvQ$0gwIYF z7;3p@j88(sAA|1Q_?~e&>TebDHEr!U#aX0AV7M$ljUeEiXTVKJezk?wA{auH0?!~c-(49#^?oLpar6_E0Kco zs=5v0bB)f733SSpu2M0AcJiAbTW$cYX?+~={=(Zql|ZY3@TFNzP6n<5g1{y>FsfnC zG|1O>)}EZ4NJgM1pCdAiYfzXAjl;c*U=!8J(_d(!i$*@BR-eny+J(1zuTl7$K@pC0 z%#`d&D%(bd-xs2f9Bjo|95e?XMsuW6Kd7%+H4xqpQBrJCxoZ4^Sip%va;BDK^9y}IVvN2E z#_L$S=UW+|4aXjUl_JR1bekKEiF0Yvm=*lBONWN=%cmVJ$ux-6rJ<4LjG!6JVSglu z8aI(xy!|Hyy7Grj^}SG|z0gQ4HfI3!54M~ndip;8pMugY!bUMWY&*!8;F_%Tu>4OZ ztCQXdiO&Vw=atRvvbl^~RU&44$On-eflHu8u#4Wa#xFz;0jF*B)@nYB{A!^kHM1#t zQ819nFd?C``>~~xJACvjmHZwn1Bu=OeC@dMO9ET;-*!!hTYo0>&MUfR8tsui96_G`-rwInx1j&&gx5XJ{OnTH$ zY=Z5l9*qS47U;^)QFGT1ybH>~2P}T%d{u>(9FNkPnNnXjf@v3jDvX<-jmARH6(9Jy zRrpA&H-`r9UbT{@%d7ijF8AJ1!_W;%bZuF5eiO+lWQj5zgGg;t226`l^okKzUZ&eU zMSwG>iiP=mnp!TSajZnLQATsP4)$`k+A$O+yn57WWq`u;chw)iR_B~pN;4_l`^U}O zZ8>%!PcWvv1!N6?Yy#l!dcTZhR_fJt*4x_s1wL&?22UtuN<|bOKRMop-+ZY)~exh&w}vsu?QI2^ZpU+#f5Dm7@~1cUF$? zL5fuF`Fv1;-UM|m?WURWNR!argon5LQ_^|lT3%~n=t@6t*x|mXW3*4Hh=oz(R>@cu zrY<$BoxPVfGco}VeZ=~+ThUrfdm7|C&``w!=S0WT^6q{QpAK=SwpjU8XVnRBo1@{G zUj))&K2&KKb5Lze&+E%aS>#eNoQ|l;kK733eVRF4wG!)M{>OMNhEI!=bvrz#n=F$U zy*ilrZIswGbp-kDKQfqlUW4IKwAvAW}|==ytjFgNLtabl4!sZvHe4ic)5o|nU%$)^4yhcwc31- z1{;e9G@!h_S%v-FVxVen4BJAj9ut=zvClblt(!}uhZ)T6fo9s{C}Y^L{G%DM&xNd? zhF!iQ2*o;W>c&^z48SJBm&9L9H_*K?=V=NL#FTU{Jpph*?K)yD-5hEPF8d{WB<(G}V;xW=fm#CSeH|3trp z-?@T7(hcHlkp_uMT3+44--9m<7KMz0FRw@&r@x+9Mm|*FWyb-`T_=zV?4D?Yi_><2 zLcLg57T?Ct9B*OjL~AXOv@sDvZki=92RUW6nrY2KqCIXiWN4c(leuZnn!5H7O?KU0(0Y75&MF&GmKtpK+<69^d4txzehb*zgvI ztdVQ=WUoy9!@<_(iRhAn?eEPSNr7%wBrG)iv-ZxLn|il*@)T0z{Ml{X0xjj9lyXvj za$}6RsS-_wO)DJufnS9ZQ))?FN+ME&Qz-&o?Lu{t!=M^)ZzleyD?CEYOa*M30 zy|V)6pA@N6UPG5<_o9TTjcI@;rZXsSreaNJMEiLN|9!KEUL{8bS4iA9t>Xqas5?+p z;v52j6E4~?<2@M`}ReHC0#@~$!?dI#y&Z) z)#Q{~^sFk@guR?C>r@y|WmB#W^Rtk+=4Azdnc_&^{)g4Sn#?56wr2f^&O2-0&p&VjKF2&xR z=Ho41P%RIT6D^Z}KKO?=A-v~|5o}kv=TV_0!Si(+w$or&n-XL}gxT`eO*ePijwPaFB0BS(V5L>xCI-GycQ|8&Sq}50ZC9lHy=zHzJ{#f9 zID@F0csRxN*1!N>l=32xRmyG>7Plva4=5ZqUf$wcv8}N(KSomLZnIZ`_~-=AiaRrV3prNm|dvV_S9O-A+)H1ZR&{{dIdAyW}wq(E1 zU&GlnocMfZ;ur(3Ve&wKl&r3yy>L`hSsO@c^YLdQnsUmF*7CHV2K@3-0O(S;t1M~NM3`|DEX)-lGDqJ@I0r+7X1#nXLAJikeQXnI_yvnufyyyR zJ%7^F)ywjA!4lW>ww&=M>A)>cl!(JwV@;KP)#P;t9Mh1>!VB}&#vypjh7tp=&Pm!^ z+3fFADlPen+SZ>?pP}QSuj(AZ45ssAUx80!Ujvo4m==wl@I9Cw%+kOtl zjX~iA9?@b!-2NiqT>c_Eoc{SgANiGEDxyB?eI}2g2`C|wGmU^4$&Oh!K5i%p63mH7 z#e}JpiWKgVf&(*D*gW8kg2Eu9z;`E7xy6vu4GEUWZmYoP4D6OzS;_it+X&L6kn3}d zOabQ{h8&ZgU9xFANiPMwXy43<7~Fq{72eQsk?IaPTxg1IkMsq2`GW$cwI2n(31I|4 z8V(b#Ty)?EQ2=^G@uT*-5+zRK6xwbX0f!A-zw4oKS^)?xUqtz5KX93YJ7tXi5W9&| z2z2mbgTZ4(0+S=kg$OMxb_10Xk8d@SOIXX@c)<3SrEG#Bk0foYyQ{X}^VN67zK(OK z_-<)}ML2v7N`v`EPtPgn1LxnLgi0?GVUy-a@Og!ym8D3K6unRv3fx@WPeAq*`|{Q$ zJSrcLrBxWBhj{N{FP6DXZ-&}Kn75Q))`SppkO?97y3GRXVT-A{bx71OC0gv!Gld$9 zn}G6}$KLX^AdRytvNeUvQg41T8?{wsVjx&iVBoCI6YvVv1g>Q4Mp|i%hmF+nQ--e@ zeI|I9C$dDASYESHwy;r%8nsTPo4qgbfh_{iZK`8*eyHMleZ8l5ThjW1$s@$ZJ?*Iz zsJK4#v34PvWDf3>eb8vvfsC8-0yc^WQRs##FX>x(#yNx;zc#zq2By`Pm-!;CUdqh2 ziRIDF%k$L~>$SS7*fx+@9e}R8nH6=q7zB0wn$vV}+jJ0A;Is9oNIIbu5qLf->&N)x zO?4}#@A|D9m>9F<=Tn1y(}6SlM9XFQ>7`*#^}-?tq6%kPVUd9oC-+-U9?|vYEu5ri zy)7jBu}BlFP>Ws-i2IrG)g&TmHJMCAVKkWMIaZITunT=|fe2@NMt6hE%s%B05?l0e zhquZoIiB#F65x`x;yfZwXrF!j> zW7$$pPt6%syxsks~7A-4Gf`FX`VX^w7f=|Rh^iMNcB2AYj60#lZB#e;-d{dCZ z5NbKw5eTPZ=@gG~$rhmVaS1H1jd9_0OAyv>pqE>b>s{w7_m6^gV%I8F+C64N;4TEu z=7^MTWN%LM(-3_WCaz~X4+cf|uX!InmBz8Nrq%SIlg@i1A|Uo0QFqlumx~nJL?#v5 zp>d)2N!CoUK^s&J)l7{GkMkPbDCFEK91Jbc91jdhmneU0{h{t)UYpemR%8+1*;Ppc ztj0z8qP~T8QPCeiX1WdI3G_U`wOm)3D#KWM6@ZeSJ0c)cGQR!5r8Uz|tz4}7DDTN! zS)`LlsZ{D&v`T#xr7_?f#4ULlFZ6Ct_jjc1BO7le9dpX)_bw>!vr{9jA7 z&y&>9^#oFe`x!xLjV;_QK{6sN>$-}rR^pb~^;CTS!NH{=!bIp6p0qs<;rja_#~8&AK$esF`$T6?k)U64u| zEF{K=0#p9LU`-cKC#lziiRGOO;)}>OygdhQ?w)U`g$bjORh}%}S3M{2EfP!bo~Yx* z&s3QLSb-c|oMqoZl!Un&>D})L?viBu>xHI%-QrCEiSsZxhF<4IAYVPI1zuV)hDXCd zQ4<6M801jF>_`TClZB>J_q?;wB?Mr2pUe=UJQ{y8Kl-UJ)YdMDjf{CXvL&Ix2rJKm zdG-c9qfw>r2Ae&*0mcUnYYf2>+pjCN$*4ZetK09W@p=m-Icjj>ZVb-2-+8~M^PWhx zlar8ufH-i0|0liwcL`sZw{)I?jWytZRtDNsHEfnyk-gR`=$UmFa1HyUxtO&#vL%(X zG^T+ixsGVE@`ysINT7+gCB2{FDPr>thm_{a=<0hN9^oefgPXmY=oTzt1m(3?KZM^4 z)B7{-K+)8s{(5eiUl0Ga{g7&rkhw?HZ+ni7$;nT~O9TBIVKqXE5 zwImK1UxiZp99n{^h4ZT#M$4ER^>$t}lzm>Y67sQA%l(P{xG-wlBhhrNY?rohA{UZ) zcTi&z8d{PZDv406t)$*<+1G{LV)O)6O)V|qpaWGB9J#iT7q>?52nTX8G07q^41WiF zGC`>ou%4q9x`stXHN!r3WEAW5Am0k&Pljwt;X8f0j~m&X;QW%zT@CGg(H|5w$w$gI z8Gd}*Id&<5hb4V1N%q_aK~APDb@g22$=MD*XP#6XNx3Kzxrqy;z=ML8DJWj27^cyp zH-M-q24ljHX9odS7ZM4-@BRw9`}AciLwM|wL5enh-^@PwZZ%gE`B8ZR!{(TpxJv7LY*!cM(0n!I|~Dlivos%h9=- z6Ha!N!!FVdCF2)mzjNo1F_`{3kP1mFwC;U>&~6zvC&fd*QLW@b~4(-P>e(BN$5i& zg(Pe0u2Ft z!7CZQ3tBCiTP`{?PmQ^8=)5$B!1%BEJHRhrQ0cqBY&cYgR(T?M%!39|@fHW=qB&%# zMePpvJuP06VDf!zOhmYlnDVQfOj6Dc+HXX`=(j?HNh9G8KpDcX)nX=LQQ#@y4oy^kPV?@Xy6}0~| z2Ut{J;kVY6Q)y=`oR9rZT{R`!G)G;RW5?*vAwj5UUpZIG5eG0gy3I@VboqY;!XGIo zwFZ2OV)YTpb3Hl3K>cb^TEndvJiLFz)2H^JRuNgbIj}`;4$UJ}#QNI>_g8cB#C- zy;}dymh}a%m#3w0WW@!t;ApFJaeGSYm>$<1jG~{S;c{ z*Yja;U2@*r&`C04<#1XBgwx6DU z=x^gD2{U)Ff9j=U7j3?Fi@zarH2HM+<~M~8Xq zWU!js$zM8MG+i1Y(qmoc74+SM=&aSVLV*JT5xy<9|8s%(?>xx+imQVW0AOZq^3Nq9 zbt~s=PJ|aPZ{cX2Q!}L{Hx-#f+}d`4N*PG|ISqe8DZx6iKyAkh`0Q#qRawbojr+-i z09c$!(2(IyvTw@-)khs_g3XY)N6qu%m=>Sje0ajk$Dfm?+Frw+9zpfRIDUS#AF{L6v=;us6d|KS3tH1dWt#CP*#Yl=o;kR52!eQrpTiMB|3^UKG)5YZ5*sYloI_ z{o|4FkPyu}+oab|TLnH`wxrbpxJ06j!C5GIiriRja8o~d3SFCQJ*gk^Af67*~ADRmafOC0P6#zESTiJ!0cP zvv^+X=CcbguIHCT&#~|sond*AUQ6(wgIAV>rZCya$7*#hWTE_UEw9NLFFz4OUPP<* zffIy%L3D*&o&5*6E_}|f2U53vA={Naz3}hPjaG7D1o%zGy1?9a4hR~UNSxrBp=3jp z#@~W)*dcB>#phskGw_c(hvu6o_U+SzqE-g%+{6cUmbvYnvT88;7C@+D>5y$k$RzGJ zr(RtF#95@FGuacmLOjxEjRw8nVy0BRGhS{&&)gLa!P3VvqGb<}6`Uru5~l02e&SPJcSEn8^NiXv^1CKuCOZSD*Wl zJsW|Mn^g>%18HSHLh5GF5yZpa+y|!dRM8A_0cga130Wgqd=fut{Hv&Il9*VQ_L%iJwF>0#a*d(Jx^0*N;*pXru7 z)Ueal@}^kOo^e)uO?A||920N&i8J^*9414Cc@ix(Z+>gL5`J`Wmsrr`6OoawEVltW zyyAxLgsZr!w8?>})8nL(38k zK%MX09!;JXIx8U#mQHXPDT62vo?Fj!Cwif}JF>)RYC+%e^hHoHZ#!IisIh99{W_Gz z;l5#2*l0GZo2P+dNo2})zp3J(Igzt5> zW_B*k=GV&-`Q=hc-5ve zxe%@gKUz%cAUa@+Ruk4!wAt(e3z&?zd$@T3kK@4f(rX5($Y8iew=em&6z^VTQ0ynr zAw(}KF#DPMXG}lV5^RoAU>?5I&Z|MZ`ik8BX;&uKkSu-b70M+)okAsu-XO9MMZN9j zFTreE7viVz%CBJWPQsfkulx`U2*{KC4=3>l7pR}4rcdVW%=`9E*HScfw9>cMGqZG{ z1N>2>wY4@0m6aBSgT{O(7Ql&%3CX{GzrJn1Am4ZhK#$7&^=~vKZP_o1!q6Xp;o(s~ zBI6;!!F+;8MnXVBM8L+v!a~C!#>2(OC8a|lWI`q3AVGyD!9u0NK_Vl>W5!3}AVuRQ z!{DVrB_YHoC#NK&;i8~p!lL5Gp%*5i6Chv_qhS-I;#FYc7Uty<;1d$(mDc5w)#p_( z6=fojWF-*eW0w?QQ~gY*EI_Xy#I7j8ryCc~g|E6zfUaYhkz2I6qrbC$m{t*PgZw~H202G_5P^p zTdW;g?iihI9^Y)A+3qg;Hdve4Szj_-_kE&+X&I-HX-3+l~2= zt%aeDg~_9}>62d*+rQ?H*QXA**Uq=5uMSt&_f9tt&vwpkk1sBdt{yH9H_lE^FV1(L zFLxd;4=*k+&u$*C?w_6?FJE3>-Ur<4tIoKD(c4HX6&DgvbXhu{Qb`5?`jEBO zndJU*Nk5d_phN0l%$@D*ayEb0roe|$)3yrAc$+J>7BXR9YMD>RrO3S9LPl8E^7Ak|N8huR#O=+sFsX%IA%2FuxPz(oIL<(4dAAyv9%nU89>MMZbyv^!r!Q|*A z^m0vF!-W(x*}3E{s^z)Qq|epyyW-a3nFCmHwQDq{??Ii0GIKa4~ zfSTLYL*=WYk{P324l5t8$;zC+Qu|e*F9VOKYy8ibS00J_<%>-Z&kD`*63KSE@?vMv ziYbnTt+dL#vh?S-^?PZrLmm4L??V0StK)R<%>4uje}NLK$Xe~0+*dk6LYd;j;}e%| z%C#6Jw8^m-D&S=`GmFhr9sPP%+F@=f9C?UJ>WQ$LFNVvaR!?+f!JlbXP`DUbhBIrp zfMZRQc%OAFdA*&7!dUQ{%FQLM?osITF8py{Y-k9hd!TBF(kdf7&0nb8BV_3l)W7}! zyn3#rj(o(bU3oo~`l`LW^w8vezBv$$$LIBQ_y29-wcrjEm;`a#+0g3bZ3F{}3rh=? z3+np)=SUsp?J6l@MFARdX%V`AktE+(TxAJT*4^|7fwyUd7xiH))y32xFu55%xSe$a zU2#nHPyWAd90@^+C=tvST{q5eS5te{tfJD(;1ROz7Gl^QBq;g!6T0dNGFze_(k~sD6rx{?a8u3v4tUG$%fJ)6s33I>CZEtJ!G2|kw7KwMKmA=VzJbD z{c1-Fg(z1{2-w{q^Vbj)xym9x@DjjE911%|W<>@z*9Oo0@EM<*)6Vt$ydb4BPvFCt zElu_j?$fFU5ya;spD=i)teHLL2Oz;d9)3d0sIO(8t(LIC-82~qNL0!5V7}}ixVeP7 z^(?;D;znRG-JqJXDoR6L_70dZlRx{|?z`wK!{oZ50kg{+6+~(DFAK z^{;w=&R@LexqnL<#=m8~|El=snD*ZrxcX+iKPNl>(ELv^@9!!>N&j;M{0GV(X8Q9) i@;+(*mJq7{FonxXg1u#GfPg-|eQe+6A9$MgqyGa7Jc#%J literal 0 HcmV?d00001 diff --git a/pandas/tests/io/data/excel/gh-36122.ods b/pandas/tests/io/data/excel/gh-36122.ods new file mode 100755 index 0000000000000000000000000000000000000000..3dfdaf976da4589b48754606c8136196e68132c4 GIT binary patch literal 8974 zcmdUVby$>Z_wFE|ASu#FmvnbXhvd*8HFVdIBLdRW-Cfe%Fd*Hapn!CDOGy7*@3_v+}#v?jb+>h=73b*LJ!a%zq8}u4WCgF*h*=J2E;LnG8h_ z+4wM{`|k7m84phiF~P$-JnwI?NdLg$uK~|=l!P(Y5*Don%qiST|GZo(!8RXOoxBTr z)*D=z-=@NDNJH5#WmdCE$J@l%kUPygHCanXoJ4qN;gg$A+vs9L%cB?P$H0YbHBy!J z+%@SNzd|!v+ZS$oDdg_zFa$)RP!aQ!6?c4|;Uz>9b)e&RMugiLi9qlyZ$U=oV^FOU zG@zCw@~u!PqHMpfqtCNOd6_P@xNkt7w#?n79)0MTPZq!a0bSV0x`fO1J3w;+hD7WOZ za*mpX+7<)Z>8tS)L&=DxE?UA2KOu<_whI!HQ)m*6kv>R<8{CysvWd3Ytj~b!XQ98tQ5gWN4 z-KfMFwZ%WVLp+N53=#T@ZY;g_MZdwTl~3^cukD;acknd4-ieJ19NwEhrgc9-J}A*E zy@9=7>-RZV1GRT+UH-3Y{rK)t0Xw=`8H4Xvdsllie3}#cw#CzL5l;L;u|$ACbOqZo zNwSU0wDm*r=q?X>ZVbw&v0`ttdf2YF<8x4z-3VgVTiC{gC z0Ma3Lu6!I!A)9dioX(0Hv+}o>C)B%;3(!d&!A@hZzs#fQWPra)@MbjHGNK{svay){ zAU+HhD<_4v0eZG8o;RRd#)8Qnt|P=rb5%146z$9xW_s-0n8&WQb>Z9uog8LlK0GdK z-ELOMK8W1nIl)ISj@&}SJjm${K_4e&~hdi!j}ZbM_RYQyA37xLoDXI^B@;$^W~EbkBGu8jnnIGL8vR9+7>DR#8H* zDep2YrWqQ7jozvjXa)ltEYL95wJ-<_Z%b5`;$E0;$|uuJTV|Z|m5)k)$NwEy*xME7 z0DiiXBr^rGget***xW&nY{L|LCC90QHB$@S@_A0gf*FlH&YS-8FW0mZ(cfw6)fKRW z>A?ZX3q%;kp3L*=m7$3Z)+e`dB>rAvHg!>89IOz_@G?KfKvq;@7la5*RcVpT6hNbm z)Hlm(wr$MY^Q&#fuhJPo&)+j(3MOYrHCR!7_S5ec7TrtY&HlImtMGl3qDCW0wq%nw znMY^PL=I><*dJW!ZG_OO4^sQad+aQgd$M5A^yqHUYOOMbOzrT6MX#Px-qglf7h}<1OdVs4? zWkV@~wVGz|*hg?`78C@J8LiNIWqP!#ob7Df-c~)9EW%)|Fm9EIh3eSWt)T9ZxS|4# zyiP)S-V)GAj`;J`fRk^6G|CCnQO_Epja*!8-y+7yBaKx!fnDOuR5e&0%-gCePgDqAxjCV`{TWeR!Hgq!`bJe9+4;TA127s;i;5EOR}S+-)`%SeYH>> z-+t2ku2bGG3MD3#VJg?F$efex)%dWL^6X4Dgef3lt(qoV4DvmJfls%gw9UgkM}_LW z4z#ZaD}H3Kta8g%CT{1@jW285Rw6DRZPFAgkm;&3e*`5fjX5Ed*&VIE<3VXjuf~}7 zhPyYiP>cdjrcV?f4cdqK4_vmM2OF2u{4a$u&o=|LNQ1r@ogZHd0xyGAlKrZ#5h)ej zb%=>}hdHNGtdW;?qLS|#8)efh%tEgS3h1Fj&Vx)Abs zA2w9f#Th}3BvW4Xkcu^e%|mv64WuQU30GUDw=^pD3i%9U|P z1=7@HB2ST79yOiQEJ;d#PRm){kdbZ@mFshPJVJrNJaTd~tbaaps*x4g8LVI?!;z_n z7W-+uj}=nigBQo4DDeu)sw2@}fRTcf!5$r0{`U}8bv@YTh#=M%n&OFQ>cBNv)tK|SAQ2#7 z`)v%-7R#;m^AI+d$L-Sv;i6?boyKo4yCRC_D1^#3vNGmWiFD{wLn|RD7QHxrKk{^rei^m%GrGwc6N8FkJD2oyk+FAG--1{DjnX$M!Jd`*_|)SKnQ+ zQ$9Axpn+7l1UY}7QG1lrHeWaLOyla5)H=vV+@D z6M{$S=QM|#`QKKnPR)ZqS?8{9<-i}=IqSwYN}eo`QmJ>rP16p)pdwyaCB9aD@%D3a z;w=iy3z}>{p51qVzVM-sL9leqTl#_N-ybTu$KYnSCIBpnfCUV1NjfPO<`GPXUIjLD zKvU-o=OYFAW?)}hOc2K%O16}{C=OyLDQLYlFmSFE=qRb$T|0rPeZQ+yTsKuxCvODh z$U4zWA`*0Jz|4l@48cMm+SPT6|c3{_AD|^-=f!IUk z8!On~_)U-z>uEsxy1sU^As*Meitv=aB1@lJ^P5ghhb7r7T6uN}$0Y^aF+1Gy@$pmO0nDp8jzc}j(M)T;TF&ewOm4&K=2bcco~9*oZ8D02cpk0^ zQ9`?c_3TXe@-;-w4yjTN{_9GR5ZWR|H5xph`A2ozsN3wo3s!T7>)HoWpiYr!soW^+ zD|rup%op_PX(juUQ_rbKJHe1RkiW56w+P$X7}f}*+?8C-f^Si)0;^vlmVX7NCpdSz zJwJbb(|}G?J07pID&va*DI0|~c~_Qh`ZeSlxL9=i*Ez9KN=fJd5dg5z{F%6p zXlvtmAAa8*6rmb2;Au|umNRv`7;{@@N}j#R;u2ZvM4~C~fT`3?EV6itZjv?_A>dZo z{(}YB9I;sIv9*PS4<1ePg5^lx4?8Lr+F)`FvfQ<-ZN`fL{TdX-gK285&>OKmbz zULrI2c5r|`HLI&qf%hf(;B0+_$sQG+vREZ4j^!A2MP(}&Sl-@pxfQ|CmGyirGoRV_ zHEK(g*r-VXHhFO0>0%{5hCVvudFsAPrteFsS?OeVdJYpK$+@kk0bMuvAYvV_Jef9$ z@g5=`y7{$%4?E6T1sWDB`P|Z7vPxo&Dxp$O>UN{?&e)5z9AsGEi7x1q(9X^ix~8xk$j0jLMDBfVHS}U^bkH9tl-A4 z5;b_C8y(artDRh*G4~-W(tlI99zS}Bmz@==j_g`FNGJT5!>KQ$hlcC>s(IZwe{h@xD{Metvw{GS|jE*C9PiIcCF;_Qf;`SHTx+-3+(qtFtc7UD^V z*f7IQ&u6h=#{)#v>Kr_SwFWu3_y|54l&)6LnYbB9^oje`dHllaGJR9Qg1RBjaI997 zyd}dyU{!*Kz#EjQbaFj|ip|iP8SDKv(C~Jawv&`+q{`%_u*E}KDe>xfq4{2-1EWk4 z!^W}6ATD~AS(DdQ(57&tf%$B<5xHUe=&VHa4y^RpY1Uw#a{8XNs+NXnkMQoYr=?iv zKyx)kdH3o#jw`~s>9XF->)BJ!rIO86SC?RT)Rz%CS$OCH-{ELV8q&B|c~<BgY~+|s3pwbB7E4hZ*A9g&gRu+;E!Ju(NnFX-^)DyAhQ}ffm1CchYMyD6KdK6^ za#vbq&=L(Hwe@_PV@mq|#I=d60 zhJS&`M1exN)xn!xkDlb~@rEt9p{8o_zyLFCe%zWirUH*&3r z7!>UpU7S?ZIvo6_j~``A>Xyepje6A8-b#Sl;t{$@CC@5Ts`n|=`x+t;r!=~2dp@Oz zCp!ZxUf9`Jl~mV_<3)BcLrU;Mn58T&#d2Udcg+dNU-QB3^H=W_%W(~5zw4-&)pZBx zj%*VrmaZA>mObv$7S6}7^J%)Y9U1bk^2wQ<3|?W^eoMO}`>r46c<&RT8+f^-p@#xn*+YjX@QL$9nEb_{}ZZf>VT(s(ckQs z^?px1i1eR!#o+bj#L=2!PRN12d3yRFsaGqU!f>Ok%t+K`X$lHQpO@O8p#O@uNtZBn zHkN6=RK&Rwj+Ewz?!DWToxIm2uA*S8!yXk^4cGAWaFI;PP(GTCm&daQ>KuqKH%X7g z6xK&iB1iJm(2@4YB2^kwd`Waxn{Z*~<-zG;&dP>gQ?atI7W*}yV>O&WqA1??eWeL< zgSy$<0~OM30?3w@g?v)o;!tfk@XNNpo-A)1Td4PN+9vYWp5IciGrX?~?KnN*i%)jx z^RQ46HZuTMHdu$~@Jt!INRzsSBq_7+pQpU0-HZ`d2PEpRSP>UG-mvus zdj&8nJ4dvB6k-UHxGgg+Ou`#TZRRfH{$UTS>cH3@xJeC zh`*1|2XR;;@i8jLz}gpm*<;w^p!Ea!eV9k@cqdU1^XcJnS~+)k67Q7N4goCdqwJDa zG24B_JqG6US09_L6rSLr?>WW`tRTxq=di7&8hSiWjKXEO+RwUjq;W6lM?htW~e-0~1#=o;RaNkwjH z${KyOtt3YBettmn!gPWsxwOUA3$Es}K+c&QyR&?gvWIKGgU0L-jr<{<&iY5G*^Q(g z@4;OE#wZM%`9Z~iMFbNYo>%kI2p;CyB2Nl6D8r&$iwpg_+E<_Z`E^-*vKe9>-Hp{$ zQ?$RC)PG4)t*56f-#JxqOO-foQlQ_IyCDIjWm_Tyt{@Op_iHV>Zn_6j;BLPO(u!uF zci>W+Bp(dN1LjijGN(?yBq!2rH+mAFEdTs6`eHP+-+dgXYn_j;ci^F+7#TX(gAS3t zLXuu6{FBa!I9Uctp`=um^M>JwqkyL8A2I1)e0gyxK;R_We!25>m0_PHhBIdpvdj}f zEPPNMMA$W>&O-B@9q5;76^jC*&aed2WwAx(f_V!~%s z_|Iq<>Dbtq*m>Dl`8oJ`xdr(I#QCK(BzU-G1bJjc1;m9!#6(|8NlL0o2nzH zloe$)On`dUnmUF$7H$?AFYUEu?e!FG^pzoIY97|QHl~L9md@r5u3#%;4{Lo-7h7{k zupPwL-8;}TFx<;h&DQ}G_S!ty*FE0H&iAeNJ0JhBu(05;=$NR;gqY}r#Kge3tl*U5 z#MI1)%<{PW`mFe%_i6FDnaM?2@kO~AWo2dWODalB%kwK5tBbR%%ZkctTbi32p=}+F zJtK|19~+04TSk}q+v+CYu3KfAKMI@z}}^J#H*>S%3pX?J^V=V<-(a`*Ur_x$E?XXET> z`|{iR)$!)R@yYSk^~LGo_0`GE&CUHh-`@6X6B6D{@{qKcu!`%{?iU#aWgw>4r^kWM zRovnHo_lA625J%H1Mv;vwN^;Yl%KFK1qZy&d8gC%fCam>7+uKfJ9y5TxI7hht{ z3;bmMx-&Z~hx}P+%-pjx&jQak>NJ%0Dk*s1*_}Ku4Age1amX$_PsNez9*)^MxEDdj zQD3LEDAaOV%t|}&QmZRnN)@?U@HeVm6SPh|Ow(iFfS^@gnA(gCW1ae&t~0qXLf^SE ztCa2En7_D4%Kq|g2uCNe#`cRYR`l?IX;Z}ouR*Cc<9r2n+qMiTWtRZEd>yh5ZA-0qH-4X+&#pT6HMfAP@EXZKp-6bckBFrEyFTwO*Ve2jo zD~Xn~=^#OuIOFl_s%5R#RDCXw9SQM127E?F#UMgQ20C0C(8+77MjFuYdHA0RJgoSd z`dU4eglfA2M0}O1>Qyp<-Q@p5pFmBKP%J5==D^4yf;?n&&G5N0ZC z^)J_tyLt0v+?r?LWt~e=gThdrqCxlM0M@E<@c^ywnh&T>5^&ZYVA`I`s#PBCF_ zPLbtK7@@y&jEXWa4=@3L7Nzgk=coOUsQ-!jc?SS+ujBqSmAmS{i_m{>|0favxR;@S zTG(CnpG^Ntl>Rf>y-57i2JWi=B>PpQ{xi?thX;E%jsN8NyJ-DqroZ<{;5SUair9bV zxqo~8G^gM2+>h)(g7Ne^`F# zxW6jA_aVbiQ@CUKukW}M#eZz#AIM+z!g~eyr-j|U>+k-X@s}R_Pt>p3qx(a`PlMcr zG(VNaf8u@(dhT;eKP?OGZ<(fl0)IWy{C$ArcW(062OJoL@=qhadrkk=ODoF2KfJ%~$=%2J&gn!G-Cz9=7`lA` literal 0 HcmV?d00001 diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 431a50477fccc..33467be42dfd9 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -499,6 +499,23 @@ def test_reader_spaces(self, read_ext): ) tm.assert_frame_equal(actual, expected) + # gh-36122, gh-35802 + @pytest.mark.parametrize( + "basename,expected", + [ + ("gh-35802", DataFrame({"COLUMN": ["Test (1)"]})), + ("gh-36122", DataFrame(columns=["got 2nd sa"])), + ], + ) + def test_read_excel_ods_nested_xml(self, read_ext, basename, expected): + # see gh-35802 + engine = pd.read_excel.keywords["engine"] + if engine != "odf": + pytest.skip(f"Skipped for engine: {engine}") + + actual = pd.read_excel(basename + read_ext) + tm.assert_frame_equal(actual, expected) + def test_reading_all_sheets(self, read_ext): # Test reading all sheet names by setting sheet_name to None, # Ensure a dict is returned. From 4b16138a6a5f68dd9e6b59d356fc64e4db94c15f Mon Sep 17 00:00:00 2001 From: alexhtn Date: Mon, 14 Sep 2020 02:07:20 +0300 Subject: [PATCH 0779/1025] DOC: add type BinaryIO to path param #35505 (#35568) --- pandas/io/excel/_base.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 65e95fd321772..46327daac2e43 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -551,7 +551,7 @@ class ExcelWriter(metaclass=abc.ABCMeta): Parameters ---------- - path : str + path : str or typing.BinaryIO Path to xls or xlsx or ods file. engine : str (optional) Engine to use for writing. If None, defaults to @@ -606,6 +606,21 @@ class ExcelWriter(metaclass=abc.ABCMeta): >>> with ExcelWriter('path_to_file.xlsx', mode='a') as writer: ... df.to_excel(writer, sheet_name='Sheet3') + + You can store Excel file in RAM: + + >>> import io + >>> buffer = io.BytesIO() + >>> with pd.ExcelWriter(buffer) as writer: + ... df.to_excel(writer) + + You can pack Excel file into zip archive: + + >>> import zipfile + >>> with zipfile.ZipFile('path_to_file.zip', 'w') as zf: + ... with zf.open('filename.xlsx', 'w') as buffer: + ... with pd.ExcelWriter(buffer) as writer: + ... df.to_excel(writer) """ # Defining an ExcelWriter implementation (see abstract methods for more...) From d5df269176c8047b520a5621ea9c32ae87fa4720 Mon Sep 17 00:00:00 2001 From: Andrew Wieteska <48889395+arw2019@users.noreply.github.com> Date: Sun, 13 Sep 2020 19:10:56 -0400 Subject: [PATCH 0780/1025] DOC: update DataFrame.to_feather docstring (#35408) --- pandas/core/frame.py | 6 +++--- pandas/io/feather_format.py | 11 +++++++++-- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 27f9d594aabc6..56dc5e54e1d59 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2202,14 +2202,14 @@ def to_stata( writer.write_file() @deprecate_kwarg(old_arg_name="fname", new_arg_name="path") - def to_feather(self, path, **kwargs) -> None: + def to_feather(self, path: FilePathOrBuffer[AnyStr], **kwargs) -> None: """ Write a DataFrame to the binary Feather format. Parameters ---------- - path : str - String file path. + path : str or file-like object + If a string, it will be used as Root Directory path. **kwargs : Additional keywords passed to :func:`pyarrow.feather.write_feather`. Starting with pyarrow 0.17, this includes the `compression`, diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py index a98eebe1c6a2a..ed3cd3cefe96e 100644 --- a/pandas/io/feather_format.py +++ b/pandas/io/feather_format.py @@ -1,6 +1,8 @@ """ feather-format compat """ -from pandas._typing import StorageOptions +from typing import AnyStr + +from pandas._typing import FilePathOrBuffer, StorageOptions from pandas.compat._optional import import_optional_dependency from pandas import DataFrame, Int64Index, RangeIndex @@ -8,7 +10,12 @@ from pandas.io.common import get_filepath_or_buffer -def to_feather(df: DataFrame, path, storage_options: StorageOptions = None, **kwargs): +def to_feather( + df: DataFrame, + path: FilePathOrBuffer[AnyStr], + storage_options: StorageOptions = None, + **kwargs, +): """ Write a DataFrame to the binary Feather format. From bec46e061c5e10febc8f0be3a1fc4a93b0765ff8 Mon Sep 17 00:00:00 2001 From: patrick <61934744+phofl@users.noreply.github.com> Date: Mon, 14 Sep 2020 01:25:29 +0200 Subject: [PATCH 0781/1025] Concatenating rows with Int64 datatype coerces to object --- pandas/tests/reshape/test_concat.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py index 90705f827af25..7d6611722d8b5 100644 --- a/pandas/tests/reshape/test_concat.py +++ b/pandas/tests/reshape/test_concat.py @@ -2918,3 +2918,12 @@ def test_concat_frame_axis0_extension_dtypes(): result = pd.concat([df2, df1], ignore_index=True) expected = pd.DataFrame({"a": [4, 5, 6, 1, 2, 3]}, dtype="Int64") tm.assert_frame_equal(result, expected) + + +def test_concat_preserves_extension_int64_dtype(): + # GH 24768 + df_a = pd.DataFrame({"a": [-1]}, dtype="Int64") + df_b = pd.DataFrame({"b": [1]}, dtype="Int64") + result = pd.concat([df_a, df_b], ignore_index=True) + expected = pd.DataFrame({"a": [-1, None], "b": [None, 1]}, dtype="Int64") + tm.assert_frame_equal(result, expected) From 32e441323371839d765c179aef2b269dfd6687eb Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 13 Sep 2020 23:28:54 -0700 Subject: [PATCH 0782/1025] REF: use check_setitem_lengths in DTA.__setitem__ (#36339) --- pandas/core/arrays/datetimelike.py | 24 +++----------- pandas/core/indexers.py | 42 +++++++++++++++--------- pandas/tests/arrays/test_datetimelike.py | 10 ++++++ 3 files changed, 42 insertions(+), 34 deletions(-) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 6f0e2a6a598fc..377996344dbbc 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -1,6 +1,6 @@ from datetime import datetime, timedelta import operator -from typing import Any, Callable, Optional, Sequence, Tuple, Type, TypeVar, Union, cast +from typing import Any, Callable, Optional, Sequence, Tuple, Type, TypeVar, Union import warnings import numpy as np @@ -58,7 +58,7 @@ from pandas.core.arrays.base import ExtensionOpsMixin import pandas.core.common as com from pandas.core.construction import array, extract_array -from pandas.core.indexers import check_array_indexer +from pandas.core.indexers import check_array_indexer, check_setitem_lengths from pandas.core.ops.common import unpack_zerodim_and_defer from pandas.core.ops.invalid import invalid_comparison, make_invalid_op @@ -605,23 +605,9 @@ def __setitem__( # to a period in from_sequence). For DatetimeArray, it's Timestamp... # I don't know if mypy can do that, possibly with Generics. # https://mypy.readthedocs.io/en/latest/generics.html - if is_list_like(value): - is_slice = isinstance(key, slice) - - if lib.is_scalar(key): - raise ValueError("setting an array element with a sequence.") - - if not is_slice: - key = cast(Sequence, key) - if len(key) != len(value) and not com.is_bool_indexer(key): - msg = ( - f"shape mismatch: value array of length '{len(key)}' " - "does not match indexing result of length " - f"'{len(value)}'." - ) - raise ValueError(msg) - elif not len(key): - return + no_op = check_setitem_lengths(key, value, self) + if no_op: + return value = self._validate_setitem_value(value) key = check_array_indexer(self, key) diff --git a/pandas/core/indexers.py b/pandas/core/indexers.py index d9aa02db3e42a..6c88ae1e03cda 100644 --- a/pandas/core/indexers.py +++ b/pandas/core/indexers.py @@ -114,7 +114,7 @@ def is_empty_indexer(indexer, arr_value: np.ndarray) -> bool: # Indexer Validation -def check_setitem_lengths(indexer, value, values) -> None: +def check_setitem_lengths(indexer, value, values) -> bool: """ Validate that value and indexer are the same length. @@ -133,34 +133,46 @@ def check_setitem_lengths(indexer, value, values) -> None: Returns ------- - None + bool + Whether this is an empty listlike setting which is a no-op. Raises ------ ValueError When the indexer is an ndarray or list and the lengths don't match. """ - # boolean with truth values == len of the value is ok too + no_op = False + if isinstance(indexer, (np.ndarray, list)): - if is_list_like(value) and len(indexer) != len(value): - if not ( - isinstance(indexer, np.ndarray) - and indexer.dtype == np.bool_ - and len(indexer[indexer]) == len(value) - ): - raise ValueError( - "cannot set using a list-like indexer " - "with a different length than the value" - ) + # We can ignore other listlikes becasue they are either + # a) not necessarily 1-D indexers, e.g. tuple + # b) boolean indexers e.g. BoolArray + if is_list_like(value): + if len(indexer) != len(value): + # boolean with truth values == len of the value is ok too + if not ( + isinstance(indexer, np.ndarray) + and indexer.dtype == np.bool_ + and len(indexer[indexer]) == len(value) + ): + raise ValueError( + "cannot set using a list-like indexer " + "with a different length than the value" + ) + if not len(indexer): + no_op = True elif isinstance(indexer, slice): - # slice - if is_list_like(value) and len(values): + if is_list_like(value): if len(value) != length_of_indexer(indexer, values): raise ValueError( "cannot set using a slice indexer with a " "different length than the value" ) + if not len(value): + no_op = True + + return no_op def validate_indices(indices: np.ndarray, n: int) -> None: diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index 0ae6b5bde5297..83b98525d3e8a 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -338,6 +338,16 @@ def test_setitem_raises(self): with pytest.raises(TypeError, match="'value' should be a.* 'object'"): arr[0] = object() + msg = "cannot set using a list-like indexer with a different length" + with pytest.raises(ValueError, match=msg): + # GH#36339 + arr[[]] = [arr[1]] + + msg = "cannot set using a slice indexer with a different length than" + with pytest.raises(ValueError, match=msg): + # GH#36339 + arr[1:1] = arr[:3] + @pytest.mark.parametrize("box", [list, np.array, pd.Index, pd.Series]) def test_setitem_numeric_raises(self, arr1d, box): # We dont case e.g. int64 to our own dtype for setitem From 2096dfd7ed380035c755bd1693f09516f4525791 Mon Sep 17 00:00:00 2001 From: Kumar Shivam <53289673+kshivi99@users.noreply.github.com> Date: Tue, 15 Sep 2020 01:55:59 +0530 Subject: [PATCH 0783/1025] DOC: Added docstring for storage_options for read_csv GH36361 (#36364) * Update parsers.py DOC: Added docstring for storage_options GH36361 * Update parsers.py DOC: Added docstring for storage_options GH36361 * Update parsers.py removed trailing whitespace --- pandas/io/parsers.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 2780b1a7f86c9..0f1cce273a146 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -341,6 +341,15 @@ values. The options are `None` or `high` for the ordinary converter, `legacy` for the original lower precision pandas converter, and `round_trip` for the round-trip converter. +storage_options : dict, optional + Extra options that make sense for a particular storage connection, e.g. + host, port, username, password, etc., if using a URL that will + be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error + will be raised if providing this argument with a local path or + a file-like buffer. See the fsspec and backend storage implementation + docs for the set of allowed keys and values. + + .. versionadded:: 1.2.0 Returns ------- From 88c3a1b7dbb7d98e636b1fe52e1db7965856c1b8 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 14 Sep 2020 19:00:28 -0700 Subject: [PATCH 0784/1025] REF: _unbox_scalar, _unbox_listlike for Categorical (#36362) --- pandas/core/arrays/categorical.py | 29 +++++++++++++++++------------ pandas/core/arrays/datetimelike.py | 2 +- pandas/core/indexes/category.py | 6 ++---- 3 files changed, 20 insertions(+), 17 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 45623f182144b..b6cd0e325f8a6 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -93,7 +93,7 @@ def func(self, other): if is_scalar(other): if other in self.categories: - i = self.categories.get_loc(other) + i = self._unbox_scalar(other) ret = op(self._codes, i) if opname not in {"__eq__", "__ge__", "__gt__"}: @@ -1184,8 +1184,7 @@ def _validate_searchsorted_value(self, value): # searchsorted is very performance sensitive. By converting codes # to same dtype as self.codes, we get much faster performance. if is_scalar(value): - codes = self.categories.get_loc(value) - codes = self.codes.dtype.type(codes) + codes = self._unbox_scalar(value) else: locs = [self.categories.get_loc(x) for x in value] codes = np.array(locs, dtype=self.codes.dtype) @@ -1212,7 +1211,7 @@ def _validate_fill_value(self, fill_value): if isna(fill_value): fill_value = -1 elif fill_value in self.categories: - fill_value = self.categories.get_loc(fill_value) + fill_value = self._unbox_scalar(fill_value) else: raise ValueError( f"'fill_value={fill_value}' is not present " @@ -1680,7 +1679,7 @@ def fillna(self, value=None, method=None, limit=None): if isna(value): codes[mask] = -1 else: - codes[mask] = self.categories.get_loc(value) + codes[mask] = self._unbox_scalar(value) else: raise TypeError( @@ -1734,6 +1733,17 @@ def _validate_listlike(self, target: ArrayLike) -> np.ndarray: return codes + def _unbox_scalar(self, key) -> int: + # searchsorted is very performance sensitive. By converting codes + # to same dtype as self.codes, we get much faster performance. + code = self.categories.get_loc(key) + code = self._codes.dtype.type(code) + return code + + def _unbox_listlike(self, value): + unboxed = self.categories.get_indexer(value) + return unboxed.astype(self._ndarray.dtype, copy=False) + # ------------------------------------------------------------------ def take_nd(self, indexer, allow_fill: bool = False, fill_value=None): @@ -1925,11 +1935,7 @@ def _validate_setitem_value(self, value): "category, set the categories first" ) - lindexer = self.categories.get_indexer(rvalue) - if isinstance(lindexer, np.ndarray) and lindexer.dtype.kind == "i": - lindexer = lindexer.astype(self._ndarray.dtype) - - return lindexer + return self._unbox_listlike(rvalue) def _validate_setitem_key(self, key): if lib.is_integer(key): @@ -2155,8 +2161,7 @@ def unique(self): return cat.set_categories(cat.categories.take(take_codes)) def _values_for_factorize(self): - codes = self.codes.astype("int64") - return codes, -1 + return self._ndarray, -1 @classmethod def _from_factorized(cls, uniques, original): diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 377996344dbbc..14f713530868a 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -697,7 +697,7 @@ def copy(self: DatetimeLikeArrayT) -> DatetimeLikeArrayT: return new_obj def _values_for_factorize(self): - return self.asi8, iNaT + return self._ndarray, iNaT @classmethod def _from_factorized(cls, values, original): diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 85ef3e58576e3..829cf767c448f 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -512,10 +512,8 @@ def _reindex_non_unique(self, target): # -------------------------------------------------------------------- # Indexing Methods - def _maybe_cast_indexer(self, key): - code = self.categories.get_loc(key) - code = self.codes.dtype.type(code) - return code + def _maybe_cast_indexer(self, key) -> int: + return self._data._unbox_scalar(key) @Appender(_index_shared_docs["get_indexer"] % _index_doc_kwargs) def get_indexer(self, target, method=None, limit=None, tolerance=None): From 5d8009aa8355ca653872201ae201893494ecae52 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 14 Sep 2020 19:01:36 -0700 Subject: [PATCH 0785/1025] REF: _assert_can_do_op -> _validate_scalar (#36367) --- pandas/core/indexes/base.py | 8 +++++--- pandas/core/indexes/category.py | 5 +++-- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 3d177e08bb0f5..15944565cb254 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2250,7 +2250,7 @@ def fillna(self, value=None, downcast=None): DataFrame.fillna : Fill NaN values of a DataFrame. Series.fillna : Fill NaN Values of a Series. """ - self._assert_can_do_op(value) + value = self._validate_scalar(value) if self.hasnans: result = self.putmask(self._isnan, value) if downcast is None: @@ -4053,12 +4053,14 @@ def _validate_fill_value(self, value): """ return value - def _assert_can_do_op(self, value): + def _validate_scalar(self, value): """ - Check value is valid for scalar op. + Check that this is a scalar value that we can use for setitem-like + operations without changing dtype. """ if not is_scalar(value): raise TypeError(f"'value' must be a scalar, passed: {type(value).__name__}") + return value @property def _has_complex_internals(self) -> bool: diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 829cf767c448f..9e4714060e23e 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -380,8 +380,9 @@ def _isnan(self): @doc(Index.fillna) def fillna(self, value, downcast=None): - self._assert_can_do_op(value) - return CategoricalIndex(self._data.fillna(value), name=self.name) + value = self._validate_scalar(value) + cat = self._data.fillna(value) + return type(self)._simple_new(cat, name=self.name) @cache_readonly def _engine(self): From 7fe631ed474aa1d1b8df5e17eadda4b83e37037e Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 14 Sep 2020 19:02:27 -0700 Subject: [PATCH 0786/1025] REF: share code for __setitem__ (#36366) --- pandas/core/arrays/_mixins.py | 12 ++++++++++++ pandas/core/arrays/categorical.py | 14 -------------- pandas/core/arrays/datetimelike.py | 4 +--- pandas/core/arrays/numpy_.py | 8 -------- 4 files changed, 13 insertions(+), 25 deletions(-) diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py index e9d8671b69c78..284dd31ffcb59 100644 --- a/pandas/core/arrays/_mixins.py +++ b/pandas/core/arrays/_mixins.py @@ -9,6 +9,7 @@ from pandas.core.algorithms import take, unique from pandas.core.array_algos.transforms import shift from pandas.core.arrays.base import ExtensionArray +from pandas.core.indexers import check_array_indexer _T = TypeVar("_T", bound="NDArrayBackedExtensionArray") @@ -156,3 +157,14 @@ def _validate_shift_value(self, fill_value): # TODO: after deprecation in datetimelikearraymixin is enforced, # we can remove this and ust validate_fill_value directly return self._validate_fill_value(fill_value) + + def __setitem__(self, key, value): + key = self._validate_setitem_key(key) + value = self._validate_setitem_value(value) + self._ndarray[key] = value + + def _validate_setitem_key(self, key): + return check_array_indexer(self, key) + + def _validate_setitem_value(self, value): + return value diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index b6cd0e325f8a6..25073282ec0f6 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1894,20 +1894,6 @@ def __getitem__(self, key): return result return self._from_backing_data(result) - def __setitem__(self, key, value): - """ - Item assignment. - - Raises - ------ - ValueError - If (one or more) Value is not in categories or if a assigned - `Categorical` does not have the same categories - """ - key = self._validate_setitem_key(key) - value = self._validate_setitem_value(value) - self._ndarray[key] = value - def _validate_setitem_value(self, value): value = extract_array(value, extract_numpy=True) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 14f713530868a..e8b1c12687584 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -609,9 +609,7 @@ def __setitem__( if no_op: return - value = self._validate_setitem_value(value) - key = check_array_indexer(self, key) - self._ndarray[key] = value + super().__setitem__(key, value) self._maybe_clear_freq() def _maybe_clear_freq(self): diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index d3fa87d5ea7ff..61ffa28d31ba0 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -259,11 +259,6 @@ def __getitem__(self, item): result = type(self)(result) return result - def __setitem__(self, key, value) -> None: - key = self._validate_setitem_key(key) - value = self._validate_setitem_value(value) - self._ndarray[key] = value - def _validate_setitem_value(self, value): value = extract_array(value, extract_numpy=True) @@ -271,9 +266,6 @@ def _validate_setitem_value(self, value): value = np.asarray(value, dtype=self._ndarray.dtype) return value - def _validate_setitem_key(self, key): - return check_array_indexer(self, key) - def isna(self) -> np.ndarray: return isna(self._ndarray) From 84a416bd8e37d3de20edf2391af763437399c93d Mon Sep 17 00:00:00 2001 From: Daniel Saxton <2658661+dsaxton@users.noreply.github.com> Date: Mon, 14 Sep 2020 21:04:46 -0500 Subject: [PATCH 0787/1025] CI: Add stale PR action (#36336) --- .github/workflows/stale-pr.yml | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 .github/workflows/stale-pr.yml diff --git a/.github/workflows/stale-pr.yml b/.github/workflows/stale-pr.yml new file mode 100644 index 0000000000000..0cbe4b7dd4582 --- /dev/null +++ b/.github/workflows/stale-pr.yml @@ -0,0 +1,21 @@ +name: "Stale PRs" +on: + schedule: + # * is a special character in YAML so you have to quote this string + - cron: "0 */6 * * *" + +jobs: + stale: + runs-on: ubuntu-latest + steps: + - uses: actions/stale@v3 + with: + repo-token: ${{ secrets.GITHUB_TOKEN }} + stale-pr-message: "This pull request is stale because it has been open for thirty days with no activity." + skip-stale-pr-message: false + stale-pr-label: "Stale" + exempt-pr-labels: "Needs Review,Blocked" + days-before-stale: 30 + days-before-close: -1 + remove-stale-when-updated: true + debug-only: true From d5f00bb36548a88a5002cc1cc68d810bb5a9daae Mon Sep 17 00:00:00 2001 From: Fangchen Li Date: Mon, 14 Sep 2020 21:06:06 -0500 Subject: [PATCH 0788/1025] BUG: add py39 compat check for ast.slice #32766 (#36080) --- pandas/core/computation/expr.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pandas/core/computation/expr.py b/pandas/core/computation/expr.py index f5897277d83bf..09fc53716dda9 100644 --- a/pandas/core/computation/expr.py +++ b/pandas/core/computation/expr.py @@ -10,6 +10,8 @@ import numpy as np +from pandas.compat import PY39 + import pandas.core.common as com from pandas.core.computation.ops import ( ARITH_OPS_SYMS, @@ -186,7 +188,6 @@ def _filter_nodes(superclass, all_nodes=_all_nodes): _stmt_nodes = _filter_nodes(ast.stmt) _expr_nodes = _filter_nodes(ast.expr) _expr_context_nodes = _filter_nodes(ast.expr_context) -_slice_nodes = _filter_nodes(ast.slice) _boolop_nodes = _filter_nodes(ast.boolop) _operator_nodes = _filter_nodes(ast.operator) _unary_op_nodes = _filter_nodes(ast.unaryop) @@ -197,6 +198,9 @@ def _filter_nodes(superclass, all_nodes=_all_nodes): _keyword_nodes = _filter_nodes(ast.keyword) _alias_nodes = _filter_nodes(ast.alias) +if not PY39: + _slice_nodes = _filter_nodes(ast.slice) + # nodes that we don't support directly but are needed for parsing _hacked_nodes = frozenset(["Assign", "Module", "Expr"]) From ffb52d3ca9dcf4829866a5a8929cffa7caa294fe Mon Sep 17 00:00:00 2001 From: Fangchen Li Date: Mon, 14 Sep 2020 21:11:47 -0500 Subject: [PATCH 0789/1025] Move sort index to generic (#36177) --- pandas/core/frame.py | 68 +++++++----------------------------------- pandas/core/generic.py | 47 +++++++++++++++++++++++++++++ pandas/core/series.py | 64 +++++++-------------------------------- pandas/core/sorting.py | 67 +++++++++++++++++++++++++++++++++++++++-- 4 files changed, 134 insertions(+), 112 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 56dc5e54e1d59..bc0e55195fb3e 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -143,7 +143,6 @@ ) from pandas.core.reshape.melt import melt from pandas.core.series import Series -from pandas.core.sorting import ensure_key_mapped from pandas.io.common import get_filepath_or_buffer from pandas.io.formats import console, format as fmt @@ -5448,62 +5447,17 @@ def sort_index( C 3 d 4 """ - # TODO: this can be combined with Series.sort_index impl as - # almost identical - - inplace = validate_bool_kwarg(inplace, "inplace") - - axis = self._get_axis_number(axis) - labels = self._get_axis(axis) - labels = ensure_key_mapped(labels, key, levels=level) - - # make sure that the axis is lexsorted to start - # if not we need to reconstruct to get the correct indexer - labels = labels._sort_levels_monotonic() - if level is not None: - new_axis, indexer = labels.sortlevel( - level, ascending=ascending, sort_remaining=sort_remaining - ) - - elif isinstance(labels, MultiIndex): - from pandas.core.sorting import lexsort_indexer - - indexer = lexsort_indexer( - labels._get_codes_for_sorting(), - orders=ascending, - na_position=na_position, - ) - else: - from pandas.core.sorting import nargsort - - # Check monotonic-ness before sort an index - # GH11080 - if (ascending and labels.is_monotonic_increasing) or ( - not ascending and labels.is_monotonic_decreasing - ): - if inplace: - return - else: - return self.copy() - - indexer = nargsort( - labels, kind=kind, ascending=ascending, na_position=na_position - ) - - baxis = self._get_block_manager_axis(axis) - new_data = self._mgr.take(indexer, axis=baxis, verify=False) - - # reconstruct axis if needed - new_data.axes[baxis] = new_data.axes[baxis]._sort_levels_monotonic() - - if ignore_index: - new_data.axes[1] = ibase.default_index(len(indexer)) - - result = self._constructor(new_data) - if inplace: - return self._update_inplace(result) - else: - return result.__finalize__(self, method="sort_index") + return super().sort_index( + axis, + level, + ascending, + inplace, + kind, + na_position, + sort_remaining, + ignore_index, + key, + ) def value_counts( self, diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 5336d0828881b..0a8dd578bf461 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -40,6 +40,7 @@ CompressionOptions, FilePathOrBuffer, FrameOrSeries, + IndexKeyFunc, IndexLabel, JSONSerializable, Label, @@ -92,6 +93,7 @@ import pandas.core.common as com from pandas.core.construction import create_series_with_explicit_dtype from pandas.core.flags import Flags +from pandas.core.indexes import base as ibase from pandas.core.indexes.api import Index, MultiIndex, RangeIndex, ensure_index from pandas.core.indexes.datetimes import DatetimeIndex from pandas.core.indexes.period import Period, PeriodIndex @@ -100,6 +102,7 @@ from pandas.core.missing import find_valid_index from pandas.core.ops import align_method_FRAME from pandas.core.shared_docs import _shared_docs +from pandas.core.sorting import get_indexer_indexer from pandas.core.window import Expanding, ExponentialMovingWindow, Rolling, Window from pandas.io.formats import format as fmt @@ -4409,6 +4412,50 @@ def sort_values( """ raise AbstractMethodError(self) + def sort_index( + self, + axis=0, + level=None, + ascending: bool_t = True, + inplace: bool_t = False, + kind: str = "quicksort", + na_position: str = "last", + sort_remaining: bool_t = True, + ignore_index: bool_t = False, + key: IndexKeyFunc = None, + ): + + inplace = validate_bool_kwarg(inplace, "inplace") + axis = self._get_axis_number(axis) + target = self._get_axis(axis) + + indexer = get_indexer_indexer( + target, level, ascending, kind, na_position, sort_remaining, key + ) + + if indexer is None: + if inplace: + return + else: + return self.copy() + + baxis = self._get_block_manager_axis(axis) + new_data = self._mgr.take(indexer, axis=baxis, verify=False) + + # reconstruct axis if needed + new_data.axes[baxis] = new_data.axes[baxis]._sort_levels_monotonic() + + if ignore_index: + axis = 1 if isinstance(self, ABCDataFrame) else 0 + new_data.axes[axis] = ibase.default_index(len(indexer)) + + result = self._constructor(new_data) + + if inplace: + return self._update_inplace(result) + else: + return result.__finalize__(self, method="sort_index") + @doc( klass=_shared_doc_kwargs["klass"], axes=_shared_doc_kwargs["axes"], diff --git a/pandas/core/series.py b/pandas/core/series.py index 69376d8bf80d1..48fae9a0a91cd 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -3463,59 +3463,17 @@ def sort_index( dtype: int64 """ - # TODO: this can be combined with DataFrame.sort_index impl as - # almost identical - inplace = validate_bool_kwarg(inplace, "inplace") - # Validate the axis parameter - self._get_axis_number(axis) - index = ensure_key_mapped(self.index, key, levels=level) - - if level is not None: - new_index, indexer = index.sortlevel( - level, ascending=ascending, sort_remaining=sort_remaining - ) - - elif isinstance(index, MultiIndex): - from pandas.core.sorting import lexsort_indexer - - labels = index._sort_levels_monotonic() - - indexer = lexsort_indexer( - labels._get_codes_for_sorting(), - orders=ascending, - na_position=na_position, - ) - else: - from pandas.core.sorting import nargsort - - # Check monotonic-ness before sort an index - # GH11080 - if (ascending and index.is_monotonic_increasing) or ( - not ascending and index.is_monotonic_decreasing - ): - if inplace: - return - else: - return self.copy() - - indexer = nargsort( - index, kind=kind, ascending=ascending, na_position=na_position - ) - - indexer = ensure_platform_int(indexer) - new_index = self.index.take(indexer) - new_index = new_index._sort_levels_monotonic() - - new_values = self._values.take(indexer) - result = self._constructor(new_values, index=new_index) - - if ignore_index: - result.index = ibase.default_index(len(result)) - - if inplace: - self._update_inplace(result) - else: - return result.__finalize__(self, method="sort_index") + return super().sort_index( + axis, + level, + ascending, + inplace, + kind, + na_position, + sort_remaining, + ignore_index, + key, + ) def argsort(self, axis=0, kind="quicksort", order=None) -> "Series": """ diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index d03b2f29521b7..dd6aadf570baa 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -1,11 +1,21 @@ """ miscellaneous sorting / groupby utilities """ from collections import defaultdict -from typing import TYPE_CHECKING, Callable, DefaultDict, Iterable, List, Optional, Tuple +from typing import ( + TYPE_CHECKING, + Callable, + DefaultDict, + Iterable, + List, + Optional, + Tuple, + Union, +) import numpy as np from pandas._libs import algos, hashtable, lib from pandas._libs.hashtable import unique_label_indices +from pandas._typing import IndexKeyFunc from pandas.core.dtypes.common import ( ensure_int64, @@ -20,11 +30,64 @@ from pandas.core.construction import extract_array if TYPE_CHECKING: - from pandas.core.indexes.base import Index # noqa:F401 + from pandas.core.indexes.base import Index _INT64_MAX = np.iinfo(np.int64).max +def get_indexer_indexer( + target: "Index", + level: Union[str, int, List[str], List[int]], + ascending: bool, + kind: str, + na_position: str, + sort_remaining: bool, + key: IndexKeyFunc, +) -> Optional[np.array]: + """ + Helper method that return the indexer according to input parameters for + the sort_index method of DataFrame and Series. + + Parameters + ---------- + target : Index + level : int or level name or list of ints or list of level names + ascending : bool or list of bools, default True + kind : {'quicksort', 'mergesort', 'heapsort'}, default 'quicksort' + na_position : {'first', 'last'}, default 'last' + sort_remaining : bool, default True + key : callable, optional + + Returns + ------- + Optional[ndarray] + The indexer for the new index. + """ + + target = ensure_key_mapped(target, key, levels=level) + target = target._sort_levels_monotonic() + + if level is not None: + _, indexer = target.sortlevel( + level, ascending=ascending, sort_remaining=sort_remaining + ) + elif isinstance(target, ABCMultiIndex): + indexer = lexsort_indexer( + target._get_codes_for_sorting(), orders=ascending, na_position=na_position, + ) + else: + # Check monotonic-ness before sort an index (GH 11080) + if (ascending and target.is_monotonic_increasing) or ( + not ascending and target.is_monotonic_decreasing + ): + return None + + indexer = nargsort( + target, kind=kind, ascending=ascending, na_position=na_position + ) + return indexer + + def get_group_index(labels, shape, sort: bool, xnull: bool): """ For the particular label_list, gets the offsets into the hypothetical list From 39430500341a66b3251d2d354d5b9771ae756a4e Mon Sep 17 00:00:00 2001 From: patrick <61934744+phofl@users.noreply.github.com> Date: Wed, 16 Sep 2020 00:20:47 +0200 Subject: [PATCH 0790/1025] [BUG]: Implement Kahan summation for rolling().mean() to avoid numerical issues (#36348) --- doc/source/whatsnew/v1.2.0.rst | 1 + pandas/_libs/window/aggregations.pyx | 85 ++++++++++++++++++---------- pandas/tests/window/test_rolling.py | 75 ++++++++++++++++++++++++ 3 files changed, 131 insertions(+), 30 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 8b18b56929acd..f398af6e4dd5e 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -118,6 +118,7 @@ Other enhancements - :class:`Index` with object dtype supports division and multiplication (:issue:`34160`) - :meth:`DataFrame.explode` and :meth:`Series.explode` now support exploding of sets (:issue:`35614`) - `Styler` now allows direct CSS class name addition to individual data cells (:issue:`36159`) +- :meth:`Rolling.mean()` and :meth:`Rolling.sum()` use Kahan summation to calculate the mean to avoid numerical problems (:issue:`10319`, :issue:`11645`, :issue:`13254`, :issue:`32761`, :issue:`36031`) .. _whatsnew_120.api_breaking.python: diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index 3ec4547d223ce..5f60b884c6ada 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -161,27 +161,42 @@ cdef inline float64_t calc_sum(int64_t minp, int64_t nobs, float64_t sum_x) nogi return result -cdef inline void add_sum(float64_t val, int64_t *nobs, float64_t *sum_x) nogil: - """ add a value from the sum calc """ +cdef inline void add_sum(float64_t val, int64_t *nobs, float64_t *sum_x, + float64_t *compensation) nogil: + """ add a value from the sum calc using Kahan summation """ + + cdef: + float64_t y, t # Not NaN if notnan(val): nobs[0] = nobs[0] + 1 - sum_x[0] = sum_x[0] + val + y = val - compensation[0] + t = sum_x[0] + y + compensation[0] = t - sum_x[0] - y + sum_x[0] = t -cdef inline void remove_sum(float64_t val, int64_t *nobs, float64_t *sum_x) nogil: - """ remove a value from the sum calc """ +cdef inline void remove_sum(float64_t val, int64_t *nobs, float64_t *sum_x, + float64_t *compensation) nogil: + """ remove a value from the sum calc using Kahan summation """ + + cdef: + float64_t y, t + # Not NaN if notnan(val): nobs[0] = nobs[0] - 1 - sum_x[0] = sum_x[0] - val + y = - val - compensation[0] + t = sum_x[0] + y + compensation[0] = t - sum_x[0] - y + sum_x[0] = t def roll_sum_variable(ndarray[float64_t] values, ndarray[int64_t] start, ndarray[int64_t] end, int64_t minp): cdef: - float64_t sum_x = 0 + float64_t sum_x = 0, compensation_add = 0, compensation_remove = 0 int64_t s, e int64_t nobs = 0, i, j, N = len(values) ndarray[float64_t] output @@ -201,23 +216,23 @@ def roll_sum_variable(ndarray[float64_t] values, ndarray[int64_t] start, # setup for j in range(s, e): - add_sum(values[j], &nobs, &sum_x) + add_sum(values[j], &nobs, &sum_x, &compensation_add) else: # calculate deletes for j in range(start[i - 1], s): - remove_sum(values[j], &nobs, &sum_x) + remove_sum(values[j], &nobs, &sum_x, &compensation_remove) # calculate adds for j in range(end[i - 1], e): - add_sum(values[j], &nobs, &sum_x) + add_sum(values[j], &nobs, &sum_x, &compensation_add) output[i] = calc_sum(minp, nobs, sum_x) if not is_monotonic_bounds: for j in range(s, e): - remove_sum(values[j], &nobs, &sum_x) + remove_sum(values[j], &nobs, &sum_x, &compensation_remove) return output @@ -225,7 +240,7 @@ def roll_sum_variable(ndarray[float64_t] values, ndarray[int64_t] start, def roll_sum_fixed(ndarray[float64_t] values, ndarray[int64_t] start, ndarray[int64_t] end, int64_t minp, int64_t win): cdef: - float64_t val, prev_x, sum_x = 0 + float64_t val, prev_x, sum_x = 0, compensation_add = 0, compensation_remove = 0 int64_t range_endpoint int64_t nobs = 0, i, N = len(values) ndarray[float64_t] output @@ -237,16 +252,16 @@ def roll_sum_fixed(ndarray[float64_t] values, ndarray[int64_t] start, with nogil: for i in range(0, range_endpoint): - add_sum(values[i], &nobs, &sum_x) + add_sum(values[i], &nobs, &sum_x, &compensation_add) output[i] = NaN for i in range(range_endpoint, N): val = values[i] - add_sum(val, &nobs, &sum_x) + add_sum(val, &nobs, &sum_x, &compensation_add) if i > win - 1: prev_x = values[i - win] - remove_sum(prev_x, &nobs, &sum_x) + remove_sum(prev_x, &nobs, &sum_x, &compensation_remove) output[i] = calc_sum(minp, nobs, sum_x) @@ -277,24 +292,34 @@ cdef inline float64_t calc_mean(int64_t minp, Py_ssize_t nobs, cdef inline void add_mean(float64_t val, Py_ssize_t *nobs, float64_t *sum_x, - Py_ssize_t *neg_ct) nogil: - """ add a value from the mean calc """ + Py_ssize_t *neg_ct, float64_t *compensation) nogil: + """ add a value from the mean calc using Kahan summation """ + cdef: + float64_t y, t # Not NaN if notnan(val): nobs[0] = nobs[0] + 1 - sum_x[0] = sum_x[0] + val + y = val - compensation[0] + t = sum_x[0] + y + compensation[0] = t - sum_x[0] - y + sum_x[0] = t if signbit(val): neg_ct[0] = neg_ct[0] + 1 cdef inline void remove_mean(float64_t val, Py_ssize_t *nobs, float64_t *sum_x, - Py_ssize_t *neg_ct) nogil: - """ remove a value from the mean calc """ + Py_ssize_t *neg_ct, float64_t *compensation) nogil: + """ remove a value from the mean calc using Kahan summation """ + cdef: + float64_t y, t if notnan(val): nobs[0] = nobs[0] - 1 - sum_x[0] = sum_x[0] - val + y = - val - compensation[0] + t = sum_x[0] + y + compensation[0] = t - sum_x[0] - y + sum_x[0] = t if signbit(val): neg_ct[0] = neg_ct[0] - 1 @@ -302,7 +327,7 @@ cdef inline void remove_mean(float64_t val, Py_ssize_t *nobs, float64_t *sum_x, def roll_mean_fixed(ndarray[float64_t] values, ndarray[int64_t] start, ndarray[int64_t] end, int64_t minp, int64_t win): cdef: - float64_t val, prev_x, sum_x = 0 + float64_t val, prev_x, sum_x = 0, compensation_add = 0, compensation_remove = 0 Py_ssize_t nobs = 0, i, neg_ct = 0, N = len(values) ndarray[float64_t] output @@ -311,16 +336,16 @@ def roll_mean_fixed(ndarray[float64_t] values, ndarray[int64_t] start, with nogil: for i in range(minp - 1): val = values[i] - add_mean(val, &nobs, &sum_x, &neg_ct) + add_mean(val, &nobs, &sum_x, &neg_ct, &compensation_add) output[i] = NaN for i in range(minp - 1, N): val = values[i] - add_mean(val, &nobs, &sum_x, &neg_ct) + add_mean(val, &nobs, &sum_x, &neg_ct, &compensation_add) if i > win - 1: prev_x = values[i - win] - remove_mean(prev_x, &nobs, &sum_x, &neg_ct) + remove_mean(prev_x, &nobs, &sum_x, &neg_ct, &compensation_remove) output[i] = calc_mean(minp, nobs, neg_ct, sum_x) @@ -330,7 +355,7 @@ def roll_mean_fixed(ndarray[float64_t] values, ndarray[int64_t] start, def roll_mean_variable(ndarray[float64_t] values, ndarray[int64_t] start, ndarray[int64_t] end, int64_t minp): cdef: - float64_t val, sum_x = 0 + float64_t val, compensation_add = 0, compensation_remove = 0, sum_x = 0 int64_t s, e Py_ssize_t nobs = 0, i, j, neg_ct = 0, N = len(values) ndarray[float64_t] output @@ -350,26 +375,26 @@ def roll_mean_variable(ndarray[float64_t] values, ndarray[int64_t] start, # setup for j in range(s, e): val = values[j] - add_mean(val, &nobs, &sum_x, &neg_ct) + add_mean(val, &nobs, &sum_x, &neg_ct, &compensation_add) else: # calculate deletes for j in range(start[i - 1], s): val = values[j] - remove_mean(val, &nobs, &sum_x, &neg_ct) + remove_mean(val, &nobs, &sum_x, &neg_ct, &compensation_remove) # calculate adds for j in range(end[i - 1], e): val = values[j] - add_mean(val, &nobs, &sum_x, &neg_ct) + add_mean(val, &nobs, &sum_x, &neg_ct, &compensation_add) output[i] = calc_mean(minp, nobs, neg_ct, sum_x) if not is_monotonic_bounds: for j in range(s, e): val = values[j] - remove_mean(val, &nobs, &sum_x, &neg_ct) + remove_mean(val, &nobs, &sum_x, &neg_ct, &compensation_remove) return output # ---------------------------------------------------------------------- diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index 67b20fd2d6daa..88afcec0f7bf4 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -696,3 +696,78 @@ def scaled_sum(*args): expected = DataFrame(data={"X": [0.0, 0.5, 1.0, 1.5, 2.0]}, index=_index) result = df.groupby(**grouping).rolling(1).apply(scaled_sum, raw=raw, args=(2,)) tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("add", [0.0, 2.0]) +def test_rolling_numerical_accuracy_kahan_mean(add): + # GH: 36031 implementing kahan summation + df = pd.DataFrame( + {"A": [3002399751580331.0 + add, -0.0, -0.0]}, + index=[ + pd.Timestamp("19700101 09:00:00"), + pd.Timestamp("19700101 09:00:03"), + pd.Timestamp("19700101 09:00:06"), + ], + ) + result = ( + df.resample("1s").ffill().rolling("3s", closed="left", min_periods=3).mean() + ) + dates = pd.date_range("19700101 09:00:00", periods=7, freq="S") + expected = pd.DataFrame( + { + "A": [ + np.nan, + np.nan, + np.nan, + 3002399751580330.5, + 2001599834386887.25, + 1000799917193443.625, + 0.0, + ] + }, + index=dates, + ) + tm.assert_frame_equal(result, expected) + + +def test_rolling_numerical_accuracy_kahan_sum(): + # GH: 13254 + df = pd.DataFrame([2.186, -1.647, 0.0, 0.0, 0.0, 0.0], columns=["x"]) + result = df["x"].rolling(3).sum() + expected = pd.Series([np.nan, np.nan, 0.539, -1.647, 0.0, 0.0], name="x") + tm.assert_series_equal(result, expected) + + +def test_rolling_numerical_accuracy_jump(): + # GH: 32761 + index = pd.date_range(start="2020-01-01", end="2020-01-02", freq="60s").append( + pd.DatetimeIndex(["2020-01-03"]) + ) + data = np.random.rand(len(index)) + + df = pd.DataFrame({"data": data}, index=index) + result = df.rolling("60s").mean() + tm.assert_frame_equal(result, df[["data"]]) + + +def test_rolling_numerical_accuracy_small_values(): + # GH: 10319 + s = Series( + data=[0.00012456, 0.0003, -0.0, -0.0], + index=date_range("1999-02-03", "1999-02-06"), + ) + result = s.rolling(1).mean() + tm.assert_series_equal(result, s) + + +def test_rolling_numerical_too_large_numbers(): + # GH: 11645 + dates = pd.date_range("2015-01-01", periods=10, freq="D") + ds = pd.Series(data=range(10), index=dates, dtype=np.float64) + ds[2] = -9e33 + result = ds.rolling(5).mean() + expected = pd.Series( + [np.nan, np.nan, np.nan, np.nan, -1.8e33, -1.8e33, -1.8e33, 0.0, 6.0, 7.0], + index=dates, + ) + tm.assert_series_equal(result, expected) From e4c8e3cbed21d8b88d10de41b2951a4a77ec1c4b Mon Sep 17 00:00:00 2001 From: Erfan Nariman <34067903+erfannariman@users.noreply.github.com> Date: Wed, 16 Sep 2020 00:32:25 +0200 Subject: [PATCH 0791/1025] DOC: Example for natural sort using key argument (#36356) --- environment.yml | 1 + pandas/core/generic.py | 26 ++++++++++++++++++++++++++ requirements-dev.txt | 1 + 3 files changed, 28 insertions(+) diff --git a/environment.yml b/environment.yml index 4622aac1dc6f8..badb0ba94a670 100644 --- a/environment.yml +++ b/environment.yml @@ -106,6 +106,7 @@ dependencies: - cftime # Needed for downstream xarray.CFTimeIndex test - pyreadstat # pandas.read_spss - tabulate>=0.8.3 # DataFrame.to_markdown + - natsort # DataFrame.sort_values - pip: - git+https://github.com/pandas-dev/pydata-sphinx-theme.git@master - git+https://github.com/numpy/numpydoc diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 0a8dd578bf461..814f307749d50 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -4409,6 +4409,32 @@ def sort_values( 3 NaN 8 4 D 4 D 7 2 e 5 C 4 3 F + + Natural sort with the key argument, + using the `natsort ` package. + + >>> df = pd.DataFrame({ + ... "time": ['0hr', '128hr', '72hr', '48hr', '96hr'], + ... "value": [10, 20, 30, 40, 50] + ... }) + >>> df + time value + 0 0hr 10 + 1 128hr 20 + 2 72hr 30 + 3 48hr 40 + 4 96hr 50 + >>> from natsort import index_natsorted + >>> df.sort_values( + ... by="time", + ... key=lambda x: np.argsort(index_natsorted(df["time"])) + ... ) + time value + 0 0hr 10 + 3 48hr 40 + 2 72hr 30 + 4 96hr 50 + 1 128hr 20 """ raise AbstractMethodError(self) diff --git a/requirements-dev.txt b/requirements-dev.txt index cc3775de3a4ba..c53ced35d27fa 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -73,6 +73,7 @@ xarray cftime pyreadstat tabulate>=0.8.3 +natsort git+https://github.com/pandas-dev/pydata-sphinx-theme.git@master git+https://github.com/numpy/numpydoc pyflakes>=2.2.0 \ No newline at end of file From db264c0130c2795045a24e6661b85027f47c8404 Mon Sep 17 00:00:00 2001 From: patrick <61934744+phofl@users.noreply.github.com> Date: Wed, 16 Sep 2020 00:33:47 +0200 Subject: [PATCH 0792/1025] [TST]: Groupby raised error with duplicate column names (#36389) --- pandas/tests/groupby/test_groupby.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 313b0ea2434f9..1bb40b322cd48 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2126,3 +2126,14 @@ def test_groupby_column_index_name_lost(func): df_grouped = df.groupby([1]) result = getattr(df_grouped, func)().columns tm.assert_index_equal(result, expected) + + +def test_groupby_duplicate_columns(): + # GH: 31735 + df = pd.DataFrame( + {"A": ["f", "e", "g", "h"], "B": ["a", "b", "c", "d"], "C": [1, 2, 3, 4]} + ).astype(object) + df.columns = ["A", "B", "B"] + result = df.groupby([0, 0, 0, 0]).min() + expected = pd.DataFrame([["e", "a", 1]], columns=["A", "B", "B"]) + tm.assert_frame_equal(result, expected) From 67e3a92aadf5540ab106085be804a572ca44bc6d Mon Sep 17 00:00:00 2001 From: Daniel Saxton <2658661+dsaxton@users.noreply.github.com> Date: Tue, 15 Sep 2020 17:35:40 -0500 Subject: [PATCH 0793/1025] BUG: Fix MultiIndex column stacking with dupe names (#36371) --- doc/source/whatsnew/v1.1.3.rst | 1 + pandas/core/reshape/reshape.py | 14 +++++--------- pandas/tests/frame/test_reshape.py | 13 +++++++++++++ 3 files changed, 19 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v1.1.3.rst b/doc/source/whatsnew/v1.1.3.rst index 8e283aec39786..8ead78a17e9c2 100644 --- a/doc/source/whatsnew/v1.1.3.rst +++ b/doc/source/whatsnew/v1.1.3.rst @@ -29,6 +29,7 @@ Bug fixes - Bug in :func:`read_spss` where passing a ``pathlib.Path`` as ``path`` would raise a ``TypeError`` (:issue:`33666`) - Bug in :meth:`Series.str.startswith` and :meth:`Series.str.endswith` with ``category`` dtype not propagating ``na`` parameter (:issue:`36241`) - Bug in :class:`Series` constructor where integer overflow would occur for sufficiently large scalar inputs when an index was provided (:issue:`36291`) +- Bug in :meth:`DataFrame.stack` raising a ``ValueError`` when stacking :class:`MultiIndex` columns based on position when the levels had duplicate names (:issue:`36353`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 6ddf53b6493e3..18ebe14763797 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -586,19 +586,15 @@ def _stack_multi_columns(frame, level_num=-1, dropna=True): def _convert_level_number(level_num, columns): """ Logic for converting the level number to something we can safely pass - to swaplevel: + to swaplevel. - We generally want to convert the level number into a level name, except - when columns do not have names, in which case we must leave as a level - number + If `level_num` matches a column name return the name from + position `level_num`, otherwise return `level_num`. """ if level_num in columns.names: return columns.names[level_num] - else: - if columns.names[level_num] is None: - return level_num - else: - return columns.names[level_num] + + return level_num this = frame.copy() diff --git a/pandas/tests/frame/test_reshape.py b/pandas/tests/frame/test_reshape.py index d80ebaa09b6a8..b10fdbb707404 100644 --- a/pandas/tests/frame/test_reshape.py +++ b/pandas/tests/frame/test_reshape.py @@ -1302,3 +1302,16 @@ def test_unstacking_multi_index_df(): ), ) tm.assert_frame_equal(result, expected) + + +def test_stack_positional_level_duplicate_column_names(): + # https://github.com/pandas-dev/pandas/issues/36353 + columns = pd.MultiIndex.from_product([("x", "y"), ("y", "z")], names=["a", "a"]) + df = pd.DataFrame([[1, 1, 1, 1]], columns=columns) + result = df.stack(0) + + new_columns = pd.Index(["y", "z"], name="a") + new_index = pd.MultiIndex.from_tuples([(0, "x"), (0, "y")], names=[None, "a"]) + expected = pd.DataFrame([[1, 1], [1, 1]], index=new_index, columns=new_columns) + + tm.assert_frame_equal(result, expected) From 9d6ecfb5588d555439a72a42ab9cec78e188dfcf Mon Sep 17 00:00:00 2001 From: Zach Brookler <39153813+zbrookle@users.noreply.github.com> Date: Tue, 15 Sep 2020 18:40:55 -0400 Subject: [PATCH 0794/1025] DOC: Add dataframe_sql to eco system page (#36370) --- doc/source/ecosystem.rst | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/doc/source/ecosystem.rst b/doc/source/ecosystem.rst index de231e43918f8..624c0551de607 100644 --- a/doc/source/ecosystem.rst +++ b/doc/source/ecosystem.rst @@ -303,6 +303,13 @@ HTTP API, and also provides several convenient methods for parsing and analyzing fredapi makes use of pandas and returns data in a Series or DataFrame. This module requires a FRED API key that you can obtain for free on the FRED website. +`dataframe_sql `__ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +``dataframe_sql`` is a Python package that translates SQL syntax directly into +operations on pandas DataFrames. This is useful when migrating from a database to +using pandas or for users more comfortable with SQL looking for a way to interface +with pandas. + .. _ecosystem.domain: From 09554ebf5dbf969d34cee5d63fd4413b93726a6a Mon Sep 17 00:00:00 2001 From: Daniel Saxton <2658661+dsaxton@users.noreply.github.com> Date: Tue, 15 Sep 2020 18:08:22 -0500 Subject: [PATCH 0795/1025] CLN: Clean test_arithmetic.py (#36390) --- pandas/tests/series/test_arithmetic.py | 99 +++++++++++++++----------- 1 file changed, 56 insertions(+), 43 deletions(-) diff --git a/pandas/tests/series/test_arithmetic.py b/pandas/tests/series/test_arithmetic.py index c937e357b9dbc..a420a1f7d6bca 100644 --- a/pandas/tests/series/test_arithmetic.py +++ b/pandas/tests/series/test_arithmetic.py @@ -271,7 +271,6 @@ def test_comparison_flex_basic(self): tm.assert_series_equal(left.gt(right), left > right) tm.assert_series_equal(left.ge(right), left >= right) - # axis for axis in [0, None, "index"]: tm.assert_series_equal(left.eq(right, axis=axis), left == right) tm.assert_series_equal(left.ne(right, axis=axis), left != right) @@ -280,7 +279,6 @@ def test_comparison_flex_basic(self): tm.assert_series_equal(left.gt(right, axis=axis), left > right) tm.assert_series_equal(left.ge(right, axis=axis), left >= right) - # msg = "No axis named 1 for object type" for op in ["eq", "ne", "le", "le", "gt", "ge"]: with pytest.raises(ValueError, match=msg): @@ -553,32 +551,30 @@ def test_comparison_tuples(self): expected = Series([True, False]) tm.assert_series_equal(result, expected) - def test_comparison_operators_with_nas(self): + def test_comparison_operators_with_nas(self, all_compare_operators): + op = all_compare_operators ser = Series(bdate_range("1/1/2000", periods=10), dtype=object) ser[::2] = np.nan - # test that comparisons work - ops = ["lt", "le", "gt", "ge", "eq", "ne"] - for op in ops: - val = ser[5] + f = getattr(operator, op) - f = getattr(operator, op) - result = f(ser, val) + # test that comparisons work + val = ser[5] - expected = f(ser.dropna(), val).reindex(ser.index) + result = f(ser, val) + expected = f(ser.dropna(), val).reindex(ser.index) - if op == "ne": - expected = expected.fillna(True).astype(bool) - else: - expected = expected.fillna(False).astype(bool) + if op == "__ne__": + expected = expected.fillna(True).astype(bool) + else: + expected = expected.fillna(False).astype(bool) - tm.assert_series_equal(result, expected) + tm.assert_series_equal(result, expected) - # FIXME: dont leave commented-out - # fffffffuuuuuuuuuuuu - # result = f(val, s) - # expected = f(val, s.dropna()).reindex(s.index) - # tm.assert_series_equal(result, expected) + # FIXME: dont leave commented-out + # result = f(val, ser) + # expected = f(val, ser.dropna()).reindex(ser.index) + # tm.assert_series_equal(result, expected) def test_ne(self): ts = Series([3, 4, 5, 6, 7], [3, 4, 5, 6, 7], dtype=float) @@ -586,35 +582,52 @@ def test_ne(self): assert tm.equalContents(ts.index != 5, expected) assert tm.equalContents(~(ts.index == 5), expected) - def test_comp_ops_df_compat(self): + @pytest.mark.parametrize( + "left, right", + [ + ( + pd.Series([1, 2, 3], index=list("ABC"), name="x"), + pd.Series([2, 2, 2], index=list("ABD"), name="x"), + ), + ( + pd.Series([1, 2, 3], index=list("ABC"), name="x"), + pd.Series([2, 2, 2, 2], index=list("ABCD"), name="x"), + ), + ], + ) + def test_comp_ops_df_compat(self, left, right): # GH 1134 - s1 = pd.Series([1, 2, 3], index=list("ABC"), name="x") - s2 = pd.Series([2, 2, 2], index=list("ABD"), name="x") - - s3 = pd.Series([1, 2, 3], index=list("ABC"), name="x") - s4 = pd.Series([2, 2, 2, 2], index=list("ABCD"), name="x") - - for left, right in [(s1, s2), (s2, s1), (s3, s4), (s4, s3)]: - - msg = "Can only compare identically-labeled Series objects" - with pytest.raises(ValueError, match=msg): - left == right + msg = "Can only compare identically-labeled Series objects" + with pytest.raises(ValueError, match=msg): + left == right + with pytest.raises(ValueError, match=msg): + right == left - with pytest.raises(ValueError, match=msg): - left != right + with pytest.raises(ValueError, match=msg): + left != right + with pytest.raises(ValueError, match=msg): + right != left - with pytest.raises(ValueError, match=msg): - left < right + with pytest.raises(ValueError, match=msg): + left < right + with pytest.raises(ValueError, match=msg): + right < left - msg = "Can only compare identically-labeled DataFrame objects" - with pytest.raises(ValueError, match=msg): - left.to_frame() == right.to_frame() + msg = "Can only compare identically-labeled DataFrame objects" + with pytest.raises(ValueError, match=msg): + left.to_frame() == right.to_frame() + with pytest.raises(ValueError, match=msg): + right.to_frame() == left.to_frame() - with pytest.raises(ValueError, match=msg): - left.to_frame() != right.to_frame() + with pytest.raises(ValueError, match=msg): + left.to_frame() != right.to_frame() + with pytest.raises(ValueError, match=msg): + right.to_frame() != left.to_frame() - with pytest.raises(ValueError, match=msg): - left.to_frame() < right.to_frame() + with pytest.raises(ValueError, match=msg): + left.to_frame() < right.to_frame() + with pytest.raises(ValueError, match=msg): + right.to_frame() < left.to_frame() def test_compare_series_interval_keyword(self): # GH#25338 From 8a0adcbf22d797d7f3b4c9ac9c79c1ff19b35f35 Mon Sep 17 00:00:00 2001 From: Fangchen Li Date: Wed, 16 Sep 2020 10:06:56 -0500 Subject: [PATCH 0796/1025] BLD/CI: fix py39 ci #36296 (#36393) --- .travis.yml | 3 --- asv_bench/asv.conf.json | 2 +- ci/build39.sh | 3 +-- ci/deps/azure-37-32bit.yaml | 2 +- ci/deps/azure-37-locale.yaml | 2 +- ci/deps/azure-37-locale_slow.yaml | 2 +- ci/deps/azure-37-minimum_versions.yaml | 2 +- ci/deps/azure-37-slow.yaml | 2 +- ci/deps/azure-38-locale.yaml | 2 +- ci/deps/azure-38-numpydev.yaml | 2 +- ci/deps/azure-macos-37.yaml | 2 +- ci/deps/azure-windows-37.yaml | 2 +- ci/deps/azure-windows-38.yaml | 2 +- ci/deps/travis-37-arm64.yaml | 2 +- ci/deps/travis-37-cov.yaml | 2 +- ci/deps/travis-37-locale.yaml | 2 +- ci/deps/travis-37.yaml | 2 +- ci/deps/travis-38.yaml | 2 +- doc/source/getting_started/install.rst | 2 +- doc/source/whatsnew/v1.1.3.rst | 15 +++++++++++++++ environment.yml | 2 +- pandas/_libs/writers.pyx | 8 ++------ pyproject.toml | 2 +- requirements-dev.txt | 2 +- setup.py | 3 ++- 25 files changed, 40 insertions(+), 32 deletions(-) diff --git a/.travis.yml b/.travis.yml index 2e98cf47aea3e..93c8238bb5059 100644 --- a/.travis.yml +++ b/.travis.yml @@ -62,9 +62,6 @@ matrix: - arch: arm64 env: - JOB="3.7, arm64" PYTEST_WORKERS=8 ENV_FILE="ci/deps/travis-37-arm64.yaml" PATTERN="(not slow and not network and not clipboard)" - - dist: bionic - env: - - JOB="3.9-dev" PATTERN="(not slow and not network and not clipboard)" before_install: diff --git a/asv_bench/asv.conf.json b/asv_bench/asv.conf.json index 1863a17e3d5f7..e8e82edabbfa3 100644 --- a/asv_bench/asv.conf.json +++ b/asv_bench/asv.conf.json @@ -39,7 +39,7 @@ // followed by the pip installed packages). "matrix": { "numpy": [], - "Cython": ["0.29.16"], + "Cython": ["0.29.21"], "matplotlib": [], "sqlalchemy": [], "scipy": [], diff --git a/ci/build39.sh b/ci/build39.sh index b9c76635df99b..f2ef11d5a71f4 100755 --- a/ci/build39.sh +++ b/ci/build39.sh @@ -3,8 +3,7 @@ sudo apt-get install build-essential gcc xvfb pip install --no-deps -U pip wheel setuptools -pip install numpy python-dateutil pytz pytest pytest-xdist hypothesis -pip install cython --pre # https://github.com/cython/cython/issues/3395 +pip install cython numpy python-dateutil pytz pytest pytest-xdist hypothesis python setup.py build_ext -inplace python -m pip install --no-build-isolation -e . diff --git a/ci/deps/azure-37-32bit.yaml b/ci/deps/azure-37-32bit.yaml index 8e0cd73a9536d..3cdd98485f281 100644 --- a/ci/deps/azure-37-32bit.yaml +++ b/ci/deps/azure-37-32bit.yaml @@ -21,6 +21,6 @@ dependencies: # see comment above - pip - pip: - - cython>=0.29.16 + - cython>=0.29.21 - numpy>=1.16.5 - pytest>=5.0.1 diff --git a/ci/deps/azure-37-locale.yaml b/ci/deps/azure-37-locale.yaml index cc996f4077cd9..64480258fe65e 100644 --- a/ci/deps/azure-37-locale.yaml +++ b/ci/deps/azure-37-locale.yaml @@ -6,7 +6,7 @@ dependencies: - python=3.7.* # tools - - cython>=0.29.16 + - cython>=0.29.21 - pytest>=5.0.1 - pytest-xdist>=1.21 - pytest-asyncio diff --git a/ci/deps/azure-37-locale_slow.yaml b/ci/deps/azure-37-locale_slow.yaml index fbb1ea671d696..7f658fe62d268 100644 --- a/ci/deps/azure-37-locale_slow.yaml +++ b/ci/deps/azure-37-locale_slow.yaml @@ -6,7 +6,7 @@ dependencies: - python=3.7.* # tools - - cython>=0.29.16 + - cython>=0.29.21 - pytest>=5.0.1 - pytest-xdist>=1.21 - hypothesis>=3.58.0 diff --git a/ci/deps/azure-37-minimum_versions.yaml b/ci/deps/azure-37-minimum_versions.yaml index 31f82f3304db3..afd5b07cc6654 100644 --- a/ci/deps/azure-37-minimum_versions.yaml +++ b/ci/deps/azure-37-minimum_versions.yaml @@ -5,7 +5,7 @@ dependencies: - python=3.7.1 # tools - - cython=0.29.16 + - cython=0.29.21 - pytest=5.0.1 - pytest-xdist>=1.21 - hypothesis>=3.58.0 diff --git a/ci/deps/azure-37-slow.yaml b/ci/deps/azure-37-slow.yaml index d17a8a2b0ed9b..13a0d442bcae7 100644 --- a/ci/deps/azure-37-slow.yaml +++ b/ci/deps/azure-37-slow.yaml @@ -6,7 +6,7 @@ dependencies: - python=3.7.* # tools - - cython>=0.29.16 + - cython>=0.29.21 - pytest>=5.0.1 - pytest-xdist>=1.21 - hypothesis>=3.58.0 diff --git a/ci/deps/azure-38-locale.yaml b/ci/deps/azure-38-locale.yaml index bb40127b672d3..8ce58e07a8542 100644 --- a/ci/deps/azure-38-locale.yaml +++ b/ci/deps/azure-38-locale.yaml @@ -5,7 +5,7 @@ dependencies: - python=3.8.* # tools - - cython>=0.29.16 + - cython>=0.29.21 - pytest>=5.0.1 - pytest-xdist>=1.21 - pytest-asyncio>=0.12.0 diff --git a/ci/deps/azure-38-numpydev.yaml b/ci/deps/azure-38-numpydev.yaml index 37592086d49e3..274be0361c2e5 100644 --- a/ci/deps/azure-38-numpydev.yaml +++ b/ci/deps/azure-38-numpydev.yaml @@ -14,7 +14,7 @@ dependencies: - pytz - pip - pip: - - cython==0.29.16 # GH#34014 + - cython==0.29.21 # GH#34014 - "git+git://github.com/dateutil/dateutil.git" - "--extra-index-url https://pypi.anaconda.org/scipy-wheels-nightly/simple" - "--pre" diff --git a/ci/deps/azure-macos-37.yaml b/ci/deps/azure-macos-37.yaml index a5a69b9a59576..31e0ffca81424 100644 --- a/ci/deps/azure-macos-37.yaml +++ b/ci/deps/azure-macos-37.yaml @@ -31,6 +31,6 @@ dependencies: - xlwt - pip - pip: - - cython>=0.29.16 + - cython>=0.29.21 - pyreadstat - pyxlsb diff --git a/ci/deps/azure-windows-37.yaml b/ci/deps/azure-windows-37.yaml index 1d15ca41c0f8e..16b4bd72683b4 100644 --- a/ci/deps/azure-windows-37.yaml +++ b/ci/deps/azure-windows-37.yaml @@ -6,7 +6,7 @@ dependencies: - python=3.7.* # tools - - cython>=0.29.16 + - cython>=0.29.21 - pytest>=5.0.1 - pytest-xdist>=1.21 - hypothesis>=3.58.0 diff --git a/ci/deps/azure-windows-38.yaml b/ci/deps/azure-windows-38.yaml index 23bede5eb26f1..449bbd05991bf 100644 --- a/ci/deps/azure-windows-38.yaml +++ b/ci/deps/azure-windows-38.yaml @@ -6,7 +6,7 @@ dependencies: - python=3.8.* # tools - - cython>=0.29.16 + - cython>=0.29.21 - pytest>=5.0.1 - pytest-xdist>=1.21 - hypothesis>=3.58.0 diff --git a/ci/deps/travis-37-arm64.yaml b/ci/deps/travis-37-arm64.yaml index ea29cbef1272b..d04b1ca0bdcfc 100644 --- a/ci/deps/travis-37-arm64.yaml +++ b/ci/deps/travis-37-arm64.yaml @@ -6,7 +6,7 @@ dependencies: - python=3.7.* # tools - - cython>=0.29.13 + - cython>=0.29.21 - pytest>=5.0.1 - pytest-xdist>=1.21 - hypothesis>=3.58.0 diff --git a/ci/deps/travis-37-cov.yaml b/ci/deps/travis-37-cov.yaml index 33ee6dfffb1a3..d031dc1cc062f 100644 --- a/ci/deps/travis-37-cov.yaml +++ b/ci/deps/travis-37-cov.yaml @@ -6,7 +6,7 @@ dependencies: - python=3.7.* # tools - - cython>=0.29.16 + - cython>=0.29.21 - pytest>=5.0.1 - pytest-xdist>=1.21 - hypothesis>=3.58.0 diff --git a/ci/deps/travis-37-locale.yaml b/ci/deps/travis-37-locale.yaml index 306f74a0101e3..8a0b5b043ceca 100644 --- a/ci/deps/travis-37-locale.yaml +++ b/ci/deps/travis-37-locale.yaml @@ -6,7 +6,7 @@ dependencies: - python=3.7.* # tools - - cython>=0.29.16 + - cython>=0.29.21 - pytest>=5.0.1 - pytest-xdist>=1.21 - hypothesis>=3.58.0 diff --git a/ci/deps/travis-37.yaml b/ci/deps/travis-37.yaml index 26d6c2910a7cc..6a15ce1195ea9 100644 --- a/ci/deps/travis-37.yaml +++ b/ci/deps/travis-37.yaml @@ -6,7 +6,7 @@ dependencies: - python=3.7.* # tools - - cython>=0.29.16 + - cython>=0.29.21 - pytest>=5.0.1 - pytest-xdist>=1.21 - hypothesis>=3.58.0 diff --git a/ci/deps/travis-38.yaml b/ci/deps/travis-38.yaml index b879c0f81dab2..874c8dd96d008 100644 --- a/ci/deps/travis-38.yaml +++ b/ci/deps/travis-38.yaml @@ -6,7 +6,7 @@ dependencies: - python=3.8.* # tools - - cython>=0.29.16 + - cython>=0.29.21 - pytest>=5.0.1 - pytest-xdist>=1.21 - hypothesis>=3.58.0 diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index fde9f567cc3ec..2196c908ecf37 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -18,7 +18,7 @@ Instructions for installing from source, Python version support ---------------------- -Officially Python 3.7.1 and above, and 3.8. +Officially Python 3.7.1 and above, 3.8, and 3.9. Installing pandas ----------------- diff --git a/doc/source/whatsnew/v1.1.3.rst b/doc/source/whatsnew/v1.1.3.rst index 8ead78a17e9c2..72526140b6eb8 100644 --- a/doc/source/whatsnew/v1.1.3.rst +++ b/doc/source/whatsnew/v1.1.3.rst @@ -10,6 +10,21 @@ including other versions of pandas. .. --------------------------------------------------------------------------- +Enhancements +~~~~~~~~~~~~ + +Added support for new Python version +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Pandas 1.1.3 now supports Python 3.9 (:issue:`36296`). + +Development Changes +^^^^^^^^^^^^^^^^^^^ + +- The minimum version of Cython is now the most recent bug-fix version (0.29.21) (:issue:`36296`). + +.. --------------------------------------------------------------------------- + .. _whatsnew_113.regressions: Fixed regressions diff --git a/environment.yml b/environment.yml index badb0ba94a670..36bbd3d307159 100644 --- a/environment.yml +++ b/environment.yml @@ -12,7 +12,7 @@ dependencies: - asv # building - - cython>=0.29.16 + - cython>=0.29.21 # code checks - black=19.10b0 diff --git a/pandas/_libs/writers.pyx b/pandas/_libs/writers.pyx index 40c39aabb7a7a..f6823c3cb0d3f 100644 --- a/pandas/_libs/writers.pyx +++ b/pandas/_libs/writers.pyx @@ -1,11 +1,7 @@ import cython -from cython import Py_ssize_t - -from cpython.bytes cimport PyBytes_GET_SIZE -from cpython.unicode cimport PyUnicode_GET_SIZE - import numpy as np +from cpython cimport PyBytes_GET_SIZE, PyUnicode_GET_LENGTH from numpy cimport ndarray, uint8_t ctypedef fused pandas_string: @@ -144,7 +140,7 @@ cpdef inline Py_ssize_t word_len(object val): Py_ssize_t l = 0 if isinstance(val, str): - l = PyUnicode_GET_SIZE(val) + l = PyUnicode_GET_LENGTH(val) elif isinstance(val, bytes): l = PyBytes_GET_SIZE(val) diff --git a/pyproject.toml b/pyproject.toml index f6f8081b6c464..8161e8ad752da 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ requires = [ "setuptools", "wheel", - "Cython>=0.29.16,<3", # Note: sync with setup.py + "Cython>=0.29.21,<3", # Note: sync with setup.py "numpy==1.16.5; python_version=='3.7' and platform_system!='AIX'", "numpy==1.17.3; python_version>='3.8' and platform_system!='AIX'", "numpy==1.16.5; python_version=='3.7' and platform_system=='AIX'", diff --git a/requirements-dev.txt b/requirements-dev.txt index c53ced35d27fa..fb647c10f72bc 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -5,7 +5,7 @@ numpy>=1.16.5 python-dateutil>=2.7.3 pytz asv -cython>=0.29.16 +cython>=0.29.21 black==19.10b0 cpplint flake8<3.8.0 diff --git a/setup.py b/setup.py index f6f0cd9aabc0e..a8dfeb0974195 100755 --- a/setup.py +++ b/setup.py @@ -34,7 +34,7 @@ def is_platform_mac(): min_numpy_ver = "1.16.5" -min_cython_ver = "0.29.16" # note: sync with pyproject.toml +min_cython_ver = "0.29.21" # note: sync with pyproject.toml try: import Cython @@ -199,6 +199,7 @@ def build_extensions(self): "Programming Language :: Python :: 3", "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", "Programming Language :: Cython", "Topic :: Scientific/Engineering", ] From f77ada652e2073e9b600b7fd7c246dcf1f2d4b74 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Wed, 16 Sep 2020 08:08:34 -0700 Subject: [PATCH 0797/1025] CLN: Numba internal routines (#36376) --- pandas/core/groupby/generic.py | 20 ++------ pandas/core/groupby/groupby.py | 26 +++++------ pandas/core/groupby/numba_.py | 85 ++++++---------------------------- pandas/core/util/numba_.py | 42 +++++------------ pandas/core/window/numba_.py | 10 ++-- pandas/core/window/rolling.py | 11 +---- 6 files changed, 47 insertions(+), 147 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index d4e673d2e538c..a931221ef3ce1 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -74,12 +74,11 @@ get_groupby, group_selection_context, ) -from pandas.core.groupby.numba_ import generate_numba_func, split_for_numba from pandas.core.indexes.api import Index, MultiIndex, all_indexes_same import pandas.core.indexes.base as ibase from pandas.core.internals import BlockManager from pandas.core.series import Series -from pandas.core.util.numba_ import NUMBA_FUNC_CACHE, maybe_use_numba +from pandas.core.util.numba_ import maybe_use_numba from pandas.plotting import boxplot_frame_groupby @@ -518,29 +517,16 @@ def transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs): result = getattr(self, func)(*args, **kwargs) return self._transform_fast(result) - def _transform_general( - self, func, *args, engine="cython", engine_kwargs=None, **kwargs - ): + def _transform_general(self, func, *args, **kwargs): """ Transform with a non-str `func`. """ - if maybe_use_numba(engine): - numba_func, cache_key = generate_numba_func( - func, engine_kwargs, kwargs, "groupby_transform" - ) - klass = type(self._selected_obj) results = [] for name, group in self: object.__setattr__(group, "name", name) - if maybe_use_numba(engine): - values, index = split_for_numba(group) - res = numba_func(values, index, *args) - if cache_key not in NUMBA_FUNC_CACHE: - NUMBA_FUNC_CACHE[cache_key] = numba_func - else: - res = func(group, *args, **kwargs) + res = func(group, *args, **kwargs) if isinstance(res, (ABCDataFrame, ABCSeries)): res = res._values diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index ceee78bfebe68..9a14323dd8c3a 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1071,16 +1071,15 @@ def _transform_with_numba(self, data, func, *args, engine_kwargs=None, **kwargs) sorted_labels = algorithms.take_nd(labels, sorted_index, allow_fill=False) sorted_data = data.take(sorted_index, axis=self.axis).to_numpy() starts, ends = lib.generate_slices(sorted_labels, n_groups) - cache_key = (func, "groupby_transform") - if cache_key in NUMBA_FUNC_CACHE: - numba_transform_func = NUMBA_FUNC_CACHE[cache_key] - else: - numba_transform_func = numba_.generate_numba_transform_func( - tuple(args), kwargs, func, engine_kwargs - ) + + numba_transform_func = numba_.generate_numba_transform_func( + tuple(args), kwargs, func, engine_kwargs + ) result = numba_transform_func( sorted_data, sorted_index, starts, ends, len(group_keys), len(data.columns) ) + + cache_key = (func, "groupby_transform") if cache_key not in NUMBA_FUNC_CACHE: NUMBA_FUNC_CACHE[cache_key] = numba_transform_func @@ -1106,16 +1105,15 @@ def _aggregate_with_numba(self, data, func, *args, engine_kwargs=None, **kwargs) sorted_labels = algorithms.take_nd(labels, sorted_index, allow_fill=False) sorted_data = data.take(sorted_index, axis=self.axis).to_numpy() starts, ends = lib.generate_slices(sorted_labels, n_groups) - cache_key = (func, "groupby_agg") - if cache_key in NUMBA_FUNC_CACHE: - numba_agg_func = NUMBA_FUNC_CACHE[cache_key] - else: - numba_agg_func = numba_.generate_numba_agg_func( - tuple(args), kwargs, func, engine_kwargs - ) + + numba_agg_func = numba_.generate_numba_agg_func( + tuple(args), kwargs, func, engine_kwargs + ) result = numba_agg_func( sorted_data, sorted_index, starts, ends, len(group_keys), len(data.columns) ) + + cache_key = (func, "groupby_agg") if cache_key not in NUMBA_FUNC_CACHE: NUMBA_FUNC_CACHE[cache_key] = numba_agg_func diff --git a/pandas/core/groupby/numba_.py b/pandas/core/groupby/numba_.py index a2dfcd7bddd53..76f50f1387196 100644 --- a/pandas/core/groupby/numba_.py +++ b/pandas/core/groupby/numba_.py @@ -4,34 +4,17 @@ import numpy as np -from pandas._typing import FrameOrSeries, Scalar +from pandas._typing import Scalar from pandas.compat._optional import import_optional_dependency from pandas.core.util.numba_ import ( NUMBA_FUNC_CACHE, NumbaUtilError, - check_kwargs_and_nopython, get_jit_arguments, jit_user_function, ) -def split_for_numba(arg: FrameOrSeries) -> Tuple[np.ndarray, np.ndarray]: - """ - Split pandas object into its components as numpy arrays for numba functions. - - Parameters - ---------- - arg : Series or DataFrame - - Returns - ------- - (ndarray, ndarray) - values, index - """ - return arg.to_numpy(), arg.index.to_numpy() - - def validate_udf(func: Callable) -> None: """ Validate user defined function for ops when using Numba with groupby ops. @@ -67,46 +50,6 @@ def f(values, index, ...): ) -def generate_numba_func( - func: Callable, - engine_kwargs: Optional[Dict[str, bool]], - kwargs: dict, - cache_key_str: str, -) -> Tuple[Callable, Tuple[Callable, str]]: - """ - Return a JITed function and cache key for the NUMBA_FUNC_CACHE - - This _may_ be specific to groupby (as it's only used there currently). - - Parameters - ---------- - func : function - user defined function - engine_kwargs : dict or None - numba.jit arguments - kwargs : dict - kwargs for func - cache_key_str : str - string representing the second part of the cache key tuple - - Returns - ------- - (JITed function, cache key) - - Raises - ------ - NumbaUtilError - """ - nopython, nogil, parallel = get_jit_arguments(engine_kwargs) - check_kwargs_and_nopython(kwargs, nopython) - validate_udf(func) - cache_key = (func, cache_key_str) - numba_func = NUMBA_FUNC_CACHE.get( - cache_key, jit_user_function(func, nopython, nogil, parallel) - ) - return numba_func, cache_key - - def generate_numba_agg_func( args: Tuple, kwargs: Dict[str, Any], @@ -120,7 +63,7 @@ def generate_numba_agg_func( 2. Return a groupby agg function with the jitted function inline Configurations specified in engine_kwargs apply to both the user's - function _AND_ the rolling apply function. + function _AND_ the groupby evaluation loop. Parameters ---------- @@ -137,16 +80,15 @@ def generate_numba_agg_func( ------- Numba function """ - nopython, nogil, parallel = get_jit_arguments(engine_kwargs) - - check_kwargs_and_nopython(kwargs, nopython) + nopython, nogil, parallel = get_jit_arguments(engine_kwargs, kwargs) validate_udf(func) + cache_key = (func, "groupby_agg") + if cache_key in NUMBA_FUNC_CACHE: + return NUMBA_FUNC_CACHE[cache_key] numba_func = jit_user_function(func, nopython, nogil, parallel) - numba = import_optional_dependency("numba") - if parallel: loop_range = numba.prange else: @@ -175,17 +117,17 @@ def group_agg( def generate_numba_transform_func( args: Tuple, kwargs: Dict[str, Any], - func: Callable[..., Scalar], + func: Callable[..., np.ndarray], engine_kwargs: Optional[Dict[str, bool]], ) -> Callable[[np.ndarray, np.ndarray, np.ndarray, np.ndarray, int, int], np.ndarray]: """ Generate a numba jitted transform function specified by values from engine_kwargs. 1. jit the user's function - 2. Return a groupby agg function with the jitted function inline + 2. Return a groupby transform function with the jitted function inline Configurations specified in engine_kwargs apply to both the user's - function _AND_ the rolling apply function. + function _AND_ the groupby evaluation loop. Parameters ---------- @@ -202,16 +144,15 @@ def generate_numba_transform_func( ------- Numba function """ - nopython, nogil, parallel = get_jit_arguments(engine_kwargs) - - check_kwargs_and_nopython(kwargs, nopython) + nopython, nogil, parallel = get_jit_arguments(engine_kwargs, kwargs) validate_udf(func) + cache_key = (func, "groupby_transform") + if cache_key in NUMBA_FUNC_CACHE: + return NUMBA_FUNC_CACHE[cache_key] numba_func = jit_user_function(func, nopython, nogil, parallel) - numba = import_optional_dependency("numba") - if parallel: loop_range = numba.prange else: diff --git a/pandas/core/util/numba_.py b/pandas/core/util/numba_.py index b951cd4f0cc2a..f06dd10d0e497 100644 --- a/pandas/core/util/numba_.py +++ b/pandas/core/util/numba_.py @@ -24,37 +24,8 @@ def set_use_numba(enable: bool = False) -> None: GLOBAL_USE_NUMBA = enable -def check_kwargs_and_nopython( - kwargs: Optional[Dict] = None, nopython: Optional[bool] = None -) -> None: - """ - Validate that **kwargs and nopython=True was passed - https://github.com/numba/numba/issues/2916 - - Parameters - ---------- - kwargs : dict, default None - user passed keyword arguments to pass into the JITed function - nopython : bool, default None - nopython parameter - - Returns - ------- - None - - Raises - ------ - NumbaUtilError - """ - if kwargs and nopython: - raise NumbaUtilError( - "numba does not support kwargs with nopython=True: " - "https://github.com/numba/numba/issues/2916" - ) - - def get_jit_arguments( - engine_kwargs: Optional[Dict[str, bool]] = None + engine_kwargs: Optional[Dict[str, bool]] = None, kwargs: Optional[Dict] = None, ) -> Tuple[bool, bool, bool]: """ Return arguments to pass to numba.JIT, falling back on pandas default JIT settings. @@ -63,16 +34,27 @@ def get_jit_arguments( ---------- engine_kwargs : dict, default None user passed keyword arguments for numba.JIT + kwargs : dict, default None + user passed keyword arguments to pass into the JITed function Returns ------- (bool, bool, bool) nopython, nogil, parallel + + Raises + ------ + NumbaUtilError """ if engine_kwargs is None: engine_kwargs = {} nopython = engine_kwargs.get("nopython", True) + if kwargs and nopython: + raise NumbaUtilError( + "numba does not support kwargs with nopython=True: " + "https://github.com/numba/numba/issues/2916" + ) nogil = engine_kwargs.get("nogil", False) parallel = engine_kwargs.get("parallel", False) return nopython, nogil, parallel diff --git a/pandas/core/window/numba_.py b/pandas/core/window/numba_.py index aec294c3c84c2..c4858b6e5a4ab 100644 --- a/pandas/core/window/numba_.py +++ b/pandas/core/window/numba_.py @@ -6,7 +6,7 @@ from pandas.compat._optional import import_optional_dependency from pandas.core.util.numba_ import ( - check_kwargs_and_nopython, + NUMBA_FUNC_CACHE, get_jit_arguments, jit_user_function, ) @@ -42,14 +42,14 @@ def generate_numba_apply_func( ------- Numba function """ - nopython, nogil, parallel = get_jit_arguments(engine_kwargs) + nopython, nogil, parallel = get_jit_arguments(engine_kwargs, kwargs) - check_kwargs_and_nopython(kwargs, nopython) + cache_key = (func, "rolling_apply") + if cache_key in NUMBA_FUNC_CACHE: + return NUMBA_FUNC_CACHE[cache_key] numba_func = jit_user_function(func, nopython, nogil, parallel) - numba = import_optional_dependency("numba") - if parallel: loop_range = numba.prange else: diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 00fdf0813b027..21a7164411fb7 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -1374,14 +1374,7 @@ def apply( if maybe_use_numba(engine): if raw is False: raise ValueError("raw must be `True` when using the numba engine") - cache_key = (func, "rolling_apply") - if cache_key in NUMBA_FUNC_CACHE: - # Return an already compiled version of roll_apply if available - apply_func = NUMBA_FUNC_CACHE[cache_key] - else: - apply_func = generate_numba_apply_func( - args, kwargs, func, engine_kwargs - ) + apply_func = generate_numba_apply_func(args, kwargs, func, engine_kwargs) center = self.center elif engine in ("cython", None): if engine_kwargs is not None: @@ -1403,7 +1396,7 @@ def apply( center=center, floor=0, name=func, - use_numba_cache=engine == "numba", + use_numba_cache=maybe_use_numba(engine), raw=raw, original_func=func, args=args, From dad2e7d107b5e2b077001c5ac75abadc1b06c6e3 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Wed, 16 Sep 2020 16:09:10 +0100 Subject: [PATCH 0798/1025] DOC: move release note for #36175 (pt1) (#36378) --- doc/source/whatsnew/v1.1.3.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.1.3.rst b/doc/source/whatsnew/v1.1.3.rst index 72526140b6eb8..9bb063b2b1590 100644 --- a/doc/source/whatsnew/v1.1.3.rst +++ b/doc/source/whatsnew/v1.1.3.rst @@ -33,6 +33,7 @@ Fixed regressions - Fixed regression in :class:`IntegerArray` unary plus and minus operations raising a ``TypeError`` (:issue:`36063`) - Fixed regression in :meth:`Series.__getitem__` incorrectly raising when the input was a tuple (:issue:`35534`) - Fixed regression in :meth:`Series.__getitem__` incorrectly raising when the input was a frozenset (:issue:`35747`) +- Fixed regression in :meth:`read_excel` with ``engine="odf"`` caused ``UnboundLocalError`` in some cases where cells had nested child nodes (:issue:`36122`,:issue:`35802`) - .. --------------------------------------------------------------------------- From 8af68060b251c0dc7ffb6964cba646b9eaa9bfce Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Wed, 16 Sep 2020 16:09:44 +0100 Subject: [PATCH 0799/1025] DOC: move release note for #36175 (pt2) (#36379) --- doc/source/whatsnew/v1.2.0.rst | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index f398af6e4dd5e..3992e697db7e4 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -313,10 +313,9 @@ I/O - In :meth:`read_csv` `float_precision='round_trip'` now handles `decimal` and `thousands` parameters (:issue:`35365`) - :meth:`to_pickle` and :meth:`read_pickle` were closing user-provided file objects (:issue:`35679`) - :meth:`to_csv` passes compression arguments for `'gzip'` always to `gzip.GzipFile` (:issue:`28103`) -- :meth:`to_csv` did not support zip compression for binary file object not having a filename (:issue: `35058`) +- :meth:`to_csv` did not support zip compression for binary file object not having a filename (:issue:`35058`) - :meth:`to_csv` and :meth:`read_csv` did not honor `compression` and `encoding` for path-like objects that are internally converted to file-like objects (:issue:`35677`, :issue:`26124`, and :issue:`32392`) - :meth:`to_picke` and :meth:`read_pickle` did not support compression for file-objects (:issue:`26237`, :issue:`29054`, and :issue:`29570`) -- Bug in :meth:`read_excel` with `engine="odf"` caused ``UnboundLocalError`` in some cases where cells had nested child nodes (:issue:`36122`, and :issue:`35802`) Plotting ^^^^^^^^ From cdb8b8504252331d2f7c7dccd55608cd82da2be3 Mon Sep 17 00:00:00 2001 From: Chris Lynch Date: Wed, 16 Sep 2020 13:59:17 -0400 Subject: [PATCH 0800/1025] remove trailing commas for black update (#36399) Co-authored-by: Lynch --- pandas/tests/arrays/integer/test_arithmetic.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/tests/arrays/integer/test_arithmetic.py b/pandas/tests/arrays/integer/test_arithmetic.py index f549a7caeab1d..cf382dd5e37e0 100644 --- a/pandas/tests/arrays/integer/test_arithmetic.py +++ b/pandas/tests/arrays/integer/test_arithmetic.py @@ -279,9 +279,7 @@ def test_unary_minus_nullable_int(any_signed_nullable_int_dtype, source, target) tm.assert_extension_array_equal(result, expected) -@pytest.mark.parametrize( - "source", [[1, 2, 3], [1, 2, None], [-1, 0, 1]], -) +@pytest.mark.parametrize("source", [[1, 2, 3], [1, 2, None], [-1, 0, 1]]) def test_unary_plus_nullable_int(any_signed_nullable_int_dtype, source): dtype = any_signed_nullable_int_dtype expected = pd.array(source, dtype=dtype) From e58185edbe2f62fc3f3fc72c4c17d2c9937affae Mon Sep 17 00:00:00 2001 From: Daniel Saxton <2658661+dsaxton@users.noreply.github.com> Date: Wed, 16 Sep 2020 21:31:08 -0500 Subject: [PATCH 0801/1025] BUG: Always cast to Categorical in lexsort_indexer (#36385) --- doc/source/whatsnew/v1.1.3.rst | 1 + pandas/core/sorting.py | 9 +-------- .../tests/frame/methods/test_sort_values.py | 20 +++++++++++++++++++ 3 files changed, 22 insertions(+), 8 deletions(-) diff --git a/doc/source/whatsnew/v1.1.3.rst b/doc/source/whatsnew/v1.1.3.rst index 9bb063b2b1590..3f8413bd492ca 100644 --- a/doc/source/whatsnew/v1.1.3.rst +++ b/doc/source/whatsnew/v1.1.3.rst @@ -45,6 +45,7 @@ Bug fixes - Bug in :func:`read_spss` where passing a ``pathlib.Path`` as ``path`` would raise a ``TypeError`` (:issue:`33666`) - Bug in :meth:`Series.str.startswith` and :meth:`Series.str.endswith` with ``category`` dtype not propagating ``na`` parameter (:issue:`36241`) - Bug in :class:`Series` constructor where integer overflow would occur for sufficiently large scalar inputs when an index was provided (:issue:`36291`) +- Bug in :meth:`DataFrame.sort_values` raising an ``AttributeError`` when sorting on a key that casts column to categorical dtype (:issue:`36383`) - Bug in :meth:`DataFrame.stack` raising a ``ValueError`` when stacking :class:`MultiIndex` columns based on position when the levels had duplicate names (:issue:`36353`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index dd6aadf570baa..ec62192464665 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -20,7 +20,6 @@ from pandas.core.dtypes.common import ( ensure_int64, ensure_platform_int, - is_categorical_dtype, is_extension_array_dtype, ) from pandas.core.dtypes.generic import ABCMultiIndex @@ -294,13 +293,7 @@ def lexsort_indexer( keys = [ensure_key_mapped(k, key) for k in keys] for k, order in zip(keys, orders): - # we are already a Categorical - if is_categorical_dtype(k): - cat = k - - # create the Categorical - else: - cat = Categorical(k, ordered=True) + cat = Categorical(k, ordered=True) if na_position not in ["last", "first"]: raise ValueError(f"invalid na_position: {na_position}") diff --git a/pandas/tests/frame/methods/test_sort_values.py b/pandas/tests/frame/methods/test_sort_values.py index c60e7e3b1bdb6..0ca232ec433e7 100644 --- a/pandas/tests/frame/methods/test_sort_values.py +++ b/pandas/tests/frame/methods/test_sort_values.py @@ -691,3 +691,23 @@ def test_sort_values_key_dict_axis(self): result = df.sort_values(1, key=lambda col: -col, axis=1) expected = df.loc[:, ::-1] tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("ordered", [True, False]) + def test_sort_values_key_casts_to_categorical(self, ordered): + # https://github.com/pandas-dev/pandas/issues/36383 + categories = ["c", "b", "a"] + df = pd.DataFrame({"x": [1, 1, 1], "y": ["a", "b", "c"]}) + + def sorter(key): + if key.name == "y": + return pd.Series( + pd.Categorical(key, categories=categories, ordered=ordered) + ) + return key + + result = df.sort_values(by=["x", "y"], key=sorter) + expected = pd.DataFrame( + {"x": [1, 1, 1], "y": ["c", "b", "a"]}, index=pd.Index([2, 1, 0]) + ) + + tm.assert_frame_equal(result, expected) From 21ef2481d7d1eaf8294624de088e5c5670223b28 Mon Sep 17 00:00:00 2001 From: Erfan Nariman <34067903+erfannariman@users.noreply.github.com> Date: Thu, 17 Sep 2020 04:39:40 +0200 Subject: [PATCH 0802/1025] DEPR: DataFrame.lookup (#35224) --- doc/source/user_guide/indexing.rst | 22 ++++++++---- doc/source/whatsnew/v1.2.0.rst | 1 + pandas/core/frame.py | 15 +++++++- pandas/tests/frame/indexing/test_indexing.py | 36 +++++++++++++++----- 4 files changed, 58 insertions(+), 16 deletions(-) diff --git a/doc/source/user_guide/indexing.rst b/doc/source/user_guide/indexing.rst index 74abbc9503db0..b11baad1e3eb5 100644 --- a/doc/source/user_guide/indexing.rst +++ b/doc/source/user_guide/indexing.rst @@ -1480,17 +1480,27 @@ default value. s.get('a') # equivalent to s['a'] s.get('x', default=-1) -The :meth:`~pandas.DataFrame.lookup` method -------------------------------------------- +.. _indexing.lookup: + +Looking up values by index/column labels +---------------------------------------- Sometimes you want to extract a set of values given a sequence of row labels -and column labels, and the ``lookup`` method allows for this and returns a -NumPy array. For instance: +and column labels, this can be achieved by ``DataFrame.melt`` combined by filtering the corresponding +rows with ``DataFrame.loc``. For instance: .. ipython:: python - dflookup = pd.DataFrame(np.random.rand(20, 4), columns = ['A', 'B', 'C', 'D']) - dflookup.lookup(list(range(0, 10, 2)), ['B', 'C', 'A', 'B', 'D']) + df = pd.DataFrame({'col': ["A", "A", "B", "B"], + 'A': [80, 23, np.nan, 22], + 'B': [80, 55, 76, 67]}) + df + melt = df.melt('col') + melt = melt.loc[melt['col'] == melt['variable'], 'value'] + melt.reset_index(drop=True) + +Formerly this could be achieved with the dedicated ``DataFrame.lookup`` method +which was deprecated in version 1.2.0. .. _indexing.class: diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 3992e697db7e4..2c95fd95ffc7b 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -210,6 +210,7 @@ Deprecations - Deprecated parameter ``inplace`` in :meth:`MultiIndex.set_codes` and :meth:`MultiIndex.set_levels` (:issue:`35626`) - Deprecated parameter ``dtype`` in :~meth:`Index.copy` on method all index classes. Use the :meth:`Index.astype` method instead for changing dtype(:issue:`35853`) - Date parser functions :func:`~pandas.io.date_converters.parse_date_time`, :func:`~pandas.io.date_converters.parse_date_fields`, :func:`~pandas.io.date_converters.parse_all_fields` and :func:`~pandas.io.date_converters.generic_parser` from ``pandas.io.date_converters`` are deprecated and will be removed in a future version; use :func:`to_datetime` instead (:issue:`35741`) +- :meth:`DataFrame.lookup` is deprecated and will be removed in a future version, use :meth:`DataFrame.melt` and :meth:`DataFrame.loc` instead (:issue:`18682`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/frame.py b/pandas/core/frame.py index bc0e55195fb3e..36dfe43bfd708 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3842,10 +3842,15 @@ def _series(self): def lookup(self, row_labels, col_labels) -> np.ndarray: """ Label-based "fancy indexing" function for DataFrame. - Given equal-length arrays of row and column labels, return an array of the values corresponding to each (row, col) pair. + .. deprecated:: 1.2.0 + DataFrame.lookup is deprecated, + use DataFrame.melt and DataFrame.loc instead. + For an example see :meth:`~pandas.DataFrame.lookup` + in the user guide. + Parameters ---------- row_labels : sequence @@ -3858,6 +3863,14 @@ def lookup(self, row_labels, col_labels) -> np.ndarray: numpy.ndarray The found values. """ + msg = ( + "The 'lookup' method is deprecated and will be" + "removed in a future version." + "You can use DataFrame.melt and DataFrame.loc" + "as a substitute." + ) + warnings.warn(msg, FutureWarning, stacklevel=2) + n = len(row_labels) if n != len(col_labels): raise ValueError("Row labels must have same size as column labels") diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index e4549dfb3e68d..b947be705a329 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -1340,7 +1340,8 @@ def test_lookup_float(self, float_frame): df = float_frame rows = list(df.index) * len(df.columns) cols = list(df.columns) * len(df.index) - result = df.lookup(rows, cols) + with tm.assert_produces_warning(FutureWarning): + result = df.lookup(rows, cols) expected = np.array([df.loc[r, c] for r, c in zip(rows, cols)]) tm.assert_numpy_array_equal(result, expected) @@ -1349,7 +1350,8 @@ def test_lookup_mixed(self, float_string_frame): df = float_string_frame rows = list(df.index) * len(df.columns) cols = list(df.columns) * len(df.index) - result = df.lookup(rows, cols) + with tm.assert_produces_warning(FutureWarning): + result = df.lookup(rows, cols) expected = np.array( [df.loc[r, c] for r, c in zip(rows, cols)], dtype=np.object_ @@ -1365,7 +1367,8 @@ def test_lookup_bool(self): "mask_c": [False, True, False, True], } ) - df["mask"] = df.lookup(df.index, "mask_" + df["label"]) + with tm.assert_produces_warning(FutureWarning): + df["mask"] = df.lookup(df.index, "mask_" + df["label"]) exp_mask = np.array( [df.loc[r, c] for r, c in zip(df.index, "mask_" + df["label"])] @@ -1376,13 +1379,16 @@ def test_lookup_bool(self): def test_lookup_raises(self, float_frame): with pytest.raises(KeyError, match="'One or more row labels was not found'"): - float_frame.lookup(["xyz"], ["A"]) + with tm.assert_produces_warning(FutureWarning): + float_frame.lookup(["xyz"], ["A"]) with pytest.raises(KeyError, match="'One or more column labels was not found'"): - float_frame.lookup([float_frame.index[0]], ["xyz"]) + with tm.assert_produces_warning(FutureWarning): + float_frame.lookup([float_frame.index[0]], ["xyz"]) with pytest.raises(ValueError, match="same size"): - float_frame.lookup(["a", "b", "c"], ["a"]) + with tm.assert_produces_warning(FutureWarning): + float_frame.lookup(["a", "b", "c"], ["a"]) def test_lookup_requires_unique_axes(self): # GH#33041 raise with a helpful error message @@ -1393,14 +1399,17 @@ def test_lookup_requires_unique_axes(self): # homogeneous-dtype case with pytest.raises(ValueError, match="requires unique index and columns"): - df.lookup(rows, cols) + with tm.assert_produces_warning(FutureWarning): + df.lookup(rows, cols) with pytest.raises(ValueError, match="requires unique index and columns"): - df.T.lookup(cols, rows) + with tm.assert_produces_warning(FutureWarning): + df.T.lookup(cols, rows) # heterogeneous dtype df["B"] = 0 with pytest.raises(ValueError, match="requires unique index and columns"): - df.lookup(rows, cols) + with tm.assert_produces_warning(FutureWarning): + df.lookup(rows, cols) def test_set_value(self, float_frame): for idx in float_frame.index: @@ -2232,3 +2241,12 @@ def test_object_casting_indexing_wraps_datetimelike(): assert blk.dtype == "m8[ns]" # we got the right block val = blk.iget((0, 0)) assert isinstance(val, pd.Timedelta) + + +def test_lookup_deprecated(): + # GH18262 + df = pd.DataFrame( + {"col": ["A", "A", "B", "B"], "A": [80, 23, np.nan, 22], "B": [80, 55, 76, 67]} + ) + with tm.assert_produces_warning(FutureWarning): + df.lookup(df.index, df["col"]) From 392e46b679a497981e5f30152cfbda391540d14e Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 16 Sep 2020 19:41:13 -0700 Subject: [PATCH 0803/1025] ENH/BUG: consistently cast strs to datetimelike for searchsorted (#36346) --- doc/source/whatsnew/v1.2.0.rst | 1 + pandas/core/arrays/datetimelike.py | 3 +- pandas/core/indexes/datetimelike.py | 8 ----- pandas/tests/arrays/test_datetimelike.py | 41 ++++++++++++++++++++++++ 4 files changed, 43 insertions(+), 10 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 2c95fd95ffc7b..6436b2eceb33f 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -119,6 +119,7 @@ Other enhancements - :meth:`DataFrame.explode` and :meth:`Series.explode` now support exploding of sets (:issue:`35614`) - `Styler` now allows direct CSS class name addition to individual data cells (:issue:`36159`) - :meth:`Rolling.mean()` and :meth:`Rolling.sum()` use Kahan summation to calculate the mean to avoid numerical problems (:issue:`10319`, :issue:`11645`, :issue:`13254`, :issue:`32761`, :issue:`36031`) +- :meth:`DatetimeIndex.searchsorted`, :meth:`TimedeltaIndex.searchsorted`, :meth:`PeriodIndex.searchsorted`, and :meth:`Series.searchsorted` with datetimelike dtypes will now try to cast string arguments (listlike and scalar) to the matching datetimelike type (:issue:`36346`) .. _whatsnew_120.api_breaking.python: diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index e8b1c12687584..15cbdf882eb7b 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -845,8 +845,7 @@ def _validate_searchsorted_value(self, value): if not is_list_like(value): value = self._validate_scalar(value, msg, cast_str=True) else: - # TODO: cast_str? we accept it for scalar - value = self._validate_listlike(value, "searchsorted") + value = self._validate_listlike(value, "searchsorted", cast_str=True) rv = self._unbox(value) return self._rebox_native(rv) diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 5ba5732c710f7..984ab49cbc517 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -178,14 +178,6 @@ def take(self, indices, axis=0, allow_fill=True, fill_value=None, **kwargs): @doc(IndexOpsMixin.searchsorted, klass="Datetime-like Index") def searchsorted(self, value, side="left", sorter=None): - if isinstance(value, str): - raise TypeError( - "searchsorted requires compatible dtype or scalar, " - f"not {type(value).__name__}" - ) - if isinstance(value, Index): - value = value._data - return self._data.searchsorted(value, side=side, sorter=sorter) _can_hold_na = True diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index 83b98525d3e8a..f512b168d2795 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -252,6 +252,47 @@ def test_searchsorted(self): else: assert result == 10 + @pytest.mark.parametrize("box", [None, "index", "series"]) + def test_searchsorted_castable_strings(self, arr1d, box): + if isinstance(arr1d, DatetimeArray): + tz = arr1d.tz + if ( + tz is not None + and tz is not pytz.UTC + and not isinstance(tz, pytz._FixedOffset) + ): + # If we have e.g. tzutc(), when we cast to string and parse + # back we get pytz.UTC, and then consider them different timezones + # so incorrectly raise. + pytest.xfail(reason="timezone comparisons inconsistent") + + arr = arr1d + if box is None: + pass + elif box == "index": + # Test the equivalent Index.searchsorted method while we're here + arr = self.index_cls(arr) + else: + # Test the equivalent Series.searchsorted method while we're here + arr = pd.Series(arr) + + # scalar + result = arr.searchsorted(str(arr[1])) + assert result == 1 + + result = arr.searchsorted(str(arr[2]), side="right") + assert result == 3 + + result = arr.searchsorted([str(x) for x in arr[1:3]]) + expected = np.array([1, 2], dtype=np.intp) + tm.assert_numpy_array_equal(result, expected) + + with pytest.raises(TypeError): + arr.searchsorted("foo") + + with pytest.raises(TypeError): + arr.searchsorted([str(arr[1]), "baz"]) + def test_getitem_2d(self, arr1d): # 2d slicing on a 1D array expected = type(arr1d)(arr1d._data[:, np.newaxis], dtype=arr1d.dtype) From e59ba3d9917e2a7e6f476a46d5500ba2d51c5faf Mon Sep 17 00:00:00 2001 From: Daniel Saxton <2658661+dsaxton@users.noreply.github.com> Date: Thu, 17 Sep 2020 03:11:00 -0500 Subject: [PATCH 0804/1025] Bump flake8 version in pre-commit-config.yaml (#36412) --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index fcd0ecdc9fcd2..78e28aad5865b 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -5,7 +5,7 @@ repos: - id: black language_version: python3 - repo: https://gitlab.com/pycqa/flake8 - rev: 3.7.7 + rev: 3.8.3 hooks: - id: flake8 language: python_venv From 0d2dcc5d0878d4833f3a053fcfb8f1f4785dd3e6 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 17 Sep 2020 09:06:47 -0700 Subject: [PATCH 0805/1025] REF: re-use validate_listlike for _convert_arr_indexer (#36415) --- pandas/core/indexes/datetimelike.py | 21 ++++++--------------- 1 file changed, 6 insertions(+), 15 deletions(-) diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 984ab49cbc517..f1b5aa9e98bdb 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -8,7 +8,6 @@ from pandas._libs import NaT, Timedelta, iNaT, join as libjoin, lib from pandas._libs.tslibs import BaseOffset, Resolution, Tick, timezones -from pandas._libs.tslibs.parsing import DateParseError from pandas._typing import Callable, Label from pandas.compat.numpy import function as nv from pandas.errors import AbstractMethodError @@ -31,7 +30,6 @@ from pandas.core.arrays.datetimelike import DatetimeLikeArrayMixin from pandas.core.base import IndexOpsMixin import pandas.core.common as com -from pandas.core.construction import array as pd_array, extract_array import pandas.core.indexes.base as ibase from pandas.core.indexes.base import Index, _index_shared_docs from pandas.core.indexes.extension import ( @@ -586,19 +584,12 @@ def _wrap_joined_index(self, joined: np.ndarray, other): @doc(Index._convert_arr_indexer) def _convert_arr_indexer(self, keyarr): - if lib.infer_dtype(keyarr) == "string": - # Weak reasoning that indexer is a list of strings - # representing datetime or timedelta or period - try: - extension_arr = pd_array(keyarr, self.dtype) - except (ValueError, DateParseError): - # Fail to infer keyarr from self.dtype - return keyarr - - converted_arr = extract_array(extension_arr, extract_numpy=True) - else: - converted_arr = com.asarray_tuplesafe(keyarr) - return converted_arr + try: + return self._data._validate_listlike( + keyarr, "convert_arr_indexer", cast_str=True, allow_object=True + ) + except (ValueError, TypeError): + return com.asarray_tuplesafe(keyarr) class DatetimeTimedeltaMixin(DatetimeIndexOpsMixin, Int64Index): From 9d6acc4e99021dcffd6be0ee3a1a3fb4c66a4250 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 17 Sep 2020 18:07:48 +0200 Subject: [PATCH 0806/1025] REF: use BlockManager.to_native_types in formatting code (#36417) --- pandas/core/internals/managers.py | 7 +++++++ pandas/io/formats/csvs.py | 10 ++-------- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 3f446874ffd0e..865412f159ea1 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -631,6 +631,13 @@ def replace_list( bm._consolidate_inplace() return bm + def to_native_types(self, **kwargs) -> "BlockManager": + """ + Convert values to native types (strings / python objects) that are used + in formatting (repr / csv). + """ + return self.apply("to_native_types", **kwargs) + def is_consolidated(self) -> bool: """ Return True if more than one block with the same dtype diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 1bda16d126905..403eead686f89 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -334,18 +334,12 @@ def _save_body(self) -> None: self._save_chunk(start_i, end_i) def _save_chunk(self, start_i: int, end_i: int) -> None: - ncols = self.obj.shape[-1] - data = [None] * ncols - # create the data for a chunk slicer = slice(start_i, end_i) - df = self.obj.iloc[slicer] - mgr = df._mgr - res = mgr.apply("to_native_types", **self._number_format) - for i in range(len(res.items)): - data[i] = res.iget_values(i) + res = df._mgr.to_native_types(**self._number_format) + data = [res.iget_values(i) for i in range(len(res.items))] ix = self.data_index.to_native_types(slicer=slicer, **self._number_format) libwriters.write_csv_rows(data, ix, self.nlevels, self.cols, self.writer) From 84187d91c48d6ed5b48d576f26e43dcfdd092aa4 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 17 Sep 2020 09:08:29 -0700 Subject: [PATCH 0807/1025] REF: re-use _maybe_promote for _is_convertible_to_index_for_join (#36416) --- pandas/core/indexes/datetimelike.py | 29 +++++------------------------ 1 file changed, 5 insertions(+), 24 deletions(-) diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index f1b5aa9e98bdb..498202bea90dc 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -848,11 +848,11 @@ def join( """ See Index.join """ - if self._is_convertible_to_index_for_join(other): - try: - other = type(self)(other) - except (TypeError, ValueError): - pass + pself, pother = self._maybe_promote(other) + if pself is not self or pother is not other: + return pself.join( + pother, how=how, level=level, return_indexers=return_indexers, sort=sort + ) this, other = self._maybe_utc_convert(other) return Index.join( @@ -881,25 +881,6 @@ def _maybe_utc_convert(self, other): other = other.tz_convert("UTC") return this, other - @classmethod - def _is_convertible_to_index_for_join(cls, other: Index) -> bool: - """ - return a boolean whether I can attempt conversion to a - DatetimeIndex/TimedeltaIndex - """ - if isinstance(other, cls): - return False - elif len(other) > 0 and other.inferred_type not in ( - "floating", - "mixed-integer", - "integer", - "integer-na", - "mixed-integer-float", - "mixed", - ): - return True - return False - # -------------------------------------------------------------------- # List-Like Methods From 02505be4f078cfa4c6f44bcb6f8d80e0ee40dacb Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 17 Sep 2020 09:10:34 -0700 Subject: [PATCH 0808/1025] REF: _validate_foo pattern for IntervalArray (#36414) --- pandas/core/arrays/interval.py | 69 ++++++++++++++++++++++----------- pandas/core/indexes/interval.py | 19 ++------- 2 files changed, 51 insertions(+), 37 deletions(-) diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 436a7dd062c4a..028472ad93e52 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -640,17 +640,10 @@ def fillna(self, value=None, method=None, limit=None): if limit is not None: raise TypeError("limit is not supported for IntervalArray.") - if not isinstance(value, Interval): - msg = ( - "'IntervalArray.fillna' only supports filling with a " - f"scalar 'pandas.Interval'. Got a '{type(value).__name__}' instead." - ) - raise TypeError(msg) - - self._check_closed_matches(value, name="value") + value_left, value_right = self._validate_fillna_value(value) - left = self.left.fillna(value=value.left) - right = self.right.fillna(value=value.right) + left = self.left.fillna(value=value_left) + right = self.right.fillna(value=value_right) return self._shallow_copy(left, right) @property @@ -845,18 +838,7 @@ def take(self, indices, allow_fill=False, fill_value=None, axis=None, **kwargs): fill_left = fill_right = fill_value if allow_fill: - if fill_value is None: - fill_left = fill_right = self.left._na_value - elif is_interval(fill_value): - self._check_closed_matches(fill_value, name="fill_value") - fill_left, fill_right = fill_value.left, fill_value.right - elif not is_scalar(fill_value) and notna(fill_value): - msg = ( - "'IntervalArray.fillna' only supports filling with a " - "'scalar pandas.Interval or NA'. " - f"Got a '{type(fill_value).__name__}' instead." - ) - raise ValueError(msg) + fill_left, fill_right = self._validate_fill_value(fill_value) left_take = take( self.left, indices, allow_fill=allow_fill, fill_value=fill_left @@ -867,6 +849,49 @@ def take(self, indices, allow_fill=False, fill_value=None, axis=None, **kwargs): return self._shallow_copy(left_take, right_take) + def _validate_fill_value(self, value): + if is_interval(value): + self._check_closed_matches(value, name="fill_value") + fill_left, fill_right = value.left, value.right + elif not is_scalar(value) and notna(value): + msg = ( + "'IntervalArray.fillna' only supports filling with a " + "'scalar pandas.Interval or NA'. " + f"Got a '{type(value).__name__}' instead." + ) + raise ValueError(msg) + else: + fill_left = fill_right = self.left._na_value + return fill_left, fill_right + + def _validate_fillna_value(self, value): + if not isinstance(value, Interval): + msg = ( + "'IntervalArray.fillna' only supports filling with a " + f"scalar 'pandas.Interval'. Got a '{type(value).__name__}' instead." + ) + raise TypeError(msg) + + self._check_closed_matches(value, name="value") + return value.left, value.right + + def _validate_insert_value(self, value): + if isinstance(value, Interval): + if value.closed != self.closed: + raise ValueError( + "inserted item must be closed on the same side as the index" + ) + left_insert = value.left + right_insert = value.right + elif is_scalar(value) and isna(value): + # GH#18295 + left_insert = right_insert = value + else: + raise ValueError( + "can only insert Interval objects and NA into an IntervalIndex" + ) + return left_insert, right_insert + def value_counts(self, dropna=True): """ Returns a Series containing counts of each interval. diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index ad0a7ea32a1cc..9ef584f5b7fbc 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -333,7 +333,9 @@ def from_tuples( # -------------------------------------------------------------------- @Appender(Index._shallow_copy.__doc__) - def _shallow_copy(self, values=None, name: Label = lib.no_default): + def _shallow_copy( + self, values: Optional[IntervalArray] = None, name: Label = lib.no_default + ): name = self.name if name is lib.no_default else name cache = self._cache.copy() if values is None else {} if values is None: @@ -927,20 +929,7 @@ def insert(self, loc, item): ------- IntervalIndex """ - if isinstance(item, Interval): - if item.closed != self.closed: - raise ValueError( - "inserted item must be closed on the same side as the index" - ) - left_insert = item.left - right_insert = item.right - elif is_scalar(item) and isna(item): - # GH 18295 - left_insert = right_insert = item - else: - raise ValueError( - "can only insert Interval objects and NA into an IntervalIndex" - ) + left_insert, right_insert = self._data._validate_insert_value(item) new_left = self.left.insert(loc, left_insert) new_right = self.right.insert(loc, right_insert) From 137ce63372cc8085984360ea9f5d8b46097f731a Mon Sep 17 00:00:00 2001 From: Chris Barnes Date: Thu, 17 Sep 2020 12:34:10 -0400 Subject: [PATCH 0809/1025] Update isort version in pre-commit config (#36428) Previously, a mismatch between the isort versions specified by pre-commit and by the files used on CI made it impossible to make a commit which passed both pre-commit and CI lints. --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 78e28aad5865b..309e22e71a523 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -25,7 +25,7 @@ repos: - file args: [--append-config=flake8/cython-template.cfg] - repo: https://github.com/pre-commit/mirrors-isort - rev: v4.3.21 + rev: v5.2.2 hooks: - id: isort language: python_venv From 2eeac14e42be42ed03e5659aa36defb81a7ac557 Mon Sep 17 00:00:00 2001 From: Daniel Saxton <2658661+dsaxton@users.noreply.github.com> Date: Thu, 17 Sep 2020 11:36:46 -0500 Subject: [PATCH 0810/1025] CLN: Clean series/test_arithmetic.py (#36406) --- pandas/tests/series/test_arithmetic.py | 98 +++++++++++--------------- 1 file changed, 42 insertions(+), 56 deletions(-) diff --git a/pandas/tests/series/test_arithmetic.py b/pandas/tests/series/test_arithmetic.py index a420a1f7d6bca..8fad6ee1cca8b 100644 --- a/pandas/tests/series/test_arithmetic.py +++ b/pandas/tests/series/test_arithmetic.py @@ -260,73 +260,59 @@ def test_sub_datetimelike_align(self): class TestSeriesFlexComparison: - def test_comparison_flex_basic(self): + @pytest.mark.parametrize("axis", [0, None, "index"]) + def test_comparison_flex_basic(self, axis, all_compare_operators): + op = all_compare_operators.strip("__") left = pd.Series(np.random.randn(10)) right = pd.Series(np.random.randn(10)) + result = getattr(left, op)(right, axis=axis) + expected = getattr(operator, op)(left, right) + tm.assert_series_equal(result, expected) - tm.assert_series_equal(left.eq(right), left == right) - tm.assert_series_equal(left.ne(right), left != right) - tm.assert_series_equal(left.le(right), left < right) - tm.assert_series_equal(left.lt(right), left <= right) - tm.assert_series_equal(left.gt(right), left > right) - tm.assert_series_equal(left.ge(right), left >= right) - - for axis in [0, None, "index"]: - tm.assert_series_equal(left.eq(right, axis=axis), left == right) - tm.assert_series_equal(left.ne(right, axis=axis), left != right) - tm.assert_series_equal(left.le(right, axis=axis), left < right) - tm.assert_series_equal(left.lt(right, axis=axis), left <= right) - tm.assert_series_equal(left.gt(right, axis=axis), left > right) - tm.assert_series_equal(left.ge(right, axis=axis), left >= right) + def test_comparison_bad_axis(self, all_compare_operators): + op = all_compare_operators.strip("__") + left = pd.Series(np.random.randn(10)) + right = pd.Series(np.random.randn(10)) msg = "No axis named 1 for object type" - for op in ["eq", "ne", "le", "le", "gt", "ge"]: - with pytest.raises(ValueError, match=msg): - getattr(left, op)(right, axis=1) + with pytest.raises(ValueError, match=msg): + getattr(left, op)(right, axis=1) - def test_comparison_flex_alignment(self): + @pytest.mark.parametrize( + "values, op", + [ + ([False, False, True, False], "eq"), + ([True, True, False, True], "ne"), + ([False, False, True, False], "le"), + ([False, False, False, False], "lt"), + ([False, True, True, False], "ge"), + ([False, True, False, False], "gt"), + ], + ) + def test_comparison_flex_alignment(self, values, op): left = Series([1, 3, 2], index=list("abc")) right = Series([2, 2, 2], index=list("bcd")) + result = getattr(left, op)(right) + expected = pd.Series(values, index=list("abcd")) + tm.assert_series_equal(result, expected) - exp = pd.Series([False, False, True, False], index=list("abcd")) - tm.assert_series_equal(left.eq(right), exp) - - exp = pd.Series([True, True, False, True], index=list("abcd")) - tm.assert_series_equal(left.ne(right), exp) - - exp = pd.Series([False, False, True, False], index=list("abcd")) - tm.assert_series_equal(left.le(right), exp) - - exp = pd.Series([False, False, False, False], index=list("abcd")) - tm.assert_series_equal(left.lt(right), exp) - - exp = pd.Series([False, True, True, False], index=list("abcd")) - tm.assert_series_equal(left.ge(right), exp) - - exp = pd.Series([False, True, False, False], index=list("abcd")) - tm.assert_series_equal(left.gt(right), exp) - - def test_comparison_flex_alignment_fill(self): + @pytest.mark.parametrize( + "values, op, fill_value", + [ + ([False, False, True, True], "eq", 2), + ([True, True, False, False], "ne", 2), + ([False, False, True, True], "le", 0), + ([False, False, False, True], "lt", 0), + ([True, True, True, False], "ge", 0), + ([True, True, False, False], "gt", 0), + ], + ) + def test_comparison_flex_alignment_fill(self, values, op, fill_value): left = Series([1, 3, 2], index=list("abc")) right = Series([2, 2, 2], index=list("bcd")) - - exp = pd.Series([False, False, True, True], index=list("abcd")) - tm.assert_series_equal(left.eq(right, fill_value=2), exp) - - exp = pd.Series([True, True, False, False], index=list("abcd")) - tm.assert_series_equal(left.ne(right, fill_value=2), exp) - - exp = pd.Series([False, False, True, True], index=list("abcd")) - tm.assert_series_equal(left.le(right, fill_value=0), exp) - - exp = pd.Series([False, False, False, True], index=list("abcd")) - tm.assert_series_equal(left.lt(right, fill_value=0), exp) - - exp = pd.Series([True, True, True, False], index=list("abcd")) - tm.assert_series_equal(left.ge(right, fill_value=0), exp) - - exp = pd.Series([True, True, False, False], index=list("abcd")) - tm.assert_series_equal(left.gt(right, fill_value=0), exp) + result = getattr(left, op)(right, fill_value=fill_value) + expected = pd.Series(values, index=list("abcd")) + tm.assert_series_equal(result, expected) class TestSeriesComparison: From dc17e3a65540e736252e240824c6a398e51ec1b5 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 17 Sep 2020 09:38:34 -0700 Subject: [PATCH 0811/1025] BUG: Categorical.sort_values inplace breaking views (#36404) --- pandas/core/arrays/categorical.py | 2 +- pandas/tests/arrays/categorical/test_sorting.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 25073282ec0f6..ba7534082d0d6 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1541,7 +1541,7 @@ def sort_values( sorted_idx = nargsort(self, ascending=ascending, na_position=na_position) if inplace: - self._codes = self._codes[sorted_idx] + self._codes[:] = self._codes[sorted_idx] else: codes = self._codes[sorted_idx] return self._from_backing_data(codes) diff --git a/pandas/tests/arrays/categorical/test_sorting.py b/pandas/tests/arrays/categorical/test_sorting.py index 2a0ef043bf9a9..9589216557cd5 100644 --- a/pandas/tests/arrays/categorical/test_sorting.py +++ b/pandas/tests/arrays/categorical/test_sorting.py @@ -66,7 +66,9 @@ def test_sort_values(self): # sort (inplace order) cat1 = cat.copy() + orig_codes = cat1._codes cat1.sort_values(inplace=True) + assert cat1._codes is orig_codes exp = np.array(["a", "b", "c", "d"], dtype=object) tm.assert_numpy_array_equal(cat1.__array__(), exp) tm.assert_index_equal(res.categories, cat.categories) From 36545cc341895d2379a1f362d41332589afcf569 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov <41443370+ivanovmg@users.noreply.github.com> Date: Thu, 17 Sep 2020 23:39:10 +0700 Subject: [PATCH 0812/1025] TYP: alias IndexLabel without Optional (#36401) --- pandas/_typing.py | 2 +- pandas/core/generic.py | 2 +- pandas/io/formats/csvs.py | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/_typing.py b/pandas/_typing.py index 7aef5c02e290f..16d81c0d39cbe 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -83,7 +83,7 @@ Axis = Union[str, int] Label = Optional[Hashable] -IndexLabel = Optional[Union[Label, Sequence[Label]]] +IndexLabel = Union[Label, Sequence[Label]] Level = Union[Label, int] Ordered = Optional[bool] JSONSerializable = Optional[Union[PythonScalar, List, Dict]] diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 814f307749d50..cc18b8681200f 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3154,7 +3154,7 @@ def to_csv( columns: Optional[Sequence[Label]] = None, header: Union[bool_t, List[str]] = True, index: bool_t = True, - index_label: IndexLabel = None, + index_label: Optional[IndexLabel] = None, mode: str = "w", encoding: Optional[str] = None, compression: CompressionOptions = "infer", diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 403eead686f89..4250a08f748d7 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -42,7 +42,7 @@ def __init__( cols: Optional[Sequence[Label]] = None, header: Union[bool, Sequence[Hashable]] = True, index: bool = True, - index_label: IndexLabel = None, + index_label: Optional[IndexLabel] = None, mode: str = "w", encoding: Optional[str] = None, errors: str = "strict", @@ -100,7 +100,7 @@ def index_label(self) -> IndexLabel: return self._index_label @index_label.setter - def index_label(self, index_label: IndexLabel) -> None: + def index_label(self, index_label: Optional[IndexLabel]) -> None: if index_label is not False: if index_label is None: index_label = self._get_index_label_from_obj() From 18c0df0c1bd26c2f037724bebfb9fb8c5a3125d5 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 17 Sep 2020 09:40:48 -0700 Subject: [PATCH 0813/1025] REF: implement putmask for CI/DTI/TDI/PI (#36400) --- pandas/core/arrays/categorical.py | 5 +++++ pandas/core/indexes/base.py | 3 --- pandas/core/indexes/category.py | 11 +++++++++++ pandas/core/indexes/datetimelike.py | 13 ++++++++++++- pandas/tests/indexes/common.py | 7 ++++--- 5 files changed, 32 insertions(+), 7 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index ba7534082d0d6..1cb428218296f 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1171,6 +1171,11 @@ def map(self, mapper): # ------------------------------------------------------------- # Validators; ideally these can be de-duplicated + def _validate_where_value(self, value): + if is_scalar(value): + return self._validate_fill_value(value) + return self._validate_listlike(value) + def _validate_insert_value(self, value) -> int: code = self.categories.get_indexer([value]) if (code == -1) and not (is_scalar(value) and isna(value)): diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 15944565cb254..a2f11160b2fdc 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4232,9 +4232,6 @@ def putmask(self, mask, value): try: converted = self._validate_fill_value(value) np.putmask(values, mask, converted) - if is_period_dtype(self.dtype): - # .values cast to object, so we need to cast back - values = type(self)(values)._data return self._shallow_copy(values) except (ValueError, TypeError) as err: if is_object_dtype(self): diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 9e4714060e23e..d73b36eff69f3 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -422,6 +422,17 @@ def where(self, cond, other=None): cat = Categorical(values, dtype=self.dtype) return type(self)._simple_new(cat, name=self.name) + def putmask(self, mask, value): + try: + code_value = self._data._validate_where_value(value) + except (TypeError, ValueError): + return self.astype(object).putmask(mask, value) + + codes = self._data._ndarray.copy() + np.putmask(codes, mask, code_value) + cat = self._data._from_backing_data(codes) + return type(self)._simple_new(cat, name=self.name) + def reindex(self, target, method=None, level=None, limit=None, tolerance=None): """ Create index with target's values (move/add/delete values as necessary) diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 498202bea90dc..122977eee99fb 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -474,7 +474,18 @@ def where(self, cond, other=None): raise TypeError(f"Where requires matching dtype, not {oth}") from err result = np.where(cond, values, other).astype("i8") - arr = type(self._data)._simple_new(result, dtype=self.dtype) + arr = self._data._from_backing_data(result) + return type(self)._simple_new(arr, name=self.name) + + def putmask(self, mask, value): + try: + value = self._data._validate_where_value(value) + except (TypeError, ValueError): + return self.astype(object).putmask(mask, value) + + result = self._data._ndarray.copy() + np.putmask(result, mask, value) + arr = self._data._from_backing_data(result) return type(self)._simple_new(arr, name=self.name) def _summary(self, name=None) -> str: diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index 11dc232af8de4..0e9e5c0b32d18 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -846,16 +846,17 @@ def test_map_str(self): def test_putmask_with_wrong_mask(self): # GH18368 index = self.create_index() + fill = index[0] msg = "putmask: mask and data must be the same size" with pytest.raises(ValueError, match=msg): - index.putmask(np.ones(len(index) + 1, np.bool_), 1) + index.putmask(np.ones(len(index) + 1, np.bool_), fill) with pytest.raises(ValueError, match=msg): - index.putmask(np.ones(len(index) - 1, np.bool_), 1) + index.putmask(np.ones(len(index) - 1, np.bool_), fill) with pytest.raises(ValueError, match=msg): - index.putmask("foo", 1) + index.putmask("foo", fill) @pytest.mark.parametrize("copy", [True, False]) @pytest.mark.parametrize("name", [None, "foo"]) From 85425fd365c63e83ea30f6f62c063c3e90d806f3 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 17 Sep 2020 09:46:12 -0700 Subject: [PATCH 0814/1025] REF: share __getitem__ for Categorical/PandasArray/DTA/TDA/PA (#36391) --- pandas/core/arrays/_mixins.py | 26 ++++++++++++++++++++++++++ pandas/core/arrays/categorical.py | 14 ++++---------- pandas/core/arrays/datetimelike.py | 23 +++++++---------------- pandas/core/arrays/numpy_.py | 14 ++++---------- 4 files changed, 41 insertions(+), 36 deletions(-) diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py index 284dd31ffcb59..a947ab64f7380 100644 --- a/pandas/core/arrays/_mixins.py +++ b/pandas/core/arrays/_mixins.py @@ -2,6 +2,7 @@ import numpy as np +from pandas._libs import lib from pandas.compat.numpy import function as nv from pandas.errors import AbstractMethodError from pandas.util._decorators import cache_readonly, doc @@ -30,6 +31,12 @@ def _from_backing_data(self: _T, arr: np.ndarray) -> _T: """ raise AbstractMethodError(self) + def _box_func(self, x): + """ + Wrap numpy type in our dtype.type if necessary. + """ + return x + # ------------------------------------------------------------------------ def take( @@ -168,3 +175,22 @@ def _validate_setitem_key(self, key): def _validate_setitem_value(self, value): return value + + def __getitem__(self, key): + if lib.is_integer(key): + # fast-path + result = self._ndarray[key] + if self.ndim == 1: + return self._box_func(result) + return self._from_backing_data(result) + + key = self._validate_getitem_key(key) + result = self._ndarray[key] + if lib.is_scalar(result): + return self._box_func(result) + + result = self._from_backing_data(result) + return result + + def _validate_getitem_key(self, key): + return check_array_indexer(self, key) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 1cb428218296f..ef69d6565cfeb 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1887,17 +1887,11 @@ def __getitem__(self, key): """ Return an item. """ - if isinstance(key, (int, np.integer)): - i = self._codes[key] - return self._box_func(i) - - key = check_array_indexer(self, key) - - result = self._codes[key] - if result.ndim > 1: + result = super().__getitem__(key) + if getattr(result, "ndim", 0) > 1: + result = result._ndarray deprecate_ndim_indexing(result) - return result - return self._from_backing_data(result) + return result def _validate_setitem_value(self, value): value = extract_array(value, extract_numpy=True) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 15cbdf882eb7b..a5b91afac338d 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -539,23 +539,11 @@ def __getitem__(self, key): This getitem defers to the underlying array, which by-definition can only handle list-likes, slices, and integer scalars """ - - if lib.is_integer(key): - # fast-path - result = self._ndarray[key] - if self.ndim == 1: - return self._box_func(result) - return self._from_backing_data(result) - - key = self._validate_getitem_key(key) - result = self._ndarray[key] + result = super().__getitem__(key) if lib.is_scalar(result): - return self._box_func(result) - - result = self._from_backing_data(result) + return result - freq = self._get_getitem_freq(key) - result._freq = freq + result._freq = self._get_getitem_freq(key) return result def _validate_getitem_key(self, key): @@ -572,7 +560,7 @@ def _validate_getitem_key(self, key): # this for now (would otherwise raise in check_array_indexer) pass else: - key = check_array_indexer(self, key) + key = super()._validate_getitem_key(key) return key def _get_getitem_freq(self, key): @@ -582,7 +570,10 @@ def _get_getitem_freq(self, key): is_period = is_period_dtype(self.dtype) if is_period: freq = self.freq + elif self.ndim != 1: + freq = None else: + key = self._validate_getitem_key(key) # maybe ndarray[bool] -> slice freq = None if isinstance(key, slice): if self.freq is not None and key.step is not None: diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index 61ffa28d31ba0..afcae2c5c8b43 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -19,7 +19,6 @@ from pandas.core.arrays._mixins import NDArrayBackedExtensionArray from pandas.core.arrays.base import ExtensionOpsMixin from pandas.core.construction import extract_array -from pandas.core.indexers import check_array_indexer from pandas.core.missing import backfill_1d, pad_1d @@ -248,16 +247,11 @@ def __array_ufunc__(self, ufunc, method: str, *inputs, **kwargs): # ------------------------------------------------------------------------ # Pandas ExtensionArray Interface - def __getitem__(self, item): - if isinstance(item, type(self)): - item = item._ndarray + def _validate_getitem_key(self, key): + if isinstance(key, type(self)): + key = key._ndarray - item = check_array_indexer(self, item) - - result = self._ndarray[item] - if not lib.is_scalar(item): - result = type(self)(result) - return result + return super()._validate_getitem_key(key) def _validate_setitem_value(self, value): value = extract_array(value, extract_numpy=True) From 698664d4543f9c4c24bff19a45d55c76723e4236 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 17 Sep 2020 09:47:20 -0700 Subject: [PATCH 0815/1025] CLN: remove unnecessary _convert_index_indexer (#36394) --- pandas/core/indexes/base.py | 17 +---------------- pandas/core/indexes/category.py | 4 ---- pandas/core/indexes/numeric.py | 9 --------- pandas/tests/indexes/test_numeric.py | 8 ++++++-- 4 files changed, 7 insertions(+), 31 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index a2f11160b2fdc..11490e2e0be29 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -3252,7 +3252,7 @@ def _convert_listlike_indexer(self, keyarr): Return tuple-safe keys. """ if isinstance(keyarr, Index): - keyarr = self._convert_index_indexer(keyarr) + pass else: keyarr = self._convert_arr_indexer(keyarr) @@ -3275,21 +3275,6 @@ def _convert_arr_indexer(self, keyarr): keyarr = com.asarray_tuplesafe(keyarr) return keyarr - def _convert_index_indexer(self, keyarr): - """ - Convert an Index indexer to the appropriate dtype. - - Parameters - ---------- - keyarr : Index (or sub-class) - Indexer to convert. - - Returns - ------- - converted_keyarr : Index (or sub-class) - """ - return keyarr - def _convert_list_indexer(self, keyarr): """ Convert a list-like indexer to the appropriate dtype. diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index d73b36eff69f3..c798ae0bd4e4d 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -583,10 +583,6 @@ def _convert_arr_indexer(self, keyarr): return self._shallow_copy(keyarr) - @doc(Index._convert_index_indexer) - def _convert_index_indexer(self, keyarr): - return self._shallow_copy(keyarr) - @doc(Index._maybe_cast_slice_bound) def _maybe_cast_slice_bound(self, label, side, kind): if kind == "loc": diff --git a/pandas/core/indexes/numeric.py b/pandas/core/indexes/numeric.py index f6859cbc4c0a2..026f128bae4be 100644 --- a/pandas/core/indexes/numeric.py +++ b/pandas/core/indexes/numeric.py @@ -308,15 +308,6 @@ def _convert_arr_indexer(self, keyarr): return com.asarray_tuplesafe(keyarr, dtype=dtype) - @doc(Index._convert_index_indexer) - def _convert_index_indexer(self, keyarr): - # Cast the indexer to uint64 if possible so - # that the values returned from indexing are - # also uint64. - if keyarr.is_integer(): - return keyarr.astype(np.uint64) - return keyarr - # ---------------------------------------------------------------- @classmethod diff --git a/pandas/tests/indexes/test_numeric.py b/pandas/tests/indexes/test_numeric.py index 1ffdbbc9afd3f..bbd72b2ac5d60 100644 --- a/pandas/tests/indexes/test_numeric.py +++ b/pandas/tests/indexes/test_numeric.py @@ -631,7 +631,11 @@ def test_range_float_union_dtype(): tm.assert_index_equal(result, expected) -def test_uint_index_does_not_convert_to_float64(): +@pytest.mark.parametrize( + "box", + [list, lambda x: np.array(x, dtype=object), lambda x: pd.Index(x, dtype=object)], +) +def test_uint_index_does_not_convert_to_float64(box): # https://github.com/pandas-dev/pandas/issues/28279 # https://github.com/pandas-dev/pandas/issues/28023 series = pd.Series( @@ -646,7 +650,7 @@ def test_uint_index_does_not_convert_to_float64(): ], ) - result = series.loc[[7606741985629028552, 17876870360202815256]] + result = series.loc[box([7606741985629028552, 17876870360202815256])] expected = UInt64Index( [7606741985629028552, 17876870360202815256, 17876870360202815256], From 0d9fd5e246faab0016bda67da49e4f63868660cd Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Thu, 17 Sep 2020 17:50:59 +0100 Subject: [PATCH 0816/1025] PERF: StringArray construction (#36325) --- doc/source/whatsnew/v1.2.0.rst | 2 +- pandas/core/arrays/string_.py | 9 +++++++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 6436b2eceb33f..6923b42d3340b 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -221,7 +221,7 @@ Deprecations Performance improvements ~~~~~~~~~~~~~~~~~~~~~~~~ -- Performance improvements when creating Series with dtype `str` or :class:`StringDtype` from array with many string elements (:issue:`36304`, :issue:`36317`) +- Performance improvements when creating Series with dtype `str` or :class:`StringDtype` from array with many string elements (:issue:`36304`, :issue:`36317`, :issue:`36325`) - Performance improvement in :meth:`GroupBy.agg` with the ``numba`` engine (:issue:`35759`) - Performance improvements when creating :meth:`pd.Series.map` from a huge dictionary (:issue:`34717`) - Performance improvement in :meth:`GroupBy.transform` with the ``numba`` engine (:issue:`36240`) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 381968f9724b6..cef35f2b1137c 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -199,13 +199,18 @@ def _from_sequence(cls, scalars, dtype=None, copy=False): assert dtype == "string" result = np.asarray(scalars, dtype="object") - # convert non-na-likes to str, and nan-likes to StringDtype.na_value result = lib.ensure_string_array( result, na_value=StringDtype.na_value, copy=copy ) - return cls(result) + # Manually creating new array avoids the validation step in the __init__, so is + # faster. Refactor need for validation? + new_string_array = object.__new__(cls) + new_string_array._dtype = StringDtype() + new_string_array._ndarray = result + + return new_string_array @classmethod def _from_sequence_of_strings(cls, strings, dtype=None, copy=False): From 7e904538c299c9137a92a19416d3e96ad0206687 Mon Sep 17 00:00:00 2001 From: Zak Kohler Date: Thu, 17 Sep 2020 17:20:06 -0400 Subject: [PATCH 0817/1025] Fix typo in docstring 'handler' --> 'handle' (#36427) --- pandas/io/excel/_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 46327daac2e43..45da2d7d28fab 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -50,7 +50,7 @@ If you want to pass in a path object, pandas accepts any ``os.PathLike``. By file-like object, we refer to objects with a ``read()`` method, - such as a file handler (e.g. via builtin ``open`` function) + such as a file handle (e.g. via builtin ``open`` function) or ``StringIO``. sheet_name : str, int, list, or None, default 0 Strings are used for sheet names. Integers are used in zero-indexed From c31407c7828bd66dbb94ba3fa9a6d17155d74c12 Mon Sep 17 00:00:00 2001 From: Daniel Saxton <2658661+dsaxton@users.noreply.github.com> Date: Thu, 17 Sep 2020 17:03:03 -0500 Subject: [PATCH 0818/1025] ADMIN: Update stale PR action (#36382) --- .github/workflows/stale-pr.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/stale-pr.yml b/.github/workflows/stale-pr.yml index 0cbe4b7dd4582..a6aece34478d9 100644 --- a/.github/workflows/stale-pr.yml +++ b/.github/workflows/stale-pr.yml @@ -12,9 +12,9 @@ jobs: with: repo-token: ${{ secrets.GITHUB_TOKEN }} stale-pr-message: "This pull request is stale because it has been open for thirty days with no activity." - skip-stale-pr-message: false + skip-stale-pr-message: true stale-pr-label: "Stale" - exempt-pr-labels: "Needs Review,Blocked" + exempt-pr-labels: "Needs Review,Blocked,Needs Discussion" days-before-stale: 30 days-before-close: -1 remove-stale-when-updated: true From c39169d681c0f70b3260fe83ea9a2ddc5771d84d Mon Sep 17 00:00:00 2001 From: Irv Lustig Date: Thu, 17 Sep 2020 18:39:41 -0400 Subject: [PATCH 0819/1025] Fix documentation for new float_precision on read_csv (#36358) --- pandas/io/parsers.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 0f1cce273a146..43ffbe6bdd66c 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -338,9 +338,12 @@ option can improve performance because there is no longer any I/O overhead. float_precision : str, optional Specifies which converter the C engine should use for floating-point - values. The options are `None` or `high` for the ordinary converter, - `legacy` for the original lower precision pandas converter, and - `round_trip` for the round-trip converter. + values. The options are ``None`` or 'high' for the ordinary converter, + 'legacy' for the original lower precision pandas converter, and + 'round_trip' for the round-trip converter. + + .. versionchanged:: 1.2 + storage_options : dict, optional Extra options that make sense for a particular storage connection, e.g. host, port, username, password, etc., if using a URL that will @@ -349,7 +352,7 @@ a file-like buffer. See the fsspec and backend storage implementation docs for the set of allowed keys and values. - .. versionadded:: 1.2.0 + .. versionadded:: 1.2 Returns ------- @@ -2290,9 +2293,10 @@ def TextParser(*args, **kwds): can be inferred, there often will be a large parsing speed-up. float_precision : str, optional Specifies which converter the C engine should use for floating-point - values. The options are None for the ordinary converter, - 'high' for the high-precision converter, and 'round_trip' for the - round-trip converter. + values. The options are `None` or `high` for the ordinary converter, + `legacy` for the original lower precision pandas converter, and + `round_trip` for the round-trip converter. + .. versionchanged:: 1.2 """ kwds["engine"] = "python" From 2f400b784301cf2e57e2d08f81735a77afae0142 Mon Sep 17 00:00:00 2001 From: Fangchen Li Date: Thu, 17 Sep 2020 18:11:43 -0500 Subject: [PATCH 0820/1025] BLD/CI fix arm64 build #36397 (#36403) --- .travis.yml | 6 +----- ci/deps/travis-37-arm64.yaml | 1 - ci/setup_env.sh | 3 ++- pandas/conftest.py | 3 +++ pandas/tests/arithmetic/test_datetime64.py | 1 + pandas/tests/frame/test_constructors.py | 1 + pandas/tests/groupby/test_groupby_dropna.py | 1 + pandas/tests/groupby/transform/test_transform.py | 1 + pandas/tests/indexes/common.py | 1 + pandas/tests/indexes/multi/test_duplicates.py | 1 + pandas/tests/indexes/multi/test_integrity.py | 1 + pandas/tests/indexes/multi/test_setops.py | 1 + pandas/tests/indexes/period/test_indexing.py | 1 + pandas/tests/indexing/interval/test_interval.py | 1 + .../tests/indexing/multiindex/test_chaining_and_caching.py | 1 + pandas/tests/indexing/test_chaining_and_caching.py | 1 + pandas/tests/indexing/test_loc.py | 1 + pandas/tests/test_sorting.py | 1 + pandas/tests/tseries/offsets/test_offsets_properties.py | 2 ++ pandas/tests/tseries/offsets/test_ticks.py | 1 + 20 files changed, 23 insertions(+), 7 deletions(-) diff --git a/.travis.yml b/.travis.yml index 93c8238bb5059..a38e90bbce8ba 100644 --- a/.travis.yml +++ b/.travis.yml @@ -42,7 +42,7 @@ matrix: - arch: arm64 env: - - JOB="3.7, arm64" PYTEST_WORKERS=8 ENV_FILE="ci/deps/travis-37-arm64.yaml" PATTERN="(not slow and not network and not clipboard)" + - JOB="3.7, arm64" PYTEST_WORKERS=8 ENV_FILE="ci/deps/travis-37-arm64.yaml" PATTERN="(not slow and not network and not clipboard and not arm_slow)" - env: - JOB="3.7, locale" ENV_FILE="ci/deps/travis-37-locale.yaml" PATTERN="((not slow and not network and not clipboard) or (single and db))" LOCALE_OVERRIDE="zh_CN.UTF-8" SQL="1" @@ -58,10 +58,6 @@ matrix: services: - mysql - postgresql - allow_failures: - - arch: arm64 - env: - - JOB="3.7, arm64" PYTEST_WORKERS=8 ENV_FILE="ci/deps/travis-37-arm64.yaml" PATTERN="(not slow and not network and not clipboard)" before_install: diff --git a/ci/deps/travis-37-arm64.yaml b/ci/deps/travis-37-arm64.yaml index d04b1ca0bdcfc..8df6104f43a50 100644 --- a/ci/deps/travis-37-arm64.yaml +++ b/ci/deps/travis-37-arm64.yaml @@ -1,6 +1,5 @@ name: pandas-dev channels: - - defaults - conda-forge dependencies: - python=3.7.* diff --git a/ci/setup_env.sh b/ci/setup_env.sh index aa43d8b7dd00a..961433204cfbb 100755 --- a/ci/setup_env.sh +++ b/ci/setup_env.sh @@ -42,8 +42,9 @@ else fi if [ "${TRAVIS_CPU_ARCH}" == "arm64" ]; then + sudo apt-get update sudo apt-get -y install xvfb - CONDA_URL="https://github.com/conda-forge/miniforge/releases/download/4.8.2-1/Miniforge3-4.8.2-1-Linux-aarch64.sh" + CONDA_URL="https://github.com/conda-forge/miniforge/releases/download/4.8.5-0/Miniforge3-4.8.5-0-Linux-aarch64.sh" else CONDA_URL="https://repo.continuum.io/miniconda/Miniconda3-latest-$CONDA_OS.sh" fi diff --git a/pandas/conftest.py b/pandas/conftest.py index e79370e53ead6..604815d496f80 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -55,6 +55,9 @@ def pytest_configure(config): ) config.addinivalue_line("markers", "high_memory: mark a test as a high-memory only") config.addinivalue_line("markers", "clipboard: mark a pd.read_clipboard test") + config.addinivalue_line( + "markers", "arm_slow: mark a test as slow for arm64 architecture" + ) def pytest_addoption(parser): diff --git a/pandas/tests/arithmetic/test_datetime64.py b/pandas/tests/arithmetic/test_datetime64.py index 5dfaea7c77420..0dd389ed516c7 100644 --- a/pandas/tests/arithmetic/test_datetime64.py +++ b/pandas/tests/arithmetic/test_datetime64.py @@ -749,6 +749,7 @@ class TestDatetime64Arithmetic: # ------------------------------------------------------------- # Addition/Subtraction of timedelta-like + @pytest.mark.arm_slow def test_dt64arr_add_timedeltalike_scalar( self, tz_naive_fixture, two_hours, box_with_array ): diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index eb334e811c5a4..63a2160e128ed 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -2588,6 +2588,7 @@ def test_to_frame_with_falsey_names(self): result = DataFrame(Series(name=0, dtype=object)).dtypes tm.assert_series_equal(result, expected) + @pytest.mark.arm_slow @pytest.mark.parametrize("dtype", [None, "uint8", "category"]) def test_constructor_range_dtype(self, dtype): expected = DataFrame({"A": [0, 1, 2, 3, 4]}, dtype=dtype or "int64") diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index 66db06eeebdfb..deb73acbb158a 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -238,6 +238,7 @@ def test_groupby_dropna_multi_index_dataframe_agg(dropna, tuples, outputs): tm.assert_frame_equal(grouped, expected) +@pytest.mark.arm_slow @pytest.mark.parametrize( "datetime1, datetime2", [ diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py index c09f35526a6bf..97be039e16ebb 100644 --- a/pandas/tests/groupby/transform/test_transform.py +++ b/pandas/tests/groupby/transform/test_transform.py @@ -675,6 +675,7 @@ def test_groupby_cum_skipna(op, skipna, input, exp): tm.assert_series_equal(expected, result) +@pytest.mark.arm_slow @pytest.mark.parametrize( "op, args, targop", [ diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index 0e9e5c0b32d18..b01cafc9b0d5c 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -906,6 +906,7 @@ def test_is_unique(self): index_na_dup = index_na.insert(0, np.nan) assert index_na_dup.is_unique is False + @pytest.mark.arm_slow def test_engine_reference_cycle(self): # GH27585 index = self.create_index() diff --git a/pandas/tests/indexes/multi/test_duplicates.py b/pandas/tests/indexes/multi/test_duplicates.py index 9add4b478da47..aa2f37dad152c 100644 --- a/pandas/tests/indexes/multi/test_duplicates.py +++ b/pandas/tests/indexes/multi/test_duplicates.py @@ -241,6 +241,7 @@ def test_duplicated(idx_dup, keep, expected): tm.assert_numpy_array_equal(result, expected) +@pytest.mark.arm_slow def test_duplicated_large(keep): # GH 9125 n, k = 200, 5000 diff --git a/pandas/tests/indexes/multi/test_integrity.py b/pandas/tests/indexes/multi/test_integrity.py index c776a33717ccd..6a353fe1ad6e7 100644 --- a/pandas/tests/indexes/multi/test_integrity.py +++ b/pandas/tests/indexes/multi/test_integrity.py @@ -118,6 +118,7 @@ def test_consistency(): assert index.is_unique is False +@pytest.mark.arm_slow def test_hash_collisions(): # non-smoke test that we don't get hash collisions diff --git a/pandas/tests/indexes/multi/test_setops.py b/pandas/tests/indexes/multi/test_setops.py index d7427ee622977..6d4928547cad1 100644 --- a/pandas/tests/indexes/multi/test_setops.py +++ b/pandas/tests/indexes/multi/test_setops.py @@ -37,6 +37,7 @@ def test_intersection_base(idx, sort, klass): first.intersection([1, 2, 3], sort=sort) +@pytest.mark.arm_slow @pytest.mark.parametrize("klass", [MultiIndex, np.array, Series, list]) def test_union_base(idx, sort, klass): first = idx[::-1] diff --git a/pandas/tests/indexes/period/test_indexing.py b/pandas/tests/indexes/period/test_indexing.py index d2499b85ad181..f42499147cdbb 100644 --- a/pandas/tests/indexes/period/test_indexing.py +++ b/pandas/tests/indexes/period/test_indexing.py @@ -157,6 +157,7 @@ def test_getitem_list_periods(self): exp = ts.iloc[[1]] tm.assert_series_equal(ts[[Period("2012-01-02", freq="D")]], exp) + @pytest.mark.arm_slow def test_getitem_seconds(self): # GH#6716 didx = date_range(start="2013/01/01 09:00:00", freq="S", periods=4000) diff --git a/pandas/tests/indexing/interval/test_interval.py b/pandas/tests/indexing/interval/test_interval.py index 634020982b1c2..8976e87a1b75a 100644 --- a/pandas/tests/indexing/interval/test_interval.py +++ b/pandas/tests/indexing/interval/test_interval.py @@ -71,6 +71,7 @@ def test_non_matching(self): with pytest.raises(KeyError, match="^$"): s.loc[[-1, 3]] + @pytest.mark.arm_slow def test_large_series(self): s = Series( np.arange(1000000), index=IntervalIndex.from_breaks(np.arange(1000001)) diff --git a/pandas/tests/indexing/multiindex/test_chaining_and_caching.py b/pandas/tests/indexing/multiindex/test_chaining_and_caching.py index d3b13336e2a44..62c0171fe641f 100644 --- a/pandas/tests/indexing/multiindex/test_chaining_and_caching.py +++ b/pandas/tests/indexing/multiindex/test_chaining_and_caching.py @@ -49,6 +49,7 @@ def test_cache_updating(): assert result == 2 +@pytest.mark.arm_slow def test_indexer_caching(): # GH5727 # make sure that indexers are in the _internal_names_set diff --git a/pandas/tests/indexing/test_chaining_and_caching.py b/pandas/tests/indexing/test_chaining_and_caching.py index 9910ef1b04b1a..66835c586e6c7 100644 --- a/pandas/tests/indexing/test_chaining_and_caching.py +++ b/pandas/tests/indexing/test_chaining_and_caching.py @@ -132,6 +132,7 @@ def test_setitem_chained_setfault(self): result = df.head() tm.assert_frame_equal(result, expected) + @pytest.mark.arm_slow def test_detect_chained_assignment(self): pd.set_option("chained_assignment", "raise") diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 9a6f30ec920cc..9b9bca77e17ec 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -785,6 +785,7 @@ def test_loc_non_unique(self): expected = DataFrame({"A": [2, 4, 5], "B": [4, 6, 7]}, index=[1, 1, 2]) tm.assert_frame_equal(result, expected) + @pytest.mark.arm_slow def test_loc_non_unique_memory_error(self): # GH 4280 diff --git a/pandas/tests/test_sorting.py b/pandas/tests/test_sorting.py index 98297474243e4..deb7434694d01 100644 --- a/pandas/tests/test_sorting.py +++ b/pandas/tests/test_sorting.py @@ -60,6 +60,7 @@ def test_int64_overflow(self): assert left[k] == v assert len(left) == len(right) + @pytest.mark.arm_slow def test_int64_overflow_moar(self): # GH9096 diff --git a/pandas/tests/tseries/offsets/test_offsets_properties.py b/pandas/tests/tseries/offsets/test_offsets_properties.py index ca14b202ef888..0fa9081d606b0 100644 --- a/pandas/tests/tseries/offsets/test_offsets_properties.py +++ b/pandas/tests/tseries/offsets/test_offsets_properties.py @@ -12,6 +12,7 @@ from hypothesis import assume, given, strategies as st from hypothesis.extra.dateutil import timezones as dateutil_timezones from hypothesis.extra.pytz import timezones as pytz_timezones +import pytest import pandas as pd from pandas import Timestamp @@ -84,6 +85,7 @@ # Offset-specific behaviour tests +@pytest.mark.arm_slow @given(gen_random_datetime, gen_yqm_offset) def test_on_offset_implementations(dt, offset): assume(not offset.normalize) diff --git a/pandas/tests/tseries/offsets/test_ticks.py b/pandas/tests/tseries/offsets/test_ticks.py index 10c239c683bc0..cc23f5f3201da 100644 --- a/pandas/tests/tseries/offsets/test_ticks.py +++ b/pandas/tests/tseries/offsets/test_ticks.py @@ -64,6 +64,7 @@ def test_tick_add_sub(cls, n, m): assert left - right == expected +@pytest.mark.arm_slow @pytest.mark.parametrize("cls", tick_classes) @settings(deadline=None) @example(n=2, m=3) From df425f563adc2c5eb0488d55c6d9914ca8f1eb52 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 17 Sep 2020 16:20:29 -0700 Subject: [PATCH 0821/1025] BUG: FooIndex.insert casting datetimelike NAs incorrectly (#36374) --- pandas/core/arrays/interval.py | 4 ++-- pandas/core/dtypes/missing.py | 3 +++ pandas/core/indexes/numeric.py | 9 +++++++-- pandas/tests/indexes/interval/test_interval.py | 11 ++++++++++- pandas/tests/indexes/ranges/test_range.py | 6 +++++- pandas/tests/indexes/test_numeric.py | 8 ++++++-- 6 files changed, 33 insertions(+), 8 deletions(-) diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 028472ad93e52..706b089e929a9 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -37,7 +37,7 @@ ABCPeriodIndex, ABCSeries, ) -from pandas.core.dtypes.missing import isna, notna +from pandas.core.dtypes.missing import is_valid_nat_for_dtype, isna, notna from pandas.core.algorithms import take, value_counts from pandas.core.arrays.base import ExtensionArray, _extension_array_shared_docs @@ -883,7 +883,7 @@ def _validate_insert_value(self, value): ) left_insert = value.left right_insert = value.right - elif is_scalar(value) and isna(value): + elif is_valid_nat_for_dtype(value, self.left.dtype): # GH#18295 left_insert = right_insert = value else: diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index d2e4974741b88..0b4aab0ac9d88 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -608,6 +608,9 @@ def is_valid_nat_for_dtype(obj, dtype: DtypeObj) -> bool: return not isinstance(obj, np.timedelta64) if dtype.kind == "m": return not isinstance(obj, np.datetime64) + if dtype.kind in ["i", "u", "f", "c"]: + # Numeric + return obj is not NaT and not isinstance(obj, (np.datetime64, np.timedelta64)) # must be PeriodDType return not isinstance(obj, (np.datetime64, np.timedelta64)) diff --git a/pandas/core/indexes/numeric.py b/pandas/core/indexes/numeric.py index 026f128bae4be..49a70600c09fa 100644 --- a/pandas/core/indexes/numeric.py +++ b/pandas/core/indexes/numeric.py @@ -28,7 +28,7 @@ ABCSeries, ABCUInt64Index, ) -from pandas.core.dtypes.missing import isna +from pandas.core.dtypes.missing import is_valid_nat_for_dtype, isna from pandas.core import algorithms import pandas.core.common as com @@ -164,7 +164,12 @@ def is_all_dates(self) -> bool: def insert(self, loc: int, item): # treat NA values as nans: if is_scalar(item) and isna(item): - item = self._na_value + if is_valid_nat_for_dtype(item, self.dtype): + item = self._na_value + else: + # NaT, np.datetime64("NaT"), np.timedelta64("NaT") + return self.astype(object).insert(loc, item) + return super().insert(loc, item) def _union(self, other, sort): diff --git a/pandas/tests/indexes/interval/test_interval.py b/pandas/tests/indexes/interval/test_interval.py index 42849e0bbb5c7..734c98af3d058 100644 --- a/pandas/tests/indexes/interval/test_interval.py +++ b/pandas/tests/indexes/interval/test_interval.py @@ -204,11 +204,20 @@ def test_insert(self, data): # GH 18295 (test missing) na_idx = IntervalIndex([np.nan], closed=data.closed) - for na in (np.nan, pd.NaT, None): + for na in [np.nan, None, pd.NA]: expected = data[:1].append(na_idx).append(data[1:]) result = data.insert(1, na) tm.assert_index_equal(result, expected) + if data.left.dtype.kind not in ["m", "M"]: + # trying to insert pd.NaT into a numeric-dtyped Index should cast/raise + msg = "can only insert Interval objects and NA into an IntervalIndex" + with pytest.raises(ValueError, match=msg): + result = data.insert(1, pd.NaT) + else: + result = data.insert(1, pd.NaT) + tm.assert_index_equal(result, expected) + def test_is_unique_interval(self, closed): """ Interval specific tests for is_unique in addition to base class tests diff --git a/pandas/tests/indexes/ranges/test_range.py b/pandas/tests/indexes/ranges/test_range.py index 172cd4a106ac1..899c8cbc0425d 100644 --- a/pandas/tests/indexes/ranges/test_range.py +++ b/pandas/tests/indexes/ranges/test_range.py @@ -100,10 +100,14 @@ def test_insert(self): # GH 18295 (test missing) expected = Float64Index([0, np.nan, 1, 2, 3, 4]) - for na in (np.nan, pd.NaT, None): + for na in [np.nan, None, pd.NA]: result = RangeIndex(5).insert(1, na) tm.assert_index_equal(result, expected) + result = RangeIndex(5).insert(1, pd.NaT) + expected = pd.Index([0, pd.NaT, 1, 2, 3, 4], dtype=object) + tm.assert_index_equal(result, expected) + def test_delete(self): idx = RangeIndex(5, name="Foo") diff --git a/pandas/tests/indexes/test_numeric.py b/pandas/tests/indexes/test_numeric.py index bbd72b2ac5d60..e1623a14c333e 100644 --- a/pandas/tests/indexes/test_numeric.py +++ b/pandas/tests/indexes/test_numeric.py @@ -84,10 +84,14 @@ def test_index_groupby(self): expected = {ex_keys[0]: idx[[0, 5]], ex_keys[1]: idx[[1, 4]]} tm.assert_dict_equal(idx.groupby(to_groupby), expected) - def test_insert(self, nulls_fixture): + def test_insert_na(self, nulls_fixture): # GH 18295 (test missing) index = self.create_index() - expected = Float64Index([index[0], np.nan] + list(index[1:])) + + if nulls_fixture is pd.NaT: + expected = Index([index[0], pd.NaT] + list(index[1:]), dtype=object) + else: + expected = Float64Index([index[0], np.nan] + list(index[1:])) result = index.insert(1, nulls_fixture) tm.assert_index_equal(result, expected) From 4db78c334e2109214c2630421f2db512e134a3f7 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 17 Sep 2020 16:21:40 -0700 Subject: [PATCH 0822/1025] REF: de-duplicate IntervalIndex compat code (#36372) --- pandas/core/indexes/base.py | 17 +++++++++++++---- pandas/core/indexes/interval.py | 34 +++------------------------------ pandas/core/indexing.py | 2 +- 3 files changed, 17 insertions(+), 36 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 11490e2e0be29..222ae589ea7fc 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -3316,7 +3316,7 @@ def _can_reindex(self, indexer): ValueError if its a duplicate axis """ # trying to reindex on an axis with duplicates - if not self.is_unique and len(indexer): + if not self._index_as_unique and len(indexer): raise ValueError("cannot reindex from a duplicate axis") def reindex(self, target, method=None, level=None, limit=None, tolerance=None): @@ -3360,8 +3360,7 @@ def reindex(self, target, method=None, level=None, limit=None, tolerance=None): if self.equals(target): indexer = None else: - # check is_overlapping for IntervalIndex compat - if self.is_unique and not getattr(self, "is_overlapping", False): + if self._index_as_unique: indexer = self.get_indexer( target, method=method, limit=limit, tolerance=tolerance ) @@ -4759,11 +4758,21 @@ def get_indexer_for(self, target, **kwargs): numpy.ndarray List of indices. """ - if self.is_unique: + if self._index_as_unique: return self.get_indexer(target, **kwargs) indexer, _ = self.get_indexer_non_unique(target, **kwargs) return indexer + @property + def _index_as_unique(self): + """ + Whether we should treat this as unique for the sake of + get_indexer vs get_indexer_non_unique. + + For IntervalIndex compat. + """ + return self.is_unique + def _maybe_promote(self, other: "Index"): """ When dealing with an object-dtype Index and a non-object Index, see diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 9ef584f5b7fbc..2f43787919faa 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -516,22 +516,6 @@ def is_overlapping(self) -> bool: # GH 23309 return self._engine.is_overlapping - def _can_reindex(self, indexer: np.ndarray) -> None: - """ - Check if we are allowing reindexing with this particular indexer. - - Parameters - ---------- - indexer : an integer indexer - - Raises - ------ - ValueError if its a duplicate axis - """ - # trying to reindex on an axis with duplicates - if self.is_overlapping and len(indexer): - raise ValueError("cannot reindex from an overlapping axis") - def _needs_i8_conversion(self, key) -> bool: """ Check if a given key needs i8 conversion. Conversion is necessary for @@ -839,21 +823,9 @@ def get_indexer_non_unique( return ensure_platform_int(indexer), ensure_platform_int(missing) - def get_indexer_for(self, target: AnyArrayLike, **kwargs) -> np.ndarray: - """ - Guaranteed return of an indexer even when overlapping. - - This dispatches to get_indexer or get_indexer_non_unique - as appropriate. - - Returns - ------- - numpy.ndarray - List of indices. - """ - if self.is_overlapping: - return self.get_indexer_non_unique(target)[0] - return self.get_indexer(target, **kwargs) + @property + def _index_as_unique(self): + return not self.is_overlapping def _convert_slice_indexer(self, key: slice, kind: str): if not (key.step is None or key.step == 1): diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 9ecad335e2c3c..5f57fe1c9a56a 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1256,7 +1256,7 @@ def _get_listlike_indexer(self, key, axis: int, raise_missing: bool = False): ) return ax[indexer], indexer - if ax.is_unique and not getattr(ax, "is_overlapping", False): + if ax._index_as_unique: indexer = ax.get_indexer_for(keyarr) keyarr = ax.reindex(keyarr)[0] else: From 07844df52c652a0f70a1b45e360c8c92d7e513ee Mon Sep 17 00:00:00 2001 From: lacrosse91 Date: Fri, 18 Sep 2020 15:55:29 +0900 Subject: [PATCH 0823/1025] remove trailing comma (#36441) --- pandas/tests/groupby/transform/test_numba.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/groupby/transform/test_numba.py b/pandas/tests/groupby/transform/test_numba.py index fcaa5ab13599a..3a184bdd007c7 100644 --- a/pandas/tests/groupby/transform/test_numba.py +++ b/pandas/tests/groupby/transform/test_numba.py @@ -131,7 +131,7 @@ def func_1(values, index): @td.skip_if_no("numba", "0.46.0") @pytest.mark.parametrize( - "agg_func", [["min", "max"], "min", {"B": ["min", "max"], "C": "sum"}], + "agg_func", [["min", "max"], "min", {"B": ["min", "max"], "C": "sum"}] ) def test_multifunc_notimplimented(agg_func): data = DataFrame( From b379c3f7fa9f4a71f08cb11049fd3d13f1dd579f Mon Sep 17 00:00:00 2001 From: Gautham <41098605+ahgamut@users.noreply.github.com> Date: Fri, 18 Sep 2020 12:50:05 +0000 Subject: [PATCH 0824/1025] DOC: read_excel skiprows documentation matches read_csv (#36435) (#36437) * DOC: updated read_excel skiprows documentation to match read_csv (GH36435) * TST: updated read_excel test with skiprows as int, callable (GH36435) --- pandas/io/excel/_base.py | 8 ++++++-- pandas/tests/io/excel/test_readers.py | 27 ++++++++++++++++++++++++++- 2 files changed, 32 insertions(+), 3 deletions(-) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 45da2d7d28fab..604b7e12ec243 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -120,8 +120,12 @@ Values to consider as True. false_values : list, default None Values to consider as False. -skiprows : list-like - Rows to skip at the beginning (0-indexed). +skiprows : list-like, int, or callable, optional + Line numbers to skip (0-indexed) or number of lines to skip (int) at the + start of the file. If callable, the callable function will be evaluated + against the row indices, returning True if the row should be skipped and + False otherwise. An example of a valid callable argument would be ``lambda + x: x in [0, 2]``. nrows : int, default None Number of rows to parse. na_values : scalar, str, list-like, or dict, default None diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 33467be42dfd9..4bdcc5b327fa7 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -894,7 +894,7 @@ def test_read_excel_bool_header_arg(self, read_ext): with pytest.raises(TypeError, match=msg): pd.read_excel("test1" + read_ext, header=arg) - def test_read_excel_skiprows_list(self, read_ext): + def test_read_excel_skiprows(self, read_ext): # GH 4903 if pd.read_excel.keywords["engine"] == "pyxlsb": pytest.xfail("Sheets containing datetimes not supported by pyxlsb") @@ -920,6 +920,31 @@ def test_read_excel_skiprows_list(self, read_ext): ) tm.assert_frame_equal(actual, expected) + # GH36435 + actual = pd.read_excel( + "testskiprows" + read_ext, + sheet_name="skiprows_list", + skiprows=lambda x: x in [0, 2], + ) + tm.assert_frame_equal(actual, expected) + + actual = pd.read_excel( + "testskiprows" + read_ext, + sheet_name="skiprows_list", + skiprows=3, + names=["a", "b", "c", "d"], + ) + expected = DataFrame( + [ + # [1, 2.5, pd.Timestamp("2015-01-01"), True], + [2, 3.5, pd.Timestamp("2015-01-02"), False], + [3, 4.5, pd.Timestamp("2015-01-03"), False], + [4, 5.5, pd.Timestamp("2015-01-04"), True], + ], + columns=["a", "b", "c", "d"], + ) + tm.assert_frame_equal(actual, expected) + def test_read_excel_nrows(self, read_ext): # GH 16645 num_rows_to_pull = 5 From 02e5c055fdcdcdab6efa7fe9fe695e31deaaa426 Mon Sep 17 00:00:00 2001 From: lacrosse91 Date: Fri, 18 Sep 2020 22:00:15 +0900 Subject: [PATCH 0825/1025] CLN: 35925 rm trailing commas (#36446) --- pandas/tests/indexes/base_class/test_indexing.py | 2 +- pandas/tests/indexes/test_common.py | 4 +--- pandas/tests/indexes/test_numeric.py | 4 +--- 3 files changed, 3 insertions(+), 7 deletions(-) diff --git a/pandas/tests/indexes/base_class/test_indexing.py b/pandas/tests/indexes/base_class/test_indexing.py index 196c0401a72be..b2fa8f31ee5ec 100644 --- a/pandas/tests/indexes/base_class/test_indexing.py +++ b/pandas/tests/indexes/base_class/test_indexing.py @@ -14,7 +14,7 @@ def test_get_slice_bounds_within(self, kind, side, expected): @pytest.mark.parametrize("kind", ["getitem", "loc", None]) @pytest.mark.parametrize("side", ["left", "right"]) @pytest.mark.parametrize( - "data, bound, expected", [(list("abcdef"), "x", 6), (list("bcdefg"), "a", 0)], + "data, bound, expected", [(list("abcdef"), "x", 6), (list("bcdefg"), "a", 0)] ) def test_get_slice_bounds_outside(self, kind, side, expected, data, bound): index = Index(data) diff --git a/pandas/tests/indexes/test_common.py b/pandas/tests/indexes/test_common.py index aa6b395176b06..675ae388a28a4 100644 --- a/pandas/tests/indexes/test_common.py +++ b/pandas/tests/indexes/test_common.py @@ -411,9 +411,7 @@ def test_sort_values_invalid_na_position(index_with_missing, na_position): pytest.xfail("missing value sorting order not defined for index type") if na_position not in ["first", "last"]: - with pytest.raises( - ValueError, match=f"invalid na_position: {na_position}", - ): + with pytest.raises(ValueError, match=f"invalid na_position: {na_position}"): index_with_missing.sort_values(na_position=na_position) diff --git a/pandas/tests/indexes/test_numeric.py b/pandas/tests/indexes/test_numeric.py index e1623a14c333e..7fa7a571d2571 100644 --- a/pandas/tests/indexes/test_numeric.py +++ b/pandas/tests/indexes/test_numeric.py @@ -699,9 +699,7 @@ def test_get_slice_bounds_within(self, kind, side, expected): @pytest.mark.parametrize("kind", ["getitem", "loc", None]) @pytest.mark.parametrize("side", ["left", "right"]) - @pytest.mark.parametrize( - "bound, expected", [(-1, 0), (10, 6)], - ) + @pytest.mark.parametrize("bound, expected", [(-1, 0), (10, 6)]) def test_get_slice_bounds_outside(self, kind, side, expected, bound): index = Index(range(6)) result = index.get_slice_bound(bound, kind=kind, side=side) From 53864a4a63eda70f07e67b3bd1be5d6b74d4789f Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 18 Sep 2020 06:01:34 -0700 Subject: [PATCH 0826/1025] REF: collect IntervalArray methods by topic (#36438) --- pandas/core/arrays/datetimelike.py | 68 +++--- pandas/core/arrays/interval.py | 370 +++++++++++++++-------------- 2 files changed, 230 insertions(+), 208 deletions(-) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index a5b91afac338d..026aad5ad6eb7 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -67,6 +67,15 @@ DTScalarOrNaT = Union[DatetimeLikeScalar, NaTType] +class InvalidComparison(Exception): + """ + Raised by _validate_comparison_value to indicate to caller it should + return invalid_comparison. + """ + + pass + + def _datetimelike_array_cmp(cls, op): """ Wrap comparison operations to convert Timestamp/Timedelta/Period-like to @@ -75,36 +84,6 @@ def _datetimelike_array_cmp(cls, op): opname = f"__{op.__name__}__" nat_result = opname == "__ne__" - class InvalidComparison(Exception): - pass - - def _validate_comparison_value(self, other): - if isinstance(other, str): - try: - # GH#18435 strings get a pass from tzawareness compat - other = self._scalar_from_string(other) - except ValueError: - # failed to parse as Timestamp/Timedelta/Period - raise InvalidComparison(other) - - if isinstance(other, self._recognized_scalars) or other is NaT: - other = self._scalar_type(other) - self._check_compatible_with(other) - - elif not is_list_like(other): - raise InvalidComparison(other) - - elif len(other) != len(self): - raise ValueError("Lengths must match") - - else: - try: - other = self._validate_listlike(other, opname, allow_object=True) - except TypeError as err: - raise InvalidComparison(other) from err - - return other - @unpack_zerodim_and_defer(opname) def wrapper(self, other): if self.ndim > 1 and getattr(other, "shape", None) == self.shape: @@ -112,7 +91,7 @@ def wrapper(self, other): return op(self.ravel(), other.ravel()).reshape(self.shape) try: - other = _validate_comparison_value(self, other) + other = self._validate_comparison_value(other, opname) except InvalidComparison: return invalid_comparison(self, other, op) @@ -696,6 +675,33 @@ def _from_factorized(cls, values, original): # Validation Methods # TODO: try to de-duplicate these, ensure identical behavior + def _validate_comparison_value(self, other, opname: str): + if isinstance(other, str): + try: + # GH#18435 strings get a pass from tzawareness compat + other = self._scalar_from_string(other) + except ValueError: + # failed to parse as Timestamp/Timedelta/Period + raise InvalidComparison(other) + + if isinstance(other, self._recognized_scalars) or other is NaT: + other = self._scalar_type(other) # type: ignore[call-arg] + self._check_compatible_with(other) + + elif not is_list_like(other): + raise InvalidComparison(other) + + elif len(other) != len(self): + raise ValueError("Lengths must match") + + else: + try: + other = self._validate_listlike(other, opname, allow_object=True) + except TypeError as err: + raise InvalidComparison(other) from err + + return other + def _validate_fill_value(self, fill_value): """ If a fill_value is passed to `take` convert it to an i8 representation, diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 706b089e929a9..ff9dd3f2a85bc 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -150,6 +150,9 @@ class IntervalArray(IntervalMixin, ExtensionArray): can_hold_na = True _na_value = _fill_value = np.nan + # --------------------------------------------------------------------- + # Constructors + def __new__( cls, data, @@ -263,32 +266,32 @@ def _from_factorized(cls, values, original): _interval_shared_docs["from_breaks"] = textwrap.dedent( """ - Construct an %(klass)s from an array of splits. + Construct an %(klass)s from an array of splits. - Parameters - ---------- - breaks : array-like (1-dimensional) - Left and right bounds for each interval. - closed : {'left', 'right', 'both', 'neither'}, default 'right' - Whether the intervals are closed on the left-side, right-side, both - or neither. - copy : bool, default False - Copy the data. - dtype : dtype or None, default None - If None, dtype will be inferred. + Parameters + ---------- + breaks : array-like (1-dimensional) + Left and right bounds for each interval. + closed : {'left', 'right', 'both', 'neither'}, default 'right' + Whether the intervals are closed on the left-side, right-side, both + or neither. + copy : bool, default False + Copy the data. + dtype : dtype or None, default None + If None, dtype will be inferred. - Returns - ------- - %(klass)s + Returns + ------- + %(klass)s - See Also - -------- - interval_range : Function to create a fixed frequency IntervalIndex. - %(klass)s.from_arrays : Construct from a left and right array. - %(klass)s.from_tuples : Construct from a sequence of tuples. + See Also + -------- + interval_range : Function to create a fixed frequency IntervalIndex. + %(klass)s.from_arrays : Construct from a left and right array. + %(klass)s.from_tuples : Construct from a sequence of tuples. - %(examples)s\ - """ + %(examples)s\ + """ ) @classmethod @@ -387,34 +390,34 @@ def from_arrays(cls, left, right, closed="right", copy=False, dtype=None): _interval_shared_docs["from_tuples"] = textwrap.dedent( """ - Construct an %(klass)s from an array-like of tuples. + Construct an %(klass)s from an array-like of tuples. - Parameters - ---------- - data : array-like (1-dimensional) - Array of tuples. - closed : {'left', 'right', 'both', 'neither'}, default 'right' - Whether the intervals are closed on the left-side, right-side, both - or neither. - copy : bool, default False - By-default copy the data, this is compat only and ignored. - dtype : dtype or None, default None - If None, dtype will be inferred. + Parameters + ---------- + data : array-like (1-dimensional) + Array of tuples. + closed : {'left', 'right', 'both', 'neither'}, default 'right' + Whether the intervals are closed on the left-side, right-side, both + or neither. + copy : bool, default False + By-default copy the data, this is compat only and ignored. + dtype : dtype or None, default None + If None, dtype will be inferred. - Returns - ------- - %(klass)s + Returns + ------- + %(klass)s - See Also - -------- - interval_range : Function to create a fixed frequency IntervalIndex. - %(klass)s.from_arrays : Construct an %(klass)s from a left and - right array. - %(klass)s.from_breaks : Construct an %(klass)s from an array of - splits. + See Also + -------- + interval_range : Function to create a fixed frequency IntervalIndex. + %(klass)s.from_arrays : Construct an %(klass)s from a left and + right array. + %(klass)s.from_breaks : Construct an %(klass)s from an array of + splits. - %(examples)s\ - """ + %(examples)s\ + """ ) @classmethod @@ -489,9 +492,38 @@ def _validate(self): msg = "left side of interval must be <= right side" raise ValueError(msg) - # --------- - # Interface - # --------- + def _shallow_copy(self, left, right): + """ + Return a new IntervalArray with the replacement attributes + + Parameters + ---------- + left : Index + Values to be used for the left-side of the intervals. + right : Index + Values to be used for the right-side of the intervals. + """ + return self._simple_new(left, right, closed=self.closed, verify_integrity=False) + + # --------------------------------------------------------------------- + # Descriptive + + @property + def dtype(self): + return IntervalDtype(self.left.dtype) + + @property + def nbytes(self) -> int: + return self.left.nbytes + self.right.nbytes + + @property + def size(self) -> int: + # Avoid materializing self.values + return self.left.size + + # --------------------------------------------------------------------- + # EA Interface + def __iter__(self): return iter(np.asarray(self)) @@ -646,10 +678,6 @@ def fillna(self, value=None, method=None, limit=None): right = self.right.fillna(value=value_right) return self._shallow_copy(left, right) - @property - def dtype(self): - return IntervalDtype(self.left.dtype) - def astype(self, dtype, copy=True): """ Cast to an ExtensionArray or NumPy array with dtype 'dtype'. @@ -722,19 +750,6 @@ def _concat_same_type(cls, to_concat): right = np.concatenate([interval.right for interval in to_concat]) return cls._simple_new(left, right, closed=closed, copy=False) - def _shallow_copy(self, left, right): - """ - Return a new IntervalArray with the replacement attributes - - Parameters - ---------- - left : Index - Values to be used for the left-side of the intervals. - right : Index - Values to be used for the right-side of the intervals. - """ - return self._simple_new(left, right, closed=self.closed, verify_integrity=False) - def copy(self): """ Return a copy of the array. @@ -752,15 +767,6 @@ def copy(self): def isna(self): return isna(self.left) - @property - def nbytes(self) -> int: - return self.left.nbytes + self.right.nbytes - - @property - def size(self) -> int: - # Avoid materializing self.values - return self.left.size - def shift(self, periods: int = 1, fill_value: object = None) -> "IntervalArray": if not len(self) or periods == 0: return self.copy() @@ -912,7 +918,8 @@ def value_counts(self, dropna=True): # TODO: implement this is a non-naive way! return value_counts(np.asarray(self), dropna=dropna) - # Formatting + # --------------------------------------------------------------------- + # Rendering Methods def _format_data(self): @@ -966,6 +973,9 @@ def _format_space(self): space = " " * (len(type(self).__name__) + 1) return f"\n{space}" + # --------------------------------------------------------------------- + # Vectorized Interval Properties/Attributes + @property def left(self): """ @@ -982,6 +992,109 @@ def right(self): """ return self._right + @property + def length(self): + """ + Return an Index with entries denoting the length of each Interval in + the IntervalArray. + """ + try: + return self.right - self.left + except TypeError as err: + # length not defined for some types, e.g. string + msg = ( + "IntervalArray contains Intervals without defined length, " + "e.g. Intervals with string endpoints" + ) + raise TypeError(msg) from err + + @property + def mid(self): + """ + Return the midpoint of each Interval in the IntervalArray as an Index. + """ + try: + return 0.5 * (self.left + self.right) + except TypeError: + # datetime safe version + return self.left + 0.5 * self.length + + _interval_shared_docs["overlaps"] = textwrap.dedent( + """ + Check elementwise if an Interval overlaps the values in the %(klass)s. + + Two intervals overlap if they share a common point, including closed + endpoints. Intervals that only have an open endpoint in common do not + overlap. + + .. versionadded:: 0.24.0 + + Parameters + ---------- + other : %(klass)s + Interval to check against for an overlap. + + Returns + ------- + ndarray + Boolean array positionally indicating where an overlap occurs. + + See Also + -------- + Interval.overlaps : Check whether two Interval objects overlap. + + Examples + -------- + %(examples)s + >>> intervals.overlaps(pd.Interval(0.5, 1.5)) + array([ True, True, False]) + + Intervals that share closed endpoints overlap: + + >>> intervals.overlaps(pd.Interval(1, 3, closed='left')) + array([ True, True, True]) + + Intervals that only have an open endpoint in common do not overlap: + + >>> intervals.overlaps(pd.Interval(1, 2, closed='right')) + array([False, True, False]) + """ + ) + + @Appender( + _interval_shared_docs["overlaps"] + % dict( + klass="IntervalArray", + examples=textwrap.dedent( + """\ + >>> data = [(0, 1), (1, 3), (2, 4)] + >>> intervals = pd.arrays.IntervalArray.from_tuples(data) + >>> intervals + + [(0, 1], (1, 3], (2, 4]] + Length: 3, closed: right, dtype: interval[int64] + """ + ), + ) + ) + def overlaps(self, other): + if isinstance(other, (IntervalArray, ABCIntervalIndex)): + raise NotImplementedError + elif not isinstance(other, Interval): + msg = f"`other` must be Interval-like, got {type(other).__name__}" + raise TypeError(msg) + + # equality is okay if both endpoints are closed (overlap at a point) + op1 = le if (self.closed_left and other.closed_right) else lt + op2 = le if (other.closed_left and self.closed_right) else lt + + # overlaps is equivalent negation of two interval being disjoint: + # disjoint = (A.left > B.right) or (B.left > A.right) + # (simplifying the negation allows this to be done in less operations) + return op1(self.left, other.right) & op2(other.left, self.right) + + # --------------------------------------------------------------------- + @property def closed(self): """ @@ -1041,33 +1154,6 @@ def set_closed(self, closed): left=self.left, right=self.right, closed=closed, verify_integrity=False ) - @property - def length(self): - """ - Return an Index with entries denoting the length of each Interval in - the IntervalArray. - """ - try: - return self.right - self.left - except TypeError as err: - # length not defined for some types, e.g. string - msg = ( - "IntervalArray contains Intervals without defined length, " - "e.g. Intervals with string endpoints" - ) - raise TypeError(msg) from err - - @property - def mid(self): - """ - Return the midpoint of each Interval in the IntervalArray as an Index. - """ - try: - return 0.5 * (self.left + self.right) - except TypeError: - # datetime safe version - return self.left + 0.5 * self.length - _interval_shared_docs[ "is_non_overlapping_monotonic" ] = """ @@ -1102,7 +1188,9 @@ def is_non_overlapping_monotonic(self): or (self.left[:-1] >= self.right[1:]).all() ) + # --------------------------------------------------------------------- # Conversion + def __array__(self, dtype=None) -> np.ndarray: """ Return the IntervalArray's data as a numpy array of Interval @@ -1200,6 +1288,8 @@ def to_tuples(self, na_tuple=True): tuples = np.where(~self.isna(), tuples, np.nan) return tuples + # --------------------------------------------------------------------- + @Appender(_extension_array_shared_docs["repeat"] % _shared_docs_kwargs) def repeat(self, repeats, axis=None): nv.validate_repeat(tuple(), dict(axis=axis)) @@ -1262,80 +1352,6 @@ def contains(self, other): other < self.right if self.open_right else other <= self.right ) - _interval_shared_docs["overlaps"] = textwrap.dedent( - """ - Check elementwise if an Interval overlaps the values in the %(klass)s. - - Two intervals overlap if they share a common point, including closed - endpoints. Intervals that only have an open endpoint in common do not - overlap. - - .. versionadded:: 0.24.0 - - Parameters - ---------- - other : %(klass)s - Interval to check against for an overlap. - - Returns - ------- - ndarray - Boolean array positionally indicating where an overlap occurs. - - See Also - -------- - Interval.overlaps : Check whether two Interval objects overlap. - - Examples - -------- - %(examples)s - >>> intervals.overlaps(pd.Interval(0.5, 1.5)) - array([ True, True, False]) - - Intervals that share closed endpoints overlap: - - >>> intervals.overlaps(pd.Interval(1, 3, closed='left')) - array([ True, True, True]) - - Intervals that only have an open endpoint in common do not overlap: - - >>> intervals.overlaps(pd.Interval(1, 2, closed='right')) - array([False, True, False]) - """ - ) - - @Appender( - _interval_shared_docs["overlaps"] - % dict( - klass="IntervalArray", - examples=textwrap.dedent( - """\ - >>> data = [(0, 1), (1, 3), (2, 4)] - >>> intervals = pd.arrays.IntervalArray.from_tuples(data) - >>> intervals - - [(0, 1], (1, 3], (2, 4]] - Length: 3, closed: right, dtype: interval[int64] - """ - ), - ) - ) - def overlaps(self, other): - if isinstance(other, (IntervalArray, ABCIntervalIndex)): - raise NotImplementedError - elif not isinstance(other, Interval): - msg = f"`other` must be Interval-like, got {type(other).__name__}" - raise TypeError(msg) - - # equality is okay if both endpoints are closed (overlap at a point) - op1 = le if (self.closed_left and other.closed_right) else lt - op2 = le if (other.closed_left and self.closed_right) else lt - - # overlaps is equivalent negation of two interval being disjoint: - # disjoint = (A.left > B.right) or (B.left > A.right) - # (simplifying the negation allows this to be done in less operations) - return op1(self.left, other.right) & op2(other.left, self.right) - def maybe_convert_platform_interval(values): """ From 1841444f26d62c0a59b72760614e1e8cc4765095 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 18 Sep 2020 06:05:09 -0700 Subject: [PATCH 0827/1025] REF: share insert between DTI/TDI/PI (#36439) --- pandas/core/indexes/datetimelike.py | 82 ++++++++++++++++------------- pandas/core/indexes/period.py | 6 +-- 2 files changed, 47 insertions(+), 41 deletions(-) diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 122977eee99fb..1ab40a76b30ff 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -575,6 +575,50 @@ def delete(self, loc): arr = type(self._data)._simple_new(new_i8s, dtype=self.dtype, freq=freq) return type(self)._simple_new(arr, name=self.name) + def insert(self, loc: int, item): + """ + Make new Index inserting new item at location + + Parameters + ---------- + loc : int + item : object + if not either a Python datetime or a numpy integer-like, returned + Index dtype will be object rather than datetime. + + Returns + ------- + new_index : Index + """ + item = self._data._validate_insert_value(item) + + freq = None + if is_period_dtype(self.dtype): + freq = self.freq + elif self.freq is not None: + # freq can be preserved on edge cases + if self.size: + if item is NaT: + pass + elif (loc == 0 or loc == -len(self)) and item + self.freq == self[0]: + freq = self.freq + elif (loc == len(self)) and item - self.freq == self[-1]: + freq = self.freq + else: + # Adding a single item to an empty index may preserve freq + if self.freq.is_on_offset(item): + freq = self.freq + + arr = self._data + item = arr._unbox_scalar(item) + item = arr._rebox_native(item) + + new_values = np.concatenate([arr._ndarray[:loc], [item], arr._ndarray[loc:]]) + new_arr = self._data._from_backing_data(new_values) + new_arr._freq = freq + + return type(self)._simple_new(new_arr, name=self.name) + # -------------------------------------------------------------------- # Join/Set Methods @@ -895,45 +939,11 @@ def _maybe_utc_convert(self, other): # -------------------------------------------------------------------- # List-Like Methods + @Appender(DatetimeIndexOpsMixin.insert.__doc__) def insert(self, loc, item): - """ - Make new Index inserting new item at location - - Parameters - ---------- - loc : int - item : object - if not either a Python datetime or a numpy integer-like, returned - Index dtype will be object rather than datetime. - - Returns - ------- - new_index : Index - """ if isinstance(item, str): # TODO: Why are strings special? # TODO: Should we attempt _scalar_from_string? return self.astype(object).insert(loc, item) - item = self._data._validate_insert_value(item) - - freq = None - # check freq can be preserved on edge cases - if self.freq is not None: - if self.size: - if item is NaT: - pass - elif (loc == 0 or loc == -len(self)) and item + self.freq == self[0]: - freq = self.freq - elif (loc == len(self)) and item - self.freq == self[-1]: - freq = self.freq - else: - # Adding a single item to an empty index may preserve freq - if self.freq.is_on_offset(item): - freq = self.freq - - item = self._data._unbox_scalar(item) - - new_i8s = np.concatenate([self[:loc].asi8, [item], self[loc:].asi8]) - arr = type(self._data)._simple_new(new_i8s, dtype=self.dtype, freq=freq) - return type(self)._simple_new(arr, name=self.name) + return DatetimeIndexOpsMixin.insert(self, loc, item) diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index 42dce1bd53f22..900d3f9f1866b 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -436,11 +436,7 @@ def insert(self, loc, item): if not isinstance(item, Period) or self.freq != item.freq: return self.astype(object).insert(loc, item) - i8result = np.concatenate( - (self[:loc].asi8, np.array([item.ordinal]), self[loc:].asi8) - ) - arr = type(self._data)._simple_new(i8result, dtype=self.dtype) - return type(self)._simple_new(arr, name=self.name) + return DatetimeIndexOpsMixin.insert(self, loc, item) def join(self, other, how="left", level=None, return_indexers=False, sort=False): """ From 45519a2496bfade9b1ff89a9c925ca61ba801e3e Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Fri, 18 Sep 2020 20:33:01 +0100 Subject: [PATCH 0828/1025] CLN Upgrade pandas/core syntax (#36453) --- pandas/core/arrays/datetimes.py | 3 +-- pandas/core/arrays/sparse/array.py | 2 +- pandas/core/common.py | 9 +++------ pandas/core/computation/expr.py | 2 +- pandas/core/computation/pytables.py | 2 +- pandas/core/generic.py | 11 +++-------- pandas/core/groupby/base.py | 11 ++--------- pandas/core/groupby/generic.py | 2 +- pandas/core/indexes/base.py | 8 +++----- pandas/core/indexes/datetimelike.py | 2 +- pandas/core/reshape/merge.py | 2 +- 11 files changed, 18 insertions(+), 36 deletions(-) diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 2073f110d536f..b1f98199f9fba 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -570,8 +570,7 @@ def __iter__(self): converted = ints_to_pydatetime( data[start_i:end_i], tz=self.tz, freq=self.freq, box="timestamp" ) - for v in converted: - yield v + yield from converted def astype(self, dtype, copy=True): # We handle diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index 853f7bb0b0d81..c88af77ea6189 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -1427,7 +1427,7 @@ def sparse_arithmetic_method(self, other): # TODO: look into _wrap_result if len(self) != len(other): raise AssertionError( - (f"length mismatch: {len(self)} vs. {len(other)}") + f"length mismatch: {len(self)} vs. {len(other)}" ) if not isinstance(other, SparseArray): dtype = getattr(other, "dtype", None) diff --git a/pandas/core/common.py b/pandas/core/common.py index 968fb180abcd0..b860c83f89cbc 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -62,8 +62,7 @@ def flatten(l): """ for el in l: if iterable_not_string(el): - for s in flatten(el): - yield s + yield from flatten(el) else: yield el @@ -434,10 +433,8 @@ def random_state(state=None): return np.random else: raise ValueError( - ( - "random_state must be an integer, array-like, a BitGenerator, " - "a numpy RandomState, or None" - ) + "random_state must be an integer, array-like, a BitGenerator, " + "a numpy RandomState, or None" ) diff --git a/pandas/core/computation/expr.py b/pandas/core/computation/expr.py index 09fc53716dda9..8c56f02c8d3cc 100644 --- a/pandas/core/computation/expr.py +++ b/pandas/core/computation/expr.py @@ -153,7 +153,7 @@ def _preparse( the ``tokenize`` module and ``tokval`` is a string. """ assert callable(f), "f must be callable" - return tokenize.untokenize((f(x) for x in tokenize_string(source))) + return tokenize.untokenize(f(x) for x in tokenize_string(source)) def _is_type(t): diff --git a/pandas/core/computation/pytables.py b/pandas/core/computation/pytables.py index 8dd7c1a22d0ae..d876c655421ef 100644 --- a/pandas/core/computation/pytables.py +++ b/pandas/core/computation/pytables.py @@ -554,7 +554,7 @@ def __init__( else: w = _validate_where(w) where[idx] = w - _where = " & ".join((f"({w})" for w in com.flatten(where))) + _where = " & ".join(f"({w})" for w in com.flatten(where)) else: _where = where diff --git a/pandas/core/generic.py b/pandas/core/generic.py index cc18b8681200f..0b9021b094cd7 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1681,10 +1681,7 @@ def _get_label_or_level_values(self, key: str, axis: int = 0) -> np.ndarray: label_axis_name = "column" if axis == 0 else "index" raise ValueError( - ( - f"The {label_axis_name} label '{key}' " - f"is not unique.{multi_message}" - ) + f"The {label_axis_name} label '{key}' is not unique.{multi_message}" ) return values @@ -1725,10 +1722,8 @@ def _drop_labels_or_levels(self, keys, axis: int = 0): if invalid_keys: raise ValueError( - ( - "The following keys are not valid labels or " - f"levels for axis {axis}: {invalid_keys}" - ) + "The following keys are not valid labels or " + f"levels for axis {axis}: {invalid_keys}" ) # Compute levels and labels to drop diff --git a/pandas/core/groupby/base.py b/pandas/core/groupby/base.py index 9cfd13f95ca0e..2387427d15670 100644 --- a/pandas/core/groupby/base.py +++ b/pandas/core/groupby/base.py @@ -93,15 +93,8 @@ def _gotitem(self, key, ndim, subset=None): ) series_apply_allowlist = ( - ( - common_apply_allowlist - | { - "nlargest", - "nsmallest", - "is_monotonic_increasing", - "is_monotonic_decreasing", - } - ) + common_apply_allowlist + | {"nlargest", "nsmallest", "is_monotonic_increasing", "is_monotonic_decreasing"} ) | frozenset(["dtype", "unique"]) dataframe_apply_allowlist = common_apply_allowlist | frozenset(["dtypes", "corrwith"]) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index a931221ef3ce1..bbccd22f2ae85 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1212,7 +1212,7 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): return result else: - all_indexed_same = all_indexes_same((x.index for x in values)) + all_indexed_same = all_indexes_same(x.index for x in values) # GH3596 # provide a reduction (Frame -> Series) if groups are diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 222ae589ea7fc..525caab7564a3 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -504,7 +504,7 @@ def _maybe_check_unique(self): if not self.is_unique: msg = """Index has duplicates.""" duplicates = self._format_duplicate_message() - msg += "\n{}".format(duplicates) + msg += f"\n{duplicates}" raise DuplicateLabelError(msg) @@ -4315,10 +4315,8 @@ def identical(self, other) -> bool: return ( self.equals(other) and all( - ( - getattr(self, c, None) == getattr(other, c, None) - for c in self._comparables - ) + getattr(self, c, None) == getattr(other, c, None) + for c in self._comparables ) and type(self) == type(other) ) diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 1ab40a76b30ff..5aa72bb838756 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -398,7 +398,7 @@ def _partial_date_slice( if len(self) and ( (use_lhs and t1 < self[0] and t2 < self[0]) - or ((use_rhs and t1 > self[-1] and t2 > self[-1])) + or (use_rhs and t1 > self[-1] and t2 > self[-1]) ): # we are out of range raise KeyError diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index d95355589fd0c..5a6518995c554 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1837,7 +1837,7 @@ def _get_single_indexer(join_key, index, sort: bool = False): def _left_join_on_index(left_ax: Index, right_ax: Index, join_keys, sort: bool = False): if len(join_keys) > 1: if not ( - (isinstance(right_ax, MultiIndex) and len(join_keys) == right_ax.nlevels) + isinstance(right_ax, MultiIndex) and len(join_keys) == right_ax.nlevels ): raise AssertionError( "If more than one join key is given then " From ea83afc584a47dbc0062b1513f0a70a8714145a1 Mon Sep 17 00:00:00 2001 From: Fangchen Li Date: Fri, 18 Sep 2020 16:30:41 -0500 Subject: [PATCH 0829/1025] CI: fix gbq test #36436 (#36443) --- ci/deps/travis-37-cov.yaml | 9 ++++----- ci/deps/travis-37-locale.yaml | 4 ++-- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/ci/deps/travis-37-cov.yaml b/ci/deps/travis-37-cov.yaml index d031dc1cc062f..7d5104a58ce83 100644 --- a/ci/deps/travis-37-cov.yaml +++ b/ci/deps/travis-37-cov.yaml @@ -1,6 +1,5 @@ name: pandas-dev channels: - - defaults - conda-forge dependencies: - python=3.7.* @@ -15,7 +14,6 @@ dependencies: # pandas dependencies - beautifulsoup4 - botocore>=1.11 - - cython>=0.29.16 - dask - fastparquet>=0.3.2 - fsspec>=0.7.4 @@ -31,16 +29,18 @@ dependencies: - odfpy - openpyxl - pandas-gbq + - google-cloud-bigquery>=1.27.2 # GH 36436 - psycopg2 - pyarrow>=0.15.0 - - pymysql + - pymysql=0.7.11 - pytables - python-snappy + - python-dateutil - pytz - s3fs>=0.4.0 - scikit-learn - scipy - - sqlalchemy + - sqlalchemy=1.3.0 - statsmodels - xarray - xlrd @@ -51,5 +51,4 @@ dependencies: - brotlipy - coverage - pandas-datareader - - python-dateutil - pyxlsb diff --git a/ci/deps/travis-37-locale.yaml b/ci/deps/travis-37-locale.yaml index 8a0b5b043ceca..cd6341e80be24 100644 --- a/ci/deps/travis-37-locale.yaml +++ b/ci/deps/travis-37-locale.yaml @@ -25,10 +25,10 @@ dependencies: - numexpr - numpy - openpyxl - - pandas-gbq=0.12.0 + - pandas-gbq + - google-cloud-bigquery>=1.27.2 # GH 36436 - pyarrow>=0.17 - psycopg2=2.7 - - pyarrow>=0.15.0 # GH #35813 - pymysql=0.7.11 - pytables - python-dateutil From a97f1e6af7777f694218d15cbb17cebf265d56aa Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 18 Sep 2020 14:55:05 -0700 Subject: [PATCH 0830/1025] REGR: Series[numeric] comparison with str raising on numexpr path (#36440) --- doc/source/whatsnew/v1.1.3.rst | 2 +- pandas/core/indexes/base.py | 6 +++++- pandas/core/ops/array_ops.py | 5 +++++ pandas/tests/arithmetic/test_numeric.py | 20 ++++++++++++++++++++ pandas/tests/indexes/test_numpy_compat.py | 15 --------------- 5 files changed, 31 insertions(+), 17 deletions(-) diff --git a/doc/source/whatsnew/v1.1.3.rst b/doc/source/whatsnew/v1.1.3.rst index 3f8413bd492ca..1d386fa372ce1 100644 --- a/doc/source/whatsnew/v1.1.3.rst +++ b/doc/source/whatsnew/v1.1.3.rst @@ -34,7 +34,7 @@ Fixed regressions - Fixed regression in :meth:`Series.__getitem__` incorrectly raising when the input was a tuple (:issue:`35534`) - Fixed regression in :meth:`Series.__getitem__` incorrectly raising when the input was a frozenset (:issue:`35747`) - Fixed regression in :meth:`read_excel` with ``engine="odf"`` caused ``UnboundLocalError`` in some cases where cells had nested child nodes (:issue:`36122`,:issue:`35802`) -- +- Fixed regression in :class:`DataFrame` and :class:`Series` comparisons between numeric arrays and strings (:issue:`35700`,:issue:`36377`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 525caab7564a3..9cd28974f9385 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -138,10 +138,14 @@ def cmp_method(self, other): with np.errstate(all="ignore"): result = ops.comp_method_OBJECT_ARRAY(op, self._values, other) - else: + elif is_interval_dtype(self.dtype): with np.errstate(all="ignore"): result = op(self._values, np.asarray(other)) + else: + with np.errstate(all="ignore"): + result = ops.comparison_op(self._values, np.asarray(other), op) + if is_bool_dtype(result): return result return ops.invalid_comparison(self, other, op) diff --git a/pandas/core/ops/array_ops.py b/pandas/core/ops/array_ops.py index aab10cea33632..fd5f126051c53 100644 --- a/pandas/core/ops/array_ops.py +++ b/pandas/core/ops/array_ops.py @@ -23,6 +23,7 @@ is_bool_dtype, is_integer_dtype, is_list_like, + is_numeric_v_string_like, is_object_dtype, is_scalar, ) @@ -235,6 +236,10 @@ def comparison_op(left: ArrayLike, right: Any, op) -> ArrayLike: else: res_values = np.zeros(lvalues.shape, dtype=bool) + elif is_numeric_v_string_like(lvalues, rvalues): + # GH#36377 going through the numexpr path would incorrectly raise + return invalid_comparison(lvalues, rvalues, op) + elif is_object_dtype(lvalues.dtype): res_values = comp_method_OBJECT_ARRAY(op, lvalues, rvalues) diff --git a/pandas/tests/arithmetic/test_numeric.py b/pandas/tests/arithmetic/test_numeric.py index ecac08ffe3ba2..139401bdf5806 100644 --- a/pandas/tests/arithmetic/test_numeric.py +++ b/pandas/tests/arithmetic/test_numeric.py @@ -89,6 +89,26 @@ def test_compare_invalid(self): b.name = pd.Timestamp("2000-01-01") tm.assert_series_equal(a / b, 1 / (b / a)) + def test_numeric_cmp_string_numexpr_path(self, box): + # GH#36377, GH#35700 + xbox = box if box is not pd.Index else np.ndarray + + obj = pd.Series(np.random.randn(10 ** 5)) + obj = tm.box_expected(obj, box, transpose=False) + + result = obj == "a" + + expected = pd.Series(np.zeros(10 ** 5, dtype=bool)) + expected = tm.box_expected(expected, xbox, transpose=False) + tm.assert_equal(result, expected) + + result = obj != "a" + tm.assert_equal(result, ~expected) + + msg = "Invalid comparison between dtype=float64 and str" + with pytest.raises(TypeError, match=msg): + obj < "a" + # ------------------------------------------------------------------ # Numeric dtypes Arithmetic with Datetime/Timedelta Scalar diff --git a/pandas/tests/indexes/test_numpy_compat.py b/pandas/tests/indexes/test_numpy_compat.py index a83684464caf6..b71417b2a625d 100644 --- a/pandas/tests/indexes/test_numpy_compat.py +++ b/pandas/tests/indexes/test_numpy_compat.py @@ -114,18 +114,3 @@ def test_numpy_ufuncs_other(index, func): else: with pytest.raises(Exception): func(index) - - -def test_elementwise_comparison_warning(): - # https://github.com/pandas-dev/pandas/issues/22698#issuecomment-458968300 - # np.array([1, 2]) == 'a' returns False, and produces a - # FutureWarning that it'll be [False, False] in the future. - # We just want to ensure that comes through. - # When NumPy dev actually enforces this change, we'll need to skip - # this test. - idx = Index([1, 2]) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - result = idx == "a" - - expected = np.array([False, False]) - tm.assert_numpy_array_equal(result, expected) From de35b46476ed04cb531f94944aee803366d92b7b Mon Sep 17 00:00:00 2001 From: patrick <61934744+phofl@users.noreply.github.com> Date: Fri, 18 Sep 2020 23:59:23 +0200 Subject: [PATCH 0831/1025] [DOC]: Add warning about rolling sums with large values (#36433) --- doc/source/user_guide/computation.rst | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/doc/source/user_guide/computation.rst b/doc/source/user_guide/computation.rst index 151ef36be7c98..10e27606a1415 100644 --- a/doc/source/user_guide/computation.rst +++ b/doc/source/user_guide/computation.rst @@ -229,6 +229,15 @@ see the :ref:`groupby docs `. The API for window statistics is quite similar to the way one works with ``GroupBy`` objects, see the documentation :ref:`here `. +.. warning:: + + When using ``rolling()`` and an associated function the results are calculated with rolling sums. As a consequence + when having values differing with magnitude :math:`1/np.finfo(np.double).eps` this results in truncation. It must be + noted, that large values may have an impact on windows, which do not include these values. `Kahan summation + `__ is used + to compute the rolling sums to preserve accuracy as much as possible. The same holds true for ``Rolling.var()`` for + values differing with magnitude :math:`(1/np.finfo(np.double).eps)^{0.5}`. + We work with ``rolling``, ``expanding`` and ``exponentially weighted`` data through the corresponding objects, :class:`~pandas.core.window.Rolling`, :class:`~pandas.core.window.Expanding` and :class:`~pandas.core.window.ExponentialMovingWindow`. From 53dd4722b5688901c99c3a82d8ac8fbdc44c2654 Mon Sep 17 00:00:00 2001 From: Daniel Saxton <2658661+dsaxton@users.noreply.github.com> Date: Fri, 18 Sep 2020 17:08:57 -0500 Subject: [PATCH 0832/1025] CI: Auto-label PRs for review (#36349) --- .github/PULL_REQUEST_TEMPLATE.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 7c3870470f074..5e4d3b4ec38e4 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -1,3 +1,9 @@ +--- + +labels: "Needs Review" + +--- + - [ ] closes #xxxx - [ ] tests added / passed - [ ] passes `black pandas` From 4d9b4dadd5d4fe4507aee9f3946f55d3bdfd0cb5 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 18 Sep 2020 15:18:03 -0700 Subject: [PATCH 0833/1025] REF: _is_compatible_with_other -> _can_union_without_object_cast (#36384) --- doc/source/whatsnew/v1.2.0.rst | 2 +- pandas/core/indexes/base.py | 12 +++++-- pandas/core/indexes/datetimelike.py | 3 ++ pandas/core/indexes/numeric.py | 34 ++++++------------- pandas/tests/indexes/common.py | 18 ++++------ pandas/tests/indexes/datetimes/test_setops.py | 6 ++-- 6 files changed, 33 insertions(+), 42 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 6923b42d3340b..33a5b016a293f 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -367,7 +367,7 @@ Other - Bug in :meth:`DataFrame.replace` and :meth:`Series.replace` incorrectly raising ``AssertionError`` instead of ``ValueError`` when invalid parameter combinations are passed (:issue:`36045`) - Bug in :meth:`DataFrame.replace` and :meth:`Series.replace` with numeric values and string ``to_replace`` (:issue:`34789`) - Bug in :meth:`Series.transform` would give incorrect results or raise when the argument ``func`` was dictionary (:issue:`35811`) -- +- Bug in :meth:`Index.union` behaving differently depending on whether operand is a :class:`Index` or other list-like (:issue:`36384`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 9cd28974f9385..962a425afbd1e 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2511,7 +2511,7 @@ def _union_incompatible_dtypes(self, other, sort): other = Index(other).astype(object, copy=False) return Index.union(this, other, sort=sort).astype(object, copy=False) - def _is_compatible_with_other(self, other) -> bool: + def _can_union_without_object_cast(self, other) -> bool: """ Check whether this and the other dtype are compatible with each other. Meaning a union can be formed between them without needing to be cast @@ -2587,8 +2587,9 @@ def union(self, other, sort=None): """ self._validate_sort_keyword(sort) self._assert_can_do_setop(other) + other = ensure_index(other) - if not self._is_compatible_with_other(other): + if not self._can_union_without_object_cast(other): return self._union_incompatible_dtypes(other, sort=sort) return self._union(other, sort=sort) @@ -5657,6 +5658,13 @@ def ensure_index( return MultiIndex.from_arrays(converted) else: + if isinstance(converted, np.ndarray) and converted.dtype == np.int64: + # Check for overflows if we should actually be uint64 + # xref GH#35481 + alt = np.asarray(index_like) + if alt.dtype == np.uint64: + converted = alt + index_like = converted else: # clean_index_list does the equivalent of copying diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 5aa72bb838756..e2f59ceb41db5 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -622,6 +622,9 @@ def insert(self, loc: int, item): # -------------------------------------------------------------------- # Join/Set Methods + def _can_union_without_object_cast(self, other) -> bool: + return is_dtype_equal(self.dtype, other.dtype) + def _wrap_joined_index(self, joined: np.ndarray, other): assert other.dtype == self.dtype, (other.dtype, self.dtype) name = get_op_result_name(self, other) diff --git a/pandas/core/indexes/numeric.py b/pandas/core/indexes/numeric.py index 49a70600c09fa..574c9adc31808 100644 --- a/pandas/core/indexes/numeric.py +++ b/pandas/core/indexes/numeric.py @@ -15,19 +15,14 @@ is_float, is_float_dtype, is_integer_dtype, + is_numeric_dtype, is_scalar, is_signed_integer_dtype, is_unsigned_integer_dtype, needs_i8_conversion, pandas_dtype, ) -from pandas.core.dtypes.generic import ( - ABCFloat64Index, - ABCInt64Index, - ABCRangeIndex, - ABCSeries, - ABCUInt64Index, -) +from pandas.core.dtypes.generic import ABCSeries from pandas.core.dtypes.missing import is_valid_nat_for_dtype, isna from pandas.core import algorithms @@ -275,11 +270,9 @@ def _assert_safe_casting(cls, data, subarr): if not np.array_equal(data, subarr): raise TypeError("Unsafe NumPy casting, you must explicitly cast") - def _is_compatible_with_other(self, other) -> bool: - return super()._is_compatible_with_other(other) or all( - isinstance(obj, (ABCInt64Index, ABCFloat64Index, ABCRangeIndex)) - for obj in [self, other] - ) + def _can_union_without_object_cast(self, other) -> bool: + # See GH#26778, further casting may occur in NumericIndex._union + return other.dtype == "f8" or other.dtype == self.dtype Int64Index._add_numeric_methods() @@ -324,10 +317,9 @@ def _assert_safe_casting(cls, data, subarr): if not np.array_equal(data, subarr): raise TypeError("Unsafe NumPy casting, you must explicitly cast") - def _is_compatible_with_other(self, other) -> bool: - return super()._is_compatible_with_other(other) or all( - isinstance(obj, (ABCUInt64Index, ABCFloat64Index)) for obj in [self, other] - ) + def _can_union_without_object_cast(self, other) -> bool: + # See GH#26778, further casting may occur in NumericIndex._union + return other.dtype == "f8" or other.dtype == self.dtype UInt64Index._add_numeric_methods() @@ -432,13 +424,9 @@ def isin(self, values, level=None): self._validate_index_level(level) return algorithms.isin(np.array(self), values) - def _is_compatible_with_other(self, other) -> bool: - return super()._is_compatible_with_other(other) or all( - isinstance( - obj, (ABCInt64Index, ABCFloat64Index, ABCUInt64Index, ABCRangeIndex) - ) - for obj in [self, other] - ) + def _can_union_without_object_cast(self, other) -> bool: + # See GH#26778, further casting may occur in NumericIndex._union + return is_numeric_dtype(other.dtype) Float64Index._add_numeric_methods() diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index b01cafc9b0d5c..c40f7b1bc2120 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -5,7 +5,6 @@ import pytest from pandas._libs import iNaT -from pandas.compat.numpy import is_numpy_dev from pandas.errors import InvalidIndexError from pandas.core.dtypes.common import is_datetime64tz_dtype @@ -456,7 +455,7 @@ def test_set_ops_error_cases(self, case, method, index): with pytest.raises(TypeError, match=msg): getattr(index, method)(case) - def test_intersection_base(self, index, request): + def test_intersection_base(self, index): if isinstance(index, CategoricalIndex): return @@ -473,15 +472,6 @@ def test_intersection_base(self, index, request): # GH 10149 cases = [klass(second.values) for klass in [np.array, Series, list]] for case in cases: - # https://github.com/pandas-dev/pandas/issues/35481 - if ( - is_numpy_dev - and isinstance(case, Series) - and isinstance(index, UInt64Index) - ): - mark = pytest.mark.xfail(reason="gh-35481") - request.node.add_marker(mark) - result = first.intersection(case) assert tm.equalContents(result, second) @@ -507,7 +497,11 @@ def test_union_base(self, index): for case in cases: if not isinstance(index, CategoricalIndex): result = first.union(case) - assert tm.equalContents(result, everything) + assert tm.equalContents(result, everything), ( + result, + everything, + type(case), + ) if isinstance(index, MultiIndex): msg = "other must be a MultiIndex or a list of tuples" diff --git a/pandas/tests/indexes/datetimes/test_setops.py b/pandas/tests/indexes/datetimes/test_setops.py index f19e78323ab23..102c8f97a8a6b 100644 --- a/pandas/tests/indexes/datetimes/test_setops.py +++ b/pandas/tests/indexes/datetimes/test_setops.py @@ -46,10 +46,8 @@ def test_union3(self, sort, box): first = everything[:5] second = everything[5:] - # GH 10149 - expected = ( - first.astype("O").union(pd.Index(second.values, dtype="O")).astype("O") - ) + # GH 10149 support listlike inputs other than Index objects + expected = first.union(second, sort=sort) case = box(second.values) result = first.union(case, sort=sort) tm.assert_index_equal(result, expected) From ef7d5688c2f3076537c2f4478971f56c14d001af Mon Sep 17 00:00:00 2001 From: Gautham <41098605+ahgamut@users.noreply.github.com> Date: Fri, 18 Sep 2020 22:20:31 +0000 Subject: [PATCH 0834/1025] CLN: Update files (as per #36450) to Python 3.7+ syntax (#36457) --- pandas/_config/display.py | 2 +- pandas/_testing.py | 3 +- pandas/_vendored/typing_extensions.py | 10 ++--- pandas/_version.py | 6 +-- pandas/io/clipboard/__init__.py | 4 +- pandas/io/formats/excel.py | 6 +-- pandas/io/html.py | 2 +- pandas/io/json/_json.py | 2 +- pandas/io/pytables.py | 6 +-- pandas/io/stata.py | 2 +- pandas/plotting/_matplotlib/core.py | 2 +- .../arrays/categorical/test_constructors.py | 2 +- .../tests/arrays/integer/test_construction.py | 2 +- pandas/tests/extension/decimal/array.py | 2 +- pandas/tests/extension/test_categorical.py | 2 +- pandas/tests/frame/test_constructors.py | 2 +- pandas/tests/groupby/test_groupby.py | 4 +- pandas/tests/groupby/test_grouping.py | 2 +- pandas/tests/indexes/test_base.py | 2 +- pandas/tests/indexing/test_indexing.py | 2 +- pandas/tests/io/formats/test_format.py | 2 +- pandas/tests/io/formats/test_to_csv.py | 20 ++++----- pandas/tests/io/formats/test_to_html.py | 2 +- pandas/tests/io/formats/test_to_latex.py | 2 +- pandas/tests/io/json/test_ujson.py | 4 +- pandas/tests/io/parser/test_c_parser_only.py | 2 +- pandas/tests/io/parser/test_common.py | 2 +- pandas/tests/io/parser/test_encoding.py | 2 +- pandas/tests/io/parser/test_read_fwf.py | 4 +- pandas/tests/io/pytables/common.py | 2 +- pandas/tests/io/test_common.py | 6 +-- pandas/tests/io/test_html.py | 4 +- pandas/tests/io/test_sql.py | 3 +- pandas/tests/reshape/test_get_dummies.py | 2 +- pandas/tests/scalar/test_na_scalar.py | 6 +-- pandas/tests/series/test_analytics.py | 2 +- pandas/tests/series/test_constructors.py | 5 +-- pandas/tests/series/test_dtypes.py | 8 ++-- pandas/tests/series/test_io.py | 3 +- scripts/validate_rst_title_capitalization.py | 7 ++- scripts/validate_unwanted_patterns.py | 4 +- setup.py | 2 +- versioneer.py | 43 ++++++++----------- 43 files changed, 94 insertions(+), 108 deletions(-) diff --git a/pandas/_config/display.py b/pandas/_config/display.py index ef319f4447565..e4553a2107f87 100644 --- a/pandas/_config/display.py +++ b/pandas/_config/display.py @@ -22,7 +22,7 @@ def detect_console_encoding() -> str: encoding = None try: encoding = sys.stdout.encoding or sys.stdin.encoding - except (AttributeError, IOError): + except (AttributeError, OSError): pass # try again for something better diff --git a/pandas/_testing.py b/pandas/_testing.py index 9db0c3496e290..cd34bec52daef 100644 --- a/pandas/_testing.py +++ b/pandas/_testing.py @@ -1960,8 +1960,7 @@ def index_subclass_makers_generator(): makeCategoricalIndex, makeMultiIndex, ] - for make_index_func in make_index_funcs: - yield make_index_func + yield from make_index_funcs def all_timeseries_index_generator(k=10): diff --git a/pandas/_vendored/typing_extensions.py b/pandas/_vendored/typing_extensions.py index 53df8da175a56..129d8998faccc 100644 --- a/pandas/_vendored/typing_extensions.py +++ b/pandas/_vendored/typing_extensions.py @@ -409,7 +409,7 @@ def __repr__(self): def __getitem__(self, parameters): item = typing._type_check( - parameters, "{} accepts only single type".format(self._name) + parameters, f"{self._name} accepts only single type" ) return _GenericAlias(self, (item,)) @@ -1671,7 +1671,7 @@ def __class_getitem__(cls, params): params = (params,) if not params and cls is not Tuple: raise TypeError( - "Parameter list to {}[...] cannot be empty".format(cls.__qualname__) + f"Parameter list to {cls.__qualname__}[...] cannot be empty" ) msg = "Parameters to generic types must be types." params = tuple(_type_check(p, msg) for p in params) @@ -2113,7 +2113,7 @@ def __class_getitem__(cls, params): return _AnnotatedAlias(origin, metadata) def __init_subclass__(cls, *args, **kwargs): - raise TypeError("Cannot subclass {}.Annotated".format(cls.__module__)) + raise TypeError(f"Cannot subclass {cls.__module__}.Annotated") def _strip_annotations(t): """Strips the annotations from a given type. @@ -2195,7 +2195,7 @@ def _tree_repr(self, tree): else: tp_repr = origin[0]._tree_repr(origin) metadata_reprs = ", ".join(repr(arg) for arg in metadata) - return "%s[%s, %s]" % (cls, tp_repr, metadata_reprs) + return f"{cls}[{tp_repr}, {metadata_reprs}]" def _subs_tree(self, tvars=None, args=None): # noqa if self is Annotated: @@ -2382,7 +2382,7 @@ def TypeAlias(self, parameters): It's invalid when used anywhere except as in the example above. """ - raise TypeError("{} is not subscriptable".format(self)) + raise TypeError(f"{self} is not subscriptable") elif sys.version_info[:2] >= (3, 7): diff --git a/pandas/_version.py b/pandas/_version.py index 66e756a4744c8..b3fa8530d09eb 100644 --- a/pandas/_version.py +++ b/pandas/_version.py @@ -74,7 +74,7 @@ def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False): stderr=(subprocess.PIPE if hide_stderr else None), ) break - except EnvironmentError: + except OSError: e = sys.exc_info()[1] if e.errno == errno.ENOENT: continue @@ -121,7 +121,7 @@ def git_get_keywords(versionfile_abs): # _version.py. keywords = {} try: - f = open(versionfile_abs, "r") + f = open(versionfile_abs) for line in f.readlines(): if line.strip().startswith("git_refnames ="): mo = re.search(r'=\s*"(.*)"', line) @@ -132,7 +132,7 @@ def git_get_keywords(versionfile_abs): if mo: keywords["full"] = mo.group(1) f.close() - except EnvironmentError: + except OSError: pass return keywords diff --git a/pandas/io/clipboard/__init__.py b/pandas/io/clipboard/__init__.py index d16955a98b62f..a8020f4bb4e4f 100644 --- a/pandas/io/clipboard/__init__.py +++ b/pandas/io/clipboard/__init__.py @@ -274,7 +274,7 @@ def copy_dev_clipboard(text): fo.write(text) def paste_dev_clipboard() -> str: - with open("/dev/clipboard", "rt") as fo: + with open("/dev/clipboard") as fo: content = fo.read() return content @@ -521,7 +521,7 @@ def determine_clipboard(): return init_windows_clipboard() if platform.system() == "Linux": - with open("/proc/version", "r") as f: + with open("/proc/version") as f: if "Microsoft" in f.read(): return init_wsl_clipboard() diff --git a/pandas/io/formats/excel.py b/pandas/io/formats/excel.py index bf4586a4b5b96..cc7b6b0bfea97 100644 --- a/pandas/io/formats/excel.py +++ b/pandas/io/formats/excel.py @@ -587,8 +587,7 @@ def _format_regular_rows(self): else: coloffset = 0 - for cell in self._generate_body(coloffset): - yield cell + yield from self._generate_body(coloffset) def _format_hierarchical_rows(self): has_aliases = isinstance(self.header, (tuple, list, np.ndarray, ABCIndex)) @@ -664,8 +663,7 @@ def _format_hierarchical_rows(self): ) gcolidx += 1 - for cell in self._generate_body(gcolidx): - yield cell + yield from self._generate_body(gcolidx) def _generate_body(self, coloffset: int): if self.styler is None: diff --git a/pandas/io/html.py b/pandas/io/html.py index 40fde224a7ae9..9a91b16e52723 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -719,7 +719,7 @@ def _build_doc(self): r = r.getroot() except AttributeError: pass - except (UnicodeDecodeError, IOError) as e: + except (UnicodeDecodeError, OSError) as e: # if the input is a blob of html goop if not is_url(self.io): r = fromstring(self.io, parser=parser) diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index c3977f89ac42f..a0ceb18c8bd20 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -821,7 +821,7 @@ def close(self): if self.should_close: try: self.open_stream.close() - except (IOError, AttributeError): + except (OSError, AttributeError): pass for file_handle in self.file_handles: file_handle.close() diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index e850a101a0a63..5e5a89d96f0e5 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -364,7 +364,7 @@ def read_hdf( if isinstance(path_or_buf, HDFStore): if not path_or_buf.is_open: - raise IOError("The HDFStore must be open for reading.") + raise OSError("The HDFStore must be open for reading.") store = path_or_buf auto_close = False @@ -693,7 +693,7 @@ def open(self, mode: str = "a", **kwargs): try: self._handle = tables.open_file(self._path, self._mode, **kwargs) - except IOError as err: # pragma: no cover + except OSError as err: # pragma: no cover if "can not be written" in str(err): print(f"Opening {self._path} in read-only mode") self._handle = tables.open_file(self._path, "r", **kwargs) @@ -724,7 +724,7 @@ def open(self, mode: str = "a", **kwargs): # trying to read from a non-existent file causes an error which # is not part of IOError, make it one if self._mode == "r" and "Unable to open/create file" in str(err): - raise IOError(str(err)) from err + raise OSError(str(err)) from err raise def close(self): diff --git a/pandas/io/stata.py b/pandas/io/stata.py index df5f6c3d53d30..a8af84e42918d 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -1077,7 +1077,7 @@ def close(self) -> None: """ close the handle if its open """ try: self.path_or_buf.close() - except IOError: + except OSError: pass def _set_encoding(self) -> None: diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index 602b42022f561..0c64ea824996f 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -342,7 +342,7 @@ def _setup_subplots(self): valid_log = {False, True, "sym", None} input_log = {self.logx, self.logy, self.loglog} if input_log - valid_log: - invalid_log = next(iter((input_log - valid_log))) + invalid_log = next(iter(input_log - valid_log)) raise ValueError( f"Boolean, None and 'sym' are valid options, '{invalid_log}' is given." ) diff --git a/pandas/tests/arrays/categorical/test_constructors.py b/pandas/tests/arrays/categorical/test_constructors.py index 89fbfbd5b8324..e200f13652a84 100644 --- a/pandas/tests/arrays/categorical/test_constructors.py +++ b/pandas/tests/arrays/categorical/test_constructors.py @@ -277,7 +277,7 @@ def test_constructor_with_generator(self): # returned a scalar for a generator exp = Categorical([0, 1, 2]) - cat = Categorical((x for x in [0, 1, 2])) + cat = Categorical(x for x in [0, 1, 2]) tm.assert_categorical_equal(cat, exp) cat = Categorical(range(3)) tm.assert_categorical_equal(cat, exp) diff --git a/pandas/tests/arrays/integer/test_construction.py b/pandas/tests/arrays/integer/test_construction.py index 1893c4554bfbf..e0a4877da6c7e 100644 --- a/pandas/tests/arrays/integer/test_construction.py +++ b/pandas/tests/arrays/integer/test_construction.py @@ -29,7 +29,7 @@ def test_from_dtype_from_float(data): # from int / array expected = pd.Series(data).dropna().reset_index(drop=True) - dropped = np.array(data.dropna()).astype(np.dtype((dtype.type))) + dropped = np.array(data.dropna()).astype(np.dtype(dtype.type)) result = pd.Series(dropped, dtype=str(dtype)) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py index 2fbeec8dd8378..9147360e71c73 100644 --- a/pandas/tests/extension/decimal/array.py +++ b/pandas/tests/extension/decimal/array.py @@ -167,7 +167,7 @@ def _na_value(self): def _formatter(self, boxed=False): if boxed: - return "Decimal: {0}".format + return "Decimal: {}".format return repr @classmethod diff --git a/pandas/tests/extension/test_categorical.py b/pandas/tests/extension/test_categorical.py index f7b572a70073a..7d03dadb20dd9 100644 --- a/pandas/tests/extension/test_categorical.py +++ b/pandas/tests/extension/test_categorical.py @@ -137,7 +137,7 @@ def test_combine_add(self, data_repeated): s2 = pd.Series(orig_data2) result = s1.combine(s2, lambda x1, x2: x1 + x2) expected = pd.Series( - ([a + b for (a, b) in zip(list(orig_data1), list(orig_data2))]) + [a + b for (a, b) in zip(list(orig_data1), list(orig_data2))] ) self.assert_series_equal(result, expected) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 63a2160e128ed..b5e211895672a 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -71,7 +71,7 @@ def test_series_with_name_not_matching_column(self): lambda: DataFrame({}), lambda: DataFrame(()), lambda: DataFrame([]), - lambda: DataFrame((_ for _ in [])), + lambda: DataFrame(_ for _ in []), lambda: DataFrame(range(0)), lambda: DataFrame(data=None), lambda: DataFrame(data={}), diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 1bb40b322cd48..6783fc5b66433 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -249,8 +249,8 @@ def test_len(): # issue 11016 df = pd.DataFrame(dict(a=[np.nan] * 3, b=[1, 2, 3])) - assert len(df.groupby(("a"))) == 0 - assert len(df.groupby(("b"))) == 3 + assert len(df.groupby("a")) == 0 + assert len(df.groupby("b")) == 3 assert len(df.groupby(["a", "b"])) == 3 diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index 40b4ce46e550b..18ef95c05f291 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -739,7 +739,7 @@ def test_get_group(self): with pytest.raises(ValueError, match=msg): g.get_group("foo") with pytest.raises(ValueError, match=msg): - g.get_group(("foo")) + g.get_group("foo") msg = "must supply a same-length tuple to get_group with multiple grouping keys" with pytest.raises(ValueError, match=msg): g.get_group(("foo", "bar", "baz")) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 7720db9d98ebf..f811bd579aaaa 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -1360,7 +1360,7 @@ def test_get_indexer_strings_raises(self): def test_get_indexer_numeric_index_boolean_target(self, idx_class): # GH 16877 - numeric_index = idx_class(RangeIndex((4))) + numeric_index = idx_class(RangeIndex(4)) result = numeric_index.get_indexer([True, False, True]) expected = np.array([-1, -1, -1], dtype=np.intp) tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index ca8a3ddc95575..0cc61cd7df389 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -745,7 +745,7 @@ def run_tests(df, rhs, right): # make frames multi-type & re-run tests for frame in [df, rhs, right]: frame["joe"] = frame["joe"].astype("float64") - frame["jolie"] = frame["jolie"].map("@{0}".format) + frame["jolie"] = frame["jolie"].map("@{}".format) run_tests(df, rhs, right) diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index a7ee10e198a52..bc1622a61a19d 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -648,7 +648,7 @@ def test_to_string_unicode_columns(self, float_frame): assert isinstance(result, str) def test_to_string_utf8_columns(self): - n = "\u05d0".encode("utf-8") + n = "\u05d0".encode() with option_context("display.max_rows", 1): df = DataFrame([1, 2], columns=[n]) diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py index c40935b2cc5dd..e2ceb95d77053 100644 --- a/pandas/tests/io/formats/test_to_csv.py +++ b/pandas/tests/io/formats/test_to_csv.py @@ -26,7 +26,7 @@ def test_to_csv_with_single_column(self): """ with tm.ensure_clean("test.csv") as path: df1.to_csv(path, header=None, index=None) - with open(path, "r") as f: + with open(path) as f: assert f.read() == expected1 df2 = DataFrame([1, None]) @@ -36,7 +36,7 @@ def test_to_csv_with_single_column(self): """ with tm.ensure_clean("test.csv") as path: df2.to_csv(path, header=None, index=None) - with open(path, "r") as f: + with open(path) as f: assert f.read() == expected2 def test_to_csv_defualt_encoding(self): @@ -58,7 +58,7 @@ def test_to_csv_quotechar(self): with tm.ensure_clean("test.csv") as path: df.to_csv(path, quoting=1) # 1=QUOTE_ALL - with open(path, "r") as f: + with open(path) as f: assert f.read() == expected expected = """\ @@ -69,7 +69,7 @@ def test_to_csv_quotechar(self): with tm.ensure_clean("test.csv") as path: df.to_csv(path, quoting=1, quotechar="$") - with open(path, "r") as f: + with open(path) as f: assert f.read() == expected with tm.ensure_clean("test.csv") as path: @@ -86,7 +86,7 @@ def test_to_csv_doublequote(self): with tm.ensure_clean("test.csv") as path: df.to_csv(path, quoting=1, doublequote=True) # QUOTE_ALL - with open(path, "r") as f: + with open(path) as f: assert f.read() == expected from _csv import Error @@ -105,7 +105,7 @@ def test_to_csv_escapechar(self): with tm.ensure_clean("test.csv") as path: # QUOTE_ALL df.to_csv(path, quoting=1, doublequote=False, escapechar="\\") - with open(path, "r") as f: + with open(path) as f: assert f.read() == expected df = DataFrame({"col": ["a,a", ",bb,"]}) @@ -117,7 +117,7 @@ def test_to_csv_escapechar(self): with tm.ensure_clean("test.csv") as path: df.to_csv(path, quoting=3, escapechar="\\") # QUOTE_NONE - with open(path, "r") as f: + with open(path) as f: assert f.read() == expected def test_csv_to_string(self): @@ -342,7 +342,7 @@ def test_to_csv_string_array_ascii(self): """ with tm.ensure_clean("str_test.csv") as path: df.to_csv(path, encoding="ascii") - with open(path, "r") as f: + with open(path) as f: assert f.read() == expected_ascii def test_to_csv_string_array_utf8(self): @@ -356,7 +356,7 @@ def test_to_csv_string_array_utf8(self): """ with tm.ensure_clean("unicode_test.csv") as path: df.to_csv(path, encoding="utf-8") - with open(path, "r") as f: + with open(path) as f: assert f.read() == expected_utf8 def test_to_csv_string_with_lf(self): @@ -467,7 +467,7 @@ def test_to_csv_write_to_open_file(self): with open(path, "w") as f: f.write("manual header\n") df.to_csv(f, header=None, index=None) - with open(path, "r") as f: + with open(path) as f: assert f.read() == expected def test_to_csv_write_to_open_file_with_newline_py3(self): diff --git a/pandas/tests/io/formats/test_to_html.py b/pandas/tests/io/formats/test_to_html.py index e85fd398964d0..7acdbfd462874 100644 --- a/pandas/tests/io/formats/test_to_html.py +++ b/pandas/tests/io/formats/test_to_html.py @@ -137,7 +137,7 @@ def test_to_html_encoding(float_frame, tmp_path): # GH 28663 path = tmp_path / "test.html" float_frame.to_html(path, encoding="gbk") - with open(str(path), "r", encoding="gbk") as f: + with open(str(path), encoding="gbk") as f: assert float_frame.to_html() == f.read() diff --git a/pandas/tests/io/formats/test_to_latex.py b/pandas/tests/io/formats/test_to_latex.py index a98644250b328..a93ab6f9cc7aa 100644 --- a/pandas/tests/io/formats/test_to_latex.py +++ b/pandas/tests/io/formats/test_to_latex.py @@ -21,7 +21,7 @@ def test_to_latex_filename(self, float_frame): with tm.ensure_clean("test.tex") as path: float_frame.to_latex(path) - with open(path, "r") as f: + with open(path) as f: assert float_frame.to_latex() == f.read() # test with utf-8 and encoding option (GH 7061) diff --git a/pandas/tests/io/json/test_ujson.py b/pandas/tests/io/json/test_ujson.py index e2007e07c572a..086c0b7ba08b2 100644 --- a/pandas/tests/io/json/test_ujson.py +++ b/pandas/tests/io/json/test_ujson.py @@ -591,14 +591,14 @@ def test_decode_number_with_32bit_sign_bit(self, val): def test_encode_big_escape(self): # Make sure no Exception is raised. for _ in range(10): - base = "\u00e5".encode("utf-8") + base = "\u00e5".encode() escape_input = base * 1024 * 1024 * 2 ujson.encode(escape_input) def test_decode_big_escape(self): # Make sure no Exception is raised. for _ in range(10): - base = "\u00e5".encode("utf-8") + base = "\u00e5".encode() quote = b'"' escape_input = quote + (base * 1024 * 1024 * 2) + quote diff --git a/pandas/tests/io/parser/test_c_parser_only.py b/pandas/tests/io/parser/test_c_parser_only.py index 7c58afe867440..ae63b6af3a8b6 100644 --- a/pandas/tests/io/parser/test_c_parser_only.py +++ b/pandas/tests/io/parser/test_c_parser_only.py @@ -577,7 +577,7 @@ def test_file_handles_mmap(c_parser_only, csv1): # Don't close user provided file handles. parser = c_parser_only - with open(csv1, "r") as f: + with open(csv1) as f: m = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) parser.read_csv(m) diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index 1d8d5a29686a4..49358fe2ecfe4 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -1726,7 +1726,7 @@ def test_iteration_open_handle(all_parsers): with open(path, "w") as f: f.write("AAA\nBBB\nCCC\nDDD\nEEE\nFFF\nGGG") - with open(path, "r") as f: + with open(path) as f: for line in f: if "CCC" in line: break diff --git a/pandas/tests/io/parser/test_encoding.py b/pandas/tests/io/parser/test_encoding.py index de7b3bed034c7..f23b498c7388a 100644 --- a/pandas/tests/io/parser/test_encoding.py +++ b/pandas/tests/io/parser/test_encoding.py @@ -27,7 +27,7 @@ def test_bytes_io_input(all_parsers): def test_read_csv_unicode(all_parsers): parser = all_parsers - data = BytesIO("\u0141aski, Jan;1".encode("utf-8")) + data = BytesIO("\u0141aski, Jan;1".encode()) result = parser.read_csv(data, sep=";", encoding="utf-8", header=None) expected = DataFrame([["\u0141aski, Jan", 1]]) diff --git a/pandas/tests/io/parser/test_read_fwf.py b/pandas/tests/io/parser/test_read_fwf.py index e982667f06f31..127d0dc4c9829 100644 --- a/pandas/tests/io/parser/test_read_fwf.py +++ b/pandas/tests/io/parser/test_read_fwf.py @@ -173,9 +173,7 @@ def test_read_csv_compat(): def test_bytes_io_input(): - result = read_fwf( - BytesIO("שלום\nשלום".encode("utf8")), widths=[2, 2], encoding="utf8" - ) + result = read_fwf(BytesIO("שלום\nשלום".encode()), widths=[2, 2], encoding="utf8") expected = DataFrame([["של", "ום"]], columns=["של", "ום"]) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/pytables/common.py b/pandas/tests/io/pytables/common.py index aad18890de3ad..7e7a76e287d32 100644 --- a/pandas/tests/io/pytables/common.py +++ b/pandas/tests/io/pytables/common.py @@ -25,7 +25,7 @@ def safe_close(store): try: if store is not None: store.close() - except IOError: + except OSError: pass diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index 85a12a13d19fb..ede8d61490778 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -339,7 +339,7 @@ def test_constructor_bad_file(self, mmap_file): with pytest.raises(err, match=msg): icom._MMapWrapper(non_file) - target = open(mmap_file, "r") + target = open(mmap_file) target.close() msg = "I/O operation on closed file" @@ -347,7 +347,7 @@ def test_constructor_bad_file(self, mmap_file): icom._MMapWrapper(target) def test_get_attr(self, mmap_file): - with open(mmap_file, "r") as target: + with open(mmap_file) as target: wrapper = icom._MMapWrapper(target) attrs = dir(wrapper.mmap) @@ -360,7 +360,7 @@ def test_get_attr(self, mmap_file): assert not hasattr(wrapper, "foo") def test_next(self, mmap_file): - with open(mmap_file, "r") as target: + with open(mmap_file) as target: wrapper = icom._MMapWrapper(target) lines = target.readlines() diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index 2c93dbb5b6b83..59034e9f3d807 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -114,7 +114,7 @@ def test_to_html_compat(self): c_idx_names=False, r_idx_names=False, ) - .applymap("{0:.3f}".format) + .applymap("{:.3f}".format) .astype(float) ) out = df.to_html() @@ -616,7 +616,7 @@ def try_remove_ws(x): @pytest.mark.slow def test_gold_canyon(self): gc = "Gold Canyon" - with open(self.banklist_data, "r") as f: + with open(self.banklist_data) as f: raw_text = f.read() assert gc in raw_text diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 1edcc937f72c3..32a15e6201037 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -281,7 +281,6 @@ def _get_exec(self): @pytest.fixture(params=[("io", "data", "csv", "iris.csv")]) def load_iris_data(self, datapath, request): - import io iris_csv_file = datapath(*request.param) @@ -291,7 +290,7 @@ def load_iris_data(self, datapath, request): self.drop_table("iris") self._get_exec().execute(SQL_STRINGS["create_iris"][self.flavor]) - with io.open(iris_csv_file, mode="r", newline=None) as iris_csv: + with open(iris_csv_file, mode="r", newline=None) as iris_csv: r = csv.reader(iris_csv) next(r) # skip header row ins = SQL_STRINGS["insert_iris"][self.flavor] diff --git a/pandas/tests/reshape/test_get_dummies.py b/pandas/tests/reshape/test_get_dummies.py index ce13762ea8f86..82e0e52c089a2 100644 --- a/pandas/tests/reshape/test_get_dummies.py +++ b/pandas/tests/reshape/test_get_dummies.py @@ -386,7 +386,7 @@ def test_dataframe_dummies_with_categorical(self, df, sparse, dtype): "get_dummies_kwargs,expected", [ ( - {"data": DataFrame(({"ä": ["a"]}))}, + {"data": DataFrame({"ä": ["a"]})}, DataFrame({"ä_a": [1]}, dtype=np.uint8), ), ( diff --git a/pandas/tests/scalar/test_na_scalar.py b/pandas/tests/scalar/test_na_scalar.py index 0a7dfbee4e672..5c4d7e191d1bb 100644 --- a/pandas/tests/scalar/test_na_scalar.py +++ b/pandas/tests/scalar/test_na_scalar.py @@ -28,9 +28,9 @@ def test_format(): assert format(NA, ">10") == " " assert format(NA, "xxx") == "" # NA is flexible, accept any format spec - assert "{}".format(NA) == "" - assert "{:>10}".format(NA) == " " - assert "{:xxx}".format(NA) == "" + assert f"{NA}" == "" + assert f"{NA:>10}" == " " + assert f"{NA:xxx}" == "" def test_truthiness(): diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index e39083b709f38..6ba55ce3c74b9 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -180,7 +180,7 @@ def test_td64_summation_overflow(self): # mean result = (s - s.min()).mean() - expected = pd.Timedelta((pd.TimedeltaIndex((s - s.min())).asi8 / len(s)).sum()) + expected = pd.Timedelta((pd.TimedeltaIndex(s - s.min()).asi8 / len(s)).sum()) # the computation is converted to float so # might be some loss of precision diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 8ac0a55e63cd1..1b5fddaf14335 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -49,7 +49,7 @@ class TestSeriesConstructors: (lambda: Series({}), True), (lambda: Series(()), False), # creates a RangeIndex (lambda: Series([]), False), # creates a RangeIndex - (lambda: Series((_ for _ in [])), False), # creates a RangeIndex + (lambda: Series(_ for _ in []), False), # creates a RangeIndex (lambda: Series(data=None), True), (lambda: Series(data={}), True), (lambda: Series(data=()), False), # creates a RangeIndex @@ -222,8 +222,7 @@ def test_constructor_iterable(self): # GH 21987 class Iter: def __iter__(self): - for i in range(10): - yield i + yield from range(10) expected = Series(list(range(10)), dtype="int64") result = Series(Iter(), dtype="int64") diff --git a/pandas/tests/series/test_dtypes.py b/pandas/tests/series/test_dtypes.py index bcc0b18134dad..ae89e16ca7667 100644 --- a/pandas/tests/series/test_dtypes.py +++ b/pandas/tests/series/test_dtypes.py @@ -137,13 +137,13 @@ def test_astype_str_cast_dt64(self): ts = Series([Timestamp("2010-01-04 00:00:00")]) s = ts.astype(str) - expected = Series([str("2010-01-04")]) + expected = Series(["2010-01-04"]) tm.assert_series_equal(s, expected) ts = Series([Timestamp("2010-01-04 00:00:00", tz="US/Eastern")]) s = ts.astype(str) - expected = Series([str("2010-01-04 00:00:00-05:00")]) + expected = Series(["2010-01-04 00:00:00-05:00"]) tm.assert_series_equal(s, expected) def test_astype_str_cast_td64(self): @@ -152,7 +152,7 @@ def test_astype_str_cast_td64(self): td = Series([Timedelta(1, unit="d")]) ser = td.astype(str) - expected = Series([str("1 days")]) + expected = Series(["1 days"]) tm.assert_series_equal(ser, expected) def test_astype_unicode(self): @@ -167,7 +167,7 @@ def test_astype_unicode(self): former_encoding = None if sys.getdefaultencoding() == "utf-8": - test_series.append(Series(["野菜食べないとやばい".encode("utf-8")])) + test_series.append(Series(["野菜食べないとやばい".encode()])) for s in test_series: res = s.astype("unicode") diff --git a/pandas/tests/series/test_io.py b/pandas/tests/series/test_io.py index 708118e950686..b12ebd58e6a7b 100644 --- a/pandas/tests/series/test_io.py +++ b/pandas/tests/series/test_io.py @@ -66,12 +66,11 @@ def test_from_csv(self, datetime_series, string_series): tm.assert_series_equal(check_series, series) def test_to_csv(self, datetime_series): - import io with tm.ensure_clean() as path: datetime_series.to_csv(path, header=False) - with io.open(path, newline=None) as f: + with open(path, newline=None) as f: lines = f.readlines() assert lines[1] != "\n" diff --git a/scripts/validate_rst_title_capitalization.py b/scripts/validate_rst_title_capitalization.py index 62ec6b9ef07af..b654e27737359 100755 --- a/scripts/validate_rst_title_capitalization.py +++ b/scripts/validate_rst_title_capitalization.py @@ -212,7 +212,7 @@ def find_titles(rst_file: str) -> Iterable[Tuple[str, int]]: The corresponding line number of the heading. """ - with open(rst_file, "r") as fd: + with open(rst_file) as fd: previous_line = "" for i, line in enumerate(fd): line = line[:-1] @@ -250,10 +250,9 @@ def find_rst_files(source_paths: List[str]) -> Iterable[str]: elif directory_address.endswith(".rst"): yield directory_address else: - for filename in glob.glob( + yield from glob.glob( pathname=f"{directory_address}/**/*.rst", recursive=True - ): - yield filename + ) def main(source_paths: List[str], output_format: str) -> int: diff --git a/scripts/validate_unwanted_patterns.py b/scripts/validate_unwanted_patterns.py index 4a0e859535215..b6ffab1482bbc 100755 --- a/scripts/validate_unwanted_patterns.py +++ b/scripts/validate_unwanted_patterns.py @@ -447,7 +447,7 @@ def main( if os.path.isfile(source_path): file_path = source_path - with open(file_path, "r") as file_obj: + with open(file_path) as file_obj: for line_number, msg in function(file_obj): is_failed = True print( @@ -466,7 +466,7 @@ def main( continue file_path = os.path.join(subdir, file_name) - with open(file_path, "r") as file_obj: + with open(file_path) as file_obj: for line_number, msg in function(file_obj): is_failed = True print( diff --git a/setup.py b/setup.py index a8dfeb0974195..8f447d5c38169 100755 --- a/setup.py +++ b/setup.py @@ -99,7 +99,7 @@ def render_templates(cls, pxifiles): # if .pxi.in is not updated, no need to output .pxi continue - with open(pxifile, "r") as f: + with open(pxifile) as f: tmpl = f.read() pyxcontent = tempita.sub(tmpl) diff --git a/versioneer.py b/versioneer.py index 5882349f65f0b..65c9523ba5573 100644 --- a/versioneer.py +++ b/versioneer.py @@ -349,7 +349,7 @@ import sys -class VersioneerConfig(object): +class VersioneerConfig: pass @@ -398,7 +398,7 @@ def get_config_from_root(root): # the top of versioneer.py for instructions on writing your setup.cfg . setup_cfg = os.path.join(root, "setup.cfg") parser = configparser.SafeConfigParser() - with open(setup_cfg, "r") as f: + with open(setup_cfg) as f: parser.readfp(f) VCS = parser.get("versioneer", "VCS") # mandatory @@ -451,7 +451,7 @@ def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False): stderr=(subprocess.PIPE if hide_stderr else None), ) break - except EnvironmentError: + except OSError: e = sys.exc_info()[1] if e.errno == errno.ENOENT: continue @@ -461,7 +461,7 @@ def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False): return None else: if verbose: - print("unable to find command, tried %s" % (commands,)) + print(f"unable to find command, tried {commands}") return None stdout = p.communicate()[0].strip().decode() @@ -946,7 +946,7 @@ def git_get_keywords(versionfile_abs): # _version.py. keywords = {} try: - f = open(versionfile_abs, "r") + f = open(versionfile_abs) for line in f.readlines(): if line.strip().startswith("git_refnames ="): mo = re.search(r'=\s*"(.*)"', line) @@ -957,7 +957,7 @@ def git_get_keywords(versionfile_abs): if mo: keywords["full"] = mo.group(1) f.close() - except EnvironmentError: + except OSError: pass return keywords @@ -1072,9 +1072,8 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): if verbose: fmt = "tag '%s' doesn't start with prefix '%s'" print(fmt % (full_tag, tag_prefix)) - pieces["error"] = "tag '%s' doesn't start with prefix '%s'" % ( - full_tag, - tag_prefix, + pieces["error"] = "tag '{}' doesn't start with prefix '{}'".format( + full_tag, tag_prefix, ) return pieces pieces["closest-tag"] = full_tag[len(tag_prefix) :] @@ -1111,13 +1110,13 @@ def do_vcs_install(manifest_in, versionfile_source, ipy): files.append(versioneer_file) present = False try: - f = open(".gitattributes", "r") + f = open(".gitattributes") for line in f.readlines(): if line.strip().startswith(versionfile_source): if "export-subst" in line.strip().split()[1:]: present = True f.close() - except EnvironmentError: + except OSError: pass if not present: f = open(".gitattributes", "a+") @@ -1171,7 +1170,7 @@ def versions_from_file(filename): try: with open(filename) as f: contents = f.read() - except EnvironmentError: + except OSError: raise NotThisMethod("unable to read _version.py") mo = re.search( r"version_json = '''\n(.*)''' # END VERSION_JSON", contents, re.M | re.S @@ -1187,7 +1186,7 @@ def write_to_version_file(filename, versions): with open(filename, "w") as f: f.write(SHORT_VERSION_PY % contents) - print("set %s to '%s'" % (filename, versions["version"])) + print("set {} to '{}'".format(filename, versions["version"])) def plus_or_dot(pieces): @@ -1399,7 +1398,7 @@ def get_versions(verbose=False): try: ver = versions_from_file(versionfile_abs) if verbose: - print("got version from file %s %s" % (versionfile_abs, ver)) + print(f"got version from file {versionfile_abs} {ver}") return ver except NotThisMethod: pass @@ -1619,11 +1618,7 @@ def do_setup(): root = get_root() try: cfg = get_config_from_root(root) - except ( - EnvironmentError, - configparser.NoSectionError, - configparser.NoOptionError, - ) as e: + except (OSError, configparser.NoSectionError, configparser.NoOptionError) as e: if isinstance(e, (EnvironmentError, configparser.NoSectionError)): print("Adding sample versioneer config to setup.cfg", file=sys.stderr) with open(os.path.join(root, "setup.cfg"), "a") as f: @@ -1648,9 +1643,9 @@ def do_setup(): ipy = os.path.join(os.path.dirname(cfg.versionfile_source), "__init__.py") if os.path.exists(ipy): try: - with open(ipy, "r") as f: + with open(ipy) as f: old = f.read() - except EnvironmentError: + except OSError: old = "" if INIT_PY_SNIPPET not in old: print(" appending to %s" % ipy) @@ -1669,12 +1664,12 @@ def do_setup(): manifest_in = os.path.join(root, "MANIFEST.in") simple_includes = set() try: - with open(manifest_in, "r") as f: + with open(manifest_in) as f: for line in f: if line.startswith("include "): for include in line.split()[1:]: simple_includes.add(include) - except EnvironmentError: + except OSError: pass # That doesn't cover everything MANIFEST.in can do # (https://docs.python.org/2/distutils/sourcedist.html#commands), so @@ -1707,7 +1702,7 @@ def scan_setup_py(): found = set() setters = False errors = 0 - with open("setup.py", "r") as f: + with open("setup.py") as f: for line in f.readlines(): if "import versioneer" in line: found.add("import") From 06c17a772a9e25cef66770db04c4bc5c56a328b8 Mon Sep 17 00:00:00 2001 From: Daniel Saxton <2658661+dsaxton@users.noreply.github.com> Date: Fri, 18 Sep 2020 17:50:17 -0500 Subject: [PATCH 0835/1025] CI: Revert PR template (#36460) --- .github/PULL_REQUEST_TEMPLATE.md | 6 ------ 1 file changed, 6 deletions(-) diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 5e4d3b4ec38e4..7c3870470f074 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -1,9 +1,3 @@ ---- - -labels: "Needs Review" - ---- - - [ ] closes #xxxx - [ ] tests added / passed - [ ] passes `black pandas` From 56cc3feaf347c6402dc8d4827d3835205dd2e085 Mon Sep 17 00:00:00 2001 From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> Date: Fri, 18 Sep 2020 20:56:47 -0400 Subject: [PATCH 0836/1025] BUG: Concat typing (#36409) --- pandas/core/reshape/concat.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index dd4bcf77641ef..a07c7b49ac55b 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -31,6 +31,7 @@ if TYPE_CHECKING: from pandas import DataFrame + from pandas.core.generic import NDFrame # --------------------------------------------------------------------- # Concatenate DataFrame objects @@ -54,7 +55,7 @@ def concat( @overload def concat( - objs: Union[Iterable[FrameOrSeries], Mapping[Label, FrameOrSeries]], + objs: Union[Iterable["NDFrame"], Mapping[Label, "NDFrame"]], axis=0, join: str = "outer", ignore_index: bool = False, @@ -69,7 +70,7 @@ def concat( def concat( - objs: Union[Iterable[FrameOrSeries], Mapping[Label, FrameOrSeries]], + objs: Union[Iterable["NDFrame"], Mapping[Label, "NDFrame"]], axis=0, join="outer", ignore_index: bool = False, From 549acccdcf34f46308fd3681e7e021f6fff47349 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 18 Sep 2020 17:58:25 -0700 Subject: [PATCH 0837/1025] REF: MultiIndex._validate_insert_value, IntervaArray._validate_setitem_value (#36461) --- pandas/core/arrays/interval.py | 68 ++++++++++++++++++---------------- pandas/core/indexes/multi.py | 16 +++++--- 2 files changed, 46 insertions(+), 38 deletions(-) diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index ff9dd3f2a85bc..f9f68004bcc23 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -547,38 +547,7 @@ def __getitem__(self, value): return self._shallow_copy(left, right) def __setitem__(self, key, value): - # na value: need special casing to set directly on numpy arrays - needs_float_conversion = False - if is_scalar(value) and isna(value): - if is_integer_dtype(self.dtype.subtype): - # can't set NaN on a numpy integer array - needs_float_conversion = True - elif is_datetime64_any_dtype(self.dtype.subtype): - # need proper NaT to set directly on the numpy array - value = np.datetime64("NaT") - elif is_timedelta64_dtype(self.dtype.subtype): - # need proper NaT to set directly on the numpy array - value = np.timedelta64("NaT") - value_left, value_right = value, value - - # scalar interval - elif is_interval_dtype(value) or isinstance(value, Interval): - self._check_closed_matches(value, name="value") - value_left, value_right = value.left, value.right - - else: - # list-like of intervals - try: - array = IntervalArray(value) - value_left, value_right = array.left, array.right - except TypeError as err: - # wrong type: not interval or NA - msg = f"'value' should be an interval type, got {type(value)} instead." - raise TypeError(msg) from err - - if needs_float_conversion: - raise ValueError("Cannot set float NaN to integer-backed IntervalArray") - + value_left, value_right = self._validate_setitem_value(value) key = check_array_indexer(self, key) # Need to ensure that left and right are updated atomically, so we're @@ -898,6 +867,41 @@ def _validate_insert_value(self, value): ) return left_insert, right_insert + def _validate_setitem_value(self, value): + needs_float_conversion = False + + if is_scalar(value) and isna(value): + # na value: need special casing to set directly on numpy arrays + if is_integer_dtype(self.dtype.subtype): + # can't set NaN on a numpy integer array + needs_float_conversion = True + elif is_datetime64_any_dtype(self.dtype.subtype): + # need proper NaT to set directly on the numpy array + value = np.datetime64("NaT") + elif is_timedelta64_dtype(self.dtype.subtype): + # need proper NaT to set directly on the numpy array + value = np.timedelta64("NaT") + value_left, value_right = value, value + + elif is_interval_dtype(value) or isinstance(value, Interval): + # scalar interval + self._check_closed_matches(value, name="value") + value_left, value_right = value.left, value.right + + else: + try: + # list-like of intervals + array = IntervalArray(value) + value_left, value_right = array.left, array.right + except TypeError as err: + # wrong type: not interval or NA + msg = f"'value' should be an interval type, got {type(value)} instead." + raise TypeError(msg) from err + + if needs_float_conversion: + raise ValueError("Cannot set float NaN to integer-backed IntervalArray") + return value_left, value_right + def value_counts(self, dropna=True): """ Returns a Series containing counts of each interval. diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index a21a54e4a9be3..cd3e384837280 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -3596,6 +3596,15 @@ def astype(self, dtype, copy=True): return self._shallow_copy() return self + def _validate_insert_value(self, item): + if not isinstance(item, tuple): + # Pad the key with empty strings if lower levels of the key + # aren't specified: + item = (item,) + ("",) * (self.nlevels - 1) + elif len(item) != self.nlevels: + raise ValueError("Item must have length equal to number of levels.") + return item + def insert(self, loc: int, item): """ Make new MultiIndex inserting new item at location @@ -3610,12 +3619,7 @@ def insert(self, loc: int, item): ------- new_index : Index """ - # Pad the key with empty strings if lower levels of the key - # aren't specified: - if not isinstance(item, tuple): - item = (item,) + ("",) * (self.nlevels - 1) - elif len(item) != self.nlevels: - raise ValueError("Item must have length equal to number of levels.") + item = self._validate_insert_value(item) new_levels = [] new_codes = [] From fe71a12b44e60553e9040f348bf6354597660c64 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 19 Sep 2020 04:13:30 +0200 Subject: [PATCH 0838/1025] DEPR: Index.to_native_types (#36418) --- doc/source/whatsnew/v1.2.0.rst | 1 + pandas/core/indexes/base.py | 8 +++++ pandas/io/formats/csvs.py | 6 ++-- .../tests/indexes/datetimes/test_formats.py | 34 +++++++++++++------ pandas/tests/indexes/interval/test_formats.py | 2 +- pandas/tests/indexes/period/test_formats.py | 16 +++------ 6 files changed, 41 insertions(+), 26 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 33a5b016a293f..1286577748afa 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -212,6 +212,7 @@ Deprecations - Deprecated parameter ``dtype`` in :~meth:`Index.copy` on method all index classes. Use the :meth:`Index.astype` method instead for changing dtype(:issue:`35853`) - Date parser functions :func:`~pandas.io.date_converters.parse_date_time`, :func:`~pandas.io.date_converters.parse_date_fields`, :func:`~pandas.io.date_converters.parse_all_fields` and :func:`~pandas.io.date_converters.generic_parser` from ``pandas.io.date_converters`` are deprecated and will be removed in a future version; use :func:`to_datetime` instead (:issue:`35741`) - :meth:`DataFrame.lookup` is deprecated and will be removed in a future version, use :meth:`DataFrame.melt` and :meth:`DataFrame.loc` instead (:issue:`18682`) +- The :meth:`Index.to_native_types` is deprecated. Use ``.astype(str)`` instead (:issue:`28867`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 962a425afbd1e..5f2b901844dad 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -1021,6 +1021,8 @@ def to_native_types(self, slicer=None, **kwargs): """ Format specified values of `self` and return them. + .. deprecated:: 1.2.0 + Parameters ---------- slicer : int, array-like @@ -1042,6 +1044,12 @@ def to_native_types(self, slicer=None, **kwargs): numpy.ndarray Formatted values. """ + warnings.warn( + "The 'to_native_types' method is deprecated and will be removed in " + "a future version. Use 'astype(str)' instead.", + FutureWarning, + stacklevel=2, + ) values = self if slicer is not None: values = values[slicer] diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 4250a08f748d7..d0e9163fc5f11 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -154,7 +154,7 @@ def _refine_cols(self, cols: Optional[Sequence[Label]]) -> Sequence[Label]: if cols is not None: if isinstance(cols, ABCIndexClass): - cols = cols.to_native_types(**self._number_format) + cols = cols._format_native_types(**self._number_format) else: cols = list(cols) self.obj = self.obj.loc[:, cols] @@ -163,7 +163,7 @@ def _refine_cols(self, cols: Optional[Sequence[Label]]) -> Sequence[Label]: # and make sure sure cols is just a list of labels cols = self.obj.columns if isinstance(cols, ABCIndexClass): - return cols.to_native_types(**self._number_format) + return cols._format_native_types(**self._number_format) else: assert isinstance(cols, Sequence) return list(cols) @@ -341,5 +341,5 @@ def _save_chunk(self, start_i: int, end_i: int) -> None: res = df._mgr.to_native_types(**self._number_format) data = [res.iget_values(i) for i in range(len(res.items))] - ix = self.data_index.to_native_types(slicer=slicer, **self._number_format) + ix = self.data_index[slicer]._format_native_types(**self._number_format) libwriters.write_csv_rows(data, ix, self.nlevels, self.cols, self.writer) diff --git a/pandas/tests/indexes/datetimes/test_formats.py b/pandas/tests/indexes/datetimes/test_formats.py index f34019e06fd5f..a98a96b436107 100644 --- a/pandas/tests/indexes/datetimes/test_formats.py +++ b/pandas/tests/indexes/datetimes/test_formats.py @@ -10,41 +10,53 @@ import pandas._testing as tm -def test_to_native_types(): +def test_to_native_types_method_deprecated(): index = pd.date_range(freq="1D", periods=3, start="2017-01-01") - - # First, with no arguments. expected = np.array(["2017-01-01", "2017-01-02", "2017-01-03"], dtype=object) - result = index.to_native_types() - tm.assert_numpy_array_equal(result, expected) + with tm.assert_produces_warning(FutureWarning): + result = index.to_native_types() - # No NaN values, so na_rep has no effect - result = index.to_native_types(na_rep="pandas") tm.assert_numpy_array_equal(result, expected) # Make sure slicing works expected = np.array(["2017-01-01", "2017-01-03"], dtype=object) - result = index.to_native_types([0, 2]) + with tm.assert_produces_warning(FutureWarning): + result = index.to_native_types([0, 2]) + + tm.assert_numpy_array_equal(result, expected) + + +def test_to_native_types(): + index = pd.date_range(freq="1D", periods=3, start="2017-01-01") + + # First, with no arguments. + expected = np.array(["2017-01-01", "2017-01-02", "2017-01-03"], dtype=object) + + result = index._format_native_types() + tm.assert_numpy_array_equal(result, expected) + + # No NaN values, so na_rep has no effect + result = index._format_native_types(na_rep="pandas") tm.assert_numpy_array_equal(result, expected) # Make sure date formatting works expected = np.array(["01-2017-01", "01-2017-02", "01-2017-03"], dtype=object) - result = index.to_native_types(date_format="%m-%Y-%d") + result = index._format_native_types(date_format="%m-%Y-%d") tm.assert_numpy_array_equal(result, expected) # NULL object handling should work index = DatetimeIndex(["2017-01-01", pd.NaT, "2017-01-03"]) expected = np.array(["2017-01-01", "NaT", "2017-01-03"], dtype=object) - result = index.to_native_types() + result = index._format_native_types() tm.assert_numpy_array_equal(result, expected) expected = np.array(["2017-01-01", "pandas", "2017-01-03"], dtype=object) - result = index.to_native_types(na_rep="pandas") + result = index._format_native_types(na_rep="pandas") tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/indexes/interval/test_formats.py b/pandas/tests/indexes/interval/test_formats.py index 7acf5c1e0906c..0e8d7d1ba5aba 100644 --- a/pandas/tests/indexes/interval/test_formats.py +++ b/pandas/tests/indexes/interval/test_formats.py @@ -73,6 +73,6 @@ def test_repr_missing(self, constructor, expected): def test_to_native_types(self, tuples, closed, expected_data): # GH 28210 index = IntervalIndex.from_tuples(tuples, closed=closed) - result = index.to_native_types() + result = index._format_native_types() expected = np.array(expected_data) tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/indexes/period/test_formats.py b/pandas/tests/indexes/period/test_formats.py index 5db373a9f07ae..150a797169c14 100644 --- a/pandas/tests/indexes/period/test_formats.py +++ b/pandas/tests/indexes/period/test_formats.py @@ -12,35 +12,29 @@ def test_to_native_types(): # First, with no arguments. expected = np.array(["2017-01-01", "2017-01-02", "2017-01-03"], dtype="=U10") - result = index.to_native_types() + result = index._format_native_types() tm.assert_numpy_array_equal(result, expected) # No NaN values, so na_rep has no effect - result = index.to_native_types(na_rep="pandas") - tm.assert_numpy_array_equal(result, expected) - - # Make sure slicing works - expected = np.array(["2017-01-01", "2017-01-03"], dtype="=U10") - - result = index.to_native_types([0, 2]) + result = index._format_native_types(na_rep="pandas") tm.assert_numpy_array_equal(result, expected) # Make sure date formatting works expected = np.array(["01-2017-01", "01-2017-02", "01-2017-03"], dtype="=U10") - result = index.to_native_types(date_format="%m-%Y-%d") + result = index._format_native_types(date_format="%m-%Y-%d") tm.assert_numpy_array_equal(result, expected) # NULL object handling should work index = PeriodIndex(["2017-01-01", pd.NaT, "2017-01-03"], freq="D") expected = np.array(["2017-01-01", "NaT", "2017-01-03"], dtype=object) - result = index.to_native_types() + result = index._format_native_types() tm.assert_numpy_array_equal(result, expected) expected = np.array(["2017-01-01", "pandas", "2017-01-03"], dtype=object) - result = index.to_native_types(na_rep="pandas") + result = index._format_native_types(na_rep="pandas") tm.assert_numpy_array_equal(result, expected) From b4ab8d52742360f1d332a9194a3bfd02243a94a6 Mon Sep 17 00:00:00 2001 From: Hans Date: Sat, 19 Sep 2020 04:14:25 +0200 Subject: [PATCH 0839/1025] BUG: fix isin with nans and large arrays (#36266) --- doc/source/whatsnew/v1.1.3.rst | 1 + pandas/core/algorithms.py | 7 ++++++- pandas/tests/test_algos.py | 18 +++++++++++++++++- 3 files changed, 24 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.1.3.rst b/doc/source/whatsnew/v1.1.3.rst index 1d386fa372ce1..7d658215d7b76 100644 --- a/doc/source/whatsnew/v1.1.3.rst +++ b/doc/source/whatsnew/v1.1.3.rst @@ -47,6 +47,7 @@ Bug fixes - Bug in :class:`Series` constructor where integer overflow would occur for sufficiently large scalar inputs when an index was provided (:issue:`36291`) - Bug in :meth:`DataFrame.sort_values` raising an ``AttributeError`` when sorting on a key that casts column to categorical dtype (:issue:`36383`) - Bug in :meth:`DataFrame.stack` raising a ``ValueError`` when stacking :class:`MultiIndex` columns based on position when the levels had duplicate names (:issue:`36353`) +- Bug in :meth:`Series.isin` and :meth:`DataFrame.isin` when using ``NaN`` and a row length above 1,000,000 (:issue:`22205`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 2ce3f2d9a7bfa..50d1810fee30d 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -440,7 +440,12 @@ def isin(comps: AnyArrayLike, values: AnyArrayLike) -> np.ndarray: # GH16012 # Ensure np.in1d doesn't get object types or it *may* throw an exception if len(comps) > 1_000_000 and not is_object_dtype(comps): - f = np.in1d + # If the the values include nan we need to check for nan explicitly + # since np.nan it not equal to np.nan + if np.isnan(values).any(): + f = lambda c, v: np.logical_or(np.in1d(c, v), np.isnan(c)) + else: + f = np.in1d elif is_integer_dtype(comps): try: values = values.astype("int64", copy=False) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index a2c2ae22a0b62..6102f43f4db6a 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -801,7 +801,6 @@ def test_i8(self): tm.assert_numpy_array_equal(result, expected) def test_large(self): - s = pd.date_range("20000101", periods=2000000, freq="s").values result = algos.isin(s, s[0:2]) expected = np.zeros(len(s), dtype=bool) @@ -841,6 +840,23 @@ def test_same_nan_is_in(self): result = algos.isin(comps, values) tm.assert_numpy_array_equal(expected, result) + def test_same_nan_is_in_large(self): + # https://github.com/pandas-dev/pandas/issues/22205 + s = np.tile(1.0, 1_000_001) + s[0] = np.nan + result = algos.isin(s, [np.nan, 1]) + expected = np.ones(len(s), dtype=bool) + tm.assert_numpy_array_equal(result, expected) + + def test_same_nan_is_in_large_series(self): + # https://github.com/pandas-dev/pandas/issues/22205 + s = np.tile(1.0, 1_000_001) + series = pd.Series(s) + s[0] = np.nan + result = series.isin([np.nan, 1]) + expected = pd.Series(np.ones(len(s), dtype=bool)) + tm.assert_series_equal(result, expected) + def test_same_object_is_in(self): # GH 22160 # there could be special treatment for nans From e49616b8e8f941392abe3777fd75aa797e3d8aa4 Mon Sep 17 00:00:00 2001 From: sm1899 <42005691+sm1899@users.noreply.github.com> Date: Sat, 19 Sep 2020 13:06:12 +0530 Subject: [PATCH 0840/1025] Remove unnecessary trailing commas (#36463) * removed trailing comma * removed trailing commas * doc/make.py * doc/make.py * Update make.py --- doc/make.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/make.py b/doc/make.py index db729853e5834..94fbfa9382d81 100755 --- a/doc/make.py +++ b/doc/make.py @@ -291,7 +291,7 @@ def main(): joined = ", ".join(cmds) argparser.add_argument( - "command", nargs="?", default="html", help=f"command to run: {joined}", + "command", nargs="?", default="html", help=f"command to run: {joined}" ) argparser.add_argument( "--num-jobs", type=int, default=0, help="number of jobs used by sphinx-build" From 0e12eaaa2f4bfadb7aa8760b2049025afa48c2f8 Mon Sep 17 00:00:00 2001 From: attack68 <24256554+attack68@users.noreply.github.com> Date: Sat, 19 Sep 2020 17:21:56 +0200 Subject: [PATCH 0841/1025] PERF: styler uuid control and security (#36345) --- doc/source/whatsnew/v1.2.0.rst | 1 + pandas/io/formats/style.py | 16 +++++++++++++--- pandas/tests/io/formats/test_style.py | 20 ++++++++++++++++++++ 3 files changed, 34 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 1286577748afa..5882b74aa8b05 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -226,6 +226,7 @@ Performance improvements - Performance improvement in :meth:`GroupBy.agg` with the ``numba`` engine (:issue:`35759`) - Performance improvements when creating :meth:`pd.Series.map` from a huge dictionary (:issue:`34717`) - Performance improvement in :meth:`GroupBy.transform` with the ``numba`` engine (:issue:`36240`) +- ``Styler`` uuid method altered to compress data transmission over web whilst maintaining reasonably low table collision probability (:issue:`36345`) .. --------------------------------------------------------------------------- diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index b5f0bc0a832c2..1df37da3da8d0 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -18,7 +18,7 @@ Tuple, Union, ) -from uuid import uuid1 +from uuid import uuid4 import numpy as np @@ -89,6 +89,12 @@ class Styler: .. versionadded:: 1.0.0 + uuid_len : int, default 5 + If ``uuid`` is not specified, the length of the ``uuid`` to randomly generate + expressed in hex characters, in range [0, 32]. + + .. versionadded:: 1.2.0 + Attributes ---------- env : Jinja2 jinja2.Environment @@ -144,6 +150,7 @@ def __init__( table_attributes: Optional[str] = None, cell_ids: bool = True, na_rep: Optional[str] = None, + uuid_len: int = 5, ): self.ctx: DefaultDict[Tuple[int, int], List[str]] = defaultdict(list) self._todo: List[Tuple[Callable, Tuple, Dict]] = [] @@ -159,7 +166,10 @@ def __init__( self.index = data.index self.columns = data.columns - self.uuid = uuid + if not isinstance(uuid_len, int) or not uuid_len >= 0: + raise TypeError("``uuid_len`` must be an integer in range [0, 32].") + self.uuid_len = min(32, uuid_len) + self.uuid = (uuid or uuid4().hex[: self.uuid_len]) + "_" self.table_styles = table_styles self.caption = caption if precision is None: @@ -248,7 +258,7 @@ def _translate(self): precision = self.precision hidden_index = self.hidden_index hidden_columns = self.hidden_columns - uuid = self.uuid or str(uuid1()).replace("-", "_") + uuid = self.uuid ROW_HEADING_CLASS = "row_heading" COL_HEADING_CLASS = "col_heading" INDEX_NAME_CLASS = "index_name" diff --git a/pandas/tests/io/formats/test_style.py b/pandas/tests/io/formats/test_style.py index e7583e1ce2ce2..8d66a16fc2b7a 100644 --- a/pandas/tests/io/formats/test_style.py +++ b/pandas/tests/io/formats/test_style.py @@ -1718,6 +1718,26 @@ def test_colspan_w3(self): s = Styler(df, uuid="_", cell_ids=False) assert '' in s.render() + @pytest.mark.parametrize("len_", [1, 5, 32, 33, 100]) + def test_uuid_len(self, len_): + # GH 36345 + df = pd.DataFrame(data=[["A"]]) + s = Styler(df, uuid_len=len_, cell_ids=False).render() + strt = s.find('id="T_') + end = s[strt + 6 :].find('"') + if len_ > 32: + assert end == 32 + 1 + else: + assert end == len_ + 1 + + @pytest.mark.parametrize("len_", [-2, "bad", None]) + def test_uuid_len_raises(self, len_): + # GH 36345 + df = pd.DataFrame(data=[["A"]]) + msg = "``uuid_len`` must be an integer in range \\[0, 32\\]." + with pytest.raises(TypeError, match=msg): + Styler(df, uuid_len=len_, cell_ids=False).render() + @td.skip_if_no_mpl class TestStylerMatplotlibDep: From 77194301819fa02ad9fb2ae10ac5b043b12be9c4 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 19 Sep 2020 08:34:44 -0700 Subject: [PATCH 0842/1025] Align cython and python reduction code paths (#36459) --- pandas/_libs/reduction.pyx | 17 ++++++++++++----- pandas/core/groupby/generic.py | 15 +++++++++++---- pandas/core/groupby/ops.py | 23 +++++++++++------------ 3 files changed, 34 insertions(+), 21 deletions(-) diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx index 8161b5c5c2b11..3a0fda5aed620 100644 --- a/pandas/_libs/reduction.pyx +++ b/pandas/_libs/reduction.pyx @@ -16,12 +16,12 @@ from pandas._libs cimport util from pandas._libs.lib import is_scalar, maybe_convert_objects -cdef _check_result_array(object obj, Py_ssize_t cnt): +cpdef check_result_array(object obj, Py_ssize_t cnt): if (util.is_array(obj) or (isinstance(obj, list) and len(obj) == cnt) or getattr(obj, 'shape', None) == (cnt,)): - raise ValueError('Function does not reduce') + raise ValueError('Must produce aggregated value') cdef class _BaseGrouper: @@ -74,12 +74,14 @@ cdef class _BaseGrouper: cached_ityp._engine.clear_mapping() cached_ityp._cache.clear() # e.g. inferred_freq must go res = self.f(cached_typ) - res = _extract_result(res) + res = extract_result(res) if not initialized: # On the first pass, we check the output shape to see # if this looks like a reduction. initialized = True - _check_result_array(res, len(self.dummy_arr)) + # In all tests other than test_series_grouper and + # test_series_bin_grouper, we have len(self.dummy_arr) == 0 + check_result_array(res, len(self.dummy_arr)) return res, initialized @@ -278,9 +280,14 @@ cdef class SeriesGrouper(_BaseGrouper): return result, counts -cdef inline _extract_result(object res, bint squeeze=True): +cpdef inline extract_result(object res, bint squeeze=True): """ extract the result object, it might be a 0-dim ndarray or a len-1 0-dim, or a scalar """ + if hasattr(res, "_values"): + # Preserve EA + res = res._values + if squeeze and res.ndim == 1 and len(res) == 1: + res = res[0] if hasattr(res, 'values') and util.is_array(res.values): res = res.values if util.is_array(res): diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index bbccd22f2ae85..0705261d0c516 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -29,7 +29,7 @@ import numpy as np -from pandas._libs import lib +from pandas._libs import lib, reduction as libreduction from pandas._typing import ArrayLike, FrameOrSeries, FrameOrSeriesUnion from pandas.util._decorators import Appender, Substitution, doc @@ -471,12 +471,19 @@ def _get_index() -> Index: def _aggregate_named(self, func, *args, **kwargs): result = {} + initialized = False for name, group in self: - group.name = name + # Each step of this loop corresponds to + # libreduction._BaseGrouper._apply_to_group + group.name = name # NB: libreduction does not pin name + output = func(group, *args, **kwargs) - if isinstance(output, (Series, Index, np.ndarray)): - raise ValueError("Must produce aggregated value") + output = libreduction.extract_result(output) + if not initialized: + # We only do this validation on the first iteration + libreduction.check_result_array(output, 0) + initialized = True result[name] = output return result diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index e9525f03368fa..b3f91d4623c84 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -623,7 +623,7 @@ def agg_series(self, obj: Series, func: F): try: return self._aggregate_series_fast(obj, func) except ValueError as err: - if "Function does not reduce" in str(err): + if "Must produce aggregated value" in str(err): # raised in libreduction pass else: @@ -653,27 +653,26 @@ def _aggregate_series_pure_python(self, obj: Series, func: F): group_index, _, ngroups = self.group_info counts = np.zeros(ngroups, dtype=int) - result = None + result = np.empty(ngroups, dtype="O") + initialized = False splitter = get_splitter(obj, group_index, ngroups, axis=0) for label, group in splitter: + + # Each step of this loop corresponds to + # libreduction._BaseGrouper._apply_to_group res = func(group) + res = libreduction.extract_result(res) - if result is None: - if isinstance(res, (Series, Index, np.ndarray)): - if len(res) == 1: - # e.g. test_agg_lambda_with_timezone lambda e: e.head(1) - # FIXME: are we potentially losing important res.index info? - res = res.item() - else: - raise ValueError("Function does not reduce") - result = np.empty(ngroups, dtype="O") + if not initialized: + # We only do this validation on the first iteration + libreduction.check_result_array(res, 0) + initialized = True counts[label] = group.shape[0] result[label] = res - assert result is not None result = lib.maybe_convert_objects(result, try_float=0) # TODO: maybe_cast_to_extension_array? From 202737338fa7122dced8004fd5a8d5a49bc3d8f8 Mon Sep 17 00:00:00 2001 From: Daniel Saxton <2658661+dsaxton@users.noreply.github.com> Date: Sat, 19 Sep 2020 14:52:24 -0500 Subject: [PATCH 0843/1025] Turn on stale GitHub action (#36476) --- .github/workflows/stale-pr.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/stale-pr.yml b/.github/workflows/stale-pr.yml index a6aece34478d9..e3b8d9336a5a6 100644 --- a/.github/workflows/stale-pr.yml +++ b/.github/workflows/stale-pr.yml @@ -18,4 +18,4 @@ jobs: days-before-stale: 30 days-before-close: -1 remove-stale-when-updated: true - debug-only: true + debug-only: false From 40e1ff0d2ccb776e74243530c35a7e49e6226fcf Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Sat, 19 Sep 2020 20:54:16 +0100 Subject: [PATCH 0844/1025] PERF: construct DataFrame with string array and dtype=str (#36432) --- asv_bench/benchmarks/strings.py | 17 ++++++++++++----- doc/source/whatsnew/v1.2.0.rst | 2 +- pandas/core/internals/construction.py | 20 +++++++++++--------- 3 files changed, 24 insertions(+), 15 deletions(-) diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py index 2023858181baa..d8b35abb94b9d 100644 --- a/asv_bench/benchmarks/strings.py +++ b/asv_bench/benchmarks/strings.py @@ -13,13 +13,20 @@ class Construction: param_names = ["dtype"] def setup(self, dtype): - self.data = tm.rands_array(nchars=10 ** 5, size=10) + self.series_arr = tm.rands_array(nchars=10, size=10 ** 5) + self.frame_arr = self.series_arr.reshape((50_000, 2)).copy() - def time_construction(self, dtype): - Series(self.data, dtype=dtype) + def time_series_construction(self, dtype): + Series(self.series_arr, dtype=dtype) - def peakmem_construction(self, dtype): - Series(self.data, dtype=dtype) + def peakmem_series_construction(self, dtype): + Series(self.series_arr, dtype=dtype) + + def time_frame_construction(self, dtype): + DataFrame(self.frame_arr, dtype=dtype) + + def peakmem_frame_construction(self, dtype): + DataFrame(self.frame_arr, dtype=dtype) class Methods: diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 5882b74aa8b05..35d813962022d 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -222,7 +222,7 @@ Deprecations Performance improvements ~~~~~~~~~~~~~~~~~~~~~~~~ -- Performance improvements when creating Series with dtype `str` or :class:`StringDtype` from array with many string elements (:issue:`36304`, :issue:`36317`, :issue:`36325`) +- Performance improvements when creating DataFrame or Series with dtype `str` or :class:`StringDtype` from array with many string elements (:issue:`36304`, :issue:`36317`, :issue:`36325`, :issue:`36432`) - Performance improvement in :meth:`GroupBy.agg` with the ``numba`` engine (:issue:`35759`) - Performance improvements when creating :meth:`pd.Series.map` from a huge dictionary (:issue:`34717`) - Performance improvement in :meth:`GroupBy.transform` with the ``numba`` engine (:issue:`36240`) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 2d4163e0dee89..d19a0dd8f29e3 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -13,6 +13,7 @@ from pandas.core.dtypes.cast import ( construct_1d_arraylike_from_scalar, + construct_1d_ndarray_preserving_na, maybe_cast_to_datetime, maybe_convert_platform, maybe_infer_to_datetimelike, @@ -189,15 +190,16 @@ def init_ndarray(values, index, columns, dtype: Optional[DtypeObj], copy: bool): # the dtypes will be coerced to a single dtype values = _prep_ndarray(values, copy=copy) - if dtype is not None: - if not is_dtype_equal(values.dtype, dtype): - try: - values = values.astype(dtype) - except Exception as orig: - # e.g. ValueError when trying to cast object dtype to float64 - raise ValueError( - f"failed to cast to '{dtype}' (Exception was: {orig})" - ) from orig + if dtype is not None and not is_dtype_equal(values.dtype, dtype): + try: + values = construct_1d_ndarray_preserving_na( + values.ravel(), dtype=dtype, copy=False + ).reshape(values.shape) + except Exception as orig: + # e.g. ValueError when trying to cast object dtype to float64 + raise ValueError( + f"failed to cast to '{dtype}' (Exception was: {orig})" + ) from orig # _prep_ndarray ensures that values.ndim == 2 at this point index, columns = _get_axes( From a6a8e0e7c83e8c386cabcec68b5446cb17f41a98 Mon Sep 17 00:00:00 2001 From: hardikpnsp Date: Sun, 20 Sep 2020 01:25:00 +0530 Subject: [PATCH 0845/1025] ASV: added benchamark tests for DataFrame.to_numpy() and .values (#36452) --- asv_bench/benchmarks/frame_methods.py | 40 +++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py index 44f71b392c0eb..70d90ded84545 100644 --- a/asv_bench/benchmarks/frame_methods.py +++ b/asv_bench/benchmarks/frame_methods.py @@ -219,6 +219,46 @@ def time_to_html_mixed(self): self.df2.to_html() +class ToNumpy: + def setup(self): + N = 10000 + M = 10 + self.df_tall = DataFrame(np.random.randn(N, M)) + self.df_wide = DataFrame(np.random.randn(M, N)) + self.df_mixed_tall = self.df_tall.copy() + self.df_mixed_tall["foo"] = "bar" + self.df_mixed_tall[0] = period_range("2000", periods=N) + self.df_mixed_tall[1] = range(N) + self.df_mixed_wide = self.df_wide.copy() + self.df_mixed_wide["foo"] = "bar" + self.df_mixed_wide[0] = period_range("2000", periods=M) + self.df_mixed_wide[1] = range(M) + + def time_to_numpy_tall(self): + self.df_tall.to_numpy() + + def time_to_numpy_wide(self): + self.df_wide.to_numpy() + + def time_to_numpy_mixed_tall(self): + self.df_mixed_tall.to_numpy() + + def time_to_numpy_mixed_wide(self): + self.df_mixed_wide.to_numpy() + + def time_values_tall(self): + self.df_tall.values + + def time_values_wide(self): + self.df_wide.values + + def time_values_mixed_tall(self): + self.df_mixed_tall.values + + def time_values_mixed_wide(self): + self.df_mixed_wide.values + + class Repr: def setup(self): nrows = 10000 From a802d13818954fd8b8a57693ec6be65eb87bbdba Mon Sep 17 00:00:00 2001 From: Alex Lim Date: Sat, 19 Sep 2020 15:56:05 -0400 Subject: [PATCH 0846/1025] BUG: get_indexer returned dtype (#36431) --- doc/source/whatsnew/v1.2.0.rst | 1 + pandas/_libs/hashtable.pxd | 2 +- pandas/_libs/hashtable_class_helper.pxi.in | 14 +++++++------- pandas/_libs/index.pyx | 6 +++--- pandas/tests/base/test_misc.py | 2 +- 5 files changed, 13 insertions(+), 12 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 35d813962022d..7724acdd8602c 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -295,6 +295,7 @@ Indexing - Bug in :meth:`PeriodIndex.get_loc` incorrectly raising ``ValueError`` on non-datelike strings instead of ``KeyError``, causing similar errors in :meth:`Series.__geitem__`, :meth:`Series.__contains__`, and :meth:`Series.loc.__getitem__` (:issue:`34240`) - Bug in :meth:`Index.sort_values` where, when empty values were passed, the method would break by trying to compare missing values instead of pushing them to the end of the sort order. (:issue:`35584`) +- Bug in :meth:`Index.get_indexer` and :meth:`Index.get_indexer_non_unique` where int64 arrays are returned instead of intp. (:issue:`36359`) - Missing diff --git a/pandas/_libs/hashtable.pxd b/pandas/_libs/hashtable.pxd index 0499eabf708af..2650bea921b3f 100644 --- a/pandas/_libs/hashtable.pxd +++ b/pandas/_libs/hashtable.pxd @@ -1,7 +1,7 @@ from pandas._libs.khash cimport ( kh_int64_t, kh_uint64_t, kh_float64_t, kh_pymap_t, kh_str_t, uint64_t, int64_t, float64_t) -from numpy cimport ndarray +from numpy cimport ndarray, intp_t # prototypes for sharing diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 5e4da96d57e42..da91fa69b0dec 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -347,7 +347,7 @@ cdef class {{name}}HashTable(HashTable): int ret = 0 {{dtype}}_t val khiter_t k - int64_t[:] locs = np.empty(n, dtype=np.int64) + intp_t[:] locs = np.empty(n, dtype=np.intp) with nogil: for i in range(n): @@ -551,7 +551,7 @@ cdef class {{name}}HashTable(HashTable): def get_labels_groupby(self, const {{dtype}}_t[:] values): cdef: Py_ssize_t i, n = len(values) - int64_t[:] labels + intp_t[:] labels Py_ssize_t idx, count = 0 int ret = 0 {{dtype}}_t val @@ -559,7 +559,7 @@ cdef class {{name}}HashTable(HashTable): {{name}}Vector uniques = {{name}}Vector() {{name}}VectorData *ud - labels = np.empty(n, dtype=np.int64) + labels = np.empty(n, dtype=np.intp) ud = uniques.data with nogil: @@ -648,8 +648,8 @@ cdef class StringHashTable(HashTable): def get_indexer(self, ndarray[object] values): cdef: Py_ssize_t i, n = len(values) - ndarray[int64_t] labels = np.empty(n, dtype=np.int64) - int64_t *resbuf = labels.data + ndarray[intp_t] labels = np.empty(n, dtype=np.intp) + intp_t *resbuf = labels.data khiter_t k kh_str_t *table = self.table const char *v @@ -680,7 +680,7 @@ cdef class StringHashTable(HashTable): object val const char *v khiter_t k - int64_t[:] locs = np.empty(n, dtype=np.int64) + intp_t[:] locs = np.empty(n, dtype=np.intp) # these by-definition *must* be strings vecs = malloc(n * sizeof(char *)) @@ -986,7 +986,7 @@ cdef class PyObjectHashTable(HashTable): int ret = 0 object val khiter_t k - int64_t[:] locs = np.empty(n, dtype=np.int64) + intp_t[:] locs = np.empty(n, dtype=np.intp) for i in range(n): val = values[i] diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index 8155e7e6c074a..e31c3739f456d 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -266,7 +266,7 @@ cdef class IndexEngine: """ cdef: ndarray values, x - ndarray[int64_t] result, missing + ndarray[intp_t] result, missing set stargets, remaining_stargets dict d = {} object val @@ -283,8 +283,8 @@ cdef class IndexEngine: else: n_alloc = n - result = np.empty(n_alloc, dtype=np.int64) - missing = np.empty(n_t, dtype=np.int64) + result = np.empty(n_alloc, dtype=np.intp) + missing = np.empty(n_t, dtype=np.intp) # map each starget to its position in the index if stargets and len(stargets) < 5 and self.is_monotonic_increasing: diff --git a/pandas/tests/base/test_misc.py b/pandas/tests/base/test_misc.py index 9523fba953ad0..b8468a5acf277 100644 --- a/pandas/tests/base/test_misc.py +++ b/pandas/tests/base/test_misc.py @@ -201,4 +201,4 @@ def test_get_indexer_non_unique_dtype_mismatch(): # GH 25459 indexes, missing = pd.Index(["A", "B"]).get_indexer_non_unique(pd.Index([0])) tm.assert_numpy_array_equal(np.array([-1], dtype=np.intp), indexes) - tm.assert_numpy_array_equal(np.array([0], dtype=np.int64), missing) + tm.assert_numpy_array_equal(np.array([0], dtype=np.intp), missing) From dcafd27cba1f735ec499b47f2fef02677014280b Mon Sep 17 00:00:00 2001 From: Ali McMaster Date: Sat, 19 Sep 2020 20:59:11 +0100 Subject: [PATCH 0847/1025] Use https for network checks (#36480) --- pandas/_testing.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/_testing.py b/pandas/_testing.py index cd34bec52daef..3e3ba480ebfeb 100644 --- a/pandas/_testing.py +++ b/pandas/_testing.py @@ -2403,7 +2403,7 @@ def can_connect(url, error_classes=None): @optional_args def network( t, - url="http://www.google.com", + url="https://www.google.com", raise_on_error=_RAISE_NETWORK_ERROR_DEFAULT, check_before_test=False, error_classes=None, @@ -2427,7 +2427,7 @@ def network( The test requiring network connectivity. url : path The url to test via ``pandas.io.common.urlopen`` to check - for connectivity. Defaults to 'http://www.google.com'. + for connectivity. Defaults to 'https://www.google.com'. raise_on_error : bool If True, never catches errors. check_before_test : bool @@ -2471,7 +2471,7 @@ def network( You can specify alternative URLs:: - >>> @network("http://www.yahoo.com") + >>> @network("https://www.yahoo.com") ... def test_something_with_yahoo(): ... raise IOError("Failure Message") >>> test_something_with_yahoo() From b7eb2f5b3f8fc6b5ca922cdc991a452d7b1beaad Mon Sep 17 00:00:00 2001 From: Asish Mahapatra Date: Sat, 19 Sep 2020 16:14:05 -0400 Subject: [PATCH 0848/1025] BUG: Python Parser skipping over items if BOM present in first element of header (#36365) --- doc/source/whatsnew/v1.2.0.rst | 1 + pandas/io/parsers.py | 10 ++++------ pandas/tests/io/parser/test_common.py | 10 ++++++++++ 3 files changed, 15 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 7724acdd8602c..9c2e5427fcd3f 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -321,6 +321,7 @@ I/O - :meth:`to_csv` did not support zip compression for binary file object not having a filename (:issue:`35058`) - :meth:`to_csv` and :meth:`read_csv` did not honor `compression` and `encoding` for path-like objects that are internally converted to file-like objects (:issue:`35677`, :issue:`26124`, and :issue:`32392`) - :meth:`to_picke` and :meth:`read_pickle` did not support compression for file-objects (:issue:`26237`, :issue:`29054`, and :issue:`29570`) +- Bug in :meth:`read_csv` with `engine='python'` truncating data if multiple items present in first row and first element started with BOM (:issue:`36343`) Plotting ^^^^^^^^ diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 43ffbe6bdd66c..bc622ab8c1f18 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -2886,14 +2886,12 @@ def _check_for_bom(self, first_row): # quotation mark. if len(first_row_bom) > end + 1: new_row += first_row_bom[end + 1 :] - return [new_row] + first_row[1:] - elif len(first_row_bom) > 1: - return [first_row_bom[1:]] else: - # First row is just the BOM, so we - # return an empty string. - return [""] + + # No quotation so just remove BOM from first element + new_row = first_row_bom[1:] + return [new_row] + first_row[1:] def _is_line_empty(self, line): """ diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index 49358fe2ecfe4..6bbc9bc9e1788 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -2128,6 +2128,16 @@ def test_first_row_bom(all_parsers): tm.assert_frame_equal(result, expected) +def test_first_row_bom_unquoted(all_parsers): + # see gh-36343 + parser = all_parsers + data = """\ufeffHead1 Head2 Head3""" + + result = parser.read_csv(StringIO(data), delimiter="\t") + expected = DataFrame(columns=["Head1", "Head2", "Head3"]) + tm.assert_frame_equal(result, expected) + + def test_integer_precision(all_parsers): # Gh 7072 s = """1,1;0;0;0;1;1;3844;3844;3844;1;1;1;1;1;1;0;0;1;1;0;0,,,4321583677327450765 From 1601462f381315e8566f55c247eb3fd5a5920dfa Mon Sep 17 00:00:00 2001 From: Andrew Wieteska <48889395+arw2019@users.noreply.github.com> Date: Sat, 19 Sep 2020 16:24:18 -0400 Subject: [PATCH 0849/1025] PERF: pd.to_datetime, unit='s' much slower for float64 than for int64 (#35027) --- asv_bench/benchmarks/timeseries.py | 23 +++++++++++++ doc/source/whatsnew/v1.2.0.rst | 1 + pandas/_libs/tslib.pyx | 46 ++++++++++++++++---------- pandas/_libs/tslibs/conversion.pxd | 1 + pandas/tests/io/sas/data/datetime.csv | 4 +-- pandas/tests/tools/test_to_datetime.py | 8 +++-- 6 files changed, 61 insertions(+), 22 deletions(-) diff --git a/asv_bench/benchmarks/timeseries.py b/asv_bench/benchmarks/timeseries.py index b494dbd8a38fa..27c904dda5b45 100644 --- a/asv_bench/benchmarks/timeseries.py +++ b/asv_bench/benchmarks/timeseries.py @@ -263,6 +263,29 @@ def time_lookup_and_cleanup(self): self.ts.index._cleanup() +class ToDatetimeFromIntsFloats: + def setup(self): + self.ts_sec = Series(range(1521080307, 1521685107), dtype="int64") + self.ts_sec_float = self.ts_sec.astype("float64") + + self.ts_nanosec = 1_000_000 * self.ts_sec + self.ts_nanosec_float = self.ts_nanosec.astype("float64") + + # speed of int64 and float64 paths should be comparable + + def time_nanosec_int64(self): + to_datetime(self.ts_nanosec, unit="ns") + + def time_nanosec_float64(self): + to_datetime(self.ts_nanosec_float, unit="ns") + + def time_sec_int64(self): + to_datetime(self.ts_sec, unit="s") + + def time_sec_float64(self): + to_datetime(self.ts_sec_float, unit="s") + + class ToDatetimeYYYYMMDD: def setup(self): rng = date_range(start="1/1/2000", periods=10000, freq="D") diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 9c2e5427fcd3f..3402d73499d99 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -227,6 +227,7 @@ Performance improvements - Performance improvements when creating :meth:`pd.Series.map` from a huge dictionary (:issue:`34717`) - Performance improvement in :meth:`GroupBy.transform` with the ``numba`` engine (:issue:`36240`) - ``Styler`` uuid method altered to compress data transmission over web whilst maintaining reasonably low table collision probability (:issue:`36345`) +- Performance improvement in :meth:`pd.to_datetime` with non-`ns` time unit for `float` `dtype` columns (:issue:`20445`) .. --------------------------------------------------------------------------- diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index e4128af62d06d..b1b38505b9476 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -41,6 +41,7 @@ from pandas._libs.tslibs.conversion cimport ( cast_from_unit, convert_datetime_to_tsobject, get_datetime64_nanos, + precision_from_unit, ) from pandas._libs.tslibs.nattype cimport ( NPY_NAT, @@ -205,6 +206,7 @@ def array_with_unit_to_datetime( cdef: Py_ssize_t i, j, n=len(values) int64_t m + int prec = 0 ndarray[float64_t] fvalues bint is_ignore = errors=='ignore' bint is_coerce = errors=='coerce' @@ -217,38 +219,48 @@ def array_with_unit_to_datetime( assert is_ignore or is_coerce or is_raise - if unit == 'ns': - if issubclass(values.dtype.type, np.integer): - result = values.astype('M8[ns]') + if unit == "ns": + if issubclass(values.dtype.type, (np.integer, np.float_)): + result = values.astype("M8[ns]", copy=False) else: result, tz = array_to_datetime(values.astype(object), errors=errors) return result, tz - m = cast_from_unit(None, unit) + m, p = precision_from_unit(unit) if is_raise: - - # try a quick conversion to i8 + # try a quick conversion to i8/f8 # if we have nulls that are not type-compat # then need to iterate - if values.dtype.kind == "i": - # Note: this condition makes the casting="same_kind" redundant - iresult = values.astype('i8', casting='same_kind', copy=False) - # fill by comparing to NPY_NAT constant + + if values.dtype.kind == "i" or values.dtype.kind == "f": + iresult = values.astype("i8", copy=False) + # fill missing values by comparing to NPY_NAT mask = iresult == NPY_NAT iresult[mask] = 0 - fvalues = iresult.astype('f8') * m + fvalues = iresult.astype("f8") * m need_to_iterate = False - # check the bounds if not need_to_iterate: - - if ((fvalues < Timestamp.min.value).any() - or (fvalues > Timestamp.max.value).any()): + # check the bounds + if (fvalues < Timestamp.min.value).any() or ( + (fvalues > Timestamp.max.value).any() + ): raise OutOfBoundsDatetime(f"cannot convert input with unit '{unit}'") - result = (iresult * m).astype('M8[ns]') - iresult = result.view('i8') + + if values.dtype.kind == "i": + result = (iresult * m).astype("M8[ns]") + + elif values.dtype.kind == "f": + fresult = (values * m).astype("f8") + fresult[mask] = 0 + if prec: + fresult = round(fresult, prec) + result = fresult.astype("M8[ns]", copy=False) + + iresult = result.view("i8") iresult[mask] = NPY_NAT + return result, tz result = np.empty(n, dtype='M8[ns]') diff --git a/pandas/_libs/tslibs/conversion.pxd b/pandas/_libs/tslibs/conversion.pxd index 73772e5ab4577..56f5481b7e781 100644 --- a/pandas/_libs/tslibs/conversion.pxd +++ b/pandas/_libs/tslibs/conversion.pxd @@ -24,5 +24,6 @@ cdef int64_t get_datetime64_nanos(object val) except? -1 cpdef datetime localize_pydatetime(datetime dt, object tz) cdef int64_t cast_from_unit(object ts, str unit) except? -1 +cpdef (int64_t, int) precision_from_unit(str unit) cdef int64_t normalize_i8_stamp(int64_t local_val) nogil diff --git a/pandas/tests/io/sas/data/datetime.csv b/pandas/tests/io/sas/data/datetime.csv index 6126f6d04eaf0..f0d82f7fc494e 100644 --- a/pandas/tests/io/sas/data/datetime.csv +++ b/pandas/tests/io/sas/data/datetime.csv @@ -1,5 +1,5 @@ Date1,Date2,DateTime,DateTimeHi,Taiw -1677-09-22,1677-09-22,1677-09-21 00:12:44,1677-09-21 00:12:43.145226,1912-01-01 +1677-09-22,1677-09-22,1677-09-21 00:12:44,1677-09-21 00:12:43.145225,1912-01-01 1960-01-01,1960-01-01,1960-01-01 00:00:00,1960-01-01 00:00:00.000000,1960-01-01 2016-02-29,2016-02-29,2016-02-29 23:59:59,2016-02-29 23:59:59.123456,2016-02-29 -2262-04-11,2262-04-11,2262-04-11 23:47:16,2262-04-11 23:47:16.854774,2262-04-11 +2262-04-11,2262-04-11,2262-04-11 23:47:16,2262-04-11 23:47:16.854775,2262-04-11 diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index d2049892705ea..819474e1f32e7 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -1217,10 +1217,10 @@ def test_unit_mixed(self, cache): @pytest.mark.parametrize("cache", [True, False]) def test_unit_rounding(self, cache): - # GH 14156: argument will incur floating point errors but no - # premature rounding + # GH 14156 & GH 20445: argument will incur floating point errors + # but no premature rounding result = pd.to_datetime(1434743731.8770001, unit="s", cache=cache) - expected = pd.Timestamp("2015-06-19 19:55:31.877000093") + expected = pd.Timestamp("2015-06-19 19:55:31.877000192") assert result == expected @pytest.mark.parametrize("cache", [True, False]) @@ -1454,6 +1454,8 @@ def test_to_datetime_unit(self): ] + [NaT] ) + # GH20455 argument will incur floating point errors but no premature rounding + result = result.round("ms") tm.assert_series_equal(result, expected) s = pd.concat( From 59c6867f5f2380fead9808f9acd81012893a58ef Mon Sep 17 00:00:00 2001 From: Avinash Pancham <44933366+avinashpancham@users.noreply.github.com> Date: Sat, 19 Sep 2020 22:29:43 +0200 Subject: [PATCH 0850/1025] BUG: Enable Series.equals to compare numpy arrays to scalars (#36161) --- doc/source/whatsnew/v1.2.0.rst | 1 + pandas/_libs/lib.pyx | 11 ++++++++++- pandas/tests/dtypes/test_missing.py | 15 +++++++++++++++ pandas/tests/series/methods/test_equals.py | 20 ++++++++++++++++---- 4 files changed, 42 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 3402d73499d99..3ea3efb9d041f 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -270,6 +270,7 @@ Numeric ^^^^^^^ - Bug in :func:`to_numeric` where float precision was incorrect (:issue:`31364`) - Bug in :meth:`DataFrame.any` with ``axis=1`` and ``bool_only=True`` ignoring the ``bool_only`` keyword (:issue:`32432`) +- Bug in :meth:`Series.equals` where a ``ValueError`` was raised when numpy arrays were compared to scalars (:issue:`35267`) - Conversion diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index cc63df90a9a9f..a57cf3b523985 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -591,7 +591,16 @@ def array_equivalent_object(left: object[:], right: object[:]) -> bool: if "tz-naive and tz-aware" in str(err): return False raise - + except ValueError: + # Avoid raising ValueError when comparing Numpy arrays to other types + if cnp.PyArray_IsAnyScalar(x) != cnp.PyArray_IsAnyScalar(y): + # Only compare scalars to scalars and non-scalars to non-scalars + return False + elif (not (cnp.PyArray_IsPythonScalar(x) or cnp.PyArray_IsPythonScalar(y)) + and not (isinstance(x, type(y)) or isinstance(y, type(x)))): + # Check if non-scalars have the same type + return False + raise return True diff --git a/pandas/tests/dtypes/test_missing.py b/pandas/tests/dtypes/test_missing.py index a642b23379c6f..046b82ef3131a 100644 --- a/pandas/tests/dtypes/test_missing.py +++ b/pandas/tests/dtypes/test_missing.py @@ -1,3 +1,4 @@ +from contextlib import nullcontext from datetime import datetime from decimal import Decimal @@ -383,6 +384,20 @@ def test_array_equivalent(dtype_equal): assert not array_equivalent(DatetimeIndex([0, np.nan]), TimedeltaIndex([0, np.nan])) +@pytest.mark.parametrize( + "val", [1, 1.1, 1 + 1j, True, "abc", [1, 2], (1, 2), {1, 2}, {"a": 1}, None] +) +def test_array_equivalent_series(val): + arr = np.array([1, 2]) + cm = ( + tm.assert_produces_warning(FutureWarning, check_stacklevel=False) + if isinstance(val, str) + else nullcontext() + ) + with cm: + assert not array_equivalent(Series([arr, arr]), Series([arr, val])) + + def test_array_equivalent_different_dtype_but_equal(): # Unclear if this is exposed anywhere in the public-facing API assert array_equivalent(np.array([1, 2]), np.array([1.0, 2.0])) diff --git a/pandas/tests/series/methods/test_equals.py b/pandas/tests/series/methods/test_equals.py index 600154adfcda3..cf55482fefe22 100644 --- a/pandas/tests/series/methods/test_equals.py +++ b/pandas/tests/series/methods/test_equals.py @@ -1,7 +1,10 @@ +from contextlib import nullcontext + import numpy as np import pytest from pandas import MultiIndex, Series +import pandas._testing as tm @pytest.mark.parametrize( @@ -24,16 +27,25 @@ def test_equals(arr, idx): assert not s1.equals(s2) -def test_equals_list_array(): +@pytest.mark.parametrize( + "val", [1, 1.1, 1 + 1j, True, "abc", [1, 2], (1, 2), {1, 2}, {"a": 1}, None] +) +def test_equals_list_array(val): # GH20676 Verify equals operator for list of Numpy arrays arr = np.array([1, 2]) s1 = Series([arr, arr]) s2 = s1.copy() assert s1.equals(s2) - # TODO: Series equals should also work between single value and list - # s1[1] = 9 - # assert not s1.equals(s2) + s1[1] = val + + cm = ( + tm.assert_produces_warning(FutureWarning, check_stacklevel=False) + if isinstance(val, str) + else nullcontext() + ) + with cm: + assert not s1.equals(s2) def test_equals_false_negative(): From d1bba43d91843d3de8e006e20f287d6160d23274 Mon Sep 17 00:00:00 2001 From: patrick <61934744+phofl@users.noreply.github.com> Date: Sun, 20 Sep 2020 00:03:59 +0200 Subject: [PATCH 0851/1025] [BUG]: Rolling.sum() calculated wrong values when axis is one and dtypes are mixed (#36458) --- doc/source/whatsnew/v1.2.0.rst | 2 +- pandas/core/window/rolling.py | 8 ++++- pandas/tests/window/test_rolling.py | 48 +++++++++++++++++++++++++++++ 3 files changed, 56 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 3ea3efb9d041f..cd53526cd4c79 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -344,7 +344,7 @@ Groupby/resample/rolling - Bug in :meth:`DataFrameGroupby.tshift` failing to raise ``ValueError`` when a frequency cannot be inferred for the index of a group (:issue:`35937`) - Bug in :meth:`DataFrame.groupby` does not always maintain column index name for ``any``, ``all``, ``bfill``, ``ffill``, ``shift`` (:issue:`29764`) - Bug in :meth:`DataFrameGroupBy.apply` raising error with ``np.nan`` group(s) when ``dropna=False`` (:issue:`35889`) -- +- Bug in :meth:`Rolling.sum()` returned wrong values when dtypes where mixed between float and integer and axis was equal to one (:issue:`20649`, :issue:`35596`) Reshaping ^^^^^^^^^ diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 21a7164411fb7..06c3ad23f904f 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -243,7 +243,13 @@ def _create_data(self, obj: FrameOrSeries) -> FrameOrSeries: if self.on is not None and not isinstance(self.on, Index): if obj.ndim == 2: obj = obj.reindex(columns=obj.columns.difference([self.on]), copy=False) - + if self.axis == 1: + # GH: 20649 in case of mixed dtype and axis=1 we have to convert everything + # to float to calculate the complete row at once. We exclude all non-numeric + # dtypes. + obj = obj.select_dtypes(include=["integer", "float"], exclude=["timedelta"]) + obj = obj.astype("float64", copy=False) + obj._mgr = obj._mgr.consolidate() return obj def _gotitem(self, key, ndim, subset=None): diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index 88afcec0f7bf4..4dfa0287bbb03 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -771,3 +771,51 @@ def test_rolling_numerical_too_large_numbers(): index=dates, ) tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + ("func", "value"), + [("sum", 2.0), ("max", 1.0), ("min", 1.0), ("mean", 1.0), ("median", 1.0)], +) +def test_rolling_mixed_dtypes_axis_1(func, value): + # GH: 20649 + df = pd.DataFrame(1, index=[1, 2], columns=["a", "b", "c"]) + df["c"] = 1.0 + result = getattr(df.rolling(window=2, min_periods=1, axis=1), func)() + expected = pd.DataFrame( + {"a": [1.0, 1.0], "b": [value, value], "c": [value, value]}, index=[1, 2] + ) + tm.assert_frame_equal(result, expected) + + +def test_rolling_axis_one_with_nan(): + # GH: 35596 + df = pd.DataFrame( + [ + [0, 1, 2, 4, np.nan, np.nan, np.nan], + [0, 1, 2, np.nan, np.nan, np.nan, np.nan], + [0, 2, 2, np.nan, 2, np.nan, 1], + ] + ) + result = df.rolling(window=7, min_periods=1, axis="columns").sum() + expected = pd.DataFrame( + [ + [0.0, 1.0, 3.0, 7.0, 7.0, 7.0, 7.0], + [0.0, 1.0, 3.0, 3.0, 3.0, 3.0, 3.0], + [0.0, 2.0, 4.0, 4.0, 6.0, 6.0, 7.0], + ] + ) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "value", + ["test", pd.to_datetime("2019-12-31"), pd.to_timedelta("1 days 06:05:01.00003")], +) +def test_rolling_axis_1_non_numeric_dtypes(value): + # GH: 20649 + df = pd.DataFrame({"a": [1, 2]}) + df["b"] = value + result = df.rolling(window=2, min_periods=1, axis=1).sum() + expected = pd.DataFrame({"a": [1.0, 2.0]}) + tm.assert_frame_equal(result, expected) From 17ba0fc2d6652ce0dda121cd38d664f5e510df74 Mon Sep 17 00:00:00 2001 From: jeschwar <36767735+jeschwar@users.noreply.github.com> Date: Sat, 19 Sep 2020 16:05:11 -0600 Subject: [PATCH 0852/1025] BUG: fix duplicate entries in LaTeX List of Tables when using longtable environments (#36297) --- doc/source/whatsnew/v1.2.0.rst | 1 + pandas/io/formats/latex.py | 22 +++++++++++++--- pandas/tests/io/formats/test_to_latex.py | 33 ++++++++++++++++++++++++ 3 files changed, 52 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index cd53526cd4c79..18940b574b517 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -323,6 +323,7 @@ I/O - :meth:`to_csv` did not support zip compression for binary file object not having a filename (:issue:`35058`) - :meth:`to_csv` and :meth:`read_csv` did not honor `compression` and `encoding` for path-like objects that are internally converted to file-like objects (:issue:`35677`, :issue:`26124`, and :issue:`32392`) - :meth:`to_picke` and :meth:`read_pickle` did not support compression for file-objects (:issue:`26237`, :issue:`29054`, and :issue:`29570`) +- Bug in :func:`LongTableBuilder.middle_separator` was duplicating LaTeX longtable entires in the List of Tables of a LaTeX document (:issue:`34360`) - Bug in :meth:`read_csv` with `engine='python'` truncating data if multiple items present in first row and first element started with BOM (:issue:`36343`) Plotting diff --git a/pandas/io/formats/latex.py b/pandas/io/formats/latex.py index 8080d953da308..eb35fff3a4f8e 100644 --- a/pandas/io/formats/latex.py +++ b/pandas/io/formats/latex.py @@ -431,13 +431,18 @@ class LongTableBuilder(GenericTableBuilder): >>> from pandas.io.formats import format as fmt >>> df = DataFrame({"a": [1, 2], "b": ["b1", "b2"]}) >>> formatter = fmt.DataFrameFormatter(df) - >>> builder = LongTableBuilder(formatter, caption='caption', label='lab', - ... column_format='lrl') + >>> builder = LongTableBuilder(formatter, caption='a long table', + ... label='tab:long', column_format='lrl') >>> table = builder.get_result() >>> print(table) \\begin{longtable}{lrl} - \\caption{caption} - \\label{lab}\\\\ + \\caption{a long table} + \\label{tab:long}\\\\ + \\toprule + {} & a & b \\\\ + \\midrule + \\endfirsthead + \\caption[]{a long table} \\\\ \\toprule {} & a & b \\\\ \\midrule @@ -476,7 +481,16 @@ def _caption_and_label(self) -> str: @property def middle_separator(self) -> str: iterator = self._create_row_iterator(over="header") + + # the content between \endfirsthead and \endhead commands + # mitigates repeated List of Tables entries in the final LaTeX + # document when dealing with longtable environments; GH #34360 elements = [ + "\\midrule", + "\\endfirsthead", + f"\\caption[]{{{self.caption}}} \\\\" if self.caption else "", + self.top_separator, + self.header, "\\midrule", "\\endhead", "\\midrule", diff --git a/pandas/tests/io/formats/test_to_latex.py b/pandas/tests/io/formats/test_to_latex.py index a93ab6f9cc7aa..8df8796d236a5 100644 --- a/pandas/tests/io/formats/test_to_latex.py +++ b/pandas/tests/io/formats/test_to_latex.py @@ -411,6 +411,11 @@ def test_to_latex_longtable(self): df = DataFrame({"a": [1, 2], "b": ["b1", "b2"]}) withindex_result = df.to_latex(longtable=True) withindex_expected = r"""\begin{longtable}{lrl} +\toprule +{} & a & b \\ +\midrule +\endfirsthead + \toprule {} & a & b \\ \midrule @@ -430,6 +435,11 @@ def test_to_latex_longtable(self): withoutindex_result = df.to_latex(index=False, longtable=True) withoutindex_expected = r"""\begin{longtable}{rl} +\toprule + a & b \\ +\midrule +\endfirsthead + \toprule a & b \\ \midrule @@ -525,6 +535,9 @@ def test_to_latex_longtable_caption_label(self): df = DataFrame({"a": [1, 2], "b": ["b1", "b2"]}) + # test when no caption and no label is provided + # is performed by test_to_latex_longtable() + # test when only the caption is provided result_c = df.to_latex(longtable=True, caption=the_caption) @@ -533,6 +546,11 @@ def test_to_latex_longtable_caption_label(self): \toprule {} & a & b \\ \midrule +\endfirsthead +\caption[]{a table in a \texttt{longtable} environment} \\ +\toprule +{} & a & b \\ +\midrule \endhead \midrule \multicolumn{3}{r}{{Continued on next page}} \\ @@ -552,6 +570,11 @@ def test_to_latex_longtable_caption_label(self): expected_l = r"""\begin{longtable}{lrl} \label{tab:longtable}\\ +\toprule +{} & a & b \\ +\midrule +\endfirsthead + \toprule {} & a & b \\ \midrule @@ -578,6 +601,11 @@ def test_to_latex_longtable_caption_label(self): \toprule {} & a & b \\ \midrule +\endfirsthead +\caption[]{a table in a \texttt{longtable} environment} \\ +\toprule +{} & a & b \\ +\midrule \endhead \midrule \multicolumn{3}{r}{{Continued on next page}} \\ @@ -623,6 +651,11 @@ def test_to_latex_longtable_position(self): result_p = df.to_latex(longtable=True, position=the_position) expected_p = r"""\begin{longtable}[t]{lrl} +\toprule +{} & a & b \\ +\midrule +\endfirsthead + \toprule {} & a & b \\ \midrule From 3e30b9e0a03c2f4303ad5c200aa07fb22619746a Mon Sep 17 00:00:00 2001 From: Maxim Ivanov <41443370+ivanovmg@users.noreply.github.com> Date: Sun, 20 Sep 2020 05:05:37 +0700 Subject: [PATCH 0853/1025] REF: pandas/io/formats/format.py (#36434) --- pandas/io/formats/format.py | 623 +++++++++++++++++++++--------------- pandas/io/formats/html.py | 36 +-- 2 files changed, 376 insertions(+), 283 deletions(-) diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 4a36dd7bc6de4..75e5eedfc148d 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -10,6 +10,7 @@ from functools import partial from io import StringIO import math +from operator import itemgetter import re from shutil import get_terminal_size from typing import ( @@ -67,6 +68,7 @@ from pandas.core.indexes.api import Index, MultiIndex, PeriodIndex, ensure_index from pandas.core.indexes.datetimes import DatetimeIndex from pandas.core.indexes.timedeltas import TimedeltaIndex +from pandas.core.reshape.concat import concat from pandas.io.common import stringify_path from pandas.io.formats.printing import adjoin, justify, pprint_thing @@ -261,17 +263,15 @@ def __init__( self._chk_truncate() def _chk_truncate(self) -> None: - from pandas.core.reshape.concat import concat - self.tr_row_num: Optional[int] min_rows = self.min_rows max_rows = self.max_rows # truncation determined by max_rows, actual truncated number of rows # used below by min_rows - truncate_v = max_rows and (len(self.series) > max_rows) + is_truncated_vertically = max_rows and (len(self.series) > max_rows) series = self.series - if truncate_v: + if is_truncated_vertically: max_rows = cast(int, max_rows) if min_rows: # if min_rows is set (not None or 0), set max_rows to minimum @@ -287,7 +287,7 @@ def _chk_truncate(self) -> None: else: self.tr_row_num = None self.tr_series = series - self.truncate_v = truncate_v + self.is_truncated_vertically = is_truncated_vertically def _get_footer(self) -> str: name = self.series.name @@ -306,7 +306,9 @@ def _get_footer(self) -> str: series_name = pprint_thing(name, escape_chars=("\t", "\r", "\n")) footer += f"Name: {series_name}" - if self.length is True or (self.length == "truncate" and self.truncate_v): + if self.length is True or ( + self.length == "truncate" and self.is_truncated_vertically + ): if footer: footer += ", " footer += f"Length: {len(self.series)}" @@ -358,7 +360,7 @@ def to_string(self) -> str: fmt_index, have_header = self._get_formatted_index() fmt_values = self._get_formatted_values() - if self.truncate_v: + if self.is_truncated_vertically: n_header_rows = 0 row_num = self.tr_row_num row_num = cast(int, row_num) @@ -451,9 +453,13 @@ def get_adjustment() -> TextAdjustment: class TableFormatter: show_dimensions: Union[bool, str] - is_truncated: bool formatters: FormattersType columns: Index + _is_truncated: bool + + @property + def is_truncated(self) -> bool: + return self._is_truncated @property def should_show_dimensions(self) -> bool: @@ -537,8 +543,6 @@ class DataFrameFormatter(TableFormatter): __doc__ = __doc__ if __doc__ else "" __doc__ += common_docstring + return_docstring - col_space: ColspaceType - def __init__( self, frame: "DataFrame", @@ -565,315 +569,409 @@ def __init__( ): self.frame = frame self.show_index_names = index_names + self.sparsify = self._initialize_sparsify(sparsify) + self.float_format = float_format + self.formatters = self._initialize_formatters(formatters) + self.na_rep = na_rep + self.decimal = decimal + self.col_space = self._initialize_colspace(col_space) + self.header = header + self.index = index + self.line_width = line_width + self.max_rows = max_rows + self.min_rows = min_rows + self.max_cols = max_cols + self.show_dimensions = show_dimensions + self.table_id = table_id + self.render_links = render_links + self.justify = self._initialize_justify(justify) + self.bold_rows = bold_rows + self.escape = escape + self.columns = self._initialize_columns(columns) - if sparsify is None: - sparsify = get_option("display.multi_sparse") + self.max_cols_fitted = self._calc_max_cols_fitted() + self.max_rows_fitted = self._calc_max_rows_fitted() - self.sparsify = sparsify + self._truncate() + self.adj = get_adjustment() - self.float_format = float_format + def _initialize_sparsify(self, sparsify: Optional[bool]) -> bool: + if sparsify is None: + return get_option("display.multi_sparse") + return sparsify + + def _initialize_formatters( + self, formatters: Optional[FormattersType] + ) -> FormattersType: if formatters is None: - self.formatters = {} - elif len(frame.columns) == len(formatters) or isinstance(formatters, dict): - self.formatters = formatters + return {} + elif len(self.frame.columns) == len(formatters) or isinstance(formatters, dict): + return formatters else: raise ValueError( f"Formatters length({len(formatters)}) should match " - f"DataFrame number of columns({len(frame.columns)})" + f"DataFrame number of columns({len(self.frame.columns)})" ) - self.na_rep = na_rep - self.decimal = decimal + + def _initialize_justify(self, justify: Optional[str]) -> str: + if justify is None: + return get_option("display.colheader_justify") + else: + return justify + + def _initialize_columns(self, columns: Optional[Sequence[str]]) -> Index: + if columns is not None: + cols = ensure_index(columns) + self.frame = self.frame[cols] + return cols + else: + return self.frame.columns + + def _initialize_colspace( + self, col_space: Optional[ColspaceArgType] + ) -> ColspaceType: + result: ColspaceType + if col_space is None: - self.col_space = {} + result = {} elif isinstance(col_space, (int, str)): - self.col_space = {"": col_space} - self.col_space.update({column: col_space for column in self.frame.columns}) + result = {"": col_space} + result.update({column: col_space for column in self.frame.columns}) elif isinstance(col_space, Mapping): for column in col_space.keys(): if column not in self.frame.columns and column != "": raise ValueError( f"Col_space is defined for an unknown column: {column}" ) - self.col_space = col_space + result = col_space else: - if len(frame.columns) != len(col_space): + if len(self.frame.columns) != len(col_space): raise ValueError( f"Col_space length({len(col_space)}) should match " - f"DataFrame number of columns({len(frame.columns)})" + f"DataFrame number of columns({len(self.frame.columns)})" ) - self.col_space = dict(zip(self.frame.columns, col_space)) + result = dict(zip(self.frame.columns, col_space)) + return result - self.header = header - self.index = index - self.line_width = line_width - self.max_rows = max_rows - self.min_rows = min_rows - self.max_cols = max_cols - self.max_rows_displayed = min(max_rows or len(self.frame), len(self.frame)) - self.show_dimensions = show_dimensions - self.table_id = table_id - self.render_links = render_links + @property + def max_rows_displayed(self) -> int: + return min(self.max_rows or len(self.frame), len(self.frame)) - if justify is None: - self.justify = get_option("display.colheader_justify") + def _calc_max_cols_fitted(self) -> Optional[int]: + """Number of columns fitting the screen.""" + if not self._is_in_terminal(): + return self.max_cols + + width, _ = get_terminal_size() + if self._is_screen_narrow(width): + return width else: - self.justify = justify + return self.max_cols - self.bold_rows = bold_rows - self.escape = escape + def _calc_max_rows_fitted(self) -> Optional[int]: + """Number of rows with data fitting the screen.""" + if not self._is_in_terminal(): + return self.max_rows - if columns is not None: - self.columns = ensure_index(columns) - self.frame = self.frame[self.columns] + _, height = get_terminal_size() + if self.max_rows == 0: + # rows available to fill with actual data + return height - self._get_number_of_auxillary_rows() + + max_rows: Optional[int] + if self._is_screen_short(height): + max_rows = height else: - self.columns = frame.columns + max_rows = self.max_rows - self._chk_truncate() - self.adj = get_adjustment() + if max_rows: + if (len(self.frame) > max_rows) and self.min_rows: + # if truncated, set max_rows showed to min_rows + max_rows = min(self.min_rows, max_rows) + return max_rows - def _chk_truncate(self) -> None: + def _is_in_terminal(self) -> bool: + """Check if the output is to be shown in terminal.""" + return bool(self.max_cols == 0 or self.max_rows == 0) + + def _is_screen_narrow(self, max_width) -> bool: + return bool(self.max_cols == 0 and len(self.frame.columns) > max_width) + + def _is_screen_short(self, max_height) -> bool: + return bool(self.max_rows == 0 and len(self.frame) > max_height) + + def _get_number_of_auxillary_rows(self) -> int: + """Get number of rows occupied by prompt, dots and dimension info.""" + dot_row = 1 + prompt_row = 1 + num_rows = dot_row + prompt_row + + if self.show_dimensions: + num_rows += len(self._dimensions_info.splitlines()) + + if self.header: + num_rows += 1 + + return num_rows + + @property + def is_truncated_horizontally(self) -> bool: + return bool(self.max_cols_fitted and (len(self.columns) > self.max_cols_fitted)) + + @property + def is_truncated_vertically(self) -> bool: + return bool(self.max_rows_fitted and (len(self.frame) > self.max_rows_fitted)) + + @property + def is_truncated(self) -> bool: + return bool(self.is_truncated_horizontally or self.is_truncated_vertically) + + def _truncate(self) -> None: """ - Checks whether the frame should be truncated. If so, slices - the frame up. + Check whether the frame should be truncated. If so, slice the frame up. """ - from pandas.core.reshape.concat import concat + self.tr_frame = self.frame.copy() - # Cut the data to the information actually printed - max_cols = self.max_cols - max_rows = self.max_rows - self.max_rows_adj: Optional[int] - max_rows_adj: Optional[int] - - if max_cols == 0 or max_rows == 0: # assume we are in the terminal - (w, h) = get_terminal_size() - self.w = w - self.h = h - if self.max_rows == 0: - dot_row = 1 - prompt_row = 1 - if self.show_dimensions: - show_dimension_rows = 3 - # assume we only get here if self.header is boolean. - # i.e. not to_latex() where self.header may be List[str] - self.header = cast(bool, self.header) - n_add_rows = self.header + dot_row + show_dimension_rows + prompt_row - # rows available to fill with actual data - max_rows_adj = self.h - n_add_rows - self.max_rows_adj = max_rows_adj - - # Format only rows and columns that could potentially fit the - # screen - if max_cols == 0 and len(self.frame.columns) > w: - max_cols = w - if max_rows == 0 and len(self.frame) > h: - max_rows = h - - if not hasattr(self, "max_rows_adj"): - if max_rows: - if (len(self.frame) > max_rows) and self.min_rows: - # if truncated, set max_rows showed to min_rows - max_rows = min(self.min_rows, max_rows) - self.max_rows_adj = max_rows - if not hasattr(self, "max_cols_adj"): - self.max_cols_adj = max_cols - - max_cols_adj = self.max_cols_adj - max_rows_adj = self.max_rows_adj - - truncate_h = max_cols_adj and (len(self.columns) > max_cols_adj) - truncate_v = max_rows_adj and (len(self.frame) > max_rows_adj) - - frame = self.frame - if truncate_h: - # cast here since if truncate_h is True, max_cols_adj is not None - max_cols_adj = cast(int, max_cols_adj) - if max_cols_adj == 0: - col_num = len(frame.columns) - elif max_cols_adj == 1: - max_cols = cast(int, max_cols) - frame = frame.iloc[:, :max_cols] - col_num = max_cols - else: - col_num = max_cols_adj // 2 - frame = concat( - (frame.iloc[:, :col_num], frame.iloc[:, -col_num:]), axis=1 - ) - # truncate formatter - if isinstance(self.formatters, (list, tuple)): - truncate_fmt = self.formatters - self.formatters = [ - *truncate_fmt[:col_num], - *truncate_fmt[-col_num:], - ] - self.tr_col_num = col_num - if truncate_v: - # cast here since if truncate_v is True, max_rows_adj is not None - max_rows_adj = cast(int, max_rows_adj) - if max_rows_adj == 1: - row_num = max_rows - frame = frame.iloc[:max_rows, :] - else: - row_num = max_rows_adj // 2 - frame = concat((frame.iloc[:row_num, :], frame.iloc[-row_num:, :])) - self.tr_row_num = row_num - else: - self.tr_row_num = None + if self.is_truncated_horizontally: + self._truncate_horizontally() - self.tr_frame = frame - self.truncate_h = truncate_h - self.truncate_v = truncate_v - self.is_truncated = bool(self.truncate_h or self.truncate_v) + if self.is_truncated_vertically: + self._truncate_vertically() - def _to_str_columns(self) -> List[List[str]]: - """ - Render a DataFrame to a list of columns (as lists of strings). + def _truncate_horizontally(self) -> None: + """Remove columns, which are not to be displayed and adjust formatters. + + Attributes affected: + - tr_frame + - formatters + - tr_col_num """ - # this method is not used by to_html where self.col_space - # could be a string so safe to cast - col_space = {k: cast(int, v) for k, v in self.col_space.items()} + assert self.max_cols_fitted is not None + col_num = self.max_cols_fitted // 2 + if col_num >= 1: + cols_to_keep = [ + x + for x in range(self.frame.shape[1]) + if x < col_num or x >= len(self.frame.columns) - col_num + ] + self.tr_frame = self.tr_frame.iloc[:, cols_to_keep] - frame = self.tr_frame - # may include levels names also + # truncate formatter + if isinstance(self.formatters, (list, tuple)): + slicer = itemgetter(*cols_to_keep) + self.formatters = slicer(self.formatters) + else: + col_num = cast(int, self.max_cols) + self.tr_frame = self.tr_frame.iloc[:, :col_num] + self.tr_col_num = col_num - str_index = self._get_formatted_index(frame) + def _truncate_vertically(self) -> None: + """Remove rows, which are not to be displayed. + + Attributes affected: + - tr_frame + - tr_row_num + """ + assert self.max_rows_fitted is not None + row_num = self.max_rows_fitted // 2 + if row_num >= 1: + rows_to_keep = [ + x + for x in range(self.frame.shape[0]) + if x < row_num or x >= len(self.frame) - row_num + ] + self.tr_frame = self.tr_frame.iloc[rows_to_keep, :] + else: + row_num = cast(int, self.max_rows) + self.tr_frame = self.tr_frame.iloc[:row_num, :] + self.tr_row_num = row_num + + def _get_strcols_without_index(self) -> List[List[str]]: + strcols: List[List[str]] = [] if not is_list_like(self.header) and not self.header: - stringified = [] - for i, c in enumerate(frame): + for i, c in enumerate(self.tr_frame): fmt_values = self._format_col(i) fmt_values = _make_fixed_width( - fmt_values, self.justify, minimum=col_space.get(c, 0), adj=self.adj + strings=fmt_values, + justify=self.justify, + minimum=int(self.col_space.get(c, 0)), + adj=self.adj, + ) + strcols.append(fmt_values) + return strcols + + if is_list_like(self.header): + # cast here since can't be bool if is_list_like + self.header = cast(List[str], self.header) + if len(self.header) != len(self.columns): + raise ValueError( + f"Writing {len(self.columns)} cols " + f"but got {len(self.header)} aliases" ) - stringified.append(fmt_values) + str_columns = [[label] for label in self.header] else: - if is_list_like(self.header): - # cast here since can't be bool if is_list_like - self.header = cast(List[str], self.header) - if len(self.header) != len(self.columns): - raise ValueError( - f"Writing {len(self.columns)} cols " - f"but got {len(self.header)} aliases" - ) - str_columns = [[label] for label in self.header] - else: - str_columns = self._get_formatted_column_labels(frame) + str_columns = self._get_formatted_column_labels(self.tr_frame) - if self.show_row_idx_names: - for x in str_columns: - x.append("") + if self.show_row_idx_names: + for x in str_columns: + x.append("") - stringified = [] - for i, c in enumerate(frame): - cheader = str_columns[i] - header_colwidth = max( - col_space.get(c, 0), *(self.adj.len(x) for x in cheader) - ) - fmt_values = self._format_col(i) - fmt_values = _make_fixed_width( - fmt_values, self.justify, minimum=header_colwidth, adj=self.adj - ) + for i, c in enumerate(self.tr_frame): + cheader = str_columns[i] + header_colwidth = max( + int(self.col_space.get(c, 0)), *(self.adj.len(x) for x in cheader) + ) + fmt_values = self._format_col(i) + fmt_values = _make_fixed_width( + fmt_values, self.justify, minimum=header_colwidth, adj=self.adj + ) + + max_len = max(max(self.adj.len(x) for x in fmt_values), header_colwidth) + cheader = self.adj.justify(cheader, max_len, mode=self.justify) + strcols.append(cheader + fmt_values) + + return strcols - max_len = max(max(self.adj.len(x) for x in fmt_values), header_colwidth) - cheader = self.adj.justify(cheader, max_len, mode=self.justify) - stringified.append(cheader + fmt_values) + def _get_strcols(self) -> List[List[str]]: + strcols = self._get_strcols_without_index() - strcols = stringified + str_index = self._get_formatted_index(self.tr_frame) if self.index: strcols.insert(0, str_index) - # Add ... to signal truncated - truncate_h = self.truncate_h - truncate_v = self.truncate_v + return strcols - if truncate_h: - col_num = self.tr_col_num - strcols.insert(self.tr_col_num + 1, [" ..."] * (len(str_index))) - if truncate_v: - n_header_rows = len(str_index) - len(frame) - row_num = self.tr_row_num - # cast here since if truncate_v is True, self.tr_row_num is not None - row_num = cast(int, row_num) - for ix, col in enumerate(strcols): - # infer from above row - cwidth = self.adj.len(strcols[ix][row_num]) + def _to_str_columns(self) -> List[List[str]]: + """ + Render a DataFrame to a list of columns (as lists of strings). + """ + strcols = self._get_strcols() + + if self.is_truncated: + strcols = self._insert_dot_separators(strcols) + + return strcols + + def _insert_dot_separators(self, strcols: List[List[str]]) -> List[List[str]]: + str_index = self._get_formatted_index(self.tr_frame) + index_length = len(str_index) + + if self.is_truncated_horizontally: + strcols = self._insert_dot_separator_horizontal(strcols, index_length) + + if self.is_truncated_vertically: + strcols = self._insert_dot_separator_vertical(strcols, index_length) + + return strcols + + def _insert_dot_separator_horizontal( + self, strcols: List[List[str]], index_length: int + ) -> List[List[str]]: + strcols.insert(self.tr_col_num + 1, [" ..."] * index_length) + return strcols + + def _insert_dot_separator_vertical( + self, strcols: List[List[str]], index_length: int + ) -> List[List[str]]: + n_header_rows = index_length - len(self.tr_frame) + row_num = self.tr_row_num + for ix, col in enumerate(strcols): + cwidth = self.adj.len(col[row_num]) + + if self.is_truncated_horizontally: + is_dot_col = ix == self.tr_col_num + 1 + else: is_dot_col = False - if truncate_h: - is_dot_col = ix == col_num + 1 - if cwidth > 3 or is_dot_col: - my_str = "..." - else: - my_str = ".." - if ix == 0: - dot_mode = "left" - elif is_dot_col: - cwidth = 4 - dot_mode = "right" - else: - dot_mode = "right" - dot_str = self.adj.justify([my_str], cwidth, mode=dot_mode)[0] - strcols[ix].insert(row_num + n_header_rows, dot_str) + if cwidth > 3 or is_dot_col: + dots = "..." + else: + dots = ".." + + if ix == 0: + dot_mode = "left" + elif is_dot_col: + cwidth = 4 + dot_mode = "right" + else: + dot_mode = "right" + + dot_str = self.adj.justify([dots], cwidth, mode=dot_mode)[0] + col.insert(row_num + n_header_rows, dot_str) return strcols def write_result(self, buf: IO[str]) -> None: """ Render a DataFrame to a console-friendly tabular output. """ - from pandas import Series + text = self._get_string_representation() + + buf.writelines(text) + + if self.should_show_dimensions: + buf.write(self._dimensions_info) - frame = self.frame + @property + def _dimensions_info(self) -> str: + return f"\n\n[{len(self.frame)} rows x {len(self.frame.columns)} columns]" - if len(frame.columns) == 0 or len(frame.index) == 0: + def _get_string_representation(self) -> str: + if self.frame.empty: info_line = ( f"Empty {type(self.frame).__name__}\n" - f"Columns: {pprint_thing(frame.columns)}\n" - f"Index: {pprint_thing(frame.index)}" + f"Columns: {pprint_thing(self.frame.columns)}\n" + f"Index: {pprint_thing(self.frame.index)}" ) - text = info_line - else: + return info_line - strcols = self._to_str_columns() - if self.line_width is None: # no need to wrap around just print - # the whole frame - text = self.adj.adjoin(1, *strcols) - elif ( - not isinstance(self.max_cols, int) or self.max_cols > 0 - ): # need to wrap around - text = self._join_multiline(*strcols) - else: # max_cols == 0. Try to fit frame to terminal - lines = self.adj.adjoin(1, *strcols).split("\n") - max_len = Series(lines).str.len().max() - # plus truncate dot col - dif = max_len - self.w - # '+ 1' to avoid too wide repr (GH PR #17023) - adj_dif = dif + 1 - col_lens = Series([Series(ele).apply(len).max() for ele in strcols]) - n_cols = len(col_lens) - counter = 0 - while adj_dif > 0 and n_cols > 1: - counter += 1 - mid = int(round(n_cols / 2.0)) - mid_ix = col_lens.index[mid] - col_len = col_lens[mid_ix] - # adjoin adds one - adj_dif -= col_len + 1 - col_lens = col_lens.drop(mid_ix) - n_cols = len(col_lens) - # subtract index column - max_cols_adj = n_cols - self.index - # GH-21180. Ensure that we print at least two. - max_cols_adj = max(max_cols_adj, 2) - self.max_cols_adj = max_cols_adj - - # Call again _chk_truncate to cut frame appropriately - # and then generate string representation - self._chk_truncate() - strcols = self._to_str_columns() - text = self.adj.adjoin(1, *strcols) - buf.writelines(text) + strcols = self._to_str_columns() - if self.should_show_dimensions: - buf.write(f"\n\n[{len(frame)} rows x {len(frame.columns)} columns]") + if self.line_width is None: + # no need to wrap around just print the whole frame + return self.adj.adjoin(1, *strcols) + + if self.max_cols is None or self.max_cols > 0: + # need to wrap around + return self._join_multiline(*strcols) + + # max_cols == 0. Try to fit frame to terminal + return self._fit_strcols_to_terminal_width(strcols) + + def _fit_strcols_to_terminal_width(self, strcols) -> str: + from pandas import Series + + lines = self.adj.adjoin(1, *strcols).split("\n") + max_len = Series(lines).str.len().max() + # plus truncate dot col + width, _ = get_terminal_size() + dif = max_len - width + # '+ 1' to avoid too wide repr (GH PR #17023) + adj_dif = dif + 1 + col_lens = Series([Series(ele).apply(len).max() for ele in strcols]) + n_cols = len(col_lens) + counter = 0 + while adj_dif > 0 and n_cols > 1: + counter += 1 + mid = int(round(n_cols / 2.0)) + mid_ix = col_lens.index[mid] + col_len = col_lens[mid_ix] + # adjoin adds one + adj_dif -= col_len + 1 + col_lens = col_lens.drop(mid_ix) + n_cols = len(col_lens) + + # subtract index column + max_cols_fitted = n_cols - self.index + # GH-21180. Ensure that we print at least two. + max_cols_fitted = max(max_cols_fitted, 2) + self.max_cols_fitted = max_cols_fitted + + # Call again _truncate to cut frame appropriately + # and then generate string representation + self._truncate() + strcols = self._to_str_columns() + return self.adj.adjoin(1, *strcols) def _join_multiline(self, *args) -> str: lwidth = self.line_width @@ -892,26 +990,25 @@ def _join_multiline(self, *args) -> str: col_bins = _binify(col_widths, lwidth) nbins = len(col_bins) - if self.truncate_v: - # cast here since if truncate_v is True, max_rows_adj is not None - self.max_rows_adj = cast(int, self.max_rows_adj) - nrows = self.max_rows_adj + 1 + if self.is_truncated_vertically: + assert self.max_rows_fitted is not None + nrows = self.max_rows_fitted + 1 else: nrows = len(self.frame) str_lst = [] - st = 0 - for i, ed in enumerate(col_bins): - row = strcols[st:ed] + start = 0 + for i, end in enumerate(col_bins): + row = strcols[start:end] if self.index: row.insert(0, idx) if nbins > 1: - if ed <= len(strcols) and i < nbins - 1: + if end <= len(strcols) and i < nbins - 1: row.append([" \\"] + [" "] * (nrows - 1)) else: row.append([" "] * nrows) str_lst.append(self.adj.adjoin(adjoin_width, *row)) - st = ed + start = end return "\n\n".join(str_lst) def to_string( diff --git a/pandas/io/formats/html.py b/pandas/io/formats/html.py index c89189f1e679a..c8eb89afdd849 100644 --- a/pandas/io/formats/html.py +++ b/pandas/io/formats/html.py @@ -85,10 +85,8 @@ def row_levels(self) -> int: def _get_columns_formatted_values(self) -> Iterable: return self.columns - # https://github.com/python/mypy/issues/1237 - # error: Signature of "is_truncated" incompatible with supertype "TableFormatter" @property - def is_truncated(self) -> bool: # type: ignore[override] + def is_truncated(self) -> bool: return self.fmt.is_truncated @property @@ -236,7 +234,7 @@ def _write_table(self, indent: int = 0) -> None: self.write("
    l0
    ", indent) def _write_col_header(self, indent: int) -> None: - truncate_h = self.fmt.truncate_h + is_truncated_horizontally = self.fmt.is_truncated_horizontally if isinstance(self.columns, MultiIndex): template = 'colspan="{span:d}" halign="left"' @@ -249,7 +247,7 @@ def _write_col_header(self, indent: int) -> None: level_lengths = get_level_lengths(levels, sentinel) inner_lvl = len(level_lengths) - 1 for lnum, (records, values) in enumerate(zip(level_lengths, levels)): - if truncate_h: + if is_truncated_horizontally: # modify the header lines ins_col = self.fmt.tr_col_num if self.fmt.sparsify: @@ -346,16 +344,16 @@ def _write_col_header(self, indent: int) -> None: row.extend(self._get_columns_formatted_values()) align = self.fmt.justify - if truncate_h: + if is_truncated_horizontally: ins_col = self.row_levels + self.fmt.tr_col_num row.insert(ins_col, "...") self.write_tr(row, indent, self.indent_delta, header=True, align=align) def _write_row_header(self, indent: int) -> None: - truncate_h = self.fmt.truncate_h + is_truncated_horizontally = self.fmt.is_truncated_horizontally row = [x if x is not None else "" for x in self.frame.index.names] + [""] * ( - self.ncols + (1 if truncate_h else 0) + self.ncols + (1 if is_truncated_horizontally else 0) ) self.write_tr(row, indent, self.indent_delta, header=True) @@ -390,8 +388,8 @@ def _write_body(self, indent: int) -> None: def _write_regular_rows( self, fmt_values: Mapping[int, List[str]], indent: int ) -> None: - truncate_h = self.fmt.truncate_h - truncate_v = self.fmt.truncate_v + is_truncated_horizontally = self.fmt.is_truncated_horizontally + is_truncated_vertically = self.fmt.is_truncated_vertically nrows = len(self.fmt.tr_frame) @@ -405,7 +403,7 @@ def _write_regular_rows( row: List[str] = [] for i in range(nrows): - if truncate_v and i == (self.fmt.tr_row_num): + if is_truncated_vertically and i == (self.fmt.tr_row_num): str_sep_row = ["..."] * len(row) self.write_tr( str_sep_row, @@ -426,7 +424,7 @@ def _write_regular_rows( row.append("") row.extend(fmt_values[j][i] for j in range(self.ncols)) - if truncate_h: + if is_truncated_horizontally: dot_col_ix = self.fmt.tr_col_num + self.row_levels row.insert(dot_col_ix, "...") self.write_tr( @@ -438,8 +436,8 @@ def _write_hierarchical_rows( ) -> None: template = 'rowspan="{span}" valign="top"' - truncate_h = self.fmt.truncate_h - truncate_v = self.fmt.truncate_v + is_truncated_horizontally = self.fmt.is_truncated_horizontally + is_truncated_vertically = self.fmt.is_truncated_vertically frame = self.fmt.tr_frame nrows = len(frame) @@ -454,12 +452,10 @@ def _write_hierarchical_rows( level_lengths = get_level_lengths(levels, sentinel) inner_lvl = len(level_lengths) - 1 - if truncate_v: + if is_truncated_vertically: # Insert ... row and adjust idx_values and # level_lengths to take this into account. ins_row = self.fmt.tr_row_num - # cast here since if truncate_v is True, self.fmt.tr_row_num is not None - ins_row = cast(int, ins_row) inserted = False for lnum, records in enumerate(level_lengths): rec_new = {} @@ -520,7 +516,7 @@ def _write_hierarchical_rows( row.append(v) row.extend(fmt_values[j][i] for j in range(self.ncols)) - if truncate_h: + if is_truncated_horizontally: row.insert( self.row_levels - sparse_offset + self.fmt.tr_col_num, "..." ) @@ -534,7 +530,7 @@ def _write_hierarchical_rows( else: row = [] for i in range(len(frame)): - if truncate_v and i == (self.fmt.tr_row_num): + if is_truncated_vertically and i == (self.fmt.tr_row_num): str_sep_row = ["..."] * len(row) self.write_tr( str_sep_row, @@ -550,7 +546,7 @@ def _write_hierarchical_rows( row = [] row.extend(idx_values[i]) row.extend(fmt_values[j][i] for j in range(self.ncols)) - if truncate_h: + if is_truncated_horizontally: row.insert(self.row_levels + self.fmt.tr_col_num, "...") self.write_tr( row, From 0a41156dfaaf75997e44e03e8239b51d593e2cdc Mon Sep 17 00:00:00 2001 From: Yanxian Lin Date: Sat, 19 Sep 2020 15:07:24 -0700 Subject: [PATCH 0854/1025] TST: #31922 assert no segmentation fault with numpy.array.__contains__ (#36283) --- pandas/tests/scalar/test_na_scalar.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pandas/tests/scalar/test_na_scalar.py b/pandas/tests/scalar/test_na_scalar.py index 5c4d7e191d1bb..10d366fe485da 100644 --- a/pandas/tests/scalar/test_na_scalar.py +++ b/pandas/tests/scalar/test_na_scalar.py @@ -305,3 +305,11 @@ def test_pickle_roundtrip_containers(as_frame, values, dtype): s = s.to_frame(name="A") result = tm.round_trip_pickle(s) tm.assert_equal(result, s) + + +@pytest.mark.parametrize("array", [np.array(["a"], dtype=object), ["a"]]) +def test_array_contains_na(array): + # GH 31922 + msg = "boolean value of NA is ambiguous" + with pytest.raises(TypeError, match=msg): + NA in array From a4f51e5862deb93ea997896c79a99f6dc21956bd Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 19 Sep 2020 15:53:29 -0700 Subject: [PATCH 0855/1025] REF: de-duplicate IntervalArray._validate_foo (#36483) --- pandas/core/arrays/interval.py | 62 +++++++++---------- pandas/tests/arrays/interval/test_interval.py | 4 ++ .../tests/indexes/interval/test_interval.py | 7 ++- 3 files changed, 39 insertions(+), 34 deletions(-) diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index f9f68004bcc23..ebabc7edcbf43 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -20,7 +20,6 @@ is_datetime64_any_dtype, is_float_dtype, is_integer_dtype, - is_interval, is_interval_dtype, is_list_like, is_object_dtype, @@ -813,7 +812,9 @@ def take(self, indices, allow_fill=False, fill_value=None, axis=None, **kwargs): fill_left = fill_right = fill_value if allow_fill: - fill_left, fill_right = self._validate_fill_value(fill_value) + if (np.asarray(indices) == -1).any(): + # We have excel tests that pass fill_value=True, xref GH#36466 + fill_left, fill_right = self._validate_fill_value(fill_value) left_take = take( self.left, indices, allow_fill=allow_fill, fill_value=fill_left @@ -824,20 +825,33 @@ def take(self, indices, allow_fill=False, fill_value=None, axis=None, **kwargs): return self._shallow_copy(left_take, right_take) - def _validate_fill_value(self, value): - if is_interval(value): - self._check_closed_matches(value, name="fill_value") - fill_left, fill_right = value.left, value.right - elif not is_scalar(value) and notna(value): - msg = ( - "'IntervalArray.fillna' only supports filling with a " - "'scalar pandas.Interval or NA'. " - f"Got a '{type(value).__name__}' instead." - ) - raise ValueError(msg) + def _validate_listlike(self, value): + # list-like of intervals + try: + array = IntervalArray(value) + # TODO: self._check_closed_matches(array, name="value") + value_left, value_right = array.left, array.right + except TypeError as err: + # wrong type: not interval or NA + msg = f"'value' should be an interval type, got {type(value)} instead." + raise TypeError(msg) from err + return value_left, value_right + + def _validate_scalar(self, value): + if isinstance(value, Interval): + self._check_closed_matches(value, name="value") + left, right = value.left, value.right + elif is_valid_nat_for_dtype(value, self.left.dtype): + # GH#18295 + left = right = value else: - fill_left = fill_right = self.left._na_value - return fill_left, fill_right + raise ValueError( + "can only insert Interval objects and NA into an IntervalArray" + ) + return left, right + + def _validate_fill_value(self, value): + return self._validate_scalar(value) def _validate_fillna_value(self, value): if not isinstance(value, Interval): @@ -851,26 +865,12 @@ def _validate_fillna_value(self, value): return value.left, value.right def _validate_insert_value(self, value): - if isinstance(value, Interval): - if value.closed != self.closed: - raise ValueError( - "inserted item must be closed on the same side as the index" - ) - left_insert = value.left - right_insert = value.right - elif is_valid_nat_for_dtype(value, self.left.dtype): - # GH#18295 - left_insert = right_insert = value - else: - raise ValueError( - "can only insert Interval objects and NA into an IntervalIndex" - ) - return left_insert, right_insert + return self._validate_scalar(value) def _validate_setitem_value(self, value): needs_float_conversion = False - if is_scalar(value) and isna(value): + if is_valid_nat_for_dtype(value, self.left.dtype): # na value: need special casing to set directly on numpy arrays if is_integer_dtype(self.dtype.subtype): # can't set NaN on a numpy integer array diff --git a/pandas/tests/arrays/interval/test_interval.py b/pandas/tests/arrays/interval/test_interval.py index 0176755b54dd1..e5ccb51ce36f5 100644 --- a/pandas/tests/arrays/interval/test_interval.py +++ b/pandas/tests/arrays/interval/test_interval.py @@ -105,6 +105,10 @@ def test_set_na(self, left_right_dtypes): left, right = left_right_dtypes result = IntervalArray.from_arrays(left, right) + if result.dtype.subtype.kind not in ["m", "M"]: + msg = "'value' should be an interval type, got <.*NaTType'> instead." + with pytest.raises(TypeError, match=msg): + result[0] = pd.NaT if result.dtype.subtype.kind in ["i", "u"]: msg = "Cannot set float NaN to integer-backed IntervalArray" with pytest.raises(ValueError, match=msg): diff --git a/pandas/tests/indexes/interval/test_interval.py b/pandas/tests/indexes/interval/test_interval.py index 734c98af3d058..b81f0f27e60ad 100644 --- a/pandas/tests/indexes/interval/test_interval.py +++ b/pandas/tests/indexes/interval/test_interval.py @@ -191,13 +191,14 @@ def test_insert(self, data): tm.assert_index_equal(result, expected) # invalid type - msg = "can only insert Interval objects and NA into an IntervalIndex" + msg = "can only insert Interval objects and NA into an IntervalArray" with pytest.raises(ValueError, match=msg): data.insert(1, "foo") # invalid closed - msg = "inserted item must be closed on the same side as the index" + msg = "'value.closed' is 'left', expected 'right'." for closed in {"left", "right", "both", "neither"} - {item.closed}: + msg = f"'value.closed' is '{closed}', expected '{item.closed}'." with pytest.raises(ValueError, match=msg): bad_item = Interval(item.left, item.right, closed=closed) data.insert(1, bad_item) @@ -211,7 +212,7 @@ def test_insert(self, data): if data.left.dtype.kind not in ["m", "M"]: # trying to insert pd.NaT into a numeric-dtyped Index should cast/raise - msg = "can only insert Interval objects and NA into an IntervalIndex" + msg = "can only insert Interval objects and NA into an IntervalArray" with pytest.raises(ValueError, match=msg): result = data.insert(1, pd.NaT) else: From 432df3775000d6a827304f4434af513063056658 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 19 Sep 2020 19:13:12 -0700 Subject: [PATCH 0856/1025] TYP: core.missing; PERF for needs_i8_conversion (#36485) --- pandas/core/arrays/datetimelike.py | 14 ++------------ pandas/core/dtypes/common.py | 4 ++++ pandas/core/missing.py | 31 +++++++++++++++--------------- 3 files changed, 21 insertions(+), 28 deletions(-) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 026aad5ad6eb7..45cabe8f0b498 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -1005,19 +1005,9 @@ def fillna(self, value=None, method=None, limit=None): else: func = missing.backfill_1d - values = self._ndarray - if not is_period_dtype(self.dtype): - # For PeriodArray self._ndarray is i8, which gets copied - # by `func`. Otherwise we need to make a copy manually - # to avoid modifying `self` in-place. - values = values.copy() - + values = self.copy() new_values = func(values, limit=limit, mask=mask) - if is_datetime64tz_dtype(self.dtype): - # we need to pass int64 values to the constructor to avoid - # re-localizing incorrectly - new_values = new_values.view("i8") - new_values = type(self)(new_values, dtype=self.dtype) + new_values = self._from_backing_data(new_values) else: # fill with value new_values = self.copy() diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 5987fdabf78bb..acbdbfd7707e3 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -1215,6 +1215,10 @@ def needs_i8_conversion(arr_or_dtype) -> bool: """ if arr_or_dtype is None: return False + if isinstance(arr_or_dtype, (np.dtype, ExtensionDtype)): + # fastpath + dtype = arr_or_dtype + return dtype.kind in ["m", "M"] or dtype.type is Period return ( is_datetime_or_timedelta_dtype(arr_or_dtype) or is_datetime64tz_dtype(arr_or_dtype) diff --git a/pandas/core/missing.py b/pandas/core/missing.py index be66b19d10064..9b96c8f01153b 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -7,17 +7,15 @@ import numpy as np from pandas._libs import algos, lib +from pandas._typing import DtypeObj from pandas.compat._optional import import_optional_dependency from pandas.core.dtypes.cast import infer_dtype_from_array from pandas.core.dtypes.common import ( ensure_float64, - is_datetime64_dtype, - is_datetime64tz_dtype, is_integer_dtype, is_numeric_v_string_like, is_scalar, - is_timedelta64_dtype, needs_i8_conversion, ) from pandas.core.dtypes.missing import isna @@ -72,7 +70,7 @@ def mask_missing(arr, values_to_mask): return mask -def clean_fill_method(method, allow_nearest=False): +def clean_fill_method(method, allow_nearest: bool = False): # asfreq is compat for resampling if method in [None, "asfreq"]: return None @@ -543,7 +541,12 @@ def _cubicspline_interpolate(xi, yi, x, axis=0, bc_type="not-a-knot", extrapolat def interpolate_2d( - values, method="pad", axis=0, limit=None, fill_value=None, dtype=None + values, + method="pad", + axis=0, + limit=None, + fill_value=None, + dtype: Optional[DtypeObj] = None, ): """ Perform an actual interpolation of values, values will be make 2-d if @@ -584,18 +587,14 @@ def interpolate_2d( return values -def _cast_values_for_fillna(values, dtype): +def _cast_values_for_fillna(values, dtype: DtypeObj): """ Cast values to a dtype that algos.pad and algos.backfill can handle. """ # TODO: for int-dtypes we make a copy, but for everything else this # alters the values in-place. Is this intentional? - if ( - is_datetime64_dtype(dtype) - or is_datetime64tz_dtype(dtype) - or is_timedelta64_dtype(dtype) - ): + if needs_i8_conversion(dtype): values = values.view(np.int64) elif is_integer_dtype(values): @@ -605,7 +604,7 @@ def _cast_values_for_fillna(values, dtype): return values -def _fillna_prep(values, mask=None, dtype=None): +def _fillna_prep(values, mask=None, dtype: Optional[DtypeObj] = None): # boilerplate for pad_1d, backfill_1d, pad_2d, backfill_2d if dtype is None: dtype = values.dtype @@ -620,19 +619,19 @@ def _fillna_prep(values, mask=None, dtype=None): return values, mask -def pad_1d(values, limit=None, mask=None, dtype=None): +def pad_1d(values, limit=None, mask=None, dtype: Optional[DtypeObj] = None): values, mask = _fillna_prep(values, mask, dtype) algos.pad_inplace(values, mask, limit=limit) return values -def backfill_1d(values, limit=None, mask=None, dtype=None): +def backfill_1d(values, limit=None, mask=None, dtype: Optional[DtypeObj] = None): values, mask = _fillna_prep(values, mask, dtype) algos.backfill_inplace(values, mask, limit=limit) return values -def pad_2d(values, limit=None, mask=None, dtype=None): +def pad_2d(values, limit=None, mask=None, dtype: Optional[DtypeObj] = None): values, mask = _fillna_prep(values, mask, dtype) if np.all(values.shape): @@ -643,7 +642,7 @@ def pad_2d(values, limit=None, mask=None, dtype=None): return values -def backfill_2d(values, limit=None, mask=None, dtype=None): +def backfill_2d(values, limit=None, mask=None, dtype: Optional[DtypeObj] = None): values, mask = _fillna_prep(values, mask, dtype) if np.all(values.shape): From efed1de595cc0ee3c0a8f755a1158704517b20a0 Mon Sep 17 00:00:00 2001 From: Daniel Saxton <2658661+dsaxton@users.noreply.github.com> Date: Sat, 19 Sep 2020 21:13:54 -0500 Subject: [PATCH 0857/1025] Don't unlabel stale PR on update (#36487) --- .github/workflows/stale-pr.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/stale-pr.yml b/.github/workflows/stale-pr.yml index e3b8d9336a5a6..e77bf2b81fc86 100644 --- a/.github/workflows/stale-pr.yml +++ b/.github/workflows/stale-pr.yml @@ -17,5 +17,5 @@ jobs: exempt-pr-labels: "Needs Review,Blocked,Needs Discussion" days-before-stale: 30 days-before-close: -1 - remove-stale-when-updated: true + remove-stale-when-updated: false debug-only: false From 14453b35e7e21241fa0cc4e64846ec8557b8f8f2 Mon Sep 17 00:00:00 2001 From: Daniel Saxton <2658661+dsaxton@users.noreply.github.com> Date: Mon, 21 Sep 2020 01:47:18 -0500 Subject: [PATCH 0858/1025] BUG: Fix astype from float32 to string (#36464) --- doc/source/whatsnew/v1.1.3.rst | 1 + pandas/_libs/lib.pyx | 3 ++- pandas/core/arrays/string_.py | 3 +-- pandas/tests/arrays/string_/test_string.py | 9 +++++++++ pandas/tests/series/methods/test_astype.py | 9 +++++++++ 5 files changed, 22 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.1.3.rst b/doc/source/whatsnew/v1.1.3.rst index 7d658215d7b76..72937141c2870 100644 --- a/doc/source/whatsnew/v1.1.3.rst +++ b/doc/source/whatsnew/v1.1.3.rst @@ -47,6 +47,7 @@ Bug fixes - Bug in :class:`Series` constructor where integer overflow would occur for sufficiently large scalar inputs when an index was provided (:issue:`36291`) - Bug in :meth:`DataFrame.sort_values` raising an ``AttributeError`` when sorting on a key that casts column to categorical dtype (:issue:`36383`) - Bug in :meth:`DataFrame.stack` raising a ``ValueError`` when stacking :class:`MultiIndex` columns based on position when the levels had duplicate names (:issue:`36353`) +- Bug in :meth:`Series.astype` showing too much precision when casting from ``np.float32`` to string dtype (:issue:`36451`) - Bug in :meth:`Series.isin` and :meth:`DataFrame.isin` when using ``NaN`` and a row length above 1,000,000 (:issue:`22205`) .. --------------------------------------------------------------------------- diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index a57cf3b523985..61a9634b00211 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -659,11 +659,12 @@ cpdef ndarray[object] ensure_string_array( Py_ssize_t i = 0, n = len(arr) result = np.asarray(arr, dtype="object") + if copy and result is arr: result = result.copy() for i in range(n): - val = result[i] + val = arr[i] if isinstance(val, str): continue diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index cef35f2b1137c..cb1144c18e49c 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -198,10 +198,9 @@ def _from_sequence(cls, scalars, dtype=None, copy=False): if dtype: assert dtype == "string" - result = np.asarray(scalars, dtype="object") # convert non-na-likes to str, and nan-likes to StringDtype.na_value result = lib.ensure_string_array( - result, na_value=StringDtype.na_value, copy=copy + scalars, na_value=StringDtype.na_value, copy=copy ) # Manually creating new array avoids the validation step in the __init__, so is diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index efd5d29ae0717..56a8e21edd004 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -336,3 +336,12 @@ def test_memory_usage(): series = pd.Series(["a", "b", "c"], dtype="string") assert 0 < series.nbytes <= series.memory_usage() < series.memory_usage(deep=True) + + +@pytest.mark.parametrize("dtype", [np.float16, np.float32, np.float64]) +def test_astype_from_float_dtype(dtype): + # https://github.com/pandas-dev/pandas/issues/36451 + s = pd.Series([0.1], dtype=dtype) + result = s.astype("string") + expected = pd.Series(["0.1"], dtype="string") + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/methods/test_astype.py b/pandas/tests/series/methods/test_astype.py index b9d90a9fc63dd..7449d8d65ef96 100644 --- a/pandas/tests/series/methods/test_astype.py +++ b/pandas/tests/series/methods/test_astype.py @@ -1,3 +1,4 @@ +import numpy as np import pytest from pandas import Interval, Series, Timestamp, date_range @@ -46,3 +47,11 @@ def test_astype_ignores_errors_for_extension_dtypes(self, values, errors): msg = "(Cannot cast)|(could not convert)" with pytest.raises((ValueError, TypeError), match=msg): values.astype(float, errors=errors) + + @pytest.mark.parametrize("dtype", [np.float16, np.float32, np.float64]) + def test_astype_from_float_to_str(self, dtype): + # https://github.com/pandas-dev/pandas/issues/36451 + s = Series([0.1], dtype=dtype) + result = s.astype(str) + expected = Series(["0.1"]) + tm.assert_series_equal(result, expected) From 74825b4da07dff0eba23992bdc934b09f0c7db82 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 21 Sep 2020 05:28:23 -0700 Subject: [PATCH 0859/1025] CI: troubleshoot segfault (#36511) --- pandas/tests/scalar/test_na_scalar.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/pandas/tests/scalar/test_na_scalar.py b/pandas/tests/scalar/test_na_scalar.py index 10d366fe485da..5c4d7e191d1bb 100644 --- a/pandas/tests/scalar/test_na_scalar.py +++ b/pandas/tests/scalar/test_na_scalar.py @@ -305,11 +305,3 @@ def test_pickle_roundtrip_containers(as_frame, values, dtype): s = s.to_frame(name="A") result = tm.round_trip_pickle(s) tm.assert_equal(result, s) - - -@pytest.mark.parametrize("array", [np.array(["a"], dtype=object), ["a"]]) -def test_array_contains_na(array): - # GH 31922 - msg = "boolean value of NA is ambiguous" - with pytest.raises(TypeError, match=msg): - NA in array From b1e56314ad33d893f0258c94484aca25465b47f5 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Mon, 21 Sep 2020 17:25:38 +0100 Subject: [PATCH 0860/1025] TST: remove xfails with strict=False (#36524) --- pandas/tests/io/parser/test_common.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index 6bbc9bc9e1788..08eab69900400 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -1138,7 +1138,6 @@ def test_parse_integers_above_fp_precision(all_parsers): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(reason="ResourceWarning #35660", strict=False) def test_chunks_have_consistent_numerical_type(all_parsers): parser = all_parsers integers = [str(i) for i in range(499999)] @@ -1152,7 +1151,6 @@ def test_chunks_have_consistent_numerical_type(all_parsers): assert result.a.dtype == float -@pytest.mark.xfail(reason="ResourceWarning #35660", strict=False) def test_warn_if_chunks_have_mismatched_type(all_parsers): warning_type = None parser = all_parsers From 60e523e12e661cf816c5345ce6c24a538e2c1abf Mon Sep 17 00:00:00 2001 From: Sharon Woo Date: Tue, 22 Sep 2020 05:11:46 +0800 Subject: [PATCH 0861/1025] Made change to Usage statement in issue 36494 (#36496) --- scripts/generate_pip_deps_from_conda.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/generate_pip_deps_from_conda.py b/scripts/generate_pip_deps_from_conda.py index b0a06416ce443..c417f58f6bf1b 100755 --- a/scripts/generate_pip_deps_from_conda.py +++ b/scripts/generate_pip_deps_from_conda.py @@ -6,11 +6,11 @@ Usage: Generate `requirements-dev.txt` - $ ./conda_to_pip + $ python scripts/generate_pip_deps_from_conda.py Compare and fail (exit status != 0) if `requirements-dev.txt` has not been generated with this script: - $ ./conda_to_pip --compare + $ python scripts/generate_pip_deps_from_conda.py --compare """ import argparse import os From f20ff42f23bb00a93172b87c5ec76d9679194484 Mon Sep 17 00:00:00 2001 From: Erfan Nariman <34067903+erfannariman@users.noreply.github.com> Date: Mon, 21 Sep 2020 23:24:08 +0200 Subject: [PATCH 0862/1025] remove not existing argument (#36533) --- pandas/core/algorithms.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 50d1810fee30d..edacacd3e26bd 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -278,7 +278,6 @@ def _check_object_for_strings(values) -> str: Parameters ---------- values : ndarray - ndtype : str Returns ------- From 99938a2d746ab88c268bc787d3bef11f4b08e7e3 Mon Sep 17 00:00:00 2001 From: Irv Lustig Date: Mon, 21 Sep 2020 17:42:48 -0400 Subject: [PATCH 0863/1025] BUG: Fix issue in preserving index name on empty DataFrame (#36532) --- doc/source/whatsnew/v1.1.3.rst | 1 + pandas/core/frame.py | 3 ++- pandas/tests/indexing/test_partial.py | 8 ++++++++ 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.3.rst b/doc/source/whatsnew/v1.1.3.rst index 72937141c2870..e3a96c69918db 100644 --- a/doc/source/whatsnew/v1.1.3.rst +++ b/doc/source/whatsnew/v1.1.3.rst @@ -35,6 +35,7 @@ Fixed regressions - Fixed regression in :meth:`Series.__getitem__` incorrectly raising when the input was a frozenset (:issue:`35747`) - Fixed regression in :meth:`read_excel` with ``engine="odf"`` caused ``UnboundLocalError`` in some cases where cells had nested child nodes (:issue:`36122`,:issue:`35802`) - Fixed regression in :class:`DataFrame` and :class:`Series` comparisons between numeric arrays and strings (:issue:`35700`,:issue:`36377`) +- Fixed regression when setting empty :class:`DataFrame` column to a :class:`Series` in preserving name of index in frame (:issue:`36527`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 36dfe43bfd708..69b12bcff967f 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3190,7 +3190,8 @@ def _ensure_valid_index(self, value): # GH31368 preserve name of index index_copy = value.index.copy() - index_copy.name = self.index.name + if self.index.name is not None: + index_copy.name = self.index.name self._mgr = self._mgr.reindex_axis(index_copy, axis=1, fill_value=np.nan) diff --git a/pandas/tests/indexing/test_partial.py b/pandas/tests/indexing/test_partial.py index 7afbbc2b9ab2b..72bc13e67c040 100644 --- a/pandas/tests/indexing/test_partial.py +++ b/pandas/tests/indexing/test_partial.py @@ -672,3 +672,11 @@ def test_index_name_empty(self): ) tm.assert_frame_equal(df, expected) + + # GH 36527 + df = pd.DataFrame() + series = pd.Series(1.23, index=pd.RangeIndex(4, name="series_index")) + df["series"] = series + expected = pd.DataFrame( + {"series": [1.23] * 4}, index=pd.RangeIndex(4, name="series_index") + ) From 86fe456350d2a08f5141f798581adf6d8a6a0246 Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Mon, 21 Sep 2020 22:47:14 +0100 Subject: [PATCH 0864/1025] ENH: Optimize nrows in read_excel (#35974) --- asv_bench/benchmarks/io/excel.py | 6 +++++- doc/source/whatsnew/v1.2.0.rst | 1 + pandas/io/excel/_base.py | 30 +++++++++++++++++++++++---- pandas/io/excel/_odfreader.py | 13 ++++++++++-- pandas/io/excel/_openpyxl.py | 9 +++++++- pandas/io/excel/_pyxlsb.py | 11 ++++++++-- pandas/io/excel/_xlrd.py | 21 +++++++++++++++---- pandas/tests/io/excel/test_readers.py | 16 ++++++++++++++ 8 files changed, 93 insertions(+), 14 deletions(-) diff --git a/asv_bench/benchmarks/io/excel.py b/asv_bench/benchmarks/io/excel.py index 80af2cff41769..1eaccb9f2d897 100644 --- a/asv_bench/benchmarks/io/excel.py +++ b/asv_bench/benchmarks/io/excel.py @@ -11,7 +11,7 @@ def _generate_dataframe(): - N = 2000 + N = 20000 C = 5 df = DataFrame( np.random.randn(N, C), @@ -69,5 +69,9 @@ def time_read_excel(self, engine): fname = self.fname_odf if engine == "odf" else self.fname_excel read_excel(fname, engine=engine) + def time_read_excel_nrows(self, engine): + fname = self.fname_odf if engine == "odf" else self.fname_excel + read_excel(fname, engine=engine, nrows=1) + from ..pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 18940b574b517..19a563be0a568 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -224,6 +224,7 @@ Performance improvements - Performance improvements when creating DataFrame or Series with dtype `str` or :class:`StringDtype` from array with many string elements (:issue:`36304`, :issue:`36317`, :issue:`36325`, :issue:`36432`) - Performance improvement in :meth:`GroupBy.agg` with the ``numba`` engine (:issue:`35759`) +- Performance improvement in `read_excel` for when ``nrows`` is much smaller than the length of the file (:issue:`33281`). - Performance improvements when creating :meth:`pd.Series.map` from a huge dictionary (:issue:`34717`) - Performance improvement in :meth:`GroupBy.transform` with the ``numba`` engine (:issue:`36240`) - ``Styler`` uuid method altered to compress data transmission over web whilst maintaining reasonably low table collision probability (:issue:`36345`) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 604b7e12ec243..667f37f47e188 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -3,12 +3,12 @@ from io import BufferedIOBase, BytesIO, RawIOBase import os from textwrap import fill -from typing import Any, Mapping, Union +from typing import Any, List, Mapping, Optional, Union from pandas._config import config from pandas._libs.parsers import STR_NA_VALUES -from pandas._typing import StorageOptions +from pandas._typing import Scalar, StorageOptions from pandas.errors import EmptyDataError from pandas.util._decorators import Appender, deprecate_nonkeyword_arguments @@ -398,7 +398,14 @@ def get_sheet_by_index(self, index): pass @abc.abstractmethod - def get_sheet_data(self, sheet, convert_float): + def get_sheet_data( + self, + sheet, + convert_float: bool, + header_nrows: int, + skiprows_nrows: int, + nrows: Optional[int], + ) -> List[List[Scalar]]: pass def parse( @@ -454,7 +461,22 @@ def parse( else: # assume an integer if not a string sheet = self.get_sheet_by_index(asheetname) - data = self.get_sheet_data(sheet, convert_float) + if isinstance(header, int): + header_nrows = header + elif header is None: + header_nrows = 0 + else: + header_nrows = max(header) + if isinstance(skiprows, int): + skiprows_nrows = skiprows + elif skiprows is None: + skiprows_nrows = 0 + else: + skiprows_nrows = len(skiprows) + + data = self.get_sheet_data( + sheet, convert_float, header_nrows, skiprows_nrows, nrows + ) usecols = maybe_convert_usecols(usecols) if not data: diff --git a/pandas/io/excel/_odfreader.py b/pandas/io/excel/_odfreader.py index 4f9f8a29c0010..07d2f9a593b96 100644 --- a/pandas/io/excel/_odfreader.py +++ b/pandas/io/excel/_odfreader.py @@ -1,4 +1,4 @@ -from typing import List, cast +from typing import List, Optional, cast import numpy as np @@ -71,7 +71,14 @@ def get_sheet_by_name(self, name: str): raise ValueError(f"sheet {name} not found") - def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]: + def get_sheet_data( + self, + sheet, + convert_float: bool, + header_nrows: int, + skiprows_nrows: int, + nrows: Optional[int], + ) -> List[List[Scalar]]: """ Parse an ODF Table into a list of lists """ @@ -87,6 +94,8 @@ def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]: table: List[List[Scalar]] = [] + if isinstance(nrows, int): + sheet_rows = sheet_rows[: header_nrows + skiprows_nrows + nrows + 1] for i, sheet_row in enumerate(sheet_rows): sheet_cells = [x for x in sheet_row.childNodes if x.qname in cell_names] empty_cells = 0 diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py index a5cadf4d93389..bc7b168eeaaa2 100644 --- a/pandas/io/excel/_openpyxl.py +++ b/pandas/io/excel/_openpyxl.py @@ -508,7 +508,14 @@ def _convert_cell(self, cell, convert_float: bool) -> Scalar: return cell.value - def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]: + def get_sheet_data( + self, + sheet, + convert_float: bool, + header_nrows: int, + skiprows_nrows: int, + nrows: Optional[int], + ) -> List[List[Scalar]]: data: List[List[Scalar]] = [] for row in sheet.rows: data.append([self._convert_cell(cell, convert_float) for cell in row]) diff --git a/pandas/io/excel/_pyxlsb.py b/pandas/io/excel/_pyxlsb.py index ac94f4dd3df74..cf3dcebdff6eb 100644 --- a/pandas/io/excel/_pyxlsb.py +++ b/pandas/io/excel/_pyxlsb.py @@ -1,4 +1,4 @@ -from typing import List +from typing import List, Optional from pandas._typing import FilePathOrBuffer, Scalar, StorageOptions from pandas.compat._optional import import_optional_dependency @@ -68,7 +68,14 @@ def _convert_cell(self, cell, convert_float: bool) -> Scalar: return cell.v - def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]: + def get_sheet_data( + self, + sheet, + convert_float: bool, + header_nrows: int, + skiprows_nrows: int, + nrows: Optional[int], + ) -> List[List[Scalar]]: return [ [self._convert_cell(c, convert_float) for c in r] for r in sheet.rows(sparse=False) diff --git a/pandas/io/excel/_xlrd.py b/pandas/io/excel/_xlrd.py index dfd5dde0329ae..e5d0d66f9570a 100644 --- a/pandas/io/excel/_xlrd.py +++ b/pandas/io/excel/_xlrd.py @@ -1,8 +1,9 @@ from datetime import time +from typing import List, Optional import numpy as np -from pandas._typing import StorageOptions +from pandas._typing import Scalar, StorageOptions from pandas.compat._optional import import_optional_dependency from pandas.io.excel._base import BaseExcelReader @@ -49,7 +50,14 @@ def get_sheet_by_name(self, name): def get_sheet_by_index(self, index): return self.book.sheet_by_index(index) - def get_sheet_data(self, sheet, convert_float): + def get_sheet_data( + self, + sheet, + convert_float: bool, + header_nrows: int, + skiprows_nrows: int, + nrows: Optional[int], + ) -> List[List[Scalar]]: from xlrd import ( XL_CELL_BOOLEAN, XL_CELL_DATE, @@ -98,9 +106,14 @@ def _parse_cell(cell_contents, cell_typ): cell_contents = val return cell_contents - data = [] + data: List[List[Scalar]] = [] - for i in range(sheet.nrows): + sheet_nrows = sheet.nrows + + if isinstance(nrows, int): + sheet_nrows = min(header_nrows + skiprows_nrows + nrows + 1, sheet_nrows) + + for i in range(sheet_nrows): row = [ _parse_cell(value, typ) for value, typ in zip(sheet.row_values(i), sheet.row_types(i)) diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 4bdcc5b327fa7..4fb1ef8fa0c15 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -1195,5 +1195,21 @@ def test_read_datetime_multiindex(self, engine, read_ext): ], ) expected = pd.DataFrame([], columns=expected_column_index) + tm.assert_frame_equal(expected, actual) + @pytest.mark.parametrize( + "header, skiprows", [(1, 2), (0, 3), (1, [0, 1]), ([2], 1)] + ) + @td.check_file_leaks + def test_header_skiprows_nrows(self, engine, read_ext, header, skiprows): + # GH 32727 + data = pd.read_excel("test1" + read_ext, engine=engine) + expected = ( + DataFrame(data.iloc[3:6]) + .reset_index(drop=True) + .rename(columns=data.iloc[2].rename(None)) + ) + actual = pd.read_excel( + "test1" + read_ext, engine=engine, header=header, skiprows=skiprows, nrows=3 + ) tm.assert_frame_equal(expected, actual) From 0dc4d207d48d8c8d3bf61265252cd4e535b8d23c Mon Sep 17 00:00:00 2001 From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> Date: Mon, 21 Sep 2020 17:59:46 -0400 Subject: [PATCH 0865/1025] CLN: Unify Series case in _wrap_applied_output (#36504) --- pandas/core/groupby/generic.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 0705261d0c516..b9cc2c19c224b 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1190,14 +1190,6 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): key_index = self.grouper.result_index if self.as_index else None - if isinstance(first_not_none, Series): - # this is to silence a DeprecationWarning - # TODO: Remove when default dtype of empty Series is object - kwargs = first_not_none._construct_axes_dict() - backup = create_series_with_explicit_dtype(dtype_if_empty=object, **kwargs) - - values = [x if (x is not None) else backup for x in values] - if isinstance(first_not_none, (np.ndarray, Index)): # GH#1738: values is list of arrays of unequal lengths # fall through to the outer else clause @@ -1217,8 +1209,13 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): result = DataFrame(values, index=key_index, columns=[self._selection]) self._insert_inaxis_grouper_inplace(result) return result - else: + # this is to silence a DeprecationWarning + # TODO: Remove when default dtype of empty Series is object + kwargs = first_not_none._construct_axes_dict() + backup = create_series_with_explicit_dtype(dtype_if_empty=object, **kwargs) + values = [x if (x is not None) else backup for x in values] + all_indexed_same = all_indexes_same(x.index for x in values) # GH3596 From 0b79b28bbb12e978f572128682195cc2b5188e80 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 21 Sep 2020 16:53:15 -0700 Subject: [PATCH 0866/1025] REF: share fillna (#36488) --- pandas/core/arrays/_mixins.py | 33 +++++++++++++++++++++++ pandas/core/arrays/datetimelike.py | 42 +----------------------------- pandas/core/arrays/numpy_.py | 34 +----------------------- pandas/core/missing.py | 11 +++++--- 4 files changed, 42 insertions(+), 78 deletions(-) diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py index a947ab64f7380..808d598558c83 100644 --- a/pandas/core/arrays/_mixins.py +++ b/pandas/core/arrays/_mixins.py @@ -6,7 +6,11 @@ from pandas.compat.numpy import function as nv from pandas.errors import AbstractMethodError from pandas.util._decorators import cache_readonly, doc +from pandas.util._validators import validate_fillna_kwargs +from pandas.core.dtypes.inference import is_array_like + +from pandas.core import missing from pandas.core.algorithms import take, unique from pandas.core.array_algos.transforms import shift from pandas.core.arrays.base import ExtensionArray @@ -194,3 +198,32 @@ def __getitem__(self, key): def _validate_getitem_key(self, key): return check_array_indexer(self, key) + + @doc(ExtensionArray.fillna) + def fillna(self: _T, value=None, method=None, limit=None) -> _T: + value, method = validate_fillna_kwargs(value, method) + + mask = self.isna() + + # TODO: share this with EA base class implementation + if is_array_like(value): + if len(value) != len(self): + raise ValueError( + f"Length of 'value' does not match. Got ({len(value)}) " + f" expected {len(self)}" + ) + value = value[mask] + + if mask.any(): + if method is not None: + func = missing.get_fill_func(method) + new_values = func(self._ndarray.copy(), limit=limit, mask=mask) + # TODO: PandasArray didnt used to copy, need tests for this + new_values = self._from_backing_data(new_values) + else: + # fill with value + new_values = self.copy() + new_values[mask] = value + else: + new_values = self.copy() + return new_values diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 45cabe8f0b498..7051507f9a90e 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -28,7 +28,6 @@ from pandas.compat.numpy import function as nv from pandas.errors import AbstractMethodError, NullFrequencyError, PerformanceWarning from pandas.util._decorators import Appender, Substitution, cache_readonly -from pandas.util._validators import validate_fillna_kwargs from pandas.core.dtypes.common import ( is_categorical_dtype, @@ -48,11 +47,9 @@ is_unsigned_integer_dtype, pandas_dtype, ) -from pandas.core.dtypes.generic import ABCSeries -from pandas.core.dtypes.inference import is_array_like from pandas.core.dtypes.missing import is_valid_nat_for_dtype, isna -from pandas.core import missing, nanops, ops +from pandas.core import nanops, ops from pandas.core.algorithms import checked_add_with_arr, unique1d, value_counts from pandas.core.arrays._mixins import NDArrayBackedExtensionArray from pandas.core.arrays.base import ExtensionOpsMixin @@ -979,43 +976,6 @@ def _maybe_mask_results(self, result, fill_value=iNaT, convert=None): result[self._isnan] = fill_value return result - def fillna(self, value=None, method=None, limit=None): - # TODO(GH-20300): remove this - # Just overriding to ensure that we avoid an astype(object). - # Either 20300 or a `_values_for_fillna` would avoid this duplication. - if isinstance(value, ABCSeries): - value = value.array - - value, method = validate_fillna_kwargs(value, method) - - mask = self.isna() - - if is_array_like(value): - if len(value) != len(self): - raise ValueError( - f"Length of 'value' does not match. Got ({len(value)}) " - f" expected {len(self)}" - ) - value = value[mask] - - if mask.any(): - if method is not None: - if method == "pad": - func = missing.pad_1d - else: - func = missing.backfill_1d - - values = self.copy() - new_values = func(values, limit=limit, mask=mask) - new_values = self._from_backing_data(new_values) - else: - # fill with value - new_values = self.copy() - new_values[mask] = value - else: - new_values = self.copy() - return new_values - # ------------------------------------------------------------------ # Frequency Properties/Methods diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index afcae2c5c8b43..61076132b24cd 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -1,5 +1,5 @@ import numbers -from typing import Optional, Tuple, Type, Union +from typing import Tuple, Type, Union import numpy as np from numpy.lib.mixins import NDArrayOperatorsMixin @@ -7,10 +7,8 @@ from pandas._libs import lib from pandas._typing import Scalar from pandas.compat.numpy import function as nv -from pandas.util._validators import validate_fillna_kwargs from pandas.core.dtypes.dtypes import ExtensionDtype -from pandas.core.dtypes.inference import is_array_like from pandas.core.dtypes.missing import isna from pandas import compat @@ -19,7 +17,6 @@ from pandas.core.arrays._mixins import NDArrayBackedExtensionArray from pandas.core.arrays.base import ExtensionOpsMixin from pandas.core.construction import extract_array -from pandas.core.missing import backfill_1d, pad_1d class PandasDtype(ExtensionDtype): @@ -263,35 +260,6 @@ def _validate_setitem_value(self, value): def isna(self) -> np.ndarray: return isna(self._ndarray) - def fillna( - self, value=None, method: Optional[str] = None, limit: Optional[int] = None - ) -> "PandasArray": - # TODO(_values_for_fillna): remove this - value, method = validate_fillna_kwargs(value, method) - - mask = self.isna() - - if is_array_like(value): - if len(value) != len(self): - raise ValueError( - f"Length of 'value' does not match. Got ({len(value)}) " - f" expected {len(self)}" - ) - value = value[mask] - - if mask.any(): - if method is not None: - func = pad_1d if method == "pad" else backfill_1d - new_values = func(self._ndarray, limit=limit, mask=mask) - new_values = self._from_sequence(new_values, dtype=self.dtype) - else: - # fill with value - new_values = self.copy() - new_values[mask] = value - else: - new_values = self.copy() - return new_values - def _validate_fill_value(self, fill_value): if fill_value is None: # Primarily for subclasses diff --git a/pandas/core/missing.py b/pandas/core/missing.py index 9b96c8f01153b..edcdf2f54bc4c 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -587,7 +587,7 @@ def interpolate_2d( return values -def _cast_values_for_fillna(values, dtype: DtypeObj): +def _cast_values_for_fillna(values, dtype: DtypeObj, has_mask: bool): """ Cast values to a dtype that algos.pad and algos.backfill can handle. """ @@ -597,8 +597,10 @@ def _cast_values_for_fillna(values, dtype: DtypeObj): if needs_i8_conversion(dtype): values = values.view(np.int64) - elif is_integer_dtype(values): + elif is_integer_dtype(values) and not has_mask: # NB: this check needs to come after the datetime64 check above + # has_mask check to avoid casting i8 values that have already + # been cast from PeriodDtype values = ensure_float64(values) return values @@ -609,11 +611,12 @@ def _fillna_prep(values, mask=None, dtype: Optional[DtypeObj] = None): if dtype is None: dtype = values.dtype - if mask is None: + has_mask = mask is not None + if not has_mask: # This needs to occur before datetime/timedeltas are cast to int64 mask = isna(values) - values = _cast_values_for_fillna(values, dtype) + values = _cast_values_for_fillna(values, dtype, has_mask) mask = mask.view(np.uint8) return values, mask From 56a19f5c201902be06294133ea4e47be1dfed00d Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 21 Sep 2020 19:58:27 -0700 Subject: [PATCH 0867/1025] Revert "ENH: Optimize nrows in read_excel (#35974)" (#36537) This reverts commit e975f3def1ff430d5801fbe241c52d7206c79956. --- asv_bench/benchmarks/io/excel.py | 6 +----- doc/source/whatsnew/v1.2.0.rst | 1 - pandas/io/excel/_base.py | 30 ++++----------------------- pandas/io/excel/_odfreader.py | 13 ++---------- pandas/io/excel/_openpyxl.py | 9 +------- pandas/io/excel/_pyxlsb.py | 11 ++-------- pandas/io/excel/_xlrd.py | 21 ++++--------------- pandas/tests/io/excel/test_readers.py | 16 -------------- 8 files changed, 14 insertions(+), 93 deletions(-) diff --git a/asv_bench/benchmarks/io/excel.py b/asv_bench/benchmarks/io/excel.py index 1eaccb9f2d897..80af2cff41769 100644 --- a/asv_bench/benchmarks/io/excel.py +++ b/asv_bench/benchmarks/io/excel.py @@ -11,7 +11,7 @@ def _generate_dataframe(): - N = 20000 + N = 2000 C = 5 df = DataFrame( np.random.randn(N, C), @@ -69,9 +69,5 @@ def time_read_excel(self, engine): fname = self.fname_odf if engine == "odf" else self.fname_excel read_excel(fname, engine=engine) - def time_read_excel_nrows(self, engine): - fname = self.fname_odf if engine == "odf" else self.fname_excel - read_excel(fname, engine=engine, nrows=1) - from ..pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 19a563be0a568..18940b574b517 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -224,7 +224,6 @@ Performance improvements - Performance improvements when creating DataFrame or Series with dtype `str` or :class:`StringDtype` from array with many string elements (:issue:`36304`, :issue:`36317`, :issue:`36325`, :issue:`36432`) - Performance improvement in :meth:`GroupBy.agg` with the ``numba`` engine (:issue:`35759`) -- Performance improvement in `read_excel` for when ``nrows`` is much smaller than the length of the file (:issue:`33281`). - Performance improvements when creating :meth:`pd.Series.map` from a huge dictionary (:issue:`34717`) - Performance improvement in :meth:`GroupBy.transform` with the ``numba`` engine (:issue:`36240`) - ``Styler`` uuid method altered to compress data transmission over web whilst maintaining reasonably low table collision probability (:issue:`36345`) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 667f37f47e188..604b7e12ec243 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -3,12 +3,12 @@ from io import BufferedIOBase, BytesIO, RawIOBase import os from textwrap import fill -from typing import Any, List, Mapping, Optional, Union +from typing import Any, Mapping, Union from pandas._config import config from pandas._libs.parsers import STR_NA_VALUES -from pandas._typing import Scalar, StorageOptions +from pandas._typing import StorageOptions from pandas.errors import EmptyDataError from pandas.util._decorators import Appender, deprecate_nonkeyword_arguments @@ -398,14 +398,7 @@ def get_sheet_by_index(self, index): pass @abc.abstractmethod - def get_sheet_data( - self, - sheet, - convert_float: bool, - header_nrows: int, - skiprows_nrows: int, - nrows: Optional[int], - ) -> List[List[Scalar]]: + def get_sheet_data(self, sheet, convert_float): pass def parse( @@ -461,22 +454,7 @@ def parse( else: # assume an integer if not a string sheet = self.get_sheet_by_index(asheetname) - if isinstance(header, int): - header_nrows = header - elif header is None: - header_nrows = 0 - else: - header_nrows = max(header) - if isinstance(skiprows, int): - skiprows_nrows = skiprows - elif skiprows is None: - skiprows_nrows = 0 - else: - skiprows_nrows = len(skiprows) - - data = self.get_sheet_data( - sheet, convert_float, header_nrows, skiprows_nrows, nrows - ) + data = self.get_sheet_data(sheet, convert_float) usecols = maybe_convert_usecols(usecols) if not data: diff --git a/pandas/io/excel/_odfreader.py b/pandas/io/excel/_odfreader.py index 07d2f9a593b96..4f9f8a29c0010 100644 --- a/pandas/io/excel/_odfreader.py +++ b/pandas/io/excel/_odfreader.py @@ -1,4 +1,4 @@ -from typing import List, Optional, cast +from typing import List, cast import numpy as np @@ -71,14 +71,7 @@ def get_sheet_by_name(self, name: str): raise ValueError(f"sheet {name} not found") - def get_sheet_data( - self, - sheet, - convert_float: bool, - header_nrows: int, - skiprows_nrows: int, - nrows: Optional[int], - ) -> List[List[Scalar]]: + def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]: """ Parse an ODF Table into a list of lists """ @@ -94,8 +87,6 @@ def get_sheet_data( table: List[List[Scalar]] = [] - if isinstance(nrows, int): - sheet_rows = sheet_rows[: header_nrows + skiprows_nrows + nrows + 1] for i, sheet_row in enumerate(sheet_rows): sheet_cells = [x for x in sheet_row.childNodes if x.qname in cell_names] empty_cells = 0 diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py index bc7b168eeaaa2..a5cadf4d93389 100644 --- a/pandas/io/excel/_openpyxl.py +++ b/pandas/io/excel/_openpyxl.py @@ -508,14 +508,7 @@ def _convert_cell(self, cell, convert_float: bool) -> Scalar: return cell.value - def get_sheet_data( - self, - sheet, - convert_float: bool, - header_nrows: int, - skiprows_nrows: int, - nrows: Optional[int], - ) -> List[List[Scalar]]: + def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]: data: List[List[Scalar]] = [] for row in sheet.rows: data.append([self._convert_cell(cell, convert_float) for cell in row]) diff --git a/pandas/io/excel/_pyxlsb.py b/pandas/io/excel/_pyxlsb.py index cf3dcebdff6eb..ac94f4dd3df74 100644 --- a/pandas/io/excel/_pyxlsb.py +++ b/pandas/io/excel/_pyxlsb.py @@ -1,4 +1,4 @@ -from typing import List, Optional +from typing import List from pandas._typing import FilePathOrBuffer, Scalar, StorageOptions from pandas.compat._optional import import_optional_dependency @@ -68,14 +68,7 @@ def _convert_cell(self, cell, convert_float: bool) -> Scalar: return cell.v - def get_sheet_data( - self, - sheet, - convert_float: bool, - header_nrows: int, - skiprows_nrows: int, - nrows: Optional[int], - ) -> List[List[Scalar]]: + def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]: return [ [self._convert_cell(c, convert_float) for c in r] for r in sheet.rows(sparse=False) diff --git a/pandas/io/excel/_xlrd.py b/pandas/io/excel/_xlrd.py index e5d0d66f9570a..dfd5dde0329ae 100644 --- a/pandas/io/excel/_xlrd.py +++ b/pandas/io/excel/_xlrd.py @@ -1,9 +1,8 @@ from datetime import time -from typing import List, Optional import numpy as np -from pandas._typing import Scalar, StorageOptions +from pandas._typing import StorageOptions from pandas.compat._optional import import_optional_dependency from pandas.io.excel._base import BaseExcelReader @@ -50,14 +49,7 @@ def get_sheet_by_name(self, name): def get_sheet_by_index(self, index): return self.book.sheet_by_index(index) - def get_sheet_data( - self, - sheet, - convert_float: bool, - header_nrows: int, - skiprows_nrows: int, - nrows: Optional[int], - ) -> List[List[Scalar]]: + def get_sheet_data(self, sheet, convert_float): from xlrd import ( XL_CELL_BOOLEAN, XL_CELL_DATE, @@ -106,14 +98,9 @@ def _parse_cell(cell_contents, cell_typ): cell_contents = val return cell_contents - data: List[List[Scalar]] = [] + data = [] - sheet_nrows = sheet.nrows - - if isinstance(nrows, int): - sheet_nrows = min(header_nrows + skiprows_nrows + nrows + 1, sheet_nrows) - - for i in range(sheet_nrows): + for i in range(sheet.nrows): row = [ _parse_cell(value, typ) for value, typ in zip(sheet.row_values(i), sheet.row_types(i)) diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 4fb1ef8fa0c15..4bdcc5b327fa7 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -1195,21 +1195,5 @@ def test_read_datetime_multiindex(self, engine, read_ext): ], ) expected = pd.DataFrame([], columns=expected_column_index) - tm.assert_frame_equal(expected, actual) - @pytest.mark.parametrize( - "header, skiprows", [(1, 2), (0, 3), (1, [0, 1]), ([2], 1)] - ) - @td.check_file_leaks - def test_header_skiprows_nrows(self, engine, read_ext, header, skiprows): - # GH 32727 - data = pd.read_excel("test1" + read_ext, engine=engine) - expected = ( - DataFrame(data.iloc[3:6]) - .reset_index(drop=True) - .rename(columns=data.iloc[2].rename(None)) - ) - actual = pd.read_excel( - "test1" + read_ext, engine=engine, header=header, skiprows=skiprows, nrows=3 - ) tm.assert_frame_equal(expected, actual) From 0ab8a6bc73b9f9134a9609d9ec38e699b494139b Mon Sep 17 00:00:00 2001 From: Jacob Peacock Date: Tue, 22 Sep 2020 03:22:11 -0400 Subject: [PATCH 0868/1025] Link to new location for scipy.window documentation (#36540) * Update link to scipy documentation This documentation has moved on Scipy. Update the link accordingly. * Update computation.rst --- doc/source/user_guide/computation.rst | 2 +- pandas/core/window/rolling.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/user_guide/computation.rst b/doc/source/user_guide/computation.rst index 10e27606a1415..e7edda90610b5 100644 --- a/doc/source/user_guide/computation.rst +++ b/doc/source/user_guide/computation.rst @@ -433,7 +433,7 @@ The following methods are available: The weights used in the window are specified by the ``win_type`` keyword. The list of recognized types are the `scipy.signal window functions -`__: +`__: * ``boxcar`` * ``triang`` diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 06c3ad23f904f..335fc3db5cd86 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -940,7 +940,7 @@ class Window(_Window): If ``win_type=None`` all points are evenly weighted. To learn more about different window types see `scipy.signal window functions - `__. + `__. Certain window types require additional parameters to be passed. Please see the third example below on how to add the additional parameters. From e845a678ef98e5e58e15cec1cb21a344d13b442c Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 22 Sep 2020 14:49:42 +0200 Subject: [PATCH 0869/1025] TST: add missing assert (#36546) --- pandas/tests/indexing/test_partial.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/indexing/test_partial.py b/pandas/tests/indexing/test_partial.py index 72bc13e67c040..337ec683ee745 100644 --- a/pandas/tests/indexing/test_partial.py +++ b/pandas/tests/indexing/test_partial.py @@ -680,3 +680,4 @@ def test_index_name_empty(self): expected = pd.DataFrame( {"series": [1.23] * 4}, index=pd.RangeIndex(4, name="series_index") ) + tm.assert_frame_equal(df, expected) From 2a408d47b404020efbbf07a2994b4a5ea70217d7 Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Tue, 22 Sep 2020 14:37:58 +0100 Subject: [PATCH 0870/1025] CI: Update version of 'black' (#36493) --- .pre-commit-config.yaml | 2 +- asv_bench/benchmarks/arithmetic.py | 2 +- doc/make.py | 2 +- doc/source/conf.py | 2 +- doc/source/development/contributing.rst | 2 +- environment.yml | 2 +- pandas/_vendored/typing_extensions.py | 3 +-- pandas/core/aggregation.py | 2 +- pandas/core/algorithms.py | 9 ++++---- pandas/core/array_algos/replace.py | 2 +- pandas/core/frame.py | 10 ++++---- pandas/core/series.py | 10 +++----- pandas/core/sorting.py | 2 +- pandas/core/util/numba_.py | 2 +- pandas/io/formats/format.py | 4 ---- pandas/io/formats/latex.py | 4 ++-- pandas/tests/arrays/sparse/test_array.py | 16 ++++--------- pandas/tests/frame/test_analytics.py | 16 ++++++------- pandas/tests/io/test_gcs.py | 4 +--- pandas/tests/io/test_parquet.py | 2 +- .../scalar/timestamp/test_constructors.py | 23 +++++++++++-------- pandas/tests/series/test_operators.py | 4 +--- requirements-dev.txt | 2 +- scripts/tests/test_validate_docstrings.py | 3 +-- versioneer.py | 2 +- 25 files changed, 56 insertions(+), 76 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 309e22e71a523..dd5323960ed20 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,6 @@ repos: - repo: https://github.com/python/black - rev: 19.10b0 + rev: 20.8b1 hooks: - id: black language_version: python3 diff --git a/asv_bench/benchmarks/arithmetic.py b/asv_bench/benchmarks/arithmetic.py index 3ef6ab6209ea7..5a3febdcf75e7 100644 --- a/asv_bench/benchmarks/arithmetic.py +++ b/asv_bench/benchmarks/arithmetic.py @@ -125,7 +125,7 @@ def setup(self, op): arr1 = np.random.randn(n_rows, int(n_cols / 2)).astype("f8") arr2 = np.random.randn(n_rows, int(n_cols / 2)).astype("f4") df = pd.concat( - [pd.DataFrame(arr1), pd.DataFrame(arr2)], axis=1, ignore_index=True, + [pd.DataFrame(arr1), pd.DataFrame(arr2)], axis=1, ignore_index=True ) # should already be the case, but just to be sure df._consolidate_inplace() diff --git a/doc/make.py b/doc/make.py index 94fbfa9382d81..40ce9ea3bbcd2 100755 --- a/doc/make.py +++ b/doc/make.py @@ -286,7 +286,7 @@ def main(): joined = ",".join(cmds) argparser = argparse.ArgumentParser( - description="pandas documentation builder", epilog=f"Commands: {joined}", + description="pandas documentation builder", epilog=f"Commands: {joined}" ) joined = ", ".join(cmds) diff --git a/doc/source/conf.py b/doc/source/conf.py index ee0d4ca3f2a24..04540f7e6ec95 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -308,7 +308,7 @@ for method in methods: # ... and each of its public methods - moved_api_pages.append((f"{old}.{method}", f"{new}.{method}",)) + moved_api_pages.append((f"{old}.{method}", f"{new}.{method}")) if pattern is None: html_additional_pages = { diff --git a/doc/source/development/contributing.rst b/doc/source/development/contributing.rst index e5c6f77eea3ef..8558774955a40 100644 --- a/doc/source/development/contributing.rst +++ b/doc/source/development/contributing.rst @@ -720,7 +720,7 @@ submitting code to run the check yourself:: to auto-format your code. Additionally, many editors have plugins that will apply ``black`` as you edit files. -You should use a ``black`` version >= 19.10b0 as previous versions are not compatible +You should use a ``black`` version 20.8b1 as previous versions are not compatible with the pandas codebase. If you wish to run these checks automatically, we encourage you to use diff --git a/environment.yml b/environment.yml index 36bbd3d307159..ffd319b006ff2 100644 --- a/environment.yml +++ b/environment.yml @@ -15,7 +15,7 @@ dependencies: - cython>=0.29.21 # code checks - - black=19.10b0 + - black=20.8b1 - cpplint - flake8<3.8.0 # temporary pin, GH#34150 - flake8-comprehensions>=3.1.0 # used by flake8, linting of unnecessary comprehensions diff --git a/pandas/_vendored/typing_extensions.py b/pandas/_vendored/typing_extensions.py index 129d8998faccc..6efbbe9302952 100644 --- a/pandas/_vendored/typing_extensions.py +++ b/pandas/_vendored/typing_extensions.py @@ -2116,8 +2116,7 @@ def __init_subclass__(cls, *args, **kwargs): raise TypeError(f"Cannot subclass {cls.__module__}.Annotated") def _strip_annotations(t): - """Strips the annotations from a given type. - """ + """Strips the annotations from a given type.""" if isinstance(t, _AnnotatedAlias): return _strip_annotations(t.__origin__) if isinstance(t, typing._GenericAlias): diff --git a/pandas/core/aggregation.py b/pandas/core/aggregation.py index c123156495924..541c617f7f618 100644 --- a/pandas/core/aggregation.py +++ b/pandas/core/aggregation.py @@ -387,7 +387,7 @@ def validate_func_kwargs( def transform( - obj: FrameOrSeries, func: AggFuncType, axis: Axis, *args, **kwargs, + obj: FrameOrSeries, func: AggFuncType, axis: Axis, *args, **kwargs ) -> FrameOrSeries: """ Transform a DataFrame or Series diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index edacacd3e26bd..ba08d26fbc24f 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1022,11 +1022,10 @@ def checked_add_with_arr(arr, b, arr_mask=None, b_mask=None): to_raise = ((np.iinfo(np.int64).max - b2 < arr) & not_nan).any() else: to_raise = ( - ((np.iinfo(np.int64).max - b2[mask1] < arr[mask1]) & not_nan[mask1]).any() - or ( - (np.iinfo(np.int64).min - b2[mask2] > arr[mask2]) & not_nan[mask2] - ).any() - ) + (np.iinfo(np.int64).max - b2[mask1] < arr[mask1]) & not_nan[mask1] + ).any() or ( + (np.iinfo(np.int64).min - b2[mask2] > arr[mask2]) & not_nan[mask2] + ).any() if to_raise: raise OverflowError("Overflow in int64 addition") diff --git a/pandas/core/array_algos/replace.py b/pandas/core/array_algos/replace.py index 09f9aefd64096..9eaa265adab2b 100644 --- a/pandas/core/array_algos/replace.py +++ b/pandas/core/array_algos/replace.py @@ -17,7 +17,7 @@ def compare_or_regex_search( - a: ArrayLike, b: Union[Scalar, Pattern], regex: bool, mask: ArrayLike, + a: ArrayLike, b: Union[Scalar, Pattern], regex: bool, mask: ArrayLike ) -> Union[ArrayLike, bool]: """ Compare two array_like inputs of the same shape or two scalar values diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 69b12bcff967f..5e06a8d16372a 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -449,9 +449,7 @@ def __init__( if isinstance(data, BlockManager): if index is None and columns is None and dtype is None and copy is False: # GH#33357 fastpath - NDFrame.__init__( - self, data, - ) + NDFrame.__init__(self, data) return mgr = self._init_mgr( @@ -5748,7 +5746,7 @@ def nsmallest(self, n, columns, keep="first") -> DataFrame: population GDP alpha-2 Tuvalu 11300 38 TV Anguilla 11300 311 AI - Iceland 337000 17036 IS + Iceland 337000 17036 IS When using ``keep='last'``, ties are resolved in reverse order: @@ -7143,7 +7141,7 @@ def unstack(self, level=-1, fill_value=None): return unstack(self, level, fill_value) - @Appender(_shared_docs["melt"] % dict(caller="df.melt(", other="melt",)) + @Appender(_shared_docs["melt"] % dict(caller="df.melt(", other="melt")) def melt( self, id_vars=None, @@ -8625,7 +8623,7 @@ def blk_func(values): # After possibly _get_data and transposing, we are now in the # simple case where we can use BlockManager.reduce res = df._mgr.reduce(blk_func) - out = df._constructor(res,).iloc[0].rename(None) + out = df._constructor(res).iloc[0].rename(None) if out_dtype is not None: out = out.astype(out_dtype) if axis == 0 and is_object_dtype(out.dtype): diff --git a/pandas/core/series.py b/pandas/core/series.py index 48fae9a0a91cd..0984e86a23592 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -198,7 +198,7 @@ class Series(base.IndexOpsMixin, generic.NDFrame): # Constructors def __init__( - self, data=None, index=None, dtype=None, name=None, copy=False, fastpath=False, + self, data=None, index=None, dtype=None, name=None, copy=False, fastpath=False ): if ( @@ -208,9 +208,7 @@ def __init__( and copy is False ): # GH#33357 called with just the SingleBlockManager - NDFrame.__init__( - self, data, - ) + NDFrame.__init__(self, data) self.name = name return @@ -329,9 +327,7 @@ def __init__( data = SingleBlockManager.from_array(data, index) - generic.NDFrame.__init__( - self, data, - ) + generic.NDFrame.__init__(self, data) self.name = name self._set_axis(0, index, fastpath=True) diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index ec62192464665..1fec2bbbf5fdc 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -72,7 +72,7 @@ def get_indexer_indexer( ) elif isinstance(target, ABCMultiIndex): indexer = lexsort_indexer( - target._get_codes_for_sorting(), orders=ascending, na_position=na_position, + target._get_codes_for_sorting(), orders=ascending, na_position=na_position ) else: # Check monotonic-ness before sort an index (GH 11080) diff --git a/pandas/core/util/numba_.py b/pandas/core/util/numba_.py index f06dd10d0e497..1dd005c1602a5 100644 --- a/pandas/core/util/numba_.py +++ b/pandas/core/util/numba_.py @@ -25,7 +25,7 @@ def set_use_numba(enable: bool = False) -> None: def get_jit_arguments( - engine_kwargs: Optional[Dict[str, bool]] = None, kwargs: Optional[Dict] = None, + engine_kwargs: Optional[Dict[str, bool]] = None, kwargs: Optional[Dict] = None ) -> Tuple[bool, bool, bool]: """ Return arguments to pass to numba.JIT, falling back on pandas default JIT settings. diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 75e5eedfc148d..8e1deb21bf8ea 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -1384,10 +1384,6 @@ def _format(x): class FloatArrayFormatter(GenericArrayFormatter): - """ - - """ - def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) diff --git a/pandas/io/formats/latex.py b/pandas/io/formats/latex.py index eb35fff3a4f8e..170df193bef00 100644 --- a/pandas/io/formats/latex.py +++ b/pandas/io/formats/latex.py @@ -41,8 +41,8 @@ def __init__( self.multirow = multirow self.clinebuf: List[List[int]] = [] self.strcols = self._get_strcols() - self.strrows: List[List[str]] = ( - list(zip(*self.strcols)) # type: ignore[arg-type] + self.strrows: List[List[str]] = list( + zip(*self.strcols) # type: ignore[arg-type] ) def get_strrow(self, row_num: int) -> str: diff --git a/pandas/tests/arrays/sparse/test_array.py b/pandas/tests/arrays/sparse/test_array.py index ece9367cea7fe..f18117cfd3d1f 100644 --- a/pandas/tests/arrays/sparse/test_array.py +++ b/pandas/tests/arrays/sparse/test_array.py @@ -193,9 +193,7 @@ def test_constructor_inferred_fill_value(self, data, fill_value): assert result == fill_value @pytest.mark.parametrize("format", ["coo", "csc", "csr"]) - @pytest.mark.parametrize( - "size", [0, 10], - ) + @pytest.mark.parametrize("size", [0, 10]) @td.skip_if_no_scipy def test_from_spmatrix(self, size, format): import scipy.sparse @@ -693,17 +691,13 @@ def test_getslice_tuple(self): dense = np.array([np.nan, 0, 3, 4, 0, 5, np.nan, np.nan, 0]) sparse = SparseArray(dense) - res = sparse[ - 4:, - ] # noqa: E231 - exp = SparseArray(dense[4:,]) # noqa: E231 + res = sparse[(slice(4, None),)] + exp = SparseArray(dense[4:]) tm.assert_sp_array_equal(res, exp) sparse = SparseArray(dense, fill_value=0) - res = sparse[ - 4:, - ] # noqa: E231 - exp = SparseArray(dense[4:,], fill_value=0) # noqa: E231 + res = sparse[(slice(4, None),)] + exp = SparseArray(dense[4:], fill_value=0) tm.assert_sp_array_equal(res, exp) msg = "too many indices for array" diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index f21b1d3dfe487..4324b03ed13d6 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -1060,14 +1060,14 @@ def test_any_all_bool_only(self): (np.any, {"A": pd.Series([0.0, 1.0], dtype="float")}, True), (np.all, {"A": pd.Series([0, 1], dtype=int)}, False), (np.any, {"A": pd.Series([0, 1], dtype=int)}, True), - pytest.param(np.all, {"A": pd.Series([0, 1], dtype="M8[ns]")}, False,), - pytest.param(np.any, {"A": pd.Series([0, 1], dtype="M8[ns]")}, True,), - pytest.param(np.all, {"A": pd.Series([1, 2], dtype="M8[ns]")}, True,), - pytest.param(np.any, {"A": pd.Series([1, 2], dtype="M8[ns]")}, True,), - pytest.param(np.all, {"A": pd.Series([0, 1], dtype="m8[ns]")}, False,), - pytest.param(np.any, {"A": pd.Series([0, 1], dtype="m8[ns]")}, True,), - pytest.param(np.all, {"A": pd.Series([1, 2], dtype="m8[ns]")}, True,), - pytest.param(np.any, {"A": pd.Series([1, 2], dtype="m8[ns]")}, True,), + pytest.param(np.all, {"A": pd.Series([0, 1], dtype="M8[ns]")}, False), + pytest.param(np.any, {"A": pd.Series([0, 1], dtype="M8[ns]")}, True), + pytest.param(np.all, {"A": pd.Series([1, 2], dtype="M8[ns]")}, True), + pytest.param(np.any, {"A": pd.Series([1, 2], dtype="M8[ns]")}, True), + pytest.param(np.all, {"A": pd.Series([0, 1], dtype="m8[ns]")}, False), + pytest.param(np.any, {"A": pd.Series([0, 1], dtype="m8[ns]")}, True), + pytest.param(np.all, {"A": pd.Series([1, 2], dtype="m8[ns]")}, True), + pytest.param(np.any, {"A": pd.Series([1, 2], dtype="m8[ns]")}, True), (np.all, {"A": pd.Series([0, 1], dtype="category")}, False), (np.any, {"A": pd.Series([0, 1], dtype="category")}, True), (np.all, {"A": pd.Series([1, 2], dtype="category")}, True), diff --git a/pandas/tests/io/test_gcs.py b/pandas/tests/io/test_gcs.py index 18b5743a3375a..9d179d983ceeb 100644 --- a/pandas/tests/io/test_gcs.py +++ b/pandas/tests/io/test_gcs.py @@ -108,9 +108,7 @@ def test_to_csv_compression_encoding_gcs(gcs_buffer, compression_only, encoding) compression_only = "gz" compression["method"] = "infer" path_gcs += f".{compression_only}" - df.to_csv( - path_gcs, compression=compression, encoding=encoding, - ) + df.to_csv(path_gcs, compression=compression, encoding=encoding) assert gcs_buffer.getvalue() == buffer.getvalue() read_df = read_csv(path_gcs, index_col=0, compression="infer", encoding=encoding) tm.assert_frame_equal(df, read_df) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 35a400cba8671..a5033c51bce81 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -572,7 +572,7 @@ def test_s3_roundtrip(self, df_compat, s3_resource, pa, s3so): pytest.param( ["A"], marks=pytest.mark.xfail( - PY38, reason="Getting back empty DataFrame", raises=AssertionError, + PY38, reason="Getting back empty DataFrame", raises=AssertionError ), ), [], diff --git a/pandas/tests/scalar/timestamp/test_constructors.py b/pandas/tests/scalar/timestamp/test_constructors.py index 316a299ba1cbb..d1c3ad508d877 100644 --- a/pandas/tests/scalar/timestamp/test_constructors.py +++ b/pandas/tests/scalar/timestamp/test_constructors.py @@ -259,17 +259,20 @@ def test_constructor_keyword(self): Timestamp("20151112") ) - assert repr( - Timestamp( - year=2015, - month=11, - day=12, - hour=1, - minute=2, - second=3, - microsecond=999999, + assert ( + repr( + Timestamp( + year=2015, + month=11, + day=12, + hour=1, + minute=2, + second=3, + microsecond=999999, + ) ) - ) == repr(Timestamp("2015-11-12 01:02:03.999999")) + == repr(Timestamp("2015-11-12 01:02:03.999999")) + ) def test_constructor_fromordinal(self): base = datetime(2000, 1, 1) diff --git a/pandas/tests/series/test_operators.py b/pandas/tests/series/test_operators.py index aee947e738525..a796023c75b78 100644 --- a/pandas/tests/series/test_operators.py +++ b/pandas/tests/series/test_operators.py @@ -554,9 +554,7 @@ def test_unary_minus_nullable_int( expected = pd.Series(target, dtype=dtype) tm.assert_series_equal(result, expected) - @pytest.mark.parametrize( - "source", [[1, 2, 3], [1, 2, None], [-1, 0, 1]], - ) + @pytest.mark.parametrize("source", [[1, 2, 3], [1, 2, None], [-1, 0, 1]]) def test_unary_plus_nullable_int(self, any_signed_nullable_int_dtype, source): dtype = any_signed_nullable_int_dtype expected = pd.Series(source, dtype=dtype) diff --git a/requirements-dev.txt b/requirements-dev.txt index fb647c10f72bc..4f93ce9017f91 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -6,7 +6,7 @@ python-dateutil>=2.7.3 pytz asv cython>=0.29.21 -black==19.10b0 +black==20.8b1 cpplint flake8<3.8.0 flake8-comprehensions>=3.1.0 diff --git a/scripts/tests/test_validate_docstrings.py b/scripts/tests/test_validate_docstrings.py index b11de0c4ad860..74819db7b878c 100644 --- a/scripts/tests/test_validate_docstrings.py +++ b/scripts/tests/test_validate_docstrings.py @@ -6,8 +6,7 @@ class BadDocstrings: - """Everything here has a bad docstring - """ + """Everything here has a bad docstring""" def private_classes(self): """ diff --git a/versioneer.py b/versioneer.py index 65c9523ba5573..171156c2c5315 100644 --- a/versioneer.py +++ b/versioneer.py @@ -1073,7 +1073,7 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): fmt = "tag '%s' doesn't start with prefix '%s'" print(fmt % (full_tag, tag_prefix)) pieces["error"] = "tag '{}' doesn't start with prefix '{}'".format( - full_tag, tag_prefix, + full_tag, tag_prefix ) return pieces pieces["closest-tag"] = full_tag[len(tag_prefix) :] From 2cbb294b2e931f57b2eeffa88f70500fc6f66063 Mon Sep 17 00:00:00 2001 From: Tomasz Sakrejda Date: Tue, 22 Sep 2020 06:41:24 -0700 Subject: [PATCH 0871/1025] TST: base test for ExtensionArray.astype to its own type + copy keyword (#35116) Co-authored-by: Joris Van den Bossche --- doc/source/whatsnew/v1.2.0.rst | 1 + pandas/core/arrays/base.py | 5 +++++ pandas/core/arrays/boolean.py | 5 ++++- pandas/core/arrays/period.py | 7 ++++++- pandas/core/arrays/sparse/array.py | 5 +++++ pandas/tests/extension/base/casting.py | 9 +++++++++ pandas/tests/extension/decimal/array.py | 7 +++++-- pandas/tests/extension/test_numpy.py | 16 ++-------------- 8 files changed, 37 insertions(+), 18 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 18940b574b517..6a5b4b3b9ff16 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -366,6 +366,7 @@ ExtensionArray ^^^^^^^^^^^^^^ - Fixed Bug where :class:`DataFrame` column set to scalar extension type via a dict instantion was considered an object type rather than the extension type (:issue:`35965`) +- Fixed bug where ``astype()`` with equal dtype and ``copy=False`` would return a new object (:issue:`284881`) - diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index e93cdb608dffb..eae401f9744f0 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -457,6 +457,11 @@ def astype(self, dtype, copy=True): from pandas.core.arrays.string_ import StringDtype dtype = pandas_dtype(dtype) + if is_dtype_equal(dtype, self.dtype): + if not copy: + return self + elif copy: + return self.copy() if isinstance(dtype, StringDtype): # allow conversion to StringArrays return dtype.construct_array_type()._from_sequence(self, copy=False) diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index bd4bdc5ecb46f..3bd36209b3c71 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -375,7 +375,10 @@ def astype(self, dtype, copy: bool = True) -> ArrayLike: if isinstance(dtype, BooleanDtype): values, mask = coerce_to_array(self, copy=copy) - return BooleanArray(values, mask, copy=False) + if not copy: + return self + else: + return BooleanArray(values, mask, copy=False) elif isinstance(dtype, StringDtype): return dtype.construct_array_type()._from_sequence(self, copy=False) diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 44c0455018a42..372ef7df9dc3a 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -33,6 +33,7 @@ TD64NS_DTYPE, ensure_object, is_datetime64_dtype, + is_dtype_equal, is_float_dtype, is_period_dtype, pandas_dtype, @@ -582,7 +583,11 @@ def astype(self, dtype, copy: bool = True): # We handle Period[T] -> Period[U] # Our parent handles everything else. dtype = pandas_dtype(dtype) - + if is_dtype_equal(dtype, self._dtype): + if not copy: + return self + elif copy: + return self.copy() if is_period_dtype(dtype): return self.asfreq(dtype.freq) return super().astype(dtype, copy=copy) diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index c88af77ea6189..528d78a5414ea 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -1063,6 +1063,11 @@ def astype(self, dtype=None, copy=True): IntIndex Indices: array([2, 3], dtype=int32) """ + if is_dtype_equal(dtype, self._dtype): + if not copy: + return self + elif copy: + return self.copy() dtype = self.dtype.update_dtype(dtype) subtype = dtype._subtype_with_str # TODO copy=False is broken for astype_nansafe with int -> float, so cannot diff --git a/pandas/tests/extension/base/casting.py b/pandas/tests/extension/base/casting.py index 3aaf040a4279b..039b42210224e 100644 --- a/pandas/tests/extension/base/casting.py +++ b/pandas/tests/extension/base/casting.py @@ -1,4 +1,5 @@ import numpy as np +import pytest import pandas as pd from pandas.core.internals import ObjectBlock @@ -56,3 +57,11 @@ def test_astype_empty_dataframe(self, dtype): df = pd.DataFrame() result = df.astype(dtype) self.assert_frame_equal(result, df) + + @pytest.mark.parametrize("copy", [True, False]) + def test_astype_own_type(self, data, copy): + # ensure that astype returns the original object for equal dtype and copy=False + # https://github.com/pandas-dev/pandas/issues/28488 + result = data.astype(data.dtype, copy=copy) + assert (result is data) is (not copy) + self.assert_extension_array_equal(result, data) diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py index 9147360e71c73..2895f33d5c887 100644 --- a/pandas/tests/extension/decimal/array.py +++ b/pandas/tests/extension/decimal/array.py @@ -7,7 +7,7 @@ import numpy as np from pandas.core.dtypes.base import ExtensionDtype -from pandas.core.dtypes.common import pandas_dtype +from pandas.core.dtypes.common import is_dtype_equal, pandas_dtype import pandas as pd from pandas.api.extensions import no_default, register_extension_dtype @@ -131,9 +131,12 @@ def copy(self): return type(self)(self._data.copy()) def astype(self, dtype, copy=True): + if is_dtype_equal(dtype, self._dtype): + if not copy: + return self dtype = pandas_dtype(dtype) if isinstance(dtype, type(self.dtype)): - return type(self)(self._data, context=dtype.context) + return type(self)(self._data, copy=copy, context=dtype.context) return super().astype(dtype, copy=copy) diff --git a/pandas/tests/extension/test_numpy.py b/pandas/tests/extension/test_numpy.py index bbfaacae1b444..c4afcd7a536df 100644 --- a/pandas/tests/extension/test_numpy.py +++ b/pandas/tests/extension/test_numpy.py @@ -177,7 +177,7 @@ def test_take_series(self, data): def test_loc_iloc_frame_single_dtype(self, data, request): npdtype = data.dtype.numpy_dtype - if npdtype == object or npdtype == np.float64: + if npdtype == object: # GH#33125 mark = pytest.mark.xfail( reason="GH#33125 astype doesn't recognize data.dtype" @@ -191,14 +191,6 @@ class TestGroupby(BaseNumPyTests, base.BaseGroupbyTests): def test_groupby_extension_apply( self, data_for_grouping, groupby_apply_op, request ): - # ValueError: Names should be list-like for a MultiIndex - a = "a" - is_identity = groupby_apply_op(a) is a - if data_for_grouping.dtype.numpy_dtype == np.float64 and is_identity: - mark = pytest.mark.xfail( - reason="GH#33125 astype doesn't recognize data.dtype" - ) - request.node.add_marker(mark) super().test_groupby_extension_apply(data_for_grouping, groupby_apply_op) @@ -306,11 +298,7 @@ def test_arith_series_with_array(self, data, all_arithmetic_operators): class TestPrinting(BaseNumPyTests, base.BasePrintingTests): - @pytest.mark.xfail( - reason="GH#33125 PandasArray.astype does not recognize PandasDtype" - ) - def test_series_repr(self, data): - super().test_series_repr(data) + pass @skip_nested From 84026d503f64966071776d460aa50593bea93992 Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Tue, 22 Sep 2020 15:31:07 +0100 Subject: [PATCH 0872/1025] CI: add pre-commit action, include pyupgrade (#36471) --- .github/workflows/pre-commit.yml | 14 ++++++++++++++ .pre-commit-config.yaml | 14 +++++++------- doc/source/development/contributing.rst | 11 +++++++++++ doc/sphinxext/announce.py | 1 - environment.yml | 2 ++ requirements-dev.txt | 2 ++ 6 files changed, 36 insertions(+), 8 deletions(-) create mode 100644 .github/workflows/pre-commit.yml diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml new file mode 100644 index 0000000000000..723347913ac38 --- /dev/null +++ b/.github/workflows/pre-commit.yml @@ -0,0 +1,14 @@ +name: pre-commit + +on: + pull_request: + push: + branches: [master] + +jobs: + pre-commit: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - uses: actions/setup-python@v2 + - uses: pre-commit/action@v2.0.0 diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index dd5323960ed20..6319629d57512 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -3,30 +3,30 @@ repos: rev: 20.8b1 hooks: - id: black - language_version: python3 - repo: https://gitlab.com/pycqa/flake8 rev: 3.8.3 hooks: - id: flake8 - language: python_venv additional_dependencies: [flake8-comprehensions>=3.1.0] - id: flake8 name: flake8-pyx - language: python_venv files: \.(pyx|pxd)$ types: - file args: [--append-config=flake8/cython.cfg] - id: flake8 name: flake8-pxd - language: python_venv files: \.pxi\.in$ types: - file args: [--append-config=flake8/cython-template.cfg] -- repo: https://github.com/pre-commit/mirrors-isort - rev: v5.2.2 +- repo: https://github.com/PyCQA/isort + rev: 5.2.2 hooks: - id: isort - language: python_venv exclude: ^pandas/__init__\.py$|^pandas/core/api\.py$ +- repo: https://github.com/asottile/pyupgrade + rev: v2.7.2 + hooks: + - id: pyupgrade + args: [--py37-plus] diff --git a/doc/source/development/contributing.rst b/doc/source/development/contributing.rst index 8558774955a40..bb13fbed09677 100644 --- a/doc/source/development/contributing.rst +++ b/doc/source/development/contributing.rst @@ -634,6 +634,10 @@ do not make sudden changes to the code that could have the potential to break a lot of user code as a result, that is, we need it to be as *backwards compatible* as possible to avoid mass breakages. +In addition to ``./ci/code_checks.sh``, some extra checks are run by +``pre-commit`` - see :ref:`here ` for how to +run them. + Additional standards are outlined on the :ref:`pandas code style guide ` Optional dependencies @@ -826,6 +830,13 @@ remain up-to-date with our code checks as they change. Note that if needed, you can skip these checks with ``git commit --no-verify``. +If you don't want to use ``pre-commit`` as part of your workflow, you can still use it +to run its checks by running:: + + pre-commit run --files + +without having to have done ``pre-commit install`` beforehand. + Backwards compatibility ~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/sphinxext/announce.py b/doc/sphinxext/announce.py index 9c175e4e58b45..2ec0b515ea95c 100755 --- a/doc/sphinxext/announce.py +++ b/doc/sphinxext/announce.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# -*- encoding:utf-8 -*- """ Script to generate contributor and pull request lists diff --git a/environment.yml b/environment.yml index ffd319b006ff2..7f6ce8cb9fa3b 100644 --- a/environment.yml +++ b/environment.yml @@ -22,7 +22,9 @@ dependencies: - flake8-rst>=0.6.0,<=0.7.0 # linting of code blocks in rst files - isort>=5.2.1 # check that imports are in the right order - mypy=0.782 + - pre-commit - pycodestyle # used by flake8 + - pyupgrade # documentation - gitpython # obtain contributors from git for whatsnew diff --git a/requirements-dev.txt b/requirements-dev.txt index 4f93ce9017f91..690a3368c7aca 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -13,7 +13,9 @@ flake8-comprehensions>=3.1.0 flake8-rst>=0.6.0,<=0.7.0 isort>=5.2.1 mypy==0.782 +pre-commit pycodestyle +pyupgrade gitpython gitdb sphinx From 1e63215e754575dd2c0d9990deb7481f1a78346e Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 22 Sep 2020 07:54:31 -0700 Subject: [PATCH 0873/1025] validate fill_value in IntervalArray.take unconditionally (#36538) --- pandas/core/arrays/interval.py | 4 +--- pandas/io/formats/excel.py | 4 +++- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index ebabc7edcbf43..1011381f235ca 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -812,9 +812,7 @@ def take(self, indices, allow_fill=False, fill_value=None, axis=None, **kwargs): fill_left = fill_right = fill_value if allow_fill: - if (np.asarray(indices) == -1).any(): - # We have excel tests that pass fill_value=True, xref GH#36466 - fill_left, fill_right = self._validate_fill_value(fill_value) + fill_left, fill_right = self._validate_fill_value(fill_value) left_take = take( self.left, indices, allow_fill=allow_fill, fill_value=fill_left diff --git a/pandas/io/formats/excel.py b/pandas/io/formats/excel.py index cc7b6b0bfea97..0140804e8c7b5 100644 --- a/pandas/io/formats/excel.py +++ b/pandas/io/formats/excel.py @@ -629,7 +629,9 @@ def _format_hierarchical_rows(self): ): values = levels.take( - level_codes, allow_fill=levels._can_hold_na, fill_value=True + level_codes, + allow_fill=levels._can_hold_na, + fill_value=levels._na_value, ) for i in spans: From 983ebdb2e3be3067216fb098eb4b93b8ff292f44 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Tue, 22 Sep 2020 17:18:53 +0100 Subject: [PATCH 0874/1025] CI: fix failing pre-commit (#36549) --- setup.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 8f447d5c38169..8e25705c1f4c3 100755 --- a/setup.py +++ b/setup.py @@ -387,8 +387,7 @@ def build_extension(self, ext): class DummyBuildSrc(Command): - """ numpy's build_src command interferes with Cython's build_ext. - """ + """numpy's build_src command interferes with Cython's build_ext.""" user_options = [] From b906c39cf7b22721740f548d9462beaa0ede750b Mon Sep 17 00:00:00 2001 From: nrebena Date: Wed, 23 Sep 2020 00:08:21 +0200 Subject: [PATCH 0875/1025] Regr/period range large value/issue 36430 (#36535) --- doc/source/whatsnew/v1.1.3.rst | 1 + pandas/_libs/tslibs/period.pyx | 3 ++- pandas/tests/scalar/period/test_period.py | 7 +++++++ 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.3.rst b/doc/source/whatsnew/v1.1.3.rst index e3a96c69918db..e3b0f59c3edcc 100644 --- a/doc/source/whatsnew/v1.1.3.rst +++ b/doc/source/whatsnew/v1.1.3.rst @@ -36,6 +36,7 @@ Fixed regressions - Fixed regression in :meth:`read_excel` with ``engine="odf"`` caused ``UnboundLocalError`` in some cases where cells had nested child nodes (:issue:`36122`,:issue:`35802`) - Fixed regression in :class:`DataFrame` and :class:`Series` comparisons between numeric arrays and strings (:issue:`35700`,:issue:`36377`) - Fixed regression when setting empty :class:`DataFrame` column to a :class:`Series` in preserving name of index in frame (:issue:`36527`) +- Fixed regression in :class:`Period` incorrect value for ordinal over the maximum timestamp (:issue:`36430`) .. --------------------------------------------------------------------------- diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index 86b6533f5caf5..27402c8d255b6 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -861,6 +861,7 @@ cdef int64_t get_time_nanos(int freq, int64_t unix_date, int64_t ordinal) nogil: """ cdef: int64_t sub, factor + int64_t nanos_in_day = 24 * 3600 * 10**9 freq = get_freq_group(freq) @@ -886,7 +887,7 @@ cdef int64_t get_time_nanos(int freq, int64_t unix_date, int64_t ordinal) nogil: # We must have freq == FR_HR factor = 10**9 * 3600 - sub = ordinal - unix_date * 24 * 3600 * 10**9 / factor + sub = ordinal - unix_date * (nanos_in_day / factor) return sub * factor diff --git a/pandas/tests/scalar/period/test_period.py b/pandas/tests/scalar/period/test_period.py index dcef0615121c1..795021a260028 100644 --- a/pandas/tests/scalar/period/test_period.py +++ b/pandas/tests/scalar/period/test_period.py @@ -486,6 +486,13 @@ def test_period_cons_combined(self): with pytest.raises(ValueError, match=msg): Period("2011-01", freq="1D1W") + @pytest.mark.parametrize("hour", range(24)) + def test_period_large_ordinal(self, hour): + # Issue #36430 + # Integer overflow for Period over the maximum timestamp + p = pd.Period(ordinal=2562048 + hour, freq="1H") + assert p.hour == hour + class TestPeriodMethods: def test_round_trip(self): From 158d40b057121f0bef16630772d88e1ff2880960 Mon Sep 17 00:00:00 2001 From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> Date: Tue, 22 Sep 2020 18:16:04 -0400 Subject: [PATCH 0876/1025] TYP: exclusions in BaseGroupBy (#36559) --- pandas/core/groupby/groupby.py | 7 ++++--- pandas/core/groupby/grouper.py | 20 ++++++++++---------- 2 files changed, 14 insertions(+), 13 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 9a14323dd8c3a..f1a61f433fc51 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -24,6 +24,7 @@ class providing the base-class of operations. Mapping, Optional, Sequence, + Set, Tuple, Type, TypeVar, @@ -36,7 +37,7 @@ class providing the base-class of operations. from pandas._libs import Timestamp, lib import pandas._libs.groupby as libgroupby -from pandas._typing import F, FrameOrSeries, FrameOrSeriesUnion, Scalar +from pandas._typing import F, FrameOrSeries, FrameOrSeriesUnion, Label, Scalar from pandas.compat.numpy import function as nv from pandas.errors import AbstractMethodError from pandas.util._decorators import Appender, Substitution, cache_readonly, doc @@ -488,7 +489,7 @@ def __init__( axis: int = 0, level=None, grouper: Optional["ops.BaseGrouper"] = None, - exclusions=None, + exclusions: Optional[Set[Label]] = None, selection=None, as_index: bool = True, sort: bool = True, @@ -537,7 +538,7 @@ def __init__( self.obj = obj self.axis = obj._get_axis_number(axis) self.grouper = grouper - self.exclusions = set(exclusions) if exclusions else set() + self.exclusions = exclusions or set() def __len__(self) -> int: return len(self.groups) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 59ea7781025c4..6263d5337f42f 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -2,12 +2,12 @@ Provide user facing operators for doing the split part of the split-apply-combine paradigm. """ -from typing import Dict, Hashable, List, Optional, Tuple +from typing import Dict, Hashable, List, Optional, Set, Tuple import warnings import numpy as np -from pandas._typing import FrameOrSeries +from pandas._typing import FrameOrSeries, Label from pandas.errors import InvalidIndexError from pandas.util._decorators import cache_readonly @@ -614,7 +614,7 @@ def get_grouper( mutated: bool = False, validate: bool = True, dropna: bool = True, -) -> Tuple["ops.BaseGrouper", List[Hashable], FrameOrSeries]: +) -> Tuple["ops.BaseGrouper", Set[Label], FrameOrSeries]: """ Create and return a BaseGrouper, which is an internal mapping of how to create the grouper indexers. @@ -690,13 +690,13 @@ def get_grouper( if isinstance(key, Grouper): binner, grouper, obj = key._get_grouper(obj, validate=False) if key.key is None: - return grouper, [], obj + return grouper, set(), obj else: - return grouper, [key.key], obj + return grouper, {key.key}, obj # already have a BaseGrouper, just return it elif isinstance(key, ops.BaseGrouper): - return key, [], obj + return key, set(), obj if not isinstance(key, list): keys = [key] @@ -739,7 +739,7 @@ def get_grouper( levels = [level] * len(keys) groupings: List[Grouping] = [] - exclusions: List[Hashable] = [] + exclusions: Set[Label] = set() # if the actual grouper should be obj[key] def is_in_axis(key) -> bool: @@ -769,21 +769,21 @@ def is_in_obj(gpr) -> bool: if is_in_obj(gpr): # df.groupby(df['name']) in_axis, name = True, gpr.name - exclusions.append(name) + exclusions.add(name) elif is_in_axis(gpr): # df.groupby('name') if gpr in obj: if validate: obj._check_label_or_level_ambiguity(gpr, axis=axis) in_axis, name, gpr = True, gpr, obj[gpr] - exclusions.append(name) + exclusions.add(name) elif obj._is_level_reference(gpr, axis=axis): in_axis, name, level, gpr = False, None, gpr, None else: raise KeyError(gpr) elif isinstance(gpr, Grouper) and gpr.key is not None: # Add key to exclusions - exclusions.append(gpr.key) + exclusions.add(gpr.key) in_axis, name = False, None else: in_axis, name = False, None From af162a6d102ed908fa173ed3bc50f01ebc2f26f6 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 22 Sep 2020 15:17:15 -0700 Subject: [PATCH 0877/1025] REF: de-duplicate Categorical validators (#36558) --- pandas/core/arrays/categorical.py | 10 ++-------- pandas/tests/indexes/categorical/test_category.py | 13 ++++++++----- pandas/tests/indexing/test_categorical.py | 5 +++-- 3 files changed, 13 insertions(+), 15 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index ef69d6565cfeb..e984f2c26b916 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1177,13 +1177,7 @@ def _validate_where_value(self, value): return self._validate_listlike(value) def _validate_insert_value(self, value) -> int: - code = self.categories.get_indexer([value]) - if (code == -1) and not (is_scalar(value) and isna(value)): - raise TypeError( - "cannot insert an item into a CategoricalIndex " - "that is not already an existing category" - ) - return code[0] + return self._validate_fill_value(value) def _validate_searchsorted_value(self, value): # searchsorted is very performance sensitive. By converting codes @@ -1213,7 +1207,7 @@ def _validate_fill_value(self, fill_value): ValueError """ - if isna(fill_value): + if is_valid_nat_for_dtype(fill_value, self.categories.dtype): fill_value = -1 elif fill_value in self.categories: fill_value = self._unbox_scalar(fill_value) diff --git a/pandas/tests/indexes/categorical/test_category.py b/pandas/tests/indexes/categorical/test_category.py index a3a06338a0277..81b31e3ea180c 100644 --- a/pandas/tests/indexes/categorical/test_category.py +++ b/pandas/tests/indexes/categorical/test_category.py @@ -171,11 +171,8 @@ def test_insert(self): tm.assert_index_equal(result, expected, exact=True) # invalid - msg = ( - "cannot insert an item into a CategoricalIndex that is not " - "already an existing category" - ) - with pytest.raises(TypeError, match=msg): + msg = "'fill_value=d' is not present in this Categorical's categories" + with pytest.raises(ValueError, match=msg): ci.insert(0, "d") # GH 18295 (test missing) @@ -184,6 +181,12 @@ def test_insert(self): result = CategoricalIndex(list("aabcb")).insert(1, na) tm.assert_index_equal(result, expected) + def test_insert_na_mismatched_dtype(self): + ci = pd.CategoricalIndex([0, 1, 1]) + msg = "'fill_value=NaT' is not present in this Categorical's categories" + with pytest.raises(ValueError, match=msg): + ci.insert(0, pd.NaT) + def test_delete(self): ci = self.create_index() diff --git a/pandas/tests/indexing/test_categorical.py b/pandas/tests/indexing/test_categorical.py index 98edb56260b01..9f3ee81fac2eb 100644 --- a/pandas/tests/indexing/test_categorical.py +++ b/pandas/tests/indexing/test_categorical.py @@ -76,9 +76,10 @@ def test_loc_scalar(self): "cannot insert an item into a CategoricalIndex that is not " "already an existing category" ) - with pytest.raises(TypeError, match=msg): + msg = "'fill_value=d' is not present in this Categorical's categories" + with pytest.raises(ValueError, match=msg): df.loc["d", "A"] = 10 - with pytest.raises(TypeError, match=msg): + with pytest.raises(ValueError, match=msg): df.loc["d", "C"] = 10 with pytest.raises(KeyError, match="^1$"): From a020377972303156e4e7feed723ad3ba91687caa Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 22 Sep 2020 17:18:06 -0500 Subject: [PATCH 0878/1025] Call finalize in Series.dt (#36554) xref #28283 --- doc/source/whatsnew/v1.2.0.rst | 1 + pandas/core/indexes/accessors.py | 12 +++++++++--- pandas/tests/generic/test_finalize.py | 4 ---- 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 6a5b4b3b9ff16..7280ccc633f17 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -374,6 +374,7 @@ Other ^^^^^ - Bug in :meth:`DataFrame.replace` and :meth:`Series.replace` incorrectly raising ``AssertionError`` instead of ``ValueError`` when invalid parameter combinations are passed (:issue:`36045`) - Bug in :meth:`DataFrame.replace` and :meth:`Series.replace` with numeric values and string ``to_replace`` (:issue:`34789`) +- Fixed metadata propagation in the :class:`Series.dt` accessor (:issue:`28283`) - Bug in :meth:`Series.transform` would give incorrect results or raise when the argument ``func`` was dictionary (:issue:`35811`) - Bug in :meth:`Index.union` behaving differently depending on whether operand is a :class:`Index` or other list-like (:issue:`36384`) diff --git a/pandas/core/indexes/accessors.py b/pandas/core/indexes/accessors.py index 881d5ce1fbaab..aa2c04e48eb81 100644 --- a/pandas/core/indexes/accessors.py +++ b/pandas/core/indexes/accessors.py @@ -78,7 +78,7 @@ def _delegate_property_get(self, name): else: index = self._parent.index # return the result as a Series, which is by definition a copy - result = Series(result, index=index, name=self.name) + result = Series(result, index=index, name=self.name).__finalize__(self._parent) # setting this object will show a SettingWithCopyWarning/Error result._is_copy = ( @@ -106,7 +106,9 @@ def _delegate_method(self, name, *args, **kwargs): if not is_list_like(result): return result - result = Series(result, index=self._parent.index, name=self.name) + result = Series(result, index=self._parent.index, name=self.name).__finalize__( + self._parent + ) # setting this object will show a SettingWithCopyWarning/Error result._is_copy = ( @@ -371,7 +373,11 @@ def components(self): 3 0 0 0 3 0 0 0 4 0 0 0 4 0 0 0 """ - return self._get_values().components.set_index(self._parent.index) + return ( + self._get_values() + .components.set_index(self._parent.index) + .__finalize__(self._parent) + ) @property def freq(self): diff --git a/pandas/tests/generic/test_finalize.py b/pandas/tests/generic/test_finalize.py index 8898619e374ab..6692102bc9008 100644 --- a/pandas/tests/generic/test_finalize.py +++ b/pandas/tests/generic/test_finalize.py @@ -678,7 +678,6 @@ def test_string_method(method): ], ids=idfn, ) -@not_implemented_mark def test_datetime_method(method): s = pd.Series(pd.date_range("2000", periods=4)) s.attrs = {"a": 1} @@ -714,7 +713,6 @@ def test_datetime_method(method): "days_in_month", ], ) -@not_implemented_mark def test_datetime_property(attr): s = pd.Series(pd.date_range("2000", periods=4)) s.attrs = {"a": 1} @@ -725,7 +723,6 @@ def test_datetime_property(attr): @pytest.mark.parametrize( "attr", ["days", "seconds", "microseconds", "nanoseconds", "components"] ) -@not_implemented_mark def test_timedelta_property(attr): s = pd.Series(pd.timedelta_range("2000", periods=4)) s.attrs = {"a": 1} @@ -734,7 +731,6 @@ def test_timedelta_property(attr): @pytest.mark.parametrize("method", [operator.methodcaller("total_seconds")]) -@not_implemented_mark def test_timedelta_methods(method): s = pd.Series(pd.timedelta_range("2000", periods=4)) s.attrs = {"a": 1} From 8bd36a2a16588f4c0cea330209e9ef86c457726e Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 22 Sep 2020 15:19:58 -0700 Subject: [PATCH 0879/1025] REF: Categorical.fillna match patterns in other methods (#36530) --- doc/source/whatsnew/v1.2.0.rst | 2 +- pandas/core/arrays/categorical.py | 41 ++++++------------- pandas/tests/frame/test_missing.py | 3 +- .../tests/indexes/categorical/test_fillna.py | 28 ++++++++++++- pandas/tests/series/methods/test_fillna.py | 3 +- 5 files changed, 44 insertions(+), 33 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 7280ccc633f17..ed48bf0675034 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -238,7 +238,7 @@ Bug fixes Categorical ^^^^^^^^^^^ - +- :meth:`Categorical.fillna` will always return a copy, will validate a passed fill value regardless of whether there are any NAs to fill, and will disallow a ``NaT`` as a fill value for numeric categories (:issue:`36530`) - - diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index e984f2c26b916..32ef37d44ad1b 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -37,7 +37,6 @@ ) from pandas.core.dtypes.dtypes import CategoricalDtype from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries -from pandas.core.dtypes.inference import is_hashable from pandas.core.dtypes.missing import is_valid_nat_for_dtype, isna, notna from pandas.core import ops @@ -1630,6 +1629,7 @@ def fillna(self, value=None, method=None, limit=None): value, method = validate_fillna_kwargs( value, method, validate_scalar_dict_value=False ) + value = extract_array(value, extract_numpy=True) if value is None: value = np.nan @@ -1638,10 +1638,8 @@ def fillna(self, value=None, method=None, limit=None): "specifying a limit for fillna has not been implemented yet" ) - codes = self._codes - - # pad / bfill if method is not None: + # pad / bfill # TODO: dispatch when self.categories is EA-dtype values = np.asarray(self).reshape(-1, len(self)) @@ -1651,40 +1649,25 @@ def fillna(self, value=None, method=None, limit=None): codes = _get_codes_for_values(values, self.categories) else: + # We copy even if there is nothing to fill + codes = self._ndarray.copy() + mask = self.isna() - # If value is a dict or a Series (a dict value has already - # been converted to a Series) - if isinstance(value, (np.ndarray, Categorical, ABCSeries)): + if isinstance(value, (np.ndarray, Categorical)): # We get ndarray or Categorical if called via Series.fillna, # where it will unwrap another aligned Series before getting here - mask = ~algorithms.isin(value, self.categories) - if not isna(value[mask]).all(): + not_categories = ~algorithms.isin(value, self.categories) + if not isna(value[not_categories]).all(): + # All entries in `value` must either be a category or NA raise ValueError("fill value must be in categories") values_codes = _get_codes_for_values(value, self.categories) - indexer = np.where(codes == -1) - codes = codes.copy() - codes[indexer] = values_codes[indexer] - - # If value is not a dict or Series it should be a scalar - elif is_hashable(value): - if not isna(value) and value not in self.categories: - raise ValueError("fill value must be in categories") - - mask = codes == -1 - if mask.any(): - codes = codes.copy() - if isna(value): - codes[mask] = -1 - else: - codes[mask] = self._unbox_scalar(value) + codes[mask] = values_codes[mask] else: - raise TypeError( - f"'value' parameter must be a scalar, dict " - f"or Series, but you passed a {type(value).__name__}" - ) + new_code = self._validate_fill_value(value) + codes[mask] = new_code return self._from_backing_data(codes) diff --git a/pandas/tests/frame/test_missing.py b/pandas/tests/frame/test_missing.py index b4f91590e09d1..5d3f8e3a2f7c1 100644 --- a/pandas/tests/frame/test_missing.py +++ b/pandas/tests/frame/test_missing.py @@ -362,7 +362,8 @@ def test_na_actions_categorical(self): res = df.fillna(value={"cats": 3, "vals": "b"}) tm.assert_frame_equal(res, df_exp_fill) - with pytest.raises(ValueError, match=("fill value must be in categories")): + msg = "'fill_value=4' is not present in this Categorical's categories" + with pytest.raises(ValueError, match=msg): df.fillna(value={"cats": 4, "vals": "c"}) res = df.fillna(method="pad") diff --git a/pandas/tests/indexes/categorical/test_fillna.py b/pandas/tests/indexes/categorical/test_fillna.py index 0d878249d3800..f6a6747166011 100644 --- a/pandas/tests/indexes/categorical/test_fillna.py +++ b/pandas/tests/indexes/categorical/test_fillna.py @@ -14,6 +14,32 @@ def test_fillna_categorical(self): tm.assert_index_equal(idx.fillna(1.0), exp) # fill by value not in categories raises ValueError - msg = "fill value must be in categories" + msg = "'fill_value=2.0' is not present in this Categorical's categories" with pytest.raises(ValueError, match=msg): idx.fillna(2.0) + + def test_fillna_copies_with_no_nas(self): + # Nothing to fill, should still get a copy + ci = CategoricalIndex([0, 1, 1]) + cat = ci._data + result = ci.fillna(0) + assert result._values._ndarray is not cat._ndarray + assert result._values._ndarray.base is None + + # Same check directly on the Categorical object + result = cat.fillna(0) + assert result._ndarray is not cat._ndarray + assert result._ndarray.base is None + + def test_fillna_validates_with_no_nas(self): + # We validate the fill value even if fillna is a no-op + ci = CategoricalIndex([2, 3, 3]) + cat = ci._data + + msg = "'fill_value=False' is not present in this Categorical's categories" + with pytest.raises(ValueError, match=msg): + ci.fillna(False) + + # Same check directly on the Categorical + with pytest.raises(ValueError, match=msg): + cat.fillna(False) diff --git a/pandas/tests/series/methods/test_fillna.py b/pandas/tests/series/methods/test_fillna.py index 80b8271e16e7a..b6a6f4e8200d4 100644 --- a/pandas/tests/series/methods/test_fillna.py +++ b/pandas/tests/series/methods/test_fillna.py @@ -125,7 +125,8 @@ def test_fillna_categorical_raises(self): data = ["a", np.nan, "b", np.nan, np.nan] ser = Series(Categorical(data, categories=["a", "b"])) - with pytest.raises(ValueError, match="fill value must be in categories"): + msg = "'fill_value=d' is not present in this Categorical's categories" + with pytest.raises(ValueError, match=msg): ser.fillna("d") with pytest.raises(ValueError, match="fill value must be in categories"): From 9d6010248a8661cb981688c2c8fcc45a9f39f9de Mon Sep 17 00:00:00 2001 From: Maxim Ivanov <41443370+ivanovmg@users.noreply.github.com> Date: Wed, 23 Sep 2020 05:34:29 +0700 Subject: [PATCH 0880/1025] REF: test_to_latex (#36528) --- pandas/tests/io/formats/test_to_latex.py | 1619 ++++++++++++---------- 1 file changed, 903 insertions(+), 716 deletions(-) diff --git a/pandas/tests/io/formats/test_to_latex.py b/pandas/tests/io/formats/test_to_latex.py index 8df8796d236a5..7a0d305758802 100644 --- a/pandas/tests/io/formats/test_to_latex.py +++ b/pandas/tests/io/formats/test_to_latex.py @@ -1,5 +1,6 @@ import codecs from datetime import datetime +from textwrap import dedent import pytest @@ -16,14 +17,82 @@ ) +def _dedent(string): + """Dedent without new line in the beginning. + + Built-in textwrap.dedent would keep new line character in the beginning + of multi-line string starting from the new line. + This version drops the leading new line character. + """ + return dedent(string).lstrip() + + class TestToLatex: - def test_to_latex_filename(self, float_frame): + @pytest.fixture + def df_short(self): + """Short dataframe for testing table/tabular/longtable LaTeX env.""" + return DataFrame({"a": [1, 2], "b": ["b1", "b2"]}) + + @pytest.fixture + def caption_table(self): + """Caption for table/tabular LaTeX environment.""" + return "a table in a \\texttt{table/tabular} environment" + + @pytest.fixture + def label_table(self): + """Label for table/tabular LaTeX environment.""" + return "tab:table_tabular" + + @pytest.fixture + def caption_longtable(self): + """Caption for longtable LaTeX environment.""" + return "a table in a \\texttt{longtable} environment" + + @pytest.fixture + def label_longtable(self): + """Label for longtable LaTeX environment.""" + return "tab:longtable" + + @pytest.fixture + def multiindex_frame(self): + """Multiindex dataframe for testing multirow LaTeX macros.""" + yield DataFrame.from_dict( + { + ("c1", 0): pd.Series({x: x for x in range(4)}), + ("c1", 1): pd.Series({x: x + 4 for x in range(4)}), + ("c2", 0): pd.Series({x: x for x in range(4)}), + ("c2", 1): pd.Series({x: x + 4 for x in range(4)}), + ("c3", 0): pd.Series({x: x for x in range(4)}), + } + ).T + + @pytest.fixture + def multicolumn_frame(self): + """Multicolumn dataframe for testing multicolumn LaTeX macros.""" + yield pd.DataFrame( + { + ("c1", 0): {x: x for x in range(5)}, + ("c1", 1): {x: x + 5 for x in range(5)}, + ("c2", 0): {x: x for x in range(5)}, + ("c2", 1): {x: x + 5 for x in range(5)}, + ("c3", 0): {x: x for x in range(5)}, + } + ) + + @pytest.fixture + def df_with_symbols(self): + """Dataframe with special characters for testing chars escaping.""" + a = "a" + b = "b" + yield DataFrame({"co$e^x$": {a: "a", b: "b"}, "co^l1": {a: "a", b: "b"}}) + + def test_to_latex_to_file(self, float_frame): with tm.ensure_clean("test.tex") as path: float_frame.to_latex(path) - with open(path) as f: assert float_frame.to_latex() == f.read() + def test_to_latex_to_file_utf8_with_encoding(self): # test with utf-8 and encoding option (GH 7061) df = DataFrame([["au\xdfgangen"]]) with tm.ensure_clean("test.tex") as path: @@ -31,42 +100,47 @@ def test_to_latex_filename(self, float_frame): with codecs.open(path, "r", encoding="utf-8") as f: assert df.to_latex() == f.read() + def test_to_latex_to_file_utf8_without_encoding(self): # test with utf-8 without encoding option + df = DataFrame([["au\xdfgangen"]]) with tm.ensure_clean("test.tex") as path: df.to_latex(path) with codecs.open(path, "r", encoding="utf-8") as f: assert df.to_latex() == f.read() - def test_to_latex(self, float_frame): - # it works! - float_frame.to_latex() - + def test_to_latex_tabular_with_index(self): df = DataFrame({"a": [1, 2], "b": ["b1", "b2"]}) - withindex_result = df.to_latex() - withindex_expected = r"""\begin{tabular}{lrl} -\toprule -{} & a & b \\ -\midrule -0 & 1 & b1 \\ -1 & 2 & b2 \\ -\bottomrule -\end{tabular} -""" - - assert withindex_result == withindex_expected - - withoutindex_result = df.to_latex(index=False) - withoutindex_expected = r"""\begin{tabular}{rl} -\toprule - a & b \\ -\midrule - 1 & b1 \\ - 2 & b2 \\ -\bottomrule -\end{tabular} -""" + result = df.to_latex() + expected = _dedent( + r""" + \begin{tabular}{lrl} + \toprule + {} & a & b \\ + \midrule + 0 & 1 & b1 \\ + 1 & 2 & b2 \\ + \bottomrule + \end{tabular} + """ + ) + assert result == expected - assert withoutindex_result == withoutindex_expected + def test_to_latex_tabular_without_index(self): + df = DataFrame({"a": [1, 2], "b": ["b1", "b2"]}) + result = df.to_latex(index=False) + expected = _dedent( + r""" + \begin{tabular}{rl} + \toprule + a & b \\ + \midrule + 1 & b1 \\ + 2 & b2 \\ + \bottomrule + \end{tabular} + """ + ) + assert result == expected @pytest.mark.parametrize( "bad_column_format", @@ -78,45 +152,55 @@ def test_to_latex_bad_column_format(self, bad_column_format): with pytest.raises(ValueError, match=msg): df.to_latex(column_format=bad_column_format) - def test_to_latex_format(self, float_frame): + def test_to_latex_column_format(self, float_frame): # GH Bug #9402 - float_frame.to_latex(column_format="ccc") + float_frame.to_latex(column_format="lcr") df = DataFrame({"a": [1, 2], "b": ["b1", "b2"]}) - withindex_result = df.to_latex(column_format="ccc") - withindex_expected = r"""\begin{tabular}{ccc} -\toprule -{} & a & b \\ -\midrule -0 & 1 & b1 \\ -1 & 2 & b2 \\ -\bottomrule -\end{tabular} -""" - - assert withindex_result == withindex_expected + result = df.to_latex(column_format="lcr") + expected = _dedent( + r""" + \begin{tabular}{lcr} + \toprule + {} & a & b \\ + \midrule + 0 & 1 & b1 \\ + 1 & 2 & b2 \\ + \bottomrule + \end{tabular} + """ + ) + assert result == expected - def test_to_latex_empty(self): + def test_to_latex_empty_tabular(self): df = DataFrame() result = df.to_latex() - expected = r"""\begin{tabular}{l} -\toprule -Empty DataFrame -Columns: Index([], dtype='object') -Index: Index([], dtype='object') \\ -\bottomrule -\end{tabular} -""" + expected = _dedent( + r""" + \begin{tabular}{l} + \toprule + Empty DataFrame + Columns: Index([], dtype='object') + Index: Index([], dtype='object') \\ + \bottomrule + \end{tabular} + """ + ) assert result == expected + def test_to_latex_empty_longtable(self): + df = DataFrame() result = df.to_latex(longtable=True) - expected = r"""\begin{longtable}{l} -\toprule -Empty DataFrame -Columns: Index([], dtype='object') -Index: Index([], dtype='object') \\ -\end{longtable} -""" + expected = _dedent( + r""" + \begin{longtable}{l} + \toprule + Empty DataFrame + Columns: Index([], dtype='object') + Index: Index([], dtype='object') \\ + \end{longtable} + """ + ) assert result == expected def test_to_latex_with_formatters(self): @@ -142,119 +226,134 @@ def test_to_latex_with_formatters(self): } result = df.to_latex(formatters=dict(formatters)) - expected = r"""\begin{tabular}{llrrl} -\toprule -{} & datetime64 & float & int & object \\ -\midrule -index: 0 & 2016-01 & [ 1.0] & 0x1 & -(1, 2)- \\ -index: 1 & 2016-02 & [ 2.0] & 0x2 & -True- \\ -index: 2 & 2016-03 & [ 3.0] & 0x3 & -False- \\ -\bottomrule -\end{tabular} -""" + expected = _dedent( + r""" + \begin{tabular}{llrrl} + \toprule + {} & datetime64 & float & int & object \\ + \midrule + index: 0 & 2016-01 & [ 1.0] & 0x1 & -(1, 2)- \\ + index: 1 & 2016-02 & [ 2.0] & 0x2 & -True- \\ + index: 2 & 2016-03 & [ 3.0] & 0x3 & -False- \\ + \bottomrule + \end{tabular} + """ + ) assert result == expected - def test_to_latex_multiindex(self): + def test_to_latex_multiindex_column_tabular(self): df = DataFrame({("x", "y"): ["a"]}) result = df.to_latex() - expected = r"""\begin{tabular}{ll} -\toprule -{} & x \\ -{} & y \\ -\midrule -0 & a \\ -\bottomrule -\end{tabular} -""" - + expected = _dedent( + r""" + \begin{tabular}{ll} + \toprule + {} & x \\ + {} & y \\ + \midrule + 0 & a \\ + \bottomrule + \end{tabular} + """ + ) assert result == expected + def test_to_latex_multiindex_small_tabular(self): + df = DataFrame({("x", "y"): ["a"]}) result = df.T.to_latex() - expected = r"""\begin{tabular}{lll} -\toprule - & & 0 \\ -\midrule -x & y & a \\ -\bottomrule -\end{tabular} -""" - + expected = _dedent( + r""" + \begin{tabular}{lll} + \toprule + & & 0 \\ + \midrule + x & y & a \\ + \bottomrule + \end{tabular} + """ + ) assert result == expected - df = DataFrame.from_dict( - { - ("c1", 0): pd.Series({x: x for x in range(4)}), - ("c1", 1): pd.Series({x: x + 4 for x in range(4)}), - ("c2", 0): pd.Series({x: x for x in range(4)}), - ("c2", 1): pd.Series({x: x + 4 for x in range(4)}), - ("c3", 0): pd.Series({x: x for x in range(4)}), - } - ).T - result = df.to_latex() - expected = r"""\begin{tabular}{llrrrr} -\toprule - & & 0 & 1 & 2 & 3 \\ -\midrule -c1 & 0 & 0 & 1 & 2 & 3 \\ - & 1 & 4 & 5 & 6 & 7 \\ -c2 & 0 & 0 & 1 & 2 & 3 \\ - & 1 & 4 & 5 & 6 & 7 \\ -c3 & 0 & 0 & 1 & 2 & 3 \\ -\bottomrule -\end{tabular} -""" - + def test_to_latex_multiindex_tabular(self, multiindex_frame): + result = multiindex_frame.to_latex() + expected = _dedent( + r""" + \begin{tabular}{llrrrr} + \toprule + & & 0 & 1 & 2 & 3 \\ + \midrule + c1 & 0 & 0 & 1 & 2 & 3 \\ + & 1 & 4 & 5 & 6 & 7 \\ + c2 & 0 & 0 & 1 & 2 & 3 \\ + & 1 & 4 & 5 & 6 & 7 \\ + c3 & 0 & 0 & 1 & 2 & 3 \\ + \bottomrule + \end{tabular} + """ + ) assert result == expected + def test_to_latex_multicolumn_tabular(self, multiindex_frame): # GH 14184 - df = df.T + df = multiindex_frame.T df.columns.names = ["a", "b"] result = df.to_latex() - expected = r"""\begin{tabular}{lrrrrr} -\toprule -a & \multicolumn{2}{l}{c1} & \multicolumn{2}{l}{c2} & c3 \\ -b & 0 & 1 & 0 & 1 & 0 \\ -\midrule -0 & 0 & 4 & 0 & 4 & 0 \\ -1 & 1 & 5 & 1 & 5 & 1 \\ -2 & 2 & 6 & 2 & 6 & 2 \\ -3 & 3 & 7 & 3 & 7 & 3 \\ -\bottomrule -\end{tabular} -""" + expected = _dedent( + r""" + \begin{tabular}{lrrrrr} + \toprule + a & \multicolumn{2}{l}{c1} & \multicolumn{2}{l}{c2} & c3 \\ + b & 0 & 1 & 0 & 1 & 0 \\ + \midrule + 0 & 0 & 4 & 0 & 4 & 0 \\ + 1 & 1 & 5 & 1 & 5 & 1 \\ + 2 & 2 & 6 & 2 & 6 & 2 \\ + 3 & 3 & 7 & 3 & 7 & 3 \\ + \bottomrule + \end{tabular} + """ + ) assert result == expected + def test_to_latex_index_has_name_tabular(self): # GH 10660 df = pd.DataFrame({"a": [0, 0, 1, 1], "b": list("abab"), "c": [1, 2, 3, 4]}) result = df.set_index(["a", "b"]).to_latex() - expected = r"""\begin{tabular}{llr} -\toprule - & & c \\ -a & b & \\ -\midrule -0 & a & 1 \\ - & b & 2 \\ -1 & a & 3 \\ - & b & 4 \\ -\bottomrule -\end{tabular} -""" - + expected = _dedent( + r""" + \begin{tabular}{llr} + \toprule + & & c \\ + a & b & \\ + \midrule + 0 & a & 1 \\ + & b & 2 \\ + 1 & a & 3 \\ + & b & 4 \\ + \bottomrule + \end{tabular} + """ + ) assert result == expected + def test_to_latex_groupby_tabular(self): + # GH 10660 + df = pd.DataFrame({"a": [0, 0, 1, 1], "b": list("abab"), "c": [1, 2, 3, 4]}) result = df.groupby("a").describe().to_latex() - expected = r"""\begin{tabular}{lrrrrrrrr} -\toprule -{} & \multicolumn{8}{l}{c} \\ -{} & count & mean & std & min & 25\% & 50\% & 75\% & max \\ -a & & & & & & & & \\ -\midrule -0 & 2.0 & 1.5 & 0.707107 & 1.0 & 1.25 & 1.5 & 1.75 & 2.0 \\ -1 & 2.0 & 3.5 & 0.707107 & 3.0 & 3.25 & 3.5 & 3.75 & 4.0 \\ -\bottomrule -\end{tabular} -""" - + expected = _dedent( + r""" + \begin{tabular}{lrrrrrrrr} + \toprule + {} & \multicolumn{8}{l}{c} \\ + {} & count & mean & std & min & 25\% & 50\% & 75\% & max \\ + a & & & & & & & & \\ + \midrule + 0 & 2.0 & 1.5 & 0.707107 & 1.0 & 1.25 & 1.5 & 1.75 & 2.0 \\ + 1 & 2.0 & 3.5 & 0.707107 & 3.0 & 3.25 & 3.5 & 3.75 & 4.0 \\ + \bottomrule + \end{tabular} + """ + ) assert result == expected def test_to_latex_multiindex_dupe_level(self): @@ -269,568 +368,635 @@ def test_to_latex_multiindex_dupe_level(self): index=pd.MultiIndex.from_tuples([("A", "c"), ("B", "c")]), columns=["col"] ) result = df.to_latex() - expected = r"""\begin{tabular}{lll} -\toprule - & & col \\ -\midrule -A & c & NaN \\ -B & c & NaN \\ -\bottomrule -\end{tabular} -""" + expected = _dedent( + r""" + \begin{tabular}{lll} + \toprule + & & col \\ + \midrule + A & c & NaN \\ + B & c & NaN \\ + \bottomrule + \end{tabular} + """ + ) assert result == expected - def test_to_latex_multicolumnrow(self): - df = pd.DataFrame( - { - ("c1", 0): {x: x for x in range(5)}, - ("c1", 1): {x: x + 5 for x in range(5)}, - ("c2", 0): {x: x for x in range(5)}, - ("c2", 1): {x: x + 5 for x in range(5)}, - ("c3", 0): {x: x for x in range(5)}, - } + def test_to_latex_multicolumn_default(self, multicolumn_frame): + result = multicolumn_frame.to_latex() + expected = _dedent( + r""" + \begin{tabular}{lrrrrr} + \toprule + {} & \multicolumn{2}{l}{c1} & \multicolumn{2}{l}{c2} & c3 \\ + {} & 0 & 1 & 0 & 1 & 0 \\ + \midrule + 0 & 0 & 5 & 0 & 5 & 0 \\ + 1 & 1 & 6 & 1 & 6 & 1 \\ + 2 & 2 & 7 & 2 & 7 & 2 \\ + 3 & 3 & 8 & 3 & 8 & 3 \\ + 4 & 4 & 9 & 4 & 9 & 4 \\ + \bottomrule + \end{tabular} + """ ) - result = df.to_latex() - expected = r"""\begin{tabular}{lrrrrr} -\toprule -{} & \multicolumn{2}{l}{c1} & \multicolumn{2}{l}{c2} & c3 \\ -{} & 0 & 1 & 0 & 1 & 0 \\ -\midrule -0 & 0 & 5 & 0 & 5 & 0 \\ -1 & 1 & 6 & 1 & 6 & 1 \\ -2 & 2 & 7 & 2 & 7 & 2 \\ -3 & 3 & 8 & 3 & 8 & 3 \\ -4 & 4 & 9 & 4 & 9 & 4 \\ -\bottomrule -\end{tabular} -""" assert result == expected - result = df.to_latex(multicolumn=False) - expected = r"""\begin{tabular}{lrrrrr} -\toprule -{} & c1 & & c2 & & c3 \\ -{} & 0 & 1 & 0 & 1 & 0 \\ -\midrule -0 & 0 & 5 & 0 & 5 & 0 \\ -1 & 1 & 6 & 1 & 6 & 1 \\ -2 & 2 & 7 & 2 & 7 & 2 \\ -3 & 3 & 8 & 3 & 8 & 3 \\ -4 & 4 & 9 & 4 & 9 & 4 \\ -\bottomrule -\end{tabular} -""" + def test_to_latex_multicolumn_false(self, multicolumn_frame): + result = multicolumn_frame.to_latex(multicolumn=False) + expected = _dedent( + r""" + \begin{tabular}{lrrrrr} + \toprule + {} & c1 & & c2 & & c3 \\ + {} & 0 & 1 & 0 & 1 & 0 \\ + \midrule + 0 & 0 & 5 & 0 & 5 & 0 \\ + 1 & 1 & 6 & 1 & 6 & 1 \\ + 2 & 2 & 7 & 2 & 7 & 2 \\ + 3 & 3 & 8 & 3 & 8 & 3 \\ + 4 & 4 & 9 & 4 & 9 & 4 \\ + \bottomrule + \end{tabular} + """ + ) assert result == expected - result = df.T.to_latex(multirow=True) - expected = r"""\begin{tabular}{llrrrrr} -\toprule - & & 0 & 1 & 2 & 3 & 4 \\ -\midrule -\multirow{2}{*}{c1} & 0 & 0 & 1 & 2 & 3 & 4 \\ - & 1 & 5 & 6 & 7 & 8 & 9 \\ -\cline{1-7} -\multirow{2}{*}{c2} & 0 & 0 & 1 & 2 & 3 & 4 \\ - & 1 & 5 & 6 & 7 & 8 & 9 \\ -\cline{1-7} -c3 & 0 & 0 & 1 & 2 & 3 & 4 \\ -\bottomrule -\end{tabular} -""" + def test_to_latex_multirow_true(self, multicolumn_frame): + result = multicolumn_frame.T.to_latex(multirow=True) + expected = _dedent( + r""" + \begin{tabular}{llrrrrr} + \toprule + & & 0 & 1 & 2 & 3 & 4 \\ + \midrule + \multirow{2}{*}{c1} & 0 & 0 & 1 & 2 & 3 & 4 \\ + & 1 & 5 & 6 & 7 & 8 & 9 \\ + \cline{1-7} + \multirow{2}{*}{c2} & 0 & 0 & 1 & 2 & 3 & 4 \\ + & 1 & 5 & 6 & 7 & 8 & 9 \\ + \cline{1-7} + c3 & 0 & 0 & 1 & 2 & 3 & 4 \\ + \bottomrule + \end{tabular} + """ + ) assert result == expected - df.index = df.T.index - result = df.T.to_latex(multirow=True, multicolumn=True, multicolumn_format="c") - expected = r"""\begin{tabular}{llrrrrr} -\toprule - & & \multicolumn{2}{c}{c1} & \multicolumn{2}{c}{c2} & c3 \\ - & & 0 & 1 & 0 & 1 & 0 \\ -\midrule -\multirow{2}{*}{c1} & 0 & 0 & 1 & 2 & 3 & 4 \\ - & 1 & 5 & 6 & 7 & 8 & 9 \\ -\cline{1-7} -\multirow{2}{*}{c2} & 0 & 0 & 1 & 2 & 3 & 4 \\ - & 1 & 5 & 6 & 7 & 8 & 9 \\ -\cline{1-7} -c3 & 0 & 0 & 1 & 2 & 3 & 4 \\ -\bottomrule -\end{tabular} -""" + def test_to_latex_multicolumnrow_with_multicol_format(self, multicolumn_frame): + multicolumn_frame.index = multicolumn_frame.T.index + result = multicolumn_frame.T.to_latex( + multirow=True, + multicolumn=True, + multicolumn_format="c", + ) + expected = _dedent( + r""" + \begin{tabular}{llrrrrr} + \toprule + & & \multicolumn{2}{c}{c1} & \multicolumn{2}{c}{c2} & c3 \\ + & & 0 & 1 & 0 & 1 & 0 \\ + \midrule + \multirow{2}{*}{c1} & 0 & 0 & 1 & 2 & 3 & 4 \\ + & 1 & 5 & 6 & 7 & 8 & 9 \\ + \cline{1-7} + \multirow{2}{*}{c2} & 0 & 0 & 1 & 2 & 3 & 4 \\ + & 1 & 5 & 6 & 7 & 8 & 9 \\ + \cline{1-7} + c3 & 0 & 0 & 1 & 2 & 3 & 4 \\ + \bottomrule + \end{tabular} + """ + ) assert result == expected - def test_to_latex_escape(self): - a = "a" - b = "b" - - test_dict = {"co$e^x$": {a: "a", b: "b"}, "co^l1": {a: "a", b: "b"}} - - unescaped_result = DataFrame(test_dict).to_latex(escape=False) - escaped_result = DataFrame(test_dict).to_latex() # default: escape=True - - unescaped_expected = r"""\begin{tabular}{lll} -\toprule -{} & co$e^x$ & co^l1 \\ -\midrule -a & a & a \\ -b & b & b \\ -\bottomrule -\end{tabular} -""" - - escaped_expected = r"""\begin{tabular}{lll} -\toprule -{} & co\$e\textasciicircum x\$ & co\textasciicircum l1 \\ -\midrule -a & a & a \\ -b & b & b \\ -\bottomrule -\end{tabular} -""" + def test_to_latex_escape_false(self, df_with_symbols): + result = df_with_symbols.to_latex(escape=False) + expected = _dedent( + r""" + \begin{tabular}{lll} + \toprule + {} & co$e^x$ & co^l1 \\ + \midrule + a & a & a \\ + b & b & b \\ + \bottomrule + \end{tabular} + """ + ) + assert result == expected - assert unescaped_result == unescaped_expected - assert escaped_result == escaped_expected + def test_to_latex_escape_default(self, df_with_symbols): + result = df_with_symbols.to_latex() # default: escape=True + expected = _dedent( + r""" + \begin{tabular}{lll} + \toprule + {} & co\$e\textasciicircum x\$ & co\textasciicircum l1 \\ + \midrule + a & a & a \\ + b & b & b \\ + \bottomrule + \end{tabular} + """ + ) + assert result == expected def test_to_latex_special_escape(self): df = DataFrame([r"a\b\c", r"^a^b^c", r"~a~b~c"]) + result = df.to_latex() + expected = _dedent( + r""" + \begin{tabular}{ll} + \toprule + {} & 0 \\ + \midrule + 0 & a\textbackslash b\textbackslash c \\ + 1 & \textasciicircum a\textasciicircum b\textasciicircum c \\ + 2 & \textasciitilde a\textasciitilde b\textasciitilde c \\ + \bottomrule + \end{tabular} + """ + ) + assert result == expected - escaped_result = df.to_latex() - escaped_expected = r"""\begin{tabular}{ll} -\toprule -{} & 0 \\ -\midrule -0 & a\textbackslash b\textbackslash c \\ -1 & \textasciicircum a\textasciicircum b\textasciicircum c \\ -2 & \textasciitilde a\textasciitilde b\textasciitilde c \\ -\bottomrule -\end{tabular} -""" - assert escaped_result == escaped_expected - - def test_to_latex_longtable(self): - + def test_to_latex_longtable_with_index(self): df = DataFrame({"a": [1, 2], "b": ["b1", "b2"]}) - withindex_result = df.to_latex(longtable=True) - withindex_expected = r"""\begin{longtable}{lrl} -\toprule -{} & a & b \\ -\midrule -\endfirsthead - -\toprule -{} & a & b \\ -\midrule -\endhead -\midrule -\multicolumn{3}{r}{{Continued on next page}} \\ -\midrule -\endfoot - -\bottomrule -\endlastfoot -0 & 1 & b1 \\ -1 & 2 & b2 \\ -\end{longtable} -""" - assert withindex_result == withindex_expected - - withoutindex_result = df.to_latex(index=False, longtable=True) - withoutindex_expected = r"""\begin{longtable}{rl} -\toprule - a & b \\ -\midrule -\endfirsthead - -\toprule - a & b \\ -\midrule -\endhead -\midrule -\multicolumn{2}{r}{{Continued on next page}} \\ -\midrule -\endfoot - -\bottomrule -\endlastfoot - 1 & b1 \\ - 2 & b2 \\ -\end{longtable} -""" - - assert withoutindex_result == withoutindex_expected - - df = DataFrame({"a": [1, 2]}) - with1column_result = df.to_latex(index=False, longtable=True) - assert r"\multicolumn{1}" in with1column_result - - df = DataFrame({"a": [1, 2], "b": [3, 4], "c": [5, 6]}) - with3columns_result = df.to_latex(index=False, longtable=True) - assert r"\multicolumn{3}" in with3columns_result - - def test_to_latex_caption_label(self): - # GH 25436 - the_caption = "a table in a \\texttt{table/tabular} environment" - the_label = "tab:table_tabular" + result = df.to_latex(longtable=True) + expected = _dedent( + r""" + \begin{longtable}{lrl} + \toprule + {} & a & b \\ + \midrule + \endfirsthead + + \toprule + {} & a & b \\ + \midrule + \endhead + \midrule + \multicolumn{3}{r}{{Continued on next page}} \\ + \midrule + \endfoot + + \bottomrule + \endlastfoot + 0 & 1 & b1 \\ + 1 & 2 & b2 \\ + \end{longtable} + """ + ) + assert result == expected + def test_to_latex_longtable_without_index(self): df = DataFrame({"a": [1, 2], "b": ["b1", "b2"]}) + result = df.to_latex(index=False, longtable=True) + expected = _dedent( + r""" + \begin{longtable}{rl} + \toprule + a & b \\ + \midrule + \endfirsthead + + \toprule + a & b \\ + \midrule + \endhead + \midrule + \multicolumn{2}{r}{{Continued on next page}} \\ + \midrule + \endfoot + + \bottomrule + \endlastfoot + 1 & b1 \\ + 2 & b2 \\ + \end{longtable} + """ + ) + assert result == expected - # test when only the caption is provided - result_c = df.to_latex(caption=the_caption) - - expected_c = r"""\begin{table} -\centering -\caption{a table in a \texttt{table/tabular} environment} -\begin{tabular}{lrl} -\toprule -{} & a & b \\ -\midrule -0 & 1 & b1 \\ -1 & 2 & b2 \\ -\bottomrule -\end{tabular} -\end{table} -""" - assert result_c == expected_c - - # test when only the label is provided - result_l = df.to_latex(label=the_label) - - expected_l = r"""\begin{table} -\centering -\label{tab:table_tabular} -\begin{tabular}{lrl} -\toprule -{} & a & b \\ -\midrule -0 & 1 & b1 \\ -1 & 2 & b2 \\ -\bottomrule -\end{tabular} -\end{table} -""" - assert result_l == expected_l - - # test when the caption and the label are provided - result_cl = df.to_latex(caption=the_caption, label=the_label) + @pytest.mark.parametrize( + "df, expected_number", + [ + (DataFrame({"a": [1, 2]}), 1), + (DataFrame({"a": [1, 2], "b": [3, 4]}), 2), + (DataFrame({"a": [1, 2], "b": [3, 4], "c": [5, 6]}), 3), + ], + ) + def test_to_latex_longtable_continued_on_next_page(self, df, expected_number): + result = df.to_latex(index=False, longtable=True) + assert fr"\multicolumn{{{expected_number}}}" in result - expected_cl = r"""\begin{table} -\centering -\caption{a table in a \texttt{table/tabular} environment} -\label{tab:table_tabular} -\begin{tabular}{lrl} -\toprule -{} & a & b \\ -\midrule -0 & 1 & b1 \\ -1 & 2 & b2 \\ -\bottomrule -\end{tabular} -\end{table} -""" - assert result_cl == expected_cl + def test_to_latex_caption_only(self, df_short, caption_table): + # GH 25436 + result = df_short.to_latex(caption=caption_table) + expected = _dedent( + r""" + \begin{table} + \centering + \caption{a table in a \texttt{table/tabular} environment} + \begin{tabular}{lrl} + \toprule + {} & a & b \\ + \midrule + 0 & 1 & b1 \\ + 1 & 2 & b2 \\ + \bottomrule + \end{tabular} + \end{table} + """ + ) + assert result == expected - def test_to_latex_longtable_caption_label(self): + def test_to_latex_label_only(self, df_short, label_table): # GH 25436 - the_caption = "a table in a \\texttt{longtable} environment" - the_label = "tab:longtable" + result = df_short.to_latex(label=label_table) + expected = _dedent( + r""" + \begin{table} + \centering + \label{tab:table_tabular} + \begin{tabular}{lrl} + \toprule + {} & a & b \\ + \midrule + 0 & 1 & b1 \\ + 1 & 2 & b2 \\ + \bottomrule + \end{tabular} + \end{table} + """ + ) + assert result == expected - df = DataFrame({"a": [1, 2], "b": ["b1", "b2"]}) + def test_to_latex_caption_and_label(self, df_short, caption_table, label_table): + # GH 25436 + result = df_short.to_latex(caption=caption_table, label=label_table) + expected = _dedent( + r""" + \begin{table} + \centering + \caption{a table in a \texttt{table/tabular} environment} + \label{tab:table_tabular} + \begin{tabular}{lrl} + \toprule + {} & a & b \\ + \midrule + 0 & 1 & b1 \\ + 1 & 2 & b2 \\ + \bottomrule + \end{tabular} + \end{table} + """ + ) + assert result == expected + def test_to_latex_longtable_caption_only(self, df_short, caption_longtable): + # GH 25436 # test when no caption and no label is provided # is performed by test_to_latex_longtable() + result = df_short.to_latex(longtable=True, caption=caption_longtable) + expected = _dedent( + r""" + \begin{longtable}{lrl} + \caption{a table in a \texttt{longtable} environment}\\ + \toprule + {} & a & b \\ + \midrule + \endfirsthead + \caption[]{a table in a \texttt{longtable} environment} \\ + \toprule + {} & a & b \\ + \midrule + \endhead + \midrule + \multicolumn{3}{r}{{Continued on next page}} \\ + \midrule + \endfoot + + \bottomrule + \endlastfoot + 0 & 1 & b1 \\ + 1 & 2 & b2 \\ + \end{longtable} + """ + ) + assert result == expected - # test when only the caption is provided - result_c = df.to_latex(longtable=True, caption=the_caption) - - expected_c = r"""\begin{longtable}{lrl} -\caption{a table in a \texttt{longtable} environment}\\ -\toprule -{} & a & b \\ -\midrule -\endfirsthead -\caption[]{a table in a \texttt{longtable} environment} \\ -\toprule -{} & a & b \\ -\midrule -\endhead -\midrule -\multicolumn{3}{r}{{Continued on next page}} \\ -\midrule -\endfoot - -\bottomrule -\endlastfoot -0 & 1 & b1 \\ -1 & 2 & b2 \\ -\end{longtable} -""" - assert result_c == expected_c - - # test when only the label is provided - result_l = df.to_latex(longtable=True, label=the_label) - - expected_l = r"""\begin{longtable}{lrl} -\label{tab:longtable}\\ -\toprule -{} & a & b \\ -\midrule -\endfirsthead - -\toprule -{} & a & b \\ -\midrule -\endhead -\midrule -\multicolumn{3}{r}{{Continued on next page}} \\ -\midrule -\endfoot - -\bottomrule -\endlastfoot -0 & 1 & b1 \\ -1 & 2 & b2 \\ -\end{longtable} -""" - assert result_l == expected_l - - # test when the caption and the label are provided - result_cl = df.to_latex(longtable=True, caption=the_caption, label=the_label) - - expected_cl = r"""\begin{longtable}{lrl} -\caption{a table in a \texttt{longtable} environment} -\label{tab:longtable}\\ -\toprule -{} & a & b \\ -\midrule -\endfirsthead -\caption[]{a table in a \texttt{longtable} environment} \\ -\toprule -{} & a & b \\ -\midrule -\endhead -\midrule -\multicolumn{3}{r}{{Continued on next page}} \\ -\midrule -\endfoot + def test_to_latex_longtable_label_only(self, df_short, label_longtable): + # GH 25436 + result = df_short.to_latex(longtable=True, label=label_longtable) + expected = _dedent( + r""" + \begin{longtable}{lrl} + \label{tab:longtable}\\ + \toprule + {} & a & b \\ + \midrule + \endfirsthead + + \toprule + {} & a & b \\ + \midrule + \endhead + \midrule + \multicolumn{3}{r}{{Continued on next page}} \\ + \midrule + \endfoot + + \bottomrule + \endlastfoot + 0 & 1 & b1 \\ + 1 & 2 & b2 \\ + \end{longtable} + """ + ) + assert result == expected -\bottomrule -\endlastfoot -0 & 1 & b1 \\ -1 & 2 & b2 \\ -\end{longtable} -""" - assert result_cl == expected_cl + def test_to_latex_longtable_caption_and_label( + self, + df_short, + caption_longtable, + label_longtable, + ): + # GH 25436 + result = df_short.to_latex( + longtable=True, + caption=caption_longtable, + label=label_longtable, + ) + expected = _dedent( + r""" + \begin{longtable}{lrl} + \caption{a table in a \texttt{longtable} environment} + \label{tab:longtable}\\ + \toprule + {} & a & b \\ + \midrule + \endfirsthead + \caption[]{a table in a \texttt{longtable} environment} \\ + \toprule + {} & a & b \\ + \midrule + \endhead + \midrule + \multicolumn{3}{r}{{Continued on next page}} \\ + \midrule + \endfoot + + \bottomrule + \endlastfoot + 0 & 1 & b1 \\ + 1 & 2 & b2 \\ + \end{longtable} + """ + ) + assert result == expected def test_to_latex_position(self): the_position = "h" - df = DataFrame({"a": [1, 2], "b": ["b1", "b2"]}) - - # test when only the position is provided - result_p = df.to_latex(position=the_position) - - expected_p = r"""\begin{table}[h] -\centering -\begin{tabular}{lrl} -\toprule -{} & a & b \\ -\midrule -0 & 1 & b1 \\ -1 & 2 & b2 \\ -\bottomrule -\end{tabular} -\end{table} -""" - assert result_p == expected_p + result = df.to_latex(position=the_position) + expected = _dedent( + r""" + \begin{table}[h] + \centering + \begin{tabular}{lrl} + \toprule + {} & a & b \\ + \midrule + 0 & 1 & b1 \\ + 1 & 2 & b2 \\ + \bottomrule + \end{tabular} + \end{table} + """ + ) + assert result == expected def test_to_latex_longtable_position(self): the_position = "t" - df = DataFrame({"a": [1, 2], "b": ["b1", "b2"]}) - - # test when only the position is provided - result_p = df.to_latex(longtable=True, position=the_position) - - expected_p = r"""\begin{longtable}[t]{lrl} -\toprule -{} & a & b \\ -\midrule -\endfirsthead - -\toprule -{} & a & b \\ -\midrule -\endhead -\midrule -\multicolumn{3}{r}{{Continued on next page}} \\ -\midrule -\endfoot - -\bottomrule -\endlastfoot -0 & 1 & b1 \\ -1 & 2 & b2 \\ -\end{longtable} -""" - assert result_p == expected_p + result = df.to_latex(longtable=True, position=the_position) + expected = _dedent( + r""" + \begin{longtable}[t]{lrl} + \toprule + {} & a & b \\ + \midrule + \endfirsthead + + \toprule + {} & a & b \\ + \midrule + \endhead + \midrule + \multicolumn{3}{r}{{Continued on next page}} \\ + \midrule + \endfoot + + \bottomrule + \endlastfoot + 0 & 1 & b1 \\ + 1 & 2 & b2 \\ + \end{longtable} + """ + ) + assert result == expected def test_to_latex_escape_special_chars(self): special_characters = ["&", "%", "$", "#", "_", "{", "}", "~", "^", "\\"] df = DataFrame(data=special_characters) - observed = df.to_latex() - expected = r"""\begin{tabular}{ll} -\toprule -{} & 0 \\ -\midrule -0 & \& \\ -1 & \% \\ -2 & \$ \\ -3 & \# \\ -4 & \_ \\ -5 & \{ \\ -6 & \} \\ -7 & \textasciitilde \\ -8 & \textasciicircum \\ -9 & \textbackslash \\ -\bottomrule -\end{tabular} -""" - - assert observed == expected + result = df.to_latex() + expected = _dedent( + r""" + \begin{tabular}{ll} + \toprule + {} & 0 \\ + \midrule + 0 & \& \\ + 1 & \% \\ + 2 & \$ \\ + 3 & \# \\ + 4 & \_ \\ + 5 & \{ \\ + 6 & \} \\ + 7 & \textasciitilde \\ + 8 & \textasciicircum \\ + 9 & \textbackslash \\ + \bottomrule + \end{tabular} + """ + ) + assert result == expected - def test_to_latex_no_header(self): + def test_to_latex_no_header_with_index(self): # GH 7124 df = DataFrame({"a": [1, 2], "b": ["b1", "b2"]}) - withindex_result = df.to_latex(header=False) - withindex_expected = r"""\begin{tabular}{lrl} -\toprule -0 & 1 & b1 \\ -1 & 2 & b2 \\ -\bottomrule -\end{tabular} -""" - - assert withindex_result == withindex_expected - - withoutindex_result = df.to_latex(index=False, header=False) - withoutindex_expected = r"""\begin{tabular}{rl} -\toprule -1 & b1 \\ -2 & b2 \\ -\bottomrule -\end{tabular} -""" - - assert withoutindex_result == withoutindex_expected + result = df.to_latex(header=False) + expected = _dedent( + r""" + \begin{tabular}{lrl} + \toprule + 0 & 1 & b1 \\ + 1 & 2 & b2 \\ + \bottomrule + \end{tabular} + """ + ) + assert result == expected - def test_to_latex_specified_header(self): + def test_to_latex_no_header_without_index(self): # GH 7124 df = DataFrame({"a": [1, 2], "b": ["b1", "b2"]}) - withindex_result = df.to_latex(header=["AA", "BB"]) - withindex_expected = r"""\begin{tabular}{lrl} -\toprule -{} & AA & BB \\ -\midrule -0 & 1 & b1 \\ -1 & 2 & b2 \\ -\bottomrule -\end{tabular} -""" - - assert withindex_result == withindex_expected - - withoutindex_result = df.to_latex(header=["AA", "BB"], index=False) - withoutindex_expected = r"""\begin{tabular}{rl} -\toprule -AA & BB \\ -\midrule - 1 & b1 \\ - 2 & b2 \\ -\bottomrule -\end{tabular} -""" + result = df.to_latex(index=False, header=False) + expected = _dedent( + r""" + \begin{tabular}{rl} + \toprule + 1 & b1 \\ + 2 & b2 \\ + \bottomrule + \end{tabular} + """ + ) + assert result == expected - assert withoutindex_result == withoutindex_expected + def test_to_latex_specified_header_with_index(self): + # GH 7124 + df = DataFrame({"a": [1, 2], "b": ["b1", "b2"]}) + result = df.to_latex(header=["AA", "BB"]) + expected = _dedent( + r""" + \begin{tabular}{lrl} + \toprule + {} & AA & BB \\ + \midrule + 0 & 1 & b1 \\ + 1 & 2 & b2 \\ + \bottomrule + \end{tabular} + """ + ) + assert result == expected - withoutescape_result = df.to_latex(header=["$A$", "$B$"], escape=False) - withoutescape_expected = r"""\begin{tabular}{lrl} -\toprule -{} & $A$ & $B$ \\ -\midrule -0 & 1 & b1 \\ -1 & 2 & b2 \\ -\bottomrule -\end{tabular} -""" + def test_to_latex_specified_header_without_index(self): + # GH 7124 + df = DataFrame({"a": [1, 2], "b": ["b1", "b2"]}) + result = df.to_latex(header=["AA", "BB"], index=False) + expected = _dedent( + r""" + \begin{tabular}{rl} + \toprule + AA & BB \\ + \midrule + 1 & b1 \\ + 2 & b2 \\ + \bottomrule + \end{tabular} + """ + ) + assert result == expected - assert withoutescape_result == withoutescape_expected + def test_to_latex_specified_header_special_chars_without_escape(self): + # GH 7124 + df = DataFrame({"a": [1, 2], "b": ["b1", "b2"]}) + result = df.to_latex(header=["$A$", "$B$"], escape=False) + expected = _dedent( + r""" + \begin{tabular}{lrl} + \toprule + {} & $A$ & $B$ \\ + \midrule + 0 & 1 & b1 \\ + 1 & 2 & b2 \\ + \bottomrule + \end{tabular} + """ + ) + assert result == expected + def test_to_latex_number_of_items_in_header_missmatch_raises(self): + # GH 7124 + df = DataFrame({"a": [1, 2], "b": ["b1", "b2"]}) msg = "Writing 2 cols but got 1 aliases" with pytest.raises(ValueError, match=msg): df.to_latex(header=["A"]) - def test_to_latex_decimal(self, float_frame): + def test_to_latex_decimal(self): # GH 12031 - float_frame.to_latex() - df = DataFrame({"a": [1.0, 2.1], "b": ["b1", "b2"]}) - withindex_result = df.to_latex(decimal=",") - - withindex_expected = r"""\begin{tabular}{lrl} -\toprule -{} & a & b \\ -\midrule -0 & 1,0 & b1 \\ -1 & 2,1 & b2 \\ -\bottomrule -\end{tabular} -""" - - assert withindex_result == withindex_expected + result = df.to_latex(decimal=",") + expected = _dedent( + r""" + \begin{tabular}{lrl} + \toprule + {} & a & b \\ + \midrule + 0 & 1,0 & b1 \\ + 1 & 2,1 & b2 \\ + \bottomrule + \end{tabular} + """ + ) + assert result == expected def test_to_latex_series(self): s = Series(["a", "b", "c"]) - withindex_result = s.to_latex() - withindex_expected = r"""\begin{tabular}{ll} -\toprule -{} & 0 \\ -\midrule -0 & a \\ -1 & b \\ -2 & c \\ -\bottomrule -\end{tabular} -""" - assert withindex_result == withindex_expected + result = s.to_latex() + expected = _dedent( + r""" + \begin{tabular}{ll} + \toprule + {} & 0 \\ + \midrule + 0 & a \\ + 1 & b \\ + 2 & c \\ + \bottomrule + \end{tabular} + """ + ) + assert result == expected def test_to_latex_bold_rows(self): # GH 16707 df = pd.DataFrame({"a": [1, 2], "b": ["b1", "b2"]}) - observed = df.to_latex(bold_rows=True) - expected = r"""\begin{tabular}{lrl} -\toprule -{} & a & b \\ -\midrule -\textbf{0} & 1 & b1 \\ -\textbf{1} & 2 & b2 \\ -\bottomrule -\end{tabular} -""" - assert observed == expected + result = df.to_latex(bold_rows=True) + expected = _dedent( + r""" + \begin{tabular}{lrl} + \toprule + {} & a & b \\ + \midrule + \textbf{0} & 1 & b1 \\ + \textbf{1} & 2 & b2 \\ + \bottomrule + \end{tabular} + """ + ) + assert result == expected def test_to_latex_no_bold_rows(self): # GH 16707 df = pd.DataFrame({"a": [1, 2], "b": ["b1", "b2"]}) - observed = df.to_latex(bold_rows=False) - expected = r"""\begin{tabular}{lrl} -\toprule -{} & a & b \\ -\midrule -0 & 1 & b1 \\ -1 & 2 & b2 \\ -\bottomrule -\end{tabular} -""" - assert observed == expected + result = df.to_latex(bold_rows=False) + expected = _dedent( + r""" + \begin{tabular}{lrl} + \toprule + {} & a & b \\ + \midrule + 0 & 1 & b1 \\ + 1 & 2 & b2 \\ + \bottomrule + \end{tabular} + """ + ) + assert result == expected @pytest.mark.parametrize("name0", [None, "named0"]) @pytest.mark.parametrize("name1", [None, "named1"]) @@ -875,13 +1041,16 @@ def test_to_latex_multiindex_nans(self, one_row): if one_row: df = df.iloc[[0]] observed = df.set_index(["a", "b"]).to_latex() - expected = r"""\begin{tabular}{llr} -\toprule - & & c \\ -a & b & \\ -\midrule -NaN & 2 & 4 \\ -""" + expected = _dedent( + r""" + \begin{tabular}{llr} + \toprule + & & c \\ + a & b & \\ + \midrule + NaN & 2 & 4 \\ + """ + ) if not one_row: expected += r"""1.0 & 3 & 5 \\ """ @@ -893,93 +1062,111 @@ def test_to_latex_multiindex_nans(self, one_row): def test_to_latex_non_string_index(self): # GH 19981 observed = pd.DataFrame([[1, 2, 3]] * 2).set_index([0, 1]).to_latex() - expected = r"""\begin{tabular}{llr} -\toprule - & & 2 \\ -0 & 1 & \\ -\midrule -1 & 2 & 3 \\ - & 2 & 3 \\ -\bottomrule -\end{tabular} -""" + expected = _dedent( + r""" + \begin{tabular}{llr} + \toprule + & & 2 \\ + 0 & 1 & \\ + \midrule + 1 & 2 & 3 \\ + & 2 & 3 \\ + \bottomrule + \end{tabular} + """ + ) assert observed == expected def test_to_latex_midrule_location(self): # GH 18326 df = pd.DataFrame({"a": [1, 2]}) df.index.name = "foo" - observed = df.to_latex(index_names=False) - expected = r"""\begin{tabular}{lr} -\toprule -{} & a \\ -\midrule -0 & 1 \\ -1 & 2 \\ -\bottomrule -\end{tabular} -""" - - assert observed == expected + result = df.to_latex(index_names=False) + expected = _dedent( + r""" + \begin{tabular}{lr} + \toprule + {} & a \\ + \midrule + 0 & 1 \\ + 1 & 2 \\ + \bottomrule + \end{tabular} + """ + ) + assert result == expected def test_to_latex_multiindex_empty_name(self): # GH 18669 mi = pd.MultiIndex.from_product([[1, 2]], names=[""]) df = pd.DataFrame(-1, index=mi, columns=range(4)) observed = df.to_latex() - expected = r"""\begin{tabular}{lrrrr} -\toprule - & 0 & 1 & 2 & 3 \\ -{} & & & & \\ -\midrule -1 & -1 & -1 & -1 & -1 \\ -2 & -1 & -1 & -1 & -1 \\ -\bottomrule -\end{tabular} -""" + expected = _dedent( + r""" + \begin{tabular}{lrrrr} + \toprule + & 0 & 1 & 2 & 3 \\ + {} & & & & \\ + \midrule + 1 & -1 & -1 & -1 & -1 \\ + 2 & -1 & -1 & -1 & -1 \\ + \bottomrule + \end{tabular} + """ + ) assert observed == expected - def test_to_latex_float_format_no_fixed_width(self): - + def test_to_latex_float_format_no_fixed_width_3decimals(self): # GH 21625 df = DataFrame({"x": [0.19999]}) - expected = r"""\begin{tabular}{lr} -\toprule -{} & x \\ -\midrule -0 & 0.200 \\ -\bottomrule -\end{tabular} -""" - assert df.to_latex(float_format="%.3f") == expected + result = df.to_latex(float_format="%.3f") + expected = _dedent( + r""" + \begin{tabular}{lr} + \toprule + {} & x \\ + \midrule + 0 & 0.200 \\ + \bottomrule + \end{tabular} + """ + ) + assert result == expected + def test_to_latex_float_format_no_fixed_width_integer(self): # GH 22270 df = DataFrame({"x": [100.0]}) - expected = r"""\begin{tabular}{lr} -\toprule -{} & x \\ -\midrule -0 & 100 \\ -\bottomrule -\end{tabular} -""" - assert df.to_latex(float_format="%.0f") == expected + result = df.to_latex(float_format="%.0f") + expected = _dedent( + r""" + \begin{tabular}{lr} + \toprule + {} & x \\ + \midrule + 0 & 100 \\ + \bottomrule + \end{tabular} + """ + ) + assert result == expected def test_to_latex_multindex_header(self): # GH 16718 - df = pd.DataFrame({"a": [0], "b": [1], "c": [2], "d": [3]}).set_index( - ["a", "b"] - ) + df = pd.DataFrame({"a": [0], "b": [1], "c": [2], "d": [3]}) + df = df.set_index(["a", "b"]) observed = df.to_latex(header=["r1", "r2"]) - expected = r"""\begin{tabular}{llrr} -\toprule - & & r1 & r2 \\ -a & b & & \\ -\midrule -0 & 1 & 2 & 3 \\ -\bottomrule -\end{tabular} -""" + expected = _dedent( + r""" + \begin{tabular}{llrr} + \toprule + & & r1 & r2 \\ + a & b & & \\ + \midrule + 0 & 1 & 2 & 3 \\ + \bottomrule + \end{tabular} + """ + ) assert observed == expected From 4b26d70ca7688488e78a5bd43957cf01f3ec6cd0 Mon Sep 17 00:00:00 2001 From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> Date: Tue, 22 Sep 2020 18:45:23 -0400 Subject: [PATCH 0881/1025] CLN: Break up wrap applied output (#36536) --- pandas/core/groupby/generic.py | 125 ++++++++++++++++++--------------- 1 file changed, 69 insertions(+), 56 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index b9cc2c19c224b..29f13107f750a 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1210,64 +1210,77 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): self._insert_inaxis_grouper_inplace(result) return result else: - # this is to silence a DeprecationWarning - # TODO: Remove when default dtype of empty Series is object - kwargs = first_not_none._construct_axes_dict() - backup = create_series_with_explicit_dtype(dtype_if_empty=object, **kwargs) - values = [x if (x is not None) else backup for x in values] - - all_indexed_same = all_indexes_same(x.index for x in values) - - # GH3596 - # provide a reduction (Frame -> Series) if groups are - # unique - if self.squeeze: - applied_index = self._selected_obj._get_axis(self.axis) - singular_series = len(values) == 1 and applied_index.nlevels == 1 - - # assign the name to this series - if singular_series: - values[0].name = keys[0] - - # GH2893 - # we have series in the values array, we want to - # produce a series: - # if any of the sub-series are not indexed the same - # OR we don't have a multi-index and we have only a - # single values - return self._concat_objects( - keys, values, not_indexed_same=not_indexed_same - ) + # values are Series + return self._wrap_applied_output_series( + keys, values, not_indexed_same, first_not_none, key_index + ) - # still a series - # path added as of GH 5545 - elif all_indexed_same: - from pandas.core.reshape.concat import concat - - return concat(values) - - if not all_indexed_same: - # GH 8467 - return self._concat_objects(keys, values, not_indexed_same=True) - - # Combine values - # vstack+constructor is faster than concat and handles MI-columns - stacked_values = np.vstack([np.asarray(v) for v in values]) - - if self.axis == 0: - index = key_index - columns = first_not_none.index.copy() - if columns.name is None: - # GH6124 - propagate name of Series when it's consistent - names = {v.name for v in values} - if len(names) == 1: - columns.name = list(names)[0] - else: - index = first_not_none.index - columns = key_index - stacked_values = stacked_values.T + def _wrap_applied_output_series( + self, + keys, + values: List[Series], + not_indexed_same: bool, + first_not_none, + key_index, + ) -> FrameOrSeriesUnion: + # this is to silence a DeprecationWarning + # TODO: Remove when default dtype of empty Series is object + kwargs = first_not_none._construct_axes_dict() + backup = create_series_with_explicit_dtype(dtype_if_empty=object, **kwargs) + values = [x if (x is not None) else backup for x in values] + + all_indexed_same = all_indexes_same(x.index for x in values) + + # GH3596 + # provide a reduction (Frame -> Series) if groups are + # unique + if self.squeeze: + applied_index = self._selected_obj._get_axis(self.axis) + singular_series = len(values) == 1 and applied_index.nlevels == 1 + + # assign the name to this series + if singular_series: + values[0].name = keys[0] + + # GH2893 + # we have series in the values array, we want to + # produce a series: + # if any of the sub-series are not indexed the same + # OR we don't have a multi-index and we have only a + # single values + return self._concat_objects( + keys, values, not_indexed_same=not_indexed_same + ) + + # still a series + # path added as of GH 5545 + elif all_indexed_same: + from pandas.core.reshape.concat import concat + + return concat(values) + + if not all_indexed_same: + # GH 8467 + return self._concat_objects(keys, values, not_indexed_same=True) + + # Combine values + # vstack+constructor is faster than concat and handles MI-columns + stacked_values = np.vstack([np.asarray(v) for v in values]) + + if self.axis == 0: + index = key_index + columns = first_not_none.index.copy() + if columns.name is None: + # GH6124 - propagate name of Series when it's consistent + names = {v.name for v in values} + if len(names) == 1: + columns.name = list(names)[0] + else: + index = first_not_none.index + columns = key_index + stacked_values = stacked_values.T - result = self.obj._constructor(stacked_values, index=index, columns=columns) + result = self.obj._constructor(stacked_values, index=index, columns=columns) # if we have date/time like in the original, then coerce dates # as we are stacking can easily have object dtypes here From 9bb5b5ea6ec3e92d49e9080ec62108b9b2268bf6 Mon Sep 17 00:00:00 2001 From: junk Date: Wed, 23 Sep 2020 08:21:37 +0900 Subject: [PATCH 0882/1025] TST: check inequality by comparing categorical with NaN ( #28384 ) (#36520) --- .../tests/arrays/categorical/test_missing.py | 21 +++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/pandas/tests/arrays/categorical/test_missing.py b/pandas/tests/arrays/categorical/test_missing.py index 5309b8827e3f0..21bea9356dcf0 100644 --- a/pandas/tests/arrays/categorical/test_missing.py +++ b/pandas/tests/arrays/categorical/test_missing.py @@ -148,3 +148,24 @@ def test_use_inf_as_na_outside_context(self, values, expected): result = pd.isna(DataFrame(cat)) expected = DataFrame(expected) tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "a1, a2, categories", + [ + (["a", "b", "c"], [np.nan, "a", "b"], ["a", "b", "c"]), + ([1, 2, 3], [np.nan, 1, 2], [1, 2, 3]), + ], + ) + def test_compare_categorical_with_missing(self, a1, a2, categories): + # GH 28384 + cat_type = CategoricalDtype(categories) + + # != + result = Series(a1, dtype=cat_type) != Series(a2, dtype=cat_type) + expected = Series(a1) != Series(a2) + tm.assert_series_equal(result, expected) + + # == + result = Series(a1, dtype=cat_type) == Series(a2, dtype=cat_type) + expected = Series(a1) == Series(a2) + tm.assert_series_equal(result, expected) From 58bf76423b4bffd2106d9f28aedda2464bc75390 Mon Sep 17 00:00:00 2001 From: samilAyoub <61546990+samilAyoub@users.noreply.github.com> Date: Wed, 23 Sep 2020 00:24:01 +0100 Subject: [PATCH 0883/1025] =?UTF-8?q?add=20a=20test=20for=20loc=20method;?= =?UTF-8?q?=20check=20if=20a=20warning=20raise=20when=20replacing=20a=20?= =?UTF-8?q?=E2=80=A6=20(#36486)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pandas/tests/indexing/test_chaining_and_caching.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/pandas/tests/indexing/test_chaining_and_caching.py b/pandas/tests/indexing/test_chaining_and_caching.py index 66835c586e6c7..1254f1f217a2e 100644 --- a/pandas/tests/indexing/test_chaining_and_caching.py +++ b/pandas/tests/indexing/test_chaining_and_caching.py @@ -335,12 +335,14 @@ def test_setting_with_copy_bug(self): # this should not raise df2["y"] = ["g", "h", "i"] - def test_detect_chained_assignment_warnings(self): + def test_detect_chained_assignment_warnings_errors(self): + df = DataFrame({"A": ["aaa", "bbb", "ccc"], "B": [1, 2, 3]}) with option_context("chained_assignment", "warn"): - df = DataFrame({"A": ["aaa", "bbb", "ccc"], "B": [1, 2, 3]}) - with tm.assert_produces_warning(com.SettingWithCopyWarning): df.loc[0]["A"] = 111 + with option_context("chained_assignment", "raise"): + with pytest.raises(com.SettingWithCopyError): + df.loc[0]["A"] = 111 def test_detect_chained_assignment_warnings_filter_and_dupe_cols(self): # xref gh-13017. From 69c23dc8648c0b62cd5b77e8f0203860fd999c82 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Wed, 23 Sep 2020 12:16:21 +0100 Subject: [PATCH 0884/1025] DOC: a few sphinx fixes in release notes (#36523) --- doc/source/whatsnew/v1.1.3.rst | 4 ++-- doc/source/whatsnew/v1.2.0.rst | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v1.1.3.rst b/doc/source/whatsnew/v1.1.3.rst index e3b0f59c3edcc..c1effad34ab93 100644 --- a/doc/source/whatsnew/v1.1.3.rst +++ b/doc/source/whatsnew/v1.1.3.rst @@ -33,8 +33,8 @@ Fixed regressions - Fixed regression in :class:`IntegerArray` unary plus and minus operations raising a ``TypeError`` (:issue:`36063`) - Fixed regression in :meth:`Series.__getitem__` incorrectly raising when the input was a tuple (:issue:`35534`) - Fixed regression in :meth:`Series.__getitem__` incorrectly raising when the input was a frozenset (:issue:`35747`) -- Fixed regression in :meth:`read_excel` with ``engine="odf"`` caused ``UnboundLocalError`` in some cases where cells had nested child nodes (:issue:`36122`,:issue:`35802`) -- Fixed regression in :class:`DataFrame` and :class:`Series` comparisons between numeric arrays and strings (:issue:`35700`,:issue:`36377`) +- Fixed regression in :meth:`read_excel` with ``engine="odf"`` caused ``UnboundLocalError`` in some cases where cells had nested child nodes (:issue:`36122`, :issue:`35802`) +- Fixed regression in :class:`DataFrame` and :class:`Series` comparisons between numeric arrays and strings (:issue:`35700`, :issue:`36377`) - Fixed regression when setting empty :class:`DataFrame` column to a :class:`Series` in preserving name of index in frame (:issue:`36527`) - Fixed regression in :class:`Period` incorrect value for ordinal over the maximum timestamp (:issue:`36430`) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index ed48bf0675034..0067632b3b460 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -96,7 +96,7 @@ For example: buffer = io.BytesIO() data.to_csv(buffer, mode="w+b", encoding="utf-8", compression="gzip") -:.. _whatsnew_read_csv_table_precision_default: +.. _whatsnew_120.read_csv_table_precision_default: Change in default floating precision for ``read_csv`` and ``read_table`` ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -209,7 +209,7 @@ See :ref:`install.dependencies` and :ref:`install.optional_dependencies` for mor Deprecations ~~~~~~~~~~~~ - Deprecated parameter ``inplace`` in :meth:`MultiIndex.set_codes` and :meth:`MultiIndex.set_levels` (:issue:`35626`) -- Deprecated parameter ``dtype`` in :~meth:`Index.copy` on method all index classes. Use the :meth:`Index.astype` method instead for changing dtype(:issue:`35853`) +- Deprecated parameter ``dtype`` in :meth:`~Index.copy` on method all index classes. Use the :meth:`~Index.astype` method instead for changing dtype (:issue:`35853`) - Date parser functions :func:`~pandas.io.date_converters.parse_date_time`, :func:`~pandas.io.date_converters.parse_date_fields`, :func:`~pandas.io.date_converters.parse_all_fields` and :func:`~pandas.io.date_converters.generic_parser` from ``pandas.io.date_converters`` are deprecated and will be removed in a future version; use :func:`to_datetime` instead (:issue:`35741`) - :meth:`DataFrame.lookup` is deprecated and will be removed in a future version, use :meth:`DataFrame.melt` and :meth:`DataFrame.loc` instead (:issue:`18682`) - The :meth:`Index.to_native_types` is deprecated. Use ``.astype(str)`` instead (:issue:`28867`) @@ -249,7 +249,7 @@ Datetimelike - Bug in :class:`DateOffset` where attributes reconstructed from pickle files differ from original objects when input values exceed normal ranges (e.g months=12) (:issue:`34511`) - Bug in :meth:`DatetimeIndex.get_slice_bound` where ``datetime.date`` objects were not accepted or naive :class:`Timestamp` with a tz-aware :class:`DatetimeIndex` (:issue:`35690`) - Bug in :meth:`DatetimeIndex.slice_locs` where ``datetime.date`` objects were not accepted (:issue:`34077`) -- Bug in :meth:`DatetimeIndex.searchsorted`, :meth:`TimedeltaIndex.searchsorted`, :meth:`PeriodIndex.searchsorted`, and :meth:`Series.searchsorted` with ``datetime64``, ``timedelta64`` or ``Period`` dtype placement of ``NaT`` values being inconsistent with ``NumPy`` (:issue:`36176`,:issue:`36254`) +- Bug in :meth:`DatetimeIndex.searchsorted`, :meth:`TimedeltaIndex.searchsorted`, :meth:`PeriodIndex.searchsorted`, and :meth:`Series.searchsorted` with ``datetime64``, ``timedelta64`` or ``Period`` dtype placement of ``NaT`` values being inconsistent with ``NumPy`` (:issue:`36176`, :issue:`36254`) - Inconsistency in :class:`DatetimeArray`, :class:`TimedeltaArray`, and :class:`PeriodArray` setitem casting arrays of strings to datetimelike scalars but not scalar strings (:issue:`36261`) - From a9e0216f6ad46cb9edb02dee2d033ee82de017d2 Mon Sep 17 00:00:00 2001 From: Andrew Wieteska <48889395+arw2019@users.noreply.github.com> Date: Wed, 23 Sep 2020 07:20:12 -0400 Subject: [PATCH 0885/1025] TST: DataFrame.to_parquet accepts pathlib.Path with partition_cols defined (#36491) --- pandas/tests/io/test_parquet.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index a5033c51bce81..b7c8ca7e0c49f 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -3,6 +3,7 @@ from distutils.version import LooseVersion from io import BytesIO import os +import pathlib from warnings import catch_warnings import numpy as np @@ -663,6 +664,20 @@ def test_partition_cols_string(self, pa, df_full): assert len(dataset.partitions.partition_names) == 1 assert dataset.partitions.partition_names == set(partition_cols_list) + @pytest.mark.parametrize( + "path_type", [lambda path: path, lambda path: pathlib.Path(path)] + ) + def test_partition_cols_pathlib(self, pa, df_compat, path_type): + # GH 35902 + + partition_cols = "B" + partition_cols_list = [partition_cols] + df = df_compat + + with tm.ensure_clean_dir() as path_str: + path = path_type(path_str) + df.to_parquet(path, partition_cols=partition_cols_list) + def test_empty_dataframe(self, pa): # GH #27339 df = pd.DataFrame() From 98e8f955b954bec476cf888e3a569c65bcb7d99d Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 23 Sep 2020 06:14:20 -0700 Subject: [PATCH 0886/1025] REF: share _reduce (#36561) --- pandas/core/arrays/_mixins.py | 8 ++++++++ pandas/core/arrays/categorical.py | 6 ------ pandas/core/arrays/datetimelike.py | 7 ------- pandas/core/arrays/numpy_.py | 8 -------- pandas/tests/arrays/categorical/test_operators.py | 4 ++-- pandas/tests/arrays/test_datetimelike.py | 3 ++- pandas/tests/reductions/test_reductions.py | 2 ++ 7 files changed, 14 insertions(+), 24 deletions(-) diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py index 808d598558c83..2bf530eb2bad4 100644 --- a/pandas/core/arrays/_mixins.py +++ b/pandas/core/arrays/_mixins.py @@ -227,3 +227,11 @@ def fillna(self: _T, value=None, method=None, limit=None) -> _T: else: new_values = self.copy() return new_values + + def _reduce(self, name: str, skipna: bool = True, **kwargs): + meth = getattr(self, name, None) + if meth: + return meth(skipna=skipna, **kwargs) + else: + msg = f"'{type(self).__name__}' does not implement reduction '{name}'" + raise TypeError(msg) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 32ef37d44ad1b..d2f88b353e1c1 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1963,12 +1963,6 @@ def _reverse_indexer(self) -> Dict[Hashable, np.ndarray]: # ------------------------------------------------------------------ # Reductions - def _reduce(self, name: str, skipna: bool = True, **kwargs): - func = getattr(self, name, None) - if func is None: - raise TypeError(f"Categorical cannot perform the operation {name}") - return func(skipna=skipna, **kwargs) - @deprecate_kwarg(old_arg_name="numeric_only", new_arg_name="skipna") def min(self, skipna=True, **kwargs): """ diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 7051507f9a90e..6752a98345b6a 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -1453,13 +1453,6 @@ def __isub__(self, other): # -------------------------------------------------------------- # Reductions - def _reduce(self, name: str, skipna: bool = True, **kwargs): - op = getattr(self, name, None) - if op: - return op(skipna=skipna, **kwargs) - else: - return super()._reduce(name, skipna, **kwargs) - def min(self, axis=None, skipna=True, *args, **kwargs): """ Return the minimum value of the Array or minimum along diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index 61076132b24cd..f65b130b396da 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -272,14 +272,6 @@ def _values_for_factorize(self) -> Tuple[np.ndarray, int]: # ------------------------------------------------------------------------ # Reductions - def _reduce(self, name, skipna=True, **kwargs): - meth = getattr(self, name, None) - if meth: - return meth(skipna=skipna, **kwargs) - else: - msg = f"'{type(self).__name__}' does not implement reduction '{name}'" - raise TypeError(msg) - def any(self, axis=None, out=None, keepdims=False, skipna=True): nv.validate_any((), dict(out=out, keepdims=keepdims)) return nanops.nanany(self._ndarray, axis=axis, skipna=skipna) diff --git a/pandas/tests/arrays/categorical/test_operators.py b/pandas/tests/arrays/categorical/test_operators.py index 9d118f1ed8753..34194738bf4ab 100644 --- a/pandas/tests/arrays/categorical/test_operators.py +++ b/pandas/tests/arrays/categorical/test_operators.py @@ -353,7 +353,7 @@ def test_numeric_like_ops(self): # min/max) s = df["value_group"] for op in ["kurt", "skew", "var", "std", "mean", "sum", "median"]: - msg = f"Categorical cannot perform the operation {op}" + msg = f"'Categorical' does not implement reduction '{op}'" with pytest.raises(TypeError, match=msg): getattr(s, op)(numeric_only=False) @@ -362,7 +362,7 @@ def test_numeric_like_ops(self): # numpy ops s = Series(Categorical([1, 2, 3, 4])) with pytest.raises( - TypeError, match="Categorical cannot perform the operation sum" + TypeError, match="'Categorical' does not implement reduction 'sum'" ): np.sum(s) diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index f512b168d2795..3f5ab5baa7d69 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -205,7 +205,8 @@ def test_reduce_invalid(self): data = np.arange(10, dtype="i8") * 24 * 3600 * 10 ** 9 arr = self.array_cls(data, freq="D") - with pytest.raises(TypeError, match="cannot perform"): + msg = f"'{type(arr).__name__}' does not implement reduction 'not a method'" + with pytest.raises(TypeError, match=msg): arr._reduce("not a method") @pytest.mark.parametrize("method", ["pad", "backfill"]) diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py index bbf2d9f1f0784..db7cd54d23a2b 100644 --- a/pandas/tests/reductions/test_reductions.py +++ b/pandas/tests/reductions/test_reductions.py @@ -351,6 +351,7 @@ def test_invalid_td64_reductions(self, opname): [ f"reduction operation '{opname}' not allowed for this dtype", rf"cannot perform {opname} with type timedelta64\[ns\]", + f"'TimedeltaArray' does not implement reduction '{opname}'", ] ) @@ -695,6 +696,7 @@ def test_ops_consistency_on_empty(self, method): [ "operation 'var' not allowed", r"cannot perform var with type timedelta64\[ns\]", + "'TimedeltaArray' does not implement reduction 'var'", ] ) with pytest.raises(TypeError, match=msg): From 3a65dc57e002773a8403d0c56ee383732c82569d Mon Sep 17 00:00:00 2001 From: Erfan Nariman <34067903+erfannariman@users.noreply.github.com> Date: Thu, 24 Sep 2020 03:12:34 +0200 Subject: [PATCH 0887/1025] CLN: clean up blocks.py (#36534) --- pandas/core/internals/blocks.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index eb5b887c8b0cb..f18bc4d0bcf85 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -175,7 +175,7 @@ def _holder(self): @property def _consolidate_key(self): - return (self._can_consolidate, self.dtype.name) + return self._can_consolidate, self.dtype.name @property def is_view(self) -> bool: @@ -1363,6 +1363,7 @@ def where( errors : str, {'raise', 'ignore'}, default 'raise' - ``raise`` : allow exceptions to be raised - ``ignore`` : suppress exceptions. On error return original object + try_cast: bool, default False axis : int, default 0 Returns @@ -1633,8 +1634,8 @@ def __init__(self, values, placement, ndim=None): def shape(self): # TODO(EA2D): override unnecessary with 2D EAs if self.ndim == 1: - return ((len(self.values)),) - return (len(self.mgr_locs), len(self.values)) + return (len(self.values),) + return len(self.mgr_locs), len(self.values) def iget(self, col): From d4e07a834311b2d5f275b4262985fb91844b6c6b Mon Sep 17 00:00:00 2001 From: parkdj1 <59840783+parkdj1@users.noreply.github.com> Date: Thu, 24 Sep 2020 10:34:43 +0900 Subject: [PATCH 0888/1025] #34640: CLN: remove 'private_key' and 'verbose' from gbq (#34654) --- doc/source/whatsnew/v1.2.0.rst | 2 ++ pandas/io/gbq.py | 6 ------ 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 0067632b3b460..7ba64f57be136 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -325,6 +325,7 @@ I/O - :meth:`to_picke` and :meth:`read_pickle` did not support compression for file-objects (:issue:`26237`, :issue:`29054`, and :issue:`29570`) - Bug in :func:`LongTableBuilder.middle_separator` was duplicating LaTeX longtable entires in the List of Tables of a LaTeX document (:issue:`34360`) - Bug in :meth:`read_csv` with `engine='python'` truncating data if multiple items present in first row and first element started with BOM (:issue:`36343`) +- Removed ``private_key`` and ``verbose`` from :func:`read_gbq` as they are no longer supported in `pandas-gbq` (:issue:`34654` :issue:`30200`) Plotting ^^^^^^^^ @@ -372,6 +373,7 @@ ExtensionArray Other ^^^^^ + - Bug in :meth:`DataFrame.replace` and :meth:`Series.replace` incorrectly raising ``AssertionError`` instead of ``ValueError`` when invalid parameter combinations are passed (:issue:`36045`) - Bug in :meth:`DataFrame.replace` and :meth:`Series.replace` with numeric values and string ``to_replace`` (:issue:`34789`) - Fixed metadata propagation in the :class:`Series.dt` accessor (:issue:`28283`) diff --git a/pandas/io/gbq.py b/pandas/io/gbq.py index 3d0792357297f..afe1234f9fa96 100644 --- a/pandas/io/gbq.py +++ b/pandas/io/gbq.py @@ -31,8 +31,6 @@ def read_gbq( credentials=None, use_bqstorage_api: Optional[bool] = None, max_results: Optional[int] = None, - private_key=None, - verbose=None, progress_bar_type: Optional[str] = None, ) -> "DataFrame": """ @@ -208,8 +206,6 @@ def to_gbq( location: Optional[str] = None, progress_bar: bool = True, credentials=None, - verbose=None, - private_key=None, ) -> None: pandas_gbq = _try_import() pandas_gbq.to_gbq( @@ -224,6 +220,4 @@ def to_gbq( location=location, progress_bar=progress_bar, credentials=credentials, - verbose=verbose, - private_key=private_key, ) From af0f7097992a3e94b477e800558cba4789ea635d Mon Sep 17 00:00:00 2001 From: Erfan Nariman <34067903+erfannariman@users.noreply.github.com> Date: Thu, 24 Sep 2020 08:44:53 +0200 Subject: [PATCH 0889/1025] CLN: clean up pandas core arrays (#36569) --- pandas/core/arrays/sparse/array.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index 528d78a5414ea..7dbb6e7e47b23 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -452,7 +452,7 @@ def from_spmatrix(cls, data): return cls._simple_new(arr, index, dtype) - def __array__(self, dtype=None, copy=True) -> np.ndarray: + def __array__(self, dtype=None) -> np.ndarray: fill_value = self.fill_value if self.sp_index.ngaps == 0: @@ -1515,7 +1515,7 @@ def _formatter(self, boxed=False): SparseArray._add_unary_ops() -def make_sparse(arr: np.ndarray, kind="block", fill_value=None, dtype=None, copy=False): +def make_sparse(arr: np.ndarray, kind="block", fill_value=None, dtype=None): """ Convert ndarray to sparse format From cb984d8631cc70110fffcff3370a96b90356502f Mon Sep 17 00:00:00 2001 From: Erfan Nariman <34067903+erfannariman@users.noreply.github.com> Date: Thu, 24 Sep 2020 08:49:08 +0200 Subject: [PATCH 0890/1025] DOC: Add note to docstring DataFrame.compare about identical labels (#35492) * [IMP] - #35491 - added note docstring * [FIX] - #35491 - remove trailing whitespace * [IMP] - #35491 - added explanation about shape * [FIX] - #35491 - labelS * [FIX] - 35491 - removed add * added Raises section * Fix raise section * Removed trailing whitespace * Adjustments after review * Removed trailing whitespace --- pandas/core/frame.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 5e06a8d16372a..ef30d989dfbd2 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5891,14 +5891,23 @@ def _construct_result(self, result) -> DataFrame: The resulting index will be a MultiIndex with 'self' and 'other' stacked alternately at the inner level. +Raises +------ +ValueError + When the two DataFrames don't have identical labels or shape. + See Also -------- Series.compare : Compare with another Series and show differences. +DataFrame.equals : Test whether two objects contain the same elements. Notes ----- Matching NaNs will not appear as a difference. +Can only compare identically-labeled +(i.e. same shape, identical row and column labels) DataFrames + Examples -------- >>> df = pd.DataFrame( From 62f4f7259d2b5c261832639fdb7184b9138a49c3 Mon Sep 17 00:00:00 2001 From: Fangchen Li Date: Thu, 24 Sep 2020 09:58:53 -0500 Subject: [PATCH 0891/1025] CI/CLN: update travis (#36514) --- .travis.yml | 22 +++++++++++----------- ci/build39.sh | 1 - ci/setup_env.sh | 12 +----------- 3 files changed, 12 insertions(+), 23 deletions(-) diff --git a/.travis.yml b/.travis.yml index a38e90bbce8ba..81cd461dd2c87 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,6 +1,15 @@ language: python python: 3.7 +addons: + apt: + update: true + packages: + - xvfb + +services: + - xvfb + # To turn off cached cython files and compiler cache # set NOCACHE-true # To delete caches go to https://travis-ci.org/OWNER/REPOSITORY/caches or run @@ -10,11 +19,9 @@ cache: ccache: true directories: - $HOME/.cache # cython cache - - $HOME/.ccache # compiler cache env: global: - # Variable for test workers - PYTEST_WORKERS="auto" # create a github personal access token # cd pandas-dev/pandas @@ -22,18 +29,17 @@ env: - secure: "EkWLZhbrp/mXJOx38CHjs7BnjXafsqHtwxPQrqWy457VDFWhIY1DMnIR/lOWG+a20Qv52sCsFtiZEmMfUjf0pLGXOqurdxbYBGJ7/ikFLk9yV2rDwiArUlVM9bWFnFxHvdz9zewBH55WurrY4ShZWyV+x2dWjjceWG5VpWeI6sA=" git: - # for cloning depth: false matrix: fast_finish: true include: - # In allowed failures - dist: bionic python: 3.9-dev env: - JOB="3.9-dev" PATTERN="(not slow and not network and not clipboard)" + - env: - JOB="3.8" ENV_FILE="ci/deps/travis-38.yaml" PATTERN="(not slow and not network and not clipboard)" @@ -42,7 +48,7 @@ matrix: - arch: arm64 env: - - JOB="3.7, arm64" PYTEST_WORKERS=8 ENV_FILE="ci/deps/travis-37-arm64.yaml" PATTERN="(not slow and not network and not clipboard and not arm_slow)" + - JOB="3.7, arm64" PYTEST_WORKERS=1 ENV_FILE="ci/deps/travis-37-arm64.yaml" PATTERN="(not slow and not network and not clipboard and not arm_slow)" - env: - JOB="3.7, locale" ENV_FILE="ci/deps/travis-37-locale.yaml" PATTERN="((not slow and not network and not clipboard) or (single and db))" LOCALE_OVERRIDE="zh_CN.UTF-8" SQL="1" @@ -71,12 +77,6 @@ before_install: - uname -a - git --version - ./ci/check_git_tags.sh - # Because travis runs on Google Cloud and has a /etc/boto.cfg, - # it breaks moto import, see: - # https://github.com/spulec/moto/issues/1771 - # https://github.com/boto/boto/issues/3741 - # This overrides travis and tells it to look nowhere. - - export BOTO_CONFIG=/dev/null install: - echo "install start" diff --git a/ci/build39.sh b/ci/build39.sh index f2ef11d5a71f4..faef2be03c2bb 100755 --- a/ci/build39.sh +++ b/ci/build39.sh @@ -1,7 +1,6 @@ #!/bin/bash -e # Special build for python3.9 until numpy puts its own wheels up -sudo apt-get install build-essential gcc xvfb pip install --no-deps -U pip wheel setuptools pip install cython numpy python-dateutil pytz pytest pytest-xdist hypothesis diff --git a/ci/setup_env.sh b/ci/setup_env.sh index 961433204cfbb..247f809c5fe63 100755 --- a/ci/setup_env.sh +++ b/ci/setup_env.sh @@ -42,9 +42,7 @@ else fi if [ "${TRAVIS_CPU_ARCH}" == "arm64" ]; then - sudo apt-get update - sudo apt-get -y install xvfb - CONDA_URL="https://github.com/conda-forge/miniforge/releases/download/4.8.5-0/Miniforge3-4.8.5-0-Linux-aarch64.sh" + CONDA_URL="https://github.com/conda-forge/miniforge/releases/download/4.8.5-1/Miniforge3-4.8.5-1-Linux-aarch64.sh" else CONDA_URL="https://repo.continuum.io/miniconda/Miniconda3-latest-$CONDA_OS.sh" fi @@ -100,8 +98,6 @@ echo "conda list (root environment)" conda list # Clean up any left-over from a previous build -# (note workaround for https://github.com/conda/conda/issues/2679: -# `conda env remove` issue) conda remove --all -q -y -n pandas-dev echo @@ -142,12 +138,6 @@ conda list pandas echo "[Build extensions]" python setup.py build_ext -q -i -j2 -# TODO: Some of our environments end up with old versions of pip (10.x) -# Adding a new enough version of pip to the requirements explodes the -# solve time. Just using pip to update itself. -# - py35_macos -# - py35_compat -# - py36_32bit echo "[Updating pip]" python -m pip install --no-deps -U pip wheel setuptools From 57b558c69a0efd65beaeac088994fba0b39ed0f0 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 24 Sep 2020 12:57:38 -0700 Subject: [PATCH 0892/1025] DEPR: string indexing along index for datetimes (#36179) --- doc/source/user_guide/timeseries.rst | 12 ++++++++++++ doc/source/whatsnew/v0.11.0.rst | 1 + doc/source/whatsnew/v1.2.0.rst | 1 + pandas/core/indexing.py | 11 ++++++++++- .../indexes/datetimes/test_partial_slicing.py | 4 +++- pandas/tests/series/indexing/test_datetime.py | 15 +++++++++------ 6 files changed, 36 insertions(+), 8 deletions(-) diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst index 32f0cac3f81e2..868bf5a1672ff 100644 --- a/doc/source/user_guide/timeseries.rst +++ b/doc/source/user_guide/timeseries.rst @@ -579,7 +579,12 @@ This type of slicing will work on a ``DataFrame`` with a ``DatetimeIndex`` as we partial string selection is a form of label slicing, the endpoints **will be** included. This would include matching times on an included date: +.. warning:: + + Indexing ``DataFrame`` rows with strings is deprecated in pandas 1.2.0 and will be removed in a future version. Use ``frame.loc[dtstring]`` instead. + .. ipython:: python + :okwarning: dft = pd.DataFrame(np.random.randn(100000, 1), columns=['A'], index=pd.date_range('20130101', periods=100000, freq='T')) @@ -590,24 +595,28 @@ This starts on the very first time in the month, and includes the last date and time for the month: .. ipython:: python + :okwarning: dft['2013-1':'2013-2'] This specifies a stop time **that includes all of the times on the last day**: .. ipython:: python + :okwarning: dft['2013-1':'2013-2-28'] This specifies an **exact** stop time (and is not the same as the above): .. ipython:: python + :okwarning: dft['2013-1':'2013-2-28 00:00:00'] We are stopping on the included end-point as it is part of the index: .. ipython:: python + :okwarning: dft['2013-1-15':'2013-1-15 12:30:00'] @@ -631,6 +640,7 @@ We are stopping on the included end-point as it is part of the index: Slicing with string indexing also honors UTC offset. .. ipython:: python + :okwarning: df = pd.DataFrame([0], index=pd.DatetimeIndex(['2019-01-01'], tz='US/Pacific')) df @@ -681,6 +691,7 @@ If index resolution is second, then the minute-accurate timestamp gives a If the timestamp string is treated as a slice, it can be used to index ``DataFrame`` with ``[]`` as well. .. ipython:: python + :okwarning: dft_minute = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]}, index=series_minute.index) @@ -2027,6 +2038,7 @@ You can pass in dates and strings to ``Series`` and ``DataFrame`` with ``PeriodI Passing a string representing a lower frequency than ``PeriodIndex`` returns partial sliced data. .. ipython:: python + :okwarning: ps['2011'] diff --git a/doc/source/whatsnew/v0.11.0.rst b/doc/source/whatsnew/v0.11.0.rst index 6c13a125a4e54..c0bc74c9ff036 100644 --- a/doc/source/whatsnew/v0.11.0.rst +++ b/doc/source/whatsnew/v0.11.0.rst @@ -367,6 +367,7 @@ Enhancements - You can now select with a string from a DataFrame with a datelike index, in a similar way to a Series (:issue:`3070`) .. ipython:: python + :okwarning: idx = pd.date_range("2001-10-1", periods=5, freq='M') ts = pd.Series(np.random.rand(len(idx)), index=idx) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 7ba64f57be136..782e7fe16a2dc 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -213,6 +213,7 @@ Deprecations - Date parser functions :func:`~pandas.io.date_converters.parse_date_time`, :func:`~pandas.io.date_converters.parse_date_fields`, :func:`~pandas.io.date_converters.parse_all_fields` and :func:`~pandas.io.date_converters.generic_parser` from ``pandas.io.date_converters`` are deprecated and will be removed in a future version; use :func:`to_datetime` instead (:issue:`35741`) - :meth:`DataFrame.lookup` is deprecated and will be removed in a future version, use :meth:`DataFrame.melt` and :meth:`DataFrame.loc` instead (:issue:`18682`) - The :meth:`Index.to_native_types` is deprecated. Use ``.astype(str)`` instead (:issue:`28867`) +- Deprecated indexing :class:`DataFrame` rows with datetime-like strings ``df[string]``, use ``df.loc[string]`` instead (:issue:`36179`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 5f57fe1c9a56a..8aef150078e5b 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1,4 +1,5 @@ from typing import TYPE_CHECKING, Hashable, List, Tuple, Union +import warnings import numpy as np @@ -2191,7 +2192,15 @@ def convert_to_index_sliceable(obj: "DataFrame", key): # slice here via partial string indexing if idx._supports_partial_string_indexing: try: - return idx._get_string_slice(key) + res = idx._get_string_slice(key) + warnings.warn( + "Indexing on datetimelike rows with `frame[string]` is " + "deprecated and will be removed in a future version. " + "Use `frame.loc[string]` instead.", + FutureWarning, + stacklevel=3, + ) + return res except (KeyError, ValueError, NotImplementedError): return None diff --git a/pandas/tests/indexes/datetimes/test_partial_slicing.py b/pandas/tests/indexes/datetimes/test_partial_slicing.py index 635470b930252..57dc46e1fb415 100644 --- a/pandas/tests/indexes/datetimes/test_partial_slicing.py +++ b/pandas/tests/indexes/datetimes/test_partial_slicing.py @@ -228,7 +228,9 @@ def test_partial_slicing_dataframe(self): tm.assert_series_equal(result, expected) # Frame should return slice as well - result = df[ts_string] + with tm.assert_produces_warning(FutureWarning): + # GH#36179 deprecated this indexing + result = df[ts_string] expected = df[theslice] tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/series/indexing/test_datetime.py b/pandas/tests/series/indexing/test_datetime.py index 088f8681feb99..b7fbed2b325b3 100644 --- a/pandas/tests/series/indexing/test_datetime.py +++ b/pandas/tests/series/indexing/test_datetime.py @@ -1,3 +1,6 @@ +""" +Also test support for datetime64[ns] in Series / DataFrame +""" from datetime import datetime, timedelta import re @@ -11,10 +14,6 @@ from pandas import DataFrame, DatetimeIndex, NaT, Series, Timestamp, date_range import pandas._testing as tm -""" -Also test support for datetime64[ns] in Series / DataFrame -""" - def test_fancy_getitem(): dti = date_range( @@ -605,7 +604,9 @@ def test_indexing(): expected.name = "A" df = DataFrame(dict(A=ts)) - result = df["2001"]["A"] + with tm.assert_produces_warning(FutureWarning): + # GH#36179 string indexing on rows for DataFrame deprecated + result = df["2001"]["A"] tm.assert_series_equal(expected, result) # setting @@ -615,7 +616,9 @@ def test_indexing(): df.loc["2001", "A"] = 1 - result = df["2001"]["A"] + with tm.assert_produces_warning(FutureWarning): + # GH#36179 string indexing on rows for DataFrame deprecated + result = df["2001"]["A"] tm.assert_series_equal(expected, result) # GH3546 (not including times on the last day) From 8726b6e163f75e1867f358cceceb8b01ac299e1d Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 24 Sep 2020 15:01:34 -0700 Subject: [PATCH 0893/1025] CLN: de-duplicate _local_timestamps (#36609) --- pandas/core/arrays/datetimes.py | 31 ++++++++----------------------- pandas/core/indexes/datetimes.py | 12 ++---------- 2 files changed, 10 insertions(+), 33 deletions(-) diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index b1f98199f9fba..6b051f1f73467 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -76,9 +76,7 @@ def tz_to_dtype(tz): def _field_accessor(name, field, docstring=None): def f(self): - values = self.asi8 - if self.tz is not None and not timezones.is_utc(self.tz): - values = self._local_timestamps() + values = self._local_timestamps() if field in self._bool_ops: if field.endswith(("start", "end")): @@ -731,6 +729,8 @@ def _local_timestamps(self): This is used to calculate time-of-day information as if the timestamps were timezone-naive. """ + if self.tz is None or timezones.is_utc(self.tz): + return self.asi8 return tzconversion.tz_convert_from_utc(self.asi8, self.tz) def tz_convert(self, tz): @@ -1167,10 +1167,7 @@ def month_name(self, locale=None): >>> idx.month_name() Index(['January', 'February', 'March'], dtype='object') """ - if self.tz is not None and not timezones.is_utc(self.tz): - values = self._local_timestamps() - else: - values = self.asi8 + values = self._local_timestamps() result = fields.get_date_name_field(values, "month_name", locale=locale) result = self._maybe_mask_results(result, fill_value=None) @@ -1200,10 +1197,7 @@ def day_name(self, locale=None): >>> idx.day_name() Index(['Monday', 'Tuesday', 'Wednesday'], dtype='object') """ - if self.tz is not None and not timezones.is_utc(self.tz): - values = self._local_timestamps() - else: - values = self.asi8 + values = self._local_timestamps() result = fields.get_date_name_field(values, "day_name", locale=locale) result = self._maybe_mask_results(result, fill_value=None) @@ -1217,10 +1211,7 @@ def time(self): # If the Timestamps have a timezone that is not UTC, # convert them into their i8 representation while # keeping their timezone and not using UTC - if self.tz is not None and not timezones.is_utc(self.tz): - timestamps = self._local_timestamps() - else: - timestamps = self.asi8 + timestamps = self._local_timestamps() return ints_to_pydatetime(timestamps, box="time") @@ -1241,10 +1232,7 @@ def date(self): # If the Timestamps have a timezone that is not UTC, # convert them into their i8 representation while # keeping their timezone and not using UTC - if self.tz is not None and not timezones.is_utc(self.tz): - timestamps = self._local_timestamps() - else: - timestamps = self.asi8 + timestamps = self._local_timestamps() return ints_to_pydatetime(timestamps, box="date") @@ -1283,10 +1271,7 @@ def isocalendar(self): """ from pandas import DataFrame - if self.tz is not None and not timezones.is_utc(self.tz): - values = self._local_timestamps() - else: - values = self.asi8 + values = self._local_timestamps() sarray = fields.build_isocalendar_sarray(values) iso_calendar_df = DataFrame( sarray, columns=["year", "week", "day"], dtype="UInt32" diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 2d166773dda2c..016544d823ae3 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -6,13 +6,7 @@ import numpy as np from pandas._libs import NaT, Period, Timestamp, index as libindex, lib -from pandas._libs.tslibs import ( - Resolution, - ints_to_pydatetime, - parsing, - timezones, - to_offset, -) +from pandas._libs.tslibs import Resolution, ints_to_pydatetime, parsing, to_offset from pandas._libs.tslibs.offsets import prefix_mapping from pandas._typing import DtypeObj, Label from pandas.errors import InvalidIndexError @@ -395,9 +389,7 @@ def _get_time_micros(self): ------- ndarray[int64_t] """ - values = self.asi8 - if self.tz is not None and not timezones.is_utc(self.tz): - values = self._data._local_timestamps() + values = self._data._local_timestamps() nanos = values % (24 * 3600 * 1_000_000_000) micros = nanos // 1000 From e54f8ca8dee29848b28f19e3714031894a1c6593 Mon Sep 17 00:00:00 2001 From: Fangchen Li Date: Thu, 24 Sep 2020 17:22:56 -0500 Subject: [PATCH 0894/1025] TST: 32bit dtype compat #36579 (#36584) --- pandas/tests/indexes/period/test_indexing.py | 2 +- pandas/tests/indexes/test_base.py | 2 +- pandas/tests/test_algos.py | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/tests/indexes/period/test_indexing.py b/pandas/tests/indexes/period/test_indexing.py index f42499147cdbb..85a01f1c5278c 100644 --- a/pandas/tests/indexes/period/test_indexing.py +++ b/pandas/tests/indexes/period/test_indexing.py @@ -450,7 +450,7 @@ def test_get_indexer_non_unique(self): result = idx1.get_indexer_non_unique(idx2) expected_indexer = np.array([1, 0, 2, -1, -1], dtype=np.intp) - expected_missing = np.array([2, 3], dtype=np.int64) + expected_missing = np.array([2, 3], dtype=np.intp) tm.assert_numpy_array_equal(result[0], expected_indexer) tm.assert_numpy_array_equal(result[1], expected_missing) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index f811bd579aaaa..7cafdb61fcb31 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -2607,7 +2607,7 @@ def construct(dtype): ex1 = np.array([0, 3, 1, 4, 2, 5] * 2, dtype=np.intp) ex2 = np.array([], dtype=np.intp) tm.assert_numpy_array_equal(result[0], ex1) - tm.assert_numpy_array_equal(result[1], ex2.astype(np.int64)) + tm.assert_numpy_array_equal(result[1], ex2) else: no_matches = np.array([-1] * 6, dtype=np.intp) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 6102f43f4db6a..28ceaa61c558f 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -1545,7 +1545,7 @@ def test_lookup_nan(self, writable): xs.setflags(write=writable) m = ht.Float64HashTable() m.map_locations(xs) - tm.assert_numpy_array_equal(m.lookup(xs), np.arange(len(xs), dtype=np.int64)) + tm.assert_numpy_array_equal(m.lookup(xs), np.arange(len(xs), dtype=np.intp)) def test_add_signed_zeros(self): # GH 21866 inconsistent hash-function for float64 @@ -1578,7 +1578,7 @@ def test_lookup_overflow(self, writable): xs.setflags(write=writable) m = ht.UInt64HashTable() m.map_locations(xs) - tm.assert_numpy_array_equal(m.lookup(xs), np.arange(len(xs), dtype=np.int64)) + tm.assert_numpy_array_equal(m.lookup(xs), np.arange(len(xs), dtype=np.intp)) def test_get_unique(self): s = Series([1, 2, 2 ** 63, 2 ** 63], dtype=np.uint64) From 4e123919f9237b5541d80a44c4b0bc46d0d75a56 Mon Sep 17 00:00:00 2001 From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> Date: Thu, 24 Sep 2020 19:36:59 -0400 Subject: [PATCH 0895/1025] CLN: Avoid importing Series in core.aggregation (#36612) --- pandas/core/aggregation.py | 11 +++++++---- pandas/core/series.py | 3 +-- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/pandas/core/aggregation.py b/pandas/core/aggregation.py index 541c617f7f618..c813b65d3cbb7 100644 --- a/pandas/core/aggregation.py +++ b/pandas/core/aggregation.py @@ -6,6 +6,7 @@ from collections import defaultdict from functools import partial from typing import ( + TYPE_CHECKING, Any, Callable, DefaultDict, @@ -26,7 +27,9 @@ from pandas.core.base import SpecificationError import pandas.core.common as com from pandas.core.indexes.api import Index -from pandas.core.series import Series + +if TYPE_CHECKING: + from pandas.core.series import Series def reconstruct_func( @@ -281,7 +284,7 @@ def relabel_result( func: Dict[str, List[Union[Callable, str]]], columns: Iterable[Label], order: Iterable[int], -) -> Dict[Label, Series]: +) -> Dict[Label, "Series"]: """ Internal function to reorder result if relabelling is True for dataframe.agg, and return the reordered result in dict. @@ -308,10 +311,10 @@ def relabel_result( reordered_indexes = [ pair[0] for pair in sorted(zip(columns, order), key=lambda t: t[1]) ] - reordered_result_in_dict: Dict[Label, Series] = {} + reordered_result_in_dict: Dict[Label, "Series"] = {} idx = 0 - reorder_mask = not isinstance(result, Series) and len(result.columns) > 1 + reorder_mask = not isinstance(result, ABCSeries) and len(result.columns) > 1 for col, fun in func.items(): s = result[col].dropna() diff --git a/pandas/core/series.py b/pandas/core/series.py index 0984e86a23592..41c3e8fa9d246 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -70,6 +70,7 @@ import pandas as pd from pandas.core import algorithms, base, generic, nanops, ops from pandas.core.accessor import CachedAccessor +from pandas.core.aggregation import transform from pandas.core.arrays import ExtensionArray from pandas.core.arrays.categorical import CategoricalAccessor from pandas.core.arrays.sparse import SparseAccessor @@ -4042,8 +4043,6 @@ def aggregate(self, func=None, axis=0, *args, **kwargs): def transform( self, func: AggFuncType, axis: Axis = 0, *args, **kwargs ) -> FrameOrSeriesUnion: - from pandas.core.aggregation import transform - return transform(self, func, axis, *args, **kwargs) def apply(self, func, convert_dtype=True, args=(), **kwds): From 4ed43b490776b6d9507261020818aace13e88306 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Fri, 25 Sep 2020 00:42:18 +0100 Subject: [PATCH 0896/1025] REGR: DataFrame.apply() with raw option and func returning string (#36610) --- doc/source/whatsnew/v1.1.3.rst | 1 + pandas/core/apply.py | 18 +++++++++++++++++- pandas/tests/frame/apply/test_frame_apply.py | 8 ++++++++ 3 files changed, 26 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.3.rst b/doc/source/whatsnew/v1.1.3.rst index c1effad34ab93..34595ea4ec50f 100644 --- a/doc/source/whatsnew/v1.1.3.rst +++ b/doc/source/whatsnew/v1.1.3.rst @@ -35,6 +35,7 @@ Fixed regressions - Fixed regression in :meth:`Series.__getitem__` incorrectly raising when the input was a frozenset (:issue:`35747`) - Fixed regression in :meth:`read_excel` with ``engine="odf"`` caused ``UnboundLocalError`` in some cases where cells had nested child nodes (:issue:`36122`, :issue:`35802`) - Fixed regression in :class:`DataFrame` and :class:`Series` comparisons between numeric arrays and strings (:issue:`35700`, :issue:`36377`) +- Fixed regression in :meth:`DataFrame.apply` with ``raw=True`` and user-function returning string (:issue:`35940`) - Fixed regression when setting empty :class:`DataFrame` column to a :class:`Series` in preserving name of index in frame (:issue:`36527`) - Fixed regression in :class:`Period` incorrect value for ordinal over the maximum timestamp (:issue:`36430`) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index bbf832f33065b..002e260742dc5 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -216,7 +216,23 @@ def apply_empty_result(self): def apply_raw(self): """ apply to the values as a numpy array """ - result = np.apply_along_axis(self.f, self.axis, self.values) + + def wrap_function(func): + """ + Wrap user supplied function to work around numpy issue. + + see https://github.com/numpy/numpy/issues/8352 + """ + + def wrapper(*args, **kwargs): + result = func(*args, **kwargs) + if isinstance(result, str): + result = np.array(result, dtype=object) + return result + + return wrapper + + result = np.apply_along_axis(wrap_function(self.f), self.axis, self.values) # TODO: mixed type case if result.ndim == 2: diff --git a/pandas/tests/frame/apply/test_frame_apply.py b/pandas/tests/frame/apply/test_frame_apply.py index e25b681c8c7c3..3f859bb4ee39e 100644 --- a/pandas/tests/frame/apply/test_frame_apply.py +++ b/pandas/tests/frame/apply/test_frame_apply.py @@ -1545,3 +1545,11 @@ def test_apply_no_suffix_index(): ) tm.assert_frame_equal(result, expected) + + +def test_apply_raw_returns_string(): + # https://github.com/pandas-dev/pandas/issues/35940 + df = pd.DataFrame({"A": ["aa", "bbb"]}) + result = df.apply(lambda x: x[0], axis=1, raw=True) + expected = pd.Series(["aa", "bbb"]) + tm.assert_series_equal(result, expected) From b0fc2f35537552389d088a8d2d417ebb770968ba Mon Sep 17 00:00:00 2001 From: patrick <61934744+phofl@users.noreply.github.com> Date: Fri, 25 Sep 2020 01:47:57 +0200 Subject: [PATCH 0897/1025] Fix regression when adding timeldeta_range to timestamp (#36582) --- doc/source/whatsnew/v1.1.3.rst | 1 + pandas/core/arrays/timedeltas.py | 2 +- pandas/tests/arithmetic/test_timedelta64.py | 17 +++++++++++++++++ 3 files changed, 19 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.3.rst b/doc/source/whatsnew/v1.1.3.rst index 34595ea4ec50f..da94e98bc78dd 100644 --- a/doc/source/whatsnew/v1.1.3.rst +++ b/doc/source/whatsnew/v1.1.3.rst @@ -38,6 +38,7 @@ Fixed regressions - Fixed regression in :meth:`DataFrame.apply` with ``raw=True`` and user-function returning string (:issue:`35940`) - Fixed regression when setting empty :class:`DataFrame` column to a :class:`Series` in preserving name of index in frame (:issue:`36527`) - Fixed regression in :class:`Period` incorrect value for ordinal over the maximum timestamp (:issue:`36430`) +- Fixed regression when adding a :meth:`timedelta_range` to a :class:``Timestamp`` raised an ``ValueError`` (:issue:`35897`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index 3eaf428bc64b2..4526fb9c8623c 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -450,7 +450,7 @@ def _add_datetimelike_scalar(self, other): result = checked_add_with_arr(i8, other.value, arr_mask=self._isnan) result = self._maybe_mask_results(result) dtype = DatetimeTZDtype(tz=other.tz) if other.tz else DT64NS_DTYPE - return DatetimeArray(result, dtype=dtype, freq=self.freq) + return DatetimeArray._simple_new(result, dtype=dtype, freq=self.freq) def _addsub_object_array(self, other, op): # Add or subtract Array-like of objects diff --git a/pandas/tests/arithmetic/test_timedelta64.py b/pandas/tests/arithmetic/test_timedelta64.py index 64d3d5b6d684d..dd9b6269ce5bf 100644 --- a/pandas/tests/arithmetic/test_timedelta64.py +++ b/pandas/tests/arithmetic/test_timedelta64.py @@ -2136,3 +2136,20 @@ def test_td64arr_pow_invalid(self, scalar_td, box_with_array): with pytest.raises(TypeError, match=pattern): td1 ** scalar_td + + +def test_add_timestamp_to_timedelta(): + # GH: 35897 + timestamp = pd.Timestamp.now() + result = timestamp + pd.timedelta_range("0s", "1s", periods=31) + expected = pd.DatetimeIndex( + [ + timestamp + + ( + pd.to_timedelta("0.033333333s") * i + + pd.to_timedelta("0.000000001s") * divmod(i, 3)[0] + ) + for i in range(31) + ] + ) + tm.assert_index_equal(result, expected) From 43f0229bc69445debf5a45ff6dbc6331dfbcac56 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov <41443370+ivanovmg@users.noreply.github.com> Date: Fri, 25 Sep 2020 06:50:31 +0700 Subject: [PATCH 0898/1025] REF: refactor/cleanup CSSResolver (#36581) --- pandas/io/formats/css.py | 179 +++++++++++++++++++++---------------- pandas/io/formats/excel.py | 7 +- 2 files changed, 106 insertions(+), 80 deletions(-) diff --git a/pandas/io/formats/css.py b/pandas/io/formats/css.py index 2e9ee192a1182..8abe13db370ca 100644 --- a/pandas/io/formats/css.py +++ b/pandas/io/formats/css.py @@ -3,7 +3,7 @@ """ import re -from typing import Optional +from typing import Dict, Optional import warnings @@ -12,8 +12,6 @@ class CSSWarning(UserWarning): This CSS syntax cannot currently be parsed. """ - pass - def _side_expander(prop_fmt: str): def expand(self, prop, value: str): @@ -34,7 +32,64 @@ class CSSResolver: A callable for parsing and resolving CSS to atomic properties. """ - def __call__(self, declarations_str, inherited=None): + UNIT_RATIOS = { + "rem": ("pt", 12), + "ex": ("em", 0.5), + # 'ch': + "px": ("pt", 0.75), + "pc": ("pt", 12), + "in": ("pt", 72), + "cm": ("in", 1 / 2.54), + "mm": ("in", 1 / 25.4), + "q": ("mm", 0.25), + "!!default": ("em", 0), + } + + FONT_SIZE_RATIOS = UNIT_RATIOS.copy() + FONT_SIZE_RATIOS.update( + { + "%": ("em", 0.01), + "xx-small": ("rem", 0.5), + "x-small": ("rem", 0.625), + "small": ("rem", 0.8), + "medium": ("rem", 1), + "large": ("rem", 1.125), + "x-large": ("rem", 1.5), + "xx-large": ("rem", 2), + "smaller": ("em", 1 / 1.2), + "larger": ("em", 1.2), + "!!default": ("em", 1), + } + ) + + MARGIN_RATIOS = UNIT_RATIOS.copy() + MARGIN_RATIOS.update({"none": ("pt", 0)}) + + BORDER_WIDTH_RATIOS = UNIT_RATIOS.copy() + BORDER_WIDTH_RATIOS.update( + { + "none": ("pt", 0), + "thick": ("px", 4), + "medium": ("px", 2), + "thin": ("px", 1), + # Default: medium only if solid + } + ) + + SIDE_SHORTHANDS = { + 1: [0, 0, 0, 0], + 2: [0, 1, 0, 1], + 3: [0, 1, 2, 1], + 4: [0, 1, 2, 3], + } + + SIDES = ("top", "right", "bottom", "left") + + def __call__( + self, + declarations_str: str, + inherited: Optional[Dict[str, str]] = None, + ) -> Dict[str, str]: """ The given declarations to atomic properties. @@ -76,100 +131,78 @@ def __call__(self, declarations_str, inherited=None): if inherited is None: inherited = {} + props = self._update_initial(props, inherited) + props = self._update_font_size(props, inherited) + return self._update_other_units(props) + + def _update_initial( + self, + props: Dict[str, str], + inherited: Dict[str, str], + ) -> Dict[str, str]: # 1. resolve inherited, initial for prop, val in inherited.items(): if prop not in props: props[prop] = val - for prop, val in list(props.items()): + new_props = props.copy() + for prop, val in props.items(): if val == "inherit": val = inherited.get(prop, "initial") - if val == "initial": - val = None - if val is None: + if val in ("initial", None): # we do not define a complete initial stylesheet - del props[prop] + del new_props[prop] else: - props[prop] = val - + new_props[prop] = val + return new_props + + def _update_font_size( + self, + props: Dict[str, str], + inherited: Dict[str, str], + ) -> Dict[str, str]: # 2. resolve relative font size - font_size: Optional[float] if props.get("font-size"): - if "font-size" in inherited: - em_pt = inherited["font-size"] - assert em_pt[-2:] == "pt" - em_pt = float(em_pt[:-2]) - else: - em_pt = None props["font-size"] = self.size_to_pt( - props["font-size"], em_pt, conversions=self.FONT_SIZE_RATIOS + props["font-size"], + self._get_font_size(inherited), + conversions=self.FONT_SIZE_RATIOS, ) + return props - font_size = float(props["font-size"][:-2]) - else: - font_size = None + def _get_font_size(self, props: Dict[str, str]) -> Optional[float]: + if props.get("font-size"): + font_size_string = props["font-size"] + return self._get_float_font_size_from_pt(font_size_string) + return None + + def _get_float_font_size_from_pt(self, font_size_string: str) -> float: + assert font_size_string.endswith("pt") + return float(font_size_string.rstrip("pt")) + def _update_other_units(self, props: Dict[str, str]) -> Dict[str, str]: + font_size = self._get_font_size(props) # 3. TODO: resolve other font-relative units for side in self.SIDES: prop = f"border-{side}-width" if prop in props: props[prop] = self.size_to_pt( - props[prop], em_pt=font_size, conversions=self.BORDER_WIDTH_RATIOS + props[prop], + em_pt=font_size, + conversions=self.BORDER_WIDTH_RATIOS, ) + for prop in [f"margin-{side}", f"padding-{side}"]: if prop in props: # TODO: support % props[prop] = self.size_to_pt( - props[prop], em_pt=font_size, conversions=self.MARGIN_RATIOS + props[prop], + em_pt=font_size, + conversions=self.MARGIN_RATIOS, ) - return props - UNIT_RATIOS = { - "rem": ("pt", 12), - "ex": ("em", 0.5), - # 'ch': - "px": ("pt", 0.75), - "pc": ("pt", 12), - "in": ("pt", 72), - "cm": ("in", 1 / 2.54), - "mm": ("in", 1 / 25.4), - "q": ("mm", 0.25), - "!!default": ("em", 0), - } - - FONT_SIZE_RATIOS = UNIT_RATIOS.copy() - FONT_SIZE_RATIOS.update( - { - "%": ("em", 0.01), - "xx-small": ("rem", 0.5), - "x-small": ("rem", 0.625), - "small": ("rem", 0.8), - "medium": ("rem", 1), - "large": ("rem", 1.125), - "x-large": ("rem", 1.5), - "xx-large": ("rem", 2), - "smaller": ("em", 1 / 1.2), - "larger": ("em", 1.2), - "!!default": ("em", 1), - } - ) - - MARGIN_RATIOS = UNIT_RATIOS.copy() - MARGIN_RATIOS.update({"none": ("pt", 0)}) - - BORDER_WIDTH_RATIOS = UNIT_RATIOS.copy() - BORDER_WIDTH_RATIOS.update( - { - "none": ("pt", 0), - "thick": ("px", 4), - "medium": ("px", 2), - "thin": ("px", 1), - # Default: medium only if solid - } - ) - def size_to_pt(self, in_val, em_pt=None, conversions=UNIT_RATIOS): def _error(): warnings.warn(f"Unhandled size: {repr(in_val)}", CSSWarning) @@ -222,14 +255,6 @@ def atomize(self, declarations): for prop, value in expand(prop, value): yield prop, value - SIDE_SHORTHANDS = { - 1: [0, 0, 0, 0], - 2: [0, 1, 0, 1], - 3: [0, 1, 2, 1], - 4: [0, 1, 2, 3], - } - SIDES = ("top", "right", "bottom", "left") - expand_border_color = _side_expander("border-{:s}-color") expand_border_style = _side_expander("border-{:s}-style") expand_border_width = _side_expander("border-{:s}-width") diff --git a/pandas/io/formats/excel.py b/pandas/io/formats/excel.py index 0140804e8c7b5..2fccb4f3e9258 100644 --- a/pandas/io/formats/excel.py +++ b/pandas/io/formats/excel.py @@ -62,12 +62,13 @@ class CSSToExcelConverter: # and __call__ make use of instance attributes. We leave them as # instancemethods so that users can easily experiment with extensions # without monkey-patching. + inherited: Optional[Dict[str, str]] def __init__(self, inherited: Optional[str] = None): if inherited is not None: - inherited = self.compute_css(inherited) - - self.inherited = inherited + self.inherited = self.compute_css(inherited) + else: + self.inherited = None compute_css = CSSResolver() From 6ef1669936e95310082cd0b7dfeba53212efd18b Mon Sep 17 00:00:00 2001 From: Maxim Ivanov <41443370+ivanovmg@users.noreply.github.com> Date: Fri, 25 Sep 2020 06:57:03 +0700 Subject: [PATCH 0899/1025] REF: refactor/cleanup of CSSToExcelConverter (#36576) --- pandas/io/formats/excel.py | 329 +++++++++++++++++++++++-------------- 1 file changed, 203 insertions(+), 126 deletions(-) diff --git a/pandas/io/formats/excel.py b/pandas/io/formats/excel.py index 2fccb4f3e9258..4cd19800d4e26 100644 --- a/pandas/io/formats/excel.py +++ b/pandas/io/formats/excel.py @@ -5,7 +5,7 @@ from functools import reduce import itertools import re -from typing import Callable, Dict, Optional, Sequence, Union +from typing import Callable, Dict, Mapping, Optional, Sequence, Union import warnings import numpy as np @@ -58,6 +58,68 @@ class CSSToExcelConverter: CSS processed by :meth:`__call__`. """ + NAMED_COLORS = { + "maroon": "800000", + "brown": "A52A2A", + "red": "FF0000", + "pink": "FFC0CB", + "orange": "FFA500", + "yellow": "FFFF00", + "olive": "808000", + "green": "008000", + "purple": "800080", + "fuchsia": "FF00FF", + "lime": "00FF00", + "teal": "008080", + "aqua": "00FFFF", + "blue": "0000FF", + "navy": "000080", + "black": "000000", + "gray": "808080", + "grey": "808080", + "silver": "C0C0C0", + "white": "FFFFFF", + } + + VERTICAL_MAP = { + "top": "top", + "text-top": "top", + "middle": "center", + "baseline": "bottom", + "bottom": "bottom", + "text-bottom": "bottom", + # OpenXML also has 'justify', 'distributed' + } + + BOLD_MAP = { + "bold": True, + "bolder": True, + "600": True, + "700": True, + "800": True, + "900": True, + "normal": False, + "lighter": False, + "100": False, + "200": False, + "300": False, + "400": False, + "500": False, + } + + ITALIC_MAP = { + "normal": False, + "italic": True, + "oblique": True, + } + + FAMILY_MAP = { + "serif": 1, # roman + "sans-serif": 2, # swiss + "cursive": 4, # script + "fantasy": 5, # decorative + } + # NB: Most of the methods here could be classmethods, as only __init__ # and __call__ make use of instance attributes. We leave them as # instancemethods so that users can easily experiment with extensions @@ -92,7 +154,7 @@ def __call__(self, declarations_str: str) -> Dict[str, Dict[str, str]]: properties = self.compute_css(declarations_str, self.inherited) return self.build_xlstyle(properties) - def build_xlstyle(self, props: Dict[str, str]) -> Dict[str, Dict[str, str]]: + def build_xlstyle(self, props: Mapping[str, str]) -> Dict[str, Dict[str, str]]: out = { "alignment": self.build_alignment(props), "border": self.build_border(props), @@ -116,29 +178,30 @@ def remove_none(d: Dict[str, str]) -> None: remove_none(out) return out - VERTICAL_MAP = { - "top": "top", - "text-top": "top", - "middle": "center", - "baseline": "bottom", - "bottom": "bottom", - "text-bottom": "bottom", - # OpenXML also has 'justify', 'distributed' - } - - def build_alignment(self, props) -> Dict[str, Optional[Union[bool, str]]]: + def build_alignment( + self, props: Mapping[str, str] + ) -> Dict[str, Optional[Union[bool, str]]]: # TODO: text-indent, padding-left -> alignment.indent return { "horizontal": props.get("text-align"), - "vertical": self.VERTICAL_MAP.get(props.get("vertical-align")), - "wrap_text": ( - None - if props.get("white-space") is None - else props["white-space"] not in ("nowrap", "pre", "pre-line") - ), + "vertical": self._get_vertical_alignment(props), + "wrap_text": self._get_is_wrap_text(props), } - def build_border(self, props: Dict) -> Dict[str, Dict[str, str]]: + def _get_vertical_alignment(self, props: Mapping[str, str]) -> Optional[str]: + vertical_align = props.get("vertical-align") + if vertical_align: + return self.VERTICAL_MAP.get(vertical_align) + return None + + def _get_is_wrap_text(self, props: Mapping[str, str]) -> Optional[bool]: + if props.get("white-space") is None: + return None + return bool(props["white-space"] not in ("nowrap", "pre", "pre-line")) + + def build_border( + self, props: Mapping[str, str] + ) -> Dict[str, Dict[str, Optional[str]]]: return { side: { "style": self._border_style( @@ -150,7 +213,7 @@ def build_border(self, props: Dict) -> Dict[str, Dict[str, str]]: for side in ["top", "right", "bottom", "left"] } - def _border_style(self, style: Optional[str], width): + def _border_style(self, style: Optional[str], width: Optional[str]): # convert styles and widths to openxml, one of: # 'dashDot' # 'dashDotDot' @@ -170,26 +233,16 @@ def _border_style(self, style: Optional[str], width): if style == "none" or style == "hidden": return None - if width is None: - width = "2pt" - width = float(width[:-2]) - if width < 1e-5: + width_name = self._get_width_name(width) + if width_name is None: return None - elif width < 1.3: - width_name = "thin" - elif width < 2.8: - width_name = "medium" - else: - width_name = "thick" - if style in (None, "groove", "ridge", "inset", "outset"): + if style in (None, "groove", "ridge", "inset", "outset", "solid"): # not handled - style = "solid" + return width_name if style == "double": return "double" - if style == "solid": - return width_name if style == "dotted": if width_name in ("hair", "thin"): return "dotted" @@ -199,36 +252,89 @@ def _border_style(self, style: Optional[str], width): return "dashed" return "mediumDashed" - def build_fill(self, props: Dict[str, str]): + def _get_width_name(self, width_input: Optional[str]) -> Optional[str]: + width = self._width_to_float(width_input) + if width < 1e-5: + return None + elif width < 1.3: + return "thin" + elif width < 2.8: + return "medium" + return "thick" + + def _width_to_float(self, width: Optional[str]) -> float: + if width is None: + width = "2pt" + return self._pt_to_float(width) + + def _pt_to_float(self, pt_string: str) -> float: + assert pt_string.endswith("pt") + return float(pt_string.rstrip("pt")) + + def build_fill(self, props: Mapping[str, str]): # TODO: perhaps allow for special properties # -excel-pattern-bgcolor and -excel-pattern-type fill_color = props.get("background-color") if fill_color not in (None, "transparent", "none"): return {"fgColor": self.color_to_excel(fill_color), "patternType": "solid"} - BOLD_MAP = { - "bold": True, - "bolder": True, - "600": True, - "700": True, - "800": True, - "900": True, - "normal": False, - "lighter": False, - "100": False, - "200": False, - "300": False, - "400": False, - "500": False, - } - ITALIC_MAP = {"normal": False, "italic": True, "oblique": True} + def build_number_format(self, props: Mapping[str, str]) -> Dict[str, Optional[str]]: + return {"format_code": props.get("number-format")} - def build_font(self, props) -> Dict[str, Optional[Union[bool, int, str]]]: - size = props.get("font-size") - if size is not None: - assert size.endswith("pt") - size = float(size[:-2]) + def build_font( + self, props: Mapping[str, str] + ) -> Dict[str, Optional[Union[bool, int, float, str]]]: + font_names = self._get_font_names(props) + decoration = self._get_decoration(props) + return { + "name": font_names[0] if font_names else None, + "family": self._select_font_family(font_names), + "size": self._get_font_size(props), + "bold": self._get_is_bold(props), + "italic": self._get_is_italic(props), + "underline": ("single" if "underline" in decoration else None), + "strike": ("line-through" in decoration) or None, + "color": self.color_to_excel(props.get("color")), + # shadow if nonzero digit before shadow color + "shadow": self._get_shadow(props), + # FIXME: dont leave commented-out + # 'vertAlign':, + # 'charset': , + # 'scheme': , + # 'outline': , + # 'condense': , + } + + def _get_is_bold(self, props: Mapping[str, str]) -> Optional[bool]: + weight = props.get("font-weight") + if weight: + return self.BOLD_MAP.get(weight) + return None + + def _get_is_italic(self, props: Mapping[str, str]) -> Optional[bool]: + font_style = props.get("font-style") + if font_style: + return self.ITALIC_MAP.get(font_style) + return None + + def _get_decoration(self, props: Mapping[str, str]) -> Sequence[str]: + decoration = props.get("text-decoration") + if decoration is not None: + return decoration.split() + else: + return () + + def _get_underline(self, decoration: Sequence[str]) -> Optional[str]: + if "underline" in decoration: + return "single" + return None + + def _get_shadow(self, props: Mapping[str, str]) -> Optional[bool]: + if "text-shadow" in props: + return bool(re.search("^[^#(]*[1-9]", props["text-shadow"])) + return None + def _get_font_names(self, props: Mapping[str, str]) -> Sequence[str]: font_names_tmp = re.findall( r"""(?x) ( @@ -241,6 +347,7 @@ def build_font(self, props) -> Dict[str, Optional[Union[bool, int, str]]]: """, props.get("font-family", ""), ) + font_names = [] for name in font_names_tmp: if name[:1] == '"': @@ -251,88 +358,58 @@ def build_font(self, props) -> Dict[str, Optional[Union[bool, int, str]]]: name = name.strip() if name: font_names.append(name) + return font_names + + def _get_font_size(self, props: Mapping[str, str]) -> Optional[float]: + size = props.get("font-size") + if size is None: + return size + return self._pt_to_float(size) + def _select_font_family(self, font_names) -> Optional[int]: family = None for name in font_names: - if name == "serif": - family = 1 # roman - break - elif name == "sans-serif": - family = 2 # swiss - break - elif name == "cursive": - family = 4 # script - break - elif name == "fantasy": - family = 5 # decorative + family = self.FAMILY_MAP.get(name) + if family: break - decoration = props.get("text-decoration") - if decoration is not None: - decoration = decoration.split() - else: - decoration = () - - return { - "name": font_names[0] if font_names else None, - "family": family, - "size": size, - "bold": self.BOLD_MAP.get(props.get("font-weight")), - "italic": self.ITALIC_MAP.get(props.get("font-style")), - "underline": ("single" if "underline" in decoration else None), - "strike": ("line-through" in decoration) or None, - "color": self.color_to_excel(props.get("color")), - # shadow if nonzero digit before shadow color - "shadow": ( - bool(re.search("^[^#(]*[1-9]", props["text-shadow"])) - if "text-shadow" in props - else None - ), - # FIXME: dont leave commented-out - # 'vertAlign':, - # 'charset': , - # 'scheme': , - # 'outline': , - # 'condense': , - } - - NAMED_COLORS = { - "maroon": "800000", - "brown": "A52A2A", - "red": "FF0000", - "pink": "FFC0CB", - "orange": "FFA500", - "yellow": "FFFF00", - "olive": "808000", - "green": "008000", - "purple": "800080", - "fuchsia": "FF00FF", - "lime": "00FF00", - "teal": "008080", - "aqua": "00FFFF", - "blue": "0000FF", - "navy": "000080", - "black": "000000", - "gray": "808080", - "grey": "808080", - "silver": "C0C0C0", - "white": "FFFFFF", - } + return family - def color_to_excel(self, val: Optional[str]): + def color_to_excel(self, val: Optional[str]) -> Optional[str]: if val is None: return None - if val.startswith("#") and len(val) == 7: - return val[1:].upper() - if val.startswith("#") and len(val) == 4: - return (val[1] * 2 + val[2] * 2 + val[3] * 2).upper() + + if self._is_hex_color(val): + return self._convert_hex_to_excel(val) + try: return self.NAMED_COLORS[val] except KeyError: warnings.warn(f"Unhandled color format: {repr(val)}", CSSWarning) + return None - def build_number_format(self, props: Dict) -> Dict[str, Optional[str]]: - return {"format_code": props.get("number-format")} + def _is_hex_color(self, color_string: str) -> bool: + return bool(color_string.startswith("#")) + + def _convert_hex_to_excel(self, color_string: str) -> str: + code = color_string.lstrip("#") + if self._is_shorthand_color(color_string): + return (code[0] * 2 + code[1] * 2 + code[2] * 2).upper() + else: + return code.upper() + + def _is_shorthand_color(self, color_string: str) -> bool: + """Check if color code is shorthand. + + #FFF is a shorthand as opposed to full #FFFFFF. + """ + code = color_string.lstrip("#") + if len(code) == 3: + return True + elif len(code) == 6: + return False + else: + raise ValueError(f"Unexpected color {color_string}") class ExcelFormatter: From e02d2c268b4726f5e5110b396e3ecfba0895109e Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 24 Sep 2020 17:31:57 -0700 Subject: [PATCH 0900/1025] BUG: alignment changing index on input series (#36503) * BUG: alignment changing index on input series * whatsnew * remove deep=False --- doc/source/whatsnew/v1.2.0.rst | 1 + pandas/core/generic.py | 8 ++++++++ pandas/tests/series/test_arithmetic.py | 13 +++++++++++++ 3 files changed, 22 insertions(+) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 782e7fe16a2dc..ed9aadfb39e43 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -272,6 +272,7 @@ Numeric - Bug in :func:`to_numeric` where float precision was incorrect (:issue:`31364`) - Bug in :meth:`DataFrame.any` with ``axis=1`` and ``bool_only=True`` ignoring the ``bool_only`` keyword (:issue:`32432`) - Bug in :meth:`Series.equals` where a ``ValueError`` was raised when numpy arrays were compared to scalars (:issue:`35267`) +- Bug in :class:`Series` where two :class:`Series` each have a :class:`DatetimeIndex` with different timezones having those indexes incorrectly changed when performing arithmetic operations (:issue:`33671`) - Conversion diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 0b9021b094cd7..a8b48f875c825 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -8748,6 +8748,10 @@ def _align_frame( if is_datetime64tz_dtype(left.index.dtype): if left.index.tz != right.index.tz: if join_index is not None: + # GH#33671 ensure we don't change the index on + # our original Series (NB: by default deep=False) + left = left.copy() + right = right.copy() left.index = join_index right.index = join_index @@ -8835,6 +8839,10 @@ def _align_series( if is_datetime64tz_dtype(left.index.dtype): if left.index.tz != right.index.tz: if join_index is not None: + # GH#33671 ensure we don't change the index on + # our original Series (NB: by default deep=False) + left = left.copy() + right = right.copy() left.index = join_index right.index = join_index diff --git a/pandas/tests/series/test_arithmetic.py b/pandas/tests/series/test_arithmetic.py index 8fad6ee1cca8b..f30246ff12fac 100644 --- a/pandas/tests/series/test_arithmetic.py +++ b/pandas/tests/series/test_arithmetic.py @@ -254,6 +254,19 @@ def test_sub_datetimelike_align(self): result = (dt2.to_frame() - dt.to_frame())[0] tm.assert_series_equal(result, expected) + def test_alignment_doesnt_change_tz(self): + # GH#33671 + dti = pd.date_range("2016-01-01", periods=10, tz="CET") + dti_utc = dti.tz_convert("UTC") + ser = pd.Series(10, index=dti) + ser_utc = pd.Series(10, index=dti_utc) + + # we don't care about the result, just that original indexes are unchanged + ser * ser_utc + + assert ser.index is dti + assert ser_utc.index is dti_utc + # ------------------------------------------------------------------ # Comparisons From 96a9f42ac66fae29c95164983e71b1461dc70370 Mon Sep 17 00:00:00 2001 From: Daniel Saxton <2658661+dsaxton@users.noreply.github.com> Date: Thu, 24 Sep 2020 19:48:26 -0500 Subject: [PATCH 0901/1025] CI: Add rst backtick checker (#36591) --- .pre-commit-config.yaml | 35 ++++++++++++++++++++++++++++++++++ doc/source/whatsnew/v1.2.0.rst | 22 ++++++++++----------- doc/sphinxext/README.rst | 2 +- 3 files changed, 47 insertions(+), 12 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 6319629d57512..d01956bb79e11 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -30,3 +30,38 @@ repos: hooks: - id: pyupgrade args: [--py37-plus] +- repo: https://github.com/pre-commit/pygrep-hooks + rev: v1.6.0 + hooks: + - id: rst-backticks + # these exclusions should be removed and the files fixed + exclude: (?x)( + text\.rst| + timeseries\.rst| + visualization\.rst| + missing_data\.rst| + options\.rst| + reshaping\.rst| + scale\.rst| + merging\.rst| + cookbook\.rst| + enhancingperf\.rst| + groupby\.rst| + io\.rst| + overview\.rst| + panel\.rst| + plotting\.rst| + 10min\.rst| + basics\.rst| + categorical\.rst| + contributing\.rst| + contributing_docstring\.rst| + extending\.rst| + ecosystem\.rst| + comparison_with_sql\.rst| + install\.rst| + calculate_statistics\.rst| + combine_dataframes\.rst| + v0\.| + v1\.0\.| + v1\.1\.[012]) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index ed9aadfb39e43..2a8b6fe3ade6a 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -117,7 +117,7 @@ Other enhancements - :meth:`DataFrame.applymap` now supports ``na_action`` (:issue:`23803`) - :class:`Index` with object dtype supports division and multiplication (:issue:`34160`) - :meth:`DataFrame.explode` and :meth:`Series.explode` now support exploding of sets (:issue:`35614`) -- `Styler` now allows direct CSS class name addition to individual data cells (:issue:`36159`) +- ``Styler`` now allows direct CSS class name addition to individual data cells (:issue:`36159`) - :meth:`Rolling.mean()` and :meth:`Rolling.sum()` use Kahan summation to calculate the mean to avoid numerical problems (:issue:`10319`, :issue:`11645`, :issue:`13254`, :issue:`32761`, :issue:`36031`) - :meth:`DatetimeIndex.searchsorted`, :meth:`TimedeltaIndex.searchsorted`, :meth:`PeriodIndex.searchsorted`, and :meth:`Series.searchsorted` with datetimelike dtypes will now try to cast string arguments (listlike and scalar) to the matching datetimelike type (:issue:`36346`) @@ -223,12 +223,12 @@ Deprecations Performance improvements ~~~~~~~~~~~~~~~~~~~~~~~~ -- Performance improvements when creating DataFrame or Series with dtype `str` or :class:`StringDtype` from array with many string elements (:issue:`36304`, :issue:`36317`, :issue:`36325`, :issue:`36432`) +- Performance improvements when creating DataFrame or Series with dtype ``str`` or :class:`StringDtype` from array with many string elements (:issue:`36304`, :issue:`36317`, :issue:`36325`, :issue:`36432`) - Performance improvement in :meth:`GroupBy.agg` with the ``numba`` engine (:issue:`35759`) - Performance improvements when creating :meth:`pd.Series.map` from a huge dictionary (:issue:`34717`) - Performance improvement in :meth:`GroupBy.transform` with the ``numba`` engine (:issue:`36240`) - ``Styler`` uuid method altered to compress data transmission over web whilst maintaining reasonably low table collision probability (:issue:`36345`) -- Performance improvement in :meth:`pd.to_datetime` with non-`ns` time unit for `float` `dtype` columns (:issue:`20445`) +- Performance improvement in :meth:`pd.to_datetime` with non-ns time unit for ``float`` ``dtype`` columns (:issue:`20445`) .. --------------------------------------------------------------------------- @@ -263,7 +263,7 @@ Timedelta Timezones ^^^^^^^^^ -- Bug in :func:`date_range` was raising AmbiguousTimeError for valid input with `ambiguous=False` (:issue:`35297`) +- Bug in :func:`date_range` was raising AmbiguousTimeError for valid input with ``ambiguous=False`` (:issue:`35297`) - @@ -305,13 +305,13 @@ Indexing Missing ^^^^^^^ -- Bug in :meth:`SeriesGroupBy.transform` now correctly handles missing values for `dropna=False` (:issue:`35014`) +- Bug in :meth:`SeriesGroupBy.transform` now correctly handles missing values for ``dropna=False`` (:issue:`35014`) - MultiIndex ^^^^^^^^^^ -- Bug in :meth:`DataFrame.xs` when used with :class:`IndexSlice` raises ``TypeError`` with message `Expected label or tuple of labels` (:issue:`35301`) +- Bug in :meth:`DataFrame.xs` when used with :class:`IndexSlice` raises ``TypeError`` with message ``"Expected label or tuple of labels"`` (:issue:`35301`) - I/O @@ -319,15 +319,15 @@ I/O - :func:`read_sas` no longer leaks resources on failure (:issue:`35566`) - Bug in :meth:`to_csv` caused a ``ValueError`` when it was called with a filename in combination with ``mode`` containing a ``b`` (:issue:`35058`) -- In :meth:`read_csv` `float_precision='round_trip'` now handles `decimal` and `thousands` parameters (:issue:`35365`) +- In :meth:`read_csv` ``float_precision='round_trip'`` now handles ``decimal`` and ``thousands`` parameters (:issue:`35365`) - :meth:`to_pickle` and :meth:`read_pickle` were closing user-provided file objects (:issue:`35679`) -- :meth:`to_csv` passes compression arguments for `'gzip'` always to `gzip.GzipFile` (:issue:`28103`) +- :meth:`to_csv` passes compression arguments for ``'gzip'`` always to ``gzip.GzipFile`` (:issue:`28103`) - :meth:`to_csv` did not support zip compression for binary file object not having a filename (:issue:`35058`) -- :meth:`to_csv` and :meth:`read_csv` did not honor `compression` and `encoding` for path-like objects that are internally converted to file-like objects (:issue:`35677`, :issue:`26124`, and :issue:`32392`) +- :meth:`to_csv` and :meth:`read_csv` did not honor ``compression`` and ``encoding`` for path-like objects that are internally converted to file-like objects (:issue:`35677`, :issue:`26124`, and :issue:`32392`) - :meth:`to_picke` and :meth:`read_pickle` did not support compression for file-objects (:issue:`26237`, :issue:`29054`, and :issue:`29570`) - Bug in :func:`LongTableBuilder.middle_separator` was duplicating LaTeX longtable entires in the List of Tables of a LaTeX document (:issue:`34360`) -- Bug in :meth:`read_csv` with `engine='python'` truncating data if multiple items present in first row and first element started with BOM (:issue:`36343`) -- Removed ``private_key`` and ``verbose`` from :func:`read_gbq` as they are no longer supported in `pandas-gbq` (:issue:`34654` :issue:`30200`) +- Bug in :meth:`read_csv` with ``engine='python'`` truncating data if multiple items present in first row and first element started with BOM (:issue:`36343`) +- Removed ``private_key`` and ``verbose`` from :func:`read_gbq` as they are no longer supported in ``pandas-gbq`` (:issue:`34654`, :issue:`30200`) Plotting ^^^^^^^^ diff --git a/doc/sphinxext/README.rst b/doc/sphinxext/README.rst index 2be5372bc0216..8f0f4a8b2636d 100644 --- a/doc/sphinxext/README.rst +++ b/doc/sphinxext/README.rst @@ -7,7 +7,7 @@ pandas documentation. These copies originate from other projects: - ``numpydoc`` - Numpy's Sphinx extensions: this can be found at its own repository: https://github.com/numpy/numpydoc - ``ipython_directive`` and ``ipython_console_highlighting`` in the folder - `ipython_sphinxext` - Sphinx extensions from IPython: these are included + ``ipython_sphinxext`` - Sphinx extensions from IPython: these are included in IPython: https://github.com/ipython/ipython/tree/master/IPython/sphinxext .. note:: From ded9910b8ea2e095fe3afb036a91f38b6ca37bb9 Mon Sep 17 00:00:00 2001 From: Daniel Saxton <2658661+dsaxton@users.noreply.github.com> Date: Thu, 24 Sep 2020 20:52:58 -0500 Subject: [PATCH 0902/1025] BUG: Fix unordered cut with Series labels (#36613) --- doc/source/whatsnew/v1.1.3.rst | 1 + pandas/core/reshape/tile.py | 2 +- pandas/tests/reshape/test_cut.py | 10 ++++++++++ 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.3.rst b/doc/source/whatsnew/v1.1.3.rst index da94e98bc78dd..b382da2db01a4 100644 --- a/doc/source/whatsnew/v1.1.3.rst +++ b/doc/source/whatsnew/v1.1.3.rst @@ -53,6 +53,7 @@ Bug fixes - Bug in :meth:`DataFrame.stack` raising a ``ValueError`` when stacking :class:`MultiIndex` columns based on position when the levels had duplicate names (:issue:`36353`) - Bug in :meth:`Series.astype` showing too much precision when casting from ``np.float32`` to string dtype (:issue:`36451`) - Bug in :meth:`Series.isin` and :meth:`DataFrame.isin` when using ``NaN`` and a row length above 1,000,000 (:issue:`22205`) +- Bug in :func:`cut` raising a ``ValueError`` when passed a :class:`Series` of labels with ``ordered=False`` (:issue:`36603`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index 077ad057f6e1d..4c5347bd16e8b 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -379,7 +379,7 @@ def _bins_to_cuts( duplicates: str = "raise", ordered: bool = True, ): - if not ordered and not labels: + if not ordered and labels is None: raise ValueError("'labels' must be provided if 'ordered = False'") if duplicates not in ["raise", "drop"]: diff --git a/pandas/tests/reshape/test_cut.py b/pandas/tests/reshape/test_cut.py index 60c80a8abdba6..4d2195da85a13 100644 --- a/pandas/tests/reshape/test_cut.py +++ b/pandas/tests/reshape/test_cut.py @@ -664,3 +664,13 @@ def test_cut_unordered_with_missing_labels_raises_error(): msg = "'labels' must be provided if 'ordered = False'" with pytest.raises(ValueError, match=msg): cut([0.5, 3], bins=[0, 1, 2], ordered=False) + + +def test_cut_unordered_with_series_labels(): + # https://github.com/pandas-dev/pandas/issues/36603 + s = pd.Series([1, 2, 3, 4, 5]) + bins = pd.Series([0, 2, 4, 6]) + labels = pd.Series(["a", "b", "c"]) + result = pd.cut(s, bins=bins, labels=labels, ordered=False) + expected = pd.Series(["a", "a", "b", "b", "c"], dtype="category") + tm.assert_series_equal(result, expected) From afad571d5d9ff0825c54aa242f4ccadce02616c0 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 25 Sep 2020 01:51:50 -0700 Subject: [PATCH 0903/1025] Revert "Fix regression when adding timeldeta_range to timestamp (#36582)" (#36616) This reverts commit dee2c5571887b6ce26071440d288f59f141db6ad. --- doc/source/whatsnew/v1.1.3.rst | 1 - pandas/core/arrays/timedeltas.py | 2 +- pandas/tests/arithmetic/test_timedelta64.py | 17 ----------------- 3 files changed, 1 insertion(+), 19 deletions(-) diff --git a/doc/source/whatsnew/v1.1.3.rst b/doc/source/whatsnew/v1.1.3.rst index b382da2db01a4..c63a78c76572f 100644 --- a/doc/source/whatsnew/v1.1.3.rst +++ b/doc/source/whatsnew/v1.1.3.rst @@ -38,7 +38,6 @@ Fixed regressions - Fixed regression in :meth:`DataFrame.apply` with ``raw=True`` and user-function returning string (:issue:`35940`) - Fixed regression when setting empty :class:`DataFrame` column to a :class:`Series` in preserving name of index in frame (:issue:`36527`) - Fixed regression in :class:`Period` incorrect value for ordinal over the maximum timestamp (:issue:`36430`) -- Fixed regression when adding a :meth:`timedelta_range` to a :class:``Timestamp`` raised an ``ValueError`` (:issue:`35897`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index 4526fb9c8623c..3eaf428bc64b2 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -450,7 +450,7 @@ def _add_datetimelike_scalar(self, other): result = checked_add_with_arr(i8, other.value, arr_mask=self._isnan) result = self._maybe_mask_results(result) dtype = DatetimeTZDtype(tz=other.tz) if other.tz else DT64NS_DTYPE - return DatetimeArray._simple_new(result, dtype=dtype, freq=self.freq) + return DatetimeArray(result, dtype=dtype, freq=self.freq) def _addsub_object_array(self, other, op): # Add or subtract Array-like of objects diff --git a/pandas/tests/arithmetic/test_timedelta64.py b/pandas/tests/arithmetic/test_timedelta64.py index dd9b6269ce5bf..64d3d5b6d684d 100644 --- a/pandas/tests/arithmetic/test_timedelta64.py +++ b/pandas/tests/arithmetic/test_timedelta64.py @@ -2136,20 +2136,3 @@ def test_td64arr_pow_invalid(self, scalar_td, box_with_array): with pytest.raises(TypeError, match=pattern): td1 ** scalar_td - - -def test_add_timestamp_to_timedelta(): - # GH: 35897 - timestamp = pd.Timestamp.now() - result = timestamp + pd.timedelta_range("0s", "1s", periods=31) - expected = pd.DatetimeIndex( - [ - timestamp - + ( - pd.to_timedelta("0.033333333s") * i - + pd.to_timedelta("0.000000001s") * divmod(i, 3)[0] - ) - for i in range(31) - ] - ) - tm.assert_index_equal(result, expected) From bbb800f358d0a56ede2a143c5aafafd719d3264b Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 25 Sep 2020 17:01:44 -0700 Subject: [PATCH 0904/1025] fix test test warnings (#36640) --- pandas/tests/io/excel/test_writers.py | 3 +-- .../series/apply/test_series_transform.py | 21 ++++++++++++------- 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/pandas/tests/io/excel/test_writers.py b/pandas/tests/io/excel/test_writers.py index e3ee53b63e102..0e27b87da9f3e 100644 --- a/pandas/tests/io/excel/test_writers.py +++ b/pandas/tests/io/excel/test_writers.py @@ -1286,10 +1286,9 @@ def test_merged_cell_custom_objects(self, merge_cells, path): expected.to_excel(path) result = pd.read_excel(path, header=[0, 1], index_col=0, convert_float=False) # need to convert PeriodIndexes to standard Indexes for assert equal - expected.columns.set_levels( + expected.columns = expected.columns.set_levels( [[str(i) for i in mi.levels[0]], [str(i) for i in mi.levels[1]]], level=[0, 1], - inplace=True, ) expected.index = expected.index.astype(np.float64) tm.assert_frame_equal(expected, result) diff --git a/pandas/tests/series/apply/test_series_transform.py b/pandas/tests/series/apply/test_series_transform.py index 0842674da2a7d..0e200709f60cf 100644 --- a/pandas/tests/series/apply/test_series_transform.py +++ b/pandas/tests/series/apply/test_series_transform.py @@ -121,15 +121,20 @@ def test_transform_bad_dtype(op): s = Series(3 * [object]) # Series that will fail on most transforms if op in ("backfill", "shift", "pad", "bfill", "ffill"): pytest.xfail("Transform function works on any datatype") + msg = "Transform function failed" - with pytest.raises(ValueError, match=msg): - s.transform(op) - with pytest.raises(ValueError, match=msg): - s.transform([op]) - with pytest.raises(ValueError, match=msg): - s.transform({"A": op}) - with pytest.raises(ValueError, match=msg): - s.transform({"A": [op]}) + + # tshift is deprecated + warn = None if op != "tshift" else FutureWarning + with tm.assert_produces_warning(warn, check_stacklevel=False): + with pytest.raises(ValueError, match=msg): + s.transform(op) + with pytest.raises(ValueError, match=msg): + s.transform([op]) + with pytest.raises(ValueError, match=msg): + s.transform({"A": op}) + with pytest.raises(ValueError, match=msg): + s.transform({"A": [op]}) @pytest.mark.parametrize("use_apply", [True, False]) From 2dd515cc97b97579bc963d5eb8d9b4eca2d9345d Mon Sep 17 00:00:00 2001 From: Jonas Laursen Date: Fri, 25 Sep 2020 17:30:01 -0700 Subject: [PATCH 0905/1025] DOC: Replaced single backticks with double backticks in several rst files (#36627) --- .pre-commit-config.yaml | 17 ------ doc/source/getting_started/overview.rst | 2 +- doc/source/reference/panel.rst | 2 +- doc/source/reference/plotting.rst | 2 +- doc/source/user_guide/10min.rst | 4 +- doc/source/user_guide/basics.rst | 16 ++--- doc/source/user_guide/cookbook.rst | 2 +- doc/source/user_guide/enhancingperf.rst | 6 +- doc/source/user_guide/groupby.rst | 4 +- doc/source/user_guide/io.rst | 80 ++++++++++++------------- doc/source/user_guide/merging.rst | 8 +-- doc/source/user_guide/missing_data.rst | 2 +- doc/source/user_guide/options.rst | 14 ++--- doc/source/user_guide/reshaping.rst | 4 +- doc/source/user_guide/scale.rst | 4 +- doc/source/user_guide/text.rst | 2 +- doc/source/user_guide/timeseries.rst | 6 +- doc/source/user_guide/visualization.rst | 8 +-- 18 files changed, 83 insertions(+), 100 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index d01956bb79e11..ad36a68c448a9 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -36,23 +36,6 @@ repos: - id: rst-backticks # these exclusions should be removed and the files fixed exclude: (?x)( - text\.rst| - timeseries\.rst| - visualization\.rst| - missing_data\.rst| - options\.rst| - reshaping\.rst| - scale\.rst| - merging\.rst| - cookbook\.rst| - enhancingperf\.rst| - groupby\.rst| - io\.rst| - overview\.rst| - panel\.rst| - plotting\.rst| - 10min\.rst| - basics\.rst| categorical\.rst| contributing\.rst| contributing_docstring\.rst| diff --git a/doc/source/getting_started/overview.rst b/doc/source/getting_started/overview.rst index 032ba73a7293d..57d87d4ec8a91 100644 --- a/doc/source/getting_started/overview.rst +++ b/doc/source/getting_started/overview.rst @@ -40,7 +40,7 @@ Here are just a few of the things that pandas does well: higher dimensional objects - Automatic and explicit **data alignment**: objects can be explicitly aligned to a set of labels, or the user can simply ignore the labels and - let `Series`, `DataFrame`, etc. automatically align the data for you in + let ``Series``, ``DataFrame``, etc. automatically align the data for you in computations - Powerful, flexible **group by** functionality to perform split-apply-combine operations on data sets, for both aggregating and diff --git a/doc/source/reference/panel.rst b/doc/source/reference/panel.rst index 94bfe87fe39f0..37d48c2dadf2e 100644 --- a/doc/source/reference/panel.rst +++ b/doc/source/reference/panel.rst @@ -7,4 +7,4 @@ Panel ===== .. currentmodule:: pandas -`Panel` was removed in 0.25.0. For prior documentation, see the `0.24 documentation `_ +``Panel`` was removed in 0.25.0. For prior documentation, see the `0.24 documentation `_ diff --git a/doc/source/reference/plotting.rst b/doc/source/reference/plotting.rst index 95657dfa5fde5..632b39a1fa858 100644 --- a/doc/source/reference/plotting.rst +++ b/doc/source/reference/plotting.rst @@ -7,7 +7,7 @@ Plotting ======== .. currentmodule:: pandas.plotting -The following functions are contained in the `pandas.plotting` module. +The following functions are contained in the ``pandas.plotting`` module. .. autosummary:: :toctree: api/ diff --git a/doc/source/user_guide/10min.rst b/doc/source/user_guide/10min.rst index 93c50fff40305..c3746cbe777a3 100644 --- a/doc/source/user_guide/10min.rst +++ b/doc/source/user_guide/10min.rst @@ -431,9 +431,9 @@ See more at :ref:`Histogramming and Discretization `. String Methods ~~~~~~~~~~~~~~ -Series is equipped with a set of string processing methods in the `str` +Series is equipped with a set of string processing methods in the ``str`` attribute that make it easy to operate on each element of the array, as in the -code snippet below. Note that pattern-matching in `str` generally uses `regular +code snippet below. Note that pattern-matching in ``str`` generally uses `regular expressions `__ by default (and in some cases always uses them). See more at :ref:`Vectorized String Methods `. diff --git a/doc/source/user_guide/basics.rst b/doc/source/user_guide/basics.rst index 6b13319061ea4..e348111fe7881 100644 --- a/doc/source/user_guide/basics.rst +++ b/doc/source/user_guide/basics.rst @@ -1459,7 +1459,7 @@ for altering the ``Series.name`` attribute. .. versionadded:: 0.24.0 The methods :meth:`DataFrame.rename_axis` and :meth:`Series.rename_axis` -allow specific names of a `MultiIndex` to be changed (as opposed to the +allow specific names of a ``MultiIndex`` to be changed (as opposed to the labels). .. ipython:: python @@ -1592,7 +1592,7 @@ index value along with a Series containing the data in each row: row All values in ``row``, returned as a Series, are now upcasted - to floats, also the original integer value in column `x`: + to floats, also the original integer value in column ``x``: .. ipython:: python @@ -1787,8 +1787,8 @@ used to sort a pandas object by its index levels. .. versionadded:: 1.1.0 Sorting by index also supports a ``key`` parameter that takes a callable -function to apply to the index being sorted. For `MultiIndex` objects, -the key is applied per-level to the levels specified by `level`. +function to apply to the index being sorted. For ``MultiIndex`` objects, +the key is applied per-level to the levels specified by ``level``. .. ipython:: python @@ -1812,8 +1812,8 @@ For information on key sorting by value, see :ref:`value sorting By values ~~~~~~~~~ -The :meth:`Series.sort_values` method is used to sort a `Series` by its values. The -:meth:`DataFrame.sort_values` method is used to sort a `DataFrame` by its column or row values. +The :meth:`Series.sort_values` method is used to sort a ``Series`` by its values. The +:meth:`DataFrame.sort_values` method is used to sort a ``DataFrame`` by its column or row values. The optional ``by`` parameter to :meth:`DataFrame.sort_values` may used to specify one or more columns to use to determine the sorted order. @@ -1855,8 +1855,8 @@ to apply to the values being sorted. s1.sort_values() s1.sort_values(key=lambda x: x.str.lower()) -`key` will be given the :class:`Series` of values and should return a ``Series`` -or array of the same shape with the transformed values. For `DataFrame` objects, +``key`` will be given the :class:`Series` of values and should return a ``Series`` +or array of the same shape with the transformed values. For ``DataFrame`` objects, the key is applied per column, so the key should still expect a Series and return a Series, e.g. diff --git a/doc/source/user_guide/cookbook.rst b/doc/source/user_guide/cookbook.rst index 7542e1dc7df6f..e33e85d3d2224 100644 --- a/doc/source/user_guide/cookbook.rst +++ b/doc/source/user_guide/cookbook.rst @@ -1270,7 +1270,7 @@ Often it's useful to obtain the lower (or upper) triangular form of a correlatio corr_mat.where(mask) -The `method` argument within `DataFrame.corr` can accept a callable in addition to the named correlation types. Here we compute the `distance correlation `__ matrix for a `DataFrame` object. +The ``method`` argument within ``DataFrame.corr`` can accept a callable in addition to the named correlation types. Here we compute the ``distance correlation ``__ matrix for a ``DataFrame`` object. .. ipython:: python diff --git a/doc/source/user_guide/enhancingperf.rst b/doc/source/user_guide/enhancingperf.rst index 9e101c1a20371..ce9db0a5279c3 100644 --- a/doc/source/user_guide/enhancingperf.rst +++ b/doc/source/user_guide/enhancingperf.rst @@ -488,9 +488,9 @@ These operations are supported by :func:`pandas.eval`: * Attribute access, e.g., ``df.a`` * Subscript expressions, e.g., ``df[0]`` * Simple variable evaluation, e.g., ``pd.eval('df')`` (this is not very useful) -* Math functions: `sin`, `cos`, `exp`, `log`, `expm1`, `log1p`, - `sqrt`, `sinh`, `cosh`, `tanh`, `arcsin`, `arccos`, `arctan`, `arccosh`, - `arcsinh`, `arctanh`, `abs`, `arctan2` and `log10`. +* Math functions: ``sin``, ``cos``, ``exp``, ``log``, ``expm1``, ``log1p``, + ``sqrt``, ``sinh``, ``cosh``, ``tanh``, ``arcsin``, ``arccos``, ``arctan``, ``arccosh``, + ``arcsinh``, ``arctanh``, ``abs``, ``arctan2`` and ``log10``. This Python syntax is **not** allowed: diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst index f745dab00bab8..52342de98de79 100644 --- a/doc/source/user_guide/groupby.rst +++ b/doc/source/user_guide/groupby.rst @@ -216,10 +216,10 @@ in case you want to include ``NA`` values in group keys, you could pass ``dropna .. ipython:: python - # Default `dropna` is set to True, which will exclude NaNs in keys + # Default ``dropna`` is set to True, which will exclude NaNs in keys df_dropna.groupby(by=["b"], dropna=True).sum() - # In order to allow NaN in keys, set `dropna` to False + # In order to allow NaN in keys, set ``dropna`` to False df_dropna.groupby(by=["b"], dropna=False).sum() The default setting of ``dropna`` argument is ``True`` which means ``NA`` are not included in group keys. diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index a0b16e5fe5d1c..fc5aad12cd5e8 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -117,9 +117,9 @@ index_col : int, str, sequence of int / str, or False, default ``None`` usecols : list-like or callable, default ``None`` Return a subset of the columns. If list-like, all elements must either be positional (i.e. integer indices into the document columns) or strings - that correspond to column names provided either by the user in `names` or + that correspond to column names provided either by the user in ``names`` or inferred from the document header row(s). For example, a valid list-like - `usecols` parameter would be ``[0, 1, 2]`` or ``['foo', 'bar', 'baz']``. + ``usecols`` parameter would be ``[0, 1, 2]`` or ``['foo', 'bar', 'baz']``. Element order is ignored, so ``usecols=[0, 1]`` is the same as ``[1, 0]``. To instantiate a DataFrame from ``data`` with element order preserved use @@ -157,7 +157,7 @@ General parsing configuration dtype : Type name or dict of column -> type, default ``None`` Data type for data or columns. E.g. ``{'a': np.float64, 'b': np.int32}`` - (unsupported with ``engine='python'``). Use `str` or `object` together + (unsupported with ``engine='python'``). Use ``str`` or ``object`` together with suitable ``na_values`` settings to preserve and not interpret dtype. engine : {``'c'``, ``'python'``} @@ -215,19 +215,19 @@ na_values : scalar, str, list-like, or dict, default ``None`` keep_default_na : boolean, default ``True`` Whether or not to include the default NaN values when parsing the data. - Depending on whether `na_values` is passed in, the behavior is as follows: + Depending on whether ``na_values`` is passed in, the behavior is as follows: - * If `keep_default_na` is ``True``, and `na_values` are specified, `na_values` + * If ``keep_default_na`` is ``True``, and ``na_values`` are specified, ``na_values`` is appended to the default NaN values used for parsing. - * If `keep_default_na` is ``True``, and `na_values` are not specified, only + * If ``keep_default_na`` is ``True``, and ``na_values`` are not specified, only the default NaN values are used for parsing. - * If `keep_default_na` is ``False``, and `na_values` are specified, only - the NaN values specified `na_values` are used for parsing. - * If `keep_default_na` is ``False``, and `na_values` are not specified, no + * If ``keep_default_na`` is ``False``, and ``na_values`` are specified, only + the NaN values specified ``na_values`` are used for parsing. + * If ``keep_default_na`` is ``False``, and ``na_values`` are not specified, no strings will be parsed as NaN. - Note that if `na_filter` is passed in as ``False``, the `keep_default_na` and - `na_values` parameters will be ignored. + Note that if ``na_filter`` is passed in as ``False``, the ``keep_default_na`` and + ``na_values`` parameters will be ignored. na_filter : boolean, default ``True`` Detect missing value markers (empty strings and the value of na_values). In data without any NAs, passing ``na_filter=False`` can improve the performance @@ -276,10 +276,10 @@ Iteration +++++++++ iterator : boolean, default ``False`` - Return `TextFileReader` object for iteration or getting chunks with + Return ``TextFileReader`` object for iteration or getting chunks with ``get_chunk()``. chunksize : int, default ``None`` - Return `TextFileReader` object for iteration. See :ref:`iterating and chunking + Return ``TextFileReader`` object for iteration. See :ref:`iterating and chunking ` below. Quoting, compression, and file format @@ -299,7 +299,7 @@ compression : {``'infer'``, ``'gzip'``, ``'bz2'``, ``'zip'``, ``'xz'``, ``None`` .. versionchanged:: 0.24.0 'infer' option added and set to default. .. versionchanged:: 1.1.0 dict option extended to support ``gzip`` and ``bz2``. - .. versionchanged:: 1.2.0 Previous versions forwarded dict entries for 'gzip' to `gzip.open`. + .. versionchanged:: 1.2.0 Previous versions forwarded dict entries for 'gzip' to ``gzip.open``. thousands : str, default ``None`` Thousands separator. decimal : str, default ``'.'`` @@ -327,17 +327,17 @@ comment : str, default ``None`` Indicates remainder of line should not be parsed. If found at the beginning of a line, the line will be ignored altogether. This parameter must be a single character. Like empty lines (as long as ``skip_blank_lines=True``), fully - commented lines are ignored by the parameter `header` but not by `skiprows`. + commented lines are ignored by the parameter ``header`` but not by ``skiprows``. For example, if ``comment='#'``, parsing '#empty\\na,b,c\\n1,2,3' with - `header=0` will result in 'a,b,c' being treated as the header. + ``header=0`` will result in 'a,b,c' being treated as the header. encoding : str, default ``None`` Encoding to use for UTF when reading/writing (e.g. ``'utf-8'``). `List of Python standard encodings `_. dialect : str or :class:`python:csv.Dialect` instance, default ``None`` If provided, this parameter will override values (default or not) for the - following parameters: `delimiter`, `doublequote`, `escapechar`, - `skipinitialspace`, `quotechar`, and `quoting`. If it is necessary to + following parameters: ``delimiter``, ``doublequote``, ``escapechar``, + ``skipinitialspace``, ``quotechar``, and ``quoting``. If it is necessary to override values, a ParserWarning will be issued. See :class:`python:csv.Dialect` documentation for more details. @@ -436,7 +436,7 @@ worth trying. mixed_df['col_1'].apply(type).value_counts() mixed_df['col_1'].dtype - will result with `mixed_df` containing an ``int`` dtype for certain chunks + will result with ``mixed_df`` containing an ``int`` dtype for certain chunks of the column, and ``str`` for others due to the mixed dtypes from the data that was read in. It is important to note that the overall column will be marked with a ``dtype`` of ``object``, which is used for columns with mixed dtypes. @@ -896,7 +896,7 @@ You can also use a dict to specify custom name columns: df It is important to remember that if multiple text columns are to be parsed into -a single date column, then a new column is prepended to the data. The `index_col` +a single date column, then a new column is prepended to the data. The ``index_col`` specification is based off of this new set of columns rather than the original data columns: @@ -937,7 +937,7 @@ Pandas will try to call the ``date_parser`` function in three different ways. If an exception is raised, the next one is tried: 1. ``date_parser`` is first called with one or more arrays as arguments, - as defined using `parse_dates` (e.g., ``date_parser(['2013', '2013'], ['1', '2'])``). + as defined using ``parse_dates`` (e.g., ``date_parser(['2013', '2013'], ['1', '2'])``). 2. If #1 fails, ``date_parser`` is called with all the columns concatenated row-wise into a single array (e.g., ``date_parser(['2013 1', '2013 2'])``). @@ -1369,7 +1369,7 @@ Files with fixed width columns While :func:`read_csv` reads delimited data, the :func:`read_fwf` function works with data files that have known and fixed column widths. The function parameters -to ``read_fwf`` are largely the same as `read_csv` with two extra parameters, and +to ``read_fwf`` are largely the same as ``read_csv`` with two extra parameters, and a different usage of the ``delimiter`` parameter: * ``colspecs``: A list of pairs (tuples) giving the extents of the @@ -1402,7 +1402,7 @@ Consider a typical fixed-width data file: print(open('bar.csv').read()) In order to parse this file into a ``DataFrame``, we simply need to supply the -column specifications to the `read_fwf` function along with the file name: +column specifications to the ``read_fwf`` function along with the file name: .. ipython:: python @@ -1718,7 +1718,7 @@ The ``Series`` and ``DataFrame`` objects have an instance method ``to_csv`` whic allows storing the contents of the object as a comma-separated-values file. The function takes a number of arguments. Only the first is required. -* ``path_or_buf``: A string path to the file to write or a file object. If a file object it must be opened with `newline=''` +* ``path_or_buf``: A string path to the file to write or a file object. If a file object it must be opened with ``newline=''`` * ``sep`` : Field delimiter for the output file (default ",") * ``na_rep``: A string representation of a missing value (default '') * ``float_format``: Format string for floating point numbers @@ -1726,13 +1726,13 @@ function takes a number of arguments. Only the first is required. * ``header``: Whether to write out the column names (default True) * ``index``: whether to write row (index) names (default True) * ``index_label``: Column label(s) for index column(s) if desired. If None - (default), and `header` and `index` are True, then the index names are + (default), and ``header`` and ``index`` are True, then the index names are used. (A sequence should be given if the ``DataFrame`` uses MultiIndex). * ``mode`` : Python write mode, default 'w' * ``encoding``: a string representing the encoding to use if the contents are non-ASCII, for Python versions prior to 3 -* ``line_terminator``: Character sequence denoting line end (default `os.linesep`) -* ``quoting``: Set quoting rules as in csv module (default csv.QUOTE_MINIMAL). Note that if you have set a `float_format` then floats are converted to strings and csv.QUOTE_NONNUMERIC will treat them as non-numeric +* ``line_terminator``: Character sequence denoting line end (default ``os.linesep``) +* ``quoting``: Set quoting rules as in csv module (default csv.QUOTE_MINIMAL). Note that if you have set a ``float_format`` then floats are converted to strings and csv.QUOTE_NONNUMERIC will treat them as non-numeric * ``quotechar``: Character used to quote fields (default '"') * ``doublequote``: Control quoting of ``quotechar`` in fields (default True) * ``escapechar``: Character used to escape ``sep`` and ``quotechar`` when @@ -1885,7 +1885,7 @@ preservation of metadata including but not limited to dtypes and index names. Any orient option that encodes to a JSON object will not preserve the ordering of index and column labels during round-trip serialization. If you wish to preserve - label ordering use the `split` option as it uses ordered containers. + label ordering use the ``split`` option as it uses ordered containers. Date handling +++++++++++++ @@ -2240,7 +2240,7 @@ For line-delimited json files, pandas can also return an iterator which reads in df df.to_json(orient='records', lines=True) - # reader is an iterator that returns `chunksize` lines each iteration + # reader is an iterator that returns ``chunksize`` lines each iteration reader = pd.read_json(StringIO(jsonl), lines=True, chunksize=1) reader for chunk in reader: @@ -3092,7 +3092,7 @@ Dtype specifications ++++++++++++++++++++ As an alternative to converters, the type for an entire column can -be specified using the `dtype` keyword, which takes a dictionary +be specified using the ``dtype`` keyword, which takes a dictionary mapping column names to types. To interpret data with no type inference, use the type ``str`` or ``object``. @@ -3748,8 +3748,8 @@ Passing ``min_itemsize={`values`: size}`` as a parameter to append will set a larger minimum for the string columns. Storing ``floats, strings, ints, bools, datetime64`` are currently supported. For string columns, passing ``nan_rep = 'nan'`` to append will change the default -nan representation on disk (which converts to/from `np.nan`), this -defaults to `nan`. +nan representation on disk (which converts to/from ``np.nan``), this +defaults to ``nan``. .. ipython:: python @@ -4045,7 +4045,7 @@ Query via data columns ++++++++++++++++++++++ You can designate (and index) certain columns that you want to be able -to perform queries (other than the `indexable` columns, which you can +to perform queries (other than the ``indexable`` columns, which you can always query). For instance say you want to perform this common operation, on-disk, and return just the frame that matches this query. You can specify ``data_columns = True`` to force all columns to @@ -4076,7 +4076,7 @@ be ``data_columns``. store.root.df_dc.table There is some performance degradation by making lots of columns into -`data columns`, so it is up to the user to designate these. In addition, +``data columns``, so it is up to the user to designate these. In addition, you cannot change data columns (nor indexables) after the first append/put operation (Of course you can simply read in the data and create a new table!). @@ -4203,7 +4203,7 @@ having a very wide table, but enables more efficient queries. The ``append_to_multiple`` method splits a given single DataFrame into multiple tables according to ``d``, a dictionary that maps the -table names to a list of 'columns' you want in that table. If `None` +table names to a list of 'columns' you want in that table. If ``None`` is used in place of a list, that table will have the remaining unspecified columns of the given DataFrame. The argument ``selector`` defines which table is the selector table (which you can make queries from). @@ -4843,8 +4843,8 @@ Parquet supports partitioning of data based on the values of one or more columns df.to_parquet(path='test', engine='pyarrow', partition_cols=['a'], compression=None) -The `path` specifies the parent directory to which data will be saved. -The `partition_cols` are the column names by which the dataset will be partitioned. +The ``path`` specifies the parent directory to which data will be saved. +The ``partition_cols`` are the column names by which the dataset will be partitioned. Columns are partitioned in the order they are given. The partition splits are determined by the unique values in the partition columns. The above example creates a partitioned dataset that may look like: @@ -5495,7 +5495,7 @@ SAS formats ----------- The top-level function :func:`read_sas` can read (but not write) SAS -`xport` (.XPT) and (since *v0.18.0*) `SAS7BDAT` (.sas7bdat) format files. +XPORT (.xpt) and (since *v0.18.0*) SAS7BDAT (.sas7bdat) format files. SAS files only contain two value types: ASCII text and floating point values (usually 8 bytes but sometimes truncated). For xport files, @@ -5543,7 +5543,7 @@ SPSS formats .. versionadded:: 0.25.0 The top-level function :func:`read_spss` can read (but not write) SPSS -`sav` (.sav) and `zsav` (.zsav) format files. +SAV (.sav) and ZSAV (.zsav) format files. SPSS files contain column names. By default the whole file is read, categorical columns are converted into ``pd.Categorical``, @@ -5566,7 +5566,7 @@ avoid converting categorical columns into ``pd.Categorical``: df = pd.read_spss('spss_data.sav', usecols=['foo', 'bar'], convert_categoricals=False) -More information about the `sav` and `zsav` file format is available here_. +More information about the SAV and ZSAV file formats is available here_. .. _here: https://www.ibm.com/support/knowledgecenter/en/SSLVMB_22.0.0/com.ibm.spss.statistics.help/spss/base/savedatatypes.htm diff --git a/doc/source/user_guide/merging.rst b/doc/source/user_guide/merging.rst index bc8fc5a7e4f4e..aee56a2565310 100644 --- a/doc/source/user_guide/merging.rst +++ b/doc/source/user_guide/merging.rst @@ -77,7 +77,7 @@ some configurable handling of "what to do with the other axes": levels=None, names=None, verify_integrity=False, copy=True) * ``objs`` : a sequence or mapping of Series or DataFrame objects. If a - dict is passed, the sorted keys will be used as the `keys` argument, unless + dict is passed, the sorted keys will be used as the ``keys`` argument, unless it is passed, in which case the values will be selected (see below). Any None objects will be dropped silently unless they are all None in which case a ValueError will be raised. @@ -1234,7 +1234,7 @@ resetting indexes. DataFrame. .. note:: - When DataFrames are merged using only some of the levels of a `MultiIndex`, + When DataFrames are merged using only some of the levels of a ``MultiIndex``, the extra levels will be dropped from the resulting merge. In order to preserve those levels, use ``reset_index`` on those level names to move those levels to columns prior to doing the merge. @@ -1487,7 +1487,7 @@ compare two DataFrame or Series, respectively, and summarize their differences. This feature was added in :ref:`V1.1.0 `. -For example, you might want to compare two `DataFrame` and stack their differences +For example, you might want to compare two ``DataFrame`` and stack their differences side by side. .. ipython:: python @@ -1523,7 +1523,7 @@ If you wish, you may choose to stack the differences on rows. df.compare(df2, align_axis=0) -If you wish to keep all original rows and columns, set `keep_shape` argument +If you wish to keep all original rows and columns, set ``keep_shape`` argument to ``True``. .. ipython:: python diff --git a/doc/source/user_guide/missing_data.rst b/doc/source/user_guide/missing_data.rst index 06a7c6e33768e..9294897686d46 100644 --- a/doc/source/user_guide/missing_data.rst +++ b/doc/source/user_guide/missing_data.rst @@ -251,7 +251,7 @@ can propagate non-NA values forward or backward: **Limit the amount of filling** If we only want consecutive gaps filled up to a certain number of data points, -we can use the `limit` keyword: +we can use the ``limit`` keyword: .. ipython:: python :suppress: diff --git a/doc/source/user_guide/options.rst b/doc/source/user_guide/options.rst index 398336960e769..563fc941294d1 100644 --- a/doc/source/user_guide/options.rst +++ b/doc/source/user_guide/options.rst @@ -109,7 +109,7 @@ It's also possible to reset multiple options at once (using a regex): ``option_context`` context manager has been exposed through the top-level API, allowing you to execute code with given option values. Option values -are restored automatically when you exit the `with` block: +are restored automatically when you exit the ``with`` block: .. ipython:: python @@ -306,10 +306,10 @@ display.encoding UTF-8 Defaults to the detected en meant to be displayed on the console. display.expand_frame_repr True Whether to print out the full DataFrame repr for wide DataFrames across - multiple lines, `max_columns` is + multiple lines, ``max_columns`` is still respected, but the output will wrap-around across multiple "pages" - if its width exceeds `display.width`. + if its width exceeds ``display.width``. display.float_format None The callable should accept a floating point number and return a string with the desired format of the number. @@ -371,11 +371,11 @@ display.max_rows 60 This sets the maximum numbe fully or just a truncated or summary repr. 'None' value means unlimited. display.min_rows 10 The numbers of rows to show in a truncated - repr (when `max_rows` is exceeded). Ignored - when `max_rows` is set to None or 0. When set - to None, follows the value of `max_rows`. + repr (when ``max_rows`` is exceeded). Ignored + when ``max_rows`` is set to None or 0. When set + to None, follows the value of ``max_rows``. display.max_seq_items 100 when pretty-printing a long sequence, - no more then `max_seq_items` will + no more then ``max_seq_items`` will be printed. If items are omitted, they will be denoted by the addition of "..." to the resulting string. diff --git a/doc/source/user_guide/reshaping.rst b/doc/source/user_guide/reshaping.rst index 1b90aeb00cf9c..e6797512ce3cf 100644 --- a/doc/source/user_guide/reshaping.rst +++ b/doc/source/user_guide/reshaping.rst @@ -609,8 +609,8 @@ This function is often used along with discretization functions like ``cut``: See also :func:`Series.str.get_dummies `. :func:`get_dummies` also accepts a ``DataFrame``. By default all categorical -variables (categorical in the statistical sense, those with `object` or -`categorical` dtype) are encoded as dummy variables. +variables (categorical in the statistical sense, those with ``object`` or +``categorical`` dtype) are encoded as dummy variables. .. ipython:: python diff --git a/doc/source/user_guide/scale.rst b/doc/source/user_guide/scale.rst index cddc3cb2600fd..206d8dd0f4739 100644 --- a/doc/source/user_guide/scale.rst +++ b/doc/source/user_guide/scale.rst @@ -214,7 +214,7 @@ work for arbitrary-sized datasets. for path in files: # Only one dataframe is in memory at a time... df = pd.read_parquet(path) - # ... plus a small Series `counts`, which is updated. + # ... plus a small Series ``counts``, which is updated. counts = counts.add(df['name'].value_counts(), fill_value=0) counts.astype(int) @@ -349,7 +349,7 @@ Now we can do things like fast random access with ``.loc``. ddf.loc['2002-01-01 12:01':'2002-01-01 12:05'].compute() -Dask knows to just look in the 3rd partition for selecting values in `2002`. It +Dask knows to just look in the 3rd partition for selecting values in 2002. It doesn't need to look at any other data. Many workflows involve a large amount of data and processing it in a way that diff --git a/doc/source/user_guide/text.rst b/doc/source/user_guide/text.rst index e03ba74f95c90..dd6ac37d88f08 100644 --- a/doc/source/user_guide/text.rst +++ b/doc/source/user_guide/text.rst @@ -266,7 +266,7 @@ i.e., from the end of the string to the beginning of the string: Some caution must be taken to keep regular expressions in mind! For example, the following code will cause trouble because of the regular expression meaning of -`$`: +``$``: .. ipython:: python diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst index 868bf5a1672ff..253fea122b3f8 100644 --- a/doc/source/user_guide/timeseries.rst +++ b/doc/source/user_guide/timeseries.rst @@ -1800,12 +1800,12 @@ See :ref:`groupby.iterating-label` or :class:`Resampler.__iter__` for more. .. _timeseries.adjust-the-start-of-the-bins: -Use `origin` or `offset` to adjust the start of the bins -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Use ``origin`` or ``offset`` to adjust the start of the bins +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. versionadded:: 1.1.0 -The bins of the grouping are adjusted based on the beginning of the day of the time series starting point. This works well with frequencies that are multiples of a day (like `30D`) or that divide a day evenly (like `90s` or `1min`). This can create inconsistencies with some frequencies that do not meet this criteria. To change this behavior you can specify a fixed Timestamp with the argument ``origin``. +The bins of the grouping are adjusted based on the beginning of the day of the time series starting point. This works well with frequencies that are multiples of a day (like ``30D``) or that divide a day evenly (like ``90s`` or ``1min``). This can create inconsistencies with some frequencies that do not meet this criteria. To change this behavior you can specify a fixed Timestamp with the argument ``origin``. For example: diff --git a/doc/source/user_guide/visualization.rst b/doc/source/user_guide/visualization.rst index 8ce4b30c717a4..f41912445455d 100644 --- a/doc/source/user_guide/visualization.rst +++ b/doc/source/user_guide/visualization.rst @@ -67,7 +67,7 @@ On DataFrame, :meth:`~DataFrame.plot` is a convenience to plot all of the column @savefig frame_plot_basic.png df.plot(); -You can plot one column versus another using the `x` and `y` keywords in +You can plot one column versus another using the ``x`` and ``y`` keywords in :meth:`~DataFrame.plot`: .. ipython:: python @@ -496,7 +496,7 @@ Area plot You can create area plots with :meth:`Series.plot.area` and :meth:`DataFrame.plot.area`. Area plots are stacked by default. To produce stacked area plot, each column must be either all positive or all negative values. -When input data contains `NaN`, it will be automatically filled by 0. If you want to drop or fill by different values, use :func:`dataframe.dropna` or :func:`dataframe.fillna` before calling `plot`. +When input data contains ``NaN``, it will be automatically filled by 0. If you want to drop or fill by different values, use :func:`dataframe.dropna` or :func:`dataframe.fillna` before calling ``plot``. .. ipython:: python :suppress: @@ -1078,7 +1078,7 @@ layout and formatting of the returned plot: plt.close('all') -For each kind of plot (e.g. `line`, `bar`, `scatter`) any additional arguments +For each kind of plot (e.g. ``line``, ``bar``, ``scatter``) any additional arguments keywords are passed along to the corresponding matplotlib function (:meth:`ax.plot() `, :meth:`ax.bar() `, @@ -1271,7 +1271,7 @@ Using the ``x_compat`` parameter, you can suppress this behavior: plt.close('all') If you have more than one plot that needs to be suppressed, the ``use`` method -in ``pandas.plotting.plot_params`` can be used in a `with statement`: +in ``pandas.plotting.plot_params`` can be used in a ``with`` statement: .. ipython:: python From 019d03a11b5cdeb7e59636b2ef5fe1c1fb5b37ac Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 25 Sep 2020 18:14:07 -0700 Subject: [PATCH 0906/1025] CLN: simplify interpolate_2d and callers (#36624) --- pandas/core/arrays/categorical.py | 2 +- pandas/core/internals/blocks.py | 11 ++++------- pandas/core/missing.py | 13 ++----------- pandas/tests/series/methods/test_interpolate.py | 8 ++++++++ 4 files changed, 15 insertions(+), 19 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index d2f88b353e1c1..4e83284cb96ed 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1643,7 +1643,7 @@ def fillna(self, value=None, method=None, limit=None): # TODO: dispatch when self.categories is EA-dtype values = np.asarray(self).reshape(-1, len(self)) - values = interpolate_2d(values, method, 0, None, value).astype( + values = interpolate_2d(values, method, 0, None).astype( self.categories.dtype )[0] codes = _get_codes_for_values(values, self.categories) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index f18bc4d0bcf85..278d71068b7bf 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1181,12 +1181,15 @@ def interpolate( m = None if m is not None: + if fill_value is not None: + # similar to validate_fillna_kwargs + raise ValueError("Cannot pass both fill_value and method") + return self._interpolate_with_fill( method=m, axis=axis, inplace=inplace, limit=limit, - fill_value=fill_value, coerce=coerce, downcast=downcast, ) @@ -1214,7 +1217,6 @@ def _interpolate_with_fill( axis: int = 0, inplace: bool = False, limit: Optional[int] = None, - fill_value: Optional[Any] = None, coerce: bool = False, downcast: Optional[str] = None, ) -> List["Block"]: @@ -1232,16 +1234,11 @@ def _interpolate_with_fill( values = self.values if inplace else self.values.copy() - # We only get here for non-ExtensionBlock - fill_value = convert_scalar_for_putitemlike(fill_value, self.values.dtype) - values = missing.interpolate_2d( values, method=method, axis=axis, limit=limit, - fill_value=fill_value, - dtype=self.dtype, ) blocks = [self.make_block_same_class(values, ndim=self.ndim)] diff --git a/pandas/core/missing.py b/pandas/core/missing.py index edcdf2f54bc4c..f4182027e9e04 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -545,8 +545,6 @@ def interpolate_2d( method="pad", axis=0, limit=None, - fill_value=None, - dtype: Optional[DtypeObj] = None, ): """ Perform an actual interpolation of values, values will be make 2-d if @@ -563,18 +561,11 @@ def interpolate_2d( raise AssertionError("cannot interpolate on a ndim == 1 with axis != 0") values = values.reshape(tuple((1,) + values.shape)) - if fill_value is None: - mask = None - else: # todo create faster fill func without masking - mask = mask_missing(transf(values), fill_value) - method = clean_fill_method(method) if method == "pad": - values = transf(pad_2d(transf(values), limit=limit, mask=mask, dtype=dtype)) + values = transf(pad_2d(transf(values), limit=limit)) else: - values = transf( - backfill_2d(transf(values), limit=limit, mask=mask, dtype=dtype) - ) + values = transf(backfill_2d(transf(values), limit=limit)) # reshape back if ndim == 1: diff --git a/pandas/tests/series/methods/test_interpolate.py b/pandas/tests/series/methods/test_interpolate.py index cba9443005f2f..9fc468221ee2d 100644 --- a/pandas/tests/series/methods/test_interpolate.py +++ b/pandas/tests/series/methods/test_interpolate.py @@ -340,6 +340,14 @@ def test_interp_invalid_method(self, invalid_method): with pytest.raises(ValueError, match=msg): s.interpolate(method=invalid_method, limit=-1) + def test_interp_invalid_method_and_value(self): + # GH#36624 + ser = Series([1, 3, np.nan, 12, np.nan, 25]) + + msg = "Cannot pass both fill_value and method" + with pytest.raises(ValueError, match=msg): + ser.interpolate(fill_value=3, method="pad") + def test_interp_limit_forward(self): s = Series([1, 3, np.nan, np.nan, np.nan, 11]) From d0be299630b2c781df4efb0e55536de38854e754 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 25 Sep 2020 18:15:21 -0700 Subject: [PATCH 0907/1025] BUG: ndarray[td64] // TimedeltaArray (#36646) --- pandas/core/arrays/timedeltas.py | 3 +-- pandas/tests/arithmetic/test_timedelta64.py | 4 ++++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index 3eaf428bc64b2..25c10516abb6b 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -689,13 +689,12 @@ def __rfloordiv__(self, other): elif is_timedelta64_dtype(other.dtype): other = type(self)(other) - # numpy timedelta64 does not natively support floordiv, so operate # on the i8 values result = other.asi8 // self.asi8 mask = self._isnan | other._isnan if mask.any(): - result = result.astype(np.int64) + result = result.astype(np.float64) result[mask] = np.nan return result diff --git a/pandas/tests/arithmetic/test_timedelta64.py b/pandas/tests/arithmetic/test_timedelta64.py index 64d3d5b6d684d..c5bec61359a07 100644 --- a/pandas/tests/arithmetic/test_timedelta64.py +++ b/pandas/tests/arithmetic/test_timedelta64.py @@ -1750,6 +1750,10 @@ def test_td64arr_floordiv_td64arr_with_nat(self, box_with_array): tm.assert_equal(result, expected) + # case that goes through __rfloordiv__ with arraylike + result = np.asarray(left) // right + tm.assert_equal(result, expected) + def test_td64arr_floordiv_tdscalar(self, box_with_array, scalar_td): # GH#18831 td1 = Series([timedelta(minutes=5, seconds=3)] * 3) From 18f5b74cf8ec3ab02eff041017a757bf26479782 Mon Sep 17 00:00:00 2001 From: Daniel Saxton <2658661+dsaxton@users.noreply.github.com> Date: Fri, 25 Sep 2020 20:16:36 -0500 Subject: [PATCH 0908/1025] CLN: Fix some spelling (#36644) --- doc/source/development/policies.rst | 2 +- doc/source/user_guide/timeseries.rst | 6 +++--- doc/source/whatsnew/v0.17.0.rst | 2 +- doc/source/whatsnew/v0.20.0.rst | 2 +- doc/source/whatsnew/v0.23.0.rst | 4 ++-- doc/source/whatsnew/v0.24.0.rst | 6 +++--- doc/source/whatsnew/v1.2.0.rst | 2 +- pandas/core/aggregation.py | 2 +- pandas/core/arrays/datetimelike.py | 4 ++-- pandas/core/arrays/sparse/array.py | 2 +- pandas/core/frame.py | 2 +- pandas/core/generic.py | 10 +++++----- pandas/core/groupby/generic.py | 2 +- pandas/core/indexers.py | 2 +- pandas/core/indexes/base.py | 2 +- pandas/core/indexes/interval.py | 2 +- pandas/core/internals/managers.py | 2 +- pandas/core/strings.py | 4 ++-- pandas/errors/__init__.py | 2 +- pandas/io/parsers.py | 2 +- pandas/io/stata.py | 2 +- pandas/plotting/_misc.py | 2 +- pandas/tests/groupby/aggregate/test_aggregate.py | 2 +- 23 files changed, 34 insertions(+), 34 deletions(-) diff --git a/doc/source/development/policies.rst b/doc/source/development/policies.rst index a564afc408df9..ced5b686b8246 100644 --- a/doc/source/development/policies.rst +++ b/doc/source/development/policies.rst @@ -16,7 +16,7 @@ deprecations, API compatibility, and version numbering. A pandas release number is made up of ``MAJOR.MINOR.PATCH``. -API breaking changes should only occur in **major** releases. Theses changes +API breaking changes should only occur in **major** releases. These changes will be documented, with clear guidance on what is changing, why it's changing, and how to migrate existing code to the new behavior. diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst index 253fea122b3f8..d3d2bf8c72ba3 100644 --- a/doc/source/user_guide/timeseries.rst +++ b/doc/source/user_guide/timeseries.rst @@ -282,20 +282,20 @@ You can pass only the columns that you need to assemble. Invalid data ~~~~~~~~~~~~ -The default behavior, ``errors='raise'``, is to raise when unparseable: +The default behavior, ``errors='raise'``, is to raise when unparsable: .. code-block:: ipython In [2]: pd.to_datetime(['2009/07/31', 'asd'], errors='raise') ValueError: Unknown string format -Pass ``errors='ignore'`` to return the original input when unparseable: +Pass ``errors='ignore'`` to return the original input when unparsable: .. ipython:: python pd.to_datetime(['2009/07/31', 'asd'], errors='ignore') -Pass ``errors='coerce'`` to convert unparseable data to ``NaT`` (not a time): +Pass ``errors='coerce'`` to convert unparsable data to ``NaT`` (not a time): .. ipython:: python diff --git a/doc/source/whatsnew/v0.17.0.rst b/doc/source/whatsnew/v0.17.0.rst index 11c252192be6b..db2790242412f 100644 --- a/doc/source/whatsnew/v0.17.0.rst +++ b/doc/source/whatsnew/v0.17.0.rst @@ -40,7 +40,7 @@ Highlights include: - Plotting methods are now available as attributes of the ``.plot`` accessor, see :ref:`here ` - The sorting API has been revamped to remove some long-time inconsistencies, see :ref:`here ` - Support for a ``datetime64[ns]`` with timezones as a first-class dtype, see :ref:`here ` -- The default for ``to_datetime`` will now be to ``raise`` when presented with unparseable formats, +- The default for ``to_datetime`` will now be to ``raise`` when presented with unparsable formats, previously this would return the original input. Also, date parse functions now return consistent results. See :ref:`here ` - The default for ``dropna`` in ``HDFStore`` has changed to ``False``, to store by default all rows even diff --git a/doc/source/whatsnew/v0.20.0.rst b/doc/source/whatsnew/v0.20.0.rst index 09980b52b6b3a..3f7a89112958b 100644 --- a/doc/source/whatsnew/v0.20.0.rst +++ b/doc/source/whatsnew/v0.20.0.rst @@ -1201,7 +1201,7 @@ Modules privacy has changed Some formerly public python/c/c++/cython extension modules have been moved and/or renamed. These are all removed from the public API. Furthermore, the ``pandas.core``, ``pandas.compat``, and ``pandas.util`` top-level modules are now considered to be PRIVATE. -If indicated, a deprecation warning will be issued if you reference theses modules. (:issue:`12588`) +If indicated, a deprecation warning will be issued if you reference these modules. (:issue:`12588`) .. csv-table:: :header: "Previous Location", "New Location", "Deprecated" diff --git a/doc/source/whatsnew/v0.23.0.rst b/doc/source/whatsnew/v0.23.0.rst index f91d89679dad1..61e92e2356da9 100644 --- a/doc/source/whatsnew/v0.23.0.rst +++ b/doc/source/whatsnew/v0.23.0.rst @@ -998,7 +998,7 @@ Datetimelike API changes - Addition and subtraction of ``NaN`` from a :class:`Series` with ``dtype='timedelta64[ns]'`` will raise a ``TypeError`` instead of treating the ``NaN`` as ``NaT`` (:issue:`19274`) - ``NaT`` division with :class:`datetime.timedelta` will now return ``NaN`` instead of raising (:issue:`17876`) - Operations between a :class:`Series` with dtype ``dtype='datetime64[ns]'`` and a :class:`PeriodIndex` will correctly raises ``TypeError`` (:issue:`18850`) -- Subtraction of :class:`Series` with timezone-aware ``dtype='datetime64[ns]'`` with mis-matched timezones will raise ``TypeError`` instead of ``ValueError`` (:issue:`18817`) +- Subtraction of :class:`Series` with timezone-aware ``dtype='datetime64[ns]'`` with mismatched timezones will raise ``TypeError`` instead of ``ValueError`` (:issue:`18817`) - :class:`Timestamp` will no longer silently ignore unused or invalid ``tz`` or ``tzinfo`` keyword arguments (:issue:`17690`) - :class:`Timestamp` will no longer silently ignore invalid ``freq`` arguments (:issue:`5168`) - :class:`CacheableOffset` and :class:`WeekDay` are no longer available in the ``pandas.tseries.offsets`` module (:issue:`17830`) @@ -1273,7 +1273,7 @@ Timedelta - Bug in :func:`Period.asfreq` where periods near ``datetime(1, 1, 1)`` could be converted incorrectly (:issue:`19643`, :issue:`19834`) - Bug in :func:`Timedelta.total_seconds()` causing precision errors, for example ``Timedelta('30S').total_seconds()==30.000000000000004`` (:issue:`19458`) - Bug in :func:`Timedelta.__rmod__` where operating with a ``numpy.timedelta64`` returned a ``timedelta64`` object instead of a ``Timedelta`` (:issue:`19820`) -- Multiplication of :class:`TimedeltaIndex` by ``TimedeltaIndex`` will now raise ``TypeError`` instead of raising ``ValueError`` in cases of length mis-match (:issue:`19333`) +- Multiplication of :class:`TimedeltaIndex` by ``TimedeltaIndex`` will now raise ``TypeError`` instead of raising ``ValueError`` in cases of length mismatch (:issue:`19333`) - Bug in indexing a :class:`TimedeltaIndex` with a ``np.timedelta64`` object which was raising a ``TypeError`` (:issue:`20393`) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 5bfaa7a5a3e6b..27cbdc9169965 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -419,7 +419,7 @@ Other enhancements - :meth:`Index.difference`, :meth:`Index.intersection`, :meth:`Index.union`, and :meth:`Index.symmetric_difference` now have an optional ``sort`` parameter to control whether the results should be sorted if possible (:issue:`17839`, :issue:`24471`) - :meth:`read_excel()` now accepts ``usecols`` as a list of column names or callable (:issue:`18273`) - :meth:`MultiIndex.to_flat_index` has been added to flatten multiple levels into a single-level :class:`Index` object. -- :meth:`DataFrame.to_stata` and :class:`pandas.io.stata.StataWriter117` can write mixed sting columns to Stata strl format (:issue:`23633`) +- :meth:`DataFrame.to_stata` and :class:`pandas.io.stata.StataWriter117` can write mixed string columns to Stata strl format (:issue:`23633`) - :meth:`DataFrame.between_time` and :meth:`DataFrame.at_time` have gained the ``axis`` parameter (:issue:`8839`) - :meth:`DataFrame.to_records` now accepts ``index_dtypes`` and ``column_dtypes`` parameters to allow different data types in stored column and index records (:issue:`18146`) - :class:`IntervalIndex` has gained the :attr:`~IntervalIndex.is_overlapping` attribute to indicate if the ``IntervalIndex`` contains any overlapping intervals (:issue:`23309`) @@ -510,7 +510,7 @@ even when ``'\n'`` was passed in ``line_terminator``. *New behavior* on Windows: -Passing ``line_terminator`` explicitly, set thes ``line terminator`` to that character. +Passing ``line_terminator`` explicitly, set the ``line terminator`` to that character. .. code-block:: ipython @@ -1885,7 +1885,7 @@ Reshaping - :meth:`DataFrame.nlargest` and :meth:`DataFrame.nsmallest` now returns the correct n values when keep != 'all' also when tied on the first columns (:issue:`22752`) - Constructing a DataFrame with an index argument that wasn't already an instance of :class:`~pandas.core.Index` was broken (:issue:`22227`). - Bug in :class:`DataFrame` prevented list subclasses to be used to construction (:issue:`21226`) -- Bug in :func:`DataFrame.unstack` and :func:`DataFrame.pivot_table` returning a missleading error message when the resulting DataFrame has more elements than int32 can handle. Now, the error message is improved, pointing towards the actual problem (:issue:`20601`) +- Bug in :func:`DataFrame.unstack` and :func:`DataFrame.pivot_table` returning a misleading error message when the resulting DataFrame has more elements than int32 can handle. Now, the error message is improved, pointing towards the actual problem (:issue:`20601`) - Bug in :func:`DataFrame.unstack` where a ``ValueError`` was raised when unstacking timezone aware values (:issue:`18338`) - Bug in :func:`DataFrame.stack` where timezone aware values were converted to timezone naive values (:issue:`19420`) - Bug in :func:`merge_asof` where a ``TypeError`` was raised when ``by_col`` were timezone aware values (:issue:`21184`) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 2a8b6fe3ade6a..031c74b1cc367 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -325,7 +325,7 @@ I/O - :meth:`to_csv` did not support zip compression for binary file object not having a filename (:issue:`35058`) - :meth:`to_csv` and :meth:`read_csv` did not honor ``compression`` and ``encoding`` for path-like objects that are internally converted to file-like objects (:issue:`35677`, :issue:`26124`, and :issue:`32392`) - :meth:`to_picke` and :meth:`read_pickle` did not support compression for file-objects (:issue:`26237`, :issue:`29054`, and :issue:`29570`) -- Bug in :func:`LongTableBuilder.middle_separator` was duplicating LaTeX longtable entires in the List of Tables of a LaTeX document (:issue:`34360`) +- Bug in :func:`LongTableBuilder.middle_separator` was duplicating LaTeX longtable entries in the List of Tables of a LaTeX document (:issue:`34360`) - Bug in :meth:`read_csv` with ``engine='python'`` truncating data if multiple items present in first row and first element started with BOM (:issue:`36343`) - Removed ``private_key`` and ``verbose`` from :func:`read_gbq` as they are no longer supported in ``pandas-gbq`` (:issue:`34654`, :issue:`30200`) diff --git a/pandas/core/aggregation.py b/pandas/core/aggregation.py index c813b65d3cbb7..71b9a658202a5 100644 --- a/pandas/core/aggregation.py +++ b/pandas/core/aggregation.py @@ -377,7 +377,7 @@ def validate_func_kwargs( (['one', 'two'], ['min', 'max']) """ no_arg_message = "Must provide 'func' or named aggregation **kwargs." - tuple_given_message = "func is expected but recieved {} in **kwargs." + tuple_given_message = "func is expected but received {} in **kwargs." columns = list(kwargs) func = [] for col_func in kwargs.values(): diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 6752a98345b6a..c90610bdd920c 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -168,7 +168,7 @@ def _unbox_scalar(self, value: DTScalarOrNaT, setitem: bool = False) -> int: value : Period, Timestamp, Timedelta, or NaT Depending on subclass. setitem : bool, default False - Whether to check compatiblity with setitem strictness. + Whether to check compatibility with setitem strictness. Returns ------- @@ -1123,7 +1123,7 @@ def _sub_period(self, other): raise TypeError(f"cannot subtract Period from a {type(self).__name__}") def _add_period(self, other: Period): - # Overriden by TimedeltaArray + # Overridden by TimedeltaArray raise TypeError(f"cannot add Period to a {type(self).__name__}") def _add_offset(self, offset): diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index 7dbb6e7e47b23..d4ec641794fc2 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -986,7 +986,7 @@ def _concat_same_type(cls, to_concat): # get an identical index as concating the values and then # creating a new index. We don't want to spend the time trying # to merge blocks across arrays in `to_concat`, so the resulting - # BlockIndex may have more blocs. + # BlockIndex may have more blocks. blengths = [] blocs = [] diff --git a/pandas/core/frame.py b/pandas/core/frame.py index ef30d989dfbd2..cd85cce361cb4 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5133,7 +5133,7 @@ def drop_duplicates( 0 Yum Yum cup 4.0 2 Indomie cup 3.5 - To remove duplicates and keep last occurences, use ``keep``. + To remove duplicates and keep last occurrences, use ``keep``. >>> df.drop_duplicates(subset=['brand', 'style'], keep='last') brand style rating diff --git a/pandas/core/generic.py b/pandas/core/generic.py index a8b48f875c825..bd720151fb15e 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3406,7 +3406,7 @@ def _maybe_update_cacher( if cacher is not None: ref = cacher[1]() - # we are trying to reference a dead referant, hence + # we are trying to reference a dead referent, hence # a copy if ref is None: del self._cacher @@ -3420,7 +3420,7 @@ def _maybe_update_cacher( ref._item_cache.pop(cacher[0], None) if verify_is_copy: - self._check_setitem_copy(stacklevel=5, t="referant") + self._check_setitem_copy(stacklevel=5, t="referent") if clear: self._clear_item_cache() @@ -3781,10 +3781,10 @@ def _check_is_chained_assignment_possible(self) -> bool_t: if self._is_view and self._is_cached: ref = self._get_cacher() if ref is not None and ref._is_mixed_type: - self._check_setitem_copy(stacklevel=4, t="referant", force=True) + self._check_setitem_copy(stacklevel=4, t="referent", force=True) return True elif self._is_copy: - self._check_setitem_copy(stacklevel=4, t="referant") + self._check_setitem_copy(stacklevel=4, t="referent") return False def _check_setitem_copy(self, stacklevel=4, t="setting", force=False): @@ -3837,7 +3837,7 @@ def _check_setitem_copy(self, stacklevel=4, t="setting", force=False): if isinstance(self._is_copy, str): t = self._is_copy - elif t == "referant": + elif t == "referent": t = ( "\n" "A value is trying to be set on a copy of a slice from a " diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 29f13107f750a..4cbbe08756ca7 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1430,7 +1430,7 @@ def _choose_path(self, fast_path: Callable, slow_path: Callable, group: DataFram except AssertionError: raise except Exception: - # GH#29631 For user-defined function, we cant predict what may be + # GH#29631 For user-defined function, we can't predict what may be # raised; see test_transform.test_transform_fastpath_raises return path, res diff --git a/pandas/core/indexers.py b/pandas/core/indexers.py index 6c88ae1e03cda..e48a42599a2a0 100644 --- a/pandas/core/indexers.py +++ b/pandas/core/indexers.py @@ -144,7 +144,7 @@ def check_setitem_lengths(indexer, value, values) -> bool: no_op = False if isinstance(indexer, (np.ndarray, list)): - # We can ignore other listlikes becasue they are either + # We can ignore other listlikes because they are either # a) not necessarily 1-D indexers, e.g. tuple # b) boolean indexers e.g. BoolArray if is_list_like(value): diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 5f2b901844dad..84489c1033d8c 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4503,7 +4503,7 @@ def sort_values( idx = ensure_key_mapped(self, key) # GH 35584. Sort missing values according to na_position kwarg - # ignore na_position for MutiIndex + # ignore na_position for MultiIndex if not isinstance( self, (ABCMultiIndex, ABCDatetimeIndex, ABCTimedeltaIndex, ABCPeriodIndex) ): diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 2f43787919faa..3fcc40c90b98e 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -1023,7 +1023,7 @@ def intersection( def _intersection_unique(self, other: "IntervalIndex") -> "IntervalIndex": """ Used when the IntervalIndex does not have any common endpoint, - no mater left or right. + no matter left or right. Return the intersection with another IntervalIndex. Parameters diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 865412f159ea1..f2480adce89b4 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -225,7 +225,7 @@ def set_axis(self, axis: int, new_labels: Index) -> None: @property def _is_single_block(self) -> bool: - # Assumes we are 2D; overriden by SingleBlockManager + # Assumes we are 2D; overridden by SingleBlockManager return len(self.blocks) == 1 def _rebuild_blknos_and_blklocs(self) -> None: diff --git a/pandas/core/strings.py b/pandas/core/strings.py index ab6c9cfb51414..4467c96041dc7 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -1470,7 +1470,7 @@ def str_pad(arr, width, side="left", fillchar=" "): character. Equivalent to ``Series.str.pad(side='left')``. Series.str.ljust : Fills the right side of strings with an arbitrary character. Equivalent to ``Series.str.pad(side='right')``. - Series.str.center : Fills boths sides of strings with an arbitrary + Series.str.center : Fills both sides of strings with an arbitrary character. Equivalent to ``Series.str.pad(side='both')``. Series.str.zfill : Pad strings in the Series/Index by prepending '0' character. Equivalent to ``Series.str.pad(side='left', fillchar='0')``. @@ -2918,7 +2918,7 @@ def zfill(self, width): character. Series.str.pad : Fills the specified sides of strings with an arbitrary character. - Series.str.center : Fills boths sides of strings with an arbitrary + Series.str.center : Fills both sides of strings with an arbitrary character. Notes diff --git a/pandas/errors/__init__.py b/pandas/errors/__init__.py index 15389ca2c3e61..ea60ae5c1d227 100644 --- a/pandas/errors/__init__.py +++ b/pandas/errors/__init__.py @@ -225,7 +225,7 @@ class DuplicateLabelError(ValueError): class InvalidIndexError(Exception): """ - Exception raised when attemping to use an invalid index key. + Exception raised when attempting to use an invalid index key. .. versionadded:: 1.1.0 """ diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index bc622ab8c1f18..c839129b91e12 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -227,7 +227,7 @@ result 'foo' If a column or index cannot be represented as an array of datetimes, - say because of an unparseable value or a mixture of timezones, the column + say because of an unparsable value or a mixture of timezones, the column or index will be returned unaltered as an object data type. For non-standard datetime parsing, use ``pd.to_datetime`` after ``pd.read_csv``. To parse an index or column with a mixture of timezones, diff --git a/pandas/io/stata.py b/pandas/io/stata.py index a8af84e42918d..5d34b4a7855ce 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -499,7 +499,7 @@ class CategoricalConversionWarning(Warning): dataset with an iterator results in categorical variable with different categories. This occurs since it is not possible to know all possible values until the entire dataset has been read. To avoid this warning, you can either -read dataset without an interator, or manually convert categorical data by +read dataset without an iterator, or manually convert categorical data by ``convert_categoricals`` to False and then accessing the variable labels through the value_labels method of the reader. """ diff --git a/pandas/plotting/_misc.py b/pandas/plotting/_misc.py index 9410dbfe8e90a..6e473bf5b182c 100644 --- a/pandas/plotting/_misc.py +++ b/pandas/plotting/_misc.py @@ -318,7 +318,7 @@ def bootstrap_plot(series, fig=None, size=50, samples=500, **kwds): Examples -------- - This example draws a basic bootstap plot for a Series. + This example draws a basic bootstrap plot for a Series. .. plot:: :context: close-figs diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index c96333bc48dd4..4a0ea5f520873 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -564,7 +564,7 @@ def test_mangled(self): def test_named_agg_nametuple(self, inp): # GH34422 s = pd.Series([1, 1, 2, 2, 3, 3, 4, 5]) - msg = f"func is expected but recieved {type(inp).__name__}" + msg = f"func is expected but received {type(inp).__name__}" with pytest.raises(TypeError, match=msg): s.groupby(s.values).agg(a=inp) From 6858f29fb566bc8308a05a13455130a42e1dbbdf Mon Sep 17 00:00:00 2001 From: Andrew Wieteska <48889395+arw2019@users.noreply.github.com> Date: Fri, 25 Sep 2020 21:17:58 -0400 Subject: [PATCH 0909/1025] BUG: propagate dropna in pd.Grouper (#36604) --- pandas/core/groupby/grouper.py | 11 ++++++++++- pandas/core/groupby/ops.py | 2 ++ pandas/tests/groupby/test_groupby_dropna.py | 8 ++++++++ 3 files changed, 20 insertions(+), 1 deletion(-) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 6263d5337f42f..a509acb3604e1 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -99,6 +99,13 @@ class Grouper: .. versionadded:: 1.1.0 + dropna : bool, default True + If True, and if group keys contain NA values, NA values together with + row/column will be dropped. If False, NA values will also be treated as + the key in groups. + + .. versionadded:: 1.2.0 + Returns ------- A specification for a groupby instruction @@ -820,7 +827,9 @@ def is_in_obj(gpr) -> bool: groupings.append(Grouping(Index([], dtype="int"), np.array([], dtype=np.intp))) # create the internals grouper - grouper = ops.BaseGrouper(group_axis, groupings, sort=sort, mutated=mutated) + grouper = ops.BaseGrouper( + group_axis, groupings, sort=sort, mutated=mutated, dropna=dropna + ) return grouper, exclusions, obj diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index b3f91d4623c84..17539cdf451e3 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -87,6 +87,7 @@ def __init__( group_keys: bool = True, mutated: bool = False, indexer: Optional[np.ndarray] = None, + dropna: bool = True, ): assert isinstance(axis, Index), axis @@ -97,6 +98,7 @@ def __init__( self.group_keys = group_keys self.mutated = mutated self.indexer = indexer + self.dropna = dropna @property def groupings(self) -> List["grouper.Grouping"]: diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index deb73acbb158a..cd6c17955c18d 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -162,6 +162,14 @@ def test_groupby_dropna_series_by(dropna, expected): tm.assert_series_equal(result, expected) +@pytest.mark.parametrize("dropna", (False, True)) +def test_grouper_dropna_propagation(dropna): + # GH 36604 + df = pd.DataFrame({"A": [0, 0, 1, None], "B": [1, 2, 3, None]}) + gb = df.groupby("A", dropna=dropna) + assert gb.grouper.dropna == dropna + + @pytest.mark.parametrize( "dropna,df_expected,s_expected", [ From 61ba57bc6dca8226c75c0ebcb4b11b44c157f5da Mon Sep 17 00:00:00 2001 From: Erfan Nariman <34067903+erfannariman@users.noreply.github.com> Date: Sat, 26 Sep 2020 03:19:03 +0200 Subject: [PATCH 0910/1025] Add generate pip dependency's from conda to pre-commit (#36531) --- .pre-commit-config.yaml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index ad36a68c448a9..53ab61afe900b 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -48,3 +48,12 @@ repos: v0\.| v1\.0\.| v1\.1\.[012]) +- repo: local + hooks: + - id: pip_to_conda + name: Generate pip dependency from conda + description: This hook checks if the conda environment.yml and requirements-dev.txt are equal + language: system + entry: python -m scripts.generate_pip_deps_from_conda + files: ^(environment.yml|requirements-dev.txt)$ + pass_filenames: false From cd038326a69b4e7b0ed19496114ec072b358cac4 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 25 Sep 2020 18:19:53 -0700 Subject: [PATCH 0911/1025] CLN: share setitem/getitem validators (#36619) --- pandas/core/arrays/_mixins.py | 2 ++ pandas/core/arrays/categorical.py | 5 ++--- pandas/core/arrays/numpy_.py | 14 -------------- 3 files changed, 4 insertions(+), 17 deletions(-) diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py index 2bf530eb2bad4..4d13a18c8ef0b 100644 --- a/pandas/core/arrays/_mixins.py +++ b/pandas/core/arrays/_mixins.py @@ -14,6 +14,7 @@ from pandas.core.algorithms import take, unique from pandas.core.array_algos.transforms import shift from pandas.core.arrays.base import ExtensionArray +from pandas.core.construction import extract_array from pandas.core.indexers import check_array_indexer _T = TypeVar("_T", bound="NDArrayBackedExtensionArray") @@ -197,6 +198,7 @@ def __getitem__(self, key): return result def _validate_getitem_key(self, key): + key = extract_array(key, extract_numpy=True) return check_array_indexer(self, key) @doc(ExtensionArray.fillna) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 4e83284cb96ed..16406dd54b577 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -47,7 +47,7 @@ from pandas.core.base import ExtensionArray, NoNewAttributesMixin, PandasObject import pandas.core.common as com from pandas.core.construction import array, extract_array, sanitize_array -from pandas.core.indexers import check_array_indexer, deprecate_ndim_indexing +from pandas.core.indexers import deprecate_ndim_indexing from pandas.core.missing import interpolate_2d from pandas.core.ops.common import unpack_zerodim_and_defer from pandas.core.sorting import nargsort @@ -1923,8 +1923,7 @@ def _validate_setitem_key(self, key): # else: array of True/False in Series or Categorical - key = check_array_indexer(self, key) - return key + return super()._validate_setitem_key(key) def _reverse_indexer(self) -> Dict[Hashable, np.ndarray]: """ diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index f65b130b396da..6b982bf579f04 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -16,7 +16,6 @@ from pandas.core.array_algos import masked_reductions from pandas.core.arrays._mixins import NDArrayBackedExtensionArray from pandas.core.arrays.base import ExtensionOpsMixin -from pandas.core.construction import extract_array class PandasDtype(ExtensionDtype): @@ -244,19 +243,6 @@ def __array_ufunc__(self, ufunc, method: str, *inputs, **kwargs): # ------------------------------------------------------------------------ # Pandas ExtensionArray Interface - def _validate_getitem_key(self, key): - if isinstance(key, type(self)): - key = key._ndarray - - return super()._validate_getitem_key(key) - - def _validate_setitem_value(self, value): - value = extract_array(value, extract_numpy=True) - - if not lib.is_scalar(value): - value = np.asarray(value, dtype=self._ndarray.dtype) - return value - def isna(self) -> np.ndarray: return isna(self._ndarray) From a3a72651d4788a110c8d82db76854fd3bf170499 Mon Sep 17 00:00:00 2001 From: patrick <61934744+phofl@users.noreply.github.com> Date: Sat, 26 Sep 2020 03:22:28 +0200 Subject: [PATCH 0912/1025] [BUG]: Fix bug with pre epoch normalization (#36557) --- doc/source/whatsnew/v1.1.3.rst | 1 + pandas/_libs/tslibs/conversion.pyx | 2 +- pandas/tests/scalar/timestamp/test_unary_ops.py | 6 ++++++ pandas/tests/series/test_datetime_values.py | 8 ++++++++ 4 files changed, 16 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.3.rst b/doc/source/whatsnew/v1.1.3.rst index c63a78c76572f..4ad85fd6bafa6 100644 --- a/doc/source/whatsnew/v1.1.3.rst +++ b/doc/source/whatsnew/v1.1.3.rst @@ -38,6 +38,7 @@ Fixed regressions - Fixed regression in :meth:`DataFrame.apply` with ``raw=True`` and user-function returning string (:issue:`35940`) - Fixed regression when setting empty :class:`DataFrame` column to a :class:`Series` in preserving name of index in frame (:issue:`36527`) - Fixed regression in :class:`Period` incorrect value for ordinal over the maximum timestamp (:issue:`36430`) +- Fixed regression in :meth:`Series.dt.normalize` when normalizing pre-epoch dates the result was shifted one day (:issue:`36294`) .. --------------------------------------------------------------------------- diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index adf1dfbc1ac72..3b52b4d499694 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -830,7 +830,7 @@ cpdef inline datetime localize_pydatetime(datetime dt, object tz): # ---------------------------------------------------------------------- # Normalization -@cython.cdivision +@cython.cdivision(False) cdef inline int64_t normalize_i8_stamp(int64_t local_val) nogil: """ Round the localized nanosecond timestamp down to the previous midnight. diff --git a/pandas/tests/scalar/timestamp/test_unary_ops.py b/pandas/tests/scalar/timestamp/test_unary_ops.py index 8641bbd0a66f2..e8196cd8328e7 100644 --- a/pandas/tests/scalar/timestamp/test_unary_ops.py +++ b/pandas/tests/scalar/timestamp/test_unary_ops.py @@ -397,6 +397,12 @@ def test_normalize(self, tz_naive_fixture, arg): expected = Timestamp("2013-11-30", tz=tz) assert result == expected + def test_normalize_pre_epoch_dates(self): + # GH: 36294 + result = Timestamp("1969-01-01 09:00:00").normalize() + expected = Timestamp("1969-01-01 00:00:00") + assert result == expected + # -------------------------------------------------------------- @td.skip_if_windows diff --git a/pandas/tests/series/test_datetime_values.py b/pandas/tests/series/test_datetime_values.py index 723bd303b1974..b0926089bd7b4 100644 --- a/pandas/tests/series/test_datetime_values.py +++ b/pandas/tests/series/test_datetime_values.py @@ -702,3 +702,11 @@ def test_week_and_weekofyear_are_deprecated(): series.dt.week with tm.assert_produces_warning(FutureWarning): series.dt.weekofyear + + +def test_normalize_pre_epoch_dates(): + # GH: 36294 + s = pd.to_datetime(pd.Series(["1969-01-01 09:00:00", "2016-01-01 09:00:00"])) + result = s.dt.normalize() + expected = pd.to_datetime(pd.Series(["1969-01-01", "2016-01-01"])) + tm.assert_series_equal(result, expected) From bacc7f9e233baa458dfec425810a81274b8adbd1 Mon Sep 17 00:00:00 2001 From: Number42 <32516498+QuentinN42@users.noreply.github.com> Date: Sat, 26 Sep 2020 03:25:19 +0200 Subject: [PATCH 0913/1025] BUG: inconsistent replace (#36444) --- doc/source/whatsnew/v1.1.3.rst | 1 + pandas/core/internals/blocks.py | 5 ++++- pandas/tests/frame/methods/test_replace.py | 25 ++++++++++++++++++++++ 3 files changed, 30 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.3.rst b/doc/source/whatsnew/v1.1.3.rst index 4ad85fd6bafa6..7c7e40e633acc 100644 --- a/doc/source/whatsnew/v1.1.3.rst +++ b/doc/source/whatsnew/v1.1.3.rst @@ -34,6 +34,7 @@ Fixed regressions - Fixed regression in :meth:`Series.__getitem__` incorrectly raising when the input was a tuple (:issue:`35534`) - Fixed regression in :meth:`Series.__getitem__` incorrectly raising when the input was a frozenset (:issue:`35747`) - Fixed regression in :meth:`read_excel` with ``engine="odf"`` caused ``UnboundLocalError`` in some cases where cells had nested child nodes (:issue:`36122`, :issue:`35802`) +- Fixed regression in :meth:`DataFrame.replace` inconsistent replace when using a float in the replace method (:issue:`35376`) - Fixed regression in :class:`DataFrame` and :class:`Series` comparisons between numeric arrays and strings (:issue:`35700`, :issue:`36377`) - Fixed regression in :meth:`DataFrame.apply` with ``raw=True`` and user-function returning string (:issue:`35940`) - Fixed regression when setting empty :class:`DataFrame` column to a :class:`Series` in preserving name of index in frame (:issue:`36527`) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 278d71068b7bf..09f276be7d64a 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -36,6 +36,7 @@ is_datetime64tz_dtype, is_dtype_equal, is_extension_array_dtype, + is_float, is_float_dtype, is_integer, is_integer_dtype, @@ -2064,7 +2065,9 @@ def _can_hold_element(self, element: Any) -> bool: and not issubclass(tipo.type, (np.datetime64, np.timedelta64)) and self.dtype.itemsize >= tipo.itemsize ) - return is_integer(element) + # We have not inferred an integer from the dtype + # check if we have a builtin int or a float equal to an int + return is_integer(element) or (is_float(element) and element.is_integer()) class DatetimeLikeBlockMixin: diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py index a77753ed9f9d0..a9cf840470ae0 100644 --- a/pandas/tests/frame/methods/test_replace.py +++ b/pandas/tests/frame/methods/test_replace.py @@ -974,6 +974,31 @@ def test_replace_for_new_dtypes(self, datetime_frame): } ), ), + # GH 35376 + ( + DataFrame([[1, 1.0], [2, 2.0]]), + 1.0, + 5, + DataFrame([[5, 5.0], [2, 2.0]]), + ), + ( + DataFrame([[1, 1.0], [2, 2.0]]), + 1, + 5, + DataFrame([[5, 5.0], [2, 2.0]]), + ), + ( + DataFrame([[1, 1.0], [2, 2.0]]), + 1.0, + 5.0, + DataFrame([[5, 5.0], [2, 2.0]]), + ), + ( + DataFrame([[1, 1.0], [2, 2.0]]), + 1, + 5.0, + DataFrame([[5, 5.0], [2, 2.0]]), + ), ], ) def test_replace_dtypes(self, frame, to_replace, value, expected): From cfb8462e5a6ae4b5bb5b1cb1cc9246b61ae2d16c Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sat, 26 Sep 2020 02:31:38 +0100 Subject: [PATCH 0914/1025] Partial Revert "ENH: infer freq in timedelta_range (#32377)" (#36595) --- doc/source/whatsnew/v1.1.3.rst | 3 ++- pandas/core/arrays/timedeltas.py | 4 ---- pandas/core/indexes/timedeltas.py | 4 ++-- pandas/tests/arithmetic/test_timedelta64.py | 17 +++++++++++++++++ .../indexes/timedeltas/test_timedelta_range.py | 6 +++++- 5 files changed, 26 insertions(+), 8 deletions(-) diff --git a/doc/source/whatsnew/v1.1.3.rst b/doc/source/whatsnew/v1.1.3.rst index 7c7e40e633acc..aeb9076617787 100644 --- a/doc/source/whatsnew/v1.1.3.rst +++ b/doc/source/whatsnew/v1.1.3.rst @@ -31,6 +31,7 @@ Fixed regressions ~~~~~~~~~~~~~~~~~ - Fixed regression in :meth:`DataFrame.agg`, :meth:`DataFrame.apply`, :meth:`Series.agg`, and :meth:`Series.apply` where internal suffix is exposed to the users when no relabelling is applied (:issue:`36189`) - Fixed regression in :class:`IntegerArray` unary plus and minus operations raising a ``TypeError`` (:issue:`36063`) +- Fixed regression when adding a :meth:`timedelta_range` to a :class:``Timestamp`` raised an ``ValueError`` (:issue:`35897`) - Fixed regression in :meth:`Series.__getitem__` incorrectly raising when the input was a tuple (:issue:`35534`) - Fixed regression in :meth:`Series.__getitem__` incorrectly raising when the input was a frozenset (:issue:`35747`) - Fixed regression in :meth:`read_excel` with ``engine="odf"`` caused ``UnboundLocalError`` in some cases where cells had nested child nodes (:issue:`36122`, :issue:`35802`) @@ -62,7 +63,7 @@ Bug fixes Other ~~~~~ -- +- Reverted enhancement added in pandas-1.1.0 where :func:`timedelta_range` infers a frequency when passed ``start``, ``stop``, and ``periods`` (:issue:`32377`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index 25c10516abb6b..145380ecce9fd 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -264,10 +264,6 @@ def _generate_range(cls, start, end, periods, freq, closed=None): index = generate_regular_range(start, end, periods, freq) else: index = np.linspace(start.value, end.value, periods).astype("i8") - if len(index) >= 2: - # Infer a frequency - td = Timedelta(index[1] - index[0]) - freq = to_offset(td) if not left_closed: index = index[1:] diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index df08fda78823d..20ebc80c7e0af 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -328,8 +328,8 @@ def timedelta_range( >>> pd.timedelta_range(start='1 day', end='5 days', periods=4) TimedeltaIndex(['1 days 00:00:00', '2 days 08:00:00', '3 days 16:00:00', - '5 days 00:00:00'], - dtype='timedelta64[ns]', freq='32H') + '5 days 00:00:00'], + dtype='timedelta64[ns]', freq=None) """ if freq is None and com.any_none(periods, start, end): freq = "D" diff --git a/pandas/tests/arithmetic/test_timedelta64.py b/pandas/tests/arithmetic/test_timedelta64.py index c5bec61359a07..68bedcc099a91 100644 --- a/pandas/tests/arithmetic/test_timedelta64.py +++ b/pandas/tests/arithmetic/test_timedelta64.py @@ -2140,3 +2140,20 @@ def test_td64arr_pow_invalid(self, scalar_td, box_with_array): with pytest.raises(TypeError, match=pattern): td1 ** scalar_td + + +def test_add_timestamp_to_timedelta(): + # GH: 35897 + timestamp = pd.Timestamp.now() + result = timestamp + pd.timedelta_range("0s", "1s", periods=31) + expected = pd.DatetimeIndex( + [ + timestamp + + ( + pd.to_timedelta("0.033333333s") * i + + pd.to_timedelta("0.000000001s") * divmod(i, 3)[0] + ) + for i in range(31) + ] + ) + tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/timedeltas/test_timedelta_range.py b/pandas/tests/indexes/timedeltas/test_timedelta_range.py index 7d78fbf9ff190..dc3df4427f351 100644 --- a/pandas/tests/indexes/timedeltas/test_timedelta_range.py +++ b/pandas/tests/indexes/timedeltas/test_timedelta_range.py @@ -38,7 +38,6 @@ def test_linspace_behavior(self, periods, freq): result = timedelta_range(start="0 days", end="4 days", periods=periods) expected = timedelta_range(start="0 days", end="4 days", freq=freq) tm.assert_index_equal(result, expected) - assert result.freq == freq def test_errors(self): # not enough params @@ -79,3 +78,8 @@ def test_timedelta_range_freq_divide_end(self, start, end, freq, expected_period assert Timedelta(start) == res[0] assert Timedelta(end) >= res[-1] assert len(res) == expected_periods + + def test_timedelta_range_infer_freq(self): + # https://github.com/pandas-dev/pandas/issues/35897 + result = timedelta_range("0s", "1s", periods=31) + assert result.freq is None From 435d496aa86d59d33f8df79e3d6b832631f523ec Mon Sep 17 00:00:00 2001 From: Maxim Ivanov <41443370+ivanovmg@users.noreply.github.com> Date: Sat, 26 Sep 2020 14:11:25 +0700 Subject: [PATCH 0915/1025] PERF: fix long string representation (#36638) --- pandas/io/formats/format.py | 28 +++++++++++----------------- 1 file changed, 11 insertions(+), 17 deletions(-) diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 8e1deb21bf8ea..acc6c47efd236 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -10,7 +10,6 @@ from functools import partial from io import StringIO import math -from operator import itemgetter import re from shutil import get_terminal_size from typing import ( @@ -592,6 +591,7 @@ def __init__( self.max_cols_fitted = self._calc_max_cols_fitted() self.max_rows_fitted = self._calc_max_rows_fitted() + self.tr_frame = self.frame self._truncate() self.adj = get_adjustment() @@ -730,8 +730,6 @@ def _truncate(self) -> None: """ Check whether the frame should be truncated. If so, slice the frame up. """ - self.tr_frame = self.frame.copy() - if self.is_truncated_horizontally: self._truncate_horizontally() @@ -749,17 +747,16 @@ def _truncate_horizontally(self) -> None: assert self.max_cols_fitted is not None col_num = self.max_cols_fitted // 2 if col_num >= 1: - cols_to_keep = [ - x - for x in range(self.frame.shape[1]) - if x < col_num or x >= len(self.frame.columns) - col_num - ] - self.tr_frame = self.tr_frame.iloc[:, cols_to_keep] + left = self.tr_frame.iloc[:, :col_num] + right = self.tr_frame.iloc[:, -col_num:] + self.tr_frame = concat((left, right), axis=1) # truncate formatter if isinstance(self.formatters, (list, tuple)): - slicer = itemgetter(*cols_to_keep) - self.formatters = slicer(self.formatters) + self.formatters = [ + *self.formatters[:col_num], + *self.formatters[-col_num:], + ] else: col_num = cast(int, self.max_cols) self.tr_frame = self.tr_frame.iloc[:, :col_num] @@ -775,12 +772,9 @@ def _truncate_vertically(self) -> None: assert self.max_rows_fitted is not None row_num = self.max_rows_fitted // 2 if row_num >= 1: - rows_to_keep = [ - x - for x in range(self.frame.shape[0]) - if x < row_num or x >= len(self.frame) - row_num - ] - self.tr_frame = self.tr_frame.iloc[rows_to_keep, :] + head = self.tr_frame.iloc[:row_num, :] + tail = self.tr_frame.iloc[-row_num:, :] + self.tr_frame = concat((head, tail)) else: row_num = cast(int, self.max_rows) self.tr_frame = self.tr_frame.iloc[:row_num, :] From b2f687476b66117aeb847de2925ae3ea23c8aac4 Mon Sep 17 00:00:00 2001 From: patrick <61934744+phofl@users.noreply.github.com> Date: Sat, 26 Sep 2020 12:31:10 +0200 Subject: [PATCH 0916/1025] [BUG]: Fix regression in read_table with delim_whitespace=True (#36560) --- doc/source/whatsnew/v1.1.3.rst | 1 + pandas/io/parsers.py | 10 ++++++++++ pandas/tests/io/parser/test_common.py | 21 +++++++++++++++++++++ 3 files changed, 32 insertions(+) diff --git a/doc/source/whatsnew/v1.1.3.rst b/doc/source/whatsnew/v1.1.3.rst index aeb9076617787..eded30ca45025 100644 --- a/doc/source/whatsnew/v1.1.3.rst +++ b/doc/source/whatsnew/v1.1.3.rst @@ -40,6 +40,7 @@ Fixed regressions - Fixed regression in :meth:`DataFrame.apply` with ``raw=True`` and user-function returning string (:issue:`35940`) - Fixed regression when setting empty :class:`DataFrame` column to a :class:`Series` in preserving name of index in frame (:issue:`36527`) - Fixed regression in :class:`Period` incorrect value for ordinal over the maximum timestamp (:issue:`36430`) +- Fixed regression in :func:`read_table` raised ``ValueError`` when ``delim_whitespace`` was set to ``True`` (:issue:`35958`) - Fixed regression in :meth:`Series.dt.normalize` when normalizing pre-epoch dates the result was shifted one day (:issue:`36294`) .. --------------------------------------------------------------------------- diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index c839129b91e12..e5b7aea895f86 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -757,6 +757,16 @@ def read_table( memory_map=False, float_precision=None, ): + # TODO: validation duplicated in read_csv + if delim_whitespace and (delimiter is not None or sep != "\t"): + raise ValueError( + "Specified a delimiter with both sep and " + "delim_whitespace=True; you can only specify one." + ) + if delim_whitespace: + # In this case sep is not used so we set it to the read_csv + # default to avoid a ValueError + sep = "," return read_csv(**locals()) diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index 08eab69900400..78c2f2bce5a02 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -2200,3 +2200,24 @@ def test_read_csv_with_use_inf_as_na(all_parsers): result = parser.read_csv(StringIO(data), header=None) expected = DataFrame([1.0, np.nan, 3.0]) tm.assert_frame_equal(result, expected) + + +def test_read_table_delim_whitespace_default_sep(all_parsers): + # GH: 35958 + f = StringIO("a b c\n1 -2 -3\n4 5 6") + parser = all_parsers + result = parser.read_table(f, delim_whitespace=True) + expected = DataFrame({"a": [1, 4], "b": [-2, 5], "c": [-3, 6]}) + tm.assert_frame_equal(result, expected) + + +def test_read_table_delim_whitespace_non_default_sep(all_parsers): + # GH: 35958 + f = StringIO("a b c\n1 -2 -3\n4 5 6") + parser = all_parsers + msg = ( + "Specified a delimiter with both sep and " + "delim_whitespace=True; you can only specify one." + ) + with pytest.raises(ValueError, match=msg): + parser.read_table(f, delim_whitespace=True, sep=",") From 886344f4bf13124192b5cb35e48c6e3d55a0da68 Mon Sep 17 00:00:00 2001 From: Scott Lasley Date: Sat, 26 Sep 2020 07:31:15 -0400 Subject: [PATCH 0917/1025] DOC: Add notes about M and Y to to_timedelata documentation. (#34968) (#34979) * DOC: Add notes about M and Y to to_timedelata documentation. (#34968) * DOC: Update notes about M and Y to to_timedelata documentation. (#34968) * DOC: Add notes about M and Y to to_timedelata documentation. (#34968) --- pandas/core/tools/timedeltas.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/core/tools/timedeltas.py b/pandas/core/tools/timedeltas.py index e457a8819f27a..791d5095283ba 100644 --- a/pandas/core/tools/timedeltas.py +++ b/pandas/core/tools/timedeltas.py @@ -25,7 +25,10 @@ def to_timedelta(arg, unit=None, errors="raise"): Parameters ---------- arg : str, timedelta, list-like or Series - The data to be converted to timedelta. + The data to be converted to timedelta. The character M by itself, + e.g. '1M', is treated as minute, not month. The characters Y and y + are treated as the mean length of the Gregorian calendar year - + 365.2425 days or 365 days 5 hours 49 minutes 12 seconds. unit : str, optional Denotes the unit of the arg for numeric `arg`. Defaults to ``"ns"``. From ab644b5b9f8a89b962e1ddbfda9dae26f251e410 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sat, 26 Sep 2020 17:08:53 +0100 Subject: [PATCH 0918/1025] DOC: minor fix for 1.1.3 release notes (#36664) --- doc/source/whatsnew/v1.1.3.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.3.rst b/doc/source/whatsnew/v1.1.3.rst index eded30ca45025..97db7a3e4862d 100644 --- a/doc/source/whatsnew/v1.1.3.rst +++ b/doc/source/whatsnew/v1.1.3.rst @@ -31,7 +31,7 @@ Fixed regressions ~~~~~~~~~~~~~~~~~ - Fixed regression in :meth:`DataFrame.agg`, :meth:`DataFrame.apply`, :meth:`Series.agg`, and :meth:`Series.apply` where internal suffix is exposed to the users when no relabelling is applied (:issue:`36189`) - Fixed regression in :class:`IntegerArray` unary plus and minus operations raising a ``TypeError`` (:issue:`36063`) -- Fixed regression when adding a :meth:`timedelta_range` to a :class:``Timestamp`` raised an ``ValueError`` (:issue:`35897`) +- Fixed regression when adding a :meth:`timedelta_range` to a :class:`Timestamp` raised an ``ValueError`` (:issue:`35897`) - Fixed regression in :meth:`Series.__getitem__` incorrectly raising when the input was a tuple (:issue:`35534`) - Fixed regression in :meth:`Series.__getitem__` incorrectly raising when the input was a frozenset (:issue:`35747`) - Fixed regression in :meth:`read_excel` with ``engine="odf"`` caused ``UnboundLocalError`` in some cases where cells had nested child nodes (:issue:`36122`, :issue:`35802`) From 235f08660fef8d3fbf70493a96d436e20efacbcc Mon Sep 17 00:00:00 2001 From: Daniel Saxton <2658661+dsaxton@users.noreply.github.com> Date: Sat, 26 Sep 2020 11:51:51 -0500 Subject: [PATCH 0919/1025] DOC: Fix release note typo (#36670) --- doc/source/whatsnew/v1.1.3.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.3.rst b/doc/source/whatsnew/v1.1.3.rst index 97db7a3e4862d..91b9cf59687b3 100644 --- a/doc/source/whatsnew/v1.1.3.rst +++ b/doc/source/whatsnew/v1.1.3.rst @@ -31,7 +31,7 @@ Fixed regressions ~~~~~~~~~~~~~~~~~ - Fixed regression in :meth:`DataFrame.agg`, :meth:`DataFrame.apply`, :meth:`Series.agg`, and :meth:`Series.apply` where internal suffix is exposed to the users when no relabelling is applied (:issue:`36189`) - Fixed regression in :class:`IntegerArray` unary plus and minus operations raising a ``TypeError`` (:issue:`36063`) -- Fixed regression when adding a :meth:`timedelta_range` to a :class:`Timestamp` raised an ``ValueError`` (:issue:`35897`) +- Fixed regression when adding a :meth:`timedelta_range` to a :class:`Timestamp` raised a ``ValueError`` (:issue:`35897`) - Fixed regression in :meth:`Series.__getitem__` incorrectly raising when the input was a tuple (:issue:`35534`) - Fixed regression in :meth:`Series.__getitem__` incorrectly raising when the input was a frozenset (:issue:`35747`) - Fixed regression in :meth:`read_excel` with ``engine="odf"`` caused ``UnboundLocalError`` in some cases where cells had nested child nodes (:issue:`36122`, :issue:`35802`) From d37cf2c24d6bc05f50723db41660a8e3f26d7d54 Mon Sep 17 00:00:00 2001 From: Shubham Mehra <43473352+Shubhamsm@users.noreply.github.com> Date: Sun, 27 Sep 2020 04:20:27 +0530 Subject: [PATCH 0920/1025] Replace single with double backticks in RST file #36617 (#36632) --- .pre-commit-config.yaml | 14 ---- doc/source/development/contributing.rst | 20 +++--- .../development/contributing_docstring.rst | 62 ++++++++-------- doc/source/development/extending.rst | 2 +- doc/source/ecosystem.rst | 2 +- .../comparison/comparison_with_sql.rst | 6 +- doc/source/getting_started/install.rst | 2 +- .../06_calculate_statistics.rst | 2 +- .../intro_tutorials/08_combine_dataframes.rst | 6 +- doc/source/user_guide/categorical.rst | 72 +++++++++---------- doc/source/whatsnew/v0.10.1.rst | 8 +-- doc/source/whatsnew/v0.11.0.rst | 8 +-- doc/source/whatsnew/v0.13.0.rst | 8 +-- doc/source/whatsnew/v0.13.1.rst | 4 +- doc/source/whatsnew/v0.14.0.rst | 30 ++++---- doc/source/whatsnew/v0.14.1.rst | 8 +-- doc/source/whatsnew/v0.15.0.rst | 8 +-- doc/source/whatsnew/v0.15.1.rst | 4 +- doc/source/whatsnew/v0.15.2.rst | 6 +- doc/source/whatsnew/v0.16.0.rst | 4 +- doc/source/whatsnew/v0.16.1.rst | 10 +-- doc/source/whatsnew/v0.16.2.rst | 2 +- doc/source/whatsnew/v0.17.0.rst | 8 +-- doc/source/whatsnew/v0.18.0.rst | 2 +- doc/source/whatsnew/v0.19.1.rst | 2 +- doc/source/whatsnew/v0.23.0.rst | 22 +++--- doc/source/whatsnew/v0.23.1.rst | 6 +- doc/source/whatsnew/v0.24.0.rst | 24 +++---- doc/source/whatsnew/v0.24.1.rst | 4 +- doc/source/whatsnew/v0.25.0.rst | 6 +- doc/source/whatsnew/v0.25.1.rst | 14 ++-- doc/source/whatsnew/v0.6.0.rst | 2 +- doc/source/whatsnew/v0.6.1.rst | 6 +- doc/source/whatsnew/v0.7.0.rst | 4 +- doc/source/whatsnew/v0.8.0.rst | 8 +-- doc/source/whatsnew/v0.9.0.rst | 2 +- doc/source/whatsnew/v0.9.1.rst | 26 +++---- doc/source/whatsnew/v1.0.0.rst | 32 ++++----- doc/source/whatsnew/v1.1.0.rst | 22 +++--- 39 files changed, 232 insertions(+), 246 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 53ab61afe900b..7f669ee77c3eb 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -34,20 +34,6 @@ repos: rev: v1.6.0 hooks: - id: rst-backticks - # these exclusions should be removed and the files fixed - exclude: (?x)( - categorical\.rst| - contributing\.rst| - contributing_docstring\.rst| - extending\.rst| - ecosystem\.rst| - comparison_with_sql\.rst| - install\.rst| - calculate_statistics\.rst| - combine_dataframes\.rst| - v0\.| - v1\.0\.| - v1\.1\.[012]) - repo: local hooks: - id: pip_to_conda diff --git a/doc/source/development/contributing.rst b/doc/source/development/contributing.rst index bb13fbed09677..d6955c5d4b8d2 100644 --- a/doc/source/development/contributing.rst +++ b/doc/source/development/contributing.rst @@ -31,13 +31,13 @@ comment letting others know they are working on an issue. While this is ok, you check each issue individually, and it's not possible to find the unassigned ones. For this reason, we implemented a workaround consisting of adding a comment with the exact -text `take`. When you do it, a GitHub action will automatically assign you the issue +text ``take``. When you do it, a GitHub action will automatically assign you the issue (this will take seconds, and may require refreshing the page to see it). By doing this, it's possible to filter the list of issues and find only the unassigned ones. So, a good way to find an issue to start contributing to pandas is to check the list of `unassigned good first issues `_ -and assign yourself one you like by writing a comment with the exact text `take`. +and assign yourself one you like by writing a comment with the exact text ``take``. If for whatever reason you are not able to continue working with the issue, please try to unassign it, so other people know it's available again. You can check the list of @@ -133,7 +133,7 @@ want to clone your fork to your machine:: cd pandas-yourname git remote add upstream https://github.com/pandas-dev/pandas.git -This creates the directory `pandas-yourname` and connects your repository to +This creates the directory ``pandas-yourname`` and connects your repository to the upstream (main project) *pandas* repository. Note that performing a shallow clone (with ``--depth==N``, for some ``N`` greater @@ -155,12 +155,12 @@ Using a Docker container Instead of manually setting up a development environment, you can use `Docker `_ to automatically create the environment with just several -commands. Pandas provides a `DockerFile` in the root directory to build a Docker image +commands. Pandas provides a ``DockerFile`` in the root directory to build a Docker image with a full pandas development environment. **Docker Commands** -Pass your GitHub username in the `DockerFile` to use your own fork:: +Pass your GitHub username in the ``DockerFile`` to use your own fork:: # Build the image pandas-yourname-env docker build --tag pandas-yourname-env . @@ -172,7 +172,7 @@ Even easier, you can integrate Docker with the following IDEs: **Visual Studio Code** You can use the DockerFile to launch a remote session with Visual Studio Code, -a popular free IDE, using the `.devcontainer.json` file. +a popular free IDE, using the ``.devcontainer.json`` file. See https://code.visualstudio.com/docs/remote/containers for details. **PyCharm (Professional)** @@ -782,7 +782,7 @@ As part of :ref:`Continuous Integration ` checks we run:: isort --check-only pandas -to check that imports are correctly formatted as per the `setup.cfg`. +to check that imports are correctly formatted as per the ``setup.cfg``. If you see output like the below in :ref:`Continuous Integration ` checks: @@ -979,7 +979,7 @@ For example, quite a few functions in pandas accept a ``dtype`` argument. This c def as_type(dtype: Dtype) -> ...: ... -This module will ultimately house types for repeatedly used concepts like "path-like", "array-like", "numeric", etc... and can also hold aliases for commonly appearing parameters like `axis`. Development of this module is active so be sure to refer to the source for the most up to date list of available types. +This module will ultimately house types for repeatedly used concepts like "path-like", "array-like", "numeric", etc... and can also hold aliases for commonly appearing parameters like ``axis``. Development of this module is active so be sure to refer to the source for the most up to date list of available types. Validating type hints ~~~~~~~~~~~~~~~~~~~~~ @@ -1302,7 +1302,7 @@ Or with one of the following constructs:: Using `pytest-xdist `_, one can speed up local testing on multicore machines. To use this feature, you will -need to install `pytest-xdist` via:: +need to install ``pytest-xdist`` via:: pip install pytest-xdist @@ -1465,7 +1465,7 @@ The following defines how a commit message should be structured. Please referen relevant GitHub issues in your commit message using GH1234 or #1234. Either style is fine, but the former is generally preferred: -* a subject line with `< 80` chars. +* a subject line with ``< 80`` chars. * One blank line. * Optionally, a commit message body. diff --git a/doc/source/development/contributing_docstring.rst b/doc/source/development/contributing_docstring.rst index 33f30e1d97512..26cdd0687706c 100644 --- a/doc/source/development/contributing_docstring.rst +++ b/doc/source/development/contributing_docstring.rst @@ -25,7 +25,7 @@ The next example gives an idea of what a docstring looks like: """ Add up two integer numbers. - This function simply wraps the `+` operator, and does not + This function simply wraps the ``+`` operator, and does not do anything interesting, except for illustrating what the docstring of a very simple function looks like. @@ -39,7 +39,7 @@ The next example gives an idea of what a docstring looks like: Returns ------- int - The sum of `num1` and `num2`. + The sum of ``num1`` and ``num2``. See Also -------- @@ -126,9 +126,9 @@ backticks. The following are considered inline code: def add_values(arr): """ - Add the values in `arr`. + Add the values in ``arr``. - This is equivalent to Python `sum` of :meth:`pandas.Series.sum`. + This is equivalent to Python ``sum`` of :meth:`pandas.Series.sum`. Some sections are omitted here for simplicity. """ @@ -144,13 +144,13 @@ backticks. The following are considered inline code: With several mistakes in the docstring. - It has a blank like after the signature `def func():`. + It has a blank like after the signature ``def func():``. The text 'Some function' should go in the line after the opening quotes of the docstring, not in the same line. There is a blank line between the docstring and the first line - of code `foo = 1`. + of code ``foo = 1``. The closing quotes should be in the next line, not in this one.""" @@ -269,11 +269,11 @@ after, and not between the line with the word "Parameters" and the one with the hyphens. After the title, each parameter in the signature must be documented, including -`*args` and `**kwargs`, but not `self`. +``*args`` and ``**kwargs``, but not ``self``. The parameters are defined by their name, followed by a space, a colon, another space, and the type (or types). Note that the space between the name and the -colon is important. Types are not defined for `*args` and `**kwargs`, but must +colon is important. Types are not defined for ``*args`` and ``**kwargs``, but must be defined for all other parameters. After the parameter definition, it is required to have a line with the parameter description, which is indented, and can have multiple lines. The description must start with a capital letter, and @@ -285,13 +285,13 @@ comma at the end of the type. The exact form of the type in this case will be argument means, which can be added after a comma "int, default -1, meaning all cpus". -In cases where the default value is `None`, meaning that the value will not be -used. Instead of "str, default None", it is preferred to write "str, optional". -When `None` is a value being used, we will keep the form "str, default None". -For example, in `df.to_csv(compression=None)`, `None` is not a value being used, +In cases where the default value is ``None``, meaning that the value will not be +used. Instead of ``"str, default None"``, it is preferred to write ``"str, optional"``. +When ``None`` is a value being used, we will keep the form "str, default None". +For example, in ``df.to_csv(compression=None)``, ``None`` is not a value being used, but means that compression is optional, and no compression is being used if not -provided. In this case we will use `str, optional`. Only in cases like -`func(value=None)` and `None` is being used in the same way as `0` or `foo` +provided. In this case we will use ``"str, optional"``. Only in cases like +``func(value=None)`` and ``None`` is being used in the same way as ``0`` or ``foo`` would be used, then we will specify "str, int or None, default None". **Good:** @@ -331,13 +331,13 @@ would be used, then we will specify "str, int or None, default None". specified kind. Note the blank line between the parameters title and the first - parameter. Also, note that after the name of the parameter `kind` + parameter. Also, note that after the name of the parameter ``kind`` and before the colon, a space is missing. Also, note that the parameter descriptions do not start with a capital letter, and do not finish with a dot. - Finally, the `**kwargs` parameter is missing. + Finally, the ``**kwargs`` parameter is missing. Parameters ---------- @@ -361,9 +361,9 @@ boolean, etc): * str * bool -For complex types, define the subtypes. For `dict` and `tuple`, as more than +For complex types, define the subtypes. For ``dict`` and ``tuple``, as more than one type is present, we use the brackets to help read the type (curly brackets -for `dict` and normal brackets for `tuple`): +for ``dict`` and normal brackets for ``tuple``): * list of int * dict of {str : int} @@ -512,8 +512,8 @@ This section is used to let users know about pandas functionality related to the one being documented. In rare cases, if no related methods or functions can be found at all, this section can be skipped. -An obvious example would be the `head()` and `tail()` methods. As `tail()` does -the equivalent as `head()` but at the end of the `Series` or `DataFrame` +An obvious example would be the ``head()`` and ``tail()`` methods. As ``tail()`` does +the equivalent as ``head()`` but at the end of the ``Series`` or ``DataFrame`` instead of at the beginning, it is good to let the users know about it. To give an intuition on what can be considered related, here there are some @@ -608,8 +608,8 @@ Examples in docstrings, besides illustrating the usage of the function or method, must be valid Python code, that returns the given output in a deterministic way, and that can be copied and run by users. -Examples are presented as a session in the Python terminal. `>>>` is used to -present code. `...` is used for code continuing from the previous line. +Examples are presented as a session in the Python terminal. ``>>>`` is used to +present code. ``...`` is used for code continuing from the previous line. Output is presented immediately after the last line of code generating the output (no blank lines in between). Comments describing the examples can be added with blank lines before and after them. @@ -664,7 +664,7 @@ A simple example could be: 4 Falcon dtype: object - With the `n` parameter, we can change the number of returned rows: + With the ``n`` parameter, we can change the number of returned rows: >>> s.head(n=3) 0 Ant @@ -742,7 +742,7 @@ positional arguments ``head(3)``. def fillna(self, value): """ - Replace missing values by `value`. + Replace missing values by ``value``. Examples -------- @@ -771,7 +771,7 @@ positional arguments ``head(3)``. def contains(self, pattern, case_sensitive=True, na=numpy.nan): """ - Return whether each value contains `pattern`. + Return whether each value contains ``pattern``. In this case, we are illustrating how to use sections, even if the example is simple enough and does not require them. @@ -788,8 +788,8 @@ positional arguments ``head(3)``. **Case sensitivity** - With `case_sensitive` set to `False` we can match `a` with both - `a` and `A`: + With ``case_sensitive`` set to ``False`` we can match ``a`` with both + ``a`` and ``A``: >>> s.contains(pattern='a', case_sensitive=False) 0 True @@ -800,7 +800,7 @@ positional arguments ``head(3)``. **Missing values** - We can fill missing values in the output using the `na` parameter: + We can fill missing values in the output using the ``na`` parameter: >>> s.contains(pattern='a', na=False) 0 False @@ -824,9 +824,9 @@ positional arguments ``head(3)``. Try to use meaningful data, when it makes the example easier to understand. - Try to avoid positional arguments like in `df.method(1)`. They + Try to avoid positional arguments like in ``df.method(1)``. They can be all right if previously defined with a meaningful name, - like in `present_value(interest_rate)`, but avoid them otherwise. + like in ``present_value(interest_rate)``, but avoid them otherwise. When presenting the behavior with different parameters, do not place all the calls one next to the other. Instead, add a short sentence @@ -914,7 +914,7 @@ plot will be generated automatically when building the documentation. class Series: def plot(self): """ - Generate a plot with the `Series` data. + Generate a plot with the ``Series`` data. Examples -------- diff --git a/doc/source/development/extending.rst b/doc/source/development/extending.rst index 46c2cbbe39b34..c708ebb361ed1 100644 --- a/doc/source/development/extending.rst +++ b/doc/source/development/extending.rst @@ -61,7 +61,7 @@ This can be a convenient way to extend pandas objects without subclassing them. If you write a custom accessor, make a pull request adding it to our :ref:`ecosystem` page. -We highly recommend validating the data in your accessor's `__init__`. +We highly recommend validating the data in your accessor's ``__init__``. In our ``GeoAccessor``, we validate that the data contains the expected columns, raising an ``AttributeError`` when the validation fails. For a ``Series`` accessor, you should validate the ``dtype`` if the accessor diff --git a/doc/source/ecosystem.rst b/doc/source/ecosystem.rst index 624c0551de607..ed6ce7e9759b6 100644 --- a/doc/source/ecosystem.rst +++ b/doc/source/ecosystem.rst @@ -436,7 +436,7 @@ arrays can be stored inside pandas' Series and DataFrame. `Pint-Pandas`_ ~~~~~~~~~~~~~~ -`Pint-Pandas ` provides an extension type for +``Pint-Pandas `` provides an extension type for storing numeric arrays with units. These arrays can be stored inside pandas' Series and DataFrame. Operations between Series and DataFrame columns which use pint's extension array are then units aware. diff --git a/doc/source/getting_started/comparison/comparison_with_sql.rst b/doc/source/getting_started/comparison/comparison_with_sql.rst index aa7218c3e4fad..04f97a27cde39 100644 --- a/doc/source/getting_started/comparison/comparison_with_sql.rst +++ b/doc/source/getting_started/comparison/comparison_with_sql.rst @@ -19,7 +19,7 @@ As is customary, we import pandas and NumPy as follows: import numpy as np Most of the examples will utilize the ``tips`` dataset found within pandas tests. We'll read -the data into a DataFrame called `tips` and assume we have a database table of the same name and +the data into a DataFrame called ``tips`` and assume we have a database table of the same name and structure. .. ipython:: python @@ -429,7 +429,7 @@ Top n rows per group .query('rn < 3') .sort_values(['day', 'rn'])) -the same using `rank(method='first')` function +the same using ``rank(method='first')`` function .. ipython:: python @@ -453,7 +453,7 @@ the same using `rank(method='first')` function Let's find tips with (rank < 3) per gender group for (tips < 2). Notice that when using ``rank(method='min')`` function -`rnk_min` remains the same for the same `tip` +``rnk_min`` remains the same for the same ``tip`` (as Oracle's RANK() function) .. ipython:: python diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index 2196c908ecf37..78bd76bbd230f 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -179,7 +179,7 @@ In Linux/Mac you can run ``which python`` on your terminal and it will tell you using. If it's something like "/usr/bin/python", you're using the Python from the system, which is not recommended. It is highly recommended to use ``conda``, for quick installation and for package and dependency updates. -You can find simple installation instructions for pandas in this document: `installation instructions `. +You can find simple installation instructions for pandas in this document: ``installation instructions ``. Installing from source ~~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/getting_started/intro_tutorials/06_calculate_statistics.rst b/doc/source/getting_started/intro_tutorials/06_calculate_statistics.rst index c7363b94146ac..bd85160d2622a 100644 --- a/doc/source/getting_started/intro_tutorials/06_calculate_statistics.rst +++ b/doc/source/getting_started/intro_tutorials/06_calculate_statistics.rst @@ -197,7 +197,7 @@ on the grouped data as well: :align: center .. note:: - The `Pclass` column contains numerical data but actually + The ``Pclass`` column contains numerical data but actually represents 3 categories (or factors) with respectively the labels ‘1’, ‘2’ and ‘3’. Calculating statistics on these does not make much sense. Therefore, pandas provides a ``Categorical`` data type to handle this diff --git a/doc/source/getting_started/intro_tutorials/08_combine_dataframes.rst b/doc/source/getting_started/intro_tutorials/08_combine_dataframes.rst index 600a75b156ac4..d6da9a0aa4f22 100644 --- a/doc/source/getting_started/intro_tutorials/08_combine_dataframes.rst +++ b/doc/source/getting_started/intro_tutorials/08_combine_dataframes.rst @@ -123,9 +123,9 @@ concatenated tables to verify the operation: .. ipython:: python - print('Shape of the `air_quality_pm25` table: ', air_quality_pm25.shape) - print('Shape of the `air_quality_no2` table: ', air_quality_no2.shape) - print('Shape of the resulting `air_quality` table: ', air_quality.shape) + print('Shape of the ``air_quality_pm25`` table: ', air_quality_pm25.shape) + print('Shape of the ``air_quality_no2`` table: ', air_quality_no2.shape) + print('Shape of the resulting ``air_quality`` table: ', air_quality.shape) Hence, the resulting table has 3178 = 1110 + 2068 rows. diff --git a/doc/source/user_guide/categorical.rst b/doc/source/user_guide/categorical.rst index b7475ae7bb132..9da5d2a9fc92f 100644 --- a/doc/source/user_guide/categorical.rst +++ b/doc/source/user_guide/categorical.rst @@ -9,9 +9,9 @@ Categorical data This is an introduction to pandas categorical data type, including a short comparison with R's ``factor``. -`Categoricals` are a pandas data type corresponding to categorical variables in +``Categoricals`` are a pandas data type corresponding to categorical variables in statistics. A categorical variable takes on a limited, and usually fixed, -number of possible values (`categories`; `levels` in R). Examples are gender, +number of possible values (``categories``; ``levels`` in R). Examples are gender, social class, blood type, country affiliation, observation time or rating via Likert scales. @@ -19,10 +19,10 @@ In contrast to statistical categorical variables, categorical data might have an 'strongly agree' vs 'agree' or 'first observation' vs. 'second observation'), but numerical operations (additions, divisions, ...) are not possible. -All values of categorical data are either in `categories` or `np.nan`. Order is defined by -the order of `categories`, not lexical order of the values. Internally, the data structure -consists of a `categories` array and an integer array of `codes` which point to the real value in -the `categories` array. +All values of categorical data are either in ``categories`` or ``np.nan``. Order is defined by +the order of ``categories``, not lexical order of the values. Internally, the data structure +consists of a ``categories`` array and an integer array of ``codes`` which point to the real value in +the ``categories`` array. The categorical data type is useful in the following cases: @@ -196,13 +196,13 @@ To get back to the original ``Series`` or NumPy array, use .. note:: - In contrast to R's `factor` function, categorical data is not converting input values to + In contrast to R's ``factor`` function, categorical data is not converting input values to strings; categories will end up the same data type as the original values. .. note:: - In contrast to R's `factor` function, there is currently no way to assign/change labels at - creation time. Use `categories` to change the categories after creation time. + In contrast to R's ``factor`` function, there is currently no way to assign/change labels at + creation time. Use ``categories`` to change the categories after creation time. .. _categorical.categoricaldtype: @@ -228,7 +228,7 @@ by default. CategoricalDtype() A :class:`~pandas.api.types.CategoricalDtype` can be used in any place pandas -expects a `dtype`. For example :func:`pandas.read_csv`, +expects a ``dtype``. For example :func:`pandas.read_csv`, :func:`pandas.DataFrame.astype`, or in the ``Series`` constructor. .. note:: @@ -288,7 +288,7 @@ output to a ``Series`` or ``DataFrame`` of type ``string``. Working with categories ----------------------- -Categorical data has a `categories` and a `ordered` property, which list their +Categorical data has a ``categories`` and a ``ordered`` property, which list their possible values and whether the ordering matters or not. These properties are exposed as ``s.cat.categories`` and ``s.cat.ordered``. If you don't manually specify categories and ordering, they are inferred from the passed arguments. @@ -353,14 +353,14 @@ Renaming categories is done by assigning new values to the .. note:: - In contrast to R's `factor`, categorical data can have categories of other types than string. + In contrast to R's ``factor``, categorical data can have categories of other types than string. .. note:: Be aware that assigning new categories is an inplace operation, while most other operations - under ``Series.cat`` per default return a new ``Series`` of dtype `category`. + under ``Series.cat`` per default return a new ``Series`` of dtype ``category``. -Categories must be unique or a `ValueError` is raised: +Categories must be unique or a ``ValueError`` is raised: .. ipython:: python @@ -369,7 +369,7 @@ Categories must be unique or a `ValueError` is raised: except ValueError as e: print("ValueError:", str(e)) -Categories must also not be ``NaN`` or a `ValueError` is raised: +Categories must also not be ``NaN`` or a ``ValueError`` is raised: .. ipython:: python @@ -535,7 +535,7 @@ Comparing categorical data with other objects is possible in three cases: * Comparing equality (``==`` and ``!=``) to a list-like object (list, Series, array, ...) of the same length as the categorical data. * All comparisons (``==``, ``!=``, ``>``, ``>=``, ``<``, and ``<=``) of categorical data to - another categorical Series, when ``ordered==True`` and the `categories` are the same. + another categorical Series, when ``ordered==True`` and the ``categories`` are the same. * All comparisons of a categorical data to a scalar. All other comparisons, especially "non-equality" comparisons of two categoricals with different @@ -657,7 +657,7 @@ Data munging The optimized pandas data access methods ``.loc``, ``.iloc``, ``.at``, and ``.iat``, work as normal. The only difference is the return type (for getting) and -that only values already in `categories` can be assigned. +that only values already in ``categories`` can be assigned. Getting ~~~~~~~ @@ -695,8 +695,8 @@ of length "1". df.at["h", "cats"] # returns a string .. note:: - The is in contrast to R's `factor` function, where ``factor(c(1,2,3))[1]`` - returns a single value `factor`. + The is in contrast to R's ``factor`` function, where ``factor(c(1,2,3))[1]`` + returns a single value ``factor``. To get a single value ``Series`` of type ``category``, you pass in a list with a single value: @@ -732,7 +732,7 @@ an appropriate type: That means, that the returned values from methods and properties on the accessors of a ``Series`` and the returned values from methods and properties on the accessors of this -``Series`` transformed to one of type `category` will be equal: +``Series`` transformed to one of type ``category`` will be equal: .. ipython:: python @@ -753,7 +753,7 @@ Setting ~~~~~~~ Setting values in a categorical column (or ``Series``) works as long as the -value is included in the `categories`: +value is included in the ``categories``: .. ipython:: python @@ -770,7 +770,7 @@ value is included in the `categories`: except ValueError as e: print("ValueError:", str(e)) -Setting values by assigning categorical data will also check that the `categories` match: +Setting values by assigning categorical data will also check that the ``categories`` match: .. ipython:: python @@ -941,7 +941,7 @@ See :ref:`here ` for an example and caveats. Writing to a CSV file will convert the data, effectively removing any information about the categorical (categories and ordering). So if you read back the CSV file you have to convert the -relevant columns back to `category` and assign the right categories and categories ordering. +relevant columns back to ``category`` and assign the right categories and categories ordering. .. ipython:: python @@ -970,7 +970,7 @@ The same holds for writing to a SQL database with ``to_sql``. Missing data ------------ -pandas primarily uses the value `np.nan` to represent missing data. It is by +pandas primarily uses the value ``np.nan`` to represent missing data. It is by default not included in computations. See the :ref:`Missing Data section `. @@ -998,20 +998,20 @@ Methods for working with missing data, e.g. :meth:`~Series.isna`, :meth:`~Series pd.isna(s) s.fillna("a") -Differences to R's `factor` ---------------------------- +Differences to R's ``factor`` +----------------------------- The following differences to R's factor functions can be observed: -* R's `levels` are named `categories`. -* R's `levels` are always of type string, while `categories` in pandas can be of any dtype. +* R's ``levels`` are named ``categories``. +* R's ``levels`` are always of type string, while ``categories`` in pandas can be of any dtype. * It's not possible to specify labels at creation time. Use ``s.cat.rename_categories(new_labels)`` afterwards. -* In contrast to R's `factor` function, using categorical data as the sole input to create a +* In contrast to R's ``factor`` function, using categorical data as the sole input to create a new categorical series will *not* remove unused categories but create a new categorical series which is equal to the passed in one! -* R allows for missing values to be included in its `levels` (pandas' `categories`). Pandas - does not allow `NaN` categories, but missing values can still be in the `values`. +* R allows for missing values to be included in its ``levels`` (pandas' ``categories``). Pandas + does not allow ``NaN`` categories, but missing values can still be in the ``values``. Gotchas @@ -1053,13 +1053,13 @@ an ``object`` dtype is a constant times the length of the data. s.astype('category').nbytes -`Categorical` is not a `numpy` array -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +``Categorical`` is not a ``numpy`` array +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Currently, categorical data and the underlying ``Categorical`` is implemented as a Python object and not as a low-level NumPy array dtype. This leads to some problems. -NumPy itself doesn't know about the new `dtype`: +NumPy itself doesn't know about the new ``dtype``: .. ipython:: python @@ -1088,7 +1088,7 @@ To check if a Series contains Categorical data, use ``hasattr(s, 'cat')``: hasattr(pd.Series(['a'], dtype='category'), 'cat') hasattr(pd.Series(['a']), 'cat') -Using NumPy functions on a ``Series`` of type ``category`` should not work as `Categoricals` +Using NumPy functions on a ``Series`` of type ``category`` should not work as ``Categoricals`` are not numeric data (even in the case that ``.categories`` is numeric). .. ipython:: python @@ -1107,7 +1107,7 @@ dtype in apply ~~~~~~~~~~~~~~ Pandas currently does not preserve the dtype in apply functions: If you apply along rows you get -a `Series` of ``object`` `dtype` (same as getting a row -> getting one element will return a +a ``Series`` of ``object`` ``dtype`` (same as getting a row -> getting one element will return a basic type) and applying along columns will also convert to object. ``NaN`` values are unaffected. You can use ``fillna`` to handle missing values before applying a function. diff --git a/doc/source/whatsnew/v0.10.1.rst b/doc/source/whatsnew/v0.10.1.rst index 1e9eafd2700e9..3dc680c46a4d9 100644 --- a/doc/source/whatsnew/v0.10.1.rst +++ b/doc/source/whatsnew/v0.10.1.rst @@ -189,8 +189,8 @@ combined result, by using ``where`` on a selector table. - ``HDFStore`` now can read native PyTables table format tables - You can pass ``nan_rep = 'my_nan_rep'`` to append, to change the default nan - representation on disk (which converts to/from `np.nan`), this defaults to - `nan`. + representation on disk (which converts to/from ``np.nan``), this defaults to + ``nan``. - You can pass ``index`` to ``append``. This defaults to ``True``. This will automagically create indices on the *indexables* and *data columns* of the @@ -224,7 +224,7 @@ combined result, by using ``where`` on a selector table. - Function to reset Google Analytics token store so users can recover from improperly setup client secrets (:issue:`2687`). - Fixed groupby bug resulting in segfault when passing in MultiIndex (:issue:`2706`) -- Fixed bug where passing a Series with datetime64 values into `to_datetime` +- Fixed bug where passing a Series with datetime64 values into ``to_datetime`` results in bogus output values (:issue:`2699`) - Fixed bug in ``pattern in HDFStore`` expressions when pattern is not a valid regex (:issue:`2694`) @@ -240,7 +240,7 @@ combined result, by using ``where`` on a selector table. - Fixed C file parser behavior when the file has more columns than data (:issue:`2668`) - Fixed file reader bug that misaligned columns with data in the presence of an - implicit column and a specified `usecols` value + implicit column and a specified ``usecols`` value - DataFrames with numerical or datetime indices are now sorted prior to plotting (:issue:`2609`) - Fixed DataFrame.from_records error when passed columns, index, but empty diff --git a/doc/source/whatsnew/v0.11.0.rst b/doc/source/whatsnew/v0.11.0.rst index c0bc74c9ff036..eb91ac427063f 100644 --- a/doc/source/whatsnew/v0.11.0.rst +++ b/doc/source/whatsnew/v0.11.0.rst @@ -425,13 +425,13 @@ Enhancements - Cursor coordinate information is now displayed in time-series plots. - - added option `display.max_seq_items` to control the number of + - added option ``display.max_seq_items`` to control the number of elements printed per sequence pprinting it. (:issue:`2979`) - - added option `display.chop_threshold` to control display of small numerical + - added option ``display.chop_threshold`` to control display of small numerical values. (:issue:`2739`) - - added option `display.max_info_rows` to prevent verbose_info from being + - added option ``display.max_info_rows`` to prevent verbose_info from being calculated for frames above 1M rows (configurable). (:issue:`2807`, :issue:`2918`) - value_counts() now accepts a "normalize" argument, for normalized @@ -440,7 +440,7 @@ Enhancements - DataFrame.from_records now accepts not only dicts but any instance of the collections.Mapping ABC. - - added option `display.mpl_style` providing a sleeker visual style + - added option ``display.mpl_style`` providing a sleeker visual style for plots. Based on https://gist.github.com/huyng/816622 (:issue:`3075`). - Treat boolean values as integers (values 1 and 0) for numeric diff --git a/doc/source/whatsnew/v0.13.0.rst b/doc/source/whatsnew/v0.13.0.rst index 5a904d6c85c61..bc607409546c6 100644 --- a/doc/source/whatsnew/v0.13.0.rst +++ b/doc/source/whatsnew/v0.13.0.rst @@ -214,7 +214,7 @@ These were announced changes in 0.12 or prior that are taking effect as of 0.13. - Remove deprecated ``read_clipboard/to_clipboard/ExcelFile/ExcelWriter`` from ``pandas.io.parsers`` (:issue:`3717`) These are available as functions in the main pandas namespace (e.g. ``pd.read_clipboard``) - default for ``tupleize_cols`` is now ``False`` for both ``to_csv`` and ``read_csv``. Fair warning in 0.12 (:issue:`3604`) -- default for `display.max_seq_len` is now 100 rather than `None`. This activates +- default for ``display.max_seq_len`` is now 100 rather than ``None``. This activates truncated display ("...") of long sequences in various places. (:issue:`3391`) Deprecations @@ -498,7 +498,7 @@ Enhancements - ``to_dict`` now takes ``records`` as a possible out type. Returns an array of column-keyed dictionaries. (:issue:`4936`) -- ``NaN`` handing in get_dummies (:issue:`4446`) with `dummy_na` +- ``NaN`` handing in get_dummies (:issue:`4446`) with ``dummy_na`` .. ipython:: python @@ -1071,7 +1071,7 @@ Bug fixes as the docstring says (:issue:`4362`). - ``as_index`` is no longer ignored when doing groupby apply (:issue:`4648`, :issue:`3417`) -- JSON NaT handling fixed, NaTs are now serialized to `null` (:issue:`4498`) +- JSON NaT handling fixed, NaTs are now serialized to ``null`` (:issue:`4498`) - Fixed JSON handling of escapable characters in JSON object keys (:issue:`4593`) - Fixed passing ``keep_default_na=False`` when ``na_values=None`` @@ -1188,7 +1188,7 @@ Bug fixes single column and passing a list for ``ascending``, the argument for ``ascending`` was being interpreted as ``True`` (:issue:`4839`, :issue:`4846`) -- Fixed ``Panel.tshift`` not working. Added `freq` support to ``Panel.shift`` +- Fixed ``Panel.tshift`` not working. Added ``freq`` support to ``Panel.shift`` (:issue:`4853`) - Fix an issue in TextFileReader w/ Python engine (i.e. PythonParser) with thousands != "," (:issue:`4596`) diff --git a/doc/source/whatsnew/v0.13.1.rst b/doc/source/whatsnew/v0.13.1.rst index 6fe010be8fb2d..9e416f8eeb3f1 100644 --- a/doc/source/whatsnew/v0.13.1.rst +++ b/doc/source/whatsnew/v0.13.1.rst @@ -379,7 +379,7 @@ Performance improvements for 0.13.1 - Series datetime/timedelta binary operations (:issue:`5801`) - DataFrame ``count/dropna`` for ``axis=1`` -- Series.str.contains now has a `regex=False` keyword which can be faster for plain (non-regex) string patterns. (:issue:`5879`) +- Series.str.contains now has a ``regex=False`` keyword which can be faster for plain (non-regex) string patterns. (:issue:`5879`) - Series.str.extract (:issue:`5944`) - ``dtypes/ftypes`` methods (:issue:`5968`) - indexing with object dtypes (:issue:`5968`) @@ -399,7 +399,7 @@ Bug fixes - Bug in ``io.wb.get_countries`` not including all countries (:issue:`6008`) - Bug in Series replace with timestamp dict (:issue:`5797`) -- read_csv/read_table now respects the `prefix` kwarg (:issue:`5732`). +- read_csv/read_table now respects the ``prefix`` kwarg (:issue:`5732`). - Bug in selection with missing values via ``.ix`` from a duplicate indexed DataFrame failing (:issue:`5835`) - Fix issue of boolean comparison on empty DataFrames (:issue:`5808`) - Bug in isnull handling ``NaT`` in an object array (:issue:`5443`) diff --git a/doc/source/whatsnew/v0.14.0.rst b/doc/source/whatsnew/v0.14.0.rst index 847a42b3a7643..421ef81427210 100644 --- a/doc/source/whatsnew/v0.14.0.rst +++ b/doc/source/whatsnew/v0.14.0.rst @@ -82,7 +82,7 @@ API changes - The :meth:`DataFrame.interpolate` keyword ``downcast`` default has been changed from ``infer`` to ``None``. This is to preserve the original dtype unless explicitly requested otherwise (:issue:`6290`). -- When converting a dataframe to HTML it used to return `Empty DataFrame`. This special case has +- When converting a dataframe to HTML it used to return ``Empty DataFrame``. This special case has been removed, instead a header with the column names is returned (:issue:`6062`). - ``Series`` and ``Index`` now internally share more common operations, e.g. ``factorize(),nunique(),value_counts()`` are now supported on ``Index`` types as well. The ``Series.weekday`` property from is removed @@ -291,12 +291,12 @@ Display changes - Regression in the display of a MultiIndexed Series with ``display.max_rows`` is less than the length of the series (:issue:`7101`) - Fixed a bug in the HTML repr of a truncated Series or DataFrame not showing the class name with the - `large_repr` set to 'info' (:issue:`7105`) -- The `verbose` keyword in ``DataFrame.info()``, which controls whether to shorten the ``info`` + ``large_repr`` set to 'info' (:issue:`7105`) +- The ``verbose`` keyword in ``DataFrame.info()``, which controls whether to shorten the ``info`` representation, is now ``None`` by default. This will follow the global setting in ``display.max_info_columns``. The global setting can be overridden with ``verbose=True`` or ``verbose=False``. -- Fixed a bug with the `info` repr not honoring the `display.max_info_columns` setting (:issue:`6939`) +- Fixed a bug with the ``info`` repr not honoring the ``display.max_info_columns`` setting (:issue:`6939`) - Offset/freq info now in Timestamp __repr__ (:issue:`4553`) .. _whatsnew_0140.parsing: @@ -603,11 +603,11 @@ Plotting - Following keywords are now acceptable for :meth:`DataFrame.plot` with ``kind='bar'`` and ``kind='barh'``: - - `width`: Specify the bar width. In previous versions, static value 0.5 was passed to matplotlib and it cannot be overwritten. (:issue:`6604`) - - `align`: Specify the bar alignment. Default is `center` (different from matplotlib). In previous versions, pandas passes `align='edge'` to matplotlib and adjust the location to `center` by itself, and it results `align` keyword is not applied as expected. (:issue:`4525`) - - `position`: Specify relative alignments for bar plot layout. From 0 (left/bottom-end) to 1(right/top-end). Default is 0.5 (center). (:issue:`6604`) + - ``width``: Specify the bar width. In previous versions, static value 0.5 was passed to matplotlib and it cannot be overwritten. (:issue:`6604`) + - ``align``: Specify the bar alignment. Default is ``center`` (different from matplotlib). In previous versions, pandas passes ``align='edge'`` to matplotlib and adjust the location to ``center`` by itself, and it results ``align`` keyword is not applied as expected. (:issue:`4525`) + - ``position``: Specify relative alignments for bar plot layout. From 0 (left/bottom-end) to 1(right/top-end). Default is 0.5 (center). (:issue:`6604`) - Because of the default `align` value changes, coordinates of bar plots are now located on integer values (0.0, 1.0, 2.0 ...). This is intended to make bar plot be located on the same coordinates as line plot. However, bar plot may differs unexpectedly when you manually adjust the bar location or drawing area, such as using `set_xlim`, `set_ylim`, etc. In this cases, please modify your script to meet with new coordinates. + Because of the default ``align`` value changes, coordinates of bar plots are now located on integer values (0.0, 1.0, 2.0 ...). This is intended to make bar plot be located on the same coordinates as line plot. However, bar plot may differs unexpectedly when you manually adjust the bar location or drawing area, such as using ``set_xlim``, ``set_ylim``, etc. In this cases, please modify your script to meet with new coordinates. - The :func:`parallel_coordinates` function now takes argument ``color`` instead of ``colors``. A ``FutureWarning`` is raised to alert that @@ -618,7 +618,7 @@ Plotting raised if the old ``data`` argument is used by name. (:issue:`6956`) - :meth:`DataFrame.boxplot` now supports ``layout`` keyword (:issue:`6769`) -- :meth:`DataFrame.boxplot` has a new keyword argument, `return_type`. It accepts ``'dict'``, +- :meth:`DataFrame.boxplot` has a new keyword argument, ``return_type``. It accepts ``'dict'``, ``'axes'``, or ``'both'``, in which case a namedtuple with the matplotlib axes and a dict of matplotlib Lines is returned. @@ -721,8 +721,8 @@ Deprecations - The following ``io.sql`` functions have been deprecated: ``tquery``, ``uquery``, ``read_frame``, ``frame_query``, ``write_frame``. -- The `percentile_width` keyword argument in :meth:`~DataFrame.describe` has been deprecated. - Use the `percentiles` keyword instead, which takes a list of percentiles to display. The +- The ``percentile_width`` keyword argument in :meth:`~DataFrame.describe` has been deprecated. + Use the ``percentiles`` keyword instead, which takes a list of percentiles to display. The default output is unchanged. - The default return type of :func:`boxplot` will change from a dict to a matplotlib Axes @@ -851,7 +851,7 @@ Enhancements - Arrays of strings can be wrapped to a specified width (``str.wrap``) (:issue:`6999`) - Add :meth:`~Series.nsmallest` and :meth:`Series.nlargest` methods to Series, See :ref:`the docs ` (:issue:`3960`) -- `PeriodIndex` fully supports partial string indexing like `DatetimeIndex` (:issue:`7043`) +- ``PeriodIndex`` fully supports partial string indexing like ``DatetimeIndex`` (:issue:`7043`) .. ipython:: python @@ -868,7 +868,7 @@ Enhancements - ``Series.rank()`` now has a percentage rank option (:issue:`5971`) - ``Series.rank()`` and ``DataFrame.rank()`` now accept ``method='dense'`` for ranks without gaps (:issue:`6514`) - Support passing ``encoding`` with xlwt (:issue:`3710`) -- Refactor Block classes removing `Block.items` attributes to avoid duplication +- Refactor Block classes removing ``Block.items`` attributes to avoid duplication in item handling (:issue:`6745`, :issue:`6988`). - Testing statements updated to use specialized asserts (:issue:`6175`) @@ -1063,10 +1063,10 @@ Bug fixes - Bug in ``MultiIndex.get_level_values`` doesn't preserve ``DatetimeIndex`` and ``PeriodIndex`` attributes (:issue:`7092`) - Bug in ``Groupby`` doesn't preserve ``tz`` (:issue:`3950`) - Bug in ``PeriodIndex`` partial string slicing (:issue:`6716`) -- Bug in the HTML repr of a truncated Series or DataFrame not showing the class name with the `large_repr` set to 'info' +- Bug in the HTML repr of a truncated Series or DataFrame not showing the class name with the ``large_repr`` set to 'info' (:issue:`7105`) - Bug in ``DatetimeIndex`` specifying ``freq`` raises ``ValueError`` when passed value is too short (:issue:`7098`) -- Fixed a bug with the `info` repr not honoring the `display.max_info_columns` setting (:issue:`6939`) +- Fixed a bug with the ``info`` repr not honoring the ``display.max_info_columns`` setting (:issue:`6939`) - Bug ``PeriodIndex`` string slicing with out of bounds values (:issue:`5407`) - Fixed a memory error in the hashtable implementation/factorizer on resizing of large tables (:issue:`7157`) - Bug in ``isnull`` when applied to 0-dimensional object arrays (:issue:`7176`) diff --git a/doc/source/whatsnew/v0.14.1.rst b/doc/source/whatsnew/v0.14.1.rst index 5de193007474c..354d67a525d0e 100644 --- a/doc/source/whatsnew/v0.14.1.rst +++ b/doc/source/whatsnew/v0.14.1.rst @@ -108,7 +108,7 @@ Enhancements - ``PeriodIndex`` is represented as the same format as ``DatetimeIndex`` (:issue:`7601`) - ``StringMethods`` now work on empty Series (:issue:`7242`) - The file parsers ``read_csv`` and ``read_table`` now ignore line comments provided by - the parameter `comment`, which accepts only a single character for the C reader. + the parameter ``comment``, which accepts only a single character for the C reader. In particular, they allow for comments before file data begins (:issue:`2685`) - Add ``NotImplementedError`` for simultaneous use of ``chunksize`` and ``nrows`` for read_csv() (:issue:`6774`). @@ -150,7 +150,7 @@ Performance - Improvements in Series.transform for significant performance gains (:issue:`6496`) - Improvements in DataFrame.transform with ufuncs and built-in grouper functions for significant performance gains (:issue:`7383`) - Regression in groupby aggregation of datetime64 dtypes (:issue:`7555`) -- Improvements in `MultiIndex.from_product` for large iterables (:issue:`7627`) +- Improvements in ``MultiIndex.from_product`` for large iterables (:issue:`7627`) .. _whatsnew_0141.experimental: @@ -217,7 +217,7 @@ Bug fixes - Bug in ``.loc`` with a list of indexers on a single-multi index level (that is not nested) (:issue:`7349`) - Bug in ``Series.map`` when mapping a dict with tuple keys of different lengths (:issue:`7333`) - Bug all ``StringMethods`` now work on empty Series (:issue:`7242`) -- Fix delegation of `read_sql` to `read_sql_query` when query does not contain 'select' (:issue:`7324`). +- Fix delegation of ``read_sql`` to ``read_sql_query`` when query does not contain 'select' (:issue:`7324`). - Bug where a string column name assignment to a ``DataFrame`` with a ``Float64Index`` raised a ``TypeError`` during a call to ``np.isnan`` (:issue:`7366`). @@ -269,7 +269,7 @@ Bug fixes - Bug in ``pandas.core.strings.str_contains`` does not properly match in a case insensitive fashion when ``regex=False`` and ``case=False`` (:issue:`7505`) - Bug in ``expanding_cov``, ``expanding_corr``, ``rolling_cov``, and ``rolling_corr`` for two arguments with mismatched index (:issue:`7512`) - Bug in ``to_sql`` taking the boolean column as text column (:issue:`7678`) -- Bug in grouped `hist` doesn't handle `rot` kw and `sharex` kw properly (:issue:`7234`) +- Bug in grouped ``hist`` doesn't handle ``rot`` kw and ``sharex`` kw properly (:issue:`7234`) - Bug in ``.loc`` performing fallback integer indexing with ``object`` dtype indices (:issue:`7496`) - Bug (regression) in ``PeriodIndex`` constructor when passed ``Series`` objects (:issue:`7701`). diff --git a/doc/source/whatsnew/v0.15.0.rst b/doc/source/whatsnew/v0.15.0.rst index b80ed7446f805..1f054930b3709 100644 --- a/doc/source/whatsnew/v0.15.0.rst +++ b/doc/source/whatsnew/v0.15.0.rst @@ -61,7 +61,7 @@ New features Categoricals in Series/DataFrame ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -:class:`~pandas.Categorical` can now be included in `Series` and `DataFrames` and gained new +:class:`~pandas.Categorical` can now be included in ``Series`` and ``DataFrames`` and gained new methods to manipulate. Thanks to Jan Schulz for much of this API/implementation. (:issue:`3943`, :issue:`5313`, :issue:`5314`, :issue:`7444`, :issue:`7839`, :issue:`7848`, :issue:`7864`, :issue:`7914`, :issue:`7768`, :issue:`8006`, :issue:`3678`, :issue:`8075`, :issue:`8076`, :issue:`8143`, :issue:`8453`, :issue:`8518`). @@ -808,7 +808,7 @@ Other notable API changes: .. _whatsnew_0150.blanklines: -- Made both the C-based and Python engines for `read_csv` and `read_table` ignore empty lines in input as well as +- Made both the C-based and Python engines for ``read_csv`` and ``read_table`` ignore empty lines in input as well as white space-filled lines, as long as ``sep`` is not white space. This is an API change that can be controlled by the keyword parameter ``skip_blank_lines``. See :ref:`the docs ` (:issue:`4466`) @@ -830,7 +830,7 @@ Other notable API changes: Previously this would have yielded a column of ``datetime64`` dtype, but without timezone info. - The behaviour of assigning a column to an existing dataframe as `df['a'] = i` + The behaviour of assigning a column to an existing dataframe as ``df['a'] = i`` remains unchanged (this already returned an ``object`` column with a timezone). - When passing multiple levels to :meth:`~pandas.DataFrame.stack()`, it will now raise a ``ValueError`` when the @@ -894,7 +894,7 @@ a transparent change with only very limited API implications (:issue:`5080`, :is - you may need to unpickle pandas version < 0.15.0 pickles using ``pd.read_pickle`` rather than ``pickle.load``. See :ref:`pickle docs ` - when plotting with a ``PeriodIndex``, the matplotlib internal axes will now be arrays of ``Period`` rather than a ``PeriodIndex`` (this is similar to how a ``DatetimeIndex`` passes arrays of ``datetimes`` now) - MultiIndexes will now raise similarly to other pandas objects w.r.t. truth testing, see :ref:`here ` (:issue:`7897`). -- When plotting a DatetimeIndex directly with matplotlib's `plot` function, +- When plotting a DatetimeIndex directly with matplotlib's ``plot`` function, the axis labels will no longer be formatted as dates but as integers (the internal representation of a ``datetime64``). **UPDATE** This is fixed in 0.15.1, see :ref:`here `. diff --git a/doc/source/whatsnew/v0.15.1.rst b/doc/source/whatsnew/v0.15.1.rst index f9c17058dc3ee..da56f07e84d9f 100644 --- a/doc/source/whatsnew/v0.15.1.rst +++ b/doc/source/whatsnew/v0.15.1.rst @@ -249,7 +249,7 @@ Enhancements dfi.memory_usage(index=True) -- Added Index properties `is_monotonic_increasing` and `is_monotonic_decreasing` (:issue:`8680`). +- Added Index properties ``is_monotonic_increasing`` and ``is_monotonic_decreasing`` (:issue:`8680`). - Added option to select columns when importing Stata files (:issue:`7935`) @@ -305,7 +305,7 @@ Bug fixes - Fixed a bug where plotting a column ``y`` and specifying a label would mutate the index name of the original DataFrame (:issue:`8494`) - Fix regression in plotting of a DatetimeIndex directly with matplotlib (:issue:`8614`). - Bug in ``date_range`` where partially-specified dates would incorporate current date (:issue:`6961`) -- Bug in Setting by indexer to a scalar value with a mixed-dtype `Panel4d` was failing (:issue:`8702`) +- Bug in Setting by indexer to a scalar value with a mixed-dtype ``Panel4d`` was failing (:issue:`8702`) - Bug where ``DataReader``'s would fail if one of the symbols passed was invalid. Now returns data for valid symbols and np.nan for invalid (:issue:`8494`) - Bug in ``get_quote_yahoo`` that wouldn't allow non-float return values (:issue:`5229`). diff --git a/doc/source/whatsnew/v0.15.2.rst b/doc/source/whatsnew/v0.15.2.rst index a4eabb97471de..95ca925f18692 100644 --- a/doc/source/whatsnew/v0.15.2.rst +++ b/doc/source/whatsnew/v0.15.2.rst @@ -137,7 +137,7 @@ Enhancements - Added ability to export Categorical data to Stata (:issue:`8633`). See :ref:`here ` for limitations of categorical variables exported to Stata data files. - Added flag ``order_categoricals`` to ``StataReader`` and ``read_stata`` to select whether to order imported categorical data (:issue:`8836`). See :ref:`here ` for more information on importing categorical variables from Stata data files. - Added ability to export Categorical data to to/from HDF5 (:issue:`7621`). Queries work the same as if it was an object array. However, the ``category`` dtyped data is stored in a more efficient manner. See :ref:`here ` for an example and caveats w.r.t. prior versions of pandas. -- Added support for ``searchsorted()`` on `Categorical` class (:issue:`8420`). +- Added support for ``searchsorted()`` on ``Categorical`` class (:issue:`8420`). Other enhancements: @@ -171,7 +171,7 @@ Other enhancements: 3 False True False True 4 True True True True -- Added support for ``utcfromtimestamp()``, ``fromtimestamp()``, and ``combine()`` on `Timestamp` class (:issue:`5351`). +- Added support for ``utcfromtimestamp()``, ``fromtimestamp()``, and ``combine()`` on ``Timestamp`` class (:issue:`5351`). - Added Google Analytics (`pandas.io.ga`) basic documentation (:issue:`8835`). See `here `__. - ``Timedelta`` arithmetic returns ``NotImplemented`` in unknown cases, allowing extensions by custom classes (:issue:`8813`). - ``Timedelta`` now supports arithmetic with ``numpy.ndarray`` objects of the appropriate dtype (numpy 1.8 or newer only) (:issue:`8884`). @@ -241,7 +241,7 @@ Bug fixes - Bug in ``MultiIndex`` where ``__contains__`` returns wrong result if index is not lexically sorted or unique (:issue:`7724`) - BUG CSV: fix problem with trailing white space in skipped rows, (:issue:`8679`), (:issue:`8661`), (:issue:`8983`) - Regression in ``Timestamp`` does not parse 'Z' zone designator for UTC (:issue:`8771`) -- Bug in `StataWriter` the produces writes strings with 244 characters irrespective of actual size (:issue:`8969`) +- Bug in ``StataWriter`` the produces writes strings with 244 characters irrespective of actual size (:issue:`8969`) - Fixed ValueError raised by cummin/cummax when datetime64 Series contains NaT. (:issue:`8965`) - Bug in DataReader returns object dtype if there are missing values (:issue:`8980`) - Bug in plotting if sharex was enabled and index was a timeseries, would show labels on multiple axes (:issue:`3964`). diff --git a/doc/source/whatsnew/v0.16.0.rst b/doc/source/whatsnew/v0.16.0.rst index 4ad533e68e275..8d0d6854cbf85 100644 --- a/doc/source/whatsnew/v0.16.0.rst +++ b/doc/source/whatsnew/v0.16.0.rst @@ -89,7 +89,7 @@ See the :ref:`documentation ` for more. (:issue:`922 Interaction with scipy.sparse ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Added :meth:`SparseSeries.to_coo` and :meth:`SparseSeries.from_coo` methods (:issue:`8048`) for converting to and from ``scipy.sparse.coo_matrix`` instances (see :ref:`here `). For example, given a SparseSeries with MultiIndex we can convert to a `scipy.sparse.coo_matrix` by specifying the row and column labels as index levels: +Added :meth:`SparseSeries.to_coo` and :meth:`SparseSeries.from_coo` methods (:issue:`8048`) for converting to and from ``scipy.sparse.coo_matrix`` instances (see :ref:`here `). For example, given a SparseSeries with MultiIndex we can convert to a ``scipy.sparse.coo_matrix`` by specifying the row and column labels as index levels: .. code-block:: python @@ -630,7 +630,7 @@ Bug fixes - Bug in ``Series.values_counts`` with excluding ``NaN`` for categorical type ``Series`` with ``dropna=True`` (:issue:`9443`) - Fixed missing numeric_only option for ``DataFrame.std/var/sem`` (:issue:`9201`) - Support constructing ``Panel`` or ``Panel4D`` with scalar data (:issue:`8285`) -- ``Series`` text representation disconnected from `max_rows`/`max_columns` (:issue:`7508`). +- ``Series`` text representation disconnected from ``max_rows``/``max_columns`` (:issue:`7508`). \ diff --git a/doc/source/whatsnew/v0.16.1.rst b/doc/source/whatsnew/v0.16.1.rst index 8dcac4c1044be..a89ede8f024a0 100644 --- a/doc/source/whatsnew/v0.16.1.rst +++ b/doc/source/whatsnew/v0.16.1.rst @@ -232,7 +232,7 @@ enhancements make string operations easier and more consistent with standard pyt idx = pd.Index([' jack', 'jill ', ' jesse ', 'frank']) idx.str.strip() - One special case for the `.str` accessor on ``Index`` is that if a string method returns ``bool``, the ``.str`` accessor + One special case for the ``.str`` accessor on ``Index`` is that if a string method returns ``bool``, the ``.str`` accessor will return a ``np.array`` instead of a boolean ``Index`` (:issue:`8875`). This enables the following expression to work naturally: @@ -310,7 +310,7 @@ Other enhancements - ``get_dummies`` function now accepts ``sparse`` keyword. If set to ``True``, the return ``DataFrame`` is sparse, e.g. ``SparseDataFrame``. (:issue:`8823`) - ``Period`` now accepts ``datetime64`` as value input. (:issue:`9054`) -- Allow timedelta string conversion when leading zero is missing from time definition, ie `0:00:00` vs `00:00:00`. (:issue:`9570`) +- Allow timedelta string conversion when leading zero is missing from time definition, ie ``0:00:00`` vs ``00:00:00``. (:issue:`9570`) - Allow ``Panel.shift`` with ``axis='items'`` (:issue:`9890`) - Trying to write an excel file now raises ``NotImplementedError`` if the ``DataFrame`` has a ``MultiIndex`` instead of writing a broken Excel file. (:issue:`9794`) @@ -329,11 +329,11 @@ Other enhancements API changes ~~~~~~~~~~~ -- When passing in an ax to ``df.plot( ..., ax=ax)``, the `sharex` kwarg will now default to `False`. +- When passing in an ax to ``df.plot( ..., ax=ax)``, the ``sharex`` kwarg will now default to ``False``. The result is that the visibility of xlabels and xticklabels will not anymore be changed. You have to do that by yourself for the right axes in your figure or set ``sharex=True`` explicitly (but this changes the visible for all axes in the figure, not only the one which is passed in!). - If pandas creates the subplots itself (e.g. no passed in `ax` kwarg), then the + If pandas creates the subplots itself (e.g. no passed in ``ax`` kwarg), then the default is still ``sharex=True`` and the visibility changes are applied. - :meth:`~pandas.DataFrame.assign` now inserts new columns in alphabetical order. Previously @@ -442,7 +442,7 @@ Bug fixes - Bug in ``read_csv`` and ``read_table`` when using ``skip_rows`` parameter if blank lines are present. (:issue:`9832`) - Bug in ``read_csv()`` interprets ``index_col=True`` as ``1`` (:issue:`9798`) - Bug in index equality comparisons using ``==`` failing on Index/MultiIndex type incompatibility (:issue:`9785`) -- Bug in which ``SparseDataFrame`` could not take `nan` as a column name (:issue:`8822`) +- Bug in which ``SparseDataFrame`` could not take ``nan`` as a column name (:issue:`8822`) - Bug in ``to_msgpack`` and ``read_msgpack`` zlib and blosc compression support (:issue:`9783`) - Bug ``GroupBy.size`` doesn't attach index name properly if grouped by ``TimeGrouper`` (:issue:`9925`) - Bug causing an exception in slice assignments because ``length_of_indexer`` returns wrong results (:issue:`9995`) diff --git a/doc/source/whatsnew/v0.16.2.rst b/doc/source/whatsnew/v0.16.2.rst index a3c34db09f555..2cb0cbec68eff 100644 --- a/doc/source/whatsnew/v0.16.2.rst +++ b/doc/source/whatsnew/v0.16.2.rst @@ -89,7 +89,7 @@ See the :ref:`documentation ` for more. (:issue:`10129`) Other enhancements ^^^^^^^^^^^^^^^^^^ -- Added `rsplit` to Index/Series StringMethods (:issue:`10303`) +- Added ``rsplit`` to Index/Series StringMethods (:issue:`10303`) - Removed the hard-coded size limits on the ``DataFrame`` HTML representation in the IPython notebook, and leave this to IPython itself (only for IPython diff --git a/doc/source/whatsnew/v0.17.0.rst b/doc/source/whatsnew/v0.17.0.rst index db2790242412f..e8f37a72f6417 100644 --- a/doc/source/whatsnew/v0.17.0.rst +++ b/doc/source/whatsnew/v0.17.0.rst @@ -273,9 +273,9 @@ Support for math functions in .eval() df = pd.DataFrame({'a': np.random.randn(10)}) df.eval("b = sin(a)") -The support math functions are `sin`, `cos`, `exp`, `log`, `expm1`, `log1p`, -`sqrt`, `sinh`, `cosh`, `tanh`, `arcsin`, `arccos`, `arctan`, `arccosh`, -`arcsinh`, `arctanh`, `abs` and `arctan2`. +The support math functions are ``sin``, ``cos``, ``exp``, ``log``, ``expm1``, ``log1p``, +``sqrt``, ``sinh``, ``cosh``, ``tanh``, ``arcsin``, ``arccos``, ``arctan``, ``arccosh``, +``arcsinh``, ``arctanh``, ``abs`` and ``arctan2``. These functions map to the intrinsics for the ``NumExpr`` engine. For the Python engine, they are mapped to ``NumPy`` calls. @@ -519,7 +519,7 @@ Other enhancements - ``DataFrame.apply`` will return a Series of dicts if the passed function returns a dict and ``reduce=True`` (:issue:`8735`). -- Allow passing `kwargs` to the interpolation methods (:issue:`10378`). +- Allow passing ``kwargs`` to the interpolation methods (:issue:`10378`). - Improved error message when concatenating an empty iterable of ``Dataframe`` objects (:issue:`9157`) diff --git a/doc/source/whatsnew/v0.18.0.rst b/doc/source/whatsnew/v0.18.0.rst index fbe24675ddfe2..ef5242b0e33c8 100644 --- a/doc/source/whatsnew/v0.18.0.rst +++ b/doc/source/whatsnew/v0.18.0.rst @@ -290,7 +290,7 @@ A new, friendlier ``ValueError`` is added to protect against the mistake of supp .. code-block:: ipython In [2]: pd.Series(['a', 'b', np.nan, 'c']).str.cat(' ') - ValueError: Did you mean to supply a `sep` keyword? + ValueError: Did you mean to supply a ``sep`` keyword? .. _whatsnew_0180.enhancements.rounding: diff --git a/doc/source/whatsnew/v0.19.1.rst b/doc/source/whatsnew/v0.19.1.rst index 9e6b884e08587..f8b60f457b33f 100644 --- a/doc/source/whatsnew/v0.19.1.rst +++ b/doc/source/whatsnew/v0.19.1.rst @@ -29,7 +29,7 @@ Performance improvements - Fixed performance regression in ``Series.asof(where)`` when ``where`` is a scalar (:issue:`14461`) - Improved performance in ``DataFrame.asof(where)`` when ``where`` is a scalar (:issue:`14461`) - Improved performance in ``.to_json()`` when ``lines=True`` (:issue:`14408`) -- Improved performance in certain types of `loc` indexing with a MultiIndex (:issue:`14551`). +- Improved performance in certain types of ``loc`` indexing with a MultiIndex (:issue:`14551`). .. _whatsnew_0191.bug_fixes: diff --git a/doc/source/whatsnew/v0.23.0.rst b/doc/source/whatsnew/v0.23.0.rst index 61e92e2356da9..cb811fd83d90d 100644 --- a/doc/source/whatsnew/v0.23.0.rst +++ b/doc/source/whatsnew/v0.23.0.rst @@ -64,7 +64,7 @@ A ``DataFrame`` can now be written to and subsequently read back via JSON while new_df new_df.dtypes -Please note that the string `index` is not supported with the round trip format, as it is used by default in ``write_json`` to indicate a missing index name. +Please note that the string ``index`` is not supported with the round trip format, as it is used by default in ``write_json`` to indicate a missing index name. .. ipython:: python :okwarning: @@ -457,7 +457,7 @@ These bugs were squashed: Previously, :meth:`Series.str.cat` did not -- in contrast to most of ``pandas`` -- align :class:`Series` on their index before concatenation (see :issue:`18657`). The method has now gained a keyword ``join`` to control the manner of alignment, see examples below and :ref:`here `. -In v.0.23 `join` will default to None (meaning no alignment), but this default will change to ``'left'`` in a future version of pandas. +In v.0.23 ``join`` will default to None (meaning no alignment), but this default will change to ``'left'`` in a future version of pandas. .. ipython:: python :okwarning: @@ -836,7 +836,7 @@ Build changes Index division by zero fills correctly ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Division operations on ``Index`` and subclasses will now fill division of positive numbers by zero with ``np.inf``, division of negative numbers by zero with ``-np.inf`` and `0 / 0` with ``np.nan``. This matches existing ``Series`` behavior. (:issue:`19322`, :issue:`19347`) +Division operations on ``Index`` and subclasses will now fill division of positive numbers by zero with ``np.inf``, division of negative numbers by zero with ``-np.inf`` and ``0 / 0`` with ``np.nan``. This matches existing ``Series`` behavior. (:issue:`19322`, :issue:`19347`) Previous behavior: @@ -974,7 +974,7 @@ automatically so that the printed data frame fits within the current terminal width (``pd.options.display.max_columns=0``) (:issue:`17023`). If Python runs as a Jupyter kernel (such as the Jupyter QtConsole or a Jupyter notebook, as well as in many IDEs), this value cannot be inferred automatically and is thus -set to `20` as in previous versions. In a terminal, this results in a much +set to ``20`` as in previous versions. In a terminal, this results in a much nicer output: .. image:: ../_static/print_df_new.png @@ -1011,7 +1011,7 @@ Datetimelike API changes - Restricted ``DateOffset`` keyword arguments. Previously, ``DateOffset`` subclasses allowed arbitrary keyword arguments which could lead to unexpected behavior. Now, only valid arguments will be accepted. (:issue:`17176`, :issue:`18226`). - :func:`pandas.merge` provides a more informative error message when trying to merge on timezone-aware and timezone-naive columns (:issue:`15800`) - For :class:`DatetimeIndex` and :class:`TimedeltaIndex` with ``freq=None``, addition or subtraction of integer-dtyped array or ``Index`` will raise ``NullFrequencyError`` instead of ``TypeError`` (:issue:`19895`) -- :class:`Timestamp` constructor now accepts a `nanosecond` keyword or positional argument (:issue:`18898`) +- :class:`Timestamp` constructor now accepts a ``nanosecond`` keyword or positional argument (:issue:`18898`) - :class:`DatetimeIndex` will now raise an ``AttributeError`` when the ``tz`` attribute is set after instantiation (:issue:`3746`) - :class:`DatetimeIndex` with a ``pytz`` timezone will now return a consistent ``pytz`` timezone (:issue:`18595`) @@ -1049,7 +1049,7 @@ Other API changes - :class:`DateOffset` objects render more simply, e.g. ```` instead of ```` (:issue:`19403`) - ``Categorical.fillna`` now validates its ``value`` and ``method`` keyword arguments. It now raises when both or none are specified, matching the behavior of :meth:`Series.fillna` (:issue:`19682`) - ``pd.to_datetime('today')`` now returns a datetime, consistent with ``pd.Timestamp('today')``; previously ``pd.to_datetime('today')`` returned a ``.normalized()`` datetime (:issue:`19935`) -- :func:`Series.str.replace` now takes an optional `regex` keyword which, when set to ``False``, uses literal string replacement rather than regex replacement (:issue:`16808`) +- :func:`Series.str.replace` now takes an optional ``regex`` keyword which, when set to ``False``, uses literal string replacement rather than regex replacement (:issue:`16808`) - :func:`DatetimeIndex.strftime` and :func:`PeriodIndex.strftime` now return an ``Index`` instead of a numpy array to be consistent with similar accessors (:issue:`20127`) - Constructing a Series from a list of length 1 no longer broadcasts this list when a longer index is specified (:issue:`19714`, :issue:`20391`). - :func:`DataFrame.to_dict` with ``orient='index'`` no longer casts int columns to float for a DataFrame with only int and float columns (:issue:`18580`) @@ -1234,7 +1234,7 @@ Categorical - Bug in ``Categorical.__iter__`` not converting to Python types (:issue:`19909`) - Bug in :func:`pandas.factorize` returning the unique codes for the ``uniques``. This now returns a ``Categorical`` with the same dtype as the input (:issue:`19721`) - Bug in :func:`pandas.factorize` including an item for missing values in the ``uniques`` return value (:issue:`19721`) -- Bug in :meth:`Series.take` with categorical data interpreting ``-1`` in `indices` as missing value markers, rather than the last element of the Series (:issue:`20664`) +- Bug in :meth:`Series.take` with categorical data interpreting ``-1`` in ``indices`` as missing value markers, rather than the last element of the Series (:issue:`20664`) Datetimelike ^^^^^^^^^^^^ @@ -1316,7 +1316,7 @@ Numeric Strings ^^^^^^^ -- Bug in :func:`Series.str.get` with a dictionary in the values and the index not in the keys, raising `KeyError` (:issue:`20671`) +- Bug in :func:`Series.str.get` with a dictionary in the values and the index not in the keys, raising ``KeyError`` (:issue:`20671`) Indexing @@ -1369,7 +1369,7 @@ IO ^^ - :func:`read_html` now rewinds seekable IO objects after parse failure, before attempting to parse with a new parser. If a parser errors and the object is non-seekable, an informative error is raised suggesting the use of a different parser (:issue:`17975`) -- :meth:`DataFrame.to_html` now has an option to add an id to the leading `` tag (:issue:`8496`) +- :meth:`DataFrame.to_html` now has an option to add an id to the leading ``
    `` tag (:issue:`8496`) - Bug in :func:`read_msgpack` with a non existent file is passed in Python 2 (:issue:`15296`) - Bug in :func:`read_csv` where a ``MultiIndex`` with duplicate columns was not being mangled appropriately (:issue:`18062`) - Bug in :func:`read_csv` where missing values were not being handled properly when ``keep_default_na=False`` with dictionary ``na_values`` (:issue:`19227`) @@ -1378,7 +1378,7 @@ IO - Bug in :func:`DataFrame.to_latex()` where pairs of braces meant to serve as invisible placeholders were escaped (:issue:`18667`) - Bug in :func:`DataFrame.to_latex()` where a ``NaN`` in a ``MultiIndex`` would cause an ``IndexError`` or incorrect output (:issue:`14249`) - Bug in :func:`DataFrame.to_latex()` where a non-string index-level name would result in an ``AttributeError`` (:issue:`19981`) -- Bug in :func:`DataFrame.to_latex()` where the combination of an index name and the `index_names=False` option would result in incorrect output (:issue:`18326`) +- Bug in :func:`DataFrame.to_latex()` where the combination of an index name and the ``index_names=False`` option would result in incorrect output (:issue:`18326`) - Bug in :func:`DataFrame.to_latex()` where a ``MultiIndex`` with an empty string as its name would result in incorrect output (:issue:`18669`) - Bug in :func:`DataFrame.to_latex()` where missing space characters caused wrong escaping and produced non-valid latex in some cases (:issue:`20859`) - Bug in :func:`read_json` where large numeric values were causing an ``OverflowError`` (:issue:`18842`) @@ -1412,7 +1412,7 @@ GroupBy/resample/rolling - Bug in :func:`DataFrame.groupby` where tuples were interpreted as lists of keys rather than as keys (:issue:`17979`, :issue:`18249`) - Bug in :func:`DataFrame.groupby` where aggregation by ``first``/``last``/``min``/``max`` was causing timestamps to lose precision (:issue:`19526`) - Bug in :func:`DataFrame.transform` where particular aggregation functions were being incorrectly cast to match the dtype(s) of the grouped data (:issue:`19200`) -- Bug in :func:`DataFrame.groupby` passing the `on=` kwarg, and subsequently using ``.apply()`` (:issue:`17813`) +- Bug in :func:`DataFrame.groupby` passing the ``on=`` kwarg, and subsequently using ``.apply()`` (:issue:`17813`) - Bug in :func:`DataFrame.resample().aggregate ` not raising a ``KeyError`` when aggregating a non-existent column (:issue:`16766`, :issue:`19566`) - Bug in :func:`DataFrameGroupBy.cumsum` and :func:`DataFrameGroupBy.cumprod` when ``skipna`` was passed (:issue:`19806`) - Bug in :func:`DataFrame.resample` that dropped timezone information (:issue:`13238`) diff --git a/doc/source/whatsnew/v0.23.1.rst b/doc/source/whatsnew/v0.23.1.rst index 03b7d9db6bc63..b51368c87f991 100644 --- a/doc/source/whatsnew/v0.23.1.rst +++ b/doc/source/whatsnew/v0.23.1.rst @@ -74,10 +74,10 @@ In addition, ordering comparisons will raise a ``TypeError`` in the future. a tz-aware time instead of tz-naive (:issue:`21267`) and :attr:`DatetimeIndex.date` returned incorrect date when the input date has a non-UTC timezone (:issue:`21230`). - Fixed regression in :meth:`pandas.io.json.json_normalize` when called with ``None`` values - in nested levels in JSON, and to not drop keys with value as `None` (:issue:`21158`, :issue:`21356`). + in nested levels in JSON, and to not drop keys with value as ``None`` (:issue:`21158`, :issue:`21356`). - Bug in :meth:`~DataFrame.to_csv` causes encoding error when compression and encoding are specified (:issue:`21241`, :issue:`21118`) - Bug preventing pandas from being importable with -OO optimization (:issue:`21071`) -- Bug in :meth:`Categorical.fillna` incorrectly raising a ``TypeError`` when `value` the individual categories are iterable and `value` is an iterable (:issue:`21097`, :issue:`19788`) +- Bug in :meth:`Categorical.fillna` incorrectly raising a ``TypeError`` when ``value`` the individual categories are iterable and ``value`` is an iterable (:issue:`21097`, :issue:`19788`) - Fixed regression in constructors coercing NA values like ``None`` to strings when passing ``dtype=str`` (:issue:`21083`) - Regression in :func:`pivot_table` where an ordered ``Categorical`` with missing values for the pivot's ``index`` would give a mis-aligned result (:issue:`21133`) @@ -106,7 +106,7 @@ Bug fixes **Data-type specific** -- Bug in :meth:`Series.str.replace()` where the method throws `TypeError` on Python 3.5.2 (:issue:`21078`) +- Bug in :meth:`Series.str.replace()` where the method throws ``TypeError`` on Python 3.5.2 (:issue:`21078`) - Bug in :class:`Timedelta` where passing a float with a unit would prematurely round the float precision (:issue:`14156`) - Bug in :func:`pandas.testing.assert_index_equal` which raised ``AssertionError`` incorrectly, when comparing two :class:`CategoricalIndex` objects with param ``check_categorical=False`` (:issue:`19776`) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 27cbdc9169965..9a2e96f717d9b 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -376,7 +376,7 @@ Other enhancements - :func:`DataFrame.to_html` now accepts ``render_links`` as an argument, allowing the user to generate HTML with links to any URLs that appear in the DataFrame. See the :ref:`section on writing HTML ` in the IO docs for example usage. (:issue:`2679`) - :func:`pandas.read_csv` now supports pandas extension types as an argument to ``dtype``, allowing the user to use pandas extension types when reading CSVs. (:issue:`23228`) -- The :meth:`~DataFrame.shift` method now accepts `fill_value` as an argument, allowing the user to specify a value which will be used instead of NA/NaT in the empty periods. (:issue:`15486`) +- The :meth:`~DataFrame.shift` method now accepts ``fill_value`` as an argument, allowing the user to specify a value which will be used instead of NA/NaT in the empty periods. (:issue:`15486`) - :func:`to_datetime` now supports the ``%Z`` and ``%z`` directive when passed into ``format`` (:issue:`13486`) - :func:`Series.mode` and :func:`DataFrame.mode` now support the ``dropna`` parameter which can be used to specify whether ``NaN``/``NaT`` values should be considered (:issue:`17534`) - :func:`DataFrame.to_csv` and :func:`Series.to_csv` now support the ``compression`` keyword when a file handle is passed. (:issue:`21227`) @@ -474,8 +474,8 @@ and replaced it with references to ``pyarrow`` (:issue:`21639` and :issue:`23053 .. _whatsnew_0240.api_breaking.csv_line_terminator: -`os.linesep` is used for ``line_terminator`` of ``DataFrame.to_csv`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +``os.linesep`` is used for ``line_terminator`` of ``DataFrame.to_csv`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ :func:`DataFrame.to_csv` now uses :func:`os.linesep` rather than ``'\n'`` for the default line terminator (:issue:`20353`). @@ -556,8 +556,8 @@ You must pass in the ``line_terminator`` explicitly, even in this case. .. _whatsnew_0240.bug_fixes.nan_with_str_dtype: -Proper handling of `np.NaN` in a string data-typed column with the Python engine -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Proper handling of ``np.NaN`` in a string data-typed column with the Python engine +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ There was bug in :func:`read_excel` and :func:`read_csv` with the Python engine, where missing values turned to ``'nan'`` with ``dtype=str`` and @@ -1198,7 +1198,7 @@ Other API changes - :meth:`DataFrame.set_index` now gives a better (and less frequent) KeyError, raises a ``ValueError`` for incorrect types, and will not fail on duplicate column names with ``drop=True``. (:issue:`22484`) - Slicing a single row of a DataFrame with multiple ExtensionArrays of the same type now preserves the dtype, rather than coercing to object (:issue:`22784`) -- :class:`DateOffset` attribute `_cacheable` and method `_should_cache` have been removed (:issue:`23118`) +- :class:`DateOffset` attribute ``_cacheable`` and method ``_should_cache`` have been removed (:issue:`23118`) - :meth:`Series.searchsorted`, when supplied a scalar value to search for, now returns a scalar instead of an array (:issue:`23801`). - :meth:`Categorical.searchsorted`, when supplied a scalar value to search for, now returns a scalar instead of an array (:issue:`23466`). - :meth:`Categorical.searchsorted` now raises a ``KeyError`` rather that a ``ValueError``, if a searched for key is not found in its categories (:issue:`23466`). @@ -1317,7 +1317,7 @@ Deprecations - Timezone converting a tz-aware ``datetime.datetime`` or :class:`Timestamp` with :class:`Timestamp` and the ``tz`` argument is now deprecated. Instead, use :meth:`Timestamp.tz_convert` (:issue:`23579`) - :func:`pandas.api.types.is_period` is deprecated in favor of ``pandas.api.types.is_period_dtype`` (:issue:`23917`) - :func:`pandas.api.types.is_datetimetz` is deprecated in favor of ``pandas.api.types.is_datetime64tz`` (:issue:`23917`) -- Creating a :class:`TimedeltaIndex`, :class:`DatetimeIndex`, or :class:`PeriodIndex` by passing range arguments `start`, `end`, and `periods` is deprecated in favor of :func:`timedelta_range`, :func:`date_range`, or :func:`period_range` (:issue:`23919`) +- Creating a :class:`TimedeltaIndex`, :class:`DatetimeIndex`, or :class:`PeriodIndex` by passing range arguments ``start``, ``end``, and ``periods`` is deprecated in favor of :func:`timedelta_range`, :func:`date_range`, or :func:`period_range` (:issue:`23919`) - Passing a string alias like ``'datetime64[ns, UTC]'`` as the ``unit`` parameter to :class:`DatetimeTZDtype` is deprecated. Use :class:`DatetimeTZDtype.construct_from_string` instead (:issue:`23990`). - The ``skipna`` parameter of :meth:`~pandas.api.types.infer_dtype` will switch to ``True`` by default in a future version of pandas (:issue:`17066`, :issue:`24050`) - In :meth:`Series.where` with Categorical data, providing an ``other`` that is not present in the categories is deprecated. Convert the categorical to a different dtype or add the ``other`` to the categories first (:issue:`24077`). @@ -1534,7 +1534,7 @@ Performance improvements - Improved the performance of :func:`pandas.get_dummies` with ``sparse=True`` (:issue:`21997`) - Improved performance of :func:`IndexEngine.get_indexer_non_unique` for sorted, non-unique indexes (:issue:`9466`) - Improved performance of :func:`PeriodIndex.unique` (:issue:`23083`) -- Improved performance of :func:`concat` for `Series` objects (:issue:`23404`) +- Improved performance of :func:`concat` for ``Series`` objects (:issue:`23404`) - Improved performance of :meth:`DatetimeIndex.normalize` and :meth:`Timestamp.normalize` for timezone naive or UTC datetimes (:issue:`23634`) - Improved performance of :meth:`DatetimeIndex.tz_localize` and various ``DatetimeIndex`` attributes with dateutil UTC timezone (:issue:`23772`) - Fixed a performance regression on Windows with Python 3.7 of :func:`read_csv` (:issue:`23516`) @@ -1602,7 +1602,7 @@ Datetimelike - Bug in :class:`DataFrame` when creating a new column from an ndarray of :class:`Timestamp` objects with timezones creating an object-dtype column, rather than datetime with timezone (:issue:`23932`) - Bug in :class:`Timestamp` constructor which would drop the frequency of an input :class:`Timestamp` (:issue:`22311`) - Bug in :class:`DatetimeIndex` where calling ``np.array(dtindex, dtype=object)`` would incorrectly return an array of ``long`` objects (:issue:`23524`) -- Bug in :class:`Index` where passing a timezone-aware :class:`DatetimeIndex` and `dtype=object` would incorrectly raise a ``ValueError`` (:issue:`23524`) +- Bug in :class:`Index` where passing a timezone-aware :class:`DatetimeIndex` and ``dtype=object`` would incorrectly raise a ``ValueError`` (:issue:`23524`) - Bug in :class:`Index` where calling ``np.array(dtindex, dtype=object)`` on a timezone-naive :class:`DatetimeIndex` would return an array of ``datetime`` objects instead of :class:`Timestamp` objects, potentially losing nanosecond portions of the timestamps (:issue:`23524`) - Bug in :class:`Categorical.__setitem__` not allowing setting with another ``Categorical`` when both are unordered and have the same categories, but in a different order (:issue:`24142`) - Bug in :func:`date_range` where using dates with millisecond resolution or higher could return incorrect values or the wrong number of values in the index (:issue:`24110`) @@ -1647,7 +1647,7 @@ Timezones - Bug in :class:`Series` constructor which would coerce tz-aware and tz-naive :class:`Timestamp` to tz-aware (:issue:`13051`) - Bug in :class:`Index` with ``datetime64[ns, tz]`` dtype that did not localize integer data correctly (:issue:`20964`) - Bug in :class:`DatetimeIndex` where constructing with an integer and tz would not localize correctly (:issue:`12619`) -- Fixed bug where :meth:`DataFrame.describe` and :meth:`Series.describe` on tz-aware datetimes did not show `first` and `last` result (:issue:`21328`) +- Fixed bug where :meth:`DataFrame.describe` and :meth:`Series.describe` on tz-aware datetimes did not show ``first`` and ``last`` result (:issue:`21328`) - Bug in :class:`DatetimeIndex` comparisons failing to raise ``TypeError`` when comparing timezone-aware ``DatetimeIndex`` against ``np.datetime64`` (:issue:`22074`) - Bug in ``DataFrame`` assignment with a timezone-aware scalar (:issue:`19843`) - Bug in :func:`DataFrame.asof` that raised a ``TypeError`` when attempting to compare tz-naive and tz-aware timestamps (:issue:`21194`) @@ -1693,7 +1693,7 @@ Numeric - :meth:`Series.agg` can now handle numpy NaN-aware methods like :func:`numpy.nansum` (:issue:`19629`) - Bug in :meth:`Series.rank` and :meth:`DataFrame.rank` when ``pct=True`` and more than 2\ :sup:`24` rows are present resulted in percentages greater than 1.0 (:issue:`18271`) - Calls such as :meth:`DataFrame.round` with a non-unique :meth:`CategoricalIndex` now return expected data. Previously, data would be improperly duplicated (:issue:`21809`). -- Added ``log10``, `floor` and `ceil` to the list of supported functions in :meth:`DataFrame.eval` (:issue:`24139`, :issue:`24353`) +- Added ``log10``, ``floor`` and ``ceil`` to the list of supported functions in :meth:`DataFrame.eval` (:issue:`24139`, :issue:`24353`) - Logical operations ``&, |, ^`` between :class:`Series` and :class:`Index` will no longer raise ``ValueError`` (:issue:`22092`) - Checking PEP 3141 numbers in :func:`~pandas.api.types.is_scalar` function returns ``True`` (:issue:`22903`) - Reduction methods like :meth:`Series.sum` now accept the default value of ``keepdims=False`` when called from a NumPy ufunc, rather than raising a ``TypeError``. Full support for ``keepdims`` has not been implemented (:issue:`24356`). @@ -1859,7 +1859,7 @@ Reshaping ^^^^^^^^^ - Bug in :func:`pandas.concat` when joining resampled DataFrames with timezone aware index (:issue:`13783`) -- Bug in :func:`pandas.concat` when joining only `Series` the `names` argument of `concat` is no longer ignored (:issue:`23490`) +- Bug in :func:`pandas.concat` when joining only ``Series`` the ``names`` argument of ``concat`` is no longer ignored (:issue:`23490`) - Bug in :meth:`Series.combine_first` with ``datetime64[ns, tz]`` dtype which would return tz-naive result (:issue:`21469`) - Bug in :meth:`Series.where` and :meth:`DataFrame.where` with ``datetime64[ns, tz]`` dtype (:issue:`21546`) - Bug in :meth:`DataFrame.where` with an empty DataFrame and empty ``cond`` having non-bool dtype (:issue:`21947`) diff --git a/doc/source/whatsnew/v0.24.1.rst b/doc/source/whatsnew/v0.24.1.rst index aead8c48eb9b7..1918a1e8caf6c 100644 --- a/doc/source/whatsnew/v0.24.1.rst +++ b/doc/source/whatsnew/v0.24.1.rst @@ -33,7 +33,7 @@ This change will allow ``sort=True`` to mean "always sort" in a future release. The same change applies to :meth:`Index.difference` and :meth:`Index.symmetric_difference`, which would not sort the result when the values could not be compared. -The `sort` option for :meth:`Index.intersection` has changed in three ways. +The ``sort`` option for :meth:`Index.intersection` has changed in three ways. 1. The default has changed from ``True`` to ``False``, to restore the pandas 0.23.4 and earlier behavior of not sorting by default. @@ -55,7 +55,7 @@ Fixed regressions - Fixed regression in :class:`Index.intersection` incorrectly sorting the values by default (:issue:`24959`). - Fixed regression in :func:`merge` when merging an empty ``DataFrame`` with multiple timezone-aware columns on one of the timezone-aware columns (:issue:`25014`). - Fixed regression in :meth:`Series.rename_axis` and :meth:`DataFrame.rename_axis` where passing ``None`` failed to remove the axis name (:issue:`25034`) -- Fixed regression in :func:`to_timedelta` with `box=False` incorrectly returning a ``datetime64`` object instead of a ``timedelta64`` object (:issue:`24961`) +- Fixed regression in :func:`to_timedelta` with ``box=False`` incorrectly returning a ``datetime64`` object instead of a ``timedelta64`` object (:issue:`24961`) - Fixed regression where custom hashable types could not be used as column keys in :meth:`DataFrame.set_index` (:issue:`24969`) .. _whatsnew_0241.bug_fixes: diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 0f0f009307c75..7b4440148677b 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -14,7 +14,7 @@ What's new in 0.25.0 (July 18, 2019) .. warning:: - `Panel` has been fully removed. For N-D labeled data structures, please + ``Panel`` has been fully removed. For N-D labeled data structures, please use `xarray `_ .. warning:: @@ -1167,7 +1167,7 @@ I/O - Fixed bug in :func:`pandas.read_csv` where a BOM would result in incorrect parsing using engine='python' (:issue:`26545`) - :func:`read_excel` now raises a ``ValueError`` when input is of type :class:`pandas.io.excel.ExcelFile` and ``engine`` param is passed since :class:`pandas.io.excel.ExcelFile` has an engine defined (:issue:`26566`) - Bug while selecting from :class:`HDFStore` with ``where=''`` specified (:issue:`26610`). -- Fixed bug in :func:`DataFrame.to_excel()` where custom objects (i.e. `PeriodIndex`) inside merged cells were not being converted into types safe for the Excel writer (:issue:`27006`) +- Fixed bug in :func:`DataFrame.to_excel()` where custom objects (i.e. ``PeriodIndex``) inside merged cells were not being converted into types safe for the Excel writer (:issue:`27006`) - Bug in :meth:`read_hdf` where reading a timezone aware :class:`DatetimeIndex` would raise a ``TypeError`` (:issue:`11926`) - Bug in :meth:`to_msgpack` and :meth:`read_msgpack` which would raise a ``ValueError`` rather than a ``FileNotFoundError`` for an invalid path (:issue:`27160`) - Fixed bug in :meth:`DataFrame.to_parquet` which would raise a ``ValueError`` when the dataframe had no columns (:issue:`27339`) @@ -1262,7 +1262,7 @@ Other - Removed unused C functions from vendored UltraJSON implementation (:issue:`26198`) - Allow :class:`Index` and :class:`RangeIndex` to be passed to numpy ``min`` and ``max`` functions (:issue:`26125`) - Use actual class name in repr of empty objects of a ``Series`` subclass (:issue:`27001`). -- Bug in :class:`DataFrame` where passing an object array of timezone-aware `datetime` objects would incorrectly raise ``ValueError`` (:issue:`13287`) +- Bug in :class:`DataFrame` where passing an object array of timezone-aware ``datetime`` objects would incorrectly raise ``ValueError`` (:issue:`13287`) .. _whatsnew_0.250.contributors: diff --git a/doc/source/whatsnew/v0.25.1.rst b/doc/source/whatsnew/v0.25.1.rst index 944021ca0fcae..2a2b511356a69 100644 --- a/doc/source/whatsnew/v0.25.1.rst +++ b/doc/source/whatsnew/v0.25.1.rst @@ -9,10 +9,10 @@ including other versions of pandas. I/O and LZMA ~~~~~~~~~~~~ -Some users may unknowingly have an incomplete Python installation lacking the `lzma` module from the standard library. In this case, `import pandas` failed due to an `ImportError` (:issue:`27575`). -Pandas will now warn, rather than raising an `ImportError` if the `lzma` module is not present. Any subsequent attempt to use `lzma` methods will raise a `RuntimeError`. -A possible fix for the lack of the `lzma` module is to ensure you have the necessary libraries and then re-install Python. -For example, on MacOS installing Python with `pyenv` may lead to an incomplete Python installation due to unmet system dependencies at compilation time (like `xz`). Compilation will succeed, but Python might fail at run time. The issue can be solved by installing the necessary dependencies and then re-installing Python. +Some users may unknowingly have an incomplete Python installation lacking the ``lzma`` module from the standard library. In this case, ``import pandas`` failed due to an ``ImportError`` (:issue:`27575`). +Pandas will now warn, rather than raising an ``ImportError`` if the ``lzma`` module is not present. Any subsequent attempt to use ``lzma`` methods will raise a ``RuntimeError``. +A possible fix for the lack of the ``lzma`` module is to ensure you have the necessary libraries and then re-install Python. +For example, on MacOS installing Python with ``pyenv`` may lead to an incomplete Python installation due to unmet system dependencies at compilation time (like ``xz``). Compilation will succeed, but Python might fail at run time. The issue can be solved by installing the necessary dependencies and then re-installing Python. .. _whatsnew_0251.bug_fixes: @@ -52,7 +52,7 @@ Conversion Interval ^^^^^^^^ -- Bug in :class:`IntervalIndex` where `dir(obj)` would raise ``ValueError`` (:issue:`27571`) +- Bug in :class:`IntervalIndex` where ``dir(obj)`` would raise ``ValueError`` (:issue:`27571`) Indexing ^^^^^^^^ @@ -89,13 +89,13 @@ Groupby/resample/rolling - Bug in :meth:`pandas.core.groupby.DataFrameGroupBy.transform` where applying a timezone conversion lambda function would drop timezone information (:issue:`27496`) - Bug in :meth:`pandas.core.groupby.GroupBy.nth` where ``observed=False`` was being ignored for Categorical groupers (:issue:`26385`) - Bug in windowing over read-only arrays (:issue:`27766`) -- Fixed segfault in `pandas.core.groupby.DataFrameGroupBy.quantile` when an invalid quantile was passed (:issue:`27470`) +- Fixed segfault in ``pandas.core.groupby.DataFrameGroupBy.quantile`` when an invalid quantile was passed (:issue:`27470`) Reshaping ^^^^^^^^^ - A ``KeyError`` is now raised if ``.unstack()`` is called on a :class:`Series` or :class:`DataFrame` with a flat :class:`Index` passing a name which is not the correct one (:issue:`18303`) -- Bug :meth:`merge_asof` could not merge :class:`Timedelta` objects when passing `tolerance` kwarg (:issue:`27642`) +- Bug :meth:`merge_asof` could not merge :class:`Timedelta` objects when passing ``tolerance`` kwarg (:issue:`27642`) - Bug in :meth:`DataFrame.crosstab` when ``margins`` set to ``True`` and ``normalize`` is not ``False``, an error is raised. (:issue:`27500`) - :meth:`DataFrame.join` now suppresses the ``FutureWarning`` when the sort parameter is specified (:issue:`21952`) - Bug in :meth:`DataFrame.join` raising with readonly arrays (:issue:`27943`) diff --git a/doc/source/whatsnew/v0.6.0.rst b/doc/source/whatsnew/v0.6.0.rst index f984b9ad71b63..1cb9dcbe159aa 100644 --- a/doc/source/whatsnew/v0.6.0.rst +++ b/doc/source/whatsnew/v0.6.0.rst @@ -52,7 +52,7 @@ New features Performance enhancements ~~~~~~~~~~~~~~~~~~~~~~~~ - VBENCH Cythonized ``cache_readonly``, resulting in substantial micro-performance enhancements throughout the code base (:issue:`361`) -- VBENCH Special Cython matrix iterator for applying arbitrary reduction operations with 3-5x better performance than `np.apply_along_axis` (:issue:`309`) +- VBENCH Special Cython matrix iterator for applying arbitrary reduction operations with 3-5x better performance than ``np.apply_along_axis`` (:issue:`309`) - VBENCH Improved performance of ``MultiIndex.from_tuples`` - VBENCH Special Cython matrix iterator for applying arbitrary reduction operations - VBENCH + DOCUMENT Add ``raw`` option to ``DataFrame.apply`` for getting better performance when diff --git a/doc/source/whatsnew/v0.6.1.rst b/doc/source/whatsnew/v0.6.1.rst index 8eea0a07f1f79..8ee80fa2c44b1 100644 --- a/doc/source/whatsnew/v0.6.1.rst +++ b/doc/source/whatsnew/v0.6.1.rst @@ -16,12 +16,12 @@ New features - Add PyQt table widget to sandbox (:issue:`435`) - DataFrame.align can :ref:`accept Series arguments ` and an :ref:`axis option ` (:issue:`461`) -- Implement new :ref:`SparseArray ` and `SparseList` +- Implement new :ref:`SparseArray ` and ``SparseList`` data structures. SparseSeries now derives from SparseArray (:issue:`463`) - :ref:`Better console printing options ` (:issue:`453`) - Implement fast :ref:`data ranking ` for Series and DataFrame, fast versions of scipy.stats.rankdata (:issue:`428`) -- Implement `DataFrame.from_items` alternate +- Implement ``DataFrame.from_items`` alternate constructor (:issue:`444`) - DataFrame.convert_objects method for :ref:`inferring better dtypes ` for object columns (:issue:`302`) @@ -37,7 +37,7 @@ New features Performance improvements ~~~~~~~~~~~~~~~~~~~~~~~~ -- Improve memory usage of `DataFrame.describe` (do not copy data +- Improve memory usage of ``DataFrame.describe`` (do not copy data unnecessarily) (PR #425) - Optimize scalar value lookups in the general case by 25% or more in Series diff --git a/doc/source/whatsnew/v0.7.0.rst b/doc/source/whatsnew/v0.7.0.rst index a193b8049e951..2fe686d8858a2 100644 --- a/doc/source/whatsnew/v0.7.0.rst +++ b/doc/source/whatsnew/v0.7.0.rst @@ -20,7 +20,7 @@ New features ``DataFrame.append`` (:issue:`468`, :issue:`479`, :issue:`273`) - :ref:`Can ` pass multiple DataFrames to - `DataFrame.append` to concatenate (stack) and multiple Series to + ``DataFrame.append`` to concatenate (stack) and multiple Series to ``Series.append`` too - :ref:`Can` pass list of dicts (e.g., a @@ -282,7 +282,7 @@ Performance improvements - Substantially improve performance of multi-GroupBy aggregation when a Python function is passed, reuse ndarray object in Cython (:issue:`496`) - Can store objects indexed by tuples and floats in HDFStore (:issue:`492`) -- Don't print length by default in Series.to_string, add `length` option (:issue:`489`) +- Don't print length by default in Series.to_string, add ``length`` option (:issue:`489`) - Improve Cython code for multi-groupby to aggregate without having to sort the data (:issue:`93`) - Improve MultiIndex reindexing speed by storing tuples in the MultiIndex, diff --git a/doc/source/whatsnew/v0.8.0.rst b/doc/source/whatsnew/v0.8.0.rst index 2a49315cc3b12..9bba68d8c331d 100644 --- a/doc/source/whatsnew/v0.8.0.rst +++ b/doc/source/whatsnew/v0.8.0.rst @@ -69,15 +69,15 @@ Time Series changes and improvements series. Replaces now deprecated DateRange class - New ``PeriodIndex`` and ``Period`` classes for representing :ref:`time spans ` and performing **calendar logic**, - including the `12 fiscal quarterly frequencies `. + including the ``12 fiscal quarterly frequencies ``. This is a partial port of, and a substantial enhancement to, elements of the scikits.timeseries code base. Support for conversion between PeriodIndex and DatetimeIndex -- New Timestamp data type subclasses `datetime.datetime`, providing the same +- New Timestamp data type subclasses ``datetime.datetime``, providing the same interface while enabling working with nanosecond-resolution data. Also provides :ref:`easy time zone conversions `. - Enhanced support for :ref:`time zones `. Add - `tz_convert` and ``tz_localize`` methods to TimeSeries and DataFrame. All + ``tz_convert`` and ``tz_localize`` methods to TimeSeries and DataFrame. All timestamps are stored as UTC; Timestamps from DatetimeIndex objects with time zone set will be localized to local time. Time zone conversions are therefore essentially free. User needs to know very little about pytz library now; only @@ -91,7 +91,7 @@ Time Series changes and improvements matplotlib-based plotting code - New ``date_range``, ``bdate_range``, and ``period_range`` :ref:`factory functions ` -- Robust **frequency inference** function `infer_freq` and ``inferred_freq`` +- Robust **frequency inference** function ``infer_freq`` and ``inferred_freq`` property of DatetimeIndex, with option to infer frequency on construction of DatetimeIndex - to_datetime function efficiently **parses array of strings** to diff --git a/doc/source/whatsnew/v0.9.0.rst b/doc/source/whatsnew/v0.9.0.rst index 565b965c116db..5172b1989765d 100644 --- a/doc/source/whatsnew/v0.9.0.rst +++ b/doc/source/whatsnew/v0.9.0.rst @@ -8,7 +8,7 @@ Version 0.9.0 (October 7, 2012) This is a major release from 0.8.1 and includes several new features and enhancements along with a large number of bug fixes. New features include -vectorized unicode encoding/decoding for `Series.str`, `to_latex` method to +vectorized unicode encoding/decoding for ``Series.str``, ``to_latex`` method to DataFrame, more flexible parsing of boolean values, and enabling the download of options data from Yahoo! Finance. diff --git a/doc/source/whatsnew/v0.9.1.rst b/doc/source/whatsnew/v0.9.1.rst index 3b2924d175cdf..6b05e5bcded7e 100644 --- a/doc/source/whatsnew/v0.9.1.rst +++ b/doc/source/whatsnew/v0.9.1.rst @@ -15,7 +15,7 @@ DataFrame. New features ~~~~~~~~~~~~ - - `Series.sort`, `DataFrame.sort`, and `DataFrame.sort_index` can now be + - ``Series.sort``, ``DataFrame.sort``, and ``DataFrame.sort_index`` can now be specified in a per-column manner to support multiple sort orders (:issue:`928`) .. code-block:: ipython @@ -34,8 +34,8 @@ New features 1 1 0 0 5 1 0 0 - - `DataFrame.rank` now supports additional argument values for the - `na_option` parameter so missing values can be assigned either the largest + - ``DataFrame.rank`` now supports additional argument values for the + ``na_option`` parameter so missing values can be assigned either the largest or the smallest rank (:issue:`1508`, :issue:`2159`) .. ipython:: python @@ -51,10 +51,10 @@ New features df.rank(na_option='bottom') - - DataFrame has new `where` and `mask` methods to select values according to a + - DataFrame has new ``where`` and ``mask`` methods to select values according to a given boolean mask (:issue:`2109`, :issue:`2151`) - DataFrame currently supports slicing via a boolean vector the same length as the DataFrame (inside the `[]`). + DataFrame currently supports slicing via a boolean vector the same length as the DataFrame (inside the ``[]``). The returned DataFrame has the same number of columns as the original, but is sliced on its index. .. ipython:: python @@ -67,8 +67,8 @@ New features If a DataFrame is sliced with a DataFrame based boolean condition (with the same size as the original DataFrame), then a DataFrame the same size (index and columns) as the original is returned, with - elements that do not meet the boolean condition as `NaN`. This is accomplished via - the new method `DataFrame.where`. In addition, `where` takes an optional `other` argument for replacement. + elements that do not meet the boolean condition as ``NaN``. This is accomplished via + the new method ``DataFrame.where``. In addition, ``where`` takes an optional ``other`` argument for replacement. .. ipython:: python @@ -78,8 +78,8 @@ New features df.where(df>0,-df) - Furthermore, `where` now aligns the input boolean condition (ndarray or DataFrame), such that partial selection - with setting is possible. This is analogous to partial setting via `.ix` (but on the contents rather than the axis labels) + Furthermore, ``where`` now aligns the input boolean condition (ndarray or DataFrame), such that partial selection + with setting is possible. This is analogous to partial setting via ``.ix`` (but on the contents rather than the axis labels) .. ipython:: python @@ -87,7 +87,7 @@ New features df2[ df2[1:4] > 0 ] = 3 df2 - `DataFrame.mask` is the inverse boolean operation of `where`. + ``DataFrame.mask`` is the inverse boolean operation of ``where``. .. ipython:: python @@ -103,9 +103,9 @@ New features - Added option to disable pandas-style tick locators and formatters - using `series.plot(x_compat=True)` or `pandas.plot_params['x_compat'] = - True` (:issue:`2205`) - - Existing TimeSeries methods `at_time` and `between_time` were added to + using ``series.plot(x_compat=True)`` or ``pandas.plot_params['x_compat'] = + True`` (:issue:`2205`) + - Existing TimeSeries methods ``at_time`` and ``between_time`` were added to DataFrame (:issue:`2149`) - DataFrame.dot can now accept ndarrays (:issue:`2042`) - DataFrame.drop now supports non-unique indexes (:issue:`2101`) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 4f0ca97310d85..32175d344c320 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -250,7 +250,7 @@ Other enhancements - :func:`read_excel` now can read binary Excel (``.xlsb``) files by passing ``engine='pyxlsb'``. For more details and example usage, see the :ref:`Binary Excel files documentation `. Closes :issue:`8540`. - The ``partition_cols`` argument in :meth:`DataFrame.to_parquet` now accepts a string (:issue:`27117`) - :func:`pandas.read_json` now parses ``NaN``, ``Infinity`` and ``-Infinity`` (:issue:`12213`) -- DataFrame constructor preserve `ExtensionArray` dtype with `ExtensionArray` (:issue:`11363`) +- DataFrame constructor preserve ``ExtensionArray`` dtype with ``ExtensionArray`` (:issue:`11363`) - :meth:`DataFrame.sort_values` and :meth:`Series.sort_values` have gained ``ignore_index`` keyword to be able to reset index after sorting (:issue:`30114`) - :meth:`DataFrame.sort_index` and :meth:`Series.sort_index` have gained ``ignore_index`` keyword to reset index (:issue:`30114`) - :meth:`DataFrame.drop_duplicates` has gained ``ignore_index`` keyword to reset index (:issue:`30114`) @@ -610,7 +610,7 @@ When :class:`Categorical` contains ``np.nan``, Default dtype of empty :class:`pandas.Series` ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Initialising an empty :class:`pandas.Series` without specifying a dtype will raise a `DeprecationWarning` now +Initialising an empty :class:`pandas.Series` without specifying a dtype will raise a ``DeprecationWarning`` now (:issue:`17261`). The default dtype will change from ``float64`` to ``object`` in future releases so that it is consistent with the behaviour of :class:`DataFrame` and :class:`Index`. @@ -974,7 +974,7 @@ or ``matplotlib.Axes.plot``. See :ref:`plotting.formatters` for more. - The 'outer' method on Numpy ufuncs, e.g. ``np.subtract.outer`` operating on :class:`Series` objects is no longer supported, and will raise ``NotImplementedError`` (:issue:`27198`) - Removed ``Series.get_dtype_counts`` and ``DataFrame.get_dtype_counts`` (:issue:`27145`) - Changed the default "fill_value" argument in :meth:`Categorical.take` from ``True`` to ``False`` (:issue:`20841`) -- Changed the default value for the `raw` argument in :func:`Series.rolling().apply() `, :func:`DataFrame.rolling().apply() `, :func:`Series.expanding().apply() `, and :func:`DataFrame.expanding().apply() ` from ``None`` to ``False`` (:issue:`20584`) +- Changed the default value for the ``raw`` argument in :func:`Series.rolling().apply() `, :func:`DataFrame.rolling().apply() `, :func:`Series.expanding().apply() `, and :func:`DataFrame.expanding().apply() ` from ``None`` to ``False`` (:issue:`20584`) - Removed deprecated behavior of :meth:`Series.argmin` and :meth:`Series.argmax`, use :meth:`Series.idxmin` and :meth:`Series.idxmax` for the old behavior (:issue:`16955`) - Passing a tz-aware ``datetime.datetime`` or :class:`Timestamp` into the :class:`Timestamp` constructor with the ``tz`` argument now raises a ``ValueError`` (:issue:`23621`) - Removed ``Series.base``, ``Index.base``, ``Categorical.base``, ``Series.flags``, ``Index.flags``, ``PeriodArray.flags``, ``Series.strides``, ``Index.strides``, ``Series.itemsize``, ``Index.itemsize``, ``Series.data``, ``Index.data`` (:issue:`20721`) @@ -1058,7 +1058,7 @@ Datetimelike - Bug in :class:`Series` and :class:`DataFrame` with integer dtype failing to raise ``TypeError`` when adding or subtracting a ``np.datetime64`` object (:issue:`28080`) - Bug in :meth:`Series.astype`, :meth:`Index.astype`, and :meth:`DataFrame.astype` failing to handle ``NaT`` when casting to an integer dtype (:issue:`28492`) - Bug in :class:`Week` with ``weekday`` incorrectly raising ``AttributeError`` instead of ``TypeError`` when adding or subtracting an invalid type (:issue:`28530`) -- Bug in :class:`DataFrame` arithmetic operations when operating with a :class:`Series` with dtype `'timedelta64[ns]'` (:issue:`28049`) +- Bug in :class:`DataFrame` arithmetic operations when operating with a :class:`Series` with dtype ``'timedelta64[ns]'`` (:issue:`28049`) - Bug in :func:`core.groupby.generic.SeriesGroupBy.apply` raising ``ValueError`` when a column in the original DataFrame is a datetime and the column labels are not standard integers (:issue:`28247`) - Bug in :func:`pandas._config.localization.get_locales` where the ``locales -a`` encodes the locales list as windows-1252 (:issue:`23638`, :issue:`24760`, :issue:`27368`) - Bug in :meth:`Series.var` failing to raise ``TypeError`` when called with ``timedelta64[ns]`` dtype (:issue:`28289`) @@ -1066,7 +1066,7 @@ Datetimelike - Bug in masking datetime-like arrays with a boolean mask of an incorrect length not raising an ``IndexError`` (:issue:`30308`) - Bug in :attr:`Timestamp.resolution` being a property instead of a class attribute (:issue:`29910`) - Bug in :func:`pandas.to_datetime` when called with ``None`` raising ``TypeError`` instead of returning ``NaT`` (:issue:`30011`) -- Bug in :func:`pandas.to_datetime` failing for `deques` when using ``cache=True`` (the default) (:issue:`29403`) +- Bug in :func:`pandas.to_datetime` failing for ``deques`` when using ``cache=True`` (the default) (:issue:`29403`) - Bug in :meth:`Series.item` with ``datetime64`` or ``timedelta64`` dtype, :meth:`DatetimeIndex.item`, and :meth:`TimedeltaIndex.item` returning an integer instead of a :class:`Timestamp` or :class:`Timedelta` (:issue:`30175`) - Bug in :class:`DatetimeIndex` addition when adding a non-optimized :class:`DateOffset` incorrectly dropping timezone information (:issue:`30336`) - Bug in :meth:`DataFrame.drop` where attempting to drop non-existent values from a DatetimeIndex would yield a confusing error message (:issue:`30399`) @@ -1095,10 +1095,10 @@ Numeric ^^^^^^^ - Bug in :meth:`DataFrame.quantile` with zero-column :class:`DataFrame` incorrectly raising (:issue:`23925`) - :class:`DataFrame` flex inequality comparisons methods (:meth:`DataFrame.lt`, :meth:`DataFrame.le`, :meth:`DataFrame.gt`, :meth:`DataFrame.ge`) with object-dtype and ``complex`` entries failing to raise ``TypeError`` like their :class:`Series` counterparts (:issue:`28079`) -- Bug in :class:`DataFrame` logical operations (`&`, `|`, `^`) not matching :class:`Series` behavior by filling NA values (:issue:`28741`) +- Bug in :class:`DataFrame` logical operations (``&``, ``|``, ``^``) not matching :class:`Series` behavior by filling NA values (:issue:`28741`) - Bug in :meth:`DataFrame.interpolate` where specifying axis by name references variable before it is assigned (:issue:`29142`) - Bug in :meth:`Series.var` not computing the right value with a nullable integer dtype series not passing through ddof argument (:issue:`29128`) -- Improved error message when using `frac` > 1 and `replace` = False (:issue:`27451`) +- Improved error message when using ``frac`` > 1 and ``replace`` = False (:issue:`27451`) - Bug in numeric indexes resulted in it being possible to instantiate an :class:`Int64Index`, :class:`UInt64Index`, or :class:`Float64Index` with an invalid dtype (e.g. datetime-like) (:issue:`29539`) - Bug in :class:`UInt64Index` precision loss while constructing from a list with values in the ``np.uint64`` range (:issue:`29526`) - Bug in :class:`NumericIndex` construction that caused indexing to fail when integers in the ``np.uint64`` range were used (:issue:`28023`) @@ -1137,8 +1137,8 @@ Indexing - Bug in assignment using a reverse slicer (:issue:`26939`) - Bug in :meth:`DataFrame.explode` would duplicate frame in the presence of duplicates in the index (:issue:`28010`) -- Bug in reindexing a :meth:`PeriodIndex` with another type of index that contained a `Period` (:issue:`28323`) (:issue:`28337`) -- Fix assignment of column via `.loc` with numpy non-ns datetime type (:issue:`27395`) +- Bug in reindexing a :meth:`PeriodIndex` with another type of index that contained a ``Period`` (:issue:`28323`) (:issue:`28337`) +- Fix assignment of column via ``.loc`` with numpy non-ns datetime type (:issue:`27395`) - Bug in :meth:`Float64Index.astype` where ``np.inf`` was not handled properly when casting to an integer dtype (:issue:`28475`) - :meth:`Index.union` could fail when the left contained duplicates (:issue:`28257`) - Bug when indexing with ``.loc`` where the index was a :class:`CategoricalIndex` with non-string categories didn't work (:issue:`17569`, :issue:`30225`) @@ -1159,7 +1159,7 @@ MultiIndex ^^^^^^^^^^ - Constructor for :class:`MultiIndex` verifies that the given ``sortorder`` is compatible with the actual ``lexsort_depth`` if ``verify_integrity`` parameter is ``True`` (the default) (:issue:`28735`) -- Series and MultiIndex `.drop` with `MultiIndex` raise exception if labels not in given in level (:issue:`8594`) +- Series and MultiIndex ``.drop`` with ``MultiIndex`` raise exception if labels not in given in level (:issue:`8594`) - I/O @@ -1171,7 +1171,7 @@ I/O - Bug in :meth:`DataFrame.to_csv` where values were truncated when the length of ``na_rep`` was shorter than the text input data. (:issue:`25099`) - Bug in :func:`DataFrame.to_string` where values were truncated using display options instead of outputting the full content (:issue:`9784`) - Bug in :meth:`DataFrame.to_json` where a datetime column label would not be written out in ISO format with ``orient="table"`` (:issue:`28130`) -- Bug in :func:`DataFrame.to_parquet` where writing to GCS would fail with `engine='fastparquet'` if the file did not already exist (:issue:`28326`) +- Bug in :func:`DataFrame.to_parquet` where writing to GCS would fail with ``engine='fastparquet'`` if the file did not already exist (:issue:`28326`) - Bug in :func:`read_hdf` closing stores that it didn't open when Exceptions are raised (:issue:`28699`) - Bug in :meth:`DataFrame.read_json` where using ``orient="index"`` would not maintain the order (:issue:`28557`) - Bug in :meth:`DataFrame.to_html` where the length of the ``formatters`` argument was not verified (:issue:`28469`) @@ -1183,9 +1183,9 @@ I/O - Bug in :func:`read_json` where default encoding was not set to ``utf-8`` (:issue:`29565`) - Bug in :class:`PythonParser` where str and bytes were being mixed when dealing with the decimal field (:issue:`29650`) - :meth:`read_gbq` now accepts ``progress_bar_type`` to display progress bar while the data downloads. (:issue:`29857`) -- Bug in :func:`pandas.io.json.json_normalize` where a missing value in the location specified by `record_path` would raise a ``TypeError`` (:issue:`30148`) +- Bug in :func:`pandas.io.json.json_normalize` where a missing value in the location specified by ``record_path`` would raise a ``TypeError`` (:issue:`30148`) - :func:`read_excel` now accepts binary data (:issue:`15914`) -- Bug in :meth:`read_csv` in which encoding handling was limited to just the string `utf-16` for the C engine (:issue:`24130`) +- Bug in :meth:`read_csv` in which encoding handling was limited to just the string ``utf-16`` for the C engine (:issue:`24130`) Plotting ^^^^^^^^ @@ -1236,7 +1236,7 @@ Reshaping - Bug in :func:`merge`, did not append suffixes correctly with MultiIndex (:issue:`28518`) - :func:`qcut` and :func:`cut` now handle boolean input (:issue:`20303`) - Fix to ensure all int dtypes can be used in :func:`merge_asof` when using a tolerance value. Previously every non-int64 type would raise an erroneous ``MergeError`` (:issue:`28870`). -- Better error message in :func:`get_dummies` when `columns` isn't a list-like value (:issue:`28383`) +- Better error message in :func:`get_dummies` when ``columns`` isn't a list-like value (:issue:`28383`) - Bug in :meth:`Index.join` that caused infinite recursion error for mismatched ``MultiIndex`` name orders. (:issue:`25760`, :issue:`28956`) - Bug :meth:`Series.pct_change` where supplying an anchored frequency would throw a ``ValueError`` (:issue:`28664`) - Bug where :meth:`DataFrame.equals` returned True incorrectly in some cases when two DataFrames had the same columns in different orders (:issue:`28839`) @@ -1244,8 +1244,8 @@ Reshaping - Bug in :func:`melt` where supplying mixed strings and numeric values for ``id_vars`` or ``value_vars`` would incorrectly raise a ``ValueError`` (:issue:`29718`) - Dtypes are now preserved when transposing a ``DataFrame`` where each column is the same extension dtype (:issue:`30091`) - Bug in :func:`merge_asof` merging on a tz-aware ``left_index`` and ``right_on`` a tz-aware column (:issue:`29864`) -- Improved error message and docstring in :func:`cut` and :func:`qcut` when `labels=True` (:issue:`13318`) -- Bug in missing `fill_na` parameter to :meth:`DataFrame.unstack` with list of levels (:issue:`30740`) +- Improved error message and docstring in :func:`cut` and :func:`qcut` when ``labels=True`` (:issue:`13318`) +- Bug in missing ``fill_na`` parameter to :meth:`DataFrame.unstack` with list of levels (:issue:`30740`) Sparse ^^^^^^ diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index a49b29d691692..54ed407ed0a0a 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -66,10 +66,10 @@ For example: .. _whatsnew_110.dataframe_or_series_comparing: -Comparing two `DataFrame` or two `Series` and summarizing the differences -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Comparing two ``DataFrame`` or two ``Series`` and summarizing the differences +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -We've added :meth:`DataFrame.compare` and :meth:`Series.compare` for comparing two `DataFrame` or two `Series` (:issue:`30429`) +We've added :meth:`DataFrame.compare` and :meth:`Series.compare` for comparing two ``DataFrame`` or two ``Series`` (:issue:`30429`) .. ipython:: python @@ -116,10 +116,10 @@ compatibility (:issue:`3729`) .. ipython:: python - # Default `dropna` is set to True, which will exclude NaNs in keys + # Default ``dropna`` is set to True, which will exclude NaNs in keys df_dropna.groupby(by=["b"], dropna=True).sum() - # In order to allow NaN in keys, set `dropna` to False + # In order to allow NaN in keys, set ``dropna`` to False df_dropna.groupby(by=["b"], dropna=False).sum() The default setting of ``dropna`` argument is ``True`` which means ``NA`` are not included in group keys. @@ -155,8 +155,8 @@ method, we get s.sort_values(key=lambda x: x.str.lower()) -When applied to a `DataFrame`, they key is applied per-column to all columns or a subset if -`by` is specified, e.g. +When applied to a ``DataFrame``, they key is applied per-column to all columns or a subset if +``by`` is specified, e.g. .. ipython:: python @@ -217,7 +217,7 @@ Grouper and resample now supports the arguments origin and offset :class:`Grouper` and :meth:`DataFrame.resample` now supports the arguments ``origin`` and ``offset``. It let the user control the timestamp on which to adjust the grouping. (:issue:`31809`) -The bins of the grouping are adjusted based on the beginning of the day of the time series starting point. This works well with frequencies that are multiples of a day (like `30D`) or that divides a day (like `90s` or `1min`). But it can create inconsistencies with some frequencies that do not meet this criteria. To change this behavior you can now specify a fixed timestamp with the argument ``origin``. +The bins of the grouping are adjusted based on the beginning of the day of the time series starting point. This works well with frequencies that are multiples of a day (like ``30D``) or that divides a day (like ``90s`` or ``1min``). But it can create inconsistencies with some frequencies that do not meet this criteria. To change this behavior you can now specify a fixed timestamp with the argument ``origin``. Two arguments are now deprecated (more information in the documentation of :meth:`DataFrame.resample`): @@ -289,7 +289,7 @@ Other enhancements - Added :meth:`api.extensions.ExtensionArray.argmax` and :meth:`api.extensions.ExtensionArray.argmin` (:issue:`24382`) - :func:`timedelta_range` will now infer a frequency when passed ``start``, ``stop``, and ``periods`` (:issue:`32377`) - Positional slicing on a :class:`IntervalIndex` now supports slices with ``step > 1`` (:issue:`31658`) -- :class:`Series.str` now has a `fullmatch` method that matches a regular expression against the entire string in each row of the :class:`Series`, similar to `re.fullmatch` (:issue:`32806`). +- :class:`Series.str` now has a ``fullmatch`` method that matches a regular expression against the entire string in each row of the :class:`Series`, similar to ``re.fullmatch`` (:issue:`32806`). - :meth:`DataFrame.sample` will now also allow array-like and BitGenerator objects to be passed to ``random_state`` as seeds (:issue:`32503`) - :meth:`Index.union` will now raise ``RuntimeWarning`` for :class:`MultiIndex` objects if the object inside are unsortable. Pass ``sort=False`` to suppress this warning (:issue:`33015`) - Added :meth:`Series.dt.isocalendar` and :meth:`DatetimeIndex.isocalendar` that returns a :class:`DataFrame` with year, week, and day calculated according to the ISO 8601 calendar (:issue:`33206`, :issue:`34392`). @@ -319,7 +319,7 @@ Other enhancements :class:`~pandas.io.stata.StataWriter`, :class:`~pandas.io.stata.StataWriter117`, and :class:`~pandas.io.stata.StataWriterUTF8` (:issue:`26599`). - :meth:`HDFStore.put` now accepts a ``track_times`` parameter. This parameter is passed to the ``create_table`` method of ``PyTables`` (:issue:`32682`). -- :meth:`Series.plot` and :meth:`DataFrame.plot` now accepts `xlabel` and `ylabel` parameters to present labels on x and y axis (:issue:`9093`). +- :meth:`Series.plot` and :meth:`DataFrame.plot` now accepts ``xlabel`` and ``ylabel`` parameters to present labels on x and y axis (:issue:`9093`). - Made :class:`pandas.core.window.rolling.Rolling` and :class:`pandas.core.window.expanding.Expanding` iterable(:issue:`11704`) - Made ``option_context`` a :class:`contextlib.ContextDecorator`, which allows it to be used as a decorator over an entire function (:issue:`34253`). - :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` now accept an ``errors`` argument (:issue:`22610`) @@ -340,7 +340,7 @@ Other enhancements - :class:`pandas.core.window.ExponentialMovingWindow` now supports a ``times`` argument that allows ``mean`` to be calculated with observations spaced by the timestamps in ``times`` (:issue:`34839`) - :meth:`DataFrame.agg` and :meth:`Series.agg` now accept named aggregation for renaming the output columns/indexes. (:issue:`26513`) - ``compute.use_numba`` now exists as a configuration option that utilizes the numba engine when available (:issue:`33966`, :issue:`35374`) -- :meth:`Series.plot` now supports asymmetric error bars. Previously, if :meth:`Series.plot` received a "2xN" array with error values for `yerr` and/or `xerr`, the left/lower values (first row) were mirrored, while the right/upper values (second row) were ignored. Now, the first row represents the left/lower error values and the second row the right/upper error values. (:issue:`9536`) +- :meth:`Series.plot` now supports asymmetric error bars. Previously, if :meth:`Series.plot` received a "2xN" array with error values for ``yerr`` and/or ``xerr``, the left/lower values (first row) were mirrored, while the right/upper values (second row) were ignored. Now, the first row represents the left/lower error values and the second row the right/upper error values. (:issue:`9536`) .. --------------------------------------------------------------------------- From 1e3574db3cefca38c9d358e018d0236ac14ddb5b Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Sat, 26 Sep 2020 16:19:52 -0700 Subject: [PATCH 0921/1025] CLN: moments/test_moments_rolling.py for apply (#36676) --- .../window/moments/test_moments_rolling.py | 19 --- .../moments/test_moments_rolling_apply.py | 151 ++++++++++++++++++ 2 files changed, 151 insertions(+), 19 deletions(-) create mode 100644 pandas/tests/window/moments/test_moments_rolling_apply.py diff --git a/pandas/tests/window/moments/test_moments_rolling.py b/pandas/tests/window/moments/test_moments_rolling.py index da256e80dff7e..ad7cdee89e6f8 100644 --- a/pandas/tests/window/moments/test_moments_rolling.py +++ b/pandas/tests/window/moments/test_moments_rolling.py @@ -1,5 +1,4 @@ import copy -import warnings import numpy as np from numpy.random import randn @@ -856,24 +855,6 @@ def test_rolling_quantile_param(): ser.rolling(3).quantile("foo") -def test_rolling_apply(raw, series, frame): - # suppress warnings about empty slices, as we are deliberately testing - # with a 0-length Series - - def f(x): - with warnings.catch_warnings(): - warnings.filterwarnings( - "ignore", - message=".*(empty slice|0 for slice).*", - category=RuntimeWarning, - ) - return x[np.isfinite(x)].mean() - - _check_moment_func( - np.mean, name="apply", func=f, raw=raw, series=series, frame=frame - ) - - def test_rolling_std(raw, series, frame): _check_moment_func( lambda x: np.std(x, ddof=1), name="std", raw=raw, series=series, frame=frame diff --git a/pandas/tests/window/moments/test_moments_rolling_apply.py b/pandas/tests/window/moments/test_moments_rolling_apply.py new file mode 100644 index 0000000000000..e48d88b365d8d --- /dev/null +++ b/pandas/tests/window/moments/test_moments_rolling_apply.py @@ -0,0 +1,151 @@ +import warnings + +import numpy as np +import pytest + +from pandas import DataFrame, Series, concat, isna, notna +import pandas._testing as tm + +import pandas.tseries.offsets as offsets + + +def f(x): + # suppress warnings about empty slices, as we are deliberately testing + # with a 0-length Series + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + message=".*(empty slice|0 for slice).*", + category=RuntimeWarning, + ) + return x[np.isfinite(x)].mean() + + +def test_series(raw, series): + result = series.rolling(50).apply(f, raw=raw) + assert isinstance(result, Series) + tm.assert_almost_equal(result.iloc[-1], np.mean(series[-50:])) + + +def test_frame(raw, frame): + result = frame.rolling(50).apply(f, raw=raw) + assert isinstance(result, DataFrame) + tm.assert_series_equal( + result.iloc[-1, :], + frame.iloc[-50:, :].apply(np.mean, axis=0, raw=raw), + check_names=False, + ) + + +def test_time_rule_series(raw, series): + win = 25 + minp = 10 + ser = series[::2].resample("B").mean() + series_result = ser.rolling(window=win, min_periods=minp).apply(f, raw=raw) + last_date = series_result.index[-1] + prev_date = last_date - 24 * offsets.BDay() + + trunc_series = series[::2].truncate(prev_date, last_date) + tm.assert_almost_equal(series_result[-1], np.mean(trunc_series)) + + +def test_time_rule_frame(raw, frame): + win = 25 + minp = 10 + frm = frame[::2].resample("B").mean() + frame_result = frm.rolling(window=win, min_periods=minp).apply(f, raw=raw) + last_date = frame_result.index[-1] + prev_date = last_date - 24 * offsets.BDay() + + trunc_frame = frame[::2].truncate(prev_date, last_date) + tm.assert_series_equal( + frame_result.xs(last_date), + trunc_frame.apply(np.mean, raw=raw), + check_names=False, + ) + + +def test_nans(raw): + obj = Series(np.random.randn(50)) + obj[:10] = np.NaN + obj[-10:] = np.NaN + + result = obj.rolling(50, min_periods=30).apply(f, raw=raw) + tm.assert_almost_equal(result.iloc[-1], np.mean(obj[10:-10])) + + # min_periods is working correctly + result = obj.rolling(20, min_periods=15).apply(f, raw=raw) + assert isna(result.iloc[23]) + assert not isna(result.iloc[24]) + + assert not isna(result.iloc[-6]) + assert isna(result.iloc[-5]) + + obj2 = Series(np.random.randn(20)) + result = obj2.rolling(10, min_periods=5).apply(f, raw=raw) + assert isna(result.iloc[3]) + assert notna(result.iloc[4]) + + result0 = obj.rolling(20, min_periods=0).apply(f, raw=raw) + result1 = obj.rolling(20, min_periods=1).apply(f, raw=raw) + tm.assert_almost_equal(result0, result1) + + +@pytest.mark.parametrize("minp", [0, 99, 100]) +def test_min_periods(raw, series, minp): + result = series.rolling(len(series) + 1, min_periods=minp).apply(f, raw=raw) + expected = series.rolling(len(series), min_periods=minp).apply(f, raw=raw) + nan_mask = isna(result) + tm.assert_series_equal(nan_mask, isna(expected)) + + nan_mask = ~nan_mask + tm.assert_almost_equal(result[nan_mask], expected[nan_mask]) + + +def test_center(raw): + obj = Series(np.random.randn(50)) + obj[:10] = np.NaN + obj[-10:] = np.NaN + + result = obj.rolling(20, min_periods=15, center=True).apply(f, raw=raw) + expected = ( + concat([obj, Series([np.NaN] * 9)]) + .rolling(20, min_periods=15) + .apply(f, raw=raw)[9:] + .reset_index(drop=True) + ) + tm.assert_series_equal(result, expected) + + +def test_center_reindex_series(raw, series): + # shifter index + s = [f"x{x:d}" for x in range(12)] + minp = 10 + + series_xp = ( + series.reindex(list(series.index) + s) + .rolling(window=25, min_periods=minp) + .apply(f, raw=raw) + .shift(-12) + .reindex(series.index) + ) + series_rs = series.rolling(window=25, min_periods=minp, center=True).apply( + f, raw=raw + ) + tm.assert_series_equal(series_xp, series_rs) + + +def test_center_reindex_frame(raw, frame): + # shifter index + s = [f"x{x:d}" for x in range(12)] + minp = 10 + + frame_xp = ( + frame.reindex(list(frame.index) + s) + .rolling(window=25, min_periods=minp) + .apply(f, raw=raw) + .shift(-12) + .reindex(frame.index) + ) + frame_rs = frame.rolling(window=25, min_periods=minp, center=True).apply(f, raw=raw) + tm.assert_frame_equal(frame_xp, frame_rs) From 4fd13bf24f871cf04dc7afad9ddb827ec9733d0b Mon Sep 17 00:00:00 2001 From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> Date: Sat, 26 Sep 2020 19:21:41 -0400 Subject: [PATCH 0922/1025] CLN: Break up aggregate.transform (#36618) --- pandas/core/aggregation.py | 86 ++++++++++++++++++++++---------------- 1 file changed, 51 insertions(+), 35 deletions(-) diff --git a/pandas/core/aggregation.py b/pandas/core/aggregation.py index 71b9a658202a5..f2eb282d1e498 100644 --- a/pandas/core/aggregation.py +++ b/pandas/core/aggregation.py @@ -418,8 +418,6 @@ def transform( ValueError If the transform function fails or does not transform. """ - from pandas.core.reshape.concat import concat - is_series = obj.ndim == 1 if obj._get_axis_number(axis) == 1: @@ -433,42 +431,11 @@ def transform( func = {col: func for col in obj} if isinstance(func, dict): - if not is_series: - cols = sorted(set(func.keys()) - set(obj.columns)) - if len(cols) > 0: - raise SpecificationError(f"Column(s) {cols} do not exist") - - if any(isinstance(v, dict) for v in func.values()): - # GH 15931 - deprecation of renaming keys - raise SpecificationError("nested renamer is not supported") - - results = {} - for name, how in func.items(): - colg = obj._gotitem(name, ndim=1) - try: - results[name] = transform(colg, how, 0, *args, **kwargs) - except Exception as e: - if str(e) == "Function did not transform": - raise e - - # combine results - if len(results) == 0: - raise ValueError("Transform function failed") - return concat(results, axis=1) + return transform_dict_like(obj, func, *args, **kwargs) # func is either str or callable try: - if isinstance(func, str): - result = obj._try_aggregate_string_function(func, *args, **kwargs) - else: - f = obj._get_cython_func(func) - if f and not args and not kwargs: - result = getattr(obj, f)() - else: - try: - result = obj.apply(func, args=args, **kwargs) - except Exception: - result = func(obj, *args, **kwargs) + result = transform_str_or_callable(obj, func, *args, **kwargs) except Exception: raise ValueError("Transform function failed") @@ -482,3 +449,52 @@ def transform( raise ValueError("Function did not transform") return result + + +def transform_dict_like(obj, func, *args, **kwargs): + """ + Compute transform in the case of a dict-like func + """ + from pandas.core.reshape.concat import concat + + if obj.ndim != 1: + cols = sorted(set(func.keys()) - set(obj.columns)) + if len(cols) > 0: + raise SpecificationError(f"Column(s) {cols} do not exist") + + if any(isinstance(v, dict) for v in func.values()): + # GH 15931 - deprecation of renaming keys + raise SpecificationError("nested renamer is not supported") + + results = {} + for name, how in func.items(): + colg = obj._gotitem(name, ndim=1) + try: + results[name] = transform(colg, how, 0, *args, **kwargs) + except Exception as e: + if str(e) == "Function did not transform": + raise e + + # combine results + if len(results) == 0: + raise ValueError("Transform function failed") + return concat(results, axis=1) + + +def transform_str_or_callable(obj, func, *args, **kwargs): + """ + Compute transform in the case of a string or callable func + """ + if isinstance(func, str): + return obj._try_aggregate_string_function(func, *args, **kwargs) + + if not args and not kwargs: + f = obj._get_cython_func(func) + if f: + return getattr(obj, f)() + + # Two possible ways to use a UDF - apply or call directly + try: + return obj.apply(func, args=args, **kwargs) + except Exception: + return func(obj, *args, **kwargs) From bc18c183880d3b46f7fcbf2ff8064c216e63367d Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Sun, 27 Sep 2020 19:59:37 +0100 Subject: [PATCH 0923/1025] CFG use black profile in setup.cfg for isort (#36639) --- setup.cfg | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/setup.cfg b/setup.cfg index e7d7df7ff19a2..d938d2ef3972a 100644 --- a/setup.cfg +++ b/setup.cfg @@ -105,11 +105,7 @@ known_pre_core = pandas._libs,pandas._typing,pandas.util._*,pandas.compat,pandas known_dtypes = pandas.core.dtypes known_post_core = pandas.tseries,pandas.io,pandas.plotting sections = FUTURE,STDLIB,THIRDPARTY,PRE_LIBS,PRE_CORE,DTYPES,FIRSTPARTY,POST_CORE,LOCALFOLDER -known_first_party = pandas -known_third_party = announce,dateutil,docutils,flake8,git,hypothesis,jinja2,lxml,matplotlib,numpy,numpydoc,pkg_resources,pyarrow,pytest,pytz,requests,scipy,setuptools,sphinx,sqlalchemy,validate_docstrings,validate_unwanted_patterns,yaml,odf -multi_line_output = 3 -include_trailing_comma = True -force_grid_wrap = 0 +profile = black combine_as_imports = True line_length = 88 force_sort_within_sections = True From 54dacb8cc00ec4a4aaa7c5b7c7b426dbff1c55f7 Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Sun, 27 Sep 2020 20:33:21 +0100 Subject: [PATCH 0924/1025] remove unnecessary noqas (#36684) --- pandas/core/algorithms.py | 2 +- pandas/core/arrays/boolean.py | 4 ++-- pandas/core/arrays/integer.py | 4 ++-- pandas/core/arrays/sparse/dtype.py | 4 ++-- pandas/core/arrays/string_.py | 4 ++-- pandas/core/dtypes/base.py | 2 +- pandas/core/dtypes/cast.py | 2 +- pandas/core/dtypes/dtypes.py | 18 +++++++----------- pandas/core/groupby/generic.py | 2 +- pandas/core/indexes/accessors.py | 2 +- pandas/core/indexes/interval.py | 2 +- pandas/core/indexes/multi.py | 2 +- pandas/core/indexing.py | 2 +- pandas/core/internals/construction.py | 4 ++-- pandas/core/internals/ops.py | 4 ++-- pandas/core/ops/__init__.py | 2 +- pandas/core/reshape/melt.py | 2 +- pandas/core/reshape/merge.py | 2 +- pandas/core/sorting.py | 2 +- pandas/core/tools/datetimes.py | 4 ++-- 20 files changed, 33 insertions(+), 37 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index ba08d26fbc24f..d2005d46bbbf1 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -60,7 +60,7 @@ from pandas.core.indexers import validate_indices if TYPE_CHECKING: - from pandas import Categorical, DataFrame, Series # noqa:F401 + from pandas import Categorical, DataFrame, Series _shared_docs: Dict[str, str] = {} diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index 3bd36209b3c71..0a6a65bbbd5a0 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -27,7 +27,7 @@ from .masked import BaseMaskedArray, BaseMaskedDtype if TYPE_CHECKING: - import pyarrow # noqa: F401 + import pyarrow @register_extension_dtype @@ -98,7 +98,7 @@ def __from_arrow__( """ Construct BooleanArray from pyarrow Array/ChunkedArray. """ - import pyarrow # noqa: F811 + import pyarrow if isinstance(array, pyarrow.Array): chunks = [array] diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 94af013d6df2c..8a51b7293082e 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -33,7 +33,7 @@ from .masked import BaseMaskedArray, BaseMaskedDtype if TYPE_CHECKING: - import pyarrow # noqa: F401 + import pyarrow class _IntegerDtype(BaseMaskedDtype): @@ -115,7 +115,7 @@ def __from_arrow__( """ Construct IntegerArray from pyarrow Array/ChunkedArray. """ - import pyarrow # noqa: F811 + import pyarrow from pandas.core.arrays._arrow_utils import pyarrow_array_to_numpy_and_mask diff --git a/pandas/core/arrays/sparse/dtype.py b/pandas/core/arrays/sparse/dtype.py index ccf2825162f51..c0662911d40da 100644 --- a/pandas/core/arrays/sparse/dtype.py +++ b/pandas/core/arrays/sparse/dtype.py @@ -22,7 +22,7 @@ from pandas.core.dtypes.missing import isna, na_value_for_dtype if TYPE_CHECKING: - from pandas.core.arrays.sparse.array import SparseArray # noqa: F401 + from pandas.core.arrays.sparse.array import SparseArray @register_extension_dtype @@ -180,7 +180,7 @@ def construct_array_type(cls) -> Type["SparseArray"]: ------- type """ - from pandas.core.arrays.sparse.array import SparseArray # noqa: F811 + from pandas.core.arrays.sparse.array import SparseArray return SparseArray diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index cb1144c18e49c..5e7066e32ea39 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -18,7 +18,7 @@ from pandas.core.missing import isna if TYPE_CHECKING: - import pyarrow # noqa: F401 + import pyarrow @register_extension_dtype @@ -79,7 +79,7 @@ def __from_arrow__( """ Construct StringArray from pyarrow Array/ChunkedArray. """ - import pyarrow # noqa: F811 + import pyarrow if isinstance(array, pyarrow.Array): chunks = [array] diff --git a/pandas/core/dtypes/base.py b/pandas/core/dtypes/base.py index 3ae5cabf9c73f..96de54380c7ad 100644 --- a/pandas/core/dtypes/base.py +++ b/pandas/core/dtypes/base.py @@ -12,7 +12,7 @@ from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries if TYPE_CHECKING: - from pandas.core.arrays import ExtensionArray # noqa: F401 + from pandas.core.arrays import ExtensionArray class ExtensionDtype: diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 05759ffb43dde..c5ea24145ae9e 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -77,7 +77,7 @@ if TYPE_CHECKING: from pandas import Series - from pandas.core.arrays import ExtensionArray # noqa: F401 + from pandas.core.arrays import ExtensionArray _int8_max = np.iinfo(np.int8).max _int16_max = np.iinfo(np.int16).max diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 2e5dc15131e70..bf8d50db8416e 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -29,14 +29,10 @@ from pandas.core.dtypes.inference import is_bool, is_list_like if TYPE_CHECKING: - import pyarrow # noqa: F401 + import pyarrow - from pandas import Categorical # noqa: F401 - from pandas.core.arrays import ( # noqa: F401 - DatetimeArray, - IntervalArray, - PeriodArray, - ) + from pandas import Categorical + from pandas.core.arrays import DatetimeArray, IntervalArray, PeriodArray str_type = str @@ -457,7 +453,7 @@ def construct_array_type(cls) -> Type["Categorical"]: ------- type """ - from pandas import Categorical # noqa: F811 + from pandas import Categorical return Categorical @@ -706,7 +702,7 @@ def construct_array_type(cls) -> Type["DatetimeArray"]: ------- type """ - from pandas.core.arrays import DatetimeArray # noqa: F811 + from pandas.core.arrays import DatetimeArray return DatetimeArray @@ -959,7 +955,7 @@ def __from_arrow__( """ Construct PeriodArray from pyarrow Array/ChunkedArray. """ - import pyarrow # noqa: F811 + import pyarrow from pandas.core.arrays import PeriodArray from pandas.core.arrays._arrow_utils import pyarrow_array_to_numpy_and_mask @@ -1157,7 +1153,7 @@ def __from_arrow__( """ Construct IntervalArray from pyarrow Array/ChunkedArray. """ - import pyarrow # noqa: F811 + import pyarrow from pandas.core.arrays import IntervalArray diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 4cbbe08756ca7..e7e812737d48e 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -83,7 +83,7 @@ from pandas.plotting import boxplot_frame_groupby if TYPE_CHECKING: - from pandas.core.internals import Block # noqa:F401 + from pandas.core.internals import Block NamedAgg = namedtuple("NamedAgg", ["column", "aggfunc"]) diff --git a/pandas/core/indexes/accessors.py b/pandas/core/indexes/accessors.py index aa2c04e48eb81..b9b2c4b07d37a 100644 --- a/pandas/core/indexes/accessors.py +++ b/pandas/core/indexes/accessors.py @@ -24,7 +24,7 @@ from pandas.core.indexes.timedeltas import TimedeltaIndex if TYPE_CHECKING: - from pandas import Series # noqa:F401 + from pandas import Series class Properties(PandasDelegate, PandasObject, NoNewAttributesMixin): diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 3fcc40c90b98e..6b877b378a140 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -57,7 +57,7 @@ from pandas.core.ops import get_op_result_name if TYPE_CHECKING: - from pandas import CategoricalIndex # noqa:F401 + from pandas import CategoricalIndex _index_doc_kwargs = dict(ibase._index_doc_kwargs) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index cd3e384837280..1de392f6fc03f 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -65,7 +65,7 @@ ) if TYPE_CHECKING: - from pandas import Series # noqa:F401 + from pandas import Series _index_doc_kwargs = dict(ibase._index_doc_kwargs) _index_doc_kwargs.update( diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 8aef150078e5b..fc1b9bee9ba03 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -35,7 +35,7 @@ from pandas.core.indexes.api import Index if TYPE_CHECKING: - from pandas import DataFrame, Series # noqa:F401 + from pandas import DataFrame, Series # "null slice" _NS = slice(None, None) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index d19a0dd8f29e3..6244f1bf0a2d2 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -52,7 +52,7 @@ ) if TYPE_CHECKING: - from pandas import Series # noqa:F401 + from pandas import Series # --------------------------------------------------------------------- # BlockManager Interface @@ -244,7 +244,7 @@ def init_dict(data: Dict, index, columns, dtype: Optional[DtypeObj] = None): arrays: Union[Sequence[Any], "Series"] if columns is not None: - from pandas.core.series import Series # noqa:F811 + from pandas.core.series import Series arrays = Series(data, index=columns, dtype=object) data_names = arrays.index diff --git a/pandas/core/internals/ops.py b/pandas/core/internals/ops.py index 05f5f9a00ae1b..d7ea5d613d96a 100644 --- a/pandas/core/internals/ops.py +++ b/pandas/core/internals/ops.py @@ -6,8 +6,8 @@ from pandas._typing import ArrayLike if TYPE_CHECKING: - from pandas.core.internals.blocks import Block # noqa:F401 - from pandas.core.internals.managers import BlockManager # noqa:F401 + from pandas.core.internals.blocks import Block + from pandas.core.internals.managers import BlockManager BlockPairInfo = namedtuple( diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index 6763db1e2b138..2dc97a3583dfb 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -55,7 +55,7 @@ ) if TYPE_CHECKING: - from pandas import DataFrame, Series # noqa:F401 + from pandas import DataFrame, Series # ----------------------------------------------------------------------------- # constants diff --git a/pandas/core/reshape/melt.py b/pandas/core/reshape/melt.py index 7f5fb6b45f014..83a5f43c2a340 100644 --- a/pandas/core/reshape/melt.py +++ b/pandas/core/reshape/melt.py @@ -19,7 +19,7 @@ from pandas.core.tools.numeric import to_numeric if TYPE_CHECKING: - from pandas import DataFrame, Series # noqa: F401 + from pandas import DataFrame, Series @Appender(_shared_docs["melt"] % dict(caller="pd.melt(df, ", other="DataFrame.melt")) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 5a6518995c554..493ba87565220 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -50,7 +50,7 @@ from pandas.core.sorting import is_int64_overflow_possible if TYPE_CHECKING: - from pandas import DataFrame # noqa:F401 + from pandas import DataFrame @Substitution("\nleft : DataFrame") diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 1fec2bbbf5fdc..e02b565ed5d7b 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -469,7 +469,7 @@ def ensure_key_mapped(values, key: Optional[Callable], levels=None): levels : Optional[List], if values is a MultiIndex, list of levels to apply the key to. """ - from pandas.core.indexes.api import Index # noqa:F811 + from pandas.core.indexes.api import Index if not key: return values diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index ddb44898dbfad..7b384c9bbb47d 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -53,9 +53,9 @@ from pandas.core.indexes.datetimes import DatetimeIndex if TYPE_CHECKING: - from pandas._libs.tslibs.nattype import NaTType # noqa:F401 + from pandas._libs.tslibs.nattype import NaTType - from pandas import Series # noqa:F401 + from pandas import Series # --------------------------------------------------------------------- # types used in annotations From d7363adba4c1b264b97eb7b7ebdd1c2a3bb6aa09 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Sun, 27 Sep 2020 18:57:39 -0700 Subject: [PATCH 0925/1025] CLN: test_moments_rolling.py for mean/std/var/count/median/min/max (#36678) * CLN: test_moments_rolling.py for mean * Add missing test * Parameterize over more functions * Remove usused param since apply was moved * Remove copy import Co-authored-by: Matt Roeschke --- .../window/moments/test_moments_rolling.py | 76 +---- .../moments/test_moments_rolling_functions.py | 302 ++++++++++++++++++ 2 files changed, 304 insertions(+), 74 deletions(-) create mode 100644 pandas/tests/window/moments/test_moments_rolling_functions.py diff --git a/pandas/tests/window/moments/test_moments_rolling.py b/pandas/tests/window/moments/test_moments_rolling.py index ad7cdee89e6f8..880316ec6111a 100644 --- a/pandas/tests/window/moments/test_moments_rolling.py +++ b/pandas/tests/window/moments/test_moments_rolling.py @@ -1,5 +1,3 @@ -import copy - import numpy as np from numpy.random import randn import pytest @@ -26,12 +24,6 @@ def _check_moment_func( frame=None, **kwargs, ): - - # inject raw - if name == "apply": - kwargs = copy.copy(kwargs) - kwargs["raw"] = raw - def get_result(obj, window, min_periods=None, center=False): r = obj.rolling(window=window, min_periods=min_periods, center=center) return getattr(r, name)(**kwargs) @@ -211,34 +203,6 @@ def test_centered_axis_validation(): (DataFrame(np.ones((10, 10))).rolling(window=3, center=True, axis=2).mean()) -def test_rolling_sum(raw, series, frame): - _check_moment_func( - np.nansum, - name="sum", - zero_min_periods_equal=False, - raw=raw, - series=series, - frame=frame, - ) - - -def test_rolling_count(raw, series, frame): - counter = lambda x: np.isfinite(x).astype(float).sum() - _check_moment_func( - counter, - name="count", - has_min_periods=False, - fill_value=0, - raw=raw, - series=series, - frame=frame, - ) - - -def test_rolling_mean(raw, series, frame): - _check_moment_func(np.mean, name="mean", raw=raw, series=series, frame=frame) - - @td.skip_if_no_scipy def test_cmov_mean(): # GH 8238 @@ -733,13 +697,7 @@ def test_cmov_window_special_linear_range(win_types_special): tm.assert_series_equal(xp, rs) -def test_rolling_median(raw, series, frame): - _check_moment_func(np.median, name="median", raw=raw, series=series, frame=frame) - - -def test_rolling_min(raw, series, frame): - _check_moment_func(np.min, name="min", raw=raw, series=series, frame=frame) - +def test_rolling_min_min_periods(): a = pd.Series([1, 2, 3, 4, 5]) result = a.rolling(window=100, min_periods=1).min() expected = pd.Series(np.ones(len(a))) @@ -749,9 +707,7 @@ def test_rolling_min(raw, series, frame): pd.Series([1, 2, 3]).rolling(window=3, min_periods=5).min() -def test_rolling_max(raw, series, frame): - _check_moment_func(np.max, name="max", raw=raw, series=series, frame=frame) - +def test_rolling_max_min_periods(): a = pd.Series([1, 2, 3, 4, 5], dtype=np.float64) b = a.rolling(window=100, min_periods=1).max() tm.assert_almost_equal(a, b) @@ -855,20 +811,6 @@ def test_rolling_quantile_param(): ser.rolling(3).quantile("foo") -def test_rolling_std(raw, series, frame): - _check_moment_func( - lambda x: np.std(x, ddof=1), name="std", raw=raw, series=series, frame=frame - ) - _check_moment_func( - lambda x: np.std(x, ddof=0), - name="std", - ddof=0, - raw=raw, - series=series, - frame=frame, - ) - - def test_rolling_std_1obs(): vals = pd.Series([1.0, 2.0, 3.0, 4.0, 5.0]) @@ -905,20 +847,6 @@ def test_rolling_std_neg_sqrt(): assert np.isfinite(b[2:]).all() -def test_rolling_var(raw, series, frame): - _check_moment_func( - lambda x: np.var(x, ddof=1), name="var", raw=raw, series=series, frame=frame - ) - _check_moment_func( - lambda x: np.var(x, ddof=0), - name="var", - ddof=0, - raw=raw, - series=series, - frame=frame, - ) - - @td.skip_if_no_scipy def test_rolling_skew(raw, series, frame): from scipy.stats import skew diff --git a/pandas/tests/window/moments/test_moments_rolling_functions.py b/pandas/tests/window/moments/test_moments_rolling_functions.py new file mode 100644 index 0000000000000..98c7a0a055bd3 --- /dev/null +++ b/pandas/tests/window/moments/test_moments_rolling_functions.py @@ -0,0 +1,302 @@ +import numpy as np +import pytest + +from pandas import DataFrame, Series, concat, isna, notna +import pandas._testing as tm + +import pandas.tseries.offsets as offsets + + +@pytest.mark.parametrize( + "compare_func, roll_func, kwargs", + [ + [np.mean, "mean", {}], + [np.nansum, "sum", {}], + [lambda x: np.isfinite(x).astype(float).sum(), "count", {}], + [np.median, "median", {}], + [np.min, "min", {}], + [np.max, "max", {}], + [lambda x: np.std(x, ddof=1), "std", {}], + [lambda x: np.std(x, ddof=0), "std", {"ddof": 0}], + [lambda x: np.var(x, ddof=1), "var", {}], + [lambda x: np.var(x, ddof=0), "var", {"ddof": 0}], + ], +) +def test_series(series, compare_func, roll_func, kwargs): + result = getattr(series.rolling(50), roll_func)(**kwargs) + assert isinstance(result, Series) + tm.assert_almost_equal(result.iloc[-1], compare_func(series[-50:])) + + +@pytest.mark.parametrize( + "compare_func, roll_func, kwargs", + [ + [np.mean, "mean", {}], + [np.nansum, "sum", {}], + [lambda x: np.isfinite(x).astype(float).sum(), "count", {}], + [np.median, "median", {}], + [np.min, "min", {}], + [np.max, "max", {}], + [lambda x: np.std(x, ddof=1), "std", {}], + [lambda x: np.std(x, ddof=0), "std", {"ddof": 0}], + [lambda x: np.var(x, ddof=1), "var", {}], + [lambda x: np.var(x, ddof=0), "var", {"ddof": 0}], + ], +) +def test_frame(raw, frame, compare_func, roll_func, kwargs): + result = getattr(frame.rolling(50), roll_func)(**kwargs) + assert isinstance(result, DataFrame) + tm.assert_series_equal( + result.iloc[-1, :], + frame.iloc[-50:, :].apply(compare_func, axis=0, raw=raw), + check_names=False, + ) + + +@pytest.mark.parametrize( + "compare_func, roll_func, kwargs, minp", + [ + [np.mean, "mean", {}, 10], + [np.nansum, "sum", {}, 10], + [lambda x: np.isfinite(x).astype(float).sum(), "count", {}, 0], + [np.median, "median", {}, 10], + [np.min, "min", {}, 10], + [np.max, "max", {}, 10], + [lambda x: np.std(x, ddof=1), "std", {}, 10], + [lambda x: np.std(x, ddof=0), "std", {"ddof": 0}, 10], + [lambda x: np.var(x, ddof=1), "var", {}, 10], + [lambda x: np.var(x, ddof=0), "var", {"ddof": 0}, 10], + ], +) +def test_time_rule_series(series, compare_func, roll_func, kwargs, minp): + win = 25 + ser = series[::2].resample("B").mean() + series_result = getattr(ser.rolling(window=win, min_periods=minp), roll_func)( + **kwargs + ) + last_date = series_result.index[-1] + prev_date = last_date - 24 * offsets.BDay() + + trunc_series = series[::2].truncate(prev_date, last_date) + tm.assert_almost_equal(series_result[-1], compare_func(trunc_series)) + + +@pytest.mark.parametrize( + "compare_func, roll_func, kwargs, minp", + [ + [np.mean, "mean", {}, 10], + [np.nansum, "sum", {}, 10], + [lambda x: np.isfinite(x).astype(float).sum(), "count", {}, 0], + [np.median, "median", {}, 10], + [np.min, "min", {}, 10], + [np.max, "max", {}, 10], + [lambda x: np.std(x, ddof=1), "std", {}, 10], + [lambda x: np.std(x, ddof=0), "std", {"ddof": 0}, 10], + [lambda x: np.var(x, ddof=1), "var", {}, 10], + [lambda x: np.var(x, ddof=0), "var", {"ddof": 0}, 10], + ], +) +def test_time_rule_frame(raw, frame, compare_func, roll_func, kwargs, minp): + win = 25 + frm = frame[::2].resample("B").mean() + frame_result = getattr(frm.rolling(window=win, min_periods=minp), roll_func)( + **kwargs + ) + last_date = frame_result.index[-1] + prev_date = last_date - 24 * offsets.BDay() + + trunc_frame = frame[::2].truncate(prev_date, last_date) + tm.assert_series_equal( + frame_result.xs(last_date), + trunc_frame.apply(compare_func, raw=raw), + check_names=False, + ) + + +@pytest.mark.parametrize( + "compare_func, roll_func, kwargs", + [ + [np.mean, "mean", {}], + [np.nansum, "sum", {}], + [np.median, "median", {}], + [np.min, "min", {}], + [np.max, "max", {}], + [lambda x: np.std(x, ddof=1), "std", {}], + [lambda x: np.std(x, ddof=0), "std", {"ddof": 0}], + [lambda x: np.var(x, ddof=1), "var", {}], + [lambda x: np.var(x, ddof=0), "var", {"ddof": 0}], + ], +) +def test_nans(compare_func, roll_func, kwargs): + obj = Series(np.random.randn(50)) + obj[:10] = np.NaN + obj[-10:] = np.NaN + + result = getattr(obj.rolling(50, min_periods=30), roll_func)(**kwargs) + tm.assert_almost_equal(result.iloc[-1], compare_func(obj[10:-10])) + + # min_periods is working correctly + result = getattr(obj.rolling(20, min_periods=15), roll_func)(**kwargs) + assert isna(result.iloc[23]) + assert not isna(result.iloc[24]) + + assert not isna(result.iloc[-6]) + assert isna(result.iloc[-5]) + + obj2 = Series(np.random.randn(20)) + result = getattr(obj2.rolling(10, min_periods=5), roll_func)(**kwargs) + assert isna(result.iloc[3]) + assert notna(result.iloc[4]) + + if roll_func != "sum": + result0 = getattr(obj.rolling(20, min_periods=0), roll_func)(**kwargs) + result1 = getattr(obj.rolling(20, min_periods=1), roll_func)(**kwargs) + tm.assert_almost_equal(result0, result1) + + +def test_nans_count(): + obj = Series(np.random.randn(50)) + obj[:10] = np.NaN + obj[-10:] = np.NaN + result = obj.rolling(50, min_periods=30).count() + tm.assert_almost_equal( + result.iloc[-1], np.isfinite(obj[10:-10]).astype(float).sum() + ) + + +@pytest.mark.parametrize( + "roll_func, kwargs", + [ + ["mean", {}], + ["sum", {}], + ["median", {}], + ["min", {}], + ["max", {}], + ["std", {}], + ["std", {"ddof": 0}], + ["var", {}], + ["var", {"ddof": 0}], + ], +) +@pytest.mark.parametrize("minp", [0, 99, 100]) +def test_min_periods(series, minp, roll_func, kwargs): + result = getattr(series.rolling(len(series) + 1, min_periods=minp), roll_func)( + **kwargs + ) + expected = getattr(series.rolling(len(series), min_periods=minp), roll_func)( + **kwargs + ) + nan_mask = isna(result) + tm.assert_series_equal(nan_mask, isna(expected)) + + nan_mask = ~nan_mask + tm.assert_almost_equal(result[nan_mask], expected[nan_mask]) + + +def test_min_periods_count(series): + result = series.rolling(len(series) + 1, min_periods=0).count() + expected = series.rolling(len(series), min_periods=0).count() + nan_mask = isna(result) + tm.assert_series_equal(nan_mask, isna(expected)) + + nan_mask = ~nan_mask + tm.assert_almost_equal(result[nan_mask], expected[nan_mask]) + + +@pytest.mark.parametrize( + "roll_func, kwargs, minp", + [ + ["mean", {}, 15], + ["sum", {}, 15], + ["count", {}, 0], + ["median", {}, 15], + ["min", {}, 15], + ["max", {}, 15], + ["std", {}, 15], + ["std", {"ddof": 0}, 15], + ["var", {}, 15], + ["var", {"ddof": 0}, 15], + ], +) +def test_center(roll_func, kwargs, minp): + obj = Series(np.random.randn(50)) + obj[:10] = np.NaN + obj[-10:] = np.NaN + + result = getattr(obj.rolling(20, min_periods=minp, center=True), roll_func)( + **kwargs + ) + expected = getattr( + concat([obj, Series([np.NaN] * 9)]).rolling(20, min_periods=minp), roll_func + )(**kwargs)[9:].reset_index(drop=True) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "roll_func, kwargs, minp, fill_value", + [ + ["mean", {}, 10, None], + ["sum", {}, 10, None], + ["count", {}, 0, 0], + ["median", {}, 10, None], + ["min", {}, 10, None], + ["max", {}, 10, None], + ["std", {}, 10, None], + ["std", {"ddof": 0}, 10, None], + ["var", {}, 10, None], + ["var", {"ddof": 0}, 10, None], + ], +) +def test_center_reindex_series(series, roll_func, kwargs, minp, fill_value): + # shifter index + s = [f"x{x:d}" for x in range(12)] + + series_xp = ( + getattr( + series.reindex(list(series.index) + s).rolling(window=25, min_periods=minp), + roll_func, + )(**kwargs) + .shift(-12) + .reindex(series.index) + ) + series_rs = getattr( + series.rolling(window=25, min_periods=minp, center=True), roll_func + )(**kwargs) + if fill_value is not None: + series_xp = series_xp.fillna(fill_value) + tm.assert_series_equal(series_xp, series_rs) + + +@pytest.mark.parametrize( + "roll_func, kwargs, minp, fill_value", + [ + ["mean", {}, 10, None], + ["sum", {}, 10, None], + ["count", {}, 0, 0], + ["median", {}, 10, None], + ["min", {}, 10, None], + ["max", {}, 10, None], + ["std", {}, 10, None], + ["std", {"ddof": 0}, 10, None], + ["var", {}, 10, None], + ["var", {"ddof": 0}, 10, None], + ], +) +def test_center_reindex_frame(frame, roll_func, kwargs, minp, fill_value): + # shifter index + s = [f"x{x:d}" for x in range(12)] + + frame_xp = ( + getattr( + frame.reindex(list(frame.index) + s).rolling(window=25, min_periods=minp), + roll_func, + )(**kwargs) + .shift(-12) + .reindex(frame.index) + ) + frame_rs = getattr( + frame.rolling(window=25, min_periods=minp, center=True), roll_func + )(**kwargs) + if fill_value is not None: + frame_xp = frame_xp.fillna(fill_value) + tm.assert_frame_equal(frame_xp, frame_rs) From 9f0112f3b9b9bb6a5ea4d0895259168cccba529d Mon Sep 17 00:00:00 2001 From: Christoph Deil Date: Mon, 28 Sep 2020 04:43:52 +0200 Subject: [PATCH 0926/1025] DOC: Fix typo in timeseries.rst (#36687) --- doc/source/user_guide/timeseries.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst index d3d2bf8c72ba3..61902b4a41b7c 100644 --- a/doc/source/user_guide/timeseries.rst +++ b/doc/source/user_guide/timeseries.rst @@ -2210,7 +2210,7 @@ Time zone handling ------------------ pandas provides rich support for working with timestamps in different time -zones using the ``pytz`` and ``dateutil`` libraries or class:`datetime.timezone` +zones using the ``pytz`` and ``dateutil`` libraries or :class:`datetime.timezone` objects from the standard library. From 5a89814e8b6d0cf866e3ff8c9e457dd559e2b524 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 28 Sep 2020 08:39:31 -0700 Subject: [PATCH 0927/1025] CLN: assorted (#36606) * docstring fixups, closes #26982 * update RangeIndex docstring, closes #22373 * CLN: misc * CLN: update Makefil * update nat docstrings to match * revert controversial --- pandas/_libs/tslibs/nattype.pyx | 4 +-- pandas/_libs/tslibs/offsets.pyx | 36 +++++++++++++++----- pandas/_libs/tslibs/timestamps.pyx | 4 +-- pandas/core/indexes/range.py | 6 ++-- scripts/validate_rst_title_capitalization.py | 2 +- 5 files changed, 36 insertions(+), 16 deletions(-) diff --git a/pandas/_libs/tslibs/nattype.pyx b/pandas/_libs/tslibs/nattype.pyx index e346a14b531c5..3a628f997e5d6 100644 --- a/pandas/_libs/tslibs/nattype.pyx +++ b/pandas/_libs/tslibs/nattype.pyx @@ -392,7 +392,7 @@ class NaTType(_NaT): Returns ------- - string + str """, ) day_name = _make_nan_func( @@ -407,7 +407,7 @@ class NaTType(_NaT): Returns ------- - string + str """, ) # _nat_methods diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index 161e5f4e54f51..a78de3eace98c 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -219,7 +219,9 @@ cdef _wrap_timedelta_result(result): cdef _get_calendar(weekmask, holidays, calendar): - """Generate busdaycalendar""" + """ + Generate busdaycalendar + """ if isinstance(calendar, np.busdaycalendar): if not holidays: holidays = tuple(calendar.holidays) @@ -659,14 +661,18 @@ cdef class BaseOffset: return nint def __setstate__(self, state): - """Reconstruct an instance from a pickled state""" + """ + Reconstruct an instance from a pickled state + """ self.n = state.pop("n") self.normalize = state.pop("normalize") self._cache = state.pop("_cache", {}) # At this point we expect state to be empty def __getstate__(self): - """Return a pickleable state""" + """ + Return a pickleable state + """ state = {} state["n"] = self.n state["normalize"] = self.normalize @@ -971,7 +977,9 @@ cdef class RelativeDeltaOffset(BaseOffset): object.__setattr__(self, key, val) def __getstate__(self): - """Return a pickleable state""" + """ + Return a pickleable state + """ # RelativeDeltaOffset (technically DateOffset) is the only non-cdef # class, so the only one with __dict__ state = self.__dict__.copy() @@ -980,7 +988,9 @@ cdef class RelativeDeltaOffset(BaseOffset): return state def __setstate__(self, state): - """Reconstruct an instance from a pickled state""" + """ + Reconstruct an instance from a pickled state + """ if "offset" in state: # Older (<0.22.0) versions have offset attribute instead of _offset @@ -3604,7 +3614,9 @@ def shift_day(other: datetime, days: int) -> datetime: cdef inline int year_add_months(npy_datetimestruct dts, int months) nogil: - """new year number after shifting npy_datetimestruct number of months""" + """ + New year number after shifting npy_datetimestruct number of months. + """ return dts.year + (dts.month + months - 1) // 12 @@ -3702,7 +3714,9 @@ cdef inline void _shift_months(const int64_t[:] dtindex, Py_ssize_t count, int months, str day_opt) nogil: - """See shift_months.__doc__""" + """ + See shift_months.__doc__ + """ cdef: Py_ssize_t i int months_to_roll @@ -3734,7 +3748,9 @@ cdef inline void _shift_quarters(const int64_t[:] dtindex, int q1start_month, str day_opt, int modby) nogil: - """See shift_quarters.__doc__""" + """ + See shift_quarters.__doc__ + """ cdef: Py_ssize_t i int months_since, n @@ -3990,7 +4006,9 @@ cdef inline int _roll_qtrday(npy_datetimestruct* dts, int n, int months_since, str day_opt) nogil except? -1: - """See roll_qtrday.__doc__""" + """ + See roll_qtrday.__doc__ + """ if n > 0: if months_since < 0 or (months_since == 0 and diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index a01ef98b83693..78f7b2150f720 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -503,7 +503,7 @@ cdef class _Timestamp(ABCTimestamp): Returns ------- - string + str """ return self._get_date_name_field("day_name", locale) @@ -518,7 +518,7 @@ cdef class _Timestamp(ABCTimestamp): Returns ------- - string + str """ return self._get_date_name_field("month_name", locale) diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 684691501de5c..4dffda2605ef7 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -53,10 +53,12 @@ class RangeIndex(Int64Index): If int and "stop" is not given, interpreted as "stop" instead. stop : int (default: 0) step : int (default: 1) - name : object, optional - Name to be stored in the index. + dtype : np.int64 + Unused, accepted for homogeneity with other index types. copy : bool, default False Unused, accepted for homogeneity with other index types. + name : object, optional + Name to be stored in the index. Attributes ---------- diff --git a/scripts/validate_rst_title_capitalization.py b/scripts/validate_rst_title_capitalization.py index b654e27737359..c5f3701cc3c3f 100755 --- a/scripts/validate_rst_title_capitalization.py +++ b/scripts/validate_rst_title_capitalization.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 """ Validate that the titles in the rst files follow the proper capitalization convention. From ccb32bc7d4ddf1a93437ab66f1c6b9e410bddccf Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 28 Sep 2020 14:36:32 -0700 Subject: [PATCH 0928/1025] CI: npdev new exception message (#36706) * npdev fix * gh ref * catch message --- pandas/tests/arithmetic/common.py | 7 +++++++ pandas/tests/frame/test_arithmetic.py | 5 +++++ 2 files changed, 12 insertions(+) diff --git a/pandas/tests/arithmetic/common.py b/pandas/tests/arithmetic/common.py index 755fbd0d9036c..cd8dd102dc27c 100644 --- a/pandas/tests/arithmetic/common.py +++ b/pandas/tests/arithmetic/common.py @@ -76,6 +76,13 @@ def assert_invalid_comparison(left, right, box): "Cannot compare type", "not supported between", "invalid type promotion", + ( + # GH#36706 npdev 1.20.0 2020-09-28 + r"The DTypes and " + r" do not have a common DType. " + "For example they cannot be stored in a single array unless the " + "dtype is `object`." + ), ] ) with pytest.raises(TypeError, match=msg): diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index 6dd8d890e8a4b..b3aa5e403e795 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -53,6 +53,11 @@ def check(df, df2): msgs = [ r"Invalid comparison between dtype=datetime64\[ns\] and ndarray", "invalid type promotion", + ( + # npdev 1.20.0 + r"The DTypes and " + r" do not have a common DType." + ), ] msg = "|".join(msgs) with pytest.raises(TypeError, match=msg): From 1d0744569c68e56509d56ebb930bc70882094452 Mon Sep 17 00:00:00 2001 From: Zak Kohler Date: Mon, 28 Sep 2020 18:40:37 -0400 Subject: [PATCH 0929/1025] DOC: Fix remaining typos in docstrings 'handler' --> 'handle' (#36650) --- pandas/core/frame.py | 2 +- pandas/io/feather_format.py | 2 +- pandas/io/json/_json.py | 2 +- pandas/io/orc.py | 2 +- pandas/io/parquet.py | 4 ++-- pandas/io/parsers.py | 4 ++-- pandas/io/pytables.py | 2 +- pandas/io/sas/sasreader.py | 2 +- pandas/io/stata.py | 2 +- 9 files changed, 11 insertions(+), 11 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index cd85cce361cb4..9b2540a1ce043 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2298,7 +2298,7 @@ def to_parquet( path : str or file-like object If a string, it will be used as Root Directory path when writing a partitioned dataset. By file-like object, - we refer to objects with a write() method, such as a file handler + we refer to objects with a write() method, such as a file handle (e.g. via builtin open function) or io.BytesIO. The engine fastparquet does not accept file-like objects. diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py index ed3cd3cefe96e..9a42b8289ab47 100644 --- a/pandas/io/feather_format.py +++ b/pandas/io/feather_format.py @@ -104,7 +104,7 @@ def read_feather( ``os.PathLike``. By file-like object, we refer to objects with a ``read()`` method, - such as a file handler (e.g. via builtin ``open`` function) + such as a file handle (e.g. via builtin ``open`` function) or ``StringIO``. columns : sequence, default None If not provided, all columns are read. diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index a0ceb18c8bd20..51bcb4acddd7e 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -393,7 +393,7 @@ def read_json( ``os.PathLike``. By file-like object, we refer to objects with a ``read()`` method, - such as a file handler (e.g. via builtin ``open`` function) + such as a file handle (e.g. via builtin ``open`` function) or ``StringIO``. orient : str Indication of expected JSON string format. diff --git a/pandas/io/orc.py b/pandas/io/orc.py index f1b1aa6a43cb5..829ff6408d86d 100644 --- a/pandas/io/orc.py +++ b/pandas/io/orc.py @@ -31,7 +31,7 @@ def read_orc( ``os.PathLike``. By file-like object, we refer to objects with a ``read()`` method, - such as a file handler (e.g. via builtin ``open`` function) + such as a file handle (e.g. via builtin ``open`` function) or ``StringIO``. columns : list, default None If not None, only these columns will be read from the file. diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 07f2078931687..55256c928aad9 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -255,7 +255,7 @@ def to_parquet( path : str or file-like object If a string, it will be used as Root Directory path when writing a partitioned dataset. By file-like object, - we refer to objects with a write() method, such as a file handler + we refer to objects with a write() method, such as a file handle (e.g. via builtin open function) or io.BytesIO. The engine fastparquet does not accept file-like objects. @@ -333,7 +333,7 @@ def read_parquet(path, engine: str = "auto", columns=None, **kwargs): ``os.PathLike``. By file-like object, we refer to objects with a ``read()`` method, - such as a file handler (e.g. via builtin ``open`` function) + such as a file handle (e.g. via builtin ``open`` function) or ``StringIO``. engine : {'auto', 'pyarrow', 'fastparquet'}, default 'auto' Parquet library to use. If 'auto', then the option diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index e5b7aea895f86..dd3588faedf7a 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -92,7 +92,7 @@ If you want to pass in a path object, pandas accepts any ``os.PathLike``. By file-like object, we refer to objects with a ``read()`` method, such as - a file handler (e.g. via builtin ``open`` function) or ``StringIO``. + a file handle (e.g. via builtin ``open`` function) or ``StringIO``. sep : str, default {_default_sep} Delimiter to use. If sep is None, the C engine cannot automatically detect the separator, but the Python parsing engine can, meaning the latter will @@ -798,7 +798,7 @@ def read_fwf( ``os.PathLike``. By file-like object, we refer to objects with a ``read()`` method, - such as a file handler (e.g. via builtin ``open`` function) + such as a file handle (e.g. via builtin ``open`` function) or ``StringIO``. colspecs : list of tuple (int, int) or 'infer'. optional A list of tuples giving the extents of the fixed-width diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 5e5a89d96f0e5..d62480baed71e 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -310,7 +310,7 @@ def read_hdf( Alternatively, pandas accepts an open :class:`pandas.HDFStore` object. By file-like object, we refer to objects with a ``read()`` method, - such as a file handler (e.g. via builtin ``open`` function) + such as a file handle (e.g. via builtin ``open`` function) or ``StringIO``. key : object, optional The group identifier in the store. Can be omitted if the HDF file diff --git a/pandas/io/sas/sasreader.py b/pandas/io/sas/sasreader.py index 31d1a6ad471ea..893a6286f74d4 100644 --- a/pandas/io/sas/sasreader.py +++ b/pandas/io/sas/sasreader.py @@ -74,7 +74,7 @@ def read_sas( ``os.PathLike``. By file-like object, we refer to objects with a ``read()`` method, - such as a file handler (e.g. via builtin ``open`` function) + such as a file handle (e.g. via builtin ``open`` function) or ``StringIO``. format : str {'xport', 'sas7bdat'} or None If None, file format is inferred from file extension. If 'xport' or diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 5d34b4a7855ce..d36bd42e7da8d 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -110,7 +110,7 @@ If you want to pass in a path object, pandas accepts any ``os.PathLike``. By file-like object, we refer to objects with a ``read()`` method, - such as a file handler (e.g. via builtin ``open`` function) + such as a file handle (e.g. via builtin ``open`` function) or ``StringIO``. {_statafile_processing_params1} {_statafile_processing_params2} From 098ca4ba5434ccaa7d80acc7f4b23ba2b93ec321 Mon Sep 17 00:00:00 2001 From: S Mono <10430241+xh2@users.noreply.github.com> Date: Tue, 29 Sep 2020 00:07:50 +0100 Subject: [PATCH 0930/1025] Fix small typo (#36711) --- pandas/core/tools/timedeltas.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/tools/timedeltas.py b/pandas/core/tools/timedeltas.py index 791d5095283ba..372eac29bad9e 100644 --- a/pandas/core/tools/timedeltas.py +++ b/pandas/core/tools/timedeltas.py @@ -93,7 +93,7 @@ def to_timedelta(arg, unit=None, errors="raise"): unit = parse_timedelta_unit(unit) if errors not in ("ignore", "raise", "coerce"): - raise ValueError("errors must be one of 'ignore', 'raise', or 'coerce'}") + raise ValueError("errors must be one of 'ignore', 'raise', or 'coerce'.") if unit in {"Y", "y", "M"}: raise ValueError( From 3cbd4f568dd8335051e0412ca6719a3d7fb4fad5 Mon Sep 17 00:00:00 2001 From: Daniel Saxton <2658661+dsaxton@users.noreply.github.com> Date: Tue, 29 Sep 2020 08:02:12 -0500 Subject: [PATCH 0931/1025] Add pytest-instafail to environment.yml (#36686) --- environment.yml | 1 + requirements-dev.txt | 1 + 2 files changed, 2 insertions(+) diff --git a/environment.yml b/environment.yml index 7f6ce8cb9fa3b..f97f8e2457585 100644 --- a/environment.yml +++ b/environment.yml @@ -58,6 +58,7 @@ dependencies: - pytest-cov - pytest-xdist>=1.21 - pytest-asyncio + - pytest-instafail # downstream tests - seaborn diff --git a/requirements-dev.txt b/requirements-dev.txt index 690a3368c7aca..5a1c6a80334ed 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -39,6 +39,7 @@ pytest>=5.0.1 pytest-cov pytest-xdist>=1.21 pytest-asyncio +pytest-instafail seaborn statsmodels ipywidgets From f8fbd9f966342c5831757ccaa3fa790ae65c9e93 Mon Sep 17 00:00:00 2001 From: Micael Jarniac Date: Tue, 29 Sep 2020 12:41:02 -0300 Subject: [PATCH 0932/1025] DOC: Fix typo in docstring (#36723) --- pandas/core/generic.py | 2 +- pandas/core/indexes/base.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index bd720151fb15e..18a9c78912ba5 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3970,7 +3970,7 @@ def reindex_like( Maximum number of consecutive labels to fill for inexact matches. tolerance : optional Maximum distance between original and new labels for inexact - matches. The values of the index at the matching locations most + matches. The values of the index at the matching locations must satisfy the equation ``abs(index[indexer] - target) <= tolerance``. Tolerance may be a scalar value, which applies the same tolerance diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 84489c1033d8c..35948a3f3dcf1 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2922,7 +2922,7 @@ def get_loc(self, key, method=None, tolerance=None): distances are broken by preferring the larger index value. tolerance : int or float, optional Maximum distance from index value for inexact matches. The value of - the index at the matching location most satisfy the equation + the index at the matching location must satisfy the equation ``abs(index[loc] - key) <= tolerance``. Returns @@ -2987,7 +2987,7 @@ def get_loc(self, key, method=None, tolerance=None): inexact matches. tolerance : optional Maximum distance between original and new labels for inexact - matches. The values of the index at the matching locations most + matches. The values of the index at the matching locations must satisfy the equation ``abs(index[indexer] - target) <= tolerance``. Tolerance may be a scalar value, which applies the same tolerance From ed7e29ca8e15c1c2cf79d1dd3d9b38e9370ff137 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Tue, 29 Sep 2020 18:07:02 +0100 Subject: [PATCH 0933/1025] TYP: update setup.cfg (#36725) --- setup.cfg | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/setup.cfg b/setup.cfg index d938d2ef3972a..494c7ad328648 100644 --- a/setup.cfg +++ b/setup.cfg @@ -182,9 +182,6 @@ check_untyped_defs=False [mypy-pandas.core.groupby.base] check_untyped_defs=False -[mypy-pandas.core.groupby.generic] -check_untyped_defs=False - [mypy-pandas.core.groupby.grouper] check_untyped_defs=False @@ -263,21 +260,12 @@ check_untyped_defs=False [mypy-pandas.io.clipboard] check_untyped_defs=False -[mypy-pandas.io.common] -check_untyped_defs=False - [mypy-pandas.io.excel._base] check_untyped_defs=False -[mypy-pandas.io.excel._util] -check_untyped_defs=False - [mypy-pandas.io.formats.console] check_untyped_defs=False -[mypy-pandas.io.formats.csvs] -check_untyped_defs=False - [mypy-pandas.io.formats.excel] check_untyped_defs=False From e45eb0c6d06ec9d5537836b164fe111154ada88d Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Tue, 29 Sep 2020 12:52:59 -0700 Subject: [PATCH 0934/1025] CLN: Remove unnecessary rolling subclass (#36721) * Remove unnecessary subclass * Rename _Window -> BaseWindow * Add Rolling._constructor property * Black Co-authored-by: Matt Roeschke --- pandas/core/window/ewm.py | 4 ++-- pandas/core/window/rolling.py | 26 +++++++++++++------------- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/pandas/core/window/ewm.py b/pandas/core/window/ewm.py index 34d9d9d8c00ef..25938b57d9720 100644 --- a/pandas/core/window/ewm.py +++ b/pandas/core/window/ewm.py @@ -15,7 +15,7 @@ import pandas.core.common as common from pandas.core.window.common import _doc_template, _shared_docs, zsqrt -from pandas.core.window.rolling import RollingMixin, flex_binary_moment +from pandas.core.window.rolling import BaseWindow, flex_binary_moment _bias_template = """ Parameters @@ -60,7 +60,7 @@ def get_center_of_mass( return float(comass) -class ExponentialMovingWindow(RollingMixin): +class ExponentialMovingWindow(BaseWindow): r""" Provide exponential weighted (EW) functions. diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 335fc3db5cd86..6ab42dda865e7 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -147,7 +147,9 @@ def func(arg, window, min_periods=None): return func -class _Window(ShallowMixin, SelectionMixin): +class BaseWindow(ShallowMixin, SelectionMixin): + """Provides utilities for performing windowing operations.""" + _attributes: List[str] = [ "window", "min_periods", @@ -184,10 +186,6 @@ def __init__( self.axis = obj._get_axis_number(axis) if axis is not None else None self.validate() - @property - def _constructor(self): - return Window - @property def is_datetimelike(self) -> Optional[bool]: return None @@ -862,7 +860,7 @@ def aggregate(self, func, *args, **kwargs): ) -class Window(_Window): +class Window(BaseWindow): """ Provide rolling window calculations. @@ -1040,6 +1038,10 @@ class Window(_Window): 2013-01-01 09:00:06 4.0 """ + @property + def _constructor(self): + return Window + def validate(self): super().validate() @@ -1220,13 +1222,7 @@ def std(self, ddof=1, *args, **kwargs): return zsqrt(self.var(ddof=ddof, name="std", **kwargs)) -class RollingMixin(_Window): - @property - def _constructor(self): - return Rolling - - -class RollingAndExpandingMixin(RollingMixin): +class RollingAndExpandingMixin(BaseWindow): _shared_docs["count"] = dedent( r""" @@ -1939,6 +1935,10 @@ def _on(self) -> Index: "must be a column (of DataFrame), an Index or None" ) + @property + def _constructor(self): + return Rolling + def validate(self): super().validate() From 2f8cbb3329df19d2a07e179ea2c0ef6cf02bc4be Mon Sep 17 00:00:00 2001 From: Daniel Saxton <2658661+dsaxton@users.noreply.github.com> Date: Tue, 29 Sep 2020 15:29:24 -0500 Subject: [PATCH 0935/1025] CLN: Format doc code blocks (#36700) --- doc/source/development/code_style.rst | 3 +- doc/source/development/developer.rst | 12 +- doc/source/development/internals.rst | 5 +- doc/source/user_guide/categorical.rst | 178 ++++++++--------- doc/source/user_guide/integer_na.rst | 10 +- doc/source/user_guide/options.rst | 117 ++++++----- doc/source/user_guide/reshaping.rst | 272 ++++++++++++++------------ doc/source/user_guide/timedeltas.rst | 139 ++++++------- setup.cfg | 1 + 9 files changed, 385 insertions(+), 352 deletions(-) diff --git a/doc/source/development/code_style.rst b/doc/source/development/code_style.rst index 11d0c35f92ff5..387f65ea583a0 100644 --- a/doc/source/development/code_style.rst +++ b/doc/source/development/code_style.rst @@ -172,5 +172,6 @@ Reading from a url .. code-block:: python from pandas.io.common import urlopen - with urlopen('http://www.google.com') as url: + + with urlopen("http://www.google.com") as url: raw_text = url.read() diff --git a/doc/source/development/developer.rst b/doc/source/development/developer.rst index fbd83af3de82e..bdbcf5ca337b8 100644 --- a/doc/source/development/developer.rst +++ b/doc/source/development/developer.rst @@ -71,11 +71,13 @@ descriptor format for these as is follows: .. code-block:: python index = pd.RangeIndex(0, 10, 2) - {'kind': 'range', - 'name': index.name, - 'start': index.start, - 'stop': index.stop, - 'step': index.step} + { + "kind": "range", + "name": index.name, + "start": index.start, + "stop": index.stop, + "step": index.step, + } Other index types must be serialized as data columns along with the other DataFrame columns. The metadata for these is a string indicating the name of diff --git a/doc/source/development/internals.rst b/doc/source/development/internals.rst index 8f1c3d5d818c2..cec385dd087db 100644 --- a/doc/source/development/internals.rst +++ b/doc/source/development/internals.rst @@ -68,8 +68,9 @@ integer **codes** (until version 0.24 named *labels*), and the level **names**: .. ipython:: python - index = pd.MultiIndex.from_product([range(3), ['one', 'two']], - names=['first', 'second']) + index = pd.MultiIndex.from_product( + [range(3), ["one", "two"]], names=["first", "second"] + ) index index.levels index.codes diff --git a/doc/source/user_guide/categorical.rst b/doc/source/user_guide/categorical.rst index 9da5d2a9fc92f..926c2d9be74c2 100644 --- a/doc/source/user_guide/categorical.rst +++ b/doc/source/user_guide/categorical.rst @@ -58,7 +58,7 @@ By converting an existing ``Series`` or column to a ``category`` dtype: .. ipython:: python df = pd.DataFrame({"A": ["a", "b", "c", "a"]}) - df["B"] = df["A"].astype('category') + df["B"] = df["A"].astype("category") df By using special functions, such as :func:`~pandas.cut`, which groups data into @@ -66,18 +66,19 @@ discrete bins. See the :ref:`example on tiling ` in the docs .. ipython:: python - df = pd.DataFrame({'value': np.random.randint(0, 100, 20)}) + df = pd.DataFrame({"value": np.random.randint(0, 100, 20)}) labels = ["{0} - {1}".format(i, i + 9) for i in range(0, 100, 10)] - df['group'] = pd.cut(df.value, range(0, 105, 10), right=False, labels=labels) + df["group"] = pd.cut(df.value, range(0, 105, 10), right=False, labels=labels) df.head(10) By passing a :class:`pandas.Categorical` object to a ``Series`` or assigning it to a ``DataFrame``. .. ipython:: python - raw_cat = pd.Categorical(["a", "b", "c", "a"], categories=["b", "c", "d"], - ordered=False) + raw_cat = pd.Categorical( + ["a", "b", "c", "a"], categories=["b", "c", "d"], ordered=False + ) s = pd.Series(raw_cat) s df = pd.DataFrame({"A": ["a", "b", "c", "a"]}) @@ -100,7 +101,7 @@ This can be done during construction by specifying ``dtype="category"`` in the ` .. ipython:: python - df = pd.DataFrame({'A': list('abca'), 'B': list('bccd')}, dtype="category") + df = pd.DataFrame({"A": list("abca"), "B": list("bccd")}, dtype="category") df.dtypes Note that the categories present in each column differ; the conversion is done column by column, so @@ -108,24 +109,24 @@ only labels present in a given column are categories: .. ipython:: python - df['A'] - df['B'] + df["A"] + df["B"] Analogously, all columns in an existing ``DataFrame`` can be batch converted using :meth:`DataFrame.astype`: .. ipython:: python - df = pd.DataFrame({'A': list('abca'), 'B': list('bccd')}) - df_cat = df.astype('category') + df = pd.DataFrame({"A": list("abca"), "B": list("bccd")}) + df_cat = df.astype("category") df_cat.dtypes This conversion is likewise done column by column: .. ipython:: python - df_cat['A'] - df_cat['B'] + df_cat["A"] + df_cat["B"] Controlling behavior @@ -143,9 +144,9 @@ of :class:`~pandas.api.types.CategoricalDtype`. .. ipython:: python from pandas.api.types import CategoricalDtype + s = pd.Series(["a", "b", "c", "a"]) - cat_type = CategoricalDtype(categories=["b", "c", "d"], - ordered=True) + cat_type = CategoricalDtype(categories=["b", "c", "d"], ordered=True) s_cat = s.astype(cat_type) s_cat @@ -155,12 +156,12 @@ are consistent among all columns. .. ipython:: python from pandas.api.types import CategoricalDtype - df = pd.DataFrame({'A': list('abca'), 'B': list('bccd')}) - cat_type = CategoricalDtype(categories=list('abcd'), - ordered=True) + + df = pd.DataFrame({"A": list("abca"), "B": list("bccd")}) + cat_type = CategoricalDtype(categories=list("abcd"), ordered=True) df_cat = df.astype(cat_type) - df_cat['A'] - df_cat['B'] + df_cat["A"] + df_cat["B"] .. note:: @@ -175,8 +176,7 @@ during normal constructor mode: .. ipython:: python splitter = np.random.choice([0, 1], 5, p=[0.5, 0.5]) - s = pd.Series(pd.Categorical.from_codes(splitter, - categories=["train", "test"])) + s = pd.Series(pd.Categorical.from_codes(splitter, categories=["train", "test"])) Regaining original data @@ -189,7 +189,7 @@ To get back to the original ``Series`` or NumPy array, use s = pd.Series(["a", "b", "c", "a"]) s - s2 = s.astype('category') + s2 = s.astype("category") s2 s2.astype(str) np.asarray(s2) @@ -223,8 +223,9 @@ by default. .. ipython:: python from pandas.api.types import CategoricalDtype - CategoricalDtype(['a', 'b', 'c']) - CategoricalDtype(['a', 'b', 'c'], ordered=True) + + CategoricalDtype(["a", "b", "c"]) + CategoricalDtype(["a", "b", "c"], ordered=True) CategoricalDtype() A :class:`~pandas.api.types.CategoricalDtype` can be used in any place pandas @@ -248,19 +249,19 @@ unordered categoricals, the order of the ``categories`` is not considered. .. ipython:: python - c1 = CategoricalDtype(['a', 'b', 'c'], ordered=False) + c1 = CategoricalDtype(["a", "b", "c"], ordered=False) # Equal, since order is not considered when ordered=False - c1 == CategoricalDtype(['b', 'c', 'a'], ordered=False) + c1 == CategoricalDtype(["b", "c", "a"], ordered=False) # Unequal, since the second CategoricalDtype is ordered - c1 == CategoricalDtype(['a', 'b', 'c'], ordered=True) + c1 == CategoricalDtype(["a", "b", "c"], ordered=True) All instances of ``CategoricalDtype`` compare equal to the string ``'category'``. .. ipython:: python - c1 == 'category' + c1 == "category" .. warning:: @@ -303,8 +304,7 @@ It's also possible to pass in the categories in a specific order: .. ipython:: python - s = pd.Series(pd.Categorical(["a", "b", "c", "a"], - categories=["c", "b", "a"])) + s = pd.Series(pd.Categorical(["a", "b", "c", "a"], categories=["c", "b", "a"])) s.cat.categories s.cat.ordered @@ -322,7 +322,7 @@ It's also possible to pass in the categories in a specific order: .. ipython:: python - s = pd.Series(list('babc')).astype(CategoricalDtype(list('abcd'))) + s = pd.Series(list("babc")).astype(CategoricalDtype(list("abcd"))) s # categories @@ -348,7 +348,7 @@ Renaming categories is done by assigning new values to the s = s.cat.rename_categories([1, 2, 3]) s # You can also pass a dict-like object to map the renaming - s = s.cat.rename_categories({1: 'x', 2: 'y', 3: 'z'}) + s = s.cat.rename_categories({1: "x", 2: "y", 3: "z"}) s .. note:: @@ -409,8 +409,7 @@ Removing unused categories can also be done: .. ipython:: python - s = pd.Series(pd.Categorical(["a", "b", "a"], - categories=["a", "b", "c", "d"])) + s = pd.Series(pd.Categorical(["a", "b", "a"], categories=["a", "b", "c", "d"])) s s.cat.remove_unused_categories() @@ -446,9 +445,7 @@ meaning and certain operations are possible. If the categorical is unordered, `` s = pd.Series(pd.Categorical(["a", "b", "c", "a"], ordered=False)) s.sort_values(inplace=True) - s = pd.Series(["a", "b", "c", "a"]).astype( - CategoricalDtype(ordered=True) - ) + s = pd.Series(["a", "b", "c", "a"]).astype(CategoricalDtype(ordered=True)) s.sort_values(inplace=True) s s.min(), s.max() @@ -514,18 +511,20 @@ The ordering of the categorical is determined by the ``categories`` of that colu .. ipython:: python - dfs = pd.DataFrame({'A': pd.Categorical(list('bbeebbaa'), - categories=['e', 'a', 'b'], - ordered=True), - 'B': [1, 2, 1, 2, 2, 1, 2, 1]}) - dfs.sort_values(by=['A', 'B']) + dfs = pd.DataFrame( + { + "A": pd.Categorical(list("bbeebbaa"), categories=["e", "a", "b"], ordered=True), + "B": [1, 2, 1, 2, 2, 1, 2, 1], + } + ) + dfs.sort_values(by=["A", "B"]) Reordering the ``categories`` changes a future sort. .. ipython:: python - dfs['A'] = dfs['A'].cat.reorder_categories(['a', 'b', 'e']) - dfs.sort_values(by=['A', 'B']) + dfs["A"] = dfs["A"].cat.reorder_categories(["a", "b", "e"]) + dfs.sort_values(by=["A", "B"]) Comparisons ----------- @@ -550,15 +549,9 @@ categories or a categorical with any list-like object, will raise a ``TypeError` .. ipython:: python - cat = pd.Series([1, 2, 3]).astype( - CategoricalDtype([3, 2, 1], ordered=True) - ) - cat_base = pd.Series([2, 2, 2]).astype( - CategoricalDtype([3, 2, 1], ordered=True) - ) - cat_base2 = pd.Series([2, 2, 2]).astype( - CategoricalDtype(ordered=True) - ) + cat = pd.Series([1, 2, 3]).astype(CategoricalDtype([3, 2, 1], ordered=True)) + cat_base = pd.Series([2, 2, 2]).astype(CategoricalDtype([3, 2, 1], ordered=True)) + cat_base2 = pd.Series([2, 2, 2]).astype(CategoricalDtype(ordered=True)) cat cat_base @@ -607,8 +600,8 @@ When you compare two unordered categoricals with the same categories, the order .. ipython:: python - c1 = pd.Categorical(['a', 'b'], categories=['a', 'b'], ordered=False) - c2 = pd.Categorical(['a', 'b'], categories=['b', 'a'], ordered=False) + c1 = pd.Categorical(["a", "b"], categories=["a", "b"], ordered=False) + c2 = pd.Categorical(["a", "b"], categories=["b", "a"], ordered=False) c1 == c2 Operations @@ -622,23 +615,21 @@ even if some categories are not present in the data: .. ipython:: python - s = pd.Series(pd.Categorical(["a", "b", "c", "c"], - categories=["c", "a", "b", "d"])) + s = pd.Series(pd.Categorical(["a", "b", "c", "c"], categories=["c", "a", "b", "d"])) s.value_counts() Groupby will also show "unused" categories: .. ipython:: python - cats = pd.Categorical(["a", "b", "b", "b", "c", "c", "c"], - categories=["a", "b", "c", "d"]) + cats = pd.Categorical( + ["a", "b", "b", "b", "c", "c", "c"], categories=["a", "b", "c", "d"] + ) df = pd.DataFrame({"cats": cats, "values": [1, 2, 2, 2, 3, 4, 5]}) df.groupby("cats").mean() cats2 = pd.Categorical(["a", "a", "b", "b"], categories=["a", "b", "c"]) - df2 = pd.DataFrame({"cats": cats2, - "B": ["c", "d", "c", "d"], - "values": [1, 2, 3, 4]}) + df2 = pd.DataFrame({"cats": cats2, "B": ["c", "d", "c", "d"], "values": [1, 2, 3, 4]}) df2.groupby(["cats", "B"]).mean() @@ -647,10 +638,8 @@ Pivot tables: .. ipython:: python raw_cat = pd.Categorical(["a", "a", "b", "b"], categories=["a", "b", "c"]) - df = pd.DataFrame({"A": raw_cat, - "B": ["c", "d", "c", "d"], - "values": [1, 2, 3, 4]}) - pd.pivot_table(df, values='values', index=['A', 'B']) + df = pd.DataFrame({"A": raw_cat, "B": ["c", "d", "c", "d"], "values": [1, 2, 3, 4]}) + pd.pivot_table(df, values="values", index=["A", "B"]) Data munging ------------ @@ -668,8 +657,7 @@ If the slicing operation returns either a ``DataFrame`` or a column of type .. ipython:: python idx = pd.Index(["h", "i", "j", "k", "l", "m", "n"]) - cats = pd.Series(["a", "b", "b", "b", "c", "c", "c"], - dtype="category", index=idx) + cats = pd.Series(["a", "b", "b", "b", "c", "c", "c"], dtype="category", index=idx) values = [1, 2, 2, 2, 3, 4, 5] df = pd.DataFrame({"cats": cats, "values": values}, index=idx) df.iloc[2:4, :] @@ -714,13 +702,13 @@ an appropriate type: .. ipython:: python - str_s = pd.Series(list('aabb')) - str_cat = str_s.astype('category') + str_s = pd.Series(list("aabb")) + str_cat = str_s.astype("category") str_cat str_cat.str.contains("a") - date_s = pd.Series(pd.date_range('1/1/2015', periods=5)) - date_cat = date_s.astype('category') + date_s = pd.Series(pd.date_range("1/1/2015", periods=5)) + date_cat = date_s.astype("category") date_cat date_cat.dt.day @@ -758,8 +746,7 @@ value is included in the ``categories``: .. ipython:: python idx = pd.Index(["h", "i", "j", "k", "l", "m", "n"]) - cats = pd.Categorical(["a", "a", "a", "a", "a", "a", "a"], - categories=["a", "b"]) + cats = pd.Categorical(["a", "a", "a", "a", "a", "a", "a"], categories=["a", "b"]) values = [1, 1, 1, 1, 1, 1, 1] df = pd.DataFrame({"cats": cats, "values": values}, index=idx) @@ -777,8 +764,7 @@ Setting values by assigning categorical data will also check that the ``categori df.loc["j":"k", "cats"] = pd.Categorical(["a", "a"], categories=["a", "b"]) df try: - df.loc["j":"k", "cats"] = pd.Categorical(["b", "b"], - categories=["a", "b", "c"]) + df.loc["j":"k", "cats"] = pd.Categorical(["b", "b"], categories=["a", "b", "c"]) except ValueError as e: print("ValueError:", str(e)) @@ -809,12 +795,12 @@ dtypes will likely have higher memory usage. Use ``.astype`` or from pandas.api.types import union_categoricals # same categories - s1 = pd.Series(['a', 'b'], dtype='category') - s2 = pd.Series(['a', 'b', 'a'], dtype='category') + s1 = pd.Series(["a", "b"], dtype="category") + s2 = pd.Series(["a", "b", "a"], dtype="category") pd.concat([s1, s2]) # different categories - s3 = pd.Series(['b', 'c'], dtype='category') + s3 = pd.Series(["b", "c"], dtype="category") pd.concat([s1, s3]) # Output dtype is inferred based on categories values @@ -822,7 +808,7 @@ dtypes will likely have higher memory usage. Use ``.astype`` or float_cats = pd.Series([3.0, 4.0], dtype="category") pd.concat([int_cats, float_cats]) - pd.concat([s1, s3]).astype('category') + pd.concat([s1, s3]).astype("category") union_categoricals([s1.array, s3.array]) The following table summarizes the results of merging ``Categoricals``: @@ -853,6 +839,7 @@ the categories being combined. .. ipython:: python from pandas.api.types import union_categoricals + a = pd.Categorical(["b", "c"]) b = pd.Categorical(["a", "b"]) union_categoricals([a, b]) @@ -900,8 +887,8 @@ the resulting array will always be a plain ``Categorical``: .. ipython:: python - a = pd.Series(["b", "c"], dtype='category') - b = pd.Series(["a", "b"], dtype='category') + a = pd.Series(["b", "c"], dtype="category") + b = pd.Series(["a", "b"], dtype="category") union_categoricals([a, b]) .. note:: @@ -946,7 +933,8 @@ relevant columns back to ``category`` and assign the right categories and catego .. ipython:: python import io - s = pd.Series(pd.Categorical(['a', 'b', 'b', 'a', 'a', 'd'])) + + s = pd.Series(pd.Categorical(["a", "b", "b", "a", "a", "d"])) # rename the categories s.cat.categories = ["very good", "good", "bad"] # reorder the categories and add missing categories @@ -959,9 +947,9 @@ relevant columns back to ``category`` and assign the right categories and catego df2["cats"] # Redo the category df2["cats"] = df2["cats"].astype("category") - df2["cats"].cat.set_categories(["very bad", "bad", "medium", - "good", "very good"], - inplace=True) + df2["cats"].cat.set_categories( + ["very bad", "bad", "medium", "good", "very good"], inplace=True + ) df2.dtypes df2["cats"] @@ -1029,13 +1017,13 @@ an ``object`` dtype is a constant times the length of the data. .. ipython:: python - s = pd.Series(['foo', 'bar'] * 1000) + s = pd.Series(["foo", "bar"] * 1000) # object dtype s.nbytes # category dtype - s.astype('category').nbytes + s.astype("category").nbytes .. note:: @@ -1044,13 +1032,13 @@ an ``object`` dtype is a constant times the length of the data. .. ipython:: python - s = pd.Series(['foo%04d' % i for i in range(2000)]) + s = pd.Series(["foo%04d" % i for i in range(2000)]) # object dtype s.nbytes # category dtype - s.astype('category').nbytes + s.astype("category").nbytes ``Categorical`` is not a ``numpy`` array @@ -1085,8 +1073,8 @@ To check if a Series contains Categorical data, use ``hasattr(s, 'cat')``: .. ipython:: python - hasattr(pd.Series(['a'], dtype='category'), 'cat') - hasattr(pd.Series(['a']), 'cat') + hasattr(pd.Series(["a"], dtype="category"), "cat") + hasattr(pd.Series(["a"]), "cat") Using NumPy functions on a ``Series`` of type ``category`` should not work as ``Categoricals`` are not numeric data (even in the case that ``.categories`` is numeric). @@ -1113,9 +1101,9 @@ You can use ``fillna`` to handle missing values before applying a function. .. ipython:: python - df = pd.DataFrame({"a": [1, 2, 3, 4], - "b": ["a", "b", "c", "d"], - "cats": pd.Categorical([1, 2, 3, 2])}) + df = pd.DataFrame( + {"a": [1, 2, 3, 4], "b": ["a", "b", "c", "d"], "cats": pd.Categorical([1, 2, 3, 2])} + ) df.apply(lambda row: type(row["cats"]), axis=1) df.apply(lambda col: col.dtype, axis=0) diff --git a/doc/source/user_guide/integer_na.rst b/doc/source/user_guide/integer_na.rst index a45d7a4fa1547..acee1638570f7 100644 --- a/doc/source/user_guide/integer_na.rst +++ b/doc/source/user_guide/integer_na.rst @@ -112,7 +112,7 @@ dtype if needed. s.iloc[1:3] # operate with other dtypes - s + s.iloc[1:3].astype('Int8') + s + s.iloc[1:3].astype("Int8") # coerce when needed s + 0.01 @@ -121,7 +121,7 @@ These dtypes can operate as part of of ``DataFrame``. .. ipython:: python - df = pd.DataFrame({'A': s, 'B': [1, 1, 3], 'C': list('aab')}) + df = pd.DataFrame({"A": s, "B": [1, 1, 3], "C": list("aab")}) df df.dtypes @@ -130,15 +130,15 @@ These dtypes can be merged & reshaped & casted. .. ipython:: python - pd.concat([df[['A']], df[['B', 'C']]], axis=1).dtypes - df['A'].astype(float) + pd.concat([df[["A"]], df[["B", "C"]]], axis=1).dtypes + df["A"].astype(float) Reduction and groupby operations such as 'sum' work as well. .. ipython:: python df.sum() - df.groupby('B').A.sum() + df.groupby("B").A.sum() Scalar NA Value --------------- diff --git a/doc/source/user_guide/options.rst b/doc/source/user_guide/options.rst index 563fc941294d1..d222297abc70b 100644 --- a/doc/source/user_guide/options.rst +++ b/doc/source/user_guide/options.rst @@ -17,6 +17,7 @@ You can get/set options directly as attributes of the top-level ``options`` attr .. ipython:: python import pandas as pd + pd.options.display.max_rows pd.options.display.max_rows = 999 pd.options.display.max_rows @@ -77,9 +78,9 @@ are available from the pandas namespace. To change an option, call .. ipython:: python - pd.get_option('mode.sim_interactive') - pd.set_option('mode.sim_interactive', True) - pd.get_option('mode.sim_interactive') + pd.get_option("mode.sim_interactive") + pd.set_option("mode.sim_interactive", True) + pd.get_option("mode.sim_interactive") **Note:** The option 'mode.sim_interactive' is mostly used for debugging purposes. @@ -135,8 +136,9 @@ More information can be found in the `ipython documentation .. code-block:: python import pandas as pd - pd.set_option('display.max_rows', 999) - pd.set_option('precision', 5) + + pd.set_option("display.max_rows", 999) + pd.set_option("precision", 5) .. _options.frequently_used: @@ -151,27 +153,27 @@ lines are replaced by an ellipsis. .. ipython:: python df = pd.DataFrame(np.random.randn(7, 2)) - pd.set_option('max_rows', 7) + pd.set_option("max_rows", 7) df - pd.set_option('max_rows', 5) + pd.set_option("max_rows", 5) df - pd.reset_option('max_rows') + pd.reset_option("max_rows") Once the ``display.max_rows`` is exceeded, the ``display.min_rows`` options determines how many rows are shown in the truncated repr. .. ipython:: python - pd.set_option('max_rows', 8) - pd.set_option('min_rows', 4) + pd.set_option("max_rows", 8) + pd.set_option("min_rows", 4) # below max_rows -> all rows shown df = pd.DataFrame(np.random.randn(7, 2)) df # above max_rows -> only min_rows (4) rows shown df = pd.DataFrame(np.random.randn(9, 2)) df - pd.reset_option('max_rows') - pd.reset_option('min_rows') + pd.reset_option("max_rows") + pd.reset_option("min_rows") ``display.expand_frame_repr`` allows for the representation of dataframes to stretch across pages, wrapped over the full column vs row-wise. @@ -179,11 +181,11 @@ dataframes to stretch across pages, wrapped over the full column vs row-wise. .. ipython:: python df = pd.DataFrame(np.random.randn(5, 10)) - pd.set_option('expand_frame_repr', True) + pd.set_option("expand_frame_repr", True) df - pd.set_option('expand_frame_repr', False) + pd.set_option("expand_frame_repr", False) df - pd.reset_option('expand_frame_repr') + pd.reset_option("expand_frame_repr") ``display.large_repr`` lets you select whether to display dataframes that exceed ``max_columns`` or ``max_rows`` as a truncated frame, or as a summary. @@ -191,26 +193,32 @@ dataframes to stretch across pages, wrapped over the full column vs row-wise. .. ipython:: python df = pd.DataFrame(np.random.randn(10, 10)) - pd.set_option('max_rows', 5) - pd.set_option('large_repr', 'truncate') + pd.set_option("max_rows", 5) + pd.set_option("large_repr", "truncate") df - pd.set_option('large_repr', 'info') + pd.set_option("large_repr", "info") df - pd.reset_option('large_repr') - pd.reset_option('max_rows') + pd.reset_option("large_repr") + pd.reset_option("max_rows") ``display.max_colwidth`` sets the maximum width of columns. Cells of this length or longer will be truncated with an ellipsis. .. ipython:: python - df = pd.DataFrame(np.array([['foo', 'bar', 'bim', 'uncomfortably long string'], - ['horse', 'cow', 'banana', 'apple']])) - pd.set_option('max_colwidth', 40) + df = pd.DataFrame( + np.array( + [ + ["foo", "bar", "bim", "uncomfortably long string"], + ["horse", "cow", "banana", "apple"], + ] + ) + ) + pd.set_option("max_colwidth", 40) df - pd.set_option('max_colwidth', 6) + pd.set_option("max_colwidth", 6) df - pd.reset_option('max_colwidth') + pd.reset_option("max_colwidth") ``display.max_info_columns`` sets a threshold for when by-column info will be given. @@ -218,11 +226,11 @@ will be given. .. ipython:: python df = pd.DataFrame(np.random.randn(10, 10)) - pd.set_option('max_info_columns', 11) + pd.set_option("max_info_columns", 11) df.info() - pd.set_option('max_info_columns', 5) + pd.set_option("max_info_columns", 5) df.info() - pd.reset_option('max_info_columns') + pd.reset_option("max_info_columns") ``display.max_info_rows``: ``df.info()`` will usually show null-counts for each column. For large frames this can be quite slow. ``max_info_rows`` and ``max_info_cols`` @@ -233,11 +241,11 @@ can specify the option ``df.info(null_counts=True)`` to override on showing a pa df = pd.DataFrame(np.random.choice([0, 1, np.nan], size=(10, 10))) df - pd.set_option('max_info_rows', 11) + pd.set_option("max_info_rows", 11) df.info() - pd.set_option('max_info_rows', 5) + pd.set_option("max_info_rows", 5) df.info() - pd.reset_option('max_info_rows') + pd.reset_option("max_info_rows") ``display.precision`` sets the output display precision in terms of decimal places. This is only a suggestion. @@ -245,9 +253,9 @@ This is only a suggestion. .. ipython:: python df = pd.DataFrame(np.random.randn(5, 5)) - pd.set_option('precision', 7) + pd.set_option("precision", 7) df - pd.set_option('precision', 4) + pd.set_option("precision", 4) df ``display.chop_threshold`` sets at what level pandas rounds to zero when @@ -257,26 +265,27 @@ precision at which the number is stored. .. ipython:: python df = pd.DataFrame(np.random.randn(6, 6)) - pd.set_option('chop_threshold', 0) + pd.set_option("chop_threshold", 0) df - pd.set_option('chop_threshold', .5) + pd.set_option("chop_threshold", 0.5) df - pd.reset_option('chop_threshold') + pd.reset_option("chop_threshold") ``display.colheader_justify`` controls the justification of the headers. The options are 'right', and 'left'. .. ipython:: python - df = pd.DataFrame(np.array([np.random.randn(6), - np.random.randint(1, 9, 6) * .1, - np.zeros(6)]).T, - columns=['A', 'B', 'C'], dtype='float') - pd.set_option('colheader_justify', 'right') + df = pd.DataFrame( + np.array([np.random.randn(6), np.random.randint(1, 9, 6) * 0.1, np.zeros(6)]).T, + columns=["A", "B", "C"], + dtype="float", + ) + pd.set_option("colheader_justify", "right") df - pd.set_option('colheader_justify', 'left') + pd.set_option("colheader_justify", "left") df - pd.reset_option('colheader_justify') + pd.reset_option("colheader_justify") @@ -481,9 +490,9 @@ For instance: import numpy as np pd.set_eng_float_format(accuracy=3, use_eng_prefix=True) - s = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e']) - s / 1.e3 - s / 1.e6 + s = pd.Series(np.random.randn(5), index=["a", "b", "c", "d", "e"]) + s / 1.0e3 + s / 1.0e6 .. ipython:: python :suppress: @@ -510,7 +519,7 @@ If a DataFrame or Series contains these characters, the default output mode may .. ipython:: python - df = pd.DataFrame({'国籍': ['UK', '日本'], '名前': ['Alice', 'しのぶ']}) + df = pd.DataFrame({"国籍": ["UK", "日本"], "名前": ["Alice", "しのぶ"]}) df .. image:: ../_static/option_unicode01.png @@ -521,7 +530,7 @@ times than the standard ``len`` function. .. ipython:: python - pd.set_option('display.unicode.east_asian_width', True) + pd.set_option("display.unicode.east_asian_width", True) df .. image:: ../_static/option_unicode02.png @@ -533,7 +542,7 @@ By default, an "Ambiguous" character's width, such as "¡" (inverted exclamation .. ipython:: python - df = pd.DataFrame({'a': ['xxx', '¡¡'], 'b': ['yyy', '¡¡']}) + df = pd.DataFrame({"a": ["xxx", "¡¡"], "b": ["yyy", "¡¡"]}) df .. image:: ../_static/option_unicode03.png @@ -545,7 +554,7 @@ However, setting this option incorrectly for your terminal will cause these char .. ipython:: python - pd.set_option('display.unicode.ambiguous_as_wide', True) + pd.set_option("display.unicode.ambiguous_as_wide", True) df .. image:: ../_static/option_unicode04.png @@ -553,8 +562,8 @@ However, setting this option incorrectly for your terminal will cause these char .. ipython:: python :suppress: - pd.set_option('display.unicode.east_asian_width', False) - pd.set_option('display.unicode.ambiguous_as_wide', False) + pd.set_option("display.unicode.east_asian_width", False) + pd.set_option("display.unicode.ambiguous_as_wide", False) .. _options.table_schema: @@ -567,7 +576,7 @@ by default. False by default, this can be enabled globally with the .. ipython:: python - pd.set_option('display.html.table_schema', True) + pd.set_option("display.html.table_schema", True) Only ``'display.max_rows'`` are serialized and published. @@ -575,4 +584,4 @@ Only ``'display.max_rows'`` are serialized and published. .. ipython:: python :suppress: - pd.reset_option('display.html.table_schema') + pd.reset_option("display.html.table_schema") diff --git a/doc/source/user_guide/reshaping.rst b/doc/source/user_guide/reshaping.rst index e6797512ce3cf..2061185b25416 100644 --- a/doc/source/user_guide/reshaping.rst +++ b/doc/source/user_guide/reshaping.rst @@ -18,14 +18,18 @@ Reshaping by pivoting DataFrame objects import pandas._testing as tm + def unpivot(frame): N, K = frame.shape - data = {'value': frame.to_numpy().ravel('F'), - 'variable': np.asarray(frame.columns).repeat(N), - 'date': np.tile(np.asarray(frame.index), K)} - columns = ['date', 'variable', 'value'] + data = { + "value": frame.to_numpy().ravel("F"), + "variable": np.asarray(frame.columns).repeat(N), + "date": np.tile(np.asarray(frame.index), K), + } + columns = ["date", "variable", "value"] return pd.DataFrame(data, columns=columns) + df = unpivot(tm.makeTimeDataFrame(3)) Data is often stored in so-called "stacked" or "record" format: @@ -41,12 +45,15 @@ For the curious here is how the above ``DataFrame`` was created: import pandas._testing as tm + def unpivot(frame): N, K = frame.shape - data = {'value': frame.to_numpy().ravel('F'), - 'variable': np.asarray(frame.columns).repeat(N), - 'date': np.tile(np.asarray(frame.index), K)} - return pd.DataFrame(data, columns=['date', 'variable', 'value']) + data = { + "value": frame.to_numpy().ravel("F"), + "variable": np.asarray(frame.columns).repeat(N), + "date": np.tile(np.asarray(frame.index), K), + } + return pd.DataFrame(data, columns=["date", "variable", "value"]) df = unpivot(tm.makeTimeDataFrame(3)) @@ -55,7 +62,7 @@ To select out everything for variable ``A`` we could do: .. ipython:: python - df[df['variable'] == 'A'] + df[df["variable"] == "A"] But suppose we wish to do time series operations with the variables. A better representation would be where the ``columns`` are the unique variables and an @@ -65,7 +72,7 @@ top level function :func:`~pandas.pivot`): .. ipython:: python - df.pivot(index='date', columns='variable', values='value') + df.pivot(index="date", columns="variable", values="value") If the ``values`` argument is omitted, and the input ``DataFrame`` has more than one column of values which are not used as column or index inputs to ``pivot``, @@ -75,15 +82,15 @@ column: .. ipython:: python - df['value2'] = df['value'] * 2 - pivoted = df.pivot(index='date', columns='variable') + df["value2"] = df["value"] * 2 + pivoted = df.pivot(index="date", columns="variable") pivoted You can then select subsets from the pivoted ``DataFrame``: .. ipython:: python - pivoted['value2'] + pivoted["value2"] Note that this returns a view on the underlying data in the case where the data are homogeneously-typed. @@ -121,12 +128,16 @@ from the hierarchical indexing section: .. ipython:: python - tuples = list(zip(*[['bar', 'bar', 'baz', 'baz', - 'foo', 'foo', 'qux', 'qux'], - ['one', 'two', 'one', 'two', - 'one', 'two', 'one', 'two']])) - index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second']) - df = pd.DataFrame(np.random.randn(8, 2), index=index, columns=['A', 'B']) + tuples = list( + zip( + *[ + ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"], + ["one", "two", "one", "two", "one", "two", "one", "two"], + ] + ) + ) + index = pd.MultiIndex.from_tuples(tuples, names=["first", "second"]) + df = pd.DataFrame(np.random.randn(8, 2), index=index, columns=["A", "B"]) df2 = df[:4] df2 @@ -163,7 +174,7 @@ the level numbers: .. ipython:: python - stacked.unstack('second') + stacked.unstack("second") .. image:: ../_static/reshaping_unstack_0.png @@ -174,8 +185,8 @@ will result in a **sorted** copy of the original ``DataFrame`` or ``Series``: .. ipython:: python - index = pd.MultiIndex.from_product([[2, 1], ['a', 'b']]) - df = pd.DataFrame(np.random.randn(4), index=index, columns=['A']) + index = pd.MultiIndex.from_product([[2, 1], ["a", "b"]]) + df = pd.DataFrame(np.random.randn(4), index=index, columns=["A"]) df all(df.unstack().stack() == df.sort_index()) @@ -193,15 +204,19 @@ processed individually. .. ipython:: python - columns = pd.MultiIndex.from_tuples([ - ('A', 'cat', 'long'), ('B', 'cat', 'long'), - ('A', 'dog', 'short'), ('B', 'dog', 'short')], - names=['exp', 'animal', 'hair_length'] + columns = pd.MultiIndex.from_tuples( + [ + ("A", "cat", "long"), + ("B", "cat", "long"), + ("A", "dog", "short"), + ("B", "dog", "short"), + ], + names=["exp", "animal", "hair_length"], ) df = pd.DataFrame(np.random.randn(4, 4), columns=columns) df - df.stack(level=['animal', 'hair_length']) + df.stack(level=["animal", "hair_length"]) The list of levels can contain either level names or level numbers (but not a mixture of the two). @@ -222,12 +237,12 @@ calling ``sort_index``, of course). Here is a more complex example: .. ipython:: python - columns = pd.MultiIndex.from_tuples([('A', 'cat'), ('B', 'dog'), - ('B', 'cat'), ('A', 'dog')], - names=['exp', 'animal']) - index = pd.MultiIndex.from_product([('bar', 'baz', 'foo', 'qux'), - ('one', 'two')], - names=['first', 'second']) + columns = pd.MultiIndex.from_tuples( + [("A", "cat"), ("B", "dog"), ("B", "cat"), ("A", "dog")], names=["exp", "animal"] + ) + index = pd.MultiIndex.from_product( + [("bar", "baz", "foo", "qux"), ("one", "two")], names=["first", "second"] + ) df = pd.DataFrame(np.random.randn(8, 4), index=index, columns=columns) df2 = df.iloc[[0, 1, 2, 4, 5, 7]] df2 @@ -237,8 +252,8 @@ which level in the columns to stack: .. ipython:: python - df2.stack('exp') - df2.stack('animal') + df2.stack("exp") + df2.stack("animal") Unstacking can result in missing values if subgroups do not have the same set of labels. By default, missing values will be replaced with the default @@ -288,13 +303,17 @@ For instance, .. ipython:: python - cheese = pd.DataFrame({'first': ['John', 'Mary'], - 'last': ['Doe', 'Bo'], - 'height': [5.5, 6.0], - 'weight': [130, 150]}) + cheese = pd.DataFrame( + { + "first": ["John", "Mary"], + "last": ["Doe", "Bo"], + "height": [5.5, 6.0], + "weight": [130, 150], + } + ) cheese - cheese.melt(id_vars=['first', 'last']) - cheese.melt(id_vars=['first', 'last'], var_name='quantity') + cheese.melt(id_vars=["first", "last"]) + cheese.melt(id_vars=["first", "last"], var_name="quantity") When transforming a DataFrame using :func:`~pandas.melt`, the index will be ignored. The original index values can be kept around by setting the ``ignore_index`` parameter to ``False`` (default is ``True``). This will however duplicate them. @@ -302,15 +321,19 @@ When transforming a DataFrame using :func:`~pandas.melt`, the index will be igno .. ipython:: python - index = pd.MultiIndex.from_tuples([('person', 'A'), ('person', 'B')]) - cheese = pd.DataFrame({'first': ['John', 'Mary'], - 'last': ['Doe', 'Bo'], - 'height': [5.5, 6.0], - 'weight': [130, 150]}, - index=index) + index = pd.MultiIndex.from_tuples([("person", "A"), ("person", "B")]) + cheese = pd.DataFrame( + { + "first": ["John", "Mary"], + "last": ["Doe", "Bo"], + "height": [5.5, 6.0], + "weight": [130, 150], + }, + index=index, + ) cheese - cheese.melt(id_vars=['first', 'last']) - cheese.melt(id_vars=['first', 'last'], ignore_index=False) + cheese.melt(id_vars=["first", "last"]) + cheese.melt(id_vars=["first", "last"], ignore_index=False) Another way to transform is to use the :func:`~pandas.wide_to_long` panel data convenience function. It is less flexible than :func:`~pandas.melt`, but more @@ -318,12 +341,15 @@ user-friendly. .. ipython:: python - dft = pd.DataFrame({"A1970": {0: "a", 1: "b", 2: "c"}, - "A1980": {0: "d", 1: "e", 2: "f"}, - "B1970": {0: 2.5, 1: 1.2, 2: .7}, - "B1980": {0: 3.2, 1: 1.3, 2: .1}, - "X": dict(zip(range(3), np.random.randn(3))) - }) + dft = pd.DataFrame( + { + "A1970": {0: "a", 1: "b", 2: "c"}, + "A1980": {0: "d", 1: "e", 2: "f"}, + "B1970": {0: 2.5, 1: 1.2, 2: 0.7}, + "B1980": {0: 3.2, 1: 1.3, 2: 0.1}, + "X": dict(zip(range(3), np.random.randn(3))), + } + ) dft["id"] = dft.index dft pd.wide_to_long(dft, ["A", "B"], i="id", j="year") @@ -380,23 +406,27 @@ Consider a data set like this: .. ipython:: python import datetime - df = pd.DataFrame({'A': ['one', 'one', 'two', 'three'] * 6, - 'B': ['A', 'B', 'C'] * 8, - 'C': ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 4, - 'D': np.random.randn(24), - 'E': np.random.randn(24), - 'F': [datetime.datetime(2013, i, 1) for i in range(1, 13)] - + [datetime.datetime(2013, i, 15) for i in range(1, 13)]}) + + df = pd.DataFrame( + { + "A": ["one", "one", "two", "three"] * 6, + "B": ["A", "B", "C"] * 8, + "C": ["foo", "foo", "foo", "bar", "bar", "bar"] * 4, + "D": np.random.randn(24), + "E": np.random.randn(24), + "F": [datetime.datetime(2013, i, 1) for i in range(1, 13)] + + [datetime.datetime(2013, i, 15) for i in range(1, 13)], + } + ) df We can produce pivot tables from this data very easily: .. ipython:: python - pd.pivot_table(df, values='D', index=['A', 'B'], columns=['C']) - pd.pivot_table(df, values='D', index=['B'], columns=['A', 'C'], aggfunc=np.sum) - pd.pivot_table(df, values=['D', 'E'], index=['B'], columns=['A', 'C'], - aggfunc=np.sum) + pd.pivot_table(df, values="D", index=["A", "B"], columns=["C"]) + pd.pivot_table(df, values="D", index=["B"], columns=["A", "C"], aggfunc=np.sum) + pd.pivot_table(df, values=["D", "E"], index=["B"], columns=["A", "C"], aggfunc=np.sum) The result object is a ``DataFrame`` having potentially hierarchical indexes on the rows and columns. If the ``values`` column name is not given, the pivot table @@ -405,22 +435,21 @@ hierarchy in the columns: .. ipython:: python - pd.pivot_table(df, index=['A', 'B'], columns=['C']) + pd.pivot_table(df, index=["A", "B"], columns=["C"]) Also, you can use ``Grouper`` for ``index`` and ``columns`` keywords. For detail of ``Grouper``, see :ref:`Grouping with a Grouper specification `. .. ipython:: python - pd.pivot_table(df, values='D', index=pd.Grouper(freq='M', key='F'), - columns='C') + pd.pivot_table(df, values="D", index=pd.Grouper(freq="M", key="F"), columns="C") You can render a nice output of the table omitting the missing values by calling ``to_string`` if you wish: .. ipython:: python - table = pd.pivot_table(df, index=['A', 'B'], columns=['C']) - print(table.to_string(na_rep='')) + table = pd.pivot_table(df, index=["A", "B"], columns=["C"]) + print(table.to_string(na_rep="")) Note that ``pivot_table`` is also available as an instance method on DataFrame, i.e. :meth:`DataFrame.pivot_table`. @@ -436,7 +465,7 @@ rows and columns: .. ipython:: python - df.pivot_table(index=['A', 'B'], columns='C', margins=True, aggfunc=np.std) + df.pivot_table(index=["A", "B"], columns="C", margins=True, aggfunc=np.std) .. _reshaping.crosstabulations: @@ -470,30 +499,31 @@ For example: .. ipython:: python - foo, bar, dull, shiny, one, two = 'foo', 'bar', 'dull', 'shiny', 'one', 'two' + foo, bar, dull, shiny, one, two = "foo", "bar", "dull", "shiny", "one", "two" a = np.array([foo, foo, bar, bar, foo, foo], dtype=object) b = np.array([one, one, two, one, two, one], dtype=object) c = np.array([dull, dull, shiny, dull, dull, shiny], dtype=object) - pd.crosstab(a, [b, c], rownames=['a'], colnames=['b', 'c']) + pd.crosstab(a, [b, c], rownames=["a"], colnames=["b", "c"]) If ``crosstab`` receives only two Series, it will provide a frequency table. .. ipython:: python - df = pd.DataFrame({'A': [1, 2, 2, 2, 2], 'B': [3, 3, 4, 4, 4], - 'C': [1, 1, np.nan, 1, 1]}) + df = pd.DataFrame( + {"A": [1, 2, 2, 2, 2], "B": [3, 3, 4, 4, 4], "C": [1, 1, np.nan, 1, 1]} + ) df - pd.crosstab(df['A'], df['B']) + pd.crosstab(df["A"], df["B"]) ``crosstab`` can also be implemented to ``Categorical`` data. .. ipython:: python - foo = pd.Categorical(['a', 'b'], categories=['a', 'b', 'c']) - bar = pd.Categorical(['d', 'e'], categories=['d', 'e', 'f']) + foo = pd.Categorical(["a", "b"], categories=["a", "b", "c"]) + bar = pd.Categorical(["d", "e"], categories=["d", "e", "f"]) pd.crosstab(foo, bar) If you want to include **all** of data categories even if the actual data does @@ -513,13 +543,13 @@ using the ``normalize`` argument: .. ipython:: python - pd.crosstab(df['A'], df['B'], normalize=True) + pd.crosstab(df["A"], df["B"], normalize=True) ``normalize`` can also normalize values within each row or within each column: .. ipython:: python - pd.crosstab(df['A'], df['B'], normalize='columns') + pd.crosstab(df["A"], df["B"], normalize="columns") ``crosstab`` can also be passed a third ``Series`` and an aggregation function (``aggfunc``) that will be applied to the values of the third ``Series`` within @@ -527,7 +557,7 @@ each group defined by the first two ``Series``: .. ipython:: python - pd.crosstab(df['A'], df['B'], values=df['C'], aggfunc=np.sum) + pd.crosstab(df["A"], df["B"], values=df["C"], aggfunc=np.sum) Adding margins ~~~~~~~~~~~~~~ @@ -536,8 +566,9 @@ Finally, one can also add margins or normalize this output. .. ipython:: python - pd.crosstab(df['A'], df['B'], values=df['C'], aggfunc=np.sum, normalize=True, - margins=True) + pd.crosstab( + df["A"], df["B"], values=df["C"], aggfunc=np.sum, normalize=True, margins=True + ) .. _reshaping.tile: .. _reshaping.tile.cut: @@ -581,19 +612,19 @@ values, can derive a ``DataFrame`` containing ``k`` columns of 1s and 0s using .. ipython:: python - df = pd.DataFrame({'key': list('bbacab'), 'data1': range(6)}) + df = pd.DataFrame({"key": list("bbacab"), "data1": range(6)}) - pd.get_dummies(df['key']) + pd.get_dummies(df["key"]) Sometimes it's useful to prefix the column names, for example when merging the result with the original ``DataFrame``: .. ipython:: python - dummies = pd.get_dummies(df['key'], prefix='key') + dummies = pd.get_dummies(df["key"], prefix="key") dummies - df[['data1']].join(dummies) + df[["data1"]].join(dummies) This function is often used along with discretization functions like ``cut``: @@ -615,8 +646,7 @@ variables (categorical in the statistical sense, those with ``object`` or .. ipython:: python - df = pd.DataFrame({'A': ['a', 'b', 'a'], 'B': ['c', 'c', 'b'], - 'C': [1, 2, 3]}) + df = pd.DataFrame({"A": ["a", "b", "a"], "B": ["c", "c", "b"], "C": [1, 2, 3]}) pd.get_dummies(df) All non-object columns are included untouched in the output. You can control @@ -624,7 +654,7 @@ the columns that are encoded with the ``columns`` keyword. .. ipython:: python - pd.get_dummies(df, columns=['A']) + pd.get_dummies(df, columns=["A"]) Notice that the ``B`` column is still included in the output, it just hasn't been encoded. You can drop ``B`` before calling ``get_dummies`` if you don't @@ -641,11 +671,11 @@ the prefix separator. You can specify ``prefix`` and ``prefix_sep`` in 3 ways: .. ipython:: python - simple = pd.get_dummies(df, prefix='new_prefix') + simple = pd.get_dummies(df, prefix="new_prefix") simple - from_list = pd.get_dummies(df, prefix=['from_A', 'from_B']) + from_list = pd.get_dummies(df, prefix=["from_A", "from_B"]) from_list - from_dict = pd.get_dummies(df, prefix={'B': 'from_B', 'A': 'from_A'}) + from_dict = pd.get_dummies(df, prefix={"B": "from_B", "A": "from_A"}) from_dict Sometimes it will be useful to only keep k-1 levels of a categorical @@ -654,7 +684,7 @@ You can switch to this mode by turn on ``drop_first``. .. ipython:: python - s = pd.Series(list('abcaa')) + s = pd.Series(list("abcaa")) pd.get_dummies(s) @@ -664,7 +694,7 @@ When a column contains only one level, it will be omitted in the result. .. ipython:: python - df = pd.DataFrame({'A': list('aaaaa'), 'B': list('ababc')}) + df = pd.DataFrame({"A": list("aaaaa"), "B": list("ababc")}) pd.get_dummies(df) @@ -675,7 +705,7 @@ To choose another dtype, use the ``dtype`` argument: .. ipython:: python - df = pd.DataFrame({'A': list('abc'), 'B': [1.1, 2.2, 3.3]}) + df = pd.DataFrame({"A": list("abc"), "B": [1.1, 2.2, 3.3]}) pd.get_dummies(df, dtype=bool).dtypes @@ -689,7 +719,7 @@ To encode 1-d values as an enumerated type use :func:`~pandas.factorize`: .. ipython:: python - x = pd.Series(['A', 'A', np.nan, 'B', 3.14, np.inf]) + x = pd.Series(["A", "A", np.nan, "B", 3.14, np.inf]) x labels, uniques = pd.factorize(x) labels @@ -733,11 +763,12 @@ DataFrame will be pivoted in the answers below. np.random.seed([3, 1415]) n = 20 - cols = np.array(['key', 'row', 'item', 'col']) - df = cols + pd.DataFrame((np.random.randint(5, size=(n, 4)) - // [2, 1, 2, 1]).astype(str)) + cols = np.array(["key", "row", "item", "col"]) + df = cols + pd.DataFrame( + (np.random.randint(5, size=(n, 4)) // [2, 1, 2, 1]).astype(str) + ) df.columns = cols - df = df.join(pd.DataFrame(np.random.rand(n, 2).round(2)).add_prefix('val')) + df = df.join(pd.DataFrame(np.random.rand(n, 2).round(2)).add_prefix("val")) df @@ -762,24 +793,21 @@ This solution uses :func:`~pandas.pivot_table`. Also note that .. ipython:: python - df.pivot_table( - values='val0', index='row', columns='col', aggfunc='mean') + df.pivot_table(values="val0", index="row", columns="col", aggfunc="mean") Note that we can also replace the missing values by using the ``fill_value`` parameter. .. ipython:: python - df.pivot_table( - values='val0', index='row', columns='col', aggfunc='mean', fill_value=0) + df.pivot_table(values="val0", index="row", columns="col", aggfunc="mean", fill_value=0) Also note that we can pass in other aggregation functions as well. For example, we can also pass in ``sum``. .. ipython:: python - df.pivot_table( - values='val0', index='row', columns='col', aggfunc='sum', fill_value=0) + df.pivot_table(values="val0", index="row", columns="col", aggfunc="sum", fill_value=0) Another aggregation we can do is calculate the frequency in which the columns and rows occur together a.k.a. "cross tabulation". To do this, we can pass @@ -787,7 +815,7 @@ and rows occur together a.k.a. "cross tabulation". To do this, we can pass .. ipython:: python - df.pivot_table(index='row', columns='col', fill_value=0, aggfunc='size') + df.pivot_table(index="row", columns="col", fill_value=0, aggfunc="size") Pivoting with multiple aggregations ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -797,24 +825,21 @@ We can also perform multiple aggregations. For example, to perform both a .. ipython:: python - df.pivot_table( - values='val0', index='row', columns='col', aggfunc=['mean', 'sum']) + df.pivot_table(values="val0", index="row", columns="col", aggfunc=["mean", "sum"]) Note to aggregate over multiple value columns, we can pass in a list to the ``values`` parameter. .. ipython:: python - df.pivot_table( - values=['val0', 'val1'], index='row', columns='col', aggfunc=['mean']) + df.pivot_table(values=["val0", "val1"], index="row", columns="col", aggfunc=["mean"]) Note to subdivide over multiple columns we can pass in a list to the ``columns`` parameter. .. ipython:: python - df.pivot_table( - values=['val0'], index='row', columns=['item', 'col'], aggfunc=['mean']) + df.pivot_table(values=["val0"], index="row", columns=["item", "col"], aggfunc=["mean"]) .. _reshaping.explode: @@ -827,28 +852,28 @@ Sometimes the values in a column are list-like. .. ipython:: python - keys = ['panda1', 'panda2', 'panda3'] - values = [['eats', 'shoots'], ['shoots', 'leaves'], ['eats', 'leaves']] - df = pd.DataFrame({'keys': keys, 'values': values}) + keys = ["panda1", "panda2", "panda3"] + values = [["eats", "shoots"], ["shoots", "leaves"], ["eats", "leaves"]] + df = pd.DataFrame({"keys": keys, "values": values}) df We can 'explode' the ``values`` column, transforming each list-like to a separate row, by using :meth:`~Series.explode`. This will replicate the index values from the original row: .. ipython:: python - df['values'].explode() + df["values"].explode() You can also explode the column in the ``DataFrame``. .. ipython:: python - df.explode('values') + df.explode("values") :meth:`Series.explode` will replace empty lists with ``np.nan`` and preserve scalar entries. The dtype of the resulting ``Series`` is always ``object``. .. ipython:: python - s = pd.Series([[1, 2, 3], 'foo', [], ['a', 'b']]) + s = pd.Series([[1, 2, 3], "foo", [], ["a", "b"]]) s s.explode() @@ -856,12 +881,11 @@ Here is a typical usecase. You have comma separated strings in a column and want .. ipython:: python - df = pd.DataFrame([{'var1': 'a,b,c', 'var2': 1}, - {'var1': 'd,e,f', 'var2': 2}]) + df = pd.DataFrame([{"var1": "a,b,c", "var2": 1}, {"var1": "d,e,f", "var2": 2}]) df Creating a long form DataFrame is now straightforward using explode and chained operations .. ipython:: python - df.assign(var1=df.var1.str.split(',')).explode('var1') + df.assign(var1=df.var1.str.split(",")).explode("var1") diff --git a/doc/source/user_guide/timedeltas.rst b/doc/source/user_guide/timedeltas.rst index 3979ad1f3e949..971a415088220 100644 --- a/doc/source/user_guide/timedeltas.rst +++ b/doc/source/user_guide/timedeltas.rst @@ -25,33 +25,33 @@ You can construct a ``Timedelta`` scalar through various arguments, including `I import datetime # strings - pd.Timedelta('1 days') - pd.Timedelta('1 days 00:00:00') - pd.Timedelta('1 days 2 hours') - pd.Timedelta('-1 days 2 min 3us') + pd.Timedelta("1 days") + pd.Timedelta("1 days 00:00:00") + pd.Timedelta("1 days 2 hours") + pd.Timedelta("-1 days 2 min 3us") # like datetime.timedelta # note: these MUST be specified as keyword arguments pd.Timedelta(days=1, seconds=1) # integers with a unit - pd.Timedelta(1, unit='d') + pd.Timedelta(1, unit="d") # from a datetime.timedelta/np.timedelta64 pd.Timedelta(datetime.timedelta(days=1, seconds=1)) - pd.Timedelta(np.timedelta64(1, 'ms')) + pd.Timedelta(np.timedelta64(1, "ms")) # negative Timedeltas have this string repr # to be more consistent with datetime.timedelta conventions - pd.Timedelta('-1us') + pd.Timedelta("-1us") # a NaT - pd.Timedelta('nan') - pd.Timedelta('nat') + pd.Timedelta("nan") + pd.Timedelta("nat") # ISO 8601 Duration strings - pd.Timedelta('P0DT0H1M0S') - pd.Timedelta('P0DT0H0M0.000000123S') + pd.Timedelta("P0DT0H1M0S") + pd.Timedelta("P0DT0H0M0.000000123S") :ref:`DateOffsets` (``Day, Hour, Minute, Second, Milli, Micro, Nano``) can also be used in construction. @@ -63,8 +63,9 @@ Further, operations among the scalars yield another scalar ``Timedelta``. .. ipython:: python - pd.Timedelta(pd.offsets.Day(2)) + pd.Timedelta(pd.offsets.Second(2)) +\ - pd.Timedelta('00:00:00.000123') + pd.Timedelta(pd.offsets.Day(2)) + pd.Timedelta(pd.offsets.Second(2)) + pd.Timedelta( + "00:00:00.000123" + ) to_timedelta ~~~~~~~~~~~~ @@ -78,21 +79,21 @@ You can parse a single string to a Timedelta: .. ipython:: python - pd.to_timedelta('1 days 06:05:01.00003') - pd.to_timedelta('15.5us') + pd.to_timedelta("1 days 06:05:01.00003") + pd.to_timedelta("15.5us") or a list/array of strings: .. ipython:: python - pd.to_timedelta(['1 days 06:05:01.00003', '15.5us', 'nan']) + pd.to_timedelta(["1 days 06:05:01.00003", "15.5us", "nan"]) The ``unit`` keyword argument specifies the unit of the Timedelta: .. ipython:: python - pd.to_timedelta(np.arange(5), unit='s') - pd.to_timedelta(np.arange(5), unit='d') + pd.to_timedelta(np.arange(5), unit="s") + pd.to_timedelta(np.arange(5), unit="d") .. _timedeltas.limitations: @@ -118,11 +119,11 @@ subtraction operations on ``datetime64[ns]`` Series, or ``Timestamps``. .. ipython:: python - s = pd.Series(pd.date_range('2012-1-1', periods=3, freq='D')) + s = pd.Series(pd.date_range("2012-1-1", periods=3, freq="D")) td = pd.Series([pd.Timedelta(days=i) for i in range(3)]) - df = pd.DataFrame({'A': s, 'B': td}) + df = pd.DataFrame({"A": s, "B": td}) df - df['C'] = df['A'] + df['B'] + df["C"] = df["A"] + df["B"] df df.dtypes @@ -165,10 +166,10 @@ Operands can also appear in a reversed order (a singular object operated with a .. ipython:: python - A = s - pd.Timestamp('20120101') - pd.Timedelta('00:05:05') - B = s - pd.Series(pd.date_range('2012-1-2', periods=3, freq='D')) + A = s - pd.Timestamp("20120101") - pd.Timedelta("00:05:05") + B = s - pd.Series(pd.date_range("2012-1-2", periods=3, freq="D")) - df = pd.DataFrame({'A': A, 'B': B}) + df = pd.DataFrame({"A": A, "B": B}) df df.min() @@ -192,17 +193,17 @@ You can fillna on timedeltas, passing a timedelta to get a particular value. .. ipython:: python y.fillna(pd.Timedelta(0)) - y.fillna(pd.Timedelta(10, unit='s')) - y.fillna(pd.Timedelta('-1 days, 00:00:05')) + y.fillna(pd.Timedelta(10, unit="s")) + y.fillna(pd.Timedelta("-1 days, 00:00:05")) You can also negate, multiply and use ``abs`` on ``Timedeltas``: .. ipython:: python - td1 = pd.Timedelta('-1 days 2 hours 3 seconds') + td1 = pd.Timedelta("-1 days 2 hours 3 seconds") td1 -1 * td1 - - td1 + -td1 abs(td1) .. _timedeltas.timedeltas_reductions: @@ -215,12 +216,13 @@ Numeric reduction operation for ``timedelta64[ns]`` will return ``Timedelta`` ob .. ipython:: python - y2 = pd.Series(pd.to_timedelta(['-1 days +00:00:05', 'nat', - '-1 days +00:00:05', '1 days'])) + y2 = pd.Series( + pd.to_timedelta(["-1 days +00:00:05", "nat", "-1 days +00:00:05", "1 days"]) + ) y2 y2.mean() y2.median() - y2.quantile(.1) + y2.quantile(0.1) y2.sum() .. _timedeltas.timedeltas_convert: @@ -234,8 +236,8 @@ Note that division by the NumPy scalar is true division, while astyping is equiv .. ipython:: python - december = pd.Series(pd.date_range('20121201', periods=4)) - january = pd.Series(pd.date_range('20130101', periods=4)) + december = pd.Series(pd.date_range("20121201", periods=4)) + january = pd.Series(pd.date_range("20130101", periods=4)) td = january - december td[2] += datetime.timedelta(minutes=5, seconds=3) @@ -243,15 +245,15 @@ Note that division by the NumPy scalar is true division, while astyping is equiv td # to days - td / np.timedelta64(1, 'D') - td.astype('timedelta64[D]') + td / np.timedelta64(1, "D") + td.astype("timedelta64[D]") # to seconds - td / np.timedelta64(1, 's') - td.astype('timedelta64[s]') + td / np.timedelta64(1, "s") + td.astype("timedelta64[s]") # to months (these are constant months) - td / np.timedelta64(1, 'M') + td / np.timedelta64(1, "M") Dividing or multiplying a ``timedelta64[ns]`` Series by an integer or integer Series yields another ``timedelta64[ns]`` dtypes Series. @@ -305,7 +307,7 @@ You can access the value of the fields for a scalar ``Timedelta`` directly. .. ipython:: python - tds = pd.Timedelta('31 days 5 min 3 sec') + tds = pd.Timedelta("31 days 5 min 3 sec") tds.days tds.seconds (-tds).seconds @@ -325,9 +327,9 @@ You can convert a ``Timedelta`` to an `ISO 8601 Duration`_ string with the .. ipython:: python - pd.Timedelta(days=6, minutes=50, seconds=3, - milliseconds=10, microseconds=10, - nanoseconds=12).isoformat() + pd.Timedelta( + days=6, minutes=50, seconds=3, milliseconds=10, microseconds=10, nanoseconds=12 + ).isoformat() .. _ISO 8601 Duration: https://en.wikipedia.org/wiki/ISO_8601#Durations @@ -344,15 +346,21 @@ or ``np.timedelta64`` objects. Passing ``np.nan/pd.NaT/nat`` will represent miss .. ipython:: python - pd.TimedeltaIndex(['1 days', '1 days, 00:00:05', np.timedelta64(2, 'D'), - datetime.timedelta(days=2, seconds=2)]) + pd.TimedeltaIndex( + [ + "1 days", + "1 days, 00:00:05", + np.timedelta64(2, "D"), + datetime.timedelta(days=2, seconds=2), + ] + ) The string 'infer' can be passed in order to set the frequency of the index as the inferred frequency upon creation: .. ipython:: python - pd.TimedeltaIndex(['0 days', '10 days', '20 days'], freq='infer') + pd.TimedeltaIndex(["0 days", "10 days", "20 days"], freq="infer") Generating ranges of time deltas ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -363,24 +371,24 @@ calendar day: .. ipython:: python - pd.timedelta_range(start='1 days', periods=5) + pd.timedelta_range(start="1 days", periods=5) Various combinations of ``start``, ``end``, and ``periods`` can be used with ``timedelta_range``: .. ipython:: python - pd.timedelta_range(start='1 days', end='5 days') + pd.timedelta_range(start="1 days", end="5 days") - pd.timedelta_range(end='10 days', periods=4) + pd.timedelta_range(end="10 days", periods=4) The ``freq`` parameter can passed a variety of :ref:`frequency aliases `: .. ipython:: python - pd.timedelta_range(start='1 days', end='2 days', freq='30T') + pd.timedelta_range(start="1 days", end="2 days", freq="30T") - pd.timedelta_range(start='1 days', periods=5, freq='2D5H') + pd.timedelta_range(start="1 days", periods=5, freq="2D5H") Specifying ``start``, ``end``, and ``periods`` will generate a range of evenly spaced @@ -389,9 +397,9 @@ in the resulting ``TimedeltaIndex``: .. ipython:: python - pd.timedelta_range('0 days', '4 days', periods=5) + pd.timedelta_range("0 days", "4 days", periods=5) - pd.timedelta_range('0 days', '4 days', periods=10) + pd.timedelta_range("0 days", "4 days", periods=10) Using the TimedeltaIndex ~~~~~~~~~~~~~~~~~~~~~~~~ @@ -401,23 +409,22 @@ Similarly to other of the datetime-like indices, ``DatetimeIndex`` and ``PeriodI .. ipython:: python - s = pd.Series(np.arange(100), - index=pd.timedelta_range('1 days', periods=100, freq='h')) + s = pd.Series(np.arange(100), index=pd.timedelta_range("1 days", periods=100, freq="h")) s Selections work similarly, with coercion on string-likes and slices: .. ipython:: python - s['1 day':'2 day'] - s['1 day 01:00:00'] - s[pd.Timedelta('1 day 1h')] + s["1 day":"2 day"] + s["1 day 01:00:00"] + s[pd.Timedelta("1 day 1h")] Furthermore you can use partial string selection and the range will be inferred: .. ipython:: python - s['1 day':'1 day 5 hours'] + s["1 day":"1 day 5 hours"] Operations ~~~~~~~~~~ @@ -426,9 +433,9 @@ Finally, the combination of ``TimedeltaIndex`` with ``DatetimeIndex`` allow cert .. ipython:: python - tdi = pd.TimedeltaIndex(['1 days', pd.NaT, '2 days']) + tdi = pd.TimedeltaIndex(["1 days", pd.NaT, "2 days"]) tdi.to_list() - dti = pd.date_range('20130101', periods=3) + dti = pd.date_range("20130101", periods=3) dti.to_list() (dti + tdi).to_list() (dti - tdi).to_list() @@ -440,22 +447,22 @@ Similarly to frequency conversion on a ``Series`` above, you can convert these i .. ipython:: python - tdi / np.timedelta64(1, 's') - tdi.astype('timedelta64[s]') + tdi / np.timedelta64(1, "s") + tdi.astype("timedelta64[s]") Scalars type ops work as well. These can potentially return a *different* type of index. .. ipython:: python # adding or timedelta and date -> datelike - tdi + pd.Timestamp('20130101') + tdi + pd.Timestamp("20130101") # subtraction of a date and a timedelta -> datelike # note that trying to subtract a date from a Timedelta will raise an exception - (pd.Timestamp('20130101') - tdi).to_list() + (pd.Timestamp("20130101") - tdi).to_list() # timedelta + timedelta -> timedelta - tdi + pd.Timedelta('10 days') + tdi + pd.Timedelta("10 days") # division can result in a Timedelta if the divisor is an integer tdi / 2 @@ -472,4 +479,4 @@ Similar to :ref:`timeseries resampling `, we can resample .. ipython:: python - s.resample('D').mean() + s.resample("D").mean() diff --git a/setup.cfg b/setup.cfg index 494c7ad328648..b2b4b4ce0351d 100644 --- a/setup.cfg +++ b/setup.cfg @@ -33,6 +33,7 @@ exclude = env # exclude asv benchmark environments from linting [flake8-rst] +max-line-length = 88 bootstrap = import numpy as np import pandas as pd From 4280aafe4185db14580760c4d4a7d31d1ce77796 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Tue, 29 Sep 2020 21:30:54 +0100 Subject: [PATCH 0936/1025] TYP: Ignore remaining mypy errors for pandas\tests\* (#36724) --- pandas/conftest.py | 4 +++- pandas/tests/window/conftest.py | 9 +++++++-- setup.cfg | 3 --- 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index 604815d496f80..5fb333acd718d 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -298,7 +298,9 @@ def unique_nulls_fixture(request): # ---------------------------------------------------------------- # Classes # ---------------------------------------------------------------- -@pytest.fixture(params=[pd.Index, pd.Series], ids=["index", "series"]) +@pytest.fixture( + params=[pd.Index, pd.Series], ids=["index", "series"] # type: ignore[list-item] +) def index_or_series(request): """ Fixture to parametrize over Index and Series, made necessary by a mypy diff --git a/pandas/tests/window/conftest.py b/pandas/tests/window/conftest.py index 7f03fa2a5ea0d..3b4ed4859b1cc 100644 --- a/pandas/tests/window/conftest.py +++ b/pandas/tests/window/conftest.py @@ -76,7 +76,12 @@ def nopython(request): @pytest.fixture( - params=[pytest.param("numba", marks=td.skip_if_no("numba", "0.46.0")), "cython"] + params=[ + pytest.param( + "numba", marks=td.skip_if_no("numba", "0.46.0") + ), # type: ignore[list-item] + "cython", + ] ) def engine(request): """engine keyword argument for rolling.apply""" @@ -327,7 +332,7 @@ def halflife_with_times(request): "float64", "m8[ns]", "M8[ns]", - pytest.param( + pytest.param( # type: ignore[list-item] "datetime64[ns, UTC]", marks=pytest.mark.skip( "direct creation of extension dtype datetime64[ns, UTC] " diff --git a/setup.cfg b/setup.cfg index b2b4b4ce0351d..73986f692b6cd 100644 --- a/setup.cfg +++ b/setup.cfg @@ -129,9 +129,6 @@ ignore_errors=True [mypy-pandas.tests.*] check_untyped_defs=False -[mypy-pandas.conftest,pandas.tests.window.conftest] -ignore_errors=True - [mypy-pandas._testing] check_untyped_defs=False From 49d96cacd5d7b7e0b3d398744a8e8aa3202a05e9 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov <41443370+ivanovmg@users.noreply.github.com> Date: Wed, 30 Sep 2020 03:32:59 +0700 Subject: [PATCH 0937/1025] CLN: cleanup DataFrameInfo (#36641) --- pandas/io/formats/info.py | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index 7a53b46a4ac0f..e8e41d4325103 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -111,7 +111,6 @@ def _get_mem_usage(self, deep: bool) -> int: mem_usage : int Object's total memory usage in bytes. """ - pass @abstractmethod def _get_ids_and_dtypes(self) -> Tuple["Index", "Series"]: @@ -125,7 +124,6 @@ def _get_ids_and_dtypes(self) -> Tuple["Index", "Series"]: dtypes : Series Dtype of each of the DataFrame's columns. """ - pass @abstractmethod def _verbose_repr( @@ -145,7 +143,6 @@ def _verbose_repr( show_counts : bool If True, count of non-NA cells for each column will be appended to `lines`. """ - pass @abstractmethod def _non_verbose_repr(self, lines: List[str], ids: "Index") -> None: @@ -159,7 +156,6 @@ def _non_verbose_repr(self, lines: List[str], ids: "Index") -> None: ids : Index The DataFrame's column names. """ - pass def info(self) -> None: """ @@ -296,7 +292,6 @@ def _verbose_repr( len_id = len(pprint_thing(id_head)) space_num = max(max_id, len_id) + col_space - header = _put_str(id_head, space_num) + _put_str(column_head, space) if show_counts: counts = self.data.count() if col_count != len(counts): # pragma: no cover @@ -319,17 +314,26 @@ def _verbose_repr( len_dtype = len(dtype_header) max_dtypes = max(len(pprint_thing(k)) for k in dtypes) space_dtype = max(len_dtype, max_dtypes) - header += _put_str(count_header, space_count) + _put_str( - dtype_header, space_dtype - ) + header = "".join( + [ + _put_str(id_head, space_num), + _put_str(column_head, space), + _put_str(count_header, space_count), + _put_str(dtype_header, space_dtype), + ] + ) lines.append(header) - lines.append( - _put_str("-" * len_id, space_num) - + _put_str("-" * len_column, space) - + _put_str("-" * len_count, space_count) - + _put_str("-" * len_dtype, space_dtype) + + top_separator = "".join( + [ + _put_str("-" * len_id, space_num), + _put_str("-" * len_column, space), + _put_str("-" * len_count, space_count), + _put_str("-" * len_dtype, space_dtype), + ] ) + lines.append(top_separator) for i, col in enumerate(ids): dtype = dtypes[i] From 57559fff92371f8d473b71e867ce6d81ef92329a Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 29 Sep 2020 15:48:22 -0700 Subject: [PATCH 0938/1025] CLN: OrderedDict->dict (#36693) --- pandas/compat/numpy/function.py | 17 ++++++++--------- pandas/tests/frame/apply/test_frame_apply.py | 9 ++++----- .../tests/frame/methods/test_select_dtypes.py | 5 +---- pandas/tests/frame/test_dtypes.py | 13 +++++-------- pandas/tests/internals/test_internals.py | 3 +-- pandas/tests/io/json/test_pandas.py | 3 +-- pandas/tests/resample/test_resample_api.py | 5 ++--- pandas/tests/reshape/merge/test_merge.py | 3 +-- pandas/tests/reshape/test_concat.py | 6 ++---- pandas/tests/reshape/test_get_dummies.py | 6 +----- pandas/tests/window/test_api.py | 6 +----- 11 files changed, 27 insertions(+), 49 deletions(-) diff --git a/pandas/compat/numpy/function.py b/pandas/compat/numpy/function.py index 5f627aeade47c..938f57f504b04 100644 --- a/pandas/compat/numpy/function.py +++ b/pandas/compat/numpy/function.py @@ -17,7 +17,6 @@ and methods that are spread throughout the codebase. This module will make it easier to adjust to future upstream changes in the analogous numpy signatures. """ -from collections import OrderedDict from distutils.version import LooseVersion from typing import Any, Dict, Optional, Union @@ -117,7 +116,7 @@ def validate_argmax_with_skipna(skipna, args, kwargs): return skipna -ARGSORT_DEFAULTS: "OrderedDict[str, Optional[Union[int, str]]]" = OrderedDict() +ARGSORT_DEFAULTS: Dict[str, Optional[Union[int, str]]] = {} ARGSORT_DEFAULTS["axis"] = -1 ARGSORT_DEFAULTS["kind"] = "quicksort" ARGSORT_DEFAULTS["order"] = None @@ -133,7 +132,7 @@ def validate_argmax_with_skipna(skipna, args, kwargs): # two different signatures of argsort, this second validation # for when the `kind` param is supported -ARGSORT_DEFAULTS_KIND: "OrderedDict[str, Optional[int]]" = OrderedDict() +ARGSORT_DEFAULTS_KIND: Dict[str, Optional[int]] = {} ARGSORT_DEFAULTS_KIND["axis"] = -1 ARGSORT_DEFAULTS_KIND["order"] = None validate_argsort_kind = CompatValidator( @@ -178,7 +177,7 @@ def validate_clip_with_axis(axis, args, kwargs): return axis -CUM_FUNC_DEFAULTS: "OrderedDict[str, Any]" = OrderedDict() +CUM_FUNC_DEFAULTS: Dict[str, Any] = {} CUM_FUNC_DEFAULTS["dtype"] = None CUM_FUNC_DEFAULTS["out"] = None validate_cum_func = CompatValidator( @@ -204,7 +203,7 @@ def validate_cum_func_with_skipna(skipna, args, kwargs, name): return skipna -ALLANY_DEFAULTS: "OrderedDict[str, Optional[bool]]" = OrderedDict() +ALLANY_DEFAULTS: Dict[str, Optional[bool]] = {} ALLANY_DEFAULTS["dtype"] = None ALLANY_DEFAULTS["out"] = None ALLANY_DEFAULTS["keepdims"] = False @@ -241,13 +240,13 @@ def validate_cum_func_with_skipna(skipna, args, kwargs, name): ROUND_DEFAULTS, fname="round", method="both", max_fname_arg_count=1 ) -SORT_DEFAULTS: "OrderedDict[str, Optional[Union[int, str]]]" = OrderedDict() +SORT_DEFAULTS: Dict[str, Optional[Union[int, str]]] = {} SORT_DEFAULTS["axis"] = -1 SORT_DEFAULTS["kind"] = "quicksort" SORT_DEFAULTS["order"] = None validate_sort = CompatValidator(SORT_DEFAULTS, fname="sort", method="kwargs") -STAT_FUNC_DEFAULTS: "OrderedDict[str, Optional[Any]]" = OrderedDict() +STAT_FUNC_DEFAULTS: Dict[str, Optional[Any]] = {} STAT_FUNC_DEFAULTS["dtype"] = None STAT_FUNC_DEFAULTS["out"] = None @@ -281,13 +280,13 @@ def validate_cum_func_with_skipna(skipna, args, kwargs, name): MEDIAN_DEFAULTS, fname="median", method="both", max_fname_arg_count=1 ) -STAT_DDOF_FUNC_DEFAULTS: "OrderedDict[str, Optional[bool]]" = OrderedDict() +STAT_DDOF_FUNC_DEFAULTS: Dict[str, Optional[bool]] = {} STAT_DDOF_FUNC_DEFAULTS["dtype"] = None STAT_DDOF_FUNC_DEFAULTS["out"] = None STAT_DDOF_FUNC_DEFAULTS["keepdims"] = False validate_stat_ddof_func = CompatValidator(STAT_DDOF_FUNC_DEFAULTS, method="kwargs") -TAKE_DEFAULTS: "OrderedDict[str, Optional[str]]" = OrderedDict() +TAKE_DEFAULTS: Dict[str, Optional[str]] = {} TAKE_DEFAULTS["out"] = None TAKE_DEFAULTS["mode"] = "raise" validate_take = CompatValidator(TAKE_DEFAULTS, fname="take", method="kwargs") diff --git a/pandas/tests/frame/apply/test_frame_apply.py b/pandas/tests/frame/apply/test_frame_apply.py index 3f859bb4ee39e..5c6a47c57970b 100644 --- a/pandas/tests/frame/apply/test_frame_apply.py +++ b/pandas/tests/frame/apply/test_frame_apply.py @@ -1,4 +1,3 @@ -from collections import OrderedDict from datetime import datetime from itertools import chain import warnings @@ -1225,7 +1224,7 @@ def test_agg_reduce(self, axis, float_frame): tm.assert_frame_equal(result, expected) # dict input with scalars - func = OrderedDict([(name1, "mean"), (name2, "sum")]) + func = dict([(name1, "mean"), (name2, "sum")]) result = float_frame.agg(func, axis=axis) expected = Series( [ @@ -1237,7 +1236,7 @@ def test_agg_reduce(self, axis, float_frame): tm.assert_series_equal(result, expected) # dict input with lists - func = OrderedDict([(name1, ["mean"]), (name2, ["sum"])]) + func = dict([(name1, ["mean"]), (name2, ["sum"])]) result = float_frame.agg(func, axis=axis) expected = DataFrame( { @@ -1253,10 +1252,10 @@ def test_agg_reduce(self, axis, float_frame): tm.assert_frame_equal(result, expected) # dict input with lists with multiple - func = OrderedDict([(name1, ["mean", "sum"]), (name2, ["sum", "max"])]) + func = dict([(name1, ["mean", "sum"]), (name2, ["sum", "max"])]) result = float_frame.agg(func, axis=axis) expected = DataFrame( - OrderedDict( + dict( [ ( name1, diff --git a/pandas/tests/frame/methods/test_select_dtypes.py b/pandas/tests/frame/methods/test_select_dtypes.py index fe7baebcf0cf7..4599761909c33 100644 --- a/pandas/tests/frame/methods/test_select_dtypes.py +++ b/pandas/tests/frame/methods/test_select_dtypes.py @@ -1,5 +1,3 @@ -from collections import OrderedDict - import numpy as np import pytest @@ -202,9 +200,8 @@ def test_select_dtypes_include_exclude_mixed_scalars_lists(self): def test_select_dtypes_duplicate_columns(self): # GH20839 - odict = OrderedDict df = DataFrame( - odict( + dict( [ ("a", list("abc")), ("b", list(range(1, 4))), diff --git a/pandas/tests/frame/test_dtypes.py b/pandas/tests/frame/test_dtypes.py index f3e3ef9bae5c6..53d417dc10014 100644 --- a/pandas/tests/frame/test_dtypes.py +++ b/pandas/tests/frame/test_dtypes.py @@ -1,4 +1,3 @@ -from collections import OrderedDict from datetime import timedelta import numpy as np @@ -50,10 +49,9 @@ def test_empty_frame_dtypes(self): norows_int_df.dtypes, pd.Series(np.dtype("int32"), index=list("abc")) ) - odict = OrderedDict - df = pd.DataFrame(odict([("a", 1), ("b", True), ("c", 1.0)]), index=[1, 2, 3]) + df = pd.DataFrame(dict([("a", 1), ("b", True), ("c", 1.0)]), index=[1, 2, 3]) ex_dtypes = pd.Series( - odict([("a", np.int64), ("b", np.bool_), ("c", np.float64)]) + dict([("a", np.int64), ("b", np.bool_), ("c", np.float64)]) ) tm.assert_series_equal(df.dtypes, ex_dtypes) @@ -85,17 +83,16 @@ def test_datetime_with_tz_dtypes(self): def test_dtypes_are_correct_after_column_slice(self): # GH6525 df = pd.DataFrame(index=range(5), columns=list("abc"), dtype=np.float_) - odict = OrderedDict tm.assert_series_equal( df.dtypes, - pd.Series(odict([("a", np.float_), ("b", np.float_), ("c", np.float_)])), + pd.Series(dict([("a", np.float_), ("b", np.float_), ("c", np.float_)])), ) tm.assert_series_equal( - df.iloc[:, 2:].dtypes, pd.Series(odict([("c", np.float_)])) + df.iloc[:, 2:].dtypes, pd.Series(dict([("c", np.float_)])) ) tm.assert_series_equal( df.dtypes, - pd.Series(odict([("a", np.float_), ("b", np.float_), ("c", np.float_)])), + pd.Series(dict([("a", np.float_), ("b", np.float_), ("c", np.float_)])), ) def test_dtypes_gh8722(self, float_string_frame): diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index 1d73d1e35728b..2567f704a4a8d 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -1,4 +1,3 @@ -from collections import OrderedDict from datetime import date, datetime import itertools import operator @@ -165,7 +164,7 @@ def create_mgr(descr, item_shape=None): offset = 0 mgr_items = [] - block_placements = OrderedDict() + block_placements = {} for d in descr.split(";"): d = d.strip() if not len(d): diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 13152f01abb04..9278e64cc911f 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1,4 +1,3 @@ -from collections import OrderedDict import datetime from datetime import timedelta from io import StringIO @@ -470,7 +469,7 @@ def test_blocks_compat_GH9037(self): index = pd.DatetimeIndex(list(index), freq=None) df_mixed = DataFrame( - OrderedDict( + dict( float_1=[ -0.92077639, 0.77434435, diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py index 73aa01cff84fa..e4af5d93ff771 100644 --- a/pandas/tests/resample/test_resample_api.py +++ b/pandas/tests/resample/test_resample_api.py @@ -1,4 +1,3 @@ -from collections import OrderedDict from datetime import datetime import numpy as np @@ -428,7 +427,7 @@ def test_agg_misc(): msg = r"Column\(s\) \['result1', 'result2'\] do not exist" for t in cases: with pytest.raises(pd.core.base.SpecificationError, match=msg): - t[["A", "B"]].agg(OrderedDict([("result1", np.sum), ("result2", np.mean)])) + t[["A", "B"]].agg(dict([("result1", np.sum), ("result2", np.mean)])) # agg with different hows expected = pd.concat( @@ -438,7 +437,7 @@ def test_agg_misc(): [("A", "sum"), ("A", "std"), ("B", "mean"), ("B", "std")] ) for t in cases: - result = t.agg(OrderedDict([("A", ["sum", "std"]), ("B", ["mean", "std"])])) + result = t.agg(dict([("A", ["sum", "std"]), ("B", ["mean", "std"])])) tm.assert_frame_equal(result, expected, check_like=True) # equivalent of using a selection list / or not diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 4fd3c688b8771..aee503235d36c 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -1,4 +1,3 @@ -from collections import OrderedDict from datetime import date, datetime, timedelta import random import re @@ -1931,7 +1930,7 @@ def test_merge_index_types(index): result = left.merge(right, on=["index_col"]) expected = DataFrame( - OrderedDict([("left_data", [1, 2]), ("right_data", [1.0, 2.0])]), index=index + dict([("left_data", [1, 2]), ("right_data", [1.0, 2.0])]), index=index ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py index 7d6611722d8b5..b0f6a8ef0c517 100644 --- a/pandas/tests/reshape/test_concat.py +++ b/pandas/tests/reshape/test_concat.py @@ -1,4 +1,4 @@ -from collections import OrderedDict, abc, deque +from collections import abc, deque import datetime as dt from datetime import datetime from decimal import Decimal @@ -2609,9 +2609,7 @@ def test_concat_odered_dict(self): [pd.Series(range(3)), pd.Series(range(4))], keys=["First", "Another"] ) result = pd.concat( - OrderedDict( - [("First", pd.Series(range(3))), ("Another", pd.Series(range(4)))] - ) + dict([("First", pd.Series(range(3))), ("Another", pd.Series(range(4)))]) ) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/reshape/test_get_dummies.py b/pandas/tests/reshape/test_get_dummies.py index 82e0e52c089a2..537bedfd1a6b9 100644 --- a/pandas/tests/reshape/test_get_dummies.py +++ b/pandas/tests/reshape/test_get_dummies.py @@ -1,5 +1,3 @@ -from collections import OrderedDict - import numpy as np import pytest @@ -569,9 +567,7 @@ def test_dataframe_dummies_preserve_categorical_dtype(self, dtype, ordered): @pytest.mark.parametrize("sparse", [True, False]) def test_get_dummies_dont_sparsify_all_columns(self, sparse): # GH18914 - df = DataFrame.from_dict( - OrderedDict([("GDP", [1, 2]), ("Nation", ["AB", "CD"])]) - ) + df = DataFrame.from_dict(dict([("GDP", [1, 2]), ("Nation", ["AB", "CD"])])) df = get_dummies(df, columns=["Nation"], sparse=sparse) df2 = df.reindex(columns=["GDP"]) diff --git a/pandas/tests/window/test_api.py b/pandas/tests/window/test_api.py index 2c3d8b4608806..eb14ecfba1f51 100644 --- a/pandas/tests/window/test_api.py +++ b/pandas/tests/window/test_api.py @@ -1,5 +1,3 @@ -from collections import OrderedDict - import numpy as np import pytest @@ -335,8 +333,6 @@ def test_multiple_agg_funcs(func, window_size, expected_vals): ) expected = pd.DataFrame(expected_vals, index=index, columns=columns) - result = window.agg( - OrderedDict((("low", ["mean", "max"]), ("high", ["mean", "min"]))) - ) + result = window.agg(dict((("low", ["mean", "max"]), ("high", ["mean", "min"])))) tm.assert_frame_equal(result, expected) From 875bd84ad66f53b1aecfc130000f7055dd819614 Mon Sep 17 00:00:00 2001 From: ebardie Date: Wed, 30 Sep 2020 01:11:51 +0100 Subject: [PATCH 0939/1025] Fix missing tick labels on twinned axes. (#33767) --- doc/source/whatsnew/v1.2.0.rst | 1 + pandas/plotting/_matplotlib/tools.py | 56 ++++++++++++- pandas/tests/plotting/test_misc.py | 114 +++++++++++++++++++++++++++ 3 files changed, 168 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 031c74b1cc367..88ad1dde5c9b0 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -334,6 +334,7 @@ Plotting - Bug in :meth:`DataFrame.plot` was rotating xticklabels when ``subplots=True``, even if the x-axis wasn't an irregular time series (:issue:`29460`) - Bug in :meth:`DataFrame.plot` where a marker letter in the ``style`` keyword sometimes causes a ``ValueError`` (:issue:`21003`) +- Twinned axes were losing their tick labels which should only happen to all but the last row or column of 'externally' shared axes (:issue:`33819`) Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/pandas/plotting/_matplotlib/tools.py b/pandas/plotting/_matplotlib/tools.py index c5b44f37150bb..aed0c360fc7ce 100644 --- a/pandas/plotting/_matplotlib/tools.py +++ b/pandas/plotting/_matplotlib/tools.py @@ -297,6 +297,56 @@ def _remove_labels_from_axis(axis: "Axis"): axis.get_label().set_visible(False) +def _has_externally_shared_axis(ax1: "matplotlib.axes", compare_axis: "str") -> bool: + """ + Return whether an axis is externally shared. + + Parameters + ---------- + ax1 : matplotlib.axes + Axis to query. + compare_axis : str + `"x"` or `"y"` according to whether the X-axis or Y-axis is being + compared. + + Returns + ------- + bool + `True` if the axis is externally shared. Otherwise `False`. + + Notes + ----- + If two axes with different positions are sharing an axis, they can be + referred to as *externally* sharing the common axis. + + If two axes sharing an axis also have the same position, they can be + referred to as *internally* sharing the common axis (a.k.a twinning). + + _handle_shared_axes() is only interested in axes externally sharing an + axis, regardless of whether either of the axes is also internally sharing + with a third axis. + """ + if compare_axis == "x": + axes = ax1.get_shared_x_axes() + elif compare_axis == "y": + axes = ax1.get_shared_y_axes() + else: + raise ValueError( + "_has_externally_shared_axis() needs 'x' or 'y' as a second parameter" + ) + + axes = axes.get_siblings(ax1) + + # Retain ax1 and any of its siblings which aren't in the same position as it + ax1_points = ax1.get_position().get_points() + + for ax2 in axes: + if not np.array_equal(ax1_points, ax2.get_position().get_points()): + return True + + return False + + def handle_shared_axes( axarr: Iterable["Axes"], nplots: int, @@ -328,7 +378,7 @@ def handle_shared_axes( # the last in the column, because below is no subplot/gap. if not layout[row_num(ax) + 1, col_num(ax)]: continue - if sharex or len(ax.get_shared_x_axes().get_siblings(ax)) > 1: + if sharex or _has_externally_shared_axis(ax, "x"): _remove_labels_from_axis(ax.xaxis) except IndexError: @@ -337,7 +387,7 @@ def handle_shared_axes( for ax in axarr: if ax.is_last_row(): continue - if sharex or len(ax.get_shared_x_axes().get_siblings(ax)) > 1: + if sharex or _has_externally_shared_axis(ax, "x"): _remove_labels_from_axis(ax.xaxis) if ncols > 1: @@ -347,7 +397,7 @@ def handle_shared_axes( # have a subplot there, we can skip the layout test if ax.is_first_col(): continue - if sharey or len(ax.get_shared_y_axes().get_siblings(ax)) > 1: + if sharey or _has_externally_shared_axis(ax, "y"): _remove_labels_from_axis(ax.yaxis) diff --git a/pandas/tests/plotting/test_misc.py b/pandas/tests/plotting/test_misc.py index 0208ab3e0225b..2838bef2a10b0 100644 --- a/pandas/tests/plotting/test_misc.py +++ b/pandas/tests/plotting/test_misc.py @@ -433,3 +433,117 @@ def test_dictionary_color(self): ax = df1.plot(kind="line", color=dic_color) colors = [rect.get_color() for rect in ax.get_lines()[0:2]] assert all(color == expected[index] for index, color in enumerate(colors)) + + @pytest.mark.slow + def test_has_externally_shared_axis_x_axis(self): + # GH33819 + # Test _has_externally_shared_axis() works for x-axis + func = plotting._matplotlib.tools._has_externally_shared_axis + + fig = self.plt.figure() + plots = fig.subplots(2, 4) + + # Create *externally* shared axes for first and third columns + plots[0][0] = fig.add_subplot(231, sharex=plots[1][0]) + plots[0][2] = fig.add_subplot(233, sharex=plots[1][2]) + + # Create *internally* shared axes for second and third columns + plots[0][1].twinx() + plots[0][2].twinx() + + # First column is only externally shared + # Second column is only internally shared + # Third column is both + # Fourth column is neither + assert func(plots[0][0], "x") + assert not func(plots[0][1], "x") + assert func(plots[0][2], "x") + assert not func(plots[0][3], "x") + + @pytest.mark.slow + def test_has_externally_shared_axis_y_axis(self): + # GH33819 + # Test _has_externally_shared_axis() works for y-axis + func = plotting._matplotlib.tools._has_externally_shared_axis + + fig = self.plt.figure() + plots = fig.subplots(4, 2) + + # Create *externally* shared axes for first and third rows + plots[0][0] = fig.add_subplot(321, sharey=plots[0][1]) + plots[2][0] = fig.add_subplot(325, sharey=plots[2][1]) + + # Create *internally* shared axes for second and third rows + plots[1][0].twiny() + plots[2][0].twiny() + + # First row is only externally shared + # Second row is only internally shared + # Third row is both + # Fourth row is neither + assert func(plots[0][0], "y") + assert not func(plots[1][0], "y") + assert func(plots[2][0], "y") + assert not func(plots[3][0], "y") + + @pytest.mark.slow + def test_has_externally_shared_axis_invalid_compare_axis(self): + # GH33819 + # Test _has_externally_shared_axis() raises an exception when + # passed an invalid value as compare_axis parameter + func = plotting._matplotlib.tools._has_externally_shared_axis + + fig = self.plt.figure() + plots = fig.subplots(4, 2) + + # Create arbitrary axes + plots[0][0] = fig.add_subplot(321, sharey=plots[0][1]) + + # Check that an invalid compare_axis value triggers the expected exception + msg = "needs 'x' or 'y' as a second parameter" + with pytest.raises(ValueError, match=msg): + func(plots[0][0], "z") + + @pytest.mark.slow + def test_externally_shared_axes(self): + # Example from GH33819 + # Create data + df = DataFrame({"a": np.random.randn(1000), "b": np.random.randn(1000)}) + + # Create figure + fig = self.plt.figure() + plots = fig.subplots(2, 3) + + # Create *externally* shared axes + plots[0][0] = fig.add_subplot(231, sharex=plots[1][0]) + # note: no plots[0][1] that's the twin only case + plots[0][2] = fig.add_subplot(233, sharex=plots[1][2]) + + # Create *internally* shared axes + # note: no plots[0][0] that's the external only case + twin_ax1 = plots[0][1].twinx() + twin_ax2 = plots[0][2].twinx() + + # Plot data to primary axes + df["a"].plot(ax=plots[0][0], title="External share only").set_xlabel( + "this label should never be visible" + ) + df["a"].plot(ax=plots[1][0]) + + df["a"].plot(ax=plots[0][1], title="Internal share (twin) only").set_xlabel( + "this label should always be visible" + ) + df["a"].plot(ax=plots[1][1]) + + df["a"].plot(ax=plots[0][2], title="Both").set_xlabel( + "this label should never be visible" + ) + df["a"].plot(ax=plots[1][2]) + + # Plot data to twinned axes + df["b"].plot(ax=twin_ax1, color="green") + df["b"].plot(ax=twin_ax2, color="yellow") + + assert not plots[0][0].xaxis.get_label().get_visible() + assert plots[0][1].xaxis.get_label().get_visible() + assert not plots[0][2].xaxis.get_label().get_visible() From dbc994f2fb23a80370ded2a9c39c1ea9a5d53338 Mon Sep 17 00:00:00 2001 From: Daniel Saxton <2658661+dsaxton@users.noreply.github.com> Date: Tue, 29 Sep 2020 19:13:02 -0500 Subject: [PATCH 0940/1025] CLN: Remove unused fixtures (#36699) --- pandas/conftest.py | 17 ----------------- .../tests/arrays/boolean/test_construction.py | 8 -------- pandas/tests/arrays/boolean/test_function.py | 8 -------- pandas/tests/arrays/sparse/test_array.py | 5 ----- pandas/tests/indexes/interval/test_setops.py | 5 ----- pandas/tests/resample/conftest.py | 6 ------ 6 files changed, 49 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index 5fb333acd718d..65c31b1f17c3c 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -174,14 +174,6 @@ def axis(request): axis_frame = axis -@pytest.fixture(params=[0, "index"], ids=lambda x: f"axis {repr(x)}") -def axis_series(request): - """ - Fixture for returning the axis numbers of a Series. - """ - return request.param - - @pytest.fixture(params=[True, False, None]) def observed(request): """ @@ -1241,15 +1233,6 @@ def spmatrix(request): return getattr(sparse, request.param + "_matrix") -@pytest.fixture(params=list(tm.cython_table)) -def cython_table_items(request): - """ - Yields a tuple of a function and its corresponding name. Correspond to - the list of aggregator "Cython functions" used on selected table items. - """ - return request.param - - @pytest.fixture( params=[ getattr(pd.offsets, o) diff --git a/pandas/tests/arrays/boolean/test_construction.py b/pandas/tests/arrays/boolean/test_construction.py index 2f5c61304d415..c9e96c437964f 100644 --- a/pandas/tests/arrays/boolean/test_construction.py +++ b/pandas/tests/arrays/boolean/test_construction.py @@ -7,14 +7,6 @@ from pandas.core.arrays.boolean import coerce_to_array -@pytest.fixture -def data(): - return pd.array( - [True, False] * 4 + [np.nan] + [True, False] * 44 + [np.nan] + [True, False], - dtype="boolean", - ) - - def test_boolean_array_constructor(): values = np.array([True, False, True, False], dtype="bool") mask = np.array([False, False, False, True], dtype="bool") diff --git a/pandas/tests/arrays/boolean/test_function.py b/pandas/tests/arrays/boolean/test_function.py index 49a832f8dda20..1547f08fa66b0 100644 --- a/pandas/tests/arrays/boolean/test_function.py +++ b/pandas/tests/arrays/boolean/test_function.py @@ -5,14 +5,6 @@ import pandas._testing as tm -@pytest.fixture -def data(): - return pd.array( - [True, False] * 4 + [np.nan] + [True, False] * 44 + [np.nan] + [True, False], - dtype="boolean", - ) - - @pytest.mark.parametrize( "ufunc", [np.add, np.logical_or, np.logical_and, np.logical_xor] ) diff --git a/pandas/tests/arrays/sparse/test_array.py b/pandas/tests/arrays/sparse/test_array.py index f18117cfd3d1f..a2a9bb2c4b039 100644 --- a/pandas/tests/arrays/sparse/test_array.py +++ b/pandas/tests/arrays/sparse/test_array.py @@ -14,11 +14,6 @@ from pandas.core.arrays.sparse import SparseArray, SparseDtype -@pytest.fixture(params=["integer", "block"]) -def kind(request): - return request.param - - class TestSparseArray: def setup_method(self, method): self.arr_data = np.array([np.nan, np.nan, 1, 2, 3, np.nan, 4, 5, np.nan, 6]) diff --git a/pandas/tests/indexes/interval/test_setops.py b/pandas/tests/indexes/interval/test_setops.py index e3e5070064aff..562497b29af12 100644 --- a/pandas/tests/indexes/interval/test_setops.py +++ b/pandas/tests/indexes/interval/test_setops.py @@ -5,11 +5,6 @@ import pandas._testing as tm -@pytest.fixture(scope="class", params=[None, "foo"]) -def name(request): - return request.param - - def monotonic_index(start, end, dtype="int64", closed="right"): return IntervalIndex.from_breaks(np.arange(start, end, dtype=dtype), closed=closed) diff --git a/pandas/tests/resample/conftest.py b/pandas/tests/resample/conftest.py index fa53e49269f8b..cb62263b885aa 100644 --- a/pandas/tests/resample/conftest.py +++ b/pandas/tests/resample/conftest.py @@ -34,12 +34,6 @@ def downsample_method(request): return request.param -@pytest.fixture(params=upsample_methods) -def upsample_method(request): - """Fixture for parametrization of Grouper upsample methods.""" - return request.param - - @pytest.fixture(params=resample_methods) def resample_method(request): """Fixture for parametrization of Grouper resample methods.""" From 2e4dab574e0422c8281b3e4849234e385b0129f8 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 29 Sep 2020 18:56:34 -0700 Subject: [PATCH 0941/1025] remove unnecesary (#36705) --- pandas/core/indexes/category.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index c798ae0bd4e4d..d3167189dbcc6 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -24,7 +24,6 @@ from pandas.core import accessor from pandas.core.arrays.categorical import Categorical, contains -import pandas.core.common as com from pandas.core.construction import extract_array import pandas.core.indexes.base as ibase from pandas.core.indexes.base import Index, _index_shared_docs, maybe_extract_name @@ -574,15 +573,6 @@ def _convert_list_indexer(self, keyarr): return self.get_indexer(keyarr) - @doc(Index._convert_arr_indexer) - def _convert_arr_indexer(self, keyarr): - keyarr = com.asarray_tuplesafe(keyarr) - - if self.categories._defer_to_indexing: - return keyarr - - return self._shallow_copy(keyarr) - @doc(Index._maybe_cast_slice_bound) def _maybe_cast_slice_bound(self, label, side, kind): if kind == "loc": From 9f68940c463cc4b58c44802e4a4b3966c80d2688 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?D=C4=81gs=20Gr=C4=ABnbergs?= Date: Wed, 30 Sep 2020 05:04:40 +0300 Subject: [PATCH 0942/1025] TST: implement test to_string_empty_col for Series (GH13653) (#36715) --- pandas/tests/io/formats/test_format.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py index bc1622a61a19d..419b2fac493c7 100644 --- a/pandas/tests/io/formats/test_format.py +++ b/pandas/tests/io/formats/test_format.py @@ -2846,6 +2846,13 @@ def test_to_string_multindex_header(self): exp = " r1 r2\na b \n0 1 2 3" assert res == exp + def test_to_string_empty_col(self): + # GH 13653 + s = pd.Series(["", "Hello", "World", "", "", "Mooooo", "", ""]) + res = s.to_string(index=False) + exp = " \n Hello\n World\n \n \nMooooo\n \n " + assert re.match(exp, res) + class TestGenericArrayFormatter: def test_1d_array(self): From 2b06685613406d1a7488430c3a7b680d5f4bf9c5 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 29 Sep 2020 19:12:03 -0700 Subject: [PATCH 0943/1025] BUG: DatetimeIndex.shift(1) with empty index (#36691) --- doc/source/whatsnew/v1.2.0.rst | 3 ++- pandas/core/arrays/datetimelike.py | 4 ++-- pandas/tests/indexes/datetimelike.py | 5 +++++ pandas/tests/indexes/datetimes/test_shift.py | 6 ++++++ 4 files changed, 15 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 88ad1dde5c9b0..c442c6ccc4568 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -252,7 +252,8 @@ Datetimelike - Bug in :meth:`DatetimeIndex.slice_locs` where ``datetime.date`` objects were not accepted (:issue:`34077`) - Bug in :meth:`DatetimeIndex.searchsorted`, :meth:`TimedeltaIndex.searchsorted`, :meth:`PeriodIndex.searchsorted`, and :meth:`Series.searchsorted` with ``datetime64``, ``timedelta64`` or ``Period`` dtype placement of ``NaT`` values being inconsistent with ``NumPy`` (:issue:`36176`, :issue:`36254`) - Inconsistency in :class:`DatetimeArray`, :class:`TimedeltaArray`, and :class:`PeriodArray` setitem casting arrays of strings to datetimelike scalars but not scalar strings (:issue:`36261`) -- +- Bug in :class:`DatetimeIndex.shift` incorrectly raising when shifting empty indexes (:issue:`14811`) + Timedelta ^^^^^^^^^ diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index c90610bdd920c..0f723546fb4c2 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -1276,8 +1276,8 @@ def _time_shift(self, periods, freq=None): result = self + offset return result - if periods == 0: - # immutable so OK + if periods == 0 or len(self) == 0: + # GH#14811 empty case return self.copy() if self.freq is None: diff --git a/pandas/tests/indexes/datetimelike.py b/pandas/tests/indexes/datetimelike.py index ac3320c6f9fa0..f667e5a610419 100644 --- a/pandas/tests/indexes/datetimelike.py +++ b/pandas/tests/indexes/datetimelike.py @@ -32,6 +32,11 @@ def test_shift_identity(self): idx = self.create_index() tm.assert_index_equal(idx, idx.shift(0)) + def test_shift_empty(self): + # GH#14811 + idx = self.create_index()[:0] + tm.assert_index_equal(idx, idx.shift(1)) + def test_str(self): # test the string repr diff --git a/pandas/tests/indexes/datetimes/test_shift.py b/pandas/tests/indexes/datetimes/test_shift.py index 8724bfeb05c4d..a2a673ed5d9e0 100644 --- a/pandas/tests/indexes/datetimes/test_shift.py +++ b/pandas/tests/indexes/datetimes/test_shift.py @@ -151,3 +151,9 @@ def test_shift_bmonth(self): with tm.assert_produces_warning(pd.errors.PerformanceWarning): shifted = rng.shift(1, freq=pd.offsets.CDay()) assert shifted[0] == rng[0] + pd.offsets.CDay() + + def test_shift_empty(self): + # GH#14811 + dti = date_range(start="2016-10-21", end="2016-10-21", freq="BM") + result = dti.shift(1) + tm.assert_index_equal(result, dti) From 19a9be5d4dba61c6479370dd0922921ec0fb20f6 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 29 Sep 2020 19:17:39 -0700 Subject: [PATCH 0944/1025] DEPR: is_all_dates (#36697) --- doc/source/whatsnew/v1.2.0.rst | 2 ++ pandas/core/generic.py | 8 ++++- pandas/core/indexes/base.py | 15 +++++++- pandas/core/indexes/datetimelike.py | 2 +- pandas/core/indexes/interval.py | 2 +- pandas/core/indexes/multi.py | 2 +- pandas/core/indexes/numeric.py | 2 +- pandas/core/missing.py | 4 +-- pandas/core/series.py | 10 ++++-- pandas/plotting/_matplotlib/core.py | 2 +- .../tests/indexes/interval/test_interval.py | 2 +- .../tests/indexes/multi/test_equivalence.py | 2 +- pandas/tests/indexes/test_base.py | 3 +- pandas/tests/indexing/test_indexing.py | 8 +++-- pandas/tests/io/pytables/test_store.py | 10 ++++-- pandas/tests/series/test_alter_axes.py | 2 +- pandas/tests/series/test_constructors.py | 8 ++--- pandas/tests/series/test_repr.py | 4 ++- pandas/tests/series/test_timeseries.py | 6 ++-- .../moments/test_moments_rolling_apply.py | 34 +++++++++++-------- 20 files changed, 87 insertions(+), 41 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index c442c6ccc4568..0f21d1dbd9857 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -214,6 +214,8 @@ Deprecations - :meth:`DataFrame.lookup` is deprecated and will be removed in a future version, use :meth:`DataFrame.melt` and :meth:`DataFrame.loc` instead (:issue:`18682`) - The :meth:`Index.to_native_types` is deprecated. Use ``.astype(str)`` instead (:issue:`28867`) - Deprecated indexing :class:`DataFrame` rows with datetime-like strings ``df[string]``, use ``df.loc[string]`` instead (:issue:`36179`) +- Deprecated casting an object-dtype index of ``datetime`` objects to :class:`DatetimeIndex` in the :class:`Series` constructor (:issue:`23598`) +- Deprecated :meth:`Index.is_all_dates` (:issue:`27744`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 18a9c78912ba5..6f0aa70625c1d 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -9528,7 +9528,13 @@ def truncate( # if we have a date index, convert to dates, otherwise # treat like a slice - if ax.is_all_dates: + if ax._is_all_dates: + if is_object_dtype(ax.dtype): + warnings.warn( + "Treating object-dtype Index of date objects as DatetimeIndex " + "is deprecated, will be removed in a future version.", + FutureWarning, + ) from pandas.core.tools.datetimes import to_datetime before = to_datetime(before) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 35948a3f3dcf1..8ee09d8ad9be3 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2086,12 +2086,25 @@ def inferred_type(self) -> str_t: return lib.infer_dtype(self._values, skipna=False) @cache_readonly - def is_all_dates(self) -> bool: + def _is_all_dates(self) -> bool: """ Whether or not the index values only consist of dates. """ return is_datetime_array(ensure_object(self._values)) + @cache_readonly + def is_all_dates(self): + """ + Whether or not the index values only consist of dates. + """ + warnings.warn( + "Index.is_all_dates is deprecated, will be removed in a future version. " + "check index.inferred_type instead", + FutureWarning, + stacklevel=2, + ) + return self._is_all_dates + # -------------------------------------------------------------------- # Pickle Methods diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index e2f59ceb41db5..3d2820976a6af 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -98,7 +98,7 @@ class DatetimeIndexOpsMixin(ExtensionIndex): _hasnans = hasnans # for index / array -agnostic code @property - def is_all_dates(self) -> bool: + def _is_all_dates(self) -> bool: return True # ------------------------------------------------------------------------ diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 6b877b378a140..8855d987af745 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -1092,7 +1092,7 @@ def func(self, other, sort=sort): # -------------------------------------------------------------------- @property - def is_all_dates(self) -> bool: + def _is_all_dates(self) -> bool: """ This is False even when left/right contain datetime-like objects, as the check is done on the Interval itself diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 1de392f6fc03f..c0b32c79435ed 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1733,7 +1733,7 @@ def to_flat_index(self): return Index(self._values, tupleize_cols=False) @property - def is_all_dates(self) -> bool: + def _is_all_dates(self) -> bool: return False def is_lexsorted(self) -> bool: diff --git a/pandas/core/indexes/numeric.py b/pandas/core/indexes/numeric.py index 574c9adc31808..34bbaca06cc08 100644 --- a/pandas/core/indexes/numeric.py +++ b/pandas/core/indexes/numeric.py @@ -149,7 +149,7 @@ def _assert_safe_casting(cls, data, subarr): pass @property - def is_all_dates(self) -> bool: + def _is_all_dates(self) -> bool: """ Checks that all the labels are datetime objects. """ diff --git a/pandas/core/missing.py b/pandas/core/missing.py index f4182027e9e04..c2926debcb6d6 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -199,7 +199,7 @@ def interpolate_1d( return yvalues if method == "time": - if not getattr(xvalues, "is_all_dates", None): + if not getattr(xvalues, "_is_all_dates", None): # if not issubclass(xvalues.dtype.type, np.datetime64): raise ValueError( "time-weighted interpolation only works " @@ -327,7 +327,7 @@ def _interpolate_scipy_wrapper( "piecewise_polynomial": _from_derivatives, } - if getattr(x, "is_all_dates", False): + if getattr(x, "_is_all_dates", False): # GH 5975, scipy.interp1d can't handle datetime64s x, new_x = x._values.astype("i8"), new_x.astype("i8") diff --git a/pandas/core/series.py b/pandas/core/series.py index 41c3e8fa9d246..d2c702d924136 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -409,14 +409,20 @@ def _set_axis(self, axis: int, labels, fastpath: bool = False) -> None: if not fastpath: labels = ensure_index(labels) - is_all_dates = labels.is_all_dates - if is_all_dates: + if labels._is_all_dates: if not isinstance(labels, (DatetimeIndex, PeriodIndex, TimedeltaIndex)): try: labels = DatetimeIndex(labels) # need to set here because we changed the index if fastpath: self._mgr.set_axis(axis, labels) + warnings.warn( + "Automatically casting object-dtype Index of datetimes to " + "DatetimeIndex is deprecated and will be removed in a " + "future version. Explicitly cast to DatetimeIndex instead.", + FutureWarning, + stacklevel=3, + ) except (tslibs.OutOfBoundsDatetime, ValueError): # labels may exceeds datetime bounds, # or not be a DatetimeIndex diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index 0c64ea824996f..f806325d60eca 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -1238,7 +1238,7 @@ def get_label(i): # would be too close together. condition = ( not self._use_dynamic_x() - and (data.index.is_all_dates and self.use_index) + and (data.index._is_all_dates and self.use_index) and (not self.subplots or (self.subplots and self.sharex)) ) diff --git a/pandas/tests/indexes/interval/test_interval.py b/pandas/tests/indexes/interval/test_interval.py index b81f0f27e60ad..17a1c69858c11 100644 --- a/pandas/tests/indexes/interval/test_interval.py +++ b/pandas/tests/indexes/interval/test_interval.py @@ -871,7 +871,7 @@ def test_is_all_dates(self): pd.Timestamp("2017-01-01 00:00:00"), pd.Timestamp("2018-01-01 00:00:00") ) year_2017_index = pd.IntervalIndex([year_2017]) - assert not year_2017_index.is_all_dates + assert not year_2017_index._is_all_dates @pytest.mark.parametrize("key", [[5], (2, 3)]) def test_get_value_non_scalar_errors(self, key): diff --git a/pandas/tests/indexes/multi/test_equivalence.py b/pandas/tests/indexes/multi/test_equivalence.py index b48f09457b96c..184cedea7dc5c 100644 --- a/pandas/tests/indexes/multi/test_equivalence.py +++ b/pandas/tests/indexes/multi/test_equivalence.py @@ -202,7 +202,7 @@ def test_is_(): def test_is_all_dates(idx): - assert not idx.is_all_dates + assert not idx._is_all_dates def test_is_numeric(idx): diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 7cafdb61fcb31..77585f4003fe9 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -1153,7 +1153,8 @@ def test_is_object(self, index, expected): indirect=["index"], ) def test_is_all_dates(self, index, expected): - assert index.is_all_dates is expected + with tm.assert_produces_warning(FutureWarning): + assert index.is_all_dates is expected def test_summary(self, index): self._check_method_works(Index._summary, index) diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index 0cc61cd7df389..7d5fea232817d 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -554,15 +554,17 @@ def test_string_slice(self): # string indexing against datetimelike with object # dtype should properly raises KeyError df = DataFrame([1], Index([pd.Timestamp("2011-01-01")], dtype=object)) - assert df.index.is_all_dates + assert df.index._is_all_dates with pytest.raises(KeyError, match="'2011'"): df["2011"] with pytest.raises(KeyError, match="'2011'"): - df.loc["2011", 0] + with tm.assert_produces_warning(FutureWarning): + # This does an is_all_dates check + df.loc["2011", 0] df = DataFrame() - assert not df.index.is_all_dates + assert not df.index._is_all_dates with pytest.raises(KeyError, match="'2011'"): df["2011"] diff --git a/pandas/tests/io/pytables/test_store.py b/pandas/tests/io/pytables/test_store.py index 0942c79837e7c..10c9475401059 100644 --- a/pandas/tests/io/pytables/test_store.py +++ b/pandas/tests/io/pytables/test_store.py @@ -2384,10 +2384,16 @@ def test_series(self, setup_path): ts = tm.makeTimeSeries() self._check_roundtrip(ts, tm.assert_series_equal, path=setup_path) - ts2 = Series(ts.index, Index(ts.index, dtype=object)) + with tm.assert_produces_warning(FutureWarning): + # auto-casting object->DatetimeIndex deprecated + ts2 = Series(ts.index, Index(ts.index, dtype=object)) self._check_roundtrip(ts2, tm.assert_series_equal, path=setup_path) - ts3 = Series(ts.values, Index(np.asarray(ts.index, dtype=object), dtype=object)) + with tm.assert_produces_warning(FutureWarning): + # auto-casting object->DatetimeIndex deprecated + ts3 = Series( + ts.values, Index(np.asarray(ts.index, dtype=object), dtype=object) + ) self._check_roundtrip( ts3, tm.assert_series_equal, path=setup_path, check_index_type=False ) diff --git a/pandas/tests/series/test_alter_axes.py b/pandas/tests/series/test_alter_axes.py index 203750757e28d..181d7de43d945 100644 --- a/pandas/tests/series/test_alter_axes.py +++ b/pandas/tests/series/test_alter_axes.py @@ -52,4 +52,4 @@ def test_set_index_makes_timeseries(self): s = Series(range(10)) s.index = idx - assert s.index.is_all_dates + assert s.index._is_all_dates diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 1b5fddaf14335..4ad4917533422 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -94,11 +94,11 @@ def test_scalar_conversion(self): def test_constructor(self, datetime_series): with tm.assert_produces_warning(DeprecationWarning, check_stacklevel=False): empty_series = Series() - assert datetime_series.index.is_all_dates + assert datetime_series.index._is_all_dates # Pass in Series derived = Series(datetime_series) - assert derived.index.is_all_dates + assert derived.index._is_all_dates assert tm.equalContents(derived.index, datetime_series.index) # Ensure new index is not created @@ -109,9 +109,9 @@ def test_constructor(self, datetime_series): assert mixed.dtype == np.object_ assert mixed[1] is np.NaN - assert not empty_series.index.is_all_dates + assert not empty_series.index._is_all_dates with tm.assert_produces_warning(DeprecationWarning, check_stacklevel=False): - assert not Series().index.is_all_dates + assert not Series().index._is_all_dates # exception raised is of type Exception with pytest.raises(Exception, match="Data must be 1-dimensional"): diff --git a/pandas/tests/series/test_repr.py b/pandas/tests/series/test_repr.py index b861b37b49f89..3aaecc37df56c 100644 --- a/pandas/tests/series/test_repr.py +++ b/pandas/tests/series/test_repr.py @@ -184,7 +184,9 @@ def test_timeseries_repr_object_dtype(self): index = Index( [datetime(2000, 1, 1) + timedelta(i) for i in range(1000)], dtype=object ) - ts = Series(np.random.randn(len(index)), index) + with tm.assert_produces_warning(FutureWarning): + # Index.is_all_dates deprecated + ts = Series(np.random.randn(len(index)), index) repr(ts) ts = tm.makeTimeSeries(1000) diff --git a/pandas/tests/series/test_timeseries.py b/pandas/tests/series/test_timeseries.py index 15b6481c08a61..bab3853e3bd1d 100644 --- a/pandas/tests/series/test_timeseries.py +++ b/pandas/tests/series/test_timeseries.py @@ -9,8 +9,10 @@ class TestTimeSeries: def test_timeseries_coercion(self): idx = tm.makeDateIndex(10000) - ser = Series(np.random.randn(len(idx)), idx.astype(object)) - assert ser.index.is_all_dates + with tm.assert_produces_warning(FutureWarning): + ser = Series(np.random.randn(len(idx)), idx.astype(object)) + with tm.assert_produces_warning(FutureWarning): + assert ser.index.is_all_dates assert isinstance(ser.index, DatetimeIndex) def test_contiguous_boolean_preserve_freq(self): diff --git a/pandas/tests/window/moments/test_moments_rolling_apply.py b/pandas/tests/window/moments/test_moments_rolling_apply.py index e48d88b365d8d..e9e672e1d3dae 100644 --- a/pandas/tests/window/moments/test_moments_rolling_apply.py +++ b/pandas/tests/window/moments/test_moments_rolling_apply.py @@ -122,13 +122,16 @@ def test_center_reindex_series(raw, series): s = [f"x{x:d}" for x in range(12)] minp = 10 - series_xp = ( - series.reindex(list(series.index) + s) - .rolling(window=25, min_periods=minp) - .apply(f, raw=raw) - .shift(-12) - .reindex(series.index) - ) + warn = None if raw else FutureWarning + with tm.assert_produces_warning(warn, check_stacklevel=False): + # GH#36697 is_all_dates deprecated + series_xp = ( + series.reindex(list(series.index) + s) + .rolling(window=25, min_periods=minp) + .apply(f, raw=raw) + .shift(-12) + .reindex(series.index) + ) series_rs = series.rolling(window=25, min_periods=minp, center=True).apply( f, raw=raw ) @@ -140,12 +143,15 @@ def test_center_reindex_frame(raw, frame): s = [f"x{x:d}" for x in range(12)] minp = 10 - frame_xp = ( - frame.reindex(list(frame.index) + s) - .rolling(window=25, min_periods=minp) - .apply(f, raw=raw) - .shift(-12) - .reindex(frame.index) - ) + warn = None if raw else FutureWarning + with tm.assert_produces_warning(warn, check_stacklevel=False): + # GH#36697 is_all_dates deprecated + frame_xp = ( + frame.reindex(list(frame.index) + s) + .rolling(window=25, min_periods=minp) + .apply(f, raw=raw) + .shift(-12) + .reindex(frame.index) + ) frame_rs = frame.rolling(window=25, min_periods=minp, center=True).apply(f, raw=raw) tm.assert_frame_equal(frame_xp, frame_rs) From 9d4fce0f36216f7b3e6b2cf6d1040017df655f4e Mon Sep 17 00:00:00 2001 From: Fangchen Li Date: Tue, 29 Sep 2020 21:23:31 -0500 Subject: [PATCH 0945/1025] bump pytables to 3.5.1 #24839 (#36683) --- ci/deps/azure-37-minimum_versions.yaml | 2 +- ci/deps/travis-37-locale.yaml | 4 ++-- doc/source/getting_started/install.rst | 4 ++-- doc/source/whatsnew/v1.2.0.rst | 3 ++- environment.yml | 2 +- pandas/tests/io/pytables/test_complex.py | 5 ----- pandas/tests/io/pytables/test_store.py | 22 +--------------------- pandas/util/_test_decorators.py | 15 --------------- requirements-dev.txt | 2 +- 9 files changed, 10 insertions(+), 49 deletions(-) diff --git a/ci/deps/azure-37-minimum_versions.yaml b/ci/deps/azure-37-minimum_versions.yaml index afd5b07cc6654..f184ea87c89fe 100644 --- a/ci/deps/azure-37-minimum_versions.yaml +++ b/ci/deps/azure-37-minimum_versions.yaml @@ -20,7 +20,7 @@ dependencies: - numexpr=2.6.8 - numpy=1.16.5 - openpyxl=2.6.0 - - pytables=3.4.4 + - pytables=3.5.1 - python-dateutil=2.7.3 - pytz=2017.3 - pyarrow=0.15 diff --git a/ci/deps/travis-37-locale.yaml b/ci/deps/travis-37-locale.yaml index cd6341e80be24..ddaf0bea097c7 100644 --- a/ci/deps/travis-37-locale.yaml +++ b/ci/deps/travis-37-locale.yaml @@ -13,7 +13,7 @@ dependencies: # pandas dependencies - beautifulsoup4 - - blosc=1.14.3 + - blosc=1.15.0 - python-blosc - fastparquet=0.3.2 - html5lib @@ -30,7 +30,7 @@ dependencies: - pyarrow>=0.17 - psycopg2=2.7 - pymysql=0.7.11 - - pytables + - pytables>=3.5.1 - python-dateutil - pytz - scipy diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index 78bd76bbd230f..a6341451b1b80 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -266,7 +266,7 @@ PyTables 3.4.4 HDF5-based reading / writing SQLAlchemy 1.2.8 SQL support for databases other than sqlite SciPy 1.12.0 Miscellaneous statistical functions xlsxwriter 1.0.2 Excel writing -blosc 1.14.3 Compression for HDF5 +blosc 1.15.0 Compression for HDF5 fsspec 0.7.4 Handling files aside from local and HTTP fastparquet 0.3.2 Parquet reading / writing gcsfs 0.6.0 Google Cloud Storage access @@ -280,7 +280,7 @@ psycopg2 2.7 PostgreSQL engine for sqlalchemy pyarrow 0.15.0 Parquet, ORC, and feather reading / writing pymysql 0.7.11 MySQL engine for sqlalchemy pyreadstat SPSS files (.sav) reading -pytables 3.4.4 HDF5 reading / writing +pytables 3.5.1 HDF5 reading / writing pyxlsb 1.0.6 Reading for xlsb files qtpy Clipboard I/O s3fs 0.4.0 Amazon S3 access diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 0f21d1dbd9857..f87dac0669e00 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -181,7 +181,7 @@ Optional libraries below the lowest tested version may still work, but are not c +-----------------+-----------------+---------+ | pymysql | 0.7.11 | X | +-----------------+-----------------+---------+ -| pytables | 3.4.4 | X | +| pytables | 3.5.1 | X | +-----------------+-----------------+---------+ | s3fs | 0.4.0 | | +-----------------+-----------------+---------+ @@ -331,6 +331,7 @@ I/O - Bug in :func:`LongTableBuilder.middle_separator` was duplicating LaTeX longtable entries in the List of Tables of a LaTeX document (:issue:`34360`) - Bug in :meth:`read_csv` with ``engine='python'`` truncating data if multiple items present in first row and first element started with BOM (:issue:`36343`) - Removed ``private_key`` and ``verbose`` from :func:`read_gbq` as they are no longer supported in ``pandas-gbq`` (:issue:`34654`, :issue:`30200`) +- Bumped minimum pytables version to 3.5.1 to avoid a ``ValueError`` in :meth:`read_hdf` (:issue:`24839`) Plotting ^^^^^^^^ diff --git a/environment.yml b/environment.yml index f97f8e2457585..6e9b417beb0af 100644 --- a/environment.yml +++ b/environment.yml @@ -100,7 +100,7 @@ dependencies: - python-snappy # required by pyarrow - pyqt>=5.9.2 # pandas.read_clipboard - - pytables>=3.4.4 # pandas.read_hdf, DataFrame.to_hdf + - pytables>=3.5.1 # pandas.read_hdf, DataFrame.to_hdf - s3fs>=0.4.0 # file IO when using 's3://...' path - fsspec>=0.7.4 # for generic remote file operations - gcsfs>=0.6.0 # file IO when using 'gcs://...' path diff --git a/pandas/tests/io/pytables/test_complex.py b/pandas/tests/io/pytables/test_complex.py index 543940e674dba..3a7aff3b551c2 100644 --- a/pandas/tests/io/pytables/test_complex.py +++ b/pandas/tests/io/pytables/test_complex.py @@ -3,8 +3,6 @@ import numpy as np import pytest -import pandas.util._test_decorators as td - import pandas as pd from pandas import DataFrame, Series import pandas._testing as tm @@ -12,8 +10,6 @@ from pandas.io.pytables import read_hdf -# GH10447 - def test_complex_fixed(setup_path): df = DataFrame( @@ -62,7 +58,6 @@ def test_complex_table(setup_path): tm.assert_frame_equal(df, reread) -@td.xfail_non_writeable def test_complex_mixed_fixed(setup_path): complex64 = np.array( [1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j], dtype=np.complex64 diff --git a/pandas/tests/io/pytables/test_store.py b/pandas/tests/io/pytables/test_store.py index 10c9475401059..ccb2efbd2c630 100644 --- a/pandas/tests/io/pytables/test_store.py +++ b/pandas/tests/io/pytables/test_store.py @@ -909,7 +909,6 @@ def test_put_integer(self, setup_path): df = DataFrame(np.random.randn(50, 100)) self._check_roundtrip(df, tm.assert_frame_equal, setup_path) - @td.xfail_non_writeable def test_put_mixed_type(self, setup_path): df = tm.makeTimeDataFrame() df["obj1"] = "foo" @@ -1518,9 +1517,7 @@ def test_to_hdf_with_min_itemsize(self, setup_path): pd.read_hdf(path, "ss4"), pd.concat([df["B"], df2["B"]]) ) - @pytest.mark.parametrize( - "format", [pytest.param("fixed", marks=td.xfail_non_writeable), "table"] - ) + @pytest.mark.parametrize("format", ["fixed", "table"]) def test_to_hdf_errors(self, format, setup_path): data = ["\ud800foo"] @@ -1956,7 +1953,6 @@ def test_pass_spec_to_storer(self, setup_path): with pytest.raises(TypeError): store.select("df", where=[("columns=A")]) - @td.xfail_non_writeable def test_append_misc(self, setup_path): with ensure_clean_store(setup_path) as store: @@ -2164,14 +2160,6 @@ def test_unimplemented_dtypes_table_columns(self, setup_path): with pytest.raises(TypeError): store.append("df_unimplemented", df) - @td.xfail_non_writeable - @pytest.mark.skipif( - LooseVersion(np.__version__) == LooseVersion("1.15.0"), - reason=( - "Skipping pytables test when numpy version is " - "exactly equal to 1.15.0: gh-22098" - ), - ) def test_calendar_roundtrip_issue(self, setup_path): # 8591 @@ -2405,7 +2393,6 @@ def test_float_index(self, setup_path): s = Series(np.random.randn(10), index=index) self._check_roundtrip(s, tm.assert_series_equal, path=setup_path) - @td.xfail_non_writeable def test_tuple_index(self, setup_path): # GH #492 @@ -2418,7 +2405,6 @@ def test_tuple_index(self, setup_path): simplefilter("ignore", pd.errors.PerformanceWarning) self._check_roundtrip(DF, tm.assert_frame_equal, path=setup_path) - @td.xfail_non_writeable @pytest.mark.filterwarnings("ignore::pandas.errors.PerformanceWarning") def test_index_types(self, setup_path): @@ -2480,7 +2466,6 @@ def test_timeseries_preepoch(self, setup_path): except OverflowError: pytest.skip("known failer on some windows platforms") - @td.xfail_non_writeable @pytest.mark.parametrize( "compression", [False, pytest.param(True, marks=td.skip_if_windows_python_3)] ) @@ -2514,7 +2499,6 @@ def test_frame(self, compression, setup_path): # empty self._check_roundtrip(df[:0], tm.assert_frame_equal, path=setup_path) - @td.xfail_non_writeable def test_empty_series_frame(self, setup_path): s0 = Series(dtype=object) s1 = Series(name="myseries", dtype=object) @@ -2528,7 +2512,6 @@ def test_empty_series_frame(self, setup_path): self._check_roundtrip(df1, tm.assert_frame_equal, path=setup_path) self._check_roundtrip(df2, tm.assert_frame_equal, path=setup_path) - @td.xfail_non_writeable @pytest.mark.parametrize( "dtype", [np.int64, np.float64, object, "m8[ns]", "M8[ns]"] ) @@ -2614,7 +2597,6 @@ def test_store_series_name(self, setup_path): recons = store["series"] tm.assert_series_equal(recons, series) - @td.xfail_non_writeable @pytest.mark.parametrize( "compression", [False, pytest.param(True, marks=td.skip_if_windows_python_3)] ) @@ -4182,7 +4164,6 @@ def test_pytables_native2_read(self, datapath, setup_path): d1 = store["detector"] assert isinstance(d1, DataFrame) - @td.xfail_non_writeable def test_legacy_table_fixed_format_read_py2(self, datapath, setup_path): # GH 24510 # legacy table with fixed format written in Python 2 @@ -4356,7 +4337,6 @@ def test_unicode_longer_encoded(self, setup_path): result = store.get("df") tm.assert_frame_equal(result, df) - @td.xfail_non_writeable def test_store_datetime_mixed(self, setup_path): df = DataFrame({"a": [1, 2, 3], "b": [1.0, 2.0, 3.0], "c": ["a", "b", "c"]}) diff --git a/pandas/util/_test_decorators.py b/pandas/util/_test_decorators.py index 0e8f6b933cd97..e3b779678c68b 100644 --- a/pandas/util/_test_decorators.py +++ b/pandas/util/_test_decorators.py @@ -75,21 +75,6 @@ def safe_import(mod_name: str, min_version: Optional[str] = None): return False -# TODO: -# remove when gh-24839 is fixed. -# this affects numpy 1.16 and pytables 3.4.4 -tables = safe_import("tables") -xfail_non_writeable = pytest.mark.xfail( - tables - and LooseVersion(np.__version__) >= LooseVersion("1.16") - and LooseVersion(tables.__version__) < LooseVersion("3.5.1"), - reason=( - "gh-25511, gh-24839. pytables needs a " - "release beyond 3.4.4 to support numpy 1.16.x" - ), -) - - def _skip_if_no_mpl(): mod = safe_import("matplotlib") if mod: diff --git a/requirements-dev.txt b/requirements-dev.txt index 5a1c6a80334ed..8f3dd20f309aa 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -67,7 +67,7 @@ fastparquet>=0.3.2 pyarrow>=0.15.0 python-snappy pyqt5>=5.9.2 -tables>=3.4.4 +tables>=3.5.1 s3fs>=0.4.0 fsspec>=0.7.4 gcsfs>=0.6.0 From 9bde3002d3e859b72480a64e5e16009cbe38675e Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Tue, 29 Sep 2020 20:50:56 -0700 Subject: [PATCH 0946/1025] CLN: More pytest idioms in pandas/tests/window (#36657) * Clean test_timeseries_window * CLN: More pytest idioms to pandas/tests/window Co-authored-by: Matt Roeschke --- pandas/tests/window/conftest.py | 33 ++++ pandas/tests/window/test_apply.py | 2 - pandas/tests/window/test_base_indexer.py | 4 +- pandas/tests/window/test_expanding.py | 23 ++- pandas/tests/window/test_grouper.py | 97 +++++---- pandas/tests/window/test_pairwise.py | 185 +++++++++--------- pandas/tests/window/test_rolling.py | 35 ++-- pandas/tests/window/test_timeseries_window.py | 113 ++++++----- pandas/tests/window/test_window.py | 27 ++- 9 files changed, 295 insertions(+), 224 deletions(-) diff --git a/pandas/tests/window/conftest.py b/pandas/tests/window/conftest.py index 3b4ed4859b1cc..e5c5579d35a5c 100644 --- a/pandas/tests/window/conftest.py +++ b/pandas/tests/window/conftest.py @@ -344,3 +344,36 @@ def halflife_with_times(request): def dtypes(request): """Dtypes for window tests""" return request.param + + +@pytest.fixture( + params=[ + DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=[1, 0]), + DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=[1, 1]), + DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=["C", "C"]), + DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=[1.0, 0]), + DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=[0.0, 1]), + DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=["C", 1]), + DataFrame([[2.0, 4.0], [1.0, 2.0], [5.0, 2.0], [8.0, 1.0]], columns=[1, 0.0]), + DataFrame([[2, 4.0], [1, 2.0], [5, 2.0], [8, 1.0]], columns=[0, 1.0]), + DataFrame([[2, 4], [1, 2], [5, 2], [8, 1.0]], columns=[1.0, "X"]), + ] +) +def pairwise_frames(request): + """Pairwise frames test_pairwise""" + return request.param + + +@pytest.fixture +def pairwise_target_frame(): + """Pairwise target frame for test_pairwise""" + return DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=[0, 1]) + + +@pytest.fixture +def pairwise_other_frame(): + """Pairwise other frame for test_pairwise""" + return DataFrame( + [[None, 1, 1], [None, 1, 2], [None, 3, 2], [None, 8, 1]], + columns=["Y", "Z", "X"], + ) diff --git a/pandas/tests/window/test_apply.py b/pandas/tests/window/test_apply.py index bc38634da8941..b7343d835fa6e 100644 --- a/pandas/tests/window/test_apply.py +++ b/pandas/tests/window/test_apply.py @@ -155,8 +155,6 @@ def foo(x, par): result = df.rolling(1).apply(foo, args=args_kwargs[0], kwargs=args_kwargs[1]) tm.assert_frame_equal(result, expected) - result = df.rolling(1).apply(foo, args=(10,)) - midx = MultiIndex.from_tuples([(1, 0), (1, 1)], names=["gr", None]) expected = Series([11.0, 12.0], index=midx, name="a") diff --git a/pandas/tests/window/test_base_indexer.py b/pandas/tests/window/test_base_indexer.py index ab73e075eed04..f681b19d57600 100644 --- a/pandas/tests/window/test_base_indexer.py +++ b/pandas/tests/window/test_base_indexer.py @@ -148,12 +148,12 @@ def test_rolling_forward_window(constructor, func, np_func, expected, np_kwargs) match = "Forward-looking windows can't have center=True" with pytest.raises(ValueError, match=match): rolling = constructor(values).rolling(window=indexer, center=True) - result = getattr(rolling, func)() + getattr(rolling, func)() match = "Forward-looking windows don't support setting the closed argument" with pytest.raises(ValueError, match=match): rolling = constructor(values).rolling(window=indexer, closed="right") - result = getattr(rolling, func)() + getattr(rolling, func)() rolling = constructor(values).rolling(window=indexer, min_periods=2) result = getattr(rolling, func)() diff --git a/pandas/tests/window/test_expanding.py b/pandas/tests/window/test_expanding.py index 146eca07c523e..e5006fd391f90 100644 --- a/pandas/tests/window/test_expanding.py +++ b/pandas/tests/window/test_expanding.py @@ -29,15 +29,22 @@ def test_constructor(which): c(min_periods=1, center=True) c(min_periods=1, center=False) + +@pytest.mark.parametrize("w", [2.0, "foo", np.array([2])]) +@pytest.mark.filterwarnings( + "ignore:The `center` argument on `expanding` will be removed in the future" +) +def test_constructor_invalid(which, w): # not valid - for w in [2.0, "foo", np.array([2])]: - msg = "min_periods must be an integer" - with pytest.raises(ValueError, match=msg): - c(min_periods=w) - - msg = "center must be a boolean" - with pytest.raises(ValueError, match=msg): - c(min_periods=1, center=w) + + c = which.expanding + msg = "min_periods must be an integer" + with pytest.raises(ValueError, match=msg): + c(min_periods=w) + + msg = "center must be a boolean" + with pytest.raises(ValueError, match=msg): + c(min_periods=1, center=w) @pytest.mark.parametrize("method", ["std", "mean", "sum", "max", "min", "var"]) diff --git a/pandas/tests/window/test_grouper.py b/pandas/tests/window/test_grouper.py index 786cf68d28871..0eebd657e97b7 100644 --- a/pandas/tests/window/test_grouper.py +++ b/pandas/tests/window/test_grouper.py @@ -8,7 +8,7 @@ class TestGrouperGrouping: - def setup_method(self, method): + def setup_method(self): self.series = Series(np.arange(10)) self.frame = DataFrame({"A": [1] * 20 + [2] * 12 + [3] * 8, "B": np.arange(40)}) @@ -55,19 +55,25 @@ def test_getitem_multiple(self): result = r.B.count() tm.assert_series_equal(result, expected) - def test_rolling(self): + @pytest.mark.parametrize( + "f", ["sum", "mean", "min", "max", "count", "kurt", "skew"] + ) + def test_rolling(self, f): g = self.frame.groupby("A") r = g.rolling(window=4) - for f in ["sum", "mean", "min", "max", "count", "kurt", "skew"]: - result = getattr(r, f)() - expected = g.apply(lambda x: getattr(x.rolling(4), f)()) - tm.assert_frame_equal(result, expected) + result = getattr(r, f)() + expected = g.apply(lambda x: getattr(x.rolling(4), f)()) + tm.assert_frame_equal(result, expected) - for f in ["std", "var"]: - result = getattr(r, f)(ddof=1) - expected = g.apply(lambda x: getattr(x.rolling(4), f)(ddof=1)) - tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize("f", ["std", "var"]) + def test_rolling_ddof(self, f): + g = self.frame.groupby("A") + r = g.rolling(window=4) + + result = getattr(r, f)(ddof=1) + expected = g.apply(lambda x: getattr(x.rolling(4), f)(ddof=1)) + tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( "interpolation", ["linear", "lower", "higher", "midpoint", "nearest"] @@ -81,26 +87,26 @@ def test_rolling_quantile(self, interpolation): ) tm.assert_frame_equal(result, expected) - def test_rolling_corr_cov(self): + @pytest.mark.parametrize("f", ["corr", "cov"]) + def test_rolling_corr_cov(self, f): g = self.frame.groupby("A") r = g.rolling(window=4) - for f in ["corr", "cov"]: - result = getattr(r, f)(self.frame) + result = getattr(r, f)(self.frame) - def func(x): - return getattr(x.rolling(4), f)(self.frame) + def func(x): + return getattr(x.rolling(4), f)(self.frame) - expected = g.apply(func) - tm.assert_frame_equal(result, expected) + expected = g.apply(func) + tm.assert_frame_equal(result, expected) - result = getattr(r.B, f)(pairwise=True) + result = getattr(r.B, f)(pairwise=True) - def func(x): - return getattr(x.B.rolling(4), f)(pairwise=True) + def func(x): + return getattr(x.B.rolling(4), f)(pairwise=True) - expected = g.apply(func) - tm.assert_series_equal(result, expected) + expected = g.apply(func) + tm.assert_series_equal(result, expected) def test_rolling_apply(self, raw): g = self.frame.groupby("A") @@ -134,20 +140,25 @@ def test_rolling_apply_mutability(self): result = g.rolling(window=2).sum() tm.assert_frame_equal(result, expected) - def test_expanding(self): + @pytest.mark.parametrize( + "f", ["sum", "mean", "min", "max", "count", "kurt", "skew"] + ) + def test_expanding(self, f): g = self.frame.groupby("A") r = g.expanding() - for f in ["sum", "mean", "min", "max", "count", "kurt", "skew"]: + result = getattr(r, f)() + expected = g.apply(lambda x: getattr(x.expanding(), f)()) + tm.assert_frame_equal(result, expected) - result = getattr(r, f)() - expected = g.apply(lambda x: getattr(x.expanding(), f)()) - tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize("f", ["std", "var"]) + def test_expanding_ddof(self, f): + g = self.frame.groupby("A") + r = g.expanding() - for f in ["std", "var"]: - result = getattr(r, f)(ddof=0) - expected = g.apply(lambda x: getattr(x.expanding(), f)(ddof=0)) - tm.assert_frame_equal(result, expected) + result = getattr(r, f)(ddof=0) + expected = g.apply(lambda x: getattr(x.expanding(), f)(ddof=0)) + tm.assert_frame_equal(result, expected) @pytest.mark.parametrize( "interpolation", ["linear", "lower", "higher", "midpoint", "nearest"] @@ -161,26 +172,26 @@ def test_expanding_quantile(self, interpolation): ) tm.assert_frame_equal(result, expected) - def test_expanding_corr_cov(self): + @pytest.mark.parametrize("f", ["corr", "cov"]) + def test_expanding_corr_cov(self, f): g = self.frame.groupby("A") r = g.expanding() - for f in ["corr", "cov"]: - result = getattr(r, f)(self.frame) + result = getattr(r, f)(self.frame) - def func(x): - return getattr(x.expanding(), f)(self.frame) + def func(x): + return getattr(x.expanding(), f)(self.frame) - expected = g.apply(func) - tm.assert_frame_equal(result, expected) + expected = g.apply(func) + tm.assert_frame_equal(result, expected) - result = getattr(r.B, f)(pairwise=True) + result = getattr(r.B, f)(pairwise=True) - def func(x): - return getattr(x.B.expanding(), f)(pairwise=True) + def func(x): + return getattr(x.B.expanding(), f)(pairwise=True) - expected = g.apply(func) - tm.assert_series_equal(result, expected) + expected = g.apply(func) + tm.assert_series_equal(result, expected) def test_expanding_apply(self, raw): g = self.frame.groupby("A") diff --git a/pandas/tests/window/test_pairwise.py b/pandas/tests/window/test_pairwise.py index 7f4e85b385b2d..b39d052a702c0 100644 --- a/pandas/tests/window/test_pairwise.py +++ b/pandas/tests/window/test_pairwise.py @@ -11,26 +11,15 @@ class TestPairwise: # GH 7738 - df1s = [ - DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=[0, 1]), - DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=[1, 0]), - DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=[1, 1]), - DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=["C", "C"]), - DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=[1.0, 0]), - DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=[0.0, 1]), - DataFrame([[2, 4], [1, 2], [5, 2], [8, 1]], columns=["C", 1]), - DataFrame([[2.0, 4.0], [1.0, 2.0], [5.0, 2.0], [8.0, 1.0]], columns=[1, 0.0]), - DataFrame([[2, 4.0], [1, 2.0], [5, 2.0], [8, 1.0]], columns=[0, 1.0]), - DataFrame([[2, 4], [1, 2], [5, 2], [8, 1.0]], columns=[1.0, "X"]), - ] - df2 = DataFrame( - [[None, 1, 1], [None, 1, 2], [None, 3, 2], [None, 8, 1]], - columns=["Y", "Z", "X"], - ) - s = Series([1, 1, 3, 8]) + @pytest.mark.parametrize("f", [lambda x: x.cov(), lambda x: x.corr()]) + def test_no_flex(self, pairwise_frames, pairwise_target_frame, f): - def compare(self, result, expected): + # DataFrame methods (which do not call flex_binary_moment()) + result = f(pairwise_frames) + tm.assert_index_equal(result.index, pairwise_frames.columns) + tm.assert_index_equal(result.columns, pairwise_frames.columns) + expected = f(pairwise_target_frame) # since we have sorted the results # we can only compare non-nans result = result.dropna().values @@ -38,19 +27,6 @@ def compare(self, result, expected): tm.assert_numpy_array_equal(result, expected, check_dtype=False) - @pytest.mark.parametrize("f", [lambda x: x.cov(), lambda x: x.corr()]) - def test_no_flex(self, f): - - # DataFrame methods (which do not call flex_binary_moment()) - - results = [f(df) for df in self.df1s] - for (df, result) in zip(self.df1s, results): - tm.assert_index_equal(result.index, df.columns) - tm.assert_index_equal(result.columns, df.columns) - for i, result in enumerate(results): - if i > 0: - self.compare(result, results[0]) - @pytest.mark.parametrize( "f", [ @@ -62,24 +38,27 @@ def test_no_flex(self, f): lambda x: x.ewm(com=3).corr(pairwise=True), ], ) - def test_pairwise_with_self(self, f): + def test_pairwise_with_self(self, pairwise_frames, pairwise_target_frame, f): # DataFrame with itself, pairwise=True # note that we may construct the 1st level of the MI # in a non-monotonic way, so compare accordingly - results = [] - for i, df in enumerate(self.df1s): - result = f(df) - tm.assert_index_equal(result.index.levels[0], df.index, check_names=False) - tm.assert_numpy_array_equal( - safe_sort(result.index.levels[1]), safe_sort(df.columns.unique()) - ) - tm.assert_index_equal(result.columns, df.columns) - results.append(df) - - for i, result in enumerate(results): - if i > 0: - self.compare(result, results[0]) + result = f(pairwise_frames) + tm.assert_index_equal( + result.index.levels[0], pairwise_frames.index, check_names=False + ) + tm.assert_numpy_array_equal( + safe_sort(result.index.levels[1]), + safe_sort(pairwise_frames.columns.unique()), + ) + tm.assert_index_equal(result.columns, pairwise_frames.columns) + expected = f(pairwise_target_frame) + # since we have sorted the results + # we can only compare non-nans + result = result.dropna().values + expected = expected.dropna().values + + tm.assert_numpy_array_equal(result, expected, check_dtype=False) @pytest.mark.parametrize( "f", @@ -92,16 +71,19 @@ def test_pairwise_with_self(self, f): lambda x: x.ewm(com=3).corr(pairwise=False), ], ) - def test_no_pairwise_with_self(self, f): + def test_no_pairwise_with_self(self, pairwise_frames, pairwise_target_frame, f): # DataFrame with itself, pairwise=False - results = [f(df) for df in self.df1s] - for (df, result) in zip(self.df1s, results): - tm.assert_index_equal(result.index, df.index) - tm.assert_index_equal(result.columns, df.columns) - for i, result in enumerate(results): - if i > 0: - self.compare(result, results[0]) + result = f(pairwise_frames) + tm.assert_index_equal(result.index, pairwise_frames.index) + tm.assert_index_equal(result.columns, pairwise_frames.columns) + expected = f(pairwise_target_frame) + # since we have sorted the results + # we can only compare non-nans + result = result.dropna().values + expected = expected.dropna().values + + tm.assert_numpy_array_equal(result, expected, check_dtype=False) @pytest.mark.parametrize( "f", @@ -114,18 +96,26 @@ def test_no_pairwise_with_self(self, f): lambda x, y: x.ewm(com=3).corr(y, pairwise=True), ], ) - def test_pairwise_with_other(self, f): + def test_pairwise_with_other( + self, pairwise_frames, pairwise_target_frame, pairwise_other_frame, f + ): # DataFrame with another DataFrame, pairwise=True - results = [f(df, self.df2) for df in self.df1s] - for (df, result) in zip(self.df1s, results): - tm.assert_index_equal(result.index.levels[0], df.index, check_names=False) - tm.assert_numpy_array_equal( - safe_sort(result.index.levels[1]), safe_sort(self.df2.columns.unique()) - ) - for i, result in enumerate(results): - if i > 0: - self.compare(result, results[0]) + result = f(pairwise_frames, pairwise_other_frame) + tm.assert_index_equal( + result.index.levels[0], pairwise_frames.index, check_names=False + ) + tm.assert_numpy_array_equal( + safe_sort(result.index.levels[1]), + safe_sort(pairwise_other_frame.columns.unique()), + ) + expected = f(pairwise_target_frame, pairwise_other_frame) + # since we have sorted the results + # we can only compare non-nans + result = result.dropna().values + expected = expected.dropna().values + + tm.assert_numpy_array_equal(result, expected, check_dtype=False) @pytest.mark.parametrize( "f", @@ -138,26 +128,29 @@ def test_pairwise_with_other(self, f): lambda x, y: x.ewm(com=3).corr(y, pairwise=False), ], ) - def test_no_pairwise_with_other(self, f): + def test_no_pairwise_with_other(self, pairwise_frames, pairwise_other_frame, f): # DataFrame with another DataFrame, pairwise=False - results = [ - f(df, self.df2) if df.columns.is_unique else None for df in self.df1s - ] - for (df, result) in zip(self.df1s, results): - if result is not None: - with warnings.catch_warnings(record=True): - warnings.simplefilter("ignore", RuntimeWarning) - # we can have int and str columns - expected_index = df.index.union(self.df2.index) - expected_columns = df.columns.union(self.df2.columns) - tm.assert_index_equal(result.index, expected_index) - tm.assert_index_equal(result.columns, expected_columns) - else: - with pytest.raises(ValueError, match="'arg1' columns are not unique"): - f(df, self.df2) - with pytest.raises(ValueError, match="'arg2' columns are not unique"): - f(self.df2, df) + result = ( + f(pairwise_frames, pairwise_other_frame) + if pairwise_frames.columns.is_unique + else None + ) + if result is not None: + with warnings.catch_warnings(record=True): + warnings.simplefilter("ignore", RuntimeWarning) + # we can have int and str columns + expected_index = pairwise_frames.index.union(pairwise_other_frame.index) + expected_columns = pairwise_frames.columns.union( + pairwise_other_frame.columns + ) + tm.assert_index_equal(result.index, expected_index) + tm.assert_index_equal(result.columns, expected_columns) + else: + with pytest.raises(ValueError, match="'arg1' columns are not unique"): + f(pairwise_frames, pairwise_other_frame) + with pytest.raises(ValueError, match="'arg2' columns are not unique"): + f(pairwise_other_frame, pairwise_frames) @pytest.mark.parametrize( "f", @@ -170,18 +163,28 @@ def test_no_pairwise_with_other(self, f): lambda x, y: x.ewm(com=3).corr(y), ], ) - def test_pairwise_with_series(self, f): + def test_pairwise_with_series(self, pairwise_frames, pairwise_target_frame, f): # DataFrame with a Series - results = [f(df, self.s) for df in self.df1s] + [ - f(self.s, df) for df in self.df1s - ] - for (df, result) in zip(self.df1s, results): - tm.assert_index_equal(result.index, df.index) - tm.assert_index_equal(result.columns, df.columns) - for i, result in enumerate(results): - if i > 0: - self.compare(result, results[0]) + result = f(pairwise_frames, Series([1, 1, 3, 8])) + tm.assert_index_equal(result.index, pairwise_frames.index) + tm.assert_index_equal(result.columns, pairwise_frames.columns) + expected = f(pairwise_target_frame, Series([1, 1, 3, 8])) + # since we have sorted the results + # we can only compare non-nans + result = result.dropna().values + expected = expected.dropna().values + tm.assert_numpy_array_equal(result, expected, check_dtype=False) + + result = f(Series([1, 1, 3, 8]), pairwise_frames) + tm.assert_index_equal(result.index, pairwise_frames.index) + tm.assert_index_equal(result.columns, pairwise_frames.columns) + expected = f(Series([1, 1, 3, 8]), pairwise_target_frame) + # since we have sorted the results + # we can only compare non-nans + result = result.dropna().values + expected = expected.dropna().values + tm.assert_numpy_array_equal(result, expected, check_dtype=False) def test_corr_freq_memory_error(self): # GH 31789 diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index 4dfa0287bbb03..9ac4871ad24a1 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -39,22 +39,27 @@ def test_constructor(which): with pytest.raises(ValueError, match=msg): c(-1) + +@pytest.mark.parametrize("w", [2.0, "foo", np.array([2])]) +def test_invalid_constructor(which, w): # not valid - for w in [2.0, "foo", np.array([2])]: - msg = ( - "window must be an integer|" - "passed window foo is not compatible with a datetimelike index" - ) - with pytest.raises(ValueError, match=msg): - c(window=w) - - msg = "min_periods must be an integer" - with pytest.raises(ValueError, match=msg): - c(window=2, min_periods=w) - - msg = "center must be a boolean" - with pytest.raises(ValueError, match=msg): - c(window=2, min_periods=1, center=w) + + c = which.rolling + + msg = ( + "window must be an integer|" + "passed window foo is not compatible with a datetimelike index" + ) + with pytest.raises(ValueError, match=msg): + c(window=w) + + msg = "min_periods must be an integer" + with pytest.raises(ValueError, match=msg): + c(window=2, min_periods=w) + + msg = "center must be a boolean" + with pytest.raises(ValueError, match=msg): + c(window=2, min_periods=1, center=w) @td.skip_if_no_scipy diff --git a/pandas/tests/window/test_timeseries_window.py b/pandas/tests/window/test_timeseries_window.py index 8aa4d7103e48a..ea4d7df6700e9 100644 --- a/pandas/tests/window/test_timeseries_window.py +++ b/pandas/tests/window/test_timeseries_window.py @@ -50,41 +50,44 @@ def test_doc_string(self): df df.rolling("2s").sum() - def test_valid(self): - - df = self.regular + def test_invalid_window_non_int(self): # not a valid freq msg = "passed window foobar is not compatible with a datetimelike index" with pytest.raises(ValueError, match=msg): - df.rolling(window="foobar") + self.regular.rolling(window="foobar") # not a datetimelike index msg = "window must be an integer" with pytest.raises(ValueError, match=msg): - df.reset_index().rolling(window="foobar") + self.regular.reset_index().rolling(window="foobar") + + @pytest.mark.parametrize("freq", ["2MS", offsets.MonthBegin(2)]) + def test_invalid_window_nonfixed(self, freq): # non-fixed freqs msg = "\\<2 \\* MonthBegins\\> is a non-fixed frequency" - for freq in ["2MS", offsets.MonthBegin(2)]: - with pytest.raises(ValueError, match=msg): - df.rolling(window=freq) + with pytest.raises(ValueError, match=msg): + self.regular.rolling(window=freq) - for freq in ["1D", offsets.Day(2), "2ms"]: - df.rolling(window=freq) + @pytest.mark.parametrize("freq", ["1D", offsets.Day(2), "2ms"]) + def test_valid_window(self, freq): + self.regular.rolling(window=freq) + @pytest.mark.parametrize("minp", [1.0, "foo", np.array([1, 2, 3])]) + def test_invalid_minp(self, minp): # non-integer min_periods msg = ( r"local variable 'minp' referenced before assignment|" "min_periods must be an integer" ) - for minp in [1.0, "foo", np.array([1, 2, 3])]: - with pytest.raises(ValueError, match=msg): - df.rolling(window="1D", min_periods=minp) + with pytest.raises(ValueError, match=msg): + self.regular.rolling(window="1D", min_periods=minp) + def test_invalid_center_datetimelike(self): # center is not implemented msg = "center is not implemented for datetimelike and offset based windows" with pytest.raises(NotImplementedError, match=msg): - df.rolling(window="1D", center=True) + self.regular.rolling(window="1D", center=True) def test_on(self): @@ -585,14 +588,9 @@ def test_freqs_ops(self, freq, op, result_data): tm.assert_series_equal(result, expected) - def test_all(self): - - # simple comparison of integer vs time-based windowing - df = self.regular * 2 - er = df.rolling(window=1) - r = df.rolling(window="1s") - - for f in [ + @pytest.mark.parametrize( + "f", + [ "sum", "mean", "count", @@ -603,29 +601,26 @@ def test_all(self): "skew", "min", "max", - ]: + ], + ) + def test_all(self, f): + + # simple comparison of integer vs time-based windowing + df = self.regular * 2 + er = df.rolling(window=1) + r = df.rolling(window="1s") - result = getattr(r, f)() - expected = getattr(er, f)() - tm.assert_frame_equal(result, expected) + result = getattr(r, f)() + expected = getattr(er, f)() + tm.assert_frame_equal(result, expected) result = r.quantile(0.5) expected = er.quantile(0.5) tm.assert_frame_equal(result, expected) - def test_all2(self): - - # more sophisticated comparison of integer vs. - # time-based windowing - df = DataFrame( - {"B": np.arange(50)}, index=date_range("20130101", periods=50, freq="H") - ) - # in-range data - dft = df.between_time("09:00", "16:00") - - r = dft.rolling(window="5H") - - for f in [ + @pytest.mark.parametrize( + "f", + [ "sum", "mean", "count", @@ -636,25 +631,35 @@ def test_all2(self): "skew", "min", "max", - ]: + ], + ) + def test_all2(self, f): + + # more sophisticated comparison of integer vs. + # time-based windowing + df = DataFrame( + {"B": np.arange(50)}, index=date_range("20130101", periods=50, freq="H") + ) + # in-range data + dft = df.between_time("09:00", "16:00") + + r = dft.rolling(window="5H") - result = getattr(r, f)() + result = getattr(r, f)() - # we need to roll the days separately - # to compare with a time-based roll - # finally groupby-apply will return a multi-index - # so we need to drop the day - def agg_by_day(x): - x = x.between_time("09:00", "16:00") - return getattr(x.rolling(5, min_periods=1), f)() + # we need to roll the days separately + # to compare with a time-based roll + # finally groupby-apply will return a multi-index + # so we need to drop the day + def agg_by_day(x): + x = x.between_time("09:00", "16:00") + return getattr(x.rolling(5, min_periods=1), f)() - expected = ( - df.groupby(df.index.day) - .apply(agg_by_day) - .reset_index(level=0, drop=True) - ) + expected = ( + df.groupby(df.index.day).apply(agg_by_day).reset_index(level=0, drop=True) + ) - tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) def test_groupby_monotonic(self): diff --git a/pandas/tests/window/test_window.py b/pandas/tests/window/test_window.py index a450d29797c41..a3fff3122f80a 100644 --- a/pandas/tests/window/test_window.py +++ b/pandas/tests/window/test_window.py @@ -19,16 +19,25 @@ def test_constructor(which): c(win_type="boxcar", window=2, min_periods=1, center=True) c(win_type="boxcar", window=2, min_periods=1, center=False) + +@pytest.mark.parametrize("w", [2.0, "foo", np.array([2])]) +@td.skip_if_no_scipy +def test_invalid_constructor(which, w): # not valid - for w in [2.0, "foo", np.array([2])]: - with pytest.raises(ValueError, match="min_periods must be an integer"): - c(win_type="boxcar", window=2, min_periods=w) - with pytest.raises(ValueError, match="center must be a boolean"): - c(win_type="boxcar", window=2, min_periods=1, center=w) - - for wt in ["foobar", 1]: - with pytest.raises(ValueError, match="Invalid win_type"): - c(win_type=wt, window=2) + + c = which.rolling + with pytest.raises(ValueError, match="min_periods must be an integer"): + c(win_type="boxcar", window=2, min_periods=w) + with pytest.raises(ValueError, match="center must be a boolean"): + c(win_type="boxcar", window=2, min_periods=1, center=w) + + +@pytest.mark.parametrize("wt", ["foobar", 1]) +@td.skip_if_no_scipy +def test_invalid_constructor_wintype(which, wt): + c = which.rolling + with pytest.raises(ValueError, match="Invalid win_type"): + c(win_type=wt, window=2) @td.skip_if_no_scipy From 3e3cc241576a32392574c3305e18c08591ba6965 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 30 Sep 2020 07:22:25 -0500 Subject: [PATCH 0947/1025] REF: Dispatch string methods to ExtensionArray (#36357) --- ci/code_checks.sh | 2 +- pandas/core/arrays/categorical.py | 22 +- pandas/core/arrays/numpy_.py | 10 +- pandas/core/arrays/string_.py | 62 +- pandas/core/strings.py | 3650 --------------------------- pandas/core/strings/__init__.py | 32 + pandas/core/strings/accessor.py | 3080 ++++++++++++++++++++++ pandas/core/strings/base.py | 225 ++ pandas/core/strings/object_array.py | 432 ++++ pandas/tests/test_strings.py | 143 +- 10 files changed, 3942 insertions(+), 3716 deletions(-) delete mode 100644 pandas/core/strings.py create mode 100644 pandas/core/strings/__init__.py create mode 100644 pandas/core/strings/accessor.py create mode 100644 pandas/core/strings/base.py create mode 100644 pandas/core/strings/object_array.py diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 54aa830379c07..b8f6bd53d4a59 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -335,7 +335,7 @@ if [[ -z "$CHECK" || "$CHECK" == "doctests" ]]; then RET=$(($RET + $?)) ; echo $MSG "DONE" MSG='Doctests strings.py' ; echo $MSG - pytest -q --doctest-modules pandas/core/strings.py + pytest -q --doctest-modules pandas/core/strings/ RET=$(($RET + $?)) ; echo $MSG "DONE" # Directories diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 16406dd54b577..41c4de51fc2e1 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -51,6 +51,7 @@ from pandas.core.missing import interpolate_2d from pandas.core.ops.common import unpack_zerodim_and_defer from pandas.core.sorting import nargsort +from pandas.core.strings.object_array import ObjectStringArrayMixin from pandas.io.formats import console @@ -176,7 +177,7 @@ def contains(cat, key, container): return any(loc_ in container for loc_ in loc) -class Categorical(NDArrayBackedExtensionArray, PandasObject): +class Categorical(NDArrayBackedExtensionArray, PandasObject, ObjectStringArrayMixin): """ Represent a categorical variable in classic R / S-plus fashion. @@ -2305,6 +2306,25 @@ def replace(self, to_replace, value, inplace: bool = False): if not inplace: return cat + # ------------------------------------------------------------------------ + # String methods interface + def _str_map(self, f, na_value=np.nan, dtype=np.dtype(object)): + # Optimization to apply the callable `f` to the categories once + # and rebuild the result by `take`ing from the result with the codes. + # Returns the same type as the object-dtype implementation though. + from pandas.core.arrays import PandasArray + + categories = self.categories + codes = self.codes + result = PandasArray(categories.to_numpy())._str_map(f, na_value, dtype) + return take_1d(result, codes, fill_value=na_value) + + def _str_get_dummies(self, sep="|"): + # sep may not be in categories. Just bail on this. + from pandas.core.arrays import PandasArray + + return PandasArray(self.astype(str))._str_get_dummies(sep) + # The Series.cat accessor diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index 6b982bf579f04..237d571507a3a 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -16,6 +16,7 @@ from pandas.core.array_algos import masked_reductions from pandas.core.arrays._mixins import NDArrayBackedExtensionArray from pandas.core.arrays.base import ExtensionOpsMixin +from pandas.core.strings.object_array import ObjectStringArrayMixin class PandasDtype(ExtensionDtype): @@ -114,7 +115,10 @@ def itemsize(self) -> int: class PandasArray( - NDArrayBackedExtensionArray, ExtensionOpsMixin, NDArrayOperatorsMixin + NDArrayBackedExtensionArray, + ExtensionOpsMixin, + NDArrayOperatorsMixin, + ObjectStringArrayMixin, ): """ A pandas ExtensionArray for NumPy data. @@ -376,6 +380,10 @@ def arithmetic_method(self, other): _create_comparison_method = _create_arithmetic_method + # ------------------------------------------------------------------------ + # String methods interface + _str_na_value = np.nan + PandasArray._add_arithmetic_ops() PandasArray._add_comparison_ops() diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 5e7066e32ea39..fb126b3725237 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -6,8 +6,14 @@ from pandas._libs import lib, missing as libmissing from pandas.core.dtypes.base import ExtensionDtype, register_extension_dtype -from pandas.core.dtypes.common import pandas_dtype -from pandas.core.dtypes.inference import is_array_like +from pandas.core.dtypes.common import ( + is_array_like, + is_bool_dtype, + is_integer_dtype, + is_object_dtype, + is_string_dtype, + pandas_dtype, +) from pandas import compat from pandas.core import ops @@ -347,6 +353,58 @@ def _add_arithmetic_ops(cls): cls.__rmul__ = cls._create_arithmetic_method(ops.rmul) _create_comparison_method = _create_arithmetic_method + # ------------------------------------------------------------------------ + # String methods interface + _str_na_value = StringDtype.na_value + + def _str_map(self, f, na_value=None, dtype=None): + from pandas.arrays import BooleanArray, IntegerArray, StringArray + from pandas.core.arrays.string_ import StringDtype + + if dtype is None: + dtype = StringDtype() + if na_value is None: + na_value = self.dtype.na_value + + mask = isna(self) + arr = np.asarray(self) + + if is_integer_dtype(dtype) or is_bool_dtype(dtype): + constructor: Union[Type[IntegerArray], Type[BooleanArray]] + if is_integer_dtype(dtype): + constructor = IntegerArray + else: + constructor = BooleanArray + + na_value_is_na = isna(na_value) + if na_value_is_na: + na_value = 1 + result = lib.map_infer_mask( + arr, + f, + mask.view("uint8"), + convert=False, + na_value=na_value, + dtype=np.dtype(dtype), + ) + + if not na_value_is_na: + mask[:] = False + + return constructor(result, mask) + + elif is_string_dtype(dtype) and not is_object_dtype(dtype): + # i.e. StringDtype + result = lib.map_infer_mask( + arr, f, mask.view("uint8"), convert=False, na_value=na_value + ) + return StringArray(result) + else: + # This is when the result type is object. We reach this when + # -> We know the result type is truly object (e.g. .encode returns bytes + # or .findall returns a list). + # -> We don't know the result type. E.g. `.get` can return anything. + return lib.map_infer_mask(arr, f, mask.view("uint8")) StringArray._add_arithmetic_ops() diff --git a/pandas/core/strings.py b/pandas/core/strings.py deleted file mode 100644 index 4467c96041dc7..0000000000000 --- a/pandas/core/strings.py +++ /dev/null @@ -1,3650 +0,0 @@ -import codecs -from functools import wraps -import re -import textwrap -from typing import TYPE_CHECKING, Any, Callable, Dict, List, Pattern, Type, Union -import warnings - -import numpy as np - -import pandas._libs.lib as lib -import pandas._libs.missing as libmissing -import pandas._libs.ops as libops -from pandas._typing import ArrayLike, Dtype, Scalar -from pandas.util._decorators import Appender - -from pandas.core.dtypes.common import ( - ensure_object, - is_bool_dtype, - is_categorical_dtype, - is_extension_array_dtype, - is_integer, - is_integer_dtype, - is_list_like, - is_object_dtype, - is_re, - is_scalar, - is_string_dtype, -) -from pandas.core.dtypes.generic import ( - ABCDataFrame, - ABCIndexClass, - ABCMultiIndex, - ABCSeries, -) -from pandas.core.dtypes.missing import isna - -from pandas.core.algorithms import take_1d -from pandas.core.base import NoNewAttributesMixin -from pandas.core.construction import extract_array - -if TYPE_CHECKING: - from pandas.arrays import StringArray - -_cpython_optimized_encoders = ( - "utf-8", - "utf8", - "latin-1", - "latin1", - "iso-8859-1", - "mbcs", - "ascii", -) -_cpython_optimized_decoders = _cpython_optimized_encoders + ("utf-16", "utf-32") - -_shared_docs: Dict[str, str] = dict() - - -def cat_core(list_of_columns: List, sep: str): - """ - Auxiliary function for :meth:`str.cat` - - Parameters - ---------- - list_of_columns : list of numpy arrays - List of arrays to be concatenated with sep; - these arrays may not contain NaNs! - sep : string - The separator string for concatenating the columns. - - Returns - ------- - nd.array - The concatenation of list_of_columns with sep. - """ - if sep == "": - # no need to interleave sep if it is empty - arr_of_cols = np.asarray(list_of_columns, dtype=object) - return np.sum(arr_of_cols, axis=0) - list_with_sep = [sep] * (2 * len(list_of_columns) - 1) - list_with_sep[::2] = list_of_columns - arr_with_sep = np.asarray(list_with_sep, dtype=object) - return np.sum(arr_with_sep, axis=0) - - -def cat_safe(list_of_columns: List, sep: str): - """ - Auxiliary function for :meth:`str.cat`. - - Same signature as cat_core, but handles TypeErrors in concatenation, which - happen if the arrays in list_of columns have the wrong dtypes or content. - - Parameters - ---------- - list_of_columns : list of numpy arrays - List of arrays to be concatenated with sep; - these arrays may not contain NaNs! - sep : string - The separator string for concatenating the columns. - - Returns - ------- - nd.array - The concatenation of list_of_columns with sep. - """ - try: - result = cat_core(list_of_columns, sep) - except TypeError: - # if there are any non-string values (wrong dtype or hidden behind - # object dtype), np.sum will fail; catch and return with better message - for column in list_of_columns: - dtype = lib.infer_dtype(column, skipna=True) - if dtype not in ["string", "empty"]: - raise TypeError( - "Concatenation requires list-likes containing only " - "strings (or missing values). Offending values found in " - f"column {dtype}" - ) from None - return result - - -def _na_map(f, arr, na_result=None, dtype=np.dtype(object)): - if is_extension_array_dtype(arr.dtype): - if na_result is None: - na_result = libmissing.NA - # just StringDtype - arr = extract_array(arr) - return _map_stringarray(f, arr, na_value=na_result, dtype=dtype) - if na_result is None: - na_result = np.nan - return _map_object(f, arr, na_mask=True, na_value=na_result, dtype=dtype) - - -def _map_stringarray( - func: Callable[[str], Any], arr: "StringArray", na_value: Any, dtype: Dtype -) -> ArrayLike: - """ - Map a callable over valid elements of a StringArray. - - Parameters - ---------- - func : Callable[[str], Any] - Apply to each valid element. - arr : StringArray - na_value : Any - The value to use for missing values. By default, this is - the original value (NA). - dtype : Dtype - The result dtype to use. Specifying this avoids an intermediate - object-dtype allocation. - - Returns - ------- - ArrayLike - An ExtensionArray for integer or string dtypes, otherwise - an ndarray. - - """ - from pandas.arrays import BooleanArray, IntegerArray, StringArray - - mask = isna(arr) - - assert isinstance(arr, StringArray) - arr = np.asarray(arr) - - if is_integer_dtype(dtype) or is_bool_dtype(dtype): - constructor: Union[Type[IntegerArray], Type[BooleanArray]] - if is_integer_dtype(dtype): - constructor = IntegerArray - else: - constructor = BooleanArray - - na_value_is_na = isna(na_value) - if na_value_is_na: - na_value = 1 - result = lib.map_infer_mask( - arr, - func, - mask.view("uint8"), - convert=False, - na_value=na_value, - dtype=np.dtype(dtype), - ) - - if not na_value_is_na: - mask[:] = False - - return constructor(result, mask) - - elif is_string_dtype(dtype) and not is_object_dtype(dtype): - # i.e. StringDtype - result = lib.map_infer_mask( - arr, func, mask.view("uint8"), convert=False, na_value=na_value - ) - return StringArray(result) - else: - # This is when the result type is object. We reach this when - # -> We know the result type is truly object (e.g. .encode returns bytes - # or .findall returns a list). - # -> We don't know the result type. E.g. `.get` can return anything. - return lib.map_infer_mask(arr, func, mask.view("uint8")) - - -def _map_object(f, arr, na_mask=False, na_value=np.nan, dtype=np.dtype(object)): - if not len(arr): - return np.ndarray(0, dtype=dtype) - - if isinstance(arr, ABCSeries): - arr = arr._values # TODO: extract_array? - if not isinstance(arr, np.ndarray): - arr = np.asarray(arr, dtype=object) - if na_mask: - mask = isna(arr) - convert = not np.all(mask) - try: - result = lib.map_infer_mask(arr, f, mask.view(np.uint8), convert) - except (TypeError, AttributeError) as e: - # Reraise the exception if callable `f` got wrong number of args. - # The user may want to be warned by this, instead of getting NaN - p_err = ( - r"((takes)|(missing)) (?(2)from \d+ to )?\d+ " - r"(?(3)required )positional arguments?" - ) - - if len(e.args) >= 1 and re.search(p_err, e.args[0]): - # FIXME: this should be totally avoidable - raise e - - def g(x): - try: - return f(x) - except (TypeError, AttributeError): - return na_value - - return _map_object(g, arr, dtype=dtype) - if na_value is not np.nan: - np.putmask(result, mask, na_value) - if result.dtype == object: - result = lib.maybe_convert_objects(result) - return result - else: - return lib.map_infer(arr, f) - - -def str_count(arr, pat, flags=0): - """ - Count occurrences of pattern in each string of the Series/Index. - - This function is used to count the number of times a particular regex - pattern is repeated in each of the string elements of the - :class:`~pandas.Series`. - - Parameters - ---------- - pat : str - Valid regular expression. - flags : int, default 0, meaning no flags - Flags for the `re` module. For a complete list, `see here - `_. - **kwargs - For compatibility with other string methods. Not used. - - Returns - ------- - Series or Index - Same type as the calling object containing the integer counts. - - See Also - -------- - re : Standard library module for regular expressions. - str.count : Standard library version, without regular expression support. - - Notes - ----- - Some characters need to be escaped when passing in `pat`. - eg. ``'$'`` has a special meaning in regex and must be escaped when - finding this literal character. - - Examples - -------- - >>> s = pd.Series(['A', 'B', 'Aaba', 'Baca', np.nan, 'CABA', 'cat']) - >>> s.str.count('a') - 0 0.0 - 1 0.0 - 2 2.0 - 3 2.0 - 4 NaN - 5 0.0 - 6 1.0 - dtype: float64 - - Escape ``'$'`` to find the literal dollar sign. - - >>> s = pd.Series(['$', 'B', 'Aab$', '$$ca', 'C$B$', 'cat']) - >>> s.str.count('\\$') - 0 1 - 1 0 - 2 1 - 3 2 - 4 2 - 5 0 - dtype: int64 - - This is also available on Index - - >>> pd.Index(['A', 'A', 'Aaba', 'cat']).str.count('a') - Int64Index([0, 0, 2, 1], dtype='int64') - """ - regex = re.compile(pat, flags=flags) - f = lambda x: len(regex.findall(x)) - return _na_map(f, arr, dtype="int64") - - -def str_contains(arr, pat, case=True, flags=0, na=np.nan, regex=True): - """ - Test if pattern or regex is contained within a string of a Series or Index. - - Return boolean Series or Index based on whether a given pattern or regex is - contained within a string of a Series or Index. - - Parameters - ---------- - pat : str - Character sequence or regular expression. - case : bool, default True - If True, case sensitive. - flags : int, default 0 (no flags) - Flags to pass through to the re module, e.g. re.IGNORECASE. - na : default NaN - Fill value for missing values. - regex : bool, default True - If True, assumes the pat is a regular expression. - - If False, treats the pat as a literal string. - - Returns - ------- - Series or Index of boolean values - A Series or Index of boolean values indicating whether the - given pattern is contained within the string of each element - of the Series or Index. - - See Also - -------- - match : Analogous, but stricter, relying on re.match instead of re.search. - Series.str.startswith : Test if the start of each string element matches a - pattern. - Series.str.endswith : Same as startswith, but tests the end of string. - - Examples - -------- - Returning a Series of booleans using only a literal pattern. - - >>> s1 = pd.Series(['Mouse', 'dog', 'house and parrot', '23', np.NaN]) - >>> s1.str.contains('og', regex=False) - 0 False - 1 True - 2 False - 3 False - 4 NaN - dtype: object - - Returning an Index of booleans using only a literal pattern. - - >>> ind = pd.Index(['Mouse', 'dog', 'house and parrot', '23.0', np.NaN]) - >>> ind.str.contains('23', regex=False) - Index([False, False, False, True, nan], dtype='object') - - Specifying case sensitivity using `case`. - - >>> s1.str.contains('oG', case=True, regex=True) - 0 False - 1 False - 2 False - 3 False - 4 NaN - dtype: object - - Specifying `na` to be `False` instead of `NaN` replaces NaN values - with `False`. If Series or Index does not contain NaN values - the resultant dtype will be `bool`, otherwise, an `object` dtype. - - >>> s1.str.contains('og', na=False, regex=True) - 0 False - 1 True - 2 False - 3 False - 4 False - dtype: bool - - Returning 'house' or 'dog' when either expression occurs in a string. - - >>> s1.str.contains('house|dog', regex=True) - 0 False - 1 True - 2 True - 3 False - 4 NaN - dtype: object - - Ignoring case sensitivity using `flags` with regex. - - >>> import re - >>> s1.str.contains('PARROT', flags=re.IGNORECASE, regex=True) - 0 False - 1 False - 2 True - 3 False - 4 NaN - dtype: object - - Returning any digit using regular expression. - - >>> s1.str.contains('\\d', regex=True) - 0 False - 1 False - 2 False - 3 True - 4 NaN - dtype: object - - Ensure `pat` is a not a literal pattern when `regex` is set to True. - Note in the following example one might expect only `s2[1]` and `s2[3]` to - return `True`. However, '.0' as a regex matches any character - followed by a 0. - - >>> s2 = pd.Series(['40', '40.0', '41', '41.0', '35']) - >>> s2.str.contains('.0', regex=True) - 0 True - 1 True - 2 False - 3 True - 4 False - dtype: bool - """ - if regex: - if not case: - flags |= re.IGNORECASE - - regex = re.compile(pat, flags=flags) - - if regex.groups > 0: - warnings.warn( - "This pattern has match groups. To actually get the " - "groups, use str.extract.", - UserWarning, - stacklevel=3, - ) - - f = lambda x: regex.search(x) is not None - else: - if case: - f = lambda x: pat in x - else: - upper_pat = pat.upper() - f = lambda x: upper_pat in x - uppered = _na_map(lambda x: x.upper(), arr) - return _na_map(f, uppered, na, dtype=np.dtype(bool)) - return _na_map(f, arr, na, dtype=np.dtype(bool)) - - -def str_startswith(arr, pat, na=np.nan): - """ - Test if the start of each string element matches a pattern. - - Equivalent to :meth:`str.startswith`. - - Parameters - ---------- - pat : str - Character sequence. Regular expressions are not accepted. - na : object, default NaN - Object shown if element tested is not a string. - - Returns - ------- - Series or Index of bool - A Series of booleans indicating whether the given pattern matches - the start of each string element. - - See Also - -------- - str.startswith : Python standard library string method. - Series.str.endswith : Same as startswith, but tests the end of string. - Series.str.contains : Tests if string element contains a pattern. - - Examples - -------- - >>> s = pd.Series(['bat', 'Bear', 'cat', np.nan]) - >>> s - 0 bat - 1 Bear - 2 cat - 3 NaN - dtype: object - - >>> s.str.startswith('b') - 0 True - 1 False - 2 False - 3 NaN - dtype: object - - Specifying `na` to be `False` instead of `NaN`. - - >>> s.str.startswith('b', na=False) - 0 True - 1 False - 2 False - 3 False - dtype: bool - """ - f = lambda x: x.startswith(pat) - return _na_map(f, arr, na, dtype=np.dtype(bool)) - - -def str_endswith(arr, pat, na=np.nan): - """ - Test if the end of each string element matches a pattern. - - Equivalent to :meth:`str.endswith`. - - Parameters - ---------- - pat : str - Character sequence. Regular expressions are not accepted. - na : object, default NaN - Object shown if element tested is not a string. - - Returns - ------- - Series or Index of bool - A Series of booleans indicating whether the given pattern matches - the end of each string element. - - See Also - -------- - str.endswith : Python standard library string method. - Series.str.startswith : Same as endswith, but tests the start of string. - Series.str.contains : Tests if string element contains a pattern. - - Examples - -------- - >>> s = pd.Series(['bat', 'bear', 'caT', np.nan]) - >>> s - 0 bat - 1 bear - 2 caT - 3 NaN - dtype: object - - >>> s.str.endswith('t') - 0 True - 1 False - 2 False - 3 NaN - dtype: object - - Specifying `na` to be `False` instead of `NaN`. - - >>> s.str.endswith('t', na=False) - 0 True - 1 False - 2 False - 3 False - dtype: bool - """ - f = lambda x: x.endswith(pat) - return _na_map(f, arr, na, dtype=np.dtype(bool)) - - -def str_replace(arr, pat, repl, n=-1, case=None, flags=0, regex=True): - r""" - Replace each occurrence of pattern/regex in the Series/Index. - - Equivalent to :meth:`str.replace` or :func:`re.sub`, depending on the regex value. - - Parameters - ---------- - pat : str or compiled regex - String can be a character sequence or regular expression. - repl : str or callable - Replacement string or a callable. The callable is passed the regex - match object and must return a replacement string to be used. - See :func:`re.sub`. - n : int, default -1 (all) - Number of replacements to make from start. - case : bool, default None - Determines if replace is case sensitive: - - - If True, case sensitive (the default if `pat` is a string) - - Set to False for case insensitive - - Cannot be set if `pat` is a compiled regex. - - flags : int, default 0 (no flags) - Regex module flags, e.g. re.IGNORECASE. Cannot be set if `pat` is a compiled - regex. - regex : bool, default True - Determines if assumes the passed-in pattern is a regular expression: - - - If True, assumes the passed-in pattern is a regular expression. - - If False, treats the pattern as a literal string - - Cannot be set to False if `pat` is a compiled regex or `repl` is - a callable. - - Returns - ------- - Series or Index of object - A copy of the object with all matching occurrences of `pat` replaced by - `repl`. - - Raises - ------ - ValueError - * if `regex` is False and `repl` is a callable or `pat` is a compiled - regex - * if `pat` is a compiled regex and `case` or `flags` is set - - Notes - ----- - When `pat` is a compiled regex, all flags should be included in the - compiled regex. Use of `case`, `flags`, or `regex=False` with a compiled - regex will raise an error. - - Examples - -------- - When `pat` is a string and `regex` is True (the default), the given `pat` - is compiled as a regex. When `repl` is a string, it replaces matching - regex patterns as with :meth:`re.sub`. NaN value(s) in the Series are - left as is: - - >>> pd.Series(['foo', 'fuz', np.nan]).str.replace('f.', 'ba', regex=True) - 0 bao - 1 baz - 2 NaN - dtype: object - - When `pat` is a string and `regex` is False, every `pat` is replaced with - `repl` as with :meth:`str.replace`: - - >>> pd.Series(['f.o', 'fuz', np.nan]).str.replace('f.', 'ba', regex=False) - 0 bao - 1 fuz - 2 NaN - dtype: object - - When `repl` is a callable, it is called on every `pat` using - :func:`re.sub`. The callable should expect one positional argument - (a regex object) and return a string. - - To get the idea: - - >>> pd.Series(['foo', 'fuz', np.nan]).str.replace('f', repr) - 0 oo - 1 uz - 2 NaN - dtype: object - - Reverse every lowercase alphabetic word: - - >>> repl = lambda m: m.group(0)[::-1] - >>> pd.Series(['foo 123', 'bar baz', np.nan]).str.replace(r'[a-z]+', repl) - 0 oof 123 - 1 rab zab - 2 NaN - dtype: object - - Using regex groups (extract second group and swap case): - - >>> pat = r"(?P\w+) (?P\w+) (?P\w+)" - >>> repl = lambda m: m.group('two').swapcase() - >>> pd.Series(['One Two Three', 'Foo Bar Baz']).str.replace(pat, repl) - 0 tWO - 1 bAR - dtype: object - - Using a compiled regex with flags - - >>> import re - >>> regex_pat = re.compile(r'FUZ', flags=re.IGNORECASE) - >>> pd.Series(['foo', 'fuz', np.nan]).str.replace(regex_pat, 'bar') - 0 foo - 1 bar - 2 NaN - dtype: object - """ - # Check whether repl is valid (GH 13438, GH 15055) - if not (isinstance(repl, str) or callable(repl)): - raise TypeError("repl must be a string or callable") - - is_compiled_re = is_re(pat) - if regex: - if is_compiled_re: - if (case is not None) or (flags != 0): - raise ValueError( - "case and flags cannot be set when pat is a compiled regex" - ) - else: - # not a compiled regex - # set default case - if case is None: - case = True - - # add case flag, if provided - if case is False: - flags |= re.IGNORECASE - if is_compiled_re or len(pat) > 1 or flags or callable(repl): - n = n if n >= 0 else 0 - compiled = re.compile(pat, flags=flags) - f = lambda x: compiled.sub(repl=repl, string=x, count=n) - else: - f = lambda x: x.replace(pat, repl, n) - else: - if is_compiled_re: - raise ValueError( - "Cannot use a compiled regex as replacement pattern with regex=False" - ) - if callable(repl): - raise ValueError("Cannot use a callable replacement when regex=False") - f = lambda x: x.replace(pat, repl, n) - - return _na_map(f, arr, dtype=str) - - -def str_repeat(arr, repeats): - """ - Duplicate each string in the Series or Index. - - Parameters - ---------- - repeats : int or sequence of int - Same value for all (int) or different value per (sequence). - - Returns - ------- - Series or Index of object - Series or Index of repeated string objects specified by - input parameter repeats. - - Examples - -------- - >>> s = pd.Series(['a', 'b', 'c']) - >>> s - 0 a - 1 b - 2 c - dtype: object - - Single int repeats string in Series - - >>> s.str.repeat(repeats=2) - 0 aa - 1 bb - 2 cc - dtype: object - - Sequence of int repeats corresponding string in Series - - >>> s.str.repeat(repeats=[1, 2, 3]) - 0 a - 1 bb - 2 ccc - dtype: object - """ - if is_scalar(repeats): - - def scalar_rep(x): - try: - return bytes.__mul__(x, repeats) - except TypeError: - return str.__mul__(x, repeats) - - return _na_map(scalar_rep, arr, dtype=str) - else: - - def rep(x, r): - if x is libmissing.NA: - return x - try: - return bytes.__mul__(x, r) - except TypeError: - return str.__mul__(x, r) - - repeats = np.asarray(repeats, dtype=object) - result = libops.vec_binop(np.asarray(arr), repeats, rep) - return result - - -def str_match( - arr: ArrayLike, - pat: Union[str, Pattern], - case: bool = True, - flags: int = 0, - na: Scalar = np.nan, -): - """ - Determine if each string starts with a match of a regular expression. - - Parameters - ---------- - pat : str - Character sequence or regular expression. - case : bool, default True - If True, case sensitive. - flags : int, default 0 (no flags) - Regex module flags, e.g. re.IGNORECASE. - na : default NaN - Fill value for missing values. - - Returns - ------- - Series/array of boolean values - - See Also - -------- - fullmatch : Stricter matching that requires the entire string to match. - contains : Analogous, but less strict, relying on re.search instead of - re.match. - extract : Extract matched groups. - """ - if not case: - flags |= re.IGNORECASE - - regex = re.compile(pat, flags=flags) - - f = lambda x: regex.match(x) is not None - - return _na_map(f, arr, na, dtype=np.dtype(bool)) - - -def str_fullmatch( - arr: ArrayLike, - pat: Union[str, Pattern], - case: bool = True, - flags: int = 0, - na: Scalar = np.nan, -): - """ - Determine if each string entirely matches a regular expression. - - .. versionadded:: 1.1.0 - - Parameters - ---------- - pat : str - Character sequence or regular expression. - case : bool, default True - If True, case sensitive. - flags : int, default 0 (no flags) - Regex module flags, e.g. re.IGNORECASE. - na : default NaN - Fill value for missing values. - - Returns - ------- - Series/array of boolean values - - See Also - -------- - match : Similar, but also returns `True` when only a *prefix* of the string - matches the regular expression. - extract : Extract matched groups. - """ - if not case: - flags |= re.IGNORECASE - - regex = re.compile(pat, flags=flags) - - f = lambda x: regex.fullmatch(x) is not None - - return _na_map(f, arr, na, dtype=np.dtype(bool)) - - -def _get_single_group_name(rx): - try: - return list(rx.groupindex.keys()).pop() - except IndexError: - return None - - -def _groups_or_na_fun(regex): - """Used in both extract_noexpand and extract_frame""" - if regex.groups == 0: - raise ValueError("pattern contains no capture groups") - empty_row = [np.nan] * regex.groups - - def f(x): - if not isinstance(x, str): - return empty_row - m = regex.search(x) - if m: - return [np.nan if item is None else item for item in m.groups()] - else: - return empty_row - - return f - - -def _result_dtype(arr): - # workaround #27953 - # ideally we just pass `dtype=arr.dtype` unconditionally, but this fails - # when the list of values is empty. - if arr.dtype.name == "string": - return "string" - else: - return object - - -def _str_extract_noexpand(arr, pat, flags=0): - """ - Find groups in each string in the Series using passed regular - expression. This function is called from - str_extract(expand=False), and can return Series, DataFrame, or - Index. - - """ - from pandas import DataFrame - - regex = re.compile(pat, flags=flags) - groups_or_na = _groups_or_na_fun(regex) - - if regex.groups == 1: - result = np.array([groups_or_na(val)[0] for val in arr], dtype=object) - name = _get_single_group_name(regex) - else: - if isinstance(arr, ABCIndexClass): - raise ValueError("only one regex group is supported with Index") - name = None - names = dict(zip(regex.groupindex.values(), regex.groupindex.keys())) - columns = [names.get(1 + i, i) for i in range(regex.groups)] - if arr.empty: - result = DataFrame(columns=columns, dtype=object) - else: - dtype = _result_dtype(arr) - result = DataFrame( - [groups_or_na(val) for val in arr], - columns=columns, - index=arr.index, - dtype=dtype, - ) - return result, name - - -def _str_extract_frame(arr, pat, flags=0): - """ - For each subject string in the Series, extract groups from the - first match of regular expression pat. This function is called from - str_extract(expand=True), and always returns a DataFrame. - - """ - from pandas import DataFrame - - regex = re.compile(pat, flags=flags) - groups_or_na = _groups_or_na_fun(regex) - names = dict(zip(regex.groupindex.values(), regex.groupindex.keys())) - columns = [names.get(1 + i, i) for i in range(regex.groups)] - - if len(arr) == 0: - return DataFrame(columns=columns, dtype=object) - try: - result_index = arr.index - except AttributeError: - result_index = None - dtype = _result_dtype(arr) - return DataFrame( - [groups_or_na(val) for val in arr], - columns=columns, - index=result_index, - dtype=dtype, - ) - - -def str_extract(arr, pat, flags=0, expand=True): - r""" - Extract capture groups in the regex `pat` as columns in a DataFrame. - - For each subject string in the Series, extract groups from the - first match of regular expression `pat`. - - Parameters - ---------- - pat : str - Regular expression pattern with capturing groups. - flags : int, default 0 (no flags) - Flags from the ``re`` module, e.g. ``re.IGNORECASE``, that - modify regular expression matching for things like case, - spaces, etc. For more details, see :mod:`re`. - expand : bool, default True - If True, return DataFrame with one column per capture group. - If False, return a Series/Index if there is one capture group - or DataFrame if there are multiple capture groups. - - Returns - ------- - DataFrame or Series or Index - A DataFrame with one row for each subject string, and one - column for each group. Any capture group names in regular - expression pat will be used for column names; otherwise - capture group numbers will be used. The dtype of each result - column is always object, even when no match is found. If - ``expand=False`` and pat has only one capture group, then - return a Series (if subject is a Series) or Index (if subject - is an Index). - - See Also - -------- - extractall : Returns all matches (not just the first match). - - Examples - -------- - A pattern with two groups will return a DataFrame with two columns. - Non-matches will be NaN. - - >>> s = pd.Series(['a1', 'b2', 'c3']) - >>> s.str.extract(r'([ab])(\d)') - 0 1 - 0 a 1 - 1 b 2 - 2 NaN NaN - - A pattern may contain optional groups. - - >>> s.str.extract(r'([ab])?(\d)') - 0 1 - 0 a 1 - 1 b 2 - 2 NaN 3 - - Named groups will become column names in the result. - - >>> s.str.extract(r'(?P[ab])(?P\d)') - letter digit - 0 a 1 - 1 b 2 - 2 NaN NaN - - A pattern with one group will return a DataFrame with one column - if expand=True. - - >>> s.str.extract(r'[ab](\d)', expand=True) - 0 - 0 1 - 1 2 - 2 NaN - - A pattern with one group will return a Series if expand=False. - - >>> s.str.extract(r'[ab](\d)', expand=False) - 0 1 - 1 2 - 2 NaN - dtype: object - """ - if not isinstance(expand, bool): - raise ValueError("expand must be True or False") - if expand: - return _str_extract_frame(arr._orig, pat, flags=flags) - else: - result, name = _str_extract_noexpand(arr._parent, pat, flags=flags) - return arr._wrap_result(result, name=name, expand=expand) - - -def str_extractall(arr, pat, flags=0): - r""" - Extract capture groups in the regex `pat` as columns in DataFrame. - - For each subject string in the Series, extract groups from all - matches of regular expression pat. When each subject string in the - Series has exactly one match, extractall(pat).xs(0, level='match') - is the same as extract(pat). - - Parameters - ---------- - pat : str - Regular expression pattern with capturing groups. - flags : int, default 0 (no flags) - A ``re`` module flag, for example ``re.IGNORECASE``. These allow - to modify regular expression matching for things like case, spaces, - etc. Multiple flags can be combined with the bitwise OR operator, - for example ``re.IGNORECASE | re.MULTILINE``. - - Returns - ------- - DataFrame - A ``DataFrame`` with one row for each match, and one column for each - group. Its rows have a ``MultiIndex`` with first levels that come from - the subject ``Series``. The last level is named 'match' and indexes the - matches in each item of the ``Series``. Any capture group names in - regular expression pat will be used for column names; otherwise capture - group numbers will be used. - - See Also - -------- - extract : Returns first match only (not all matches). - - Examples - -------- - A pattern with one group will return a DataFrame with one column. - Indices with no matches will not appear in the result. - - >>> s = pd.Series(["a1a2", "b1", "c1"], index=["A", "B", "C"]) - >>> s.str.extractall(r"[ab](\d)") - 0 - match - A 0 1 - 1 2 - B 0 1 - - Capture group names are used for column names of the result. - - >>> s.str.extractall(r"[ab](?P\d)") - digit - match - A 0 1 - 1 2 - B 0 1 - - A pattern with two groups will return a DataFrame with two columns. - - >>> s.str.extractall(r"(?P[ab])(?P\d)") - letter digit - match - A 0 a 1 - 1 a 2 - B 0 b 1 - - Optional groups that do not match are NaN in the result. - - >>> s.str.extractall(r"(?P[ab])?(?P\d)") - letter digit - match - A 0 a 1 - 1 a 2 - B 0 b 1 - C 0 NaN 1 - """ - regex = re.compile(pat, flags=flags) - # the regex must contain capture groups. - if regex.groups == 0: - raise ValueError("pattern contains no capture groups") - - if isinstance(arr, ABCIndexClass): - arr = arr.to_series().reset_index(drop=True) - - names = dict(zip(regex.groupindex.values(), regex.groupindex.keys())) - columns = [names.get(1 + i, i) for i in range(regex.groups)] - match_list = [] - index_list = [] - is_mi = arr.index.nlevels > 1 - - for subject_key, subject in arr.items(): - if isinstance(subject, str): - - if not is_mi: - subject_key = (subject_key,) - - for match_i, match_tuple in enumerate(regex.findall(subject)): - if isinstance(match_tuple, str): - match_tuple = (match_tuple,) - na_tuple = [np.NaN if group == "" else group for group in match_tuple] - match_list.append(na_tuple) - result_key = tuple(subject_key + (match_i,)) - index_list.append(result_key) - - from pandas import MultiIndex - - index = MultiIndex.from_tuples(index_list, names=arr.index.names + ["match"]) - dtype = _result_dtype(arr) - - result = arr._constructor_expanddim( - match_list, index=index, columns=columns, dtype=dtype - ) - return result - - -def str_get_dummies(arr, sep="|"): - """ - Return DataFrame of dummy/indicator variables for Series. - - Each string in Series is split by sep and returned as a DataFrame - of dummy/indicator variables. - - Parameters - ---------- - sep : str, default "|" - String to split on. - - Returns - ------- - DataFrame - Dummy variables corresponding to values of the Series. - - See Also - -------- - get_dummies : Convert categorical variable into dummy/indicator - variables. - - Examples - -------- - >>> pd.Series(['a|b', 'a', 'a|c']).str.get_dummies() - a b c - 0 1 1 0 - 1 1 0 0 - 2 1 0 1 - - >>> pd.Series(['a|b', np.nan, 'a|c']).str.get_dummies() - a b c - 0 1 1 0 - 1 0 0 0 - 2 1 0 1 - """ - arr = arr.fillna("") - try: - arr = sep + arr + sep - except TypeError: - arr = sep + arr.astype(str) + sep - - tags = set() - for ts in arr.str.split(sep): - tags.update(ts) - tags = sorted(tags - {""}) - - dummies = np.empty((len(arr), len(tags)), dtype=np.int64) - - for i, t in enumerate(tags): - pat = sep + t + sep - dummies[:, i] = lib.map_infer(arr.to_numpy(), lambda x: pat in x) - return dummies, tags - - -def str_join(arr, sep): - """ - Join lists contained as elements in the Series/Index with passed delimiter. - - If the elements of a Series are lists themselves, join the content of these - lists using the delimiter passed to the function. - This function is an equivalent to :meth:`str.join`. - - Parameters - ---------- - sep : str - Delimiter to use between list entries. - - Returns - ------- - Series/Index: object - The list entries concatenated by intervening occurrences of the - delimiter. - - Raises - ------ - AttributeError - If the supplied Series contains neither strings nor lists. - - See Also - -------- - str.join : Standard library version of this method. - Series.str.split : Split strings around given separator/delimiter. - - Notes - ----- - If any of the list items is not a string object, the result of the join - will be `NaN`. - - Examples - -------- - Example with a list that contains non-string elements. - - >>> s = pd.Series([['lion', 'elephant', 'zebra'], - ... [1.1, 2.2, 3.3], - ... ['cat', np.nan, 'dog'], - ... ['cow', 4.5, 'goat'], - ... ['duck', ['swan', 'fish'], 'guppy']]) - >>> s - 0 [lion, elephant, zebra] - 1 [1.1, 2.2, 3.3] - 2 [cat, nan, dog] - 3 [cow, 4.5, goat] - 4 [duck, [swan, fish], guppy] - dtype: object - - Join all lists using a '-'. The lists containing object(s) of types other - than str will produce a NaN. - - >>> s.str.join('-') - 0 lion-elephant-zebra - 1 NaN - 2 NaN - 3 NaN - 4 NaN - dtype: object - """ - return _na_map(sep.join, arr, dtype=str) - - -def str_findall(arr, pat, flags=0): - """ - Find all occurrences of pattern or regular expression in the Series/Index. - - Equivalent to applying :func:`re.findall` to all the elements in the - Series/Index. - - Parameters - ---------- - pat : str - Pattern or regular expression. - flags : int, default 0 - Flags from ``re`` module, e.g. `re.IGNORECASE` (default is 0, which - means no flags). - - Returns - ------- - Series/Index of lists of strings - All non-overlapping matches of pattern or regular expression in each - string of this Series/Index. - - See Also - -------- - count : Count occurrences of pattern or regular expression in each string - of the Series/Index. - extractall : For each string in the Series, extract groups from all matches - of regular expression and return a DataFrame with one row for each - match and one column for each group. - re.findall : The equivalent ``re`` function to all non-overlapping matches - of pattern or regular expression in string, as a list of strings. - - Examples - -------- - >>> s = pd.Series(['Lion', 'Monkey', 'Rabbit']) - - The search for the pattern 'Monkey' returns one match: - - >>> s.str.findall('Monkey') - 0 [] - 1 [Monkey] - 2 [] - dtype: object - - On the other hand, the search for the pattern 'MONKEY' doesn't return any - match: - - >>> s.str.findall('MONKEY') - 0 [] - 1 [] - 2 [] - dtype: object - - Flags can be added to the pattern or regular expression. For instance, - to find the pattern 'MONKEY' ignoring the case: - - >>> import re - >>> s.str.findall('MONKEY', flags=re.IGNORECASE) - 0 [] - 1 [Monkey] - 2 [] - dtype: object - - When the pattern matches more than one string in the Series, all matches - are returned: - - >>> s.str.findall('on') - 0 [on] - 1 [on] - 2 [] - dtype: object - - Regular expressions are supported too. For instance, the search for all the - strings ending with the word 'on' is shown next: - - >>> s.str.findall('on$') - 0 [on] - 1 [] - 2 [] - dtype: object - - If the pattern is found more than once in the same string, then a list of - multiple strings is returned: - - >>> s.str.findall('b') - 0 [] - 1 [] - 2 [b, b] - dtype: object - """ - regex = re.compile(pat, flags=flags) - return _na_map(regex.findall, arr) - - -def str_find(arr, sub, start=0, end=None, side="left"): - """ - Return indexes in each strings in the Series/Index where the - substring is fully contained between [start:end]. Return -1 on failure. - - Parameters - ---------- - sub : str - Substring being searched. - start : int - Left edge index. - end : int - Right edge index. - side : {'left', 'right'}, default 'left' - Specifies a starting side, equivalent to ``find`` or ``rfind``. - - Returns - ------- - Series or Index - Indexes where substring is found. - """ - if not isinstance(sub, str): - msg = f"expected a string object, not {type(sub).__name__}" - raise TypeError(msg) - - if side == "left": - method = "find" - elif side == "right": - method = "rfind" - else: # pragma: no cover - raise ValueError("Invalid side") - - if end is None: - f = lambda x: getattr(x, method)(sub, start) - else: - f = lambda x: getattr(x, method)(sub, start, end) - - return _na_map(f, arr, dtype=np.dtype("int64")) - - -def str_index(arr, sub, start=0, end=None, side="left"): - if not isinstance(sub, str): - msg = f"expected a string object, not {type(sub).__name__}" - raise TypeError(msg) - - if side == "left": - method = "index" - elif side == "right": - method = "rindex" - else: # pragma: no cover - raise ValueError("Invalid side") - - if end is None: - f = lambda x: getattr(x, method)(sub, start) - else: - f = lambda x: getattr(x, method)(sub, start, end) - - return _na_map(f, arr, dtype=np.dtype("int64")) - - -def str_pad(arr, width, side="left", fillchar=" "): - """ - Pad strings in the Series/Index up to width. - - Parameters - ---------- - width : int - Minimum width of resulting string; additional characters will be filled - with character defined in `fillchar`. - side : {'left', 'right', 'both'}, default 'left' - Side from which to fill resulting string. - fillchar : str, default ' ' - Additional character for filling, default is whitespace. - - Returns - ------- - Series or Index of object - Returns Series or Index with minimum number of char in object. - - See Also - -------- - Series.str.rjust : Fills the left side of strings with an arbitrary - character. Equivalent to ``Series.str.pad(side='left')``. - Series.str.ljust : Fills the right side of strings with an arbitrary - character. Equivalent to ``Series.str.pad(side='right')``. - Series.str.center : Fills both sides of strings with an arbitrary - character. Equivalent to ``Series.str.pad(side='both')``. - Series.str.zfill : Pad strings in the Series/Index by prepending '0' - character. Equivalent to ``Series.str.pad(side='left', fillchar='0')``. - - Examples - -------- - >>> s = pd.Series(["caribou", "tiger"]) - >>> s - 0 caribou - 1 tiger - dtype: object - - >>> s.str.pad(width=10) - 0 caribou - 1 tiger - dtype: object - - >>> s.str.pad(width=10, side='right', fillchar='-') - 0 caribou--- - 1 tiger----- - dtype: object - - >>> s.str.pad(width=10, side='both', fillchar='-') - 0 -caribou-- - 1 --tiger--- - dtype: object - """ - if not isinstance(fillchar, str): - msg = f"fillchar must be a character, not {type(fillchar).__name__}" - raise TypeError(msg) - - if len(fillchar) != 1: - raise TypeError("fillchar must be a character, not str") - - if not is_integer(width): - msg = f"width must be of integer type, not {type(width).__name__}" - raise TypeError(msg) - - if side == "left": - f = lambda x: x.rjust(width, fillchar) - elif side == "right": - f = lambda x: x.ljust(width, fillchar) - elif side == "both": - f = lambda x: x.center(width, fillchar) - else: # pragma: no cover - raise ValueError("Invalid side") - - return _na_map(f, arr, dtype=str) - - -def str_split(arr, pat=None, n=None): - - if pat is None: - if n is None or n == 0: - n = -1 - f = lambda x: x.split(pat, n) - else: - if len(pat) == 1: - if n is None or n == 0: - n = -1 - f = lambda x: x.split(pat, n) - else: - if n is None or n == -1: - n = 0 - regex = re.compile(pat) - f = lambda x: regex.split(x, maxsplit=n) - res = _na_map(f, arr) - return res - - -def str_rsplit(arr, pat=None, n=None): - - if n is None or n == 0: - n = -1 - f = lambda x: x.rsplit(pat, n) - res = _na_map(f, arr) - return res - - -def str_slice(arr, start=None, stop=None, step=None): - """ - Slice substrings from each element in the Series or Index. - - Parameters - ---------- - start : int, optional - Start position for slice operation. - stop : int, optional - Stop position for slice operation. - step : int, optional - Step size for slice operation. - - Returns - ------- - Series or Index of object - Series or Index from sliced substring from original string object. - - See Also - -------- - Series.str.slice_replace : Replace a slice with a string. - Series.str.get : Return element at position. - Equivalent to `Series.str.slice(start=i, stop=i+1)` with `i` - being the position. - - Examples - -------- - >>> s = pd.Series(["koala", "fox", "chameleon"]) - >>> s - 0 koala - 1 fox - 2 chameleon - dtype: object - - >>> s.str.slice(start=1) - 0 oala - 1 ox - 2 hameleon - dtype: object - - >>> s.str.slice(start=-1) - 0 a - 1 x - 2 n - dtype: object - - >>> s.str.slice(stop=2) - 0 ko - 1 fo - 2 ch - dtype: object - - >>> s.str.slice(step=2) - 0 kaa - 1 fx - 2 caeen - dtype: object - - >>> s.str.slice(start=0, stop=5, step=3) - 0 kl - 1 f - 2 cm - dtype: object - - Equivalent behaviour to: - - >>> s.str[0:5:3] - 0 kl - 1 f - 2 cm - dtype: object - """ - obj = slice(start, stop, step) - f = lambda x: x[obj] - return _na_map(f, arr, dtype=str) - - -def str_slice_replace(arr, start=None, stop=None, repl=None): - """ - Replace a positional slice of a string with another value. - - Parameters - ---------- - start : int, optional - Left index position to use for the slice. If not specified (None), - the slice is unbounded on the left, i.e. slice from the start - of the string. - stop : int, optional - Right index position to use for the slice. If not specified (None), - the slice is unbounded on the right, i.e. slice until the - end of the string. - repl : str, optional - String for replacement. If not specified (None), the sliced region - is replaced with an empty string. - - Returns - ------- - Series or Index - Same type as the original object. - - See Also - -------- - Series.str.slice : Just slicing without replacement. - - Examples - -------- - >>> s = pd.Series(['a', 'ab', 'abc', 'abdc', 'abcde']) - >>> s - 0 a - 1 ab - 2 abc - 3 abdc - 4 abcde - dtype: object - - Specify just `start`, meaning replace `start` until the end of the - string with `repl`. - - >>> s.str.slice_replace(1, repl='X') - 0 aX - 1 aX - 2 aX - 3 aX - 4 aX - dtype: object - - Specify just `stop`, meaning the start of the string to `stop` is replaced - with `repl`, and the rest of the string is included. - - >>> s.str.slice_replace(stop=2, repl='X') - 0 X - 1 X - 2 Xc - 3 Xdc - 4 Xcde - dtype: object - - Specify `start` and `stop`, meaning the slice from `start` to `stop` is - replaced with `repl`. Everything before or after `start` and `stop` is - included as is. - - >>> s.str.slice_replace(start=1, stop=3, repl='X') - 0 aX - 1 aX - 2 aX - 3 aXc - 4 aXde - dtype: object - """ - if repl is None: - repl = "" - - def f(x): - if x[start:stop] == "": - local_stop = start - else: - local_stop = stop - y = "" - if start is not None: - y += x[:start] - y += repl - if stop is not None: - y += x[local_stop:] - return y - - return _na_map(f, arr, dtype=str) - - -def str_strip(arr, to_strip=None, side="both"): - """ - Strip whitespace (including newlines) from each string in the - Series/Index. - - Parameters - ---------- - to_strip : str or unicode - side : {'left', 'right', 'both'}, default 'both' - - Returns - ------- - Series or Index - """ - if side == "both": - f = lambda x: x.strip(to_strip) - elif side == "left": - f = lambda x: x.lstrip(to_strip) - elif side == "right": - f = lambda x: x.rstrip(to_strip) - else: # pragma: no cover - raise ValueError("Invalid side") - return _na_map(f, arr, dtype=str) - - -def str_wrap(arr, width, **kwargs): - r""" - Wrap strings in Series/Index at specified line width. - - This method has the same keyword parameters and defaults as - :class:`textwrap.TextWrapper`. - - Parameters - ---------- - width : int - Maximum line width. - expand_tabs : bool, optional - If True, tab characters will be expanded to spaces (default: True). - replace_whitespace : bool, optional - If True, each whitespace character (as defined by string.whitespace) - remaining after tab expansion will be replaced by a single space - (default: True). - drop_whitespace : bool, optional - If True, whitespace that, after wrapping, happens to end up at the - beginning or end of a line is dropped (default: True). - break_long_words : bool, optional - If True, then words longer than width will be broken in order to ensure - that no lines are longer than width. If it is false, long words will - not be broken, and some lines may be longer than width (default: True). - break_on_hyphens : bool, optional - If True, wrapping will occur preferably on whitespace and right after - hyphens in compound words, as it is customary in English. If false, - only whitespaces will be considered as potentially good places for line - breaks, but you need to set break_long_words to false if you want truly - insecable words (default: True). - - Returns - ------- - Series or Index - - Notes - ----- - Internally, this method uses a :class:`textwrap.TextWrapper` instance with - default settings. To achieve behavior matching R's stringr library str_wrap - function, use the arguments: - - - expand_tabs = False - - replace_whitespace = True - - drop_whitespace = True - - break_long_words = False - - break_on_hyphens = False - - Examples - -------- - >>> s = pd.Series(['line to be wrapped', 'another line to be wrapped']) - >>> s.str.wrap(12) - 0 line to be\nwrapped - 1 another line\nto be\nwrapped - dtype: object - """ - kwargs["width"] = width - - tw = textwrap.TextWrapper(**kwargs) - - return _na_map(lambda s: "\n".join(tw.wrap(s)), arr, dtype=str) - - -def str_translate(arr, table): - """ - Map all characters in the string through the given mapping table. - - Equivalent to standard :meth:`str.translate`. - - Parameters - ---------- - table : dict - Table is a mapping of Unicode ordinals to Unicode ordinals, strings, or - None. Unmapped characters are left untouched. - Characters mapped to None are deleted. :meth:`str.maketrans` is a - helper function for making translation tables. - - Returns - ------- - Series or Index - """ - return _na_map(lambda x: x.translate(table), arr, dtype=str) - - -def str_get(arr, i): - """ - Extract element from each component at specified position. - - Extract element from lists, tuples, or strings in each element in the - Series/Index. - - Parameters - ---------- - i : int - Position of element to extract. - - Returns - ------- - Series or Index - - Examples - -------- - >>> s = pd.Series(["String", - ... (1, 2, 3), - ... ["a", "b", "c"], - ... 123, - ... -456, - ... {1: "Hello", "2": "World"}]) - >>> s - 0 String - 1 (1, 2, 3) - 2 [a, b, c] - 3 123 - 4 -456 - 5 {1: 'Hello', '2': 'World'} - dtype: object - - >>> s.str.get(1) - 0 t - 1 2 - 2 b - 3 NaN - 4 NaN - 5 Hello - dtype: object - - >>> s.str.get(-1) - 0 g - 1 3 - 2 c - 3 NaN - 4 NaN - 5 None - dtype: object - """ - - def f(x): - if isinstance(x, dict): - return x.get(i) - elif len(x) > i >= -len(x): - return x[i] - return np.nan - - return _na_map(f, arr) - - -def str_decode(arr, encoding, errors="strict"): - """ - Decode character string in the Series/Index using indicated encoding. - - Equivalent to :meth:`str.decode` in python2 and :meth:`bytes.decode` in - python3. - - Parameters - ---------- - encoding : str - errors : str, optional - - Returns - ------- - Series or Index - """ - if encoding in _cpython_optimized_decoders: - # CPython optimized implementation - f = lambda x: x.decode(encoding, errors) - else: - decoder = codecs.getdecoder(encoding) - f = lambda x: decoder(x, errors)[0] - return _na_map(f, arr) - - -def str_encode(arr, encoding, errors="strict"): - """ - Encode character string in the Series/Index using indicated encoding. - - Equivalent to :meth:`str.encode`. - - Parameters - ---------- - encoding : str - errors : str, optional - - Returns - ------- - encoded : Series/Index of objects - """ - if encoding in _cpython_optimized_encoders: - # CPython optimized implementation - f = lambda x: x.encode(encoding, errors) - else: - encoder = codecs.getencoder(encoding) - f = lambda x: encoder(x, errors)[0] - return _na_map(f, arr) - - -def forbid_nonstring_types(forbidden, name=None): - """ - Decorator to forbid specific types for a method of StringMethods. - - For calling `.str.{method}` on a Series or Index, it is necessary to first - initialize the :class:`StringMethods` object, and then call the method. - However, different methods allow different input types, and so this can not - be checked during :meth:`StringMethods.__init__`, but must be done on a - per-method basis. This decorator exists to facilitate this process, and - make it explicit which (inferred) types are disallowed by the method. - - :meth:`StringMethods.__init__` allows the *union* of types its different - methods allow (after skipping NaNs; see :meth:`StringMethods._validate`), - namely: ['string', 'empty', 'bytes', 'mixed', 'mixed-integer']. - - The default string types ['string', 'empty'] are allowed for all methods. - For the additional types ['bytes', 'mixed', 'mixed-integer'], each method - then needs to forbid the types it is not intended for. - - Parameters - ---------- - forbidden : list-of-str or None - List of forbidden non-string types, may be one or more of - `['bytes', 'mixed', 'mixed-integer']`. - name : str, default None - Name of the method to use in the error message. By default, this is - None, in which case the name from the method being wrapped will be - copied. However, for working with further wrappers (like _pat_wrapper - and _noarg_wrapper), it is necessary to specify the name. - - Returns - ------- - func : wrapper - The method to which the decorator is applied, with an added check that - enforces the inferred type to not be in the list of forbidden types. - - Raises - ------ - TypeError - If the inferred type of the underlying data is in `forbidden`. - """ - # deal with None - forbidden = [] if forbidden is None else forbidden - - allowed_types = {"string", "empty", "bytes", "mixed", "mixed-integer"} - set( - forbidden - ) - - def _forbid_nonstring_types(func): - func_name = func.__name__ if name is None else name - - @wraps(func) - def wrapper(self, *args, **kwargs): - if self._inferred_dtype not in allowed_types: - msg = ( - f"Cannot use .str.{func_name} with values of " - f"inferred dtype '{self._inferred_dtype}'." - ) - raise TypeError(msg) - return func(self, *args, **kwargs) - - wrapper.__name__ = func_name - return wrapper - - return _forbid_nonstring_types - - -def _noarg_wrapper( - f, - name=None, - docstring=None, - forbidden_types=["bytes"], - returns_string=True, - **kwargs, -): - @forbid_nonstring_types(forbidden_types, name=name) - def wrapper(self): - result = _na_map(f, self._parent, **kwargs) - return self._wrap_result(result, returns_string=returns_string) - - wrapper.__name__ = f.__name__ if name is None else name - if docstring is not None: - wrapper.__doc__ = docstring - else: - raise ValueError("Provide docstring") - - return wrapper - - -def _pat_wrapper( - f, - flags=False, - na=False, - name=None, - forbidden_types=["bytes"], - returns_string=True, - **kwargs, -): - @forbid_nonstring_types(forbidden_types, name=name) - def wrapper1(self, pat): - result = f(self._parent, pat) - return self._wrap_result(result, returns_string=returns_string) - - @forbid_nonstring_types(forbidden_types, name=name) - def wrapper2(self, pat, flags=0, **kwargs): - result = f(self._parent, pat, flags=flags, **kwargs) - return self._wrap_result(result, returns_string=returns_string) - - @forbid_nonstring_types(forbidden_types, name=name) - def wrapper3(self, pat, na=np.nan): - result = f(self._parent, pat, na=na) - return self._wrap_result(result, returns_string=returns_string, fill_value=na) - - wrapper = wrapper3 if na else wrapper2 if flags else wrapper1 - - wrapper.__name__ = f.__name__ if name is None else name - if f.__doc__: - wrapper.__doc__ = f.__doc__ - - return wrapper - - -def copy(source): - """Copy a docstring from another source function (if present)""" - - def do_copy(target): - if source.__doc__: - target.__doc__ = source.__doc__ - return target - - return do_copy - - -class StringMethods(NoNewAttributesMixin): - """ - Vectorized string functions for Series and Index. - - NAs stay NA unless handled otherwise by a particular method. - Patterned after Python's string methods, with some inspiration from - R's stringr package. - - Examples - -------- - >>> s = pd.Series(["A_Str_Series"]) - >>> s - 0 A_Str_Series - dtype: object - - >>> s.str.split("_") - 0 [A, Str, Series] - dtype: object - - >>> s.str.replace("_", "") - 0 AStrSeries - dtype: object - """ - - def __init__(self, data): - self._inferred_dtype = self._validate(data) - self._is_categorical = is_categorical_dtype(data.dtype) - self._is_string = data.dtype.name == "string" - - # ._values.categories works for both Series/Index - self._parent = data._values.categories if self._is_categorical else data - # save orig to blow up categoricals to the right type - self._orig = data - self._freeze() - - @staticmethod - def _validate(data): - """ - Auxiliary function for StringMethods, infers and checks dtype of data. - - This is a "first line of defence" at the creation of the StringMethods- - object (see _make_accessor), and just checks that the dtype is in the - *union* of the allowed types over all string methods below; this - restriction is then refined on a per-method basis using the decorator - @forbid_nonstring_types (more info in the corresponding docstring). - - This really should exclude all series/index with any non-string values, - but that isn't practical for performance reasons until we have a str - dtype (GH 9343 / 13877) - - Parameters - ---------- - data : The content of the Series - - Returns - ------- - dtype : inferred dtype of data - """ - from pandas import StringDtype - - if isinstance(data, ABCMultiIndex): - raise AttributeError( - "Can only use .str accessor with Index, not MultiIndex" - ) - - # see _libs/lib.pyx for list of inferred types - allowed_types = ["string", "empty", "bytes", "mixed", "mixed-integer"] - - values = getattr(data, "values", data) # Series / Index - values = getattr(values, "categories", values) # categorical / normal - - # explicitly allow StringDtype - if isinstance(values.dtype, StringDtype): - return "string" - - try: - inferred_dtype = lib.infer_dtype(values, skipna=True) - except ValueError: - # GH#27571 mostly occurs with ExtensionArray - inferred_dtype = None - - if inferred_dtype not in allowed_types: - raise AttributeError("Can only use .str accessor with string values!") - return inferred_dtype - - def __getitem__(self, key): - if isinstance(key, slice): - return self.slice(start=key.start, stop=key.stop, step=key.step) - else: - return self.get(key) - - def __iter__(self): - warnings.warn( - "Columnar iteration over characters will be deprecated in future releases.", - FutureWarning, - stacklevel=2, - ) - i = 0 - g = self.get(i) - while g.notna().any(): - yield g - i += 1 - g = self.get(i) - - def _wrap_result( - self, - result, - use_codes=True, - name=None, - expand=None, - fill_value=np.nan, - returns_string=True, - ): - - from pandas import Index, MultiIndex, Series - - # for category, we do the stuff on the categories, so blow it up - # to the full series again - # But for some operations, we have to do the stuff on the full values, - # so make it possible to skip this step as the method already did this - # before the transformation... - if use_codes and self._is_categorical: - # if self._orig is a CategoricalIndex, there is no .cat-accessor - result = take_1d( - result, Series(self._orig, copy=False).cat.codes, fill_value=fill_value - ) - - if not hasattr(result, "ndim") or not hasattr(result, "dtype"): - return result - assert result.ndim < 3 - - # We can be wrapping a string / object / categorical result, in which - # case we'll want to return the same dtype as the input. - # Or we can be wrapping a numeric output, in which case we don't want - # to return a StringArray. - if self._is_string and returns_string: - dtype = "string" - else: - dtype = None - - if expand is None: - # infer from ndim if expand is not specified - expand = result.ndim != 1 - - elif expand is True and not isinstance(self._orig, ABCIndexClass): - # required when expand=True is explicitly specified - # not needed when inferred - - def cons_row(x): - if is_list_like(x): - return x - else: - return [x] - - result = [cons_row(x) for x in result] - if result: - # propagate nan values to match longest sequence (GH 18450) - max_len = max(len(x) for x in result) - result = [ - x * max_len if len(x) == 0 or x[0] is np.nan else x for x in result - ] - - if not isinstance(expand, bool): - raise ValueError("expand must be True or False") - - if expand is False: - # if expand is False, result should have the same name - # as the original otherwise specified - if name is None: - name = getattr(result, "name", None) - if name is None: - # do not use logical or, _orig may be a DataFrame - # which has "name" column - name = self._orig.name - - # Wait until we are sure result is a Series or Index before - # checking attributes (GH 12180) - if isinstance(self._orig, ABCIndexClass): - # if result is a boolean np.array, return the np.array - # instead of wrapping it into a boolean Index (GH 8875) - if is_bool_dtype(result): - return result - - if expand: - result = list(result) - out = MultiIndex.from_tuples(result, names=name) - if out.nlevels == 1: - # We had all tuples of length-one, which are - # better represented as a regular Index. - out = out.get_level_values(0) - return out - else: - return Index(result, name=name) - else: - index = self._orig.index - if expand: - cons = self._orig._constructor_expanddim - result = cons(result, columns=name, index=index, dtype=dtype) - else: - # Must be a Series - cons = self._orig._constructor - result = cons(result, name=name, index=index, dtype=dtype) - return result - - def _get_series_list(self, others): - """ - Auxiliary function for :meth:`str.cat`. Turn potentially mixed input - into a list of Series (elements without an index must match the length - of the calling Series/Index). - - Parameters - ---------- - others : Series, DataFrame, np.ndarray, list-like or list-like of - Objects that are either Series, Index or np.ndarray (1-dim). - - Returns - ------- - list of Series - Others transformed into list of Series. - """ - from pandas import DataFrame, Series - - # self._orig is either Series or Index - idx = self._orig if isinstance(self._orig, ABCIndexClass) else self._orig.index - - # Generally speaking, all objects without an index inherit the index - # `idx` of the calling Series/Index - i.e. must have matching length. - # Objects with an index (i.e. Series/Index/DataFrame) keep their own. - if isinstance(others, ABCSeries): - return [others] - elif isinstance(others, ABCIndexClass): - return [Series(others._values, index=idx)] - elif isinstance(others, ABCDataFrame): - return [others[x] for x in others] - elif isinstance(others, np.ndarray) and others.ndim == 2: - others = DataFrame(others, index=idx) - return [others[x] for x in others] - elif is_list_like(others, allow_sets=False): - others = list(others) # ensure iterators do not get read twice etc - - # in case of list-like `others`, all elements must be - # either Series/Index/np.ndarray (1-dim)... - if all( - isinstance(x, (ABCSeries, ABCIndexClass)) - or (isinstance(x, np.ndarray) and x.ndim == 1) - for x in others - ): - los = [] - while others: # iterate through list and append each element - los = los + self._get_series_list(others.pop(0)) - return los - # ... or just strings - elif all(not is_list_like(x) for x in others): - return [Series(others, index=idx)] - raise TypeError( - "others must be Series, Index, DataFrame, np.ndarray " - "or list-like (either containing only strings or " - "containing only objects of type Series/Index/" - "np.ndarray[1-dim])" - ) - - @forbid_nonstring_types(["bytes", "mixed", "mixed-integer"]) - def cat(self, others=None, sep=None, na_rep=None, join="left"): - """ - Concatenate strings in the Series/Index with given separator. - - If `others` is specified, this function concatenates the Series/Index - and elements of `others` element-wise. - If `others` is not passed, then all values in the Series/Index are - concatenated into a single string with a given `sep`. - - Parameters - ---------- - others : Series, Index, DataFrame, np.ndarray or list-like - Series, Index, DataFrame, np.ndarray (one- or two-dimensional) and - other list-likes of strings must have the same length as the - calling Series/Index, with the exception of indexed objects (i.e. - Series/Index/DataFrame) if `join` is not None. - - If others is a list-like that contains a combination of Series, - Index or np.ndarray (1-dim), then all elements will be unpacked and - must satisfy the above criteria individually. - - If others is None, the method returns the concatenation of all - strings in the calling Series/Index. - sep : str, default '' - The separator between the different elements/columns. By default - the empty string `''` is used. - na_rep : str or None, default None - Representation that is inserted for all missing values: - - - If `na_rep` is None, and `others` is None, missing values in the - Series/Index are omitted from the result. - - If `na_rep` is None, and `others` is not None, a row containing a - missing value in any of the columns (before concatenation) will - have a missing value in the result. - join : {'left', 'right', 'outer', 'inner'}, default 'left' - Determines the join-style between the calling Series/Index and any - Series/Index/DataFrame in `others` (objects without an index need - to match the length of the calling Series/Index). To disable - alignment, use `.values` on any Series/Index/DataFrame in `others`. - - .. versionchanged:: 1.0.0 - Changed default of `join` from None to `'left'`. - - Returns - ------- - str, Series or Index - If `others` is None, `str` is returned, otherwise a `Series/Index` - (same type as caller) of objects is returned. - - See Also - -------- - split : Split each string in the Series/Index. - join : Join lists contained as elements in the Series/Index. - - Examples - -------- - When not passing `others`, all values are concatenated into a single - string: - - >>> s = pd.Series(['a', 'b', np.nan, 'd']) - >>> s.str.cat(sep=' ') - 'a b d' - - By default, NA values in the Series are ignored. Using `na_rep`, they - can be given a representation: - - >>> s.str.cat(sep=' ', na_rep='?') - 'a b ? d' - - If `others` is specified, corresponding values are concatenated with - the separator. Result will be a Series of strings. - - >>> s.str.cat(['A', 'B', 'C', 'D'], sep=',') - 0 a,A - 1 b,B - 2 NaN - 3 d,D - dtype: object - - Missing values will remain missing in the result, but can again be - represented using `na_rep` - - >>> s.str.cat(['A', 'B', 'C', 'D'], sep=',', na_rep='-') - 0 a,A - 1 b,B - 2 -,C - 3 d,D - dtype: object - - If `sep` is not specified, the values are concatenated without - separation. - - >>> s.str.cat(['A', 'B', 'C', 'D'], na_rep='-') - 0 aA - 1 bB - 2 -C - 3 dD - dtype: object - - Series with different indexes can be aligned before concatenation. The - `join`-keyword works as in other methods. - - >>> t = pd.Series(['d', 'a', 'e', 'c'], index=[3, 0, 4, 2]) - >>> s.str.cat(t, join='left', na_rep='-') - 0 aa - 1 b- - 2 -c - 3 dd - dtype: object - >>> - >>> s.str.cat(t, join='outer', na_rep='-') - 0 aa - 1 b- - 2 -c - 3 dd - 4 -e - dtype: object - >>> - >>> s.str.cat(t, join='inner', na_rep='-') - 0 aa - 2 -c - 3 dd - dtype: object - >>> - >>> s.str.cat(t, join='right', na_rep='-') - 3 dd - 0 aa - 4 -e - 2 -c - dtype: object - - For more examples, see :ref:`here `. - """ - from pandas import Index, Series, concat - - if isinstance(others, str): - raise ValueError("Did you mean to supply a `sep` keyword?") - if sep is None: - sep = "" - - if isinstance(self._orig, ABCIndexClass): - data = Series(self._orig, index=self._orig) - else: # Series - data = self._orig - - # concatenate Series/Index with itself if no "others" - if others is None: - data = ensure_object(data) - na_mask = isna(data) - if na_rep is None and na_mask.any(): - data = data[~na_mask] - elif na_rep is not None and na_mask.any(): - data = np.where(na_mask, na_rep, data) - return sep.join(data) - - try: - # turn anything in "others" into lists of Series - others = self._get_series_list(others) - except ValueError as err: # do not catch TypeError raised by _get_series_list - raise ValueError( - "If `others` contains arrays or lists (or other " - "list-likes without an index), these must all be " - "of the same length as the calling Series/Index." - ) from err - - # align if required - if any(not data.index.equals(x.index) for x in others): - # Need to add keys for uniqueness in case of duplicate columns - others = concat( - others, - axis=1, - join=(join if join == "inner" else "outer"), - keys=range(len(others)), - sort=False, - copy=False, - ) - data, others = data.align(others, join=join) - others = [others[x] for x in others] # again list of Series - - all_cols = [ensure_object(x) for x in [data] + others] - na_masks = np.array([isna(x) for x in all_cols]) - union_mask = np.logical_or.reduce(na_masks, axis=0) - - if na_rep is None and union_mask.any(): - # no na_rep means NaNs for all rows where any column has a NaN - # only necessary if there are actually any NaNs - result = np.empty(len(data), dtype=object) - np.putmask(result, union_mask, np.nan) - - not_masked = ~union_mask - result[not_masked] = cat_safe([x[not_masked] for x in all_cols], sep) - elif na_rep is not None and union_mask.any(): - # fill NaNs with na_rep in case there are actually any NaNs - all_cols = [ - np.where(nm, na_rep, col) for nm, col in zip(na_masks, all_cols) - ] - result = cat_safe(all_cols, sep) - else: - # no NaNs - can just concatenate - result = cat_safe(all_cols, sep) - - if isinstance(self._orig, ABCIndexClass): - # add dtype for case that result is all-NA - result = Index(result, dtype=object, name=self._orig.name) - else: # Series - if is_categorical_dtype(self._orig.dtype): - # We need to infer the new categories. - dtype = None - else: - dtype = self._orig.dtype - result = Series(result, dtype=dtype, index=data.index, name=self._orig.name) - return result - - _shared_docs[ - "str_split" - ] = r""" - Split strings around given separator/delimiter. - - Splits the string in the Series/Index from the %(side)s, - at the specified delimiter string. Equivalent to :meth:`str.%(method)s`. - - Parameters - ---------- - pat : str, optional - String or regular expression to split on. - If not specified, split on whitespace. - n : int, default -1 (all) - Limit number of splits in output. - ``None``, 0 and -1 will be interpreted as return all splits. - expand : bool, default False - Expand the split strings into separate columns. - - * If ``True``, return DataFrame/MultiIndex expanding dimensionality. - * If ``False``, return Series/Index, containing lists of strings. - - Returns - ------- - Series, Index, DataFrame or MultiIndex - Type matches caller unless ``expand=True`` (see Notes). - - See Also - -------- - Series.str.split : Split strings around given separator/delimiter. - Series.str.rsplit : Splits string around given separator/delimiter, - starting from the right. - Series.str.join : Join lists contained as elements in the Series/Index - with passed delimiter. - str.split : Standard library version for split. - str.rsplit : Standard library version for rsplit. - - Notes - ----- - The handling of the `n` keyword depends on the number of found splits: - - - If found splits > `n`, make first `n` splits only - - If found splits <= `n`, make all splits - - If for a certain row the number of found splits < `n`, - append `None` for padding up to `n` if ``expand=True`` - - If using ``expand=True``, Series and Index callers return DataFrame and - MultiIndex objects, respectively. - - Examples - -------- - >>> s = pd.Series( - ... [ - ... "this is a regular sentence", - ... "https://docs.python.org/3/tutorial/index.html", - ... np.nan - ... ] - ... ) - >>> s - 0 this is a regular sentence - 1 https://docs.python.org/3/tutorial/index.html - 2 NaN - dtype: object - - In the default setting, the string is split by whitespace. - - >>> s.str.split() - 0 [this, is, a, regular, sentence] - 1 [https://docs.python.org/3/tutorial/index.html] - 2 NaN - dtype: object - - Without the `n` parameter, the outputs of `rsplit` and `split` - are identical. - - >>> s.str.rsplit() - 0 [this, is, a, regular, sentence] - 1 [https://docs.python.org/3/tutorial/index.html] - 2 NaN - dtype: object - - The `n` parameter can be used to limit the number of splits on the - delimiter. The outputs of `split` and `rsplit` are different. - - >>> s.str.split(n=2) - 0 [this, is, a regular sentence] - 1 [https://docs.python.org/3/tutorial/index.html] - 2 NaN - dtype: object - - >>> s.str.rsplit(n=2) - 0 [this is a, regular, sentence] - 1 [https://docs.python.org/3/tutorial/index.html] - 2 NaN - dtype: object - - The `pat` parameter can be used to split by other characters. - - >>> s.str.split(pat="/") - 0 [this is a regular sentence] - 1 [https:, , docs.python.org, 3, tutorial, index... - 2 NaN - dtype: object - - When using ``expand=True``, the split elements will expand out into - separate columns. If NaN is present, it is propagated throughout - the columns during the split. - - >>> s.str.split(expand=True) - 0 1 2 3 4 - 0 this is a regular sentence - 1 https://docs.python.org/3/tutorial/index.html None None None None - 2 NaN NaN NaN NaN NaN - - For slightly more complex use cases like splitting the html document name - from a url, a combination of parameter settings can be used. - - >>> s.str.rsplit("/", n=1, expand=True) - 0 1 - 0 this is a regular sentence None - 1 https://docs.python.org/3/tutorial index.html - 2 NaN NaN - - Remember to escape special characters when explicitly using regular - expressions. - - >>> s = pd.Series(["1+1=2"]) - >>> s - 0 1+1=2 - dtype: object - >>> s.str.split(r"\+|=", expand=True) - 0 1 2 - 0 1 1 2 - """ - - @Appender(_shared_docs["str_split"] % {"side": "beginning", "method": "split"}) - @forbid_nonstring_types(["bytes"]) - def split(self, pat=None, n=-1, expand=False): - result = str_split(self._parent, pat, n=n) - return self._wrap_result(result, expand=expand, returns_string=expand) - - @Appender(_shared_docs["str_split"] % {"side": "end", "method": "rsplit"}) - @forbid_nonstring_types(["bytes"]) - def rsplit(self, pat=None, n=-1, expand=False): - result = str_rsplit(self._parent, pat, n=n) - return self._wrap_result(result, expand=expand, returns_string=expand) - - _shared_docs[ - "str_partition" - ] = """ - Split the string at the %(side)s occurrence of `sep`. - - This method splits the string at the %(side)s occurrence of `sep`, - and returns 3 elements containing the part before the separator, - the separator itself, and the part after the separator. - If the separator is not found, return %(return)s. - - Parameters - ---------- - sep : str, default whitespace - String to split on. - expand : bool, default True - If True, return DataFrame/MultiIndex expanding dimensionality. - If False, return Series/Index. - - Returns - ------- - DataFrame/MultiIndex or Series/Index of objects - - See Also - -------- - %(also)s - Series.str.split : Split strings around given separators. - str.partition : Standard library version. - - Examples - -------- - - >>> s = pd.Series(['Linda van der Berg', 'George Pitt-Rivers']) - >>> s - 0 Linda van der Berg - 1 George Pitt-Rivers - dtype: object - - >>> s.str.partition() - 0 1 2 - 0 Linda van der Berg - 1 George Pitt-Rivers - - To partition by the last space instead of the first one: - - >>> s.str.rpartition() - 0 1 2 - 0 Linda van der Berg - 1 George Pitt-Rivers - - To partition by something different than a space: - - >>> s.str.partition('-') - 0 1 2 - 0 Linda van der Berg - 1 George Pitt - Rivers - - To return a Series containing tuples instead of a DataFrame: - - >>> s.str.partition('-', expand=False) - 0 (Linda van der Berg, , ) - 1 (George Pitt, -, Rivers) - dtype: object - - Also available on indices: - - >>> idx = pd.Index(['X 123', 'Y 999']) - >>> idx - Index(['X 123', 'Y 999'], dtype='object') - - Which will create a MultiIndex: - - >>> idx.str.partition() - MultiIndex([('X', ' ', '123'), - ('Y', ' ', '999')], - ) - - Or an index with tuples with ``expand=False``: - - >>> idx.str.partition(expand=False) - Index([('X', ' ', '123'), ('Y', ' ', '999')], dtype='object') - """ - - @Appender( - _shared_docs["str_partition"] - % { - "side": "first", - "return": "3 elements containing the string itself, followed by two " - "empty strings", - "also": "rpartition : Split the string at the last occurrence of `sep`.", - } - ) - @forbid_nonstring_types(["bytes"]) - def partition(self, sep=" ", expand=True): - f = lambda x: x.partition(sep) - result = _na_map(f, self._parent) - return self._wrap_result(result, expand=expand, returns_string=expand) - - @Appender( - _shared_docs["str_partition"] - % { - "side": "last", - "return": "3 elements containing two empty strings, followed by the " - "string itself", - "also": "partition : Split the string at the first occurrence of `sep`.", - } - ) - @forbid_nonstring_types(["bytes"]) - def rpartition(self, sep=" ", expand=True): - f = lambda x: x.rpartition(sep) - result = _na_map(f, self._parent) - return self._wrap_result(result, expand=expand, returns_string=expand) - - @copy(str_get) - def get(self, i): - result = str_get(self._parent, i) - return self._wrap_result(result) - - @copy(str_join) - @forbid_nonstring_types(["bytes"]) - def join(self, sep): - result = str_join(self._parent, sep) - return self._wrap_result(result) - - @copy(str_contains) - @forbid_nonstring_types(["bytes"]) - def contains(self, pat, case=True, flags=0, na=np.nan, regex=True): - result = str_contains( - self._parent, pat, case=case, flags=flags, na=na, regex=regex - ) - return self._wrap_result(result, fill_value=na, returns_string=False) - - @copy(str_match) - @forbid_nonstring_types(["bytes"]) - def match(self, pat, case=True, flags=0, na=np.nan): - result = str_match(self._parent, pat, case=case, flags=flags, na=na) - return self._wrap_result(result, fill_value=na, returns_string=False) - - @copy(str_fullmatch) - @forbid_nonstring_types(["bytes"]) - def fullmatch(self, pat, case=True, flags=0, na=np.nan): - result = str_fullmatch(self._parent, pat, case=case, flags=flags, na=na) - return self._wrap_result(result, fill_value=na, returns_string=False) - - @copy(str_replace) - @forbid_nonstring_types(["bytes"]) - def replace(self, pat, repl, n=-1, case=None, flags=0, regex=True): - result = str_replace( - self._parent, pat, repl, n=n, case=case, flags=flags, regex=regex - ) - return self._wrap_result(result) - - @copy(str_repeat) - @forbid_nonstring_types(["bytes"]) - def repeat(self, repeats): - result = str_repeat(self._parent, repeats) - return self._wrap_result(result) - - @copy(str_pad) - @forbid_nonstring_types(["bytes"]) - def pad(self, width, side="left", fillchar=" "): - result = str_pad(self._parent, width, side=side, fillchar=fillchar) - return self._wrap_result(result) - - _shared_docs[ - "str_pad" - ] = """ - Pad %(side)s side of strings in the Series/Index. - - Equivalent to :meth:`str.%(method)s`. - - Parameters - ---------- - width : int - Minimum width of resulting string; additional characters will be filled - with ``fillchar``. - fillchar : str - Additional character for filling, default is whitespace. - - Returns - ------- - filled : Series/Index of objects. - """ - - @Appender(_shared_docs["str_pad"] % dict(side="left and right", method="center")) - @forbid_nonstring_types(["bytes"]) - def center(self, width, fillchar=" "): - return self.pad(width, side="both", fillchar=fillchar) - - @Appender(_shared_docs["str_pad"] % dict(side="right", method="ljust")) - @forbid_nonstring_types(["bytes"]) - def ljust(self, width, fillchar=" "): - return self.pad(width, side="right", fillchar=fillchar) - - @Appender(_shared_docs["str_pad"] % dict(side="left", method="rjust")) - @forbid_nonstring_types(["bytes"]) - def rjust(self, width, fillchar=" "): - return self.pad(width, side="left", fillchar=fillchar) - - @forbid_nonstring_types(["bytes"]) - def zfill(self, width): - """ - Pad strings in the Series/Index by prepending '0' characters. - - Strings in the Series/Index are padded with '0' characters on the - left of the string to reach a total string length `width`. Strings - in the Series/Index with length greater or equal to `width` are - unchanged. - - Parameters - ---------- - width : int - Minimum length of resulting string; strings with length less - than `width` be prepended with '0' characters. - - Returns - ------- - Series/Index of objects. - - See Also - -------- - Series.str.rjust : Fills the left side of strings with an arbitrary - character. - Series.str.ljust : Fills the right side of strings with an arbitrary - character. - Series.str.pad : Fills the specified sides of strings with an arbitrary - character. - Series.str.center : Fills both sides of strings with an arbitrary - character. - - Notes - ----- - Differs from :meth:`str.zfill` which has special handling - for '+'/'-' in the string. - - Examples - -------- - >>> s = pd.Series(['-1', '1', '1000', 10, np.nan]) - >>> s - 0 -1 - 1 1 - 2 1000 - 3 10 - 4 NaN - dtype: object - - Note that ``10`` and ``NaN`` are not strings, therefore they are - converted to ``NaN``. The minus sign in ``'-1'`` is treated as a - regular character and the zero is added to the left of it - (:meth:`str.zfill` would have moved it to the left). ``1000`` - remains unchanged as it is longer than `width`. - - >>> s.str.zfill(3) - 0 0-1 - 1 001 - 2 1000 - 3 NaN - 4 NaN - dtype: object - """ - result = str_pad(self._parent, width, side="left", fillchar="0") - return self._wrap_result(result) - - @copy(str_slice) - def slice(self, start=None, stop=None, step=None): - result = str_slice(self._parent, start, stop, step) - return self._wrap_result(result) - - @copy(str_slice_replace) - @forbid_nonstring_types(["bytes"]) - def slice_replace(self, start=None, stop=None, repl=None): - result = str_slice_replace(self._parent, start, stop, repl) - return self._wrap_result(result) - - @copy(str_decode) - def decode(self, encoding, errors="strict"): - # need to allow bytes here - result = str_decode(self._parent, encoding, errors) - # TODO: Not sure how to handle this. - return self._wrap_result(result, returns_string=False) - - @copy(str_encode) - @forbid_nonstring_types(["bytes"]) - def encode(self, encoding, errors="strict"): - result = str_encode(self._parent, encoding, errors) - return self._wrap_result(result, returns_string=False) - - _shared_docs[ - "str_strip" - ] = r""" - Remove %(position)s characters. - - Strip whitespaces (including newlines) or a set of specified characters - from each string in the Series/Index from %(side)s. - Equivalent to :meth:`str.%(method)s`. - - Parameters - ---------- - to_strip : str or None, default None - Specifying the set of characters to be removed. - All combinations of this set of characters will be stripped. - If None then whitespaces are removed. - - Returns - ------- - Series or Index of object - - See Also - -------- - Series.str.strip : Remove leading and trailing characters in Series/Index. - Series.str.lstrip : Remove leading characters in Series/Index. - Series.str.rstrip : Remove trailing characters in Series/Index. - - Examples - -------- - >>> s = pd.Series(['1. Ant. ', '2. Bee!\n', '3. Cat?\t', np.nan]) - >>> s - 0 1. Ant. - 1 2. Bee!\n - 2 3. Cat?\t - 3 NaN - dtype: object - - >>> s.str.strip() - 0 1. Ant. - 1 2. Bee! - 2 3. Cat? - 3 NaN - dtype: object - - >>> s.str.lstrip('123.') - 0 Ant. - 1 Bee!\n - 2 Cat?\t - 3 NaN - dtype: object - - >>> s.str.rstrip('.!? \n\t') - 0 1. Ant - 1 2. Bee - 2 3. Cat - 3 NaN - dtype: object - - >>> s.str.strip('123.!? \n\t') - 0 Ant - 1 Bee - 2 Cat - 3 NaN - dtype: object - """ - - @Appender( - _shared_docs["str_strip"] - % dict( - side="left and right sides", method="strip", position="leading and trailing" - ) - ) - @forbid_nonstring_types(["bytes"]) - def strip(self, to_strip=None): - result = str_strip(self._parent, to_strip, side="both") - return self._wrap_result(result) - - @Appender( - _shared_docs["str_strip"] - % dict(side="left side", method="lstrip", position="leading") - ) - @forbid_nonstring_types(["bytes"]) - def lstrip(self, to_strip=None): - result = str_strip(self._parent, to_strip, side="left") - return self._wrap_result(result) - - @Appender( - _shared_docs["str_strip"] - % dict(side="right side", method="rstrip", position="trailing") - ) - @forbid_nonstring_types(["bytes"]) - def rstrip(self, to_strip=None): - result = str_strip(self._parent, to_strip, side="right") - return self._wrap_result(result) - - @copy(str_wrap) - @forbid_nonstring_types(["bytes"]) - def wrap(self, width, **kwargs): - result = str_wrap(self._parent, width, **kwargs) - return self._wrap_result(result) - - @copy(str_get_dummies) - @forbid_nonstring_types(["bytes"]) - def get_dummies(self, sep="|"): - # we need to cast to Series of strings as only that has all - # methods available for making the dummies... - data = self._orig.astype(str) if self._is_categorical else self._parent - result, name = str_get_dummies(data, sep) - return self._wrap_result( - result, - use_codes=(not self._is_categorical), - name=name, - expand=True, - returns_string=False, - ) - - @copy(str_translate) - @forbid_nonstring_types(["bytes"]) - def translate(self, table): - result = str_translate(self._parent, table) - return self._wrap_result(result) - - count = _pat_wrapper(str_count, flags=True, name="count", returns_string=False) - startswith = _pat_wrapper( - str_startswith, na=True, name="startswith", returns_string=False - ) - endswith = _pat_wrapper( - str_endswith, na=True, name="endswith", returns_string=False - ) - findall = _pat_wrapper( - str_findall, flags=True, name="findall", returns_string=False - ) - - @copy(str_extract) - @forbid_nonstring_types(["bytes"]) - def extract(self, pat, flags=0, expand=True): - return str_extract(self, pat, flags=flags, expand=expand) - - @copy(str_extractall) - @forbid_nonstring_types(["bytes"]) - def extractall(self, pat, flags=0): - return str_extractall(self._orig, pat, flags=flags) - - _shared_docs[ - "find" - ] = """ - Return %(side)s indexes in each strings in the Series/Index. - - Each of returned indexes corresponds to the position where the - substring is fully contained between [start:end]. Return -1 on - failure. Equivalent to standard :meth:`str.%(method)s`. - - Parameters - ---------- - sub : str - Substring being searched. - start : int - Left edge index. - end : int - Right edge index. - - Returns - ------- - Series or Index of int. - - See Also - -------- - %(also)s - """ - - @Appender( - _shared_docs["find"] - % dict( - side="lowest", - method="find", - also="rfind : Return highest indexes in each strings.", - ) - ) - @forbid_nonstring_types(["bytes"]) - def find(self, sub, start=0, end=None): - result = str_find(self._parent, sub, start=start, end=end, side="left") - return self._wrap_result(result, returns_string=False) - - @Appender( - _shared_docs["find"] - % dict( - side="highest", - method="rfind", - also="find : Return lowest indexes in each strings.", - ) - ) - @forbid_nonstring_types(["bytes"]) - def rfind(self, sub, start=0, end=None): - result = str_find(self._parent, sub, start=start, end=end, side="right") - return self._wrap_result(result, returns_string=False) - - @forbid_nonstring_types(["bytes"]) - def normalize(self, form): - """ - Return the Unicode normal form for the strings in the Series/Index. - - For more information on the forms, see the - :func:`unicodedata.normalize`. - - Parameters - ---------- - form : {'NFC', 'NFKC', 'NFD', 'NFKD'} - Unicode form. - - Returns - ------- - normalized : Series/Index of objects - """ - import unicodedata - - f = lambda x: unicodedata.normalize(form, x) - result = _na_map(f, self._parent, dtype=str) - return self._wrap_result(result) - - _shared_docs[ - "index" - ] = """ - Return %(side)s indexes in each string in Series/Index. - - Each of the returned indexes corresponds to the position where the - substring is fully contained between [start:end]. This is the same - as ``str.%(similar)s`` except instead of returning -1, it raises a - ValueError when the substring is not found. Equivalent to standard - ``str.%(method)s``. - - Parameters - ---------- - sub : str - Substring being searched. - start : int - Left edge index. - end : int - Right edge index. - - Returns - ------- - Series or Index of object - - See Also - -------- - %(also)s - """ - - @Appender( - _shared_docs["index"] - % dict( - side="lowest", - similar="find", - method="index", - also="rindex : Return highest indexes in each strings.", - ) - ) - @forbid_nonstring_types(["bytes"]) - def index(self, sub, start=0, end=None): - result = str_index(self._parent, sub, start=start, end=end, side="left") - return self._wrap_result(result, returns_string=False) - - @Appender( - _shared_docs["index"] - % dict( - side="highest", - similar="rfind", - method="rindex", - also="index : Return lowest indexes in each strings.", - ) - ) - @forbid_nonstring_types(["bytes"]) - def rindex(self, sub, start=0, end=None): - result = str_index(self._parent, sub, start=start, end=end, side="right") - return self._wrap_result(result, returns_string=False) - - _shared_docs[ - "len" - ] = """ - Compute the length of each element in the Series/Index. - - The element may be a sequence (such as a string, tuple or list) or a collection - (such as a dictionary). - - Returns - ------- - Series or Index of int - A Series or Index of integer values indicating the length of each - element in the Series or Index. - - See Also - -------- - str.len : Python built-in function returning the length of an object. - Series.size : Returns the length of the Series. - - Examples - -------- - Returns the length (number of characters) in a string. Returns the - number of entries for dictionaries, lists or tuples. - - >>> s = pd.Series(['dog', - ... '', - ... 5, - ... {'foo' : 'bar'}, - ... [2, 3, 5, 7], - ... ('one', 'two', 'three')]) - >>> s - 0 dog - 1 - 2 5 - 3 {'foo': 'bar'} - 4 [2, 3, 5, 7] - 5 (one, two, three) - dtype: object - >>> s.str.len() - 0 3.0 - 1 0.0 - 2 NaN - 3 1.0 - 4 4.0 - 5 3.0 - dtype: float64 - """ - len = _noarg_wrapper( - len, - docstring=_shared_docs["len"], - forbidden_types=None, - dtype=np.dtype("int64"), - returns_string=False, - ) - - _shared_docs[ - "casemethods" - ] = """ - Convert strings in the Series/Index to %(type)s. - %(version)s - Equivalent to :meth:`str.%(method)s`. - - Returns - ------- - Series or Index of object - - See Also - -------- - Series.str.lower : Converts all characters to lowercase. - Series.str.upper : Converts all characters to uppercase. - Series.str.title : Converts first character of each word to uppercase and - remaining to lowercase. - Series.str.capitalize : Converts first character to uppercase and - remaining to lowercase. - Series.str.swapcase : Converts uppercase to lowercase and lowercase to - uppercase. - Series.str.casefold: Removes all case distinctions in the string. - - Examples - -------- - >>> s = pd.Series(['lower', 'CAPITALS', 'this is a sentence', 'SwApCaSe']) - >>> s - 0 lower - 1 CAPITALS - 2 this is a sentence - 3 SwApCaSe - dtype: object - - >>> s.str.lower() - 0 lower - 1 capitals - 2 this is a sentence - 3 swapcase - dtype: object - - >>> s.str.upper() - 0 LOWER - 1 CAPITALS - 2 THIS IS A SENTENCE - 3 SWAPCASE - dtype: object - - >>> s.str.title() - 0 Lower - 1 Capitals - 2 This Is A Sentence - 3 Swapcase - dtype: object - - >>> s.str.capitalize() - 0 Lower - 1 Capitals - 2 This is a sentence - 3 Swapcase - dtype: object - - >>> s.str.swapcase() - 0 LOWER - 1 capitals - 2 THIS IS A SENTENCE - 3 sWaPcAsE - dtype: object - """ - - # _doc_args holds dict of strings to use in substituting casemethod docs - _doc_args: Dict[str, Dict[str, str]] = {} - _doc_args["lower"] = dict(type="lowercase", method="lower", version="") - _doc_args["upper"] = dict(type="uppercase", method="upper", version="") - _doc_args["title"] = dict(type="titlecase", method="title", version="") - _doc_args["capitalize"] = dict( - type="be capitalized", method="capitalize", version="" - ) - _doc_args["swapcase"] = dict(type="be swapcased", method="swapcase", version="") - _doc_args["casefold"] = dict( - type="be casefolded", - method="casefold", - version="\n .. versionadded:: 0.25.0\n", - ) - lower = _noarg_wrapper( - lambda x: x.lower(), - name="lower", - docstring=_shared_docs["casemethods"] % _doc_args["lower"], - dtype=str, - ) - upper = _noarg_wrapper( - lambda x: x.upper(), - name="upper", - docstring=_shared_docs["casemethods"] % _doc_args["upper"], - dtype=str, - ) - title = _noarg_wrapper( - lambda x: x.title(), - name="title", - docstring=_shared_docs["casemethods"] % _doc_args["title"], - dtype=str, - ) - capitalize = _noarg_wrapper( - lambda x: x.capitalize(), - name="capitalize", - docstring=_shared_docs["casemethods"] % _doc_args["capitalize"], - dtype=str, - ) - swapcase = _noarg_wrapper( - lambda x: x.swapcase(), - name="swapcase", - docstring=_shared_docs["casemethods"] % _doc_args["swapcase"], - dtype=str, - ) - casefold = _noarg_wrapper( - lambda x: x.casefold(), - name="casefold", - docstring=_shared_docs["casemethods"] % _doc_args["casefold"], - dtype=str, - ) - - _shared_docs[ - "ismethods" - ] = """ - Check whether all characters in each string are %(type)s. - - This is equivalent to running the Python string method - :meth:`str.%(method)s` for each element of the Series/Index. If a string - has zero characters, ``False`` is returned for that check. - - Returns - ------- - Series or Index of bool - Series or Index of boolean values with the same length as the original - Series/Index. - - See Also - -------- - Series.str.isalpha : Check whether all characters are alphabetic. - Series.str.isnumeric : Check whether all characters are numeric. - Series.str.isalnum : Check whether all characters are alphanumeric. - Series.str.isdigit : Check whether all characters are digits. - Series.str.isdecimal : Check whether all characters are decimal. - Series.str.isspace : Check whether all characters are whitespace. - Series.str.islower : Check whether all characters are lowercase. - Series.str.isupper : Check whether all characters are uppercase. - Series.str.istitle : Check whether all characters are titlecase. - - Examples - -------- - **Checks for Alphabetic and Numeric Characters** - - >>> s1 = pd.Series(['one', 'one1', '1', '']) - - >>> s1.str.isalpha() - 0 True - 1 False - 2 False - 3 False - dtype: bool - - >>> s1.str.isnumeric() - 0 False - 1 False - 2 True - 3 False - dtype: bool - - >>> s1.str.isalnum() - 0 True - 1 True - 2 True - 3 False - dtype: bool - - Note that checks against characters mixed with any additional punctuation - or whitespace will evaluate to false for an alphanumeric check. - - >>> s2 = pd.Series(['A B', '1.5', '3,000']) - >>> s2.str.isalnum() - 0 False - 1 False - 2 False - dtype: bool - - **More Detailed Checks for Numeric Characters** - - There are several different but overlapping sets of numeric characters that - can be checked for. - - >>> s3 = pd.Series(['23', '³', '⅕', '']) - - The ``s3.str.isdecimal`` method checks for characters used to form numbers - in base 10. - - >>> s3.str.isdecimal() - 0 True - 1 False - 2 False - 3 False - dtype: bool - - The ``s.str.isdigit`` method is the same as ``s3.str.isdecimal`` but also - includes special digits, like superscripted and subscripted digits in - unicode. - - >>> s3.str.isdigit() - 0 True - 1 True - 2 False - 3 False - dtype: bool - - The ``s.str.isnumeric`` method is the same as ``s3.str.isdigit`` but also - includes other characters that can represent quantities such as unicode - fractions. - - >>> s3.str.isnumeric() - 0 True - 1 True - 2 True - 3 False - dtype: bool - - **Checks for Whitespace** - - >>> s4 = pd.Series([' ', '\\t\\r\\n ', '']) - >>> s4.str.isspace() - 0 True - 1 True - 2 False - dtype: bool - - **Checks for Character Case** - - >>> s5 = pd.Series(['leopard', 'Golden Eagle', 'SNAKE', '']) - - >>> s5.str.islower() - 0 True - 1 False - 2 False - 3 False - dtype: bool - - >>> s5.str.isupper() - 0 False - 1 False - 2 True - 3 False - dtype: bool - - The ``s5.str.istitle`` method checks for whether all words are in title - case (whether only the first letter of each word is capitalized). Words are - assumed to be as any sequence of non-numeric characters separated by - whitespace characters. - - >>> s5.str.istitle() - 0 False - 1 True - 2 False - 3 False - dtype: bool - """ - _doc_args["isalnum"] = dict(type="alphanumeric", method="isalnum") - _doc_args["isalpha"] = dict(type="alphabetic", method="isalpha") - _doc_args["isdigit"] = dict(type="digits", method="isdigit") - _doc_args["isspace"] = dict(type="whitespace", method="isspace") - _doc_args["islower"] = dict(type="lowercase", method="islower") - _doc_args["isupper"] = dict(type="uppercase", method="isupper") - _doc_args["istitle"] = dict(type="titlecase", method="istitle") - _doc_args["isnumeric"] = dict(type="numeric", method="isnumeric") - _doc_args["isdecimal"] = dict(type="decimal", method="isdecimal") - # force _noarg_wrapper return type with dtype=np.dtype(bool) (GH 29624) - isalnum = _noarg_wrapper( - lambda x: x.isalnum(), - name="isalnum", - docstring=_shared_docs["ismethods"] % _doc_args["isalnum"], - returns_string=False, - dtype=np.dtype(bool), - ) - isalpha = _noarg_wrapper( - lambda x: x.isalpha(), - name="isalpha", - docstring=_shared_docs["ismethods"] % _doc_args["isalpha"], - returns_string=False, - dtype=np.dtype(bool), - ) - isdigit = _noarg_wrapper( - lambda x: x.isdigit(), - name="isdigit", - docstring=_shared_docs["ismethods"] % _doc_args["isdigit"], - returns_string=False, - dtype=np.dtype(bool), - ) - isspace = _noarg_wrapper( - lambda x: x.isspace(), - name="isspace", - docstring=_shared_docs["ismethods"] % _doc_args["isspace"], - returns_string=False, - dtype=np.dtype(bool), - ) - islower = _noarg_wrapper( - lambda x: x.islower(), - name="islower", - docstring=_shared_docs["ismethods"] % _doc_args["islower"], - returns_string=False, - dtype=np.dtype(bool), - ) - isupper = _noarg_wrapper( - lambda x: x.isupper(), - name="isupper", - docstring=_shared_docs["ismethods"] % _doc_args["isupper"], - returns_string=False, - dtype=np.dtype(bool), - ) - istitle = _noarg_wrapper( - lambda x: x.istitle(), - name="istitle", - docstring=_shared_docs["ismethods"] % _doc_args["istitle"], - returns_string=False, - dtype=np.dtype(bool), - ) - isnumeric = _noarg_wrapper( - lambda x: x.isnumeric(), - name="isnumeric", - docstring=_shared_docs["ismethods"] % _doc_args["isnumeric"], - returns_string=False, - dtype=np.dtype(bool), - ) - isdecimal = _noarg_wrapper( - lambda x: x.isdecimal(), - name="isdecimal", - docstring=_shared_docs["ismethods"] % _doc_args["isdecimal"], - returns_string=False, - dtype=np.dtype(bool), - ) - - @classmethod - def _make_accessor(cls, data): - cls._validate(data) - return cls(data) diff --git a/pandas/core/strings/__init__.py b/pandas/core/strings/__init__.py new file mode 100644 index 0000000000000..243250f0360a0 --- /dev/null +++ b/pandas/core/strings/__init__.py @@ -0,0 +1,32 @@ +""" +Implementation of pandas.Series.str and its interface. + +* strings.accessor.StringMethods : Accessor for Series.str +* strings.base.BaseStringArrayMethods: Mixin ABC for EAs to implement str methods + +Most methods on the StringMethods accessor follow the pattern: + + 1. extract the array from the series (or index) + 2. Call that array's implementation of the string method + 3. Wrap the result (in a Series, index, or DataFrame) + +Pandas extension arrays implementing string methods should inherit from +pandas.core.strings.base.BaseStringArrayMethods. This is an ABC defining +the various string methods. To avoid namespace clashes and pollution, +these are prefixed with `_str_`. So ``Series.str.upper()`` calls +``Series.array._str_upper()``. The interface isn't currently public +to other string extension arrays. +""" +# Pandas current implementation is in ObjectStringArrayMixin. This is designed +# to work on object-dtype ndarrays. +# +# BaseStringArrayMethods +# - ObjectStringArrayMixin +# - StringArray +# - PandasArray +# - Categorical + +from .accessor import StringMethods +from .base import BaseStringArrayMethods + +__all__ = ["StringMethods", "BaseStringArrayMethods"] diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py new file mode 100644 index 0000000000000..cae8cc1baf1df --- /dev/null +++ b/pandas/core/strings/accessor.py @@ -0,0 +1,3080 @@ +import codecs +from functools import wraps +import re +from typing import Dict, List, Optional +import warnings + +import numpy as np + +import pandas._libs.lib as lib +from pandas.util._decorators import Appender + +from pandas.core.dtypes.common import ( + ensure_object, + is_bool_dtype, + is_categorical_dtype, + is_integer, + is_list_like, +) +from pandas.core.dtypes.generic import ( + ABCDataFrame, + ABCIndexClass, + ABCMultiIndex, + ABCSeries, +) +from pandas.core.dtypes.missing import isna + +from pandas.core.base import NoNewAttributesMixin + +_shared_docs: Dict[str, str] = dict() +_cpython_optimized_encoders = ( + "utf-8", + "utf8", + "latin-1", + "latin1", + "iso-8859-1", + "mbcs", + "ascii", +) +_cpython_optimized_decoders = _cpython_optimized_encoders + ("utf-16", "utf-32") + + +def forbid_nonstring_types(forbidden, name=None): + """ + Decorator to forbid specific types for a method of StringMethods. + + For calling `.str.{method}` on a Series or Index, it is necessary to first + initialize the :class:`StringMethods` object, and then call the method. + However, different methods allow different input types, and so this can not + be checked during :meth:`StringMethods.__init__`, but must be done on a + per-method basis. This decorator exists to facilitate this process, and + make it explicit which (inferred) types are disallowed by the method. + + :meth:`StringMethods.__init__` allows the *union* of types its different + methods allow (after skipping NaNs; see :meth:`StringMethods._validate`), + namely: ['string', 'empty', 'bytes', 'mixed', 'mixed-integer']. + + The default string types ['string', 'empty'] are allowed for all methods. + For the additional types ['bytes', 'mixed', 'mixed-integer'], each method + then needs to forbid the types it is not intended for. + + Parameters + ---------- + forbidden : list-of-str or None + List of forbidden non-string types, may be one or more of + `['bytes', 'mixed', 'mixed-integer']`. + name : str, default None + Name of the method to use in the error message. By default, this is + None, in which case the name from the method being wrapped will be + copied. However, for working with further wrappers (like _pat_wrapper + and _noarg_wrapper), it is necessary to specify the name. + + Returns + ------- + func : wrapper + The method to which the decorator is applied, with an added check that + enforces the inferred type to not be in the list of forbidden types. + + Raises + ------ + TypeError + If the inferred type of the underlying data is in `forbidden`. + """ + # deal with None + forbidden = [] if forbidden is None else forbidden + + allowed_types = {"string", "empty", "bytes", "mixed", "mixed-integer"} - set( + forbidden + ) + + def _forbid_nonstring_types(func): + func_name = func.__name__ if name is None else name + + @wraps(func) + def wrapper(self, *args, **kwargs): + if self._inferred_dtype not in allowed_types: + msg = ( + f"Cannot use .str.{func_name} with values of " + f"inferred dtype '{self._inferred_dtype}'." + ) + raise TypeError(msg) + return func(self, *args, **kwargs) + + wrapper.__name__ = func_name + return wrapper + + return _forbid_nonstring_types + + +def _map_and_wrap(name, docstring): + @forbid_nonstring_types(["bytes"], name=name) + def wrapper(self): + result = getattr(self._array, f"_str_{name}")() + return self._wrap_result(result) + + wrapper.__doc__ = docstring + return wrapper + + +class StringMethods(NoNewAttributesMixin): + """ + Vectorized string functions for Series and Index. + + NAs stay NA unless handled otherwise by a particular method. + Patterned after Python's string methods, with some inspiration from + R's stringr package. + + Examples + -------- + >>> s = pd.Series(["A_Str_Series"]) + >>> s + 0 A_Str_Series + dtype: object + + >>> s.str.split("_") + 0 [A, Str, Series] + dtype: object + + >>> s.str.replace("_", "") + 0 AStrSeries + dtype: object + """ + + # Note: see the docstring in pandas.core.strings.__init__ + # for an explanation of the implementation. + # TODO: Dispatch all the methods + # Currently the following are not dispatched to the array + # * cat + # * extract + # * extractall + + def __init__(self, data): + from pandas.core.arrays.string_ import StringDtype + + self._inferred_dtype = self._validate(data) + self._is_categorical = is_categorical_dtype(data.dtype) + self._is_string = isinstance(data.dtype, StringDtype) + array = data.array + self._array = array + + if isinstance(data, ABCSeries): + self._index = data.index + self._name = data.name + else: + self._index = self._name = None + + # ._values.categories works for both Series/Index + self._parent = data._values.categories if self._is_categorical else data + # save orig to blow up categoricals to the right type + self._orig = data + self._freeze() + + @staticmethod + def _validate(data): + """ + Auxiliary function for StringMethods, infers and checks dtype of data. + + This is a "first line of defence" at the creation of the StringMethods- + object, and just checks that the dtype is in the + *union* of the allowed types over all string methods below; this + restriction is then refined on a per-method basis using the decorator + @forbid_nonstring_types (more info in the corresponding docstring). + + This really should exclude all series/index with any non-string values, + but that isn't practical for performance reasons until we have a str + dtype (GH 9343 / 13877) + + Parameters + ---------- + data : The content of the Series + + Returns + ------- + dtype : inferred dtype of data + """ + from pandas import StringDtype + + if isinstance(data, ABCMultiIndex): + raise AttributeError( + "Can only use .str accessor with Index, not MultiIndex" + ) + + # see _libs/lib.pyx for list of inferred types + allowed_types = ["string", "empty", "bytes", "mixed", "mixed-integer"] + + values = getattr(data, "values", data) # Series / Index + values = getattr(values, "categories", values) # categorical / normal + + # explicitly allow StringDtype + if isinstance(values.dtype, StringDtype): + return "string" + + try: + inferred_dtype = lib.infer_dtype(values, skipna=True) + except ValueError: + # GH#27571 mostly occurs with ExtensionArray + inferred_dtype = None + + if inferred_dtype not in allowed_types: + raise AttributeError("Can only use .str accessor with string values!") + return inferred_dtype + + def __getitem__(self, key): + result = self._array._str_getitem(key) + return self._wrap_result(result) + + def __iter__(self): + warnings.warn( + "Columnar iteration over characters will be deprecated in future releases.", + FutureWarning, + stacklevel=2, + ) + i = 0 + g = self.get(i) + while g.notna().any(): + yield g + i += 1 + g = self.get(i) + + def _wrap_result( + self, + result, + name=None, + expand=None, + fill_value=np.nan, + returns_string=True, + ): + from pandas import Index, MultiIndex + + if not hasattr(result, "ndim") or not hasattr(result, "dtype"): + return result + assert result.ndim < 3 + + # We can be wrapping a string / object / categorical result, in which + # case we'll want to return the same dtype as the input. + # Or we can be wrapping a numeric output, in which case we don't want + # to return a StringArray. + # Ideally the array method returns the right array type. + if expand is None: + # infer from ndim if expand is not specified + expand = result.ndim != 1 + + elif expand is True and not isinstance(self._orig, ABCIndexClass): + # required when expand=True is explicitly specified + # not needed when inferred + + def cons_row(x): + if is_list_like(x): + return x + else: + return [x] + + result = [cons_row(x) for x in result] + if result: + # propagate nan values to match longest sequence (GH 18450) + max_len = max(len(x) for x in result) + result = [ + x * max_len if len(x) == 0 or x[0] is np.nan else x for x in result + ] + + if not isinstance(expand, bool): + raise ValueError("expand must be True or False") + + if expand is False: + # if expand is False, result should have the same name + # as the original otherwise specified + if name is None: + name = getattr(result, "name", None) + if name is None: + # do not use logical or, _orig may be a DataFrame + # which has "name" column + name = self._orig.name + + # Wait until we are sure result is a Series or Index before + # checking attributes (GH 12180) + if isinstance(self._orig, ABCIndexClass): + # if result is a boolean np.array, return the np.array + # instead of wrapping it into a boolean Index (GH 8875) + if is_bool_dtype(result): + return result + + if expand: + result = list(result) + out = MultiIndex.from_tuples(result, names=name) + if out.nlevels == 1: + # We had all tuples of length-one, which are + # better represented as a regular Index. + out = out.get_level_values(0) + return out + else: + return Index(result, name=name) + else: + index = self._orig.index + # This is a mess. + dtype: Optional[str] + if self._is_string and returns_string: + dtype = "string" + else: + dtype = None + + if expand: + cons = self._orig._constructor_expanddim + result = cons(result, columns=name, index=index, dtype=dtype) + else: + # Must be a Series + cons = self._orig._constructor + result = cons(result, name=name, index=index) + return result + + def _get_series_list(self, others): + """ + Auxiliary function for :meth:`str.cat`. Turn potentially mixed input + into a list of Series (elements without an index must match the length + of the calling Series/Index). + + Parameters + ---------- + others : Series, DataFrame, np.ndarray, list-like or list-like of + Objects that are either Series, Index or np.ndarray (1-dim). + + Returns + ------- + list of Series + Others transformed into list of Series. + """ + from pandas import DataFrame, Series + + # self._orig is either Series or Index + idx = self._orig if isinstance(self._orig, ABCIndexClass) else self._orig.index + + # Generally speaking, all objects without an index inherit the index + # `idx` of the calling Series/Index - i.e. must have matching length. + # Objects with an index (i.e. Series/Index/DataFrame) keep their own. + if isinstance(others, ABCSeries): + return [others] + elif isinstance(others, ABCIndexClass): + return [Series(others._values, index=idx)] + elif isinstance(others, ABCDataFrame): + return [others[x] for x in others] + elif isinstance(others, np.ndarray) and others.ndim == 2: + others = DataFrame(others, index=idx) + return [others[x] for x in others] + elif is_list_like(others, allow_sets=False): + others = list(others) # ensure iterators do not get read twice etc + + # in case of list-like `others`, all elements must be + # either Series/Index/np.ndarray (1-dim)... + if all( + isinstance(x, (ABCSeries, ABCIndexClass)) + or (isinstance(x, np.ndarray) and x.ndim == 1) + for x in others + ): + los: List[Series] = [] + while others: # iterate through list and append each element + los = los + self._get_series_list(others.pop(0)) + return los + # ... or just strings + elif all(not is_list_like(x) for x in others): + return [Series(others, index=idx)] + raise TypeError( + "others must be Series, Index, DataFrame, np.ndarray " + "or list-like (either containing only strings or " + "containing only objects of type Series/Index/" + "np.ndarray[1-dim])" + ) + + @forbid_nonstring_types(["bytes", "mixed", "mixed-integer"]) + def cat(self, others=None, sep=None, na_rep=None, join="left"): + """ + Concatenate strings in the Series/Index with given separator. + + If `others` is specified, this function concatenates the Series/Index + and elements of `others` element-wise. + If `others` is not passed, then all values in the Series/Index are + concatenated into a single string with a given `sep`. + + Parameters + ---------- + others : Series, Index, DataFrame, np.ndarray or list-like + Series, Index, DataFrame, np.ndarray (one- or two-dimensional) and + other list-likes of strings must have the same length as the + calling Series/Index, with the exception of indexed objects (i.e. + Series/Index/DataFrame) if `join` is not None. + + If others is a list-like that contains a combination of Series, + Index or np.ndarray (1-dim), then all elements will be unpacked and + must satisfy the above criteria individually. + + If others is None, the method returns the concatenation of all + strings in the calling Series/Index. + sep : str, default '' + The separator between the different elements/columns. By default + the empty string `''` is used. + na_rep : str or None, default None + Representation that is inserted for all missing values: + + - If `na_rep` is None, and `others` is None, missing values in the + Series/Index are omitted from the result. + - If `na_rep` is None, and `others` is not None, a row containing a + missing value in any of the columns (before concatenation) will + have a missing value in the result. + join : {'left', 'right', 'outer', 'inner'}, default 'left' + Determines the join-style between the calling Series/Index and any + Series/Index/DataFrame in `others` (objects without an index need + to match the length of the calling Series/Index). To disable + alignment, use `.values` on any Series/Index/DataFrame in `others`. + + .. versionadded:: 0.23.0 + .. versionchanged:: 1.0.0 + Changed default of `join` from None to `'left'`. + + Returns + ------- + str, Series or Index + If `others` is None, `str` is returned, otherwise a `Series/Index` + (same type as caller) of objects is returned. + + See Also + -------- + split : Split each string in the Series/Index. + join : Join lists contained as elements in the Series/Index. + + Examples + -------- + When not passing `others`, all values are concatenated into a single + string: + + >>> s = pd.Series(['a', 'b', np.nan, 'd']) + >>> s.str.cat(sep=' ') + 'a b d' + + By default, NA values in the Series are ignored. Using `na_rep`, they + can be given a representation: + + >>> s.str.cat(sep=' ', na_rep='?') + 'a b ? d' + + If `others` is specified, corresponding values are concatenated with + the separator. Result will be a Series of strings. + + >>> s.str.cat(['A', 'B', 'C', 'D'], sep=',') + 0 a,A + 1 b,B + 2 NaN + 3 d,D + dtype: object + + Missing values will remain missing in the result, but can again be + represented using `na_rep` + + >>> s.str.cat(['A', 'B', 'C', 'D'], sep=',', na_rep='-') + 0 a,A + 1 b,B + 2 -,C + 3 d,D + dtype: object + + If `sep` is not specified, the values are concatenated without + separation. + + >>> s.str.cat(['A', 'B', 'C', 'D'], na_rep='-') + 0 aA + 1 bB + 2 -C + 3 dD + dtype: object + + Series with different indexes can be aligned before concatenation. The + `join`-keyword works as in other methods. + + >>> t = pd.Series(['d', 'a', 'e', 'c'], index=[3, 0, 4, 2]) + >>> s.str.cat(t, join='left', na_rep='-') + 0 aa + 1 b- + 2 -c + 3 dd + dtype: object + >>> + >>> s.str.cat(t, join='outer', na_rep='-') + 0 aa + 1 b- + 2 -c + 3 dd + 4 -e + dtype: object + >>> + >>> s.str.cat(t, join='inner', na_rep='-') + 0 aa + 2 -c + 3 dd + dtype: object + >>> + >>> s.str.cat(t, join='right', na_rep='-') + 3 dd + 0 aa + 4 -e + 2 -c + dtype: object + + For more examples, see :ref:`here `. + """ + # TODO: dispatch + from pandas import Index, Series, concat + + if isinstance(others, str): + raise ValueError("Did you mean to supply a `sep` keyword?") + if sep is None: + sep = "" + + if isinstance(self._orig, ABCIndexClass): + data = Series(self._orig, index=self._orig) + else: # Series + data = self._orig + + # concatenate Series/Index with itself if no "others" + if others is None: + data = ensure_object(data) + na_mask = isna(data) + if na_rep is None and na_mask.any(): + data = data[~na_mask] + elif na_rep is not None and na_mask.any(): + data = np.where(na_mask, na_rep, data) + return sep.join(data) + + try: + # turn anything in "others" into lists of Series + others = self._get_series_list(others) + except ValueError as err: # do not catch TypeError raised by _get_series_list + raise ValueError( + "If `others` contains arrays or lists (or other " + "list-likes without an index), these must all be " + "of the same length as the calling Series/Index." + ) from err + + # align if required + if any(not data.index.equals(x.index) for x in others): + # Need to add keys for uniqueness in case of duplicate columns + others = concat( + others, + axis=1, + join=(join if join == "inner" else "outer"), + keys=range(len(others)), + sort=False, + copy=False, + ) + data, others = data.align(others, join=join) + others = [others[x] for x in others] # again list of Series + + all_cols = [ensure_object(x) for x in [data] + others] + na_masks = np.array([isna(x) for x in all_cols]) + union_mask = np.logical_or.reduce(na_masks, axis=0) + + if na_rep is None and union_mask.any(): + # no na_rep means NaNs for all rows where any column has a NaN + # only necessary if there are actually any NaNs + result = np.empty(len(data), dtype=object) + np.putmask(result, union_mask, np.nan) + + not_masked = ~union_mask + result[not_masked] = cat_safe([x[not_masked] for x in all_cols], sep) + elif na_rep is not None and union_mask.any(): + # fill NaNs with na_rep in case there are actually any NaNs + all_cols = [ + np.where(nm, na_rep, col) for nm, col in zip(na_masks, all_cols) + ] + result = cat_safe(all_cols, sep) + else: + # no NaNs - can just concatenate + result = cat_safe(all_cols, sep) + + if isinstance(self._orig, ABCIndexClass): + # add dtype for case that result is all-NA + result = Index(result, dtype=object, name=self._orig.name) + else: # Series + if is_categorical_dtype(self._orig.dtype): + # We need to infer the new categories. + dtype = None + else: + dtype = self._orig.dtype + result = Series(result, dtype=dtype, index=data.index, name=self._orig.name) + return result + + _shared_docs[ + "str_split" + ] = r""" + Split strings around given separator/delimiter. + + Splits the string in the Series/Index from the %(side)s, + at the specified delimiter string. Equivalent to :meth:`str.%(method)s`. + + Parameters + ---------- + pat : str, optional + String or regular expression to split on. + If not specified, split on whitespace. + n : int, default -1 (all) + Limit number of splits in output. + ``None``, 0 and -1 will be interpreted as return all splits. + expand : bool, default False + Expand the split strings into separate columns. + + * If ``True``, return DataFrame/MultiIndex expanding dimensionality. + * If ``False``, return Series/Index, containing lists of strings. + + Returns + ------- + Series, Index, DataFrame or MultiIndex + Type matches caller unless ``expand=True`` (see Notes). + + See Also + -------- + Series.str.split : Split strings around given separator/delimiter. + Series.str.rsplit : Splits string around given separator/delimiter, + starting from the right. + Series.str.join : Join lists contained as elements in the Series/Index + with passed delimiter. + str.split : Standard library version for split. + str.rsplit : Standard library version for rsplit. + + Notes + ----- + The handling of the `n` keyword depends on the number of found splits: + + - If found splits > `n`, make first `n` splits only + - If found splits <= `n`, make all splits + - If for a certain row the number of found splits < `n`, + append `None` for padding up to `n` if ``expand=True`` + + If using ``expand=True``, Series and Index callers return DataFrame and + MultiIndex objects, respectively. + + Examples + -------- + >>> s = pd.Series( + ... [ + ... "this is a regular sentence", + ... "https://docs.python.org/3/tutorial/index.html", + ... np.nan + ... ] + ... ) + >>> s + 0 this is a regular sentence + 1 https://docs.python.org/3/tutorial/index.html + 2 NaN + dtype: object + + In the default setting, the string is split by whitespace. + + >>> s.str.split() + 0 [this, is, a, regular, sentence] + 1 [https://docs.python.org/3/tutorial/index.html] + 2 NaN + dtype: object + + Without the `n` parameter, the outputs of `rsplit` and `split` + are identical. + + >>> s.str.rsplit() + 0 [this, is, a, regular, sentence] + 1 [https://docs.python.org/3/tutorial/index.html] + 2 NaN + dtype: object + + The `n` parameter can be used to limit the number of splits on the + delimiter. The outputs of `split` and `rsplit` are different. + + >>> s.str.split(n=2) + 0 [this, is, a regular sentence] + 1 [https://docs.python.org/3/tutorial/index.html] + 2 NaN + dtype: object + + >>> s.str.rsplit(n=2) + 0 [this is a, regular, sentence] + 1 [https://docs.python.org/3/tutorial/index.html] + 2 NaN + dtype: object + + The `pat` parameter can be used to split by other characters. + + >>> s.str.split(pat="/") + 0 [this is a regular sentence] + 1 [https:, , docs.python.org, 3, tutorial, index... + 2 NaN + dtype: object + + When using ``expand=True``, the split elements will expand out into + separate columns. If NaN is present, it is propagated throughout + the columns during the split. + + >>> s.str.split(expand=True) + 0 1 2 3 4 + 0 this is a regular sentence + 1 https://docs.python.org/3/tutorial/index.html None None None None + 2 NaN NaN NaN NaN NaN + + For slightly more complex use cases like splitting the html document name + from a url, a combination of parameter settings can be used. + + >>> s.str.rsplit("/", n=1, expand=True) + 0 1 + 0 this is a regular sentence None + 1 https://docs.python.org/3/tutorial index.html + 2 NaN NaN + + Remember to escape special characters when explicitly using regular + expressions. + + >>> s = pd.Series(["1+1=2"]) + >>> s + 0 1+1=2 + dtype: object + >>> s.str.split(r"\+|=", expand=True) + 0 1 2 + 0 1 1 2 + """ + + @Appender(_shared_docs["str_split"] % {"side": "beginning", "method": "split"}) + @forbid_nonstring_types(["bytes"]) + def split(self, pat=None, n=-1, expand=False): + result = self._array._str_split(pat, n, expand) + return self._wrap_result(result, returns_string=expand, expand=expand) + + @Appender(_shared_docs["str_split"] % {"side": "end", "method": "rsplit"}) + @forbid_nonstring_types(["bytes"]) + def rsplit(self, pat=None, n=-1, expand=False): + result = self._array._str_rsplit(pat, n=n) + return self._wrap_result(result, expand=expand, returns_string=expand) + + _shared_docs[ + "str_partition" + ] = """ + Split the string at the %(side)s occurrence of `sep`. + + This method splits the string at the %(side)s occurrence of `sep`, + and returns 3 elements containing the part before the separator, + the separator itself, and the part after the separator. + If the separator is not found, return %(return)s. + + Parameters + ---------- + sep : str, default whitespace + String to split on. + expand : bool, default True + If True, return DataFrame/MultiIndex expanding dimensionality. + If False, return Series/Index. + + Returns + ------- + DataFrame/MultiIndex or Series/Index of objects + + See Also + -------- + %(also)s + Series.str.split : Split strings around given separators. + str.partition : Standard library version. + + Examples + -------- + + >>> s = pd.Series(['Linda van der Berg', 'George Pitt-Rivers']) + >>> s + 0 Linda van der Berg + 1 George Pitt-Rivers + dtype: object + + >>> s.str.partition() + 0 1 2 + 0 Linda van der Berg + 1 George Pitt-Rivers + + To partition by the last space instead of the first one: + + >>> s.str.rpartition() + 0 1 2 + 0 Linda van der Berg + 1 George Pitt-Rivers + + To partition by something different than a space: + + >>> s.str.partition('-') + 0 1 2 + 0 Linda van der Berg + 1 George Pitt - Rivers + + To return a Series containing tuples instead of a DataFrame: + + >>> s.str.partition('-', expand=False) + 0 (Linda van der Berg, , ) + 1 (George Pitt, -, Rivers) + dtype: object + + Also available on indices: + + >>> idx = pd.Index(['X 123', 'Y 999']) + >>> idx + Index(['X 123', 'Y 999'], dtype='object') + + Which will create a MultiIndex: + + >>> idx.str.partition() + MultiIndex([('X', ' ', '123'), + ('Y', ' ', '999')], + ) + + Or an index with tuples with ``expand=False``: + + >>> idx.str.partition(expand=False) + Index([('X', ' ', '123'), ('Y', ' ', '999')], dtype='object') + """ + + @Appender( + _shared_docs["str_partition"] + % { + "side": "first", + "return": "3 elements containing the string itself, followed by two " + "empty strings", + "also": "rpartition : Split the string at the last occurrence of `sep`.", + } + ) + @forbid_nonstring_types(["bytes"]) + def partition(self, sep=" ", expand=True): + result = self._array._str_partition(sep, expand) + return self._wrap_result(result, expand=expand, returns_string=expand) + + @Appender( + _shared_docs["str_partition"] + % { + "side": "last", + "return": "3 elements containing two empty strings, followed by the " + "string itself", + "also": "partition : Split the string at the first occurrence of `sep`.", + } + ) + @forbid_nonstring_types(["bytes"]) + def rpartition(self, sep=" ", expand=True): + result = self._array._str_rpartition(sep, expand) + return self._wrap_result(result, expand=expand, returns_string=expand) + + def get(self, i): + """ + Extract element from each component at specified position. + + Extract element from lists, tuples, or strings in each element in the + Series/Index. + + Parameters + ---------- + i : int + Position of element to extract. + + Returns + ------- + Series or Index + + Examples + -------- + >>> s = pd.Series(["String", + ... (1, 2, 3), + ... ["a", "b", "c"], + ... 123, + ... -456, + ... {1: "Hello", "2": "World"}]) + >>> s + 0 String + 1 (1, 2, 3) + 2 [a, b, c] + 3 123 + 4 -456 + 5 {1: 'Hello', '2': 'World'} + dtype: object + + >>> s.str.get(1) + 0 t + 1 2 + 2 b + 3 NaN + 4 NaN + 5 Hello + dtype: object + + >>> s.str.get(-1) + 0 g + 1 3 + 2 c + 3 NaN + 4 NaN + 5 None + dtype: object + """ + result = self._array._str_get(i) + return self._wrap_result(result) + + @forbid_nonstring_types(["bytes"]) + def join(self, sep): + """ + Join lists contained as elements in the Series/Index with passed delimiter. + + If the elements of a Series are lists themselves, join the content of these + lists using the delimiter passed to the function. + This function is an equivalent to :meth:`str.join`. + + Parameters + ---------- + sep : str + Delimiter to use between list entries. + + Returns + ------- + Series/Index: object + The list entries concatenated by intervening occurrences of the + delimiter. + + Raises + ------ + AttributeError + If the supplied Series contains neither strings nor lists. + + See Also + -------- + str.join : Standard library version of this method. + Series.str.split : Split strings around given separator/delimiter. + + Notes + ----- + If any of the list items is not a string object, the result of the join + will be `NaN`. + + Examples + -------- + Example with a list that contains non-string elements. + + >>> s = pd.Series([['lion', 'elephant', 'zebra'], + ... [1.1, 2.2, 3.3], + ... ['cat', np.nan, 'dog'], + ... ['cow', 4.5, 'goat'], + ... ['duck', ['swan', 'fish'], 'guppy']]) + >>> s + 0 [lion, elephant, zebra] + 1 [1.1, 2.2, 3.3] + 2 [cat, nan, dog] + 3 [cow, 4.5, goat] + 4 [duck, [swan, fish], guppy] + dtype: object + + Join all lists using a '-'. The lists containing object(s) of types other + than str will produce a NaN. + + >>> s.str.join('-') + 0 lion-elephant-zebra + 1 NaN + 2 NaN + 3 NaN + 4 NaN + dtype: object + """ + result = self._array._str_join(sep) + return self._wrap_result(result) + + @forbid_nonstring_types(["bytes"]) + def contains(self, pat, case=True, flags=0, na=None, regex=True): + r""" + Test if pattern or regex is contained within a string of a Series or Index. + + Return boolean Series or Index based on whether a given pattern or regex is + contained within a string of a Series or Index. + + Parameters + ---------- + pat : str + Character sequence or regular expression. + case : bool, default True + If True, case sensitive. + flags : int, default 0 (no flags) + Flags to pass through to the re module, e.g. re.IGNORECASE. + na : scalar, optional + Fill value for missing values. The default depends on dtype of the + array. For object-dtype, ``numpy.nan`` is used. For ``StringDtype``, + ``pandas.NA`` is used. + regex : bool, default True + If True, assumes the pat is a regular expression. + + If False, treats the pat as a literal string. + + Returns + ------- + Series or Index of boolean values + A Series or Index of boolean values indicating whether the + given pattern is contained within the string of each element + of the Series or Index. + + See Also + -------- + match : Analogous, but stricter, relying on re.match instead of re.search. + Series.str.startswith : Test if the start of each string element matches a + pattern. + Series.str.endswith : Same as startswith, but tests the end of string. + + Examples + -------- + Returning a Series of booleans using only a literal pattern. + + >>> s1 = pd.Series(['Mouse', 'dog', 'house and parrot', '23', np.NaN]) + >>> s1.str.contains('og', regex=False) + 0 False + 1 True + 2 False + 3 False + 4 NaN + dtype: object + + Returning an Index of booleans using only a literal pattern. + + >>> ind = pd.Index(['Mouse', 'dog', 'house and parrot', '23.0', np.NaN]) + >>> ind.str.contains('23', regex=False) + Index([False, False, False, True, nan], dtype='object') + + Specifying case sensitivity using `case`. + + >>> s1.str.contains('oG', case=True, regex=True) + 0 False + 1 False + 2 False + 3 False + 4 NaN + dtype: object + + Specifying `na` to be `False` instead of `NaN` replaces NaN values + with `False`. If Series or Index does not contain NaN values + the resultant dtype will be `bool`, otherwise, an `object` dtype. + + >>> s1.str.contains('og', na=False, regex=True) + 0 False + 1 True + 2 False + 3 False + 4 False + dtype: bool + + Returning 'house' or 'dog' when either expression occurs in a string. + + >>> s1.str.contains('house|dog', regex=True) + 0 False + 1 True + 2 True + 3 False + 4 NaN + dtype: object + + Ignoring case sensitivity using `flags` with regex. + + >>> import re + >>> s1.str.contains('PARROT', flags=re.IGNORECASE, regex=True) + 0 False + 1 False + 2 True + 3 False + 4 NaN + dtype: object + + Returning any digit using regular expression. + + >>> s1.str.contains('\\d', regex=True) + 0 False + 1 False + 2 False + 3 True + 4 NaN + dtype: object + + Ensure `pat` is a not a literal pattern when `regex` is set to True. + Note in the following example one might expect only `s2[1]` and `s2[3]` to + return `True`. However, '.0' as a regex matches any character + followed by a 0. + + >>> s2 = pd.Series(['40', '40.0', '41', '41.0', '35']) + >>> s2.str.contains('.0', regex=True) + 0 True + 1 True + 2 False + 3 True + 4 False + dtype: bool + """ + result = self._array._str_contains(pat, case, flags, na, regex) + return self._wrap_result(result, fill_value=na, returns_string=False) + + @forbid_nonstring_types(["bytes"]) + def match(self, pat, case=True, flags=0, na=None): + """ + Determine if each string starts with a match of a regular expression. + + Parameters + ---------- + pat : str + Character sequence or regular expression. + case : bool, default True + If True, case sensitive. + flags : int, default 0 (no flags) + Regex module flags, e.g. re.IGNORECASE. + na : scalar, optional + Fill value for missing values. The default depends on dtype of the + array. For object-dtype, ``numpy.nan`` is used. For ``StringDtype``, + ``pandas.NA`` is used. + + Returns + ------- + Series/array of boolean values + + See Also + -------- + fullmatch : Stricter matching that requires the entire string to match. + contains : Analogous, but less strict, relying on re.search instead of + re.match. + extract : Extract matched groups. + """ + result = self._array._str_match(pat, case=case, flags=flags, na=na) + return self._wrap_result(result, fill_value=na, returns_string=False) + + @forbid_nonstring_types(["bytes"]) + def fullmatch(self, pat, case=True, flags=0, na=None): + """ + Determine if each string entirely matches a regular expression. + + .. versionadded:: 1.1.0 + + Parameters + ---------- + pat : str + Character sequence or regular expression. + case : bool, default True + If True, case sensitive. + flags : int, default 0 (no flags) + Regex module flags, e.g. re.IGNORECASE. + na : scalar, optional. + Fill value for missing values. The default depends on dtype of the + array. For object-dtype, ``numpy.nan`` is used. For ``StringDtype``, + ``pandas.NA`` is used. + + Returns + ------- + Series/array of boolean values + + See Also + -------- + match : Similar, but also returns `True` when only a *prefix* of the string + matches the regular expression. + extract : Extract matched groups. + """ + result = self._array._str_fullmatch(pat, case=case, flags=flags, na=na) + return self._wrap_result(result, fill_value=na, returns_string=False) + + @forbid_nonstring_types(["bytes"]) + def replace(self, pat, repl, n=-1, case=None, flags=0, regex=True): + r""" + Replace each occurrence of pattern/regex in the Series/Index. + + Equivalent to :meth:`str.replace` or :func:`re.sub`, depending on + the regex value. + + Parameters + ---------- + pat : str or compiled regex + String can be a character sequence or regular expression. + repl : str or callable + Replacement string or a callable. The callable is passed the regex + match object and must return a replacement string to be used. + See :func:`re.sub`. + n : int, default -1 (all) + Number of replacements to make from start. + case : bool, default None + Determines if replace is case sensitive: + + - If True, case sensitive (the default if `pat` is a string) + - Set to False for case insensitive + - Cannot be set if `pat` is a compiled regex. + + flags : int, default 0 (no flags) + Regex module flags, e.g. re.IGNORECASE. Cannot be set if `pat` is a compiled + regex. + regex : bool, default True + Determines if assumes the passed-in pattern is a regular expression: + + - If True, assumes the passed-in pattern is a regular expression. + - If False, treats the pattern as a literal string + - Cannot be set to False if `pat` is a compiled regex or `repl` is + a callable. + + .. versionadded:: 0.23.0 + + Returns + ------- + Series or Index of object + A copy of the object with all matching occurrences of `pat` replaced by + `repl`. + + Raises + ------ + ValueError + * if `regex` is False and `repl` is a callable or `pat` is a compiled + regex + * if `pat` is a compiled regex and `case` or `flags` is set + + Notes + ----- + When `pat` is a compiled regex, all flags should be included in the + compiled regex. Use of `case`, `flags`, or `regex=False` with a compiled + regex will raise an error. + + Examples + -------- + When `pat` is a string and `regex` is True (the default), the given `pat` + is compiled as a regex. When `repl` is a string, it replaces matching + regex patterns as with :meth:`re.sub`. NaN value(s) in the Series are + left as is: + + >>> pd.Series(['foo', 'fuz', np.nan]).str.replace('f.', 'ba', regex=True) + 0 bao + 1 baz + 2 NaN + dtype: object + + When `pat` is a string and `regex` is False, every `pat` is replaced with + `repl` as with :meth:`str.replace`: + + >>> pd.Series(['f.o', 'fuz', np.nan]).str.replace('f.', 'ba', regex=False) + 0 bao + 1 fuz + 2 NaN + dtype: object + + When `repl` is a callable, it is called on every `pat` using + :func:`re.sub`. The callable should expect one positional argument + (a regex object) and return a string. + + To get the idea: + + >>> pd.Series(['foo', 'fuz', np.nan]).str.replace('f', repr) + 0 oo + 1 uz + 2 NaN + dtype: object + + Reverse every lowercase alphabetic word: + + >>> repl = lambda m: m.group(0)[::-1] + >>> pd.Series(['foo 123', 'bar baz', np.nan]).str.replace(r'[a-z]+', repl) + 0 oof 123 + 1 rab zab + 2 NaN + dtype: object + + Using regex groups (extract second group and swap case): + + >>> pat = r"(?P\w+) (?P\w+) (?P\w+)" + >>> repl = lambda m: m.group('two').swapcase() + >>> pd.Series(['One Two Three', 'Foo Bar Baz']).str.replace(pat, repl) + 0 tWO + 1 bAR + dtype: object + + Using a compiled regex with flags + + >>> import re + >>> regex_pat = re.compile(r'FUZ', flags=re.IGNORECASE) + >>> pd.Series(['foo', 'fuz', np.nan]).str.replace(regex_pat, 'bar') + 0 foo + 1 bar + 2 NaN + dtype: object + """ + result = self._array._str_replace( + pat, repl, n=n, case=case, flags=flags, regex=regex + ) + return self._wrap_result(result) + + @forbid_nonstring_types(["bytes"]) + def repeat(self, repeats): + """ + Duplicate each string in the Series or Index. + + Parameters + ---------- + repeats : int or sequence of int + Same value for all (int) or different value per (sequence). + + Returns + ------- + Series or Index of object + Series or Index of repeated string objects specified by + input parameter repeats. + + Examples + -------- + >>> s = pd.Series(['a', 'b', 'c']) + >>> s + 0 a + 1 b + 2 c + dtype: object + + Single int repeats string in Series + + >>> s.str.repeat(repeats=2) + 0 aa + 1 bb + 2 cc + dtype: object + + Sequence of int repeats corresponding string in Series + + >>> s.str.repeat(repeats=[1, 2, 3]) + 0 a + 1 bb + 2 ccc + dtype: object + """ + result = self._array._str_repeat(repeats) + return self._wrap_result(result) + + @forbid_nonstring_types(["bytes"]) + def pad(self, width, side="left", fillchar=" "): + """ + Pad strings in the Series/Index up to width. + + Parameters + ---------- + width : int + Minimum width of resulting string; additional characters will be filled + with character defined in `fillchar`. + side : {'left', 'right', 'both'}, default 'left' + Side from which to fill resulting string. + fillchar : str, default ' ' + Additional character for filling, default is whitespace. + + Returns + ------- + Series or Index of object + Returns Series or Index with minimum number of char in object. + + See Also + -------- + Series.str.rjust : Fills the left side of strings with an arbitrary + character. Equivalent to ``Series.str.pad(side='left')``. + Series.str.ljust : Fills the right side of strings with an arbitrary + character. Equivalent to ``Series.str.pad(side='right')``. + Series.str.center : Fills both sides of strings with an arbitrary + character. Equivalent to ``Series.str.pad(side='both')``. + Series.str.zfill : Pad strings in the Series/Index by prepending '0' + character. Equivalent to ``Series.str.pad(side='left', fillchar='0')``. + + Examples + -------- + >>> s = pd.Series(["caribou", "tiger"]) + >>> s + 0 caribou + 1 tiger + dtype: object + + >>> s.str.pad(width=10) + 0 caribou + 1 tiger + dtype: object + + >>> s.str.pad(width=10, side='right', fillchar='-') + 0 caribou--- + 1 tiger----- + dtype: object + + >>> s.str.pad(width=10, side='both', fillchar='-') + 0 -caribou-- + 1 --tiger--- + dtype: object + """ + if not isinstance(fillchar, str): + msg = f"fillchar must be a character, not {type(fillchar).__name__}" + raise TypeError(msg) + + if len(fillchar) != 1: + raise TypeError("fillchar must be a character, not str") + + if not is_integer(width): + msg = f"width must be of integer type, not {type(width).__name__}" + raise TypeError(msg) + + result = self._array._str_pad(width, side=side, fillchar=fillchar) + return self._wrap_result(result) + + _shared_docs[ + "str_pad" + ] = """ + Pad %(side)s side of strings in the Series/Index. + + Equivalent to :meth:`str.%(method)s`. + + Parameters + ---------- + width : int + Minimum width of resulting string; additional characters will be filled + with ``fillchar``. + fillchar : str + Additional character for filling, default is whitespace. + + Returns + ------- + filled : Series/Index of objects. + """ + + @Appender(_shared_docs["str_pad"] % dict(side="left and right", method="center")) + @forbid_nonstring_types(["bytes"]) + def center(self, width, fillchar=" "): + return self.pad(width, side="both", fillchar=fillchar) + + @Appender(_shared_docs["str_pad"] % dict(side="right", method="ljust")) + @forbid_nonstring_types(["bytes"]) + def ljust(self, width, fillchar=" "): + return self.pad(width, side="right", fillchar=fillchar) + + @Appender(_shared_docs["str_pad"] % dict(side="left", method="rjust")) + @forbid_nonstring_types(["bytes"]) + def rjust(self, width, fillchar=" "): + return self.pad(width, side="left", fillchar=fillchar) + + @forbid_nonstring_types(["bytes"]) + def zfill(self, width): + """ + Pad strings in the Series/Index by prepending '0' characters. + + Strings in the Series/Index are padded with '0' characters on the + left of the string to reach a total string length `width`. Strings + in the Series/Index with length greater or equal to `width` are + unchanged. + + Parameters + ---------- + width : int + Minimum length of resulting string; strings with length less + than `width` be prepended with '0' characters. + + Returns + ------- + Series/Index of objects. + + See Also + -------- + Series.str.rjust : Fills the left side of strings with an arbitrary + character. + Series.str.ljust : Fills the right side of strings with an arbitrary + character. + Series.str.pad : Fills the specified sides of strings with an arbitrary + character. + Series.str.center : Fills both sides of strings with an arbitrary + character. + + Notes + ----- + Differs from :meth:`str.zfill` which has special handling + for '+'/'-' in the string. + + Examples + -------- + >>> s = pd.Series(['-1', '1', '1000', 10, np.nan]) + >>> s + 0 -1 + 1 1 + 2 1000 + 3 10 + 4 NaN + dtype: object + + Note that ``10`` and ``NaN`` are not strings, therefore they are + converted to ``NaN``. The minus sign in ``'-1'`` is treated as a + regular character and the zero is added to the left of it + (:meth:`str.zfill` would have moved it to the left). ``1000`` + remains unchanged as it is longer than `width`. + + >>> s.str.zfill(3) + 0 0-1 + 1 001 + 2 1000 + 3 NaN + 4 NaN + dtype: object + """ + result = self.pad(width, side="left", fillchar="0") + return self._wrap_result(result) + + def slice(self, start=None, stop=None, step=None): + """ + Slice substrings from each element in the Series or Index. + + Parameters + ---------- + start : int, optional + Start position for slice operation. + stop : int, optional + Stop position for slice operation. + step : int, optional + Step size for slice operation. + + Returns + ------- + Series or Index of object + Series or Index from sliced substring from original string object. + + See Also + -------- + Series.str.slice_replace : Replace a slice with a string. + Series.str.get : Return element at position. + Equivalent to `Series.str.slice(start=i, stop=i+1)` with `i` + being the position. + + Examples + -------- + >>> s = pd.Series(["koala", "fox", "chameleon"]) + >>> s + 0 koala + 1 fox + 2 chameleon + dtype: object + + >>> s.str.slice(start=1) + 0 oala + 1 ox + 2 hameleon + dtype: object + + >>> s.str.slice(start=-1) + 0 a + 1 x + 2 n + dtype: object + + >>> s.str.slice(stop=2) + 0 ko + 1 fo + 2 ch + dtype: object + + >>> s.str.slice(step=2) + 0 kaa + 1 fx + 2 caeen + dtype: object + + >>> s.str.slice(start=0, stop=5, step=3) + 0 kl + 1 f + 2 cm + dtype: object + + Equivalent behaviour to: + + >>> s.str[0:5:3] + 0 kl + 1 f + 2 cm + dtype: object + """ + result = self._array._str_slice(start, stop, step) + return self._wrap_result(result) + + @forbid_nonstring_types(["bytes"]) + def slice_replace(self, start=None, stop=None, repl=None): + """ + Replace a positional slice of a string with another value. + + Parameters + ---------- + start : int, optional + Left index position to use for the slice. If not specified (None), + the slice is unbounded on the left, i.e. slice from the start + of the string. + stop : int, optional + Right index position to use for the slice. If not specified (None), + the slice is unbounded on the right, i.e. slice until the + end of the string. + repl : str, optional + String for replacement. If not specified (None), the sliced region + is replaced with an empty string. + + Returns + ------- + Series or Index + Same type as the original object. + + See Also + -------- + Series.str.slice : Just slicing without replacement. + + Examples + -------- + >>> s = pd.Series(['a', 'ab', 'abc', 'abdc', 'abcde']) + >>> s + 0 a + 1 ab + 2 abc + 3 abdc + 4 abcde + dtype: object + + Specify just `start`, meaning replace `start` until the end of the + string with `repl`. + + >>> s.str.slice_replace(1, repl='X') + 0 aX + 1 aX + 2 aX + 3 aX + 4 aX + dtype: object + + Specify just `stop`, meaning the start of the string to `stop` is replaced + with `repl`, and the rest of the string is included. + + >>> s.str.slice_replace(stop=2, repl='X') + 0 X + 1 X + 2 Xc + 3 Xdc + 4 Xcde + dtype: object + + Specify `start` and `stop`, meaning the slice from `start` to `stop` is + replaced with `repl`. Everything before or after `start` and `stop` is + included as is. + + >>> s.str.slice_replace(start=1, stop=3, repl='X') + 0 aX + 1 aX + 2 aX + 3 aXc + 4 aXde + dtype: object + """ + result = self._array._str_slice_replace(start, stop, repl) + return self._wrap_result(result) + + def decode(self, encoding, errors="strict"): + """ + Decode character string in the Series/Index using indicated encoding. + + Equivalent to :meth:`str.decode` in python2 and :meth:`bytes.decode` in + python3. + + Parameters + ---------- + encoding : str + errors : str, optional + + Returns + ------- + Series or Index + """ + # TODO: Add a similar _bytes interface. + if encoding in _cpython_optimized_decoders: + # CPython optimized implementation + f = lambda x: x.decode(encoding, errors) + else: + decoder = codecs.getdecoder(encoding) + f = lambda x: decoder(x, errors)[0] + arr = self._array + # assert isinstance(arr, (StringArray,)) + result = arr._str_map(f) + return self._wrap_result(result) + + @forbid_nonstring_types(["bytes"]) + def encode(self, encoding, errors="strict"): + """ + Encode character string in the Series/Index using indicated encoding. + + Equivalent to :meth:`str.encode`. + + Parameters + ---------- + encoding : str + errors : str, optional + + Returns + ------- + encoded : Series/Index of objects + """ + result = self._array._str_encode(encoding, errors) + return self._wrap_result(result, returns_string=False) + + _shared_docs[ + "str_strip" + ] = r""" + Remove %(position)s characters. + + Strip whitespaces (including newlines) or a set of specified characters + from each string in the Series/Index from %(side)s. + Equivalent to :meth:`str.%(method)s`. + + Parameters + ---------- + to_strip : str or None, default None + Specifying the set of characters to be removed. + All combinations of this set of characters will be stripped. + If None then whitespaces are removed. + + Returns + ------- + Series or Index of object + + See Also + -------- + Series.str.strip : Remove leading and trailing characters in Series/Index. + Series.str.lstrip : Remove leading characters in Series/Index. + Series.str.rstrip : Remove trailing characters in Series/Index. + + Examples + -------- + >>> s = pd.Series(['1. Ant. ', '2. Bee!\n', '3. Cat?\t', np.nan]) + >>> s + 0 1. Ant. + 1 2. Bee!\n + 2 3. Cat?\t + 3 NaN + dtype: object + + >>> s.str.strip() + 0 1. Ant. + 1 2. Bee! + 2 3. Cat? + 3 NaN + dtype: object + + >>> s.str.lstrip('123.') + 0 Ant. + 1 Bee!\n + 2 Cat?\t + 3 NaN + dtype: object + + >>> s.str.rstrip('.!? \n\t') + 0 1. Ant + 1 2. Bee + 2 3. Cat + 3 NaN + dtype: object + + >>> s.str.strip('123.!? \n\t') + 0 Ant + 1 Bee + 2 Cat + 3 NaN + dtype: object + """ + + @Appender( + _shared_docs["str_strip"] + % dict( + side="left and right sides", method="strip", position="leading and trailing" + ) + ) + @forbid_nonstring_types(["bytes"]) + def strip(self, to_strip=None): + result = self._array._str_strip(to_strip) + return self._wrap_result(result) + + @Appender( + _shared_docs["str_strip"] + % dict(side="left side", method="lstrip", position="leading") + ) + @forbid_nonstring_types(["bytes"]) + def lstrip(self, to_strip=None): + result = self._array._str_lstrip(to_strip) + return self._wrap_result(result) + + @Appender( + _shared_docs["str_strip"] + % dict(side="right side", method="rstrip", position="trailing") + ) + @forbid_nonstring_types(["bytes"]) + def rstrip(self, to_strip=None): + result = self._array._str_rstrip(to_strip) + return self._wrap_result(result) + + @forbid_nonstring_types(["bytes"]) + def wrap(self, width, **kwargs): + r""" + Wrap strings in Series/Index at specified line width. + + This method has the same keyword parameters and defaults as + :class:`textwrap.TextWrapper`. + + Parameters + ---------- + width : int + Maximum line width. + expand_tabs : bool, optional + If True, tab characters will be expanded to spaces (default: True). + replace_whitespace : bool, optional + If True, each whitespace character (as defined by string.whitespace) + remaining after tab expansion will be replaced by a single space + (default: True). + drop_whitespace : bool, optional + If True, whitespace that, after wrapping, happens to end up at the + beginning or end of a line is dropped (default: True). + break_long_words : bool, optional + If True, then words longer than width will be broken in order to ensure + that no lines are longer than width. If it is false, long words will + not be broken, and some lines may be longer than width (default: True). + break_on_hyphens : bool, optional + If True, wrapping will occur preferably on whitespace and right after + hyphens in compound words, as it is customary in English. If false, + only whitespaces will be considered as potentially good places for line + breaks, but you need to set break_long_words to false if you want truly + insecable words (default: True). + + Returns + ------- + Series or Index + + Notes + ----- + Internally, this method uses a :class:`textwrap.TextWrapper` instance with + default settings. To achieve behavior matching R's stringr library str_wrap + function, use the arguments: + + - expand_tabs = False + - replace_whitespace = True + - drop_whitespace = True + - break_long_words = False + - break_on_hyphens = False + + Examples + -------- + >>> s = pd.Series(['line to be wrapped', 'another line to be wrapped']) + >>> s.str.wrap(12) + 0 line to be\nwrapped + 1 another line\nto be\nwrapped + dtype: object + """ + result = self._array._str_wrap(width, **kwargs) + return self._wrap_result(result) + + @forbid_nonstring_types(["bytes"]) + def get_dummies(self, sep="|"): + """ + Return DataFrame of dummy/indicator variables for Series. + + Each string in Series is split by sep and returned as a DataFrame + of dummy/indicator variables. + + Parameters + ---------- + sep : str, default "|" + String to split on. + + Returns + ------- + DataFrame + Dummy variables corresponding to values of the Series. + + See Also + -------- + get_dummies : Convert categorical variable into dummy/indicator + variables. + + Examples + -------- + >>> pd.Series(['a|b', 'a', 'a|c']).str.get_dummies() + a b c + 0 1 1 0 + 1 1 0 0 + 2 1 0 1 + + >>> pd.Series(['a|b', np.nan, 'a|c']).str.get_dummies() + a b c + 0 1 1 0 + 1 0 0 0 + 2 1 0 1 + """ + # we need to cast to Series of strings as only that has all + # methods available for making the dummies... + result, name = self._array._str_get_dummies(sep) + return self._wrap_result( + result, + name=name, + expand=True, + returns_string=False, + ) + + @forbid_nonstring_types(["bytes"]) + def translate(self, table): + """ + Map all characters in the string through the given mapping table. + + Equivalent to standard :meth:`str.translate`. + + Parameters + ---------- + table : dict + Table is a mapping of Unicode ordinals to Unicode ordinals, strings, or + None. Unmapped characters are left untouched. + Characters mapped to None are deleted. :meth:`str.maketrans` is a + helper function for making translation tables. + + Returns + ------- + Series or Index + """ + result = self._array._str_translate(table) + return self._wrap_result(result) + + @forbid_nonstring_types(["bytes"]) + def count(self, pat, flags=0): + r""" + Count occurrences of pattern in each string of the Series/Index. + + This function is used to count the number of times a particular regex + pattern is repeated in each of the string elements of the + :class:`~pandas.Series`. + + Parameters + ---------- + pat : str + Valid regular expression. + flags : int, default 0, meaning no flags + Flags for the `re` module. For a complete list, `see here + `_. + **kwargs + For compatibility with other string methods. Not used. + + Returns + ------- + Series or Index + Same type as the calling object containing the integer counts. + + See Also + -------- + re : Standard library module for regular expressions. + str.count : Standard library version, without regular expression support. + + Notes + ----- + Some characters need to be escaped when passing in `pat`. + eg. ``'$'`` has a special meaning in regex and must be escaped when + finding this literal character. + + Examples + -------- + >>> s = pd.Series(['A', 'B', 'Aaba', 'Baca', np.nan, 'CABA', 'cat']) + >>> s.str.count('a') + 0 0.0 + 1 0.0 + 2 2.0 + 3 2.0 + 4 NaN + 5 0.0 + 6 1.0 + dtype: float64 + + Escape ``'$'`` to find the literal dollar sign. + + >>> s = pd.Series(['$', 'B', 'Aab$', '$$ca', 'C$B$', 'cat']) + >>> s.str.count('\\$') + 0 1 + 1 0 + 2 1 + 3 2 + 4 2 + 5 0 + dtype: int64 + + This is also available on Index + + >>> pd.Index(['A', 'A', 'Aaba', 'cat']).str.count('a') + Int64Index([0, 0, 2, 1], dtype='int64') + """ + result = self._array._str_count(pat, flags) + return self._wrap_result(result, returns_string=False) + + @forbid_nonstring_types(["bytes"]) + def startswith(self, pat, na=None): + """ + Test if the start of each string element matches a pattern. + + Equivalent to :meth:`str.startswith`. + + Parameters + ---------- + pat : str + Character sequence. Regular expressions are not accepted. + na : object, default NaN + Object shown if element tested is not a string. The default depends + on dtype of the array. For object-dtype, ``numpy.nan`` is used. + For ``StringDtype``, ``pandas.NA`` is used. + + Returns + ------- + Series or Index of bool + A Series of booleans indicating whether the given pattern matches + the start of each string element. + + See Also + -------- + str.startswith : Python standard library string method. + Series.str.endswith : Same as startswith, but tests the end of string. + Series.str.contains : Tests if string element contains a pattern. + + Examples + -------- + >>> s = pd.Series(['bat', 'Bear', 'cat', np.nan]) + >>> s + 0 bat + 1 Bear + 2 cat + 3 NaN + dtype: object + + >>> s.str.startswith('b') + 0 True + 1 False + 2 False + 3 NaN + dtype: object + + Specifying `na` to be `False` instead of `NaN`. + + >>> s.str.startswith('b', na=False) + 0 True + 1 False + 2 False + 3 False + dtype: bool + """ + result = self._array._str_startswith(pat, na=na) + return self._wrap_result(result, returns_string=False) + + @forbid_nonstring_types(["bytes"]) + def endswith(self, pat, na=None): + """ + Test if the end of each string element matches a pattern. + + Equivalent to :meth:`str.endswith`. + + Parameters + ---------- + pat : str + Character sequence. Regular expressions are not accepted. + na : object, default NaN + Object shown if element tested is not a string. The default depends + on dtype of the array. For object-dtype, ``numpy.nan`` is used. + For ``StringDtype``, ``pandas.NA`` is used. + + Returns + ------- + Series or Index of bool + A Series of booleans indicating whether the given pattern matches + the end of each string element. + + See Also + -------- + str.endswith : Python standard library string method. + Series.str.startswith : Same as endswith, but tests the start of string. + Series.str.contains : Tests if string element contains a pattern. + + Examples + -------- + >>> s = pd.Series(['bat', 'bear', 'caT', np.nan]) + >>> s + 0 bat + 1 bear + 2 caT + 3 NaN + dtype: object + + >>> s.str.endswith('t') + 0 True + 1 False + 2 False + 3 NaN + dtype: object + + Specifying `na` to be `False` instead of `NaN`. + + >>> s.str.endswith('t', na=False) + 0 True + 1 False + 2 False + 3 False + dtype: bool + """ + result = self._array._str_endswith(pat, na=na) + return self._wrap_result(result, returns_string=False) + + @forbid_nonstring_types(["bytes"]) + def findall(self, pat, flags=0): + """ + Find all occurrences of pattern or regular expression in the Series/Index. + + Equivalent to applying :func:`re.findall` to all the elements in the + Series/Index. + + Parameters + ---------- + pat : str + Pattern or regular expression. + flags : int, default 0 + Flags from ``re`` module, e.g. `re.IGNORECASE` (default is 0, which + means no flags). + + Returns + ------- + Series/Index of lists of strings + All non-overlapping matches of pattern or regular expression in each + string of this Series/Index. + + See Also + -------- + count : Count occurrences of pattern or regular expression in each string + of the Series/Index. + extractall : For each string in the Series, extract groups from all matches + of regular expression and return a DataFrame with one row for each + match and one column for each group. + re.findall : The equivalent ``re`` function to all non-overlapping matches + of pattern or regular expression in string, as a list of strings. + + Examples + -------- + >>> s = pd.Series(['Lion', 'Monkey', 'Rabbit']) + + The search for the pattern 'Monkey' returns one match: + + >>> s.str.findall('Monkey') + 0 [] + 1 [Monkey] + 2 [] + dtype: object + + On the other hand, the search for the pattern 'MONKEY' doesn't return any + match: + + >>> s.str.findall('MONKEY') + 0 [] + 1 [] + 2 [] + dtype: object + + Flags can be added to the pattern or regular expression. For instance, + to find the pattern 'MONKEY' ignoring the case: + + >>> import re + >>> s.str.findall('MONKEY', flags=re.IGNORECASE) + 0 [] + 1 [Monkey] + 2 [] + dtype: object + + When the pattern matches more than one string in the Series, all matches + are returned: + + >>> s.str.findall('on') + 0 [on] + 1 [on] + 2 [] + dtype: object + + Regular expressions are supported too. For instance, the search for all the + strings ending with the word 'on' is shown next: + + >>> s.str.findall('on$') + 0 [on] + 1 [] + 2 [] + dtype: object + + If the pattern is found more than once in the same string, then a list of + multiple strings is returned: + + >>> s.str.findall('b') + 0 [] + 1 [] + 2 [b, b] + dtype: object + """ + result = self._array._str_findall(pat, flags) + return self._wrap_result(result, returns_string=False) + + @forbid_nonstring_types(["bytes"]) + def extract(self, pat, flags=0, expand=True): + r""" + Extract capture groups in the regex `pat` as columns in a DataFrame. + + For each subject string in the Series, extract groups from the + first match of regular expression `pat`. + + Parameters + ---------- + pat : str + Regular expression pattern with capturing groups. + flags : int, default 0 (no flags) + Flags from the ``re`` module, e.g. ``re.IGNORECASE``, that + modify regular expression matching for things like case, + spaces, etc. For more details, see :mod:`re`. + expand : bool, default True + If True, return DataFrame with one column per capture group. + If False, return a Series/Index if there is one capture group + or DataFrame if there are multiple capture groups. + + Returns + ------- + DataFrame or Series or Index + A DataFrame with one row for each subject string, and one + column for each group. Any capture group names in regular + expression pat will be used for column names; otherwise + capture group numbers will be used. The dtype of each result + column is always object, even when no match is found. If + ``expand=False`` and pat has only one capture group, then + return a Series (if subject is a Series) or Index (if subject + is an Index). + + See Also + -------- + extractall : Returns all matches (not just the first match). + + Examples + -------- + A pattern with two groups will return a DataFrame with two columns. + Non-matches will be NaN. + + >>> s = pd.Series(['a1', 'b2', 'c3']) + >>> s.str.extract(r'([ab])(\d)') + 0 1 + 0 a 1 + 1 b 2 + 2 NaN NaN + + A pattern may contain optional groups. + + >>> s.str.extract(r'([ab])?(\d)') + 0 1 + 0 a 1 + 1 b 2 + 2 NaN 3 + + Named groups will become column names in the result. + + >>> s.str.extract(r'(?P[ab])(?P\d)') + letter digit + 0 a 1 + 1 b 2 + 2 NaN NaN + + A pattern with one group will return a DataFrame with one column + if expand=True. + + >>> s.str.extract(r'[ab](\d)', expand=True) + 0 + 0 1 + 1 2 + 2 NaN + + A pattern with one group will return a Series if expand=False. + + >>> s.str.extract(r'[ab](\d)', expand=False) + 0 1 + 1 2 + 2 NaN + dtype: object + """ + # TODO: dispatch + return str_extract(self, pat, flags, expand=expand) + + @forbid_nonstring_types(["bytes"]) + def extractall(self, pat, flags=0): + r""" + Extract capture groups in the regex `pat` as columns in DataFrame. + + For each subject string in the Series, extract groups from all + matches of regular expression pat. When each subject string in the + Series has exactly one match, extractall(pat).xs(0, level='match') + is the same as extract(pat). + + Parameters + ---------- + pat : str + Regular expression pattern with capturing groups. + flags : int, default 0 (no flags) + A ``re`` module flag, for example ``re.IGNORECASE``. These allow + to modify regular expression matching for things like case, spaces, + etc. Multiple flags can be combined with the bitwise OR operator, + for example ``re.IGNORECASE | re.MULTILINE``. + + Returns + ------- + DataFrame + A ``DataFrame`` with one row for each match, and one column for each + group. Its rows have a ``MultiIndex`` with first levels that come from + the subject ``Series``. The last level is named 'match' and indexes the + matches in each item of the ``Series``. Any capture group names in + regular expression pat will be used for column names; otherwise capture + group numbers will be used. + + See Also + -------- + extract : Returns first match only (not all matches). + + Examples + -------- + A pattern with one group will return a DataFrame with one column. + Indices with no matches will not appear in the result. + + >>> s = pd.Series(["a1a2", "b1", "c1"], index=["A", "B", "C"]) + >>> s.str.extractall(r"[ab](\d)") + 0 + match + A 0 1 + 1 2 + B 0 1 + + Capture group names are used for column names of the result. + + >>> s.str.extractall(r"[ab](?P\d)") + digit + match + A 0 1 + 1 2 + B 0 1 + + A pattern with two groups will return a DataFrame with two columns. + + >>> s.str.extractall(r"(?P[ab])(?P\d)") + letter digit + match + A 0 a 1 + 1 a 2 + B 0 b 1 + + Optional groups that do not match are NaN in the result. + + >>> s.str.extractall(r"(?P[ab])?(?P\d)") + letter digit + match + A 0 a 1 + 1 a 2 + B 0 b 1 + C 0 NaN 1 + """ + # TODO: dispatch + return str_extractall(self._orig, pat, flags) + + _shared_docs[ + "find" + ] = """ + Return %(side)s indexes in each strings in the Series/Index. + + Each of returned indexes corresponds to the position where the + substring is fully contained between [start:end]. Return -1 on + failure. Equivalent to standard :meth:`str.%(method)s`. + + Parameters + ---------- + sub : str + Substring being searched. + start : int + Left edge index. + end : int + Right edge index. + + Returns + ------- + Series or Index of int. + + See Also + -------- + %(also)s + """ + + @Appender( + _shared_docs["find"] + % dict( + side="lowest", + method="find", + also="rfind : Return highest indexes in each strings.", + ) + ) + @forbid_nonstring_types(["bytes"]) + def find(self, sub, start=0, end=None): + if not isinstance(sub, str): + msg = f"expected a string object, not {type(sub).__name__}" + raise TypeError(msg) + + result = self._array._str_find(sub, start, end) + return self._wrap_result(result, returns_string=False) + + @Appender( + _shared_docs["find"] + % dict( + side="highest", + method="rfind", + also="find : Return lowest indexes in each strings.", + ) + ) + @forbid_nonstring_types(["bytes"]) + def rfind(self, sub, start=0, end=None): + if not isinstance(sub, str): + msg = f"expected a string object, not {type(sub).__name__}" + raise TypeError(msg) + + result = self._array._str_rfind(sub, start=start, end=end) + return self._wrap_result(result, returns_string=False) + + @forbid_nonstring_types(["bytes"]) + def normalize(self, form): + """ + Return the Unicode normal form for the strings in the Series/Index. + + For more information on the forms, see the + :func:`unicodedata.normalize`. + + Parameters + ---------- + form : {'NFC', 'NFKC', 'NFD', 'NFKD'} + Unicode form. + + Returns + ------- + normalized : Series/Index of objects + """ + result = self._array._str_normalize(form) + return self._wrap_result(result) + + _shared_docs[ + "index" + ] = """ + Return %(side)s indexes in each string in Series/Index. + + Each of the returned indexes corresponds to the position where the + substring is fully contained between [start:end]. This is the same + as ``str.%(similar)s`` except instead of returning -1, it raises a + ValueError when the substring is not found. Equivalent to standard + ``str.%(method)s``. + + Parameters + ---------- + sub : str + Substring being searched. + start : int + Left edge index. + end : int + Right edge index. + + Returns + ------- + Series or Index of object + + See Also + -------- + %(also)s + """ + + @Appender( + _shared_docs["index"] + % dict( + side="lowest", + similar="find", + method="index", + also="rindex : Return highest indexes in each strings.", + ) + ) + @forbid_nonstring_types(["bytes"]) + def index(self, sub, start=0, end=None): + if not isinstance(sub, str): + msg = f"expected a string object, not {type(sub).__name__}" + raise TypeError(msg) + + result = self._array._str_index(sub, start=start, end=end) + return self._wrap_result(result, returns_string=False) + + @Appender( + _shared_docs["index"] + % dict( + side="highest", + similar="rfind", + method="rindex", + also="index : Return lowest indexes in each strings.", + ) + ) + @forbid_nonstring_types(["bytes"]) + def rindex(self, sub, start=0, end=None): + if not isinstance(sub, str): + msg = f"expected a string object, not {type(sub).__name__}" + raise TypeError(msg) + + result = self._array._str_rindex(sub, start=start, end=end) + return self._wrap_result(result, returns_string=False) + + def len(self): + """ + Compute the length of each element in the Series/Index. + + The element may be a sequence (such as a string, tuple or list) or a collection + (such as a dictionary). + + Returns + ------- + Series or Index of int + A Series or Index of integer values indicating the length of each + element in the Series or Index. + + See Also + -------- + str.len : Python built-in function returning the length of an object. + Series.size : Returns the length of the Series. + + Examples + -------- + Returns the length (number of characters) in a string. Returns the + number of entries for dictionaries, lists or tuples. + + >>> s = pd.Series(['dog', + ... '', + ... 5, + ... {'foo' : 'bar'}, + ... [2, 3, 5, 7], + ... ('one', 'two', 'three')]) + >>> s + 0 dog + 1 + 2 5 + 3 {'foo': 'bar'} + 4 [2, 3, 5, 7] + 5 (one, two, three) + dtype: object + >>> s.str.len() + 0 3.0 + 1 0.0 + 2 NaN + 3 1.0 + 4 4.0 + 5 3.0 + dtype: float64 + """ + result = self._array._str_len() + return self._wrap_result(result, returns_string=False) + + _shared_docs[ + "casemethods" + ] = """ + Convert strings in the Series/Index to %(type)s. + %(version)s + Equivalent to :meth:`str.%(method)s`. + + Returns + ------- + Series or Index of object + + See Also + -------- + Series.str.lower : Converts all characters to lowercase. + Series.str.upper : Converts all characters to uppercase. + Series.str.title : Converts first character of each word to uppercase and + remaining to lowercase. + Series.str.capitalize : Converts first character to uppercase and + remaining to lowercase. + Series.str.swapcase : Converts uppercase to lowercase and lowercase to + uppercase. + Series.str.casefold: Removes all case distinctions in the string. + + Examples + -------- + >>> s = pd.Series(['lower', 'CAPITALS', 'this is a sentence', 'SwApCaSe']) + >>> s + 0 lower + 1 CAPITALS + 2 this is a sentence + 3 SwApCaSe + dtype: object + + >>> s.str.lower() + 0 lower + 1 capitals + 2 this is a sentence + 3 swapcase + dtype: object + + >>> s.str.upper() + 0 LOWER + 1 CAPITALS + 2 THIS IS A SENTENCE + 3 SWAPCASE + dtype: object + + >>> s.str.title() + 0 Lower + 1 Capitals + 2 This Is A Sentence + 3 Swapcase + dtype: object + + >>> s.str.capitalize() + 0 Lower + 1 Capitals + 2 This is a sentence + 3 Swapcase + dtype: object + + >>> s.str.swapcase() + 0 LOWER + 1 capitals + 2 THIS IS A SENTENCE + 3 sWaPcAsE + dtype: object + """ + # Types: + # cases: + # upper, lower, title, capitalize, swapcase, casefold + # boolean: + # isalpha, isnumeric isalnum isdigit isdecimal isspace islower isupper istitle + # _doc_args holds dict of strings to use in substituting casemethod docs + _doc_args: Dict[str, Dict[str, str]] = {} + _doc_args["lower"] = dict(type="lowercase", method="lower", version="") + _doc_args["upper"] = dict(type="uppercase", method="upper", version="") + _doc_args["title"] = dict(type="titlecase", method="title", version="") + _doc_args["capitalize"] = dict( + type="be capitalized", method="capitalize", version="" + ) + _doc_args["swapcase"] = dict(type="be swapcased", method="swapcase", version="") + _doc_args["casefold"] = dict( + type="be casefolded", + method="casefold", + version="\n .. versionadded:: 0.25.0\n", + ) + + @Appender(_shared_docs["casemethods"] % _doc_args["lower"]) + @forbid_nonstring_types(["bytes"]) + def lower(self): + result = self._array._str_lower() + return self._wrap_result(result) + + @Appender(_shared_docs["casemethods"] % _doc_args["upper"]) + @forbid_nonstring_types(["bytes"]) + def upper(self): + result = self._array._str_upper() + return self._wrap_result(result) + + @Appender(_shared_docs["casemethods"] % _doc_args["title"]) + @forbid_nonstring_types(["bytes"]) + def title(self): + result = self._array._str_title() + return self._wrap_result(result) + + @Appender(_shared_docs["casemethods"] % _doc_args["capitalize"]) + @forbid_nonstring_types(["bytes"]) + def capitalize(self): + result = self._array._str_capitalize() + return self._wrap_result(result) + + @Appender(_shared_docs["casemethods"] % _doc_args["swapcase"]) + @forbid_nonstring_types(["bytes"]) + def swapcase(self): + result = self._array._str_swapcase() + return self._wrap_result(result) + + @Appender(_shared_docs["casemethods"] % _doc_args["casefold"]) + @forbid_nonstring_types(["bytes"]) + def casefold(self): + result = self._array._str_casefold() + return self._wrap_result(result) + + _shared_docs[ + "ismethods" + ] = """ + Check whether all characters in each string are %(type)s. + + This is equivalent to running the Python string method + :meth:`str.%(method)s` for each element of the Series/Index. If a string + has zero characters, ``False`` is returned for that check. + + Returns + ------- + Series or Index of bool + Series or Index of boolean values with the same length as the original + Series/Index. + + See Also + -------- + Series.str.isalpha : Check whether all characters are alphabetic. + Series.str.isnumeric : Check whether all characters are numeric. + Series.str.isalnum : Check whether all characters are alphanumeric. + Series.str.isdigit : Check whether all characters are digits. + Series.str.isdecimal : Check whether all characters are decimal. + Series.str.isspace : Check whether all characters are whitespace. + Series.str.islower : Check whether all characters are lowercase. + Series.str.isupper : Check whether all characters are uppercase. + Series.str.istitle : Check whether all characters are titlecase. + + Examples + -------- + **Checks for Alphabetic and Numeric Characters** + + >>> s1 = pd.Series(['one', 'one1', '1', '']) + + >>> s1.str.isalpha() + 0 True + 1 False + 2 False + 3 False + dtype: bool + + >>> s1.str.isnumeric() + 0 False + 1 False + 2 True + 3 False + dtype: bool + + >>> s1.str.isalnum() + 0 True + 1 True + 2 True + 3 False + dtype: bool + + Note that checks against characters mixed with any additional punctuation + or whitespace will evaluate to false for an alphanumeric check. + + >>> s2 = pd.Series(['A B', '1.5', '3,000']) + >>> s2.str.isalnum() + 0 False + 1 False + 2 False + dtype: bool + + **More Detailed Checks for Numeric Characters** + + There are several different but overlapping sets of numeric characters that + can be checked for. + + >>> s3 = pd.Series(['23', '³', '⅕', '']) + + The ``s3.str.isdecimal`` method checks for characters used to form numbers + in base 10. + + >>> s3.str.isdecimal() + 0 True + 1 False + 2 False + 3 False + dtype: bool + + The ``s.str.isdigit`` method is the same as ``s3.str.isdecimal`` but also + includes special digits, like superscripted and subscripted digits in + unicode. + + >>> s3.str.isdigit() + 0 True + 1 True + 2 False + 3 False + dtype: bool + + The ``s.str.isnumeric`` method is the same as ``s3.str.isdigit`` but also + includes other characters that can represent quantities such as unicode + fractions. + + >>> s3.str.isnumeric() + 0 True + 1 True + 2 True + 3 False + dtype: bool + + **Checks for Whitespace** + + >>> s4 = pd.Series([' ', '\\t\\r\\n ', '']) + >>> s4.str.isspace() + 0 True + 1 True + 2 False + dtype: bool + + **Checks for Character Case** + + >>> s5 = pd.Series(['leopard', 'Golden Eagle', 'SNAKE', '']) + + >>> s5.str.islower() + 0 True + 1 False + 2 False + 3 False + dtype: bool + + >>> s5.str.isupper() + 0 False + 1 False + 2 True + 3 False + dtype: bool + + The ``s5.str.istitle`` method checks for whether all words are in title + case (whether only the first letter of each word is capitalized). Words are + assumed to be as any sequence of non-numeric characters separated by + whitespace characters. + + >>> s5.str.istitle() + 0 False + 1 True + 2 False + 3 False + dtype: bool + """ + _doc_args["isalnum"] = dict(type="alphanumeric", method="isalnum") + _doc_args["isalpha"] = dict(type="alphabetic", method="isalpha") + _doc_args["isdigit"] = dict(type="digits", method="isdigit") + _doc_args["isspace"] = dict(type="whitespace", method="isspace") + _doc_args["islower"] = dict(type="lowercase", method="islower") + _doc_args["isupper"] = dict(type="uppercase", method="isupper") + _doc_args["istitle"] = dict(type="titlecase", method="istitle") + _doc_args["isnumeric"] = dict(type="numeric", method="isnumeric") + _doc_args["isdecimal"] = dict(type="decimal", method="isdecimal") + # force _noarg_wrapper return type with dtype=np.dtype(bool) (GH 29624) + + isalnum = _map_and_wrap( + "isalnum", docstring=_shared_docs["ismethods"] % _doc_args["isalnum"] + ) + isalpha = _map_and_wrap( + "isalpha", docstring=_shared_docs["ismethods"] % _doc_args["isalpha"] + ) + isdigit = _map_and_wrap( + "isdigit", docstring=_shared_docs["ismethods"] % _doc_args["isdigit"] + ) + isspace = _map_and_wrap( + "isspace", docstring=_shared_docs["ismethods"] % _doc_args["isalnum"] + ) + islower = _map_and_wrap( + "islower", docstring=_shared_docs["ismethods"] % _doc_args["islower"] + ) + isupper = _map_and_wrap( + "isupper", docstring=_shared_docs["ismethods"] % _doc_args["isupper"] + ) + istitle = _map_and_wrap( + "istitle", docstring=_shared_docs["ismethods"] % _doc_args["istitle"] + ) + isnumeric = _map_and_wrap( + "isnumeric", docstring=_shared_docs["ismethods"] % _doc_args["isnumeric"] + ) + isdecimal = _map_and_wrap( + "isdecimal", docstring=_shared_docs["ismethods"] % _doc_args["isdecimal"] + ) + + +def cat_safe(list_of_columns: List, sep: str): + """ + Auxiliary function for :meth:`str.cat`. + + Same signature as cat_core, but handles TypeErrors in concatenation, which + happen if the arrays in list_of columns have the wrong dtypes or content. + + Parameters + ---------- + list_of_columns : list of numpy arrays + List of arrays to be concatenated with sep; + these arrays may not contain NaNs! + sep : string + The separator string for concatenating the columns. + + Returns + ------- + nd.array + The concatenation of list_of_columns with sep. + """ + try: + result = cat_core(list_of_columns, sep) + except TypeError: + # if there are any non-string values (wrong dtype or hidden behind + # object dtype), np.sum will fail; catch and return with better message + for column in list_of_columns: + dtype = lib.infer_dtype(column, skipna=True) + if dtype not in ["string", "empty"]: + raise TypeError( + "Concatenation requires list-likes containing only " + "strings (or missing values). Offending values found in " + f"column {dtype}" + ) from None + return result + + +def cat_core(list_of_columns: List, sep: str): + """ + Auxiliary function for :meth:`str.cat` + + Parameters + ---------- + list_of_columns : list of numpy arrays + List of arrays to be concatenated with sep; + these arrays may not contain NaNs! + sep : string + The separator string for concatenating the columns. + + Returns + ------- + nd.array + The concatenation of list_of_columns with sep. + """ + if sep == "": + # no need to interleave sep if it is empty + arr_of_cols = np.asarray(list_of_columns, dtype=object) + return np.sum(arr_of_cols, axis=0) + list_with_sep = [sep] * (2 * len(list_of_columns) - 1) + list_with_sep[::2] = list_of_columns + arr_with_sep = np.asarray(list_with_sep, dtype=object) + return np.sum(arr_with_sep, axis=0) + + +def _groups_or_na_fun(regex): + """Used in both extract_noexpand and extract_frame""" + if regex.groups == 0: + raise ValueError("pattern contains no capture groups") + empty_row = [np.nan] * regex.groups + + def f(x): + if not isinstance(x, str): + return empty_row + m = regex.search(x) + if m: + return [np.nan if item is None else item for item in m.groups()] + else: + return empty_row + + return f + + +def _result_dtype(arr): + # workaround #27953 + # ideally we just pass `dtype=arr.dtype` unconditionally, but this fails + # when the list of values is empty. + from pandas.core.arrays.string_ import StringDtype + + if isinstance(arr.dtype, StringDtype): + return arr.dtype.name + else: + return object + + +def _get_single_group_name(rx): + try: + return list(rx.groupindex.keys()).pop() + except IndexError: + return None + + +def _str_extract_noexpand(arr, pat, flags=0): + """ + Find groups in each string in the Series using passed regular + expression. This function is called from + str_extract(expand=False), and can return Series, DataFrame, or + Index. + + """ + from pandas import DataFrame, array + + regex = re.compile(pat, flags=flags) + groups_or_na = _groups_or_na_fun(regex) + result_dtype = _result_dtype(arr) + + if regex.groups == 1: + result = np.array([groups_or_na(val)[0] for val in arr], dtype=object) + name = _get_single_group_name(regex) + # not dispatching, so we have to reconstruct here. + result = array(result, dtype=result_dtype) + else: + if isinstance(arr, ABCIndexClass): + raise ValueError("only one regex group is supported with Index") + name = None + names = dict(zip(regex.groupindex.values(), regex.groupindex.keys())) + columns = [names.get(1 + i, i) for i in range(regex.groups)] + if arr.size == 0: + result = DataFrame(columns=columns, dtype=object) + else: + dtype = _result_dtype(arr) + result = DataFrame( + [groups_or_na(val) for val in arr], + columns=columns, + index=arr.index, + dtype=dtype, + ) + return result, name + + +def _str_extract_frame(arr, pat, flags=0): + """ + For each subject string in the Series, extract groups from the + first match of regular expression pat. This function is called from + str_extract(expand=True), and always returns a DataFrame. + + """ + from pandas import DataFrame + + regex = re.compile(pat, flags=flags) + groups_or_na = _groups_or_na_fun(regex) + names = dict(zip(regex.groupindex.values(), regex.groupindex.keys())) + columns = [names.get(1 + i, i) for i in range(regex.groups)] + + if len(arr) == 0: + return DataFrame(columns=columns, dtype=object) + try: + result_index = arr.index + except AttributeError: + result_index = None + dtype = _result_dtype(arr) + return DataFrame( + [groups_or_na(val) for val in arr], + columns=columns, + index=result_index, + dtype=dtype, + ) + + +def str_extract(arr, pat, flags=0, expand=True): + if not isinstance(expand, bool): + raise ValueError("expand must be True or False") + if expand: + return _str_extract_frame(arr._orig, pat, flags=flags) + else: + result, name = _str_extract_noexpand(arr._orig, pat, flags=flags) + return arr._wrap_result(result, name=name, expand=expand) + + +def str_extractall(arr, pat, flags=0): + regex = re.compile(pat, flags=flags) + # the regex must contain capture groups. + if regex.groups == 0: + raise ValueError("pattern contains no capture groups") + + if isinstance(arr, ABCIndexClass): + arr = arr.to_series().reset_index(drop=True) + + names = dict(zip(regex.groupindex.values(), regex.groupindex.keys())) + columns = [names.get(1 + i, i) for i in range(regex.groups)] + match_list = [] + index_list = [] + is_mi = arr.index.nlevels > 1 + + for subject_key, subject in arr.items(): + if isinstance(subject, str): + + if not is_mi: + subject_key = (subject_key,) + + for match_i, match_tuple in enumerate(regex.findall(subject)): + if isinstance(match_tuple, str): + match_tuple = (match_tuple,) + na_tuple = [np.NaN if group == "" else group for group in match_tuple] + match_list.append(na_tuple) + result_key = tuple(subject_key + (match_i,)) + index_list.append(result_key) + + from pandas import MultiIndex + + index = MultiIndex.from_tuples(index_list, names=arr.index.names + ["match"]) + dtype = _result_dtype(arr) + + result = arr._constructor_expanddim( + match_list, index=index, columns=columns, dtype=dtype + ) + return result diff --git a/pandas/core/strings/base.py b/pandas/core/strings/base.py new file mode 100644 index 0000000000000..08064244a2ff9 --- /dev/null +++ b/pandas/core/strings/base.py @@ -0,0 +1,225 @@ +import abc +from typing import Pattern, Union + +import numpy as np + +from pandas._typing import Scalar + + +class BaseStringArrayMethods(abc.ABC): + """ + Base class for extension arrays implementing string methods. + + This is where our ExtensionArrays can override the implementation of + Series.str.. We don't expect this to work with + 3rd-party extension arrays. + + * User calls Series.str. + * pandas extracts the extension array from the Series + * pandas calls ``extension_array._str_(*args, **kwargs)`` + * pandas wraps the result, to return to the user. + + See :ref:`Series.str` for the docstring of each method. + """ + + def _str_getitem(self, key): + if isinstance(key, slice): + return self._str_slice(start=key.start, stop=key.stop, step=key.step) + else: + return self._str_get(key) + + @abc.abstractmethod + def _str_count(self, pat, flags=0): + pass + + @abc.abstractmethod + def _str_pad(self, width, side="left", fillchar=" "): + pass + + @abc.abstractmethod + def _str_contains(self, pat, case=True, flags=0, na=None, regex=True): + pass + + @abc.abstractmethod + def _str_startswith(self, pat, na=None): + pass + + @abc.abstractmethod + def _str_endswith(self, pat, na=None): + pass + + @abc.abstractmethod + def _str_replace(self, pat, repl, n=-1, case=None, flags=0, regex=True): + pass + + @abc.abstractmethod + def _str_repeat(self, repeats): + pass + + @abc.abstractmethod + def _str_match( + self, + pat: Union[str, Pattern], + case: bool = True, + flags: int = 0, + na: Scalar = np.nan, + ): + pass + + @abc.abstractmethod + def _str_fullmatch( + self, + pat: Union[str, Pattern], + case: bool = True, + flags: int = 0, + na: Scalar = np.nan, + ): + pass + + @abc.abstractmethod + def _str_encode(self, encoding, errors="strict"): + pass + + @abc.abstractmethod + def _str_find(self, sub, start=0, end=None): + pass + + @abc.abstractmethod + def _str_rfind(self, sub, start=0, end=None): + pass + + @abc.abstractmethod + def _str_findall(self, pat, flags=0): + pass + + @abc.abstractmethod + def _str_get(self, i): + pass + + @abc.abstractmethod + def _str_index(self, sub, start=0, end=None): + pass + + @abc.abstractmethod + def _str_rindex(self, sub, start=0, end=None): + pass + + @abc.abstractmethod + def _str_join(self, sep): + pass + + @abc.abstractmethod + def _str_partition(self, sep, expand): + pass + + @abc.abstractmethod + def _str_rpartition(self, sep, expand): + pass + + @abc.abstractmethod + def _str_len(self): + pass + + @abc.abstractmethod + def _str_slice(self, start=None, stop=None, step=None): + pass + + @abc.abstractmethod + def _str_slice_replace(self, start=None, stop=None, repl=None): + pass + + @abc.abstractmethod + def _str_translate(self, table): + pass + + @abc.abstractmethod + def _str_wrap(self, width, **kwargs): + pass + + @abc.abstractmethod + def _str_get_dummies(self, sep="|"): + pass + + @abc.abstractmethod + def _str_isalnum(self): + pass + + @abc.abstractmethod + def _str_isalpha(self): + pass + + @abc.abstractmethod + def _str_isdecimal(self): + pass + + @abc.abstractmethod + def _str_isdigit(self): + pass + + @abc.abstractmethod + def _str_islower(self): + pass + + @abc.abstractmethod + def _str_isnumeric(self): + pass + + @abc.abstractmethod + def _str_isspace(self): + pass + + @abc.abstractmethod + def _str_istitle(self): + pass + + @abc.abstractmethod + def _str_isupper(self): + pass + + @abc.abstractmethod + def _str_capitalize(self): + pass + + @abc.abstractmethod + def _str_casefold(self): + pass + + @abc.abstractmethod + def _str_title(self): + pass + + @abc.abstractmethod + def _str_swapcase(self): + pass + + @abc.abstractmethod + def _str_lower(self): + pass + + @abc.abstractmethod + def _str_upper(self): + pass + + @abc.abstractmethod + def _str_normalize(self, form): + pass + + @abc.abstractmethod + def _str_strip(self, to_strip=None): + pass + + @abc.abstractmethod + def _str_lstrip(self, to_strip=None): + pass + + @abc.abstractmethod + def _str_rstrip(self, to_strip=None): + pass + + @abc.abstractmethod + def _str_split(self, pat=None, n=-1, expand=False): + pass + + @abc.abstractmethod + def _str_rsplit(self, pat=None, n=-1): + pass diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py new file mode 100644 index 0000000000000..a29d84edd3a77 --- /dev/null +++ b/pandas/core/strings/object_array.py @@ -0,0 +1,432 @@ +import re +import textwrap +from typing import Pattern, Set, Union, cast +import unicodedata +import warnings + +import numpy as np + +import pandas._libs.lib as lib +import pandas._libs.missing as libmissing +import pandas._libs.ops as libops +from pandas._typing import Scalar + +from pandas.core.dtypes.common import is_re, is_scalar +from pandas.core.dtypes.missing import isna + +from pandas.core.strings.base import BaseStringArrayMethods + + +class ObjectStringArrayMixin(BaseStringArrayMethods): + """ + String Methods operating on object-dtype ndarrays. + """ + + _str_na_value = np.nan + + def __len__(self): + # For typing, _str_map relies on the object being sized. + raise NotImplementedError + + def _str_map(self, f, na_value=None, dtype=None): + """ + Map a callable over valid element of the array. + + Parameters + ---------- + f : Callable + A function to call on each non-NA element. + na_value : Scalar, optional + The value to set for NA values. Might also be used for the + fill value if the callable `f` raises an exception. + This defaults to ``self._str_na_value`` which is ``np.nan`` + for object-dtype and Categorical and ``pd.NA`` for StringArray. + dtype : Dtype, optional + The dtype of the result array. + """ + arr = self + if dtype is None: + dtype = np.dtype("object") + if na_value is None: + na_value = self._str_na_value + + if not len(arr): + return np.ndarray(0, dtype=dtype) + + if not isinstance(arr, np.ndarray): + arr = np.asarray(arr, dtype=object) + mask = isna(arr) + convert = not np.all(mask) + try: + result = lib.map_infer_mask(arr, f, mask.view(np.uint8), convert) + except (TypeError, AttributeError) as e: + # Reraise the exception if callable `f` got wrong number of args. + # The user may want to be warned by this, instead of getting NaN + p_err = ( + r"((takes)|(missing)) (?(2)from \d+ to )?\d+ " + r"(?(3)required )positional arguments?" + ) + + if len(e.args) >= 1 and re.search(p_err, e.args[0]): + # FIXME: this should be totally avoidable + raise e + + def g(x): + # This type of fallback behavior can be removed once + # we remove object-dtype .str accessor. + try: + return f(x) + except (TypeError, AttributeError): + return na_value + + return self._str_map(g, na_value=na_value, dtype=dtype) + if na_value is not np.nan: + np.putmask(result, mask, na_value) + if result.dtype == object: + result = lib.maybe_convert_objects(result) + return result + + def _str_count(self, pat, flags=0): + regex = re.compile(pat, flags=flags) + f = lambda x: len(regex.findall(x)) + return self._str_map(f, dtype="int64") + + def _str_pad(self, width, side="left", fillchar=" "): + if side == "left": + f = lambda x: x.rjust(width, fillchar) + elif side == "right": + f = lambda x: x.ljust(width, fillchar) + elif side == "both": + f = lambda x: x.center(width, fillchar) + else: # pragma: no cover + raise ValueError("Invalid side") + return self._str_map(f) + + def _str_contains(self, pat, case=True, flags=0, na=np.nan, regex=True): + if regex: + if not case: + flags |= re.IGNORECASE + + regex = re.compile(pat, flags=flags) + + if regex.groups > 0: + warnings.warn( + "This pattern has match groups. To actually get the " + "groups, use str.extract.", + UserWarning, + stacklevel=3, + ) + + f = lambda x: regex.search(x) is not None + else: + if case: + f = lambda x: pat in x + else: + upper_pat = pat.upper() + f = lambda x: upper_pat in x.upper() + return self._str_map(f, na, dtype=np.dtype("bool")) + + def _str_startswith(self, pat, na=None): + f = lambda x: x.startswith(pat) + return self._str_map(f, na_value=na, dtype=np.dtype(bool)) + + def _str_endswith(self, pat, na=None): + f = lambda x: x.endswith(pat) + return self._str_map(f, na_value=na, dtype=np.dtype(bool)) + + def _str_replace(self, pat, repl, n=-1, case=None, flags=0, regex=True): + # Check whether repl is valid (GH 13438, GH 15055) + if not (isinstance(repl, str) or callable(repl)): + raise TypeError("repl must be a string or callable") + + is_compiled_re = is_re(pat) + if regex: + if is_compiled_re: + if (case is not None) or (flags != 0): + raise ValueError( + "case and flags cannot be set when pat is a compiled regex" + ) + else: + # not a compiled regex + # set default case + if case is None: + case = True + + # add case flag, if provided + if case is False: + flags |= re.IGNORECASE + if is_compiled_re or len(pat) > 1 or flags or callable(repl): + n = n if n >= 0 else 0 + compiled = re.compile(pat, flags=flags) + f = lambda x: compiled.sub(repl=repl, string=x, count=n) + else: + f = lambda x: x.replace(pat, repl, n) + else: + if is_compiled_re: + raise ValueError( + "Cannot use a compiled regex as replacement pattern with " + "regex=False" + ) + if callable(repl): + raise ValueError("Cannot use a callable replacement when regex=False") + f = lambda x: x.replace(pat, repl, n) + + return self._str_map(f, dtype=str) + + def _str_repeat(self, repeats): + if is_scalar(repeats): + + def scalar_rep(x): + try: + return bytes.__mul__(x, repeats) + except TypeError: + return str.__mul__(x, repeats) + + return self._str_map(scalar_rep, dtype=str) + else: + from pandas.core.arrays.string_ import StringArray + + def rep(x, r): + if x is libmissing.NA: + return x + try: + return bytes.__mul__(x, r) + except TypeError: + return str.__mul__(x, r) + + repeats = np.asarray(repeats, dtype=object) + result = libops.vec_binop(np.asarray(self), repeats, rep) + if isinstance(self, StringArray): + # Not going through map, so we have to do this here. + result = StringArray._from_sequence(result) + return result + + def _str_match( + self, + pat: Union[str, Pattern], + case: bool = True, + flags: int = 0, + na: Scalar = None, + ): + if not case: + flags |= re.IGNORECASE + + regex = re.compile(pat, flags=flags) + + f = lambda x: regex.match(x) is not None + return self._str_map(f, na_value=na, dtype=np.dtype(bool)) + + def _str_fullmatch( + self, + pat: Union[str, Pattern], + case: bool = True, + flags: int = 0, + na: Scalar = None, + ): + if not case: + flags |= re.IGNORECASE + + regex = re.compile(pat, flags=flags) + + f = lambda x: regex.fullmatch(x) is not None + return self._str_map(f, na_value=na, dtype=np.dtype(bool)) + + def _str_encode(self, encoding, errors="strict"): + f = lambda x: x.encode(encoding, errors=errors) + return self._str_map(f, dtype=object) + + def _str_find(self, sub, start=0, end=None): + return self._str_find_(sub, start, end, side="left") + + def _str_rfind(self, sub, start=0, end=None): + return self._str_find_(sub, start, end, side="right") + + def _str_find_(self, sub, start, end, side): + if side == "left": + method = "find" + elif side == "right": + method = "rfind" + else: # pragma: no cover + raise ValueError("Invalid side") + + if end is None: + f = lambda x: getattr(x, method)(sub, start) + else: + f = lambda x: getattr(x, method)(sub, start, end) + return self._str_map(f, dtype="int64") + + def _str_findall(self, pat, flags=0): + regex = re.compile(pat, flags=flags) + return self._str_map(regex.findall, dtype="object") + + def _str_get(self, i): + def f(x): + if isinstance(x, dict): + return x.get(i) + elif len(x) > i >= -len(x): + return x[i] + return self._str_na_value + + return self._str_map(f) + + def _str_index(self, sub, start=0, end=None): + if end: + f = lambda x: x.index(sub, start, end) + else: + f = lambda x: x.index(sub, start, end) + return self._str_map(f, dtype="int64") + + def _str_rindex(self, sub, start=0, end=None): + if end: + f = lambda x: x.rindex(sub, start, end) + else: + f = lambda x: x.rindex(sub, start, end) + return self._str_map(f, dtype="int64") + + def _str_join(self, sep): + return self._str_map(sep.join) + + def _str_partition(self, sep, expand): + result = self._str_map(lambda x: x.partition(sep), dtype="object") + return result + + def _str_rpartition(self, sep, expand): + return self._str_map(lambda x: x.rpartition(sep), dtype="object") + + def _str_len(self): + return self._str_map(len, dtype="int64") + + def _str_slice(self, start=None, stop=None, step=None): + obj = slice(start, stop, step) + return self._str_map(lambda x: x[obj]) + + def _str_slice_replace(self, start=None, stop=None, repl=None): + if repl is None: + repl = "" + + def f(x): + if x[start:stop] == "": + local_stop = start + else: + local_stop = stop + y = "" + if start is not None: + y += x[:start] + y += repl + if stop is not None: + y += x[local_stop:] + return y + + return self._str_map(f) + + def _str_split(self, pat=None, n=-1, expand=False): + if pat is None: + if n is None or n == 0: + n = -1 + f = lambda x: x.split(pat, n) + else: + if len(pat) == 1: + if n is None or n == 0: + n = -1 + f = lambda x: x.split(pat, n) + else: + if n is None or n == -1: + n = 0 + regex = re.compile(pat) + f = lambda x: regex.split(x, maxsplit=n) + return self._str_map(f, dtype=object) + + def _str_rsplit(self, pat=None, n=-1): + if n is None or n == 0: + n = -1 + f = lambda x: x.rsplit(pat, n) + return self._str_map(f, dtype="object") + + def _str_translate(self, table): + return self._str_map(lambda x: x.translate(table)) + + def _str_wrap(self, width, **kwargs): + kwargs["width"] = width + tw = textwrap.TextWrapper(**kwargs) + return self._str_map(lambda s: "\n".join(tw.wrap(s))) + + def _str_get_dummies(self, sep="|"): + from pandas import Series + + arr = Series(self).fillna("") + try: + arr = sep + arr + sep + except TypeError: + arr = cast(Series, arr) + arr = sep + arr.astype(str) + sep + arr = cast(Series, arr) + + tags: Set[str] = set() + for ts in Series(arr).str.split(sep): + tags.update(ts) + tags2 = sorted(tags - {""}) + + dummies = np.empty((len(arr), len(tags2)), dtype=np.int64) + + for i, t in enumerate(tags2): + pat = sep + t + sep + dummies[:, i] = lib.map_infer(arr.to_numpy(), lambda x: pat in x) + return dummies, tags2 + + def _str_upper(self): + return self._str_map(lambda x: x.upper()) + + def _str_isalnum(self): + return self._str_map(str.isalnum, dtype="bool") + + def _str_isalpha(self): + return self._str_map(str.isalpha, dtype="bool") + + def _str_isdecimal(self): + return self._str_map(str.isdecimal, dtype="bool") + + def _str_isdigit(self): + return self._str_map(str.isdigit, dtype="bool") + + def _str_islower(self): + return self._str_map(str.islower, dtype="bool") + + def _str_isnumeric(self): + return self._str_map(str.isnumeric, dtype="bool") + + def _str_isspace(self): + return self._str_map(str.isspace, dtype="bool") + + def _str_istitle(self): + return self._str_map(str.istitle, dtype="bool") + + def _str_isupper(self): + return self._str_map(str.isupper, dtype="bool") + + def _str_capitalize(self): + return self._str_map(str.capitalize) + + def _str_casefold(self): + return self._str_map(str.casefold) + + def _str_title(self): + return self._str_map(str.title) + + def _str_swapcase(self): + return self._str_map(str.swapcase) + + def _str_lower(self): + return self._str_map(str.lower) + + def _str_normalize(self, form): + f = lambda x: unicodedata.normalize(form, x) + return self._str_map(f) + + def _str_strip(self, to_strip=None): + return self._str_map(lambda x: x.strip(to_strip)) + + def _str_lstrip(self, to_strip=None): + return self._str_map(lambda x: x.lstrip(to_strip)) + + def _str_rstrip(self, to_strip=None): + return self._str_map(lambda x: x.rstrip(to_strip)) diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py index c792a48d3ef08..6ad55639ae5d8 100644 --- a/pandas/tests/test_strings.py +++ b/pandas/tests/test_strings.py @@ -728,10 +728,6 @@ def test_count(self): ["foo", "foofoo", np.nan, "foooofooofommmfoo"], dtype=np.object_ ) - result = strings.str_count(values, "f[o]+") - exp = np.array([1, 2, np.nan, 4]) - tm.assert_numpy_array_equal(result, exp) - result = Series(values).str.count("f[o]+") exp = Series([1, 2, np.nan, 4]) assert isinstance(result, Series) @@ -742,10 +738,6 @@ def test_count(self): ["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0], dtype=object, ) - rs = strings.str_count(mixed, "a") - xp = np.array([1, np.nan, 0, np.nan, np.nan, 0, np.nan, np.nan, np.nan]) - tm.assert_numpy_array_equal(rs, xp) - rs = Series(mixed).str.count("a") xp = Series([1, np.nan, 0, np.nan, np.nan, 0, np.nan, np.nan, np.nan]) assert isinstance(rs, Series) @@ -755,46 +747,55 @@ def test_contains(self): values = np.array( ["foo", np.nan, "fooommm__foo", "mmm_", "foommm[_]+bar"], dtype=np.object_ ) + values = Series(values) pat = "mmm[_]+" - result = strings.str_contains(values, pat) - expected = np.array([False, np.nan, True, True, False], dtype=np.object_) - tm.assert_numpy_array_equal(result, expected) + result = values.str.contains(pat) + expected = Series( + np.array([False, np.nan, True, True, False], dtype=np.object_) + ) + tm.assert_series_equal(result, expected) - result = strings.str_contains(values, pat, regex=False) - expected = np.array([False, np.nan, False, False, True], dtype=np.object_) - tm.assert_numpy_array_equal(result, expected) + result = values.str.contains(pat, regex=False) + expected = Series( + np.array([False, np.nan, False, False, True], dtype=np.object_) + ) + tm.assert_series_equal(result, expected) - values = np.array(["foo", "xyz", "fooommm__foo", "mmm_"], dtype=object) - result = strings.str_contains(values, pat) - expected = np.array([False, False, True, True]) + values = Series(np.array(["foo", "xyz", "fooommm__foo", "mmm_"], dtype=object)) + result = values.str.contains(pat) + expected = Series(np.array([False, False, True, True])) assert result.dtype == np.bool_ - tm.assert_numpy_array_equal(result, expected) + tm.assert_series_equal(result, expected) # case insensitive using regex - values = np.array(["Foo", "xYz", "fOOomMm__fOo", "MMM_"], dtype=object) - result = strings.str_contains(values, "FOO|mmm", case=False) - expected = np.array([True, False, True, True]) - tm.assert_numpy_array_equal(result, expected) + values = Series(np.array(["Foo", "xYz", "fOOomMm__fOo", "MMM_"], dtype=object)) + result = values.str.contains("FOO|mmm", case=False) + expected = Series(np.array([True, False, True, True])) + tm.assert_series_equal(result, expected) # case insensitive without regex - result = strings.str_contains(values, "foo", regex=False, case=False) - expected = np.array([True, False, True, False]) - tm.assert_numpy_array_equal(result, expected) + result = Series(values).str.contains("foo", regex=False, case=False) + expected = Series(np.array([True, False, True, False])) + tm.assert_series_equal(result, expected) # mixed - mixed = np.array( - ["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0], - dtype=object, + mixed = Series( + np.array( + ["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0], + dtype=object, + ) ) - rs = strings.str_contains(mixed, "o") - xp = np.array( - [False, np.nan, False, np.nan, np.nan, True, np.nan, np.nan, np.nan], - dtype=np.object_, + rs = mixed.str.contains("o") + xp = Series( + np.array( + [False, np.nan, False, np.nan, np.nan, True, np.nan, np.nan, np.nan], + dtype=np.object_, + ) ) - tm.assert_numpy_array_equal(rs, xp) + tm.assert_series_equal(rs, xp) - rs = Series(mixed).str.contains("o") + rs = mixed.str.contains("o") xp = Series( [False, np.nan, False, np.nan, np.nan, True, np.nan, np.nan, np.nan] ) @@ -802,22 +803,26 @@ def test_contains(self): tm.assert_series_equal(rs, xp) # unicode - values = np.array(["foo", np.nan, "fooommm__foo", "mmm_"], dtype=np.object_) + values = Series( + np.array(["foo", np.nan, "fooommm__foo", "mmm_"], dtype=np.object_) + ) pat = "mmm[_]+" - result = strings.str_contains(values, pat) - expected = np.array([False, np.nan, True, True], dtype=np.object_) - tm.assert_numpy_array_equal(result, expected) + result = values.str.contains(pat) + expected = Series(np.array([False, np.nan, True, True], dtype=np.object_)) + tm.assert_series_equal(result, expected) - result = strings.str_contains(values, pat, na=False) - expected = np.array([False, False, True, True]) - tm.assert_numpy_array_equal(result, expected) + result = values.str.contains(pat, na=False) + expected = Series(np.array([False, False, True, True])) + tm.assert_series_equal(result, expected) - values = np.array(["foo", "xyz", "fooommm__foo", "mmm_"], dtype=np.object_) - result = strings.str_contains(values, pat) - expected = np.array([False, False, True, True]) + values = Series( + np.array(["foo", "xyz", "fooommm__foo", "mmm_"], dtype=np.object_) + ) + result = values.str.contains(pat) + expected = Series(np.array([False, False, True, True])) assert result.dtype == np.bool_ - tm.assert_numpy_array_equal(result, expected) + tm.assert_series_equal(result, expected) def test_contains_for_object_category(self): # gh 22158 @@ -865,15 +870,7 @@ def test_startswith(self, dtype, null_value, na): ["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0], dtype=np.object_, ) - rs = strings.str_startswith(mixed, "f") - xp = np.array( - [False, np.nan, False, np.nan, np.nan, True, np.nan, np.nan, np.nan], - dtype=np.object_, - ) - tm.assert_numpy_array_equal(rs, xp) - rs = Series(mixed).str.startswith("f") - assert isinstance(rs, Series) xp = Series( [False, np.nan, False, np.nan, np.nan, True, np.nan, np.nan, np.nan] ) @@ -902,18 +899,10 @@ def test_endswith(self, dtype, null_value, na): ["a", np.nan, "b", True, datetime.today(), "foo", None, 1, 2.0], dtype=object, ) - rs = strings.str_endswith(mixed, "f") - xp = np.array( - [False, np.nan, False, np.nan, np.nan, False, np.nan, np.nan, np.nan], - dtype=np.object_, - ) - tm.assert_numpy_array_equal(rs, xp) - rs = Series(mixed).str.endswith("f") xp = Series( [False, np.nan, False, np.nan, np.nan, False, np.nan, np.nan, np.nan] ) - assert isinstance(rs, Series) tm.assert_series_equal(rs, xp) def test_title(self): @@ -1213,6 +1202,11 @@ def test_match(self): exp = Series([True, np.nan, np.nan]) tm.assert_series_equal(exp, res) + values = Series(["ab", "AB", "abc", "ABC"]) + result = values.str.match("ab", case=False) + expected = Series([True, True, True, True]) + tm.assert_series_equal(result, expected) + def test_fullmatch(self): # GH 32806 values = Series(["fooBAD__barBAD", "BAD_BADleroybrown", np.nan, "foo"]) @@ -1229,6 +1223,11 @@ def test_fullmatch(self): string_exp = Series([True, False, np.nan, False], dtype="boolean") tm.assert_series_equal(result, string_exp) + values = Series(["ab", "AB", "abc", "ABC"]) + result = values.str.fullmatch("ab", case=False) + expected = Series([True, True, False, False]) + tm.assert_series_equal(result, expected) + def test_extract_expand_None(self): values = Series(["fooBAD__barBAD", np.nan, "foo"]) with pytest.raises(ValueError, match="expand must be True or False"): @@ -2252,6 +2251,9 @@ def _check(result, expected): with pytest.raises(TypeError, match=msg): result = s.str.index(0) + with pytest.raises(TypeError, match=msg): + result = s.str.rindex(0) + # test with nan s = Series(["abcb", "ab", "bcbe", np.nan]) result = s.str.index("b") @@ -2539,6 +2541,18 @@ def test_split(self): exp = Series([["a", "b", "c"], ["c", "d", "e"], np.nan, ["f", "g", "h"]]) tm.assert_series_equal(result, exp) + @pytest.mark.parametrize("dtype", [object, "string"]) + @pytest.mark.parametrize("method", ["split", "rsplit"]) + def test_split_n(self, dtype, method): + s = pd.Series(["a b", pd.NA, "b c"], dtype=dtype) + expected = pd.Series([["a", "b"], pd.NA, ["b", "c"]]) + + result = getattr(s.str, method)(" ", n=None) + tm.assert_series_equal(result, expected) + + result = getattr(s.str, method)(" ", n=0) + tm.assert_series_equal(result, expected) + def test_rsplit(self): values = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"]) result = values.str.rsplit("_") @@ -3641,3 +3655,10 @@ def test_cat_different_classes(klass): result = s.str.cat(klass(["x", "y", "z"])) expected = pd.Series(["ax", "by", "cz"]) tm.assert_series_equal(result, expected) + + +def test_str_get_stringarray_multiple_nans(): + s = pd.Series(pd.array(["a", "ab", pd.NA, "abc"])) + result = s.str.get(2) + expected = pd.Series(pd.array([pd.NA, pd.NA, pd.NA, "c"])) + tm.assert_series_equal(result, expected) From b5f60e54def9023053a5a4a3b2e12b6a7db38e26 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 30 Sep 2020 14:31:51 +0200 Subject: [PATCH 0948/1025] ENH: nullable Float32/64 ExtensionArray (#34307) --- doc/source/whatsnew/v1.2.0.rst | 48 ++ pandas/__init__.py | 2 + pandas/_testing.py | 1 + pandas/arrays/__init__.py | 2 + pandas/conftest.py | 11 + pandas/core/api.py | 1 + pandas/core/arrays/__init__.py | 4 + pandas/core/arrays/boolean.py | 8 +- pandas/core/arrays/floating.py | 618 ++++++++++++++++++ pandas/core/arrays/integer.py | 34 +- pandas/core/arrays/masked.py | 20 +- pandas/core/arrays/string_.py | 18 +- pandas/core/construction.py | 21 +- pandas/core/dtypes/cast.py | 4 +- pandas/core/dtypes/common.py | 7 +- pandas/core/groupby/ops.py | 3 +- pandas/tests/api/test_api.py | 2 + pandas/tests/arrays/floating/__init__.py | 0 pandas/tests/arrays/floating/conftest.py | 36 + .../tests/arrays/floating/test_arithmetic.py | 182 ++++++ pandas/tests/arrays/floating/test_astype.py | 120 ++++ .../tests/arrays/floating/test_comparison.py | 117 ++++ pandas/tests/arrays/floating/test_concat.py | 21 + .../arrays/floating/test_construction.py | 167 +++++ pandas/tests/arrays/floating/test_function.py | 154 +++++ pandas/tests/arrays/floating/test_repr.py | 45 ++ pandas/tests/arrays/floating/test_to_numpy.py | 132 ++++ pandas/tests/arrays/integer/test_concat.py | 6 +- pandas/tests/arrays/integer/test_dtypes.py | 7 + pandas/tests/arrays/masked/test_arithmetic.py | 9 +- .../tests/arrays/masked/test_arrow_compat.py | 1 + pandas/tests/arrays/test_array.py | 19 + pandas/tests/extension/test_floating.py | 219 +++++++ 33 files changed, 1992 insertions(+), 47 deletions(-) create mode 100644 pandas/core/arrays/floating.py create mode 100644 pandas/tests/arrays/floating/__init__.py create mode 100644 pandas/tests/arrays/floating/conftest.py create mode 100644 pandas/tests/arrays/floating/test_arithmetic.py create mode 100644 pandas/tests/arrays/floating/test_astype.py create mode 100644 pandas/tests/arrays/floating/test_comparison.py create mode 100644 pandas/tests/arrays/floating/test_concat.py create mode 100644 pandas/tests/arrays/floating/test_construction.py create mode 100644 pandas/tests/arrays/floating/test_function.py create mode 100644 pandas/tests/arrays/floating/test_repr.py create mode 100644 pandas/tests/arrays/floating/test_to_numpy.py create mode 100644 pandas/tests/extension/test_floating.py diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index f87dac0669e00..ddee06aeab779 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -109,6 +109,54 @@ Beginning with this version, the default is now to use the more accurate parser ``floating_precision="legacy"`` to use the legacy parser. The change to using the higher precision parser by default should have no impact on performance. (:issue:`17154`) +.. _whatsnew_120.floating: + +Experimental nullable data types for float data +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +We've added :class:`Float32Dtype` / :class:`Float64Dtype` and :class:`~arrays.FloatingArray`, +an extension data type dedicated to floating point data that can hold the +``pd.NA`` missing value indicator (:issue:`32265`, :issue:`34307`). + +While the default float data type already supports missing values using ``np.nan``, +this new data type uses ``pd.NA`` (and its corresponding behaviour) as missing +value indicator, in line with the already existing nullable :ref:`integer ` +and :ref:`boolean ` data types. + +One example where the behaviour of ``np.nan`` and ``pd.NA`` is different is +comparison operations: + +.. ipython:: python + + # the default numpy float64 dtype + s1 = pd.Series([1.5, None]) + s1 + s1 > 1 + +.. ipython:: python + + # the new nullable float64 dtype + s2 = pd.Series([1.5, None], dtype="Float64") + s2 + s2 > 1 + +See the :ref:`missing_data.NA` doc section for more details on the behaviour +when using the ``pd.NA`` missing value indicator. + +As shown above, the dtype can be specified using the "Float64" or "Float32" +string (capitalized to distinguish it from the default "float64" data type). +Alternatively, you can also use the dtype object: + +.. ipython:: python + + pd.Series([1.5, None], dtype=pd.Float32Dtype()) + +.. warning:: + + Experimental: the new floating data types are currently experimental, and its + behaviour or API may still change without warning. Expecially the behaviour + regarding NaN (distinct from NA missing values) is subject to change. + .. _whatsnew_120.enhancements.other: Other enhancements diff --git a/pandas/__init__.py b/pandas/__init__.py index 70bb0c8a2cb51..cf7ae2505b72d 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -58,6 +58,8 @@ UInt16Dtype, UInt32Dtype, UInt64Dtype, + Float32Dtype, + Float64Dtype, CategoricalDtype, PeriodDtype, IntervalDtype, diff --git a/pandas/_testing.py b/pandas/_testing.py index 3e3ba480ebfeb..78b6b3c4f9072 100644 --- a/pandas/_testing.py +++ b/pandas/_testing.py @@ -84,6 +84,7 @@ ALL_EA_INT_DTYPES = UNSIGNED_EA_INT_DTYPES + SIGNED_EA_INT_DTYPES FLOAT_DTYPES: List[Dtype] = [float, "float32", "float64"] +FLOAT_EA_DTYPES: List[Dtype] = ["Float32", "Float64"] COMPLEX_DTYPES: List[Dtype] = [complex, "complex64", "complex128"] STRING_DTYPES: List[Dtype] = [str, "str", "U"] diff --git a/pandas/arrays/__init__.py b/pandas/arrays/__init__.py index 61832a8b6d621..0fa070b6e4fc4 100644 --- a/pandas/arrays/__init__.py +++ b/pandas/arrays/__init__.py @@ -7,6 +7,7 @@ BooleanArray, Categorical, DatetimeArray, + FloatingArray, IntegerArray, IntervalArray, PandasArray, @@ -20,6 +21,7 @@ "BooleanArray", "Categorical", "DatetimeArray", + "FloatingArray", "IntegerArray", "IntervalArray", "PandasArray", diff --git a/pandas/conftest.py b/pandas/conftest.py index 65c31b1f17c3c..3865d287c6905 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -978,6 +978,17 @@ def float_dtype(request): return request.param +@pytest.fixture(params=tm.FLOAT_EA_DTYPES) +def float_ea_dtype(request): + """ + Parameterized fixture for float dtypes. + + * 'Float32' + * 'Float64' + """ + return request.param + + @pytest.fixture(params=tm.COMPLEX_DTYPES) def complex_dtype(request): """ diff --git a/pandas/core/api.py b/pandas/core/api.py index 348e9206d6e19..67e86c2076329 100644 --- a/pandas/core/api.py +++ b/pandas/core/api.py @@ -14,6 +14,7 @@ from pandas.core.algorithms import factorize, unique, value_counts from pandas.core.arrays import Categorical from pandas.core.arrays.boolean import BooleanDtype +from pandas.core.arrays.floating import Float32Dtype, Float64Dtype from pandas.core.arrays.integer import ( Int8Dtype, Int16Dtype, diff --git a/pandas/core/arrays/__init__.py b/pandas/core/arrays/__init__.py index 1d538824e6d82..e5258a6aecd30 100644 --- a/pandas/core/arrays/__init__.py +++ b/pandas/core/arrays/__init__.py @@ -6,8 +6,10 @@ from pandas.core.arrays.boolean import BooleanArray from pandas.core.arrays.categorical import Categorical from pandas.core.arrays.datetimes import DatetimeArray +from pandas.core.arrays.floating import FloatingArray from pandas.core.arrays.integer import IntegerArray, integer_array from pandas.core.arrays.interval import IntervalArray +from pandas.core.arrays.masked import BaseMaskedArray from pandas.core.arrays.numpy_ import PandasArray, PandasDtype from pandas.core.arrays.period import PeriodArray, period_array from pandas.core.arrays.sparse import SparseArray @@ -18,9 +20,11 @@ "ExtensionArray", "ExtensionOpsMixin", "ExtensionScalarOpsMixin", + "BaseMaskedArray", "BooleanArray", "Categorical", "DatetimeArray", + "FloatingArray", "IntegerArray", "integer_array", "IntervalArray", diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index 0a6a65bbbd5a0..dd750bce7842e 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -58,8 +58,9 @@ class BooleanDtype(BaseMaskedDtype): name = "boolean" + # mypy: https://github.com/python/mypy/issues/4125 @property - def type(self) -> Type[np.bool_]: + def type(self) -> Type: # type: ignore[override] return np.bool_ @property @@ -606,10 +607,9 @@ def logical_method(self, other): def _create_comparison_method(cls, op): @ops.unpack_zerodim_and_defer(op.__name__) def cmp_method(self, other): - from pandas.arrays import IntegerArray + from pandas.arrays import FloatingArray, IntegerArray - if isinstance(other, IntegerArray): - # Rely on pandas to unbox and dispatch to us. + if isinstance(other, (IntegerArray, FloatingArray)): return NotImplemented mask = None diff --git a/pandas/core/arrays/floating.py b/pandas/core/arrays/floating.py new file mode 100644 index 0000000000000..c3710196a8611 --- /dev/null +++ b/pandas/core/arrays/floating.py @@ -0,0 +1,618 @@ +import numbers +from typing import TYPE_CHECKING, List, Optional, Tuple, Type, Union +import warnings + +import numpy as np + +from pandas._libs import lib, missing as libmissing +from pandas._typing import ArrayLike, DtypeObj +from pandas.compat import set_function_name +from pandas.compat.numpy import function as nv +from pandas.util._decorators import cache_readonly + +from pandas.core.dtypes.cast import astype_nansafe +from pandas.core.dtypes.common import ( + is_bool_dtype, + is_datetime64_dtype, + is_float, + is_float_dtype, + is_integer, + is_integer_dtype, + is_list_like, + is_object_dtype, + pandas_dtype, +) +from pandas.core.dtypes.dtypes import register_extension_dtype +from pandas.core.dtypes.missing import isna + +from pandas.core import nanops, ops +from pandas.core.array_algos import masked_reductions +from pandas.core.ops import invalid_comparison +from pandas.core.ops.common import unpack_zerodim_and_defer +from pandas.core.tools.numeric import to_numeric + +from .masked import BaseMaskedArray, BaseMaskedDtype + +if TYPE_CHECKING: + import pyarrow # noqa: F401 + + +class FloatingDtype(BaseMaskedDtype): + """ + An ExtensionDtype to hold a single size of floating dtype. + + These specific implementations are subclasses of the non-public + FloatingDtype. For example we have Float32Dtype to represent float32. + + The attributes name & type are set when these subclasses are created. + """ + + def __repr__(self) -> str: + return f"{self.name}Dtype()" + + @property + def _is_numeric(self) -> bool: + return True + + @classmethod + def construct_array_type(cls) -> Type["FloatingArray"]: + """ + Return the array type associated with this dtype. + + Returns + ------- + type + """ + return FloatingArray + + def _get_common_dtype(self, dtypes: List[DtypeObj]) -> Optional[DtypeObj]: + # for now only handle other floating types + if not all(isinstance(t, FloatingDtype) for t in dtypes): + return None + np_dtype = np.find_common_type( + [t.numpy_dtype for t in dtypes], [] # type: ignore[union-attr] + ) + if np.issubdtype(np_dtype, np.floating): + return FLOAT_STR_TO_DTYPE[str(np_dtype)] + return None + + def __from_arrow__( + self, array: Union["pyarrow.Array", "pyarrow.ChunkedArray"] + ) -> "FloatingArray": + """ + Construct FloatingArray from pyarrow Array/ChunkedArray. + """ + import pyarrow # noqa: F811 + + from pandas.core.arrays._arrow_utils import pyarrow_array_to_numpy_and_mask + + pyarrow_type = pyarrow.from_numpy_dtype(self.type) + if not array.type.equals(pyarrow_type): + array = array.cast(pyarrow_type) + + if isinstance(array, pyarrow.Array): + chunks = [array] + else: + # pyarrow.ChunkedArray + chunks = array.chunks + + results = [] + for arr in chunks: + data, mask = pyarrow_array_to_numpy_and_mask(arr, dtype=self.type) + float_arr = FloatingArray(data.copy(), ~mask, copy=False) + results.append(float_arr) + + return FloatingArray._concat_same_type(results) + + +def coerce_to_array( + values, dtype=None, mask=None, copy: bool = False +) -> Tuple[np.ndarray, np.ndarray]: + """ + Coerce the input values array to numpy arrays with a mask. + + Parameters + ---------- + values : 1D list-like + dtype : float dtype + mask : bool 1D array, optional + copy : bool, default False + if True, copy the input + + Returns + ------- + tuple of (values, mask) + """ + # if values is floating numpy array, preserve it's dtype + if dtype is None and hasattr(values, "dtype"): + if is_float_dtype(values.dtype): + dtype = values.dtype + + if dtype is not None: + if isinstance(dtype, str) and dtype.startswith("Float"): + # Avoid DeprecationWarning from NumPy about np.dtype("Float64") + # https://github.com/numpy/numpy/pull/7476 + dtype = dtype.lower() + + if not issubclass(type(dtype), FloatingDtype): + try: + dtype = FLOAT_STR_TO_DTYPE[str(np.dtype(dtype))] + except KeyError as err: + raise ValueError(f"invalid dtype specified {dtype}") from err + + if isinstance(values, FloatingArray): + values, mask = values._data, values._mask + if dtype is not None: + values = values.astype(dtype.numpy_dtype, copy=False) + + if copy: + values = values.copy() + mask = mask.copy() + return values, mask + + values = np.array(values, copy=copy) + if is_object_dtype(values): + inferred_type = lib.infer_dtype(values, skipna=True) + if inferred_type == "empty": + values = np.empty(len(values)) + values.fill(np.nan) + elif inferred_type not in [ + "floating", + "integer", + "mixed-integer", + "integer-na", + "mixed-integer-float", + ]: + raise TypeError(f"{values.dtype} cannot be converted to a FloatingDtype") + + elif is_bool_dtype(values) and is_float_dtype(dtype): + values = np.array(values, dtype=float, copy=copy) + + elif not (is_integer_dtype(values) or is_float_dtype(values)): + raise TypeError(f"{values.dtype} cannot be converted to a FloatingDtype") + + if mask is None: + mask = isna(values) + else: + assert len(mask) == len(values) + + if not values.ndim == 1: + raise TypeError("values must be a 1D list-like") + if not mask.ndim == 1: + raise TypeError("mask must be a 1D list-like") + + # infer dtype if needed + if dtype is None: + dtype = np.dtype("float64") + else: + dtype = dtype.type + + # if we are float, let's make sure that we can + # safely cast + + # we copy as need to coerce here + # TODO should this be a safe cast? + if mask.any(): + values = values.copy() + values[mask] = np.nan + values = values.astype(dtype, copy=False) # , casting="safe") + else: + values = values.astype(dtype, copy=False) # , casting="safe") + + return values, mask + + +class FloatingArray(BaseMaskedArray): + """ + Array of floating (optional missing) values. + + .. versionadded:: 1.2.0 + + .. warning:: + + FloatingArray is currently experimental, and its API or internal + implementation may change without warning. Expecially the behaviour + regarding NaN (distinct from NA missing values) is subject to change. + + We represent a FloatingArray with 2 numpy arrays: + + - data: contains a numpy float array of the appropriate dtype + - mask: a boolean array holding a mask on the data, True is missing + + To construct an FloatingArray from generic array-like input, use + :func:`pandas.array` with one of the float dtypes (see examples). + + See :ref:`integer_na` for more. + + Parameters + ---------- + values : numpy.ndarray + A 1-d float-dtype array. + mask : numpy.ndarray + A 1-d boolean-dtype array indicating missing values. + copy : bool, default False + Whether to copy the `values` and `mask`. + + Attributes + ---------- + None + + Methods + ------- + None + + Returns + ------- + FloatingArray + + Examples + -------- + Create an FloatingArray with :func:`pandas.array`: + + >>> pd.array([0.1, None, 0.3], dtype=pd.Float32Dtype()) + + [0.1, , 0.3] + Length: 3, dtype: Float32 + + String aliases for the dtypes are also available. They are capitalized. + + >>> pd.array([0.1, None, 0.3], dtype="Float32") + + [0.1, , 0.3] + Length: 3, dtype: Float32 + """ + + # The value used to fill '_data' to avoid upcasting + _internal_fill_value = 0.0 + + @cache_readonly + def dtype(self) -> FloatingDtype: + return FLOAT_STR_TO_DTYPE[str(self._data.dtype)] + + def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False): + if not (isinstance(values, np.ndarray) and values.dtype.kind == "f"): + raise TypeError( + "values should be floating numpy array. Use " + "the 'pd.array' function instead" + ) + super().__init__(values, mask, copy=copy) + + @classmethod + def _from_sequence(cls, scalars, dtype=None, copy: bool = False) -> "FloatingArray": + values, mask = coerce_to_array(scalars, dtype=dtype, copy=copy) + return FloatingArray(values, mask) + + @classmethod + def _from_sequence_of_strings( + cls, strings, dtype=None, copy: bool = False + ) -> "FloatingArray": + scalars = to_numeric(strings, errors="raise") + return cls._from_sequence(scalars, dtype, copy) + + _HANDLED_TYPES = (np.ndarray, numbers.Number) + + def __array_ufunc__(self, ufunc, method: str, *inputs, **kwargs): + # For FloatingArray inputs, we apply the ufunc to ._data + # and mask the result. + if method == "reduce": + # Not clear how to handle missing values in reductions. Raise. + raise NotImplementedError("The 'reduce' method is not supported.") + out = kwargs.get("out", ()) + + for x in inputs + out: + if not isinstance(x, self._HANDLED_TYPES + (FloatingArray,)): + return NotImplemented + + # for binary ops, use our custom dunder methods + result = ops.maybe_dispatch_ufunc_to_dunder_op( + self, ufunc, method, *inputs, **kwargs + ) + if result is not NotImplemented: + return result + + mask = np.zeros(len(self), dtype=bool) + inputs2 = [] + for x in inputs: + if isinstance(x, FloatingArray): + mask |= x._mask + inputs2.append(x._data) + else: + inputs2.append(x) + + def reconstruct(x): + # we don't worry about scalar `x` here, since we + # raise for reduce up above. + + # TODO + if is_float_dtype(x.dtype): + m = mask.copy() + return FloatingArray(x, m) + else: + x[mask] = np.nan + return x + + result = getattr(ufunc, method)(*inputs2, **kwargs) + if isinstance(result, tuple): + tuple(reconstruct(x) for x in result) + else: + return reconstruct(result) + + def _coerce_to_array(self, value) -> Tuple[np.ndarray, np.ndarray]: + return coerce_to_array(value, dtype=self.dtype) + + def astype(self, dtype, copy: bool = True) -> ArrayLike: + """ + Cast to a NumPy array or ExtensionArray with 'dtype'. + + Parameters + ---------- + dtype : str or dtype + Typecode or data-type to which the array is cast. + copy : bool, default True + Whether to copy the data, even if not necessary. If False, + a copy is made only if the old dtype does not match the + new dtype. + + Returns + ------- + ndarray or ExtensionArray + NumPy ndarray, or BooleanArray, IntegerArray or FloatingArray with + 'dtype' for its dtype. + + Raises + ------ + TypeError + if incompatible type with an FloatingDtype, equivalent of same_kind + casting + """ + from pandas.core.arrays.string_ import StringArray, StringDtype + + dtype = pandas_dtype(dtype) + + # if the dtype is exactly the same, we can fastpath + if self.dtype == dtype: + # return the same object for copy=False + return self.copy() if copy else self + # if we are astyping to another nullable masked dtype, we can fastpath + if isinstance(dtype, BaseMaskedDtype): + # TODO deal with NaNs + data = self._data.astype(dtype.numpy_dtype, copy=copy) + # mask is copied depending on whether the data was copied, and + # not directly depending on the `copy` keyword + mask = self._mask if data is self._data else self._mask.copy() + return dtype.construct_array_type()(data, mask, copy=False) + elif isinstance(dtype, StringDtype): + return StringArray._from_sequence(self, copy=False) + + # coerce + if is_float_dtype(dtype): + # In astype, we consider dtype=float to also mean na_value=np.nan + kwargs = dict(na_value=np.nan) + elif is_datetime64_dtype(dtype): + kwargs = dict(na_value=np.datetime64("NaT")) + else: + kwargs = {} + + data = self.to_numpy(dtype=dtype, **kwargs) + return astype_nansafe(data, dtype, copy=False) + + def _values_for_argsort(self) -> np.ndarray: + return self._data + + @classmethod + def _create_comparison_method(cls, op): + op_name = op.__name__ + + @unpack_zerodim_and_defer(op.__name__) + def cmp_method(self, other): + from pandas.arrays import BooleanArray, IntegerArray + + mask = None + + if isinstance(other, (BooleanArray, IntegerArray, FloatingArray)): + other, mask = other._data, other._mask + + elif is_list_like(other): + other = np.asarray(other) + if other.ndim > 1: + raise NotImplementedError( + "can only perform ops with 1-d structures" + ) + + if other is libmissing.NA: + # numpy does not handle pd.NA well as "other" scalar (it returns + # a scalar False instead of an array) + # This may be fixed by NA.__array_ufunc__. Revisit this check + # once that's implemented. + result = np.zeros(self._data.shape, dtype="bool") + mask = np.ones(self._data.shape, dtype="bool") + else: + with warnings.catch_warnings(): + # numpy may show a FutureWarning: + # elementwise comparison failed; returning scalar instead, + # but in the future will perform elementwise comparison + # before returning NotImplemented. We fall back to the correct + # behavior today, so that should be fine to ignore. + warnings.filterwarnings("ignore", "elementwise", FutureWarning) + with np.errstate(all="ignore"): + method = getattr(self._data, f"__{op_name}__") + result = method(other) + + if result is NotImplemented: + result = invalid_comparison(self._data, other, op) + + # nans propagate + if mask is None: + mask = self._mask.copy() + else: + mask = self._mask | mask + + return BooleanArray(result, mask) + + name = f"__{op.__name__}__" + return set_function_name(cmp_method, name, cls) + + def _reduce(self, name: str, skipna: bool = True, **kwargs): + data = self._data + mask = self._mask + + if name in {"sum", "prod", "min", "max"}: + op = getattr(masked_reductions, name) + return op(data, mask, skipna=skipna, **kwargs) + + # coerce to a nan-aware float if needed + # (we explicitly use NaN within reductions) + if self._hasna: + data = self.to_numpy("float64", na_value=np.nan) + + op = getattr(nanops, "nan" + name) + result = op(data, axis=0, skipna=skipna, mask=mask, **kwargs) + + if np.isnan(result): + return libmissing.NA + + return result + + def sum(self, skipna=True, min_count=0, **kwargs): + nv.validate_sum((), kwargs) + result = masked_reductions.sum( + values=self._data, mask=self._mask, skipna=skipna, min_count=min_count + ) + return result + + def _maybe_mask_result(self, result, mask, other, op_name: str): + """ + Parameters + ---------- + result : array-like + mask : array-like bool + other : scalar or array-like + op_name : str + """ + # TODO are there cases we don't end up with float? + # if we have a float operand we are by-definition + # a float result + # or our op is a divide + # if (is_float_dtype(other) or is_float(other)) or ( + # op_name in ["rtruediv", "truediv"] + # ): + # result[mask] = np.nan + # return result + + return type(self)(result, mask, copy=False) + + @classmethod + def _create_arithmetic_method(cls, op): + op_name = op.__name__ + + @unpack_zerodim_and_defer(op.__name__) + def floating_arithmetic_method(self, other): + from pandas.arrays import IntegerArray + + omask = None + + if getattr(other, "ndim", 0) > 1: + raise NotImplementedError("can only perform ops with 1-d structures") + + if isinstance(other, (IntegerArray, FloatingArray)): + other, omask = other._data, other._mask + + elif is_list_like(other): + other = np.asarray(other) + if other.ndim > 1: + raise NotImplementedError( + "can only perform ops with 1-d structures" + ) + if len(self) != len(other): + raise ValueError("Lengths must match") + if not (is_float_dtype(other) or is_integer_dtype(other)): + raise TypeError("can only perform ops with numeric values") + + else: + if not (is_float(other) or is_integer(other) or other is libmissing.NA): + raise TypeError("can only perform ops with numeric values") + + if omask is None: + mask = self._mask.copy() + if other is libmissing.NA: + mask |= True + else: + mask = self._mask | omask + + if op_name == "pow": + # 1 ** x is 1. + mask = np.where((self._data == 1) & ~self._mask, False, mask) + # x ** 0 is 1. + if omask is not None: + mask = np.where((other == 0) & ~omask, False, mask) + elif other is not libmissing.NA: + mask = np.where(other == 0, False, mask) + + elif op_name == "rpow": + # 1 ** x is 1. + if omask is not None: + mask = np.where((other == 1) & ~omask, False, mask) + elif other is not libmissing.NA: + mask = np.where(other == 1, False, mask) + # x ** 0 is 1. + mask = np.where((self._data == 0) & ~self._mask, False, mask) + + if other is libmissing.NA: + result = np.ones_like(self._data) + else: + with np.errstate(all="ignore"): + result = op(self._data, other) + + # divmod returns a tuple + if op_name == "divmod": + div, mod = result + return ( + self._maybe_mask_result(div, mask, other, "floordiv"), + self._maybe_mask_result(mod, mask, other, "mod"), + ) + + return self._maybe_mask_result(result, mask, other, op_name) + + name = f"__{op.__name__}__" + return set_function_name(floating_arithmetic_method, name, cls) + + +FloatingArray._add_arithmetic_ops() +FloatingArray._add_comparison_ops() + + +_dtype_docstring = """ +An ExtensionDtype for {dtype} data. + +This dtype uses ``pd.NA`` as missing value indicator. + +Attributes +---------- +None + +Methods +------- +None +""" + +# create the Dtype + + +@register_extension_dtype +class Float32Dtype(FloatingDtype): + type = np.float32 + name = "Float32" + __doc__ = _dtype_docstring.format(dtype="float32") + + +@register_extension_dtype +class Float64Dtype(FloatingDtype): + type = np.float64 + name = "Float64" + __doc__ = _dtype_docstring.format(dtype="float64") + + +FLOAT_STR_TO_DTYPE = { + "float32": Float32Dtype(), + "float64": Float64Dtype(), +} diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 8a51b7293082e..04c4c73954671 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -46,10 +46,6 @@ class _IntegerDtype(BaseMaskedDtype): The attributes name & type are set when these subclasses are created. """ - name: str - base = None - type: Type - def __repr__(self) -> str: sign = "U" if self.is_unsigned_integer else "" return f"{sign}Int{8 * self.itemsize}Dtype()" @@ -66,20 +62,6 @@ def is_unsigned_integer(self) -> bool: def _is_numeric(self) -> bool: return True - @cache_readonly - def numpy_dtype(self) -> np.dtype: - """ Return an instance of our numpy dtype """ - return np.dtype(self.type) - - @cache_readonly - def kind(self) -> str: - return self.numpy_dtype.kind - - @cache_readonly - def itemsize(self) -> int: - """ Return the number of bytes in this dtype """ - return self.numpy_dtype.itemsize - @classmethod def construct_array_type(cls) -> Type["IntegerArray"]: """ @@ -106,7 +88,11 @@ def _get_common_dtype(self, dtypes: List[DtypeObj]) -> Optional[DtypeObj]: [t.numpy_dtype if isinstance(t, BaseMaskedDtype) else t for t in dtypes], [] ) if np.issubdtype(np_dtype, np.integer): - return STR_TO_DTYPE[str(np_dtype)] + return INT_STR_TO_DTYPE[str(np_dtype)] + elif np.issubdtype(np_dtype, np.floating): + from pandas.core.arrays.floating import FLOAT_STR_TO_DTYPE + + return FLOAT_STR_TO_DTYPE[str(np_dtype)] return None def __from_arrow__( @@ -214,7 +200,7 @@ def coerce_to_array( if not issubclass(type(dtype), _IntegerDtype): try: - dtype = STR_TO_DTYPE[str(np.dtype(dtype))] + dtype = INT_STR_TO_DTYPE[str(np.dtype(dtype))] except KeyError as err: raise ValueError(f"invalid dtype specified {dtype}") from err @@ -354,7 +340,7 @@ class IntegerArray(BaseMaskedArray): @cache_readonly def dtype(self) -> _IntegerDtype: - return STR_TO_DTYPE[str(self._data.dtype)] + return INT_STR_TO_DTYPE[str(self._data.dtype)] def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False): if not (isinstance(values, np.ndarray) and values.dtype.kind in ["i", "u"]): @@ -513,11 +499,11 @@ def _create_comparison_method(cls, op): @unpack_zerodim_and_defer(op.__name__) def cmp_method(self, other): - from pandas.arrays import BooleanArray + from pandas.core.arrays import BaseMaskedArray, BooleanArray mask = None - if isinstance(other, (BooleanArray, IntegerArray)): + if isinstance(other, BaseMaskedArray): other, mask = other._data, other._mask elif is_list_like(other): @@ -744,7 +730,7 @@ class UInt64Dtype(_IntegerDtype): __doc__ = _dtype_docstring.format(dtype="uint64") -STR_TO_DTYPE: Dict[str, _IntegerDtype] = { +INT_STR_TO_DTYPE: Dict[str, _IntegerDtype] = { "int8": Int8Dtype(), "int16": Int16Dtype(), "int32": Int32Dtype(), diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 31274232e2525..97ade0dc70843 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -5,7 +5,7 @@ from pandas._libs import lib, missing as libmissing from pandas._typing import Scalar from pandas.errors import AbstractMethodError -from pandas.util._decorators import doc +from pandas.util._decorators import cache_readonly, doc from pandas.core.dtypes.base import ExtensionDtype from pandas.core.dtypes.common import ( @@ -34,11 +34,25 @@ class BaseMaskedDtype(ExtensionDtype): Base class for dtypes for BasedMaskedArray subclasses. """ + name: str + base = None + type: Type + na_value = libmissing.NA - @property + @cache_readonly def numpy_dtype(self) -> np.dtype: - raise AbstractMethodError + """ Return an instance of our numpy dtype """ + return np.dtype(self.type) + + @cache_readonly + def kind(self) -> str: + return self.numpy_dtype.kind + + @cache_readonly + def itemsize(self) -> int: + """ Return the number of bytes in this dtype """ + return self.numpy_dtype.itemsize @classmethod def construct_array_type(cls) -> Type["BaseMaskedArray"]: diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index fb126b3725237..bf8b93b5a4164 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -204,10 +204,20 @@ def _from_sequence(cls, scalars, dtype=None, copy=False): if dtype: assert dtype == "string" - # convert non-na-likes to str, and nan-likes to StringDtype.na_value - result = lib.ensure_string_array( - scalars, na_value=StringDtype.na_value, copy=copy - ) + from pandas.core.arrays.masked import BaseMaskedArray + + if isinstance(scalars, BaseMaskedArray): + # avoid costly conversion to object dtype + na_values = scalars._mask + result = scalars._data + result = lib.ensure_string_array(result, copy=copy, convert_na_value=False) + result[na_values] = StringDtype.na_value + + else: + # convert non-na-likes to str, and nan-likes to StringDtype.na_value + result = lib.ensure_string_array( + scalars, na_value=StringDtype.na_value, copy=copy + ) # Manually creating new array avoids the validation step in the __init__, so is # faster. Refactor need for validation? diff --git a/pandas/core/construction.py b/pandas/core/construction.py index 3ec5bc90d521d..4751f6076f869 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -102,6 +102,7 @@ def array( :class:`datetime.datetime` :class:`pandas.arrays.DatetimeArray` :class:`datetime.timedelta` :class:`pandas.arrays.TimedeltaArray` :class:`int` :class:`pandas.arrays.IntegerArray` + :class:`float` :class:`pandas.arrays.FloatingArray` :class:`str` :class:`pandas.arrays.StringArray` :class:`bool` :class:`pandas.arrays.BooleanArray` ============================== ===================================== @@ -114,6 +115,11 @@ def array( string dtype for string data, and nullable-boolean dtype for boolean data. + .. versionchanged:: 1.2.0 + + Pandas now also infers nullable-floating dtype for float-like + input data + copy : bool, default True Whether to copy the data, even if not necessary. Depending on the type of `data`, creating the new array may require @@ -205,6 +211,11 @@ def array( [1, 2, ] Length: 3, dtype: Int64 + >>> pd.array([1.1, 2.2]) + + [1.1, 2.2] + Length: 2, dtype: Float64 + >>> pd.array(["a", None, "c"]) ['a', , 'c'] @@ -231,10 +242,10 @@ def array( If pandas does not infer a dedicated extension type a :class:`arrays.PandasArray` is returned. - >>> pd.array([1.1, 2.2]) + >>> pd.array([1 + 1j, 3 + 2j]) - [1.1, 2.2] - Length: 2, dtype: float64 + [(1+1j), (3+2j)] + Length: 2, dtype: complex128 As mentioned in the "Notes" section, new extension types may be added in the future (by pandas or 3rd party libraries), causing the return @@ -258,6 +269,7 @@ def array( from pandas.core.arrays import ( BooleanArray, DatetimeArray, + FloatingArray, IntegerArray, IntervalArray, PandasArray, @@ -320,6 +332,9 @@ def array( elif inferred_dtype == "integer": return IntegerArray._from_sequence(data, copy=copy) + elif inferred_dtype in ("floating", "mixed-integer-float"): + return FloatingArray._from_sequence(data, copy=copy) + elif inferred_dtype == "boolean": return BooleanArray._from_sequence(data, copy=copy) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index c5ea24145ae9e..3aa1317f6db6d 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1156,9 +1156,9 @@ def convert_dtypes( target_int_dtype = "Int64" if is_integer_dtype(input_array.dtype): - from pandas.core.arrays.integer import STR_TO_DTYPE + from pandas.core.arrays.integer import INT_STR_TO_DTYPE - inferred_dtype = STR_TO_DTYPE.get( + inferred_dtype = INT_STR_TO_DTYPE.get( input_array.dtype.name, target_int_dtype ) if not is_integer_dtype(input_array.dtype) and is_numeric_dtype( diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index acbdbfd7707e3..14184f044ae95 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -83,7 +83,12 @@ def ensure_float(arr): float_arr : The original array cast to the float dtype if possible. Otherwise, the original array is returned. """ - if issubclass(arr.dtype.type, (np.integer, np.bool_)): + if is_extension_array_dtype(arr.dtype): + if is_float_dtype(arr.dtype): + arr = arr.to_numpy(dtype=arr.dtype.numpy_dtype, na_value=np.nan) + else: + arr = arr.to_numpy(dtype="float64", na_value=np.nan) + elif issubclass(arr.dtype.type, (np.integer, np.bool_)): arr = arr.astype(float) return arr diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 17539cdf451e3..6051aa3022da1 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -20,6 +20,7 @@ from pandas.core.dtypes.cast import maybe_cast_result from pandas.core.dtypes.common import ( + ensure_float, ensure_float64, ensure_int64, ensure_int_or_float, @@ -491,7 +492,7 @@ def _cython_operation( else: values = ensure_int_or_float(values) elif is_numeric and not is_complex_dtype(values): - values = ensure_float64(values) + values = ensure_float64(ensure_float(values)) else: values = values.astype(object) diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index 54da13c3c620b..541c2988a0636 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -92,6 +92,8 @@ class TestPDApi(Base): "UInt16Dtype", "UInt32Dtype", "UInt64Dtype", + "Float32Dtype", + "Float64Dtype", "NamedAgg", ] diff --git a/pandas/tests/arrays/floating/__init__.py b/pandas/tests/arrays/floating/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/pandas/tests/arrays/floating/conftest.py b/pandas/tests/arrays/floating/conftest.py new file mode 100644 index 0000000000000..1e80518e15941 --- /dev/null +++ b/pandas/tests/arrays/floating/conftest.py @@ -0,0 +1,36 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas.core.arrays.floating import Float32Dtype, Float64Dtype + + +@pytest.fixture(params=[Float32Dtype, Float64Dtype]) +def dtype(request): + return request.param() + + +@pytest.fixture +def data(dtype): + return pd.array( + list(np.arange(0.1, 0.9, 0.1)) + + [pd.NA] + + list(np.arange(1, 9.8, 0.1)) + + [pd.NA] + + [9.9, 10.0], + dtype=dtype, + ) + + +@pytest.fixture +def data_missing(dtype): + return pd.array([np.nan, 0.1], dtype=dtype) + + +@pytest.fixture(params=["data", "data_missing"]) +def all_data(request, data, data_missing): + """Parametrized fixture giving 'data' and 'data_missing'""" + if request.param == "data": + return data + elif request.param == "data_missing": + return data_missing diff --git a/pandas/tests/arrays/floating/test_arithmetic.py b/pandas/tests/arrays/floating/test_arithmetic.py new file mode 100644 index 0000000000000..7ba4da8a5ede9 --- /dev/null +++ b/pandas/tests/arrays/floating/test_arithmetic.py @@ -0,0 +1,182 @@ +import operator + +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm +from pandas.core.arrays import FloatingArray + +# Basic test for the arithmetic array ops +# ----------------------------------------------------------------------------- + + +@pytest.mark.parametrize( + "opname, exp", + [ + ("add", [1.1, 2.2, None, None, 5.5]), + ("mul", [0.1, 0.4, None, None, 2.5]), + ("sub", [0.9, 1.8, None, None, 4.5]), + ("truediv", [10.0, 10.0, None, None, 10.0]), + ("floordiv", [9.0, 9.0, None, None, 10.0]), + ("mod", [0.1, 0.2, None, None, 0.0]), + ], + ids=["add", "mul", "sub", "div", "floordiv", "mod"], +) +def test_array_op(dtype, opname, exp): + a = pd.array([1.0, 2.0, None, 4.0, 5.0], dtype=dtype) + b = pd.array([0.1, 0.2, 0.3, None, 0.5], dtype=dtype) + + op = getattr(operator, opname) + + result = op(a, b) + expected = pd.array(exp, dtype=dtype) + tm.assert_extension_array_equal(result, expected) + + +@pytest.mark.parametrize("zero, negative", [(0, False), (0.0, False), (-0.0, True)]) +def test_divide_by_zero(dtype, zero, negative): + # TODO pending NA/NaN discussion + # https://github.com/pandas-dev/pandas/issues/32265/ + a = pd.array([0, 1, -1, None], dtype=dtype) + result = a / zero + expected = FloatingArray( + np.array([np.nan, np.inf, -np.inf, np.nan], dtype=dtype.numpy_dtype), + np.array([False, False, False, True]), + ) + if negative: + expected *= -1 + tm.assert_extension_array_equal(result, expected) + + +def test_pow_scalar(dtype): + a = pd.array([-1, 0, 1, None, 2], dtype=dtype) + result = a ** 0 + expected = pd.array([1, 1, 1, 1, 1], dtype=dtype) + tm.assert_extension_array_equal(result, expected) + + result = a ** 1 + expected = pd.array([-1, 0, 1, None, 2], dtype=dtype) + tm.assert_extension_array_equal(result, expected) + + result = a ** pd.NA + expected = pd.array([None, None, 1, None, None], dtype=dtype) + tm.assert_extension_array_equal(result, expected) + + result = a ** np.nan + # TODO np.nan should be converted to pd.NA / missing before operation? + expected = FloatingArray( + np.array([np.nan, np.nan, 1, np.nan, np.nan], dtype=dtype.numpy_dtype), + mask=a._mask, + ) + tm.assert_extension_array_equal(result, expected) + + # reversed + a = a[1:] # Can't raise integers to negative powers. + + result = 0 ** a + expected = pd.array([1, 0, None, 0], dtype=dtype) + tm.assert_extension_array_equal(result, expected) + + result = 1 ** a + expected = pd.array([1, 1, 1, 1], dtype=dtype) + tm.assert_extension_array_equal(result, expected) + + result = pd.NA ** a + expected = pd.array([1, None, None, None], dtype=dtype) + tm.assert_extension_array_equal(result, expected) + + result = np.nan ** a + expected = FloatingArray( + np.array([1, np.nan, np.nan, np.nan], dtype=dtype.numpy_dtype), mask=a._mask + ) + tm.assert_extension_array_equal(result, expected) + + +def test_pow_array(dtype): + a = pd.array([0, 0, 0, 1, 1, 1, None, None, None], dtype=dtype) + b = pd.array([0, 1, None, 0, 1, None, 0, 1, None], dtype=dtype) + result = a ** b + expected = pd.array([1, 0, None, 1, 1, 1, 1, None, None], dtype=dtype) + tm.assert_extension_array_equal(result, expected) + + +def test_rpow_one_to_na(): + # https://github.com/pandas-dev/pandas/issues/22022 + # https://github.com/pandas-dev/pandas/issues/29997 + arr = pd.array([np.nan, np.nan], dtype="Float64") + result = np.array([1.0, 2.0]) ** arr + expected = pd.array([1.0, np.nan], dtype="Float64") + tm.assert_extension_array_equal(result, expected) + + +@pytest.mark.parametrize("other", [0, 0.5]) +def test_arith_zero_dim_ndarray(other): + arr = pd.array([1, None, 2], dtype="Float64") + result = arr + np.array(other) + expected = arr + other + tm.assert_equal(result, expected) + + +# Test generic characteristics / errors +# ----------------------------------------------------------------------------- + + +def test_error_invalid_values(data, all_arithmetic_operators): + + op = all_arithmetic_operators + s = pd.Series(data) + ops = getattr(s, op) + + # invalid scalars + msg = ( + r"(:?can only perform ops with numeric values)" + r"|(:?FloatingArray cannot perform the operation mod)" + ) + with pytest.raises(TypeError, match=msg): + ops("foo") + with pytest.raises(TypeError, match=msg): + ops(pd.Timestamp("20180101")) + + # invalid array-likes + with pytest.raises(TypeError, match=msg): + ops(pd.Series("foo", index=s.index)) + + if op != "__rpow__": + # TODO(extension) + # rpow with a datetimelike coerces the integer array incorrectly + msg = ( + "can only perform ops with numeric values|" + "cannot perform .* with this index type: DatetimeArray|" + "Addition/subtraction of integers and integer-arrays " + "with DatetimeArray is no longer supported. *" + ) + with pytest.raises(TypeError, match=msg): + ops(pd.Series(pd.date_range("20180101", periods=len(s)))) + + +# Various +# ----------------------------------------------------------------------------- + + +def test_cross_type_arithmetic(): + + df = pd.DataFrame( + { + "A": pd.array([1, 2, np.nan], dtype="Float64"), + "B": pd.array([1, np.nan, 3], dtype="Float32"), + "C": np.array([1, 2, 3], dtype="float64"), + } + ) + + result = df.A + df.C + expected = pd.Series([2, 4, np.nan], dtype="Float64") + tm.assert_series_equal(result, expected) + + result = (df.A + df.C) * 3 == 12 + expected = pd.Series([False, True, None], dtype="boolean") + tm.assert_series_equal(result, expected) + + result = df.A + df.B + expected = pd.Series([2, np.nan, np.nan], dtype="Float64") + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/arrays/floating/test_astype.py b/pandas/tests/arrays/floating/test_astype.py new file mode 100644 index 0000000000000..828d80d2f9a51 --- /dev/null +++ b/pandas/tests/arrays/floating/test_astype.py @@ -0,0 +1,120 @@ +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm + + +def test_astype(): + # with missing values + arr = pd.array([0.1, 0.2, None], dtype="Float64") + + with pytest.raises(ValueError, match="cannot convert to 'int64'-dtype NumPy"): + arr.astype("int64") + + with pytest.raises(ValueError, match="cannot convert to 'bool'-dtype NumPy"): + arr.astype("bool") + + result = arr.astype("float64") + expected = np.array([0.1, 0.2, np.nan], dtype="float64") + tm.assert_numpy_array_equal(result, expected) + + # no missing values + arr = pd.array([0.0, 1.0, 0.5], dtype="Float64") + result = arr.astype("int64") + expected = np.array([0, 1, 0], dtype="int64") + tm.assert_numpy_array_equal(result, expected) + + result = arr.astype("bool") + expected = np.array([False, True, True], dtype="bool") + tm.assert_numpy_array_equal(result, expected) + + +def test_astype_to_floating_array(): + # astype to FloatingArray + arr = pd.array([0.0, 1.0, None], dtype="Float64") + + result = arr.astype("Float64") + tm.assert_extension_array_equal(result, arr) + result = arr.astype(pd.Float64Dtype()) + tm.assert_extension_array_equal(result, arr) + result = arr.astype("Float32") + expected = pd.array([0.0, 1.0, None], dtype="Float32") + tm.assert_extension_array_equal(result, expected) + + +def test_astype_to_boolean_array(): + # astype to BooleanArray + arr = pd.array([0.0, 1.0, None], dtype="Float64") + + result = arr.astype("boolean") + expected = pd.array([False, True, None], dtype="boolean") + tm.assert_extension_array_equal(result, expected) + result = arr.astype(pd.BooleanDtype()) + tm.assert_extension_array_equal(result, expected) + + +def test_astype_to_integer_array(): + # astype to IntegerArray + arr = pd.array([0.0, 1.5, None], dtype="Float64") + + result = arr.astype("Int64") + expected = pd.array([0, 1, None], dtype="Int64") + tm.assert_extension_array_equal(result, expected) + + +def test_astype_str(): + a = pd.array([0.1, 0.2, None], dtype="Float64") + expected = np.array(["0.1", "0.2", ""], dtype=object) + + tm.assert_numpy_array_equal(a.astype(str), expected) + tm.assert_numpy_array_equal(a.astype("str"), expected) + + +def test_astype_copy(): + arr = pd.array([0.1, 0.2, None], dtype="Float64") + orig = pd.array([0.1, 0.2, None], dtype="Float64") + + # copy=True -> ensure both data and mask are actual copies + result = arr.astype("Float64", copy=True) + assert result is not arr + assert not np.shares_memory(result._data, arr._data) + assert not np.shares_memory(result._mask, arr._mask) + result[0] = 10 + tm.assert_extension_array_equal(arr, orig) + result[0] = pd.NA + tm.assert_extension_array_equal(arr, orig) + + # copy=False + result = arr.astype("Float64", copy=False) + assert result is arr + assert np.shares_memory(result._data, arr._data) + assert np.shares_memory(result._mask, arr._mask) + result[0] = 10 + assert arr[0] == 10 + result[0] = pd.NA + assert arr[0] is pd.NA + + # astype to different dtype -> always needs a copy -> even with copy=False + # we need to ensure that also the mask is actually copied + arr = pd.array([0.1, 0.2, None], dtype="Float64") + orig = pd.array([0.1, 0.2, None], dtype="Float64") + + result = arr.astype("Float32", copy=False) + assert not np.shares_memory(result._data, arr._data) + assert not np.shares_memory(result._mask, arr._mask) + result[0] = 10 + tm.assert_extension_array_equal(arr, orig) + result[0] = pd.NA + tm.assert_extension_array_equal(arr, orig) + + +def test_astype_object(dtype): + arr = pd.array([1.0, pd.NA], dtype=dtype) + + result = arr.astype(object) + expected = np.array([1.0, pd.NA], dtype=object) + tm.assert_numpy_array_equal(result, expected) + # check exact element types + assert isinstance(result[0], float) + assert result[1] is pd.NA diff --git a/pandas/tests/arrays/floating/test_comparison.py b/pandas/tests/arrays/floating/test_comparison.py new file mode 100644 index 0000000000000..5538367f49e5b --- /dev/null +++ b/pandas/tests/arrays/floating/test_comparison.py @@ -0,0 +1,117 @@ +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm +from pandas.tests.extension.base import BaseOpsUtil + + +class TestComparisonOps(BaseOpsUtil): + def _compare_other(self, data, op_name, other): + op = self.get_op_from_name(op_name) + + # array + result = pd.Series(op(data, other)) + expected = pd.Series(op(data._data, other), dtype="boolean") + + # fill the nan locations + expected[data._mask] = pd.NA + + tm.assert_series_equal(result, expected) + + # series + s = pd.Series(data) + result = op(s, other) + + expected = op(pd.Series(data._data), other) + + # fill the nan locations + expected[data._mask] = pd.NA + expected = expected.astype("boolean") + + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("other", [True, False, pd.NA, -1.0, 0.0, 1]) + def test_scalar(self, other, all_compare_operators): + op = self.get_op_from_name(all_compare_operators) + a = pd.array([1.0, 0.0, None], dtype="Float64") + + result = op(a, other) + + if other is pd.NA: + expected = pd.array([None, None, None], dtype="boolean") + else: + values = op(a._data, other) + expected = pd.arrays.BooleanArray(values, a._mask, copy=True) + tm.assert_extension_array_equal(result, expected) + + # ensure we haven't mutated anything inplace + result[0] = pd.NA + tm.assert_extension_array_equal(a, pd.array([1.0, 0.0, None], dtype="Float64")) + + def test_array(self, all_compare_operators): + op = self.get_op_from_name(all_compare_operators) + a = pd.array([0, 1, 2, None, None, None], dtype="Float64") + b = pd.array([0, 1, None, 0, 1, None], dtype="Float64") + + result = op(a, b) + values = op(a._data, b._data) + mask = a._mask | b._mask + + expected = pd.arrays.BooleanArray(values, mask) + tm.assert_extension_array_equal(result, expected) + + # ensure we haven't mutated anything inplace + result[0] = pd.NA + tm.assert_extension_array_equal( + a, pd.array([0, 1, 2, None, None, None], dtype="Float64") + ) + tm.assert_extension_array_equal( + b, pd.array([0, 1, None, 0, 1, None], dtype="Float64") + ) + + def test_compare_with_booleanarray(self, all_compare_operators): + op = self.get_op_from_name(all_compare_operators) + a = pd.array([True, False, None] * 3, dtype="boolean") + b = pd.array([0] * 3 + [1] * 3 + [None] * 3, dtype="Float64") + other = pd.array([False] * 3 + [True] * 3 + [None] * 3, dtype="boolean") + expected = op(a, other) + result = op(a, b) + tm.assert_extension_array_equal(result, expected) + expected = op(other, a) + result = op(b, a) + tm.assert_extension_array_equal(result, expected) + + def test_compare_with_integerarray(self, all_compare_operators): + op = self.get_op_from_name(all_compare_operators) + a = pd.array([0, 1, None] * 3, dtype="Int64") + b = pd.array([0] * 3 + [1] * 3 + [None] * 3, dtype="Float64") + other = b.astype("Int64") + expected = op(a, other) + result = op(a, b) + tm.assert_extension_array_equal(result, expected) + expected = op(other, a) + result = op(b, a) + tm.assert_extension_array_equal(result, expected) + + def test_no_shared_mask(self, data): + result = data + 1 + assert np.shares_memory(result._mask, data._mask) is False + + def test_compare_to_string(self, dtype): + # GH 28930 + s = pd.Series([1, None], dtype=dtype) + result = s == "a" + expected = pd.Series([False, pd.NA], dtype="boolean") + + self.assert_series_equal(result, expected) + + +def test_equals(): + # GH-30652 + # equals is generally tested in /tests/extension/base/methods, but this + # specifically tests that two arrays of the same class but different dtype + # do not evaluate equal + a1 = pd.array([1, 2, None], dtype="Float64") + a2 = pd.array([1, 2, None], dtype="Float32") + assert a1.equals(a2) is False diff --git a/pandas/tests/arrays/floating/test_concat.py b/pandas/tests/arrays/floating/test_concat.py new file mode 100644 index 0000000000000..dcb021045c6a7 --- /dev/null +++ b/pandas/tests/arrays/floating/test_concat.py @@ -0,0 +1,21 @@ +import pytest + +import pandas as pd +import pandas._testing as tm + + +@pytest.mark.parametrize( + "to_concat_dtypes, result_dtype", + [ + (["Float64", "Float64"], "Float64"), + (["Float32", "Float64"], "Float64"), + (["Float32", "Float32"], "Float32"), + ], +) +def test_concat_series(to_concat_dtypes, result_dtype): + + result = pd.concat([pd.Series([1, 2, pd.NA], dtype=t) for t in to_concat_dtypes]) + expected = pd.concat([pd.Series([1, 2, pd.NA], dtype=object)] * 2).astype( + result_dtype + ) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/arrays/floating/test_construction.py b/pandas/tests/arrays/floating/test_construction.py new file mode 100644 index 0000000000000..69147f8f3a54a --- /dev/null +++ b/pandas/tests/arrays/floating/test_construction.py @@ -0,0 +1,167 @@ +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm +from pandas.core.arrays import FloatingArray +from pandas.core.arrays.floating import Float32Dtype, Float64Dtype + + +def test_uses_pandas_na(): + a = pd.array([1, None], dtype=pd.Float64Dtype()) + assert a[1] is pd.NA + + +def test_floating_array_constructor(): + values = np.array([1, 2, 3, 4], dtype="float64") + mask = np.array([False, False, False, True], dtype="bool") + + result = FloatingArray(values, mask) + expected = pd.array([1, 2, 3, np.nan], dtype="Float64") + tm.assert_extension_array_equal(result, expected) + tm.assert_numpy_array_equal(result._data, values) + tm.assert_numpy_array_equal(result._mask, mask) + + msg = r".* should be .* numpy array. Use the 'pd.array' function instead" + with pytest.raises(TypeError, match=msg): + FloatingArray(values.tolist(), mask) + + with pytest.raises(TypeError, match=msg): + FloatingArray(values, mask.tolist()) + + with pytest.raises(TypeError, match=msg): + FloatingArray(values.astype(int), mask) + + msg = r"__init__\(\) missing 1 required positional argument: 'mask'" + with pytest.raises(TypeError, match=msg): + FloatingArray(values) + + +def test_floating_array_constructor_copy(): + values = np.array([1, 2, 3, 4], dtype="float64") + mask = np.array([False, False, False, True], dtype="bool") + + result = FloatingArray(values, mask) + assert result._data is values + assert result._mask is mask + + result = FloatingArray(values, mask, copy=True) + assert result._data is not values + assert result._mask is not mask + + +def test_to_array(): + result = pd.array([0.1, 0.2, 0.3, 0.4]) + expected = pd.array([0.1, 0.2, 0.3, 0.4], dtype="Float64") + tm.assert_extension_array_equal(result, expected) + + +@pytest.mark.parametrize( + "a, b", + [ + ([1, None], [1, pd.NA]), + ([None], [pd.NA]), + ([None, np.nan], [pd.NA, pd.NA]), + ([1, np.nan], [1, pd.NA]), + ([np.nan], [pd.NA]), + ], +) +def test_to_array_none_is_nan(a, b): + result = pd.array(a, dtype="Float64") + expected = pd.array(b, dtype="Float64") + tm.assert_extension_array_equal(result, expected) + + +def test_to_array_mixed_integer_float(): + result = pd.array([1, 2.0]) + expected = pd.array([1.0, 2.0], dtype="Float64") + tm.assert_extension_array_equal(result, expected) + + result = pd.array([1, None, 2.0]) + expected = pd.array([1.0, None, 2.0], dtype="Float64") + tm.assert_extension_array_equal(result, expected) + + +@pytest.mark.parametrize( + "values", + [ + ["foo", "bar"], + ["1", "2"], + "foo", + 1, + 1.0, + pd.date_range("20130101", periods=2), + np.array(["foo"]), + [[1, 2], [3, 4]], + [np.nan, {"a": 1}], + ], +) +def test_to_array_error(values): + # error in converting existing arrays to FloatingArray + msg = ( + r"(:?.* cannot be converted to a FloatingDtype)" + r"|(:?values must be a 1D list-like)" + r"|(:?Cannot pass scalar)" + ) + with pytest.raises((TypeError, ValueError), match=msg): + pd.array(values, dtype="Float64") + + +def test_to_array_inferred_dtype(): + # if values has dtype -> respect it + result = pd.array(np.array([1, 2], dtype="float32")) + assert result.dtype == Float32Dtype() + + # if values have no dtype -> always float64 + result = pd.array([1.0, 2.0]) + assert result.dtype == Float64Dtype() + + +def test_to_array_dtype_keyword(): + result = pd.array([1, 2], dtype="Float32") + assert result.dtype == Float32Dtype() + + # if values has dtype -> override it + result = pd.array(np.array([1, 2], dtype="float32"), dtype="Float64") + assert result.dtype == Float64Dtype() + + +def test_to_array_integer(): + result = pd.array([1, 2], dtype="Float64") + expected = pd.array([1.0, 2.0], dtype="Float64") + tm.assert_extension_array_equal(result, expected) + + # for integer dtypes, the itemsize is not preserved + # TODO can we specify "floating" in general? + result = pd.array(np.array([1, 2], dtype="int32"), dtype="Float64") + assert result.dtype == Float64Dtype() + + +@pytest.mark.parametrize( + "bool_values, values, target_dtype, expected_dtype", + [ + ([False, True], [0, 1], Float64Dtype(), Float64Dtype()), + ([False, True], [0, 1], "Float64", Float64Dtype()), + ([False, True, np.nan], [0, 1, np.nan], Float64Dtype(), Float64Dtype()), + ], +) +def test_to_array_bool(bool_values, values, target_dtype, expected_dtype): + result = pd.array(bool_values, dtype=target_dtype) + assert result.dtype == expected_dtype + expected = pd.array(values, dtype=target_dtype) + tm.assert_extension_array_equal(result, expected) + + +def test_series_from_float(data): + # construct from our dtype & string dtype + dtype = data.dtype + + # from float + expected = pd.Series(data) + result = pd.Series(data.to_numpy(na_value=np.nan, dtype="float"), dtype=str(dtype)) + tm.assert_series_equal(result, expected) + + # from list + expected = pd.Series(data) + result = pd.Series(np.array(data).tolist(), dtype=str(dtype)) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/arrays/floating/test_function.py b/pandas/tests/arrays/floating/test_function.py new file mode 100644 index 0000000000000..84c650f880541 --- /dev/null +++ b/pandas/tests/arrays/floating/test_function.py @@ -0,0 +1,154 @@ +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm + + +@pytest.mark.parametrize("ufunc", [np.abs, np.sign]) +# np.sign emits a warning with nans, +@pytest.mark.filterwarnings("ignore:invalid value encountered in sign") +def test_ufuncs_single(ufunc): + a = pd.array([1, 2, -3, np.nan], dtype="Float64") + result = ufunc(a) + expected = pd.array(ufunc(a.astype(float)), dtype="Float64") + tm.assert_extension_array_equal(result, expected) + + s = pd.Series(a) + result = ufunc(s) + expected = pd.Series(expected) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("ufunc", [np.log, np.exp, np.sin, np.cos, np.sqrt]) +def test_ufuncs_single_float(ufunc): + a = pd.array([1.0, 0.2, 3.0, np.nan], dtype="Float64") + with np.errstate(invalid="ignore"): + result = ufunc(a) + expected = pd.array(ufunc(a.astype(float)), dtype="Float64") + tm.assert_extension_array_equal(result, expected) + + s = pd.Series(a) + with np.errstate(invalid="ignore"): + result = ufunc(s) + expected = pd.Series(ufunc(s.astype(float)), dtype="Float64") + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("ufunc", [np.add, np.subtract]) +def test_ufuncs_binary_float(ufunc): + # two FloatingArrays + a = pd.array([1, 0.2, -3, np.nan], dtype="Float64") + result = ufunc(a, a) + expected = pd.array(ufunc(a.astype(float), a.astype(float)), dtype="Float64") + tm.assert_extension_array_equal(result, expected) + + # FloatingArray with numpy array + arr = np.array([1, 2, 3, 4]) + result = ufunc(a, arr) + expected = pd.array(ufunc(a.astype(float), arr), dtype="Float64") + tm.assert_extension_array_equal(result, expected) + + result = ufunc(arr, a) + expected = pd.array(ufunc(arr, a.astype(float)), dtype="Float64") + tm.assert_extension_array_equal(result, expected) + + # FloatingArray with scalar + result = ufunc(a, 1) + expected = pd.array(ufunc(a.astype(float), 1), dtype="Float64") + tm.assert_extension_array_equal(result, expected) + + result = ufunc(1, a) + expected = pd.array(ufunc(1, a.astype(float)), dtype="Float64") + tm.assert_extension_array_equal(result, expected) + + +@pytest.mark.parametrize("values", [[0, 1], [0, None]]) +def test_ufunc_reduce_raises(values): + a = pd.array(values, dtype="Float64") + msg = r"The 'reduce' method is not supported." + with pytest.raises(NotImplementedError, match=msg): + np.add.reduce(a) + + +@pytest.mark.parametrize( + "pandasmethname, kwargs", + [ + ("var", {"ddof": 0}), + ("var", {"ddof": 1}), + ("kurtosis", {}), + ("skew", {}), + ("sem", {}), + ], +) +def test_stat_method(pandasmethname, kwargs): + s = pd.Series(data=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, np.nan, np.nan], dtype="Float64") + pandasmeth = getattr(s, pandasmethname) + result = pandasmeth(**kwargs) + s2 = pd.Series(data=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6], dtype="float64") + pandasmeth = getattr(s2, pandasmethname) + expected = pandasmeth(**kwargs) + assert expected == result + + +def test_value_counts_na(): + arr = pd.array([0.1, 0.2, 0.1, pd.NA], dtype="Float64") + result = arr.value_counts(dropna=False) + expected = pd.Series([2, 1, 1], index=[0.1, 0.2, pd.NA], dtype="Int64") + tm.assert_series_equal(result, expected) + + result = arr.value_counts(dropna=True) + expected = pd.Series([2, 1], index=[0.1, 0.2], dtype="Int64") + tm.assert_series_equal(result, expected) + + +def test_value_counts_empty(): + s = pd.Series([], dtype="Float64") + result = s.value_counts() + idx = pd.Index([], dtype="object") + expected = pd.Series([], index=idx, dtype="Int64") + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("skipna", [True, False]) +@pytest.mark.parametrize("min_count", [0, 4]) +def test_floating_array_sum(skipna, min_count): + arr = pd.array([1, 2, 3, None], dtype="Float64") + result = arr.sum(skipna=skipna, min_count=min_count) + if skipna and min_count == 0: + assert result == 6.0 + else: + assert result is pd.NA + + +@pytest.mark.parametrize( + "values, expected", [([1, 2, 3], 6.0), ([1, 2, 3, None], 6.0), ([None], 0.0)] +) +def test_floating_array_numpy_sum(values, expected): + arr = pd.array(values, dtype="Float64") + result = np.sum(arr) + assert result == expected + + +@pytest.mark.parametrize("op", ["sum", "min", "max", "prod"]) +def test_preserve_dtypes(op): + df = pd.DataFrame( + { + "A": ["a", "b", "b"], + "B": [1, None, 3], + "C": pd.array([0.1, None, 3.0], dtype="Float64"), + } + ) + + # op + result = getattr(df.C, op)() + assert isinstance(result, np.float64) + + # groupby + result = getattr(df.groupby("A"), op)() + + expected = pd.DataFrame( + {"B": np.array([1.0, 3.0]), "C": pd.array([0.1, 3], dtype="Float64")}, + index=pd.Index(["a", "b"], name="A"), + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/arrays/floating/test_repr.py b/pandas/tests/arrays/floating/test_repr.py new file mode 100644 index 0000000000000..8767b79242c83 --- /dev/null +++ b/pandas/tests/arrays/floating/test_repr.py @@ -0,0 +1,45 @@ +import numpy as np +import pytest + +import pandas as pd +from pandas.core.arrays.floating import Float32Dtype, Float64Dtype + + +def test_dtypes(dtype): + # smoke tests on auto dtype construction + + np.dtype(dtype.type).kind == "f" + assert dtype.name is not None + + +@pytest.mark.parametrize( + "dtype, expected", + [(Float32Dtype(), "Float32Dtype()"), (Float64Dtype(), "Float64Dtype()")], +) +def test_repr_dtype(dtype, expected): + assert repr(dtype) == expected + + +def test_repr_array(): + result = repr(pd.array([1.0, None, 3.0])) + expected = "\n[1.0, , 3.0]\nLength: 3, dtype: Float64" + assert result == expected + + +def test_repr_array_long(): + data = pd.array([1.0, 2.0, None] * 1000) + expected = """ +[ 1.0, 2.0, , 1.0, 2.0, , 1.0, 2.0, , 1.0, + ... + , 1.0, 2.0, , 1.0, 2.0, , 1.0, 2.0, ] +Length: 3000, dtype: Float64""" + result = repr(data) + assert result == expected + + +def test_frame_repr(data_missing): + + df = pd.DataFrame({"A": data_missing}) + result = repr(df) + expected = " A\n0 \n1 0.1" + assert result == expected diff --git a/pandas/tests/arrays/floating/test_to_numpy.py b/pandas/tests/arrays/floating/test_to_numpy.py new file mode 100644 index 0000000000000..26e5687b1b4a0 --- /dev/null +++ b/pandas/tests/arrays/floating/test_to_numpy.py @@ -0,0 +1,132 @@ +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm +from pandas.core.arrays import FloatingArray + + +@pytest.mark.parametrize("box", [True, False], ids=["series", "array"]) +def test_to_numpy(box): + con = pd.Series if box else pd.array + + # default (with or without missing values) -> object dtype + arr = con([0.1, 0.2, 0.3], dtype="Float64") + result = arr.to_numpy() + expected = np.array([0.1, 0.2, 0.3], dtype="object") + tm.assert_numpy_array_equal(result, expected) + + arr = con([0.1, 0.2, None], dtype="Float64") + result = arr.to_numpy() + expected = np.array([0.1, 0.2, pd.NA], dtype="object") + tm.assert_numpy_array_equal(result, expected) + + +@pytest.mark.parametrize("box", [True, False], ids=["series", "array"]) +def test_to_numpy_float(box): + con = pd.Series if box else pd.array + + # no missing values -> can convert to float, otherwise raises + arr = con([0.1, 0.2, 0.3], dtype="Float64") + result = arr.to_numpy(dtype="float64") + expected = np.array([0.1, 0.2, 0.3], dtype="float64") + tm.assert_numpy_array_equal(result, expected) + + arr = con([0.1, 0.2, None], dtype="Float64") + with pytest.raises(ValueError, match="cannot convert to 'float64'-dtype"): + result = arr.to_numpy(dtype="float64") + + # need to explicitly specify na_value + result = arr.to_numpy(dtype="float64", na_value=np.nan) + expected = np.array([0.1, 0.2, np.nan], dtype="float64") + tm.assert_numpy_array_equal(result, expected) + + +@pytest.mark.parametrize("box", [True, False], ids=["series", "array"]) +def test_to_numpy_int(box): + con = pd.Series if box else pd.array + + # no missing values -> can convert to int, otherwise raises + arr = con([1.0, 2.0, 3.0], dtype="Float64") + result = arr.to_numpy(dtype="int64") + expected = np.array([1, 2, 3], dtype="int64") + tm.assert_numpy_array_equal(result, expected) + + arr = con([1.0, 2.0, None], dtype="Float64") + with pytest.raises(ValueError, match="cannot convert to 'int64'-dtype"): + result = arr.to_numpy(dtype="int64") + + # automatic casting (floors the values) + arr = con([0.1, 0.9, 1.1], dtype="Float64") + result = arr.to_numpy(dtype="int64") + expected = np.array([0, 0, 1], dtype="int64") + tm.assert_numpy_array_equal(result, expected) + + +@pytest.mark.parametrize("box", [True, False], ids=["series", "array"]) +def test_to_numpy_na_value(box): + con = pd.Series if box else pd.array + + arr = con([0.0, 1.0, None], dtype="Float64") + result = arr.to_numpy(dtype=object, na_value=None) + expected = np.array([0.0, 1.0, None], dtype="object") + tm.assert_numpy_array_equal(result, expected) + + result = arr.to_numpy(dtype=bool, na_value=False) + expected = np.array([False, True, False], dtype="bool") + tm.assert_numpy_array_equal(result, expected) + + result = arr.to_numpy(dtype="int64", na_value=-99) + expected = np.array([0, 1, -99], dtype="int64") + tm.assert_numpy_array_equal(result, expected) + + +def test_to_numpy_na_value_with_nan(): + # array with both NaN and NA -> only fill NA with `na_value` + arr = FloatingArray(np.array([0.0, np.nan, 0.0]), np.array([False, False, True])) + result = arr.to_numpy(dtype="float64", na_value=-1) + expected = np.array([0.0, np.nan, -1.0], dtype="float64") + tm.assert_numpy_array_equal(result, expected) + + +@pytest.mark.parametrize("dtype", ["float64", "float32", "int32", "int64", "bool"]) +@pytest.mark.parametrize("box", [True, False], ids=["series", "array"]) +def test_to_numpy_dtype(box, dtype): + con = pd.Series if box else pd.array + arr = con([0.0, 1.0], dtype="Float64") + + result = arr.to_numpy(dtype=dtype) + expected = np.array([0, 1], dtype=dtype) + tm.assert_numpy_array_equal(result, expected) + + +@pytest.mark.parametrize("dtype", ["float64", "float32", "int32", "int64", "bool"]) +@pytest.mark.parametrize("box", [True, False], ids=["series", "array"]) +def test_to_numpy_na_raises(box, dtype): + con = pd.Series if box else pd.array + arr = con([0.0, 1.0, None], dtype="Float64") + with pytest.raises(ValueError, match=dtype): + arr.to_numpy(dtype=dtype) + + +@pytest.mark.parametrize("box", [True, False], ids=["series", "array"]) +def test_to_numpy_string(box, dtype): + con = pd.Series if box else pd.array + arr = con([0.0, 1.0, None], dtype="Float64") + + result = arr.to_numpy(dtype="str") + expected = np.array([0.0, 1.0, pd.NA], dtype=" Date: Wed, 30 Sep 2020 07:37:41 -0500 Subject: [PATCH 0949/1025] DOC: Update roadmap for completions (#36728) --- doc/source/development/roadmap.rst | 31 ++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/doc/source/development/roadmap.rst b/doc/source/development/roadmap.rst index efee21b5889ed..8223edcf6f63a 100644 --- a/doc/source/development/roadmap.rst +++ b/doc/source/development/roadmap.rst @@ -141,20 +141,6 @@ ways for users to apply their own Numba-jitted functions where pandas accepts us and in groupby and window contexts). This will improve the performance of user-defined-functions in these operations by staying within compiled code. - -Documentation improvements --------------------------- - -We'd like to improve the content, structure, and presentation of the pandas documentation. -Some specific goals include - -* Overhaul the HTML theme with a modern, responsive design (:issue:`15556`) -* Improve the "Getting Started" documentation, designing and writing learning paths - for users different backgrounds (e.g. brand new to programming, familiar with - other languages like R, already familiar with Python). -* Improve the overall organization of the documentation and specific subsections - of the documentation to make navigation and finding content easier. - Performance monitoring ---------------------- @@ -203,3 +189,20 @@ should be notified of the proposal. When there's agreement that an implementation would be welcome, the roadmap should be updated to include the summary and a link to the discussion issue. + +Completed items +--------------- + +This section records now completed items from the pandas roadmap. + +Documentation improvements +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +We improved the pandas documentation + +* The pandas community worked with others to build the `pydata-sphinx-theme`_, + which is now used for https://pandas.pydata.org/docs/ (:issue:`15556`). +* :ref:`getting_started` contains a number of resources intended for new + pandas users coming from a variety of backgrounds (:issue:`26831`). + +.. _pydata-sphinx-theme: https://github.com/pandas-dev/pydata-sphinx-theme From 4b747fb9cfe630de3020ad4e22e309efb6b5ae59 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov <41443370+ivanovmg@users.noreply.github.com> Date: Wed, 30 Sep 2020 19:38:17 +0700 Subject: [PATCH 0950/1025] REF: rearrange test_to_latex.py (#36714) --- pandas/tests/io/formats/test_to_latex.py | 1176 +++++++++++----------- 1 file changed, 604 insertions(+), 572 deletions(-) diff --git a/pandas/tests/io/formats/test_to_latex.py b/pandas/tests/io/formats/test_to_latex.py index 7a0d305758802..d3d865158309c 100644 --- a/pandas/tests/io/formats/test_to_latex.py +++ b/pandas/tests/io/formats/test_to_latex.py @@ -27,65 +27,13 @@ def _dedent(string): return dedent(string).lstrip() -class TestToLatex: - @pytest.fixture - def df_short(self): - """Short dataframe for testing table/tabular/longtable LaTeX env.""" - return DataFrame({"a": [1, 2], "b": ["b1", "b2"]}) - - @pytest.fixture - def caption_table(self): - """Caption for table/tabular LaTeX environment.""" - return "a table in a \\texttt{table/tabular} environment" - - @pytest.fixture - def label_table(self): - """Label for table/tabular LaTeX environment.""" - return "tab:table_tabular" - - @pytest.fixture - def caption_longtable(self): - """Caption for longtable LaTeX environment.""" - return "a table in a \\texttt{longtable} environment" - - @pytest.fixture - def label_longtable(self): - """Label for longtable LaTeX environment.""" - return "tab:longtable" - - @pytest.fixture - def multiindex_frame(self): - """Multiindex dataframe for testing multirow LaTeX macros.""" - yield DataFrame.from_dict( - { - ("c1", 0): pd.Series({x: x for x in range(4)}), - ("c1", 1): pd.Series({x: x + 4 for x in range(4)}), - ("c2", 0): pd.Series({x: x for x in range(4)}), - ("c2", 1): pd.Series({x: x + 4 for x in range(4)}), - ("c3", 0): pd.Series({x: x for x in range(4)}), - } - ).T - - @pytest.fixture - def multicolumn_frame(self): - """Multicolumn dataframe for testing multicolumn LaTeX macros.""" - yield pd.DataFrame( - { - ("c1", 0): {x: x for x in range(5)}, - ("c1", 1): {x: x + 5 for x in range(5)}, - ("c2", 0): {x: x for x in range(5)}, - ("c2", 1): {x: x + 5 for x in range(5)}, - ("c3", 0): {x: x for x in range(5)}, - } - ) +@pytest.fixture +def df_short(): + """Short dataframe for testing table/tabular/longtable LaTeX env.""" + return DataFrame({"a": [1, 2], "b": ["b1", "b2"]}) - @pytest.fixture - def df_with_symbols(self): - """Dataframe with special characters for testing chars escaping.""" - a = "a" - b = "b" - yield DataFrame({"co$e^x$": {a: "a", b: "b"}, "co^l1": {a: "a", b: "b"}}) +class TestToLatex: def test_to_latex_to_file(self, float_frame): with tm.ensure_clean("test.tex") as path: float_frame.to_latex(path) @@ -152,10 +100,11 @@ def test_to_latex_bad_column_format(self, bad_column_format): with pytest.raises(ValueError, match=msg): df.to_latex(column_format=bad_column_format) - def test_to_latex_column_format(self, float_frame): + def test_to_latex_column_format_just_works(self, float_frame): # GH Bug #9402 float_frame.to_latex(column_format="lcr") + def test_to_latex_column_format(self): df = DataFrame({"a": [1, 2], "b": ["b1", "b2"]}) result = df.to_latex(column_format="lcr") expected = _dedent( @@ -188,6 +137,45 @@ def test_to_latex_empty_tabular(self): ) assert result == expected + def test_to_latex_series(self): + s = Series(["a", "b", "c"]) + result = s.to_latex() + expected = _dedent( + r""" + \begin{tabular}{ll} + \toprule + {} & 0 \\ + \midrule + 0 & a \\ + 1 & b \\ + 2 & c \\ + \bottomrule + \end{tabular} + """ + ) + assert result == expected + + def test_to_latex_midrule_location(self): + # GH 18326 + df = pd.DataFrame({"a": [1, 2]}) + df.index.name = "foo" + result = df.to_latex(index_names=False) + expected = _dedent( + r""" + \begin{tabular}{lr} + \toprule + {} & a \\ + \midrule + 0 & 1 \\ + 1 & 2 \\ + \bottomrule + \end{tabular} + """ + ) + assert result == expected + + +class TestToLatexLongtable: def test_to_latex_empty_longtable(self): df = DataFrame() result = df.to_latex(longtable=True) @@ -203,329 +191,347 @@ def test_to_latex_empty_longtable(self): ) assert result == expected - def test_to_latex_with_formatters(self): - df = DataFrame( - { - "datetime64": [ - datetime(2016, 1, 1), - datetime(2016, 2, 5), - datetime(2016, 3, 3), - ], - "float": [1.0, 2.0, 3.0], - "int": [1, 2, 3], - "object": [(1, 2), True, False], - } - ) - - formatters = { - "datetime64": lambda x: x.strftime("%Y-%m"), - "float": lambda x: f"[{x: 4.1f}]", - "int": lambda x: f"0x{x:x}", - "object": lambda x: f"-{x!s}-", - "__index__": lambda x: f"index: {x}", - } - result = df.to_latex(formatters=dict(formatters)) - + def test_to_latex_longtable_with_index(self): + df = DataFrame({"a": [1, 2], "b": ["b1", "b2"]}) + result = df.to_latex(longtable=True) expected = _dedent( r""" - \begin{tabular}{llrrl} + \begin{longtable}{lrl} \toprule - {} & datetime64 & float & int & object \\ + {} & a & b \\ \midrule - index: 0 & 2016-01 & [ 1.0] & 0x1 & -(1, 2)- \\ - index: 1 & 2016-02 & [ 2.0] & 0x2 & -True- \\ - index: 2 & 2016-03 & [ 3.0] & 0x3 & -False- \\ + \endfirsthead + + \toprule + {} & a & b \\ + \midrule + \endhead + \midrule + \multicolumn{3}{r}{{Continued on next page}} \\ + \midrule + \endfoot + \bottomrule - \end{tabular} + \endlastfoot + 0 & 1 & b1 \\ + 1 & 2 & b2 \\ + \end{longtable} """ ) assert result == expected - def test_to_latex_multiindex_column_tabular(self): - df = DataFrame({("x", "y"): ["a"]}) - result = df.to_latex() + def test_to_latex_longtable_without_index(self): + df = DataFrame({"a": [1, 2], "b": ["b1", "b2"]}) + result = df.to_latex(index=False, longtable=True) expected = _dedent( r""" - \begin{tabular}{ll} + \begin{longtable}{rl} \toprule - {} & x \\ - {} & y \\ + a & b \\ \midrule - 0 & a \\ + \endfirsthead + + \toprule + a & b \\ + \midrule + \endhead + \midrule + \multicolumn{2}{r}{{Continued on next page}} \\ + \midrule + \endfoot + \bottomrule - \end{tabular} + \endlastfoot + 1 & b1 \\ + 2 & b2 \\ + \end{longtable} """ ) assert result == expected - def test_to_latex_multiindex_small_tabular(self): - df = DataFrame({("x", "y"): ["a"]}) - result = df.T.to_latex() + @pytest.mark.parametrize( + "df, expected_number", + [ + (DataFrame({"a": [1, 2]}), 1), + (DataFrame({"a": [1, 2], "b": [3, 4]}), 2), + (DataFrame({"a": [1, 2], "b": [3, 4], "c": [5, 6]}), 3), + ], + ) + def test_to_latex_longtable_continued_on_next_page(self, df, expected_number): + result = df.to_latex(index=False, longtable=True) + assert fr"\multicolumn{{{expected_number}}}" in result + + +class TestToLatexHeader: + def test_to_latex_no_header_with_index(self): + # GH 7124 + df = DataFrame({"a": [1, 2], "b": ["b1", "b2"]}) + result = df.to_latex(header=False) expected = _dedent( r""" - \begin{tabular}{lll} + \begin{tabular}{lrl} \toprule - & & 0 \\ - \midrule - x & y & a \\ + 0 & 1 & b1 \\ + 1 & 2 & b2 \\ \bottomrule \end{tabular} """ ) assert result == expected - def test_to_latex_multiindex_tabular(self, multiindex_frame): - result = multiindex_frame.to_latex() + def test_to_latex_no_header_without_index(self): + # GH 7124 + df = DataFrame({"a": [1, 2], "b": ["b1", "b2"]}) + result = df.to_latex(index=False, header=False) expected = _dedent( r""" - \begin{tabular}{llrrrr} + \begin{tabular}{rl} \toprule - & & 0 & 1 & 2 & 3 \\ - \midrule - c1 & 0 & 0 & 1 & 2 & 3 \\ - & 1 & 4 & 5 & 6 & 7 \\ - c2 & 0 & 0 & 1 & 2 & 3 \\ - & 1 & 4 & 5 & 6 & 7 \\ - c3 & 0 & 0 & 1 & 2 & 3 \\ + 1 & b1 \\ + 2 & b2 \\ \bottomrule \end{tabular} """ ) assert result == expected - def test_to_latex_multicolumn_tabular(self, multiindex_frame): - # GH 14184 - df = multiindex_frame.T - df.columns.names = ["a", "b"] - result = df.to_latex() + def test_to_latex_specified_header_with_index(self): + # GH 7124 + df = DataFrame({"a": [1, 2], "b": ["b1", "b2"]}) + result = df.to_latex(header=["AA", "BB"]) expected = _dedent( r""" - \begin{tabular}{lrrrrr} + \begin{tabular}{lrl} \toprule - a & \multicolumn{2}{l}{c1} & \multicolumn{2}{l}{c2} & c3 \\ - b & 0 & 1 & 0 & 1 & 0 \\ + {} & AA & BB \\ \midrule - 0 & 0 & 4 & 0 & 4 & 0 \\ - 1 & 1 & 5 & 1 & 5 & 1 \\ - 2 & 2 & 6 & 2 & 6 & 2 \\ - 3 & 3 & 7 & 3 & 7 & 3 \\ + 0 & 1 & b1 \\ + 1 & 2 & b2 \\ \bottomrule \end{tabular} """ ) assert result == expected - def test_to_latex_index_has_name_tabular(self): - # GH 10660 - df = pd.DataFrame({"a": [0, 0, 1, 1], "b": list("abab"), "c": [1, 2, 3, 4]}) - result = df.set_index(["a", "b"]).to_latex() + def test_to_latex_specified_header_without_index(self): + # GH 7124 + df = DataFrame({"a": [1, 2], "b": ["b1", "b2"]}) + result = df.to_latex(header=["AA", "BB"], index=False) expected = _dedent( r""" - \begin{tabular}{llr} + \begin{tabular}{rl} \toprule - & & c \\ - a & b & \\ + AA & BB \\ \midrule - 0 & a & 1 \\ - & b & 2 \\ - 1 & a & 3 \\ - & b & 4 \\ + 1 & b1 \\ + 2 & b2 \\ \bottomrule \end{tabular} """ ) assert result == expected - def test_to_latex_groupby_tabular(self): - # GH 10660 - df = pd.DataFrame({"a": [0, 0, 1, 1], "b": list("abab"), "c": [1, 2, 3, 4]}) - result = df.groupby("a").describe().to_latex() - expected = _dedent( - r""" - \begin{tabular}{lrrrrrrrr} - \toprule - {} & \multicolumn{8}{l}{c} \\ - {} & count & mean & std & min & 25\% & 50\% & 75\% & max \\ - a & & & & & & & & \\ - \midrule - 0 & 2.0 & 1.5 & 0.707107 & 1.0 & 1.25 & 1.5 & 1.75 & 2.0 \\ - 1 & 2.0 & 3.5 & 0.707107 & 3.0 & 3.25 & 3.5 & 3.75 & 4.0 \\ - \bottomrule - \end{tabular} - """ - ) - assert result == expected + @pytest.mark.parametrize( + "header, num_aliases", + [ + (["A"], 1), + (("B",), 1), + (("Col1", "Col2", "Col3"), 3), + (("Col1", "Col2", "Col3", "Col4"), 4), + ], + ) + def test_to_latex_number_of_items_in_header_missmatch_raises( + self, + header, + num_aliases, + ): + # GH 7124 + df = DataFrame({"a": [1, 2], "b": ["b1", "b2"]}) + msg = f"Writing 2 cols but got {num_aliases} aliases" + with pytest.raises(ValueError, match=msg): + df.to_latex(header=header) - def test_to_latex_multiindex_dupe_level(self): - # see gh-14484 - # - # If an index is repeated in subsequent rows, it should be - # replaced with a blank in the created table. This should - # ONLY happen if all higher order indices (to the left) are - # equal too. In this test, 'c' has to be printed both times - # because the higher order index 'A' != 'B'. - df = pd.DataFrame( - index=pd.MultiIndex.from_tuples([("A", "c"), ("B", "c")]), columns=["col"] - ) - result = df.to_latex() + def test_to_latex_decimal(self): + # GH 12031 + df = DataFrame({"a": [1.0, 2.1], "b": ["b1", "b2"]}) + result = df.to_latex(decimal=",") expected = _dedent( r""" - \begin{tabular}{lll} + \begin{tabular}{lrl} \toprule - & & col \\ + {} & a & b \\ \midrule - A & c & NaN \\ - B & c & NaN \\ + 0 & 1,0 & b1 \\ + 1 & 2,1 & b2 \\ \bottomrule \end{tabular} """ ) assert result == expected - def test_to_latex_multicolumn_default(self, multicolumn_frame): - result = multicolumn_frame.to_latex() + +class TestToLatexBold: + def test_to_latex_bold_rows(self): + # GH 16707 + df = pd.DataFrame({"a": [1, 2], "b": ["b1", "b2"]}) + result = df.to_latex(bold_rows=True) expected = _dedent( r""" - \begin{tabular}{lrrrrr} + \begin{tabular}{lrl} \toprule - {} & \multicolumn{2}{l}{c1} & \multicolumn{2}{l}{c2} & c3 \\ - {} & 0 & 1 & 0 & 1 & 0 \\ + {} & a & b \\ \midrule - 0 & 0 & 5 & 0 & 5 & 0 \\ - 1 & 1 & 6 & 1 & 6 & 1 \\ - 2 & 2 & 7 & 2 & 7 & 2 \\ - 3 & 3 & 8 & 3 & 8 & 3 \\ - 4 & 4 & 9 & 4 & 9 & 4 \\ + \textbf{0} & 1 & b1 \\ + \textbf{1} & 2 & b2 \\ \bottomrule \end{tabular} """ ) assert result == expected - def test_to_latex_multicolumn_false(self, multicolumn_frame): - result = multicolumn_frame.to_latex(multicolumn=False) + def test_to_latex_no_bold_rows(self): + # GH 16707 + df = pd.DataFrame({"a": [1, 2], "b": ["b1", "b2"]}) + result = df.to_latex(bold_rows=False) expected = _dedent( r""" - \begin{tabular}{lrrrrr} + \begin{tabular}{lrl} \toprule - {} & c1 & & c2 & & c3 \\ - {} & 0 & 1 & 0 & 1 & 0 \\ + {} & a & b \\ \midrule - 0 & 0 & 5 & 0 & 5 & 0 \\ - 1 & 1 & 6 & 1 & 6 & 1 \\ - 2 & 2 & 7 & 2 & 7 & 2 \\ - 3 & 3 & 8 & 3 & 8 & 3 \\ - 4 & 4 & 9 & 4 & 9 & 4 \\ + 0 & 1 & b1 \\ + 1 & 2 & b2 \\ \bottomrule \end{tabular} """ ) assert result == expected - def test_to_latex_multirow_true(self, multicolumn_frame): - result = multicolumn_frame.T.to_latex(multirow=True) + +class TestToLatexCaptionLabel: + @pytest.fixture + def caption_table(self): + """Caption for table/tabular LaTeX environment.""" + return "a table in a \\texttt{table/tabular} environment" + + @pytest.fixture + def label_table(self): + """Label for table/tabular LaTeX environment.""" + return "tab:table_tabular" + + @pytest.fixture + def caption_longtable(self): + """Caption for longtable LaTeX environment.""" + return "a table in a \\texttt{longtable} environment" + + @pytest.fixture + def label_longtable(self): + """Label for longtable LaTeX environment.""" + return "tab:longtable" + + def test_to_latex_caption_only(self, df_short, caption_table): + # GH 25436 + result = df_short.to_latex(caption=caption_table) expected = _dedent( r""" - \begin{tabular}{llrrrrr} + \begin{table} + \centering + \caption{a table in a \texttt{table/tabular} environment} + \begin{tabular}{lrl} \toprule - & & 0 & 1 & 2 & 3 & 4 \\ + {} & a & b \\ \midrule - \multirow{2}{*}{c1} & 0 & 0 & 1 & 2 & 3 & 4 \\ - & 1 & 5 & 6 & 7 & 8 & 9 \\ - \cline{1-7} - \multirow{2}{*}{c2} & 0 & 0 & 1 & 2 & 3 & 4 \\ - & 1 & 5 & 6 & 7 & 8 & 9 \\ - \cline{1-7} - c3 & 0 & 0 & 1 & 2 & 3 & 4 \\ + 0 & 1 & b1 \\ + 1 & 2 & b2 \\ \bottomrule \end{tabular} + \end{table} """ ) assert result == expected - def test_to_latex_multicolumnrow_with_multicol_format(self, multicolumn_frame): - multicolumn_frame.index = multicolumn_frame.T.index - result = multicolumn_frame.T.to_latex( - multirow=True, - multicolumn=True, - multicolumn_format="c", - ) + def test_to_latex_label_only(self, df_short, label_table): + # GH 25436 + result = df_short.to_latex(label=label_table) expected = _dedent( r""" - \begin{tabular}{llrrrrr} + \begin{table} + \centering + \label{tab:table_tabular} + \begin{tabular}{lrl} \toprule - & & \multicolumn{2}{c}{c1} & \multicolumn{2}{c}{c2} & c3 \\ - & & 0 & 1 & 0 & 1 & 0 \\ + {} & a & b \\ \midrule - \multirow{2}{*}{c1} & 0 & 0 & 1 & 2 & 3 & 4 \\ - & 1 & 5 & 6 & 7 & 8 & 9 \\ - \cline{1-7} - \multirow{2}{*}{c2} & 0 & 0 & 1 & 2 & 3 & 4 \\ - & 1 & 5 & 6 & 7 & 8 & 9 \\ - \cline{1-7} - c3 & 0 & 0 & 1 & 2 & 3 & 4 \\ + 0 & 1 & b1 \\ + 1 & 2 & b2 \\ \bottomrule \end{tabular} + \end{table} """ ) assert result == expected - def test_to_latex_escape_false(self, df_with_symbols): - result = df_with_symbols.to_latex(escape=False) + def test_to_latex_caption_and_label(self, df_short, caption_table, label_table): + # GH 25436 + result = df_short.to_latex(caption=caption_table, label=label_table) expected = _dedent( r""" - \begin{tabular}{lll} + \begin{table} + \centering + \caption{a table in a \texttt{table/tabular} environment} + \label{tab:table_tabular} + \begin{tabular}{lrl} \toprule - {} & co$e^x$ & co^l1 \\ + {} & a & b \\ \midrule - a & a & a \\ - b & b & b \\ + 0 & 1 & b1 \\ + 1 & 2 & b2 \\ \bottomrule \end{tabular} + \end{table} """ ) assert result == expected - def test_to_latex_escape_default(self, df_with_symbols): - result = df_with_symbols.to_latex() # default: escape=True + def test_to_latex_longtable_caption_only(self, df_short, caption_longtable): + # GH 25436 + # test when no caption and no label is provided + # is performed by test_to_latex_longtable() + result = df_short.to_latex(longtable=True, caption=caption_longtable) expected = _dedent( r""" - \begin{tabular}{lll} + \begin{longtable}{lrl} + \caption{a table in a \texttt{longtable} environment}\\ \toprule - {} & co\$e\textasciicircum x\$ & co\textasciicircum l1 \\ + {} & a & b \\ \midrule - a & a & a \\ - b & b & b \\ - \bottomrule - \end{tabular} - """ - ) - assert result == expected - - def test_to_latex_special_escape(self): - df = DataFrame([r"a\b\c", r"^a^b^c", r"~a~b~c"]) - result = df.to_latex() - expected = _dedent( - r""" - \begin{tabular}{ll} + \endfirsthead + \caption[]{a table in a \texttt{longtable} environment} \\ \toprule - {} & 0 \\ + {} & a & b \\ \midrule - 0 & a\textbackslash b\textbackslash c \\ - 1 & \textasciicircum a\textasciicircum b\textasciicircum c \\ - 2 & \textasciitilde a\textasciitilde b\textasciitilde c \\ + \endhead + \midrule + \multicolumn{3}{r}{{Continued on next page}} \\ + \midrule + \endfoot + \bottomrule - \end{tabular} + \endlastfoot + 0 & 1 & b1 \\ + 1 & 2 & b2 \\ + \end{longtable} """ ) assert result == expected - def test_to_latex_longtable_with_index(self): - df = DataFrame({"a": [1, 2], "b": ["b1", "b2"]}) - result = df.to_latex(longtable=True) + def test_to_latex_longtable_label_only(self, df_short, label_longtable): + # GH 25436 + result = df_short.to_latex(longtable=True, label=label_longtable) expected = _dedent( r""" \begin{longtable}{lrl} + \label{tab:longtable}\\ \toprule {} & a & b \\ \midrule @@ -549,150 +555,179 @@ def test_to_latex_longtable_with_index(self): ) assert result == expected - def test_to_latex_longtable_without_index(self): - df = DataFrame({"a": [1, 2], "b": ["b1", "b2"]}) - result = df.to_latex(index=False, longtable=True) + def test_to_latex_longtable_caption_and_label( + self, + df_short, + caption_longtable, + label_longtable, + ): + # GH 25436 + result = df_short.to_latex( + longtable=True, + caption=caption_longtable, + label=label_longtable, + ) expected = _dedent( r""" - \begin{longtable}{rl} + \begin{longtable}{lrl} + \caption{a table in a \texttt{longtable} environment} + \label{tab:longtable}\\ \toprule - a & b \\ + {} & a & b \\ \midrule \endfirsthead - + \caption[]{a table in a \texttt{longtable} environment} \\ \toprule - a & b \\ + {} & a & b \\ \midrule \endhead \midrule - \multicolumn{2}{r}{{Continued on next page}} \\ + \multicolumn{3}{r}{{Continued on next page}} \\ \midrule \endfoot \bottomrule \endlastfoot - 1 & b1 \\ - 2 & b2 \\ - \end{longtable} + 0 & 1 & b1 \\ + 1 & 2 & b2 \\ + \end{longtable} """ ) assert result == expected - @pytest.mark.parametrize( - "df, expected_number", - [ - (DataFrame({"a": [1, 2]}), 1), - (DataFrame({"a": [1, 2], "b": [3, 4]}), 2), - (DataFrame({"a": [1, 2], "b": [3, 4], "c": [5, 6]}), 3), - ], - ) - def test_to_latex_longtable_continued_on_next_page(self, df, expected_number): - result = df.to_latex(index=False, longtable=True) - assert fr"\multicolumn{{{expected_number}}}" in result - def test_to_latex_caption_only(self, df_short, caption_table): - # GH 25436 - result = df_short.to_latex(caption=caption_table) +class TestToLatexEscape: + @pytest.fixture + def df_with_symbols(self): + """Dataframe with special characters for testing chars escaping.""" + a = "a" + b = "b" + yield DataFrame({"co$e^x$": {a: "a", b: "b"}, "co^l1": {a: "a", b: "b"}}) + + def test_to_latex_escape_false(self, df_with_symbols): + result = df_with_symbols.to_latex(escape=False) expected = _dedent( r""" - \begin{table} - \centering - \caption{a table in a \texttt{table/tabular} environment} - \begin{tabular}{lrl} + \begin{tabular}{lll} \toprule - {} & a & b \\ + {} & co$e^x$ & co^l1 \\ \midrule - 0 & 1 & b1 \\ - 1 & 2 & b2 \\ + a & a & a \\ + b & b & b \\ \bottomrule \end{tabular} - \end{table} """ ) assert result == expected - def test_to_latex_label_only(self, df_short, label_table): - # GH 25436 - result = df_short.to_latex(label=label_table) + def test_to_latex_escape_default(self, df_with_symbols): + result = df_with_symbols.to_latex() # default: escape=True expected = _dedent( r""" - \begin{table} - \centering - \label{tab:table_tabular} - \begin{tabular}{lrl} + \begin{tabular}{lll} \toprule - {} & a & b \\ + {} & co\$e\textasciicircum x\$ & co\textasciicircum l1 \\ \midrule - 0 & 1 & b1 \\ - 1 & 2 & b2 \\ + a & a & a \\ + b & b & b \\ \bottomrule \end{tabular} - \end{table} """ ) assert result == expected - def test_to_latex_caption_and_label(self, df_short, caption_table, label_table): - # GH 25436 - result = df_short.to_latex(caption=caption_table, label=label_table) + def test_to_latex_special_escape(self): + df = DataFrame([r"a\b\c", r"^a^b^c", r"~a~b~c"]) + result = df.to_latex() expected = _dedent( r""" - \begin{table} - \centering - \caption{a table in a \texttt{table/tabular} environment} - \label{tab:table_tabular} - \begin{tabular}{lrl} + \begin{tabular}{ll} \toprule - {} & a & b \\ + {} & 0 \\ \midrule - 0 & 1 & b1 \\ - 1 & 2 & b2 \\ + 0 & a\textbackslash b\textbackslash c \\ + 1 & \textasciicircum a\textasciicircum b\textasciicircum c \\ + 2 & \textasciitilde a\textasciitilde b\textasciitilde c \\ \bottomrule \end{tabular} - \end{table} """ ) assert result == expected - def test_to_latex_longtable_caption_only(self, df_short, caption_longtable): - # GH 25436 - # test when no caption and no label is provided - # is performed by test_to_latex_longtable() - result = df_short.to_latex(longtable=True, caption=caption_longtable) + def test_to_latex_escape_special_chars(self): + special_characters = ["&", "%", "$", "#", "_", "{", "}", "~", "^", "\\"] + df = DataFrame(data=special_characters) + result = df.to_latex() expected = _dedent( r""" - \begin{longtable}{lrl} - \caption{a table in a \texttt{longtable} environment}\\ + \begin{tabular}{ll} \toprule - {} & a & b \\ + {} & 0 \\ \midrule - \endfirsthead - \caption[]{a table in a \texttt{longtable} environment} \\ + 0 & \& \\ + 1 & \% \\ + 2 & \$ \\ + 3 & \# \\ + 4 & \_ \\ + 5 & \{ \\ + 6 & \} \\ + 7 & \textasciitilde \\ + 8 & \textasciicircum \\ + 9 & \textbackslash \\ + \bottomrule + \end{tabular} + """ + ) + assert result == expected + + def test_to_latex_specified_header_special_chars_without_escape(self): + # GH 7124 + df = DataFrame({"a": [1, 2], "b": ["b1", "b2"]}) + result = df.to_latex(header=["$A$", "$B$"], escape=False) + expected = _dedent( + r""" + \begin{tabular}{lrl} \toprule - {} & a & b \\ - \midrule - \endhead - \midrule - \multicolumn{3}{r}{{Continued on next page}} \\ + {} & $A$ & $B$ \\ \midrule - \endfoot - + 0 & 1 & b1 \\ + 1 & 2 & b2 \\ \bottomrule - \endlastfoot + \end{tabular} + """ + ) + assert result == expected + + +class TestToLatexPosition: + def test_to_latex_position(self): + the_position = "h" + df = DataFrame({"a": [1, 2], "b": ["b1", "b2"]}) + result = df.to_latex(position=the_position) + expected = _dedent( + r""" + \begin{table}[h] + \centering + \begin{tabular}{lrl} + \toprule + {} & a & b \\ + \midrule 0 & 1 & b1 \\ 1 & 2 & b2 \\ - \end{longtable} + \bottomrule + \end{tabular} + \end{table} """ ) assert result == expected - def test_to_latex_longtable_label_only(self, df_short, label_longtable): - # GH 25436 - result = df_short.to_latex(longtable=True, label=label_longtable) + def test_to_latex_longtable_position(self): + the_position = "t" + df = DataFrame({"a": [1, 2], "b": ["b1", "b2"]}) + result = df.to_latex(longtable=True, position=the_position) expected = _dedent( r""" - \begin{longtable}{lrl} - \label{tab:longtable}\\ + \begin{longtable}[t]{lrl} \toprule {} & a & b \\ \midrule @@ -716,282 +751,370 @@ def test_to_latex_longtable_label_only(self, df_short, label_longtable): ) assert result == expected - def test_to_latex_longtable_caption_and_label( - self, - df_short, - caption_longtable, - label_longtable, - ): - # GH 25436 - result = df_short.to_latex( - longtable=True, - caption=caption_longtable, - label=label_longtable, + +class TestToLatexFormatters: + def test_to_latex_with_formatters(self): + df = DataFrame( + { + "datetime64": [ + datetime(2016, 1, 1), + datetime(2016, 2, 5), + datetime(2016, 3, 3), + ], + "float": [1.0, 2.0, 3.0], + "int": [1, 2, 3], + "object": [(1, 2), True, False], + } ) + + formatters = { + "datetime64": lambda x: x.strftime("%Y-%m"), + "float": lambda x: f"[{x: 4.1f}]", + "int": lambda x: f"0x{x:x}", + "object": lambda x: f"-{x!s}-", + "__index__": lambda x: f"index: {x}", + } + result = df.to_latex(formatters=dict(formatters)) + expected = _dedent( r""" - \begin{longtable}{lrl} - \caption{a table in a \texttt{longtable} environment} - \label{tab:longtable}\\ + \begin{tabular}{llrrl} \toprule - {} & a & b \\ + {} & datetime64 & float & int & object \\ \midrule - \endfirsthead - \caption[]{a table in a \texttt{longtable} environment} \\ + index: 0 & 2016-01 & [ 1.0] & 0x1 & -(1, 2)- \\ + index: 1 & 2016-02 & [ 2.0] & 0x2 & -True- \\ + index: 2 & 2016-03 & [ 3.0] & 0x3 & -False- \\ + \bottomrule + \end{tabular} + """ + ) + assert result == expected + + def test_to_latex_float_format_no_fixed_width_3decimals(self): + # GH 21625 + df = DataFrame({"x": [0.19999]}) + result = df.to_latex(float_format="%.3f") + expected = _dedent( + r""" + \begin{tabular}{lr} \toprule - {} & a & b \\ - \midrule - \endhead - \midrule - \multicolumn{3}{r}{{Continued on next page}} \\ + {} & x \\ \midrule - \endfoot - + 0 & 0.200 \\ \bottomrule - \endlastfoot - 0 & 1 & b1 \\ - 1 & 2 & b2 \\ - \end{longtable} + \end{tabular} """ ) assert result == expected - def test_to_latex_position(self): - the_position = "h" - df = DataFrame({"a": [1, 2], "b": ["b1", "b2"]}) - result = df.to_latex(position=the_position) + def test_to_latex_float_format_no_fixed_width_integer(self): + # GH 22270 + df = DataFrame({"x": [100.0]}) + result = df.to_latex(float_format="%.0f") expected = _dedent( r""" - \begin{table}[h] - \centering - \begin{tabular}{lrl} + \begin{tabular}{lr} \toprule - {} & a & b \\ + {} & x \\ \midrule - 0 & 1 & b1 \\ - 1 & 2 & b2 \\ + 0 & 100 \\ \bottomrule \end{tabular} - \end{table} """ ) assert result == expected - def test_to_latex_longtable_position(self): - the_position = "t" - df = DataFrame({"a": [1, 2], "b": ["b1", "b2"]}) - result = df.to_latex(longtable=True, position=the_position) + +class TestToLatexMultiindex: + @pytest.fixture + def multiindex_frame(self): + """Multiindex dataframe for testing multirow LaTeX macros.""" + yield DataFrame.from_dict( + { + ("c1", 0): pd.Series({x: x for x in range(4)}), + ("c1", 1): pd.Series({x: x + 4 for x in range(4)}), + ("c2", 0): pd.Series({x: x for x in range(4)}), + ("c2", 1): pd.Series({x: x + 4 for x in range(4)}), + ("c3", 0): pd.Series({x: x for x in range(4)}), + } + ).T + + @pytest.fixture + def multicolumn_frame(self): + """Multicolumn dataframe for testing multicolumn LaTeX macros.""" + yield pd.DataFrame( + { + ("c1", 0): {x: x for x in range(5)}, + ("c1", 1): {x: x + 5 for x in range(5)}, + ("c2", 0): {x: x for x in range(5)}, + ("c2", 1): {x: x + 5 for x in range(5)}, + ("c3", 0): {x: x for x in range(5)}, + } + ) + + def test_to_latex_multindex_header(self): + # GH 16718 + df = pd.DataFrame({"a": [0], "b": [1], "c": [2], "d": [3]}) + df = df.set_index(["a", "b"]) + observed = df.to_latex(header=["r1", "r2"]) expected = _dedent( r""" - \begin{longtable}[t]{lrl} + \begin{tabular}{llrr} \toprule - {} & a & b \\ + & & r1 & r2 \\ + a & b & & \\ \midrule - \endfirsthead + 0 & 1 & 2 & 3 \\ + \bottomrule + \end{tabular} + """ + ) + assert observed == expected + def test_to_latex_multiindex_empty_name(self): + # GH 18669 + mi = pd.MultiIndex.from_product([[1, 2]], names=[""]) + df = pd.DataFrame(-1, index=mi, columns=range(4)) + observed = df.to_latex() + expected = _dedent( + r""" + \begin{tabular}{lrrrr} \toprule - {} & a & b \\ - \midrule - \endhead - \midrule - \multicolumn{3}{r}{{Continued on next page}} \\ + & 0 & 1 & 2 & 3 \\ + {} & & & & \\ \midrule - \endfoot - + 1 & -1 & -1 & -1 & -1 \\ + 2 & -1 & -1 & -1 & -1 \\ \bottomrule - \endlastfoot - 0 & 1 & b1 \\ - 1 & 2 & b2 \\ - \end{longtable} + \end{tabular} """ ) - assert result == expected + assert observed == expected - def test_to_latex_escape_special_chars(self): - special_characters = ["&", "%", "$", "#", "_", "{", "}", "~", "^", "\\"] - df = DataFrame(data=special_characters) + def test_to_latex_multiindex_column_tabular(self): + df = DataFrame({("x", "y"): ["a"]}) result = df.to_latex() expected = _dedent( r""" \begin{tabular}{ll} \toprule - {} & 0 \\ + {} & x \\ + {} & y \\ \midrule - 0 & \& \\ - 1 & \% \\ - 2 & \$ \\ - 3 & \# \\ - 4 & \_ \\ - 5 & \{ \\ - 6 & \} \\ - 7 & \textasciitilde \\ - 8 & \textasciicircum \\ - 9 & \textbackslash \\ + 0 & a \\ \bottomrule \end{tabular} """ ) assert result == expected - def test_to_latex_no_header_with_index(self): - # GH 7124 - df = DataFrame({"a": [1, 2], "b": ["b1", "b2"]}) - result = df.to_latex(header=False) + def test_to_latex_multiindex_small_tabular(self): + df = DataFrame({("x", "y"): ["a"]}).T + result = df.to_latex() expected = _dedent( r""" - \begin{tabular}{lrl} + \begin{tabular}{lll} \toprule - 0 & 1 & b1 \\ - 1 & 2 & b2 \\ + & & 0 \\ + \midrule + x & y & a \\ \bottomrule \end{tabular} """ ) assert result == expected - def test_to_latex_no_header_without_index(self): - # GH 7124 - df = DataFrame({"a": [1, 2], "b": ["b1", "b2"]}) - result = df.to_latex(index=False, header=False) + def test_to_latex_multiindex_tabular(self, multiindex_frame): + result = multiindex_frame.to_latex() + expected = _dedent( + r""" + \begin{tabular}{llrrrr} + \toprule + & & 0 & 1 & 2 & 3 \\ + \midrule + c1 & 0 & 0 & 1 & 2 & 3 \\ + & 1 & 4 & 5 & 6 & 7 \\ + c2 & 0 & 0 & 1 & 2 & 3 \\ + & 1 & 4 & 5 & 6 & 7 \\ + c3 & 0 & 0 & 1 & 2 & 3 \\ + \bottomrule + \end{tabular} + """ + ) + assert result == expected + + def test_to_latex_multicolumn_tabular(self, multiindex_frame): + # GH 14184 + df = multiindex_frame.T + df.columns.names = ["a", "b"] + result = df.to_latex() expected = _dedent( r""" - \begin{tabular}{rl} + \begin{tabular}{lrrrrr} \toprule - 1 & b1 \\ - 2 & b2 \\ + a & \multicolumn{2}{l}{c1} & \multicolumn{2}{l}{c2} & c3 \\ + b & 0 & 1 & 0 & 1 & 0 \\ + \midrule + 0 & 0 & 4 & 0 & 4 & 0 \\ + 1 & 1 & 5 & 1 & 5 & 1 \\ + 2 & 2 & 6 & 2 & 6 & 2 \\ + 3 & 3 & 7 & 3 & 7 & 3 \\ \bottomrule \end{tabular} """ ) assert result == expected - def test_to_latex_specified_header_with_index(self): - # GH 7124 - df = DataFrame({"a": [1, 2], "b": ["b1", "b2"]}) - result = df.to_latex(header=["AA", "BB"]) + def test_to_latex_index_has_name_tabular(self): + # GH 10660 + df = pd.DataFrame({"a": [0, 0, 1, 1], "b": list("abab"), "c": [1, 2, 3, 4]}) + result = df.set_index(["a", "b"]).to_latex() expected = _dedent( r""" - \begin{tabular}{lrl} + \begin{tabular}{llr} \toprule - {} & AA & BB \\ + & & c \\ + a & b & \\ \midrule - 0 & 1 & b1 \\ - 1 & 2 & b2 \\ + 0 & a & 1 \\ + & b & 2 \\ + 1 & a & 3 \\ + & b & 4 \\ \bottomrule \end{tabular} """ ) assert result == expected - def test_to_latex_specified_header_without_index(self): - # GH 7124 - df = DataFrame({"a": [1, 2], "b": ["b1", "b2"]}) - result = df.to_latex(header=["AA", "BB"], index=False) + def test_to_latex_groupby_tabular(self): + # GH 10660 + df = pd.DataFrame({"a": [0, 0, 1, 1], "b": list("abab"), "c": [1, 2, 3, 4]}) + result = df.groupby("a").describe().to_latex() expected = _dedent( r""" - \begin{tabular}{rl} + \begin{tabular}{lrrrrrrrr} \toprule - AA & BB \\ + {} & \multicolumn{8}{l}{c} \\ + {} & count & mean & std & min & 25\% & 50\% & 75\% & max \\ + a & & & & & & & & \\ \midrule - 1 & b1 \\ - 2 & b2 \\ + 0 & 2.0 & 1.5 & 0.707107 & 1.0 & 1.25 & 1.5 & 1.75 & 2.0 \\ + 1 & 2.0 & 3.5 & 0.707107 & 3.0 & 3.25 & 3.5 & 3.75 & 4.0 \\ \bottomrule \end{tabular} """ ) assert result == expected - def test_to_latex_specified_header_special_chars_without_escape(self): - # GH 7124 - df = DataFrame({"a": [1, 2], "b": ["b1", "b2"]}) - result = df.to_latex(header=["$A$", "$B$"], escape=False) + def test_to_latex_multiindex_dupe_level(self): + # see gh-14484 + # + # If an index is repeated in subsequent rows, it should be + # replaced with a blank in the created table. This should + # ONLY happen if all higher order indices (to the left) are + # equal too. In this test, 'c' has to be printed both times + # because the higher order index 'A' != 'B'. + df = pd.DataFrame( + index=pd.MultiIndex.from_tuples([("A", "c"), ("B", "c")]), columns=["col"] + ) + result = df.to_latex() expected = _dedent( r""" - \begin{tabular}{lrl} + \begin{tabular}{lll} \toprule - {} & $A$ & $B$ \\ + & & col \\ \midrule - 0 & 1 & b1 \\ - 1 & 2 & b2 \\ + A & c & NaN \\ + B & c & NaN \\ \bottomrule \end{tabular} """ ) assert result == expected - def test_to_latex_number_of_items_in_header_missmatch_raises(self): - # GH 7124 - df = DataFrame({"a": [1, 2], "b": ["b1", "b2"]}) - msg = "Writing 2 cols but got 1 aliases" - with pytest.raises(ValueError, match=msg): - df.to_latex(header=["A"]) - - def test_to_latex_decimal(self): - # GH 12031 - df = DataFrame({"a": [1.0, 2.1], "b": ["b1", "b2"]}) - result = df.to_latex(decimal=",") + def test_to_latex_multicolumn_default(self, multicolumn_frame): + result = multicolumn_frame.to_latex() expected = _dedent( r""" - \begin{tabular}{lrl} + \begin{tabular}{lrrrrr} \toprule - {} & a & b \\ + {} & \multicolumn{2}{l}{c1} & \multicolumn{2}{l}{c2} & c3 \\ + {} & 0 & 1 & 0 & 1 & 0 \\ \midrule - 0 & 1,0 & b1 \\ - 1 & 2,1 & b2 \\ + 0 & 0 & 5 & 0 & 5 & 0 \\ + 1 & 1 & 6 & 1 & 6 & 1 \\ + 2 & 2 & 7 & 2 & 7 & 2 \\ + 3 & 3 & 8 & 3 & 8 & 3 \\ + 4 & 4 & 9 & 4 & 9 & 4 \\ \bottomrule \end{tabular} """ ) assert result == expected - def test_to_latex_series(self): - s = Series(["a", "b", "c"]) - result = s.to_latex() + def test_to_latex_multicolumn_false(self, multicolumn_frame): + result = multicolumn_frame.to_latex(multicolumn=False) expected = _dedent( r""" - \begin{tabular}{ll} + \begin{tabular}{lrrrrr} \toprule - {} & 0 \\ + {} & c1 & & c2 & & c3 \\ + {} & 0 & 1 & 0 & 1 & 0 \\ \midrule - 0 & a \\ - 1 & b \\ - 2 & c \\ + 0 & 0 & 5 & 0 & 5 & 0 \\ + 1 & 1 & 6 & 1 & 6 & 1 \\ + 2 & 2 & 7 & 2 & 7 & 2 \\ + 3 & 3 & 8 & 3 & 8 & 3 \\ + 4 & 4 & 9 & 4 & 9 & 4 \\ \bottomrule \end{tabular} """ ) assert result == expected - def test_to_latex_bold_rows(self): - # GH 16707 - df = pd.DataFrame({"a": [1, 2], "b": ["b1", "b2"]}) - result = df.to_latex(bold_rows=True) + def test_to_latex_multirow_true(self, multicolumn_frame): + result = multicolumn_frame.T.to_latex(multirow=True) expected = _dedent( r""" - \begin{tabular}{lrl} + \begin{tabular}{llrrrrr} \toprule - {} & a & b \\ + & & 0 & 1 & 2 & 3 & 4 \\ \midrule - \textbf{0} & 1 & b1 \\ - \textbf{1} & 2 & b2 \\ + \multirow{2}{*}{c1} & 0 & 0 & 1 & 2 & 3 & 4 \\ + & 1 & 5 & 6 & 7 & 8 & 9 \\ + \cline{1-7} + \multirow{2}{*}{c2} & 0 & 0 & 1 & 2 & 3 & 4 \\ + & 1 & 5 & 6 & 7 & 8 & 9 \\ + \cline{1-7} + c3 & 0 & 0 & 1 & 2 & 3 & 4 \\ \bottomrule \end{tabular} """ ) assert result == expected - def test_to_latex_no_bold_rows(self): - # GH 16707 - df = pd.DataFrame({"a": [1, 2], "b": ["b1", "b2"]}) - result = df.to_latex(bold_rows=False) + def test_to_latex_multicolumnrow_with_multicol_format(self, multicolumn_frame): + multicolumn_frame.index = multicolumn_frame.T.index + result = multicolumn_frame.T.to_latex( + multirow=True, + multicolumn=True, + multicolumn_format="c", + ) expected = _dedent( r""" - \begin{tabular}{lrl} + \begin{tabular}{llrrrrr} \toprule - {} & a & b \\ + & & \multicolumn{2}{c}{c1} & \multicolumn{2}{c}{c2} & c3 \\ + & & 0 & 1 & 0 & 1 & 0 \\ \midrule - 0 & 1 & b1 \\ - 1 & 2 & b2 \\ + \multirow{2}{*}{c1} & 0 & 0 & 1 & 2 & 3 & 4 \\ + & 1 & 5 & 6 & 7 & 8 & 9 \\ + \cline{1-7} + \multirow{2}{*}{c2} & 0 & 0 & 1 & 2 & 3 & 4 \\ + & 1 & 5 & 6 & 7 & 8 & 9 \\ + \cline{1-7} + c3 & 0 & 0 & 1 & 2 & 3 & 4 \\ \bottomrule \end{tabular} """ @@ -1061,7 +1184,8 @@ def test_to_latex_multiindex_nans(self, one_row): def test_to_latex_non_string_index(self): # GH 19981 - observed = pd.DataFrame([[1, 2, 3]] * 2).set_index([0, 1]).to_latex() + df = pd.DataFrame([[1, 2, 3]] * 2).set_index([0, 1]) + result = df.to_latex() expected = _dedent( r""" \begin{tabular}{llr} @@ -1075,100 +1199,8 @@ def test_to_latex_non_string_index(self): \end{tabular} """ ) - assert observed == expected - - def test_to_latex_midrule_location(self): - # GH 18326 - df = pd.DataFrame({"a": [1, 2]}) - df.index.name = "foo" - result = df.to_latex(index_names=False) - expected = _dedent( - r""" - \begin{tabular}{lr} - \toprule - {} & a \\ - \midrule - 0 & 1 \\ - 1 & 2 \\ - \bottomrule - \end{tabular} - """ - ) - assert result == expected - - def test_to_latex_multiindex_empty_name(self): - # GH 18669 - mi = pd.MultiIndex.from_product([[1, 2]], names=[""]) - df = pd.DataFrame(-1, index=mi, columns=range(4)) - observed = df.to_latex() - expected = _dedent( - r""" - \begin{tabular}{lrrrr} - \toprule - & 0 & 1 & 2 & 3 \\ - {} & & & & \\ - \midrule - 1 & -1 & -1 & -1 & -1 \\ - 2 & -1 & -1 & -1 & -1 \\ - \bottomrule - \end{tabular} - """ - ) - assert observed == expected - - def test_to_latex_float_format_no_fixed_width_3decimals(self): - # GH 21625 - df = DataFrame({"x": [0.19999]}) - result = df.to_latex(float_format="%.3f") - expected = _dedent( - r""" - \begin{tabular}{lr} - \toprule - {} & x \\ - \midrule - 0 & 0.200 \\ - \bottomrule - \end{tabular} - """ - ) - assert result == expected - - def test_to_latex_float_format_no_fixed_width_integer(self): - # GH 22270 - df = DataFrame({"x": [100.0]}) - result = df.to_latex(float_format="%.0f") - expected = _dedent( - r""" - \begin{tabular}{lr} - \toprule - {} & x \\ - \midrule - 0 & 100 \\ - \bottomrule - \end{tabular} - """ - ) assert result == expected - def test_to_latex_multindex_header(self): - # GH 16718 - df = pd.DataFrame({"a": [0], "b": [1], "c": [2], "d": [3]}) - df = df.set_index(["a", "b"]) - observed = df.to_latex(header=["r1", "r2"]) - expected = _dedent( - r""" - \begin{tabular}{llrr} - \toprule - & & r1 & r2 \\ - a & b & & \\ - \midrule - 0 & 1 & 2 & 3 \\ - \bottomrule - \end{tabular} - """ - ) - assert observed == expected - class TestTableBuilder: @pytest.fixture From 8e6ca4f87356f6c245cf2be2d1939e7bbc4785ca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torsten=20W=C3=B6rtwein?= Date: Wed, 30 Sep 2020 09:10:04 -0400 Subject: [PATCH 0951/1025] TST: read binary file objects with read_fwf (#36735) --- pandas/tests/io/parser/test_read_fwf.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/pandas/tests/io/parser/test_read_fwf.py b/pandas/tests/io/parser/test_read_fwf.py index 127d0dc4c9829..13519154f82b8 100644 --- a/pandas/tests/io/parser/test_read_fwf.py +++ b/pandas/tests/io/parser/test_read_fwf.py @@ -6,6 +6,7 @@ from datetime import datetime from io import BytesIO, StringIO +from pathlib import Path import numpy as np import pytest @@ -614,3 +615,22 @@ def test_fwf_compression(compression_only, infer): result = read_fwf(path, **kwargs) tm.assert_frame_equal(result, expected) + + +def test_binary_mode(): + """ + read_fwf supports opening files in binary mode. + + GH 18035. + """ + data = """aas aas aas +bba bab b a""" + df_reference = pd.DataFrame( + [["bba", "bab", "b a"]], columns=["aas", "aas.1", "aas.2"], index=[0] + ) + with tm.ensure_clean() as path: + Path(path).write_text(data) + with open(path, "rb") as file: + df = pd.read_fwf(file) + file.seek(0) + tm.assert_frame_equal(df, df_reference) From 4ef947f1e202b6e5b2edf2b5f42d173799251100 Mon Sep 17 00:00:00 2001 From: Daniel Saxton <2658661+dsaxton@users.noreply.github.com> Date: Wed, 30 Sep 2020 12:17:05 -0500 Subject: [PATCH 0952/1025] Comment on stale PRs (#36622) --- .github/workflows/stale-pr.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/stale-pr.yml b/.github/workflows/stale-pr.yml index e77bf2b81fc86..2f55a180bc88c 100644 --- a/.github/workflows/stale-pr.yml +++ b/.github/workflows/stale-pr.yml @@ -2,7 +2,7 @@ name: "Stale PRs" on: schedule: # * is a special character in YAML so you have to quote this string - - cron: "0 */6 * * *" + - cron: "0 0 * * *" jobs: stale: @@ -11,8 +11,8 @@ jobs: - uses: actions/stale@v3 with: repo-token: ${{ secrets.GITHUB_TOKEN }} - stale-pr-message: "This pull request is stale because it has been open for thirty days with no activity." - skip-stale-pr-message: true + stale-pr-message: "This pull request is stale because it has been open for thirty days with no activity. Please update or respond to this comment if you're still interested in working on this." + skip-stale-pr-message: false stale-pr-label: "Stale" exempt-pr-labels: "Needs Review,Blocked,Needs Discussion" days-before-stale: 30 From 37c3ecd5e18a819c94d83455fb260a3597248715 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torsten=20W=C3=B6rtwein?= Date: Wed, 30 Sep 2020 13:22:41 -0400 Subject: [PATCH 0953/1025] TST: honor encoding in read_fwf for memory-mapped files (#36737) --- pandas/tests/io/parser/test_read_fwf.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/pandas/tests/io/parser/test_read_fwf.py b/pandas/tests/io/parser/test_read_fwf.py index 13519154f82b8..d45317aaa3458 100644 --- a/pandas/tests/io/parser/test_read_fwf.py +++ b/pandas/tests/io/parser/test_read_fwf.py @@ -634,3 +634,24 @@ def test_binary_mode(): df = pd.read_fwf(file) file.seek(0) tm.assert_frame_equal(df, df_reference) + + +@pytest.mark.parametrize("memory_map", [True, False]) +def test_encoding_mmap(memory_map): + """ + encoding should be working, even when using a memory-mapped file. + + GH 23254. + """ + encoding = "iso8859_1" + data = BytesIO(" 1 A Ä 2\n".encode(encoding)) + df = pd.read_fwf( + data, + header=None, + widths=[2, 2, 2, 2], + encoding=encoding, + memory_map=memory_map, + ) + data.seek(0) + df_reference = pd.DataFrame([[1, "A", "Ä", 2]]) + tm.assert_frame_equal(df, df_reference) From 3fdbf27184e2956758a38d7a44ce896771849042 Mon Sep 17 00:00:00 2001 From: krajatcl <53620269+krajatcl@users.noreply.github.com> Date: Wed, 30 Sep 2020 23:19:25 +0530 Subject: [PATCH 0954/1025] TST: insert 'match' to bare pytest raises in pandas/tests/tseries/offsets/test_ticks.py (#36682) --- pandas/tests/tseries/offsets/test_ticks.py | 23 ++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/pandas/tests/tseries/offsets/test_ticks.py b/pandas/tests/tseries/offsets/test_ticks.py index cc23f5f3201da..c1621669bffd0 100644 --- a/pandas/tests/tseries/offsets/test_ticks.py +++ b/pandas/tests/tseries/offsets/test_ticks.py @@ -266,10 +266,15 @@ def test_tick_rdiv(cls): off = cls(10) delta = off.delta td64 = delta.to_timedelta64() + instance__type = ".".join([cls.__module__, cls.__name__]) + msg = ( + "unsupported operand type\\(s\\) for \\/: 'int'|'float' and " + f"'{instance__type}'" + ) - with pytest.raises(TypeError): + with pytest.raises(TypeError, match=msg): 2 / off - with pytest.raises(TypeError): + with pytest.raises(TypeError, match=msg): 2.0 / off assert (td64 * 2.5) / off == 2.5 @@ -330,14 +335,20 @@ def test_compare_ticks_to_strs(cls): assert not off == "infer" assert not "foo" == off + instance_type = ".".join([cls.__module__, cls.__name__]) + msg = ( + "'<'|'<='|'>'|'>=' not supported between instances of " + f"'str' and '{instance_type}'|'{instance_type}' and 'str'" + ) + for left, right in [("infer", off), (off, "infer")]: - with pytest.raises(TypeError): + with pytest.raises(TypeError, match=msg): left < right - with pytest.raises(TypeError): + with pytest.raises(TypeError, match=msg): left <= right - with pytest.raises(TypeError): + with pytest.raises(TypeError, match=msg): left > right - with pytest.raises(TypeError): + with pytest.raises(TypeError, match=msg): left >= right From db5d0c03508135cf3abfad30985df577e99c85f2 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 30 Sep 2020 12:55:14 -0700 Subject: [PATCH 0955/1025] CI: disable ARM build (#36733) --- .travis.yml | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/.travis.yml b/.travis.yml index 81cd461dd2c87..2ef8e0e03aaf8 100644 --- a/.travis.yml +++ b/.travis.yml @@ -46,16 +46,16 @@ matrix: - env: - JOB="3.7" ENV_FILE="ci/deps/travis-37.yaml" PATTERN="(not slow and not network and not clipboard)" - - arch: arm64 - env: - - JOB="3.7, arm64" PYTEST_WORKERS=1 ENV_FILE="ci/deps/travis-37-arm64.yaml" PATTERN="(not slow and not network and not clipboard and not arm_slow)" - - env: - JOB="3.7, locale" ENV_FILE="ci/deps/travis-37-locale.yaml" PATTERN="((not slow and not network and not clipboard) or (single and db))" LOCALE_OVERRIDE="zh_CN.UTF-8" SQL="1" services: - mysql - postgresql + - arch: arm64 + env: + - JOB="3.7, arm64" PYTEST_WORKERS=1 ENV_FILE="ci/deps/travis-37-arm64.yaml" PATTERN="(not slow and not network and not clipboard and not arm_slow)" + - env: # Enabling Deprecations when running tests # PANDAS_TESTING_MODE="deprecate" causes DeprecationWarning messages to be displayed in the logs @@ -65,6 +65,12 @@ matrix: - mysql - postgresql + allow_failures: + # Moved to allowed_failures 2020-09-29 due to timeouts https://github.com/pandas-dev/pandas/issues/36719 + - arch: arm64 + env: + - JOB="3.7, arm64" PYTEST_WORKERS=1 ENV_FILE="ci/deps/travis-37-arm64.yaml" PATTERN="(not slow and not network and not clipboard and not arm_slow)" + before_install: - echo "before_install" From 3bf7c0810994553838923f1fcacfe5727ea915cf Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Wed, 30 Sep 2020 21:27:07 +0100 Subject: [PATCH 0956/1025] REGR: Series.__mod__ behaves different with numexpr (#36552) --- doc/source/whatsnew/v1.1.3.rst | 1 + pandas/core/computation/expressions.py | 5 +++- pandas/core/ops/methods.py | 2 -- pandas/tests/test_expressions.py | 40 +++++++++++++++++++++++++- 4 files changed, 44 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v1.1.3.rst b/doc/source/whatsnew/v1.1.3.rst index 91b9cf59687b3..15777abcb8084 100644 --- a/doc/source/whatsnew/v1.1.3.rst +++ b/doc/source/whatsnew/v1.1.3.rst @@ -34,6 +34,7 @@ Fixed regressions - Fixed regression when adding a :meth:`timedelta_range` to a :class:`Timestamp` raised a ``ValueError`` (:issue:`35897`) - Fixed regression in :meth:`Series.__getitem__` incorrectly raising when the input was a tuple (:issue:`35534`) - Fixed regression in :meth:`Series.__getitem__` incorrectly raising when the input was a frozenset (:issue:`35747`) +- Fixed regression in modulo of :class:`Index`, :class:`Series` and :class:`DataFrame` using ``numexpr`` using C not Python semantics (:issue:`36047`, :issue:`36526`) - Fixed regression in :meth:`read_excel` with ``engine="odf"`` caused ``UnboundLocalError`` in some cases where cells had nested child nodes (:issue:`36122`, :issue:`35802`) - Fixed regression in :meth:`DataFrame.replace` inconsistent replace when using a float in the replace method (:issue:`35376`) - Fixed regression in :class:`DataFrame` and :class:`Series` comparisons between numeric arrays and strings (:issue:`35700`, :issue:`36377`) diff --git a/pandas/core/computation/expressions.py b/pandas/core/computation/expressions.py index 0032fe97b8b33..5bfd2e93a9247 100644 --- a/pandas/core/computation/expressions.py +++ b/pandas/core/computation/expressions.py @@ -133,7 +133,10 @@ def _evaluate_numexpr(op, op_str, a, b): roperator.rtruediv: "/", operator.floordiv: "//", roperator.rfloordiv: "//", - operator.mod: "%", + # we require Python semantics for mod of negative for backwards compatibility + # see https://github.com/pydata/numexpr/issues/365 + # so sticking with unaccelerated for now + operator.mod: None, roperator.rmod: "%", operator.pow: "**", roperator.rpow: "**", diff --git a/pandas/core/ops/methods.py b/pandas/core/ops/methods.py index e04db92b58c36..852157e52d5fe 100644 --- a/pandas/core/ops/methods.py +++ b/pandas/core/ops/methods.py @@ -171,8 +171,6 @@ def _create_methods(cls, arith_method, comp_method, bool_method, special): mul=arith_method(cls, operator.mul, special), truediv=arith_method(cls, operator.truediv, special), floordiv=arith_method(cls, operator.floordiv, special), - # Causes a floating point exception in the tests when numexpr enabled, - # so for now no speedup mod=arith_method(cls, operator.mod, special), pow=arith_method(cls, operator.pow, special), # not entirely sure why this is necessary, but previously was included diff --git a/pandas/tests/test_expressions.py b/pandas/tests/test_expressions.py index da7f8b9b4a721..6db1078fcde4f 100644 --- a/pandas/tests/test_expressions.py +++ b/pandas/tests/test_expressions.py @@ -6,7 +6,7 @@ import pytest import pandas._testing as tm -from pandas.core.api import DataFrame +from pandas.core.api import DataFrame, Index, Series from pandas.core.computation import expressions as expr _frame = DataFrame(randn(10000, 4), columns=list("ABCD"), dtype="float64") @@ -380,3 +380,41 @@ def test_frame_series_axis(self, axis, arith): result = op_func(other, axis=axis) tm.assert_frame_equal(expected, result) + + @pytest.mark.parametrize( + "op", + [ + "__mod__", + pytest.param("__rmod__", marks=pytest.mark.xfail(reason="GH-36552")), + "__floordiv__", + "__rfloordiv__", + ], + ) + @pytest.mark.parametrize("box", [DataFrame, Series, Index]) + @pytest.mark.parametrize("scalar", [-5, 5]) + def test_python_semantics_with_numexpr_installed(self, op, box, scalar): + # https://github.com/pandas-dev/pandas/issues/36047 + expr._MIN_ELEMENTS = 0 + data = np.arange(-50, 50) + obj = box(data) + method = getattr(obj, op) + result = method(scalar) + + # compare result with numpy + expr.set_use_numexpr(False) + expected = method(scalar) + expr.set_use_numexpr(True) + tm.assert_equal(result, expected) + + # compare result element-wise with Python + for i, elem in enumerate(data): + if box == DataFrame: + scalar_result = result.iloc[i, 0] + else: + scalar_result = result[i] + try: + expected = getattr(int(elem), op)(scalar) + except ZeroDivisionError: + pass + else: + assert scalar_result == expected From dba02b3961e4c85a70f2e755dc66886b02d7d7f5 Mon Sep 17 00:00:00 2001 From: Daniel Saxton <2658661+dsaxton@users.noreply.github.com> Date: Wed, 30 Sep 2020 20:14:21 -0500 Subject: [PATCH 0957/1025] DOC: Format more code blocks (#36734) --- doc/source/user_guide/io.rst | 1472 +++++++++++++++++--------------- doc/source/user_guide/text.rst | 173 ++-- 2 files changed, 845 insertions(+), 800 deletions(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index fc5aad12cd5e8..e483cebf71614 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -135,12 +135,10 @@ usecols : list-like or callable, default ``None`` import pandas as pd from io import StringIO - data = ('col1,col2,col3\n' - 'a,b,1\n' - 'a,b,2\n' - 'c,d,3') + + data = "col1,col2,col3\na,b,1\na,b,2\nc,d,3" pd.read_csv(StringIO(data)) - pd.read_csv(StringIO(data), usecols=lambda x: x.upper() in ['COL1', 'COL3']) + pd.read_csv(StringIO(data), usecols=lambda x: x.upper() in ["COL1", "COL3"]) Using this parameter results in much faster parsing time and lower memory usage. squeeze : boolean, default ``False`` @@ -181,10 +179,7 @@ skiprows : list-like or integer, default ``None`` .. ipython:: python - data = ('col1,col2,col3\n' - 'a,b,1\n' - 'a,b,2\n' - 'c,d,3') + data = "col1,col2,col3\na,b,1\na,b,2\nc,d,3" pd.read_csv(StringIO(data)) pd.read_csv(StringIO(data), skiprows=lambda x: x % 2 != 0) @@ -365,17 +360,14 @@ columns: .. ipython:: python import numpy as np - data = ('a,b,c,d\n' - '1,2,3,4\n' - '5,6,7,8\n' - '9,10,11') + + data = "a,b,c,d\n1,2,3,4\n5,6,7,8\n9,10,11" print(data) df = pd.read_csv(StringIO(data), dtype=object) df - df['a'][0] - df = pd.read_csv(StringIO(data), - dtype={'b': object, 'c': np.float64, 'd': 'Int64'}) + df["a"][0] + df = pd.read_csv(StringIO(data), dtype={"b": object, "c": np.float64, "d": "Int64"}) df.dtypes Fortunately, pandas offers more than one way to ensure that your column(s) @@ -390,14 +382,10 @@ of :func:`~pandas.read_csv`: .. ipython:: python - data = ("col_1\n" - "1\n" - "2\n" - "'A'\n" - "4.22") - df = pd.read_csv(StringIO(data), converters={'col_1': str}) + data = "col_1\n1\n2\n'A'\n4.22" + df = pd.read_csv(StringIO(data), converters={"col_1": str}) df - df['col_1'].apply(type).value_counts() + df["col_1"].apply(type).value_counts() Or you can use the :func:`~pandas.to_numeric` function to coerce the dtypes after reading in the data, @@ -405,9 +393,9 @@ dtypes after reading in the data, .. ipython:: python df2 = pd.read_csv(StringIO(data)) - df2['col_1'] = pd.to_numeric(df2['col_1'], errors='coerce') + df2["col_1"] = pd.to_numeric(df2["col_1"], errors="coerce") df2 - df2['col_1'].apply(type).value_counts() + df2["col_1"].apply(type).value_counts() which will convert all valid parsing to floats, leaving the invalid parsing as ``NaN``. @@ -429,12 +417,12 @@ worth trying. .. ipython:: python :okwarning: - col_1 = list(range(500000)) + ['a', 'b'] + list(range(500000)) - df = pd.DataFrame({'col_1': col_1}) - df.to_csv('foo.csv') - mixed_df = pd.read_csv('foo.csv') - mixed_df['col_1'].apply(type).value_counts() - mixed_df['col_1'].dtype + col_1 = list(range(500000)) + ["a", "b"] + list(range(500000)) + df = pd.DataFrame({"col_1": col_1}) + df.to_csv("foo.csv") + mixed_df = pd.read_csv("foo.csv") + mixed_df["col_1"].apply(type).value_counts() + mixed_df["col_1"].dtype will result with ``mixed_df`` containing an ``int`` dtype for certain chunks of the column, and ``str`` for others due to the mixed dtypes from the @@ -445,7 +433,8 @@ worth trying. :suppress: import os - os.remove('foo.csv') + + os.remove("foo.csv") .. _io.categorical: @@ -457,21 +446,18 @@ Specifying categorical dtype .. ipython:: python - data = ('col1,col2,col3\n' - 'a,b,1\n' - 'a,b,2\n' - 'c,d,3') + data = "col1,col2,col3\na,b,1\na,b,2\nc,d,3" pd.read_csv(StringIO(data)) pd.read_csv(StringIO(data)).dtypes - pd.read_csv(StringIO(data), dtype='category').dtypes + pd.read_csv(StringIO(data), dtype="category").dtypes Individual columns can be parsed as a ``Categorical`` using a dict specification: .. ipython:: python - pd.read_csv(StringIO(data), dtype={'col1': 'category'}).dtypes + pd.read_csv(StringIO(data), dtype={"col1": "category"}).dtypes Specifying ``dtype='category'`` will result in an unordered ``Categorical`` whose ``categories`` are the unique values observed in the data. For more @@ -482,16 +468,17 @@ that column's ``dtype``. .. ipython:: python from pandas.api.types import CategoricalDtype - dtype = CategoricalDtype(['d', 'c', 'b', 'a'], ordered=True) - pd.read_csv(StringIO(data), dtype={'col1': dtype}).dtypes + + dtype = CategoricalDtype(["d", "c", "b", "a"], ordered=True) + pd.read_csv(StringIO(data), dtype={"col1": dtype}).dtypes When using ``dtype=CategoricalDtype``, "unexpected" values outside of ``dtype.categories`` are treated as missing values. .. ipython:: python - dtype = CategoricalDtype(['a', 'b', 'd']) # No 'c' - pd.read_csv(StringIO(data), dtype={'col1': dtype}).col1 + dtype = CategoricalDtype(["a", "b", "d"]) # No 'c' + pd.read_csv(StringIO(data), dtype={"col1": dtype}).col1 This matches the behavior of :meth:`Categorical.set_categories`. @@ -507,11 +494,11 @@ This matches the behavior of :meth:`Categorical.set_categories`. .. ipython:: python - df = pd.read_csv(StringIO(data), dtype='category') + df = pd.read_csv(StringIO(data), dtype="category") df.dtypes - df['col3'] - df['col3'].cat.categories = pd.to_numeric(df['col3'].cat.categories) - df['col3'] + df["col3"] + df["col3"].cat.categories = pd.to_numeric(df["col3"].cat.categories) + df["col3"] Naming and using columns @@ -527,10 +514,7 @@ used as the column names: .. ipython:: python - data = ('a,b,c\n' - '1,2,3\n' - '4,5,6\n' - '7,8,9') + data = "a,b,c\n1,2,3\n4,5,6\n7,8,9" print(data) pd.read_csv(StringIO(data)) @@ -541,19 +525,15 @@ any): .. ipython:: python print(data) - pd.read_csv(StringIO(data), names=['foo', 'bar', 'baz'], header=0) - pd.read_csv(StringIO(data), names=['foo', 'bar', 'baz'], header=None) + pd.read_csv(StringIO(data), names=["foo", "bar", "baz"], header=0) + pd.read_csv(StringIO(data), names=["foo", "bar", "baz"], header=None) If the header is in a row other than the first, pass the row number to ``header``. This will skip the preceding rows: .. ipython:: python - data = ('skip this skip it\n' - 'a,b,c\n' - '1,2,3\n' - '4,5,6\n' - '7,8,9') + data = "skip this skip it\na,b,c\n1,2,3\n4,5,6\n7,8,9" pd.read_csv(StringIO(data), header=1) .. note:: @@ -574,9 +554,7 @@ distinguish between them so as to prevent overwriting data: .. ipython:: python - data = ('a,b,a\n' - '0,1,2\n' - '3,4,5') + data = "a,b,a\n0,1,2\n3,4,5" pd.read_csv(StringIO(data)) There is no more duplicate data because ``mangle_dupe_cols=True`` by default, @@ -613,18 +591,18 @@ file, either using the column names, position numbers or a callable: .. ipython:: python - data = 'a,b,c,d\n1,2,3,foo\n4,5,6,bar\n7,8,9,baz' + data = "a,b,c,d\n1,2,3,foo\n4,5,6,bar\n7,8,9,baz" pd.read_csv(StringIO(data)) - pd.read_csv(StringIO(data), usecols=['b', 'd']) + pd.read_csv(StringIO(data), usecols=["b", "d"]) pd.read_csv(StringIO(data), usecols=[0, 2, 3]) - pd.read_csv(StringIO(data), usecols=lambda x: x.upper() in ['A', 'C']) + pd.read_csv(StringIO(data), usecols=lambda x: x.upper() in ["A", "C"]) The ``usecols`` argument can also be used to specify which columns not to use in the final result: .. ipython:: python - pd.read_csv(StringIO(data), usecols=lambda x: x not in ['a', 'c']) + pd.read_csv(StringIO(data), usecols=lambda x: x not in ["a", "c"]) In this case, the callable is specifying that we exclude the "a" and "c" columns from the output. @@ -642,26 +620,15 @@ be ignored. By default, completely blank lines will be ignored as well. .. ipython:: python - data = ('\n' - 'a,b,c\n' - ' \n' - '# commented line\n' - '1,2,3\n' - '\n' - '4,5,6') + data = "\na,b,c\n \n# commented line\n1,2,3\n\n4,5,6" print(data) - pd.read_csv(StringIO(data), comment='#') + pd.read_csv(StringIO(data), comment="#") If ``skip_blank_lines=False``, then ``read_csv`` will not ignore blank lines: .. ipython:: python - data = ('a,b,c\n' - '\n' - '1,2,3\n' - '\n' - '\n' - '4,5,6') + data = "a,b,c\n\n1,2,3\n\n\n4,5,6" pd.read_csv(StringIO(data), skip_blank_lines=False) .. warning:: @@ -672,32 +639,28 @@ If ``skip_blank_lines=False``, then ``read_csv`` will not ignore blank lines: .. ipython:: python - data = ('#comment\n' - 'a,b,c\n' - 'A,B,C\n' - '1,2,3') - pd.read_csv(StringIO(data), comment='#', header=1) - data = ('A,B,C\n' - '#comment\n' - 'a,b,c\n' - '1,2,3') - pd.read_csv(StringIO(data), comment='#', skiprows=2) + data = "#comment\na,b,c\nA,B,C\n1,2,3" + pd.read_csv(StringIO(data), comment="#", header=1) + data = "A,B,C\n#comment\na,b,c\n1,2,3" + pd.read_csv(StringIO(data), comment="#", skiprows=2) If both ``header`` and ``skiprows`` are specified, ``header`` will be relative to the end of ``skiprows``. For example: .. ipython:: python - data = ('# empty\n' - '# second empty line\n' - '# third emptyline\n' - 'X,Y,Z\n' - '1,2,3\n' - 'A,B,C\n' - '1,2.,4.\n' - '5.,NaN,10.0\n') + data = ( + "# empty\n" + "# second empty line\n" + "# third emptyline\n" + "X,Y,Z\n" + "1,2,3\n" + "A,B,C\n" + "1,2.,4.\n" + "5.,NaN,10.0\n" + ) print(data) - pd.read_csv(StringIO(data), comment='#', skiprows=4, header=1) + pd.read_csv(StringIO(data), comment="#", skiprows=4, header=1) .. _io.comments: @@ -709,36 +672,38 @@ Sometimes comments or meta data may be included in a file: .. ipython:: python :suppress: - data = ("ID,level,category\n" - "Patient1,123000,x # really unpleasant\n" - "Patient2,23000,y # wouldn't take his medicine\n" - "Patient3,1234018,z # awesome") + data = ( + "ID,level,category\n" + "Patient1,123000,x # really unpleasant\n" + "Patient2,23000,y # wouldn't take his medicine\n" + "Patient3,1234018,z # awesome" + ) - with open('tmp.csv', 'w') as fh: + with open("tmp.csv", "w") as fh: fh.write(data) .. ipython:: python - print(open('tmp.csv').read()) + print(open("tmp.csv").read()) By default, the parser includes the comments in the output: .. ipython:: python - df = pd.read_csv('tmp.csv') + df = pd.read_csv("tmp.csv") df We can suppress the comments using the ``comment`` keyword: .. ipython:: python - df = pd.read_csv('tmp.csv', comment='#') + df = pd.read_csv("tmp.csv", comment="#") df .. ipython:: python :suppress: - os.remove('tmp.csv') + os.remove("tmp.csv") .. _io.unicode: @@ -751,13 +716,12 @@ result in byte strings being decoded to unicode in the result: .. ipython:: python from io import BytesIO - data = (b'word,length\n' - b'Tr\xc3\xa4umen,7\n' - b'Gr\xc3\xbc\xc3\x9fe,5') - data = data.decode('utf8').encode('latin-1') - df = pd.read_csv(BytesIO(data), encoding='latin-1') + + data = b"word,length\n" b"Tr\xc3\xa4umen,7\n" b"Gr\xc3\xbc\xc3\x9fe,5" + data = data.decode("utf8").encode("latin-1") + df = pd.read_csv(BytesIO(data), encoding="latin-1") df - df['word'][1] + df["word"][1] Some formats which encode all characters as multiple bytes, like UTF-16, won't parse correctly at all without specifying the encoding. `Full list of Python @@ -774,16 +738,12 @@ first column will be used as the ``DataFrame``'s row names: .. ipython:: python - data = ('a,b,c\n' - '4,apple,bat,5.7\n' - '8,orange,cow,10') + data = "a,b,c\n4,apple,bat,5.7\n8,orange,cow,10" pd.read_csv(StringIO(data)) .. ipython:: python - data = ('index,a,b,c\n' - '4,apple,bat,5.7\n' - '8,orange,cow,10') + data = "index,a,b,c\n4,apple,bat,5.7\n8,orange,cow,10" pd.read_csv(StringIO(data), index_col=0) Ordinarily, you can achieve this behavior using the ``index_col`` option. @@ -794,9 +754,7 @@ index column inference and discard the last column, pass ``index_col=False``: .. ipython:: python - data = ('a,b,c\n' - '4,apple,bat,\n' - '8,orange,cow,') + data = "a,b,c\n4,apple,bat,\n8,orange,cow," print(data) pd.read_csv(StringIO(data)) pd.read_csv(StringIO(data), index_col=False) @@ -806,12 +764,10 @@ If a subset of data is being parsed using the ``usecols`` option, the .. ipython:: python - data = ('a,b,c\n' - '4,apple,bat,\n' - '8,orange,cow,') + data = "a,b,c\n4,apple,bat,\n8,orange,cow," print(data) - pd.read_csv(StringIO(data), usecols=['b', 'c']) - pd.read_csv(StringIO(data), usecols=['b', 'c'], index_col=0) + pd.read_csv(StringIO(data), usecols=["b", "c"]) + pd.read_csv(StringIO(data), usecols=["b", "c"], index_col=0) .. _io.parse_dates: @@ -831,14 +787,14 @@ The simplest case is to just pass in ``parse_dates=True``: .. ipython:: python :suppress: - f = open('foo.csv', 'w') - f.write('date,A,B,C\n20090101,a,1,2\n20090102,b,3,4\n20090103,c,4,5') + f = open("foo.csv", "w") + f.write("date,A,B,C\n20090101,a,1,2\n20090102,b,3,4\n20090103,c,4,5") f.close() .. ipython:: python # Use a column as an index, and parse it as dates. - df = pd.read_csv('foo.csv', index_col=0, parse_dates=True) + df = pd.read_csv("foo.csv", index_col=0, parse_dates=True) df # These are Python datetime objects @@ -856,20 +812,22 @@ column names: .. ipython:: python :suppress: - data = ("KORD,19990127, 19:00:00, 18:56:00, 0.8100\n" - "KORD,19990127, 20:00:00, 19:56:00, 0.0100\n" - "KORD,19990127, 21:00:00, 20:56:00, -0.5900\n" - "KORD,19990127, 21:00:00, 21:18:00, -0.9900\n" - "KORD,19990127, 22:00:00, 21:56:00, -0.5900\n" - "KORD,19990127, 23:00:00, 22:56:00, -0.5900") + data = ( + "KORD,19990127, 19:00:00, 18:56:00, 0.8100\n" + "KORD,19990127, 20:00:00, 19:56:00, 0.0100\n" + "KORD,19990127, 21:00:00, 20:56:00, -0.5900\n" + "KORD,19990127, 21:00:00, 21:18:00, -0.9900\n" + "KORD,19990127, 22:00:00, 21:56:00, -0.5900\n" + "KORD,19990127, 23:00:00, 22:56:00, -0.5900" + ) - with open('tmp.csv', 'w') as fh: + with open("tmp.csv", "w") as fh: fh.write(data) .. ipython:: python - print(open('tmp.csv').read()) - df = pd.read_csv('tmp.csv', header=None, parse_dates=[[1, 2], [1, 3]]) + print(open("tmp.csv").read()) + df = pd.read_csv("tmp.csv", header=None, parse_dates=[[1, 2], [1, 3]]) df By default the parser removes the component date columns, but you can choose @@ -877,8 +835,9 @@ to retain them via the ``keep_date_col`` keyword: .. ipython:: python - df = pd.read_csv('tmp.csv', header=None, parse_dates=[[1, 2], [1, 3]], - keep_date_col=True) + df = pd.read_csv( + "tmp.csv", header=None, parse_dates=[[1, 2], [1, 3]], keep_date_col=True + ) df Note that if you wish to combine multiple columns into a single date column, a @@ -891,8 +850,8 @@ You can also use a dict to specify custom name columns: .. ipython:: python - date_spec = {'nominal': [1, 2], 'actual': [1, 3]} - df = pd.read_csv('tmp.csv', header=None, parse_dates=date_spec) + date_spec = {"nominal": [1, 2], "actual": [1, 3]} + df = pd.read_csv("tmp.csv", header=None, parse_dates=date_spec) df It is important to remember that if multiple text columns are to be parsed into @@ -903,9 +862,10 @@ data columns: .. ipython:: python - date_spec = {'nominal': [1, 2], 'actual': [1, 3]} - df = pd.read_csv('tmp.csv', header=None, parse_dates=date_spec, - index_col=0) # index is the nominal column + date_spec = {"nominal": [1, 2], "actual": [1, 3]} + df = pd.read_csv( + "tmp.csv", header=None, parse_dates=date_spec, index_col=0 + ) # index is the nominal column df .. note:: @@ -929,8 +889,9 @@ take full advantage of the flexibility of the date parsing API: .. ipython:: python - df = pd.read_csv('tmp.csv', header=None, parse_dates=date_spec, - date_parser=pd.to_datetime) + df = pd.read_csv( + "tmp.csv", header=None, parse_dates=date_spec, date_parser=pd.to_datetime + ) df Pandas will try to call the ``date_parser`` function in three different ways. If @@ -957,7 +918,7 @@ Note that performance-wise, you should try these methods of parsing dates in ord .. ipython:: python :suppress: - os.remove('tmp.csv') + os.remove("tmp.csv") .. _io.csv.mixed_timezones: @@ -976,17 +937,20 @@ an object-dtype column with strings, even with ``parse_dates``. a 2000-01-01T00:00:00+05:00 2000-01-01T00:00:00+06:00""" - df = pd.read_csv(StringIO(content), parse_dates=['a']) - df['a'] + df = pd.read_csv(StringIO(content), parse_dates=["a"]) + df["a"] To parse the mixed-timezone values as a datetime column, pass a partially-applied :func:`to_datetime` with ``utc=True`` as the ``date_parser``. .. ipython:: python - df = pd.read_csv(StringIO(content), parse_dates=['a'], - date_parser=lambda col: pd.to_datetime(col, utc=True)) - df['a'] + df = pd.read_csv( + StringIO(content), + parse_dates=["a"], + date_parser=lambda col: pd.to_datetime(col, utc=True), + ) + df["a"] .. _io.dayfirst: @@ -1022,14 +986,13 @@ Note that ``infer_datetime_format`` is sensitive to ``dayfirst``. With .. ipython:: python # Try to infer the format for the index column - df = pd.read_csv('foo.csv', index_col=0, parse_dates=True, - infer_datetime_format=True) + df = pd.read_csv("foo.csv", index_col=0, parse_dates=True, infer_datetime_format=True) df .. ipython:: python :suppress: - os.remove('foo.csv') + os.remove("foo.csv") International date formats ++++++++++++++++++++++++++ @@ -1040,19 +1003,16 @@ DD/MM/YYYY instead. For convenience, a ``dayfirst`` keyword is provided: .. ipython:: python :suppress: - data = ("date,value,cat\n" - "1/6/2000,5,a\n" - "2/6/2000,10,b\n" - "3/6/2000,15,c") - with open('tmp.csv', 'w') as fh: + data = "date,value,cat\n1/6/2000,5,a\n2/6/2000,10,b\n3/6/2000,15,c" + with open("tmp.csv", "w") as fh: fh.write(data) .. ipython:: python - print(open('tmp.csv').read()) + print(open("tmp.csv").read()) - pd.read_csv('tmp.csv', parse_dates=[0]) - pd.read_csv('tmp.csv', dayfirst=True, parse_dates=[0]) + pd.read_csv("tmp.csv", parse_dates=[0]) + pd.read_csv("tmp.csv", dayfirst=True, parse_dates=[0]) Writing CSVs to binary file objects +++++++++++++++++++++++++++++++++++ @@ -1084,14 +1044,16 @@ writing to a file). For example: .. ipython:: python - val = '0.3066101993807095471566981359501369297504425048828125' - data = 'a,b,c\n1,2,{0}'.format(val) - abs(pd.read_csv(StringIO(data), engine='c', - float_precision=None)['c'][0] - float(val)) - abs(pd.read_csv(StringIO(data), engine='c', - float_precision='high')['c'][0] - float(val)) - abs(pd.read_csv(StringIO(data), engine='c', - float_precision='round_trip')['c'][0] - float(val)) + val = "0.3066101993807095471566981359501369297504425048828125" + data = "a,b,c\n1,2,{0}".format(val) + abs(pd.read_csv(StringIO(data), engine="c", float_precision=None)["c"][0] - float(val)) + abs( + pd.read_csv(StringIO(data), engine="c", float_precision="high")["c"][0] - float(val) + ) + abs( + pd.read_csv(StringIO(data), engine="c", float_precision="round_trip")["c"][0] + - float(val) + ) .. _io.thousands: @@ -1106,20 +1068,22 @@ correctly: .. ipython:: python :suppress: - data = ("ID|level|category\n" - "Patient1|123,000|x\n" - "Patient2|23,000|y\n" - "Patient3|1,234,018|z") + data = ( + "ID|level|category\n" + "Patient1|123,000|x\n" + "Patient2|23,000|y\n" + "Patient3|1,234,018|z" + ) - with open('tmp.csv', 'w') as fh: + with open("tmp.csv", "w") as fh: fh.write(data) By default, numbers with a thousands separator will be parsed as strings: .. ipython:: python - print(open('tmp.csv').read()) - df = pd.read_csv('tmp.csv', sep='|') + print(open("tmp.csv").read()) + df = pd.read_csv("tmp.csv", sep="|") df df.level.dtype @@ -1128,8 +1092,8 @@ The ``thousands`` keyword allows integers to be parsed correctly: .. ipython:: python - print(open('tmp.csv').read()) - df = pd.read_csv('tmp.csv', sep='|', thousands=',') + print(open("tmp.csv").read()) + df = pd.read_csv("tmp.csv", sep="|", thousands=",") df df.level.dtype @@ -1137,7 +1101,7 @@ The ``thousands`` keyword allows integers to be parsed correctly: .. ipython:: python :suppress: - os.remove('tmp.csv') + os.remove("tmp.csv") .. _io.na_values: @@ -1162,7 +1126,7 @@ Let us consider some examples: .. code-block:: python - pd.read_csv('path_to_file.csv', na_values=[5]) + pd.read_csv("path_to_file.csv", na_values=[5]) In the example above ``5`` and ``5.0`` will be recognized as ``NaN``, in addition to the defaults. A string will first be interpreted as a numerical @@ -1170,19 +1134,19 @@ addition to the defaults. A string will first be interpreted as a numerical .. code-block:: python - pd.read_csv('path_to_file.csv', keep_default_na=False, na_values=[""]) + pd.read_csv("path_to_file.csv", keep_default_na=False, na_values=[""]) Above, only an empty field will be recognized as ``NaN``. .. code-block:: python - pd.read_csv('path_to_file.csv', keep_default_na=False, na_values=["NA", "0"]) + pd.read_csv("path_to_file.csv", keep_default_na=False, na_values=["NA", "0"]) Above, both ``NA`` and ``0`` as strings are ``NaN``. .. code-block:: python - pd.read_csv('path_to_file.csv', na_values=["Nope"]) + pd.read_csv("path_to_file.csv", na_values=["Nope"]) The default values, in addition to the string ``"Nope"`` are recognized as ``NaN``. @@ -1205,19 +1169,16 @@ as a ``Series``: .. ipython:: python :suppress: - data = ("level\n" - "Patient1,123000\n" - "Patient2,23000\n" - "Patient3,1234018") + data = "level\nPatient1,123000\nPatient2,23000\nPatient3,1234018" - with open('tmp.csv', 'w') as fh: + with open("tmp.csv", "w") as fh: fh.write(data) .. ipython:: python - print(open('tmp.csv').read()) + print(open("tmp.csv").read()) - output = pd.read_csv('tmp.csv', squeeze=True) + output = pd.read_csv("tmp.csv", squeeze=True) output type(output) @@ -1225,7 +1186,7 @@ as a ``Series``: .. ipython:: python :suppress: - os.remove('tmp.csv') + os.remove("tmp.csv") .. _io.boolean: @@ -1239,12 +1200,10 @@ options as follows: .. ipython:: python - data = ('a,b,c\n' - '1,Yes,2\n' - '3,No,4') + data = "a,b,c\n1,Yes,2\n3,No,4" print(data) pd.read_csv(StringIO(data)) - pd.read_csv(StringIO(data), true_values=['Yes'], false_values=['No']) + pd.read_csv(StringIO(data), true_values=["Yes"], false_values=["No"]) .. _io.bad_lines: @@ -1258,10 +1217,7 @@ too many fields will raise an error by default: .. ipython:: python :okexcept: - data = ('a,b,c\n' - '1,2,3\n' - '4,5,6,7\n' - '8,9,10') + data = "a,b,c\n1,2,3\n4,5,6,7\n8,9,10" pd.read_csv(StringIO(data)) You can elect to skip bad lines: @@ -1301,9 +1257,7 @@ or a :class:`python:csv.Dialect` instance. .. ipython:: python :suppress: - data = ('label1,label2,label3\n' - 'index1,"a,c,e\n' - 'index2,b,d,f') + data = "label1,label2,label3\n" 'index1,"a,c,e\n' "index2,b,d,f" Suppose you had data with unenclosed quotes: @@ -1321,6 +1275,7 @@ We can get around this using ``dialect``: :okwarning: import csv + dia = csv.excel() dia.quoting = csv.QUOTE_NONE pd.read_csv(StringIO(data), dialect=dia) @@ -1329,15 +1284,15 @@ All of the dialect options can be specified separately by keyword arguments: .. ipython:: python - data = 'a,b,c~1,2,3~4,5,6' - pd.read_csv(StringIO(data), lineterminator='~') + data = "a,b,c~1,2,3~4,5,6" + pd.read_csv(StringIO(data), lineterminator="~") Another common dialect option is ``skipinitialspace``, to skip any whitespace after a delimiter: .. ipython:: python - data = 'a, b, c\n1, 2, 3\n4, 5, 6' + data = "a, b, c\n1, 2, 3\n4, 5, 6" print(data) pd.read_csv(StringIO(data), skipinitialspace=True) @@ -1359,7 +1314,7 @@ should pass the ``escapechar`` option: data = 'a,b\n"hello, \\"Bob\\", nice to see you",5' print(data) - pd.read_csv(StringIO(data), escapechar='\\') + pd.read_csv(StringIO(data), escapechar="\\") .. _io.fwf_reader: .. _io.fwf: @@ -1386,12 +1341,14 @@ a different usage of the ``delimiter`` parameter: .. ipython:: python :suppress: - f = open('bar.csv', 'w') - data1 = ("id8141 360.242940 149.910199 11950.7\n" - "id1594 444.953632 166.985655 11788.4\n" - "id1849 364.136849 183.628767 11806.2\n" - "id1230 413.836124 184.375703 11916.8\n" - "id1948 502.953953 173.237159 12468.3") + f = open("bar.csv", "w") + data1 = ( + "id8141 360.242940 149.910199 11950.7\n" + "id1594 444.953632 166.985655 11788.4\n" + "id1849 364.136849 183.628767 11806.2\n" + "id1230 413.836124 184.375703 11916.8\n" + "id1948 502.953953 173.237159 12468.3" + ) f.write(data1) f.close() @@ -1399,7 +1356,7 @@ Consider a typical fixed-width data file: .. ipython:: python - print(open('bar.csv').read()) + print(open("bar.csv").read()) In order to parse this file into a ``DataFrame``, we simply need to supply the column specifications to the ``read_fwf`` function along with the file name: @@ -1408,7 +1365,7 @@ column specifications to the ``read_fwf`` function along with the file name: # Column specifications are a list of half-intervals colspecs = [(0, 6), (8, 20), (21, 33), (34, 43)] - df = pd.read_fwf('bar.csv', colspecs=colspecs, header=None, index_col=0) + df = pd.read_fwf("bar.csv", colspecs=colspecs, header=None, index_col=0) df Note how the parser automatically picks column names X. when @@ -1419,7 +1376,7 @@ column widths for contiguous columns: # Widths are a list of integers widths = [6, 14, 13, 10] - df = pd.read_fwf('bar.csv', widths=widths, header=None) + df = pd.read_fwf("bar.csv", widths=widths, header=None) df The parser will take care of extra white spaces around the columns @@ -1432,7 +1389,7 @@ is whitespace). .. ipython:: python - df = pd.read_fwf('bar.csv', header=None, index_col=0) + df = pd.read_fwf("bar.csv", header=None, index_col=0) df ``read_fwf`` supports the ``dtype`` parameter for specifying the types of @@ -1440,13 +1397,13 @@ parsed columns to be different from the inferred type. .. ipython:: python - pd.read_fwf('bar.csv', header=None, index_col=0).dtypes - pd.read_fwf('bar.csv', header=None, dtype={2: 'object'}).dtypes + pd.read_fwf("bar.csv", header=None, index_col=0).dtypes + pd.read_fwf("bar.csv", header=None, dtype={2: "object"}).dtypes .. ipython:: python :suppress: - os.remove('bar.csv') + os.remove("bar.csv") Indexes @@ -1458,8 +1415,8 @@ Files with an "implicit" index column .. ipython:: python :suppress: - f = open('foo.csv', 'w') - f.write('A,B,C\n20090101,a,1,2\n20090102,b,3,4\n20090103,c,4,5') + f = open("foo.csv", "w") + f.write("A,B,C\n20090101,a,1,2\n20090102,b,3,4\n20090103,c,4,5") f.close() Consider a file with one less entry in the header than the number of data @@ -1467,27 +1424,27 @@ column: .. ipython:: python - print(open('foo.csv').read()) + print(open("foo.csv").read()) In this special case, ``read_csv`` assumes that the first column is to be used as the index of the ``DataFrame``: .. ipython:: python - pd.read_csv('foo.csv') + pd.read_csv("foo.csv") Note that the dates weren't automatically parsed. In that case you would need to do as before: .. ipython:: python - df = pd.read_csv('foo.csv', parse_dates=True) + df = pd.read_csv("foo.csv", parse_dates=True) df.index .. ipython:: python :suppress: - os.remove('foo.csv') + os.remove("foo.csv") Reading an index with a ``MultiIndex`` @@ -1499,7 +1456,7 @@ Suppose you have data indexed by two columns: .. ipython:: python - print(open('data/mindex_ex.csv').read()) + print(open("data/mindex_ex.csv").read()) The ``index_col`` argument to ``read_csv`` can take a list of column numbers to turn multiple columns into a ``MultiIndex`` for the index of the @@ -1523,10 +1480,11 @@ rows will skip the intervening rows. .. ipython:: python from pandas._testing import makeCustomDataframe as mkdf + df = mkdf(5, 3, r_idx_nlevels=2, c_idx_nlevels=4) - df.to_csv('mi.csv') - print(open('mi.csv').read()) - pd.read_csv('mi.csv', header=[0, 1, 2, 3], index_col=[0, 1]) + df.to_csv("mi.csv") + print(open("mi.csv").read()) + pd.read_csv("mi.csv", header=[0, 1, 2, 3], index_col=[0, 1]) ``read_csv`` is also able to interpret a more common format of multi-columns indices. @@ -1535,14 +1493,14 @@ of multi-columns indices. :suppress: data = ",a,a,a,b,c,c\n,q,r,s,t,u,v\none,1,2,3,4,5,6\ntwo,7,8,9,10,11,12" - fh = open('mi2.csv', 'w') + fh = open("mi2.csv", "w") fh.write(data) fh.close() .. ipython:: python - print(open('mi2.csv').read()) - pd.read_csv('mi2.csv', header=[0, 1], index_col=0) + print(open("mi2.csv").read()) + pd.read_csv("mi2.csv", header=[0, 1], index_col=0) Note: If an ``index_col`` is not specified (e.g. you don't have an index, or wrote it with ``df.to_csv(..., index=False)``, then any ``names`` on the columns index will be *lost*. @@ -1550,8 +1508,8 @@ with ``df.to_csv(..., index=False)``, then any ``names`` on the columns index wi .. ipython:: python :suppress: - os.remove('mi.csv') - os.remove('mi2.csv') + os.remove("mi.csv") + os.remove("mi2.csv") .. _io.sniff: @@ -1566,13 +1524,13 @@ class of the csv module. For this, you have to specify ``sep=None``. :suppress: df = pd.DataFrame(np.random.randn(10, 4)) - df.to_csv('tmp.sv', sep='|') - df.to_csv('tmp2.sv', sep=':') + df.to_csv("tmp.sv", sep="|") + df.to_csv("tmp2.sv", sep=":") .. ipython:: python - print(open('tmp2.sv').read()) - pd.read_csv('tmp2.sv', sep=None, engine='python') + print(open("tmp2.sv").read()) + pd.read_csv("tmp2.sv", sep=None, engine="python") .. _io.multiple_files: @@ -1593,8 +1551,8 @@ rather than reading the entire file into memory, such as the following: .. ipython:: python - print(open('tmp.sv').read()) - table = pd.read_csv('tmp.sv', sep='|') + print(open("tmp.sv").read()) + table = pd.read_csv("tmp.sv", sep="|") table @@ -1603,7 +1561,7 @@ value will be an iterable object of type ``TextFileReader``: .. ipython:: python - reader = pd.read_csv('tmp.sv', sep='|', chunksize=4) + reader = pd.read_csv("tmp.sv", sep="|", chunksize=4) reader for chunk in reader: @@ -1614,14 +1572,14 @@ Specifying ``iterator=True`` will also return the ``TextFileReader`` object: .. ipython:: python - reader = pd.read_csv('tmp.sv', sep='|', iterator=True) + reader = pd.read_csv("tmp.sv", sep="|", iterator=True) reader.get_chunk(5) .. ipython:: python :suppress: - os.remove('tmp.sv') - os.remove('tmp2.sv') + os.remove("tmp.sv") + os.remove("tmp2.sv") Specifying the parser engine '''''''''''''''''''''''''''' @@ -1649,8 +1607,7 @@ functions - the following example shows reading a CSV file: .. code-block:: python - df = pd.read_csv('https://download.bls.gov/pub/time.series/cu/cu.item', - sep='\t') + df = pd.read_csv("https://download.bls.gov/pub/time.series/cu/cu.item", sep="\t") All URLs which are not local files or HTTP(s) are handled by `fsspec`_, if installed, and its various filesystem implementations @@ -1662,7 +1619,7 @@ S3 URLs require the `s3fs .. code-block:: python - df = pd.read_json('s3://pandas-test/adatafile.json') + df = pd.read_json("s3://pandas-test/adatafile.json") When dealing with remote storage systems, you might need extra configuration with environment variables or config files in @@ -1683,9 +1640,11 @@ specifying an anonymous connection, such as .. code-block:: python - pd.read_csv("s3://ncei-wcsd-archive/data/processed/SH1305/18kHz/SaKe2013" - "-D20130523-T080854_to_SaKe2013-D20130523-T085643.csv", - storage_options={"anon": True}) + pd.read_csv( + "s3://ncei-wcsd-archive/data/processed/SH1305/18kHz/SaKe2013" + "-D20130523-T080854_to_SaKe2013-D20130523-T085643.csv", + storage_options={"anon": True}, + ) ``fsspec`` also allows complex URLs, for accessing data in compressed archives, local caching of files, and more. To locally cache the above @@ -1693,9 +1652,11 @@ example, you would modify the call to .. code-block:: python - pd.read_csv("simplecache::s3://ncei-wcsd-archive/data/processed/SH1305/18kHz/" - "SaKe2013-D20130523-T080854_to_SaKe2013-D20130523-T085643.csv", - storage_options={"s3": {"anon": True}}) + pd.read_csv( + "simplecache::s3://ncei-wcsd-archive/data/processed/SH1305/18kHz/" + "SaKe2013-D20130523-T080854_to_SaKe2013-D20130523-T085643.csv", + storage_options={"s3": {"anon": True}}, + ) where we specify that the "anon" parameter is meant for the "s3" part of the implementation, not to the caching implementation. Note that this caches to a temporary @@ -1819,7 +1780,7 @@ Note ``NaN``'s, ``NaT``'s and ``None`` will be converted to ``null`` and ``datet .. ipython:: python - dfj = pd.DataFrame(np.random.randn(5, 2), columns=list('AB')) + dfj = pd.DataFrame(np.random.randn(5, 2), columns=list("AB")) json = dfj.to_json() json @@ -1831,10 +1792,13 @@ file / string. Consider the following ``DataFrame`` and ``Series``: .. ipython:: python - dfjo = pd.DataFrame(dict(A=range(1, 4), B=range(4, 7), C=range(7, 10)), - columns=list('ABC'), index=list('xyz')) + dfjo = pd.DataFrame( + dict(A=range(1, 4), B=range(4, 7), C=range(7, 10)), + columns=list("ABC"), + index=list("xyz"), + ) dfjo - sjo = pd.Series(dict(x=15, y=16, z=17), name='D') + sjo = pd.Series(dict(x=15, y=16, z=17), name="D") sjo **Column oriented** (the default for ``DataFrame``) serializes the data as @@ -1894,24 +1858,24 @@ Writing in ISO date format: .. ipython:: python - dfd = pd.DataFrame(np.random.randn(5, 2), columns=list('AB')) - dfd['date'] = pd.Timestamp('20130101') + dfd = pd.DataFrame(np.random.randn(5, 2), columns=list("AB")) + dfd["date"] = pd.Timestamp("20130101") dfd = dfd.sort_index(1, ascending=False) - json = dfd.to_json(date_format='iso') + json = dfd.to_json(date_format="iso") json Writing in ISO date format, with microseconds: .. ipython:: python - json = dfd.to_json(date_format='iso', date_unit='us') + json = dfd.to_json(date_format="iso", date_unit="us") json Epoch timestamps, in seconds: .. ipython:: python - json = dfd.to_json(date_format='epoch', date_unit='s') + json = dfd.to_json(date_format="epoch", date_unit="s") json Writing to a file, with a date index and a date column: @@ -1919,13 +1883,13 @@ Writing to a file, with a date index and a date column: .. ipython:: python dfj2 = dfj.copy() - dfj2['date'] = pd.Timestamp('20130101') - dfj2['ints'] = list(range(5)) - dfj2['bools'] = True - dfj2.index = pd.date_range('20130101', periods=5) - dfj2.to_json('test.json') + dfj2["date"] = pd.Timestamp("20130101") + dfj2["ints"] = list(range(5)) + dfj2["bools"] = True + dfj2.index = pd.date_range("20130101", periods=5) + dfj2.to_json("test.json") - with open('test.json') as fh: + with open("test.json") as fh: print(fh.read()) Fallback behavior @@ -2060,26 +2024,27 @@ Reading from a file: .. ipython:: python - pd.read_json('test.json') + pd.read_json("test.json") Don't convert any data (but still convert axes and dates): .. ipython:: python - pd.read_json('test.json', dtype=object).dtypes + pd.read_json("test.json", dtype=object).dtypes Specify dtypes for conversion: .. ipython:: python - pd.read_json('test.json', dtype={'A': 'float32', 'bools': 'int8'}).dtypes + pd.read_json("test.json", dtype={"A": "float32", "bools": "int8"}).dtypes Preserve string indices: .. ipython:: python - si = pd.DataFrame(np.zeros((4, 4)), columns=list(range(4)), - index=[str(i) for i in range(4)]) + si = pd.DataFrame( + np.zeros((4, 4)), columns=list(range(4)), index=[str(i) for i in range(4)] + ) si si.index si.columns @@ -2094,10 +2059,10 @@ Dates written in nanoseconds need to be read back in nanoseconds: .. ipython:: python - json = dfj2.to_json(date_unit='ns') + json = dfj2.to_json(date_unit="ns") # Try to parse timestamps as milliseconds -> Won't Work - dfju = pd.read_json(json, date_unit='ms') + dfju = pd.read_json(json, date_unit="ms") dfju # Let pandas detect the correct precision @@ -2105,7 +2070,7 @@ Dates written in nanoseconds need to be read back in nanoseconds: dfju # Or specify that all timestamps are in nanoseconds - dfju = pd.read_json(json, date_unit='ns') + dfju = pd.read_json(json, date_unit="ns") dfju The Numpy parameter @@ -2127,7 +2092,7 @@ data: randfloats = np.random.uniform(-100, 1000, 10000) randfloats.shape = (1000, 10) - dffloats = pd.DataFrame(randfloats, columns=list('ABCDEFGHIJ')) + dffloats = pd.DataFrame(randfloats, columns=list("ABCDEFGHIJ")) jsonfloats = dffloats.to_json() @@ -2174,7 +2139,7 @@ The speedup is less noticeable for smaller datasets: .. ipython:: python :suppress: - os.remove('test.json') + os.remove("test.json") .. _io.json_normalize: @@ -2186,38 +2151,54 @@ into a flat table. .. ipython:: python - data = [{'id': 1, 'name': {'first': 'Coleen', 'last': 'Volk'}}, - {'name': {'given': 'Mose', 'family': 'Regner'}}, - {'id': 2, 'name': 'Faye Raker'}] + data = [ + {"id": 1, "name": {"first": "Coleen", "last": "Volk"}}, + {"name": {"given": "Mose", "family": "Regner"}}, + {"id": 2, "name": "Faye Raker"}, + ] pd.json_normalize(data) .. ipython:: python - data = [{'state': 'Florida', - 'shortname': 'FL', - 'info': {'governor': 'Rick Scott'}, - 'county': [{'name': 'Dade', 'population': 12345}, - {'name': 'Broward', 'population': 40000}, - {'name': 'Palm Beach', 'population': 60000}]}, - {'state': 'Ohio', - 'shortname': 'OH', - 'info': {'governor': 'John Kasich'}, - 'county': [{'name': 'Summit', 'population': 1234}, - {'name': 'Cuyahoga', 'population': 1337}]}] - - pd.json_normalize(data, 'county', ['state', 'shortname', ['info', 'governor']]) + data = [ + { + "state": "Florida", + "shortname": "FL", + "info": {"governor": "Rick Scott"}, + "county": [ + {"name": "Dade", "population": 12345}, + {"name": "Broward", "population": 40000}, + {"name": "Palm Beach", "population": 60000}, + ], + }, + { + "state": "Ohio", + "shortname": "OH", + "info": {"governor": "John Kasich"}, + "county": [ + {"name": "Summit", "population": 1234}, + {"name": "Cuyahoga", "population": 1337}, + ], + }, + ] + + pd.json_normalize(data, "county", ["state", "shortname", ["info", "governor"]]) The max_level parameter provides more control over which level to end normalization. With max_level=1 the following snippet normalizes until 1st nesting level of the provided dict. .. ipython:: python - data = [{'CreatedBy': {'Name': 'User001'}, - 'Lookup': {'TextField': 'Some text', - 'UserField': {'Id': 'ID001', - 'Name': 'Name001'}}, - 'Image': {'a': 'b'} - }] + data = [ + { + "CreatedBy": {"Name": "User001"}, + "Lookup": { + "TextField": "Some text", + "UserField": {"Id": "ID001", "Name": "Name001"}, + }, + "Image": {"a": "b"}, + } + ] pd.json_normalize(data, max_level=1) .. _io.jsonl: @@ -2232,13 +2213,13 @@ For line-delimited json files, pandas can also return an iterator which reads in .. ipython:: python - jsonl = ''' + jsonl = """ {"a": 1, "b": 2} {"a": 3, "b": 4} - ''' + """ df = pd.read_json(jsonl, lines=True) df - df.to_json(orient='records', lines=True) + df.to_json(orient="records", lines=True) # reader is an iterator that returns ``chunksize`` lines each iteration reader = pd.read_json(StringIO(jsonl), lines=True, chunksize=1) @@ -2258,12 +2239,16 @@ a JSON string with two fields, ``schema`` and ``data``. .. ipython:: python - df = pd.DataFrame({'A': [1, 2, 3], - 'B': ['a', 'b', 'c'], - 'C': pd.date_range('2016-01-01', freq='d', periods=3)}, - index=pd.Index(range(3), name='idx')) + df = pd.DataFrame( + { + "A": [1, 2, 3], + "B": ["a", "b", "c"], + "C": pd.date_range("2016-01-01", freq="d", periods=3), + }, + index=pd.Index(range(3), name="idx"), + ) df - df.to_json(orient='table', date_format="iso") + df.to_json(orient="table", date_format="iso") The ``schema`` field contains the ``fields`` key, which itself contains a list of column name to type pairs, including the ``Index`` or ``MultiIndex`` @@ -2302,7 +2287,8 @@ A few notes on the generated table schema: .. ipython:: python from pandas.io.json import build_table_schema - s = pd.Series(pd.date_range('2016', periods=4)) + + s = pd.Series(pd.date_range("2016", periods=4)) build_table_schema(s) * datetimes with a timezone (before serializing), include an additional field @@ -2310,8 +2296,7 @@ A few notes on the generated table schema: .. ipython:: python - s_tz = pd.Series(pd.date_range('2016', periods=12, - tz='US/Central')) + s_tz = pd.Series(pd.date_range("2016", periods=12, tz="US/Central")) build_table_schema(s_tz) * Periods are converted to timestamps before serialization, and so have the @@ -2320,8 +2305,7 @@ A few notes on the generated table schema: .. ipython:: python - s_per = pd.Series(1, index=pd.period_range('2016', freq='A-DEC', - periods=4)) + s_per = pd.Series(1, index=pd.period_range("2016", freq="A-DEC", periods=4)) build_table_schema(s_per) * Categoricals use the ``any`` type and an ``enum`` constraint listing @@ -2329,7 +2313,7 @@ A few notes on the generated table schema: .. ipython:: python - s_cat = pd.Series(pd.Categorical(['a', 'b', 'a'])) + s_cat = pd.Series(pd.Categorical(["a", "b", "a"])) build_table_schema(s_cat) * A ``primaryKey`` field, containing an array of labels, is included @@ -2345,8 +2329,7 @@ A few notes on the generated table schema: .. ipython:: python - s_multi = pd.Series(1, index=pd.MultiIndex.from_product([('a', 'b'), - (0, 1)])) + s_multi = pd.Series(1, index=pd.MultiIndex.from_product([("a", "b"), (0, 1)])) build_table_schema(s_multi) * The default naming roughly follows these rules: @@ -2366,16 +2349,20 @@ round-trippable manner. .. ipython:: python - df = pd.DataFrame({'foo': [1, 2, 3, 4], - 'bar': ['a', 'b', 'c', 'd'], - 'baz': pd.date_range('2018-01-01', freq='d', periods=4), - 'qux': pd.Categorical(['a', 'b', 'c', 'c']) - }, index=pd.Index(range(4), name='idx')) + df = pd.DataFrame( + { + "foo": [1, 2, 3, 4], + "bar": ["a", "b", "c", "d"], + "baz": pd.date_range("2018-01-01", freq="d", periods=4), + "qux": pd.Categorical(["a", "b", "c", "c"]), + }, + index=pd.Index(range(4), name="idx"), + ) df df.dtypes - df.to_json('test.json', orient='table') - new_df = pd.read_json('test.json', orient='table') + df.to_json("test.json", orient="table") + new_df = pd.read_json("test.json", orient="table") new_df new_df.dtypes @@ -2387,15 +2374,15 @@ indicate missing values and the subsequent read cannot distinguish the intent. .. ipython:: python :okwarning: - df.index.name = 'index' - df.to_json('test.json', orient='table') - new_df = pd.read_json('test.json', orient='table') + df.index.name = "index" + df.to_json("test.json", orient="table") + new_df = pd.read_json("test.json", orient="table") print(new_df.index.name) .. ipython:: python :suppress: - os.remove('test.json') + os.remove("test.json") .. _Table Schema: https://specs.frictionlessdata.io/table-schema/ @@ -2425,7 +2412,7 @@ Read a URL with no options: .. ipython:: python - url = 'https://www.fdic.gov/bank/individual/failed/banklist.html' + url = "https://www.fdic.gov/bank/individual/failed/banklist.html" dfs = pd.read_html(url) dfs @@ -2440,11 +2427,11 @@ as a string: .. ipython:: python :suppress: - file_path = os.path.abspath(os.path.join('source', '_static', 'banklist.html')) + file_path = os.path.abspath(os.path.join("source", "_static", "banklist.html")) .. ipython:: python - with open(file_path, 'r') as f: + with open(file_path, "r") as f: dfs = pd.read_html(f.read()) dfs @@ -2452,7 +2439,7 @@ You can even pass in an instance of ``StringIO`` if you so desire: .. ipython:: python - with open(file_path, 'r') as f: + with open(file_path, "r") as f: sio = StringIO(f.read()) dfs = pd.read_html(sio) @@ -2471,7 +2458,7 @@ Read a URL and match a table that contains specific text: .. code-block:: python - match = 'Metcalf Bank' + match = "Metcalf Bank" df_list = pd.read_html(url, match=match) Specify a header row (by default ``
    `` or ```` elements located within a @@ -2506,15 +2493,15 @@ Specify an HTML attribute: .. code-block:: python - dfs1 = pd.read_html(url, attrs={'id': 'table'}) - dfs2 = pd.read_html(url, attrs={'class': 'sortable'}) + dfs1 = pd.read_html(url, attrs={"id": "table"}) + dfs2 = pd.read_html(url, attrs={"class": "sortable"}) print(np.array_equal(dfs1[0], dfs2[0])) # Should be True Specify values that should be converted to NaN: .. code-block:: python - dfs = pd.read_html(url, na_values=['No Acquirer']) + dfs = pd.read_html(url, na_values=["No Acquirer"]) Specify whether to keep the default set of NaN values: @@ -2529,22 +2516,21 @@ columns to strings. .. code-block:: python - url_mcc = 'https://en.wikipedia.org/wiki/Mobile_country_code' - dfs = pd.read_html(url_mcc, match='Telekom Albania', header=0, - converters={'MNC': str}) + url_mcc = "https://en.wikipedia.org/wiki/Mobile_country_code" + dfs = pd.read_html(url_mcc, match="Telekom Albania", header=0, converters={"MNC": str}) Use some combination of the above: .. code-block:: python - dfs = pd.read_html(url, match='Metcalf Bank', index_col=0) + dfs = pd.read_html(url, match="Metcalf Bank", index_col=0) Read in pandas ``to_html`` output (with some loss of floating point precision): .. code-block:: python df = pd.DataFrame(np.random.randn(2, 2)) - s = df.to_html(float_format='{0:.40g}'.format) + s = df.to_html(float_format="{0:.40g}".format) dfin = pd.read_html(s, index_col=0) The ``lxml`` backend will raise an error on a failed parse if that is the only @@ -2554,13 +2540,13 @@ for example, the function expects a sequence of strings. You may use: .. code-block:: python - dfs = pd.read_html(url, 'Metcalf Bank', index_col=0, flavor=['lxml']) + dfs = pd.read_html(url, "Metcalf Bank", index_col=0, flavor=["lxml"]) Or you could pass ``flavor='lxml'`` without a list: .. code-block:: python - dfs = pd.read_html(url, 'Metcalf Bank', index_col=0, flavor='lxml') + dfs = pd.read_html(url, "Metcalf Bank", index_col=0, flavor="lxml") However, if you have bs4 and html5lib installed and pass ``None`` or ``['lxml', 'bs4']`` then the parse will most likely succeed. Note that *as soon as a parse @@ -2568,7 +2554,7 @@ succeeds, the function will return*. .. code-block:: python - dfs = pd.read_html(url, 'Metcalf Bank', index_col=0, flavor=['lxml', 'bs4']) + dfs = pd.read_html(url, "Metcalf Bank", index_col=0, flavor=["lxml", "bs4"]) .. _io.html: @@ -2590,8 +2576,8 @@ in the method ``to_string`` described above. :suppress: def write_html(df, filename, *args, **kwargs): - static = os.path.abspath(os.path.join('source', '_static')) - with open(os.path.join(static, filename + '.html'), 'w') as f: + static = os.path.abspath(os.path.join("source", "_static")) + with open(os.path.join(static, filename + ".html"), "w") as f: df.to_html(f, *args, **kwargs) .. ipython:: python @@ -2603,7 +2589,7 @@ in the method ``to_string`` described above. .. ipython:: python :suppress: - write_html(df, 'basic') + write_html(df, "basic") HTML: @@ -2619,7 +2605,7 @@ The ``columns`` argument will limit the columns shown: .. ipython:: python :suppress: - write_html(df, 'columns', columns=[0]) + write_html(df, "columns", columns=[0]) HTML: @@ -2631,12 +2617,12 @@ point values: .. ipython:: python - print(df.to_html(float_format='{0:.10f}'.format)) + print(df.to_html(float_format="{0:.10f}".format)) .. ipython:: python :suppress: - write_html(df, 'float_format', float_format='{0:.10f}'.format) + write_html(df, "float_format", float_format="{0:.10f}".format) HTML: @@ -2653,7 +2639,7 @@ off: .. ipython:: python :suppress: - write_html(df, 'nobold', bold_rows=False) + write_html(df, "nobold", bold_rows=False) .. raw:: html :file: ../_static/nobold.html @@ -2664,7 +2650,7 @@ table CSS classes. Note that these classes are *appended* to the existing .. ipython:: python - print(df.to_html(classes=['awesome_table_class', 'even_more_awesome_class'])) + print(df.to_html(classes=["awesome_table_class", "even_more_awesome_class"])) The ``render_links`` argument provides the ability to add hyperlinks to cells that contain URLs. @@ -2673,15 +2659,18 @@ that contain URLs. .. ipython:: python - url_df = pd.DataFrame({ - 'name': ['Python', 'Pandas'], - 'url': ['https://www.python.org/', 'https://pandas.pydata.org']}) + url_df = pd.DataFrame( + { + "name": ["Python", "Pandas"], + "url": ["https://www.python.org/", "https://pandas.pydata.org"], + } + ) print(url_df.to_html(render_links=True)) .. ipython:: python :suppress: - write_html(url_df, 'render_links', render_links=True) + write_html(url_df, "render_links", render_links=True) HTML: @@ -2694,14 +2683,14 @@ Finally, the ``escape`` argument allows you to control whether the .. ipython:: python - df = pd.DataFrame({'a': list('&<>'), 'b': np.random.randn(3)}) + df = pd.DataFrame({"a": list("&<>"), "b": np.random.randn(3)}) .. ipython:: python :suppress: - write_html(df, 'escape') - write_html(df, 'noescape', escape=False) + write_html(df, "escape") + write_html(df, "noescape", escape=False) Escaped: @@ -2828,7 +2817,7 @@ file, and the ``sheet_name`` indicating which sheet to parse. .. code-block:: python # Returns a DataFrame - pd.read_excel('path_to_file.xls', sheet_name='Sheet1') + pd.read_excel("path_to_file.xls", sheet_name="Sheet1") .. _io.excel.excelfile_class: @@ -2843,16 +2832,16 @@ read into memory only once. .. code-block:: python - xlsx = pd.ExcelFile('path_to_file.xls') - df = pd.read_excel(xlsx, 'Sheet1') + xlsx = pd.ExcelFile("path_to_file.xls") + df = pd.read_excel(xlsx, "Sheet1") The ``ExcelFile`` class can also be used as a context manager. .. code-block:: python - with pd.ExcelFile('path_to_file.xls') as xls: - df1 = pd.read_excel(xls, 'Sheet1') - df2 = pd.read_excel(xls, 'Sheet2') + with pd.ExcelFile("path_to_file.xls") as xls: + df1 = pd.read_excel(xls, "Sheet1") + df2 = pd.read_excel(xls, "Sheet2") The ``sheet_names`` property will generate a list of the sheet names in the file. @@ -2864,10 +2853,9 @@ different parameters: data = {} # For when Sheet1's format differs from Sheet2 - with pd.ExcelFile('path_to_file.xls') as xls: - data['Sheet1'] = pd.read_excel(xls, 'Sheet1', index_col=None, - na_values=['NA']) - data['Sheet2'] = pd.read_excel(xls, 'Sheet2', index_col=1) + with pd.ExcelFile("path_to_file.xls") as xls: + data["Sheet1"] = pd.read_excel(xls, "Sheet1", index_col=None, na_values=["NA"]) + data["Sheet2"] = pd.read_excel(xls, "Sheet2", index_col=1) Note that if the same parsing parameters are used for all sheets, a list of sheet names can simply be passed to ``read_excel`` with no loss in performance. @@ -2876,15 +2864,14 @@ of sheet names can simply be passed to ``read_excel`` with no loss in performanc # using the ExcelFile class data = {} - with pd.ExcelFile('path_to_file.xls') as xls: - data['Sheet1'] = pd.read_excel(xls, 'Sheet1', index_col=None, - na_values=['NA']) - data['Sheet2'] = pd.read_excel(xls, 'Sheet2', index_col=None, - na_values=['NA']) + with pd.ExcelFile("path_to_file.xls") as xls: + data["Sheet1"] = pd.read_excel(xls, "Sheet1", index_col=None, na_values=["NA"]) + data["Sheet2"] = pd.read_excel(xls, "Sheet2", index_col=None, na_values=["NA"]) # equivalent using the read_excel function - data = pd.read_excel('path_to_file.xls', ['Sheet1', 'Sheet2'], - index_col=None, na_values=['NA']) + data = pd.read_excel( + "path_to_file.xls", ["Sheet1", "Sheet2"], index_col=None, na_values=["NA"] + ) ``ExcelFile`` can also be called with a ``xlrd.book.Book`` object as a parameter. This allows the user to control how the excel file is read. @@ -2894,10 +2881,11 @@ with ``on_demand=True``. .. code-block:: python import xlrd - xlrd_book = xlrd.open_workbook('path_to_file.xls', on_demand=True) + + xlrd_book = xlrd.open_workbook("path_to_file.xls", on_demand=True) with pd.ExcelFile(xlrd_book) as xls: - df1 = pd.read_excel(xls, 'Sheet1') - df2 = pd.read_excel(xls, 'Sheet2') + df1 = pd.read_excel(xls, "Sheet1") + df2 = pd.read_excel(xls, "Sheet2") .. _io.excel.specifying_sheets: @@ -2919,35 +2907,35 @@ Specifying sheets .. code-block:: python # Returns a DataFrame - pd.read_excel('path_to_file.xls', 'Sheet1', index_col=None, na_values=['NA']) + pd.read_excel("path_to_file.xls", "Sheet1", index_col=None, na_values=["NA"]) Using the sheet index: .. code-block:: python # Returns a DataFrame - pd.read_excel('path_to_file.xls', 0, index_col=None, na_values=['NA']) + pd.read_excel("path_to_file.xls", 0, index_col=None, na_values=["NA"]) Using all default values: .. code-block:: python # Returns a DataFrame - pd.read_excel('path_to_file.xls') + pd.read_excel("path_to_file.xls") Using None to get all sheets: .. code-block:: python # Returns a dictionary of DataFrames - pd.read_excel('path_to_file.xls', sheet_name=None) + pd.read_excel("path_to_file.xls", sheet_name=None) Using a list to get multiple sheets: .. code-block:: python # Returns the 1st and 4th sheet, as a dictionary of DataFrames. - pd.read_excel('path_to_file.xls', sheet_name=['Sheet1', 3]) + pd.read_excel("path_to_file.xls", sheet_name=["Sheet1", 3]) ``read_excel`` can read more than one sheet, by setting ``sheet_name`` to either a list of sheet names, a list of sheet positions, or ``None`` to read all sheets. @@ -2968,10 +2956,12 @@ For example, to read in a ``MultiIndex`` index without names: .. ipython:: python - df = pd.DataFrame({'a': [1, 2, 3, 4], 'b': [5, 6, 7, 8]}, - index=pd.MultiIndex.from_product([['a', 'b'], ['c', 'd']])) - df.to_excel('path_to_file.xlsx') - df = pd.read_excel('path_to_file.xlsx', index_col=[0, 1]) + df = pd.DataFrame( + {"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]}, + index=pd.MultiIndex.from_product([["a", "b"], ["c", "d"]]), + ) + df.to_excel("path_to_file.xlsx") + df = pd.read_excel("path_to_file.xlsx", index_col=[0, 1]) df If the index has level names, they will parsed as well, using the same @@ -2979,9 +2969,9 @@ parameters. .. ipython:: python - df.index = df.index.set_names(['lvl1', 'lvl2']) - df.to_excel('path_to_file.xlsx') - df = pd.read_excel('path_to_file.xlsx', index_col=[0, 1]) + df.index = df.index.set_names(["lvl1", "lvl2"]) + df.to_excel("path_to_file.xlsx") + df = pd.read_excel("path_to_file.xlsx", index_col=[0, 1]) df @@ -2990,16 +2980,15 @@ should be passed to ``index_col`` and ``header``: .. ipython:: python - df.columns = pd.MultiIndex.from_product([['a'], ['b', 'd']], - names=['c1', 'c2']) - df.to_excel('path_to_file.xlsx') - df = pd.read_excel('path_to_file.xlsx', index_col=[0, 1], header=[0, 1]) + df.columns = pd.MultiIndex.from_product([["a"], ["b", "d"]], names=["c1", "c2"]) + df.to_excel("path_to_file.xlsx") + df = pd.read_excel("path_to_file.xlsx", index_col=[0, 1], header=[0, 1]) df .. ipython:: python :suppress: - os.remove('path_to_file.xlsx') + os.remove("path_to_file.xlsx") Parsing specific columns @@ -3018,14 +3007,14 @@ You can specify a comma-delimited set of Excel columns and ranges as a string: .. code-block:: python - pd.read_excel('path_to_file.xls', 'Sheet1', usecols='A,C:E') + pd.read_excel("path_to_file.xls", "Sheet1", usecols="A,C:E") If ``usecols`` is a list of integers, then it is assumed to be the file column indices to be parsed. .. code-block:: python - pd.read_excel('path_to_file.xls', 'Sheet1', usecols=[0, 2, 3]) + pd.read_excel("path_to_file.xls", "Sheet1", usecols=[0, 2, 3]) Element order is ignored, so ``usecols=[0, 1]`` is the same as ``[1, 0]``. @@ -3037,7 +3026,7 @@ document header row(s). Those strings define which columns will be parsed: .. code-block:: python - pd.read_excel('path_to_file.xls', 'Sheet1', usecols=['foo', 'bar']) + pd.read_excel("path_to_file.xls", "Sheet1", usecols=["foo", "bar"]) Element order is ignored, so ``usecols=['baz', 'joe']`` is the same as ``['joe', 'baz']``. @@ -3048,7 +3037,7 @@ the column names, returning names where the callable function evaluates to ``Tru .. code-block:: python - pd.read_excel('path_to_file.xls', 'Sheet1', usecols=lambda x: x.isalpha()) + pd.read_excel("path_to_file.xls", "Sheet1", usecols=lambda x: x.isalpha()) Parsing dates +++++++++++++ @@ -3060,7 +3049,7 @@ use the ``parse_dates`` keyword to parse those strings to datetimes: .. code-block:: python - pd.read_excel('path_to_file.xls', 'Sheet1', parse_dates=['date_strings']) + pd.read_excel("path_to_file.xls", "Sheet1", parse_dates=["date_strings"]) Cell converters @@ -3071,7 +3060,7 @@ option. For instance, to convert a column to boolean: .. code-block:: python - pd.read_excel('path_to_file.xls', 'Sheet1', converters={'MyBools': bool}) + pd.read_excel("path_to_file.xls", "Sheet1", converters={"MyBools": bool}) This options handles missing values and treats exceptions in the converters as missing data. Transformations are applied cell by cell rather than to the @@ -3086,7 +3075,7 @@ missing data to recover integer dtype: return int(x) if x else -1 - pd.read_excel('path_to_file.xls', 'Sheet1', converters={'MyInts': cfun}) + pd.read_excel("path_to_file.xls", "Sheet1", converters={"MyInts": cfun}) Dtype specifications ++++++++++++++++++++ @@ -3098,7 +3087,7 @@ no type inference, use the type ``str`` or ``object``. .. code-block:: python - pd.read_excel('path_to_file.xls', dtype={'MyInts': 'int64', 'MyText': str}) + pd.read_excel("path_to_file.xls", dtype={"MyInts": "int64", "MyText": str}) .. _io.excel_writer: @@ -3116,7 +3105,7 @@ written. For example: .. code-block:: python - df.to_excel('path_to_file.xlsx', sheet_name='Sheet1') + df.to_excel("path_to_file.xlsx", sheet_name="Sheet1") Files with a ``.xls`` extension will be written using ``xlwt`` and those with a ``.xlsx`` extension will be written using ``xlsxwriter`` (if available) or @@ -3129,16 +3118,16 @@ row instead of the first. You can place it in the first row by setting the .. code-block:: python - df.to_excel('path_to_file.xlsx', index_label='label', merge_cells=False) + df.to_excel("path_to_file.xlsx", index_label="label", merge_cells=False) In order to write separate ``DataFrames`` to separate sheets in a single Excel file, one can pass an :class:`~pandas.io.excel.ExcelWriter`. .. code-block:: python - with pd.ExcelWriter('path_to_file.xlsx') as writer: - df1.to_excel(writer, sheet_name='Sheet1') - df2.to_excel(writer, sheet_name='Sheet2') + with pd.ExcelWriter("path_to_file.xlsx") as writer: + df1.to_excel(writer, sheet_name="Sheet1") + df2.to_excel(writer, sheet_name="Sheet2") .. note:: @@ -3164,8 +3153,8 @@ Pandas supports writing Excel files to buffer-like objects such as ``StringIO`` bio = BytesIO() # By setting the 'engine' in the ExcelWriter constructor. - writer = pd.ExcelWriter(bio, engine='xlsxwriter') - df.to_excel(writer, sheet_name='Sheet1') + writer = pd.ExcelWriter(bio, engine="xlsxwriter") + df.to_excel(writer, sheet_name="Sheet1") # Save the workbook writer.save() @@ -3214,16 +3203,17 @@ argument to ``to_excel`` and to ``ExcelWriter``. The built-in engines are: .. code-block:: python # By setting the 'engine' in the DataFrame 'to_excel()' methods. - df.to_excel('path_to_file.xlsx', sheet_name='Sheet1', engine='xlsxwriter') + df.to_excel("path_to_file.xlsx", sheet_name="Sheet1", engine="xlsxwriter") # By setting the 'engine' in the ExcelWriter constructor. - writer = pd.ExcelWriter('path_to_file.xlsx', engine='xlsxwriter') + writer = pd.ExcelWriter("path_to_file.xlsx", engine="xlsxwriter") # Or via pandas configuration. from pandas import options # noqa: E402 - options.io.excel.xlsx.writer = 'xlsxwriter' - df.to_excel('path_to_file.xlsx', sheet_name='Sheet1') + options.io.excel.xlsx.writer = "xlsxwriter" + + df.to_excel("path_to_file.xlsx", sheet_name="Sheet1") .. _io.excel.style: @@ -3254,7 +3244,7 @@ OpenDocument spreadsheets match what can be done for `Excel files`_ using .. code-block:: python # Returns a DataFrame - pd.read_excel('path_to_file.ods', engine='odf') + pd.read_excel("path_to_file.ods", engine="odf") .. note:: @@ -3277,7 +3267,7 @@ in files and will return floats instead. .. code-block:: python # Returns a DataFrame - pd.read_excel('path_to_file.xlsb', engine='pyxlsb') + pd.read_excel("path_to_file.xlsb", engine="pyxlsb") .. note:: @@ -3353,7 +3343,7 @@ All pandas objects are equipped with ``to_pickle`` methods which use Python's .. ipython:: python df - df.to_pickle('foo.pkl') + df.to_pickle("foo.pkl") The ``read_pickle`` function in the ``pandas`` namespace can be used to load any pickled pandas object (or any other pickled object) from file: @@ -3361,12 +3351,12 @@ any pickled pandas object (or any other pickled object) from file: .. ipython:: python - pd.read_pickle('foo.pkl') + pd.read_pickle("foo.pkl") .. ipython:: python :suppress: - os.remove('foo.pkl') + os.remove("foo.pkl") .. warning:: @@ -3400,10 +3390,13 @@ the underlying compression library. .. ipython:: python - df = pd.DataFrame({ - 'A': np.random.randn(1000), - 'B': 'foo', - 'C': pd.date_range('20130101', periods=1000, freq='s')}) + df = pd.DataFrame( + { + "A": np.random.randn(1000), + "B": "foo", + "C": pd.date_range("20130101", periods=1000, freq="s"), + } + ) df Using an explicit compression type: @@ -3438,10 +3431,7 @@ Passing options to the compression protocol in order to speed up compression: .. ipython:: python - df.to_pickle( - "data.pkl.gz", - compression={"method": "gzip", 'compresslevel': 1} - ) + df.to_pickle("data.pkl.gz", compression={"method": "gzip", "compresslevel": 1}) .. ipython:: python :suppress: @@ -3462,11 +3452,13 @@ Example pyarrow usage: .. code-block:: python - >>> import pandas as pd - >>> import pyarrow as pa - >>> df = pd.DataFrame({'A': [1, 2, 3]}) - >>> context = pa.default_serialization_context() - >>> df_bytestring = context.serialize(df).to_buffer().to_pybytes() + import pandas as pd + import pyarrow as pa + + df = pd.DataFrame({"A": [1, 2, 3]}) + + context = pa.default_serialization_context() + df_bytestring = context.serialize(df).to_buffer().to_pybytes() For documentation on pyarrow, see `here `__. @@ -3492,11 +3484,11 @@ for some advanced strategies :suppress: :okexcept: - os.remove('store.h5') + os.remove("store.h5") .. ipython:: python - store = pd.HDFStore('store.h5') + store = pd.HDFStore("store.h5") print(store) Objects can be written to the file just like adding key-value pairs to a @@ -3504,15 +3496,14 @@ dict: .. ipython:: python - index = pd.date_range('1/1/2000', periods=8) - s = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e']) - df = pd.DataFrame(np.random.randn(8, 3), index=index, - columns=['A', 'B', 'C']) + index = pd.date_range("1/1/2000", periods=8) + s = pd.Series(np.random.randn(5), index=["a", "b", "c", "d", "e"]) + df = pd.DataFrame(np.random.randn(8, 3), index=index, columns=["A", "B", "C"]) # store.put('s', s) is an equivalent method - store['s'] = s + store["s"] = s - store['df'] = df + store["df"] = df store @@ -3521,7 +3512,7 @@ In a current or later Python session, you can retrieve stored objects: .. ipython:: python # store.get('df') is an equivalent method - store['df'] + store["df"] # dotted (attribute) access provides get as well store.df @@ -3531,7 +3522,7 @@ Deletion of the object specified by the key: .. ipython:: python # store.remove('df') is an equivalent method - del store['df'] + del store["df"] store @@ -3544,14 +3535,14 @@ Closing a Store and using a context manager: store.is_open # Working with, and automatically closing the store using a context manager - with pd.HDFStore('store.h5') as store: + with pd.HDFStore("store.h5") as store: store.keys() .. ipython:: python :suppress: store.close() - os.remove('store.h5') + os.remove("store.h5") @@ -3563,15 +3554,15 @@ similar to how ``read_csv`` and ``to_csv`` work. .. ipython:: python - df_tl = pd.DataFrame({'A': list(range(5)), 'B': list(range(5))}) - df_tl.to_hdf('store_tl.h5', 'table', append=True) - pd.read_hdf('store_tl.h5', 'table', where=['index>2']) + df_tl = pd.DataFrame({"A": list(range(5)), "B": list(range(5))}) + df_tl.to_hdf("store_tl.h5", "table", append=True) + pd.read_hdf("store_tl.h5", "table", where=["index>2"]) .. ipython:: python :suppress: :okexcept: - os.remove('store_tl.h5') + os.remove("store_tl.h5") HDFStore will by default not drop rows that are all missing. This behavior can be changed by setting ``dropna=True``. @@ -3579,24 +3570,23 @@ HDFStore will by default not drop rows that are all missing. This behavior can b .. ipython:: python - df_with_missing = pd.DataFrame({'col1': [0, np.nan, 2], - 'col2': [1, np.nan, np.nan]}) + df_with_missing = pd.DataFrame({"col1": [0, np.nan, 2], "col2": [1, np.nan, np.nan]}) df_with_missing - df_with_missing.to_hdf('file.h5', 'df_with_missing', - format='table', mode='w') + df_with_missing.to_hdf("file.h5", "df_with_missing", format="table", mode="w") - pd.read_hdf('file.h5', 'df_with_missing') + pd.read_hdf("file.h5", "df_with_missing") - df_with_missing.to_hdf('file.h5', 'df_with_missing', - format='table', mode='w', dropna=True) - pd.read_hdf('file.h5', 'df_with_missing') + df_with_missing.to_hdf( + "file.h5", "df_with_missing", format="table", mode="w", dropna=True + ) + pd.read_hdf("file.h5", "df_with_missing") .. ipython:: python :suppress: - os.remove('file.h5') + os.remove("file.h5") .. _io.hdf5-fixed: @@ -3642,21 +3632,21 @@ enable ``put/append/to_hdf`` to by default store in the ``table`` format. :suppress: :okexcept: - os.remove('store.h5') + os.remove("store.h5") .. ipython:: python - store = pd.HDFStore('store.h5') + store = pd.HDFStore("store.h5") df1 = df[0:4] df2 = df[4:] # append data (creates a table automatically) - store.append('df', df1) - store.append('df', df2) + store.append("df", df1) + store.append("df", df2) store # select the entire object - store.select('df') + store.select("df") # the type of stored data store.root.df._v_attrs.pandas_type @@ -3679,16 +3669,16 @@ everything in the sub-store and **below**, so be *careful*. .. ipython:: python - store.put('foo/bar/bah', df) - store.append('food/orange', df) - store.append('food/apple', df) + store.put("foo/bar/bah", df) + store.append("food/orange", df) + store.append("food/apple", df) store # a list of keys are returned store.keys() # remove all nodes under this level - store.remove('food') + store.remove("food") store @@ -3702,10 +3692,10 @@ will yield a tuple for each group key along with the relative keys of its conten for (path, subgroups, subkeys) in store.walk(): for subgroup in subgroups: - print('GROUP: {}/{}'.format(path, subgroup)) + print("GROUP: {}/{}".format(path, subgroup)) for subkey in subkeys: - key = '/'.join([path, subkey]) - print('KEY: {}'.format(key)) + key = "/".join([path, subkey]) + print("KEY: {}".format(key)) print(store.get(key)) @@ -3729,7 +3719,7 @@ will yield a tuple for each group key along with the relative keys of its conten .. ipython:: python - store['foo/bar/bah'] + store["foo/bar/bah"] .. _io.hdf5-types: @@ -3753,19 +3743,22 @@ defaults to ``nan``. .. ipython:: python - df_mixed = pd.DataFrame({'A': np.random.randn(8), - 'B': np.random.randn(8), - 'C': np.array(np.random.randn(8), dtype='float32'), - 'string': 'string', - 'int': 1, - 'bool': True, - 'datetime64': pd.Timestamp('20010102')}, - index=list(range(8))) - df_mixed.loc[df_mixed.index[3:5], - ['A', 'B', 'string', 'datetime64']] = np.nan + df_mixed = pd.DataFrame( + { + "A": np.random.randn(8), + "B": np.random.randn(8), + "C": np.array(np.random.randn(8), dtype="float32"), + "string": "string", + "int": 1, + "bool": True, + "datetime64": pd.Timestamp("20010102"), + }, + index=list(range(8)), + ) + df_mixed.loc[df_mixed.index[3:5], ["A", "B", "string", "datetime64"]] = np.nan - store.append('df_mixed', df_mixed, min_itemsize={'values': 50}) - df_mixed1 = store.select('df_mixed') + store.append("df_mixed", df_mixed, min_itemsize={"values": 50}) + df_mixed1 = store.select("df_mixed") df_mixed1 df_mixed1.dtypes.value_counts() @@ -3780,20 +3773,19 @@ storing/selecting from homogeneous index ``DataFrames``. .. ipython:: python - index = pd.MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], - ['one', 'two', 'three']], - codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], - [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], - names=['foo', 'bar']) - df_mi = pd.DataFrame(np.random.randn(10, 3), index=index, - columns=['A', 'B', 'C']) + index = pd.MultiIndex( + levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], + codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=["foo", "bar"], + ) + df_mi = pd.DataFrame(np.random.randn(10, 3), index=index, columns=["A", "B", "C"]) df_mi - store.append('df_mi', df_mi) - store.select('df_mi') + store.append("df_mi", df_mi) + store.select("df_mi") # the levels are automatically included as data columns - store.select('df_mi', 'foo=bar') + store.select("df_mi", "foo=bar") .. note:: The ``index`` keyword is reserved and cannot be use as a level name. @@ -3870,7 +3862,7 @@ The right-hand side of the sub-expression (after a comparison operator) can be: .. code-block:: python string = "HolyMoly'" - store.select('df', 'index == string') + store.select("df", "index == string") instead of this @@ -3887,7 +3879,7 @@ The right-hand side of the sub-expression (after a comparison operator) can be: .. code-block:: python - store.select('df', 'index == %r' % string) + store.select("df", "index == %r" % string) which will quote ``string``. @@ -3896,21 +3888,24 @@ Here are some examples: .. ipython:: python - dfq = pd.DataFrame(np.random.randn(10, 4), columns=list('ABCD'), - index=pd.date_range('20130101', periods=10)) - store.append('dfq', dfq, format='table', data_columns=True) + dfq = pd.DataFrame( + np.random.randn(10, 4), + columns=list("ABCD"), + index=pd.date_range("20130101", periods=10), + ) + store.append("dfq", dfq, format="table", data_columns=True) Use boolean expressions, with in-line function evaluation. .. ipython:: python - store.select('dfq', "index>pd.Timestamp('20130104') & columns=['A', 'B']") + store.select("dfq", "index>pd.Timestamp('20130104') & columns=['A', 'B']") Use inline column reference. .. ipython:: python - store.select('dfq', where="A>0 or C>0") + store.select("dfq", where="A>0 or C>0") The ``columns`` keyword can be supplied to select a list of columns to be returned, this is equivalent to passing a @@ -3918,7 +3913,7 @@ returned, this is equivalent to passing a .. ipython:: python - store.select('df', "columns=['A', 'B']") + store.select("df", "columns=['A', 'B']") ``start`` and ``stop`` parameters can be specified to limit the total search space. These are in terms of the total number of rows in a table. @@ -3944,14 +3939,19 @@ specified in the format: ``()``, where float may be signed (and fra .. ipython:: python from datetime import timedelta - dftd = pd.DataFrame({'A': pd.Timestamp('20130101'), - 'B': [pd.Timestamp('20130101') + timedelta(days=i, - seconds=10) - for i in range(10)]}) - dftd['C'] = dftd['A'] - dftd['B'] + + dftd = pd.DataFrame( + { + "A": pd.Timestamp("20130101"), + "B": [ + pd.Timestamp("20130101") + timedelta(days=i, seconds=10) for i in range(10) + ], + } + ) + dftd["C"] = dftd["A"] - dftd["B"] dftd - store.append('dftd', dftd, data_columns=True) - store.select('dftd', "C<'-3.5D'") + store.append("dftd", dftd, data_columns=True) + store.select("dftd", "C<'-3.5D'") .. _io.query_multi: @@ -3963,7 +3963,7 @@ Selecting from a ``MultiIndex`` can be achieved by using the name of the level. .. ipython:: python df_mi.index.names - store.select('df_mi', "foo=baz and bar=two") + store.select("df_mi", "foo=baz and bar=two") If the ``MultiIndex`` levels names are ``None``, the levels are automatically made available via the ``level_n`` keyword with ``n`` the level of the ``MultiIndex`` you want to select from. @@ -3974,8 +3974,7 @@ the ``level_n`` keyword with ``n`` the level of the ``MultiIndex`` you want to s levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], ) - df_mi_2 = pd.DataFrame(np.random.randn(10, 3), - index=index, columns=["A", "B", "C"]) + df_mi_2 = pd.DataFrame(np.random.randn(10, 3), index=index, columns=["A", "B", "C"]) df_mi_2 store.append("df_mi_2", df_mi_2) @@ -4006,7 +4005,7 @@ indexed dimension as the ``where``. i.optlevel, i.kind # change an index by passing new parameters - store.create_table_index('df', optlevel=9, kind='full') + store.create_table_index("df", optlevel=9, kind="full") i = store.root.df.table.cols.index.index i.optlevel, i.kind @@ -4014,20 +4013,20 @@ Oftentimes when appending large amounts of data to a store, it is useful to turn .. ipython:: python - df_1 = pd.DataFrame(np.random.randn(10, 2), columns=list('AB')) - df_2 = pd.DataFrame(np.random.randn(10, 2), columns=list('AB')) + df_1 = pd.DataFrame(np.random.randn(10, 2), columns=list("AB")) + df_2 = pd.DataFrame(np.random.randn(10, 2), columns=list("AB")) - st = pd.HDFStore('appends.h5', mode='w') - st.append('df', df_1, data_columns=['B'], index=False) - st.append('df', df_2, data_columns=['B'], index=False) - st.get_storer('df').table + st = pd.HDFStore("appends.h5", mode="w") + st.append("df", df_1, data_columns=["B"], index=False) + st.append("df", df_2, data_columns=["B"], index=False) + st.get_storer("df").table Then create the index when finished appending. .. ipython:: python - st.create_table_index('df', columns=['B'], optlevel=9, kind='full') - st.get_storer('df').table + st.create_table_index("df", columns=["B"], optlevel=9, kind="full") + st.get_storer("df").table st.close() @@ -4035,7 +4034,7 @@ Then create the index when finished appending. :suppress: :okexcept: - os.remove('appends.h5') + os.remove("appends.h5") See `here `__ for how to create a completely-sorted-index (CSI) on an existing store. @@ -4054,22 +4053,22 @@ be ``data_columns``. .. ipython:: python df_dc = df.copy() - df_dc['string'] = 'foo' - df_dc.loc[df_dc.index[4:6], 'string'] = np.nan - df_dc.loc[df_dc.index[7:9], 'string'] = 'bar' - df_dc['string2'] = 'cool' - df_dc.loc[df_dc.index[1:3], ['B', 'C']] = 1.0 + df_dc["string"] = "foo" + df_dc.loc[df_dc.index[4:6], "string"] = np.nan + df_dc.loc[df_dc.index[7:9], "string"] = "bar" + df_dc["string2"] = "cool" + df_dc.loc[df_dc.index[1:3], ["B", "C"]] = 1.0 df_dc # on-disk operations - store.append('df_dc', df_dc, data_columns=['B', 'C', 'string', 'string2']) - store.select('df_dc', where='B > 0') + store.append("df_dc", df_dc, data_columns=["B", "C", "string", "string2"]) + store.select("df_dc", where="B > 0") # getting creative - store.select('df_dc', 'B > 0 & C > 0 & string == foo') + store.select("df_dc", "B > 0 & C > 0 & string == foo") # this is in-memory version of this type of selection - df_dc[(df_dc.B > 0) & (df_dc.C > 0) & (df_dc.string == 'foo')] + df_dc[(df_dc.B > 0) & (df_dc.C > 0) & (df_dc.string == "foo")] # we have automagically created this index and the B/C/string/string2 # columns are stored separately as ``PyTables`` columns @@ -4090,7 +4089,7 @@ The default is 50,000 rows returned in a chunk. .. ipython:: python - for df in store.select('df', chunksize=3): + for df in store.select("df", chunksize=3): print(df) .. note:: @@ -4100,7 +4099,7 @@ The default is 50,000 rows returned in a chunk. .. code-block:: python - for df in pd.read_hdf('store.h5', 'df', chunksize=3): + for df in pd.read_hdf("store.h5", "df", chunksize=3): print(df) Note, that the chunksize keyword applies to the **source** rows. So if you @@ -4112,18 +4111,20 @@ chunks. .. ipython:: python - dfeq = pd.DataFrame({'number': np.arange(1, 11)}) + dfeq = pd.DataFrame({"number": np.arange(1, 11)}) dfeq - store.append('dfeq', dfeq, data_columns=['number']) + store.append("dfeq", dfeq, data_columns=["number"]) + def chunks(l, n): - return [l[i:i + n] for i in range(0, len(l), n)] + return [l[i: i + n] for i in range(0, len(l), n)] + evens = [2, 4, 6, 8, 10] - coordinates = store.select_as_coordinates('dfeq', 'number=evens') + coordinates = store.select_as_coordinates("dfeq", "number=evens") for c in chunks(coordinates, 2): - print(store.select('dfeq', where=c)) + print(store.select("dfeq", where=c)) Advanced queries ++++++++++++++++ @@ -4138,8 +4139,8 @@ These do not currently accept the ``where`` selector. .. ipython:: python - store.select_column('df_dc', 'index') - store.select_column('df_dc', 'string') + store.select_column("df_dc", "index") + store.select_column("df_dc", "string") .. _io.hdf5-selecting_coordinates: @@ -4152,12 +4153,13 @@ Sometimes you want to get the coordinates (a.k.a the index locations) of your qu .. ipython:: python - df_coord = pd.DataFrame(np.random.randn(1000, 2), - index=pd.date_range('20000101', periods=1000)) - store.append('df_coord', df_coord) - c = store.select_as_coordinates('df_coord', 'index > 20020101') + df_coord = pd.DataFrame( + np.random.randn(1000, 2), index=pd.date_range("20000101", periods=1000) + ) + store.append("df_coord", df_coord) + c = store.select_as_coordinates("df_coord", "index > 20020101") c - store.select('df_coord', where=c) + store.select("df_coord", where=c) .. _io.hdf5-where_mask: @@ -4170,12 +4172,13 @@ a datetimeindex which are 5. .. ipython:: python - df_mask = pd.DataFrame(np.random.randn(1000, 2), - index=pd.date_range('20000101', periods=1000)) - store.append('df_mask', df_mask) - c = store.select_column('df_mask', 'index') + df_mask = pd.DataFrame( + np.random.randn(1000, 2), index=pd.date_range("20000101", periods=1000) + ) + store.append("df_mask", df_mask) + c = store.select_column("df_mask", "index") where = c[pd.DatetimeIndex(c).month == 5].index - store.select('df_mask', where=where) + store.select("df_mask", where=where) Storer object ^^^^^^^^^^^^^ @@ -4186,7 +4189,7 @@ of rows in an object. .. ipython:: python - store.get_storer('df_dc').nrows + store.get_storer("df_dc").nrows Multiple table queries @@ -4219,24 +4222,26 @@ results. .. ipython:: python - df_mt = pd.DataFrame(np.random.randn(8, 6), - index=pd.date_range('1/1/2000', periods=8), - columns=['A', 'B', 'C', 'D', 'E', 'F']) - df_mt['foo'] = 'bar' - df_mt.loc[df_mt.index[1], ('A', 'B')] = np.nan + df_mt = pd.DataFrame( + np.random.randn(8, 6), + index=pd.date_range("1/1/2000", periods=8), + columns=["A", "B", "C", "D", "E", "F"], + ) + df_mt["foo"] = "bar" + df_mt.loc[df_mt.index[1], ("A", "B")] = np.nan # you can also create the tables individually - store.append_to_multiple({'df1_mt': ['A', 'B'], 'df2_mt': None}, - df_mt, selector='df1_mt') + store.append_to_multiple( + {"df1_mt": ["A", "B"], "df2_mt": None}, df_mt, selector="df1_mt" + ) store # individual tables were created - store.select('df1_mt') - store.select('df2_mt') + store.select("df1_mt") + store.select("df2_mt") # as a multiple - store.select_as_multiple(['df1_mt', 'df2_mt'], where=['A>0', 'B>0'], - selector='df1_mt') + store.select_as_multiple(["df1_mt", "df2_mt"], where=["A>0", "B>0"], selector="df1_mt") Delete from a table @@ -4345,14 +4350,15 @@ Enable compression for all objects within the file: .. code-block:: python - store_compressed = pd.HDFStore('store_compressed.h5', complevel=9, - complib='blosc:blosclz') + store_compressed = pd.HDFStore( + "store_compressed.h5", complevel=9, complib="blosc:blosclz" + ) Or on-the-fly compression (this only applies to tables) in stores where compression is not enabled: .. code-block:: python - store.append('df', df, complib='zlib', complevel=5) + store.append("df", df, complib="zlib", complevel=5) .. _io.hdf5-ptrepack: @@ -4441,13 +4447,14 @@ stored in a more efficient manner. .. ipython:: python - dfcat = pd.DataFrame({'A': pd.Series(list('aabbcdba')).astype('category'), - 'B': np.random.randn(8)}) + dfcat = pd.DataFrame( + {"A": pd.Series(list("aabbcdba")).astype("category"), "B": np.random.randn(8)} + ) dfcat dfcat.dtypes - cstore = pd.HDFStore('cats.h5', mode='w') - cstore.append('dfcat', dfcat, format='table', data_columns=['A']) - result = cstore.select('dfcat', where="A in ['b', 'c']") + cstore = pd.HDFStore("cats.h5", mode="w") + cstore.append("dfcat", dfcat, format="table", data_columns=["A"]) + result = cstore.select("dfcat", where="A in ['b', 'c']") result result.dtypes @@ -4456,7 +4463,7 @@ stored in a more efficient manner. :okexcept: cstore.close() - os.remove('cats.h5') + os.remove("cats.h5") String columns @@ -4483,17 +4490,17 @@ Passing a ``min_itemsize`` dict will cause all passed columns to be created as * .. ipython:: python - dfs = pd.DataFrame({'A': 'foo', 'B': 'bar'}, index=list(range(5))) + dfs = pd.DataFrame({"A": "foo", "B": "bar"}, index=list(range(5))) dfs # A and B have a size of 30 - store.append('dfs', dfs, min_itemsize=30) - store.get_storer('dfs').table + store.append("dfs", dfs, min_itemsize=30) + store.get_storer("dfs").table # A is created as a data_column with a size of 30 # B is size is calculated - store.append('dfs2', dfs, min_itemsize={'A': 30}) - store.get_storer('dfs2').table + store.append("dfs2", dfs, min_itemsize={"A": 30}) + store.get_storer("dfs2").table **nan_rep** @@ -4502,15 +4509,15 @@ You could inadvertently turn an actual ``nan`` value into a missing value. .. ipython:: python - dfss = pd.DataFrame({'A': ['foo', 'bar', 'nan']}) + dfss = pd.DataFrame({"A": ["foo", "bar", "nan"]}) dfss - store.append('dfss', dfss) - store.select('dfss') + store.append("dfss", dfss) + store.select("dfss") # here you need to specify a different nan rep - store.append('dfss2', dfss, nan_rep='_nan_') - store.select('dfss2') + store.append("dfss2", dfss, nan_rep="_nan_") + store.select("dfss2") .. _io.external_compatibility: @@ -4529,21 +4536,25 @@ It is possible to write an ``HDFStore`` object that can easily be imported into .. ipython:: python - df_for_r = pd.DataFrame({"first": np.random.rand(100), - "second": np.random.rand(100), - "class": np.random.randint(0, 2, (100, ))}, - index=range(100)) + df_for_r = pd.DataFrame( + { + "first": np.random.rand(100), + "second": np.random.rand(100), + "class": np.random.randint(0, 2, (100,)), + }, + index=range(100), + ) df_for_r.head() - store_export = pd.HDFStore('export.h5') - store_export.append('df_for_r', df_for_r, data_columns=df_dc.columns) + store_export = pd.HDFStore("export.h5") + store_export.append("df_for_r", df_for_r, data_columns=df_dc.columns) store_export .. ipython:: python :suppress: store_export.close() - os.remove('export.h5') + os.remove("export.h5") In R this file can be read into a ``data.frame`` object using the ``rhdf5`` library. The following example function reads the corresponding column names @@ -4630,7 +4641,7 @@ Performance :suppress: store.close() - os.remove('store.h5') + os.remove("store.h5") .. _io.feather: @@ -4660,21 +4671,26 @@ See the `Full Documentation `__. :suppress: import warnings + # This can be removed once building with pyarrow >=0.15.0 warnings.filterwarnings("ignore", "The Sparse", FutureWarning) .. ipython:: python - df = pd.DataFrame({'a': list('abc'), - 'b': list(range(1, 4)), - 'c': np.arange(3, 6).astype('u1'), - 'd': np.arange(4.0, 7.0, dtype='float64'), - 'e': [True, False, True], - 'f': pd.Categorical(list('abc')), - 'g': pd.date_range('20130101', periods=3), - 'h': pd.date_range('20130101', periods=3, tz='US/Eastern'), - 'i': pd.date_range('20130101', periods=3, freq='ns')}) + df = pd.DataFrame( + { + "a": list("abc"), + "b": list(range(1, 4)), + "c": np.arange(3, 6).astype("u1"), + "d": np.arange(4.0, 7.0, dtype="float64"), + "e": [True, False, True], + "f": pd.Categorical(list("abc")), + "g": pd.date_range("20130101", periods=3), + "h": pd.date_range("20130101", periods=3, tz="US/Eastern"), + "i": pd.date_range("20130101", periods=3, freq="ns"), + } + ) df df.dtypes @@ -4683,13 +4699,13 @@ Write to a feather file. .. ipython:: python - df.to_feather('example.feather') + df.to_feather("example.feather") Read from a feather file. .. ipython:: python - result = pd.read_feather('example.feather') + result = pd.read_feather("example.feather") result # we preserve dtypes @@ -4698,7 +4714,7 @@ Read from a feather file. .. ipython:: python :suppress: - os.remove('example.feather') + os.remove("example.feather") .. _io.parquet: @@ -4743,15 +4759,19 @@ See the documentation for `pyarrow `__ an .. ipython:: python - df = pd.DataFrame({'a': list('abc'), - 'b': list(range(1, 4)), - 'c': np.arange(3, 6).astype('u1'), - 'd': np.arange(4.0, 7.0, dtype='float64'), - 'e': [True, False, True], - 'f': pd.date_range('20130101', periods=3), - 'g': pd.date_range('20130101', periods=3, tz='US/Eastern'), - 'h': pd.Categorical(list('abc')), - 'i': pd.Categorical(list('abc'), ordered=True)}) + df = pd.DataFrame( + { + "a": list("abc"), + "b": list(range(1, 4)), + "c": np.arange(3, 6).astype("u1"), + "d": np.arange(4.0, 7.0, dtype="float64"), + "e": [True, False, True], + "f": pd.date_range("20130101", periods=3), + "g": pd.date_range("20130101", periods=3, tz="US/Eastern"), + "h": pd.Categorical(list("abc")), + "i": pd.Categorical(list("abc"), ordered=True), + } + ) df df.dtypes @@ -4761,15 +4781,15 @@ Write to a parquet file. .. ipython:: python :okwarning: - df.to_parquet('example_pa.parquet', engine='pyarrow') - df.to_parquet('example_fp.parquet', engine='fastparquet') + df.to_parquet("example_pa.parquet", engine="pyarrow") + df.to_parquet("example_fp.parquet", engine="fastparquet") Read from a parquet file. .. ipython:: python - result = pd.read_parquet('example_fp.parquet', engine='fastparquet') - result = pd.read_parquet('example_pa.parquet', engine='pyarrow') + result = pd.read_parquet("example_fp.parquet", engine="fastparquet") + result = pd.read_parquet("example_pa.parquet", engine="pyarrow") result.dtypes @@ -4777,18 +4797,16 @@ Read only certain columns of a parquet file. .. ipython:: python - result = pd.read_parquet('example_fp.parquet', - engine='fastparquet', columns=['a', 'b']) - result = pd.read_parquet('example_pa.parquet', - engine='pyarrow', columns=['a', 'b']) + result = pd.read_parquet("example_fp.parquet", engine="fastparquet", columns=["a", "b"]) + result = pd.read_parquet("example_pa.parquet", engine="pyarrow", columns=["a", "b"]) result.dtypes .. ipython:: python :suppress: - os.remove('example_pa.parquet') - os.remove('example_fp.parquet') + os.remove("example_pa.parquet") + os.remove("example_fp.parquet") Handling indexes @@ -4799,8 +4817,8 @@ more columns in the output file. Thus, this code: .. ipython:: python - df = pd.DataFrame({'a': [1, 2], 'b': [3, 4]}) - df.to_parquet('test.parquet', engine='pyarrow') + df = pd.DataFrame({"a": [1, 2], "b": [3, 4]}) + df.to_parquet("test.parquet", engine="pyarrow") creates a parquet file with *three* columns if you use ``pyarrow`` for serialization: ``a``, ``b``, and ``__index_level_0__``. If you're using ``fastparquet``, the @@ -4815,7 +4833,7 @@ If you want to omit a dataframe's indexes when writing, pass ``index=False`` to .. ipython:: python - df.to_parquet('test.parquet', index=False) + df.to_parquet("test.parquet", index=False) This creates a parquet file with just the two expected columns, ``a`` and ``b``. If your ``DataFrame`` has a custom index, you won't get it back when you load @@ -4827,7 +4845,7 @@ underlying engine's default behavior. .. ipython:: python :suppress: - os.remove('test.parquet') + os.remove("test.parquet") Partitioning Parquet files @@ -4839,9 +4857,8 @@ Parquet supports partitioning of data based on the values of one or more columns .. ipython:: python - df = pd.DataFrame({'a': [0, 0, 1, 1], 'b': [0, 1, 0, 1]}) - df.to_parquet(path='test', engine='pyarrow', - partition_cols=['a'], compression=None) + df = pd.DataFrame({"a": [0, 0, 1, 1], "b": [0, 1, 0, 1]}) + df.to_parquet(path="test", engine="pyarrow", partition_cols=["a"], compression=None) The ``path`` specifies the parent directory to which data will be saved. The ``partition_cols`` are the column names by which the dataset will be partitioned. @@ -4863,8 +4880,9 @@ The above example creates a partitioned dataset that may look like: :suppress: from shutil import rmtree + try: - rmtree('test') + rmtree("test") except OSError: pass @@ -4932,15 +4950,16 @@ below and the SQLAlchemy `documentation / # where is relative: - engine = create_engine('sqlite:///foo.db') + engine = create_engine("sqlite:///foo.db") # or absolute, starting with a slash: - engine = create_engine('sqlite:////absolute/path/to/foo.db') + engine = create_engine("sqlite:////absolute/path/to/foo.db") For more information see the examples the SQLAlchemy `documentation `__ @@ -5257,21 +5280,25 @@ Use :func:`sqlalchemy.text` to specify query parameters in a backend-neutral way .. ipython:: python import sqlalchemy as sa - pd.read_sql(sa.text('SELECT * FROM data where Col_1=:col1'), - engine, params={'col1': 'X'}) + + pd.read_sql( + sa.text("SELECT * FROM data where Col_1=:col1"), engine, params={"col1": "X"} + ) If you have an SQLAlchemy description of your database you can express where conditions using SQLAlchemy expressions .. ipython:: python metadata = sa.MetaData() - data_table = sa.Table('data', metadata, - sa.Column('index', sa.Integer), - sa.Column('Date', sa.DateTime), - sa.Column('Col_1', sa.String), - sa.Column('Col_2', sa.Float), - sa.Column('Col_3', sa.Boolean), - ) + data_table = sa.Table( + "data", + metadata, + sa.Column("index", sa.Integer), + sa.Column("Date", sa.DateTime), + sa.Column("Col_1", sa.String), + sa.Column("Col_2", sa.Float), + sa.Column("Col_3", sa.Boolean), + ) pd.read_sql(sa.select([data_table]).where(data_table.c.Col_3 is True), engine) @@ -5280,8 +5307,9 @@ You can combine SQLAlchemy expressions with parameters passed to :func:`read_sql .. ipython:: python import datetime as dt - expr = sa.select([data_table]).where(data_table.c.Date > sa.bindparam('date')) - pd.read_sql(expr, engine, params={'date': dt.datetime(2010, 10, 18)}) + + expr = sa.select([data_table]).where(data_table.c.Date > sa.bindparam("date")) + pd.read_sql(expr, engine, params={"date": dt.datetime(2010, 10, 18)}) Sqlite fallback @@ -5296,13 +5324,14 @@ You can create connections like so: .. code-block:: python import sqlite3 - con = sqlite3.connect(':memory:') + + con = sqlite3.connect(":memory:") And then issue the following queries: .. code-block:: python - data.to_sql('data', con) + data.to_sql("data", con) pd.read_sql_query("SELECT * FROM data", con) @@ -5339,8 +5368,8 @@ into a .dta file. The format version of this file is always 115 (Stata 12). .. ipython:: python - df = pd.DataFrame(np.random.randn(10, 2), columns=list('AB')) - df.to_stata('stata.dta') + df = pd.DataFrame(np.random.randn(10, 2), columns=list("AB")) + df.to_stata("stata.dta") *Stata* data files have limited data type support; only strings with 244 or fewer characters, ``int8``, ``int16``, ``int32``, ``float32`` @@ -5390,7 +5419,7 @@ be used to read the file incrementally. .. ipython:: python - pd.read_stata('stata.dta') + pd.read_stata("stata.dta") Specifying a ``chunksize`` yields a :class:`~pandas.io.stata.StataReader` instance that can be used to @@ -5399,7 +5428,7 @@ object can be used as an iterator. .. ipython:: python - reader = pd.read_stata('stata.dta', chunksize=3) + reader = pd.read_stata("stata.dta", chunksize=3) for df in reader: print(df.shape) @@ -5409,7 +5438,7 @@ For more fine-grained control, use ``iterator=True`` and specify .. ipython:: python - reader = pd.read_stata('stata.dta', iterator=True) + reader = pd.read_stata("stata.dta", iterator=True) chunk1 = reader.read(5) chunk2 = reader.read(5) @@ -5441,7 +5470,7 @@ values will have ``object`` data type. .. ipython:: python :suppress: - os.remove('stata.dta') + os.remove("stata.dta") .. _io.stata-categorical: @@ -5513,7 +5542,7 @@ Read a SAS7BDAT file: .. code-block:: python - df = pd.read_sas('sas_data.sas7bdat') + df = pd.read_sas("sas_data.sas7bdat") Obtain an iterator and read an XPORT file 100,000 lines at a time: @@ -5522,7 +5551,8 @@ Obtain an iterator and read an XPORT file 100,000 lines at a time: def do_something(chunk): pass - rdr = pd.read_sas('sas_xport.xpt', chunk=100000) + + rdr = pd.read_sas("sas_xport.xpt", chunk=100000) for chunk in rdr: do_something(chunk) @@ -5556,15 +5586,14 @@ Read an SPSS file: .. code-block:: python - df = pd.read_spss('spss_data.sav') + df = pd.read_spss("spss_data.sav") Extract a subset of columns contained in ``usecols`` from an SPSS file and avoid converting categorical columns into ``pd.Categorical``: .. code-block:: python - df = pd.read_spss('spss_data.sav', usecols=['foo', 'bar'], - convert_categoricals=False) + df = pd.read_spss("spss_data.sav", usecols=["foo", "bar"], convert_categoricals=False) More information about the SAV and ZSAV file formats is available here_. @@ -5622,78 +5651,99 @@ Given the next test set: import os sz = 1000000 - df = pd.DataFrame({'A': np.random.randn(sz), 'B': [1] * sz}) + df = pd.DataFrame({"A": np.random.randn(sz), "B": [1] * sz}) sz = 1000000 np.random.seed(42) - df = pd.DataFrame({'A': np.random.randn(sz), 'B': [1] * sz}) + df = pd.DataFrame({"A": np.random.randn(sz), "B": [1] * sz}) + def test_sql_write(df): - if os.path.exists('test.sql'): - os.remove('test.sql') - sql_db = sqlite3.connect('test.sql') - df.to_sql(name='test_table', con=sql_db) + if os.path.exists("test.sql"): + os.remove("test.sql") + sql_db = sqlite3.connect("test.sql") + df.to_sql(name="test_table", con=sql_db) sql_db.close() + def test_sql_read(): - sql_db = sqlite3.connect('test.sql') + sql_db = sqlite3.connect("test.sql") pd.read_sql_query("select * from test_table", sql_db) sql_db.close() + def test_hdf_fixed_write(df): - df.to_hdf('test_fixed.hdf', 'test', mode='w') + df.to_hdf("test_fixed.hdf", "test", mode="w") + def test_hdf_fixed_read(): - pd.read_hdf('test_fixed.hdf', 'test') + pd.read_hdf("test_fixed.hdf", "test") + def test_hdf_fixed_write_compress(df): - df.to_hdf('test_fixed_compress.hdf', 'test', mode='w', complib='blosc') + df.to_hdf("test_fixed_compress.hdf", "test", mode="w", complib="blosc") + def test_hdf_fixed_read_compress(): - pd.read_hdf('test_fixed_compress.hdf', 'test') + pd.read_hdf("test_fixed_compress.hdf", "test") + def test_hdf_table_write(df): - df.to_hdf('test_table.hdf', 'test', mode='w', format='table') + df.to_hdf("test_table.hdf", "test", mode="w", format="table") + def test_hdf_table_read(): - pd.read_hdf('test_table.hdf', 'test') + pd.read_hdf("test_table.hdf", "test") + def test_hdf_table_write_compress(df): - df.to_hdf('test_table_compress.hdf', 'test', mode='w', - complib='blosc', format='table') + df.to_hdf( + "test_table_compress.hdf", "test", mode="w", complib="blosc", format="table" + ) + def test_hdf_table_read_compress(): - pd.read_hdf('test_table_compress.hdf', 'test') + pd.read_hdf("test_table_compress.hdf", "test") + def test_csv_write(df): - df.to_csv('test.csv', mode='w') + df.to_csv("test.csv", mode="w") + def test_csv_read(): - pd.read_csv('test.csv', index_col=0) + pd.read_csv("test.csv", index_col=0) + def test_feather_write(df): - df.to_feather('test.feather') + df.to_feather("test.feather") + def test_feather_read(): - pd.read_feather('test.feather') + pd.read_feather("test.feather") + def test_pickle_write(df): - df.to_pickle('test.pkl') + df.to_pickle("test.pkl") + def test_pickle_read(): - pd.read_pickle('test.pkl') + pd.read_pickle("test.pkl") + def test_pickle_write_compress(df): - df.to_pickle('test.pkl.compress', compression='xz') + df.to_pickle("test.pkl.compress", compression="xz") + def test_pickle_read_compress(): - pd.read_pickle('test.pkl.compress', compression='xz') + pd.read_pickle("test.pkl.compress", compression="xz") + def test_parquet_write(df): - df.to_parquet('test.parquet') + df.to_parquet("test.parquet") + def test_parquet_read(): - pd.read_parquet('test.parquet') + pd.read_parquet("test.parquet") When writing, the top-three functions in terms of speed are ``test_feather_write``, ``test_hdf_fixed_write`` and ``test_hdf_fixed_write_compress``. diff --git a/doc/source/user_guide/text.rst b/doc/source/user_guide/text.rst index dd6ac37d88f08..2ada09117273d 100644 --- a/doc/source/user_guide/text.rst +++ b/doc/source/user_guide/text.rst @@ -46,20 +46,20 @@ infer a list of strings to .. ipython:: python - pd.Series(['a', 'b', 'c']) + pd.Series(["a", "b", "c"]) To explicitly request ``string`` dtype, specify the ``dtype`` .. ipython:: python - pd.Series(['a', 'b', 'c'], dtype="string") - pd.Series(['a', 'b', 'c'], dtype=pd.StringDtype()) + pd.Series(["a", "b", "c"], dtype="string") + pd.Series(["a", "b", "c"], dtype=pd.StringDtype()) Or ``astype`` after the ``Series`` or ``DataFrame`` is created .. ipython:: python - s = pd.Series(['a', 'b', 'c']) + s = pd.Series(["a", "b", "c"]) s s.astype("string") @@ -71,7 +71,7 @@ it will be converted to ``string`` dtype: .. ipython:: python - s = pd.Series(['a', 2, np.nan], dtype="string") + s = pd.Series(["a", 2, np.nan], dtype="string") s type(s[1]) @@ -147,15 +147,16 @@ the equivalent (scalar) built-in string methods: .. ipython:: python - s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'], - dtype="string") + s = pd.Series( + ["A", "B", "C", "Aaba", "Baca", np.nan, "CABA", "dog", "cat"], dtype="string" + ) s.str.lower() s.str.upper() s.str.len() .. ipython:: python - idx = pd.Index([' jack', 'jill ', ' jesse ', 'frank']) + idx = pd.Index([" jack", "jill ", " jesse ", "frank"]) idx.str.strip() idx.str.lstrip() idx.str.rstrip() @@ -166,8 +167,9 @@ leading or trailing whitespace: .. ipython:: python - df = pd.DataFrame(np.random.randn(3, 2), - columns=[' Column A ', ' Column B '], index=range(3)) + df = pd.DataFrame( + np.random.randn(3, 2), columns=[" Column A ", " Column B "], index=range(3) + ) df Since ``df.columns`` is an Index object, we can use the ``.str`` accessor @@ -183,7 +185,7 @@ and replacing any remaining whitespaces with underscores: .. ipython:: python - df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_') + df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_") df .. note:: @@ -221,21 +223,21 @@ Methods like ``split`` return a Series of lists: .. ipython:: python - s2 = pd.Series(['a_b_c', 'c_d_e', np.nan, 'f_g_h'], dtype="string") - s2.str.split('_') + s2 = pd.Series(["a_b_c", "c_d_e", np.nan, "f_g_h"], dtype="string") + s2.str.split("_") Elements in the split lists can be accessed using ``get`` or ``[]`` notation: .. ipython:: python - s2.str.split('_').str.get(1) - s2.str.split('_').str[1] + s2.str.split("_").str.get(1) + s2.str.split("_").str[1] It is easy to expand this to return a DataFrame using ``expand``. .. ipython:: python - s2.str.split('_', expand=True) + s2.str.split("_", expand=True) When original ``Series`` has :class:`StringDtype`, the output columns will all be :class:`StringDtype` as well. @@ -244,25 +246,25 @@ It is also possible to limit the number of splits: .. ipython:: python - s2.str.split('_', expand=True, n=1) + s2.str.split("_", expand=True, n=1) ``rsplit`` is similar to ``split`` except it works in the reverse direction, i.e., from the end of the string to the beginning of the string: .. ipython:: python - s2.str.rsplit('_', expand=True, n=1) + s2.str.rsplit("_", expand=True, n=1) ``replace`` by default replaces `regular expressions `__: .. ipython:: python - s3 = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', - '', np.nan, 'CABA', 'dog', 'cat'], - dtype="string") + s3 = pd.Series( + ["A", "B", "C", "Aaba", "Baca", "", np.nan, "CABA", "dog", "cat"], dtype="string" + ) s3 - s3.str.replace('^.a|dog', 'XX-XX ', case=False) + s3.str.replace("^.a|dog", "XX-XX ", case=False) Some caution must be taken to keep regular expressions in mind! For example, the following code will cause trouble because of the regular expression meaning of @@ -271,16 +273,16 @@ following code will cause trouble because of the regular expression meaning of .. ipython:: python # Consider the following badly formatted financial data - dollars = pd.Series(['12', '-$10', '$10,000'], dtype="string") + dollars = pd.Series(["12", "-$10", "$10,000"], dtype="string") # This does what you'd naively expect: - dollars.str.replace('$', '') + dollars.str.replace("$", "") # But this doesn't: - dollars.str.replace('-$', '-') + dollars.str.replace("-$", "-") # We need to escape the special character (for >1 len patterns) - dollars.str.replace(r'-\$', '-') + dollars.str.replace(r"-\$", "-") If you do want literal replacement of a string (equivalent to :meth:`str.replace`), you can set the optional ``regex`` parameter to @@ -290,8 +292,8 @@ and ``repl`` must be strings: .. ipython:: python # These lines are equivalent - dollars.str.replace(r'-\$', '-') - dollars.str.replace('-$', '-', regex=False) + dollars.str.replace(r"-\$", "-") + dollars.str.replace("-$", "-", regex=False) The ``replace`` method can also take a callable as replacement. It is called on every ``pat`` using :func:`re.sub`. The callable should expect one @@ -300,22 +302,24 @@ positional argument (a regex object) and return a string. .. ipython:: python # Reverse every lowercase alphabetic word - pat = r'[a-z]+' + pat = r"[a-z]+" + def repl(m): return m.group(0)[::-1] - pd.Series(['foo 123', 'bar baz', np.nan], - dtype="string").str.replace(pat, repl) + + pd.Series(["foo 123", "bar baz", np.nan], dtype="string").str.replace(pat, repl) # Using regex groups pat = r"(?P\w+) (?P\w+) (?P\w+)" + def repl(m): - return m.group('two').swapcase() + return m.group("two").swapcase() - pd.Series(['Foo Bar Baz', np.nan], - dtype="string").str.replace(pat, repl) + + pd.Series(["Foo Bar Baz", np.nan], dtype="string").str.replace(pat, repl) The ``replace`` method also accepts a compiled regular expression object from :func:`re.compile` as a pattern. All flags should be included in the @@ -324,8 +328,9 @@ compiled regular expression object. .. ipython:: python import re - regex_pat = re.compile(r'^.a|dog', flags=re.IGNORECASE) - s3.str.replace(regex_pat, 'XX-XX ') + + regex_pat = re.compile(r"^.a|dog", flags=re.IGNORECASE) + s3.str.replace(regex_pat, "XX-XX ") Including a ``flags`` argument when calling ``replace`` with a compiled regular expression object will raise a ``ValueError``. @@ -352,8 +357,8 @@ The content of a ``Series`` (or ``Index``) can be concatenated: .. ipython:: python - s = pd.Series(['a', 'b', 'c', 'd'], dtype="string") - s.str.cat(sep=',') + s = pd.Series(["a", "b", "c", "d"], dtype="string") + s.str.cat(sep=",") If not specified, the keyword ``sep`` for the separator defaults to the empty string, ``sep=''``: @@ -365,9 +370,9 @@ By default, missing values are ignored. Using ``na_rep``, they can be given a re .. ipython:: python - t = pd.Series(['a', 'b', np.nan, 'd'], dtype="string") - t.str.cat(sep=',') - t.str.cat(sep=',', na_rep='-') + t = pd.Series(["a", "b", np.nan, "d"], dtype="string") + t.str.cat(sep=",") + t.str.cat(sep=",", na_rep="-") Concatenating a Series and something list-like into a Series ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -376,14 +381,14 @@ The first argument to :meth:`~Series.str.cat` can be a list-like object, provide .. ipython:: python - s.str.cat(['A', 'B', 'C', 'D']) + s.str.cat(["A", "B", "C", "D"]) Missing values on either side will result in missing values in the result as well, *unless* ``na_rep`` is specified: .. ipython:: python s.str.cat(t) - s.str.cat(t, na_rep='-') + s.str.cat(t, na_rep="-") Concatenating a Series and something array-like into a Series ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -395,7 +400,7 @@ The parameter ``others`` can also be two-dimensional. In this case, the number o d = pd.concat([t, s], axis=1) s d - s.str.cat(d, na_rep='-') + s.str.cat(d, na_rep="-") Concatenating a Series and an indexed object into a Series, with alignment ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -406,12 +411,11 @@ the ``join``-keyword. .. ipython:: python :okwarning: - u = pd.Series(['b', 'd', 'a', 'c'], index=[1, 3, 0, 2], - dtype="string") + u = pd.Series(["b", "d", "a", "c"], index=[1, 3, 0, 2], dtype="string") s u s.str.cat(u) - s.str.cat(u, join='left') + s.str.cat(u, join="left") .. warning:: @@ -423,12 +427,11 @@ In particular, alignment also means that the different lengths do not need to co .. ipython:: python - v = pd.Series(['z', 'a', 'b', 'd', 'e'], index=[-1, 0, 1, 3, 4], - dtype="string") + v = pd.Series(["z", "a", "b", "d", "e"], index=[-1, 0, 1, 3, 4], dtype="string") s v - s.str.cat(v, join='left', na_rep='-') - s.str.cat(v, join='outer', na_rep='-') + s.str.cat(v, join="left", na_rep="-") + s.str.cat(v, join="outer", na_rep="-") The same alignment can be used when ``others`` is a ``DataFrame``: @@ -437,7 +440,7 @@ The same alignment can be used when ``others`` is a ``DataFrame``: f = d.loc[[3, 2, 1, 0], :] s f - s.str.cat(f, join='left', na_rep='-') + s.str.cat(f, join="left", na_rep="-") Concatenating a Series and many objects into a Series ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -449,7 +452,7 @@ can be combined in a list-like container (including iterators, ``dict``-views, e s u - s.str.cat([u, u.to_numpy()], join='left') + s.str.cat([u, u.to_numpy()], join="left") All elements without an index (e.g. ``np.ndarray``) within the passed list-like must match in length to the calling ``Series`` (or ``Index``), but ``Series`` and ``Index`` may have arbitrary length (as long as alignment is not disabled with ``join=None``): @@ -457,7 +460,7 @@ but ``Series`` and ``Index`` may have arbitrary length (as long as alignment is .. ipython:: python v - s.str.cat([v, u, u.to_numpy()], join='outer', na_rep='-') + s.str.cat([v, u, u.to_numpy()], join="outer", na_rep="-") If using ``join='right'`` on a list-like of ``others`` that contains different indexes, the union of these indexes will be used as the basis for the final concatenation: @@ -466,7 +469,7 @@ the union of these indexes will be used as the basis for the final concatenation u.loc[[3]] v.loc[[-1, 0]] - s.str.cat([u.loc[[3]], v.loc[[-1, 0]]], join='right', na_rep='-') + s.str.cat([u.loc[[3]], v.loc[[-1, 0]]], join="right", na_rep="-") Indexing with ``.str`` ---------------------- @@ -479,9 +482,9 @@ of the string, the result will be a ``NaN``. .. ipython:: python - s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, - 'CABA', 'dog', 'cat'], - dtype="string") + s = pd.Series( + ["A", "B", "C", "Aaba", "Baca", np.nan, "CABA", "dog", "cat"], dtype="string" + ) s.str[0] s.str[1] @@ -512,8 +515,7 @@ DataFrame with one column per group. .. ipython:: python - pd.Series(['a1', 'b2', 'c3'], - dtype="string").str.extract(r'([ab])(\d)', expand=False) + pd.Series(["a1", "b2", "c3"], dtype="string").str.extract(r"([ab])(\d)", expand=False) Elements that do not match return a row filled with ``NaN``. Thus, a Series of messy strings can be "converted" into a like-indexed Series @@ -526,16 +528,15 @@ Named groups like .. ipython:: python - pd.Series(['a1', 'b2', 'c3'], - dtype="string").str.extract(r'(?P[ab])(?P\d)', - expand=False) + pd.Series(["a1", "b2", "c3"], dtype="string").str.extract( + r"(?P[ab])(?P\d)", expand=False + ) and optional groups like .. ipython:: python - pd.Series(['a1', 'b2', '3'], - dtype="string").str.extract(r'([ab])?(\d)', expand=False) + pd.Series(["a1", "b2", "3"], dtype="string").str.extract(r"([ab])?(\d)", expand=False) can also be used. Note that any capture group names in the regular expression will be used for column names; otherwise capture group @@ -546,23 +547,20 @@ with one column if ``expand=True``. .. ipython:: python - pd.Series(['a1', 'b2', 'c3'], - dtype="string").str.extract(r'[ab](\d)', expand=True) + pd.Series(["a1", "b2", "c3"], dtype="string").str.extract(r"[ab](\d)", expand=True) It returns a Series if ``expand=False``. .. ipython:: python - pd.Series(['a1', 'b2', 'c3'], - dtype="string").str.extract(r'[ab](\d)', expand=False) + pd.Series(["a1", "b2", "c3"], dtype="string").str.extract(r"[ab](\d)", expand=False) Calling on an ``Index`` with a regex with exactly one capture group returns a ``DataFrame`` with one column if ``expand=True``. .. ipython:: python - s = pd.Series(["a1", "b2", "c3"], ["A11", "B22", "C33"], - dtype="string") + s = pd.Series(["a1", "b2", "c3"], ["A11", "B22", "C33"], dtype="string") s s.index.str.extract("(?P[a-zA-Z])", expand=True) @@ -607,10 +605,9 @@ Unlike ``extract`` (which returns only the first match), .. ipython:: python - s = pd.Series(["a1a2", "b1", "c1"], index=["A", "B", "C"], - dtype="string") + s = pd.Series(["a1a2", "b1", "c1"], index=["A", "B", "C"], dtype="string") s - two_groups = '(?P[a-z])(?P[0-9])' + two_groups = "(?P[a-z])(?P[0-9])" s.str.extract(two_groups, expand=True) the ``extractall`` method returns every match. The result of @@ -626,7 +623,7 @@ When each subject string in the Series has exactly one match, .. ipython:: python - s = pd.Series(['a3', 'b3', 'c2'], dtype="string") + s = pd.Series(["a3", "b3", "c2"], dtype="string") s then ``extractall(pat).xs(0, level='match')`` gives the same result as @@ -657,23 +654,20 @@ You can check whether elements contain a pattern: .. ipython:: python - pattern = r'[0-9][a-z]' - pd.Series(['1', '2', '3a', '3b', '03c', '4dx'], - dtype="string").str.contains(pattern) + pattern = r"[0-9][a-z]" + pd.Series(["1", "2", "3a", "3b", "03c", "4dx"], dtype="string").str.contains(pattern) Or whether elements match a pattern: .. ipython:: python - pd.Series(['1', '2', '3a', '3b', '03c', '4dx'], - dtype="string").str.match(pattern) + pd.Series(["1", "2", "3a", "3b", "03c", "4dx"], dtype="string").str.match(pattern) .. versionadded:: 1.1.0 .. ipython:: python - pd.Series(['1', '2', '3a', '3b', '03c', '4dx'], - dtype="string").str.fullmatch(pattern) + pd.Series(["1", "2", "3a", "3b", "03c", "4dx"], dtype="string").str.fullmatch(pattern) .. note:: @@ -695,9 +689,10 @@ True or False: .. ipython:: python - s4 = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'], - dtype="string") - s4.str.contains('A', na=False) + s4 = pd.Series( + ["A", "B", "C", "Aaba", "Baca", np.nan, "CABA", "dog", "cat"], dtype="string" + ) + s4.str.contains("A", na=False) .. _text.indicator: @@ -709,15 +704,15 @@ For example if they are separated by a ``'|'``: .. ipython:: python - s = pd.Series(['a', 'a|b', np.nan, 'a|c'], dtype="string") - s.str.get_dummies(sep='|') + s = pd.Series(["a", "a|b", np.nan, "a|c"], dtype="string") + s.str.get_dummies(sep="|") String ``Index`` also supports ``get_dummies`` which returns a ``MultiIndex``. .. ipython:: python - idx = pd.Index(['a', 'a|b', np.nan, 'a|c']) - idx.str.get_dummies(sep='|') + idx = pd.Index(["a", "a|b", np.nan, "a|c"]) + idx.str.get_dummies(sep="|") See also :func:`~pandas.get_dummies`. From 5eb4cb16a8c8d0d0a3ca6166c87b48de5b087bda Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 30 Sep 2020 18:15:04 -0700 Subject: [PATCH 0958/1025] CLN: de-duplicate IntervalArray validators (#36653) --- pandas/core/arrays/interval.py | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 1011381f235ca..5105b5b9cc57b 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -852,15 +852,15 @@ def _validate_fill_value(self, value): return self._validate_scalar(value) def _validate_fillna_value(self, value): - if not isinstance(value, Interval): + # This mirrors Datetimelike._validate_fill_value + try: + return self._validate_scalar(value) + except ValueError as err: msg = ( "'IntervalArray.fillna' only supports filling with a " f"scalar 'pandas.Interval'. Got a '{type(value).__name__}' instead." ) - raise TypeError(msg) - - self._check_closed_matches(value, name="value") - return value.left, value.right + raise TypeError(msg) from err def _validate_insert_value(self, value): return self._validate_scalar(value) @@ -887,14 +887,7 @@ def _validate_setitem_value(self, value): value_left, value_right = value.left, value.right else: - try: - # list-like of intervals - array = IntervalArray(value) - value_left, value_right = array.left, array.right - except TypeError as err: - # wrong type: not interval or NA - msg = f"'value' should be an interval type, got {type(value)} instead." - raise TypeError(msg) from err + return self._validate_listlike(value) if needs_float_conversion: raise ValueError("Cannot set float NaN to integer-backed IntervalArray") From da5ba00f44469aea324b300b092c3e1385c47eac Mon Sep 17 00:00:00 2001 From: patrick <61934744+phofl@users.noreply.github.com> Date: Thu, 1 Oct 2020 03:18:40 +0200 Subject: [PATCH 0959/1025] [DOC]: Add explanation about DataFrame methods use all Categories (#36747) \ --- doc/source/user_guide/categorical.rst | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/doc/source/user_guide/categorical.rst b/doc/source/user_guide/categorical.rst index 926c2d9be74c2..6a8e1767ef7e8 100644 --- a/doc/source/user_guide/categorical.rst +++ b/doc/source/user_guide/categorical.rst @@ -618,6 +618,19 @@ even if some categories are not present in the data: s = pd.Series(pd.Categorical(["a", "b", "c", "c"], categories=["c", "a", "b", "d"])) s.value_counts() +``DataFrame`` methods like :meth:`DataFrame.sum` also show "unused" categories. + +.. ipython:: python + + columns = pd.Categorical( + ["One", "One", "Two"], categories=["One", "Two", "Three"], ordered=True + ) + df = pd.DataFrame( + data=[[1, 2, 3], [4, 5, 6]], + columns=pd.MultiIndex.from_arrays([["A", "B", "B"], columns]), + ) + df.sum(axis=1, level=1) + Groupby will also show "unused" categories: .. ipython:: python From 3c043dbd373e7bff7036356106870b954cc95b9a Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 30 Sep 2020 18:19:44 -0700 Subject: [PATCH 0960/1025] REF: privatize in core.missing, remove unused kwarg (#36645) --- pandas/core/arrays/base.py | 4 ++-- pandas/core/missing.py | 39 +++++++++++++++++++------------------- 2 files changed, 22 insertions(+), 21 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index eae401f9744f0..c2fc72ff753a8 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -32,7 +32,7 @@ from pandas.core import ops from pandas.core.algorithms import factorize_array, unique -from pandas.core.missing import backfill_1d, pad_1d +from pandas.core.missing import get_fill_func from pandas.core.sorting import nargminmax, nargsort _extension_array_shared_docs: Dict[str, str] = dict() @@ -616,7 +616,7 @@ def fillna(self, value=None, method=None, limit=None): if mask.any(): if method is not None: - func = pad_1d if method == "pad" else backfill_1d + func = get_fill_func(method) new_values = func(self.astype(object), limit=limit, mask=mask) new_values = self._from_sequence(new_values, dtype=self.dtype) else: diff --git a/pandas/core/missing.py b/pandas/core/missing.py index c2926debcb6d6..f3229b2876e5d 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -562,20 +562,22 @@ def interpolate_2d( values = values.reshape(tuple((1,) + values.shape)) method = clean_fill_method(method) + tvalues = transf(values) if method == "pad": - values = transf(pad_2d(transf(values), limit=limit)) + result = _pad_2d(tvalues, limit=limit) else: - values = transf(backfill_2d(transf(values), limit=limit)) + result = _backfill_2d(tvalues, limit=limit) + result = transf(result) # reshape back if ndim == 1: - values = values[0] + result = result[0] if orig_values.dtype.kind == "M": # convert float back to datetime64 - values = values.astype(orig_values.dtype) + result = result.astype(orig_values.dtype) - return values + return result def _cast_values_for_fillna(values, dtype: DtypeObj, has_mask: bool): @@ -597,10 +599,9 @@ def _cast_values_for_fillna(values, dtype: DtypeObj, has_mask: bool): return values -def _fillna_prep(values, mask=None, dtype: Optional[DtypeObj] = None): - # boilerplate for pad_1d, backfill_1d, pad_2d, backfill_2d - if dtype is None: - dtype = values.dtype +def _fillna_prep(values, mask=None): + # boilerplate for _pad_1d, _backfill_1d, _pad_2d, _backfill_2d + dtype = values.dtype has_mask = mask is not None if not has_mask: @@ -613,20 +614,20 @@ def _fillna_prep(values, mask=None, dtype: Optional[DtypeObj] = None): return values, mask -def pad_1d(values, limit=None, mask=None, dtype: Optional[DtypeObj] = None): - values, mask = _fillna_prep(values, mask, dtype) +def _pad_1d(values, limit=None, mask=None): + values, mask = _fillna_prep(values, mask) algos.pad_inplace(values, mask, limit=limit) return values -def backfill_1d(values, limit=None, mask=None, dtype: Optional[DtypeObj] = None): - values, mask = _fillna_prep(values, mask, dtype) +def _backfill_1d(values, limit=None, mask=None): + values, mask = _fillna_prep(values, mask) algos.backfill_inplace(values, mask, limit=limit) return values -def pad_2d(values, limit=None, mask=None, dtype: Optional[DtypeObj] = None): - values, mask = _fillna_prep(values, mask, dtype) +def _pad_2d(values, limit=None, mask=None): + values, mask = _fillna_prep(values, mask) if np.all(values.shape): algos.pad_2d_inplace(values, mask, limit=limit) @@ -636,8 +637,8 @@ def pad_2d(values, limit=None, mask=None, dtype: Optional[DtypeObj] = None): return values -def backfill_2d(values, limit=None, mask=None, dtype: Optional[DtypeObj] = None): - values, mask = _fillna_prep(values, mask, dtype) +def _backfill_2d(values, limit=None, mask=None): + values, mask = _fillna_prep(values, mask) if np.all(values.shape): algos.backfill_2d_inplace(values, mask, limit=limit) @@ -647,7 +648,7 @@ def backfill_2d(values, limit=None, mask=None, dtype: Optional[DtypeObj] = None) return values -_fill_methods = {"pad": pad_1d, "backfill": backfill_1d} +_fill_methods = {"pad": _pad_1d, "backfill": _backfill_1d} def get_fill_func(method): @@ -724,7 +725,7 @@ def inner(invalid, limit): return f_idx & b_idx -def _rolling_window(a, window): +def _rolling_window(a: np.ndarray, window: int): """ [True, True, False, True, False], 2 -> From 901151fee0c29db888054b80118ddf3a05fbd7ce Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Thu, 1 Oct 2020 02:22:05 +0100 Subject: [PATCH 0961/1025] DEPR: Deprecate params levels & codes in MultiIndex.copy (#36685) --- doc/source/whatsnew/v1.2.0.rst | 1 + pandas/core/indexes/multi.py | 21 ++++++++++++++++++++- pandas/tests/indexes/multi/test_copy.py | 23 +++++++++++++++++++++-- 3 files changed, 42 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index ddee06aeab779..e810fc0239b40 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -258,6 +258,7 @@ Deprecations ~~~~~~~~~~~~ - Deprecated parameter ``inplace`` in :meth:`MultiIndex.set_codes` and :meth:`MultiIndex.set_levels` (:issue:`35626`) - Deprecated parameter ``dtype`` in :meth:`~Index.copy` on method all index classes. Use the :meth:`~Index.astype` method instead for changing dtype (:issue:`35853`) +- Deprecated parameters ``levels`` and ``codes`` in :meth:`~MultiIndex.copy`. Use the :meth:`~MultiIndex.set_levels` and :meth:`~MultiIndex.set_codes` methods instead (:issue:`36685`) - Date parser functions :func:`~pandas.io.date_converters.parse_date_time`, :func:`~pandas.io.date_converters.parse_date_fields`, :func:`~pandas.io.date_converters.parse_all_fields` and :func:`~pandas.io.date_converters.generic_parser` from ``pandas.io.date_converters`` are deprecated and will be removed in a future version; use :func:`to_datetime` instead (:issue:`35741`) - :meth:`DataFrame.lookup` is deprecated and will be removed in a future version, use :meth:`DataFrame.melt` and :meth:`DataFrame.loc` instead (:issue:`18682`) - The :meth:`Index.to_native_types` is deprecated. Use ``.astype(str)`` instead (:issue:`28867`) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index c0b32c79435ed..1628b44be4096 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1132,7 +1132,11 @@ def copy( .. deprecated:: 1.2.0 levels : sequence, optional + + .. deprecated:: 1.2.0 codes : sequence, optional + + .. deprecated:: 1.2.0 deep : bool, default False name : Label Kept for compatibility with 1-dimensional Index. Should not be used. @@ -1148,6 +1152,21 @@ def copy( This could be potentially expensive on large MultiIndex objects. """ names = self._validate_names(name=name, names=names, deep=deep) + if levels is not None: + warnings.warn( + "parameter levels is deprecated and will be removed in a future " + "version. Use the set_levels method instead.", + FutureWarning, + stacklevel=2, + ) + if codes is not None: + warnings.warn( + "parameter codes is deprecated and will be removed in a future " + "version. Use the set_codes method instead.", + FutureWarning, + stacklevel=2, + ) + if deep: from copy import deepcopy @@ -1575,7 +1594,7 @@ def dropna(self, how="any"): raise ValueError(f"invalid how option: {how}") new_codes = [level_codes[~indexer] for level_codes in self.codes] - return self.copy(codes=new_codes, deep=True) + return self.set_codes(codes=new_codes) def _get_level_values(self, level, unique=False): """ diff --git a/pandas/tests/indexes/multi/test_copy.py b/pandas/tests/indexes/multi/test_copy.py index 67b815ecba3b8..8dc8572493444 100644 --- a/pandas/tests/indexes/multi/test_copy.py +++ b/pandas/tests/indexes/multi/test_copy.py @@ -69,8 +69,6 @@ def test_copy_method(deep): "kwarg, value", [ ("names", ["third", "fourth"]), - ("levels", [["foo2", "bar2"], ["fizz2", "buzz2"]]), - ("codes", [[1, 0, 0, 0], [1, 1, 0, 0]]), ], ) def test_copy_method_kwargs(deep, kwarg, value): @@ -85,3 +83,24 @@ def test_copy_method_kwargs(deep, kwarg, value): assert getattr(idx_copy, kwarg) == value else: assert [list(i) for i in getattr(idx_copy, kwarg)] == value + + +@pytest.mark.parametrize("deep", [True, False]) +@pytest.mark.parametrize( + "param_name, param_value", + [ + ("levels", [["foo2", "bar2"], ["fizz2", "buzz2"]]), + ("codes", [[1, 0, 0, 0], [1, 1, 0, 0]]), + ], +) +def test_copy_deprecated_parameters(deep, param_name, param_value): + # gh-36685 + idx = MultiIndex( + levels=[["foo", "bar"], ["fizz", "buzz"]], + codes=[[0, 0, 0, 1], [0, 0, 1, 1]], + names=["first", "second"], + ) + with tm.assert_produces_warning(FutureWarning): + idx_copy = idx.copy(deep=deep, **{param_name: param_value}) + + assert [list(i) for i in getattr(idx_copy, param_name)] == param_value From 5f215d7aa82e420374097f32811b4f853d8031cd Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Thu, 1 Oct 2020 02:23:32 +0100 Subject: [PATCH 0962/1025] TYP: some more static definitions of methods for DatetimeIndex (#36742) --- pandas/core/indexes/datetimes.py | 40 +++++++++++++++++++++++++++----- 1 file changed, 34 insertions(+), 6 deletions(-) diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 016544d823ae3..da78f8ff5d603 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -1,6 +1,6 @@ from datetime import date, datetime, time, timedelta, tzinfo import operator -from typing import Optional +from typing import TYPE_CHECKING, Optional import warnings import numpy as np @@ -30,6 +30,9 @@ from pandas.core.indexes.extension import inherit_names from pandas.core.tools.times import to_time +if TYPE_CHECKING: + from pandas import DataFrame, Float64Index, PeriodIndex, TimedeltaIndex + def _new_DatetimeIndex(cls, d): """ @@ -64,8 +67,7 @@ def _new_DatetimeIndex(cls, d): @inherit_names( - ["to_perioddelta", "to_julian_date", "strftime", "isocalendar"] - + DatetimeArray._field_ops + DatetimeArray._field_ops + [ method for method in DatetimeArray._datetimelike_methods @@ -220,7 +222,12 @@ class DatetimeIndex(DatetimeTimedeltaMixin): tz: Optional[tzinfo] # -------------------------------------------------------------------- - # methods that dispatch to array and wrap result in DatetimeIndex + # methods that dispatch to DatetimeArray and wrap result + + @doc(DatetimeArray.strftime) + def strftime(self, date_format) -> Index: + arr = self._data.strftime(date_format) + return Index(arr, name=self.name) @doc(DatetimeArray.tz_convert) def tz_convert(self, tz) -> "DatetimeIndex": @@ -235,9 +242,30 @@ def tz_localize( return type(self)._simple_new(arr, name=self.name) @doc(DatetimeArray.to_period) - def to_period(self, freq=None) -> "DatetimeIndex": + def to_period(self, freq=None) -> "PeriodIndex": + from pandas.core.indexes.api import PeriodIndex + arr = self._data.to_period(freq) - return type(self)._simple_new(arr, name=self.name) + return PeriodIndex._simple_new(arr, name=self.name) + + @doc(DatetimeArray.to_perioddelta) + def to_perioddelta(self, freq) -> "TimedeltaIndex": + from pandas.core.indexes.api import TimedeltaIndex + + arr = self._data.to_perioddelta(freq) + return TimedeltaIndex._simple_new(arr, name=self.name) + + @doc(DatetimeArray.to_julian_date) + def to_julian_date(self) -> "Float64Index": + from pandas.core.indexes.api import Float64Index + + arr = self._data.to_julian_date() + return Float64Index._simple_new(arr, name=self.name) + + @doc(DatetimeArray.isocalendar) + def isocalendar(self) -> "DataFrame": + df = self._data.isocalendar() + return df.set_index(self) # -------------------------------------------------------------------- # Constructors From 8e1345b1934ea4ef45d275afcee61ee026618526 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Thu, 1 Oct 2020 02:05:43 -0700 Subject: [PATCH 0963/1025] TYP: mostly datetimelike (#36696) --- pandas/core/arrays/categorical.py | 9 ++---- pandas/core/arrays/datetimelike.py | 4 ++- pandas/core/arrays/datetimes.py | 5 +++- pandas/core/arrays/period.py | 6 ++-- pandas/core/arrays/string_.py | 2 +- pandas/core/arrays/timedeltas.py | 38 ++++++++++++++++---------- pandas/core/base.py | 2 +- pandas/core/indexes/datetimelike.py | 2 +- pandas/core/indexes/period.py | 20 +++++++++----- pandas/core/indexes/timedeltas.py | 2 +- pandas/core/series.py | 7 ++--- pandas/tests/extension/arrow/arrays.py | 4 ++- 12 files changed, 60 insertions(+), 41 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 41c4de51fc2e1..9db22df20e66d 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -288,6 +288,7 @@ class Categorical(NDArrayBackedExtensionArray, PandasObject, ObjectStringArrayMi # tolist is not actually deprecated, just suppressed in the __dir__ _deprecations = PandasObject._deprecations | frozenset(["tolist"]) _typ = "categorical" + _can_hold_na = True def __init__( self, values, categories=None, ordered=None, dtype=None, fastpath=False @@ -1268,10 +1269,10 @@ def __setstate__(self, state): setattr(self, k, v) @property - def nbytes(self): + def nbytes(self) -> int: return self._codes.nbytes + self.dtype.categories.values.nbytes - def memory_usage(self, deep=False): + def memory_usage(self, deep: bool = False) -> int: """ Memory usage of my values @@ -2144,10 +2145,6 @@ def equals(self, other: object) -> bool: return np.array_equal(self._codes, other_codes) return False - @property - def _can_hold_na(self): - return True - @classmethod def _concat_same_type(self, to_concat): from pandas.core.dtypes.concat import union_categoricals diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 0f723546fb4c2..83a9c0ba61c2d 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -117,7 +117,9 @@ class AttributesMixin: _data: np.ndarray @classmethod - def _simple_new(cls, values: np.ndarray, **kwargs): + def _simple_new( + cls, values: np.ndarray, freq: Optional[BaseOffset] = None, dtype=None + ): raise AbstractMethodError(cls) @property diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 6b051f1f73467..cd5449058fb33 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -6,6 +6,7 @@ from pandas._libs import lib, tslib from pandas._libs.tslibs import ( + BaseOffset, NaT, NaTType, Resolution, @@ -283,7 +284,9 @@ def __init__(self, values, dtype=DT64NS_DTYPE, freq=None, copy=False): type(self)._validate_frequency(self, freq) @classmethod - def _simple_new(cls, values, freq=None, dtype=DT64NS_DTYPE): + def _simple_new( + cls, values, freq: Optional[BaseOffset] = None, dtype=DT64NS_DTYPE + ) -> "DatetimeArray": assert isinstance(values, np.ndarray) if values.dtype != DT64NS_DTYPE: assert values.dtype == "i8" diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 372ef7df9dc3a..15f2842e39875 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -174,11 +174,13 @@ def __init__(self, values, freq=None, dtype=None, copy=False): self._dtype = PeriodDtype(freq) @classmethod - def _simple_new(cls, values: np.ndarray, freq=None, **kwargs) -> "PeriodArray": + def _simple_new( + cls, values: np.ndarray, freq: Optional[BaseOffset] = None, dtype=None + ) -> "PeriodArray": # alias for PeriodArray.__init__ assertion_msg = "Should be numpy array of type i8" assert isinstance(values, np.ndarray) and values.dtype == "i8", assertion_msg - return cls(values, freq=freq, **kwargs) + return cls(values, freq=freq, dtype=dtype) @classmethod def _from_sequence( diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index bf8b93b5a4164..9ea34d4680748 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -308,7 +308,7 @@ def value_counts(self, dropna=False): return value_counts(self._ndarray, dropna=dropna).astype("Int64") - def memory_usage(self, deep=False): + def memory_usage(self, deep: bool = False) -> int: result = self._ndarray.nbytes if deep: return result + lib.memory_usage_of_objects(self._ndarray) diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index 145380ecce9fd..6ca57e7872910 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -1,10 +1,11 @@ from datetime import timedelta -from typing import List, Union +from typing import List, Optional, Union import numpy as np from pandas._libs import lib, tslibs from pandas._libs.tslibs import ( + BaseOffset, NaT, NaTType, Period, @@ -45,8 +46,8 @@ from pandas.core.ops.common import unpack_zerodim_and_defer -def _field_accessor(name, alias, docstring=None): - def f(self): +def _field_accessor(name: str, alias: str, docstring: str): + def f(self) -> np.ndarray: values = self.asi8 result = get_timedelta_field(values, alias) if self._hasnans: @@ -121,7 +122,7 @@ def _box_func(self, x) -> Union[Timedelta, NaTType]: return Timedelta(x, unit="ns") @property - def dtype(self): + def dtype(self) -> np.dtype: """ The dtype for the TimedeltaArray. @@ -196,7 +197,9 @@ def __init__(self, values, dtype=TD64NS_DTYPE, freq=lib.no_default, copy=False): type(self)._validate_frequency(self, freq) @classmethod - def _simple_new(cls, values, freq=None, dtype=TD64NS_DTYPE): + def _simple_new( + cls, values, freq: Optional[BaseOffset] = None, dtype=TD64NS_DTYPE + ) -> "TimedeltaArray": assert dtype == TD64NS_DTYPE, dtype assert isinstance(values, np.ndarray), type(values) if values.dtype != TD64NS_DTYPE: @@ -211,8 +214,13 @@ def _simple_new(cls, values, freq=None, dtype=TD64NS_DTYPE): @classmethod def _from_sequence( - cls, data, dtype=TD64NS_DTYPE, copy=False, freq=lib.no_default, unit=None - ): + cls, + data, + dtype=TD64NS_DTYPE, + copy: bool = False, + freq=lib.no_default, + unit=None, + ) -> "TimedeltaArray": if dtype: _validate_td64_dtype(dtype) @@ -240,7 +248,9 @@ def _from_sequence( return result @classmethod - def _generate_range(cls, start, end, periods, freq, closed=None): + def _generate_range( + cls, start, end, periods, freq, closed=None + ) -> "TimedeltaArray": periods = dtl.validate_periods(periods) if freq is None and any(x is None for x in [periods, start, end]): @@ -298,7 +308,7 @@ def _maybe_clear_freq(self): # ---------------------------------------------------------------- # Array-Like / EA-Interface Methods - def astype(self, dtype, copy=True): + def astype(self, dtype, copy: bool = True): # We handle # --> timedelta64[ns] # --> timedelta64 @@ -461,7 +471,7 @@ def _addsub_object_array(self, other, op): ) from err @unpack_zerodim_and_defer("__mul__") - def __mul__(self, other): + def __mul__(self, other) -> "TimedeltaArray": if is_scalar(other): # numpy will accept float and int, raise TypeError for others result = self._data * other @@ -737,22 +747,22 @@ def __rdivmod__(self, other): res2 = other - res1 * self return res1, res2 - def __neg__(self): + def __neg__(self) -> "TimedeltaArray": if self.freq is not None: return type(self)(-self._data, freq=-self.freq) return type(self)(-self._data) - def __pos__(self): + def __pos__(self) -> "TimedeltaArray": return type(self)(self._data, freq=self.freq) - def __abs__(self): + def __abs__(self) -> "TimedeltaArray": # Note: freq is not preserved return type(self)(np.abs(self._data)) # ---------------------------------------------------------------- # Conversion Methods - Vectorized analogues of Timedelta methods - def total_seconds(self): + def total_seconds(self) -> np.ndarray: """ Return total duration of each element expressed in seconds. diff --git a/pandas/core/base.py b/pandas/core/base.py index 4d5cddc086b2a..a50181c1be2f0 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -1347,7 +1347,7 @@ def memory_usage(self, deep=False): Parameters ---------- - deep : bool + deep : bool, default False Introspect the data deeply, interrogate `object` dtypes for system-level memory consumption. diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 3d2820976a6af..23cc93b9ecb33 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -105,7 +105,7 @@ def _is_all_dates(self) -> bool: # Abstract data attributes @property - def values(self): + def values(self) -> np.ndarray: # Note: PeriodArray overrides this to return an ndarray of objects. return self._data._data diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index 900d3f9f1866b..27b60747015de 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -60,7 +60,7 @@ def _new_PeriodIndex(cls, **d): @inherit_names( - ["strftime", "to_timestamp", "start_time", "end_time"] + PeriodArray._field_ops, + ["strftime", "start_time", "end_time"] + PeriodArray._field_ops, PeriodArray, wrap=True, ) @@ -149,12 +149,18 @@ class PeriodIndex(DatetimeIndexOpsMixin, Int64Index): # -------------------------------------------------------------------- # methods that dispatch to array and wrap result in PeriodIndex + # These are defined here instead of via inherit_names for mypy @doc(PeriodArray.asfreq) def asfreq(self, freq=None, how: str = "E") -> "PeriodIndex": arr = self._data.asfreq(freq, how) return type(self)._simple_new(arr, name=self.name) + @doc(PeriodArray.to_timestamp) + def to_timestamp(self, freq=None, how="start") -> DatetimeIndex: + arr = self._data.to_timestamp(freq, how) + return DatetimeIndex._simple_new(arr, name=self.name) + # ------------------------------------------------------------------------ # Index Constructors @@ -244,11 +250,11 @@ def _simple_new(cls, values: PeriodArray, name: Label = None): # Data @property - def values(self): + def values(self) -> np.ndarray: return np.asarray(self) @property - def _has_complex_internals(self): + def _has_complex_internals(self) -> bool: # used to avoid libreduction code paths, which raise or require conversion return True @@ -402,7 +408,7 @@ def asof_locs(self, where, mask: np.ndarray) -> np.ndarray: return result @doc(Index.astype) - def astype(self, dtype, copy=True, how="start"): + def astype(self, dtype, copy: bool = True, how="start"): dtype = pandas_dtype(dtype) if is_datetime64_any_dtype(dtype): @@ -421,7 +427,7 @@ def is_full(self) -> bool: """ if len(self) == 0: return True - if not self.is_monotonic: + if not self.is_monotonic_increasing: raise ValueError("Index is not monotonic") values = self.asi8 return ((values[1:] - values[:-1]) < 2).all() @@ -432,7 +438,7 @@ def inferred_type(self) -> str: # indexing return "period" - def insert(self, loc, item): + def insert(self, loc: int, item): if not isinstance(item, Period) or self.freq != item.freq: return self.astype(object).insert(loc, item) @@ -706,7 +712,7 @@ def _union(self, other, sort): # ------------------------------------------------------------------------ - def memory_usage(self, deep=False): + def memory_usage(self, deep: bool = False) -> int: result = super().memory_usage(deep=deep) if hasattr(self, "_cache") and "_int64index" in self._cache: result += self._int64index.memory_usage(deep=deep) diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index 20ebc80c7e0af..854c4e33eca01 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -184,7 +184,7 @@ def _formatter_func(self): # ------------------------------------------------------------------- @doc(Index.astype) - def astype(self, dtype, copy=True): + def astype(self, dtype, copy: bool = True): dtype = pandas_dtype(dtype) if is_timedelta64_dtype(dtype) and not is_timedelta64_ns_dtype(dtype): # Have to repeat the check for 'timedelta64' (not ns) dtype diff --git a/pandas/core/series.py b/pandas/core/series.py index d2c702d924136..fca1b6b08b434 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -394,7 +394,7 @@ def _constructor_expanddim(self) -> Type["DataFrame"]: # types @property - def _can_hold_na(self): + def _can_hold_na(self) -> bool: return self._mgr._can_hold_na _index = None @@ -4904,10 +4904,7 @@ def to_timestamp(self, freq=None, how="start", copy=True) -> "Series": if not isinstance(self.index, PeriodIndex): raise TypeError(f"unsupported Type {type(self.index).__name__}") - # error: "PeriodIndex" has no attribute "to_timestamp" - new_index = self.index.to_timestamp( # type: ignore[attr-defined] - freq=freq, how=how - ) + new_index = self.index.to_timestamp(freq=freq, how=how) return self._constructor(new_values, index=new_index).__finalize__( self, method="to_timestamp" ) diff --git a/pandas/tests/extension/arrow/arrays.py b/pandas/tests/extension/arrow/arrays.py index 8a18f505058bc..5e930b7b22f30 100644 --- a/pandas/tests/extension/arrow/arrays.py +++ b/pandas/tests/extension/arrow/arrays.py @@ -68,6 +68,8 @@ def construct_array_type(cls) -> Type["ArrowStringArray"]: class ArrowExtensionArray(ExtensionArray): + _data: pa.ChunkedArray + @classmethod def from_scalars(cls, values): arr = pa.chunked_array([pa.array(np.asarray(values))]) @@ -129,7 +131,7 @@ def __or__(self, other): return self._boolean_op(other, operator.or_) @property - def nbytes(self): + def nbytes(self) -> int: return sum( x.size for chunk in self._data.chunks From 552d21c39c1834f72998d64c3916a74a819510bc Mon Sep 17 00:00:00 2001 From: krajatcl <53620269+krajatcl@users.noreply.github.com> Date: Thu, 1 Oct 2020 16:24:26 +0530 Subject: [PATCH 0964/1025] TST: insert 'match' to bare pytest raises in pandas/tests/indexing/test_chaining_and_caching.py (#36762) Co-authored-by: Rajat Bishnoi --- .../indexing/test_chaining_and_caching.py | 25 +++++++++++-------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/pandas/tests/indexing/test_chaining_and_caching.py b/pandas/tests/indexing/test_chaining_and_caching.py index 1254f1f217a2e..1241d394d7936 100644 --- a/pandas/tests/indexing/test_chaining_and_caching.py +++ b/pandas/tests/indexing/test_chaining_and_caching.py @@ -155,10 +155,11 @@ def test_detect_chained_assignment(self): ) assert df._is_copy is None - with pytest.raises(com.SettingWithCopyError): + msg = "A value is trying to be set on a copy of a slice from a DataFrame" + with pytest.raises(com.SettingWithCopyError, match=msg): df["A"][0] = -5 - with pytest.raises(com.SettingWithCopyError): + with pytest.raises(com.SettingWithCopyError, match=msg): df["A"][1] = np.nan assert df["A"]._is_copy is None @@ -171,7 +172,7 @@ def test_detect_chained_assignment(self): } ) - with pytest.raises(com.SettingWithCopyError): + with pytest.raises(com.SettingWithCopyError, match=msg): df.loc[0]["A"] = -5 # Doc example @@ -183,17 +184,17 @@ def test_detect_chained_assignment(self): ) assert df._is_copy is None - with pytest.raises(com.SettingWithCopyError): + with pytest.raises(com.SettingWithCopyError, match=msg): indexer = df.a.str.startswith("o") df[indexer]["c"] = 42 expected = DataFrame({"A": [111, "bbb", "ccc"], "B": [1, 2, 3]}) df = DataFrame({"A": ["aaa", "bbb", "ccc"], "B": [1, 2, 3]}) - with pytest.raises(com.SettingWithCopyError): + with pytest.raises(com.SettingWithCopyError, match=msg): df["A"][0] = 111 - with pytest.raises(com.SettingWithCopyError): + with pytest.raises(com.SettingWithCopyError, match=msg): df.loc[0]["A"] = 111 df.loc[0, "A"] = 111 @@ -293,7 +294,7 @@ def random_text(nobs=100): df = DataFrame(np.arange(0, 9), columns=["count"]) df["group"] = "b" - with pytest.raises(com.SettingWithCopyError): + with pytest.raises(com.SettingWithCopyError, match=msg): df.iloc[0:5]["group"] = "a" # Mixed type setting but same dtype & changing dtype @@ -306,13 +307,13 @@ def random_text(nobs=100): ) ) - with pytest.raises(com.SettingWithCopyError): + with pytest.raises(com.SettingWithCopyError, match=msg): df.loc[2]["D"] = "foo" - with pytest.raises(com.SettingWithCopyError): + with pytest.raises(com.SettingWithCopyError, match=msg): df.loc[2]["C"] = "foo" - with pytest.raises(com.SettingWithCopyError): + with pytest.raises(com.SettingWithCopyError, match=msg): df["C"][2] = "foo" def test_setting_with_copy_bug(self): @@ -340,8 +341,10 @@ def test_detect_chained_assignment_warnings_errors(self): with option_context("chained_assignment", "warn"): with tm.assert_produces_warning(com.SettingWithCopyWarning): df.loc[0]["A"] = 111 + + msg = "A value is trying to be set on a copy of a slice from a DataFrame" with option_context("chained_assignment", "raise"): - with pytest.raises(com.SettingWithCopyError): + with pytest.raises(com.SettingWithCopyError, match=msg): df.loc[0]["A"] = 111 def test_detect_chained_assignment_warnings_filter_and_dupe_cols(self): From 031f2b5e2bc8ac7b795ffff3c5734f7514069ab7 Mon Sep 17 00:00:00 2001 From: Mayank Chaudhary <62796466+mayank1897@users.noreply.github.com> Date: Thu, 1 Oct 2020 19:59:46 +0530 Subject: [PATCH 0965/1025] Update README.md (#36772) * Update README.md * Update README.md --- README.md | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index a2f2f1c04442a..da8487d76f4a1 100644 --- a/README.md +++ b/README.md @@ -32,32 +32,32 @@ its way towards this goal. Here are just a few of the things that pandas does well: - Easy handling of [**missing data**][missing-data] (represented as - `NaN`, `NA`, or `NaT`) in floating point as well as non-floating point data + `NaN`, `NA`, or `NaT`) in floating point as well as non-floating point data; - Size mutability: columns can be [**inserted and deleted**][insertion-deletion] from DataFrame and higher dimensional - objects + objects; - Automatic and explicit [**data alignment**][alignment]: objects can be explicitly aligned to a set of labels, or the user can simply ignore the labels and let `Series`, `DataFrame`, etc. automatically - align the data for you in computations + align the data for you in computations; - Powerful, flexible [**group by**][groupby] functionality to perform split-apply-combine operations on data sets, for both aggregating - and transforming data + and transforming data; - Make it [**easy to convert**][conversion] ragged, differently-indexed data in other Python and NumPy data structures - into DataFrame objects + into DataFrame objects; - Intelligent label-based [**slicing**][slicing], [**fancy indexing**][fancy-indexing], and [**subsetting**][subsetting] of - large data sets + large data sets; - Intuitive [**merging**][merging] and [**joining**][joining] data - sets + sets; - Flexible [**reshaping**][reshape] and [**pivoting**][pivot-table] of - data sets + data sets; - [**Hierarchical**][mi] labeling of axes (possible to have multiple - labels per tick) + labels per tick); - Robust IO tools for loading data from [**flat files**][flat-files] (CSV and delimited), [**Excel files**][excel], [**databases**][db], - and saving/loading data from the ultrafast [**HDF5 format**][hdfstore] + and saving/loading data from the ultrafast [**HDF5 format**][hdfstore]; - [**Time series**][timeseries]-specific functionality: date range generation and frequency conversion, moving window statistics, date shifting and lagging. From e730e605b744a3f623711cc3e0bacb6504867b5d Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Thu, 1 Oct 2020 18:22:57 +0100 Subject: [PATCH 0966/1025] Revert "Update README.md (#36772)" (#36781) --- README.md | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index da8487d76f4a1..a2f2f1c04442a 100644 --- a/README.md +++ b/README.md @@ -32,32 +32,32 @@ its way towards this goal. Here are just a few of the things that pandas does well: - Easy handling of [**missing data**][missing-data] (represented as - `NaN`, `NA`, or `NaT`) in floating point as well as non-floating point data; + `NaN`, `NA`, or `NaT`) in floating point as well as non-floating point data - Size mutability: columns can be [**inserted and deleted**][insertion-deletion] from DataFrame and higher dimensional - objects; + objects - Automatic and explicit [**data alignment**][alignment]: objects can be explicitly aligned to a set of labels, or the user can simply ignore the labels and let `Series`, `DataFrame`, etc. automatically - align the data for you in computations; + align the data for you in computations - Powerful, flexible [**group by**][groupby] functionality to perform split-apply-combine operations on data sets, for both aggregating - and transforming data; + and transforming data - Make it [**easy to convert**][conversion] ragged, differently-indexed data in other Python and NumPy data structures - into DataFrame objects; + into DataFrame objects - Intelligent label-based [**slicing**][slicing], [**fancy indexing**][fancy-indexing], and [**subsetting**][subsetting] of - large data sets; + large data sets - Intuitive [**merging**][merging] and [**joining**][joining] data - sets; + sets - Flexible [**reshaping**][reshape] and [**pivoting**][pivot-table] of - data sets; + data sets - [**Hierarchical**][mi] labeling of axes (possible to have multiple - labels per tick); + labels per tick) - Robust IO tools for loading data from [**flat files**][flat-files] (CSV and delimited), [**Excel files**][excel], [**databases**][db], - and saving/loading data from the ultrafast [**HDF5 format**][hdfstore]; + and saving/loading data from the ultrafast [**HDF5 format**][hdfstore] - [**Time series**][timeseries]-specific functionality: date range generation and frequency conversion, moving window statistics, date shifting and lagging. From 706fb0400aaf30504b62f0636aca5891bc5529c5 Mon Sep 17 00:00:00 2001 From: Andrew Wieteska <48889395+arw2019@users.noreply.github.com> Date: Thu, 1 Oct 2020 13:52:13 -0400 Subject: [PATCH 0967/1025] BUG: use cmath to test complex number equality in pandas._testing (#36580) --- doc/source/whatsnew/v1.2.0.rst | 1 + pandas/_libs/testing.pyx | 12 +++++++ pandas/tests/util/test_assert_almost_equal.py | 31 +++++++++++++++++++ 3 files changed, 44 insertions(+) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index e810fc0239b40..302aa3f9aa998 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -325,6 +325,7 @@ Numeric - Bug in :meth:`DataFrame.any` with ``axis=1`` and ``bool_only=True`` ignoring the ``bool_only`` keyword (:issue:`32432`) - Bug in :meth:`Series.equals` where a ``ValueError`` was raised when numpy arrays were compared to scalars (:issue:`35267`) - Bug in :class:`Series` where two :class:`Series` each have a :class:`DatetimeIndex` with different timezones having those indexes incorrectly changed when performing arithmetic operations (:issue:`33671`) +- Bug in :meth:`pd._testing.assert_almost_equal` was incorrect for complex numeric types (:issue:`28235`) - Conversion diff --git a/pandas/_libs/testing.pyx b/pandas/_libs/testing.pyx index 64fc8d615ea9c..b2f19fcf5f5da 100644 --- a/pandas/_libs/testing.pyx +++ b/pandas/_libs/testing.pyx @@ -1,3 +1,4 @@ +import cmath import math import numpy as np @@ -7,6 +8,7 @@ from numpy cimport import_array import_array() from pandas._libs.util cimport is_array +from pandas._libs.lib import is_complex from pandas.core.dtypes.common import is_dtype_equal from pandas.core.dtypes.missing import array_equivalent, isna @@ -210,4 +212,14 @@ cpdef assert_almost_equal(a, b, f"with rtol={rtol}, atol={atol}") return True + if is_complex(a) and is_complex(b): + if array_equivalent(a, b, strict_nan=True): + # inf comparison + return True + + if not cmath.isclose(a, b, rel_tol=rtol, abs_tol=atol): + assert False, (f"expected {b:.5f} but got {a:.5f}, " + f"with rtol={rtol}, atol={atol}") + return True + raise AssertionError(f"{a} != {b}") diff --git a/pandas/tests/util/test_assert_almost_equal.py b/pandas/tests/util/test_assert_almost_equal.py index c25668c33bfc4..c4bc3b7ee352d 100644 --- a/pandas/tests/util/test_assert_almost_equal.py +++ b/pandas/tests/util/test_assert_almost_equal.py @@ -146,6 +146,37 @@ def test_assert_not_almost_equal_numbers_rtol(a, b): _assert_not_almost_equal_both(a, b, rtol=0.05) +@pytest.mark.parametrize( + "a,b,rtol", + [ + (1.00001, 1.00005, 0.001), + (-0.908356 + 0.2j, -0.908358 + 0.2j, 1e-3), + (0.1 + 1.009j, 0.1 + 1.006j, 0.1), + (0.1001 + 2.0j, 0.1 + 2.001j, 0.01), + ], +) +def test_assert_almost_equal_complex_numbers(a, b, rtol): + _assert_almost_equal_both(a, b, rtol=rtol) + _assert_almost_equal_both(np.complex64(a), np.complex64(b), rtol=rtol) + _assert_almost_equal_both(np.complex128(a), np.complex128(b), rtol=rtol) + + +@pytest.mark.parametrize( + "a,b,rtol", + [ + (0.58310768, 0.58330768, 1e-7), + (-0.908 + 0.2j, -0.978 + 0.2j, 0.001), + (0.1 + 1j, 0.1 + 2j, 0.01), + (-0.132 + 1.001j, -0.132 + 1.005j, 1e-5), + (0.58310768j, 0.58330768j, 1e-9), + ], +) +def test_assert_not_almost_equal_complex_numbers(a, b, rtol): + _assert_not_almost_equal_both(a, b, rtol=rtol) + _assert_not_almost_equal_both(np.complex64(a), np.complex64(b), rtol=rtol) + _assert_not_almost_equal_both(np.complex128(a), np.complex128(b), rtol=rtol) + + @pytest.mark.parametrize("a,b", [(0, 0), (0, 0.0), (0, np.float64(0)), (0.00000001, 0)]) def test_assert_almost_equal_numbers_with_zeros(a, b): _assert_almost_equal_both(a, b) From f0ea1adcb5a7dd58d48b82d833027b505e676a8b Mon Sep 17 00:00:00 2001 From: BeanNan <40813941+BeanNan@users.noreply.github.com> Date: Fri, 2 Oct 2020 02:32:24 +0800 Subject: [PATCH 0968/1025] BUG: Index sortlevel ascending add type checking #32334 (#36767) * BUG: Index sortlevel ascending add type checking #32334 * DOC: add v1.2 whatsnew entry #32334 * BUG: adjust judgment conditions, len(ascending) > 1 => len(ascending) != 1 * DOC: adjustment whatsnew Co-authored-by: beanan --- doc/source/whatsnew/v1.2.0.rst | 2 +- pandas/core/indexes/base.py | 14 ++++++++++++++ pandas/tests/indexes/test_base.py | 25 +++++++++++++++++++++++++ 3 files changed, 40 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 302aa3f9aa998..016e8d90e7d21 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -353,7 +353,7 @@ Indexing - Bug in :meth:`PeriodIndex.get_loc` incorrectly raising ``ValueError`` on non-datelike strings instead of ``KeyError``, causing similar errors in :meth:`Series.__geitem__`, :meth:`Series.__contains__`, and :meth:`Series.loc.__getitem__` (:issue:`34240`) - Bug in :meth:`Index.sort_values` where, when empty values were passed, the method would break by trying to compare missing values instead of pushing them to the end of the sort order. (:issue:`35584`) - Bug in :meth:`Index.get_indexer` and :meth:`Index.get_indexer_non_unique` where int64 arrays are returned instead of intp. (:issue:`36359`) -- +- Bug in :meth:`DataFrame.sort_index` where parameter ascending passed as a list on a single level index gives wrong result. (:issue:`32334`) Missing ^^^^^^^ diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 8ee09d8ad9be3..ff3d8bf05f9a5 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -1515,6 +1515,20 @@ def sortlevel(self, level=None, ascending=True, sort_remaining=None): ------- Index """ + if not isinstance(ascending, (list, bool)): + raise TypeError( + "ascending must be a single bool value or" + "a list of bool values of length 1" + ) + + if isinstance(ascending, list): + if len(ascending) != 1: + raise TypeError("ascending must be a list of bool values of length 1") + ascending = ascending[0] + + if not isinstance(ascending, bool): + raise TypeError("ascending must be a bool value") + return self.sort_values(return_indexer=True, ascending=ascending) def _get_level_values(self, level): diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 77585f4003fe9..8db1bcc84bfa6 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -2222,6 +2222,31 @@ def test_contains_method_removed(self, index): with pytest.raises(AttributeError, match=msg): index.contains(1) + def test_sortlevel(self): + index = pd.Index([5, 4, 3, 2, 1]) + with pytest.raises(Exception, match="ascending must be a single bool value or"): + index.sortlevel(ascending="True") + + with pytest.raises( + Exception, match="ascending must be a list of bool values of length 1" + ): + index.sortlevel(ascending=[True, True]) + + with pytest.raises(Exception, match="ascending must be a bool value"): + index.sortlevel(ascending=["True"]) + + expected = pd.Index([1, 2, 3, 4, 5]) + result = index.sortlevel(ascending=[True]) + tm.assert_index_equal(result[0], expected) + + expected = pd.Index([1, 2, 3, 4, 5]) + result = index.sortlevel(ascending=True) + tm.assert_index_equal(result[0], expected) + + expected = pd.Index([5, 4, 3, 2, 1]) + result = index.sortlevel(ascending=False) + tm.assert_index_equal(result[0], expected) + class TestMixedIntIndex(Base): # Mostly the tests from common.py for which the results differ From d0b2cad9ed780f19accad35765eb84003d27397f Mon Sep 17 00:00:00 2001 From: Erfan Nariman <34067903+erfannariman@users.noreply.github.com> Date: Fri, 2 Oct 2020 16:33:45 +0200 Subject: [PATCH 0969/1025] DOC: Fix code style in documentation (#36780) --- .../06_calculate_statistics.rst | 5 +- .../07_reshape_table_layout.rst | 32 ++++---- .../intro_tutorials/08_combine_dataframes.rst | 6 +- .../intro_tutorials/09_timeseries.rst | 7 +- .../intro_tutorials/10_text_data.rst | 3 +- doc/source/user_guide/10min.rst | 76 ++++++++++++------- doc/source/user_guide/sparse.rst | 35 +++++---- 7 files changed, 94 insertions(+), 70 deletions(-) diff --git a/doc/source/getting_started/intro_tutorials/06_calculate_statistics.rst b/doc/source/getting_started/intro_tutorials/06_calculate_statistics.rst index bd85160d2622a..7e919777fdf03 100644 --- a/doc/source/getting_started/intro_tutorials/06_calculate_statistics.rst +++ b/doc/source/getting_started/intro_tutorials/06_calculate_statistics.rst @@ -122,8 +122,9 @@ aggregating statistics for given columns can be defined using the .. ipython:: python - titanic.agg({'Age': ['min', 'max', 'median', 'skew'], - 'Fare': ['min', 'max', 'median', 'mean']}) + titanic.agg( + {"Age": ["min", "max", "median", "skew"], "Fare": ["min", "max", "median", "mean"]} + ) .. raw:: html diff --git a/doc/source/getting_started/intro_tutorials/07_reshape_table_layout.rst b/doc/source/getting_started/intro_tutorials/07_reshape_table_layout.rst index c16fec6aaba9f..20c36133330c4 100644 --- a/doc/source/getting_started/intro_tutorials/07_reshape_table_layout.rst +++ b/doc/source/getting_started/intro_tutorials/07_reshape_table_layout.rst @@ -101,8 +101,9 @@ measurement. .. ipython:: python - air_quality = pd.read_csv("data/air_quality_long.csv", - index_col="date.utc", parse_dates=True) + air_quality = pd.read_csv( + "data/air_quality_long.csv", index_col="date.utc", parse_dates=True + ) air_quality.head() .. raw:: html @@ -247,8 +248,9 @@ I want the mean concentrations for :math:`NO_2` and :math:`PM_{2.5}` in each of .. ipython:: python - air_quality.pivot_table(values="value", index="location", - columns="parameter", aggfunc="mean") + air_quality.pivot_table( + values="value", index="location", columns="parameter", aggfunc="mean" + ) In the case of :meth:`~DataFrame.pivot`, the data is only rearranged. When multiple values need to be aggregated (in this specific case, the values on @@ -266,9 +268,13 @@ the ``margin`` parameter to ``True``: .. ipython:: python - air_quality.pivot_table(values="value", index="location", - columns="parameter", aggfunc="mean", - margins=True) + air_quality.pivot_table( + values="value", + index="location", + columns="parameter", + aggfunc="mean", + margins=True, + ) .. raw:: html @@ -345,12 +351,12 @@ The :func:`pandas.melt` method can be defined in more detail: .. ipython:: python - no_2 = no2_pivoted.melt(id_vars="date.utc", - value_vars=["BETR801", - "FR04014", - "London Westminster"], - value_name="NO_2", - var_name="id_location") + no_2 = no2_pivoted.melt( + id_vars="date.utc", + value_vars=["BETR801", "FR04014", "London Westminster"], + value_name="NO_2", + var_name="id_location", + ) no_2.head() The result in the same, but in more detail defined: diff --git a/doc/source/getting_started/intro_tutorials/08_combine_dataframes.rst b/doc/source/getting_started/intro_tutorials/08_combine_dataframes.rst index d6da9a0aa4f22..be4c284912db4 100644 --- a/doc/source/getting_started/intro_tutorials/08_combine_dataframes.rst +++ b/doc/source/getting_started/intro_tutorials/08_combine_dataframes.rst @@ -155,8 +155,7 @@ index. For example: .. ipython:: python - air_quality_ = pd.concat([air_quality_pm25, air_quality_no2], - keys=["PM25", "NO2"]) + air_quality_ = pd.concat([air_quality_pm25, air_quality_no2], keys=["PM25", "NO2"]) .. ipython:: python @@ -233,8 +232,7 @@ Add the station coordinates, provided by the stations metadata table, to the cor .. ipython:: python - air_quality = pd.merge(air_quality, stations_coord, - how='left', on='location') + air_quality = pd.merge(air_quality, stations_coord, how="left", on="location") air_quality.head() Using the :meth:`~pandas.merge` function, for each of the rows in the diff --git a/doc/source/getting_started/intro_tutorials/09_timeseries.rst b/doc/source/getting_started/intro_tutorials/09_timeseries.rst index 19351e0e3bc75..598d3514baa15 100644 --- a/doc/source/getting_started/intro_tutorials/09_timeseries.rst +++ b/doc/source/getting_started/intro_tutorials/09_timeseries.rst @@ -204,10 +204,9 @@ Plot the typical :math:`NO_2` pattern during the day of our time series of all s .. ipython:: python fig, axs = plt.subplots(figsize=(12, 4)) - air_quality.groupby( - air_quality["datetime"].dt.hour)["value"].mean().plot(kind='bar', - rot=0, - ax=axs) + air_quality.groupby(air_quality["datetime"].dt.hour)["value"].mean().plot( + kind='bar', rot=0, ax=axs + ) plt.xlabel("Hour of the day"); # custom x label using matplotlib @savefig 09_bar_chart.png plt.ylabel("$NO_2 (µg/m^3)$"); diff --git a/doc/source/getting_started/intro_tutorials/10_text_data.rst b/doc/source/getting_started/intro_tutorials/10_text_data.rst index 93ad35fb1960b..b7fb99a98d78f 100644 --- a/doc/source/getting_started/intro_tutorials/10_text_data.rst +++ b/doc/source/getting_started/intro_tutorials/10_text_data.rst @@ -224,8 +224,7 @@ In the "Sex" column, replace values of "male" by "M" and values of "female" by " .. ipython:: python - titanic["Sex_short"] = titanic["Sex"].replace({"male": "M", - "female": "F"}) + titanic["Sex_short"] = titanic["Sex"].replace({"male": "M", "female": "F"}) titanic["Sex_short"] Whereas :meth:`~Series.replace` is not a string method, it provides a convenient way diff --git a/doc/source/user_guide/10min.rst b/doc/source/user_guide/10min.rst index c3746cbe777a3..673f8689736f1 100644 --- a/doc/source/user_guide/10min.rst +++ b/doc/source/user_guide/10min.rst @@ -43,12 +43,16 @@ Creating a :class:`DataFrame` by passing a dict of objects that can be converted .. ipython:: python - df2 = pd.DataFrame({'A': 1., - 'B': pd.Timestamp('20130102'), - 'C': pd.Series(1, index=list(range(4)), dtype='float32'), - 'D': np.array([3] * 4, dtype='int32'), - 'E': pd.Categorical(["test", "train", "test", "train"]), - 'F': 'foo'}) + df2 = pd.DataFrame( + { + "A": 1.0, + "B": pd.Timestamp("20130102"), + "C": pd.Series(1, index=list(range(4)), dtype="float32"), + "D": np.array([3] * 4, dtype="int32"), + "E": pd.Categorical(["test", "train", "test", "train"]), + "F": "foo", + } + ) df2 The columns of the resulting :class:`DataFrame` have different @@ -512,12 +516,14 @@ See the :ref:`Grouping section `. .. ipython:: python - df = pd.DataFrame({'A': ['foo', 'bar', 'foo', 'bar', - 'foo', 'bar', 'foo', 'foo'], - 'B': ['one', 'one', 'two', 'three', - 'two', 'two', 'one', 'three'], - 'C': np.random.randn(8), - 'D': np.random.randn(8)}) + df = pd.DataFrame( + { + "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], + "B": ["one", "one", "two", "three", "two", "two", "one", "three"], + "C": np.random.randn(8), + "D": np.random.randn(8), + } + ) df Grouping and then applying the :meth:`~pandas.core.groupby.GroupBy.sum` function to the resulting @@ -545,10 +551,14 @@ Stack .. ipython:: python - tuples = list(zip(*[['bar', 'bar', 'baz', 'baz', - 'foo', 'foo', 'qux', 'qux'], - ['one', 'two', 'one', 'two', - 'one', 'two', 'one', 'two']])) + tuples = list( + zip( + *[ + ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"], + ["one", "two", "one", "two", "one", "two", "one", "two"], + ] + ) + ) index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second']) df = pd.DataFrame(np.random.randn(8, 2), index=index, columns=['A', 'B']) df2 = df[:4] @@ -578,11 +588,15 @@ See the section on :ref:`Pivot Tables `. .. ipython:: python - df = pd.DataFrame({'A': ['one', 'one', 'two', 'three'] * 3, - 'B': ['A', 'B', 'C'] * 4, - 'C': ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 2, - 'D': np.random.randn(12), - 'E': np.random.randn(12)}) + df = pd.DataFrame( + { + "A": ["one", "one", "two", "three"] * 3, + "B": ["A", "B", "C"] * 4, + "C": ["foo", "foo", "foo", "bar", "bar", "bar"] * 2, + "D": np.random.randn(12), + "E": np.random.randn(12), + } + ) df We can produce pivot tables from this data very easily: @@ -653,8 +667,10 @@ pandas can include categorical data in a :class:`DataFrame`. For full docs, see .. ipython:: python - df = pd.DataFrame({"id": [1, 2, 3, 4, 5, 6], - "raw_grade": ['a', 'b', 'b', 'a', 'a', 'e']}) + df = pd.DataFrame( + {"id": [1, 2, 3, 4, 5, 6], "raw_grade": ["a", "b", "b", "a", "a", "e"]} + ) + Convert the raw grades to a categorical data type. @@ -674,8 +690,9 @@ Reorder the categories and simultaneously add the missing categories (methods un .. ipython:: python - df["grade"] = df["grade"].cat.set_categories(["very bad", "bad", "medium", - "good", "very good"]) + df["grade"] = df["grade"].cat.set_categories( + ["very bad", "bad", "medium", "good", "very good"] + ) df["grade"] Sorting is per order in the categories, not lexical order. @@ -705,8 +722,7 @@ We use the standard convention for referencing the matplotlib API: .. ipython:: python - ts = pd.Series(np.random.randn(1000), - index=pd.date_range('1/1/2000', periods=1000)) + ts = pd.Series(np.random.randn(1000), index=pd.date_range("1/1/2000", periods=1000)) ts = ts.cumsum() @savefig series_plot_basic.png @@ -717,8 +733,10 @@ of the columns with labels: .. ipython:: python - df = pd.DataFrame(np.random.randn(1000, 4), index=ts.index, - columns=['A', 'B', 'C', 'D']) + df = pd.DataFrame( + np.random.randn(1000, 4), index=ts.index, columns=["A", "B", "C", "D"] + ) + df = df.cumsum() plt.figure() diff --git a/doc/source/user_guide/sparse.rst b/doc/source/user_guide/sparse.rst index 35e0e0fb86472..62e35cb994faf 100644 --- a/doc/source/user_guide/sparse.rst +++ b/doc/source/user_guide/sparse.rst @@ -303,14 +303,17 @@ The method requires a ``MultiIndex`` with two or more levels. .. ipython:: python s = pd.Series([3.0, np.nan, 1.0, 3.0, np.nan, np.nan]) - s.index = pd.MultiIndex.from_tuples([(1, 2, 'a', 0), - (1, 2, 'a', 1), - (1, 1, 'b', 0), - (1, 1, 'b', 1), - (2, 1, 'b', 0), - (2, 1, 'b', 1)], - names=['A', 'B', 'C', 'D']) - s + s.index = pd.MultiIndex.from_tuples( + [ + (1, 2, "a", 0), + (1, 2, "a", 1), + (1, 1, "b", 0), + (1, 1, "b", 1), + (2, 1, "b", 0), + (2, 1, "b", 1), + ], + names=["A", "B", "C", "D"], + ) ss = s.astype('Sparse') ss @@ -318,9 +321,10 @@ In the example below, we transform the ``Series`` to a sparse representation of .. ipython:: python - A, rows, columns = ss.sparse.to_coo(row_levels=['A', 'B'], - column_levels=['C', 'D'], - sort_labels=True) + A, rows, columns = ss.sparse.to_coo( + row_levels=["A", "B"], column_levels=["C", "D"], sort_labels=True + ) + A A.todense() @@ -331,9 +335,9 @@ Specifying different row and column labels (and not sorting them) yields a diffe .. ipython:: python - A, rows, columns = ss.sparse.to_coo(row_levels=['A', 'B', 'C'], - column_levels=['D'], - sort_labels=False) + A, rows, columns = ss.sparse.to_coo( + row_levels=["A", "B", "C"], column_levels=["D"], sort_labels=False + ) A A.todense() @@ -345,8 +349,7 @@ A convenience method :meth:`Series.sparse.from_coo` is implemented for creating .. ipython:: python from scipy import sparse - A = sparse.coo_matrix(([3.0, 1.0, 2.0], ([1, 0, 0], [0, 2, 3])), - shape=(3, 4)) + A = sparse.coo_matrix(([3.0, 1.0, 2.0], ([1, 0, 0], [0, 2, 3])), shape=(3, 4)) A A.todense() From a504e47e00dbc88a9af139e712c426a2f2db1e6b Mon Sep 17 00:00:00 2001 From: Iqrar Agalosi Nureyza Date: Sat, 3 Oct 2020 00:34:26 +0700 Subject: [PATCH 0970/1025] DOC: Fix PR09 errors in several files (#36763) --- pandas/core/computation/eval.py | 3 ++- pandas/core/flags.py | 2 +- pandas/core/frame.py | 2 +- pandas/core/generic.py | 8 ++++---- pandas/core/groupby/groupby.py | 4 ++-- pandas/core/series.py | 2 +- pandas/io/json/_json.py | 2 +- pandas/io/pickle.py | 4 ++-- pandas/io/pytables.py | 4 ++-- 9 files changed, 16 insertions(+), 15 deletions(-) diff --git a/pandas/core/computation/eval.py b/pandas/core/computation/eval.py index 630606b4d8111..913f135b449f3 100644 --- a/pandas/core/computation/eval.py +++ b/pandas/core/computation/eval.py @@ -212,7 +212,8 @@ def eval( truediv : bool, optional Whether to use true division, like in Python >= 3. - deprecated:: 1.0.0 + + .. deprecated:: 1.0.0 local_dict : dict or None, optional A dictionary of local variables, taken from locals() by default. diff --git a/pandas/core/flags.py b/pandas/core/flags.py index 15966d8ddce2a..6a09bfa3bd082 100644 --- a/pandas/core/flags.py +++ b/pandas/core/flags.py @@ -10,7 +10,7 @@ class Flags: Parameters ---------- obj : Series or DataFrame - The object these flags are associated with + The object these flags are associated with. allows_duplicate_labels : bool, default True Whether to allow duplicate labels in this object. By default, duplicate labels are permitted. Setting this to ``False`` will diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 9b2540a1ce043..75fdeb122a074 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2337,7 +2337,7 @@ def to_parquet( be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error will be raised if providing this argument with a local path or a file-like buffer. See the fsspec and backend storage implementation - docs for the set of allowed keys and values + docs for the set of allowed keys and values. .. versionadded:: 1.2.0 diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 6f0aa70625c1d..04e1fc91c5fd4 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2253,7 +2253,7 @@ def to_json( be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error will be raised if providing this argument with a local path or a file-like buffer. See the fsspec and backend storage implementation - docs for the set of allowed keys and values + docs for the set of allowed keys and values. .. versionadded:: 1.2.0 @@ -2777,7 +2777,7 @@ def to_pickle( be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error will be raised if providing this argument with a local path or a file-like buffer. See the fsspec and backend storage implementation - docs for the set of allowed keys and values + docs for the set of allowed keys and values. .. versionadded:: 1.2.0 @@ -3286,7 +3286,7 @@ def to_csv( be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error will be raised if providing this argument with a local path or a file-like buffer. See the fsspec and backend storage implementation - docs for the set of allowed keys and values + docs for the set of allowed keys and values. .. versionadded:: 1.2.0 @@ -9195,7 +9195,7 @@ def shift( extend the index when shifting and preserve the original data. If `freq` is specified as "infer" then it will be inferred from the freq or inferred_freq attributes of the index. If neither of - those attributes exist, a ValueError is thrown + those attributes exist, a ValueError is thrown. axis : {{0 or 'index', 1 or 'columns', None}}, default None Shift direction. fill_value : object, optional diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index f1a61f433fc51..887f50f8dbcd5 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -285,7 +285,7 @@ class providing the base-class of operations. .. versionchanged:: 1.1.0 *args - Positional arguments to pass to func + Positional arguments to pass to func. engine : str, default None * ``'cython'`` : Runs the function through C-extensions from cython. * ``'numba'`` : Runs the function through JIT compiled code from numba. @@ -394,7 +394,7 @@ class providing the base-class of operations. .. versionchanged:: 1.1.0 *args - Positional arguments to pass to func + Positional arguments to pass to func. engine : str, default None * ``'cython'`` : Runs the function through C-extensions from cython. * ``'numba'`` : Runs the function through JIT compiled code from numba. diff --git a/pandas/core/series.py b/pandas/core/series.py index fca1b6b08b434..2b972d33d7cdd 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1458,7 +1458,7 @@ def to_markdown( be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error will be raised if providing this argument with a local path or a file-like buffer. See the fsspec and backend storage implementation - docs for the set of allowed keys and values + docs for the set of allowed keys and values. .. versionadded:: 1.2.0 diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 51bcb4acddd7e..ef684469dffbb 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -525,7 +525,7 @@ def read_json( be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error will be raised if providing this argument with a local path or a file-like buffer. See the fsspec and backend storage implementation - docs for the set of allowed keys and values + docs for the set of allowed keys and values. .. versionadded:: 1.2.0 diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py index 655deb5ca3779..80baa6f78ddd7 100644 --- a/pandas/io/pickle.py +++ b/pandas/io/pickle.py @@ -49,7 +49,7 @@ def to_pickle( be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error will be raised if providing this argument with a local path or a file-like buffer. See the fsspec and backend storage implementation - docs for the set of allowed keys and values + docs for the set of allowed keys and values. .. versionadded:: 1.2.0 @@ -146,7 +146,7 @@ def read_pickle( be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error will be raised if providing this argument with a local path or a file-like buffer. See the fsspec and backend storage implementation - docs for the set of allowed keys and values + docs for the set of allowed keys and values. .. versionadded:: 1.2.0 diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index d62480baed71e..d0ea327a65a1d 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -615,8 +615,8 @@ def keys(self, include: str = "pandas") -> List[str]: ---------- include : str, default 'pandas' - When kind equals 'pandas' return pandas objects - When kind equals 'native' return native HDF5 Table objects + When kind equals 'pandas' return pandas objects. + When kind equals 'native' return native HDF5 Table objects. .. versionadded:: 1.1.0 From 713dfb25959a944cfd1f41e9bead5e3dcea919f7 Mon Sep 17 00:00:00 2001 From: krajatcl <53620269+krajatcl@users.noreply.github.com> Date: Fri, 2 Oct 2020 23:38:58 +0530 Subject: [PATCH 0971/1025] TST: insert 'match' to bare pytest raises in pandas/tests/indexing/test_indexing.py (#36809) Co-authored-by: Rajat Bishnoi --- pandas/tests/indexing/test_indexing.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index 7d5fea232817d..fd83f9ab29407 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -31,7 +31,11 @@ def test_setitem_ndarray_1d(self): df["bar"] = np.zeros(10, dtype=complex) # invalid - with pytest.raises(ValueError): + msg = ( + "cannot set using a multi-index selection " + "indexer with a different length than the value" + ) + with pytest.raises(ValueError, match=msg): df.loc[df.index[2:5], "bar"] = np.array([2.33j, 1.23 + 0.1j, 2.2, 1.0]) # valid @@ -48,7 +52,8 @@ def test_setitem_ndarray_1d(self): df["foo"] = np.zeros(10, dtype=np.float64) df["bar"] = np.zeros(10, dtype=complex) - with pytest.raises(ValueError): + msg = "Must have equal len keys and value when setting with an iterable" + with pytest.raises(ValueError, match=msg): df[2:5] = np.arange(1, 4) * 1j @pytest.mark.parametrize( @@ -1055,13 +1060,13 @@ def test_1tuple_without_multiindex(): def test_duplicate_index_mistyped_key_raises_keyerror(): # GH#29189 float_index.get_loc(None) should raise KeyError, not TypeError ser = pd.Series([2, 5, 6, 8], index=[2.0, 4.0, 4.0, 5.0]) - with pytest.raises(KeyError): + with pytest.raises(KeyError, match="None"): ser[None] - with pytest.raises(KeyError): + with pytest.raises(KeyError, match="None"): ser.index.get_loc(None) - with pytest.raises(KeyError): + with pytest.raises(KeyError, match="None"): ser.index._engine.get_loc(None) From e225361bfd95c75d20df2d348b2cba42c7f73a75 Mon Sep 17 00:00:00 2001 From: patrick <61934744+phofl@users.noreply.github.com> Date: Fri, 2 Oct 2020 20:31:09 +0200 Subject: [PATCH 0972/1025] Add test for 32724 (#36789) --- pandas/tests/window/test_rolling.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index 9ac4871ad24a1..10527649b728f 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -824,3 +824,14 @@ def test_rolling_axis_1_non_numeric_dtypes(value): result = df.rolling(window=2, min_periods=1, axis=1).sum() expected = pd.DataFrame({"a": [1.0, 2.0]}) tm.assert_frame_equal(result, expected) + + +def test_rolling_on_df_transposed(): + # GH: 32724 + df = pd.DataFrame({"A": [1, None], "B": [4, 5], "C": [7, 8]}) + expected = pd.DataFrame({"A": [1.0, np.nan], "B": [5.0, 5.0], "C": [11.0, 13.0]}) + result = df.rolling(min_periods=1, window=2, axis=1).sum() + tm.assert_frame_equal(result, expected) + + result = df.T.rolling(min_periods=1, window=2).sum().T + tm.assert_frame_equal(result, expected) From 62dd3df9aa038667359ac1e15e50b5bc91e122b2 Mon Sep 17 00:00:00 2001 From: John Karasinski Date: Fri, 2 Oct 2020 12:15:07 -0700 Subject: [PATCH 0973/1025] DOC: use blacken to fix code style in documentation #36777 (#36811) --- doc/source/user_guide/cookbook.rst | 582 +++++++++++++++++------------ 1 file changed, 350 insertions(+), 232 deletions(-) diff --git a/doc/source/user_guide/cookbook.rst b/doc/source/user_guide/cookbook.rst index e33e85d3d2224..0a30d865f3c23 100644 --- a/doc/source/user_guide/cookbook.rst +++ b/doc/source/user_guide/cookbook.rst @@ -33,9 +33,9 @@ These are some neat pandas ``idioms`` .. ipython:: python - df = pd.DataFrame({'AAA': [4, 5, 6, 7], - 'BBB': [10, 20, 30, 40], - 'CCC': [100, 50, -30, -50]}) + df = pd.DataFrame( + {"AAA": [4, 5, 6, 7], "BBB": [10, 20, 30, 40], "CCC": [100, 50, -30, -50]} + ) df if-then... @@ -45,30 +45,30 @@ An if-then on one column .. ipython:: python - df.loc[df.AAA >= 5, 'BBB'] = -1 + df.loc[df.AAA >= 5, "BBB"] = -1 df An if-then with assignment to 2 columns: .. ipython:: python - df.loc[df.AAA >= 5, ['BBB', 'CCC']] = 555 + df.loc[df.AAA >= 5, ["BBB", "CCC"]] = 555 df Add another line with different logic, to do the -else .. ipython:: python - df.loc[df.AAA < 5, ['BBB', 'CCC']] = 2000 + df.loc[df.AAA < 5, ["BBB", "CCC"]] = 2000 df Or use pandas where after you've set up a mask .. ipython:: python - df_mask = pd.DataFrame({'AAA': [True] * 4, - 'BBB': [False] * 4, - 'CCC': [True, False] * 2}) + df_mask = pd.DataFrame( + {"AAA": [True] * 4, "BBB": [False] * 4, "CCC": [True, False] * 2} + ) df.where(df_mask, -1000) `if-then-else using numpy's where() @@ -76,11 +76,11 @@ Or use pandas where after you've set up a mask .. ipython:: python - df = pd.DataFrame({'AAA': [4, 5, 6, 7], - 'BBB': [10, 20, 30, 40], - 'CCC': [100, 50, -30, -50]}) + df = pd.DataFrame( + {"AAA": [4, 5, 6, 7], "BBB": [10, 20, 30, 40], "CCC": [100, 50, -30, -50]} + ) df - df['logic'] = np.where(df['AAA'] > 5, 'high', 'low') + df["logic"] = np.where(df["AAA"] > 5, "high", "low") df Splitting @@ -91,9 +91,9 @@ Splitting .. ipython:: python - df = pd.DataFrame({'AAA': [4, 5, 6, 7], - 'BBB': [10, 20, 30, 40], - 'CCC': [100, 50, -30, -50]}) + df = pd.DataFrame( + {"AAA": [4, 5, 6, 7], "BBB": [10, 20, 30, 40], "CCC": [100, 50, -30, -50]} + ) df df[df.AAA <= 5] @@ -107,28 +107,28 @@ Building criteria .. ipython:: python - df = pd.DataFrame({'AAA': [4, 5, 6, 7], - 'BBB': [10, 20, 30, 40], - 'CCC': [100, 50, -30, -50]}) + df = pd.DataFrame( + {"AAA": [4, 5, 6, 7], "BBB": [10, 20, 30, 40], "CCC": [100, 50, -30, -50]} + ) df ...and (without assignment returns a Series) .. ipython:: python - df.loc[(df['BBB'] < 25) & (df['CCC'] >= -40), 'AAA'] + df.loc[(df["BBB"] < 25) & (df["CCC"] >= -40), "AAA"] ...or (without assignment returns a Series) .. ipython:: python - df.loc[(df['BBB'] > 25) | (df['CCC'] >= -40), 'AAA'] + df.loc[(df["BBB"] > 25) | (df["CCC"] >= -40), "AAA"] ...or (with assignment modifies the DataFrame.) .. ipython:: python - df.loc[(df['BBB'] > 25) | (df['CCC'] >= 75), 'AAA'] = 0.1 + df.loc[(df["BBB"] > 25) | (df["CCC"] >= 75), "AAA"] = 0.1 df `Select rows with data closest to certain value using argsort @@ -136,9 +136,9 @@ Building criteria .. ipython:: python - df = pd.DataFrame({'AAA': [4, 5, 6, 7], - 'BBB': [10, 20, 30, 40], - 'CCC': [100, 50, -30, -50]}) + df = pd.DataFrame( + {"AAA": [4, 5, 6, 7], "BBB": [10, 20, 30, 40], "CCC": [100, 50, -30, -50]} + ) df aValue = 43.0 df.loc[(df.CCC - aValue).abs().argsort()] @@ -148,9 +148,9 @@ Building criteria .. ipython:: python - df = pd.DataFrame({'AAA': [4, 5, 6, 7], - 'BBB': [10, 20, 30, 40], - 'CCC': [100, 50, -30, -50]}) + df = pd.DataFrame( + {"AAA": [4, 5, 6, 7], "BBB": [10, 20, 30, 40], "CCC": [100, 50, -30, -50]} + ) df Crit1 = df.AAA <= 5.5 @@ -189,9 +189,9 @@ The :ref:`indexing ` docs. .. ipython:: python - df = pd.DataFrame({'AAA': [4, 5, 6, 7], - 'BBB': [10, 20, 30, 40], - 'CCC': [100, 50, -30, -50]}) + df = pd.DataFrame( + {"AAA": [4, 5, 6, 7], "BBB": [10, 20, 30, 40], "CCC": [100, 50, -30, -50]} + ) df df[(df.AAA <= 6) & (df.index.isin([0, 2, 4]))] @@ -201,10 +201,10 @@ The :ref:`indexing ` docs. .. ipython:: python - df = pd.DataFrame({'AAA': [4, 5, 6, 7], - 'BBB': [10, 20, 30, 40], - 'CCC': [100, 50, -30, -50]}, - index=['foo', 'bar', 'boo', 'kar']) + df = pd.DataFrame( + {"AAA": [4, 5, 6, 7], "BBB": [10, 20, 30, 40], "CCC": [100, 50, -30, -50]}, + index=["foo", "bar", "boo", "kar"], + ) There are 2 explicit slicing methods, with a third general case @@ -216,19 +216,17 @@ There are 2 explicit slicing methods, with a third general case .. ipython:: python df.iloc[0:3] # Positional - df.loc['bar':'kar'] # Label + df.loc["bar":"kar"] # Label # Generic df[0:3] - df['bar':'kar'] + df["bar":"kar"] Ambiguity arises when an index consists of integers with a non-zero start or non-unit increment. .. ipython:: python - data = {'AAA': [4, 5, 6, 7], - 'BBB': [10, 20, 30, 40], - 'CCC': [100, 50, -30, -50]} + data = {"AAA": [4, 5, 6, 7], "BBB": [10, 20, 30, 40], "CCC": [100, 50, -30, -50]} df2 = pd.DataFrame(data=data, index=[1, 2, 3, 4]) # Note index starts at 1. df2.iloc[1:3] # Position-oriented df2.loc[1:3] # Label-oriented @@ -238,9 +236,9 @@ Ambiguity arises when an index consists of integers with a non-zero start or non .. ipython:: python - df = pd.DataFrame({'AAA': [4, 5, 6, 7], - 'BBB': [10, 20, 30, 40], - 'CCC': [100, 50, -30, -50]}) + df = pd.DataFrame( + {"AAA": [4, 5, 6, 7], "BBB": [10, 20, 30, 40], "CCC": [100, 50, -30, -50]} + ) df df[~((df.AAA <= 6) & (df.index.isin([0, 2, 4])))] @@ -253,14 +251,12 @@ New columns .. ipython:: python - df = pd.DataFrame({'AAA': [1, 2, 1, 3], - 'BBB': [1, 1, 2, 2], - 'CCC': [2, 1, 3, 1]}) + df = pd.DataFrame({"AAA": [1, 2, 1, 3], "BBB": [1, 1, 2, 2], "CCC": [2, 1, 3, 1]}) df - source_cols = df.columns # Or some subset would work too + source_cols = df.columns # Or some subset would work too new_cols = [str(x) + "_cat" for x in source_cols] - categories = {1: 'Alpha', 2: 'Beta', 3: 'Charlie'} + categories = {1: "Alpha", 2: "Beta", 3: "Charlie"} df[new_cols] = df[source_cols].applymap(categories.get) df @@ -270,8 +266,7 @@ New columns .. ipython:: python - df = pd.DataFrame({'AAA': [1, 1, 1, 2, 2, 2, 3, 3], - 'BBB': [2, 1, 3, 4, 5, 1, 2, 3]}) + df = pd.DataFrame({"AAA": [1, 1, 1, 2, 2, 2, 3, 3], "BBB": [2, 1, 3, 4, 5, 1, 2, 3]}) df Method 1 : idxmin() to get the index of the minimums @@ -300,25 +295,28 @@ The :ref:`multindexing ` docs. .. ipython:: python - df = pd.DataFrame({'row': [0, 1, 2], - 'One_X': [1.1, 1.1, 1.1], - 'One_Y': [1.2, 1.2, 1.2], - 'Two_X': [1.11, 1.11, 1.11], - 'Two_Y': [1.22, 1.22, 1.22]}) + df = pd.DataFrame( + { + "row": [0, 1, 2], + "One_X": [1.1, 1.1, 1.1], + "One_Y": [1.2, 1.2, 1.2], + "Two_X": [1.11, 1.11, 1.11], + "Two_Y": [1.22, 1.22, 1.22], + } + ) df # As Labelled Index - df = df.set_index('row') + df = df.set_index("row") df # With Hierarchical Columns - df.columns = pd.MultiIndex.from_tuples([tuple(c.split('_')) - for c in df.columns]) + df.columns = pd.MultiIndex.from_tuples([tuple(c.split("_")) for c in df.columns]) df # Now stack & Reset df = df.stack(0).reset_index(1) df # And fix the labels (Notice the label 'level_1' got added automatically) - df.columns = ['Sample', 'All_X', 'All_Y'] + df.columns = ["Sample", "All_X", "All_Y"] df Arithmetic @@ -329,11 +327,10 @@ Arithmetic .. ipython:: python - cols = pd.MultiIndex.from_tuples([(x, y) for x in ['A', 'B', 'C'] - for y in ['O', 'I']]) - df = pd.DataFrame(np.random.randn(2, 6), index=['n', 'm'], columns=cols) + cols = pd.MultiIndex.from_tuples([(x, y) for x in ["A", "B", "C"] for y in ["O", "I"]]) + df = pd.DataFrame(np.random.randn(2, 6), index=["n", "m"], columns=cols) df - df = df.div(df['C'], level=1) + df = df.div(df["C"], level=1) df Slicing @@ -344,10 +341,9 @@ Slicing .. ipython:: python - coords = [('AA', 'one'), ('AA', 'six'), ('BB', 'one'), ('BB', 'two'), - ('BB', 'six')] + coords = [("AA", "one"), ("AA", "six"), ("BB", "one"), ("BB", "two"), ("BB", "six")] index = pd.MultiIndex.from_tuples(coords) - df = pd.DataFrame([11, 22, 33, 44, 55], index, ['MyData']) + df = pd.DataFrame([11, 22, 33, 44, 55], index, ["MyData"]) df To take the cross section of the 1st level and 1st axis the index: @@ -355,13 +351,13 @@ To take the cross section of the 1st level and 1st axis the index: .. ipython:: python # Note : level and axis are optional, and default to zero - df.xs('BB', level=0, axis=0) + df.xs("BB", level=0, axis=0) ...and now the 2nd level of the 1st axis. .. ipython:: python - df.xs('six', level=1, axis=0) + df.xs("six", level=1, axis=0) `Slicing a MultiIndex with xs, method #2 `__ @@ -370,21 +366,20 @@ To take the cross section of the 1st level and 1st axis the index: import itertools - index = list(itertools.product(['Ada', 'Quinn', 'Violet'], - ['Comp', 'Math', 'Sci'])) - headr = list(itertools.product(['Exams', 'Labs'], ['I', 'II'])) - indx = pd.MultiIndex.from_tuples(index, names=['Student', 'Course']) - cols = pd.MultiIndex.from_tuples(headr) # Notice these are un-named + index = list(itertools.product(["Ada", "Quinn", "Violet"], ["Comp", "Math", "Sci"])) + headr = list(itertools.product(["Exams", "Labs"], ["I", "II"])) + indx = pd.MultiIndex.from_tuples(index, names=["Student", "Course"]) + cols = pd.MultiIndex.from_tuples(headr) # Notice these are un-named data = [[70 + x + y + (x * y) % 3 for x in range(4)] for y in range(9)] df = pd.DataFrame(data, indx, cols) df All = slice(None) - df.loc['Violet'] - df.loc[(All, 'Math'), All] - df.loc[(slice('Ada', 'Quinn'), 'Math'), All] - df.loc[(All, 'Math'), ('Exams')] - df.loc[(All, 'Math'), (All, 'II')] + df.loc["Violet"] + df.loc[(All, "Math"), All] + df.loc[(slice("Ada", "Quinn"), "Math"), All] + df.loc[(All, "Math"), ("Exams")] + df.loc[(All, "Math"), (All, "II")] `Setting portions of a MultiIndex with xs `__ @@ -397,7 +392,7 @@ Sorting .. ipython:: python - df.sort_values(by=('Labs', 'II'), ascending=False) + df.sort_values(by=("Labs", "II"), ascending=False) `Partial selection, the need for sortedness; `__ @@ -422,10 +417,12 @@ Fill forward a reversed timeseries .. ipython:: python - df = pd.DataFrame(np.random.randn(6, 1), - index=pd.date_range('2013-08-01', periods=6, freq='B'), - columns=list('A')) - df.loc[df.index[3], 'A'] = np.nan + df = pd.DataFrame( + np.random.randn(6, 1), + index=pd.date_range("2013-08-01", periods=6, freq="B"), + columns=list("A"), + ) + df.loc[df.index[3], "A"] = np.nan df df.reindex(df.index[::-1]).ffill() @@ -452,22 +449,26 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to .. ipython:: python - df = pd.DataFrame({'animal': 'cat dog cat fish dog cat cat'.split(), - 'size': list('SSMMMLL'), - 'weight': [8, 10, 11, 1, 20, 12, 12], - 'adult': [False] * 5 + [True] * 2}) + df = pd.DataFrame( + { + "animal": "cat dog cat fish dog cat cat".split(), + "size": list("SSMMMLL"), + "weight": [8, 10, 11, 1, 20, 12, 12], + "adult": [False] * 5 + [True] * 2, + } + ) df # List the size of the animals with the highest weight. - df.groupby('animal').apply(lambda subf: subf['size'][subf['weight'].idxmax()]) + df.groupby("animal").apply(lambda subf: subf["size"][subf["weight"].idxmax()]) `Using get_group `__ .. ipython:: python - gb = df.groupby(['animal']) - gb.get_group('cat') + gb = df.groupby(["animal"]) + gb.get_group("cat") `Apply to different items in a group `__ @@ -475,12 +476,12 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to .. ipython:: python def GrowUp(x): - avg_weight = sum(x[x['size'] == 'S'].weight * 1.5) - avg_weight += sum(x[x['size'] == 'M'].weight * 1.25) - avg_weight += sum(x[x['size'] == 'L'].weight) + avg_weight = sum(x[x["size"] == "S"].weight * 1.5) + avg_weight += sum(x[x["size"] == "M"].weight * 1.25) + avg_weight += sum(x[x["size"] == "L"].weight) avg_weight /= len(x) - return pd.Series(['L', avg_weight, True], - index=['size', 'weight', 'adult']) + return pd.Series(["L", avg_weight, True], index=["size", "weight", "adult"]) + expected_df = gb.apply(GrowUp) expected_df @@ -492,12 +493,15 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to S = pd.Series([i / 100.0 for i in range(1, 11)]) + def cum_ret(x, y): return x * (1 + y) + def red(x): return functools.reduce(cum_ret, x, 1.0) + S.expanding().apply(red, raw=True) @@ -506,13 +510,15 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to .. ipython:: python - df = pd.DataFrame({'A': [1, 1, 2, 2], 'B': [1, -1, 1, 2]}) - gb = df.groupby('A') + df = pd.DataFrame({"A": [1, 1, 2, 2], "B": [1, -1, 1, 2]}) + gb = df.groupby("A") + def replace(g): mask = g < 0 return g.where(mask, g[~mask].mean()) + gb.transform(replace) `Sort groups by aggregated data @@ -520,13 +526,17 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to .. ipython:: python - df = pd.DataFrame({'code': ['foo', 'bar', 'baz'] * 2, - 'data': [0.16, -0.21, 0.33, 0.45, -0.59, 0.62], - 'flag': [False, True] * 3}) + df = pd.DataFrame( + { + "code": ["foo", "bar", "baz"] * 2, + "data": [0.16, -0.21, 0.33, 0.45, -0.59, 0.62], + "flag": [False, True] * 3, + } + ) - code_groups = df.groupby('code') + code_groups = df.groupby("code") - agg_n_sort_order = code_groups[['data']].transform(sum).sort_values(by='data') + agg_n_sort_order = code_groups[["data"]].transform(sum).sort_values(by="data") sorted_df = df.loc[agg_n_sort_order.index] @@ -537,15 +547,17 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to .. ipython:: python - rng = pd.date_range(start="2014-10-07", periods=10, freq='2min') + rng = pd.date_range(start="2014-10-07", periods=10, freq="2min") ts = pd.Series(data=list(range(10)), index=rng) + def MyCust(x): if len(x) > 2: return x[1] * 1.234 return pd.NaT - mhc = {'Mean': np.mean, 'Max': np.max, 'Custom': MyCust} + + mhc = {"Mean": np.mean, "Max": np.max, "Custom": MyCust} ts.resample("5min").apply(mhc) ts @@ -554,10 +566,9 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to .. ipython:: python - df = pd.DataFrame({'Color': 'Red Red Red Blue'.split(), - 'Value': [100, 150, 50, 50]}) + df = pd.DataFrame({"Color": "Red Red Red Blue".split(), "Value": [100, 150, 50, 50]}) df - df['Counts'] = df.groupby(['Color']).transform(len) + df["Counts"] = df.groupby(["Color"]).transform(len) df `Shift groups of the values in a column based on the index @@ -565,13 +576,19 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to .. ipython:: python - df = pd.DataFrame({'line_race': [10, 10, 8, 10, 10, 8], - 'beyer': [99, 102, 103, 103, 88, 100]}, - index=['Last Gunfighter', 'Last Gunfighter', - 'Last Gunfighter', 'Paynter', 'Paynter', - 'Paynter']) + df = pd.DataFrame( + {"line_race": [10, 10, 8, 10, 10, 8], "beyer": [99, 102, 103, 103, 88, 100]}, + index=[ + "Last Gunfighter", + "Last Gunfighter", + "Last Gunfighter", + "Paynter", + "Paynter", + "Paynter", + ], + ) df - df['beyer_shifted'] = df.groupby(level=0)['beyer'].shift(1) + df["beyer_shifted"] = df.groupby(level=0)["beyer"].shift(1) df `Select row with maximum value from each group @@ -579,11 +596,15 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to .. ipython:: python - df = pd.DataFrame({'host': ['other', 'other', 'that', 'this', 'this'], - 'service': ['mail', 'web', 'mail', 'mail', 'web'], - 'no': [1, 2, 1, 2, 1]}).set_index(['host', 'service']) - mask = df.groupby(level=0).agg('idxmax') - df_count = df.loc[mask['no']].reset_index() + df = pd.DataFrame( + { + "host": ["other", "other", "that", "this", "this"], + "service": ["mail", "web", "mail", "mail", "web"], + "no": [1, 2, 1, 2, 1], + } + ).set_index(["host", "service"]) + mask = df.groupby(level=0).agg("idxmax") + df_count = df.loc[mask["no"]].reset_index() df_count `Grouping like Python's itertools.groupby @@ -591,9 +612,9 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to .. ipython:: python - df = pd.DataFrame([0, 1, 0, 1, 1, 1, 0, 1, 1], columns=['A']) - df['A'].groupby((df['A'] != df['A'].shift()).cumsum()).groups - df['A'].groupby((df['A'] != df['A'].shift()).cumsum()).cumsum() + df = pd.DataFrame([0, 1, 0, 1, 1, 1, 0, 1, 1], columns=["A"]) + df["A"].groupby((df["A"] != df["A"].shift()).cumsum()).groups + df["A"].groupby((df["A"] != df["A"].shift()).cumsum()).cumsum() Expanding data ************** @@ -617,12 +638,20 @@ Create a list of dataframes, split using a delineation based on logic included i .. ipython:: python - df = pd.DataFrame(data={'Case': ['A', 'A', 'A', 'B', 'A', 'A', 'B', 'A', - 'A'], - 'Data': np.random.randn(9)}) + df = pd.DataFrame( + data={ + "Case": ["A", "A", "A", "B", "A", "A", "B", "A", "A"], + "Data": np.random.randn(9), + } + ) - dfs = list(zip(*df.groupby((1 * (df['Case'] == 'B')).cumsum() - .rolling(window=3, min_periods=1).median())))[-1] + dfs = list( + zip( + *df.groupby( + (1 * (df["Case"] == "B")).cumsum().rolling(window=3, min_periods=1).median() + ) + ) + )[-1] dfs[0] dfs[1] @@ -639,14 +668,30 @@ The :ref:`Pivot ` docs. .. ipython:: python - df = pd.DataFrame(data={'Province': ['ON', 'QC', 'BC', 'AL', 'AL', 'MN', 'ON'], - 'City': ['Toronto', 'Montreal', 'Vancouver', - 'Calgary', 'Edmonton', 'Winnipeg', - 'Windsor'], - 'Sales': [13, 6, 16, 8, 4, 3, 1]}) - table = pd.pivot_table(df, values=['Sales'], index=['Province'], - columns=['City'], aggfunc=np.sum, margins=True) - table.stack('City') + df = pd.DataFrame( + data={ + "Province": ["ON", "QC", "BC", "AL", "AL", "MN", "ON"], + "City": [ + "Toronto", + "Montreal", + "Vancouver", + "Calgary", + "Edmonton", + "Winnipeg", + "Windsor", + ], + "Sales": [13, 6, 16, 8, 4, 3, 1], + } + ) + table = pd.pivot_table( + df, + values=["Sales"], + index=["Province"], + columns=["City"], + aggfunc=np.sum, + margins=True, + ) + table.stack("City") `Frequency table like plyr in R `__ @@ -654,25 +699,60 @@ The :ref:`Pivot ` docs. .. ipython:: python grades = [48, 99, 75, 80, 42, 80, 72, 68, 36, 78] - df = pd.DataFrame({'ID': ["x%d" % r for r in range(10)], - 'Gender': ['F', 'M', 'F', 'M', 'F', - 'M', 'F', 'M', 'M', 'M'], - 'ExamYear': ['2007', '2007', '2007', '2008', '2008', - '2008', '2008', '2009', '2009', '2009'], - 'Class': ['algebra', 'stats', 'bio', 'algebra', - 'algebra', 'stats', 'stats', 'algebra', - 'bio', 'bio'], - 'Participated': ['yes', 'yes', 'yes', 'yes', 'no', - 'yes', 'yes', 'yes', 'yes', 'yes'], - 'Passed': ['yes' if x > 50 else 'no' for x in grades], - 'Employed': [True, True, True, False, - False, False, False, True, True, False], - 'Grade': grades}) - - df.groupby('ExamYear').agg({'Participated': lambda x: x.value_counts()['yes'], - 'Passed': lambda x: sum(x == 'yes'), - 'Employed': lambda x: sum(x), - 'Grade': lambda x: sum(x) / len(x)}) + df = pd.DataFrame( + { + "ID": ["x%d" % r for r in range(10)], + "Gender": ["F", "M", "F", "M", "F", "M", "F", "M", "M", "M"], + "ExamYear": [ + "2007", + "2007", + "2007", + "2008", + "2008", + "2008", + "2008", + "2009", + "2009", + "2009", + ], + "Class": [ + "algebra", + "stats", + "bio", + "algebra", + "algebra", + "stats", + "stats", + "algebra", + "bio", + "bio", + ], + "Participated": [ + "yes", + "yes", + "yes", + "yes", + "no", + "yes", + "yes", + "yes", + "yes", + "yes", + ], + "Passed": ["yes" if x > 50 else "no" for x in grades], + "Employed": [True, True, True, False, False, False, False, True, True, False], + "Grade": grades, + } + ) + + df.groupby("ExamYear").agg( + { + "Participated": lambda x: x.value_counts()["yes"], + "Passed": lambda x: sum(x == "yes"), + "Employed": lambda x: sum(x), + "Grade": lambda x: sum(x) / len(x), + } + ) `Plot pandas DataFrame with year over year data `__ @@ -681,11 +761,14 @@ To create year and month cross tabulation: .. ipython:: python - df = pd.DataFrame({'value': np.random.randn(36)}, - index=pd.date_range('2011-01-01', freq='M', periods=36)) + df = pd.DataFrame( + {"value": np.random.randn(36)}, + index=pd.date_range("2011-01-01", freq="M", periods=36), + ) - pd.pivot_table(df, index=df.index.month, columns=df.index.year, - values='value', aggfunc='sum') + pd.pivot_table( + df, index=df.index.month, columns=df.index.year, values="value", aggfunc="sum" + ) Apply ***** @@ -695,15 +778,20 @@ Apply .. ipython:: python - df = pd.DataFrame(data={'A': [[2, 4, 8, 16], [100, 200], [10, 20, 30]], - 'B': [['a', 'b', 'c'], ['jj', 'kk'], ['ccc']]}, - index=['I', 'II', 'III']) + df = pd.DataFrame( + data={ + "A": [[2, 4, 8, 16], [100, 200], [10, 20, 30]], + "B": [["a", "b", "c"], ["jj", "kk"], ["ccc"]], + }, + index=["I", "II", "III"], + ) + def SeriesFromSubList(aList): return pd.Series(aList) - df_orgz = pd.concat({ind: row.apply(SeriesFromSubList) - for ind, row in df.iterrows()}) + + df_orgz = pd.concat({ind: row.apply(SeriesFromSubList) for ind, row in df.iterrows()}) df_orgz `Rolling apply with a DataFrame returning a Series @@ -713,17 +801,25 @@ Rolling Apply to multiple columns where function calculates a Series before a Sc .. ipython:: python - df = pd.DataFrame(data=np.random.randn(2000, 2) / 10000, - index=pd.date_range('2001-01-01', periods=2000), - columns=['A', 'B']) + df = pd.DataFrame( + data=np.random.randn(2000, 2) / 10000, + index=pd.date_range("2001-01-01", periods=2000), + columns=["A", "B"], + ) df + def gm(df, const): - v = ((((df['A'] + df['B']) + 1).cumprod()) - 1) * const + v = ((((df["A"] + df["B"]) + 1).cumprod()) - 1) * const return v.iloc[-1] - s = pd.Series({df.index[i]: gm(df.iloc[i:min(i + 51, len(df) - 1)], 5) - for i in range(len(df) - 50)}) + + s = pd.Series( + { + df.index[i]: gm(df.iloc[i: min(i + 51, len(df) - 1)], 5) + for i in range(len(df) - 50) + } + ) s `Rolling apply with a DataFrame returning a Scalar @@ -733,20 +829,29 @@ Rolling Apply to multiple columns where function returns a Scalar (Volume Weight .. ipython:: python - rng = pd.date_range(start='2014-01-01', periods=100) - df = pd.DataFrame({'Open': np.random.randn(len(rng)), - 'Close': np.random.randn(len(rng)), - 'Volume': np.random.randint(100, 2000, len(rng))}, - index=rng) + rng = pd.date_range(start="2014-01-01", periods=100) + df = pd.DataFrame( + { + "Open": np.random.randn(len(rng)), + "Close": np.random.randn(len(rng)), + "Volume": np.random.randint(100, 2000, len(rng)), + }, + index=rng, + ) df + def vwap(bars): - return ((bars.Close * bars.Volume).sum() / bars.Volume.sum()) + return (bars.Close * bars.Volume).sum() / bars.Volume.sum() + window = 5 - s = pd.concat([(pd.Series(vwap(df.iloc[i:i + window]), - index=[df.index[i + window]])) - for i in range(len(df) - window)]) + s = pd.concat( + [ + (pd.Series(vwap(df.iloc[i: i + window]), index=[df.index[i + window]])) + for i in range(len(df) - window) + ] + ) s.round(2) Timeseries @@ -778,8 +883,8 @@ Calculate the first day of the month for each entry in a DatetimeIndex .. ipython:: python - dates = pd.date_range('2000-01-01', periods=5) - dates.to_period(freq='M').to_timestamp() + dates = pd.date_range("2000-01-01", periods=5) + dates.to_period(freq="M").to_timestamp() .. _cookbook.resample: @@ -825,8 +930,8 @@ The :ref:`Concat ` docs. The :ref:`Join ` d .. ipython:: python - rng = pd.date_range('2000-01-01', periods=6) - df1 = pd.DataFrame(np.random.randn(6, 3), index=rng, columns=['A', 'B', 'C']) + rng = pd.date_range("2000-01-01", periods=6) + df1 = pd.DataFrame(np.random.randn(6, 3), index=rng, columns=["A", "B", "C"]) df2 = df1.copy() Depending on df construction, ``ignore_index`` may be needed @@ -841,17 +946,25 @@ Depending on df construction, ``ignore_index`` may be needed .. ipython:: python - df = pd.DataFrame(data={'Area': ['A'] * 5 + ['C'] * 2, - 'Bins': [110] * 2 + [160] * 3 + [40] * 2, - 'Test_0': [0, 1, 0, 1, 2, 0, 1], - 'Data': np.random.randn(7)}) + df = pd.DataFrame( + data={ + "Area": ["A"] * 5 + ["C"] * 2, + "Bins": [110] * 2 + [160] * 3 + [40] * 2, + "Test_0": [0, 1, 0, 1, 2, 0, 1], + "Data": np.random.randn(7), + } + ) df - df['Test_1'] = df['Test_0'] - 1 + df["Test_1"] = df["Test_0"] - 1 - pd.merge(df, df, left_on=['Bins', 'Area', 'Test_0'], - right_on=['Bins', 'Area', 'Test_1'], - suffixes=('_L', '_R')) + pd.merge( + df, + df, + left_on=["Bins", "Area", "Test_0"], + right_on=["Bins", "Area", "Test_1"], + suffixes=("_L", "_R"), + ) `How to set the index and join `__ @@ -902,16 +1015,18 @@ The :ref:`Plotting ` docs. .. ipython:: python df = pd.DataFrame( - {'stratifying_var': np.random.uniform(0, 100, 20), - 'price': np.random.normal(100, 5, 20)}) + { + "stratifying_var": np.random.uniform(0, 100, 20), + "price": np.random.normal(100, 5, 20), + } + ) - df['quartiles'] = pd.qcut( - df['stratifying_var'], - 4, - labels=['0-25%', '25-50%', '50-75%', '75-100%']) + df["quartiles"] = pd.qcut( + df["stratifying_var"], 4, labels=["0-25%", "25-50%", "50-75%", "75-100%"] + ) @savefig quartile_boxplot.png - df.boxplot(column='price', by='quartiles') + df.boxplot(column="price", by="quartiles") Data in/out ----------- @@ -973,9 +1088,9 @@ of the individual frames into a list, and then combine the frames in the list us for i in range(3): data = pd.DataFrame(np.random.randn(10, 4)) - data.to_csv('file_{}.csv'.format(i)) + data.to_csv("file_{}.csv".format(i)) - files = ['file_0.csv', 'file_1.csv', 'file_2.csv'] + files = ["file_0.csv", "file_1.csv", "file_2.csv"] result = pd.concat([pd.read_csv(f) for f in files], ignore_index=True) You can use the same approach to read all files matching a pattern. Here is an example using ``glob``: @@ -985,7 +1100,7 @@ You can use the same approach to read all files matching a pattern. Here is an import glob import os - files = glob.glob('file_*.csv') + files = glob.glob("file_*.csv") result = pd.concat([pd.read_csv(f) for f in files], ignore_index=True) Finally, this strategy will work with the other ``pd.read_*(...)`` functions described in the :ref:`io docs`. @@ -994,7 +1109,7 @@ Finally, this strategy will work with the other ``pd.read_*(...)`` functions des :suppress: for i in range(3): - os.remove('file_{}.csv'.format(i)) + os.remove("file_{}.csv".format(i)) Parsing date components in multi-columns ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -1003,12 +1118,12 @@ Parsing date components in multi-columns is faster with a format .. ipython:: python - i = pd.date_range('20000101', periods=10000) - df = pd.DataFrame({'year': i.year, 'month': i.month, 'day': i.day}) + i = pd.date_range("20000101", periods=10000) + df = pd.DataFrame({"year": i.year, "month": i.month, "day": i.day}) df.head() + %timeit pd.to_datetime(df.year * 10000 + df.month * 100 + df.day, format='%Y%m%d') - ds = df.apply(lambda x: "%04d%02d%02d" % (x['year'], - x['month'], x['day']), axis=1) + ds = df.apply(lambda x: "%04d%02d%02d" % (x["year"], x["month"], x["day"]), axis=1) ds.head() %timeit pd.to_datetime(ds) @@ -1046,18 +1161,20 @@ Option 1: pass rows explicitly to skip rows from io import StringIO - pd.read_csv(StringIO(data), sep=';', skiprows=[11, 12], - index_col=0, parse_dates=True, header=10) + pd.read_csv( + StringIO(data), sep=";", skiprows=[11, 12], index_col=0, parse_dates=True, header=10 + ) Option 2: read column names and then data """"""""""""""""""""""""""""""""""""""""" .. ipython:: python - pd.read_csv(StringIO(data), sep=';', header=10, nrows=10).columns - columns = pd.read_csv(StringIO(data), sep=';', header=10, nrows=10).columns - pd.read_csv(StringIO(data), sep=';', index_col=0, - header=12, parse_dates=True, names=columns) + pd.read_csv(StringIO(data), sep=";", header=10, nrows=10).columns + columns = pd.read_csv(StringIO(data), sep=";", header=10, nrows=10).columns + pd.read_csv( + StringIO(data), sep=";", index_col=0, header=12, parse_dates=True, names=columns + ) .. _cookbook.sql: @@ -1153,18 +1270,18 @@ Storing Attributes to a group node .. ipython:: python df = pd.DataFrame(np.random.randn(8, 3)) - store = pd.HDFStore('test.h5') - store.put('df', df) + store = pd.HDFStore("test.h5") + store.put("df", df) # you can store an arbitrary Python object via pickle - store.get_storer('df').attrs.my_attribute = {'A': 10} - store.get_storer('df').attrs.my_attribute + store.get_storer("df").attrs.my_attribute = {"A": 10} + store.get_storer("df").attrs.my_attribute .. ipython:: python :suppress: store.close() - os.remove('test.h5') + os.remove("test.h5") You can create or load a HDFStore in-memory by passing the ``driver`` parameter to PyTables. Changes are only written to disk when the HDFStore @@ -1172,10 +1289,10 @@ is closed. .. ipython:: python - store = pd.HDFStore('test.h5', 'w', diver='H5FD_CORE') + store = pd.HDFStore("test.h5", "w", diver="H5FD_CORE") df = pd.DataFrame(np.random.randn(8, 3)) - store['test'] = df + store["test"] = df # only after closing the store, data is written to disk: store.close() @@ -1183,7 +1300,7 @@ is closed. .. ipython:: python :suppress: - os.remove('test.h5') + os.remove("test.h5") .. _cookbook.binary: @@ -1232,15 +1349,14 @@ in the frame: .. code-block:: python - names = 'count', 'avg', 'scale' + names = "count", "avg", "scale" # note that the offsets are larger than the size of the type because of # struct padding offsets = 0, 8, 16 - formats = 'i4', 'f8', 'f4' - dt = np.dtype({'names': names, 'offsets': offsets, 'formats': formats}, - align=True) - df = pd.DataFrame(np.fromfile('binary.dat', dt)) + formats = "i4", "f8", "f4" + dt = np.dtype({"names": names, "offsets": offsets, "formats": formats}, align=True) + df = pd.DataFrame(np.fromfile("binary.dat", dt)) .. note:: @@ -1289,10 +1405,11 @@ The ``method`` argument within ``DataFrame.corr`` can accept a callable in addit A = a - a_bar - a_bar.T + np.full(shape=(n, n), fill_value=a_bar.mean()) B = b - b_bar - b_bar.T + np.full(shape=(n, n), fill_value=b_bar.mean()) cov_ab = np.sqrt(np.nansum(A * B)) / n - std_a = np.sqrt(np.sqrt(np.nansum(A**2)) / n) - std_b = np.sqrt(np.sqrt(np.nansum(B**2)) / n) + std_a = np.sqrt(np.sqrt(np.nansum(A ** 2)) / n) + std_b = np.sqrt(np.sqrt(np.nansum(B ** 2)) / n) return cov_ab / std_a / std_b + df = pd.DataFrame(np.random.normal(size=(100, 3))) df.corr(method=distcorr) @@ -1308,7 +1425,7 @@ The :ref:`Timedeltas ` docs. import datetime - s = pd.Series(pd.date_range('2012-1-1', periods=3, freq='D')) + s = pd.Series(pd.date_range("2012-1-1", periods=3, freq="D")) s - s.max() @@ -1329,12 +1446,12 @@ The :ref:`Timedeltas ` docs. deltas = pd.Series([datetime.timedelta(days=i) for i in range(3)]) - df = pd.DataFrame({'A': s, 'B': deltas}) + df = pd.DataFrame({"A": s, "B": deltas}) df - df['New Dates'] = df['A'] + df['B'] + df["New Dates"] = df["A"] + df["B"] - df['Delta'] = df['A'] - df['New Dates'] + df["Delta"] = df["A"] - df["New Dates"] df df.dtypes @@ -1365,7 +1482,8 @@ of the data values: rows = itertools.product(*data_dict.values()) return pd.DataFrame.from_records(rows, columns=data_dict.keys()) - df = expand_grid({'height': [60, 70], - 'weight': [100, 140, 180], - 'sex': ['Male', 'Female']}) + + df = expand_grid( + {"height": [60, 70], "weight": [100, 140, 180], "sex": ["Male", "Female"]} + ) df From 8ba4535489f940cbbf5c3c0469f48135554b6736 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Fri, 2 Oct 2020 13:24:06 -0700 Subject: [PATCH 0974/1025] CLN: Use more pytest idioms in test_momemts_ewm.py (#36801) --- .../tests/window/moments/test_moments_ewm.py | 194 +++++++++++++----- 1 file changed, 141 insertions(+), 53 deletions(-) diff --git a/pandas/tests/window/moments/test_moments_ewm.py b/pandas/tests/window/moments/test_moments_ewm.py index a83bfabc4a048..287cd7ebba536 100644 --- a/pandas/tests/window/moments/test_moments_ewm.py +++ b/pandas/tests/window/moments/test_moments_ewm.py @@ -7,21 +7,19 @@ import pandas._testing as tm -def check_ew(name=None, preserve_nan=False, series=None, frame=None, nan_locs=None): +@pytest.mark.parametrize("name", ["var", "vol", "mean"]) +def test_ewma_series(series, name): series_result = getattr(series.ewm(com=10), name)() assert isinstance(series_result, Series) - frame_result = getattr(frame.ewm(com=10), name)() - assert type(frame_result) == DataFrame - - result = getattr(series.ewm(com=10), name)() - if preserve_nan: - assert result[nan_locs].isna().all() +@pytest.mark.parametrize("name", ["var", "vol", "mean"]) +def test_ewma_frame(frame, name): + frame_result = getattr(frame.ewm(com=10), name)() + assert isinstance(frame_result, DataFrame) -def test_ewma(series, frame, nan_locs): - check_ew(name="mean", frame=frame, series=series, nan_locs=nan_locs) +def test_ewma_adjust(): vals = pd.Series(np.zeros(1000)) vals[5] = 1 result = vals.ewm(span=100, adjust=False).mean().sum() @@ -53,63 +51,153 @@ def test_ewma_nan_handling(): result = s.ewm(com=5).mean() tm.assert_series_equal(result, Series([np.nan] * 2 + [1.0] * 4)) - # GH 7603 - s0 = Series([np.nan, 1.0, 101.0]) - s1 = Series([1.0, np.nan, 101.0]) - s2 = Series([np.nan, 1.0, np.nan, np.nan, 101.0, np.nan]) - s3 = Series([1.0, np.nan, 101.0, 50.0]) - com = 2.0 - alpha = 1.0 / (1.0 + com) - - def simple_wma(s, w): - return (s.multiply(w).cumsum() / w.cumsum()).fillna(method="ffill") - - for (s, adjust, ignore_na, w) in [ - (s0, True, False, [np.nan, (1.0 - alpha), 1.0]), - (s0, True, True, [np.nan, (1.0 - alpha), 1.0]), - (s0, False, False, [np.nan, (1.0 - alpha), alpha]), - (s0, False, True, [np.nan, (1.0 - alpha), alpha]), - (s1, True, False, [(1.0 - alpha) ** 2, np.nan, 1.0]), - (s1, True, True, [(1.0 - alpha), np.nan, 1.0]), - (s1, False, False, [(1.0 - alpha) ** 2, np.nan, alpha]), - (s1, False, True, [(1.0 - alpha), np.nan, alpha]), - (s2, True, False, [np.nan, (1.0 - alpha) ** 3, np.nan, np.nan, 1.0, np.nan]), - (s2, True, True, [np.nan, (1.0 - alpha), np.nan, np.nan, 1.0, np.nan]), + +@pytest.mark.parametrize( + "s, adjust, ignore_na, w", + [ + ( + Series([np.nan, 1.0, 101.0]), + True, + False, + [np.nan, (1.0 - (1.0 / (1.0 + 2.0))), 1.0], + ), + ( + Series([np.nan, 1.0, 101.0]), + True, + True, + [np.nan, (1.0 - (1.0 / (1.0 + 2.0))), 1.0], + ), + ( + Series([np.nan, 1.0, 101.0]), + False, + False, + [np.nan, (1.0 - (1.0 / (1.0 + 2.0))), (1.0 / (1.0 + 2.0))], + ), + ( + Series([np.nan, 1.0, 101.0]), + False, + True, + [np.nan, (1.0 - (1.0 / (1.0 + 2.0))), (1.0 / (1.0 + 2.0))], + ), + ( + Series([1.0, np.nan, 101.0]), + True, + False, + [(1.0 - (1.0 / (1.0 + 2.0))) ** 2, np.nan, 1.0], + ), ( - s2, + Series([1.0, np.nan, 101.0]), + True, + True, + [(1.0 - (1.0 / (1.0 + 2.0))), np.nan, 1.0], + ), + ( + Series([1.0, np.nan, 101.0]), + False, False, + [(1.0 - (1.0 / (1.0 + 2.0))) ** 2, np.nan, (1.0 / (1.0 + 2.0))], + ), + ( + Series([1.0, np.nan, 101.0]), + False, + True, + [(1.0 - (1.0 / (1.0 + 2.0))), np.nan, (1.0 / (1.0 + 2.0))], + ), + ( + Series([np.nan, 1.0, np.nan, np.nan, 101.0, np.nan]), + True, False, - [np.nan, (1.0 - alpha) ** 3, np.nan, np.nan, alpha, np.nan], + [np.nan, (1.0 - (1.0 / (1.0 + 2.0))) ** 3, np.nan, np.nan, 1.0, np.nan], + ), + ( + Series([np.nan, 1.0, np.nan, np.nan, 101.0, np.nan]), + True, + True, + [np.nan, (1.0 - (1.0 / (1.0 + 2.0))), np.nan, np.nan, 1.0, np.nan], ), - (s2, False, True, [np.nan, (1.0 - alpha), np.nan, np.nan, alpha, np.nan]), - (s3, True, False, [(1.0 - alpha) ** 3, np.nan, (1.0 - alpha), 1.0]), - (s3, True, True, [(1.0 - alpha) ** 2, np.nan, (1.0 - alpha), 1.0]), ( - s3, + Series([np.nan, 1.0, np.nan, np.nan, 101.0, np.nan]), False, False, [ - (1.0 - alpha) ** 3, np.nan, - (1.0 - alpha) * alpha, - alpha * ((1.0 - alpha) ** 2 + alpha), + (1.0 - (1.0 / (1.0 + 2.0))) ** 3, + np.nan, + np.nan, + (1.0 / (1.0 + 2.0)), + np.nan, ], ), - (s3, False, True, [(1.0 - alpha) ** 2, np.nan, (1.0 - alpha) * alpha, alpha]), - ]: - expected = simple_wma(s, Series(w)) - result = s.ewm(com=com, adjust=adjust, ignore_na=ignore_na).mean() + ( + Series([np.nan, 1.0, np.nan, np.nan, 101.0, np.nan]), + False, + True, + [ + np.nan, + (1.0 - (1.0 / (1.0 + 2.0))), + np.nan, + np.nan, + (1.0 / (1.0 + 2.0)), + np.nan, + ], + ), + ( + Series([1.0, np.nan, 101.0, 50.0]), + True, + False, + [ + (1.0 - (1.0 / (1.0 + 2.0))) ** 3, + np.nan, + (1.0 - (1.0 / (1.0 + 2.0))), + 1.0, + ], + ), + ( + Series([1.0, np.nan, 101.0, 50.0]), + True, + True, + [ + (1.0 - (1.0 / (1.0 + 2.0))) ** 2, + np.nan, + (1.0 - (1.0 / (1.0 + 2.0))), + 1.0, + ], + ), + ( + Series([1.0, np.nan, 101.0, 50.0]), + False, + False, + [ + (1.0 - (1.0 / (1.0 + 2.0))) ** 3, + np.nan, + (1.0 - (1.0 / (1.0 + 2.0))) * (1.0 / (1.0 + 2.0)), + (1.0 / (1.0 + 2.0)) + * ((1.0 - (1.0 / (1.0 + 2.0))) ** 2 + (1.0 / (1.0 + 2.0))), + ], + ), + ( + Series([1.0, np.nan, 101.0, 50.0]), + False, + True, + [ + (1.0 - (1.0 / (1.0 + 2.0))) ** 2, + np.nan, + (1.0 - (1.0 / (1.0 + 2.0))) * (1.0 / (1.0 + 2.0)), + (1.0 / (1.0 + 2.0)), + ], + ), + ], +) +def test_ewma_nan_handling_cases(s, adjust, ignore_na, w): + # GH 7603 + expected = (s.multiply(w).cumsum() / Series(w).cumsum()).fillna(method="ffill") + result = s.ewm(com=2.0, adjust=adjust, ignore_na=ignore_na).mean() + tm.assert_series_equal(result, expected) + if ignore_na is False: + # check that ignore_na defaults to False + result = s.ewm(com=2.0, adjust=adjust).mean() tm.assert_series_equal(result, expected) - if ignore_na is False: - # check that ignore_na defaults to False - result = s.ewm(com=com, adjust=adjust).mean() - tm.assert_series_equal(result, expected) - - -@pytest.mark.parametrize("name", ["var", "vol"]) -def test_ewmvar_ewmvol(series, frame, nan_locs, name): - check_ew(name=name, frame=frame, series=series, nan_locs=nan_locs) def test_ewma_span_com_args(series): From c1fefbed4ffe4defdaa8b3e5aa1be722ca7dda39 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Fri, 2 Oct 2020 13:35:11 -0700 Subject: [PATCH 0975/1025] DEPR: min_periods=None behavior for Rolling.count (#36649) --- doc/source/whatsnew/v1.2.0.rst | 2 + pandas/_libs/window/aggregations.pyx | 56 ---------- pandas/core/window/rolling.py | 66 +++++------ .../test_moments_consistency_rolling.py | 2 +- .../moments/test_moments_rolling_functions.py | 14 ++- pandas/tests/window/test_base_indexer.py | 10 ++ pandas/tests/window/test_dtypes.py | 103 +++++++++++------- pandas/tests/window/test_grouper.py | 18 ++- pandas/tests/window/test_rolling.py | 4 +- pandas/tests/window/test_timeseries_window.py | 5 +- 10 files changed, 139 insertions(+), 141 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 016e8d90e7d21..3bfb507d2e140 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -265,6 +265,7 @@ Deprecations - Deprecated indexing :class:`DataFrame` rows with datetime-like strings ``df[string]``, use ``df.loc[string]`` instead (:issue:`36179`) - Deprecated casting an object-dtype index of ``datetime`` objects to :class:`DatetimeIndex` in the :class:`Series` constructor (:issue:`23598`) - Deprecated :meth:`Index.is_all_dates` (:issue:`27744`) +- :meth:`Rolling.count` with ``min_periods=None`` will default to the size of the window in a future version (:issue:`31302`) .. --------------------------------------------------------------------------- @@ -404,6 +405,7 @@ Groupby/resample/rolling - Bug in :meth:`DataFrame.groupby` does not always maintain column index name for ``any``, ``all``, ``bfill``, ``ffill``, ``shift`` (:issue:`29764`) - Bug in :meth:`DataFrameGroupBy.apply` raising error with ``np.nan`` group(s) when ``dropna=False`` (:issue:`35889`) - Bug in :meth:`Rolling.sum()` returned wrong values when dtypes where mixed between float and integer and axis was equal to one (:issue:`20649`, :issue:`35596`) +- Bug in :meth:`Rolling.count` returned ``np.nan`` with :class:`pandas.api.indexers.FixedForwardWindowIndexer` as window, ``min_periods=0`` and only missing values in window (:issue:`35579`) Reshaping ^^^^^^^^^ diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index 5f60b884c6ada..c6fd569247b90 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -89,62 +89,6 @@ cdef bint is_monotonic_start_end_bounds( # Physical description: 366 p. # Series: Prentice-Hall Series in Automatic Computation -# ---------------------------------------------------------------------- -# Rolling count -# this is only an impl for index not None, IOW, freq aware - - -def roll_count( - ndarray[float64_t] values, - ndarray[int64_t] start, - ndarray[int64_t] end, - int64_t minp, -): - cdef: - float64_t val, count_x = 0.0 - int64_t s, e, nobs, N = len(values) - Py_ssize_t i, j - ndarray[float64_t] output - - output = np.empty(N, dtype=float) - - with nogil: - - for i in range(0, N): - s = start[i] - e = end[i] - - if i == 0: - - # setup - count_x = 0.0 - for j in range(s, e): - val = values[j] - if notnan(val): - count_x += 1.0 - - else: - - # calculate deletes - for j in range(start[i - 1], s): - val = values[j] - if notnan(val): - count_x -= 1.0 - - # calculate adds - for j in range(end[i - 1], e): - val = values[j] - if notnan(val): - count_x += 1.0 - - if count_x >= minp: - output[i] = count_x - else: - output[i] = NaN - - return output - - # ---------------------------------------------------------------------- # Rolling sum diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 6ab42dda865e7..f207ea4cd67d4 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -17,6 +17,7 @@ Type, Union, ) +import warnings import numpy as np @@ -469,14 +470,18 @@ def _get_window_indexer(self, window: int) -> BaseIndexer: return VariableWindowIndexer(index_array=self._on.asi8, window_size=window) return FixedWindowIndexer(window_size=window) - def _apply_series(self, homogeneous_func: Callable[..., ArrayLike]) -> "Series": + def _apply_series( + self, homogeneous_func: Callable[..., ArrayLike], name: Optional[str] = None + ) -> "Series": """ Series version of _apply_blockwise """ obj = self._create_data(self._selected_obj) try: - values = self._prep_values(obj.values) + # GH 12541: Special case for count where we support date-like types + input = obj.values if name != "count" else notna(obj.values).astype(int) + values = self._prep_values(input) except (TypeError, NotImplementedError) as err: raise DataError("No numeric types to aggregate") from err @@ -484,16 +489,20 @@ def _apply_series(self, homogeneous_func: Callable[..., ArrayLike]) -> "Series": return obj._constructor(result, index=obj.index, name=obj.name) def _apply_blockwise( - self, homogeneous_func: Callable[..., ArrayLike] + self, homogeneous_func: Callable[..., ArrayLike], name: Optional[str] = None ) -> FrameOrSeriesUnion: """ Apply the given function to the DataFrame broken down into homogeneous sub-frames. """ if self._selected_obj.ndim == 1: - return self._apply_series(homogeneous_func) + return self._apply_series(homogeneous_func, name) obj = self._create_data(self._selected_obj) + if name == "count": + # GH 12541: Special case for count where we support date-like types + obj = notna(obj).astype(int) + obj._mgr = obj._mgr.consolidate() mgr = obj._mgr def hfunc(bvalues: ArrayLike) -> ArrayLike: @@ -606,7 +615,7 @@ def calc(x): return result - return self._apply_blockwise(homogeneous_func) + return self._apply_blockwise(homogeneous_func, name) def aggregate(self, func, *args, **kwargs): result, how = self._aggregate(func, *args, **kwargs) @@ -1265,33 +1274,8 @@ class RollingAndExpandingMixin(BaseWindow): ) def count(self): - # GH 32865. Using count with custom BaseIndexer subclass - # implementations shouldn't end up here - assert not isinstance(self.window, BaseIndexer) - - obj = self._create_data(self._selected_obj) - - def hfunc(values: np.ndarray) -> np.ndarray: - result = notna(values) - result = result.astype(int) - frame = type(obj)(result.T) - result = self._constructor( - frame, - window=self._get_window(), - min_periods=self.min_periods or 0, - center=self.center, - axis=self.axis, - closed=self.closed, - ).sum() - return result.values.T - - new_mgr = obj._mgr.apply(hfunc) - out = obj._constructor(new_mgr) - if obj.ndim == 1: - out.name = obj.name - else: - self._insert_on_column(out, obj) - return out + window_func = self._get_cython_func_type("roll_sum") + return self._apply(window_func, center=self.center, name="count") _shared_docs["apply"] = dedent( r""" @@ -2050,14 +2034,16 @@ def aggregate(self, func, *args, **kwargs): @Substitution(name="rolling") @Appender(_shared_docs["count"]) def count(self): - - # different impl for freq counting - # GH 32865. Use a custom count function implementation - # when using a BaseIndexer subclass as a window - if self.is_freq_type or isinstance(self.window, BaseIndexer): - window_func = self._get_roll_func("roll_count") - return self._apply(window_func, center=self.center, name="count") - + if self.min_periods is None: + warnings.warn( + ( + "min_periods=None will default to the size of window " + "consistent with other methods in a future version. " + "Specify min_periods=0 instead." + ), + FutureWarning, + ) + self.min_periods = 0 return super().count() @Substitution(name="rolling") diff --git a/pandas/tests/window/moments/test_moments_consistency_rolling.py b/pandas/tests/window/moments/test_moments_consistency_rolling.py index dfcbdde466d44..99c2c4dd0045b 100644 --- a/pandas/tests/window/moments/test_moments_consistency_rolling.py +++ b/pandas/tests/window/moments/test_moments_consistency_rolling.py @@ -452,7 +452,7 @@ def test_moment_functions_zero_length(): df2_expected = df2 functions = [ - lambda x: x.rolling(window=10).count(), + lambda x: x.rolling(window=10, min_periods=0).count(), lambda x: x.rolling(window=10, min_periods=5).cov(x, pairwise=False), lambda x: x.rolling(window=10, min_periods=5).corr(x, pairwise=False), lambda x: x.rolling(window=10, min_periods=5).max(), diff --git a/pandas/tests/window/moments/test_moments_rolling_functions.py b/pandas/tests/window/moments/test_moments_rolling_functions.py index 98c7a0a055bd3..abe75c7289ed4 100644 --- a/pandas/tests/window/moments/test_moments_rolling_functions.py +++ b/pandas/tests/window/moments/test_moments_rolling_functions.py @@ -12,7 +12,12 @@ [ [np.mean, "mean", {}], [np.nansum, "sum", {}], - [lambda x: np.isfinite(x).astype(float).sum(), "count", {}], + pytest.param( + lambda x: np.isfinite(x).astype(float).sum(), + "count", + {}, + marks=pytest.mark.filterwarnings("ignore:min_periods:FutureWarning"), + ), [np.median, "median", {}], [np.min, "min", {}], [np.max, "max", {}], @@ -33,7 +38,12 @@ def test_series(series, compare_func, roll_func, kwargs): [ [np.mean, "mean", {}], [np.nansum, "sum", {}], - [lambda x: np.isfinite(x).astype(float).sum(), "count", {}], + pytest.param( + lambda x: np.isfinite(x).astype(float).sum(), + "count", + {}, + marks=pytest.mark.filterwarnings("ignore:min_periods:FutureWarning"), + ), [np.median, "median", {}], [np.min, "min", {}], [np.max, "max", {}], diff --git a/pandas/tests/window/test_base_indexer.py b/pandas/tests/window/test_base_indexer.py index f681b19d57600..7f2d58effe1ae 100644 --- a/pandas/tests/window/test_base_indexer.py +++ b/pandas/tests/window/test_base_indexer.py @@ -138,6 +138,7 @@ def get_window_bounds(self, num_values, min_periods, center, closed): ), ], ) +@pytest.mark.filterwarnings("ignore:min_periods:FutureWarning") def test_rolling_forward_window(constructor, func, np_func, expected, np_kwargs): # GH 32865 values = np.arange(10.0) @@ -253,3 +254,12 @@ def test_non_fixed_variable_window_indexer(closed, expected_data): result = df.rolling(indexer, closed=closed).sum() expected = DataFrame(expected_data, index=index) tm.assert_frame_equal(result, expected) + + +def test_fixed_forward_indexer_count(): + # GH: 35579 + df = DataFrame({"b": [None, None, None, 7]}) + indexer = FixedForwardWindowIndexer(window_size=2) + result = df.rolling(window=indexer, min_periods=0).count() + expected = DataFrame({"b": [0.0, 0.0, 1.0, 1.0]}) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/window/test_dtypes.py b/pandas/tests/window/test_dtypes.py index 245b48b351684..fc7a51834780f 100644 --- a/pandas/tests/window/test_dtypes.py +++ b/pandas/tests/window/test_dtypes.py @@ -21,82 +21,111 @@ def get_dtype(dtype, coerce_int=None): @pytest.mark.parametrize( - "method, data, expected_data, coerce_int", + "method, data, expected_data, coerce_int, min_periods", [ - ("count", np.arange(5), [1, 2, 2, 2, 2], True), - ("count", np.arange(10, 0, -2), [1, 2, 2, 2, 2], True), - ("count", [0, 1, 2, np.nan, 4], [1, 2, 2, 1, 1], False), - ("max", np.arange(5), [np.nan, 1, 2, 3, 4], True), - ("max", np.arange(10, 0, -2), [np.nan, 10, 8, 6, 4], True), - ("max", [0, 1, 2, np.nan, 4], [np.nan, 1, 2, np.nan, np.nan], False), - ("min", np.arange(5), [np.nan, 0, 1, 2, 3], True), - ("min", np.arange(10, 0, -2), [np.nan, 8, 6, 4, 2], True), - ("min", [0, 1, 2, np.nan, 4], [np.nan, 0, 1, np.nan, np.nan], False), - ("sum", np.arange(5), [np.nan, 1, 3, 5, 7], True), - ("sum", np.arange(10, 0, -2), [np.nan, 18, 14, 10, 6], True), - ("sum", [0, 1, 2, np.nan, 4], [np.nan, 1, 3, np.nan, np.nan], False), - ("mean", np.arange(5), [np.nan, 0.5, 1.5, 2.5, 3.5], True), - ("mean", np.arange(10, 0, -2), [np.nan, 9, 7, 5, 3], True), - ("mean", [0, 1, 2, np.nan, 4], [np.nan, 0.5, 1.5, np.nan, np.nan], False), - ("std", np.arange(5), [np.nan] + [np.sqrt(0.5)] * 4, True), - ("std", np.arange(10, 0, -2), [np.nan] + [np.sqrt(2)] * 4, True), + ("count", np.arange(5), [1, 2, 2, 2, 2], True, 0), + ("count", np.arange(10, 0, -2), [1, 2, 2, 2, 2], True, 0), + ("count", [0, 1, 2, np.nan, 4], [1, 2, 2, 1, 1], False, 0), + ("max", np.arange(5), [np.nan, 1, 2, 3, 4], True, None), + ("max", np.arange(10, 0, -2), [np.nan, 10, 8, 6, 4], True, None), + ("max", [0, 1, 2, np.nan, 4], [np.nan, 1, 2, np.nan, np.nan], False, None), + ("min", np.arange(5), [np.nan, 0, 1, 2, 3], True, None), + ("min", np.arange(10, 0, -2), [np.nan, 8, 6, 4, 2], True, None), + ("min", [0, 1, 2, np.nan, 4], [np.nan, 0, 1, np.nan, np.nan], False, None), + ("sum", np.arange(5), [np.nan, 1, 3, 5, 7], True, None), + ("sum", np.arange(10, 0, -2), [np.nan, 18, 14, 10, 6], True, None), + ("sum", [0, 1, 2, np.nan, 4], [np.nan, 1, 3, np.nan, np.nan], False, None), + ("mean", np.arange(5), [np.nan, 0.5, 1.5, 2.5, 3.5], True, None), + ("mean", np.arange(10, 0, -2), [np.nan, 9, 7, 5, 3], True, None), + ("mean", [0, 1, 2, np.nan, 4], [np.nan, 0.5, 1.5, np.nan, np.nan], False, None), + ("std", np.arange(5), [np.nan] + [np.sqrt(0.5)] * 4, True, None), + ("std", np.arange(10, 0, -2), [np.nan] + [np.sqrt(2)] * 4, True, None), ( "std", [0, 1, 2, np.nan, 4], [np.nan] + [np.sqrt(0.5)] * 2 + [np.nan] * 2, False, + None, + ), + ("var", np.arange(5), [np.nan, 0.5, 0.5, 0.5, 0.5], True, None), + ("var", np.arange(10, 0, -2), [np.nan, 2, 2, 2, 2], True, None), + ("var", [0, 1, 2, np.nan, 4], [np.nan, 0.5, 0.5, np.nan, np.nan], False, None), + ("median", np.arange(5), [np.nan, 0.5, 1.5, 2.5, 3.5], True, None), + ("median", np.arange(10, 0, -2), [np.nan, 9, 7, 5, 3], True, None), + ( + "median", + [0, 1, 2, np.nan, 4], + [np.nan, 0.5, 1.5, np.nan, np.nan], + False, + None, ), - ("var", np.arange(5), [np.nan, 0.5, 0.5, 0.5, 0.5], True), - ("var", np.arange(10, 0, -2), [np.nan, 2, 2, 2, 2], True), - ("var", [0, 1, 2, np.nan, 4], [np.nan, 0.5, 0.5, np.nan, np.nan], False), - ("median", np.arange(5), [np.nan, 0.5, 1.5, 2.5, 3.5], True), - ("median", np.arange(10, 0, -2), [np.nan, 9, 7, 5, 3], True), - ("median", [0, 1, 2, np.nan, 4], [np.nan, 0.5, 1.5, np.nan, np.nan], False), ], ) -def test_series_dtypes(method, data, expected_data, coerce_int, dtypes): +def test_series_dtypes(method, data, expected_data, coerce_int, dtypes, min_periods): s = Series(data, dtype=get_dtype(dtypes, coerce_int=coerce_int)) if dtypes in ("m8[ns]", "M8[ns]") and method != "count": msg = "No numeric types to aggregate" with pytest.raises(DataError, match=msg): - getattr(s.rolling(2), method)() + getattr(s.rolling(2, min_periods=min_periods), method)() else: - result = getattr(s.rolling(2), method)() + result = getattr(s.rolling(2, min_periods=min_periods), method)() expected = Series(expected_data, dtype="float64") tm.assert_almost_equal(result, expected) @pytest.mark.parametrize( - "method, expected_data", + "method, expected_data, min_periods", [ - ("count", {0: Series([1, 2, 2, 2, 2]), 1: Series([1, 2, 2, 2, 2])}), - ("max", {0: Series([np.nan, 2, 4, 6, 8]), 1: Series([np.nan, 3, 5, 7, 9])}), - ("min", {0: Series([np.nan, 0, 2, 4, 6]), 1: Series([np.nan, 1, 3, 5, 7])}), + ("count", {0: Series([1, 2, 2, 2, 2]), 1: Series([1, 2, 2, 2, 2])}, 0), + ( + "max", + {0: Series([np.nan, 2, 4, 6, 8]), 1: Series([np.nan, 3, 5, 7, 9])}, + None, + ), + ( + "min", + {0: Series([np.nan, 0, 2, 4, 6]), 1: Series([np.nan, 1, 3, 5, 7])}, + None, + ), ( "sum", {0: Series([np.nan, 2, 6, 10, 14]), 1: Series([np.nan, 4, 8, 12, 16])}, + None, + ), + ( + "mean", + {0: Series([np.nan, 1, 3, 5, 7]), 1: Series([np.nan, 2, 4, 6, 8])}, + None, ), - ("mean", {0: Series([np.nan, 1, 3, 5, 7]), 1: Series([np.nan, 2, 4, 6, 8])}), ( "std", { 0: Series([np.nan] + [np.sqrt(2)] * 4), 1: Series([np.nan] + [np.sqrt(2)] * 4), }, + None, + ), + ( + "var", + {0: Series([np.nan, 2, 2, 2, 2]), 1: Series([np.nan, 2, 2, 2, 2])}, + None, + ), + ( + "median", + {0: Series([np.nan, 1, 3, 5, 7]), 1: Series([np.nan, 2, 4, 6, 8])}, + None, ), - ("var", {0: Series([np.nan, 2, 2, 2, 2]), 1: Series([np.nan, 2, 2, 2, 2])}), - ("median", {0: Series([np.nan, 1, 3, 5, 7]), 1: Series([np.nan, 2, 4, 6, 8])}), ], ) -def test_dataframe_dtypes(method, expected_data, dtypes): +def test_dataframe_dtypes(method, expected_data, dtypes, min_periods): if dtypes == "category": pytest.skip("Category dataframe testing not implemented.") df = DataFrame(np.arange(10).reshape((5, 2)), dtype=get_dtype(dtypes)) if dtypes in ("m8[ns]", "M8[ns]") and method != "count": msg = "No numeric types to aggregate" with pytest.raises(DataError, match=msg): - getattr(df.rolling(2), method)() + getattr(df.rolling(2, min_periods=min_periods), method)() else: - result = getattr(df.rolling(2), method)() + result = getattr(df.rolling(2, min_periods=min_periods), method)() expected = DataFrame(expected_data, dtype="float64") tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/window/test_grouper.py b/pandas/tests/window/test_grouper.py index 0eebd657e97b7..7cfac7c6a752a 100644 --- a/pandas/tests/window/test_grouper.py +++ b/pandas/tests/window/test_grouper.py @@ -45,9 +45,9 @@ def test_getitem_multiple(self): # GH 13174 g = self.frame.groupby("A") - r = g.rolling(2) + r = g.rolling(2, min_periods=0) g_mutated = get_groupby(self.frame, by="A", mutated=True) - expected = g_mutated.B.apply(lambda x: x.rolling(2).count()) + expected = g_mutated.B.apply(lambda x: x.rolling(2, min_periods=0).count()) result = r.B.count() tm.assert_series_equal(result, expected) @@ -56,7 +56,19 @@ def test_getitem_multiple(self): tm.assert_series_equal(result, expected) @pytest.mark.parametrize( - "f", ["sum", "mean", "min", "max", "count", "kurt", "skew"] + "f", + [ + "sum", + "mean", + "min", + "max", + pytest.param( + "count", + marks=pytest.mark.filterwarnings("ignore:min_periods:FutureWarning"), + ), + "kurt", + "skew", + ], ) def test_rolling(self, f): g = self.frame.groupby("A") diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index 10527649b728f..5ed5e99db8ab4 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -460,7 +460,9 @@ def test_rolling_count_default_min_periods_with_null_values(constructor): values = [1, 2, 3, np.nan, 4, 5, 6] expected_counts = [1.0, 2.0, 3.0, 2.0, 2.0, 2.0, 3.0] - result = constructor(values).rolling(3).count() + # GH 31302 + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + result = constructor(values).rolling(3).count() expected = constructor(expected_counts) tm.assert_equal(result, expected) diff --git a/pandas/tests/window/test_timeseries_window.py b/pandas/tests/window/test_timeseries_window.py index ea4d7df6700e9..d9fcb538c97c1 100644 --- a/pandas/tests/window/test_timeseries_window.py +++ b/pandas/tests/window/test_timeseries_window.py @@ -593,7 +593,10 @@ def test_freqs_ops(self, freq, op, result_data): [ "sum", "mean", - "count", + pytest.param( + "count", + marks=pytest.mark.filterwarnings("ignore:min_periods:FutureWarning"), + ), "median", "std", "var", From 93768d5b4ccecf8e3126848dea44d7c72bb811cd Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Fri, 2 Oct 2020 13:36:15 -0700 Subject: [PATCH 0976/1025] CLN: test_moments_rolling.py for quantile/kurt/skew (#36784) --- .../window/moments/test_moments_rolling.py | 226 +----------------- .../moments/test_moments_rolling_quantile.py | 166 +++++++++++++ .../moments/test_moments_rolling_skew_kurt.py | 163 +++++++++++++ 3 files changed, 330 insertions(+), 225 deletions(-) create mode 100644 pandas/tests/window/moments/test_moments_rolling_quantile.py create mode 100644 pandas/tests/window/moments/test_moments_rolling_skew_kurt.py diff --git a/pandas/tests/window/moments/test_moments_rolling.py b/pandas/tests/window/moments/test_moments_rolling.py index 880316ec6111a..488306d0585c5 100644 --- a/pandas/tests/window/moments/test_moments_rolling.py +++ b/pandas/tests/window/moments/test_moments_rolling.py @@ -1,187 +1,12 @@ import numpy as np -from numpy.random import randn import pytest import pandas.util._test_decorators as td import pandas as pd -from pandas import DataFrame, Series, isna, notna +from pandas import DataFrame, Series import pandas._testing as tm -import pandas.tseries.offsets as offsets - - -def _check_moment_func( - static_comp, - name, - raw, - has_min_periods=True, - has_center=True, - has_time_rule=True, - fill_value=None, - zero_min_periods_equal=True, - series=None, - frame=None, - **kwargs, -): - def get_result(obj, window, min_periods=None, center=False): - r = obj.rolling(window=window, min_periods=min_periods, center=center) - return getattr(r, name)(**kwargs) - - series_result = get_result(series, window=50) - assert isinstance(series_result, Series) - tm.assert_almost_equal(series_result.iloc[-1], static_comp(series[-50:])) - - frame_result = get_result(frame, window=50) - assert isinstance(frame_result, DataFrame) - tm.assert_series_equal( - frame_result.iloc[-1, :], - frame.iloc[-50:, :].apply(static_comp, axis=0, raw=raw), - check_names=False, - ) - - # check time_rule works - if has_time_rule: - win = 25 - minp = 10 - ser = series[::2].resample("B").mean() - frm = frame[::2].resample("B").mean() - - if has_min_periods: - series_result = get_result(ser, window=win, min_periods=minp) - frame_result = get_result(frm, window=win, min_periods=minp) - else: - series_result = get_result(ser, window=win, min_periods=0) - frame_result = get_result(frm, window=win, min_periods=0) - - last_date = series_result.index[-1] - prev_date = last_date - 24 * offsets.BDay() - - trunc_series = series[::2].truncate(prev_date, last_date) - trunc_frame = frame[::2].truncate(prev_date, last_date) - - tm.assert_almost_equal(series_result[-1], static_comp(trunc_series)) - - tm.assert_series_equal( - frame_result.xs(last_date), - trunc_frame.apply(static_comp, raw=raw), - check_names=False, - ) - - # excluding NaNs correctly - obj = Series(randn(50)) - obj[:10] = np.NaN - obj[-10:] = np.NaN - if has_min_periods: - result = get_result(obj, 50, min_periods=30) - tm.assert_almost_equal(result.iloc[-1], static_comp(obj[10:-10])) - - # min_periods is working correctly - result = get_result(obj, 20, min_periods=15) - assert isna(result.iloc[23]) - assert not isna(result.iloc[24]) - - assert not isna(result.iloc[-6]) - assert isna(result.iloc[-5]) - - obj2 = Series(randn(20)) - result = get_result(obj2, 10, min_periods=5) - assert isna(result.iloc[3]) - assert notna(result.iloc[4]) - - if zero_min_periods_equal: - # min_periods=0 may be equivalent to min_periods=1 - result0 = get_result(obj, 20, min_periods=0) - result1 = get_result(obj, 20, min_periods=1) - tm.assert_almost_equal(result0, result1) - else: - result = get_result(obj, 50) - tm.assert_almost_equal(result.iloc[-1], static_comp(obj[10:-10])) - - # window larger than series length (#7297) - if has_min_periods: - for minp in (0, len(series) - 1, len(series)): - result = get_result(series, len(series) + 1, min_periods=minp) - expected = get_result(series, len(series), min_periods=minp) - nan_mask = isna(result) - tm.assert_series_equal(nan_mask, isna(expected)) - - nan_mask = ~nan_mask - tm.assert_almost_equal(result[nan_mask], expected[nan_mask]) - else: - result = get_result(series, len(series) + 1, min_periods=0) - expected = get_result(series, len(series), min_periods=0) - nan_mask = isna(result) - tm.assert_series_equal(nan_mask, isna(expected)) - - nan_mask = ~nan_mask - tm.assert_almost_equal(result[nan_mask], expected[nan_mask]) - - # check center=True - if has_center: - if has_min_periods: - result = get_result(obj, 20, min_periods=15, center=True) - expected = get_result( - pd.concat([obj, Series([np.NaN] * 9)]), 20, min_periods=15 - )[9:].reset_index(drop=True) - else: - result = get_result(obj, 20, min_periods=0, center=True) - print(result) - expected = get_result( - pd.concat([obj, Series([np.NaN] * 9)]), 20, min_periods=0 - )[9:].reset_index(drop=True) - - tm.assert_series_equal(result, expected) - - # shifter index - s = [f"x{x:d}" for x in range(12)] - - if has_min_periods: - minp = 10 - - series_xp = ( - get_result( - series.reindex(list(series.index) + s), window=25, min_periods=minp - ) - .shift(-12) - .reindex(series.index) - ) - frame_xp = ( - get_result( - frame.reindex(list(frame.index) + s), window=25, min_periods=minp - ) - .shift(-12) - .reindex(frame.index) - ) - - series_rs = get_result(series, window=25, min_periods=minp, center=True) - frame_rs = get_result(frame, window=25, min_periods=minp, center=True) - - else: - series_xp = ( - get_result( - series.reindex(list(series.index) + s), window=25, min_periods=0 - ) - .shift(-12) - .reindex(series.index) - ) - frame_xp = ( - get_result( - frame.reindex(list(frame.index) + s), window=25, min_periods=0 - ) - .shift(-12) - .reindex(frame.index) - ) - - series_rs = get_result(series, window=25, min_periods=0, center=True) - frame_rs = get_result(frame, window=25, min_periods=0, center=True) - - if fill_value is not None: - series_xp = series_xp.fillna(fill_value) - frame_xp = frame_xp.fillna(fill_value) - tm.assert_series_equal(series_xp, series_rs) - tm.assert_frame_equal(frame_xp, frame_rs) - def test_centered_axis_validation(): @@ -716,33 +541,6 @@ def test_rolling_max_min_periods(): pd.Series([1, 2, 3]).rolling(window=3, min_periods=5).max() -@pytest.mark.parametrize("q", [0.0, 0.1, 0.5, 0.9, 1.0]) -def test_rolling_quantile(q, raw, series, frame): - def scoreatpercentile(a, per): - values = np.sort(a, axis=0) - - idx = int(per / 1.0 * (values.shape[0] - 1)) - - if idx == values.shape[0] - 1: - retval = values[-1] - - else: - qlow = float(idx) / float(values.shape[0] - 1) - qhig = float(idx + 1) / float(values.shape[0] - 1) - vlow = values[idx] - vhig = values[idx + 1] - retval = vlow + (vhig - vlow) * (per - qlow) / (qhig - qlow) - - return retval - - def quantile_func(x): - return scoreatpercentile(x, q) - - _check_moment_func( - quantile_func, name="quantile", quantile=q, raw=raw, series=series, frame=frame - ) - - def test_rolling_quantile_np_percentile(): # #9413: Tests that rolling window's quantile default behavior # is analogous to Numpy's percentile @@ -845,25 +643,3 @@ def test_rolling_std_neg_sqrt(): b = a.ewm(span=3).std() assert np.isfinite(b[2:]).all() - - -@td.skip_if_no_scipy -def test_rolling_skew(raw, series, frame): - from scipy.stats import skew - - _check_moment_func( - lambda x: skew(x, bias=False), name="skew", raw=raw, series=series, frame=frame - ) - - -@td.skip_if_no_scipy -def test_rolling_kurt(raw, series, frame): - from scipy.stats import kurtosis - - _check_moment_func( - lambda x: kurtosis(x, bias=False), - name="kurt", - raw=raw, - series=series, - frame=frame, - ) diff --git a/pandas/tests/window/moments/test_moments_rolling_quantile.py b/pandas/tests/window/moments/test_moments_rolling_quantile.py new file mode 100644 index 0000000000000..1b6d4a5c82164 --- /dev/null +++ b/pandas/tests/window/moments/test_moments_rolling_quantile.py @@ -0,0 +1,166 @@ +from functools import partial + +import numpy as np +import pytest + +from pandas import DataFrame, Series, concat, isna, notna +import pandas._testing as tm + +import pandas.tseries.offsets as offsets + + +def scoreatpercentile(a, per): + values = np.sort(a, axis=0) + + idx = int(per / 1.0 * (values.shape[0] - 1)) + + if idx == values.shape[0] - 1: + retval = values[-1] + + else: + qlow = float(idx) / float(values.shape[0] - 1) + qhig = float(idx + 1) / float(values.shape[0] - 1) + vlow = values[idx] + vhig = values[idx + 1] + retval = vlow + (vhig - vlow) * (per - qlow) / (qhig - qlow) + + return retval + + +@pytest.mark.parametrize("q", [0.0, 0.1, 0.5, 0.9, 1.0]) +def test_series(series, q): + compare_func = partial(scoreatpercentile, per=q) + result = series.rolling(50).quantile(q) + assert isinstance(result, Series) + tm.assert_almost_equal(result.iloc[-1], compare_func(series[-50:])) + + +@pytest.mark.parametrize("q", [0.0, 0.1, 0.5, 0.9, 1.0]) +def test_frame(raw, frame, q): + compare_func = partial(scoreatpercentile, per=q) + result = frame.rolling(50).quantile(q) + assert isinstance(result, DataFrame) + tm.assert_series_equal( + result.iloc[-1, :], + frame.iloc[-50:, :].apply(compare_func, axis=0, raw=raw), + check_names=False, + ) + + +@pytest.mark.parametrize("q", [0.0, 0.1, 0.5, 0.9, 1.0]) +def test_time_rule_series(series, q): + compare_func = partial(scoreatpercentile, per=q) + win = 25 + ser = series[::2].resample("B").mean() + series_result = ser.rolling(window=win, min_periods=10).quantile(q) + last_date = series_result.index[-1] + prev_date = last_date - 24 * offsets.BDay() + + trunc_series = series[::2].truncate(prev_date, last_date) + tm.assert_almost_equal(series_result[-1], compare_func(trunc_series)) + + +@pytest.mark.parametrize("q", [0.0, 0.1, 0.5, 0.9, 1.0]) +def test_time_rule_frame(raw, frame, q): + compare_func = partial(scoreatpercentile, per=q) + win = 25 + frm = frame[::2].resample("B").mean() + frame_result = frm.rolling(window=win, min_periods=10).quantile(q) + last_date = frame_result.index[-1] + prev_date = last_date - 24 * offsets.BDay() + + trunc_frame = frame[::2].truncate(prev_date, last_date) + tm.assert_series_equal( + frame_result.xs(last_date), + trunc_frame.apply(compare_func, raw=raw), + check_names=False, + ) + + +@pytest.mark.parametrize("q", [0.0, 0.1, 0.5, 0.9, 1.0]) +def test_nans(q): + compare_func = partial(scoreatpercentile, per=q) + obj = Series(np.random.randn(50)) + obj[:10] = np.NaN + obj[-10:] = np.NaN + + result = obj.rolling(50, min_periods=30).quantile(q) + tm.assert_almost_equal(result.iloc[-1], compare_func(obj[10:-10])) + + # min_periods is working correctly + result = obj.rolling(20, min_periods=15).quantile(q) + assert isna(result.iloc[23]) + assert not isna(result.iloc[24]) + + assert not isna(result.iloc[-6]) + assert isna(result.iloc[-5]) + + obj2 = Series(np.random.randn(20)) + result = obj2.rolling(10, min_periods=5).quantile(q) + assert isna(result.iloc[3]) + assert notna(result.iloc[4]) + + result0 = obj.rolling(20, min_periods=0).quantile(q) + result1 = obj.rolling(20, min_periods=1).quantile(q) + tm.assert_almost_equal(result0, result1) + + +@pytest.mark.parametrize("minp", [0, 99, 100]) +@pytest.mark.parametrize("q", [0.0, 0.1, 0.5, 0.9, 1.0]) +def test_min_periods(series, minp, q): + result = series.rolling(len(series) + 1, min_periods=minp).quantile(q) + expected = series.rolling(len(series), min_periods=minp).quantile(q) + nan_mask = isna(result) + tm.assert_series_equal(nan_mask, isna(expected)) + + nan_mask = ~nan_mask + tm.assert_almost_equal(result[nan_mask], expected[nan_mask]) + + +@pytest.mark.parametrize("q", [0.0, 0.1, 0.5, 0.9, 1.0]) +def test_center(q): + obj = Series(np.random.randn(50)) + obj[:10] = np.NaN + obj[-10:] = np.NaN + + result = obj.rolling(20, center=True).quantile(q) + expected = ( + concat([obj, Series([np.NaN] * 9)]) + .rolling(20) + .quantile(q)[9:] + .reset_index(drop=True) + ) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("q", [0.0, 0.1, 0.5, 0.9, 1.0]) +def test_center_reindex_series(series, q): + # shifter index + s = [f"x{x:d}" for x in range(12)] + + series_xp = ( + series.reindex(list(series.index) + s) + .rolling(window=25) + .quantile(q) + .shift(-12) + .reindex(series.index) + ) + + series_rs = series.rolling(window=25, center=True).quantile(q) + tm.assert_series_equal(series_xp, series_rs) + + +@pytest.mark.parametrize("q", [0.0, 0.1, 0.5, 0.9, 1.0]) +def test_center_reindex_frame(frame, q): + # shifter index + s = [f"x{x:d}" for x in range(12)] + + frame_xp = ( + frame.reindex(list(frame.index) + s) + .rolling(window=25) + .quantile(q) + .shift(-12) + .reindex(frame.index) + ) + frame_rs = frame.rolling(window=25, center=True).quantile(q) + tm.assert_frame_equal(frame_xp, frame_rs) diff --git a/pandas/tests/window/moments/test_moments_rolling_skew_kurt.py b/pandas/tests/window/moments/test_moments_rolling_skew_kurt.py new file mode 100644 index 0000000000000..cc67e602be12e --- /dev/null +++ b/pandas/tests/window/moments/test_moments_rolling_skew_kurt.py @@ -0,0 +1,163 @@ +from functools import partial + +import numpy as np +import pytest + +import pandas.util._test_decorators as td + +from pandas import DataFrame, Series, concat, isna, notna +import pandas._testing as tm + +import pandas.tseries.offsets as offsets + + +@td.skip_if_no_scipy +@pytest.mark.parametrize("sp_func, roll_func", [["kurtosis", "kurt"], ["skew", "skew"]]) +def test_series(series, sp_func, roll_func): + import scipy.stats + + compare_func = partial(getattr(scipy.stats, sp_func), bias=False) + result = getattr(series.rolling(50), roll_func)() + assert isinstance(result, Series) + tm.assert_almost_equal(result.iloc[-1], compare_func(series[-50:])) + + +@td.skip_if_no_scipy +@pytest.mark.parametrize("sp_func, roll_func", [["kurtosis", "kurt"], ["skew", "skew"]]) +def test_frame(raw, frame, sp_func, roll_func): + import scipy.stats + + compare_func = partial(getattr(scipy.stats, sp_func), bias=False) + result = getattr(frame.rolling(50), roll_func)() + assert isinstance(result, DataFrame) + tm.assert_series_equal( + result.iloc[-1, :], + frame.iloc[-50:, :].apply(compare_func, axis=0, raw=raw), + check_names=False, + ) + + +@td.skip_if_no_scipy +@pytest.mark.parametrize("sp_func, roll_func", [["kurtosis", "kurt"], ["skew", "skew"]]) +def test_time_rule_series(series, sp_func, roll_func): + import scipy.stats + + compare_func = partial(getattr(scipy.stats, sp_func), bias=False) + win = 25 + ser = series[::2].resample("B").mean() + series_result = getattr(ser.rolling(window=win, min_periods=10), roll_func)() + last_date = series_result.index[-1] + prev_date = last_date - 24 * offsets.BDay() + + trunc_series = series[::2].truncate(prev_date, last_date) + tm.assert_almost_equal(series_result[-1], compare_func(trunc_series)) + + +@td.skip_if_no_scipy +@pytest.mark.parametrize("sp_func, roll_func", [["kurtosis", "kurt"], ["skew", "skew"]]) +def test_time_rule_frame(raw, frame, sp_func, roll_func): + import scipy.stats + + compare_func = partial(getattr(scipy.stats, sp_func), bias=False) + win = 25 + frm = frame[::2].resample("B").mean() + frame_result = getattr(frm.rolling(window=win, min_periods=10), roll_func)() + last_date = frame_result.index[-1] + prev_date = last_date - 24 * offsets.BDay() + + trunc_frame = frame[::2].truncate(prev_date, last_date) + tm.assert_series_equal( + frame_result.xs(last_date), + trunc_frame.apply(compare_func, raw=raw), + check_names=False, + ) + + +@td.skip_if_no_scipy +@pytest.mark.parametrize("sp_func, roll_func", [["kurtosis", "kurt"], ["skew", "skew"]]) +def test_nans(sp_func, roll_func): + import scipy.stats + + compare_func = partial(getattr(scipy.stats, sp_func), bias=False) + obj = Series(np.random.randn(50)) + obj[:10] = np.NaN + obj[-10:] = np.NaN + + result = getattr(obj.rolling(50, min_periods=30), roll_func)() + tm.assert_almost_equal(result.iloc[-1], compare_func(obj[10:-10])) + + # min_periods is working correctly + result = getattr(obj.rolling(20, min_periods=15), roll_func)() + assert isna(result.iloc[23]) + assert not isna(result.iloc[24]) + + assert not isna(result.iloc[-6]) + assert isna(result.iloc[-5]) + + obj2 = Series(np.random.randn(20)) + result = getattr(obj2.rolling(10, min_periods=5), roll_func)() + assert isna(result.iloc[3]) + assert notna(result.iloc[4]) + + result0 = getattr(obj.rolling(20, min_periods=0), roll_func)() + result1 = getattr(obj.rolling(20, min_periods=1), roll_func)() + tm.assert_almost_equal(result0, result1) + + +@pytest.mark.parametrize("minp", [0, 99, 100]) +@pytest.mark.parametrize("roll_func", ["kurt", "skew"]) +def test_min_periods(series, minp, roll_func): + result = getattr(series.rolling(len(series) + 1, min_periods=minp), roll_func)() + expected = getattr(series.rolling(len(series), min_periods=minp), roll_func)() + nan_mask = isna(result) + tm.assert_series_equal(nan_mask, isna(expected)) + + nan_mask = ~nan_mask + tm.assert_almost_equal(result[nan_mask], expected[nan_mask]) + + +@pytest.mark.parametrize("roll_func", ["kurt", "skew"]) +def test_center(roll_func): + obj = Series(np.random.randn(50)) + obj[:10] = np.NaN + obj[-10:] = np.NaN + + result = getattr(obj.rolling(20, center=True), roll_func)() + expected = getattr(concat([obj, Series([np.NaN] * 9)]).rolling(20), roll_func)()[ + 9: + ].reset_index(drop=True) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("roll_func", ["kurt", "skew"]) +def test_center_reindex_series(series, roll_func): + # shifter index + s = [f"x{x:d}" for x in range(12)] + + series_xp = ( + getattr( + series.reindex(list(series.index) + s).rolling(window=25), + roll_func, + )() + .shift(-12) + .reindex(series.index) + ) + series_rs = getattr(series.rolling(window=25, center=True), roll_func)() + tm.assert_series_equal(series_xp, series_rs) + + +@pytest.mark.parametrize("roll_func", ["kurt", "skew"]) +def test_center_reindex_frame(frame, roll_func): + # shifter index + s = [f"x{x:d}" for x in range(12)] + + frame_xp = ( + getattr( + frame.reindex(list(frame.index) + s).rolling(window=25), + roll_func, + )() + .shift(-12) + .reindex(frame.index) + ) + frame_rs = getattr(frame.rolling(window=25, center=True), roll_func)() + tm.assert_frame_equal(frame_xp, frame_rs) From 9f6a47bfe6dfcf0454f527dbcd2c00df50210fb8 Mon Sep 17 00:00:00 2001 From: Prayag Savsani Date: Sat, 3 Oct 2020 02:33:50 +0530 Subject: [PATCH 0977/1025] DOC: use black to fix code style in doc pandas-dev#36777 (#36813) --- .../comparison/comparison_with_r.rst | 147 +++++++++++------- .../comparison/comparison_with_sas.rst | 126 +++++++-------- .../comparison/comparison_with_sql.rst | 110 +++++++------ .../comparison/comparison_with_stata.rst | 120 +++++++------- 4 files changed, 284 insertions(+), 219 deletions(-) diff --git a/doc/source/getting_started/comparison/comparison_with_r.rst b/doc/source/getting_started/comparison/comparison_with_r.rst index e1a4cfe49b7d1..358bb6ad951f0 100644 --- a/doc/source/getting_started/comparison/comparison_with_r.rst +++ b/doc/source/getting_started/comparison/comparison_with_r.rst @@ -122,16 +122,16 @@ Selecting multiple columns by name in ``pandas`` is straightforward .. ipython:: python - df = pd.DataFrame(np.random.randn(10, 3), columns=list('abc')) - df[['a', 'c']] - df.loc[:, ['a', 'c']] + df = pd.DataFrame(np.random.randn(10, 3), columns=list("abc")) + df[["a", "c"]] + df.loc[:, ["a", "c"]] Selecting multiple noncontiguous columns by integer location can be achieved with a combination of the ``iloc`` indexer attribute and ``numpy.r_``. .. ipython:: python - named = list('abcdefg') + named = list("abcdefg") n = 30 columns = named + np.arange(len(named), n).tolist() df = pd.DataFrame(np.random.randn(n, n), columns=columns) @@ -160,14 +160,29 @@ function. .. ipython:: python df = pd.DataFrame( - {'v1': [1, 3, 5, 7, 8, 3, 5, np.nan, 4, 5, 7, 9], - 'v2': [11, 33, 55, 77, 88, 33, 55, np.nan, 44, 55, 77, 99], - 'by1': ["red", "blue", 1, 2, np.nan, "big", 1, 2, "red", 1, np.nan, 12], - 'by2': ["wet", "dry", 99, 95, np.nan, "damp", 95, 99, "red", 99, np.nan, - np.nan]}) + { + "v1": [1, 3, 5, 7, 8, 3, 5, np.nan, 4, 5, 7, 9], + "v2": [11, 33, 55, 77, 88, 33, 55, np.nan, 44, 55, 77, 99], + "by1": ["red", "blue", 1, 2, np.nan, "big", 1, 2, "red", 1, np.nan, 12], + "by2": [ + "wet", + "dry", + 99, + 95, + np.nan, + "damp", + 95, + 99, + "red", + 99, + np.nan, + np.nan, + ], + } + ) - g = df.groupby(['by1', 'by2']) - g[['v1', 'v2']].mean() + g = df.groupby(["by1", "by2"]) + g[["v1", "v2"]].mean() For more details and examples see :ref:`the groupby documentation `. @@ -228,11 +243,14 @@ In ``pandas`` we may use :meth:`~pandas.pivot_table` method to handle this: import string baseball = pd.DataFrame( - {'team': ["team %d" % (x + 1) for x in range(5)] * 5, - 'player': random.sample(list(string.ascii_lowercase), 25), - 'batting avg': np.random.uniform(.200, .400, 25)}) + { + "team": ["team %d" % (x + 1) for x in range(5)] * 5, + "player": random.sample(list(string.ascii_lowercase), 25), + "batting avg": np.random.uniform(0.200, 0.400, 25), + } + ) - baseball.pivot_table(values='batting avg', columns='team', aggfunc=np.max) + baseball.pivot_table(values="batting avg", columns="team", aggfunc=np.max) For more details and examples see :ref:`the reshaping documentation `. @@ -256,10 +274,10 @@ index/slice as well as standard boolean indexing: .. ipython:: python - df = pd.DataFrame({'a': np.random.randn(10), 'b': np.random.randn(10)}) - df.query('a <= b') - df[df['a'] <= df['b']] - df.loc[df['a'] <= df['b']] + df = pd.DataFrame({"a": np.random.randn(10), "b": np.random.randn(10)}) + df.query("a <= b") + df[df["a"] <= df["b"]] + df.loc[df["a"] <= df["b"]] For more details and examples see :ref:`the query documentation `. @@ -282,9 +300,9 @@ In ``pandas`` the equivalent expression, using the .. ipython:: python - df = pd.DataFrame({'a': np.random.randn(10), 'b': np.random.randn(10)}) - df.eval('a + b') - df['a'] + df['b'] # same as the previous expression + df = pd.DataFrame({"a": np.random.randn(10), "b": np.random.randn(10)}) + df.eval("a + b") + df["a"] + df["b"] # same as the previous expression In certain cases :meth:`~pandas.DataFrame.eval` will be much faster than evaluation in pure Python. For more details and examples see :ref:`the eval @@ -334,14 +352,18 @@ In ``pandas`` the equivalent expression, using the .. ipython:: python - df = pd.DataFrame({'x': np.random.uniform(1., 168., 120), - 'y': np.random.uniform(7., 334., 120), - 'z': np.random.uniform(1.7, 20.7, 120), - 'month': [5, 6, 7, 8] * 30, - 'week': np.random.randint(1, 4, 120)}) + df = pd.DataFrame( + { + "x": np.random.uniform(1.0, 168.0, 120), + "y": np.random.uniform(7.0, 334.0, 120), + "z": np.random.uniform(1.7, 20.7, 120), + "month": [5, 6, 7, 8] * 30, + "week": np.random.randint(1, 4, 120), + } + ) - grouped = df.groupby(['month', 'week']) - grouped['x'].agg([np.mean, np.std]) + grouped = df.groupby(["month", "week"]) + grouped["x"].agg([np.mean, np.std]) For more details and examples see :ref:`the groupby documentation @@ -410,13 +432,17 @@ In Python, the :meth:`~pandas.melt` method is the R equivalent: .. ipython:: python - cheese = pd.DataFrame({'first': ['John', 'Mary'], - 'last': ['Doe', 'Bo'], - 'height': [5.5, 6.0], - 'weight': [130, 150]}) + cheese = pd.DataFrame( + { + "first": ["John", "Mary"], + "last": ["Doe", "Bo"], + "height": [5.5, 6.0], + "weight": [130, 150], + } + ) - pd.melt(cheese, id_vars=['first', 'last']) - cheese.set_index(['first', 'last']).stack() # alternative way + pd.melt(cheese, id_vars=["first", "last"]) + cheese.set_index(["first", "last"]).stack() # alternative way For more details and examples see :ref:`the reshaping documentation `. @@ -444,15 +470,24 @@ In Python the best way is to make use of :meth:`~pandas.pivot_table`: .. ipython:: python - df = pd.DataFrame({'x': np.random.uniform(1., 168., 12), - 'y': np.random.uniform(7., 334., 12), - 'z': np.random.uniform(1.7, 20.7, 12), - 'month': [5, 6, 7] * 4, - 'week': [1, 2] * 6}) + df = pd.DataFrame( + { + "x": np.random.uniform(1.0, 168.0, 12), + "y": np.random.uniform(7.0, 334.0, 12), + "z": np.random.uniform(1.7, 20.7, 12), + "month": [5, 6, 7] * 4, + "week": [1, 2] * 6, + } + ) - mdf = pd.melt(df, id_vars=['month', 'week']) - pd.pivot_table(mdf, values='value', index=['variable', 'week'], - columns=['month'], aggfunc=np.mean) + mdf = pd.melt(df, id_vars=["month", "week"]) + pd.pivot_table( + mdf, + values="value", + index=["variable", "week"], + columns=["month"], + aggfunc=np.mean, + ) Similarly for ``dcast`` which uses a data.frame called ``df`` in R to aggregate information based on ``Animal`` and ``FeedType``: @@ -475,21 +510,29 @@ using :meth:`~pandas.pivot_table`: .. ipython:: python - df = pd.DataFrame({ - 'Animal': ['Animal1', 'Animal2', 'Animal3', 'Animal2', 'Animal1', - 'Animal2', 'Animal3'], - 'FeedType': ['A', 'B', 'A', 'A', 'B', 'B', 'A'], - 'Amount': [10, 7, 4, 2, 5, 6, 2], - }) + df = pd.DataFrame( + { + "Animal": [ + "Animal1", + "Animal2", + "Animal3", + "Animal2", + "Animal1", + "Animal2", + "Animal3", + ], + "FeedType": ["A", "B", "A", "A", "B", "B", "A"], + "Amount": [10, 7, 4, 2, 5, 6, 2], + } + ) - df.pivot_table(values='Amount', index='Animal', columns='FeedType', - aggfunc='sum') + df.pivot_table(values="Amount", index="Animal", columns="FeedType", aggfunc="sum") The second approach is to use the :meth:`~pandas.DataFrame.groupby` method: .. ipython:: python - df.groupby(['Animal', 'FeedType'])['Amount'].sum() + df.groupby(["Animal", "FeedType"])["Amount"].sum() For more details and examples see :ref:`the reshaping documentation ` or :ref:`the groupby documentation`. diff --git a/doc/source/getting_started/comparison/comparison_with_sas.rst b/doc/source/getting_started/comparison/comparison_with_sas.rst index 85c6ea2c31969..ae9f1caebd556 100644 --- a/doc/source/getting_started/comparison/comparison_with_sas.rst +++ b/doc/source/getting_started/comparison/comparison_with_sas.rst @@ -106,7 +106,7 @@ and the values are the data. .. ipython:: python - df = pd.DataFrame({'x': [1, 3, 5], 'y': [2, 4, 6]}) + df = pd.DataFrame({"x": [1, 3, 5], "y": [2, 4, 6]}) df @@ -130,8 +130,10 @@ The pandas method is :func:`read_csv`, which works similarly. .. ipython:: python - url = ('https://raw.github.com/pandas-dev/' - 'pandas/master/pandas/tests/io/data/csv/tips.csv') + url = ( + "https://raw.github.com/pandas-dev/" + "pandas/master/pandas/tests/io/data/csv/tips.csv" + ) tips = pd.read_csv(url) tips.head() @@ -142,10 +144,10 @@ and did not have column names, the pandas command would be: .. code-block:: python - tips = pd.read_csv('tips.csv', sep='\t', header=None) + tips = pd.read_csv("tips.csv", sep="\t", header=None) # alternatively, read_table is an alias to read_csv with tab delimiter - tips = pd.read_table('tips.csv', header=None) + tips = pd.read_table("tips.csv", header=None) In addition to text/csv, pandas supports a variety of other data formats such as Excel, HDF5, and SQL databases. These are all read via a ``pd.read_*`` @@ -166,7 +168,7 @@ and other data formats follow a similar api. .. code-block:: python - tips.to_csv('tips2.csv') + tips.to_csv("tips2.csv") Data operations @@ -192,14 +194,14 @@ New columns can be assigned in the same way. .. ipython:: python - tips['total_bill'] = tips['total_bill'] - 2 - tips['new_bill'] = tips['total_bill'] / 2.0 + tips["total_bill"] = tips["total_bill"] - 2 + tips["new_bill"] = tips["total_bill"] / 2.0 tips.head() .. ipython:: python :suppress: - tips = tips.drop('new_bill', axis=1) + tips = tips.drop("new_bill", axis=1) Filtering ~~~~~~~~~ @@ -226,7 +228,7 @@ DataFrames can be filtered in multiple ways; the most intuitive of which is usin .. ipython:: python - tips[tips['total_bill'] > 10].head() + tips[tips["total_bill"] > 10].head() If/then logic ~~~~~~~~~~~~~ @@ -248,13 +250,13 @@ the ``where`` method from ``numpy``. .. ipython:: python - tips['bucket'] = np.where(tips['total_bill'] < 10, 'low', 'high') + tips["bucket"] = np.where(tips["total_bill"] < 10, "low", "high") tips.head() .. ipython:: python :suppress: - tips = tips.drop('bucket', axis=1) + tips = tips.drop("bucket", axis=1) Date functionality ~~~~~~~~~~~~~~~~~~ @@ -284,22 +286,26 @@ see the :ref:`timeseries documentation` for more details. .. ipython:: python - tips['date1'] = pd.Timestamp('2013-01-15') - tips['date2'] = pd.Timestamp('2015-02-15') - tips['date1_year'] = tips['date1'].dt.year - tips['date2_month'] = tips['date2'].dt.month - tips['date1_next'] = tips['date1'] + pd.offsets.MonthBegin() - tips['months_between'] = ( - tips['date2'].dt.to_period('M') - tips['date1'].dt.to_period('M')) + tips["date1"] = pd.Timestamp("2013-01-15") + tips["date2"] = pd.Timestamp("2015-02-15") + tips["date1_year"] = tips["date1"].dt.year + tips["date2_month"] = tips["date2"].dt.month + tips["date1_next"] = tips["date1"] + pd.offsets.MonthBegin() + tips["months_between"] = tips["date2"].dt.to_period("M") - tips[ + "date1" + ].dt.to_period("M") - tips[['date1', 'date2', 'date1_year', 'date2_month', - 'date1_next', 'months_between']].head() + tips[ + ["date1", "date2", "date1_year", "date2_month", "date1_next", "months_between"] + ].head() .. ipython:: python :suppress: - tips = tips.drop(['date1', 'date2', 'date1_year', - 'date2_month', 'date1_next', 'months_between'], axis=1) + tips = tips.drop( + ["date1", "date2", "date1_year", "date2_month", "date1_next", "months_between"], + axis=1, + ) Selection of columns ~~~~~~~~~~~~~~~~~~~~ @@ -329,13 +335,13 @@ The same operations are expressed in pandas below. .. ipython:: python # keep - tips[['sex', 'total_bill', 'tip']].head() + tips[["sex", "total_bill", "tip"]].head() # drop - tips.drop('sex', axis=1).head() + tips.drop("sex", axis=1).head() # rename - tips.rename(columns={'total_bill': 'total_bill_2'}).head() + tips.rename(columns={"total_bill": "total_bill_2"}).head() Sorting by values @@ -354,7 +360,7 @@ takes a list of columns to sort by. .. ipython:: python - tips = tips.sort_values(['sex', 'total_bill']) + tips = tips.sort_values(["sex", "total_bill"]) tips.head() @@ -383,8 +389,8 @@ trailing blanks. .. ipython:: python - tips['time'].str.len().head() - tips['time'].str.rstrip().str.len().head() + tips["time"].str.len().head() + tips["time"].str.rstrip().str.len().head() Find @@ -410,7 +416,7 @@ the function will return -1 if it fails to find the substring. .. ipython:: python - tips['sex'].str.find("ale").head() + tips["sex"].str.find("ale").head() Substring @@ -432,7 +438,7 @@ indexes are zero-based. .. ipython:: python - tips['sex'].str[0:1].head() + tips["sex"].str[0:1].head() Scan @@ -460,9 +466,9 @@ approaches, but this just shows a simple approach. .. ipython:: python - firstlast = pd.DataFrame({'String': ['John Smith', 'Jane Cook']}) - firstlast['First_Name'] = firstlast['String'].str.split(" ", expand=True)[0] - firstlast['Last_Name'] = firstlast['String'].str.rsplit(" ", expand=True)[0] + firstlast = pd.DataFrame({"String": ["John Smith", "Jane Cook"]}) + firstlast["First_Name"] = firstlast["String"].str.split(" ", expand=True)[0] + firstlast["Last_Name"] = firstlast["String"].str.rsplit(" ", expand=True)[0] firstlast @@ -491,10 +497,10 @@ The equivalent Python functions are ``upper``, ``lower``, and ``title``. .. ipython:: python - firstlast = pd.DataFrame({'String': ['John Smith', 'Jane Cook']}) - firstlast['string_up'] = firstlast['String'].str.upper() - firstlast['string_low'] = firstlast['String'].str.lower() - firstlast['string_prop'] = firstlast['String'].str.title() + firstlast = pd.DataFrame({"String": ["John Smith", "Jane Cook"]}) + firstlast["string_up"] = firstlast["String"].str.upper() + firstlast["string_low"] = firstlast["String"].str.lower() + firstlast["string_prop"] = firstlast["String"].str.title() firstlast Merging @@ -504,11 +510,9 @@ The following tables will be used in the merge examples .. ipython:: python - df1 = pd.DataFrame({'key': ['A', 'B', 'C', 'D'], - 'value': np.random.randn(4)}) + df1 = pd.DataFrame({"key": ["A", "B", "C", "D"], "value": np.random.randn(4)}) df1 - df2 = pd.DataFrame({'key': ['B', 'D', 'D', 'E'], - 'value': np.random.randn(4)}) + df2 = pd.DataFrame({"key": ["B", "D", "D", "E"], "value": np.random.randn(4)}) df2 In SAS, data must be explicitly sorted before merging. Different @@ -542,16 +546,16 @@ types are accomplished via the ``how`` keyword. .. ipython:: python - inner_join = df1.merge(df2, on=['key'], how='inner') + inner_join = df1.merge(df2, on=["key"], how="inner") inner_join - left_join = df1.merge(df2, on=['key'], how='left') + left_join = df1.merge(df2, on=["key"], how="left") left_join - right_join = df1.merge(df2, on=['key'], how='right') + right_join = df1.merge(df2, on=["key"], how="right") right_join - outer_join = df1.merge(df2, on=['key'], how='outer') + outer_join = df1.merge(df2, on=["key"], how="outer") outer_join @@ -566,8 +570,8 @@ operations, and is ignored by default for aggregations. .. ipython:: python outer_join - outer_join['value_x'] + outer_join['value_y'] - outer_join['value_x'].sum() + outer_join["value_x"] + outer_join["value_y"] + outer_join["value_x"].sum() One difference is that missing data cannot be compared to its sentinel value. For example, in SAS you could do this to filter missing values. @@ -589,8 +593,8 @@ should be used for comparisons. .. ipython:: python - outer_join[pd.isna(outer_join['value_x'])] - outer_join[pd.notna(outer_join['value_x'])] + outer_join[pd.isna(outer_join["value_x"])] + outer_join[pd.notna(outer_join["value_x"])] pandas also provides a variety of methods to work with missing data - some of which would be challenging to express in SAS. For example, there are methods to @@ -601,8 +605,8 @@ value, like the mean, or forward filling from previous rows. See the .. ipython:: python outer_join.dropna() - outer_join.fillna(method='ffill') - outer_join['value_x'].fillna(outer_join['value_x'].mean()) + outer_join.fillna(method="ffill") + outer_join["value_x"].fillna(outer_join["value_x"].mean()) GroupBy @@ -629,7 +633,7 @@ for more details and examples. .. ipython:: python - tips_summed = tips.groupby(['sex', 'smoker'])[['total_bill', 'tip']].sum() + tips_summed = tips.groupby(["sex", "smoker"])[["total_bill", "tip"]].sum() tips_summed.head() @@ -666,8 +670,8 @@ operation. .. ipython:: python - gb = tips.groupby('smoker')['total_bill'] - tips['adj_total_bill'] = tips['total_bill'] - gb.transform('mean') + gb = tips.groupby("smoker")["total_bill"] + tips["adj_total_bill"] = tips["total_bill"] - gb.transform("mean") tips.head() @@ -695,7 +699,7 @@ In pandas this would be written as: .. ipython:: python - tips.groupby(['sex', 'smoker']).first() + tips.groupby(["sex", "smoker"]).first() Other considerations @@ -729,16 +733,16 @@ the XPORT or SAS7BDAT binary format. .. code-block:: python - df = pd.read_sas('transport-file.xpt') - df = pd.read_sas('binary-file.sas7bdat') + df = pd.read_sas("transport-file.xpt") + df = pd.read_sas("binary-file.sas7bdat") You can also specify the file format directly. By default, pandas will try to infer the file format based on its extension. .. code-block:: python - df = pd.read_sas('transport-file.xpt', format='xport') - df = pd.read_sas('binary-file.sas7bdat', format='sas7bdat') + df = pd.read_sas("transport-file.xpt", format="xport") + df = pd.read_sas("binary-file.sas7bdat", format="sas7bdat") XPORT is a relatively limited format and the parsing of it is not as optimized as some of the other pandas readers. An alternative way @@ -752,4 +756,4 @@ to interop data between SAS and pandas is to serialize to csv. Wall time: 14.6 s In [9]: %time df = pd.read_csv('big.csv') - Wall time: 4.86 s \ No newline at end of file + Wall time: 4.86 s diff --git a/doc/source/getting_started/comparison/comparison_with_sql.rst b/doc/source/getting_started/comparison/comparison_with_sql.rst index 04f97a27cde39..6848d8df2e46b 100644 --- a/doc/source/getting_started/comparison/comparison_with_sql.rst +++ b/doc/source/getting_started/comparison/comparison_with_sql.rst @@ -24,8 +24,10 @@ structure. .. ipython:: python - url = ('https://raw.github.com/pandas-dev' - '/pandas/master/pandas/tests/io/data/csv/tips.csv') + url = ( + "https://raw.github.com/pandas-dev" + "/pandas/master/pandas/tests/io/data/csv/tips.csv" + ) tips = pd.read_csv(url) tips.head() @@ -44,7 +46,7 @@ With pandas, column selection is done by passing a list of column names to your .. ipython:: python - tips[['total_bill', 'tip', 'smoker', 'time']].head(5) + tips[["total_bill", "tip", "smoker", "time"]].head(5) Calling the DataFrame without the list of column names would display all columns (akin to SQL's ``*``). @@ -61,7 +63,7 @@ With pandas, you can use the :meth:`DataFrame.assign` method of a DataFrame to a .. ipython:: python - tips.assign(tip_rate=tips['tip'] / tips['total_bill']).head(5) + tips.assign(tip_rate=tips["tip"] / tips["total_bill"]).head(5) WHERE ----- @@ -79,14 +81,14 @@ DataFrames can be filtered in multiple ways; the most intuitive of which is usin .. ipython:: python - tips[tips['time'] == 'Dinner'].head(5) + tips[tips["time"] == "Dinner"].head(5) The above statement is simply passing a ``Series`` of True/False objects to the DataFrame, returning all rows with True. .. ipython:: python - is_dinner = tips['time'] == 'Dinner' + is_dinner = tips["time"] == "Dinner" is_dinner.value_counts() tips[is_dinner].head(5) @@ -103,7 +105,7 @@ Just like SQL's OR and AND, multiple conditions can be passed to a DataFrame usi .. ipython:: python # tips of more than $5.00 at Dinner meals - tips[(tips['time'] == 'Dinner') & (tips['tip'] > 5.00)] + tips[(tips["time"] == "Dinner") & (tips["tip"] > 5.00)] .. code-block:: sql @@ -115,15 +117,16 @@ Just like SQL's OR and AND, multiple conditions can be passed to a DataFrame usi .. ipython:: python # tips by parties of at least 5 diners OR bill total was more than $45 - tips[(tips['size'] >= 5) | (tips['total_bill'] > 45)] + tips[(tips["size"] >= 5) | (tips["total_bill"] > 45)] NULL checking is done using the :meth:`~pandas.Series.notna` and :meth:`~pandas.Series.isna` methods. .. ipython:: python - frame = pd.DataFrame({'col1': ['A', 'B', np.NaN, 'C', 'D'], - 'col2': ['F', np.NaN, 'G', 'H', 'I']}) + frame = pd.DataFrame( + {"col1": ["A", "B", np.NaN, "C", "D"], "col2": ["F", np.NaN, "G", "H", "I"]} + ) frame Assume we have a table of the same structure as our DataFrame above. We can see only the records @@ -137,7 +140,7 @@ where ``col2`` IS NULL with the following query: .. ipython:: python - frame[frame['col2'].isna()] + frame[frame["col2"].isna()] Getting items where ``col1`` IS NOT NULL can be done with :meth:`~pandas.Series.notna`. @@ -149,7 +152,7 @@ Getting items where ``col1`` IS NOT NULL can be done with :meth:`~pandas.Series. .. ipython:: python - frame[frame['col1'].notna()] + frame[frame["col1"].notna()] GROUP BY @@ -177,7 +180,7 @@ The pandas equivalent would be: .. ipython:: python - tips.groupby('sex').size() + tips.groupby("sex").size() Notice that in the pandas code we used :meth:`~pandas.core.groupby.DataFrameGroupBy.size` and not :meth:`~pandas.core.groupby.DataFrameGroupBy.count`. This is because @@ -186,14 +189,14 @@ the number of ``not null`` records within each. .. ipython:: python - tips.groupby('sex').count() + tips.groupby("sex").count() Alternatively, we could have applied the :meth:`~pandas.core.groupby.DataFrameGroupBy.count` method to an individual column: .. ipython:: python - tips.groupby('sex')['total_bill'].count() + tips.groupby("sex")["total_bill"].count() Multiple functions can also be applied at once. For instance, say we'd like to see how tip amount differs by day of the week - :meth:`~pandas.core.groupby.DataFrameGroupBy.agg` allows you to pass a dictionary @@ -213,7 +216,7 @@ to your grouped DataFrame, indicating which functions to apply to specific colum .. ipython:: python - tips.groupby('day').agg({'tip': np.mean, 'day': np.size}) + tips.groupby("day").agg({"tip": np.mean, "day": np.size}) Grouping by more than one column is done by passing a list of columns to the :meth:`~pandas.DataFrame.groupby` method. @@ -237,7 +240,7 @@ Grouping by more than one column is done by passing a list of columns to the .. ipython:: python - tips.groupby(['smoker', 'day']).agg({'tip': [np.size, np.mean]}) + tips.groupby(["smoker", "day"]).agg({"tip": [np.size, np.mean]}) .. _compare_with_sql.join: @@ -250,10 +253,8 @@ columns to join on (column names or indices). .. ipython:: python - df1 = pd.DataFrame({'key': ['A', 'B', 'C', 'D'], - 'value': np.random.randn(4)}) - df2 = pd.DataFrame({'key': ['B', 'D', 'D', 'E'], - 'value': np.random.randn(4)}) + df1 = pd.DataFrame({"key": ["A", "B", "C", "D"], "value": np.random.randn(4)}) + df2 = pd.DataFrame({"key": ["B", "D", "D", "E"], "value": np.random.randn(4)}) Assume we have two database tables of the same name and structure as our DataFrames. @@ -271,15 +272,15 @@ INNER JOIN .. ipython:: python # merge performs an INNER JOIN by default - pd.merge(df1, df2, on='key') + pd.merge(df1, df2, on="key") :meth:`~pandas.merge` also offers parameters for cases when you'd like to join one DataFrame's column with another DataFrame's index. .. ipython:: python - indexed_df2 = df2.set_index('key') - pd.merge(df1, indexed_df2, left_on='key', right_index=True) + indexed_df2 = df2.set_index("key") + pd.merge(df1, indexed_df2, left_on="key", right_index=True) LEFT OUTER JOIN ~~~~~~~~~~~~~~~ @@ -294,7 +295,7 @@ LEFT OUTER JOIN .. ipython:: python # show all records from df1 - pd.merge(df1, df2, on='key', how='left') + pd.merge(df1, df2, on="key", how="left") RIGHT JOIN ~~~~~~~~~~ @@ -309,7 +310,7 @@ RIGHT JOIN .. ipython:: python # show all records from df2 - pd.merge(df1, df2, on='key', how='right') + pd.merge(df1, df2, on="key", how="right") FULL JOIN ~~~~~~~~~ @@ -327,7 +328,7 @@ joined columns find a match. As of writing, FULL JOINs are not supported in all .. ipython:: python # show all records from both frames - pd.merge(df1, df2, on='key', how='outer') + pd.merge(df1, df2, on="key", how="outer") UNION @@ -336,10 +337,12 @@ UNION ALL can be performed using :meth:`~pandas.concat`. .. ipython:: python - df1 = pd.DataFrame({'city': ['Chicago', 'San Francisco', 'New York City'], - 'rank': range(1, 4)}) - df2 = pd.DataFrame({'city': ['Chicago', 'Boston', 'Los Angeles'], - 'rank': [1, 4, 5]}) + df1 = pd.DataFrame( + {"city": ["Chicago", "San Francisco", "New York City"], "rank": range(1, 4)} + ) + df2 = pd.DataFrame( + {"city": ["Chicago", "Boston", "Los Angeles"], "rank": [1, 4, 5]} + ) .. code-block:: sql @@ -403,7 +406,7 @@ Top n rows with offset .. ipython:: python - tips.nlargest(10 + 5, columns='tip').tail(10) + tips.nlargest(10 + 5, columns="tip").tail(10) Top n rows per group ~~~~~~~~~~~~~~~~~~~~ @@ -423,20 +426,30 @@ Top n rows per group .. ipython:: python - (tips.assign(rn=tips.sort_values(['total_bill'], ascending=False) - .groupby(['day']) - .cumcount() + 1) - .query('rn < 3') - .sort_values(['day', 'rn'])) + ( + tips.assign( + rn=tips.sort_values(["total_bill"], ascending=False) + .groupby(["day"]) + .cumcount() + + 1 + ) + .query("rn < 3") + .sort_values(["day", "rn"]) + ) the same using ``rank(method='first')`` function .. ipython:: python - (tips.assign(rnk=tips.groupby(['day'])['total_bill'] - .rank(method='first', ascending=False)) - .query('rnk < 3') - .sort_values(['day', 'rnk'])) + ( + tips.assign( + rnk=tips.groupby(["day"])["total_bill"].rank( + method="first", ascending=False + ) + ) + .query("rnk < 3") + .sort_values(["day", "rnk"]) + ) .. code-block:: sql @@ -458,11 +471,12 @@ Notice that when using ``rank(method='min')`` function .. ipython:: python - (tips[tips['tip'] < 2] - .assign(rnk_min=tips.groupby(['sex'])['tip'] - .rank(method='min')) - .query('rnk_min < 3') - .sort_values(['sex', 'rnk_min'])) + ( + tips[tips["tip"] < 2] + .assign(rnk_min=tips.groupby(["sex"])["tip"].rank(method="min")) + .query("rnk_min < 3") + .sort_values(["sex", "rnk_min"]) + ) UPDATE @@ -476,7 +490,7 @@ UPDATE .. ipython:: python - tips.loc[tips['tip'] < 2, 'tip'] *= 2 + tips.loc[tips["tip"] < 2, "tip"] *= 2 DELETE ------ @@ -490,4 +504,4 @@ In pandas we select the rows that should remain, instead of deleting them .. ipython:: python - tips = tips.loc[tips['tip'] <= 9] + tips = tips.loc[tips["tip"] <= 9] diff --git a/doc/source/getting_started/comparison/comparison_with_stata.rst b/doc/source/getting_started/comparison/comparison_with_stata.rst index 06f9e45466243..7b8d9c6be61db 100644 --- a/doc/source/getting_started/comparison/comparison_with_stata.rst +++ b/doc/source/getting_started/comparison/comparison_with_stata.rst @@ -103,7 +103,7 @@ and the values are the data. .. ipython:: python - df = pd.DataFrame({'x': [1, 3, 5], 'y': [2, 4, 6]}) + df = pd.DataFrame({"x": [1, 3, 5], "y": [2, 4, 6]}) df @@ -127,8 +127,10 @@ the data set if presented with a url. .. ipython:: python - url = ('https://raw.github.com/pandas-dev' - '/pandas/master/pandas/tests/io/data/csv/tips.csv') + url = ( + "https://raw.github.com/pandas-dev" + "/pandas/master/pandas/tests/io/data/csv/tips.csv" + ) tips = pd.read_csv(url) tips.head() @@ -139,16 +141,16 @@ the pandas command would be: .. code-block:: python - tips = pd.read_csv('tips.csv', sep='\t', header=None) + tips = pd.read_csv("tips.csv", sep="\t", header=None) # alternatively, read_table is an alias to read_csv with tab delimiter - tips = pd.read_table('tips.csv', header=None) + tips = pd.read_table("tips.csv", header=None) Pandas can also read Stata data sets in ``.dta`` format with the :func:`read_stata` function. .. code-block:: python - df = pd.read_stata('data.dta') + df = pd.read_stata("data.dta") In addition to text/csv and Stata files, pandas supports a variety of other data formats such as Excel, SAS, HDF5, Parquet, and SQL databases. These are all read via a ``pd.read_*`` @@ -168,13 +170,13 @@ Similarly in pandas, the opposite of ``read_csv`` is :meth:`DataFrame.to_csv`. .. code-block:: python - tips.to_csv('tips2.csv') + tips.to_csv("tips2.csv") Pandas can also export to Stata file format with the :meth:`DataFrame.to_stata` method. .. code-block:: python - tips.to_stata('tips2.dta') + tips.to_stata("tips2.dta") Data operations @@ -200,11 +202,11 @@ drops a column from the ``DataFrame``. .. ipython:: python - tips['total_bill'] = tips['total_bill'] - 2 - tips['new_bill'] = tips['total_bill'] / 2 + tips["total_bill"] = tips["total_bill"] - 2 + tips["new_bill"] = tips["total_bill"] / 2 tips.head() - tips = tips.drop('new_bill', axis=1) + tips = tips.drop("new_bill", axis=1) Filtering ~~~~~~~~~ @@ -220,7 +222,7 @@ DataFrames can be filtered in multiple ways; the most intuitive of which is usin .. ipython:: python - tips[tips['total_bill'] > 10].head() + tips[tips["total_bill"] > 10].head() If/then logic ~~~~~~~~~~~~~ @@ -237,13 +239,13 @@ the ``where`` method from ``numpy``. .. ipython:: python - tips['bucket'] = np.where(tips['total_bill'] < 10, 'low', 'high') + tips["bucket"] = np.where(tips["total_bill"] < 10, "low", "high") tips.head() .. ipython:: python :suppress: - tips = tips.drop('bucket', axis=1) + tips = tips.drop("bucket", axis=1) Date functionality ~~~~~~~~~~~~~~~~~~ @@ -273,22 +275,26 @@ see the :ref:`timeseries documentation` for more details. .. ipython:: python - tips['date1'] = pd.Timestamp('2013-01-15') - tips['date2'] = pd.Timestamp('2015-02-15') - tips['date1_year'] = tips['date1'].dt.year - tips['date2_month'] = tips['date2'].dt.month - tips['date1_next'] = tips['date1'] + pd.offsets.MonthBegin() - tips['months_between'] = (tips['date2'].dt.to_period('M') - - tips['date1'].dt.to_period('M')) + tips["date1"] = pd.Timestamp("2013-01-15") + tips["date2"] = pd.Timestamp("2015-02-15") + tips["date1_year"] = tips["date1"].dt.year + tips["date2_month"] = tips["date2"].dt.month + tips["date1_next"] = tips["date1"] + pd.offsets.MonthBegin() + tips["months_between"] = tips["date2"].dt.to_period("M") - tips[ + "date1" + ].dt.to_period("M") - tips[['date1', 'date2', 'date1_year', 'date2_month', 'date1_next', - 'months_between']].head() + tips[ + ["date1", "date2", "date1_year", "date2_month", "date1_next", "months_between"] + ].head() .. ipython:: python :suppress: - tips = tips.drop(['date1', 'date2', 'date1_year', 'date2_month', - 'date1_next', 'months_between'], axis=1) + tips = tips.drop( + ["date1", "date2", "date1_year", "date2_month", "date1_next", "months_between"], + axis=1, + ) Selection of columns ~~~~~~~~~~~~~~~~~~~~ @@ -310,13 +316,13 @@ to a variable. .. ipython:: python # keep - tips[['sex', 'total_bill', 'tip']].head() + tips[["sex", "total_bill", "tip"]].head() # drop - tips.drop('sex', axis=1).head() + tips.drop("sex", axis=1).head() # rename - tips.rename(columns={'total_bill': 'total_bill_2'}).head() + tips.rename(columns={"total_bill": "total_bill_2"}).head() Sorting by values @@ -333,7 +339,7 @@ takes a list of columns to sort by. .. ipython:: python - tips = tips.sort_values(['sex', 'total_bill']) + tips = tips.sort_values(["sex", "total_bill"]) tips.head() @@ -357,8 +363,8 @@ Use ``len`` and ``rstrip`` to exclude trailing blanks. .. ipython:: python - tips['time'].str.len().head() - tips['time'].str.rstrip().str.len().head() + tips["time"].str.len().head() + tips["time"].str.rstrip().str.len().head() Finding position of substring @@ -380,7 +386,7 @@ the function will return -1 if it fails to find the substring. .. ipython:: python - tips['sex'].str.find("ale").head() + tips["sex"].str.find("ale").head() Extracting substring by position @@ -398,7 +404,7 @@ indexes are zero-based. .. ipython:: python - tips['sex'].str[0:1].head() + tips["sex"].str[0:1].head() Extracting nth word @@ -425,9 +431,9 @@ approaches, but this just shows a simple approach. .. ipython:: python - firstlast = pd.DataFrame({'string': ['John Smith', 'Jane Cook']}) - firstlast['First_Name'] = firstlast['string'].str.split(" ", expand=True)[0] - firstlast['Last_Name'] = firstlast['string'].str.rsplit(" ", expand=True)[0] + firstlast = pd.DataFrame({"string": ["John Smith", "Jane Cook"]}) + firstlast["First_Name"] = firstlast["string"].str.split(" ", expand=True)[0] + firstlast["Last_Name"] = firstlast["string"].str.rsplit(" ", expand=True)[0] firstlast @@ -455,10 +461,10 @@ The equivalent Python functions are ``upper``, ``lower``, and ``title``. .. ipython:: python - firstlast = pd.DataFrame({'string': ['John Smith', 'Jane Cook']}) - firstlast['upper'] = firstlast['string'].str.upper() - firstlast['lower'] = firstlast['string'].str.lower() - firstlast['title'] = firstlast['string'].str.title() + firstlast = pd.DataFrame({"string": ["John Smith", "Jane Cook"]}) + firstlast["upper"] = firstlast["string"].str.upper() + firstlast["lower"] = firstlast["string"].str.lower() + firstlast["title"] = firstlast["string"].str.title() firstlast Merging @@ -468,11 +474,9 @@ The following tables will be used in the merge examples .. ipython:: python - df1 = pd.DataFrame({'key': ['A', 'B', 'C', 'D'], - 'value': np.random.randn(4)}) + df1 = pd.DataFrame({"key": ["A", "B", "C", "D"], "value": np.random.randn(4)}) df1 - df2 = pd.DataFrame({'key': ['B', 'D', 'D', 'E'], - 'value': np.random.randn(4)}) + df2 = pd.DataFrame({"key": ["B", "D", "D", "E"], "value": np.random.randn(4)}) df2 In Stata, to perform a merge, one data set must be in memory @@ -534,16 +538,16 @@ types are accomplished via the ``how`` keyword. .. ipython:: python - inner_join = df1.merge(df2, on=['key'], how='inner') + inner_join = df1.merge(df2, on=["key"], how="inner") inner_join - left_join = df1.merge(df2, on=['key'], how='left') + left_join = df1.merge(df2, on=["key"], how="left") left_join - right_join = df1.merge(df2, on=['key'], how='right') + right_join = df1.merge(df2, on=["key"], how="right") right_join - outer_join = df1.merge(df2, on=['key'], how='outer') + outer_join = df1.merge(df2, on=["key"], how="outer") outer_join @@ -558,8 +562,8 @@ operations, and is ignored by default for aggregations. .. ipython:: python outer_join - outer_join['value_x'] + outer_join['value_y'] - outer_join['value_x'].sum() + outer_join["value_x"] + outer_join["value_y"] + outer_join["value_x"].sum() One difference is that missing data cannot be compared to its sentinel value. For example, in Stata you could do this to filter missing values. @@ -576,8 +580,8 @@ should be used for comparisons. .. ipython:: python - outer_join[pd.isna(outer_join['value_x'])] - outer_join[pd.notna(outer_join['value_x'])] + outer_join[pd.isna(outer_join["value_x"])] + outer_join[pd.notna(outer_join["value_x"])] Pandas also provides a variety of methods to work with missing data -- some of which would be challenging to express in Stata. For example, there are methods to @@ -591,10 +595,10 @@ value, like the mean, or forward filling from previous rows. See the outer_join.dropna() # Fill forwards - outer_join.fillna(method='ffill') + outer_join.fillna(method="ffill") # Impute missing values with the mean - outer_join['value_x'].fillna(outer_join['value_x'].mean()) + outer_join["value_x"].fillna(outer_join["value_x"].mean()) GroupBy @@ -617,7 +621,7 @@ for more details and examples. .. ipython:: python - tips_summed = tips.groupby(['sex', 'smoker'])[['total_bill', 'tip']].sum() + tips_summed = tips.groupby(["sex", "smoker"])[["total_bill", "tip"]].sum() tips_summed.head() @@ -640,8 +644,8 @@ operation. .. ipython:: python - gb = tips.groupby('smoker')['total_bill'] - tips['adj_total_bill'] = tips['total_bill'] - gb.transform('mean') + gb = tips.groupby("smoker")["total_bill"] + tips["adj_total_bill"] = tips["total_bill"] - gb.transform("mean") tips.head() @@ -661,7 +665,7 @@ In pandas this would be written as: .. ipython:: python - tips.groupby(['sex', 'smoker']).first() + tips.groupby(["sex", "smoker"]).first() Other considerations From 5e1992e7339194511c73245c92ebe3ab6c110c13 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 2 Oct 2020 14:12:35 -0700 Subject: [PATCH 0978/1025] ERR: error handling in DataFrame.__rmatmul__ (#36792) --- doc/source/whatsnew/v1.2.0.rst | 2 +- pandas/core/frame.py | 9 ++++++++- pandas/tests/frame/test_analytics.py | 14 ++++++++++++++ 3 files changed, 23 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 3bfb507d2e140..8688b2ae81302 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -327,7 +327,7 @@ Numeric - Bug in :meth:`Series.equals` where a ``ValueError`` was raised when numpy arrays were compared to scalars (:issue:`35267`) - Bug in :class:`Series` where two :class:`Series` each have a :class:`DatetimeIndex` with different timezones having those indexes incorrectly changed when performing arithmetic operations (:issue:`33671`) - Bug in :meth:`pd._testing.assert_almost_equal` was incorrect for complex numeric types (:issue:`28235`) -- +- Bug in :meth:`DataFrame.__rmatmul__` error handling reporting transposed shapes (:issue:`21581`) Conversion ^^^^^^^^^^ diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 75fdeb122a074..13443cc3befd3 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1216,7 +1216,14 @@ def __rmatmul__(self, other): """ Matrix multiplication using binary `@` operator in Python>=3.5. """ - return self.T.dot(np.transpose(other)).T + try: + return self.T.dot(np.transpose(other)).T + except ValueError as err: + if "shape mismatch" not in str(err): + raise + # GH#21581 give exception message for original shapes + msg = f"shapes {np.shape(other)} and {self.shape} not aligned" + raise ValueError(msg) from err # ---------------------------------------------------------------------- # IO methods (to / from other formats) diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index 4324b03ed13d6..ee136533b0775 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -1177,6 +1177,20 @@ def test_matmul(self): with pytest.raises(ValueError, match="aligned"): operator.matmul(df, df2) + def test_matmul_message_shapes(self): + # GH#21581 exception message should reflect original shapes, + # not transposed shapes + a = np.random.rand(10, 4) + b = np.random.rand(5, 3) + + df = DataFrame(b) + + msg = r"shapes \(10, 4\) and \(5, 3\) not aligned" + with pytest.raises(ValueError, match=msg): + a @ df + with pytest.raises(ValueError, match=msg): + a.tolist() @ df + # --------------------------------------------------------------------- # Unsorted From 972651fb8181293c0e373401974baaaf86214954 Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Fri, 2 Oct 2020 22:13:21 +0100 Subject: [PATCH 0979/1025] CLN: Remove param _set_identity in MultiIndex (#36786) --- pandas/core/indexes/multi.py | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 1628b44be4096..a157fdfdde447 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -278,7 +278,6 @@ def __new__( copy=False, name=None, verify_integrity: bool = True, - _set_identity: bool = True, ): # compat with Index @@ -312,10 +311,7 @@ def __new__( new_codes = result._verify_integrity() result._codes = new_codes - if _set_identity: - result._reset_identity() - - return result + return result._reset_identity() def _validate_codes(self, level: List, code: List): """ @@ -1071,7 +1067,6 @@ def _shallow_copy( codes=None, sortorder=None, names=lib.no_default, - _set_identity: bool = True, ): if names is not lib.no_default and name is not lib.no_default: raise TypeError("Can only provide one of `names` and `name`") @@ -1091,7 +1086,6 @@ def _shallow_copy( sortorder=sortorder, names=names, verify_integrity=False, - _set_identity=_set_identity, ) result._cache = self._cache.copy() result._cache.pop("levels", None) # GH32669 @@ -1119,7 +1113,6 @@ def copy( codes=None, deep=False, name=None, - _set_identity=False, ): """ Make a copy of this object. Names, dtype, levels and codes can be @@ -1180,7 +1173,6 @@ def copy( codes=codes, names=names, sortorder=self.sortorder, - _set_identity=_set_identity, ) if dtype: From 8b68ecbc022d095d1a7b63af375a7f81e813f759 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 2 Oct 2020 14:29:24 -0700 Subject: [PATCH 0980/1025] BUG: DTI/TDI.equals with i8 (#36744) --- doc/source/whatsnew/v1.2.0.rst | 2 +- pandas/core/indexes/datetimelike.py | 2 ++ pandas/core/ops/__init__.py | 4 +++- pandas/tests/computation/test_eval.py | 14 +++++++------- pandas/tests/indexes/datetimelike.py | 7 +++++++ 5 files changed, 20 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 8688b2ae81302..af12dc90d5290 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -305,7 +305,7 @@ Datetimelike - Bug in :meth:`DatetimeIndex.searchsorted`, :meth:`TimedeltaIndex.searchsorted`, :meth:`PeriodIndex.searchsorted`, and :meth:`Series.searchsorted` with ``datetime64``, ``timedelta64`` or ``Period`` dtype placement of ``NaT`` values being inconsistent with ``NumPy`` (:issue:`36176`, :issue:`36254`) - Inconsistency in :class:`DatetimeArray`, :class:`TimedeltaArray`, and :class:`PeriodArray` setitem casting arrays of strings to datetimelike scalars but not scalar strings (:issue:`36261`) - Bug in :class:`DatetimeIndex.shift` incorrectly raising when shifting empty indexes (:issue:`14811`) - +- Bug in :meth:`DatetimeIndex.equals` and :meth:`TimedeltaIndex.equals` incorrectly considering ``int64`` indexes as equal (:issue:`36744`) Timedelta ^^^^^^^^^ diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 23cc93b9ecb33..d2162d987ccd6 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -134,6 +134,8 @@ def equals(self, other: object) -> bool: if not isinstance(other, Index): return False + elif other.dtype.kind in ["f", "i", "u", "c"]: + return False elif not isinstance(other, type(self)): try: other = type(self)(other) diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index 2dc97a3583dfb..ab04cbfff3f9a 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -539,7 +539,9 @@ def _should_reindex_frame_op( if fill_value is None and level is None and axis is default_axis: # TODO: any other cases we should handle here? cols = left.columns.intersection(right.columns) - if not (cols.equals(left.columns) and cols.equals(right.columns)): + + if len(cols) and not (cols.equals(left.columns) and cols.equals(right.columns)): + # TODO: is there a shortcut available when len(cols) == 0? return True return False diff --git a/pandas/tests/computation/test_eval.py b/pandas/tests/computation/test_eval.py index cca64a6bf487c..b78c7775e8a37 100644 --- a/pandas/tests/computation/test_eval.py +++ b/pandas/tests/computation/test_eval.py @@ -1054,15 +1054,15 @@ def test_complex_series_frame_alignment(self, engine, parser): m2, n, data_gen_f=f, r_idx_type=r2, c_idx_type=c2 ) index = getattr(locals().get(obj_name), index_name) - s = Series(np.random.randn(n), index[:n]) + ser = Series(np.random.randn(n), index[:n]) if r2 == "dt" or c2 == "dt": if engine == "numexpr": - expected2 = df2.add(s) + expected2 = df2.add(ser) else: - expected2 = df2 + s + expected2 = df2 + ser else: - expected2 = df2 + s + expected2 = df2 + ser if r1 == "dt" or c1 == "dt": if engine == "numexpr": @@ -1072,11 +1072,11 @@ def test_complex_series_frame_alignment(self, engine, parser): else: expected = expected2 + df - if should_warn(df2.index, s.index, df.index): + if should_warn(df2.index, ser.index, df.index): with tm.assert_produces_warning(RuntimeWarning): - res = pd.eval("df2 + s + df", engine=engine, parser=parser) + res = pd.eval("df2 + ser + df", engine=engine, parser=parser) else: - res = pd.eval("df2 + s + df", engine=engine, parser=parser) + res = pd.eval("df2 + ser + df", engine=engine, parser=parser) assert res.shape == expected.shape tm.assert_frame_equal(res, expected) diff --git a/pandas/tests/indexes/datetimelike.py b/pandas/tests/indexes/datetimelike.py index f667e5a610419..71ae1d6bda9c7 100644 --- a/pandas/tests/indexes/datetimelike.py +++ b/pandas/tests/indexes/datetimelike.py @@ -108,3 +108,10 @@ def test_getitem_preserves_freq(self): result = index[:] assert result.freq == index.freq + + def test_not_equals_numeric(self): + index = self.create_index() + + assert not index.equals(pd.Index(index.asi8)) + assert not index.equals(pd.Index(index.asi8.astype("u8"))) + assert not index.equals(pd.Index(index.asi8).astype("f8")) From 497be4001b291a5c94c4a86574cb3468f30eae97 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Fri, 2 Oct 2020 14:39:26 -0700 Subject: [PATCH 0981/1025] CLN: test_moments_consistency_*.py (#36810) --- pandas/tests/window/common.py | 43 ------------------- .../moments/test_moments_consistency_ewm.py | 43 +++++++++++++++---- .../test_moments_consistency_rolling.py | 8 +++- 3 files changed, 40 insertions(+), 54 deletions(-) diff --git a/pandas/tests/window/common.py b/pandas/tests/window/common.py index 7e0be331ec8d5..7c8c9de40f7c5 100644 --- a/pandas/tests/window/common.py +++ b/pandas/tests/window/common.py @@ -4,49 +4,6 @@ import pandas._testing as tm -def check_pairwise_moment(frame, dispatch, name, **kwargs): - def get_result(obj, obj2=None): - return getattr(getattr(obj, dispatch)(**kwargs), name)(obj2) - - result = get_result(frame) - result = result.loc[(slice(None), 1), 5] - result.index = result.index.droplevel(1) - expected = get_result(frame[1], frame[5]) - expected.index = expected.index._with_freq(None) - tm.assert_series_equal(result, expected, check_names=False) - - -def ew_func(A, B, com, name, **kwargs): - return getattr(A.ewm(com, **kwargs), name)(B) - - -def check_binary_ew(name, A, B): - - result = ew_func(A=A, B=B, com=20, name=name, min_periods=5) - assert np.isnan(result.values[:14]).all() - assert not np.isnan(result.values[14:]).any() - - -def check_binary_ew_min_periods(name, min_periods, A, B): - # GH 7898 - result = ew_func(A, B, 20, name=name, min_periods=min_periods) - # binary functions (ewmcov, ewmcorr) with bias=False require at - # least two values - assert np.isnan(result.values[:11]).all() - assert not np.isnan(result.values[11:]).any() - - # check series of length 0 - empty = Series([], dtype=np.float64) - result = ew_func(empty, empty, 50, name=name, min_periods=min_periods) - tm.assert_series_equal(result, empty) - - # check series of length 1 - result = ew_func( - Series([1.0]), Series([1.0]), 50, name=name, min_periods=min_periods - ) - tm.assert_series_equal(result, Series([np.NaN])) - - def moments_consistency_mock_mean(x, mean, mock_mean): mean_x = mean(x) # check that correlation of a series with itself is either 1 or NaN diff --git a/pandas/tests/window/moments/test_moments_consistency_ewm.py b/pandas/tests/window/moments/test_moments_consistency_ewm.py index f143278e12ec5..089ec697b5b1c 100644 --- a/pandas/tests/window/moments/test_moments_consistency_ewm.py +++ b/pandas/tests/window/moments/test_moments_consistency_ewm.py @@ -3,11 +3,8 @@ import pytest from pandas import DataFrame, Series, concat +import pandas._testing as tm from pandas.tests.window.common import ( - check_binary_ew, - check_binary_ew_min_periods, - check_pairwise_moment, - ew_func, moments_consistency_cov_data, moments_consistency_is_constant, moments_consistency_mock_mean, @@ -20,15 +17,43 @@ @pytest.mark.parametrize("func", ["cov", "corr"]) def test_ewm_pairwise_cov_corr(func, frame): - check_pairwise_moment(frame, "ewm", func, span=10, min_periods=5) + result = getattr(frame.ewm(span=10, min_periods=5), func)() + result = result.loc[(slice(None), 1), 5] + result.index = result.index.droplevel(1) + expected = getattr(frame[1].ewm(span=10, min_periods=5), func)(frame[5]) + expected.index = expected.index._with_freq(None) + tm.assert_series_equal(result, expected, check_names=False) @pytest.mark.parametrize("name", ["cov", "corr"]) -def test_ewm_corr_cov(name, min_periods, binary_ew_data): +def test_ewm_corr_cov(name, binary_ew_data): A, B = binary_ew_data - check_binary_ew(name="corr", A=A, B=B) - check_binary_ew_min_periods("corr", min_periods, A, B) + result = getattr(A.ewm(com=20, min_periods=5), name)(B) + assert np.isnan(result.values[:14]).all() + assert not np.isnan(result.values[14:]).any() + + +@pytest.mark.parametrize("name", ["cov", "corr"]) +def test_ewm_corr_cov_min_periods(name, min_periods, binary_ew_data): + # GH 7898 + A, B = binary_ew_data + result = getattr(A.ewm(com=20, min_periods=min_periods), name)(B) + # binary functions (ewmcov, ewmcorr) with bias=False require at + # least two values + assert np.isnan(result.values[:11]).all() + assert not np.isnan(result.values[11:]).any() + + # check series of length 0 + empty = Series([], dtype=np.float64) + result = getattr(empty.ewm(com=50, min_periods=min_periods), name)(empty) + tm.assert_series_equal(result, empty) + + # check series of length 1 + result = getattr(Series([1.0]).ewm(com=50, min_periods=min_periods), name)( + Series([1.0]) + ) + tm.assert_series_equal(result, Series([np.NaN])) @pytest.mark.parametrize("name", ["cov", "corr"]) @@ -38,7 +63,7 @@ def test_different_input_array_raise_exception(name, binary_ew_data): msg = "Input arrays must be of the same type!" # exception raised is Exception with pytest.raises(Exception, match=msg): - ew_func(A, randn(50), 20, name=name, min_periods=5) + getattr(A.ewm(com=20, min_periods=5), name)(randn(50)) @pytest.mark.slow diff --git a/pandas/tests/window/moments/test_moments_consistency_rolling.py b/pandas/tests/window/moments/test_moments_consistency_rolling.py index 99c2c4dd0045b..96ad83f6b40b1 100644 --- a/pandas/tests/window/moments/test_moments_consistency_rolling.py +++ b/pandas/tests/window/moments/test_moments_consistency_rolling.py @@ -12,7 +12,6 @@ import pandas._testing as tm from pandas.core.window.common import flex_binary_moment from pandas.tests.window.common import ( - check_pairwise_moment, moments_consistency_cov_data, moments_consistency_is_constant, moments_consistency_mock_mean, @@ -60,7 +59,12 @@ def test_rolling_corr(series): @pytest.mark.parametrize("func", ["cov", "corr"]) def test_rolling_pairwise_cov_corr(func, frame): - check_pairwise_moment(frame, "rolling", func, window=10, min_periods=5) + result = getattr(frame.rolling(window=10, min_periods=5), func)() + result = result.loc[(slice(None), 1), 5] + result.index = result.index.droplevel(1) + expected = getattr(frame[1].rolling(window=10, min_periods=5), func)(frame[5]) + expected.index = expected.index._with_freq(None) + tm.assert_series_equal(result, expected, check_names=False) @pytest.mark.parametrize("method", ["corr", "cov"]) From c70fff48b7e2a45e7218426f052b5ad2f077733e Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 2 Oct 2020 14:41:07 -0700 Subject: [PATCH 0982/1025] API: make DataFrame.__boolop__ default_axis match DataFrame.__arithop__ default_axis (#36793) --- pandas/core/ops/__init__.py | 30 ++---------------------------- pandas/tests/series/test_api.py | 2 +- 2 files changed, 3 insertions(+), 29 deletions(-) diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index ab04cbfff3f9a..f92f67e1d03d7 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -146,31 +146,6 @@ def _maybe_match_name(a, b): # ----------------------------------------------------------------------------- -def _get_frame_op_default_axis(name: str) -> Optional[str]: - """ - Only DataFrame cares about default_axis, specifically: - special methods have default_axis=None and flex methods - have default_axis='columns'. - - Parameters - ---------- - name : str - - Returns - ------- - default_axis: str or None - """ - if name.replace("__r", "__") in ["__and__", "__or__", "__xor__"]: - # bool methods - return "columns" - elif name.startswith("__"): - # __add__, __mul__, ... - return None - else: - # add, mul, ... - return "columns" - - def _get_op_name(op, special: bool) -> str: """ Find the name to attach to this method according to conventions @@ -619,7 +594,7 @@ def _maybe_align_series_as_frame(frame: "DataFrame", series: "Series", axis: int def arith_method_FRAME(cls: Type["DataFrame"], op, special: bool): # This is the only function where `special` can be either True or False op_name = _get_op_name(op, special) - default_axis = _get_frame_op_default_axis(op_name) + default_axis = None if special else "columns" na_op = get_array_op(op) @@ -671,8 +646,7 @@ def f(self, other, axis=default_axis, level=None, fill_value=None): def flex_comp_method_FRAME(cls: Type["DataFrame"], op, special: bool): assert not special # "special" also means "not flex" op_name = _get_op_name(op, special) - default_axis = _get_frame_op_default_axis(op_name) - assert default_axis == "columns", default_axis # because we are not "special" + default_axis = "columns" # because we are "flex" doc = _flex_comp_doc_FRAME.format( op_name=op_name, desc=_op_descriptions[op_name]["desc"] diff --git a/pandas/tests/series/test_api.py b/pandas/tests/series/test_api.py index a69c0ee75eaba..d92edb6fe149a 100644 --- a/pandas/tests/series/test_api.py +++ b/pandas/tests/series/test_api.py @@ -493,7 +493,7 @@ async def test_tab_complete_warning(self, ip): pytest.importorskip("IPython", minversion="6.0.0") from IPython.core.completer import provisionalcompleter - code = "import pandas as pd; s = pd.Series()" + code = "import pandas as pd; s = pd.Series(dtype=object)" await ip.run_code(code) # TODO: remove it when Ipython updates From 1ce959b1d72ce41c9a02241e17b0de91f4bf165c Mon Sep 17 00:00:00 2001 From: Daniel Saxton <2658661+dsaxton@users.noreply.github.com> Date: Fri, 2 Oct 2020 16:45:17 -0500 Subject: [PATCH 0983/1025] ENH: Implement IntegerArray reductions (#36761) --- doc/source/whatsnew/v1.2.0.rst | 1 + pandas/core/arrays/integer.py | 18 ++++++++---- pandas/tests/arrays/integer/test_function.py | 30 ++++++++++++++++++-- 3 files changed, 42 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index af12dc90d5290..25fac48397c68 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -168,6 +168,7 @@ Other enhancements - ``Styler`` now allows direct CSS class name addition to individual data cells (:issue:`36159`) - :meth:`Rolling.mean()` and :meth:`Rolling.sum()` use Kahan summation to calculate the mean to avoid numerical problems (:issue:`10319`, :issue:`11645`, :issue:`13254`, :issue:`32761`, :issue:`36031`) - :meth:`DatetimeIndex.searchsorted`, :meth:`TimedeltaIndex.searchsorted`, :meth:`PeriodIndex.searchsorted`, and :meth:`Series.searchsorted` with datetimelike dtypes will now try to cast string arguments (listlike and scalar) to the matching datetimelike type (:issue:`36346`) +- Added methods :meth:`IntegerArray.prod`, :meth:`IntegerArray.min`, and :meth:`IntegerArray.max` (:issue:`33790`) .. _whatsnew_120.api_breaking.python: diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index 04c4c73954671..af521a8efacc7 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -25,7 +25,6 @@ from pandas.core.dtypes.missing import isna from pandas.core import ops -from pandas.core.array_algos import masked_reductions from pandas.core.ops import invalid_comparison from pandas.core.ops.common import unpack_zerodim_and_defer from pandas.core.tools.numeric import to_numeric @@ -550,10 +549,19 @@ def cmp_method(self, other): def sum(self, skipna=True, min_count=0, **kwargs): nv.validate_sum((), kwargs) - result = masked_reductions.sum( - values=self._data, mask=self._mask, skipna=skipna, min_count=min_count - ) - return result + return super()._reduce("sum", skipna=skipna, min_count=min_count) + + def prod(self, skipna=True, min_count=0, **kwargs): + nv.validate_prod((), kwargs) + return super()._reduce("prod", skipna=skipna, min_count=min_count) + + def min(self, skipna=True, **kwargs): + nv.validate_min((), kwargs) + return super()._reduce("min", skipna=skipna) + + def max(self, skipna=True, **kwargs): + nv.validate_max((), kwargs) + return super()._reduce("max", skipna=skipna) def _maybe_mask_result(self, result, mask, other, op_name: str): """ diff --git a/pandas/tests/arrays/integer/test_function.py b/pandas/tests/arrays/integer/test_function.py index a81434339fdae..8f64c9c0900f1 100644 --- a/pandas/tests/arrays/integer/test_function.py +++ b/pandas/tests/arrays/integer/test_function.py @@ -115,8 +115,9 @@ def test_value_counts_empty(): @pytest.mark.parametrize("skipna", [True, False]) @pytest.mark.parametrize("min_count", [0, 4]) -def test_integer_array_sum(skipna, min_count): - arr = pd.array([1, 2, 3, None], dtype="Int64") +def test_integer_array_sum(skipna, min_count, any_nullable_int_dtype): + dtype = any_nullable_int_dtype + arr = pd.array([1, 2, 3, None], dtype=dtype) result = arr.sum(skipna=skipna, min_count=min_count) if skipna and min_count == 0: assert result == 6 @@ -124,6 +125,31 @@ def test_integer_array_sum(skipna, min_count): assert result is pd.NA +@pytest.mark.parametrize("skipna", [True, False]) +@pytest.mark.parametrize("method", ["min", "max"]) +def test_integer_array_min_max(skipna, method, any_nullable_int_dtype): + dtype = any_nullable_int_dtype + arr = pd.array([0, 1, None], dtype=dtype) + func = getattr(arr, method) + result = func(skipna=skipna) + if skipna: + assert result == (0 if method == "min" else 1) + else: + assert result is pd.NA + + +@pytest.mark.parametrize("skipna", [True, False]) +@pytest.mark.parametrize("min_count", [0, 9]) +def test_integer_array_prod(skipna, min_count, any_nullable_int_dtype): + dtype = any_nullable_int_dtype + arr = pd.array([1, 2, None], dtype=dtype) + result = arr.prod(skipna=skipna, min_count=min_count) + if skipna and min_count == 0: + assert result == 2 + else: + assert result is pd.NA + + @pytest.mark.parametrize( "values, expected", [([1, 2, 3], 6), ([1, 2, 3, None], 6), ([None], 0)] ) From d77f1b29ebff5aed3ce0e131d2f22dd1490167e9 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 2 Oct 2020 14:51:24 -0700 Subject: [PATCH 0984/1025] ENH: PandasArray ops use core.ops functions (#36484) --- pandas/core/arrays/numpy_.py | 22 ++- pandas/tests/arithmetic/common.py | 6 +- pandas/tests/arithmetic/conftest.py | 5 +- pandas/tests/arithmetic/test_datetime64.py | 22 ++- pandas/tests/arithmetic/test_numeric.py | 49 +++++-- pandas/tests/arithmetic/test_object.py | 12 +- pandas/tests/arithmetic/test_period.py | 20 +-- pandas/tests/arithmetic/test_timedelta64.py | 141 +++++++++++++------- 8 files changed, 186 insertions(+), 91 deletions(-) diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index 237d571507a3a..05139783456b9 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -362,19 +362,29 @@ def __invert__(self): @classmethod def _create_arithmetic_method(cls, op): + + pd_op = ops.get_array_op(op) + @ops.unpack_zerodim_and_defer(op.__name__) def arithmetic_method(self, other): if isinstance(other, cls): other = other._ndarray - with np.errstate(all="ignore"): - result = op(self._ndarray, other) + result = pd_op(self._ndarray, other) - if op is divmod: + if op is divmod or op is ops.rdivmod: a, b = result - return cls(a), cls(b) - - return cls(result) + if isinstance(a, np.ndarray): + # for e.g. op vs TimedeltaArray, we may already + # have an ExtensionArray, in which case we do not wrap + return cls(a), cls(b) + return a, b + + if isinstance(result, np.ndarray): + # for e.g. multiplication vs TimedeltaArray, we may already + # have an ExtensionArray, in which case we do not wrap + return cls(result) + return result return compat.set_function_name(arithmetic_method, f"__{op.__name__}__", cls) diff --git a/pandas/tests/arithmetic/common.py b/pandas/tests/arithmetic/common.py index cd8dd102dc27c..a663c2f3a0175 100644 --- a/pandas/tests/arithmetic/common.py +++ b/pandas/tests/arithmetic/common.py @@ -4,7 +4,7 @@ import numpy as np import pytest -from pandas import DataFrame, Index, Series +from pandas import DataFrame, Index, Series, array as pd_array import pandas._testing as tm @@ -49,12 +49,12 @@ def assert_invalid_comparison(left, right, box): ---------- left : np.ndarray, ExtensionArray, Index, or Series right : object - box : {pd.DataFrame, pd.Series, pd.Index, tm.to_array} + box : {pd.DataFrame, pd.Series, pd.Index, pd.array, tm.to_array} """ # Not for tznaive-tzaware comparison # Note: not quite the same as how we do this for tm.box_expected - xbox = box if box is not Index else np.array + xbox = box if box not in [Index, pd_array] else np.array result = left == right expected = xbox(np.zeros(result.shape, dtype=np.bool_)) diff --git a/pandas/tests/arithmetic/conftest.py b/pandas/tests/arithmetic/conftest.py index 8b9e5cd371a90..6286711ac6113 100644 --- a/pandas/tests/arithmetic/conftest.py +++ b/pandas/tests/arithmetic/conftest.py @@ -2,7 +2,6 @@ import pytest import pandas as pd -import pandas._testing as tm # ------------------------------------------------------------------ # Helper Functions @@ -56,7 +55,7 @@ def one(request): zeros = [ box_cls([0] * 5, dtype=dtype) - for box_cls in [pd.Index, np.array] + for box_cls in [pd.Index, np.array, pd.array] for dtype in [np.int64, np.uint64, np.float64] ] zeros.extend( @@ -231,7 +230,7 @@ def box(request): return request.param -@pytest.fixture(params=[pd.Index, pd.Series, pd.DataFrame, tm.to_array], ids=id_func) +@pytest.fixture(params=[pd.Index, pd.Series, pd.DataFrame, pd.array], ids=id_func) def box_with_array(request): """ Fixture to test behavior for Index, Series, DataFrame, and pandas Array diff --git a/pandas/tests/arithmetic/test_datetime64.py b/pandas/tests/arithmetic/test_datetime64.py index 0dd389ed516c7..626dd4f748e0b 100644 --- a/pandas/tests/arithmetic/test_datetime64.py +++ b/pandas/tests/arithmetic/test_datetime64.py @@ -48,7 +48,9 @@ def test_compare_zerodim(self, tz_naive_fixture, box_with_array): # Test comparison with zero-dimensional array is unboxed tz = tz_naive_fixture box = box_with_array - xbox = box_with_array if box_with_array is not pd.Index else np.ndarray + xbox = ( + box_with_array if box_with_array not in [pd.Index, pd.array] else np.ndarray + ) dti = date_range("20130101", periods=3, tz=tz) other = np.array(dti.to_numpy()[0]) @@ -135,7 +137,7 @@ def test_dt64arr_nat_comparison(self, tz_naive_fixture, box_with_array): # GH#22242, GH#22163 DataFrame considered NaT == ts incorrectly tz = tz_naive_fixture box = box_with_array - xbox = box if box is not pd.Index else np.ndarray + xbox = box if box not in [pd.Index, pd.array] else np.ndarray ts = pd.Timestamp.now(tz) ser = pd.Series([ts, pd.NaT]) @@ -203,6 +205,8 @@ def test_nat_comparisons(self, dtype, index_or_series, reverse, pair): def test_comparison_invalid(self, tz_naive_fixture, box_with_array): # GH#4968 # invalid date/int comparisons + if box_with_array is pd.array: + pytest.xfail("assert_invalid_comparison doesnt handle BooleanArray yet") tz = tz_naive_fixture ser = Series(range(5)) ser2 = Series(pd.date_range("20010101", periods=5, tz=tz)) @@ -226,8 +230,12 @@ def test_nat_comparisons_scalar(self, dtype, data, box_with_array): # dont bother testing ndarray comparison methods as this fails # on older numpys (since they check object identity) return + if box_with_array is pd.array and dtype is object: + pytest.xfail("reversed comparisons give BooleanArray, not ndarray") - xbox = box_with_array if box_with_array is not pd.Index else np.ndarray + xbox = ( + box_with_array if box_with_array not in [pd.Index, pd.array] else np.ndarray + ) left = Series(data, dtype=dtype) left = tm.box_expected(left, box_with_array) @@ -299,7 +307,9 @@ def test_timestamp_compare_series(self, left, right): def test_dt64arr_timestamp_equality(self, box_with_array): # GH#11034 - xbox = box_with_array if box_with_array is not pd.Index else np.ndarray + xbox = ( + box_with_array if box_with_array not in [pd.Index, pd.array] else np.ndarray + ) ser = pd.Series([pd.Timestamp("2000-01-29 01:59:00"), "NaT"]) ser = tm.box_expected(ser, box_with_array) @@ -388,7 +398,9 @@ def test_dti_cmp_nat(self, dtype, box_with_array): # on older numpys (since they check object identity) return - xbox = box_with_array if box_with_array is not pd.Index else np.ndarray + xbox = ( + box_with_array if box_with_array not in [pd.Index, pd.array] else np.ndarray + ) left = pd.DatetimeIndex( [pd.Timestamp("2011-01-01"), pd.NaT, pd.Timestamp("2011-01-03")] diff --git a/pandas/tests/arithmetic/test_numeric.py b/pandas/tests/arithmetic/test_numeric.py index 139401bdf5806..df98b43e11f4a 100644 --- a/pandas/tests/arithmetic/test_numeric.py +++ b/pandas/tests/arithmetic/test_numeric.py @@ -89,8 +89,9 @@ def test_compare_invalid(self): b.name = pd.Timestamp("2000-01-01") tm.assert_series_equal(a / b, 1 / (b / a)) - def test_numeric_cmp_string_numexpr_path(self, box): + def test_numeric_cmp_string_numexpr_path(self, box_with_array): # GH#36377, GH#35700 + box = box_with_array xbox = box if box is not pd.Index else np.ndarray obj = pd.Series(np.random.randn(10 ** 5)) @@ -183,10 +184,14 @@ def test_ops_series(self): ], ids=lambda x: type(x).__name__, ) - def test_numeric_arr_mul_tdscalar(self, scalar_td, numeric_idx, box): + def test_numeric_arr_mul_tdscalar(self, scalar_td, numeric_idx, box_with_array): # GH#19333 + box = box_with_array + if box is pd.array: + pytest.xfail( + "we get a PandasArray[timedelta64[ns]] instead of TimedeltaArray" + ) index = numeric_idx - expected = pd.TimedeltaIndex([pd.Timedelta(days=n) for n in range(5)]) index = tm.box_expected(index, box) @@ -207,7 +212,11 @@ def test_numeric_arr_mul_tdscalar(self, scalar_td, numeric_idx, box): ], ids=lambda x: type(x).__name__, ) - def test_numeric_arr_mul_tdscalar_numexpr_path(self, scalar_td, box): + def test_numeric_arr_mul_tdscalar_numexpr_path(self, scalar_td, box_with_array): + box = box_with_array + if box is pd.array: + pytest.xfail("IntegerArray.__mul__ doesnt handle timedeltas") + arr = np.arange(2 * 10 ** 4).astype(np.int64) obj = tm.box_expected(arr, box, transpose=False) @@ -220,7 +229,11 @@ def test_numeric_arr_mul_tdscalar_numexpr_path(self, scalar_td, box): result = scalar_td * obj tm.assert_equal(result, expected) - def test_numeric_arr_rdiv_tdscalar(self, three_days, numeric_idx, box): + def test_numeric_arr_rdiv_tdscalar(self, three_days, numeric_idx, box_with_array): + box = box_with_array + if box is pd.array: + pytest.xfail("We get PandasArray[td64] instead of TimedeltaArray") + index = numeric_idx[1:3] expected = TimedeltaIndex(["3 Days", "36 Hours"]) @@ -248,7 +261,11 @@ def test_numeric_arr_rdiv_tdscalar(self, three_days, numeric_idx, box): pd.offsets.Second(0), ], ) - def test_add_sub_timedeltalike_invalid(self, numeric_idx, other, box): + def test_add_sub_timedeltalike_invalid(self, numeric_idx, other, box_with_array): + box = box_with_array + if box is pd.array: + pytest.xfail("PandasArray[int].__add__ doesnt raise on td64") + left = tm.box_expected(numeric_idx, box) msg = ( "unsupported operand type|" @@ -276,16 +293,21 @@ def test_add_sub_timedeltalike_invalid(self, numeric_idx, other, box): ], ) @pytest.mark.filterwarnings("ignore:elementwise comp:DeprecationWarning") - def test_add_sub_datetimelike_invalid(self, numeric_idx, other, box): + def test_add_sub_datetimelike_invalid(self, numeric_idx, other, box_with_array): # GH#28080 numeric+datetime64 should raise; Timestamp raises # NullFrequencyError instead of TypeError so is excluded. + box = box_with_array left = tm.box_expected(numeric_idx, box) - msg = ( - "unsupported operand type|" - "Cannot (add|subtract) NaT (to|from) ndarray|" - "Addition/subtraction of integers and integer-arrays|" - "Concatenation operation is not implemented for NumPy arrays" + msg = "|".join( + [ + "unsupported operand type", + "Cannot (add|subtract) NaT (to|from) ndarray", + "Addition/subtraction of integers and integer-arrays", + "Concatenation operation is not implemented for NumPy arrays", + # pd.array vs np.datetime64 case + r"operand type\(s\) all returned NotImplemented from __array_ufunc__", + ] ) with pytest.raises(TypeError, match=msg): left + other @@ -568,8 +590,9 @@ class TestMultiplicationDivision: # __mul__, __rmul__, __div__, __rdiv__, __floordiv__, __rfloordiv__ # for non-timestamp/timedelta/period dtypes - def test_divide_decimal(self, box): + def test_divide_decimal(self, box_with_array): # resolves issue GH#9787 + box = box_with_array ser = Series([Decimal(10)]) expected = Series([Decimal(5)]) diff --git a/pandas/tests/arithmetic/test_object.py b/pandas/tests/arithmetic/test_object.py index c0cb522b516ab..02cb4f4d7a606 100644 --- a/pandas/tests/arithmetic/test_object.py +++ b/pandas/tests/arithmetic/test_object.py @@ -104,22 +104,22 @@ def test_add_extension_scalar(self, other, box_with_array, op): result = op(arr, other) tm.assert_equal(result, expected) - def test_objarr_add_str(self, box): + def test_objarr_add_str(self, box_with_array): ser = pd.Series(["x", np.nan, "x"]) expected = pd.Series(["xa", np.nan, "xa"]) - ser = tm.box_expected(ser, box) - expected = tm.box_expected(expected, box) + ser = tm.box_expected(ser, box_with_array) + expected = tm.box_expected(expected, box_with_array) result = ser + "a" tm.assert_equal(result, expected) - def test_objarr_radd_str(self, box): + def test_objarr_radd_str(self, box_with_array): ser = pd.Series(["x", np.nan, "x"]) expected = pd.Series(["ax", np.nan, "ax"]) - ser = tm.box_expected(ser, box) - expected = tm.box_expected(expected, box) + ser = tm.box_expected(ser, box_with_array) + expected = tm.box_expected(expected, box_with_array) result = "a" + ser tm.assert_equal(result, expected) diff --git a/pandas/tests/arithmetic/test_period.py b/pandas/tests/arithmetic/test_period.py index 930435074efc1..e78e696d00398 100644 --- a/pandas/tests/arithmetic/test_period.py +++ b/pandas/tests/arithmetic/test_period.py @@ -28,7 +28,9 @@ class TestPeriodArrayLikeComparisons: def test_compare_zerodim(self, box_with_array): # GH#26689 make sure we unbox zero-dimensional arrays - xbox = box_with_array if box_with_array is not pd.Index else np.ndarray + xbox = ( + box_with_array if box_with_array not in [pd.Index, pd.array] else np.ndarray + ) pi = pd.period_range("2000", periods=4) other = np.array(pi.to_numpy()[0]) @@ -68,7 +70,7 @@ def test_compare_object_dtype(self, box_with_array, other_box): pi = pd.period_range("2000", periods=5) parr = tm.box_expected(pi, box_with_array) - xbox = np.ndarray if box_with_array is pd.Index else box_with_array + xbox = np.ndarray if box_with_array in [pd.Index, pd.array] else box_with_array other = other_box(pi) @@ -175,7 +177,9 @@ def test_pi_cmp_period(self): # TODO: moved from test_datetime64; de-duplicate with version below def test_parr_cmp_period_scalar2(self, box_with_array): - xbox = box_with_array if box_with_array is not pd.Index else np.ndarray + xbox = ( + box_with_array if box_with_array not in [pd.Index, pd.array] else np.ndarray + ) pi = pd.period_range("2000-01-01", periods=10, freq="D") @@ -196,7 +200,7 @@ def test_parr_cmp_period_scalar2(self, box_with_array): @pytest.mark.parametrize("freq", ["M", "2M", "3M"]) def test_parr_cmp_period_scalar(self, freq, box_with_array): # GH#13200 - xbox = np.ndarray if box_with_array is pd.Index else box_with_array + xbox = np.ndarray if box_with_array in [pd.Index, pd.array] else box_with_array base = PeriodIndex(["2011-01", "2011-02", "2011-03", "2011-04"], freq=freq) base = tm.box_expected(base, box_with_array) @@ -235,7 +239,7 @@ def test_parr_cmp_period_scalar(self, freq, box_with_array): @pytest.mark.parametrize("freq", ["M", "2M", "3M"]) def test_parr_cmp_pi(self, freq, box_with_array): # GH#13200 - xbox = np.ndarray if box_with_array is pd.Index else box_with_array + xbox = np.ndarray if box_with_array in [pd.Index, pd.array] else box_with_array base = PeriodIndex(["2011-01", "2011-02", "2011-03", "2011-04"], freq=freq) base = tm.box_expected(base, box_with_array) @@ -284,7 +288,7 @@ def test_parr_cmp_pi_mismatched_freq_raises(self, freq, box_with_array): # TODO: Could parametrize over boxes for idx? idx = PeriodIndex(["2011", "2012", "2013", "2014"], freq="A") rev_msg = r"Input has different freq=(M|2M|3M) from PeriodArray\(freq=A-DEC\)" - idx_msg = rev_msg if box_with_array is tm.to_array else msg + idx_msg = rev_msg if box_with_array in [tm.to_array, pd.array] else msg with pytest.raises(IncompatibleFrequency, match=idx_msg): base <= idx @@ -298,7 +302,7 @@ def test_parr_cmp_pi_mismatched_freq_raises(self, freq, box_with_array): idx = PeriodIndex(["2011", "2012", "2013", "2014"], freq="4M") rev_msg = r"Input has different freq=(M|2M|3M) from PeriodArray\(freq=4M\)" - idx_msg = rev_msg if box_with_array is tm.to_array else msg + idx_msg = rev_msg if box_with_array in [tm.to_array, pd.array] else msg with pytest.raises(IncompatibleFrequency, match=idx_msg): base <= idx @@ -779,7 +783,7 @@ def test_pi_add_sub_td64_array_tick(self): @pytest.mark.parametrize("tdi_freq", [None, "H"]) def test_parr_sub_td64array(self, box_with_array, tdi_freq, pi_freq): box = box_with_array - xbox = box if box is not tm.to_array else pd.Index + xbox = box if box not in [pd.array, tm.to_array] else pd.Index tdi = TimedeltaIndex(["1 hours", "2 hours"], freq=tdi_freq) dti = Timestamp("2018-03-07 17:16:40") + tdi diff --git a/pandas/tests/arithmetic/test_timedelta64.py b/pandas/tests/arithmetic/test_timedelta64.py index 68bedcc099a91..b3dfb5d015ab4 100644 --- a/pandas/tests/arithmetic/test_timedelta64.py +++ b/pandas/tests/arithmetic/test_timedelta64.py @@ -50,7 +50,9 @@ class TestTimedelta64ArrayLikeComparisons: def test_compare_timedelta64_zerodim(self, box_with_array): # GH#26689 should unbox when comparing with zerodim array box = box_with_array - xbox = box_with_array if box_with_array is not pd.Index else np.ndarray + xbox = ( + box_with_array if box_with_array not in [pd.Index, pd.array] else np.ndarray + ) tdi = pd.timedelta_range("2H", periods=4) other = np.array(tdi.to_numpy()[0]) @@ -73,7 +75,8 @@ def test_compare_timedelta64_zerodim(self, box_with_array): def test_compare_timedeltalike_scalar(self, box_with_array, td_scalar): # regression test for GH#5963 box = box_with_array - xbox = box if box is not pd.Index else np.ndarray + xbox = box if box not in [pd.Index, pd.array] else np.ndarray + ser = pd.Series([timedelta(days=1), timedelta(days=2)]) ser = tm.box_expected(ser, box) actual = ser > td_scalar @@ -85,6 +88,7 @@ def test_compare_timedeltalike_scalar(self, box_with_array, td_scalar): def test_td64_comparisons_invalid(self, box_with_array, invalid): # GH#13624 for str box = box_with_array + rng = timedelta_range("1 days", periods=10) obj = tm.box_expected(rng, box) @@ -1142,19 +1146,24 @@ def test_td64arr_add_sub_integer_array(self, box_with_array): # GH#19959, deprecated GH#22535 # GH#22696 for DataFrame case, check that we don't dispatch to numpy # implementation, which treats int64 as m8[ns] + box = box_with_array + xbox = np.ndarray if box is pd.array else box rng = timedelta_range("1 days 09:00:00", freq="H", periods=3) - tdarr = tm.box_expected(rng, box_with_array) - other = tm.box_expected([4, 3, 2], box_with_array) + tdarr = tm.box_expected(rng, box) + other = tm.box_expected([4, 3, 2], xbox) msg = "Addition/subtraction of integers and integer-arrays" assert_invalid_addsub_type(tdarr, other, msg) def test_td64arr_addsub_integer_array_no_freq(self, box_with_array): # GH#19959 + box = box_with_array + xbox = np.ndarray if box is pd.array else box + tdi = TimedeltaIndex(["1 Day", "NaT", "3 Hours"]) - tdarr = tm.box_expected(tdi, box_with_array) - other = tm.box_expected([14, -1, 16], box_with_array) + tdarr = tm.box_expected(tdi, box) + other = tm.box_expected([14, -1, 16], xbox) msg = "Addition/subtraction of integers" assert_invalid_addsub_type(tdarr, other, msg) @@ -1204,7 +1213,7 @@ def test_td64arr_add_sub_tdi(self, box_with_array, names): ) tdi = TimedeltaIndex(["0 days", "1 day"], name=names[0]) - tdi = np.array(tdi) if box is tm.to_array else tdi + tdi = np.array(tdi) if box in [tm.to_array, pd.array] else tdi ser = Series([Timedelta(hours=3), Timedelta(hours=4)], name=names[1]) expected = Series( [Timedelta(hours=3), Timedelta(days=1, hours=4)], name=names[2] @@ -1311,7 +1320,7 @@ def test_td64arr_add_offset_index(self, names, box_with_array): tdi = TimedeltaIndex(["1 days 00:00:00", "3 days 04:00:00"], name=names[0]) other = pd.Index([pd.offsets.Hour(n=1), pd.offsets.Minute(n=-2)], name=names[1]) - other = np.array(other) if box is tm.to_array else other + other = np.array(other) if box in [tm.to_array, pd.array] else other expected = TimedeltaIndex( [tdi[n] + other[n] for n in range(len(tdi))], freq="infer", name=names[2] @@ -1353,8 +1362,8 @@ def test_td64arr_add_offset_array(self, box_with_array): def test_td64arr_sub_offset_index(self, names, box_with_array): # GH#18824, GH#19744 box = box_with_array - xbox = box if box is not tm.to_array else pd.Index - exname = names[2] if box is not tm.to_array else names[1] + xbox = box if box not in [tm.to_array, pd.array] else pd.Index + exname = names[2] if box not in [tm.to_array, pd.array] else names[1] if box is pd.DataFrame and names[1] != names[0]: pytest.skip( @@ -1395,13 +1404,13 @@ def test_td64arr_sub_offset_array(self, box_with_array): def test_td64arr_with_offset_series(self, names, box_with_array): # GH#18849 box = box_with_array - box2 = Series if box in [pd.Index, tm.to_array] else box + box2 = Series if box in [pd.Index, tm.to_array, pd.array] else box if box is pd.DataFrame: # Since we are operating with a DataFrame and a non-DataFrame, # the non-DataFrame is cast to Series and its name ignored. exname = names[0] - elif box is tm.to_array: + elif box in [tm.to_array, pd.array]: exname = names[1] else: exname = names[2] @@ -1456,8 +1465,11 @@ def test_td64arr_addsub_anchored_offset_arraylike(self, obox, box_with_array): # Unsorted def test_td64arr_add_sub_object_array(self, box_with_array): + box = box_with_array + xbox = np.ndarray if box is pd.array else box + tdi = pd.timedelta_range("1 day", periods=3, freq="D") - tdarr = tm.box_expected(tdi, box_with_array) + tdarr = tm.box_expected(tdi, box) other = np.array( [pd.Timedelta(days=1), pd.offsets.Day(2), pd.Timestamp("2000-01-04")] @@ -1469,7 +1481,7 @@ def test_td64arr_add_sub_object_array(self, box_with_array): expected = pd.Index( [pd.Timedelta(days=2), pd.Timedelta(days=4), pd.Timestamp("2000-01-07")] ) - expected = tm.box_expected(expected, box_with_array) + expected = tm.box_expected(expected, xbox) tm.assert_equal(result, expected) msg = "unsupported operand type|cannot subtract a datelike" @@ -1483,7 +1495,7 @@ def test_td64arr_add_sub_object_array(self, box_with_array): expected = pd.Index( [pd.Timedelta(0), pd.Timedelta(0), pd.Timestamp("2000-01-01")] ) - expected = tm.box_expected(expected, box_with_array) + expected = tm.box_expected(expected, xbox) tm.assert_equal(result, expected) @@ -1536,7 +1548,7 @@ def test_tdi_mul_int_array(self, box_with_array): def test_tdi_mul_int_series(self, box_with_array): box = box_with_array - xbox = pd.Series if box in [pd.Index, tm.to_array] else box + xbox = pd.Series if box in [pd.Index, tm.to_array, pd.array] else box idx = TimedeltaIndex(np.arange(5, dtype="int64")) expected = TimedeltaIndex(np.arange(5, dtype="int64") ** 2) @@ -1549,7 +1561,7 @@ def test_tdi_mul_int_series(self, box_with_array): def test_tdi_mul_float_series(self, box_with_array): box = box_with_array - xbox = pd.Series if box in [pd.Index, tm.to_array] else box + xbox = pd.Series if box in [pd.Index, tm.to_array, pd.array] else box idx = TimedeltaIndex(np.arange(5, dtype="int64")) idx = tm.box_expected(idx, box) @@ -1604,13 +1616,16 @@ def test_td64arr_div_nat_invalid(self, box_with_array): def test_td64arr_div_td64nat(self, box_with_array): # GH#23829 + box = box_with_array + xbox = np.ndarray if box is pd.array else box + rng = timedelta_range("1 days", "10 days") - rng = tm.box_expected(rng, box_with_array) + rng = tm.box_expected(rng, box) other = np.timedelta64("NaT") expected = np.array([np.nan] * 10) - expected = tm.box_expected(expected, box_with_array) + expected = tm.box_expected(expected, xbox) result = rng / other tm.assert_equal(result, expected) @@ -1631,11 +1646,14 @@ def test_td64arr_div_int(self, box_with_array): def test_td64arr_div_tdlike_scalar(self, two_hours, box_with_array): # GH#20088, GH#22163 ensure DataFrame returns correct dtype + box = box_with_array + xbox = np.ndarray if box is pd.array else box + rng = timedelta_range("1 days", "10 days", name="foo") expected = pd.Float64Index((np.arange(10) + 1) * 12, name="foo") - rng = tm.box_expected(rng, box_with_array) - expected = tm.box_expected(expected, box_with_array) + rng = tm.box_expected(rng, box) + expected = tm.box_expected(expected, xbox) result = rng / two_hours tm.assert_equal(result, expected) @@ -1647,32 +1665,38 @@ def test_td64arr_div_tdlike_scalar(self, two_hours, box_with_array): @pytest.mark.parametrize("m", [1, 3, 10]) @pytest.mark.parametrize("unit", ["D", "h", "m", "s", "ms", "us", "ns"]) def test_td64arr_div_td64_scalar(self, m, unit, box_with_array): + box = box_with_array + xbox = np.ndarray if box is pd.array else box + startdate = Series(pd.date_range("2013-01-01", "2013-01-03")) enddate = Series(pd.date_range("2013-03-01", "2013-03-03")) ser = enddate - startdate ser[2] = np.nan flat = ser - ser = tm.box_expected(ser, box_with_array) + ser = tm.box_expected(ser, box) # op expected = Series([x / np.timedelta64(m, unit) for x in flat]) - expected = tm.box_expected(expected, box_with_array) + expected = tm.box_expected(expected, xbox) result = ser / np.timedelta64(m, unit) tm.assert_equal(result, expected) # reverse op expected = Series([Timedelta(np.timedelta64(m, unit)) / x for x in flat]) - expected = tm.box_expected(expected, box_with_array) + expected = tm.box_expected(expected, xbox) result = np.timedelta64(m, unit) / ser tm.assert_equal(result, expected) def test_td64arr_div_tdlike_scalar_with_nat(self, two_hours, box_with_array): + box = box_with_array + xbox = np.ndarray if box is pd.array else box + rng = TimedeltaIndex(["1 days", pd.NaT, "2 days"], name="foo") expected = pd.Float64Index([12, np.nan, 24], name="foo") - rng = tm.box_expected(rng, box_with_array) - expected = tm.box_expected(expected, box_with_array) + rng = tm.box_expected(rng, box) + expected = tm.box_expected(expected, xbox) result = rng / two_hours tm.assert_equal(result, expected) @@ -1683,17 +1707,20 @@ def test_td64arr_div_tdlike_scalar_with_nat(self, two_hours, box_with_array): def test_td64arr_div_td64_ndarray(self, box_with_array): # GH#22631 + box = box_with_array + xbox = np.ndarray if box is pd.array else box + rng = TimedeltaIndex(["1 days", pd.NaT, "2 days"]) expected = pd.Float64Index([12, np.nan, 24]) - rng = tm.box_expected(rng, box_with_array) - expected = tm.box_expected(expected, box_with_array) + rng = tm.box_expected(rng, box) + expected = tm.box_expected(expected, xbox) other = np.array([2, 4, 2], dtype="m8[h]") result = rng / other tm.assert_equal(result, expected) - result = rng / tm.box_expected(other, box_with_array) + result = rng / tm.box_expected(other, box) tm.assert_equal(result, expected) result = rng / other.astype(object) @@ -1707,7 +1734,7 @@ def test_td64arr_div_td64_ndarray(self, box_with_array): result = other / rng tm.assert_equal(result, expected) - result = tm.box_expected(other, box_with_array) / rng + result = tm.box_expected(other, box) / rng tm.assert_equal(result, expected) result = other.astype(object) / rng @@ -1736,6 +1763,7 @@ def test_tdarr_div_length_mismatch(self, box_with_array): def test_td64arr_floordiv_td64arr_with_nat(self, box_with_array): # GH#35529 box = box_with_array + xbox = np.ndarray if box is pd.array else box left = pd.Series([1000, 222330, 30], dtype="timedelta64[ns]") right = pd.Series([1000, 222330, None], dtype="timedelta64[ns]") @@ -1744,7 +1772,7 @@ def test_td64arr_floordiv_td64arr_with_nat(self, box_with_array): right = tm.box_expected(right, box) expected = np.array([1.0, 1.0, np.nan], dtype=np.float64) - expected = tm.box_expected(expected, box) + expected = tm.box_expected(expected, xbox) result = left // right @@ -1756,39 +1784,48 @@ def test_td64arr_floordiv_td64arr_with_nat(self, box_with_array): def test_td64arr_floordiv_tdscalar(self, box_with_array, scalar_td): # GH#18831 + box = box_with_array + xbox = np.ndarray if box is pd.array else box + td1 = Series([timedelta(minutes=5, seconds=3)] * 3) td1.iloc[2] = np.nan expected = Series([0, 0, np.nan]) - td1 = tm.box_expected(td1, box_with_array, transpose=False) - expected = tm.box_expected(expected, box_with_array, transpose=False) + td1 = tm.box_expected(td1, box, transpose=False) + expected = tm.box_expected(expected, xbox, transpose=False) result = td1 // scalar_td tm.assert_equal(result, expected) def test_td64arr_rfloordiv_tdscalar(self, box_with_array, scalar_td): # GH#18831 + box = box_with_array + xbox = np.ndarray if box is pd.array else box + td1 = Series([timedelta(minutes=5, seconds=3)] * 3) td1.iloc[2] = np.nan expected = Series([1, 1, np.nan]) - td1 = tm.box_expected(td1, box_with_array, transpose=False) - expected = tm.box_expected(expected, box_with_array, transpose=False) + td1 = tm.box_expected(td1, box, transpose=False) + expected = tm.box_expected(expected, xbox, transpose=False) result = scalar_td // td1 tm.assert_equal(result, expected) def test_td64arr_rfloordiv_tdscalar_explicit(self, box_with_array, scalar_td): # GH#18831 + box = box_with_array + xbox = np.ndarray if box is pd.array else box + td1 = Series([timedelta(minutes=5, seconds=3)] * 3) td1.iloc[2] = np.nan expected = Series([1, 1, np.nan]) - td1 = tm.box_expected(td1, box_with_array, transpose=False) - expected = tm.box_expected(expected, box_with_array, transpose=False) + td1 = tm.box_expected(td1, box, transpose=False) + expected = tm.box_expected(expected, xbox, transpose=False) # We can test __rfloordiv__ using this syntax, # see `test_timedelta_rfloordiv` @@ -1806,11 +1843,14 @@ def test_td64arr_floordiv_int(self, box_with_array): 1 // idx def test_td64arr_floordiv_tdlike_scalar(self, two_hours, box_with_array): + box = box_with_array + xbox = np.ndarray if box is pd.array else box + tdi = timedelta_range("1 days", "10 days", name="foo") expected = pd.Int64Index((np.arange(10) + 1) * 12, name="foo") - tdi = tm.box_expected(tdi, box_with_array) - expected = tm.box_expected(expected, box_with_array) + tdi = tm.box_expected(tdi, box) + expected = tm.box_expected(expected, xbox) result = tdi // two_hours tm.assert_equal(result, expected) @@ -1827,17 +1867,20 @@ def test_td64arr_floordiv_tdlike_scalar(self, two_hours, box_with_array): ) def test_td64arr_rfloordiv_tdlike_scalar(self, scalar_td, box_with_array): # GH#19125 + box = box_with_array + xbox = np.ndarray if box_with_array is pd.array else box_with_array + tdi = TimedeltaIndex(["00:05:03", "00:05:03", pd.NaT], freq=None) expected = pd.Index([2.0, 2.0, np.nan]) - tdi = tm.box_expected(tdi, box_with_array, transpose=False) - expected = tm.box_expected(expected, box_with_array, transpose=False) + tdi = tm.box_expected(tdi, box, transpose=False) + expected = tm.box_expected(expected, xbox, transpose=False) res = tdi.__rfloordiv__(scalar_td) tm.assert_equal(res, expected) expected = pd.Index([0.0, 0.0, np.nan]) - expected = tm.box_expected(expected, box_with_array, transpose=False) + expected = tm.box_expected(expected, xbox, transpose=False) res = tdi // (scalar_td) tm.assert_equal(res, expected) @@ -2059,7 +2102,7 @@ def test_td64arr_mul_int_series(self, box_with_array, names, request): reason = "broadcasts along wrong axis, but doesn't raise" request.node.add_marker(pytest.mark.xfail(reason=reason)) - exname = names[2] if box is not tm.to_array else names[1] + exname = names[2] if box not in [tm.to_array, pd.array] else names[1] tdi = TimedeltaIndex( ["0days", "1day", "2days", "3days", "4days"], name=names[0] @@ -2074,8 +2117,12 @@ def test_td64arr_mul_int_series(self, box_with_array, names, request): ) tdi = tm.box_expected(tdi, box) - box = Series if (box is pd.Index or box is tm.to_array) else box - expected = tm.box_expected(expected, box) + xbox = ( + Series + if (box is pd.Index or box is tm.to_array or box is pd.array) + else box + ) + expected = tm.box_expected(expected, xbox) result = ser * tdi tm.assert_equal(result, expected) @@ -2098,7 +2145,7 @@ def test_float_series_rdiv_td64arr(self, box_with_array, names): ) ser = Series([1.5, 3, 4.5, 6, 7.5], dtype=np.float64, name=names[1]) - xname = names[2] if box is not tm.to_array else names[1] + xname = names[2] if box not in [tm.to_array, pd.array] else names[1] expected = Series( [tdi[n] / ser[n] for n in range(len(ser))], dtype="timedelta64[ns]", @@ -2106,7 +2153,7 @@ def test_float_series_rdiv_td64arr(self, box_with_array, names): ) xbox = box - if box in [pd.Index, tm.to_array] and type(ser) is Series: + if box in [pd.Index, tm.to_array, pd.array] and type(ser) is Series: xbox = Series tdi = tm.box_expected(tdi, box) From 258a17d37414289e679b9f1eeee0fc60e23ea71f Mon Sep 17 00:00:00 2001 From: John Karasinski Date: Fri, 2 Oct 2020 14:58:10 -0700 Subject: [PATCH 0985/1025] DOC: update code style for remaining intro tutorial docs for #36777 (#36817) --- .../intro_tutorials/01_table_oriented.rst | 16 ++++++++----- .../intro_tutorials/02_read_write.rst | 7 +++--- .../intro_tutorials/04_plotting.rst | 23 ++++++++++--------- .../intro_tutorials/05_add_columns.rst | 15 +++++++----- 4 files changed, 35 insertions(+), 26 deletions(-) diff --git a/doc/source/getting_started/intro_tutorials/01_table_oriented.rst b/doc/source/getting_started/intro_tutorials/01_table_oriented.rst index dc9bec2284aab..e8e0fef271a74 100644 --- a/doc/source/getting_started/intro_tutorials/01_table_oriented.rst +++ b/doc/source/getting_started/intro_tutorials/01_table_oriented.rst @@ -41,12 +41,16 @@ I want to store passenger data of the Titanic. For a number of passengers, I kno .. ipython:: python - df = pd.DataFrame({ - "Name": ["Braund, Mr. Owen Harris", - "Allen, Mr. William Henry", - "Bonnell, Miss. Elizabeth"], - "Age": [22, 35, 58], - "Sex": ["male", "male", "female"]} + df = pd.DataFrame( + { + "Name": [ + "Braund, Mr. Owen Harris", + "Allen, Mr. William Henry", + "Bonnell, Miss. Elizabeth", + ], + "Age": [22, 35, 58], + "Sex": ["male", "male", "female"], + } ) df diff --git a/doc/source/getting_started/intro_tutorials/02_read_write.rst b/doc/source/getting_started/intro_tutorials/02_read_write.rst index c6c6bfefc4303..c9b6a12904311 100644 --- a/doc/source/getting_started/intro_tutorials/02_read_write.rst +++ b/doc/source/getting_started/intro_tutorials/02_read_write.rst @@ -138,7 +138,7 @@ My colleague requested the Titanic data as a spreadsheet. .. ipython:: python - titanic.to_excel('titanic.xlsx', sheet_name='passengers', index=False) + titanic.to_excel("titanic.xlsx", sheet_name="passengers", index=False) Whereas ``read_*`` functions are used to read data to pandas, the ``to_*`` methods are used to store data. The :meth:`~DataFrame.to_excel` method stores @@ -156,7 +156,7 @@ The equivalent read function :meth:`~DataFrame.read_excel` will reload the data .. ipython:: python - titanic = pd.read_excel('titanic.xlsx', sheet_name='passengers') + titanic = pd.read_excel("titanic.xlsx", sheet_name="passengers") .. ipython:: python @@ -166,7 +166,8 @@ The equivalent read function :meth:`~DataFrame.read_excel` will reload the data :suppress: import os - os.remove('titanic.xlsx') + + os.remove("titanic.xlsx") .. raw:: html diff --git a/doc/source/getting_started/intro_tutorials/04_plotting.rst b/doc/source/getting_started/intro_tutorials/04_plotting.rst index f3d99ee56359a..ae33a6e1fcd9e 100644 --- a/doc/source/getting_started/intro_tutorials/04_plotting.rst +++ b/doc/source/getting_started/intro_tutorials/04_plotting.rst @@ -40,8 +40,7 @@ in respectively Paris, Antwerp and London. .. ipython:: python - air_quality = pd.read_csv("data/air_quality_no2.csv", - index_col=0, parse_dates=True) + air_quality = pd.read_csv("data/air_quality_no2.csv", index_col=0, parse_dates=True) air_quality.head() .. note:: @@ -112,9 +111,7 @@ I want to visually compare the :math:`N0_2` values measured in London versus Par .. ipython:: python @savefig 04_airqual_scatter.png - air_quality.plot.scatter(x="station_london", - y="station_paris", - alpha=0.5) + air_quality.plot.scatter(x="station_london", y="station_paris", alpha=0.5) .. raw:: html @@ -127,8 +124,11 @@ standard Python to get an overview of the available plot methods: .. ipython:: python - [method_name for method_name in dir(air_quality.plot) - if not method_name.startswith("_")] + [ + method_name + for method_name in dir(air_quality.plot) + if not method_name.startswith("_") + ] .. note:: In many development environments as well as ipython and @@ -196,17 +196,18 @@ I want to further customize, extend or save the resulting plot. .. ipython:: python - fig, axs = plt.subplots(figsize=(12, 4)); - air_quality.plot.area(ax=axs); + fig, axs = plt.subplots(figsize=(12, 4)) + air_quality.plot.area(ax=axs) @savefig 04_airqual_customized.png - axs.set_ylabel("NO$_2$ concentration"); + axs.set_ylabel("NO$_2$ concentration") fig.savefig("no2_concentrations.png") .. ipython:: python :suppress: import os - os.remove('no2_concentrations.png') + + os.remove("no2_concentrations.png") .. raw:: html diff --git a/doc/source/getting_started/intro_tutorials/05_add_columns.rst b/doc/source/getting_started/intro_tutorials/05_add_columns.rst index d4f6a8d6bb4a2..a99c2c49585c5 100644 --- a/doc/source/getting_started/intro_tutorials/05_add_columns.rst +++ b/doc/source/getting_started/intro_tutorials/05_add_columns.rst @@ -39,8 +39,7 @@ in respectively Paris, Antwerp and London. .. ipython:: python - air_quality = pd.read_csv("data/air_quality_no2.csv", - index_col=0, parse_dates=True) + air_quality = pd.read_csv("data/air_quality_no2.csv", index_col=0, parse_dates=True) air_quality.head() .. raw:: html @@ -95,8 +94,9 @@ I want to check the ratio of the values in Paris versus Antwerp and save the res .. ipython:: python - air_quality["ratio_paris_antwerp"] = \ + air_quality["ratio_paris_antwerp"] = ( air_quality["station_paris"] / air_quality["station_antwerp"] + ) air_quality.head() The calculation is again element-wise, so the ``/`` is applied *for the @@ -122,9 +122,12 @@ I want to rename the data columns to the corresponding station identifiers used .. ipython:: python air_quality_renamed = air_quality.rename( - columns={"station_antwerp": "BETR801", - "station_paris": "FR04014", - "station_london": "London Westminster"}) + columns={ + "station_antwerp": "BETR801", + "station_paris": "FR04014", + "station_london": "London Westminster", + } + ) .. ipython:: python From 79b5778ea49563004f3a274e47ad6e6b8f6538dc Mon Sep 17 00:00:00 2001 From: Brendan Wilby <39991145+BrendanWilby@users.noreply.github.com> Date: Fri, 2 Oct 2020 23:05:24 +0100 Subject: [PATCH 0986/1025] DOC: uses black to fix formatting #36777 (#36815) --- doc/source/user_guide/merging.rst | 464 +++++++++++++++++------------- 1 file changed, 257 insertions(+), 207 deletions(-) diff --git a/doc/source/user_guide/merging.rst b/doc/source/user_guide/merging.rst index aee56a2565310..8dbfc261e6fa8 100644 --- a/doc/source/user_guide/merging.rst +++ b/doc/source/user_guide/merging.rst @@ -7,6 +7,7 @@ from matplotlib import pyplot as plt import pandas.util._doctools as doctools + p = doctools.TablePlotter() @@ -38,23 +39,35 @@ a simple example: .. ipython:: python - df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'], - 'B': ['B0', 'B1', 'B2', 'B3'], - 'C': ['C0', 'C1', 'C2', 'C3'], - 'D': ['D0', 'D1', 'D2', 'D3']}, - index=[0, 1, 2, 3]) + df1 = pd.DataFrame( + { + "A": ["A0", "A1", "A2", "A3"], + "B": ["B0", "B1", "B2", "B3"], + "C": ["C0", "C1", "C2", "C3"], + "D": ["D0", "D1", "D2", "D3"], + }, + index=[0, 1, 2, 3], + ) - df2 = pd.DataFrame({'A': ['A4', 'A5', 'A6', 'A7'], - 'B': ['B4', 'B5', 'B6', 'B7'], - 'C': ['C4', 'C5', 'C6', 'C7'], - 'D': ['D4', 'D5', 'D6', 'D7']}, - index=[4, 5, 6, 7]) + df2 = pd.DataFrame( + { + "A": ["A4", "A5", "A6", "A7"], + "B": ["B4", "B5", "B6", "B7"], + "C": ["C4", "C5", "C6", "C7"], + "D": ["D4", "D5", "D6", "D7"], + }, + index=[4, 5, 6, 7], + ) - df3 = pd.DataFrame({'A': ['A8', 'A9', 'A10', 'A11'], - 'B': ['B8', 'B9', 'B10', 'B11'], - 'C': ['C8', 'C9', 'C10', 'C11'], - 'D': ['D8', 'D9', 'D10', 'D11']}, - index=[8, 9, 10, 11]) + df3 = pd.DataFrame( + { + "A": ["A8", "A9", "A10", "A11"], + "B": ["B8", "B9", "B10", "B11"], + "C": ["C8", "C9", "C10", "C11"], + "D": ["D8", "D9", "D10", "D11"], + }, + index=[8, 9, 10, 11], + ) frames = [df1, df2, df3] result = pd.concat(frames) @@ -109,7 +122,7 @@ with each of the pieces of the chopped up DataFrame. We can do this using the .. ipython:: python - result = pd.concat(frames, keys=['x', 'y', 'z']) + result = pd.concat(frames, keys=["x", "y", "z"]) .. ipython:: python :suppress: @@ -125,7 +138,7 @@ means that we can now select out each chunk by key: .. ipython:: python - result.loc['y'] + result.loc["y"] It's not a stretch to see how this can be very useful. More detail on this functionality below. @@ -158,10 +171,14 @@ behavior: .. ipython:: python - df4 = pd.DataFrame({'B': ['B2', 'B3', 'B6', 'B7'], - 'D': ['D2', 'D3', 'D6', 'D7'], - 'F': ['F2', 'F3', 'F6', 'F7']}, - index=[2, 3, 6, 7]) + df4 = pd.DataFrame( + { + "B": ["B2", "B3", "B6", "B7"], + "D": ["D2", "D3", "D6", "D7"], + "F": ["F2", "F3", "F6", "F7"], + }, + index=[2, 3, 6, 7], + ) result = pd.concat([df1, df4], axis=1, sort=False) @@ -184,7 +201,7 @@ Here is the same thing with ``join='inner'``: .. ipython:: python - result = pd.concat([df1, df4], axis=1, join='inner') + result = pd.concat([df1, df4], axis=1, join="inner") .. ipython:: python :suppress: @@ -316,7 +333,7 @@ the name of the ``Series``. .. ipython:: python - s1 = pd.Series(['X0', 'X1', 'X2', 'X3'], name='X') + s1 = pd.Series(["X0", "X1", "X2", "X3"], name="X") result = pd.concat([df1, s1], axis=1) .. ipython:: python @@ -338,7 +355,7 @@ If unnamed ``Series`` are passed they will be numbered consecutively. .. ipython:: python - s2 = pd.Series(['_0', '_1', '_2', '_3']) + s2 = pd.Series(["_0", "_1", "_2", "_3"]) result = pd.concat([df1, s2, s2, s2], axis=1) .. ipython:: python @@ -373,7 +390,7 @@ inherit the parent ``Series``' name, when these existed. .. ipython:: python - s3 = pd.Series([0, 1, 2, 3], name='foo') + s3 = pd.Series([0, 1, 2, 3], name="foo") s4 = pd.Series([0, 1, 2, 3]) s5 = pd.Series([0, 1, 4, 5]) @@ -383,13 +400,13 @@ Through the ``keys`` argument we can override the existing column names. .. ipython:: python - pd.concat([s3, s4, s5], axis=1, keys=['red', 'blue', 'yellow']) + pd.concat([s3, s4, s5], axis=1, keys=["red", "blue", "yellow"]) Let's consider a variation of the very first example presented: .. ipython:: python - result = pd.concat(frames, keys=['x', 'y', 'z']) + result = pd.concat(frames, keys=["x", "y", "z"]) .. ipython:: python :suppress: @@ -404,7 +421,7 @@ for the ``keys`` argument (unless other keys are specified): .. ipython:: python - pieces = {'x': df1, 'y': df2, 'z': df3} + pieces = {"x": df1, "y": df2, "z": df3} result = pd.concat(pieces) .. ipython:: python @@ -417,7 +434,7 @@ for the ``keys`` argument (unless other keys are specified): .. ipython:: python - result = pd.concat(pieces, keys=['z', 'y']) + result = pd.concat(pieces, keys=["z", "y"]) .. ipython:: python :suppress: @@ -439,9 +456,9 @@ do so using the ``levels`` argument: .. ipython:: python - result = pd.concat(pieces, keys=['x', 'y', 'z'], - levels=[['z', 'y', 'x', 'w']], - names=['group_key']) + result = pd.concat( + pieces, keys=["x", "y", "z"], levels=[["z", "y", "x", "w"]], names=["group_key"] + ) .. ipython:: python :suppress: @@ -469,7 +486,7 @@ append a single row to a ``DataFrame`` by passing a ``Series`` or dict to .. ipython:: python - s2 = pd.Series(['X0', 'X1', 'X2', 'X3'], index=['A', 'B', 'C', 'D']) + s2 = pd.Series(["X0", "X1", "X2", "X3"], index=["A", "B", "C", "D"]) result = df1.append(s2, ignore_index=True) .. ipython:: python @@ -488,8 +505,7 @@ You can also pass a list of dicts or Series: .. ipython:: python - dicts = [{'A': 1, 'B': 2, 'C': 3, 'X': 4}, - {'A': 5, 'B': 6, 'C': 7, 'Y': 8}] + dicts = [{"A": 1, "B": 2, "C": 3, "X": 4}, {"A": 5, "B": 6, "C": 7, "Y": 8}] result = df1.append(dicts, ignore_index=True, sort=False) .. ipython:: python @@ -619,14 +635,22 @@ key combination: .. ipython:: python - left = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'], - 'A': ['A0', 'A1', 'A2', 'A3'], - 'B': ['B0', 'B1', 'B2', 'B3']}) + left = pd.DataFrame( + { + "key": ["K0", "K1", "K2", "K3"], + "A": ["A0", "A1", "A2", "A3"], + "B": ["B0", "B1", "B2", "B3"], + } + ) - right = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'], - 'C': ['C0', 'C1', 'C2', 'C3'], - 'D': ['D0', 'D1', 'D2', 'D3']}) - result = pd.merge(left, right, on='key') + right = pd.DataFrame( + { + "key": ["K0", "K1", "K2", "K3"], + "C": ["C0", "C1", "C2", "C3"], + "D": ["D0", "D1", "D2", "D3"], + } + ) + result = pd.merge(left, right, on="key") .. ipython:: python :suppress: @@ -642,17 +666,25 @@ appearing in ``left`` and ``right`` are present (the intersection), since .. ipython:: python - left = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'], - 'key2': ['K0', 'K1', 'K0', 'K1'], - 'A': ['A0', 'A1', 'A2', 'A3'], - 'B': ['B0', 'B1', 'B2', 'B3']}) + left = pd.DataFrame( + { + "key1": ["K0", "K0", "K1", "K2"], + "key2": ["K0", "K1", "K0", "K1"], + "A": ["A0", "A1", "A2", "A3"], + "B": ["B0", "B1", "B2", "B3"], + } + ) - right = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'], - 'key2': ['K0', 'K0', 'K0', 'K0'], - 'C': ['C0', 'C1', 'C2', 'C3'], - 'D': ['D0', 'D1', 'D2', 'D3']}) + right = pd.DataFrame( + { + "key1": ["K0", "K1", "K1", "K2"], + "key2": ["K0", "K0", "K0", "K0"], + "C": ["C0", "C1", "C2", "C3"], + "D": ["D0", "D1", "D2", "D3"], + } + ) - result = pd.merge(left, right, on=['key1', 'key2']) + result = pd.merge(left, right, on=["key1", "key2"]) .. ipython:: python :suppress: @@ -678,7 +710,7 @@ either the left or right tables, the values in the joined table will be .. ipython:: python - result = pd.merge(left, right, how='left', on=['key1', 'key2']) + result = pd.merge(left, right, how="left", on=["key1", "key2"]) .. ipython:: python :suppress: @@ -690,7 +722,7 @@ either the left or right tables, the values in the joined table will be .. ipython:: python - result = pd.merge(left, right, how='right', on=['key1', 'key2']) + result = pd.merge(left, right, how="right", on=["key1", "key2"]) .. ipython:: python :suppress: @@ -701,7 +733,7 @@ either the left or right tables, the values in the joined table will be .. ipython:: python - result = pd.merge(left, right, how='outer', on=['key1', 'key2']) + result = pd.merge(left, right, how="outer", on=["key1", "key2"]) .. ipython:: python :suppress: @@ -713,7 +745,7 @@ either the left or right tables, the values in the joined table will be .. ipython:: python - result = pd.merge(left, right, how='inner', on=['key1', 'key2']) + result = pd.merge(left, right, how="inner", on=["key1", "key2"]) .. ipython:: python :suppress: @@ -741,18 +773,18 @@ as shown in the following example. ) ser - pd.merge(df, ser.reset_index(), on=['Let', 'Num']) + pd.merge(df, ser.reset_index(), on=["Let", "Num"]) Here is another example with duplicate join keys in DataFrames: .. ipython:: python - left = pd.DataFrame({'A': [1, 2], 'B': [2, 2]}) + left = pd.DataFrame({"A": [1, 2], "B": [2, 2]}) - right = pd.DataFrame({'A': [4, 5, 6], 'B': [2, 2, 2]}) + right = pd.DataFrame({"A": [4, 5, 6], "B": [2, 2, 2]}) - result = pd.merge(left, right, on='B', how='outer') + result = pd.merge(left, right, on="B", how="outer") .. ipython:: python :suppress: @@ -784,8 +816,8 @@ In the following example, there are duplicate values of ``B`` in the right .. ipython:: python - left = pd.DataFrame({'A' : [1,2], 'B' : [1, 2]}) - right = pd.DataFrame({'A' : [4,5,6], 'B': [2, 2, 2]}) + left = pd.DataFrame({"A": [1, 2], "B": [1, 2]}) + right = pd.DataFrame({"A": [4, 5, 6], "B": [2, 2, 2]}) .. code-block:: ipython @@ -799,7 +831,7 @@ ensure there are no duplicates in the left DataFrame, one can use the .. ipython:: python - pd.merge(left, right, on='B', how='outer', validate="one_to_many") + pd.merge(left, right, on="B", how="outer", validate="one_to_many") .. _merging.indicator: @@ -821,15 +853,15 @@ that takes on values: .. ipython:: python - df1 = pd.DataFrame({'col1': [0, 1], 'col_left': ['a', 'b']}) - df2 = pd.DataFrame({'col1': [1, 2, 2], 'col_right': [2, 2, 2]}) - pd.merge(df1, df2, on='col1', how='outer', indicator=True) + df1 = pd.DataFrame({"col1": [0, 1], "col_left": ["a", "b"]}) + df2 = pd.DataFrame({"col1": [1, 2, 2], "col_right": [2, 2, 2]}) + pd.merge(df1, df2, on="col1", how="outer", indicator=True) The ``indicator`` argument will also accept string arguments, in which case the indicator function will use the value of the passed string as the name for the indicator column. .. ipython:: python - pd.merge(df1, df2, on='col1', how='outer', indicator='indicator_column') + pd.merge(df1, df2, on="col1", how="outer", indicator="indicator_column") .. _merging.dtypes: @@ -841,25 +873,25 @@ Merging will preserve the dtype of the join keys. .. ipython:: python - left = pd.DataFrame({'key': [1], 'v1': [10]}) + left = pd.DataFrame({"key": [1], "v1": [10]}) left - right = pd.DataFrame({'key': [1, 2], 'v1': [20, 30]}) + right = pd.DataFrame({"key": [1, 2], "v1": [20, 30]}) right We are able to preserve the join keys: .. ipython:: python - pd.merge(left, right, how='outer') - pd.merge(left, right, how='outer').dtypes + pd.merge(left, right, how="outer") + pd.merge(left, right, how="outer").dtypes Of course if you have missing values that are introduced, then the resulting dtype will be upcast. .. ipython:: python - pd.merge(left, right, how='outer', on='key') - pd.merge(left, right, how='outer', on='key').dtypes + pd.merge(left, right, how="outer", on="key") + pd.merge(left, right, how="outer", on="key").dtypes Merging will preserve ``category`` dtypes of the mergands. See also the section on :ref:`categoricals `. @@ -869,12 +901,12 @@ The left frame. from pandas.api.types import CategoricalDtype - X = pd.Series(np.random.choice(['foo', 'bar'], size=(10,))) - X = X.astype(CategoricalDtype(categories=['foo', 'bar'])) + X = pd.Series(np.random.choice(["foo", "bar"], size=(10,))) + X = X.astype(CategoricalDtype(categories=["foo", "bar"])) - left = pd.DataFrame({'X': X, - 'Y': np.random.choice(['one', 'two', 'three'], - size=(10,))}) + left = pd.DataFrame( + {"X": X, "Y": np.random.choice(["one", "two", "three"], size=(10,))} + ) left left.dtypes @@ -882,9 +914,12 @@ The right frame. .. ipython:: python - right = pd.DataFrame({'X': pd.Series(['foo', 'bar'], - dtype=CategoricalDtype(['foo', 'bar'])), - 'Z': [1, 2]}) + right = pd.DataFrame( + { + "X": pd.Series(["foo", "bar"], dtype=CategoricalDtype(["foo", "bar"])), + "Z": [1, 2], + } + ) right right.dtypes @@ -892,7 +927,7 @@ The merged result: .. ipython:: python - result = pd.merge(left, right, how='outer') + result = pd.merge(left, right, how="outer") result result.dtypes @@ -916,13 +951,13 @@ potentially differently-indexed ``DataFrames`` into a single result .. ipython:: python - left = pd.DataFrame({'A': ['A0', 'A1', 'A2'], - 'B': ['B0', 'B1', 'B2']}, - index=['K0', 'K1', 'K2']) + left = pd.DataFrame( + {"A": ["A0", "A1", "A2"], "B": ["B0", "B1", "B2"]}, index=["K0", "K1", "K2"] + ) - right = pd.DataFrame({'C': ['C0', 'C2', 'C3'], - 'D': ['D0', 'D2', 'D3']}, - index=['K0', 'K2', 'K3']) + right = pd.DataFrame( + {"C": ["C0", "C2", "C3"], "D": ["D0", "D2", "D3"]}, index=["K0", "K2", "K3"] + ) result = left.join(right) @@ -936,7 +971,7 @@ potentially differently-indexed ``DataFrames`` into a single result .. ipython:: python - result = left.join(right, how='outer') + result = left.join(right, how="outer") .. ipython:: python :suppress: @@ -950,7 +985,7 @@ The same as above, but with ``how='inner'``. .. ipython:: python - result = left.join(right, how='inner') + result = left.join(right, how="inner") .. ipython:: python :suppress: @@ -966,7 +1001,7 @@ indexes: .. ipython:: python - result = pd.merge(left, right, left_index=True, right_index=True, how='outer') + result = pd.merge(left, right, left_index=True, right_index=True, how="outer") .. ipython:: python :suppress: @@ -978,7 +1013,7 @@ indexes: .. ipython:: python - result = pd.merge(left, right, left_index=True, right_index=True, how='inner'); + result = pd.merge(left, right, left_index=True, right_index=True, how="inner") .. ipython:: python :suppress: @@ -1008,15 +1043,17 @@ join key), using ``join`` may be more convenient. Here is a simple example: .. ipython:: python - left = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'], - 'B': ['B0', 'B1', 'B2', 'B3'], - 'key': ['K0', 'K1', 'K0', 'K1']}) + left = pd.DataFrame( + { + "A": ["A0", "A1", "A2", "A3"], + "B": ["B0", "B1", "B2", "B3"], + "key": ["K0", "K1", "K0", "K1"], + } + ) - right = pd.DataFrame({'C': ['C0', 'C1'], - 'D': ['D0', 'D1']}, - index=['K0', 'K1']) + right = pd.DataFrame({"C": ["C0", "C1"], "D": ["D0", "D1"]}, index=["K0", "K1"]) - result = left.join(right, on='key') + result = left.join(right, on="key") .. ipython:: python :suppress: @@ -1028,8 +1065,7 @@ join key), using ``join`` may be more convenient. Here is a simple example: .. ipython:: python - result = pd.merge(left, right, left_on='key', right_index=True, - how='left', sort=False); + result = pd.merge(left, right, left_on="key", right_index=True, how="left", sort=False) .. ipython:: python :suppress: @@ -1045,22 +1081,27 @@ To join on multiple keys, the passed DataFrame must have a ``MultiIndex``: .. ipython:: python - left = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'], - 'B': ['B0', 'B1', 'B2', 'B3'], - 'key1': ['K0', 'K0', 'K1', 'K2'], - 'key2': ['K0', 'K1', 'K0', 'K1']}) + left = pd.DataFrame( + { + "A": ["A0", "A1", "A2", "A3"], + "B": ["B0", "B1", "B2", "B3"], + "key1": ["K0", "K0", "K1", "K2"], + "key2": ["K0", "K1", "K0", "K1"], + } + ) - index = pd.MultiIndex.from_tuples([('K0', 'K0'), ('K1', 'K0'), - ('K2', 'K0'), ('K2', 'K1')]) - right = pd.DataFrame({'C': ['C0', 'C1', 'C2', 'C3'], - 'D': ['D0', 'D1', 'D2', 'D3']}, - index=index) + index = pd.MultiIndex.from_tuples( + [("K0", "K0"), ("K1", "K0"), ("K2", "K0"), ("K2", "K1")] + ) + right = pd.DataFrame( + {"C": ["C0", "C1", "C2", "C3"], "D": ["D0", "D1", "D2", "D3"]}, index=index + ) Now this can be joined by passing the two key column names: .. ipython:: python - result = left.join(right, on=['key1', 'key2']) + result = left.join(right, on=["key1", "key2"]) .. ipython:: python :suppress: @@ -1079,7 +1120,7 @@ easily performed: .. ipython:: python - result = left.join(right, on=['key1', 'key2'], how='inner') + result = left.join(right, on=["key1", "key2"], how="inner") .. ipython:: python :suppress: @@ -1149,39 +1190,38 @@ the left argument, as in this example: .. ipython:: python - leftindex = pd.MultiIndex.from_product([list('abc'), list('xy'), [1, 2]], - names=['abc', 'xy', 'num']) - left = pd.DataFrame({'v1': range(12)}, index=leftindex) + leftindex = pd.MultiIndex.from_product( + [list("abc"), list("xy"), [1, 2]], names=["abc", "xy", "num"] + ) + left = pd.DataFrame({"v1": range(12)}, index=leftindex) left - rightindex = pd.MultiIndex.from_product([list('abc'), list('xy')], - names=['abc', 'xy']) - right = pd.DataFrame({'v2': [100 * i for i in range(1, 7)]}, index=rightindex) + rightindex = pd.MultiIndex.from_product([list("abc"), list("xy")], names=["abc", "xy"]) + right = pd.DataFrame({"v2": [100 * i for i in range(1, 7)]}, index=rightindex) right - left.join(right, on=['abc', 'xy'], how='inner') + left.join(right, on=["abc", "xy"], how="inner") If that condition is not satisfied, a join with two multi-indexes can be done using the following code. .. ipython:: python - leftindex = pd.MultiIndex.from_tuples([('K0', 'X0'), ('K0', 'X1'), - ('K1', 'X2')], - names=['key', 'X']) - left = pd.DataFrame({'A': ['A0', 'A1', 'A2'], - 'B': ['B0', 'B1', 'B2']}, - index=leftindex) + leftindex = pd.MultiIndex.from_tuples( + [("K0", "X0"), ("K0", "X1"), ("K1", "X2")], names=["key", "X"] + ) + left = pd.DataFrame({"A": ["A0", "A1", "A2"], "B": ["B0", "B1", "B2"]}, index=leftindex) - rightindex = pd.MultiIndex.from_tuples([('K0', 'Y0'), ('K1', 'Y1'), - ('K2', 'Y2'), ('K2', 'Y3')], - names=['key', 'Y']) - right = pd.DataFrame({'C': ['C0', 'C1', 'C2', 'C3'], - 'D': ['D0', 'D1', 'D2', 'D3']}, - index=rightindex) + rightindex = pd.MultiIndex.from_tuples( + [("K0", "Y0"), ("K1", "Y1"), ("K2", "Y2"), ("K2", "Y3")], names=["key", "Y"] + ) + right = pd.DataFrame( + {"C": ["C0", "C1", "C2", "C3"], "D": ["D0", "D1", "D2", "D3"]}, index=rightindex + ) - result = pd.merge(left.reset_index(), right.reset_index(), - on=['key'], how='inner').set_index(['key', 'X', 'Y']) + result = pd.merge( + left.reset_index(), right.reset_index(), on=["key"], how="inner" + ).set_index(["key", "X", "Y"]) .. ipython:: python :suppress: @@ -1203,21 +1243,29 @@ resetting indexes. .. ipython:: python - left_index = pd.Index(['K0', 'K0', 'K1', 'K2'], name='key1') + left_index = pd.Index(["K0", "K0", "K1", "K2"], name="key1") - left = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'], - 'B': ['B0', 'B1', 'B2', 'B3'], - 'key2': ['K0', 'K1', 'K0', 'K1']}, - index=left_index) + left = pd.DataFrame( + { + "A": ["A0", "A1", "A2", "A3"], + "B": ["B0", "B1", "B2", "B3"], + "key2": ["K0", "K1", "K0", "K1"], + }, + index=left_index, + ) - right_index = pd.Index(['K0', 'K1', 'K2', 'K2'], name='key1') + right_index = pd.Index(["K0", "K1", "K2", "K2"], name="key1") - right = pd.DataFrame({'C': ['C0', 'C1', 'C2', 'C3'], - 'D': ['D0', 'D1', 'D2', 'D3'], - 'key2': ['K0', 'K0', 'K0', 'K1']}, - index=right_index) + right = pd.DataFrame( + { + "C": ["C0", "C1", "C2", "C3"], + "D": ["D0", "D1", "D2", "D3"], + "key2": ["K0", "K0", "K0", "K1"], + }, + index=right_index, + ) - result = left.merge(right, on=['key1', 'key2']) + result = left.merge(right, on=["key1", "key2"]) .. ipython:: python :suppress: @@ -1254,10 +1302,10 @@ columns: .. ipython:: python - left = pd.DataFrame({'k': ['K0', 'K1', 'K2'], 'v': [1, 2, 3]}) - right = pd.DataFrame({'k': ['K0', 'K0', 'K3'], 'v': [4, 5, 6]}) + left = pd.DataFrame({"k": ["K0", "K1", "K2"], "v": [1, 2, 3]}) + right = pd.DataFrame({"k": ["K0", "K0", "K3"], "v": [4, 5, 6]}) - result = pd.merge(left, right, on='k') + result = pd.merge(left, right, on="k") .. ipython:: python :suppress: @@ -1269,7 +1317,7 @@ columns: .. ipython:: python - result = pd.merge(left, right, on='k', suffixes=('_l', '_r')) + result = pd.merge(left, right, on="k", suffixes=("_l", "_r")) .. ipython:: python :suppress: @@ -1284,9 +1332,9 @@ similarly. .. ipython:: python - left = left.set_index('k') - right = right.set_index('k') - result = left.join(right, lsuffix='_l', rsuffix='_r') + left = left.set_index("k") + right = right.set_index("k") + result = left.join(right, lsuffix="_l", rsuffix="_r") .. ipython:: python :suppress: @@ -1306,7 +1354,7 @@ to join them together on their indexes. .. ipython:: python - right2 = pd.DataFrame({'v': [7, 8, 9]}, index=['K1', 'K1', 'K2']) + right2 = pd.DataFrame({"v": [7, 8, 9]}, index=["K1", "K1", "K2"]) result = left.join([right, right2]) .. ipython:: python @@ -1328,10 +1376,8 @@ one object from values for matching indices in the other. Here is an example: .. ipython:: python - df1 = pd.DataFrame([[np.nan, 3., 5.], [-4.6, np.nan, np.nan], - [np.nan, 7., np.nan]]) - df2 = pd.DataFrame([[-42.6, np.nan, -8.2], [-5., 1.6, 4]], - index=[1, 2]) + df1 = pd.DataFrame([[np.nan, 3.0, 5.0], [-4.6, np.nan, np.nan], [np.nan, 7.0, np.nan]]) + df2 = pd.DataFrame([[-42.6, np.nan, -8.2], [-5.0, 1.6, 4]], index=[1, 2]) For this, use the :meth:`~DataFrame.combine_first` method: @@ -1384,14 +1430,13 @@ fill/interpolate missing data: .. ipython:: python - left = pd.DataFrame({'k': ['K0', 'K1', 'K1', 'K2'], - 'lv': [1, 2, 3, 4], - 's': ['a', 'b', 'c', 'd']}) + left = pd.DataFrame( + {"k": ["K0", "K1", "K1", "K2"], "lv": [1, 2, 3, 4], "s": ["a", "b", "c", "d"]} + ) - right = pd.DataFrame({'k': ['K1', 'K2', 'K4'], - 'rv': [1, 2, 3]}) + right = pd.DataFrame({"k": ["K1", "K2", "K4"], "rv": [1, 2, 3]}) - pd.merge_ordered(left, right, fill_method='ffill', left_by='s') + pd.merge_ordered(left, right, fill_method="ffill", left_by="s") .. _merging.merge_asof: @@ -1411,37 +1456,44 @@ merge them. .. ipython:: python - trades = pd.DataFrame({ - 'time': pd.to_datetime(['20160525 13:30:00.023', - '20160525 13:30:00.038', - '20160525 13:30:00.048', - '20160525 13:30:00.048', - '20160525 13:30:00.048']), - 'ticker': ['MSFT', 'MSFT', - 'GOOG', 'GOOG', 'AAPL'], - 'price': [51.95, 51.95, - 720.77, 720.92, 98.00], - 'quantity': [75, 155, - 100, 100, 100]}, - columns=['time', 'ticker', 'price', 'quantity']) - - quotes = pd.DataFrame({ - 'time': pd.to_datetime(['20160525 13:30:00.023', - '20160525 13:30:00.023', - '20160525 13:30:00.030', - '20160525 13:30:00.041', - '20160525 13:30:00.048', - '20160525 13:30:00.049', - '20160525 13:30:00.072', - '20160525 13:30:00.075']), - 'ticker': ['GOOG', 'MSFT', 'MSFT', - 'MSFT', 'GOOG', 'AAPL', 'GOOG', - 'MSFT'], - 'bid': [720.50, 51.95, 51.97, 51.99, - 720.50, 97.99, 720.50, 52.01], - 'ask': [720.93, 51.96, 51.98, 52.00, - 720.93, 98.01, 720.88, 52.03]}, - columns=['time', 'ticker', 'bid', 'ask']) + trades = pd.DataFrame( + { + "time": pd.to_datetime( + [ + "20160525 13:30:00.023", + "20160525 13:30:00.038", + "20160525 13:30:00.048", + "20160525 13:30:00.048", + "20160525 13:30:00.048", + ] + ), + "ticker": ["MSFT", "MSFT", "GOOG", "GOOG", "AAPL"], + "price": [51.95, 51.95, 720.77, 720.92, 98.00], + "quantity": [75, 155, 100, 100, 100], + }, + columns=["time", "ticker", "price", "quantity"], + ) + + quotes = pd.DataFrame( + { + "time": pd.to_datetime( + [ + "20160525 13:30:00.023", + "20160525 13:30:00.023", + "20160525 13:30:00.030", + "20160525 13:30:00.041", + "20160525 13:30:00.048", + "20160525 13:30:00.049", + "20160525 13:30:00.072", + "20160525 13:30:00.075", + ] + ), + "ticker": ["GOOG", "MSFT", "MSFT", "MSFT", "GOOG", "AAPL", "GOOG", "MSFT"], + "bid": [720.50, 51.95, 51.97, 51.99, 720.50, 97.99, 720.50, 52.01], + "ask": [720.93, 51.96, 51.98, 52.00, 720.93, 98.01, 720.88, 52.03], + }, + columns=["time", "ticker", "bid", "ask"], + ) .. ipython:: python @@ -1452,18 +1504,13 @@ By default we are taking the asof of the quotes. .. ipython:: python - pd.merge_asof(trades, quotes, - on='time', - by='ticker') + pd.merge_asof(trades, quotes, on="time", by="ticker") We only asof within ``2ms`` between the quote time and the trade time. .. ipython:: python - pd.merge_asof(trades, quotes, - on='time', - by='ticker', - tolerance=pd.Timedelta('2ms')) + pd.merge_asof(trades, quotes, on="time", by="ticker", tolerance=pd.Timedelta("2ms")) We only asof within ``10ms`` between the quote time and the trade time and we exclude exact matches on time. Note that though we exclude the exact matches @@ -1471,11 +1518,14 @@ exclude exact matches on time. Note that though we exclude the exact matches .. ipython:: python - pd.merge_asof(trades, quotes, - on='time', - by='ticker', - tolerance=pd.Timedelta('10ms'), - allow_exact_matches=False) + pd.merge_asof( + trades, + quotes, + on="time", + by="ticker", + tolerance=pd.Timedelta("10ms"), + allow_exact_matches=False, + ) .. _merging.compare: @@ -1496,7 +1546,7 @@ side by side. { "col1": ["a", "a", "b", "b", "a"], "col2": [1.0, 2.0, 3.0, np.nan, 5.0], - "col3": [1.0, 2.0, 3.0, 4.0, 5.0] + "col3": [1.0, 2.0, 3.0, 4.0, 5.0], }, columns=["col1", "col2", "col3"], ) @@ -1505,8 +1555,8 @@ side by side. .. ipython:: python df2 = df.copy() - df2.loc[0, 'col1'] = 'c' - df2.loc[2, 'col3'] = 4.0 + df2.loc[0, "col1"] = "c" + df2.loc[2, "col3"] = 4.0 df2 .. ipython:: python From ddf3bbad89268a833a497f173f4dfa00e8fcbb94 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 2 Oct 2020 15:29:39 -0700 Subject: [PATCH 0987/1025] CLN: cleanups in DataFrame._reduce (#36674) --- pandas/core/frame.py | 50 ++++++++++++++++++-------------------------- 1 file changed, 20 insertions(+), 30 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 13443cc3befd3..1f9987d9d3f5b 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -638,10 +638,10 @@ def _can_fast_transpose(self) -> bool: """ Can we transpose this DataFrame without creating any new array objects. """ - if self._data.any_extension_types: + if self._mgr.any_extension_types: # TODO(EA2D) special case would be unnecessary with 2D EAs return False - return len(self._data.blocks) == 1 + return len(self._mgr.blocks) == 1 # ---------------------------------------------------------------------- # Rendering Methods @@ -2879,7 +2879,7 @@ def _get_column_array(self, i: int) -> ArrayLike: Get the values of the i'th column (ndarray or ExtensionArray, as stored in the Block) """ - return self._data.iget_values(i) + return self._mgr.iget_values(i) def _iter_column_arrays(self) -> Iterator[ArrayLike]: """ @@ -4911,7 +4911,7 @@ def _maybe_casted_values(index, labels=None): @doc(NDFrame.isna, klass=_shared_doc_kwargs["klass"]) def isna(self) -> DataFrame: - result = self._constructor(self._data.isna(func=isna)) + result = self._constructor(self._mgr.isna(func=isna)) return result.__finalize__(self, method="isna") @doc(NDFrame.isna, klass=_shared_doc_kwargs["klass"]) @@ -8575,6 +8575,7 @@ def _reduce( ): assert filter_type is None or filter_type == "bool", filter_type + out_dtype = "bool" if filter_type == "bool" else None dtype_is_dt = np.array( [ @@ -8594,10 +8595,9 @@ def _reduce( cols = self.columns[~dtype_is_dt] self = self[cols] - # TODO: Make other agg func handle axis=None properly + # TODO: Make other agg func handle axis=None properly GH#21597 axis = self._get_axis_number(axis) labels = self._get_agg_axis(axis) - constructor = self._constructor assert axis in [0, 1] def func(values): @@ -8606,18 +8606,19 @@ def func(values): else: return op(values, axis=axis, skipna=skipna, **kwds) + def blk_func(values): + if isinstance(values, ExtensionArray): + return values._reduce(name, skipna=skipna, **kwds) + else: + return op(values, axis=1, skipna=skipna, **kwds) + def _get_data() -> DataFrame: if filter_type is None: data = self._get_numeric_data() - elif filter_type == "bool": + else: # GH#25101, GH#24434 + assert filter_type == "bool" data = self._get_bool_data() - else: # pragma: no cover - msg = ( - f"Generating numeric_only data with filter_type {filter_type} " - "not supported." - ) - raise NotImplementedError(msg) return data if numeric_only is not None: @@ -8628,14 +8629,6 @@ def _get_data() -> DataFrame: df = df.T axis = 0 - out_dtype = "bool" if filter_type == "bool" else None - - def blk_func(values): - if isinstance(values, ExtensionArray): - return values._reduce(name, skipna=skipna, **kwds) - else: - return op(values, axis=1, skipna=skipna, **kwds) - # After possibly _get_data and transposing, we are now in the # simple case where we can use BlockManager.reduce res = df._mgr.reduce(blk_func) @@ -8651,11 +8644,10 @@ def blk_func(values): if not self._is_homogeneous_type or self._mgr.any_extension_types: # try to avoid self.values call - if filter_type is None and axis == 0 and len(self) > 0: + if filter_type is None and axis == 0: # operate column-wise # numeric_only must be None here, as other cases caught above - # require len(self) > 0 bc frame_apply messes up empty prod/sum # this can end up with a non-reduction # but not always. if the types are mixed @@ -8691,19 +8683,17 @@ def blk_func(values): with np.errstate(all="ignore"): result = func(values) - if is_object_dtype(result.dtype): + if filter_type == "bool" and notna(result).all(): + result = result.astype(np.bool_) + elif filter_type is None and is_object_dtype(result.dtype): try: - if filter_type is None: - result = result.astype(np.float64) - elif filter_type == "bool" and notna(result).all(): - result = result.astype(np.bool_) + result = result.astype(np.float64) except (ValueError, TypeError): # try to coerce to the original dtypes item by item if we can if axis == 0: result = coerce_to_dtypes(result, data.dtypes) - if constructor is not None: - result = self._constructor_sliced(result, index=labels) + result = self._constructor_sliced(result, index=labels) return result def nunique(self, axis=0, dropna=True) -> Series: From e470c34faf1c6e6a0c3fc296f961bfb6c80c0815 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 2 Oct 2020 15:39:16 -0700 Subject: [PATCH 0988/1025] DOC: remove outdated doc closes #31487 (#36797) --- doc/source/user_guide/dsintro.rst | 25 ------------------------- 1 file changed, 25 deletions(-) diff --git a/doc/source/user_guide/dsintro.rst b/doc/source/user_guide/dsintro.rst index 0e6767e88edc2..c27c73d439a0c 100644 --- a/doc/source/user_guide/dsintro.rst +++ b/doc/source/user_guide/dsintro.rst @@ -663,31 +663,6 @@ row-wise. For example: df - df.iloc[0] -In the special case of working with time series data, if the DataFrame index -contains dates, the broadcasting will be column-wise: - -.. ipython:: python - :okwarning: - - index = pd.date_range('1/1/2000', periods=8) - df = pd.DataFrame(np.random.randn(8, 3), index=index, columns=list('ABC')) - df - type(df['A']) - df - df['A'] - -.. warning:: - - .. code-block:: python - - df - df['A'] - - is now deprecated and will be removed in a future release. The preferred way - to replicate this behavior is - - .. code-block:: python - - df.sub(df['A'], axis=0) - For explicit control over the matching and broadcasting behavior, see the section on :ref:`flexible binary operations `. From 8a1dc7ade6efcb691983254373dfa241e9e77fe7 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 2 Oct 2020 15:43:23 -0700 Subject: [PATCH 0989/1025] BUG: Categorical setitem, comparison with tuple category (#36623) --- doc/source/whatsnew/v1.2.0.rst | 2 +- pandas/core/arrays/categorical.py | 11 +++++++---- pandas/tests/arrays/categorical/test_indexing.py | 10 +++++++++- pandas/tests/arrays/categorical/test_operators.py | 14 ++++++++++++++ 4 files changed, 31 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 25fac48397c68..6d1196b783f74 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -293,7 +293,7 @@ Bug fixes Categorical ^^^^^^^^^^^ - :meth:`Categorical.fillna` will always return a copy, will validate a passed fill value regardless of whether there are any NAs to fill, and will disallow a ``NaT`` as a fill value for numeric categories (:issue:`36530`) -- +- Bug in :meth:`Categorical.__setitem__` that incorrectly raised when trying to set a tuple value (:issue:`20439`) - Datetimelike diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 9db22df20e66d..1a8861af10ed1 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -28,6 +28,7 @@ is_dict_like, is_dtype_equal, is_extension_array_dtype, + is_hashable, is_integer_dtype, is_list_like, is_object_dtype, @@ -62,8 +63,9 @@ def _cat_compare_op(op): @unpack_zerodim_and_defer(opname) def func(self, other): - if is_list_like(other) and len(other) != len(self): - # TODO: Could this fail if the categories are listlike objects? + hashable = is_hashable(other) + if is_list_like(other) and len(other) != len(self) and not hashable: + # in hashable case we may have a tuple that is itself a category raise ValueError("Lengths must match.") if not self.ordered: @@ -91,7 +93,7 @@ def func(self, other): ret[mask] = fill_value return ret - if is_scalar(other): + if hashable: if other in self.categories: i = self._unbox_scalar(other) ret = op(self._codes, i) @@ -1885,7 +1887,8 @@ def _validate_setitem_value(self, value): new_codes = self._validate_listlike(value) value = Categorical.from_codes(new_codes, dtype=self.dtype) - rvalue = value if is_list_like(value) else [value] + # wrap scalars and hashable-listlikes in list + rvalue = value if not is_hashable(value) else [value] from pandas import Index diff --git a/pandas/tests/arrays/categorical/test_indexing.py b/pandas/tests/arrays/categorical/test_indexing.py index ab8606ef9258d..2c4dd8fe64057 100644 --- a/pandas/tests/arrays/categorical/test_indexing.py +++ b/pandas/tests/arrays/categorical/test_indexing.py @@ -75,7 +75,7 @@ def test_setitem_different_unordered_raises(self, other): pd.Categorical(["b", "a"], categories=["a", "b", "c"], ordered=True), ], ) - def test_setitem_same_ordered_rasies(self, other): + def test_setitem_same_ordered_raises(self, other): # Gh-24142 target = pd.Categorical(["a", "b"], categories=["a", "b"], ordered=True) mask = np.array([True, False]) @@ -83,6 +83,14 @@ def test_setitem_same_ordered_rasies(self, other): with pytest.raises(ValueError, match=msg): target[mask] = other[mask] + def test_setitem_tuple(self): + # GH#20439 + cat = pd.Categorical([(0, 1), (0, 2), (0, 1)]) + + # This should not raise + cat[1] = cat[0] + assert cat[1] == (0, 1) + class TestCategoricalIndexing: def test_getitem_slice(self): diff --git a/pandas/tests/arrays/categorical/test_operators.py b/pandas/tests/arrays/categorical/test_operators.py index 34194738bf4ab..ed70417523491 100644 --- a/pandas/tests/arrays/categorical/test_operators.py +++ b/pandas/tests/arrays/categorical/test_operators.py @@ -179,6 +179,20 @@ def test_comparison_with_unknown_scalars(self): tm.assert_numpy_array_equal(cat == 4, np.array([False, False, False])) tm.assert_numpy_array_equal(cat != 4, np.array([True, True, True])) + def test_comparison_with_tuple(self): + cat = pd.Categorical(np.array(["foo", (0, 1), 3, (0, 1)], dtype=object)) + + result = cat == "foo" + expected = np.array([True, False, False, False], dtype=bool) + tm.assert_numpy_array_equal(result, expected) + + result = cat == (0, 1) + expected = np.array([False, True, False, True], dtype=bool) + tm.assert_numpy_array_equal(result, expected) + + result = cat != (0, 1) + tm.assert_numpy_array_equal(result, ~expected) + def test_comparison_of_ordered_categorical_with_nan_to_scalar( self, compare_operators_no_eq_ne ): From 3ea466b70932c6fd9e67bb30e5b6e6324b190b9a Mon Sep 17 00:00:00 2001 From: Amanda Dsouza Date: Sat, 3 Oct 2020 04:29:01 +0530 Subject: [PATCH 0990/1025] [MRG] TST: Added test for groupby apply datetimeindex fix (#36671) * TST: Added test for groupby apply datetimeindex fix * TST: Moved test to similar tests --- pandas/tests/groupby/test_apply.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index db5c4af9c6f53..176efdb6204da 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -681,6 +681,23 @@ def test_apply_aggregating_timedelta_and_datetime(): tm.assert_frame_equal(result, expected) +def test_apply_groupby_datetimeindex(): + # GH 26182 + # groupby apply failed on dataframe with DatetimeIndex + + data = [["A", 10], ["B", 20], ["B", 30], ["C", 40], ["C", 50]] + df = pd.DataFrame( + data, columns=["Name", "Value"], index=pd.date_range("2020-09-01", "2020-09-05") + ) + + result = df.groupby("Name").sum() + + expected = pd.DataFrame({"Name": ["A", "B", "C"], "Value": [10, 50, 90]}) + expected.set_index("Name", inplace=True) + + tm.assert_frame_equal(result, expected) + + def test_time_field_bug(): # Test a fix for the following error related to GH issue 11324 When # non-key fields in a group-by dataframe contained time-based fields From da4e889f2e73a2793f29de8d5da7756696d376f4 Mon Sep 17 00:00:00 2001 From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> Date: Fri, 2 Oct 2020 19:02:01 -0400 Subject: [PATCH 0991/1025] CLN/TYP: aggregation methods in core.base (#36677) --- pandas/core/base.py | 52 +++++++++++++++++++++++++++------------------ 1 file changed, 31 insertions(+), 21 deletions(-) diff --git a/pandas/core/base.py b/pandas/core/base.py index a50181c1be2f0..9e6f93b656af8 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -4,11 +4,12 @@ import builtins import textwrap -from typing import Any, Callable, Dict, FrozenSet, Optional, Union +from typing import Any, Callable, Dict, FrozenSet, List, Optional, Union, cast import numpy as np import pandas._libs.lib as lib +from pandas._typing import AggFuncType, AggFuncTypeBase, Label from pandas.compat import PYPY from pandas.compat.numpy import function as nv from pandas.errors import AbstractMethodError @@ -278,7 +279,7 @@ def _try_aggregate_string_function(self, arg: str, *args, **kwargs): f"'{arg}' is not a valid function for '{type(self).__name__}' object" ) - def _aggregate(self, arg, *args, **kwargs): + def _aggregate(self, arg: AggFuncType, *args, **kwargs): """ provide an implementation for the aggregators @@ -311,13 +312,13 @@ def _aggregate(self, arg, *args, **kwargs): if _axis != 0: # pragma: no cover raise ValueError("Can only pass dict with axis=0") - obj = self._selected_obj + selected_obj = self._selected_obj # if we have a dict of any non-scalars # eg. {'A' : ['mean']}, normalize all to # be list-likes if any(is_aggregator(x) for x in arg.values()): - new_arg = {} + new_arg: Dict[Label, Union[AggFuncTypeBase, List[AggFuncTypeBase]]] = {} for k, v in arg.items(): if not isinstance(v, (tuple, list, dict)): new_arg[k] = [v] @@ -336,9 +337,12 @@ def _aggregate(self, arg, *args, **kwargs): # {'ra' : { 'A' : 'mean' }} if isinstance(v, dict): raise SpecificationError("nested renamer is not supported") - elif isinstance(obj, ABCSeries): + elif isinstance(selected_obj, ABCSeries): raise SpecificationError("nested renamer is not supported") - elif isinstance(obj, ABCDataFrame) and k not in obj.columns: + elif ( + isinstance(selected_obj, ABCDataFrame) + and k not in selected_obj.columns + ): raise KeyError(f"Column '{k}' does not exist!") arg = new_arg @@ -347,10 +351,12 @@ def _aggregate(self, arg, *args, **kwargs): # deprecation of renaming keys # GH 15931 keys = list(arg.keys()) - if isinstance(obj, ABCDataFrame) and len( - obj.columns.intersection(keys) + if isinstance(selected_obj, ABCDataFrame) and len( + selected_obj.columns.intersection(keys) ) != len(keys): - cols = sorted(set(keys) - set(obj.columns.intersection(keys))) + cols = sorted( + set(keys) - set(selected_obj.columns.intersection(keys)) + ) raise SpecificationError(f"Column(s) {cols} do not exist") from pandas.core.reshape.concat import concat @@ -370,7 +376,7 @@ def _agg_2dim(how): """ aggregate a 2-dim with how """ - colg = self._gotitem(self._selection, ndim=2, subset=obj) + colg = self._gotitem(self._selection, ndim=2, subset=selected_obj) return colg.aggregate(how) def _agg(arg, func): @@ -385,7 +391,6 @@ def _agg(arg, func): # set the final keys keys = list(arg.keys()) - result = {} if self._selection is not None: @@ -473,7 +478,11 @@ def is_any_frame() -> bool: # we have a dict of scalars # GH 36212 use name only if self is a series - name = self.name if (self.ndim == 1) else None + if self.ndim == 1: + self = cast("Series", self) + name = self.name + else: + name = None result = Series(result, name=name) @@ -484,9 +493,10 @@ def is_any_frame() -> bool: else: result = None - f = self._get_cython_func(arg) - if f and not args and not kwargs: - return getattr(self, f)(), None + if callable(arg): + f = self._get_cython_func(arg) + if f and not args and not kwargs: + return getattr(self, f)(), None # caller can react return result, True @@ -498,17 +508,17 @@ def _aggregate_multiple_funcs(self, arg, _axis): raise NotImplementedError("axis other than 0 is not supported") if self._selected_obj.ndim == 1: - obj = self._selected_obj + selected_obj = self._selected_obj else: - obj = self._obj_with_exclusions + selected_obj = self._obj_with_exclusions results = [] keys = [] # degenerate case - if obj.ndim == 1: + if selected_obj.ndim == 1: for a in arg: - colg = self._gotitem(obj.name, ndim=1, subset=obj) + colg = self._gotitem(selected_obj.name, ndim=1, subset=selected_obj) try: new_res = colg.aggregate(a) @@ -523,8 +533,8 @@ def _aggregate_multiple_funcs(self, arg, _axis): # multiples else: - for index, col in enumerate(obj): - colg = self._gotitem(col, ndim=1, subset=obj.iloc[:, index]) + for index, col in enumerate(selected_obj): + colg = self._gotitem(col, ndim=1, subset=selected_obj.iloc[:, index]) try: new_res = colg.aggregate(arg) except (TypeError, DataError): From 25b8fbb02838de9b31f070f47422a6ee4779b999 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sat, 3 Oct 2020 00:02:45 +0100 Subject: [PATCH 0992/1025] REGR: Series.loc with a MultiIndex containing Timestamp raises InvalidIndexError (#36675) --- doc/source/whatsnew/v1.1.3.rst | 1 + pandas/core/indexing.py | 2 +- pandas/tests/indexing/multiindex/test_loc.py | 12 ++++++++++++ 3 files changed, 14 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.3.rst b/doc/source/whatsnew/v1.1.3.rst index 15777abcb8084..acf1dafc59885 100644 --- a/doc/source/whatsnew/v1.1.3.rst +++ b/doc/source/whatsnew/v1.1.3.rst @@ -37,6 +37,7 @@ Fixed regressions - Fixed regression in modulo of :class:`Index`, :class:`Series` and :class:`DataFrame` using ``numexpr`` using C not Python semantics (:issue:`36047`, :issue:`36526`) - Fixed regression in :meth:`read_excel` with ``engine="odf"`` caused ``UnboundLocalError`` in some cases where cells had nested child nodes (:issue:`36122`, :issue:`35802`) - Fixed regression in :meth:`DataFrame.replace` inconsistent replace when using a float in the replace method (:issue:`35376`) +- Fixed regression in :meth:`Series.loc` on a :class:`Series` with a :class:`MultiIndex` containing :class:`Timestamp` raising ``InvalidIndexError`` (:issue:`35858`) - Fixed regression in :class:`DataFrame` and :class:`Series` comparisons between numeric arrays and strings (:issue:`35700`, :issue:`36377`) - Fixed regression in :meth:`DataFrame.apply` with ``raw=True`` and user-function returning string (:issue:`35940`) - Fixed regression when setting empty :class:`DataFrame` column to a :class:`Series` in preserving name of index in frame (:issue:`36527`) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index fc1b9bee9ba03..7b4b779e80481 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1076,7 +1076,7 @@ def _handle_lowerdim_multi_index_axis0(self, tup: Tuple): try: # fast path for series or for tup devoid of slices return self._get_label(tup, axis=axis) - except TypeError: + except (TypeError, InvalidIndexError): # slices are unhashable pass except KeyError as ek: diff --git a/pandas/tests/indexing/multiindex/test_loc.py b/pandas/tests/indexing/multiindex/test_loc.py index 63983f45d7832..1b659bec0e9e8 100644 --- a/pandas/tests/indexing/multiindex/test_loc.py +++ b/pandas/tests/indexing/multiindex/test_loc.py @@ -493,6 +493,18 @@ def test_loc_datetime_mask_slicing(): tm.assert_series_equal(result, expected) +def test_loc_datetime_series_tuple_slicing(): + # https://github.com/pandas-dev/pandas/issues/35858 + date = pd.Timestamp("2000") + ser = pd.Series( + 1, + index=pd.MultiIndex.from_tuples([("a", date)], names=["a", "b"]), + name="c", + ) + result = ser.loc[:, [date]] + tm.assert_series_equal(result, ser) + + def test_loc_with_mi_indexer(): # https://github.com/pandas-dev/pandas/issues/35351 df = DataFrame( From c5070e5b16bc968c07543b1cab59ebe908074f10 Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Sat, 3 Oct 2020 00:06:53 +0100 Subject: [PATCH 0993/1025] CI, CLN remove unnecessary noqa statements, add CI check (#36707) --- .pre-commit-config.yaml | 4 ++++ asv_bench/benchmarks/pandas_vb_common.py | 2 +- asv_bench/benchmarks/tslibs/offsets.py | 2 +- doc/source/conf.py | 10 +++++----- pandas/_typing.py | 12 ++++++------ pandas/api/types/__init__.py | 2 +- pandas/conftest.py | 2 +- pandas/core/arrays/floating.py | 4 ++-- pandas/io/common.py | 2 +- pandas/io/formats/console.py | 4 ++-- pandas/io/formats/info.py | 2 +- pandas/io/json/_table_schema.py | 2 +- pandas/io/pytables.py | 2 +- pandas/io/sas/sasreader.py | 2 +- pandas/plotting/_matplotlib/__init__.py | 2 +- pandas/plotting/_matplotlib/timeseries.py | 2 +- pandas/plotting/_matplotlib/tools.py | 2 +- pandas/tests/arrays/categorical/test_constructors.py | 2 +- pandas/tests/arrays/categorical/test_repr.py | 6 +++--- pandas/tests/arrays/sparse/test_libsparse.py | 2 +- pandas/tests/computation/test_eval.py | 8 ++++---- pandas/tests/dtypes/test_inference.py | 2 +- pandas/tests/frame/indexing/test_indexing.py | 2 +- pandas/tests/frame/test_block_internals.py | 2 +- pandas/tests/frame/test_query_eval.py | 4 +--- pandas/tests/indexes/categorical/test_formats.py | 6 +++--- pandas/tests/indexes/multi/test_formats.py | 2 +- pandas/tests/indexes/multi/test_sorting.py | 2 +- pandas/tests/indexing/test_callable.py | 12 ++++++------ pandas/tests/io/formats/test_style.py | 2 +- pandas/tests/io/json/test_pandas.py | 2 +- pandas/tests/io/parser/test_index_col.py | 2 +- pandas/tests/io/pytables/test_store.py | 8 ++++---- pandas/tests/io/test_feather.py | 2 +- pandas/tests/io/test_gcs.py | 10 +++++----- pandas/tests/io/test_parquet.py | 10 +++++----- pandas/tests/plotting/test_frame.py | 2 +- pandas/tests/scalar/timestamp/test_constructors.py | 2 +- pandas/tests/series/test_repr.py | 2 +- pandas/tests/test_common.py | 2 +- pandas/tests/test_downstream.py | 8 ++++---- pandas/tests/test_errors.py | 2 +- scripts/validate_docstrings.py | 8 ++++---- setup.py | 4 ++-- 44 files changed, 88 insertions(+), 86 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 7f669ee77c3eb..d0c9f12614d0d 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -43,3 +43,7 @@ repos: entry: python -m scripts.generate_pip_deps_from_conda files: ^(environment.yml|requirements-dev.txt)$ pass_filenames: false +- repo: https://github.com/asottile/yesqa + rev: v1.2.2 + hooks: + - id: yesqa diff --git a/asv_bench/benchmarks/pandas_vb_common.py b/asv_bench/benchmarks/pandas_vb_common.py index 23286343d7367..7bd4d639633b3 100644 --- a/asv_bench/benchmarks/pandas_vb_common.py +++ b/asv_bench/benchmarks/pandas_vb_common.py @@ -15,7 +15,7 @@ # Compatibility import for the testing module try: - import pandas._testing as tm # noqa + import pandas._testing as tm except ImportError: import pandas.util.testing as tm # noqa diff --git a/asv_bench/benchmarks/tslibs/offsets.py b/asv_bench/benchmarks/tslibs/offsets.py index fc1efe63307b2..0aea8332398b1 100644 --- a/asv_bench/benchmarks/tslibs/offsets.py +++ b/asv_bench/benchmarks/tslibs/offsets.py @@ -9,7 +9,7 @@ from pandas import offsets try: - import pandas.tseries.holiday # noqa + import pandas.tseries.holiday except ImportError: pass diff --git a/doc/source/conf.py b/doc/source/conf.py index 04540f7e6ec95..15e7a13ff5b72 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -146,7 +146,7 @@ # built documents. # # The short X.Y version. -import pandas # noqa: E402 isort:skip +import pandas # isort:skip # version = '%s r%s' % (pandas.__version__, svn_version()) version = str(pandas.__version__) @@ -441,14 +441,14 @@ # Add custom Documenter to handle attributes/methods of an AccessorProperty # eg pandas.Series.str and pandas.Series.dt (see GH9322) -import sphinx # noqa: E402 isort:skip -from sphinx.util import rpartition # noqa: E402 isort:skip -from sphinx.ext.autodoc import ( # noqa: E402 isort:skip +import sphinx # isort:skip +from sphinx.util import rpartition # isort:skip +from sphinx.ext.autodoc import ( # isort:skip AttributeDocumenter, Documenter, MethodDocumenter, ) -from sphinx.ext.autosummary import Autosummary # noqa: E402 isort:skip +from sphinx.ext.autosummary import Autosummary # isort:skip class AccessorDocumenter(MethodDocumenter): diff --git a/pandas/_typing.py b/pandas/_typing.py index 16d81c0d39cbe..7678d1bf12d8b 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -27,16 +27,16 @@ # and use a string literal forward reference to it in subsequent types # https://mypy.readthedocs.io/en/latest/common_issues.html#import-cycles if TYPE_CHECKING: - from pandas._libs import Period, Timedelta, Timestamp # noqa: F401 + from pandas._libs import Period, Timedelta, Timestamp - from pandas.core.dtypes.dtypes import ExtensionDtype # noqa: F401 + from pandas.core.dtypes.dtypes import ExtensionDtype - from pandas import Interval # noqa: F401 + from pandas import Interval from pandas.core.arrays.base import ExtensionArray # noqa: F401 - from pandas.core.frame import DataFrame # noqa: F401 + from pandas.core.frame import DataFrame from pandas.core.generic import NDFrame # noqa: F401 - from pandas.core.indexes.base import Index # noqa: F401 - from pandas.core.series import Series # noqa: F401 + from pandas.core.indexes.base import Index + from pandas.core.series import Series # array-like diff --git a/pandas/api/types/__init__.py b/pandas/api/types/__init__.py index 3495b493707c2..fb1abdd5b18ec 100644 --- a/pandas/api/types/__init__.py +++ b/pandas/api/types/__init__.py @@ -4,7 +4,7 @@ from pandas._libs.lib import infer_dtype -from pandas.core.dtypes.api import * # noqa: F403, F401 +from pandas.core.dtypes.api import * # noqa: F401, F403 from pandas.core.dtypes.concat import union_categoricals from pandas.core.dtypes.dtypes import ( CategoricalDtype, diff --git a/pandas/conftest.py b/pandas/conftest.py index 3865d287c6905..5ac5e3670f69f 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1226,7 +1226,7 @@ def ip(): from IPython.core.interactiveshell import InteractiveShell # GH#35711 make sure sqlite history file handle is not leaked - from traitlets.config import Config # noqa: F401 isort:skip + from traitlets.config import Config # isort:skip c = Config() c.HistoryManager.hist_file = ":memory:" diff --git a/pandas/core/arrays/floating.py b/pandas/core/arrays/floating.py index c3710196a8611..33659fe2f397d 100644 --- a/pandas/core/arrays/floating.py +++ b/pandas/core/arrays/floating.py @@ -34,7 +34,7 @@ from .masked import BaseMaskedArray, BaseMaskedDtype if TYPE_CHECKING: - import pyarrow # noqa: F401 + import pyarrow class FloatingDtype(BaseMaskedDtype): @@ -82,7 +82,7 @@ def __from_arrow__( """ Construct FloatingArray from pyarrow Array/ChunkedArray. """ - import pyarrow # noqa: F811 + import pyarrow from pandas.core.arrays._arrow_utils import pyarrow_array_to_numpy_and_mask diff --git a/pandas/io/common.py b/pandas/io/common.py index f177e08ac0089..c147ae9fd0aa8 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -53,7 +53,7 @@ if TYPE_CHECKING: - from io import IOBase # noqa: F401 + from io import IOBase def is_url(url) -> bool: diff --git a/pandas/io/formats/console.py b/pandas/io/formats/console.py index bed29e1fd4792..50e69f7e8b435 100644 --- a/pandas/io/formats/console.py +++ b/pandas/io/formats/console.py @@ -69,7 +69,7 @@ def check_main(): return not hasattr(main, "__file__") or get_option("mode.sim_interactive") try: - return __IPYTHON__ or check_main() # noqa + return __IPYTHON__ or check_main() except NameError: return check_main() @@ -83,7 +83,7 @@ def in_ipython_frontend(): bool """ try: - ip = get_ipython() # noqa + ip = get_ipython() return "zmq" in str(type(ip)).lower() except NameError: pass diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index e8e41d4325103..5c6ce23707781 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -12,7 +12,7 @@ from pandas.io.formats.printing import pprint_thing if TYPE_CHECKING: - from pandas.core.series import Series # noqa: F401 + from pandas.core.series import Series def _put_str(s: Union[str, Dtype], space: int) -> str: diff --git a/pandas/io/json/_table_schema.py b/pandas/io/json/_table_schema.py index 84146a5d732e1..2b4c86b3c4406 100644 --- a/pandas/io/json/_table_schema.py +++ b/pandas/io/json/_table_schema.py @@ -26,7 +26,7 @@ import pandas.core.common as com if TYPE_CHECKING: - from pandas.core.indexes.multi import MultiIndex # noqa: F401 + from pandas.core.indexes.multi import MultiIndex loads = json.loads diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index d0ea327a65a1d..a3d6975c00a95 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -57,7 +57,7 @@ from pandas.io.formats.printing import adjoin, pprint_thing if TYPE_CHECKING: - from tables import Col, File, Node # noqa:F401 + from tables import Col, File, Node # versioning attribute diff --git a/pandas/io/sas/sasreader.py b/pandas/io/sas/sasreader.py index 893a6286f74d4..caf53b5be971a 100644 --- a/pandas/io/sas/sasreader.py +++ b/pandas/io/sas/sasreader.py @@ -9,7 +9,7 @@ from pandas.io.common import get_filepath_or_buffer, stringify_path if TYPE_CHECKING: - from pandas import DataFrame # noqa: F401 + from pandas import DataFrame # TODO(PY38): replace with Protocol in Python 3.8 diff --git a/pandas/plotting/_matplotlib/__init__.py b/pandas/plotting/_matplotlib/__init__.py index 27b1d55fe1bd6..33011e6a66cac 100644 --- a/pandas/plotting/_matplotlib/__init__.py +++ b/pandas/plotting/_matplotlib/__init__.py @@ -29,7 +29,7 @@ from pandas.plotting._matplotlib.tools import table if TYPE_CHECKING: - from pandas.plotting._matplotlib.core import MPLPlot # noqa: F401 + from pandas.plotting._matplotlib.core import MPLPlot PLOT_CLASSES: Dict[str, Type["MPLPlot"]] = { "line": LinePlot, diff --git a/pandas/plotting/_matplotlib/timeseries.py b/pandas/plotting/_matplotlib/timeseries.py index f8faac6a6a026..64cd43c230f28 100644 --- a/pandas/plotting/_matplotlib/timeseries.py +++ b/pandas/plotting/_matplotlib/timeseries.py @@ -26,7 +26,7 @@ if TYPE_CHECKING: from matplotlib.axes import Axes - from pandas import Index, Series # noqa:F401 + from pandas import Index, Series # --------------------------------------------------------------------- # Plotting functions and monkey patches diff --git a/pandas/plotting/_matplotlib/tools.py b/pandas/plotting/_matplotlib/tools.py index aed0c360fc7ce..832957dd73ec7 100644 --- a/pandas/plotting/_matplotlib/tools.py +++ b/pandas/plotting/_matplotlib/tools.py @@ -17,7 +17,7 @@ if TYPE_CHECKING: from matplotlib.axes import Axes from matplotlib.axis import Axis - from matplotlib.lines import Line2D # noqa:F401 + from matplotlib.lines import Line2D from matplotlib.table import Table diff --git a/pandas/tests/arrays/categorical/test_constructors.py b/pandas/tests/arrays/categorical/test_constructors.py index e200f13652a84..1eef86980f974 100644 --- a/pandas/tests/arrays/categorical/test_constructors.py +++ b/pandas/tests/arrays/categorical/test_constructors.py @@ -213,7 +213,7 @@ def test_constructor(self): # - when the first is an integer dtype and the second is not # - when the resulting codes are all -1/NaN with tm.assert_produces_warning(None): - c_old = Categorical([0, 1, 2, 0, 1, 2], categories=["a", "b", "c"]) # noqa + c_old = Categorical([0, 1, 2, 0, 1, 2], categories=["a", "b", "c"]) with tm.assert_produces_warning(None): c_old = Categorical([0, 1, 2, 0, 1, 2], categories=[3, 4, 5]) # noqa diff --git a/pandas/tests/arrays/categorical/test_repr.py b/pandas/tests/arrays/categorical/test_repr.py index 735b062eae80e..e23fbb16190ea 100644 --- a/pandas/tests/arrays/categorical/test_repr.py +++ b/pandas/tests/arrays/categorical/test_repr.py @@ -320,7 +320,7 @@ def test_categorical_repr_timedelta(self): c = Categorical(idx.append(idx), categories=idx) exp = """[1 days, 2 days, 3 days, 4 days, 5 days, 1 days, 2 days, 3 days, 4 days, 5 days] -Categories (5, timedelta64[ns]): [1 days, 2 days, 3 days, 4 days, 5 days]""" # noqa +Categories (5, timedelta64[ns]): [1 days, 2 days, 3 days, 4 days, 5 days]""" assert repr(c) == exp @@ -347,13 +347,13 @@ def test_categorical_repr_timedelta_ordered(self): idx = timedelta_range("1 days", periods=5) c = Categorical(idx, ordered=True) exp = """[1 days, 2 days, 3 days, 4 days, 5 days] -Categories (5, timedelta64[ns]): [1 days < 2 days < 3 days < 4 days < 5 days]""" # noqa +Categories (5, timedelta64[ns]): [1 days < 2 days < 3 days < 4 days < 5 days]""" assert repr(c) == exp c = Categorical(idx.append(idx), categories=idx, ordered=True) exp = """[1 days, 2 days, 3 days, 4 days, 5 days, 1 days, 2 days, 3 days, 4 days, 5 days] -Categories (5, timedelta64[ns]): [1 days < 2 days < 3 days < 4 days < 5 days]""" # noqa +Categories (5, timedelta64[ns]): [1 days < 2 days < 3 days < 4 days < 5 days]""" assert repr(c) == exp diff --git a/pandas/tests/arrays/sparse/test_libsparse.py b/pandas/tests/arrays/sparse/test_libsparse.py index 2d6e657debdb2..517dc4a2c3d8b 100644 --- a/pandas/tests/arrays/sparse/test_libsparse.py +++ b/pandas/tests/arrays/sparse/test_libsparse.py @@ -452,7 +452,7 @@ def test_check_integrity(self): # 0-length OK # TODO: index variables are not used...is that right? - index = BlockIndex(0, locs, lengths) # noqa + index = BlockIndex(0, locs, lengths) # also OK even though empty index = BlockIndex(1, locs, lengths) # noqa diff --git a/pandas/tests/computation/test_eval.py b/pandas/tests/computation/test_eval.py index b78c7775e8a37..2c5846872c341 100644 --- a/pandas/tests/computation/test_eval.py +++ b/pandas/tests/computation/test_eval.py @@ -48,7 +48,7 @@ ) for engine in ENGINES ) -) # noqa +) def engine(request): return request.param @@ -1885,7 +1885,7 @@ def test_global_scope(self, engine, parser): ) def test_no_new_locals(self, engine, parser): - x = 1 # noqa + x = 1 lcls = locals().copy() pd.eval("x + 1", local_dict=lcls, engine=engine, parser=parser) lcls2 = locals().copy() @@ -1995,8 +1995,8 @@ def test_bool_ops_fails_on_scalars(lhs, cmp, rhs, engine, parser): gen = {int: lambda: np.random.randint(10), float: np.random.randn} mid = gen[lhs]() # noqa - lhs = gen[lhs]() # noqa - rhs = gen[rhs]() # noqa + lhs = gen[lhs]() + rhs = gen[rhs]() ex1 = f"lhs {cmp} mid {cmp} rhs" ex2 = f"lhs {cmp} mid and mid {cmp} rhs" diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index e40a12f7bc8d1..c6c54ccb357d5 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -1495,7 +1495,7 @@ def test_nan_to_nat_conversions(): @td.skip_if_no_scipy @pytest.mark.filterwarnings("ignore::PendingDeprecationWarning") -def test_is_scipy_sparse(spmatrix): # noqa: F811 +def test_is_scipy_sparse(spmatrix): assert is_scipy_sparse(spmatrix([[0, 1]])) assert not is_scipy_sparse(np.array([1])) diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index b947be705a329..507d01f5b900c 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -1091,7 +1091,7 @@ def test_getitem_setitem_float_labels(self): cp.iloc[1.0:5] = 0 with pytest.raises(TypeError, match=msg): - result = cp.iloc[1.0:5] == 0 # noqa + result = cp.iloc[1.0:5] == 0 assert result.values.all() assert (cp.iloc[0:1] == df.iloc[0:1]).values.all() diff --git a/pandas/tests/frame/test_block_internals.py b/pandas/tests/frame/test_block_internals.py index 4a85da72bc8b1..1e404c572dd51 100644 --- a/pandas/tests/frame/test_block_internals.py +++ b/pandas/tests/frame/test_block_internals.py @@ -78,7 +78,7 @@ def test_consolidate_inplace(self, float_frame): def test_values_consolidate(self, float_frame): float_frame["E"] = 7.0 assert not float_frame._mgr.is_consolidated() - _ = float_frame.values # noqa + _ = float_frame.values assert float_frame._mgr.is_consolidated() def test_modify_values(self, float_frame): diff --git a/pandas/tests/frame/test_query_eval.py b/pandas/tests/frame/test_query_eval.py index 2994482fa5139..024403189409c 100644 --- a/pandas/tests/frame/test_query_eval.py +++ b/pandas/tests/frame/test_query_eval.py @@ -633,9 +633,7 @@ def test_chained_cmp_and_in(self): res = df.query( "a < b < c and a not in b not in c", engine=engine, parser=parser ) - ind = ( - (df.a < df.b) & (df.b < df.c) & ~df.b.isin(df.a) & ~df.c.isin(df.b) - ) # noqa + ind = (df.a < df.b) & (df.b < df.c) & ~df.b.isin(df.a) & ~df.c.isin(df.b) expec = df[ind] tm.assert_frame_equal(res, expec) diff --git a/pandas/tests/indexes/categorical/test_formats.py b/pandas/tests/indexes/categorical/test_formats.py index a5607224f6448..45089fd876ffc 100644 --- a/pandas/tests/indexes/categorical/test_formats.py +++ b/pandas/tests/indexes/categorical/test_formats.py @@ -18,7 +18,7 @@ def test_string_categorical_index_repr(self): expected = """CategoricalIndex(['a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc'], - categories=['a', 'bb', 'ccc'], ordered=False, dtype='category')""" # noqa + categories=['a', 'bb', 'ccc'], ordered=False, dtype='category')""" assert repr(idx) == expected @@ -49,7 +49,7 @@ def test_string_categorical_index_repr(self): expected = """CategoricalIndex(['あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう'], - categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category')""" # noqa + categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category')""" assert repr(idx) == expected @@ -84,7 +84,7 @@ def test_string_categorical_index_repr(self): 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう'], - categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category')""" # noqa + categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category')""" assert repr(idx) == expected diff --git a/pandas/tests/indexes/multi/test_formats.py b/pandas/tests/indexes/multi/test_formats.py index 792dcf4c535e3..c1de7f79c2d2e 100644 --- a/pandas/tests/indexes/multi/test_formats.py +++ b/pandas/tests/indexes/multi/test_formats.py @@ -206,5 +206,5 @@ def test_tuple_width(self, wide_multi_index): ('abc', 10, '2000-01-01 00:33:17', '2000-01-01 00:33:17', ...), ('abc', 10, '2000-01-01 00:33:18', '2000-01-01 00:33:18', ...), ('abc', 10, '2000-01-01 00:33:19', '2000-01-01 00:33:19', ...)], - names=['a', 'b', 'dti_1', 'dti_2', 'dti_3'], length=2000)""" # noqa + names=['a', 'b', 'dti_1', 'dti_2', 'dti_3'], length=2000)""" assert result == expected diff --git a/pandas/tests/indexes/multi/test_sorting.py b/pandas/tests/indexes/multi/test_sorting.py index 423bbed831b87..a1e5cc33ef2f6 100644 --- a/pandas/tests/indexes/multi/test_sorting.py +++ b/pandas/tests/indexes/multi/test_sorting.py @@ -119,7 +119,7 @@ def test_unsortedindex(): def test_unsortedindex_doc_examples(): - # https://pandas.pydata.org/pandas-docs/stable/advanced.html#sorting-a-multiindex # noqa + # https://pandas.pydata.org/pandas-docs/stable/advanced.html#sorting-a-multiindex dfm = DataFrame( {"jim": [0, 0, 1, 1], "joe": ["x", "x", "z", "y"], "jolie": np.random.rand(4)} ) diff --git a/pandas/tests/indexing/test_callable.py b/pandas/tests/indexing/test_callable.py index bf51c3e5d1695..b98c9a3df0438 100644 --- a/pandas/tests/indexing/test_callable.py +++ b/pandas/tests/indexing/test_callable.py @@ -17,11 +17,11 @@ def test_frame_loc_callable(self): res = df.loc[lambda x: x.A > 2] tm.assert_frame_equal(res, df.loc[df.A > 2]) - res = df.loc[lambda x: x.A > 2] # noqa: E231 - tm.assert_frame_equal(res, df.loc[df.A > 2]) # noqa: E231 + res = df.loc[lambda x: x.A > 2] + tm.assert_frame_equal(res, df.loc[df.A > 2]) - res = df.loc[lambda x: x.A > 2] # noqa: E231 - tm.assert_frame_equal(res, df.loc[df.A > 2]) # noqa: E231 + res = df.loc[lambda x: x.A > 2] + tm.assert_frame_equal(res, df.loc[df.A > 2]) res = df.loc[lambda x: x.B == "b", :] tm.assert_frame_equal(res, df.loc[df.B == "b", :]) @@ -90,8 +90,8 @@ def test_frame_loc_callable_labels(self): res = df.loc[lambda x: ["A", "C"]] tm.assert_frame_equal(res, df.loc[["A", "C"]]) - res = df.loc[lambda x: ["A", "C"]] # noqa: E231 - tm.assert_frame_equal(res, df.loc[["A", "C"]]) # noqa: E231 + res = df.loc[lambda x: ["A", "C"]] + tm.assert_frame_equal(res, df.loc[["A", "C"]]) res = df.loc[lambda x: ["A", "C"], :] tm.assert_frame_equal(res, df.loc[["A", "C"], :]) diff --git a/pandas/tests/io/formats/test_style.py b/pandas/tests/io/formats/test_style.py index 8d66a16fc2b7a..476d75f7d239d 100644 --- a/pandas/tests/io/formats/test_style.py +++ b/pandas/tests/io/formats/test_style.py @@ -12,7 +12,7 @@ import pandas._testing as tm jinja2 = pytest.importorskip("jinja2") -from pandas.io.formats.style import Styler, _get_level_lengths # noqa # isort:skip +from pandas.io.formats.style import Styler, _get_level_lengths # isort:skip class TestStyler: diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 9278e64cc911f..822342113f62a 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -987,7 +987,7 @@ def test_round_trip_exception_(self): ], ) def test_url(self, field, dtype): - url = "https://api.github.com/repos/pandas-dev/pandas/issues?per_page=5" # noqa + url = "https://api.github.com/repos/pandas-dev/pandas/issues?per_page=5" result = read_json(url, convert_dates=True) assert result[field].dtype == dtype diff --git a/pandas/tests/io/parser/test_index_col.py b/pandas/tests/io/parser/test_index_col.py index 9f425168540ba..4d64f2bf411bd 100644 --- a/pandas/tests/io/parser/test_index_col.py +++ b/pandas/tests/io/parser/test_index_col.py @@ -21,7 +21,7 @@ def test_index_col_named(all_parsers, with_header): KORD3,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 KORD4,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 KORD5,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 -KORD6,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000""" # noqa +KORD6,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000""" header = "ID,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir\n" if with_header: diff --git a/pandas/tests/io/pytables/test_store.py b/pandas/tests/io/pytables/test_store.py index ccb2efbd2c630..c1938db12a0bc 100644 --- a/pandas/tests/io/pytables/test_store.py +++ b/pandas/tests/io/pytables/test_store.py @@ -52,8 +52,8 @@ read_hdf, ) -from pandas.io import pytables as pytables # noqa: E402 isort:skip -from pandas.io.pytables import TableIterator # noqa: E402 isort:skip +from pandas.io import pytables as pytables # isort:skip +from pandas.io.pytables import TableIterator # isort:skip _default_compressor = "blosc" @@ -512,7 +512,7 @@ def check(mode): # context if mode in ["r", "r+"]: with pytest.raises(IOError): - with HDFStore(path, mode=mode) as store: # noqa + with HDFStore(path, mode=mode) as store: pass else: with HDFStore(path, mode=mode) as store: @@ -2350,7 +2350,7 @@ def test_same_name_scoping(self, setup_path): store.put("df", df, format="table") expected = df[df.index > pd.Timestamp("20130105")] - import datetime # noqa + import datetime result = store.select("df", "index>datetime.datetime(2013,1,5)") tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py index c1e63f512b53e..cef5d28b8ccf0 100644 --- a/pandas/tests/io/test_feather.py +++ b/pandas/tests/io/test_feather.py @@ -9,7 +9,7 @@ import pandas as pd import pandas._testing as tm -from pandas.io.feather_format import read_feather, to_feather # noqa: E402 isort:skip +from pandas.io.feather_format import read_feather, to_feather # isort:skip pyarrow = pytest.importorskip("pyarrow") diff --git a/pandas/tests/io/test_gcs.py b/pandas/tests/io/test_gcs.py index 9d179d983ceeb..65e174cd32e22 100644 --- a/pandas/tests/io/test_gcs.py +++ b/pandas/tests/io/test_gcs.py @@ -14,7 +14,7 @@ def gcs_buffer(monkeypatch): """Emulate GCS using a binary buffer.""" from fsspec import AbstractFileSystem, registry - registry.target.clear() # noqa # remove state + registry.target.clear() # remove state gcs_buffer = BytesIO() gcs_buffer.close = lambda: True @@ -33,7 +33,7 @@ def open(*args, **kwargs): def test_read_csv_gcs(gcs_buffer): from fsspec import registry - registry.target.clear() # noqa # remove state + registry.target.clear() # remove state df1 = DataFrame( { @@ -55,7 +55,7 @@ def test_read_csv_gcs(gcs_buffer): def test_to_csv_gcs(gcs_buffer): from fsspec import registry - registry.target.clear() # noqa # remove state + registry.target.clear() # remove state df1 = DataFrame( { @@ -84,7 +84,7 @@ def test_to_csv_compression_encoding_gcs(gcs_buffer, compression_only, encoding) """ from fsspec import registry - registry.target.clear() # noqa # remove state + registry.target.clear() # remove state df = tm.makeDataFrame() # reference of compressed and encoded file @@ -120,7 +120,7 @@ def test_to_parquet_gcs_new_file(monkeypatch, tmpdir): """Regression test for writing to a not-yet-existent GCS Parquet file.""" from fsspec import AbstractFileSystem, registry - registry.target.clear() # noqa # remove state + registry.target.clear() # remove state df1 = DataFrame( { "int": [1, 3], diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index b7c8ca7e0c49f..f7b25f8c0eeac 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -24,14 +24,14 @@ ) try: - import pyarrow # noqa + import pyarrow _HAVE_PYARROW = True except ImportError: _HAVE_PYARROW = False try: - import fastparquet # noqa + import fastparquet _HAVE_FASTPARQUET = True except ImportError: @@ -818,7 +818,7 @@ def test_partition_cols_supported(self, fp, df_full): compression=None, ) assert os.path.exists(path) - import fastparquet # noqa: F811 + import fastparquet actual_partition_cols = fastparquet.ParquetFile(path, False).cats assert len(actual_partition_cols) == 2 @@ -835,7 +835,7 @@ def test_partition_cols_string(self, fp, df_full): compression=None, ) assert os.path.exists(path) - import fastparquet # noqa: F811 + import fastparquet actual_partition_cols = fastparquet.ParquetFile(path, False).cats assert len(actual_partition_cols) == 1 @@ -852,7 +852,7 @@ def test_partition_on_supported(self, fp, df_full): partition_on=partition_cols, ) assert os.path.exists(path) - import fastparquet # noqa: F811 + import fastparquet actual_partition_cols = fastparquet.ParquetFile(path, False).cats assert len(actual_partition_cols) == 2 diff --git a/pandas/tests/plotting/test_frame.py b/pandas/tests/plotting/test_frame.py index ca4c2bdcc2fe1..bdb86d2dd846f 100644 --- a/pandas/tests/plotting/test_frame.py +++ b/pandas/tests/plotting/test_frame.py @@ -3448,7 +3448,7 @@ def test_xlabel_ylabel_dataframe_subplots( def _generate_4_axes_via_gridspec(): import matplotlib as mpl - import matplotlib.gridspec # noqa + import matplotlib.gridspec import matplotlib.pyplot as plt gs = mpl.gridspec.GridSpec(2, 2) diff --git a/pandas/tests/scalar/timestamp/test_constructors.py b/pandas/tests/scalar/timestamp/test_constructors.py index d1c3ad508d877..583110cc4ba70 100644 --- a/pandas/tests/scalar/timestamp/test_constructors.py +++ b/pandas/tests/scalar/timestamp/test_constructors.py @@ -135,7 +135,7 @@ def test_constructor_with_stringoffset(self): # converted to Chicago tz result = Timestamp("2013-11-01 00:00:00-0500", tz="America/Chicago") assert result.value == Timestamp("2013-11-01 05:00").value - expected = "Timestamp('2013-11-01 00:00:00-0500', tz='America/Chicago')" # noqa + expected = "Timestamp('2013-11-01 00:00:00-0500', tz='America/Chicago')" assert repr(result) == expected assert result == eval(repr(result)) diff --git a/pandas/tests/series/test_repr.py b/pandas/tests/series/test_repr.py index 3aaecc37df56c..32e1220f83f40 100644 --- a/pandas/tests/series/test_repr.py +++ b/pandas/tests/series/test_repr.py @@ -486,7 +486,7 @@ def test_categorical_series_repr_timedelta_ordered(self): 3 4 days 4 5 days dtype: category -Categories (5, timedelta64[ns]): [1 days < 2 days < 3 days < 4 days < 5 days]""" # noqa +Categories (5, timedelta64[ns]): [1 days < 2 days < 3 days < 4 days < 5 days]""" assert repr(s) == exp diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py index f7f3f1fa0c13d..17d7527a2b687 100644 --- a/pandas/tests/test_common.py +++ b/pandas/tests/test_common.py @@ -21,7 +21,7 @@ def test_get_callable_name(): def fn(x): return x - lambda_ = lambda x: x # noqa: E731 + lambda_ = lambda x: x part1 = partial(fn) part2 = partial(part1) diff --git a/pandas/tests/test_downstream.py b/pandas/tests/test_downstream.py index b32c5e91af295..c03e8e26952e5 100644 --- a/pandas/tests/test_downstream.py +++ b/pandas/tests/test_downstream.py @@ -20,7 +20,7 @@ def import_module(name): try: return importlib.import_module(name) - except ModuleNotFoundError: # noqa + except ModuleNotFoundError: pytest.skip(f"skipping as {name} not available") @@ -117,7 +117,7 @@ def test_pandas_gbq(df): @tm.network def test_pandas_datareader(): - pandas_datareader = import_module("pandas_datareader") # noqa + pandas_datareader = import_module("pandas_datareader") pandas_datareader.DataReader("F", "quandl", "2017-01-01", "2017-02-01") @@ -125,7 +125,7 @@ def test_pandas_datareader(): @pytest.mark.filterwarnings("ignore:can't resolve:ImportWarning") def test_geopandas(): - geopandas = import_module("geopandas") # noqa + geopandas = import_module("geopandas") fp = geopandas.datasets.get_path("naturalearth_lowres") assert geopandas.read_file(fp) is not None @@ -135,7 +135,7 @@ def test_geopandas(): @pytest.mark.filterwarnings("ignore:RangeIndex.* is deprecated:DeprecationWarning") def test_pyarrow(df): - pyarrow = import_module("pyarrow") # noqa + pyarrow = import_module("pyarrow") table = pyarrow.Table.from_pandas(df) result = table.to_pandas() tm.assert_frame_equal(result, df) diff --git a/pandas/tests/test_errors.py b/pandas/tests/test_errors.py index 6a1a74c73288f..6207b886b95c7 100644 --- a/pandas/tests/test_errors.py +++ b/pandas/tests/test_errors.py @@ -2,7 +2,7 @@ from pandas.errors import AbstractMethodError -import pandas as pd # noqa +import pandas as pd @pytest.mark.parametrize( diff --git a/scripts/validate_docstrings.py b/scripts/validate_docstrings.py index 7971379ca60c1..8b15358834066 100755 --- a/scripts/validate_docstrings.py +++ b/scripts/validate_docstrings.py @@ -35,19 +35,19 @@ # script. Setting here before matplotlib is loaded. # We don't warn for the number of open plots, as none is actually being opened os.environ["MPLBACKEND"] = "Template" -import matplotlib # noqa: E402 isort:skip +import matplotlib # isort:skip matplotlib.rc("figure", max_open_warning=10000) -import numpy # noqa: E402 isort:skip +import numpy # isort:skip BASE_PATH = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) sys.path.insert(0, os.path.join(BASE_PATH)) -import pandas # noqa: E402 isort:skip +import pandas # isort:skip sys.path.insert(1, os.path.join(BASE_PATH, "doc", "sphinxext")) -from numpydoc.validate import validate, Docstring # noqa: E402 isort:skip +from numpydoc.validate import validate, Docstring # isort:skip PRIVATE_CLASSES = ["NDFrame", "IndexOpsMixin"] diff --git a/setup.py b/setup.py index 8e25705c1f4c3..9a9d12ce4d2ba 100755 --- a/setup.py +++ b/setup.py @@ -51,8 +51,8 @@ def is_platform_mac(): # The import of Extension must be after the import of Cython, otherwise # we do not get the appropriately patched class. # See https://cython.readthedocs.io/en/latest/src/userguide/source_files_and_compilation.html # noqa -from distutils.extension import Extension # noqa: E402 isort:skip -from distutils.command.build import build # noqa: E402 isort:skip +from distutils.extension import Extension # isort:skip +from distutils.command.build import build # isort:skip if _CYTHON_INSTALLED: from Cython.Distutils.old_build_ext import old_build_ext as _build_ext From a2189298c4aafca1d376569231f1f10e8f37515c Mon Sep 17 00:00:00 2001 From: hardikpnsp Date: Sat, 3 Oct 2020 04:38:39 +0530 Subject: [PATCH 0994/1025] ASV: used integer ndarray as indexer for Series.take indexing benchmark (#36656) --- asv_bench/benchmarks/indexing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py index 836d3ca8602ec..74e0a3a434cde 100644 --- a/asv_bench/benchmarks/indexing.py +++ b/asv_bench/benchmarks/indexing.py @@ -191,7 +191,7 @@ def setup(self, index): } index = indexes[index] self.s = Series(np.random.rand(N), index=index) - self.indexer = [True, False, True, True, False] * 20000 + self.indexer = np.random.randint(0, N, size=N) def time_take(self, index): self.s.take(self.indexer) From f6d53ad27f0f1103f8d12bcee9d5b7fdc43d7193 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 2 Oct 2020 16:14:03 -0700 Subject: [PATCH 0995/1025] EA: tighten TimedeltaArray._from_sequence signature (#36731) --- pandas/core/arrays/timedeltas.py | 13 +++++++++++++ pandas/core/indexes/timedeltas.py | 2 +- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index 6ca57e7872910..c97c7da375fd4 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -214,6 +214,19 @@ def _simple_new( @classmethod def _from_sequence( + cls, data, dtype=TD64NS_DTYPE, copy: bool = False + ) -> "TimedeltaArray": + if dtype: + _validate_td64_dtype(dtype) + + data, inferred_freq = sequence_to_td64ns(data, copy=copy, unit=None) + freq, _ = dtl.validate_inferred_freq(None, inferred_freq, False) + + result = cls._simple_new(data, freq=freq) + return result + + @classmethod + def _from_sequence_not_strict( cls, data, dtype=TD64NS_DTYPE, diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index 854c4e33eca01..858387f2e1600 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -153,7 +153,7 @@ def __new__( # - Cases checked above all return/raise before reaching here - # - tdarr = TimedeltaArray._from_sequence( + tdarr = TimedeltaArray._from_sequence_not_strict( data, freq=freq, unit=unit, dtype=dtype, copy=copy ) return cls._simple_new(tdarr, name=name) From 0617797b38e5e4bd514df68439f258644b7d05ac Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 2 Oct 2020 16:14:19 -0700 Subject: [PATCH 0996/1025] EA: Tighten signature on DatetimeArray._from_sequence (#36718) --- pandas/core/arrays/datetimes.py | 6 ++++- pandas/core/indexes/datetimes.py | 2 +- pandas/core/nanops.py | 4 ++- pandas/tests/arrays/test_array.py | 4 ++- pandas/tests/arrays/test_datetimes.py | 27 +++++++++++++------ pandas/tests/extension/test_datetime.py | 4 ++- .../indexes/datetimes/test_constructors.py | 4 ++- pandas/tests/scalar/test_nat.py | 5 +++- 8 files changed, 41 insertions(+), 15 deletions(-) diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index cd5449058fb33..db73c84b39cf9 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -299,7 +299,11 @@ def _simple_new( return result @classmethod - def _from_sequence( + def _from_sequence(cls, scalars, dtype=None, copy: bool = False): + return cls._from_sequence_not_strict(scalars, dtype=dtype, copy=copy) + + @classmethod + def _from_sequence_not_strict( cls, data, dtype=None, diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index da78f8ff5d603..06405995f7685 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -295,7 +295,7 @@ def __new__( name = maybe_extract_name(name, data, cls) - dtarr = DatetimeArray._from_sequence( + dtarr = DatetimeArray._from_sequence_not_strict( data, dtype=dtype, copy=copy, diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 64470da2fb910..f2354f649b1e3 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -1616,7 +1616,9 @@ def na_accum_func(values: ArrayLike, accum_func, skipna: bool) -> ArrayLike: result = result.view(orig_dtype) else: # DatetimeArray - result = type(values)._from_sequence(result, dtype=orig_dtype) + result = type(values)._simple_new( # type: ignore[attr-defined] + result, dtype=orig_dtype + ) elif skipna and not issubclass(values.dtype.type, (np.integer, np.bool_)): vals = values.copy() diff --git a/pandas/tests/arrays/test_array.py b/pandas/tests/arrays/test_array.py index ff2573a51c3e7..72deada4eaf43 100644 --- a/pandas/tests/arrays/test_array.py +++ b/pandas/tests/arrays/test_array.py @@ -210,7 +210,9 @@ def test_array_copy(): datetime.datetime(2000, 1, 1, tzinfo=cet), datetime.datetime(2001, 1, 1, tzinfo=cet), ], - DatetimeArray._from_sequence(["2000", "2001"], tz=cet), + DatetimeArray._from_sequence( + ["2000", "2001"], dtype=pd.DatetimeTZDtype(tz=cet) + ), ), # timedelta ( diff --git a/pandas/tests/arrays/test_datetimes.py b/pandas/tests/arrays/test_datetimes.py index 53f26de09f94e..e7605125e7420 100644 --- a/pandas/tests/arrays/test_datetimes.py +++ b/pandas/tests/arrays/test_datetimes.py @@ -71,7 +71,7 @@ def test_mixing_naive_tzaware_raises(self, meth): def test_from_pandas_array(self): arr = pd.array(np.arange(5, dtype=np.int64)) * 3600 * 10 ** 9 - result = DatetimeArray._from_sequence(arr, freq="infer") + result = DatetimeArray._from_sequence(arr)._with_freq("infer") expected = pd.date_range("1970-01-01", periods=5, freq="H")._data tm.assert_datetime_array_equal(result, expected) @@ -162,7 +162,9 @@ def test_cmp_dt64_arraylike_tznaive(self, all_compare_operators): class TestDatetimeArray: def test_astype_to_same(self): - arr = DatetimeArray._from_sequence(["2000"], tz="US/Central") + arr = DatetimeArray._from_sequence( + ["2000"], dtype=DatetimeTZDtype(tz="US/Central") + ) result = arr.astype(DatetimeTZDtype(tz="US/Central"), copy=False) assert result is arr @@ -193,7 +195,9 @@ def test_astype_int(self, dtype): tm.assert_numpy_array_equal(result, expected) def test_tz_setter_raises(self): - arr = DatetimeArray._from_sequence(["2000"], tz="US/Central") + arr = DatetimeArray._from_sequence( + ["2000"], dtype=DatetimeTZDtype(tz="US/Central") + ) with pytest.raises(AttributeError, match="tz_localize"): arr.tz = "UTC" @@ -282,7 +286,8 @@ def test_fillna_preserves_tz(self, method): fill_val = dti[1] if method == "pad" else dti[3] expected = DatetimeArray._from_sequence( - [dti[0], dti[1], fill_val, dti[3], dti[4]], freq=None, tz="US/Central" + [dti[0], dti[1], fill_val, dti[3], dti[4]], + dtype=DatetimeTZDtype(tz="US/Central"), ) result = arr.fillna(method=method) @@ -434,12 +439,16 @@ def test_shift_value_tzawareness_mismatch(self): class TestSequenceToDT64NS: def test_tz_dtype_mismatch_raises(self): - arr = DatetimeArray._from_sequence(["2000"], tz="US/Central") + arr = DatetimeArray._from_sequence( + ["2000"], dtype=DatetimeTZDtype(tz="US/Central") + ) with pytest.raises(TypeError, match="data is already tz-aware"): sequence_to_dt64ns(arr, dtype=DatetimeTZDtype(tz="UTC")) def test_tz_dtype_matches(self): - arr = DatetimeArray._from_sequence(["2000"], tz="US/Central") + arr = DatetimeArray._from_sequence( + ["2000"], dtype=DatetimeTZDtype(tz="US/Central") + ) result, _, _ = sequence_to_dt64ns(arr, dtype=DatetimeTZDtype(tz="US/Central")) tm.assert_numpy_array_equal(arr._data, result) @@ -447,6 +456,7 @@ def test_tz_dtype_matches(self): class TestReductions: @pytest.mark.parametrize("tz", [None, "US/Central"]) def test_min_max(self, tz): + dtype = DatetimeTZDtype(tz=tz) if tz is not None else np.dtype("M8[ns]") arr = DatetimeArray._from_sequence( [ "2000-01-03", @@ -456,7 +466,7 @@ def test_min_max(self, tz): "2000-01-05", "2000-01-04", ], - tz=tz, + dtype=dtype, ) result = arr.min() @@ -476,7 +486,8 @@ def test_min_max(self, tz): @pytest.mark.parametrize("tz", [None, "US/Central"]) @pytest.mark.parametrize("skipna", [True, False]) def test_min_max_empty(self, skipna, tz): - arr = DatetimeArray._from_sequence([], tz=tz) + dtype = DatetimeTZDtype(tz=tz) if tz is not None else np.dtype("M8[ns]") + arr = DatetimeArray._from_sequence([], dtype=dtype) result = arr.min(skipna=skipna) assert result is pd.NaT diff --git a/pandas/tests/extension/test_datetime.py b/pandas/tests/extension/test_datetime.py index e026809f7e611..0fde1e8a2fdb8 100644 --- a/pandas/tests/extension/test_datetime.py +++ b/pandas/tests/extension/test_datetime.py @@ -181,8 +181,10 @@ def test_concat_mixed_dtypes(self, data): @pytest.mark.parametrize("obj", ["series", "frame"]) def test_unstack(self, obj): # GH-13287: can't use base test, since building the expected fails. + dtype = DatetimeTZDtype(tz="US/Central") data = DatetimeArray._from_sequence( - ["2000", "2001", "2002", "2003"], tz="US/Central" + ["2000", "2001", "2002", "2003"], + dtype=dtype, ) index = pd.MultiIndex.from_product(([["A", "B"], ["a", "b"]]), names=["a", "b"]) diff --git a/pandas/tests/indexes/datetimes/test_constructors.py b/pandas/tests/indexes/datetimes/test_constructors.py index 9a855a1624520..d3c79f231449a 100644 --- a/pandas/tests/indexes/datetimes/test_constructors.py +++ b/pandas/tests/indexes/datetimes/test_constructors.py @@ -16,7 +16,9 @@ class TestDatetimeIndex: - @pytest.mark.parametrize("dt_cls", [DatetimeIndex, DatetimeArray._from_sequence]) + @pytest.mark.parametrize( + "dt_cls", [DatetimeIndex, DatetimeArray._from_sequence_not_strict] + ) def test_freq_validation_with_nat(self, dt_cls): # GH#11587 make sure we get a useful error message when generate_range # raises diff --git a/pandas/tests/scalar/test_nat.py b/pandas/tests/scalar/test_nat.py index 09d5d9c1677d0..2ea7602b00206 100644 --- a/pandas/tests/scalar/test_nat.py +++ b/pandas/tests/scalar/test_nat.py @@ -12,6 +12,7 @@ from pandas import ( DatetimeIndex, + DatetimeTZDtype, Index, NaT, Period, @@ -440,7 +441,9 @@ def test_nat_rfloordiv_timedelta(val, expected): DatetimeIndex(["2011-01-01", "2011-01-02"], name="x"), DatetimeIndex(["2011-01-01", "2011-01-02"], tz="US/Eastern", name="x"), DatetimeArray._from_sequence(["2011-01-01", "2011-01-02"]), - DatetimeArray._from_sequence(["2011-01-01", "2011-01-02"], tz="US/Pacific"), + DatetimeArray._from_sequence( + ["2011-01-01", "2011-01-02"], dtype=DatetimeTZDtype(tz="US/Pacific") + ), TimedeltaIndex(["1 day", "2 day"], name="x"), ], ) From 6b02357bd54a78a6845da092dd6666cdcb4476dd Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 2 Oct 2020 16:34:10 -0700 Subject: [PATCH 0997/1025] REF: Back IntervalArray by array instead of Index (#36310) --- doc/source/whatsnew/v1.2.0.rst | 1 + pandas/_testing.py | 20 ++- pandas/core/arrays/interval.py | 126 ++++++++++-------- pandas/core/indexes/interval.py | 30 +++-- pandas/tests/extension/test_interval.py | 4 +- .../indexes/interval/test_constructors.py | 6 + pandas/tests/series/indexing/test_getitem.py | 2 +- .../util/test_assert_interval_array_equal.py | 14 +- 8 files changed, 123 insertions(+), 80 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 6d1196b783f74..1236c672a1fa1 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -282,6 +282,7 @@ Performance improvements - Performance improvement in :meth:`GroupBy.transform` with the ``numba`` engine (:issue:`36240`) - ``Styler`` uuid method altered to compress data transmission over web whilst maintaining reasonably low table collision probability (:issue:`36345`) - Performance improvement in :meth:`pd.to_datetime` with non-ns time unit for ``float`` ``dtype`` columns (:issue:`20445`) +- Performance improvement in setting values on a :class:`IntervalArray` (:issue:`36310`) .. --------------------------------------------------------------------------- diff --git a/pandas/_testing.py b/pandas/_testing.py index 78b6b3c4f9072..cf6272edc4c05 100644 --- a/pandas/_testing.py +++ b/pandas/_testing.py @@ -977,8 +977,14 @@ def assert_interval_array_equal(left, right, exact="equiv", obj="IntervalArray") """ _check_isinstance(left, right, IntervalArray) - assert_index_equal(left.left, right.left, exact=exact, obj=f"{obj}.left") - assert_index_equal(left.right, right.right, exact=exact, obj=f"{obj}.left") + kwargs = {} + if left._left.dtype.kind in ["m", "M"]: + # We have a DatetimeArray or TimedeltaArray + kwargs["check_freq"] = False + + assert_equal(left._left, right._left, obj=f"{obj}.left", **kwargs) + assert_equal(left._right, right._right, obj=f"{obj}.left", **kwargs) + assert_attr_equal("closed", left, right, obj=obj) @@ -989,20 +995,22 @@ def assert_period_array_equal(left, right, obj="PeriodArray"): assert_attr_equal("freq", left, right, obj=obj) -def assert_datetime_array_equal(left, right, obj="DatetimeArray"): +def assert_datetime_array_equal(left, right, obj="DatetimeArray", check_freq=True): __tracebackhide__ = True _check_isinstance(left, right, DatetimeArray) assert_numpy_array_equal(left._data, right._data, obj=f"{obj}._data") - assert_attr_equal("freq", left, right, obj=obj) + if check_freq: + assert_attr_equal("freq", left, right, obj=obj) assert_attr_equal("tz", left, right, obj=obj) -def assert_timedelta_array_equal(left, right, obj="TimedeltaArray"): +def assert_timedelta_array_equal(left, right, obj="TimedeltaArray", check_freq=True): __tracebackhide__ = True _check_isinstance(left, right, TimedeltaArray) assert_numpy_array_equal(left._data, right._data, obj=f"{obj}._data") - assert_attr_equal("freq", left, right, obj=obj) + if check_freq: + assert_attr_equal("freq", left, right, obj=obj) def raise_assert_detail(obj, message, left, right, diff=None, index_values=None): diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 5105b5b9cc57b..413430942575d 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -31,7 +31,6 @@ from pandas.core.dtypes.dtypes import IntervalDtype from pandas.core.dtypes.generic import ( ABCDatetimeIndex, - ABCIndexClass, ABCIntervalIndex, ABCPeriodIndex, ABCSeries, @@ -42,7 +41,7 @@ from pandas.core.arrays.base import ExtensionArray, _extension_array_shared_docs from pandas.core.arrays.categorical import Categorical import pandas.core.common as com -from pandas.core.construction import array +from pandas.core.construction import array, extract_array from pandas.core.indexers import check_array_indexer from pandas.core.indexes.base import ensure_index @@ -161,12 +160,14 @@ def __new__( verify_integrity: bool = True, ): - if isinstance(data, ABCSeries) and is_interval_dtype(data.dtype): - data = data._values + if isinstance(data, (ABCSeries, ABCIntervalIndex)) and is_interval_dtype( + data.dtype + ): + data = data._values # TODO: extract_array? - if isinstance(data, (cls, ABCIntervalIndex)): - left = data.left - right = data.right + if isinstance(data, cls): + left = data._left + right = data._right closed = closed or data.closed else: @@ -243,6 +244,20 @@ def _simple_new( ) raise ValueError(msg) + # For dt64/td64 we want DatetimeArray/TimedeltaArray instead of ndarray + from pandas.core.ops.array_ops import maybe_upcast_datetimelike_array + + left = maybe_upcast_datetimelike_array(left) + left = extract_array(left, extract_numpy=True) + right = maybe_upcast_datetimelike_array(right) + right = extract_array(right, extract_numpy=True) + + lbase = getattr(left, "_ndarray", left).base + rbase = getattr(right, "_ndarray", right).base + if lbase is not None and lbase is rbase: + # If these share data, then setitem could corrupt our IA + right = right.copy() + result._left = left result._right = right result._closed = closed @@ -476,18 +491,18 @@ def _validate(self): if self.closed not in VALID_CLOSED: msg = f"invalid option for 'closed': {self.closed}" raise ValueError(msg) - if len(self.left) != len(self.right): + if len(self._left) != len(self._right): msg = "left and right must have the same length" raise ValueError(msg) - left_mask = notna(self.left) - right_mask = notna(self.right) + left_mask = notna(self._left) + right_mask = notna(self._right) if not (left_mask == right_mask).all(): msg = ( "missing values must be missing in the same " "location both left and right sides" ) raise ValueError(msg) - if not (self.left[left_mask] <= self.right[left_mask]).all(): + if not (self._left[left_mask] <= self._right[left_mask]).all(): msg = "left side of interval must be <= right side" raise ValueError(msg) @@ -527,37 +542,29 @@ def __iter__(self): return iter(np.asarray(self)) def __len__(self) -> int: - return len(self.left) + return len(self._left) def __getitem__(self, value): value = check_array_indexer(self, value) - left = self.left[value] - right = self.right[value] + left = self._left[value] + right = self._right[value] - # scalar - if not isinstance(left, ABCIndexClass): + if not isinstance(left, (np.ndarray, ExtensionArray)): + # scalar if is_scalar(left) and isna(left): return self._fill_value - if np.ndim(left) > 1: - # GH#30588 multi-dimensional indexer disallowed - raise ValueError("multi-dimensional indexing not allowed") return Interval(left, right, self.closed) - + if np.ndim(left) > 1: + # GH#30588 multi-dimensional indexer disallowed + raise ValueError("multi-dimensional indexing not allowed") return self._shallow_copy(left, right) def __setitem__(self, key, value): value_left, value_right = self._validate_setitem_value(value) key = check_array_indexer(self, key) - # Need to ensure that left and right are updated atomically, so we're - # forced to copy, update the copy, and swap in the new values. - left = self.left.copy(deep=True) - left._values[key] = value_left - self._left = left - - right = self.right.copy(deep=True) - right._values[key] = value_right - self._right = right + self._left[key] = value_left + self._right[key] = value_right def __eq__(self, other): # ensure pandas array for list-like and eliminate non-interval scalars @@ -588,7 +595,7 @@ def __eq__(self, other): if is_interval_dtype(other_dtype): if self.closed != other.closed: return np.zeros(len(self), dtype=bool) - return (self.left == other.left) & (self.right == other.right) + return (self._left == other.left) & (self._right == other.right) # non-interval/non-object dtype -> no matches if not is_object_dtype(other_dtype): @@ -601,8 +608,8 @@ def __eq__(self, other): if ( isinstance(obj, Interval) and self.closed == obj.closed - and self.left[i] == obj.left - and self.right[i] == obj.right + and self._left[i] == obj.left + and self._right[i] == obj.right ): result[i] = True @@ -665,6 +672,7 @@ def astype(self, dtype, copy=True): array : ExtensionArray or ndarray ExtensionArray or NumPy ndarray with 'dtype' for its dtype. """ + from pandas import Index from pandas.core.arrays.string_ import StringDtype if dtype is not None: @@ -676,8 +684,10 @@ def astype(self, dtype, copy=True): # need to cast to different subtype try: - new_left = self.left.astype(dtype.subtype) - new_right = self.right.astype(dtype.subtype) + # We need to use Index rules for astype to prevent casting + # np.nan entries to int subtypes + new_left = Index(self._left, copy=False).astype(dtype.subtype) + new_right = Index(self._right, copy=False).astype(dtype.subtype) except TypeError as err: msg = ( f"Cannot convert {self.dtype} to {dtype}; subtypes are incompatible" @@ -726,14 +736,14 @@ def copy(self): ------- IntervalArray """ - left = self.left.copy(deep=True) - right = self.right.copy(deep=True) + left = self._left.copy() + right = self._right.copy() closed = self.closed # TODO: Could skip verify_integrity here. return type(self).from_arrays(left, right, closed=closed) - def isna(self): - return isna(self.left) + def isna(self) -> np.ndarray: + return isna(self._left) def shift(self, periods: int = 1, fill_value: object = None) -> "IntervalArray": if not len(self) or periods == 0: @@ -749,7 +759,9 @@ def shift(self, periods: int = 1, fill_value: object = None) -> "IntervalArray": empty_len = min(abs(periods), len(self)) if isna(fill_value): - fill_value = self.left._na_value + from pandas import Index + + fill_value = Index(self._left, copy=False)._na_value empty = IntervalArray.from_breaks([fill_value] * (empty_len + 1)) else: empty = self._from_sequence([fill_value] * empty_len) @@ -815,10 +827,10 @@ def take(self, indices, allow_fill=False, fill_value=None, axis=None, **kwargs): fill_left, fill_right = self._validate_fill_value(fill_value) left_take = take( - self.left, indices, allow_fill=allow_fill, fill_value=fill_left + self._left, indices, allow_fill=allow_fill, fill_value=fill_left ) right_take = take( - self.right, indices, allow_fill=allow_fill, fill_value=fill_right + self._right, indices, allow_fill=allow_fill, fill_value=fill_right ) return self._shallow_copy(left_take, right_take) @@ -977,7 +989,9 @@ def left(self): Return the left endpoints of each Interval in the IntervalArray as an Index. """ - return self._left + from pandas import Index + + return Index(self._left, copy=False) @property def right(self): @@ -985,7 +999,9 @@ def right(self): Return the right endpoints of each Interval in the IntervalArray as an Index. """ - return self._right + from pandas import Index + + return Index(self._right, copy=False) @property def length(self): @@ -1146,7 +1162,7 @@ def set_closed(self, closed): raise ValueError(msg) return type(self)._simple_new( - left=self.left, right=self.right, closed=closed, verify_integrity=False + left=self._left, right=self._right, closed=closed, verify_integrity=False ) _interval_shared_docs[ @@ -1172,15 +1188,15 @@ def is_non_overlapping_monotonic(self): # at a point when both sides of intervals are included if self.closed == "both": return bool( - (self.right[:-1] < self.left[1:]).all() - or (self.left[:-1] > self.right[1:]).all() + (self._right[:-1] < self._left[1:]).all() + or (self._left[:-1] > self._right[1:]).all() ) # non-strict inequality when closed != 'both'; at least one side is # not included in the intervals, so equality does not imply overlapping return bool( - (self.right[:-1] <= self.left[1:]).all() - or (self.left[:-1] >= self.right[1:]).all() + (self._right[:-1] <= self._left[1:]).all() + or (self._left[:-1] >= self._right[1:]).all() ) # --------------------------------------------------------------------- @@ -1191,8 +1207,8 @@ def __array__(self, dtype=None) -> np.ndarray: Return the IntervalArray's data as a numpy array of Interval objects (with dtype='object') """ - left = self.left - right = self.right + left = self._left + right = self._right mask = self.isna() closed = self._closed @@ -1222,8 +1238,8 @@ def __arrow_array__(self, type=None): interval_type = ArrowIntervalType(subtype, self.closed) storage_array = pyarrow.StructArray.from_arrays( [ - pyarrow.array(self.left, type=subtype, from_pandas=True), - pyarrow.array(self.right, type=subtype, from_pandas=True), + pyarrow.array(self._left, type=subtype, from_pandas=True), + pyarrow.array(self._right, type=subtype, from_pandas=True), ], names=["left", "right"], ) @@ -1277,7 +1293,7 @@ def __arrow_array__(self, type=None): _interval_shared_docs["to_tuples"] % dict(return_type="ndarray", examples="") ) def to_tuples(self, na_tuple=True): - tuples = com.asarray_tuplesafe(zip(self.left, self.right)) + tuples = com.asarray_tuplesafe(zip(self._left, self._right)) if not na_tuple: # GH 18756 tuples = np.where(~self.isna(), tuples, np.nan) @@ -1343,8 +1359,8 @@ def contains(self, other): if isinstance(other, Interval): raise NotImplementedError("contains not implemented for two intervals") - return (self.left < other if self.open_left else self.left <= other) & ( - other < self.right if self.open_right else other <= self.right + return (self._left < other if self.open_left else self._left <= other) & ( + other < self._right if self.open_right else other <= self._right ) diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 8855d987af745..a56f6a5bb0340 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -183,12 +183,8 @@ def func(intvidx_self, other, sort=False): ) ) @inherit_names(["set_closed", "to_tuples"], IntervalArray, wrap=True) -@inherit_names( - ["__array__", "overlaps", "contains", "left", "right", "length"], IntervalArray -) -@inherit_names( - ["is_non_overlapping_monotonic", "mid", "closed"], IntervalArray, cache=True -) +@inherit_names(["__array__", "overlaps", "contains"], IntervalArray) +@inherit_names(["is_non_overlapping_monotonic", "closed"], IntervalArray, cache=True) class IntervalIndex(IntervalMixin, ExtensionIndex): _typ = "intervalindex" _comparables = ["name"] @@ -201,6 +197,8 @@ class IntervalIndex(IntervalMixin, ExtensionIndex): _mask = None _data: IntervalArray + _values: IntervalArray + # -------------------------------------------------------------------- # Constructors @@ -409,7 +407,7 @@ def __reduce__(self): return _new_IntervalIndex, (type(self), d), None @Appender(Index.astype.__doc__) - def astype(self, dtype, copy=True): + def astype(self, dtype, copy: bool = True): with rewrite_exception("IntervalArray", type(self).__name__): new_values = self._values.astype(dtype, copy=copy) if is_interval_dtype(new_values.dtype): @@ -438,7 +436,7 @@ def is_monotonic_decreasing(self) -> bool: return self[::-1].is_monotonic_increasing @cache_readonly - def is_unique(self): + def is_unique(self) -> bool: """ Return True if the IntervalIndex contains unique elements, else False. """ @@ -865,6 +863,22 @@ def _convert_list_indexer(self, keyarr): # -------------------------------------------------------------------- + @cache_readonly + def left(self) -> Index: + return Index(self._data.left, copy=False) + + @cache_readonly + def right(self) -> Index: + return Index(self._data.right, copy=False) + + @cache_readonly + def mid(self): + return Index(self._data.mid, copy=False) + + @property + def length(self): + return Index(self._data.length, copy=False) + @Appender(Index.where.__doc__) def where(self, cond, other=None): if other is None: diff --git a/pandas/tests/extension/test_interval.py b/pandas/tests/extension/test_interval.py index 2411f6cfbd936..4fdcf930d224f 100644 --- a/pandas/tests/extension/test_interval.py +++ b/pandas/tests/extension/test_interval.py @@ -147,9 +147,7 @@ class TestReshaping(BaseInterval, base.BaseReshapingTests): class TestSetitem(BaseInterval, base.BaseSetitemTests): - @pytest.mark.xfail(reason="GH#27147 setitem changes underlying index") - def test_setitem_preserves_views(self, data): - super().test_setitem_preserves_views(data) + pass class TestPrinting(BaseInterval, base.BasePrintingTests): diff --git a/pandas/tests/indexes/interval/test_constructors.py b/pandas/tests/indexes/interval/test_constructors.py index fa881df8139c6..aec7de549744f 100644 --- a/pandas/tests/indexes/interval/test_constructors.py +++ b/pandas/tests/indexes/interval/test_constructors.py @@ -262,6 +262,12 @@ def test_length_one(self): expected = IntervalIndex.from_breaks([]) tm.assert_index_equal(result, expected) + def test_left_right_dont_share_data(self): + # GH#36310 + breaks = np.arange(5) + result = IntervalIndex.from_breaks(breaks)._data + assert result._left.base is None or result._left.base is not result._right.base + class TestFromTuples(Base): """Tests specific to IntervalIndex.from_tuples""" diff --git a/pandas/tests/series/indexing/test_getitem.py b/pandas/tests/series/indexing/test_getitem.py index 6b7cda89a4714..5b585e8802752 100644 --- a/pandas/tests/series/indexing/test_getitem.py +++ b/pandas/tests/series/indexing/test_getitem.py @@ -101,7 +101,7 @@ def test_getitem_intlist_intindex_periodvalues(self): @pytest.mark.parametrize("box", [list, np.array, pd.Index]) def test_getitem_intlist_intervalindex_non_int(self, box): # GH#33404 fall back to positional since ints are unambiguous - dti = date_range("2000-01-03", periods=3) + dti = date_range("2000-01-03", periods=3)._with_freq(None) ii = pd.IntervalIndex.from_breaks(dti) ser = Series(range(len(ii)), index=ii) diff --git a/pandas/tests/util/test_assert_interval_array_equal.py b/pandas/tests/util/test_assert_interval_array_equal.py index 96f2973a1528c..2e8699536c72a 100644 --- a/pandas/tests/util/test_assert_interval_array_equal.py +++ b/pandas/tests/util/test_assert_interval_array_equal.py @@ -41,9 +41,9 @@ def test_interval_array_equal_periods_mismatch(): msg = """\ IntervalArray.left are different -IntervalArray.left length are different -\\[left\\]: 5, Int64Index\\(\\[0, 1, 2, 3, 4\\], dtype='int64'\\) -\\[right\\]: 6, Int64Index\\(\\[0, 1, 2, 3, 4, 5\\], dtype='int64'\\)""" +IntervalArray.left shapes are different +\\[left\\]: \\(5,\\) +\\[right\\]: \\(6,\\)""" with pytest.raises(AssertionError, match=msg): tm.assert_interval_array_equal(arr1, arr2) @@ -58,8 +58,8 @@ def test_interval_array_equal_end_mismatch(): IntervalArray.left are different IntervalArray.left values are different \\(80.0 %\\) -\\[left\\]: Int64Index\\(\\[0, 2, 4, 6, 8\\], dtype='int64'\\) -\\[right\\]: Int64Index\\(\\[0, 4, 8, 12, 16\\], dtype='int64'\\)""" +\\[left\\]: \\[0, 2, 4, 6, 8\\] +\\[right\\]: \\[0, 4, 8, 12, 16\\]""" with pytest.raises(AssertionError, match=msg): tm.assert_interval_array_equal(arr1, arr2) @@ -74,8 +74,8 @@ def test_interval_array_equal_start_mismatch(): IntervalArray.left are different IntervalArray.left values are different \\(100.0 %\\) -\\[left\\]: Int64Index\\(\\[0, 1, 2, 3\\], dtype='int64'\\) -\\[right\\]: Int64Index\\(\\[1, 2, 3, 4\\], dtype='int64'\\)""" +\\[left\\]: \\[0, 1, 2, 3\\] +\\[right\\]: \\[1, 2, 3, 4\\]""" with pytest.raises(AssertionError, match=msg): tm.assert_interval_array_equal(arr1, arr2) From ec5fe47bac9047ba5665eb64d16afa347dcafb16 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 2 Oct 2020 17:55:36 -0700 Subject: [PATCH 0998/1025] DEPR: automatic alignment on frame.__cmp__(series) (#36795) --- doc/source/whatsnew/v1.2.0.rst | 1 + pandas/core/ops/__init__.py | 13 +++++++++++++ pandas/tests/arithmetic/test_datetime64.py | 17 +++++++++++++---- pandas/tests/frame/test_arithmetic.py | 8 ++++++-- 4 files changed, 33 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 1236c672a1fa1..c854f995bd2ea 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -266,6 +266,7 @@ Deprecations - Deprecated indexing :class:`DataFrame` rows with datetime-like strings ``df[string]``, use ``df.loc[string]`` instead (:issue:`36179`) - Deprecated casting an object-dtype index of ``datetime`` objects to :class:`DatetimeIndex` in the :class:`Series` constructor (:issue:`23598`) - Deprecated :meth:`Index.is_all_dates` (:issue:`27744`) +- Deprecated automatic alignment on comparison operations between :class:`DataFrame` and :class:`Series`, do ``frame, ser = frame.align(ser, axis=1, copy=False)`` before e.g. ``frame == ser`` (:issue:`28759`) - :meth:`Rolling.count` with ``min_periods=None`` will default to the size of the window in a future version (:issue:`31302`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index f92f67e1d03d7..36e3a0e37c1ae 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -5,6 +5,7 @@ """ import operator from typing import TYPE_CHECKING, Optional, Set, Type +import warnings import numpy as np @@ -488,6 +489,18 @@ def to_series(right): elif isinstance(right, ABCSeries): # axis=1 is default for DataFrame-with-Series op axis = left._get_axis_number(axis) if axis is not None else 1 + + if not flex: + if not left.axes[axis].equals(right.index): + warnings.warn( + "Automatic reindexing on DataFrame vs Series comparisons " + "is deprecated and will raise ValueError in a future version. " + "Do `left, right = left.align(right, axis=1, copy=False)` " + "before e.g. `left == right`", + FutureWarning, + stacklevel=3, + ) + left, right = left.align( right, join="outer", axis=axis, level=level, copy=False ) diff --git a/pandas/tests/arithmetic/test_datetime64.py b/pandas/tests/arithmetic/test_datetime64.py index 626dd4f748e0b..46be296759088 100644 --- a/pandas/tests/arithmetic/test_datetime64.py +++ b/pandas/tests/arithmetic/test_datetime64.py @@ -318,11 +318,16 @@ def test_dt64arr_timestamp_equality(self, box_with_array): expected = tm.box_expected([False, True], xbox) tm.assert_equal(result, expected) - result = ser != ser[0] + warn = FutureWarning if box_with_array is pd.DataFrame else None + with tm.assert_produces_warning(warn): + # alignment for frame vs series comparisons deprecated + result = ser != ser[0] expected = tm.box_expected([False, True], xbox) tm.assert_equal(result, expected) - result = ser != ser[1] + with tm.assert_produces_warning(warn): + # alignment for frame vs series comparisons deprecated + result = ser != ser[1] expected = tm.box_expected([True, True], xbox) tm.assert_equal(result, expected) @@ -330,11 +335,15 @@ def test_dt64arr_timestamp_equality(self, box_with_array): expected = tm.box_expected([True, False], xbox) tm.assert_equal(result, expected) - result = ser == ser[0] + with tm.assert_produces_warning(warn): + # alignment for frame vs series comparisons deprecated + result = ser == ser[0] expected = tm.box_expected([True, False], xbox) tm.assert_equal(result, expected) - result = ser == ser[1] + with tm.assert_produces_warning(warn): + # alignment for frame vs series comparisons deprecated + result = ser == ser[1] expected = tm.box_expected([False, False], xbox) tm.assert_equal(result, expected) diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index b3aa5e403e795..d9ef19e174700 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -795,13 +795,17 @@ def test_frame_with_zero_len_series_corner_cases(): expected = pd.DataFrame(df.values * np.nan, columns=df.columns) tm.assert_frame_equal(result, expected) - result = df == ser + with tm.assert_produces_warning(FutureWarning): + # Automatic alignment for comparisons deprecated + result = df == ser expected = pd.DataFrame(False, index=df.index, columns=df.columns) tm.assert_frame_equal(result, expected) # non-float case should not raise on comparison df2 = pd.DataFrame(df.values.view("M8[ns]"), columns=df.columns) - result = df2 == ser + with tm.assert_produces_warning(FutureWarning): + # Automatic alignment for comparisons deprecated + result = df2 == ser expected = pd.DataFrame(False, index=df.index, columns=df.columns) tm.assert_frame_equal(result, expected) From a3f3f9fb8c2dd0d57b8bb40882689449022a2a33 Mon Sep 17 00:00:00 2001 From: Daniel Saxton <2658661+dsaxton@users.noreply.github.com> Date: Fri, 2 Oct 2020 19:58:28 -0500 Subject: [PATCH 0999/1025] ENH: Implement FloatingArray reductions (#36778) --- pandas/core/arrays/floating.py | 39 +++++++------------ pandas/tests/arrays/floating/test_function.py | 27 ++++++++++++- 2 files changed, 38 insertions(+), 28 deletions(-) diff --git a/pandas/core/arrays/floating.py b/pandas/core/arrays/floating.py index 33659fe2f397d..bbb5467d42d53 100644 --- a/pandas/core/arrays/floating.py +++ b/pandas/core/arrays/floating.py @@ -25,8 +25,7 @@ from pandas.core.dtypes.dtypes import register_extension_dtype from pandas.core.dtypes.missing import isna -from pandas.core import nanops, ops -from pandas.core.array_algos import masked_reductions +from pandas.core import ops from pandas.core.ops import invalid_comparison from pandas.core.ops.common import unpack_zerodim_and_defer from pandas.core.tools.numeric import to_numeric @@ -452,33 +451,21 @@ def cmp_method(self, other): name = f"__{op.__name__}__" return set_function_name(cmp_method, name, cls) - def _reduce(self, name: str, skipna: bool = True, **kwargs): - data = self._data - mask = self._mask - - if name in {"sum", "prod", "min", "max"}: - op = getattr(masked_reductions, name) - return op(data, mask, skipna=skipna, **kwargs) - - # coerce to a nan-aware float if needed - # (we explicitly use NaN within reductions) - if self._hasna: - data = self.to_numpy("float64", na_value=np.nan) - - op = getattr(nanops, "nan" + name) - result = op(data, axis=0, skipna=skipna, mask=mask, **kwargs) + def sum(self, skipna=True, min_count=0, **kwargs): + nv.validate_sum((), kwargs) + return super()._reduce("sum", skipna=skipna, min_count=min_count) - if np.isnan(result): - return libmissing.NA + def prod(self, skipna=True, min_count=0, **kwargs): + nv.validate_prod((), kwargs) + return super()._reduce("prod", skipna=skipna, min_count=min_count) - return result + def min(self, skipna=True, **kwargs): + nv.validate_min((), kwargs) + return super()._reduce("min", skipna=skipna) - def sum(self, skipna=True, min_count=0, **kwargs): - nv.validate_sum((), kwargs) - result = masked_reductions.sum( - values=self._data, mask=self._mask, skipna=skipna, min_count=min_count - ) - return result + def max(self, skipna=True, **kwargs): + nv.validate_max((), kwargs) + return super()._reduce("max", skipna=skipna) def _maybe_mask_result(self, result, mask, other, op_name: str): """ diff --git a/pandas/tests/arrays/floating/test_function.py b/pandas/tests/arrays/floating/test_function.py index 84c650f880541..2767d93741d4c 100644 --- a/pandas/tests/arrays/floating/test_function.py +++ b/pandas/tests/arrays/floating/test_function.py @@ -112,8 +112,8 @@ def test_value_counts_empty(): @pytest.mark.parametrize("skipna", [True, False]) @pytest.mark.parametrize("min_count", [0, 4]) -def test_floating_array_sum(skipna, min_count): - arr = pd.array([1, 2, 3, None], dtype="Float64") +def test_floating_array_sum(skipna, min_count, dtype): + arr = pd.array([1, 2, 3, None], dtype=dtype) result = arr.sum(skipna=skipna, min_count=min_count) if skipna and min_count == 0: assert result == 6.0 @@ -152,3 +152,26 @@ def test_preserve_dtypes(op): index=pd.Index(["a", "b"], name="A"), ) tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("skipna", [True, False]) +@pytest.mark.parametrize("method", ["min", "max"]) +def test_floating_array_min_max(skipna, method, dtype): + arr = pd.array([0.0, 1.0, None], dtype=dtype) + func = getattr(arr, method) + result = func(skipna=skipna) + if skipna: + assert result == (0 if method == "min" else 1) + else: + assert result is pd.NA + + +@pytest.mark.parametrize("skipna", [True, False]) +@pytest.mark.parametrize("min_count", [0, 9]) +def test_floating_array_prod(skipna, min_count, dtype): + arr = pd.array([1.0, 2.0, None], dtype=dtype) + result = arr.prod(skipna=skipna, min_count=min_count) + if skipna and min_count == 0: + assert result == 2 + else: + assert result is pd.NA From ed477e665697070743e492252aaec738d6d3830b Mon Sep 17 00:00:00 2001 From: patrick <61934744+phofl@users.noreply.github.com> Date: Sat, 3 Oct 2020 02:59:33 +0200 Subject: [PATCH 1000/1025] [BUG]: Rolling selected too large windows with PeriodIndex (#36730) --- doc/source/whatsnew/v1.2.0.rst | 1 + pandas/core/window/rolling.py | 13 ++++++++++-- pandas/tests/window/test_rolling.py | 31 +++++++++++++++++++++++++++++ 3 files changed, 43 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index c854f995bd2ea..cb0858fd678f8 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -409,6 +409,7 @@ Groupby/resample/rolling - Bug in :meth:`DataFrameGroupBy.apply` raising error with ``np.nan`` group(s) when ``dropna=False`` (:issue:`35889`) - Bug in :meth:`Rolling.sum()` returned wrong values when dtypes where mixed between float and integer and axis was equal to one (:issue:`20649`, :issue:`35596`) - Bug in :meth:`Rolling.count` returned ``np.nan`` with :class:`pandas.api.indexers.FixedForwardWindowIndexer` as window, ``min_periods=0`` and only missing values in window (:issue:`35579`) +- Bug where :class:`pandas.core.window.Rolling` produces incorrect window sizes when using a ``PeriodIndex`` (:issue:`34225`) Reshaping ^^^^^^^^^ diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index f207ea4cd67d4..39f1839ba559d 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -1932,7 +1932,6 @@ def validate(self): ): self._validate_monotonic() - freq = self._validate_freq() # we don't allow center if self.center: @@ -1943,7 +1942,7 @@ def validate(self): # this will raise ValueError on non-fixed freqs self.win_freq = self.window - self.window = freq.nanos + self.window = self._determine_window_length() self.win_type = "freq" # min_periods must be an integer @@ -1963,6 +1962,16 @@ def validate(self): "closed only implemented for datetimelike and offset based windows" ) + def _determine_window_length(self) -> Union[int, float]: + """ + Calculate freq for PeriodIndexes based on Index freq. Can not use + nanos, because asi8 of PeriodIndex is not in nanos + """ + freq = self._validate_freq() + if isinstance(self._on, ABCPeriodIndex): + return freq.nanos / (self._on.freq.nanos / self._on.freq.n) + return freq.nanos + def _validate_monotonic(self): """ Validate monotonic (increasing or decreasing). diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index 5ed5e99db8ab4..eaee276c7a388 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -837,3 +837,34 @@ def test_rolling_on_df_transposed(): result = df.T.rolling(min_periods=1, window=2).sum().T tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + ("index", "window"), + [ + ( + pd.period_range(start="2020-01-01 08:00", end="2020-01-01 08:08", freq="T"), + "2T", + ), + ( + pd.period_range( + start="2020-01-01 08:00", end="2020-01-01 12:00", freq="30T" + ), + "1h", + ), + ], +) +@pytest.mark.parametrize( + ("func", "values"), + [ + ("min", [np.nan, 0, 0, 1, 2, 3, 4, 5, 6]), + ("max", [np.nan, 0, 1, 2, 3, 4, 5, 6, 7]), + ("sum", [np.nan, 0, 1, 3, 5, 7, 9, 11, 13]), + ], +) +def test_rolling_period_index(index, window, func, values): + # GH: 34225 + ds = pd.Series([0, 1, 2, 3, 4, 5, 6, 7, 8], index=index) + result = getattr(ds.rolling(window, closed="left"), func)() + expected = pd.Series(values, index=index) + tm.assert_series_equal(result, expected) From 7cdf0ef67cf7e6f00b40ae5b24eff3d3df58e470 Mon Sep 17 00:00:00 2001 From: John Karasinski Date: Sat, 3 Oct 2020 07:25:28 -0700 Subject: [PATCH 1001/1025] DOC: update code style for development doc and user guide #36777 (#36821) --- doc/source/development/extending.rst | 28 +- doc/source/user_guide/computation.rst | 186 +++++------ doc/source/user_guide/dsintro.rst | 170 +++++----- doc/source/user_guide/visualization.rst | 395 ++++++++++++------------ 4 files changed, 409 insertions(+), 370 deletions(-) diff --git a/doc/source/development/extending.rst b/doc/source/development/extending.rst index c708ebb361ed1..46960140d3a8c 100644 --- a/doc/source/development/extending.rst +++ b/doc/source/development/extending.rst @@ -34,7 +34,7 @@ decorate a class, providing the name of attribute to add. The class's @staticmethod def _validate(obj): # verify there is a column latitude and a column longitude - if 'latitude' not in obj.columns or 'longitude' not in obj.columns: + if "latitude" not in obj.columns or "longitude" not in obj.columns: raise AttributeError("Must have 'latitude' and 'longitude'.") @property @@ -176,6 +176,7 @@ your ``MyExtensionArray`` class, as follows: from pandas.api.extensions import ExtensionArray, ExtensionScalarOpsMixin + class MyExtensionArray(ExtensionArray, ExtensionScalarOpsMixin): pass @@ -271,6 +272,7 @@ included as a column in a pandas DataFrame): def __arrow_array__(self, type=None): # convert the underlying array values to a pyarrow Array import pyarrow + return pyarrow.array(..., type=type) The ``ExtensionDtype.__from_arrow__`` method then controls the conversion @@ -347,7 +349,6 @@ Below example shows how to define ``SubclassedSeries`` and ``SubclassedDataFrame .. code-block:: python class SubclassedSeries(pd.Series): - @property def _constructor(self): return SubclassedSeries @@ -358,7 +359,6 @@ Below example shows how to define ``SubclassedSeries`` and ``SubclassedDataFrame class SubclassedDataFrame(pd.DataFrame): - @property def _constructor(self): return SubclassedDataFrame @@ -377,7 +377,7 @@ Below example shows how to define ``SubclassedSeries`` and ``SubclassedDataFrame >>> type(to_framed) - >>> df = SubclassedDataFrame({'A': [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]}) + >>> df = SubclassedDataFrame({"A": [1, 2, 3], "B": [4, 5, 6], "C": [7, 8, 9]}) >>> df A B C 0 1 4 7 @@ -387,7 +387,7 @@ Below example shows how to define ``SubclassedSeries`` and ``SubclassedDataFrame >>> type(df) - >>> sliced1 = df[['A', 'B']] + >>> sliced1 = df[["A", "B"]] >>> sliced1 A B 0 1 4 @@ -397,7 +397,7 @@ Below example shows how to define ``SubclassedSeries`` and ``SubclassedDataFrame >>> type(sliced1) - >>> sliced2 = df['A'] + >>> sliced2 = df["A"] >>> sliced2 0 1 1 2 @@ -422,11 +422,11 @@ Below is an example to define two original properties, "internal_cache" as a tem class SubclassedDataFrame2(pd.DataFrame): # temporary properties - _internal_names = pd.DataFrame._internal_names + ['internal_cache'] + _internal_names = pd.DataFrame._internal_names + ["internal_cache"] _internal_names_set = set(_internal_names) # normal properties - _metadata = ['added_property'] + _metadata = ["added_property"] @property def _constructor(self): @@ -434,15 +434,15 @@ Below is an example to define two original properties, "internal_cache" as a tem .. code-block:: python - >>> df = SubclassedDataFrame2({'A': [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]}) + >>> df = SubclassedDataFrame2({"A": [1, 2, 3], "B": [4, 5, 6], "C": [7, 8, 9]}) >>> df A B C 0 1 4 7 1 2 5 8 2 3 6 9 - >>> df.internal_cache = 'cached' - >>> df.added_property = 'property' + >>> df.internal_cache = "cached" + >>> df.added_property = "property" >>> df.internal_cache cached @@ -450,11 +450,11 @@ Below is an example to define two original properties, "internal_cache" as a tem property # properties defined in _internal_names is reset after manipulation - >>> df[['A', 'B']].internal_cache + >>> df[["A", "B"]].internal_cache AttributeError: 'SubclassedDataFrame2' object has no attribute 'internal_cache' # properties defined in _metadata are retained - >>> df[['A', 'B']].added_property + >>> df[["A", "B"]].added_property property .. _extending.plotting-backends: @@ -468,7 +468,7 @@ one based on Matplotlib. For example: .. code-block:: python - >>> pd.set_option('plotting.backend', 'backend.module') + >>> pd.set_option("plotting.backend", "backend.module") >>> pd.Series([1, 2, 3]).plot() This would be more or less equivalent to: diff --git a/doc/source/user_guide/computation.rst b/doc/source/user_guide/computation.rst index e7edda90610b5..2f6ac6b06d85e 100644 --- a/doc/source/user_guide/computation.rst +++ b/doc/source/user_guide/computation.rst @@ -63,8 +63,7 @@ series in the DataFrame, also excluding NA/null values. .. ipython:: python - frame = pd.DataFrame(np.random.randn(1000, 5), - columns=['a', 'b', 'c', 'd', 'e']) + frame = pd.DataFrame(np.random.randn(1000, 5), columns=["a", "b", "c", "d", "e"]) frame.cov() ``DataFrame.cov`` also supports an optional ``min_periods`` keyword that @@ -73,9 +72,9 @@ in order to have a valid result. .. ipython:: python - frame = pd.DataFrame(np.random.randn(20, 3), columns=['a', 'b', 'c']) - frame.loc[frame.index[:5], 'a'] = np.nan - frame.loc[frame.index[5:10], 'b'] = np.nan + frame = pd.DataFrame(np.random.randn(20, 3), columns=["a", "b", "c"]) + frame.loc[frame.index[:5], "a"] = np.nan + frame.loc[frame.index[5:10], "b"] = np.nan frame.cov() @@ -116,13 +115,12 @@ Wikipedia has articles covering the above correlation coefficients: .. ipython:: python - frame = pd.DataFrame(np.random.randn(1000, 5), - columns=['a', 'b', 'c', 'd', 'e']) + frame = pd.DataFrame(np.random.randn(1000, 5), columns=["a", "b", "c", "d", "e"]) frame.iloc[::2] = np.nan # Series with Series - frame['a'].corr(frame['b']) - frame['a'].corr(frame['b'], method='spearman') + frame["a"].corr(frame["b"]) + frame["a"].corr(frame["b"], method="spearman") # Pairwise correlation of DataFrame columns frame.corr() @@ -134,9 +132,9 @@ Like ``cov``, ``corr`` also supports the optional ``min_periods`` keyword: .. ipython:: python - frame = pd.DataFrame(np.random.randn(20, 3), columns=['a', 'b', 'c']) - frame.loc[frame.index[:5], 'a'] = np.nan - frame.loc[frame.index[5:10], 'b'] = np.nan + frame = pd.DataFrame(np.random.randn(20, 3), columns=["a", "b", "c"]) + frame.loc[frame.index[:5], "a"] = np.nan + frame.loc[frame.index[5:10], "b"] = np.nan frame.corr() @@ -154,8 +152,8 @@ compute the correlation based on histogram intersection: # histogram intersection def histogram_intersection(a, b): - return np.minimum(np.true_divide(a, a.sum()), - np.true_divide(b, b.sum())).sum() + return np.minimum(np.true_divide(a, a.sum()), np.true_divide(b, b.sum())).sum() + frame.corr(method=histogram_intersection) @@ -165,8 +163,8 @@ DataFrame objects. .. ipython:: python - index = ['a', 'b', 'c', 'd', 'e'] - columns = ['one', 'two', 'three', 'four'] + index = ["a", "b", "c", "d", "e"] + columns = ["one", "two", "three", "four"] df1 = pd.DataFrame(np.random.randn(5, 4), index=index, columns=columns) df2 = pd.DataFrame(np.random.randn(4, 4), index=index[:4], columns=columns) df1.corrwith(df2) @@ -182,8 +180,8 @@ assigned the mean of the ranks (by default) for the group: .. ipython:: python - s = pd.Series(np.random.randn(5), index=list('abcde')) - s['d'] = s['b'] # so there's a tie + s = pd.Series(np.random.randn(5), index=list("abcde")) + s["d"] = s["b"] # so there's a tie s.rank() :meth:`~DataFrame.rank` is also a DataFrame method and can rank either the rows @@ -243,8 +241,7 @@ objects, :class:`~pandas.core.window.Rolling`, :class:`~pandas.core.window.Expan .. ipython:: python - s = pd.Series(np.random.randn(1000), - index=pd.date_range('1/1/2000', periods=1000)) + s = pd.Series(np.random.randn(1000), index=pd.date_range("1/1/2000", periods=1000)) s = s.cumsum() s @@ -279,24 +276,26 @@ We can then call methods on these ``rolling`` objects. These return like-indexed .. ipython:: python - s.plot(style='k--') + s.plot(style="k--") @savefig rolling_mean_ex.png - r.mean().plot(style='k') + r.mean().plot(style="k") .. ipython:: python :suppress: - plt.close('all') + plt.close("all") They can also be applied to DataFrame objects. This is really just syntactic sugar for applying the moving window operator to all of the DataFrame's columns: .. ipython:: python - df = pd.DataFrame(np.random.randn(1000, 4), - index=pd.date_range('1/1/2000', periods=1000), - columns=['A', 'B', 'C', 'D']) + df = pd.DataFrame( + np.random.randn(1000, 4), + index=pd.date_range("1/1/2000", periods=1000), + columns=["A", "B", "C", "D"], + ) df = df.cumsum() @savefig rolling_mean_frame.png @@ -368,7 +367,7 @@ compute the mean absolute deviation on a rolling basis: return np.fabs(x - x.mean()).mean() @savefig rolling_apply_ex.png - s.rolling(window=60).apply(mad, raw=True).plot(style='k') + s.rolling(window=60).apply(mad, raw=True).plot(style="k") Using the Numba engine ~~~~~~~~~~~~~~~~~~~~~~ @@ -453,23 +452,22 @@ The list of recognized types are the `scipy.signal window functions .. ipython:: python - ser = pd.Series(np.random.randn(10), - index=pd.date_range('1/1/2000', periods=10)) + ser = pd.Series(np.random.randn(10), index=pd.date_range("1/1/2000", periods=10)) - ser.rolling(window=5, win_type='triang').mean() + ser.rolling(window=5, win_type="triang").mean() Note that the ``boxcar`` window is equivalent to :meth:`~Rolling.mean`. .. ipython:: python - ser.rolling(window=5, win_type='boxcar').mean() + ser.rolling(window=5, win_type="boxcar").mean() ser.rolling(window=5).mean() For some windowing functions, additional parameters must be specified: .. ipython:: python - ser.rolling(window=5, win_type='gaussian').mean(std=0.1) + ser.rolling(window=5, win_type="gaussian").mean(std=0.1) .. _stats.moments.normalization: @@ -498,10 +496,10 @@ This can be particularly useful for a non-regular time frequency index. .. ipython:: python - dft = pd.DataFrame({'B': [0, 1, 2, np.nan, 4]}, - index=pd.date_range('20130101 09:00:00', - periods=5, - freq='s')) + dft = pd.DataFrame( + {"B": [0, 1, 2, np.nan, 4]}, + index=pd.date_range("20130101 09:00:00", periods=5, freq="s"), + ) dft This is a regular frequency index. Using an integer window parameter works to roll along the window frequency. @@ -515,20 +513,26 @@ Specifying an offset allows a more intuitive specification of the rolling freque .. ipython:: python - dft.rolling('2s').sum() + dft.rolling("2s").sum() Using a non-regular, but still monotonic index, rolling with an integer window does not impart any special calculation. .. ipython:: python - dft = pd.DataFrame({'B': [0, 1, 2, np.nan, 4]}, - index=pd.Index([pd.Timestamp('20130101 09:00:00'), - pd.Timestamp('20130101 09:00:02'), - pd.Timestamp('20130101 09:00:03'), - pd.Timestamp('20130101 09:00:05'), - pd.Timestamp('20130101 09:00:06')], - name='foo')) + dft = pd.DataFrame( + {"B": [0, 1, 2, np.nan, 4]}, + index=pd.Index( + [ + pd.Timestamp("20130101 09:00:00"), + pd.Timestamp("20130101 09:00:02"), + pd.Timestamp("20130101 09:00:03"), + pd.Timestamp("20130101 09:00:05"), + pd.Timestamp("20130101 09:00:06"), + ], + name="foo", + ), + ) dft dft.rolling(2).sum() @@ -537,7 +541,7 @@ Using the time-specification generates variable windows for this sparse data. .. ipython:: python - dft.rolling('2s').sum() + dft.rolling("2s").sum() Furthermore, we now allow an optional ``on`` parameter to specify a column (rather than the default of the index) in a DataFrame. @@ -546,7 +550,7 @@ default of the index) in a DataFrame. dft = dft.reset_index() dft - dft.rolling('2s', on='foo').sum() + dft.rolling("2s", on="foo").sum() .. _stats.custom_rolling_window: @@ -569,7 +573,7 @@ For example, if we have the following ``DataFrame``: use_expanding = [True, False, True, False, True] use_expanding - df = pd.DataFrame({'values': range(5)}) + df = pd.DataFrame({"values": range(5)}) df and we want to use an expanding window where ``use_expanding`` is ``True`` otherwise a window of size @@ -615,7 +619,8 @@ rolling operations over a non-fixed offset like a ``BusinessDay``. .. ipython:: python from pandas.api.indexers import VariableOffsetWindowIndexer - df = pd.DataFrame(range(10), index=pd.date_range('2020', periods=10)) + + df = pd.DataFrame(range(10), index=pd.date_range("2020", periods=10)) offset = pd.offsets.BDay(1) indexer = VariableOffsetWindowIndexer(index=df.index, offset=offset) df @@ -657,17 +662,21 @@ from present information back to past information. This allows the rolling windo .. ipython:: python - df = pd.DataFrame({'x': 1}, - index=[pd.Timestamp('20130101 09:00:01'), - pd.Timestamp('20130101 09:00:02'), - pd.Timestamp('20130101 09:00:03'), - pd.Timestamp('20130101 09:00:04'), - pd.Timestamp('20130101 09:00:06')]) - - df["right"] = df.rolling('2s', closed='right').x.sum() # default - df["both"] = df.rolling('2s', closed='both').x.sum() - df["left"] = df.rolling('2s', closed='left').x.sum() - df["neither"] = df.rolling('2s', closed='neither').x.sum() + df = pd.DataFrame( + {"x": 1}, + index=[ + pd.Timestamp("20130101 09:00:01"), + pd.Timestamp("20130101 09:00:02"), + pd.Timestamp("20130101 09:00:03"), + pd.Timestamp("20130101 09:00:04"), + pd.Timestamp("20130101 09:00:06"), + ], + ) + + df["right"] = df.rolling("2s", closed="right").x.sum() # default + df["both"] = df.rolling("2s", closed="both").x.sum() + df["left"] = df.rolling("2s", closed="left").x.sum() + df["neither"] = df.rolling("2s", closed="neither").x.sum() df @@ -745,13 +754,15 @@ For example: .. ipython:: python - df = pd.DataFrame(np.random.randn(1000, 4), - index=pd.date_range('1/1/2000', periods=1000), - columns=['A', 'B', 'C', 'D']) + df = pd.DataFrame( + np.random.randn(1000, 4), + index=pd.date_range("1/1/2000", periods=1000), + columns=["A", "B", "C", "D"], + ) df = df.cumsum() df2 = df[:20] - df2.rolling(window=5).corr(df2['B']) + df2.rolling(window=5).corr(df2["B"]) .. _stats.moments.corr_pairwise: @@ -776,14 +787,13 @@ can even be omitted: .. ipython:: python - covs = (df[['B', 'C', 'D']].rolling(window=50) - .cov(df[['A', 'B', 'C']], pairwise=True)) - covs.loc['2002-09-22':] + covs = df[["B", "C", "D"]].rolling(window=50).cov(df[["A", "B", "C"]], pairwise=True) + covs.loc["2002-09-22":] .. ipython:: python correls = df.rolling(window=50).corr() - correls.loc['2002-09-22':] + correls.loc["2002-09-22":] You can efficiently retrieve the time series of correlations between two columns by reshaping and indexing: @@ -791,12 +801,12 @@ columns by reshaping and indexing: .. ipython:: python :suppress: - plt.close('all') + plt.close("all") .. ipython:: python @savefig rolling_corr_pairwise_ex.png - correls.unstack(1)[('A', 'C')].plot() + correls.unstack(1)[("A", "C")].plot() .. _stats.aggregate: @@ -810,9 +820,11 @@ perform multiple computations on the data. These operations are similar to the : .. ipython:: python - dfa = pd.DataFrame(np.random.randn(1000, 3), - index=pd.date_range('1/1/2000', periods=1000), - columns=['A', 'B', 'C']) + dfa = pd.DataFrame( + np.random.randn(1000, 3), + index=pd.date_range("1/1/2000", periods=1000), + columns=["A", "B", "C"], + ) r = dfa.rolling(window=60, min_periods=1) r @@ -823,9 +835,9 @@ Series (or multiple Series) via standard ``__getitem__``. r.aggregate(np.sum) - r['A'].aggregate(np.sum) + r["A"].aggregate(np.sum) - r[['A', 'B']].aggregate(np.sum) + r[["A", "B"]].aggregate(np.sum) As you can see, the result of the aggregation will have the selected columns, or all columns if none are selected. @@ -840,7 +852,7 @@ aggregation with, outputting a DataFrame: .. ipython:: python - r['A'].agg([np.sum, np.mean, np.std]) + r["A"].agg([np.sum, np.mean, np.std]) On a windowed DataFrame, you can pass a list of functions to apply to each column, which produces an aggregated result with a hierarchical index: @@ -860,20 +872,20 @@ columns of a ``DataFrame``: .. ipython:: python - r.agg({'A': np.sum, 'B': lambda x: np.std(x, ddof=1)}) + r.agg({"A": np.sum, "B": lambda x: np.std(x, ddof=1)}) The function names can also be strings. In order for a string to be valid it must be implemented on the windowed object .. ipython:: python - r.agg({'A': 'sum', 'B': 'std'}) + r.agg({"A": "sum", "B": "std"}) Furthermore you can pass a nested dict to indicate different aggregations on different columns. .. ipython:: python - r.agg({'A': ['sum', 'std'], 'B': ['mean', 'std']}) + r.agg({"A": ["sum", "std"], "B": ["mean", "std"]}) .. _stats.moments.expanding: @@ -967,7 +979,7 @@ all accept are: sn.expanding().sum() sn.cumsum() - sn.cumsum().fillna(method='ffill') + sn.cumsum().fillna(method="ffill") An expanding window statistic will be more stable (and less responsive) than @@ -978,14 +990,14 @@ relative impact of an individual data point. As an example, here is the .. ipython:: python :suppress: - plt.close('all') + plt.close("all") .. ipython:: python - s.plot(style='k--') + s.plot(style="k--") @savefig expanding_mean_frame.png - s.expanding().mean().plot(style='k') + s.expanding().mean().plot(style="k") .. _stats.moments.exponentially_weighted: @@ -1115,10 +1127,10 @@ of ``times``. .. ipython:: python - df = pd.DataFrame({'B': [0, 1, 2, np.nan, 4]}) + df = pd.DataFrame({"B": [0, 1, 2, np.nan, 4]}) df - times = ['2020-01-01', '2020-01-03', '2020-01-10', '2020-01-15', '2020-01-17'] - df.ewm(halflife='4 days', times=pd.DatetimeIndex(times)).mean() + times = ["2020-01-01", "2020-01-03", "2020-01-10", "2020-01-15", "2020-01-17"] + df.ewm(halflife="4 days", times=pd.DatetimeIndex(times)).mean() The following formula is used to compute exponentially weighted mean with an input vector of times: @@ -1130,10 +1142,10 @@ Here is an example for a univariate time series: .. ipython:: python - s.plot(style='k--') + s.plot(style="k--") @savefig ewma_ex.png - s.ewm(span=20).mean().plot(style='k') + s.ewm(span=20).mean().plot(style="k") ExponentialMovingWindow has a ``min_periods`` argument, which has the same meaning it does for all the ``.expanding`` and ``.rolling`` methods: diff --git a/doc/source/user_guide/dsintro.rst b/doc/source/user_guide/dsintro.rst index c27c73d439a0c..d698b316d321e 100644 --- a/doc/source/user_guide/dsintro.rst +++ b/doc/source/user_guide/dsintro.rst @@ -51,7 +51,7 @@ index is passed, one will be created having values ``[0, ..., len(data) - 1]``. .. ipython:: python - s = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e']) + s = pd.Series(np.random.randn(5), index=["a", "b", "c", "d", "e"]) s s.index @@ -71,7 +71,7 @@ Series can be instantiated from dicts: .. ipython:: python - d = {'b': 1, 'a': 0, 'c': 2} + d = {"b": 1, "a": 0, "c": 2} pd.Series(d) .. note:: @@ -92,9 +92,9 @@ index will be pulled out. .. ipython:: python - d = {'a': 0., 'b': 1., 'c': 2.} + d = {"a": 0.0, "b": 1.0, "c": 2.0} pd.Series(d) - pd.Series(d, index=['b', 'c', 'd', 'a']) + pd.Series(d, index=["b", "c", "d", "a"]) .. note:: @@ -107,7 +107,7 @@ provided. The value will be repeated to match the length of **index**. .. ipython:: python - pd.Series(5., index=['a', 'b', 'c', 'd', 'e']) + pd.Series(5.0, index=["a", "b", "c", "d", "e"]) Series is ndarray-like ~~~~~~~~~~~~~~~~~~~~~~ @@ -173,26 +173,26 @@ label: .. ipython:: python - s['a'] - s['e'] = 12. + s["a"] + s["e"] = 12.0 s - 'e' in s - 'f' in s + "e" in s + "f" in s If a label is not contained, an exception is raised: .. code-block:: python - >>> s['f'] + >>> s["f"] KeyError: 'f' Using the ``get`` method, a missing label will return None or specified default: .. ipython:: python - s.get('f') + s.get("f") - s.get('f', np.nan) + s.get("f", np.nan) See also the :ref:`section on attribute access`. @@ -244,7 +244,7 @@ Series can also have a ``name`` attribute: .. ipython:: python - s = pd.Series(np.random.randn(5), name='something') + s = pd.Series(np.random.randn(5), name="something") s s.name @@ -306,13 +306,15 @@ keys. .. ipython:: python - d = {'one': pd.Series([1., 2., 3.], index=['a', 'b', 'c']), - 'two': pd.Series([1., 2., 3., 4.], index=['a', 'b', 'c', 'd'])} + d = { + "one": pd.Series([1.0, 2.0, 3.0], index=["a", "b", "c"]), + "two": pd.Series([1.0, 2.0, 3.0, 4.0], index=["a", "b", "c", "d"]), + } df = pd.DataFrame(d) df - pd.DataFrame(d, index=['d', 'b', 'a']) - pd.DataFrame(d, index=['d', 'b', 'a'], columns=['two', 'three']) + pd.DataFrame(d, index=["d", "b", "a"]) + pd.DataFrame(d, index=["d", "b", "a"], columns=["two", "three"]) The row and column labels can be accessed respectively by accessing the **index** and **columns** attributes: @@ -336,10 +338,9 @@ result will be ``range(n)``, where ``n`` is the array length. .. ipython:: python - d = {'one': [1., 2., 3., 4.], - 'two': [4., 3., 2., 1.]} + d = {"one": [1.0, 2.0, 3.0, 4.0], "two": [4.0, 3.0, 2.0, 1.0]} pd.DataFrame(d) - pd.DataFrame(d, index=['a', 'b', 'c', 'd']) + pd.DataFrame(d, index=["a", "b", "c", "d"]) From structured or record array ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -348,12 +349,12 @@ This case is handled identically to a dict of arrays. .. ipython:: python - data = np.zeros((2, ), dtype=[('A', 'i4'), ('B', 'f4'), ('C', 'a10')]) - data[:] = [(1, 2., 'Hello'), (2, 3., "World")] + data = np.zeros((2,), dtype=[("A", "i4"), ("B", "f4"), ("C", "a10")]) + data[:] = [(1, 2.0, "Hello"), (2, 3.0, "World")] pd.DataFrame(data) - pd.DataFrame(data, index=['first', 'second']) - pd.DataFrame(data, columns=['C', 'A', 'B']) + pd.DataFrame(data, index=["first", "second"]) + pd.DataFrame(data, columns=["C", "A", "B"]) .. note:: @@ -367,10 +368,10 @@ From a list of dicts .. ipython:: python - data2 = [{'a': 1, 'b': 2}, {'a': 5, 'b': 10, 'c': 20}] + data2 = [{"a": 1, "b": 2}, {"a": 5, "b": 10, "c": 20}] pd.DataFrame(data2) - pd.DataFrame(data2, index=['first', 'second']) - pd.DataFrame(data2, columns=['a', 'b']) + pd.DataFrame(data2, index=["first", "second"]) + pd.DataFrame(data2, columns=["a", "b"]) .. _basics.dataframe.from_dict_of_tuples: @@ -382,11 +383,15 @@ dictionary. .. ipython:: python - pd.DataFrame({('a', 'b'): {('A', 'B'): 1, ('A', 'C'): 2}, - ('a', 'a'): {('A', 'C'): 3, ('A', 'B'): 4}, - ('a', 'c'): {('A', 'B'): 5, ('A', 'C'): 6}, - ('b', 'a'): {('A', 'C'): 7, ('A', 'B'): 8}, - ('b', 'b'): {('A', 'D'): 9, ('A', 'B'): 10}}) + pd.DataFrame( + { + ("a", "b"): {("A", "B"): 1, ("A", "C"): 2}, + ("a", "a"): {("A", "C"): 3, ("A", "B"): 4}, + ("a", "c"): {("A", "B"): 5, ("A", "C"): 6}, + ("b", "a"): {("A", "C"): 7, ("A", "B"): 8}, + ("b", "b"): {("A", "D"): 9, ("A", "B"): 10}, + } + ) .. _basics.dataframe.from_series: @@ -414,11 +419,11 @@ first ``namedtuple``, a ``ValueError`` is raised. from collections import namedtuple - Point = namedtuple('Point', 'x y') + Point = namedtuple("Point", "x y") pd.DataFrame([Point(0, 0), Point(0, 3), (2, 3)]) - Point3D = namedtuple('Point3D', 'x y z') + Point3D = namedtuple("Point3D", "x y z") pd.DataFrame([Point3D(0, 0, 0), Point3D(0, 3, 5), Point(2, 3)]) @@ -468,15 +473,18 @@ set to ``'index'`` in order to use the dict keys as row labels. .. ipython:: python - pd.DataFrame.from_dict(dict([('A', [1, 2, 3]), ('B', [4, 5, 6])])) + pd.DataFrame.from_dict(dict([("A", [1, 2, 3]), ("B", [4, 5, 6])])) If you pass ``orient='index'``, the keys will be the row labels. In this case, you can also pass the desired column names: .. ipython:: python - pd.DataFrame.from_dict(dict([('A', [1, 2, 3]), ('B', [4, 5, 6])]), - orient='index', columns=['one', 'two', 'three']) + pd.DataFrame.from_dict( + dict([("A", [1, 2, 3]), ("B", [4, 5, 6])]), + orient="index", + columns=["one", "two", "three"], + ) .. _basics.dataframe.from_records: @@ -490,7 +498,7 @@ dtype. For example: .. ipython:: python data - pd.DataFrame.from_records(data, index='C') + pd.DataFrame.from_records(data, index="C") .. _basics.dataframe.sel_add_del: @@ -503,17 +511,17 @@ the analogous dict operations: .. ipython:: python - df['one'] - df['three'] = df['one'] * df['two'] - df['flag'] = df['one'] > 2 + df["one"] + df["three"] = df["one"] * df["two"] + df["flag"] = df["one"] > 2 df Columns can be deleted or popped like with a dict: .. ipython:: python - del df['two'] - three = df.pop('three') + del df["two"] + three = df.pop("three") df When inserting a scalar value, it will naturally be propagated to fill the @@ -521,7 +529,7 @@ column: .. ipython:: python - df['foo'] = 'bar' + df["foo"] = "bar" df When inserting a Series that does not have the same index as the DataFrame, it @@ -529,7 +537,7 @@ will be conformed to the DataFrame's index: .. ipython:: python - df['one_trunc'] = df['one'][:2] + df["one_trunc"] = df["one"][:2] df You can insert raw ndarrays but their length must match the length of the @@ -540,7 +548,7 @@ available to insert at a particular location in the columns: .. ipython:: python - df.insert(1, 'bar', df['one']) + df.insert(1, "bar", df["one"]) df .. _dsintro.chained_assignment: @@ -556,17 +564,16 @@ derived from existing columns. .. ipython:: python - iris = pd.read_csv('data/iris.data') + iris = pd.read_csv("data/iris.data") iris.head() - (iris.assign(sepal_ratio=iris['SepalWidth'] / iris['SepalLength']) - .head()) + iris.assign(sepal_ratio=iris["SepalWidth"] / iris["SepalLength"]).head() In the example above, we inserted a precomputed value. We can also pass in a function of one argument to be evaluated on the DataFrame being assigned to. .. ipython:: python - iris.assign(sepal_ratio=lambda x: (x['SepalWidth'] / x['SepalLength'])).head() + iris.assign(sepal_ratio=lambda x: (x["SepalWidth"] / x["SepalLength"])).head() ``assign`` **always** returns a copy of the data, leaving the original DataFrame untouched. @@ -580,10 +587,14 @@ greater than 5, calculate the ratio, and plot: .. ipython:: python @savefig basics_assign.png - (iris.query('SepalLength > 5') - .assign(SepalRatio=lambda x: x.SepalWidth / x.SepalLength, - PetalRatio=lambda x: x.PetalWidth / x.PetalLength) - .plot(kind='scatter', x='SepalRatio', y='PetalRatio')) + ( + iris.query("SepalLength > 5") + .assign( + SepalRatio=lambda x: x.SepalWidth / x.SepalLength, + PetalRatio=lambda x: x.PetalWidth / x.PetalLength, + ) + .plot(kind="scatter", x="SepalRatio", y="PetalRatio") + ) Since a function is passed in, the function is computed on the DataFrame being assigned to. Importantly, this is the DataFrame that's been filtered @@ -603,10 +614,8 @@ to a column created earlier in the same :meth:`~DataFrame.assign`. .. ipython:: python - dfa = pd.DataFrame({"A": [1, 2, 3], - "B": [4, 5, 6]}) - dfa.assign(C=lambda x: x['A'] + x['B'], - D=lambda x: x['A'] + x['C']) + dfa = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) + dfa.assign(C=lambda x: x["A"] + x["B"], D=lambda x: x["A"] + x["C"]) In the second expression, ``x['C']`` will refer to the newly created column, that's equal to ``dfa['A'] + dfa['B']``. @@ -631,7 +640,7 @@ DataFrame: .. ipython:: python - df.loc['b'] + df.loc["b"] df.iloc[2] For a more exhaustive treatment of sophisticated label-based indexing and @@ -650,8 +659,8 @@ union of the column and row labels. .. ipython:: python - df = pd.DataFrame(np.random.randn(10, 4), columns=['A', 'B', 'C', 'D']) - df2 = pd.DataFrame(np.random.randn(7, 3), columns=['A', 'B', 'C']) + df = pd.DataFrame(np.random.randn(10, 4), columns=["A", "B", "C", "D"]) + df2 = pd.DataFrame(np.random.randn(7, 3), columns=["A", "B", "C"]) df + df2 When doing an operation between DataFrame and Series, the default behavior is @@ -680,8 +689,8 @@ Boolean operators work as well: .. ipython:: python - df1 = pd.DataFrame({'a': [1, 0, 1], 'b': [0, 1, 1]}, dtype=bool) - df2 = pd.DataFrame({'a': [0, 1, 1], 'b': [1, 1, 0]}, dtype=bool) + df1 = pd.DataFrame({"a": [1, 0, 1], "b": [0, 1, 1]}, dtype=bool) + df2 = pd.DataFrame({"a": [0, 1, 1], "b": [1, 1, 0]}, dtype=bool) df1 & df2 df1 | df2 df1 ^ df2 @@ -737,8 +746,8 @@ on two :class:`Series` with differently ordered labels will align before the ope .. ipython:: python - ser1 = pd.Series([1, 2, 3], index=['a', 'b', 'c']) - ser2 = pd.Series([1, 3, 5], index=['b', 'a', 'c']) + ser1 = pd.Series([1, 2, 3], index=["a", "b", "c"]) + ser2 = pd.Series([1, 3, 5], index=["b", "a", "c"]) ser1 ser2 np.remainder(ser1, ser2) @@ -748,7 +757,7 @@ with missing values. .. ipython:: python - ser3 = pd.Series([2, 4, 6], index=['b', 'c', 'd']) + ser3 = pd.Series([2, 4, 6], index=["b", "c", "d"]) ser3 np.remainder(ser1, ser3) @@ -778,11 +787,11 @@ R package): :suppress: # force a summary to be printed - pd.set_option('display.max_rows', 5) + pd.set_option("display.max_rows", 5) .. ipython:: python - baseball = pd.read_csv('data/baseball.csv') + baseball = pd.read_csv("data/baseball.csv") print(baseball) baseball.info() @@ -791,7 +800,7 @@ R package): :okwarning: # restore GlobalPrintConfig - pd.reset_option(r'^display\.') + pd.reset_option(r"^display\.") However, using ``to_string`` will return a string representation of the DataFrame in tabular form, though it won't always fit the console width: @@ -812,7 +821,7 @@ option: .. ipython:: python - pd.set_option('display.width', 40) # default is 80 + pd.set_option("display.width", 40) # default is 80 pd.DataFrame(np.random.randn(3, 12)) @@ -820,21 +829,25 @@ You can adjust the max width of the individual columns by setting ``display.max_ .. ipython:: python - datafile = {'filename': ['filename_01', 'filename_02'], - 'path': ["media/user_name/storage/folder_01/filename_01", - "media/user_name/storage/folder_02/filename_02"]} + datafile = { + "filename": ["filename_01", "filename_02"], + "path": [ + "media/user_name/storage/folder_01/filename_01", + "media/user_name/storage/folder_02/filename_02", + ], + } - pd.set_option('display.max_colwidth', 30) + pd.set_option("display.max_colwidth", 30) pd.DataFrame(datafile) - pd.set_option('display.max_colwidth', 100) + pd.set_option("display.max_colwidth", 100) pd.DataFrame(datafile) .. ipython:: python :suppress: - pd.reset_option('display.width') - pd.reset_option('display.max_colwidth') + pd.reset_option("display.width") + pd.reset_option("display.max_colwidth") You can also disable this feature via the ``expand_frame_repr`` option. This will print the table in one block. @@ -847,8 +860,7 @@ accessed like an attribute: .. ipython:: python - df = pd.DataFrame({'foo1': np.random.randn(5), - 'foo2': np.random.randn(5)}) + df = pd.DataFrame({"foo1": np.random.randn(5), "foo2": np.random.randn(5)}) df df.foo1 diff --git a/doc/source/user_guide/visualization.rst b/doc/source/user_guide/visualization.rst index f41912445455d..46ab29a52747a 100644 --- a/doc/source/user_guide/visualization.rst +++ b/doc/source/user_guide/visualization.rst @@ -11,7 +11,8 @@ We use the standard convention for referencing the matplotlib API: .. ipython:: python import matplotlib.pyplot as plt - plt.close('all') + + plt.close("all") We provide the basics in pandas to easily create decent looking plots. See the :ref:`ecosystem ` section for visualization @@ -39,8 +40,7 @@ The ``plot`` method on Series and DataFrame is just a simple wrapper around .. ipython:: python - ts = pd.Series(np.random.randn(1000), - index=pd.date_range('1/1/2000', periods=1000)) + ts = pd.Series(np.random.randn(1000), index=pd.date_range("1/1/2000", periods=1000)) ts = ts.cumsum() @savefig series_plot_basic.png @@ -54,18 +54,17 @@ On DataFrame, :meth:`~DataFrame.plot` is a convenience to plot all of the column .. ipython:: python :suppress: - plt.close('all') + plt.close("all") np.random.seed(123456) .. ipython:: python - df = pd.DataFrame(np.random.randn(1000, 4), - index=ts.index, columns=list('ABCD')) + df = pd.DataFrame(np.random.randn(1000, 4), index=ts.index, columns=list("ABCD")) df = df.cumsum() plt.figure(); @savefig frame_plot_basic.png - df.plot(); + df.plot() You can plot one column versus another using the ``x`` and ``y`` keywords in :meth:`~DataFrame.plot`: @@ -73,17 +72,17 @@ You can plot one column versus another using the ``x`` and ``y`` keywords in .. ipython:: python :suppress: - plt.close('all') + plt.close("all") plt.figure() np.random.seed(123456) .. ipython:: python - df3 = pd.DataFrame(np.random.randn(1000, 2), columns=['B', 'C']).cumsum() - df3['A'] = pd.Series(list(range(len(df)))) + df3 = pd.DataFrame(np.random.randn(1000, 2), columns=["B", "C"]).cumsum() + df3["A"] = pd.Series(list(range(len(df)))) @savefig df_plot_xy.png - df3.plot(x='A', y='B') + df3.plot(x="A", y="B") .. note:: @@ -93,7 +92,7 @@ You can plot one column versus another using the ``x`` and ``y`` keywords in .. ipython:: python :suppress: - plt.close('all') + plt.close("all") .. _visualization.other: @@ -120,7 +119,7 @@ For example, a bar plot can be created the following way: plt.figure(); @savefig bar_plot_ex.png - df.iloc[5].plot(kind='bar'); + df.iloc[5].plot(kind="bar") You can also create these other plots using the methods ``DataFrame.plot.`` instead of providing the ``kind`` keyword argument. This makes it easier to discover plot methods and the specific arguments they use: @@ -164,7 +163,7 @@ For labeled, non-time series data, you may wish to produce a bar plot: @savefig bar_plot_ex.png df.iloc[5].plot.bar() - plt.axhline(0, color='k'); + plt.axhline(0, color="k") Calling a DataFrame's :meth:`plot.bar() ` method produces a multiple bar plot: @@ -172,42 +171,42 @@ bar plot: .. ipython:: python :suppress: - plt.close('all') + plt.close("all") plt.figure() np.random.seed(123456) .. ipython:: python - df2 = pd.DataFrame(np.random.rand(10, 4), columns=['a', 'b', 'c', 'd']) + df2 = pd.DataFrame(np.random.rand(10, 4), columns=["a", "b", "c", "d"]) @savefig bar_plot_multi_ex.png - df2.plot.bar(); + df2.plot.bar() To produce a stacked bar plot, pass ``stacked=True``: .. ipython:: python :suppress: - plt.close('all') + plt.close("all") plt.figure() .. ipython:: python @savefig bar_plot_stacked_ex.png - df2.plot.bar(stacked=True); + df2.plot.bar(stacked=True) To get horizontal bar plots, use the ``barh`` method: .. ipython:: python :suppress: - plt.close('all') + plt.close("all") plt.figure() .. ipython:: python @savefig barh_plot_stacked_ex.png - df2.plot.barh(stacked=True); + df2.plot.barh(stacked=True) .. _visualization.hist: @@ -218,8 +217,14 @@ Histograms can be drawn by using the :meth:`DataFrame.plot.hist` and :meth:`Seri .. ipython:: python - df4 = pd.DataFrame({'a': np.random.randn(1000) + 1, 'b': np.random.randn(1000), - 'c': np.random.randn(1000) - 1}, columns=['a', 'b', 'c']) + df4 = pd.DataFrame( + { + "a": np.random.randn(1000) + 1, + "b": np.random.randn(1000), + "c": np.random.randn(1000) - 1, + }, + columns=["a", "b", "c"], + ) plt.figure(); @@ -230,7 +235,7 @@ Histograms can be drawn by using the :meth:`DataFrame.plot.hist` and :meth:`Seri .. ipython:: python :suppress: - plt.close('all') + plt.close("all") A histogram can be stacked using ``stacked=True``. Bin size can be changed using the ``bins`` keyword. @@ -245,7 +250,7 @@ using the ``bins`` keyword. .. ipython:: python :suppress: - plt.close('all') + plt.close("all") You can pass other keywords supported by matplotlib ``hist``. For example, horizontal and cumulative histograms can be drawn by @@ -256,12 +261,12 @@ horizontal and cumulative histograms can be drawn by plt.figure(); @savefig hist_new_kwargs.png - df4['a'].plot.hist(orientation='horizontal', cumulative=True) + df4["a"].plot.hist(orientation="horizontal", cumulative=True) .. ipython:: python :suppress: - plt.close('all') + plt.close("all") See the :meth:`hist ` method and the `matplotlib hist documentation `__ for more. @@ -274,12 +279,12 @@ The existing interface ``DataFrame.hist`` to plot histogram still can be used. plt.figure(); @savefig hist_plot_ex.png - df['A'].diff().hist() + df["A"].diff().hist() .. ipython:: python :suppress: - plt.close('all') + plt.close("all") :meth:`DataFrame.hist` plots the histograms of the columns on multiple subplots: @@ -289,7 +294,7 @@ subplots: plt.figure() @savefig frame_hist_ex.png - df.diff().hist(color='k', alpha=0.5, bins=50) + df.diff().hist(color="k", alpha=0.5, bins=50) The ``by`` keyword can be specified to plot grouped histograms: @@ -297,7 +302,7 @@ The ``by`` keyword can be specified to plot grouped histograms: .. ipython:: python :suppress: - plt.close('all') + plt.close("all") plt.figure() np.random.seed(123456) @@ -323,12 +328,12 @@ a uniform random variable on [0,1). .. ipython:: python :suppress: - plt.close('all') + plt.close("all") np.random.seed(123456) .. ipython:: python - df = pd.DataFrame(np.random.rand(10, 5), columns=['A', 'B', 'C', 'D', 'E']) + df = pd.DataFrame(np.random.rand(10, 5), columns=["A", "B", "C", "D", "E"]) @savefig box_plot_new.png df.plot.box() @@ -348,16 +353,20 @@ more complicated colorization, you can get each drawn artists by passing .. ipython:: python - color = {'boxes': 'DarkGreen', 'whiskers': 'DarkOrange', - 'medians': 'DarkBlue', 'caps': 'Gray'} + color = { + "boxes": "DarkGreen", + "whiskers": "DarkOrange", + "medians": "DarkBlue", + "caps": "Gray", + } @savefig box_new_colorize.png - df.plot.box(color=color, sym='r+') + df.plot.box(color=color, sym="r+") .. ipython:: python :suppress: - plt.close('all') + plt.close("all") Also, you can pass other keywords supported by matplotlib ``boxplot``. For example, horizontal and custom-positioned boxplot can be drawn by @@ -378,7 +387,7 @@ The existing interface ``DataFrame.boxplot`` to plot boxplot still can be used. .. ipython:: python :suppress: - plt.close('all') + plt.close("all") np.random.seed(123456) .. ipython:: python @@ -396,19 +405,19 @@ groupings. For instance, .. ipython:: python :suppress: - plt.close('all') + plt.close("all") np.random.seed(123456) .. ipython:: python :okwarning: - df = pd.DataFrame(np.random.rand(10, 2), columns=['Col1', 'Col2']) - df['X'] = pd.Series(['A', 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'B']) + df = pd.DataFrame(np.random.rand(10, 2), columns=["Col1", "Col2"]) + df["X"] = pd.Series(["A", "A", "A", "A", "A", "B", "B", "B", "B", "B"]) - plt.figure(); + plt.figure() @savefig box_plot_ex2.png - bp = df.boxplot(by='X') + bp = df.boxplot(by="X") You can also pass a subset of columns to plot, as well as group by multiple columns: @@ -416,25 +425,25 @@ columns: .. ipython:: python :suppress: - plt.close('all') + plt.close("all") np.random.seed(123456) .. ipython:: python :okwarning: - df = pd.DataFrame(np.random.rand(10, 3), columns=['Col1', 'Col2', 'Col3']) - df['X'] = pd.Series(['A', 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'B']) - df['Y'] = pd.Series(['A', 'B', 'A', 'B', 'A', 'B', 'A', 'B', 'A', 'B']) + df = pd.DataFrame(np.random.rand(10, 3), columns=["Col1", "Col2", "Col3"]) + df["X"] = pd.Series(["A", "A", "A", "A", "A", "B", "B", "B", "B", "B"]) + df["Y"] = pd.Series(["A", "B", "A", "B", "A", "B", "A", "B", "A", "B"]) plt.figure(); @savefig box_plot_ex3.png - bp = df.boxplot(column=['Col1', 'Col2'], by=['X', 'Y']) + bp = df.boxplot(column=["Col1", "Col2"], by=["X", "Y"]) .. ipython:: python :suppress: - plt.close('all') + plt.close("all") .. _visualization.box.return: @@ -462,16 +471,16 @@ keyword, will affect the output type as well: np.random.seed(1234) df_box = pd.DataFrame(np.random.randn(50, 2)) - df_box['g'] = np.random.choice(['A', 'B'], size=50) - df_box.loc[df_box['g'] == 'B', 1] += 3 + df_box["g"] = np.random.choice(["A", "B"], size=50) + df_box.loc[df_box["g"] == "B", 1] += 3 @savefig boxplot_groupby.png - bp = df_box.boxplot(by='g') + bp = df_box.boxplot(by="g") .. ipython:: python :suppress: - plt.close('all') + plt.close("all") The subplots above are split by the numeric columns first, then the value of the ``g`` column. Below the subplots are first split by the value of ``g``, @@ -481,12 +490,12 @@ then by the numeric columns. :okwarning: @savefig groupby_boxplot_vis.png - bp = df_box.groupby('g').boxplot() + bp = df_box.groupby("g").boxplot() .. ipython:: python :suppress: - plt.close('all') + plt.close("all") .. _visualization.area_plot: @@ -506,23 +515,23 @@ When input data contains ``NaN``, it will be automatically filled by 0. If you w .. ipython:: python - df = pd.DataFrame(np.random.rand(10, 4), columns=['a', 'b', 'c', 'd']) + df = pd.DataFrame(np.random.rand(10, 4), columns=["a", "b", "c", "d"]) @savefig area_plot_stacked.png - df.plot.area(); + df.plot.area() To produce an unstacked plot, pass ``stacked=False``. Alpha value is set to 0.5 unless otherwise specified: .. ipython:: python :suppress: - plt.close('all') + plt.close("all") plt.figure() .. ipython:: python @savefig area_plot_unstacked.png - df.plot.area(stacked=False); + df.plot.area(stacked=False) .. _visualization.scatter: @@ -537,29 +546,29 @@ These can be specified by the ``x`` and ``y`` keywords. :suppress: np.random.seed(123456) - plt.close('all') + plt.close("all") plt.figure() .. ipython:: python - df = pd.DataFrame(np.random.rand(50, 4), columns=['a', 'b', 'c', 'd']) + df = pd.DataFrame(np.random.rand(50, 4), columns=["a", "b", "c", "d"]) @savefig scatter_plot.png - df.plot.scatter(x='a', y='b'); + df.plot.scatter(x="a", y="b") To plot multiple column groups in a single axes, repeat ``plot`` method specifying target ``ax``. It is recommended to specify ``color`` and ``label`` keywords to distinguish each groups. .. ipython:: python - ax = df.plot.scatter(x='a', y='b', color='DarkBlue', label='Group 1'); + ax = df.plot.scatter(x="a", y="b", color="DarkBlue", label="Group 1") @savefig scatter_plot_repeated.png - df.plot.scatter(x='c', y='d', color='DarkGreen', label='Group 2', ax=ax); + df.plot.scatter(x="c", y="d", color="DarkGreen", label="Group 2", ax=ax) .. ipython:: python :suppress: - plt.close('all') + plt.close("all") The keyword ``c`` may be given as the name of a column to provide colors for each point: @@ -567,13 +576,13 @@ each point: .. ipython:: python @savefig scatter_plot_colored.png - df.plot.scatter(x='a', y='b', c='c', s=50); + df.plot.scatter(x="a", y="b", c="c", s=50) .. ipython:: python :suppress: - plt.close('all') + plt.close("all") You can pass other keywords supported by matplotlib :meth:`scatter `. The example below shows a @@ -582,12 +591,12 @@ bubble chart using a column of the ``DataFrame`` as the bubble size. .. ipython:: python @savefig scatter_plot_bubble.png - df.plot.scatter(x='a', y='b', s=df['c'] * 200); + df.plot.scatter(x="a", y="b", s=df["c"] * 200) .. ipython:: python :suppress: - plt.close('all') + plt.close("all") See the :meth:`scatter ` method and the `matplotlib scatter documentation `__ for more. @@ -609,11 +618,11 @@ too dense to plot each point individually. .. ipython:: python - df = pd.DataFrame(np.random.randn(1000, 2), columns=['a', 'b']) - df['b'] = df['b'] + np.arange(1000) + df = pd.DataFrame(np.random.randn(1000, 2), columns=["a", "b"]) + df["b"] = df["b"] + np.arange(1000) @savefig hexbin_plot.png - df.plot.hexbin(x='a', y='b', gridsize=25) + df.plot.hexbin(x="a", y="b", gridsize=25) A useful keyword argument is ``gridsize``; it controls the number of hexagons @@ -631,23 +640,23 @@ given by column ``z``. The bins are aggregated with NumPy's ``max`` function. .. ipython:: python :suppress: - plt.close('all') + plt.close("all") plt.figure() np.random.seed(123456) .. ipython:: python - df = pd.DataFrame(np.random.randn(1000, 2), columns=['a', 'b']) - df['b'] = df['b'] = df['b'] + np.arange(1000) - df['z'] = np.random.uniform(0, 3, 1000) + df = pd.DataFrame(np.random.randn(1000, 2), columns=["a", "b"]) + df["b"] = df["b"] = df["b"] + np.arange(1000) + df["z"] = np.random.uniform(0, 3, 1000) @savefig hexbin_plot_agg.png - df.plot.hexbin(x='a', y='b', C='z', reduce_C_function=np.max, gridsize=25) + df.plot.hexbin(x="a", y="b", C="z", reduce_C_function=np.max, gridsize=25) .. ipython:: python :suppress: - plt.close('all') + plt.close("all") See the :meth:`hexbin ` method and the `matplotlib hexbin documentation `__ for more. @@ -670,8 +679,7 @@ A ``ValueError`` will be raised if there are any negative values in your data. .. ipython:: python :okwarning: - series = pd.Series(3 * np.random.rand(4), - index=['a', 'b', 'c', 'd'], name='series') + series = pd.Series(3 * np.random.rand(4), index=["a", "b", "c", "d"], name="series") @savefig series_pie_plot.png series.plot.pie(figsize=(6, 6)) @@ -679,7 +687,7 @@ A ``ValueError`` will be raised if there are any negative values in your data. .. ipython:: python :suppress: - plt.close('all') + plt.close("all") For pie plots it's best to use square figures, i.e. a figure aspect ratio 1. You can create the figure with equal width and height, or force the aspect ratio @@ -700,8 +708,9 @@ drawn in each pie plots by default; specify ``legend=False`` to hide it. .. ipython:: python - df = pd.DataFrame(3 * np.random.rand(4, 2), - index=['a', 'b', 'c', 'd'], columns=['x', 'y']) + df = pd.DataFrame( + 3 * np.random.rand(4, 2), index=["a", "b", "c", "d"], columns=["x", "y"] + ) @savefig df_pie_plot.png df.plot.pie(subplots=True, figsize=(8, 4)) @@ -709,7 +718,7 @@ drawn in each pie plots by default; specify ``legend=False`` to hide it. .. ipython:: python :suppress: - plt.close('all') + plt.close("all") You can use the ``labels`` and ``colors`` keywords to specify the labels and colors of each wedge. @@ -731,21 +740,26 @@ Also, other keywords supported by :func:`matplotlib.pyplot.pie` can be used. .. ipython:: python @savefig series_pie_plot_options.png - series.plot.pie(labels=['AA', 'BB', 'CC', 'DD'], colors=['r', 'g', 'b', 'c'], - autopct='%.2f', fontsize=20, figsize=(6, 6)) + series.plot.pie( + labels=["AA", "BB", "CC", "DD"], + colors=["r", "g", "b", "c"], + autopct="%.2f", + fontsize=20, + figsize=(6, 6), + ) If you pass values whose sum total is less than 1.0, matplotlib draws a semicircle. .. ipython:: python :suppress: - plt.close('all') + plt.close("all") plt.figure() .. ipython:: python :okwarning: - series = pd.Series([0.1] * 4, index=['a', 'b', 'c', 'd'], name='series2') + series = pd.Series([0.1] * 4, index=["a", "b", "c", "d"], name="series2") @savefig series_pie_plot_semi.png series.plot.pie(figsize=(6, 6)) @@ -755,7 +769,7 @@ See the `matplotlib pie documentation `__ for more. @@ -1560,12 +1574,12 @@ To use the cubehelix colormap, we can pass ``colormap='cubehelix'``. plt.figure() @savefig cubehelix.png - df.plot(colormap='cubehelix') + df.plot(colormap="cubehelix") .. ipython:: python :suppress: - plt.close('all') + plt.close("all") Alternatively, we can pass the colormap itself: @@ -1581,7 +1595,7 @@ Alternatively, we can pass the colormap itself: .. ipython:: python :suppress: - plt.close('all') + plt.close("all") Colormaps can also be used other plot types, like bar charts: @@ -1598,12 +1612,12 @@ Colormaps can also be used other plot types, like bar charts: plt.figure() @savefig greens.png - dd.plot.bar(colormap='Greens') + dd.plot.bar(colormap="Greens") .. ipython:: python :suppress: - plt.close('all') + plt.close("all") Parallel coordinates charts: @@ -1612,12 +1626,12 @@ Parallel coordinates charts: plt.figure() @savefig parallel_gist_rainbow.png - parallel_coordinates(data, 'Name', colormap='gist_rainbow') + parallel_coordinates(data, "Name", colormap="gist_rainbow") .. ipython:: python :suppress: - plt.close('all') + plt.close("all") Andrews curves charts: @@ -1626,12 +1640,12 @@ Andrews curves charts: plt.figure() @savefig andrews_curve_winter.png - andrews_curves(data, 'Name', colormap='winter') + andrews_curves(data, "Name", colormap="winter") .. ipython:: python :suppress: - plt.close('all') + plt.close("all") Plotting directly with matplotlib --------------------------------- @@ -1655,23 +1669,24 @@ when plotting a large number of points. .. ipython:: python - price = pd.Series(np.random.randn(150).cumsum(), - index=pd.date_range('2000-1-1', periods=150, freq='B')) + price = pd.Series( + np.random.randn(150).cumsum(), + index=pd.date_range("2000-1-1", periods=150, freq="B"), + ) ma = price.rolling(20).mean() mstd = price.rolling(20).std() plt.figure() - plt.plot(price.index, price, 'k') - plt.plot(ma.index, ma, 'b') + plt.plot(price.index, price, "k") + plt.plot(ma.index, ma, "b") @savefig bollinger.png - plt.fill_between(mstd.index, ma - 2 * mstd, ma + 2 * mstd, - color='b', alpha=0.2) + plt.fill_between(mstd.index, ma - 2 * mstd, ma + 2 * mstd, color="b", alpha=0.2) .. ipython:: python :suppress: - plt.close('all') + plt.close("all") Plotting backends ----------------- @@ -1685,21 +1700,21 @@ function. For example: .. code-block:: python - >>> Series([1, 2, 3]).plot(backend='backend.module') + >>> Series([1, 2, 3]).plot(backend="backend.module") Alternatively, you can also set this option globally, do you don't need to specify the keyword in each ``plot`` call. For example: .. code-block:: python - >>> pd.set_option('plotting.backend', 'backend.module') + >>> pd.set_option("plotting.backend", "backend.module") >>> pd.Series([1, 2, 3]).plot() Or: .. code-block:: python - >>> pd.options.plotting.backend = 'backend.module' + >>> pd.options.plotting.backend = "backend.module" >>> pd.Series([1, 2, 3]).plot() This would be more or less equivalent to: From b9e720454d327595679a2f12aba8c42f9a847261 Mon Sep 17 00:00:00 2001 From: John Karasinski Date: Sat, 3 Oct 2020 07:30:49 -0700 Subject: [PATCH 1002/1025] DOC: update code style for user guide for #36777 (#36823) --- doc/source/user_guide/groupby.rst | 463 ++++++++-------- doc/source/user_guide/io.rst | 12 +- doc/source/user_guide/missing_data.rst | 137 ++--- doc/source/user_guide/scale.rst | 35 +- doc/source/user_guide/timeseries.rst | 713 +++++++++++++------------ 5 files changed, 717 insertions(+), 643 deletions(-) diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst index 52342de98de79..9696f14f03b56 100644 --- a/doc/source/user_guide/groupby.rst +++ b/doc/source/user_guide/groupby.rst @@ -68,19 +68,23 @@ object (more on what the GroupBy object is later), you may do the following: .. ipython:: python - df = pd.DataFrame([('bird', 'Falconiformes', 389.0), - ('bird', 'Psittaciformes', 24.0), - ('mammal', 'Carnivora', 80.2), - ('mammal', 'Primates', np.nan), - ('mammal', 'Carnivora', 58)], - index=['falcon', 'parrot', 'lion', 'monkey', 'leopard'], - columns=('class', 'order', 'max_speed')) + df = pd.DataFrame( + [ + ("bird", "Falconiformes", 389.0), + ("bird", "Psittaciformes", 24.0), + ("mammal", "Carnivora", 80.2), + ("mammal", "Primates", np.nan), + ("mammal", "Carnivora", 58), + ], + index=["falcon", "parrot", "lion", "monkey", "leopard"], + columns=("class", "order", "max_speed"), + ) df # default is axis=0 - grouped = df.groupby('class') - grouped = df.groupby('order', axis='columns') - grouped = df.groupby(['class', 'order']) + grouped = df.groupby("class") + grouped = df.groupby("order", axis="columns") + grouped = df.groupby(["class", "order"]) The mapping can be specified many different ways: @@ -103,12 +107,14 @@ consider the following ``DataFrame``: .. ipython:: python - df = pd.DataFrame({'A': ['foo', 'bar', 'foo', 'bar', - 'foo', 'bar', 'foo', 'foo'], - 'B': ['one', 'one', 'two', 'three', - 'two', 'two', 'one', 'three'], - 'C': np.random.randn(8), - 'D': np.random.randn(8)}) + df = pd.DataFrame( + { + "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], + "B": ["one", "one", "two", "three", "two", "two", "one", "three"], + "C": np.random.randn(8), + "D": np.random.randn(8), + } + ) df On a DataFrame, we obtain a GroupBy object by calling :meth:`~DataFrame.groupby`. @@ -116,8 +122,8 @@ We could naturally group by either the ``A`` or ``B`` columns, or both: .. ipython:: python - grouped = df.groupby('A') - grouped = df.groupby(['A', 'B']) + grouped = df.groupby("A") + grouped = df.groupby(["A", "B"]) .. versionadded:: 0.24 @@ -126,8 +132,8 @@ but the specified columns .. ipython:: python - df2 = df.set_index(['A', 'B']) - grouped = df2.groupby(level=df2.index.names.difference(['B'])) + df2 = df.set_index(["A", "B"]) + grouped = df2.groupby(level=df2.index.names.difference(["B"])) grouped.sum() These will split the DataFrame on its index (rows). We could also split by the @@ -181,9 +187,9 @@ By default the group keys are sorted during the ``groupby`` operation. You may h .. ipython:: python - df2 = pd.DataFrame({'X': ['B', 'B', 'A', 'A'], 'Y': [1, 2, 3, 4]}) - df2.groupby(['X']).sum() - df2.groupby(['X'], sort=False).sum() + df2 = pd.DataFrame({"X": ["B", "B", "A", "A"], "Y": [1, 2, 3, 4]}) + df2.groupby(["X"]).sum() + df2.groupby(["X"], sort=False).sum() Note that ``groupby`` will preserve the order in which *observations* are sorted *within* each group. @@ -191,10 +197,10 @@ For example, the groups created by ``groupby()`` below are in the order they app .. ipython:: python - df3 = pd.DataFrame({'X': ['A', 'B', 'A', 'B'], 'Y': [1, 4, 3, 2]}) - df3.groupby(['X']).get_group('A') + df3 = pd.DataFrame({"X": ["A", "B", "A", "B"], "Y": [1, 4, 3, 2]}) + df3.groupby(["X"]).get_group("A") - df3.groupby(['X']).get_group('B') + df3.groupby(["X"]).get_group("B") .. _groupby.dropna: @@ -236,7 +242,7 @@ above example we have: .. ipython:: python - df.groupby('A').groups + df.groupby("A").groups df.groupby(get_letter_type, axis=1).groups Calling the standard Python ``len`` function on the GroupBy object just returns @@ -244,7 +250,7 @@ the length of the ``groups`` dict, so it is largely just a convenience: .. ipython:: python - grouped = df.groupby(['A', 'B']) + grouped = df.groupby(["A", "B"]) grouped.groups len(grouped) @@ -259,15 +265,14 @@ the length of the ``groups`` dict, so it is largely just a convenience: n = 10 weight = np.random.normal(166, 20, size=n) height = np.random.normal(60, 10, size=n) - time = pd.date_range('1/1/2000', periods=n) - gender = np.random.choice(['male', 'female'], size=n) - df = pd.DataFrame({'height': height, 'weight': weight, - 'gender': gender}, index=time) + time = pd.date_range("1/1/2000", periods=n) + gender = np.random.choice(["male", "female"], size=n) + df = pd.DataFrame({"height": height, "weight": weight, "gender": gender}, index=time) .. ipython:: python df - gb = df.groupby('gender') + gb = df.groupby("gender") .. ipython:: @@ -291,9 +296,11 @@ Let's create a Series with a two-level ``MultiIndex``. .. ipython:: python - arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'], - ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']] - index = pd.MultiIndex.from_arrays(arrays, names=['first', 'second']) + arrays = [ + ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"], + ["one", "two", "one", "two", "one", "two", "one", "two"], + ] + index = pd.MultiIndex.from_arrays(arrays, names=["first", "second"]) s = pd.Series(np.random.randn(8), index=index) s @@ -309,7 +316,7 @@ number: .. ipython:: python - s.groupby(level='second').sum() + s.groupby(level="second").sum() The aggregation functions such as ``sum`` will take the level parameter directly. Additionally, the resulting index will be named according to the @@ -317,30 +324,32 @@ chosen level: .. ipython:: python - s.sum(level='second') + s.sum(level="second") Grouping with multiple levels is supported. .. ipython:: python :suppress: - arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'], - ['doo', 'doo', 'bee', 'bee', 'bop', 'bop', 'bop', 'bop'], - ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']] + arrays = [ + ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"], + ["doo", "doo", "bee", "bee", "bop", "bop", "bop", "bop"], + ["one", "two", "one", "two", "one", "two", "one", "two"], + ] tuples = list(zip(*arrays)) - index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second', 'third']) + index = pd.MultiIndex.from_tuples(tuples, names=["first", "second", "third"]) s = pd.Series(np.random.randn(8), index=index) .. ipython:: python s - s.groupby(level=['first', 'second']).sum() + s.groupby(level=["first", "second"]).sum() Index level names may be supplied as keys. .. ipython:: python - s.groupby(['first', 'second']).sum() + s.groupby(["first", "second"]).sum() More on the ``sum`` function and aggregation later. @@ -352,14 +361,14 @@ objects. .. ipython:: python - arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'], - ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']] + arrays = [ + ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"], + ["one", "two", "one", "two", "one", "two", "one", "two"], + ] - index = pd.MultiIndex.from_arrays(arrays, names=['first', 'second']) + index = pd.MultiIndex.from_arrays(arrays, names=["first", "second"]) - df = pd.DataFrame({'A': [1, 1, 1, 1, 2, 2, 3, 3], - 'B': np.arange(8)}, - index=index) + df = pd.DataFrame({"A": [1, 1, 1, 1, 2, 2, 3, 3], "B": np.arange(8)}, index=index) df @@ -368,19 +377,19 @@ the ``A`` column. .. ipython:: python - df.groupby([pd.Grouper(level=1), 'A']).sum() + df.groupby([pd.Grouper(level=1), "A"]).sum() Index levels may also be specified by name. .. ipython:: python - df.groupby([pd.Grouper(level='second'), 'A']).sum() + df.groupby([pd.Grouper(level="second"), "A"]).sum() Index level names may be specified as keys directly to ``groupby``. .. ipython:: python - df.groupby(['second', 'A']).sum() + df.groupby(["second", "A"]).sum() DataFrame column selection in GroupBy ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -392,24 +401,26 @@ getting a column from a DataFrame, you can do: .. ipython:: python :suppress: - df = pd.DataFrame({'A': ['foo', 'bar', 'foo', 'bar', - 'foo', 'bar', 'foo', 'foo'], - 'B': ['one', 'one', 'two', 'three', - 'two', 'two', 'one', 'three'], - 'C': np.random.randn(8), - 'D': np.random.randn(8)}) + df = pd.DataFrame( + { + "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], + "B": ["one", "one", "two", "three", "two", "two", "one", "three"], + "C": np.random.randn(8), + "D": np.random.randn(8), + } + ) .. ipython:: python - grouped = df.groupby(['A']) - grouped_C = grouped['C'] - grouped_D = grouped['D'] + grouped = df.groupby(["A"]) + grouped_C = grouped["C"] + grouped_D = grouped["D"] This is mainly syntactic sugar for the alternative and much more verbose: .. ipython:: python - df['C'].groupby(df['A']) + df["C"].groupby(df["A"]) Additionally this method avoids recomputing the internal grouping information derived from the passed key. @@ -450,13 +461,13 @@ A single group can be selected using .. ipython:: python - grouped.get_group('bar') + grouped.get_group("bar") Or for an object grouped on multiple columns: .. ipython:: python - df.groupby(['A', 'B']).get_group(('bar', 'one')) + df.groupby(["A", "B"]).get_group(("bar", "one")) .. _groupby.aggregate: @@ -474,10 +485,10 @@ An obvious one is aggregation via the .. ipython:: python - grouped = df.groupby('A') + grouped = df.groupby("A") grouped.aggregate(np.sum) - grouped = df.groupby(['A', 'B']) + grouped = df.groupby(["A", "B"]) grouped.aggregate(np.sum) As you can see, the result of the aggregation will have the group names as the @@ -487,17 +498,17 @@ changed by using the ``as_index`` option: .. ipython:: python - grouped = df.groupby(['A', 'B'], as_index=False) + grouped = df.groupby(["A", "B"], as_index=False) grouped.aggregate(np.sum) - df.groupby('A', as_index=False).sum() + df.groupby("A", as_index=False).sum() Note that you could use the ``reset_index`` DataFrame function to achieve the same result as the column names are stored in the resulting ``MultiIndex``: .. ipython:: python - df.groupby(['A', 'B']).sum().reset_index() + df.groupby(["A", "B"]).sum().reset_index() Another simple aggregation example is to compute the size of each group. This is included in GroupBy as the ``size`` method. It returns a Series whose @@ -559,8 +570,8 @@ aggregation with, outputting a DataFrame: .. ipython:: python - grouped = df.groupby('A') - grouped['C'].agg([np.sum, np.mean, np.std]) + grouped = df.groupby("A") + grouped["C"].agg([np.sum, np.mean, np.std]) On a grouped ``DataFrame``, you can pass a list of functions to apply to each column, which produces an aggregated result with a hierarchical index: @@ -575,19 +586,21 @@ need to rename, then you can add in a chained operation for a ``Series`` like th .. ipython:: python - (grouped['C'].agg([np.sum, np.mean, np.std]) - .rename(columns={'sum': 'foo', - 'mean': 'bar', - 'std': 'baz'})) + ( + grouped["C"] + .agg([np.sum, np.mean, np.std]) + .rename(columns={"sum": "foo", "mean": "bar", "std": "baz"}) + ) For a grouped ``DataFrame``, you can rename in a similar manner: .. ipython:: python - (grouped.agg([np.sum, np.mean, np.std]) - .rename(columns={'sum': 'foo', - 'mean': 'bar', - 'std': 'baz'})) + ( + grouped.agg([np.sum, np.mean, np.std]).rename( + columns={"sum": "foo", "mean": "bar", "std": "baz"} + ) + ) .. note:: @@ -598,7 +611,7 @@ For a grouped ``DataFrame``, you can rename in a similar manner: .. ipython:: python :okexcept: - grouped['C'].agg(['sum', 'sum']) + grouped["C"].agg(["sum", "sum"]) Pandas *does* allow you to provide multiple lambdas. In this case, pandas @@ -607,8 +620,7 @@ For a grouped ``DataFrame``, you can rename in a similar manner: .. ipython:: python - grouped['C'].agg([lambda x: x.max() - x.min(), - lambda x: x.median() - x.mean()]) + grouped["C"].agg([lambda x: x.max() - x.min(), lambda x: x.median() - x.mean()]) @@ -631,15 +643,19 @@ accepts the special syntax in :meth:`GroupBy.agg`, known as "named aggregation", .. ipython:: python - animals = pd.DataFrame({'kind': ['cat', 'dog', 'cat', 'dog'], - 'height': [9.1, 6.0, 9.5, 34.0], - 'weight': [7.9, 7.5, 9.9, 198.0]}) + animals = pd.DataFrame( + { + "kind": ["cat", "dog", "cat", "dog"], + "height": [9.1, 6.0, 9.5, 34.0], + "weight": [7.9, 7.5, 9.9, 198.0], + } + ) animals animals.groupby("kind").agg( - min_height=pd.NamedAgg(column='height', aggfunc='min'), - max_height=pd.NamedAgg(column='height', aggfunc='max'), - average_weight=pd.NamedAgg(column='weight', aggfunc=np.mean), + min_height=pd.NamedAgg(column="height", aggfunc="min"), + max_height=pd.NamedAgg(column="height", aggfunc="max"), + average_weight=pd.NamedAgg(column="weight", aggfunc=np.mean), ) @@ -648,9 +664,9 @@ accepts the special syntax in :meth:`GroupBy.agg`, known as "named aggregation", .. ipython:: python animals.groupby("kind").agg( - min_height=('height', 'min'), - max_height=('height', 'max'), - average_weight=('weight', np.mean), + min_height=("height", "min"), + max_height=("height", "max"), + average_weight=("weight", np.mean), ) @@ -659,9 +675,11 @@ and unpack the keyword arguments .. ipython:: python - animals.groupby("kind").agg(**{ - 'total weight': pd.NamedAgg(column='weight', aggfunc=sum), - }) + animals.groupby("kind").agg( + **{ + "total weight": pd.NamedAgg(column="weight", aggfunc=sum), + } + ) Additional keyword arguments are not passed through to the aggregation functions. Only pairs of ``(column, aggfunc)`` should be passed as ``**kwargs``. If your aggregation functions @@ -680,8 +698,8 @@ no column selection, so the values are just the functions. .. ipython:: python animals.groupby("kind").height.agg( - min_height='min', - max_height='max', + min_height="min", + max_height="max", ) Applying different functions to DataFrame columns @@ -692,8 +710,7 @@ columns of a DataFrame: .. ipython:: python - grouped.agg({'C': np.sum, - 'D': lambda x: np.std(x, ddof=1)}) + grouped.agg({"C": np.sum, "D": lambda x: np.std(x, ddof=1)}) The function names can also be strings. In order for a string to be valid it must be either implemented on GroupBy or available via :ref:`dispatching @@ -701,7 +718,7 @@ must be either implemented on GroupBy or available via :ref:`dispatching .. ipython:: python - grouped.agg({'C': 'sum', 'D': 'std'}) + grouped.agg({"C": "sum", "D": "std"}) .. _groupby.aggregate.cython: @@ -713,8 +730,8 @@ optimized Cython implementations: .. ipython:: python - df.groupby('A').sum() - df.groupby(['A', 'B']).mean() + df.groupby("A").sum() + df.groupby(["A", "B"]).mean() Of course ``sum`` and ``mean`` are implemented on pandas objects, so the above code would work even without the special versions via dispatching (see below). @@ -743,15 +760,14 @@ For example, suppose we wished to standardize the data within each group: .. ipython:: python - index = pd.date_range('10/1/1999', periods=1100) + index = pd.date_range("10/1/1999", periods=1100) ts = pd.Series(np.random.normal(0.5, 2, 1100), index) ts = ts.rolling(window=100, min_periods=100).mean().dropna() ts.head() ts.tail() - transformed = (ts.groupby(lambda x: x.year) - .transform(lambda x: (x - x.mean()) / x.std())) + transformed = ts.groupby(lambda x: x.year).transform(lambda x: (x - x.mean()) / x.std()) We would expect the result to now have mean 0 and standard deviation 1 within each group, which we can easily check: @@ -772,7 +788,7 @@ We can also visually compare the original and transformed data sets. .. ipython:: python - compare = pd.DataFrame({'Original': ts, 'Transformed': transformed}) + compare = pd.DataFrame({"Original": ts, "Transformed": transformed}) @savefig groupby_transform_plot.png compare.plot() @@ -788,8 +804,8 @@ Alternatively, the built-in methods could be used to produce the same outputs. .. ipython:: python - max = ts.groupby(lambda x: x.year).transform('max') - min = ts.groupby(lambda x: x.year).transform('min') + max = ts.groupby(lambda x: x.year).transform("max") + min = ts.groupby(lambda x: x.year).transform("min") max - min @@ -798,7 +814,7 @@ Another common data transform is to replace missing data with the group mean. .. ipython:: python :suppress: - cols = ['A', 'B', 'C'] + cols = ["A", "B", "C"] values = np.random.randn(1000, 3) values[np.random.randint(0, 1000, 100), 0] = np.nan values[np.random.randint(0, 1000, 50), 1] = np.nan @@ -809,7 +825,7 @@ Another common data transform is to replace missing data with the group mean. data_df - countries = np.array(['US', 'UK', 'GR', 'JP']) + countries = np.array(["US", "UK", "GR", "JP"]) key = countries[np.random.randint(0, 4, 1000)] grouped = data_df.groupby(key) @@ -859,11 +875,10 @@ the column B based on the groups of column A. .. ipython:: python - df_re = pd.DataFrame({'A': [1] * 10 + [5] * 10, - 'B': np.arange(20)}) + df_re = pd.DataFrame({"A": [1] * 10 + [5] * 10, "B": np.arange(20)}) df_re - df_re.groupby('A').rolling(4).B.mean() + df_re.groupby("A").rolling(4).B.mean() The ``expanding()`` method will accumulate a given operation @@ -872,7 +887,7 @@ group. .. ipython:: python - df_re.groupby('A').expanding().sum() + df_re.groupby("A").expanding().sum() Suppose you want to use the ``resample()`` method to get a daily @@ -881,13 +896,16 @@ missing values with the ``ffill()`` method. .. ipython:: python - df_re = pd.DataFrame({'date': pd.date_range(start='2016-01-01', periods=4, - freq='W'), - 'group': [1, 1, 2, 2], - 'val': [5, 6, 7, 8]}).set_index('date') + df_re = pd.DataFrame( + { + "date": pd.date_range(start="2016-01-01", periods=4, freq="W"), + "group": [1, 1, 2, 2], + "val": [5, 6, 7, 8], + } + ).set_index("date") df_re - df_re.groupby('group').resample('1D').ffill() + df_re.groupby("group").resample("1D").ffill() .. _groupby.filter: @@ -911,8 +929,8 @@ with only a couple members. .. ipython:: python - dff = pd.DataFrame({'A': np.arange(8), 'B': list('aabbbbcc')}) - dff.groupby('B').filter(lambda x: len(x) > 2) + dff = pd.DataFrame({"A": np.arange(8), "B": list("aabbbbcc")}) + dff.groupby("B").filter(lambda x: len(x) > 2) Alternatively, instead of dropping the offending groups, we can return a like-indexed objects where the groups that do not pass the filter are filled @@ -920,14 +938,14 @@ with NaNs. .. ipython:: python - dff.groupby('B').filter(lambda x: len(x) > 2, dropna=False) + dff.groupby("B").filter(lambda x: len(x) > 2, dropna=False) For DataFrames with multiple columns, filters should explicitly specify a column as the filter criterion. .. ipython:: python - dff['C'] = np.arange(8) - dff.groupby('B').filter(lambda x: len(x['C']) > 2) + dff["C"] = np.arange(8) + dff.groupby("B").filter(lambda x: len(x["C"]) > 2) .. note:: @@ -939,7 +957,7 @@ For DataFrames with multiple columns, filters should explicitly specify a column .. ipython:: python - dff.groupby('B').head(2) + dff.groupby("B").head(2) .. _groupby.dispatch: @@ -953,7 +971,7 @@ functions: .. ipython:: python - grouped = df.groupby('A') + grouped = df.groupby("A") grouped.agg(lambda x: x.std()) But, it's rather verbose and can be untidy if you need to pass additional @@ -973,12 +991,14 @@ next). This enables some operations to be carried out rather succinctly: .. ipython:: python - tsdf = pd.DataFrame(np.random.randn(1000, 3), - index=pd.date_range('1/1/2000', periods=1000), - columns=['A', 'B', 'C']) + tsdf = pd.DataFrame( + np.random.randn(1000, 3), + index=pd.date_range("1/1/2000", periods=1000), + columns=["A", "B", "C"], + ) tsdf.iloc[::2] = np.nan grouped = tsdf.groupby(lambda x: x.year) - grouped.fillna(method='pad') + grouped.fillna(method="pad") In this example, we chopped the collection of time series into yearly chunks then independently called :ref:`fillna ` on the @@ -989,7 +1009,7 @@ The ``nlargest`` and ``nsmallest`` methods work on ``Series`` style groupbys: .. ipython:: python s = pd.Series([9, 8, 7, 5, 19, 1, 4.2, 3.3]) - g = pd.Series(list('abababab')) + g = pd.Series(list("abababab")) gb = s.groupby(g) gb.nlargest(3) gb.nsmallest(3) @@ -1008,10 +1028,10 @@ for both ``aggregate`` and ``transform`` in many standard use cases. However, .. ipython:: python df - grouped = df.groupby('A') + grouped = df.groupby("A") # could also just call .describe() - grouped['C'].apply(lambda x: x.describe()) + grouped["C"].apply(lambda x: x.describe()) The dimension of the returned result can also change: @@ -1032,7 +1052,8 @@ that is itself a series, and possibly upcast the result to a DataFrame: .. ipython:: python def f(x): - return pd.Series([x, x ** 2], index=['x', 'x^2']) + return pd.Series([x, x ** 2], index=["x", "x^2"]) + s = pd.Series(np.random.rand(5)) s @@ -1133,7 +1154,7 @@ will be (silently) dropped. Thus, this does not pose any problems: .. ipython:: python - df.groupby('A').std() + df.groupby("A").std() Note that ``df.groupby('A').colname.std().`` is more efficient than ``df.groupby('A').std().colname``, so if the result of an aggregation function @@ -1151,23 +1172,29 @@ is only interesting over one column (here ``colname``), it may be filtered .. ipython:: python from decimal import Decimal + df_dec = pd.DataFrame( - {'id': [1, 2, 1, 2], - 'int_column': [1, 2, 3, 4], - 'dec_column': [Decimal('0.50'), Decimal('0.15'), - Decimal('0.25'), Decimal('0.40')] - } + { + "id": [1, 2, 1, 2], + "int_column": [1, 2, 3, 4], + "dec_column": [ + Decimal("0.50"), + Decimal("0.15"), + Decimal("0.25"), + Decimal("0.40"), + ], + } ) # Decimal columns can be sum'd explicitly by themselves... - df_dec.groupby(['id'])[['dec_column']].sum() + df_dec.groupby(["id"])[["dec_column"]].sum() # ...but cannot be combined with standard data types or they will be excluded - df_dec.groupby(['id'])[['int_column', 'dec_column']].sum() + df_dec.groupby(["id"])[["int_column", "dec_column"]].sum() # Use .agg function to aggregate over standard and "nuisance" data types # at the same time - df_dec.groupby(['id']).agg({'int_column': 'sum', 'dec_column': 'sum'}) + df_dec.groupby(["id"]).agg({"int_column": "sum", "dec_column": "sum"}) .. _groupby.observed: @@ -1182,25 +1209,27 @@ Show all values: .. ipython:: python - pd.Series([1, 1, 1]).groupby(pd.Categorical(['a', 'a', 'a'], - categories=['a', 'b']), - observed=False).count() + pd.Series([1, 1, 1]).groupby( + pd.Categorical(["a", "a", "a"], categories=["a", "b"]), observed=False + ).count() Show only the observed values: .. ipython:: python - pd.Series([1, 1, 1]).groupby(pd.Categorical(['a', 'a', 'a'], - categories=['a', 'b']), - observed=True).count() + pd.Series([1, 1, 1]).groupby( + pd.Categorical(["a", "a", "a"], categories=["a", "b"]), observed=True + ).count() The returned dtype of the grouped will *always* include *all* of the categories that were grouped. .. ipython:: python - s = pd.Series([1, 1, 1]).groupby(pd.Categorical(['a', 'a', 'a'], - categories=['a', 'b']), - observed=False).count() + s = ( + pd.Series([1, 1, 1]) + .groupby(pd.Categorical(["a", "a", "a"], categories=["a", "b"]), observed=False) + .count() + ) s.index.dtype .. _groupby.missing: @@ -1224,7 +1253,7 @@ can be used as group keys. If so, the order of the levels will be preserved: data = pd.Series(np.random.randn(100)) - factor = pd.qcut(data, [0, .25, .5, .75, 1.]) + factor = pd.qcut(data, [0, 0.25, 0.5, 0.75, 1.0]) data.groupby(factor).mean() @@ -1240,19 +1269,23 @@ use the ``pd.Grouper`` to provide this local control. import datetime - df = pd.DataFrame({'Branch': 'A A A A A A A B'.split(), - 'Buyer': 'Carl Mark Carl Carl Joe Joe Joe Carl'.split(), - 'Quantity': [1, 3, 5, 1, 8, 1, 9, 3], - 'Date': [ - datetime.datetime(2013, 1, 1, 13, 0), - datetime.datetime(2013, 1, 1, 13, 5), - datetime.datetime(2013, 10, 1, 20, 0), - datetime.datetime(2013, 10, 2, 10, 0), - datetime.datetime(2013, 10, 1, 20, 0), - datetime.datetime(2013, 10, 2, 10, 0), - datetime.datetime(2013, 12, 2, 12, 0), - datetime.datetime(2013, 12, 2, 14, 0)] - }) + df = pd.DataFrame( + { + "Branch": "A A A A A A A B".split(), + "Buyer": "Carl Mark Carl Carl Joe Joe Joe Carl".split(), + "Quantity": [1, 3, 5, 1, 8, 1, 9, 3], + "Date": [ + datetime.datetime(2013, 1, 1, 13, 0), + datetime.datetime(2013, 1, 1, 13, 5), + datetime.datetime(2013, 10, 1, 20, 0), + datetime.datetime(2013, 10, 2, 10, 0), + datetime.datetime(2013, 10, 1, 20, 0), + datetime.datetime(2013, 10, 2, 10, 0), + datetime.datetime(2013, 12, 2, 12, 0), + datetime.datetime(2013, 12, 2, 14, 0), + ], + } + ) df @@ -1260,18 +1293,18 @@ Groupby a specific column with the desired frequency. This is like resampling. .. ipython:: python - df.groupby([pd.Grouper(freq='1M', key='Date'), 'Buyer']).sum() + df.groupby([pd.Grouper(freq="1M", key="Date"), "Buyer"]).sum() You have an ambiguous specification in that you have a named index and a column that could be potential groupers. .. ipython:: python - df = df.set_index('Date') - df['Date'] = df.index + pd.offsets.MonthEnd(2) - df.groupby([pd.Grouper(freq='6M', key='Date'), 'Buyer']).sum() + df = df.set_index("Date") + df["Date"] = df.index + pd.offsets.MonthEnd(2) + df.groupby([pd.Grouper(freq="6M", key="Date"), "Buyer"]).sum() - df.groupby([pd.Grouper(freq='6M', level='Date'), 'Buyer']).sum() + df.groupby([pd.Grouper(freq="6M", level="Date"), "Buyer"]).sum() Taking the first rows of each group @@ -1281,10 +1314,10 @@ Just like for a DataFrame or Series you can call head and tail on a groupby: .. ipython:: python - df = pd.DataFrame([[1, 2], [1, 4], [5, 6]], columns=['A', 'B']) + df = pd.DataFrame([[1, 2], [1, 4], [5, 6]], columns=["A", "B"]) df - g = df.groupby('A') + g = df.groupby("A") g.head(1) g.tail(1) @@ -1302,8 +1335,8 @@ will return a single row (or no row) per group if you pass an int for n: .. ipython:: python - df = pd.DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B']) - g = df.groupby('A') + df = pd.DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=["A", "B"]) + g = df.groupby("A") g.nth(0) g.nth(-1) @@ -1314,21 +1347,21 @@ If you want to select the nth not-null item, use the ``dropna`` kwarg. For a Dat .. ipython:: python # nth(0) is the same as g.first() - g.nth(0, dropna='any') + g.nth(0, dropna="any") g.first() # nth(-1) is the same as g.last() - g.nth(-1, dropna='any') # NaNs denote group exhausted when using dropna + g.nth(-1, dropna="any") # NaNs denote group exhausted when using dropna g.last() - g.B.nth(0, dropna='all') + g.B.nth(0, dropna="all") As with other methods, passing ``as_index=False``, will achieve a filtration, which returns the grouped row. .. ipython:: python - df = pd.DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B']) - g = df.groupby('A', as_index=False) + df = pd.DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=["A", "B"]) + g = df.groupby("A", as_index=False) g.nth(0) g.nth(-1) @@ -1337,8 +1370,8 @@ You can also select multiple rows from each group by specifying multiple nth val .. ipython:: python - business_dates = pd.date_range(start='4/1/2014', end='6/30/2014', freq='B') - df = pd.DataFrame(1, index=business_dates, columns=['a', 'b']) + business_dates = pd.date_range(start="4/1/2014", end="6/30/2014", freq="B") + df = pd.DataFrame(1, index=business_dates, columns=["a", "b"]) # get the first, 4th, and last date index for each month df.groupby([df.index.year, df.index.month]).nth([0, 3, -1]) @@ -1350,12 +1383,12 @@ To see the order in which each row appears within its group, use the .. ipython:: python - dfg = pd.DataFrame(list('aaabba'), columns=['A']) + dfg = pd.DataFrame(list("aaabba"), columns=["A"]) dfg - dfg.groupby('A').cumcount() + dfg.groupby("A").cumcount() - dfg.groupby('A').cumcount(ascending=False) + dfg.groupby("A").cumcount(ascending=False) .. _groupby.ngroup: @@ -1374,12 +1407,12 @@ order they are first observed. .. ipython:: python - dfg = pd.DataFrame(list('aaabba'), columns=['A']) + dfg = pd.DataFrame(list("aaabba"), columns=["A"]) dfg - dfg.groupby('A').ngroup() + dfg.groupby("A").ngroup() - dfg.groupby('A').ngroup(ascending=False) + dfg.groupby("A").ngroup(ascending=False) Plotting ~~~~~~~~ @@ -1392,8 +1425,8 @@ the values in column 1 where the group is "B" are 3 higher on average. np.random.seed(1234) df = pd.DataFrame(np.random.randn(50, 2)) - df['g'] = np.random.choice(['A', 'B'], size=50) - df.loc[df['g'] == 'B', 1] += 3 + df["g"] = np.random.choice(["A", "B"], size=50) + df.loc[df["g"] == "B", 1] += 3 We can easily visualize this with a boxplot: @@ -1401,7 +1434,7 @@ We can easily visualize this with a boxplot: :okwarning: @savefig groupby_boxplot.png - df.groupby('g').boxplot() + df.groupby("g").boxplot() The result of calling ``boxplot`` is a dictionary whose keys are the values of our grouping column ``g`` ("A" and "B"). The values of the resulting dictionary @@ -1436,20 +1469,26 @@ code more readable. First we set the data: .. ipython:: python n = 1000 - df = pd.DataFrame({'Store': np.random.choice(['Store_1', 'Store_2'], n), - 'Product': np.random.choice(['Product_1', - 'Product_2'], n), - 'Revenue': (np.random.random(n) * 50 + 10).round(2), - 'Quantity': np.random.randint(1, 10, size=n)}) + df = pd.DataFrame( + { + "Store": np.random.choice(["Store_1", "Store_2"], n), + "Product": np.random.choice(["Product_1", "Product_2"], n), + "Revenue": (np.random.random(n) * 50 + 10).round(2), + "Quantity": np.random.randint(1, 10, size=n), + } + ) df.head(2) Now, to find prices per store/product, we can simply do: .. ipython:: python - (df.groupby(['Store', 'Product']) - .pipe(lambda grp: grp.Revenue.sum() / grp.Quantity.sum()) - .unstack().round(2)) + ( + df.groupby(["Store", "Product"]) + .pipe(lambda grp: grp.Revenue.sum() / grp.Quantity.sum()) + .unstack() + .round(2) + ) Piping can also be expressive when you want to deliver a grouped object to some arbitrary function, for example: @@ -1459,7 +1498,8 @@ arbitrary function, for example: def mean(groupby): return groupby.mean() - df.groupby(['Store', 'Product']).pipe(mean) + + df.groupby(["Store", "Product"]).pipe(mean) where ``mean`` takes a GroupBy object and finds the mean of the Revenue and Quantity columns respectively for each Store-Product combination. The ``mean`` function can @@ -1476,8 +1516,7 @@ Regroup columns of a DataFrame according to their sum, and sum the aggregated on .. ipython:: python - df = pd.DataFrame({'a': [1, 0, 0], 'b': [0, 1, 0], - 'c': [1, 0, 0], 'd': [2, 3, 4]}) + df = pd.DataFrame({"a": [1, 0, 0], "b": [0, 1, 0], "c": [1, 0, 0], "d": [2, 3, 4]}) df df.groupby(df.sum(), axis=1).sum() @@ -1536,16 +1575,22 @@ column index name will be used as the name of the inserted column: .. ipython:: python - df = pd.DataFrame({'a': [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2], - 'b': [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1], - 'c': [1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0], - 'd': [0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1]}) + df = pd.DataFrame( + { + "a": [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2], + "b": [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1], + "c": [1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0], + "d": [0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1], + } + ) + def compute_metrics(x): - result = {'b_sum': x['b'].sum(), 'c_mean': x['c'].mean()} - return pd.Series(result, name='metrics') + result = {"b_sum": x["b"].sum(), "c_mean": x["c"].mean()} + return pd.Series(result, name="metrics") + - result = df.groupby('a').apply(compute_metrics) + result = df.groupby("a").apply(compute_metrics) result diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index e483cebf71614..184894bbafe28 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -3310,10 +3310,10 @@ applications (CTRL-V on many operating systems). Here we illustrate writing a .. code-block:: python - >>> df = pd.DataFrame({'A': [1, 2, 3], - ... 'B': [4, 5, 6], - ... 'C': ['p', 'q', 'r']}, - ... index=['x', 'y', 'z']) + >>> df = pd.DataFrame( + ... {"A": [1, 2, 3], "B": [4, 5, 6], "C": ["p", "q", "r"]}, index=["x", "y", "z"] + ... ) + >>> df A B C x 1 4 p @@ -3607,8 +3607,8 @@ This format is specified by default when using ``put`` or ``to_hdf`` or by ``for .. code-block:: python - >>> pd.DataFrame(np.random.randn(10, 2)).to_hdf('test_fixed.h5', 'df') - >>> pd.read_hdf('test_fixed.h5', 'df', where='index>5') + >>> pd.DataFrame(np.random.randn(10, 2)).to_hdf("test_fixed.h5", "df") + >>> pd.read_hdf("test_fixed.h5", "df", where="index>5") TypeError: cannot pass a where specification when reading a fixed format. this store must be selected in its entirety diff --git a/doc/source/user_guide/missing_data.rst b/doc/source/user_guide/missing_data.rst index 9294897686d46..3c97cc7da6edb 100644 --- a/doc/source/user_guide/missing_data.rst +++ b/doc/source/user_guide/missing_data.rst @@ -38,12 +38,15 @@ arise and we wish to also consider that "missing" or "not available" or "NA". .. ipython:: python - df = pd.DataFrame(np.random.randn(5, 3), index=['a', 'c', 'e', 'f', 'h'], - columns=['one', 'two', 'three']) - df['four'] = 'bar' - df['five'] = df['one'] > 0 + df = pd.DataFrame( + np.random.randn(5, 3), + index=["a", "c", "e", "f", "h"], + columns=["one", "two", "three"], + ) + df["four"] = "bar" + df["five"] = df["one"] > 0 df - df2 = df.reindex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h']) + df2 = df.reindex(["a", "b", "c", "d", "e", "f", "g", "h"]) df2 To make detecting missing values easier (and across different array dtypes), @@ -53,9 +56,9 @@ Series and DataFrame objects: .. ipython:: python - df2['one'] - pd.isna(df2['one']) - df2['four'].notna() + df2["one"] + pd.isna(df2["one"]) + df2["four"].notna() df2.isna() .. warning:: @@ -65,14 +68,14 @@ Series and DataFrame objects: .. ipython:: python - None == None # noqa: E711 + None == None # noqa: E711 np.nan == np.nan So as compared to above, a scalar equality comparison versus a ``None/np.nan`` doesn't provide useful information. .. ipython:: python - df2['one'] == np.nan + df2["one"] == np.nan Integer dtypes and missing data ------------------------------- @@ -101,9 +104,9 @@ pandas objects provide compatibility between ``NaT`` and ``NaN``. .. ipython:: python df2 = df.copy() - df2['timestamp'] = pd.Timestamp('20120101') + df2["timestamp"] = pd.Timestamp("20120101") df2 - df2.loc[['a', 'c', 'h'], ['one', 'timestamp']] = np.nan + df2.loc[["a", "c", "h"], ["one", "timestamp"]] = np.nan df2 df2.dtypes.value_counts() @@ -146,9 +149,9 @@ objects. .. ipython:: python :suppress: - df = df2.loc[:, ['one', 'two', 'three']] - a = df2.loc[df2.index[:5], ['one', 'two']].fillna(method='pad') - b = df2.loc[df2.index[:5], ['one', 'two', 'three']] + df = df2.loc[:, ["one", "two", "three"]] + a = df2.loc[df2.index[:5], ["one", "two"]].fillna(method="pad") + b = df2.loc[df2.index[:5], ["one", "two", "three"]] .. ipython:: python @@ -168,7 +171,7 @@ account for missing data. For example: .. ipython:: python df - df['one'].sum() + df["one"].sum() df.mean(1) df.cumsum() df.cumsum(skipna=False) @@ -210,7 +213,7 @@ with R, for example: .. ipython:: python df - df.groupby('one').mean() + df.groupby("one").mean() See the groupby section :ref:`here ` for more information. @@ -234,7 +237,7 @@ of ways, which we illustrate: df2 df2.fillna(0) - df2['one'].fillna('missing') + df2["one"].fillna("missing") **Fill gaps forward or backward** @@ -244,7 +247,7 @@ can propagate non-NA values forward or backward: .. ipython:: python df - df.fillna(method='pad') + df.fillna(method="pad") .. _missing_data.fillna.limit: @@ -261,7 +264,7 @@ we can use the ``limit`` keyword: .. ipython:: python df - df.fillna(method='pad', limit=1) + df.fillna(method="pad", limit=1) To remind you, these are the available filling methods: @@ -289,21 +292,21 @@ use case of this is to fill a DataFrame with the mean of that column. .. ipython:: python - dff = pd.DataFrame(np.random.randn(10, 3), columns=list('ABC')) + dff = pd.DataFrame(np.random.randn(10, 3), columns=list("ABC")) dff.iloc[3:5, 0] = np.nan dff.iloc[4:6, 1] = np.nan dff.iloc[5:8, 2] = np.nan dff dff.fillna(dff.mean()) - dff.fillna(dff.mean()['B':'C']) + dff.fillna(dff.mean()["B":"C"]) Same result as above, but is aligning the 'fill' value which is a Series in this case. .. ipython:: python - dff.where(pd.notna(dff), dff.mean(), axis='columns') + dff.where(pd.notna(dff), dff.mean(), axis="columns") .. _missing_data.dropna: @@ -317,15 +320,15 @@ data. To do this, use :meth:`~DataFrame.dropna`: .. ipython:: python :suppress: - df['two'] = df['two'].fillna(0) - df['three'] = df['three'].fillna(0) + df["two"] = df["two"].fillna(0) + df["three"] = df["three"].fillna(0) .. ipython:: python df df.dropna(axis=0) df.dropna(axis=1) - df['one'].dropna() + df["one"].dropna() An equivalent :meth:`~Series.dropna` is available for Series. DataFrame.dropna has considerably more options than Series.dropna, which can be @@ -343,7 +346,7 @@ that, by default, performs linear interpolation at missing data points. :suppress: np.random.seed(123456) - idx = pd.date_range('1/1/2000', periods=100, freq='BM') + idx = pd.date_range("1/1/2000", periods=100, freq="BM") ts = pd.Series(np.random.randn(100), index=idx) ts[1:5] = np.nan ts[20:30] = np.nan @@ -376,28 +379,29 @@ Index aware interpolation is available via the ``method`` keyword: ts2 ts2.interpolate() - ts2.interpolate(method='time') + ts2.interpolate(method="time") For a floating-point index, use ``method='values'``: .. ipython:: python :suppress: - idx = [0., 1., 10.] - ser = pd.Series([0., np.nan, 10.], idx) + idx = [0.0, 1.0, 10.0] + ser = pd.Series([0.0, np.nan, 10.0], idx) .. ipython:: python ser ser.interpolate() - ser.interpolate(method='values') + ser.interpolate(method="values") You can also interpolate with a DataFrame: .. ipython:: python - df = pd.DataFrame({'A': [1, 2.1, np.nan, 4.7, 5.6, 6.8], - 'B': [.25, np.nan, np.nan, 4, 12.2, 14.4]}) + df = pd.DataFrame( + {"A": [1, 2.1, np.nan, 4.7, 5.6, 6.8], "B": [0.25, np.nan, np.nan, 4, 12.2, 14.4]} + ) df df.interpolate() @@ -418,20 +422,20 @@ The appropriate interpolation method will depend on the type of data you are wor .. ipython:: python - df.interpolate(method='barycentric') + df.interpolate(method="barycentric") - df.interpolate(method='pchip') + df.interpolate(method="pchip") - df.interpolate(method='akima') + df.interpolate(method="akima") When interpolating via a polynomial or spline approximation, you must also specify the degree or order of the approximation: .. ipython:: python - df.interpolate(method='spline', order=2) + df.interpolate(method="spline", order=2) - df.interpolate(method='polynomial', order=2) + df.interpolate(method="polynomial", order=2) Compare several methods: @@ -439,10 +443,10 @@ Compare several methods: np.random.seed(2) - ser = pd.Series(np.arange(1, 10.1, .25) ** 2 + np.random.randn(37)) + ser = pd.Series(np.arange(1, 10.1, 0.25) ** 2 + np.random.randn(37)) missing = np.array([4, 13, 14, 15, 16, 17, 18, 20, 29]) ser[missing] = np.nan - methods = ['linear', 'quadratic', 'cubic'] + methods = ["linear", "quadratic", "cubic"] df = pd.DataFrame({m: ser.interpolate(method=m) for m in methods}) @savefig compare_interpolations.png @@ -460,7 +464,7 @@ at the new values. # interpolate at new_index new_index = ser.index | pd.Index([49.25, 49.5, 49.75, 50.25, 50.5, 50.75]) - interp_s = ser.reindex(new_index).interpolate(method='pchip') + interp_s = ser.reindex(new_index).interpolate(method="pchip") interp_s[49:51] .. _scipy: https://www.scipy.org @@ -478,8 +482,7 @@ filled since the last valid observation: .. ipython:: python - ser = pd.Series([np.nan, np.nan, 5, np.nan, np.nan, - np.nan, 13, np.nan, np.nan]) + ser = pd.Series([np.nan, np.nan, 5, np.nan, np.nan, np.nan, 13, np.nan, np.nan]) ser # fill all consecutive values in a forward direction @@ -494,13 +497,13 @@ By default, ``NaN`` values are filled in a ``forward`` direction. Use .. ipython:: python # fill one consecutive value backwards - ser.interpolate(limit=1, limit_direction='backward') + ser.interpolate(limit=1, limit_direction="backward") # fill one consecutive value in both directions - ser.interpolate(limit=1, limit_direction='both') + ser.interpolate(limit=1, limit_direction="both") # fill all consecutive values in both directions - ser.interpolate(limit_direction='both') + ser.interpolate(limit_direction="both") By default, ``NaN`` values are filled whether they are inside (surrounded by) existing valid values, or outside existing valid values. The ``limit_area`` @@ -509,13 +512,13 @@ parameter restricts filling to either inside or outside values. .. ipython:: python # fill one consecutive inside value in both directions - ser.interpolate(limit_direction='both', limit_area='inside', limit=1) + ser.interpolate(limit_direction="both", limit_area="inside", limit=1) # fill all consecutive outside values backward - ser.interpolate(limit_direction='backward', limit_area='outside') + ser.interpolate(limit_direction="backward", limit_area="outside") # fill all consecutive outside values in both directions - ser.interpolate(limit_direction='both', limit_area='outside') + ser.interpolate(limit_direction="both", limit_area="outside") .. _missing_data.replace: @@ -531,7 +534,7 @@ value: .. ipython:: python - ser = pd.Series([0., 1., 2., 3., 4.]) + ser = pd.Series([0.0, 1.0, 2.0, 3.0, 4.0]) ser.replace(0, 5) @@ -551,16 +554,16 @@ For a DataFrame, you can specify individual values by column: .. ipython:: python - df = pd.DataFrame({'a': [0, 1, 2, 3, 4], 'b': [5, 6, 7, 8, 9]}) + df = pd.DataFrame({"a": [0, 1, 2, 3, 4], "b": [5, 6, 7, 8, 9]}) - df.replace({'a': 0, 'b': 5}, 100) + df.replace({"a": 0, "b": 5}, 100) Instead of replacing with specified values, you can treat all given values as missing and interpolate over them: .. ipython:: python - ser.replace([1, 2, 3], method='pad') + ser.replace([1, 2, 3], method="pad") .. _missing_data.replace_expression: @@ -581,67 +584,67 @@ Replace the '.' with ``NaN`` (str -> str): .. ipython:: python - d = {'a': list(range(4)), 'b': list('ab..'), 'c': ['a', 'b', np.nan, 'd']} + d = {"a": list(range(4)), "b": list("ab.."), "c": ["a", "b", np.nan, "d"]} df = pd.DataFrame(d) - df.replace('.', np.nan) + df.replace(".", np.nan) Now do it with a regular expression that removes surrounding whitespace (regex -> regex): .. ipython:: python - df.replace(r'\s*\.\s*', np.nan, regex=True) + df.replace(r"\s*\.\s*", np.nan, regex=True) Replace a few different values (list -> list): .. ipython:: python - df.replace(['a', '.'], ['b', np.nan]) + df.replace(["a", "."], ["b", np.nan]) list of regex -> list of regex: .. ipython:: python - df.replace([r'\.', r'(a)'], ['dot', r'\1stuff'], regex=True) + df.replace([r"\.", r"(a)"], ["dot", r"\1stuff"], regex=True) Only search in column ``'b'`` (dict -> dict): .. ipython:: python - df.replace({'b': '.'}, {'b': np.nan}) + df.replace({"b": "."}, {"b": np.nan}) Same as the previous example, but use a regular expression for searching instead (dict of regex -> dict): .. ipython:: python - df.replace({'b': r'\s*\.\s*'}, {'b': np.nan}, regex=True) + df.replace({"b": r"\s*\.\s*"}, {"b": np.nan}, regex=True) You can pass nested dictionaries of regular expressions that use ``regex=True``: .. ipython:: python - df.replace({'b': {'b': r''}}, regex=True) + df.replace({"b": {"b": r""}}, regex=True) Alternatively, you can pass the nested dictionary like so: .. ipython:: python - df.replace(regex={'b': {r'\s*\.\s*': np.nan}}) + df.replace(regex={"b": {r"\s*\.\s*": np.nan}}) You can also use the group of a regular expression match when replacing (dict of regex -> dict of regex), this works for lists as well. .. ipython:: python - df.replace({'b': r'\s*(\.)\s*'}, {'b': r'\1ty'}, regex=True) + df.replace({"b": r"\s*(\.)\s*"}, {"b": r"\1ty"}, regex=True) You can pass a list of regular expressions, of which those that match will be replaced with a scalar (list of regex -> regex). .. ipython:: python - df.replace([r'\s*\.\s*', r'a|b'], np.nan, regex=True) + df.replace([r"\s*\.\s*", r"a|b"], np.nan, regex=True) All of the regular expression examples can also be passed with the ``to_replace`` argument as the ``regex`` argument. In this case the ``value`` @@ -650,7 +653,7 @@ dictionary. The previous example, in this case, would then be: .. ipython:: python - df.replace(regex=[r'\s*\.\s*', r'a|b'], value=np.nan) + df.replace(regex=[r"\s*\.\s*", r"a|b"], value=np.nan) This can be convenient if you do not want to pass ``regex=True`` every time you want to use a regular expression. @@ -676,7 +679,7 @@ Replacing more than one value is possible by passing a list. .. ipython:: python df00 = df.iloc[0, 0] - df.replace([1.5, df00], [np.nan, 'a']) + df.replace([1.5, df00], [np.nan, "a"]) df[1].dtype You can also operate on the DataFrame in place: @@ -932,7 +935,7 @@ the first 10 columns. .. ipython:: python - bb = pd.read_csv('data/baseball.csv', index_col='id') + bb = pd.read_csv("data/baseball.csv", index_col="id") bb[bb.columns[:10]].dtypes .. ipython:: python diff --git a/doc/source/user_guide/scale.rst b/doc/source/user_guide/scale.rst index 206d8dd0f4739..f36f27269a996 100644 --- a/doc/source/user_guide/scale.rst +++ b/doc/source/user_guide/scale.rst @@ -72,7 +72,7 @@ Option 1 loads in all the data and then filters to what we need. .. ipython:: python - columns = ['id_0', 'name_0', 'x_0', 'y_0'] + columns = ["id_0", "name_0", "x_0", "y_0"] pd.read_parquet("timeseries_wide.parquet")[columns] @@ -123,7 +123,7 @@ space-efficient integers to know which specific name is used in each row. .. ipython:: python ts2 = ts.copy() - ts2['name'] = ts2['name'].astype('category') + ts2["name"] = ts2["name"].astype("category") ts2.memory_usage(deep=True) We can go a bit further and downcast the numeric columns to their smallest types @@ -131,8 +131,8 @@ using :func:`pandas.to_numeric`. .. ipython:: python - ts2['id'] = pd.to_numeric(ts2['id'], downcast='unsigned') - ts2[['x', 'y']] = ts2[['x', 'y']].apply(pd.to_numeric, downcast='float') + ts2["id"] = pd.to_numeric(ts2["id"], downcast="unsigned") + ts2[["x", "y"]] = ts2[["x", "y"]].apply(pd.to_numeric, downcast="float") ts2.dtypes .. ipython:: python @@ -141,8 +141,7 @@ using :func:`pandas.to_numeric`. .. ipython:: python - reduction = (ts2.memory_usage(deep=True).sum() - / ts.memory_usage(deep=True).sum()) + reduction = ts2.memory_usage(deep=True).sum() / ts.memory_usage(deep=True).sum() print(f"{reduction:0.2f}") In all, we've reduced the in-memory footprint of this dataset to 1/5 of its @@ -174,13 +173,13 @@ files. Each file in the directory represents a different year of the entire data import pathlib N = 12 - starts = [f'20{i:>02d}-01-01' for i in range(N)] - ends = [f'20{i:>02d}-12-13' for i in range(N)] + starts = [f"20{i:>02d}-01-01" for i in range(N)] + ends = [f"20{i:>02d}-12-13" for i in range(N)] pathlib.Path("data/timeseries").mkdir(exist_ok=True) for i, (start, end) in enumerate(zip(starts, ends)): - ts = _make_timeseries(start=start, end=end, freq='1T', seed=i) + ts = _make_timeseries(start=start, end=end, freq="1T", seed=i) ts.to_parquet(f"data/timeseries/ts-{i:0>2d}.parquet") @@ -215,7 +214,7 @@ work for arbitrary-sized datasets. # Only one dataframe is in memory at a time... df = pd.read_parquet(path) # ... plus a small Series ``counts``, which is updated. - counts = counts.add(df['name'].value_counts(), fill_value=0) + counts = counts.add(df["name"].value_counts(), fill_value=0) counts.astype(int) Some readers, like :meth:`pandas.read_csv`, offer parameters to control the @@ -278,8 +277,8 @@ Rather than executing immediately, doing operations build up a **task graph**. .. ipython:: python ddf - ddf['name'] - ddf['name'].value_counts() + ddf["name"] + ddf["name"].value_counts() Each of these calls is instant because the result isn't being computed yet. We're just building up a list of computation to do when someone needs the @@ -291,7 +290,7 @@ To get the actual result you can call ``.compute()``. .. ipython:: python - %time ddf['name'].value_counts().compute() + %time ddf["name"].value_counts().compute() At that point, you get back the same thing you'd get with pandas, in this case a concrete pandas Series with the count of each ``name``. @@ -324,7 +323,7 @@ a familiar groupby aggregation. .. ipython:: python - %time ddf.groupby('name')[['x', 'y']].mean().compute().head() + %time ddf.groupby("name")[["x", "y"]].mean().compute().head() The grouping and aggregation is done out-of-core and in parallel. @@ -336,8 +335,8 @@ we need to supply the divisions manually. .. ipython:: python N = 12 - starts = [f'20{i:>02d}-01-01' for i in range(N)] - ends = [f'20{i:>02d}-12-13' for i in range(N)] + starts = [f"20{i:>02d}-01-01" for i in range(N)] + ends = [f"20{i:>02d}-12-13" for i in range(N)] divisions = tuple(pd.to_datetime(starts)) + (pd.Timestamp(ends[-1]),) ddf.divisions = divisions @@ -347,7 +346,7 @@ Now we can do things like fast random access with ``.loc``. .. ipython:: python - ddf.loc['2002-01-01 12:01':'2002-01-01 12:05'].compute() + ddf.loc["2002-01-01 12:01":"2002-01-01 12:05"].compute() Dask knows to just look in the 3rd partition for selecting values in 2002. It doesn't need to look at any other data. @@ -362,7 +361,7 @@ out of memory. At that point it's just a regular pandas object. :okwarning: @savefig dask_resample.png - ddf[['x', 'y']].resample("1D").mean().cumsum().compute().plot() + ddf[["x", "y"]].resample("1D").mean().cumsum().compute().plot() These Dask examples have all be done using multiple processes on a single machine. Dask can be `deployed on a cluster diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst index 61902b4a41b7c..11ec90085d9bf 100644 --- a/doc/source/user_guide/timeseries.rst +++ b/doc/source/user_guide/timeseries.rst @@ -19,42 +19,43 @@ Parsing time series information from various sources and formats import datetime - dti = pd.to_datetime(['1/1/2018', np.datetime64('2018-01-01'), - datetime.datetime(2018, 1, 1)]) + dti = pd.to_datetime( + ["1/1/2018", np.datetime64("2018-01-01"), datetime.datetime(2018, 1, 1)] + ) dti Generate sequences of fixed-frequency dates and time spans .. ipython:: python - dti = pd.date_range('2018-01-01', periods=3, freq='H') + dti = pd.date_range("2018-01-01", periods=3, freq="H") dti Manipulating and converting date times with timezone information .. ipython:: python - dti = dti.tz_localize('UTC') + dti = dti.tz_localize("UTC") dti - dti.tz_convert('US/Pacific') + dti.tz_convert("US/Pacific") Resampling or converting a time series to a particular frequency .. ipython:: python - idx = pd.date_range('2018-01-01', periods=5, freq='H') + idx = pd.date_range("2018-01-01", periods=5, freq="H") ts = pd.Series(range(len(idx)), index=idx) ts - ts.resample('2H').mean() + ts.resample("2H").mean() Performing date and time arithmetic with absolute or relative time increments .. ipython:: python - friday = pd.Timestamp('2018-01-05') + friday = pd.Timestamp("2018-01-05") friday.day_name() # Add 1 day - saturday = friday + pd.Timedelta('1 day') + saturday = friday + pd.Timedelta("1 day") saturday.day_name() # Add 1 business day (Friday --> Monday) monday = friday + pd.offsets.BDay() @@ -90,13 +91,13 @@ so manipulations can be performed with respect to the time element. .. ipython:: python - pd.Series(range(3), index=pd.date_range('2000', freq='D', periods=3)) + pd.Series(range(3), index=pd.date_range("2000", freq="D", periods=3)) However, :class:`Series` and :class:`DataFrame` can directly also support the time component as data itself. .. ipython:: python - pd.Series(pd.date_range('2000', freq='D', periods=3)) + pd.Series(pd.date_range("2000", freq="D", periods=3)) :class:`Series` and :class:`DataFrame` have extended data type support and functionality for ``datetime``, ``timedelta`` and ``Period`` data when passed into those constructors. ``DateOffset`` @@ -104,9 +105,9 @@ data however will be stored as ``object`` data. .. ipython:: python - pd.Series(pd.period_range('1/1/2011', freq='M', periods=3)) + pd.Series(pd.period_range("1/1/2011", freq="M", periods=3)) pd.Series([pd.DateOffset(1), pd.DateOffset(2)]) - pd.Series(pd.date_range('1/1/2011', freq='M', periods=3)) + pd.Series(pd.date_range("1/1/2011", freq="M", periods=3)) Lastly, pandas represents null date times, time deltas, and time spans as ``NaT`` which is useful for representing missing or null date like values and behaves similar @@ -132,7 +133,7 @@ time. .. ipython:: python pd.Timestamp(datetime.datetime(2012, 5, 1)) - pd.Timestamp('2012-05-01') + pd.Timestamp("2012-05-01") pd.Timestamp(2012, 5, 1) However, in many cases it is more natural to associate things like change @@ -143,9 +144,9 @@ For example: .. ipython:: python - pd.Period('2011-01') + pd.Period("2011-01") - pd.Period('2012-05', freq='D') + pd.Period("2012-05", freq="D") :class:`Timestamp` and :class:`Period` can serve as an index. Lists of ``Timestamp`` and ``Period`` are automatically coerced to :class:`DatetimeIndex` @@ -153,9 +154,11 @@ and :class:`PeriodIndex` respectively. .. ipython:: python - dates = [pd.Timestamp('2012-05-01'), - pd.Timestamp('2012-05-02'), - pd.Timestamp('2012-05-03')] + dates = [ + pd.Timestamp("2012-05-01"), + pd.Timestamp("2012-05-02"), + pd.Timestamp("2012-05-03"), + ] ts = pd.Series(np.random.randn(3), dates) type(ts.index) @@ -163,7 +166,7 @@ and :class:`PeriodIndex` respectively. ts - periods = [pd.Period('2012-01'), pd.Period('2012-02'), pd.Period('2012-03')] + periods = [pd.Period("2012-01"), pd.Period("2012-02"), pd.Period("2012-03")] ts = pd.Series(np.random.randn(3), periods) @@ -193,18 +196,18 @@ is converted to a ``DatetimeIndex``: .. ipython:: python - pd.to_datetime(pd.Series(['Jul 31, 2009', '2010-01-10', None])) + pd.to_datetime(pd.Series(["Jul 31, 2009", "2010-01-10", None])) - pd.to_datetime(['2005/11/23', '2010.12.31']) + pd.to_datetime(["2005/11/23", "2010.12.31"]) If you use dates which start with the day first (i.e. European style), you can pass the ``dayfirst`` flag: .. ipython:: python - pd.to_datetime(['04-01-2012 10:00'], dayfirst=True) + pd.to_datetime(["04-01-2012 10:00"], dayfirst=True) - pd.to_datetime(['14-01-2012', '01-14-2012'], dayfirst=True) + pd.to_datetime(["14-01-2012", "01-14-2012"], dayfirst=True) .. warning:: @@ -218,22 +221,22 @@ options like ``dayfirst`` or ``format``, so use ``to_datetime`` if these are req .. ipython:: python - pd.to_datetime('2010/11/12') + pd.to_datetime("2010/11/12") - pd.Timestamp('2010/11/12') + pd.Timestamp("2010/11/12") You can also use the ``DatetimeIndex`` constructor directly: .. ipython:: python - pd.DatetimeIndex(['2018-01-01', '2018-01-03', '2018-01-05']) + pd.DatetimeIndex(["2018-01-01", "2018-01-03", "2018-01-05"]) The string 'infer' can be passed in order to set the frequency of the index as the inferred frequency upon creation: .. ipython:: python - pd.DatetimeIndex(['2018-01-01', '2018-01-03', '2018-01-05'], freq='infer') + pd.DatetimeIndex(["2018-01-01", "2018-01-03", "2018-01-05"], freq="infer") .. _timeseries.converting.format: @@ -245,9 +248,9 @@ This could also potentially speed up the conversion considerably. .. ipython:: python - pd.to_datetime('2010/11/12', format='%Y/%m/%d') + pd.to_datetime("2010/11/12", format="%Y/%m/%d") - pd.to_datetime('12-11-2010 00:00', format='%d-%m-%Y %H:%M') + pd.to_datetime("12-11-2010 00:00", format="%d-%m-%Y %H:%M") For more information on the choices available when specifying the ``format`` option, see the Python `datetime documentation`_. @@ -261,10 +264,9 @@ You can also pass a ``DataFrame`` of integer or string columns to assemble into .. ipython:: python - df = pd.DataFrame({'year': [2015, 2016], - 'month': [2, 3], - 'day': [4, 5], - 'hour': [2, 3]}) + df = pd.DataFrame( + {"year": [2015, 2016], "month": [2, 3], "day": [4, 5], "hour": [2, 3]} + ) pd.to_datetime(df) @@ -272,7 +274,7 @@ You can pass only the columns that you need to assemble. .. ipython:: python - pd.to_datetime(df[['year', 'month', 'day']]) + pd.to_datetime(df[["year", "month", "day"]]) ``pd.to_datetime`` looks for standard designations of the datetime component in the column names, including: @@ -293,13 +295,13 @@ Pass ``errors='ignore'`` to return the original input when unparsable: .. ipython:: python - pd.to_datetime(['2009/07/31', 'asd'], errors='ignore') + pd.to_datetime(["2009/07/31", "asd"], errors="ignore") Pass ``errors='coerce'`` to convert unparsable data to ``NaT`` (not a time): .. ipython:: python - pd.to_datetime(['2009/07/31', 'asd'], errors='coerce') + pd.to_datetime(["2009/07/31", "asd"], errors="coerce") .. _timeseries.converting.epoch: @@ -315,11 +317,12 @@ which can be specified. These are computed from the starting point specified by .. ipython:: python - pd.to_datetime([1349720105, 1349806505, 1349892905, - 1349979305, 1350065705], unit='s') + pd.to_datetime([1349720105, 1349806505, 1349892905, 1349979305, 1350065705], unit="s") - pd.to_datetime([1349720105100, 1349720105200, 1349720105300, - 1349720105400, 1349720105500], unit='ms') + pd.to_datetime( + [1349720105100, 1349720105200, 1349720105300, 1349720105400, 1349720105500], + unit="ms", + ) .. note:: @@ -336,8 +339,8 @@ as timezone-naive timestamps and then localize to the appropriate timezone: .. ipython:: python - pd.Timestamp(1262347200000000000).tz_localize('US/Pacific') - pd.DatetimeIndex([1262347200000000000]).tz_localize('US/Pacific') + pd.Timestamp(1262347200000000000).tz_localize("US/Pacific") + pd.DatetimeIndex([1262347200000000000]).tz_localize("US/Pacific") .. note:: @@ -353,8 +356,8 @@ as timezone-naive timestamps and then localize to the appropriate timezone: .. ipython:: python - pd.to_datetime([1490195805.433, 1490195805.433502912], unit='s') - pd.to_datetime(1490195805433502912, unit='ns') + pd.to_datetime([1490195805.433, 1490195805.433502912], unit="s") + pd.to_datetime(1490195805433502912, unit="ns") .. seealso:: @@ -369,7 +372,7 @@ To invert the operation from above, namely, to convert from a ``Timestamp`` to a .. ipython:: python - stamps = pd.date_range('2012-10-08 18:15:05', periods=4, freq='D') + stamps = pd.date_range("2012-10-08 18:15:05", periods=4, freq="D") stamps We subtract the epoch (midnight at January 1, 1970 UTC) and then floor divide by the @@ -377,7 +380,7 @@ We subtract the epoch (midnight at January 1, 1970 UTC) and then floor divide by .. ipython:: python - (stamps - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s') + (stamps - pd.Timestamp("1970-01-01")) // pd.Timedelta("1s") .. _timeseries.origin: @@ -389,14 +392,14 @@ of a ``DatetimeIndex``. For example, to use 1960-01-01 as the starting date: .. ipython:: python - pd.to_datetime([1, 2, 3], unit='D', origin=pd.Timestamp('1960-01-01')) + pd.to_datetime([1, 2, 3], unit="D", origin=pd.Timestamp("1960-01-01")) The default is set at ``origin='unix'``, which defaults to ``1970-01-01 00:00:00``. Commonly called 'unix epoch' or POSIX time. .. ipython:: python - pd.to_datetime([1, 2, 3], unit='D') + pd.to_datetime([1, 2, 3], unit="D") .. _timeseries.daterange: @@ -408,9 +411,11 @@ To generate an index with timestamps, you can use either the ``DatetimeIndex`` o .. ipython:: python - dates = [datetime.datetime(2012, 5, 1), - datetime.datetime(2012, 5, 2), - datetime.datetime(2012, 5, 3)] + dates = [ + datetime.datetime(2012, 5, 1), + datetime.datetime(2012, 5, 2), + datetime.datetime(2012, 5, 3), + ] # Note the frequency information index = pd.DatetimeIndex(dates) @@ -442,9 +447,9 @@ variety of :ref:`frequency aliases `: .. ipython:: python - pd.date_range(start, periods=1000, freq='M') + pd.date_range(start, periods=1000, freq="M") - pd.bdate_range(start, periods=250, freq='BQS') + pd.bdate_range(start, periods=250, freq="BQS") ``date_range`` and ``bdate_range`` make it easy to generate a range of dates using various combinations of parameters like ``start``, ``end``, ``periods``, @@ -453,9 +458,9 @@ of those specified will not be generated: .. ipython:: python - pd.date_range(start, end, freq='BM') + pd.date_range(start, end, freq="BM") - pd.date_range(start, end, freq='W') + pd.date_range(start, end, freq="W") pd.bdate_range(end=end, periods=20) @@ -467,9 +472,9 @@ resulting ``DatetimeIndex``: .. ipython:: python - pd.date_range('2018-01-01', '2018-01-05', periods=5) + pd.date_range("2018-01-01", "2018-01-05", periods=5) - pd.date_range('2018-01-01', '2018-01-05', periods=10) + pd.date_range("2018-01-01", "2018-01-05", periods=10) .. _timeseries.custom-freq-ranges: @@ -482,13 +487,13 @@ used if a custom frequency string is passed. .. ipython:: python - weekmask = 'Mon Wed Fri' + weekmask = "Mon Wed Fri" holidays = [datetime.datetime(2011, 1, 5), datetime.datetime(2011, 3, 14)] - pd.bdate_range(start, end, freq='C', weekmask=weekmask, holidays=holidays) + pd.bdate_range(start, end, freq="C", weekmask=weekmask, holidays=holidays) - pd.bdate_range(start, end, freq='CBMS', weekmask=weekmask) + pd.bdate_range(start, end, freq="CBMS", weekmask=weekmask) .. seealso:: @@ -545,7 +550,7 @@ intelligent functionality like selection, slicing, etc. .. ipython:: python - rng = pd.date_range(start, end, freq='BM') + rng = pd.date_range(start, end, freq="BM") ts = pd.Series(np.random.randn(len(rng)), index=rng) ts.index ts[:5].index @@ -560,20 +565,20 @@ Dates and strings that parse to timestamps can be passed as indexing parameters: .. ipython:: python - ts['1/31/2011'] + ts["1/31/2011"] ts[datetime.datetime(2011, 12, 25):] - ts['10/31/2011':'12/31/2011'] + ts["10/31/2011":"12/31/2011"] To provide convenience for accessing longer time series, you can also pass in the year or year and month as strings: .. ipython:: python - ts['2011'] + ts["2011"] - ts['2011-6'] + ts["2011-6"] This type of slicing will work on a ``DataFrame`` with a ``DatetimeIndex`` as well. Since the partial string selection is a form of label slicing, the endpoints **will be** included. This @@ -586,10 +591,13 @@ would include matching times on an included date: .. ipython:: python :okwarning: - dft = pd.DataFrame(np.random.randn(100000, 1), columns=['A'], - index=pd.date_range('20130101', periods=100000, freq='T')) + dft = pd.DataFrame( + np.random.randn(100000, 1), + columns=["A"], + index=pd.date_range("20130101", periods=100000, freq="T"), + ) dft - dft['2013'] + dft["2013"] This starts on the very first time in the month, and includes the last date and time for the month: @@ -597,43 +605,45 @@ time for the month: .. ipython:: python :okwarning: - dft['2013-1':'2013-2'] + dft["2013-1":"2013-2"] This specifies a stop time **that includes all of the times on the last day**: .. ipython:: python :okwarning: - dft['2013-1':'2013-2-28'] + dft["2013-1":"2013-2-28"] This specifies an **exact** stop time (and is not the same as the above): .. ipython:: python :okwarning: - dft['2013-1':'2013-2-28 00:00:00'] + dft["2013-1":"2013-2-28 00:00:00"] We are stopping on the included end-point as it is part of the index: .. ipython:: python :okwarning: - dft['2013-1-15':'2013-1-15 12:30:00'] + dft["2013-1-15":"2013-1-15 12:30:00"] ``DatetimeIndex`` partial string indexing also works on a ``DataFrame`` with a ``MultiIndex``: .. ipython:: python - dft2 = pd.DataFrame(np.random.randn(20, 1), - columns=['A'], - index=pd.MultiIndex.from_product( - [pd.date_range('20130101', periods=10, freq='12H'), - ['a', 'b']])) + dft2 = pd.DataFrame( + np.random.randn(20, 1), + columns=["A"], + index=pd.MultiIndex.from_product( + [pd.date_range("20130101", periods=10, freq="12H"), ["a", "b"]] + ), + ) dft2 - dft2.loc['2013-01-05'] + dft2.loc["2013-01-05"] idx = pd.IndexSlice dft2 = dft2.swaplevel(0, 1).sort_index() - dft2.loc[idx[:, '2013-01-05'], :] + dft2.loc[idx[:, "2013-01-05"], :] .. versionadded:: 0.25.0 @@ -642,9 +652,9 @@ Slicing with string indexing also honors UTC offset. .. ipython:: python :okwarning: - df = pd.DataFrame([0], index=pd.DatetimeIndex(['2019-01-01'], tz='US/Pacific')) + df = pd.DataFrame([0], index=pd.DatetimeIndex(["2019-01-01"], tz="US/Pacific")) df - df['2019-01-01 12:00:00+04:00':'2019-01-01 13:00:00+04:00'] + df["2019-01-01 12:00:00+04:00":"2019-01-01 13:00:00+04:00"] .. _timeseries.slice_vs_exact_match: @@ -657,45 +667,48 @@ Consider a ``Series`` object with a minute resolution index: .. ipython:: python - series_minute = pd.Series([1, 2, 3], - pd.DatetimeIndex(['2011-12-31 23:59:00', - '2012-01-01 00:00:00', - '2012-01-01 00:02:00'])) + series_minute = pd.Series( + [1, 2, 3], + pd.DatetimeIndex( + ["2011-12-31 23:59:00", "2012-01-01 00:00:00", "2012-01-01 00:02:00"] + ), + ) series_minute.index.resolution A timestamp string less accurate than a minute gives a ``Series`` object. .. ipython:: python - series_minute['2011-12-31 23'] + series_minute["2011-12-31 23"] A timestamp string with minute resolution (or more accurate), gives a scalar instead, i.e. it is not casted to a slice. .. ipython:: python - series_minute['2011-12-31 23:59'] - series_minute['2011-12-31 23:59:00'] + series_minute["2011-12-31 23:59"] + series_minute["2011-12-31 23:59:00"] If index resolution is second, then the minute-accurate timestamp gives a ``Series``. .. ipython:: python - series_second = pd.Series([1, 2, 3], - pd.DatetimeIndex(['2011-12-31 23:59:59', - '2012-01-01 00:00:00', - '2012-01-01 00:00:01'])) + series_second = pd.Series( + [1, 2, 3], + pd.DatetimeIndex( + ["2011-12-31 23:59:59", "2012-01-01 00:00:00", "2012-01-01 00:00:01"] + ), + ) series_second.index.resolution - series_second['2011-12-31 23:59'] + series_second["2011-12-31 23:59"] If the timestamp string is treated as a slice, it can be used to index ``DataFrame`` with ``[]`` as well. .. ipython:: python :okwarning: - dft_minute = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]}, - index=series_minute.index) - dft_minute['2011-12-31 23'] + dft_minute = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}, index=series_minute.index) + dft_minute["2011-12-31 23"] .. warning:: @@ -706,16 +719,17 @@ If the timestamp string is treated as a slice, it can be used to index ``DataFra .. ipython:: python - dft_minute.loc['2011-12-31 23:59'] + dft_minute.loc["2011-12-31 23:59"] Note also that ``DatetimeIndex`` resolution cannot be less precise than day. .. ipython:: python - series_monthly = pd.Series([1, 2, 3], - pd.DatetimeIndex(['2011-12', '2012-01', '2012-02'])) + series_monthly = pd.Series( + [1, 2, 3], pd.DatetimeIndex(["2011-12", "2012-01", "2012-02"]) + ) series_monthly.index.resolution - series_monthly['2011-12'] # returns Series + series_monthly["2011-12"] # returns Series Exact indexing @@ -727,14 +741,15 @@ These ``Timestamp`` and ``datetime`` objects have exact ``hours, minutes,`` and .. ipython:: python - dft[datetime.datetime(2013, 1, 1):datetime.datetime(2013, 2, 28)] + dft[datetime.datetime(2013, 1, 1): datetime.datetime(2013, 2, 28)] With no defaults. .. ipython:: python - dft[datetime.datetime(2013, 1, 1, 10, 12, 0): - datetime.datetime(2013, 2, 28, 10, 12, 0)] + dft[ + datetime.datetime(2013, 1, 1, 10, 12, 0): datetime.datetime(2013, 2, 28, 10, 12, 0) + ] Truncating & fancy indexing @@ -747,11 +762,11 @@ partially matching dates: .. ipython:: python - rng2 = pd.date_range('2011-01-01', '2012-01-01', freq='W') + rng2 = pd.date_range("2011-01-01", "2012-01-01", freq="W") ts2 = pd.Series(np.random.randn(len(rng2)), index=rng2) - ts2.truncate(before='2011-11', after='2011-12') - ts2['2011-11':'2011-12'] + ts2.truncate(before="2011-11", after="2011-12") + ts2["2011-11":"2011-12"] Even complicated fancy indexing that breaks the ``DatetimeIndex`` frequency regularity will result in a ``DatetimeIndex``, although frequency is lost: @@ -807,7 +822,7 @@ You may obtain the year, week and day components of the ISO year from the ISO 86 .. ipython:: python - idx = pd.date_range(start='2019-12-29', freq='D', periods=4) + idx = pd.date_range(start="2019-12-29", freq="D", periods=4) idx.isocalendar() idx.to_series().dt.isocalendar() @@ -837,12 +852,12 @@ arithmetic operator (``+``) or the ``apply`` method can be used to perform the s .. ipython:: python # This particular day contains a day light savings time transition - ts = pd.Timestamp('2016-10-30 00:00:00', tz='Europe/Helsinki') + ts = pd.Timestamp("2016-10-30 00:00:00", tz="Europe/Helsinki") # Respects absolute time ts + pd.Timedelta(days=1) # Respects calendar time ts + pd.DateOffset(days=1) - friday = pd.Timestamp('2018-01-05') + friday = pd.Timestamp("2018-01-05") friday.day_name() # Add 2 business days (Friday --> Tuesday) two_business_days = 2 * pd.offsets.BDay() @@ -900,10 +915,10 @@ business offsets operate on the weekdays. .. ipython:: python - ts = pd.Timestamp('2018-01-06 00:00:00') + ts = pd.Timestamp("2018-01-06 00:00:00") ts.day_name() # BusinessHour's valid offset dates are Monday through Friday - offset = pd.offsets.BusinessHour(start='09:00') + offset = pd.offsets.BusinessHour(start="09:00") # Bring the date to the closest offset date (Monday) offset.rollforward(ts) # Date is brought to the closest offset date first and then the hour is added @@ -916,12 +931,12 @@ in the operation). .. ipython:: python - ts = pd.Timestamp('2014-01-01 09:00') + ts = pd.Timestamp("2014-01-01 09:00") day = pd.offsets.Day() day.apply(ts) day.apply(ts).normalize() - ts = pd.Timestamp('2014-01-01 22:00') + ts = pd.Timestamp("2014-01-01 22:00") hour = pd.offsets.Hour() hour.apply(ts) hour.apply(ts).normalize() @@ -974,7 +989,7 @@ apply the offset to each element. .. ipython:: python - rng = pd.date_range('2012-01-01', '2012-01-03') + rng = pd.date_range("2012-01-01", "2012-01-03") s = pd.Series(rng) rng rng + pd.DateOffset(months=2) @@ -989,7 +1004,7 @@ used exactly like a ``Timedelta`` - see the .. ipython:: python s - pd.offsets.Day(2) - td = s - pd.Series(pd.date_range('2011-12-29', '2011-12-31')) + td = s - pd.Series(pd.date_range("2011-12-29", "2011-12-31")) td td + pd.offsets.Minute(15) @@ -1016,16 +1031,13 @@ As an interesting example, let's look at Egypt where a Friday-Saturday weekend i .. ipython:: python - weekmask_egypt = 'Sun Mon Tue Wed Thu' + weekmask_egypt = "Sun Mon Tue Wed Thu" # They also observe International Workers' Day so let's # add that for a couple of years - holidays = ['2012-05-01', - datetime.datetime(2013, 5, 1), - np.datetime64('2014-05-01')] - bday_egypt = pd.offsets.CustomBusinessDay(holidays=holidays, - weekmask=weekmask_egypt) + holidays = ["2012-05-01", datetime.datetime(2013, 5, 1), np.datetime64("2014-05-01")] + bday_egypt = pd.offsets.CustomBusinessDay(holidays=holidays, weekmask=weekmask_egypt) dt = datetime.datetime(2013, 4, 30) dt + 2 * bday_egypt @@ -1035,8 +1047,7 @@ Let's map to the weekday names: dts = pd.date_range(dt, periods=5, freq=bday_egypt) - pd.Series(dts.weekday, dts).map( - pd.Series('Mon Tue Wed Thu Fri Sat Sun'.split())) + pd.Series(dts.weekday, dts).map(pd.Series("Mon Tue Wed Thu Fri Sat Sun".split())) Holiday calendars can be used to provide the list of holidays. See the :ref:`holiday calendar` section for more information. @@ -1058,15 +1069,14 @@ in the usual way. .. ipython:: python - bmth_us = pd.offsets.CustomBusinessMonthBegin( - calendar=USFederalHolidayCalendar()) + bmth_us = pd.offsets.CustomBusinessMonthBegin(calendar=USFederalHolidayCalendar()) # Skip new years dt = datetime.datetime(2013, 12, 17) dt + bmth_us # Define date index with custom offset - pd.date_range(start='20100101', end='20120101', freq=bmth_us) + pd.date_range(start="20100101", end="20120101", freq=bmth_us) .. note:: @@ -1097,23 +1107,23 @@ hours are added to the next business day. bh # 2014-08-01 is Friday - pd.Timestamp('2014-08-01 10:00').weekday() - pd.Timestamp('2014-08-01 10:00') + bh + pd.Timestamp("2014-08-01 10:00").weekday() + pd.Timestamp("2014-08-01 10:00") + bh # Below example is the same as: pd.Timestamp('2014-08-01 09:00') + bh - pd.Timestamp('2014-08-01 08:00') + bh + pd.Timestamp("2014-08-01 08:00") + bh # If the results is on the end time, move to the next business day - pd.Timestamp('2014-08-01 16:00') + bh + pd.Timestamp("2014-08-01 16:00") + bh # Remainings are added to the next day - pd.Timestamp('2014-08-01 16:30') + bh + pd.Timestamp("2014-08-01 16:30") + bh # Adding 2 business hours - pd.Timestamp('2014-08-01 10:00') + pd.offsets.BusinessHour(2) + pd.Timestamp("2014-08-01 10:00") + pd.offsets.BusinessHour(2) # Subtracting 3 business hours - pd.Timestamp('2014-08-01 10:00') + pd.offsets.BusinessHour(-3) + pd.Timestamp("2014-08-01 10:00") + pd.offsets.BusinessHour(-3) You can also specify ``start`` and ``end`` time by keywords. The argument must be a ``str`` with an ``hour:minute`` representation or a ``datetime.time`` @@ -1122,12 +1132,12 @@ results in ``ValueError``. .. ipython:: python - bh = pd.offsets.BusinessHour(start='11:00', end=datetime.time(20, 0)) + bh = pd.offsets.BusinessHour(start="11:00", end=datetime.time(20, 0)) bh - pd.Timestamp('2014-08-01 13:00') + bh - pd.Timestamp('2014-08-01 09:00') + bh - pd.Timestamp('2014-08-01 18:00') + bh + pd.Timestamp("2014-08-01 13:00") + bh + pd.Timestamp("2014-08-01 09:00") + bh + pd.Timestamp("2014-08-01 18:00") + bh Passing ``start`` time later than ``end`` represents midnight business hour. In this case, business hour exceeds midnight and overlap to the next day. @@ -1135,19 +1145,19 @@ Valid business hours are distinguished by whether it started from valid ``Busine .. ipython:: python - bh = pd.offsets.BusinessHour(start='17:00', end='09:00') + bh = pd.offsets.BusinessHour(start="17:00", end="09:00") bh - pd.Timestamp('2014-08-01 17:00') + bh - pd.Timestamp('2014-08-01 23:00') + bh + pd.Timestamp("2014-08-01 17:00") + bh + pd.Timestamp("2014-08-01 23:00") + bh # Although 2014-08-02 is Saturday, # it is valid because it starts from 08-01 (Friday). - pd.Timestamp('2014-08-02 04:00') + bh + pd.Timestamp("2014-08-02 04:00") + bh # Although 2014-08-04 is Monday, # it is out of business hours because it starts from 08-03 (Sunday). - pd.Timestamp('2014-08-04 04:00') + bh + pd.Timestamp("2014-08-04 04:00") + bh Applying ``BusinessHour.rollforward`` and ``rollback`` to out of business hours results in the next business hour start or previous day's end. Different from other offsets, ``BusinessHour.rollforward`` @@ -1160,19 +1170,19 @@ under the default business hours (9:00 - 17:00), there is no gap (0 minutes) bet .. ipython:: python # This adjusts a Timestamp to business hour edge - pd.offsets.BusinessHour().rollback(pd.Timestamp('2014-08-02 15:00')) - pd.offsets.BusinessHour().rollforward(pd.Timestamp('2014-08-02 15:00')) + pd.offsets.BusinessHour().rollback(pd.Timestamp("2014-08-02 15:00")) + pd.offsets.BusinessHour().rollforward(pd.Timestamp("2014-08-02 15:00")) # It is the same as BusinessHour().apply(pd.Timestamp('2014-08-01 17:00')). # And it is the same as BusinessHour().apply(pd.Timestamp('2014-08-04 09:00')) - pd.offsets.BusinessHour().apply(pd.Timestamp('2014-08-02 15:00')) + pd.offsets.BusinessHour().apply(pd.Timestamp("2014-08-02 15:00")) # BusinessDay results (for reference) - pd.offsets.BusinessHour().rollforward(pd.Timestamp('2014-08-02')) + pd.offsets.BusinessHour().rollforward(pd.Timestamp("2014-08-02")) # It is the same as BusinessDay().apply(pd.Timestamp('2014-08-01')) # The result is the same as rollworward because BusinessDay never overlap. - pd.offsets.BusinessHour().apply(pd.Timestamp('2014-08-02')) + pd.offsets.BusinessHour().apply(pd.Timestamp("2014-08-02")) ``BusinessHour`` regards Saturday and Sunday as holidays. To use arbitrary holidays, you can use ``CustomBusinessHour`` offset, as explained in the @@ -1190,6 +1200,7 @@ as ``BusinessHour`` except that it skips specified custom holidays. .. ipython:: python from pandas.tseries.holiday import USFederalHolidayCalendar + bhour_us = pd.offsets.CustomBusinessHour(calendar=USFederalHolidayCalendar()) # Friday before MLK Day dt = datetime.datetime(2014, 1, 17, 15) @@ -1203,8 +1214,7 @@ You can use keyword arguments supported by either ``BusinessHour`` and ``CustomB .. ipython:: python - bhour_mon = pd.offsets.CustomBusinessHour(start='10:00', - weekmask='Tue Wed Thu Fri') + bhour_mon = pd.offsets.CustomBusinessHour(start="10:00", weekmask="Tue Wed Thu Fri") # Monday is skipped because it's a holiday, business hour starts from 10:00 dt + bhour_mon * 2 @@ -1257,7 +1267,7 @@ most functions: .. ipython:: python - pd.date_range(start, periods=5, freq='B') + pd.date_range(start, periods=5, freq="B") pd.date_range(start, periods=5, freq=pd.offsets.BDay()) @@ -1265,9 +1275,9 @@ You can combine together day and intraday offsets: .. ipython:: python - pd.date_range(start, periods=10, freq='2h20min') + pd.date_range(start, periods=10, freq="2h20min") - pd.date_range(start, periods=10, freq='1D10U') + pd.date_range(start, periods=10, freq="1D10U") Anchored offsets ~~~~~~~~~~~~~~~~ @@ -1326,39 +1336,39 @@ anchor point, and moved ``|n|-1`` additional steps forwards or backwards. .. ipython:: python - pd.Timestamp('2014-01-02') + pd.offsets.MonthBegin(n=1) - pd.Timestamp('2014-01-02') + pd.offsets.MonthEnd(n=1) + pd.Timestamp("2014-01-02") + pd.offsets.MonthBegin(n=1) + pd.Timestamp("2014-01-02") + pd.offsets.MonthEnd(n=1) - pd.Timestamp('2014-01-02') - pd.offsets.MonthBegin(n=1) - pd.Timestamp('2014-01-02') - pd.offsets.MonthEnd(n=1) + pd.Timestamp("2014-01-02") - pd.offsets.MonthBegin(n=1) + pd.Timestamp("2014-01-02") - pd.offsets.MonthEnd(n=1) - pd.Timestamp('2014-01-02') + pd.offsets.MonthBegin(n=4) - pd.Timestamp('2014-01-02') - pd.offsets.MonthBegin(n=4) + pd.Timestamp("2014-01-02") + pd.offsets.MonthBegin(n=4) + pd.Timestamp("2014-01-02") - pd.offsets.MonthBegin(n=4) If the given date *is* on an anchor point, it is moved ``|n|`` points forwards or backwards. .. ipython:: python - pd.Timestamp('2014-01-01') + pd.offsets.MonthBegin(n=1) - pd.Timestamp('2014-01-31') + pd.offsets.MonthEnd(n=1) + pd.Timestamp("2014-01-01") + pd.offsets.MonthBegin(n=1) + pd.Timestamp("2014-01-31") + pd.offsets.MonthEnd(n=1) - pd.Timestamp('2014-01-01') - pd.offsets.MonthBegin(n=1) - pd.Timestamp('2014-01-31') - pd.offsets.MonthEnd(n=1) + pd.Timestamp("2014-01-01") - pd.offsets.MonthBegin(n=1) + pd.Timestamp("2014-01-31") - pd.offsets.MonthEnd(n=1) - pd.Timestamp('2014-01-01') + pd.offsets.MonthBegin(n=4) - pd.Timestamp('2014-01-31') - pd.offsets.MonthBegin(n=4) + pd.Timestamp("2014-01-01") + pd.offsets.MonthBegin(n=4) + pd.Timestamp("2014-01-31") - pd.offsets.MonthBegin(n=4) For the case when ``n=0``, the date is not moved if on an anchor point, otherwise it is rolled forward to the next anchor point. .. ipython:: python - pd.Timestamp('2014-01-02') + pd.offsets.MonthBegin(n=0) - pd.Timestamp('2014-01-02') + pd.offsets.MonthEnd(n=0) + pd.Timestamp("2014-01-02") + pd.offsets.MonthBegin(n=0) + pd.Timestamp("2014-01-02") + pd.offsets.MonthEnd(n=0) - pd.Timestamp('2014-01-01') + pd.offsets.MonthBegin(n=0) - pd.Timestamp('2014-01-31') + pd.offsets.MonthEnd(n=0) + pd.Timestamp("2014-01-01") + pd.offsets.MonthBegin(n=0) + pd.Timestamp("2014-01-31") + pd.offsets.MonthEnd(n=0) .. _timeseries.holiday: @@ -1394,14 +1404,22 @@ An example of how holidays and holiday calendars are defined: .. ipython:: python - from pandas.tseries.holiday import Holiday, USMemorialDay,\ - AbstractHolidayCalendar, nearest_workday, MO + from pandas.tseries.holiday import ( + Holiday, + USMemorialDay, + AbstractHolidayCalendar, + nearest_workday, + MO, + ) + + class ExampleCalendar(AbstractHolidayCalendar): rules = [ USMemorialDay, - Holiday('July 4th', month=7, day=4, observance=nearest_workday), - Holiday('Columbus Day', month=10, day=1, - offset=pd.DateOffset(weekday=MO(2)))] + Holiday("July 4th", month=7, day=4, observance=nearest_workday), + Holiday("Columbus Day", month=10, day=1, offset=pd.DateOffset(weekday=MO(2))), + ] + cal = ExampleCalendar() cal.holidays(datetime.datetime(2012, 1, 1), datetime.datetime(2012, 12, 31)) @@ -1417,8 +1435,9 @@ or ``Timestamp`` objects. .. ipython:: python - pd.date_range(start='7/1/2012', end='7/10/2012', - freq=pd.offsets.CDay(calendar=cal)).to_pydatetime() + pd.date_range( + start="7/1/2012", end="7/10/2012", freq=pd.offsets.CDay(calendar=cal) + ).to_pydatetime() offset = pd.offsets.CustomBusinessDay(calendar=cal) datetime.datetime(2012, 5, 25) + offset datetime.datetime(2012, 7, 3) + offset @@ -1450,11 +1469,11 @@ or calendars with additional rules. .. ipython:: python - from pandas.tseries.holiday import get_calendar, HolidayCalendarFactory,\ - USLaborDay - cal = get_calendar('ExampleCalendar') + from pandas.tseries.holiday import get_calendar, HolidayCalendarFactory, USLaborDay + + cal = get_calendar("ExampleCalendar") cal.rules - new_cal = HolidayCalendarFactory('NewExampleCalendar', cal, USLaborDay) + new_cal = HolidayCalendarFactory("NewExampleCalendar", cal, USLaborDay) new_cal.rules .. _timeseries.advanced_datetime: @@ -1484,9 +1503,9 @@ rather than changing the alignment of the data and the index: .. ipython:: python - ts.shift(5, freq='D') + ts.shift(5, freq="D") ts.shift(5, freq=pd.offsets.BDay()) - ts.shift(5, freq='BM') + ts.shift(5, freq="BM") Note that with when ``freq`` is specified, the leading entry is no longer NaN because the data is not being realigned. @@ -1501,7 +1520,7 @@ calls ``reindex``. .. ipython:: python - dr = pd.date_range('1/1/2010', periods=3, freq=3 * pd.offsets.BDay()) + dr = pd.date_range("1/1/2010", periods=3, freq=3 * pd.offsets.BDay()) ts = pd.Series(np.random.randn(3), index=dr) ts ts.asfreq(pd.offsets.BDay()) @@ -1511,7 +1530,7 @@ method for any gaps that may appear after the frequency conversion. .. ipython:: python - ts.asfreq(pd.offsets.BDay(), method='pad') + ts.asfreq(pd.offsets.BDay(), method="pad") Filling forward / backward ~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -1552,11 +1571,11 @@ Basics .. ipython:: python - rng = pd.date_range('1/1/2012', periods=100, freq='S') + rng = pd.date_range("1/1/2012", periods=100, freq="S") ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng) - ts.resample('5Min').sum() + ts.resample("5Min").sum() The ``resample`` function is very flexible and allows you to specify many different parameters to control the frequency conversion and resampling @@ -1568,11 +1587,11 @@ a method of the returned object, including ``sum``, ``mean``, ``std``, ``sem``, .. ipython:: python - ts.resample('5Min').mean() + ts.resample("5Min").mean() - ts.resample('5Min').ohlc() + ts.resample("5Min").ohlc() - ts.resample('5Min').max() + ts.resample("5Min").max() For downsampling, ``closed`` can be set to 'left' or 'right' to specify which @@ -1580,9 +1599,9 @@ end of the interval is closed: .. ipython:: python - ts.resample('5Min', closed='right').mean() + ts.resample("5Min", closed="right").mean() - ts.resample('5Min', closed='left').mean() + ts.resample("5Min", closed="left").mean() Parameters like ``label`` are used to manipulate the resulting labels. ``label`` specifies whether the result is labeled with the beginning or @@ -1590,9 +1609,9 @@ the end of the interval. .. ipython:: python - ts.resample('5Min').mean() # by default label='left' + ts.resample("5Min").mean() # by default label='left' - ts.resample('5Min', label='left').mean() + ts.resample("5Min", label="left").mean() .. warning:: @@ -1606,12 +1625,12 @@ the end of the interval. .. ipython:: python - s = pd.date_range('2000-01-01', '2000-01-05').to_series() + s = pd.date_range("2000-01-01", "2000-01-05").to_series() s.iloc[2] = pd.NaT s.dt.day_name() # default: label='left', closed='left' - s.resample('B').last().dt.day_name() + s.resample("B").last().dt.day_name() Notice how the value for Sunday got pulled back to the previous Friday. To get the behavior where the value for Sunday is pushed to Monday, use @@ -1619,7 +1638,7 @@ the end of the interval. .. ipython:: python - s.resample('B', label='right', closed='right').last().dt.day_name() + s.resample("B", label="right", closed="right").last().dt.day_name() The ``axis`` parameter can be set to 0 or 1 and allows you to resample the specified axis for a ``DataFrame``. @@ -1642,11 +1661,11 @@ For upsampling, you can specify a way to upsample and the ``limit`` parameter to # from secondly to every 250 milliseconds - ts[:2].resample('250L').asfreq() + ts[:2].resample("250L").asfreq() - ts[:2].resample('250L').ffill() + ts[:2].resample("250L").ffill() - ts[:2].resample('250L').ffill(limit=2) + ts[:2].resample("250L").ffill(limit=2) Sparse resampling ~~~~~~~~~~~~~~~~~ @@ -1662,14 +1681,14 @@ resample only the groups that are not all ``NaN``. .. ipython:: python - rng = pd.date_range('2014-1-1', periods=100, freq='D') + pd.Timedelta('1s') + rng = pd.date_range("2014-1-1", periods=100, freq="D") + pd.Timedelta("1s") ts = pd.Series(range(100), index=rng) If we want to resample to the full range of the series: .. ipython:: python - ts.resample('3T').sum() + ts.resample("3T").sum() We can instead only resample those groups where we have points as follows: @@ -1678,12 +1697,14 @@ We can instead only resample those groups where we have points as follows: from functools import partial from pandas.tseries.frequencies import to_offset + def round(t, freq): # round a Timestamp to a specified freq freq = to_offset(freq) return pd.Timestamp((t.value // freq.delta.value) * freq.delta.value) - ts.groupby(partial(round, freq='3T')).sum() + + ts.groupby(partial(round, freq="3T")).sum() .. _timeseries.aggregate: @@ -1697,25 +1718,27 @@ Resampling a ``DataFrame``, the default will be to act on all columns with the s .. ipython:: python - df = pd.DataFrame(np.random.randn(1000, 3), - index=pd.date_range('1/1/2012', freq='S', periods=1000), - columns=['A', 'B', 'C']) - r = df.resample('3T') + df = pd.DataFrame( + np.random.randn(1000, 3), + index=pd.date_range("1/1/2012", freq="S", periods=1000), + columns=["A", "B", "C"], + ) + r = df.resample("3T") r.mean() We can select a specific column or columns using standard getitem. .. ipython:: python - r['A'].mean() + r["A"].mean() - r[['A', 'B']].mean() + r[["A", "B"]].mean() You can pass a list or dict of functions to do aggregation with, outputting a ``DataFrame``: .. ipython:: python - r['A'].agg([np.sum, np.mean, np.std]) + r["A"].agg([np.sum, np.mean, np.std]) On a resampled ``DataFrame``, you can pass a list of functions to apply to each column, which produces an aggregated result with a hierarchical index: @@ -1730,21 +1753,20 @@ columns of a ``DataFrame``: .. ipython:: python :okexcept: - r.agg({'A': np.sum, - 'B': lambda x: np.std(x, ddof=1)}) + r.agg({"A": np.sum, "B": lambda x: np.std(x, ddof=1)}) The function names can also be strings. In order for a string to be valid it must be implemented on the resampled object: .. ipython:: python - r.agg({'A': 'sum', 'B': 'std'}) + r.agg({"A": "sum", "B": "std"}) Furthermore, you can also specify multiple aggregation functions for each column separately. .. ipython:: python - r.agg({'A': ['sum', 'std'], 'B': ['mean', 'std']}) + r.agg({"A": ["sum", "std"], "B": ["mean", "std"]}) If a ``DataFrame`` does not have a datetimelike index, but instead you want @@ -1753,14 +1775,15 @@ to resample based on datetimelike column in the frame, it can passed to the .. ipython:: python - df = pd.DataFrame({'date': pd.date_range('2015-01-01', freq='W', periods=5), - 'a': np.arange(5)}, - index=pd.MultiIndex.from_arrays([ - [1, 2, 3, 4, 5], - pd.date_range('2015-01-01', freq='W', periods=5)], - names=['v', 'd'])) + df = pd.DataFrame( + {"date": pd.date_range("2015-01-01", freq="W", periods=5), "a": np.arange(5)}, + index=pd.MultiIndex.from_arrays( + [[1, 2, 3, 4, 5], pd.date_range("2015-01-01", freq="W", periods=5)], + names=["v", "d"], + ), + ) df - df.resample('M', on='date').sum() + df.resample("M", on="date").sum() Similarly, if you instead want to resample by a datetimelike level of ``MultiIndex``, its name or location can be passed to the @@ -1768,7 +1791,7 @@ level of ``MultiIndex``, its name or location can be passed to the .. ipython:: python - df.resample('M', level='d').sum() + df.resample("M", level="d").sum() .. _timeseries.iterating-label: @@ -1782,14 +1805,18 @@ natural and functions similarly to :py:func:`itertools.groupby`: small = pd.Series( range(6), - index=pd.to_datetime(['2017-01-01T00:00:00', - '2017-01-01T00:30:00', - '2017-01-01T00:31:00', - '2017-01-01T01:00:00', - '2017-01-01T03:00:00', - '2017-01-01T03:05:00']) + index=pd.to_datetime( + [ + "2017-01-01T00:00:00", + "2017-01-01T00:30:00", + "2017-01-01T00:31:00", + "2017-01-01T01:00:00", + "2017-01-01T03:00:00", + "2017-01-01T03:05:00", + ] + ), ) - resampled = small.resample('H') + resampled = small.resample("H") for name, group in resampled: print("Group: ", name) @@ -1811,9 +1838,9 @@ For example: .. ipython:: python - start, end = '2000-10-01 23:30:00', '2000-10-02 00:30:00' - middle = '2000-10-02 00:00:00' - rng = pd.date_range(start, end, freq='7min') + start, end = "2000-10-01 23:30:00", "2000-10-02 00:30:00" + middle = "2000-10-02 00:00:00" + rng = pd.date_range(start, end, freq="7min") ts = pd.Series(np.arange(len(rng)) * 3, index=rng) ts @@ -1821,32 +1848,32 @@ Here we can see that, when using ``origin`` with its default value (``'start_day .. ipython:: python - ts.resample('17min', origin='start_day').sum() - ts[middle:end].resample('17min', origin='start_day').sum() + ts.resample("17min", origin="start_day").sum() + ts[middle:end].resample("17min", origin="start_day").sum() Here we can see that, when setting ``origin`` to ``'epoch'``, the result after ``'2000-10-02 00:00:00'`` are identical depending on the start of time series: .. ipython:: python - ts.resample('17min', origin='epoch').sum() - ts[middle:end].resample('17min', origin='epoch').sum() + ts.resample("17min", origin="epoch").sum() + ts[middle:end].resample("17min", origin="epoch").sum() If needed you can use a custom timestamp for ``origin``: .. ipython:: python - ts.resample('17min', origin='2001-01-01').sum() - ts[middle:end].resample('17min', origin=pd.Timestamp('2001-01-01')).sum() + ts.resample("17min", origin="2001-01-01").sum() + ts[middle:end].resample("17min", origin=pd.Timestamp("2001-01-01")).sum() If needed you can just adjust the bins with an ``offset`` Timedelta that would be added to the default ``origin``. Those two examples are equivalent for this time series: .. ipython:: python - ts.resample('17min', origin='start').sum() - ts.resample('17min', offset='23h30min').sum() + ts.resample("17min", origin="start").sum() + ts.resample("17min", offset="23h30min").sum() Note the use of ``'start'`` for ``origin`` on the last example. In that case, ``origin`` will be set to the first value of the timeseries. @@ -1869,37 +1896,37 @@ Because ``freq`` represents a span of ``Period``, it cannot be negative like "-3 .. ipython:: python - pd.Period('2012', freq='A-DEC') + pd.Period("2012", freq="A-DEC") - pd.Period('2012-1-1', freq='D') + pd.Period("2012-1-1", freq="D") - pd.Period('2012-1-1 19:00', freq='H') + pd.Period("2012-1-1 19:00", freq="H") - pd.Period('2012-1-1 19:00', freq='5H') + pd.Period("2012-1-1 19:00", freq="5H") Adding and subtracting integers from periods shifts the period by its own frequency. Arithmetic is not allowed between ``Period`` with different ``freq`` (span). .. ipython:: python - p = pd.Period('2012', freq='A-DEC') + p = pd.Period("2012", freq="A-DEC") p + 1 p - 3 - p = pd.Period('2012-01', freq='2M') + p = pd.Period("2012-01", freq="2M") p + 2 p - 1 @okexcept - p == pd.Period('2012-01', freq='3M') + p == pd.Period("2012-01", freq="3M") If ``Period`` freq is daily or higher (``D``, ``H``, ``T``, ``S``, ``L``, ``U``, ``N``), ``offsets`` and ``timedelta``-like can be added if the result can have the same freq. Otherwise, ``ValueError`` will be raised. .. ipython:: python - p = pd.Period('2014-07-01 09:00', freq='H') + p = pd.Period("2014-07-01 09:00", freq="H") p + pd.offsets.Hour(2) p + datetime.timedelta(minutes=120) - p + np.timedelta64(7200, 's') + p + np.timedelta64(7200, "s") .. code-block:: ipython @@ -1912,7 +1939,7 @@ If ``Period`` has other frequencies, only the same ``offsets`` can be added. Oth .. ipython:: python - p = pd.Period('2014-07', freq='M') + p = pd.Period("2014-07", freq="M") p + pd.offsets.MonthEnd(3) .. code-block:: ipython @@ -1927,7 +1954,7 @@ return the number of frequency units between them: .. ipython:: python - pd.Period('2012', freq='A-DEC') - pd.Period('2002', freq='A-DEC') + pd.Period("2012", freq="A-DEC") - pd.Period("2002", freq="A-DEC") PeriodIndex and period_range ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -1936,21 +1963,21 @@ which can be constructed using the ``period_range`` convenience function: .. ipython:: python - prng = pd.period_range('1/1/2011', '1/1/2012', freq='M') + prng = pd.period_range("1/1/2011", "1/1/2012", freq="M") prng The ``PeriodIndex`` constructor can also be used directly: .. ipython:: python - pd.PeriodIndex(['2011-1', '2011-2', '2011-3'], freq='M') + pd.PeriodIndex(["2011-1", "2011-2", "2011-3"], freq="M") Passing multiplied frequency outputs a sequence of ``Period`` which has multiplied span. .. ipython:: python - pd.period_range(start='2014-01', freq='3M', periods=4) + pd.period_range(start="2014-01", freq="3M", periods=4) If ``start`` or ``end`` are ``Period`` objects, they will be used as anchor endpoints for a ``PeriodIndex`` with frequency matching that of the @@ -1958,8 +1985,9 @@ endpoints for a ``PeriodIndex`` with frequency matching that of the .. ipython:: python - pd.period_range(start=pd.Period('2017Q1', freq='Q'), - end=pd.Period('2017Q2', freq='Q'), freq='M') + pd.period_range( + start=pd.Period("2017Q1", freq="Q"), end=pd.Period("2017Q2", freq="Q"), freq="M" + ) Just like ``DatetimeIndex``, a ``PeriodIndex`` can also be used to index pandas objects: @@ -1973,11 +2001,11 @@ objects: .. ipython:: python - idx = pd.period_range('2014-07-01 09:00', periods=5, freq='H') + idx = pd.period_range("2014-07-01 09:00", periods=5, freq="H") idx idx + pd.offsets.Hour(2) - idx = pd.period_range('2014-07', periods=5, freq='M') + idx = pd.period_range("2014-07", periods=5, freq="M") idx idx + pd.offsets.MonthEnd(3) @@ -1996,7 +2024,7 @@ The ``period`` dtype holds the ``freq`` attribute and is represented with .. ipython:: python - pi = pd.period_range('2016-01-01', periods=3, freq='M') + pi = pd.period_range("2016-01-01", periods=3, freq="M") pi pi.dtype @@ -2007,15 +2035,15 @@ The ``period`` dtype can be used in ``.astype(...)``. It allows one to change th .. ipython:: python # change monthly freq to daily freq - pi.astype('period[D]') + pi.astype("period[D]") # convert to DatetimeIndex - pi.astype('datetime64[ns]') + pi.astype("datetime64[ns]") # convert to PeriodIndex - dti = pd.date_range('2011-01-01', freq='M', periods=3) + dti = pd.date_range("2011-01-01", freq="M", periods=3) dti - dti.astype('period[M]') + dti.astype("period[M]") PeriodIndex partial string indexing @@ -2029,32 +2057,32 @@ You can pass in dates and strings to ``Series`` and ``DataFrame`` with ``PeriodI .. ipython:: python - ps['2011-01'] + ps["2011-01"] ps[datetime.datetime(2011, 12, 25):] - ps['10/31/2011':'12/31/2011'] + ps["10/31/2011":"12/31/2011"] Passing a string representing a lower frequency than ``PeriodIndex`` returns partial sliced data. .. ipython:: python :okwarning: - ps['2011'] + ps["2011"] - dfp = pd.DataFrame(np.random.randn(600, 1), - columns=['A'], - index=pd.period_range('2013-01-01 9:00', - periods=600, - freq='T')) + dfp = pd.DataFrame( + np.random.randn(600, 1), + columns=["A"], + index=pd.period_range("2013-01-01 9:00", periods=600, freq="T"), + ) dfp - dfp['2013-01-01 10H'] + dfp["2013-01-01 10H"] As with ``DatetimeIndex``, the endpoints will be included in the result. The example below slices data starting from 10:00 to 11:59. .. ipython:: python - dfp['2013-01-01 10H':'2013-01-01 11H'] + dfp["2013-01-01 10H":"2013-01-01 11H"] Frequency conversion and resampling with PeriodIndex @@ -2064,7 +2092,7 @@ method. Let's start with the fiscal year 2011, ending in December: .. ipython:: python - p = pd.Period('2011', freq='A-DEC') + p = pd.Period("2011", freq="A-DEC") p We can convert it to a monthly frequency. Using the ``how`` parameter, we can @@ -2072,16 +2100,16 @@ specify whether to return the starting or ending month: .. ipython:: python - p.asfreq('M', how='start') + p.asfreq("M", how="start") - p.asfreq('M', how='end') + p.asfreq("M", how="end") The shorthands 's' and 'e' are provided for convenience: .. ipython:: python - p.asfreq('M', 's') - p.asfreq('M', 'e') + p.asfreq("M", "s") + p.asfreq("M", "e") Converting to a "super-period" (e.g., annual frequency is a super-period of quarterly frequency) automatically returns the super-period that includes the @@ -2089,9 +2117,9 @@ input period: .. ipython:: python - p = pd.Period('2011-12', freq='M') + p = pd.Period("2011-12", freq="M") - p.asfreq('A-NOV') + p.asfreq("A-NOV") Note that since we converted to an annual frequency that ends the year in November, the monthly period of December 2011 is actually in the 2012 A-NOV @@ -2110,21 +2138,21 @@ frequencies ``Q-JAN`` through ``Q-DEC``. .. ipython:: python - p = pd.Period('2012Q1', freq='Q-DEC') + p = pd.Period("2012Q1", freq="Q-DEC") - p.asfreq('D', 's') + p.asfreq("D", "s") - p.asfreq('D', 'e') + p.asfreq("D", "e") ``Q-MAR`` defines fiscal year end in March: .. ipython:: python - p = pd.Period('2011Q4', freq='Q-MAR') + p = pd.Period("2011Q4", freq="Q-MAR") - p.asfreq('D', 's') + p.asfreq("D", "s") - p.asfreq('D', 'e') + p.asfreq("D", "e") .. _timeseries.interchange: @@ -2136,7 +2164,7 @@ and vice-versa using ``to_timestamp``: .. ipython:: python - rng = pd.date_range('1/1/2012', periods=5, freq='M') + rng = pd.date_range("1/1/2012", periods=5, freq="M") ts = pd.Series(np.random.randn(len(rng)), index=rng) @@ -2153,7 +2181,7 @@ end of the period: .. ipython:: python - ps.to_timestamp('D', how='s') + ps.to_timestamp("D", how="s") Converting between period and timestamp enables some convenient arithmetic functions to be used. In the following example, we convert a quarterly @@ -2162,11 +2190,11 @@ the quarter end: .. ipython:: python - prng = pd.period_range('1990Q1', '2000Q4', freq='Q-NOV') + prng = pd.period_range("1990Q1", "2000Q4", freq="Q-NOV") ts = pd.Series(np.random.randn(len(prng)), prng) - ts.index = (prng.asfreq('M', 'e') + 1).asfreq('H', 's') + 9 + ts.index = (prng.asfreq("M", "e") + 1).asfreq("H", "s") + 9 ts.head() @@ -2180,7 +2208,7 @@ then you can use a ``PeriodIndex`` and/or ``Series`` of ``Periods`` to do comput .. ipython:: python - span = pd.period_range('1215-01-01', '1381-01-01', freq='D') + span = pd.period_range("1215-01-01", "1381-01-01", freq="D") span To convert from an ``int64`` based YYYYMMDD representation. @@ -2190,9 +2218,10 @@ To convert from an ``int64`` based YYYYMMDD representation. s = pd.Series([20121231, 20141130, 99991231]) s + def conv(x): - return pd.Period(year=x // 10000, month=x // 100 % 100, - day=x % 100, freq='D') + return pd.Period(year=x // 10000, month=x // 100 % 100, day=x % 100, freq="D") + s.apply(conv) s.apply(conv)[2] @@ -2221,7 +2250,7 @@ By default, pandas objects are time zone unaware: .. ipython:: python - rng = pd.date_range('3/6/2012 00:00', periods=15, freq='D') + rng = pd.date_range("3/6/2012 00:00", periods=15, freq="D") rng.tz is None To localize these dates to a time zone (assign a particular time zone to a naive date), @@ -2241,18 +2270,16 @@ To return ``dateutil`` time zone objects, append ``dateutil/`` before the string import dateutil # pytz - rng_pytz = pd.date_range('3/6/2012 00:00', periods=3, freq='D', - tz='Europe/London') + rng_pytz = pd.date_range("3/6/2012 00:00", periods=3, freq="D", tz="Europe/London") rng_pytz.tz # dateutil - rng_dateutil = pd.date_range('3/6/2012 00:00', periods=3, freq='D') - rng_dateutil = rng_dateutil.tz_localize('dateutil/Europe/London') + rng_dateutil = pd.date_range("3/6/2012 00:00", periods=3, freq="D") + rng_dateutil = rng_dateutil.tz_localize("dateutil/Europe/London") rng_dateutil.tz # dateutil - utc special case - rng_utc = pd.date_range('3/6/2012 00:00', periods=3, freq='D', - tz=dateutil.tz.tzutc()) + rng_utc = pd.date_range("3/6/2012 00:00", periods=3, freq="D", tz=dateutil.tz.tzutc()) rng_utc.tz .. versionadded:: 0.25.0 @@ -2260,8 +2287,7 @@ To return ``dateutil`` time zone objects, append ``dateutil/`` before the string .. ipython:: python # datetime.timezone - rng_utc = pd.date_range('3/6/2012 00:00', periods=3, freq='D', - tz=datetime.timezone.utc) + rng_utc = pd.date_range("3/6/2012 00:00", periods=3, freq="D", tz=datetime.timezone.utc) rng_utc.tz Note that the ``UTC`` time zone is a special case in ``dateutil`` and should be constructed explicitly @@ -2273,15 +2299,14 @@ zones objects explicitly first. import pytz # pytz - tz_pytz = pytz.timezone('Europe/London') - rng_pytz = pd.date_range('3/6/2012 00:00', periods=3, freq='D') + tz_pytz = pytz.timezone("Europe/London") + rng_pytz = pd.date_range("3/6/2012 00:00", periods=3, freq="D") rng_pytz = rng_pytz.tz_localize(tz_pytz) rng_pytz.tz == tz_pytz # dateutil - tz_dateutil = dateutil.tz.gettz('Europe/London') - rng_dateutil = pd.date_range('3/6/2012 00:00', periods=3, freq='D', - tz=tz_dateutil) + tz_dateutil = dateutil.tz.gettz("Europe/London") + rng_dateutil = pd.date_range("3/6/2012 00:00", periods=3, freq="D", tz=tz_dateutil) rng_dateutil.tz == tz_dateutil To convert a time zone aware pandas object from one time zone to another, @@ -2289,7 +2314,7 @@ you can use the ``tz_convert`` method. .. ipython:: python - rng_pytz.tz_convert('US/Eastern') + rng_pytz.tz_convert("US/Eastern") .. note:: @@ -2301,9 +2326,9 @@ you can use the ``tz_convert`` method. .. ipython:: python - dti = pd.date_range('2019-01-01', periods=3, freq='D', tz='US/Pacific') + dti = pd.date_range("2019-01-01", periods=3, freq="D", tz="US/Pacific") dti.tz - ts = pd.Timestamp('2019-01-01', tz='US/Pacific') + ts = pd.Timestamp("2019-01-01", tz="US/Pacific") ts.tz .. warning:: @@ -2344,11 +2369,11 @@ you can use the ``tz_convert`` method. .. ipython:: python - d_2037 = '2037-03-31T010101' - d_2038 = '2038-03-31T010101' - DST = 'Europe/London' - assert pd.Timestamp(d_2037, tz=DST) != pd.Timestamp(d_2037, tz='GMT') - assert pd.Timestamp(d_2038, tz=DST) == pd.Timestamp(d_2038, tz='GMT') + d_2037 = "2037-03-31T010101" + d_2038 = "2038-03-31T010101" + DST = "Europe/London" + assert pd.Timestamp(d_2037, tz=DST) != pd.Timestamp(d_2037, tz="GMT") + assert pd.Timestamp(d_2038, tz=DST) == pd.Timestamp(d_2038, tz="GMT") Under the hood, all timestamps are stored in UTC. Values from a time zone aware :class:`DatetimeIndex` or :class:`Timestamp` will have their fields (day, hour, minute, etc.) @@ -2357,8 +2382,8 @@ still considered to be equal even if they are in different time zones: .. ipython:: python - rng_eastern = rng_utc.tz_convert('US/Eastern') - rng_berlin = rng_utc.tz_convert('Europe/Berlin') + rng_eastern = rng_utc.tz_convert("US/Eastern") + rng_berlin = rng_utc.tz_convert("Europe/Berlin") rng_eastern[2] rng_berlin[2] @@ -2369,9 +2394,9 @@ Operations between :class:`Series` in different time zones will yield UTC .. ipython:: python - ts_utc = pd.Series(range(3), pd.date_range('20130101', periods=3, tz='UTC')) - eastern = ts_utc.tz_convert('US/Eastern') - berlin = ts_utc.tz_convert('Europe/Berlin') + ts_utc = pd.Series(range(3), pd.date_range("20130101", periods=3, tz="UTC")) + eastern = ts_utc.tz_convert("US/Eastern") + berlin = ts_utc.tz_convert("Europe/Berlin") result = eastern + berlin result result.index @@ -2382,14 +2407,13 @@ To remove time zone information, use ``tz_localize(None)`` or ``tz_convert(None) .. ipython:: python - didx = pd.date_range(start='2014-08-01 09:00', freq='H', - periods=3, tz='US/Eastern') + didx = pd.date_range(start="2014-08-01 09:00", freq="H", periods=3, tz="US/Eastern") didx didx.tz_localize(None) didx.tz_convert(None) # tz_convert(None) is identical to tz_convert('UTC').tz_localize(None) - didx.tz_convert('UTC').tz_localize(None) + didx.tz_convert("UTC").tz_localize(None) .. _timeseries.fold: @@ -2415,10 +2439,12 @@ control over how they are handled. .. ipython:: python - pd.Timestamp(datetime.datetime(2019, 10, 27, 1, 30, 0, 0), - tz='dateutil/Europe/London', fold=0) - pd.Timestamp(year=2019, month=10, day=27, hour=1, minute=30, - tz='dateutil/Europe/London', fold=1) + pd.Timestamp( + datetime.datetime(2019, 10, 27, 1, 30, 0, 0), tz="dateutil/Europe/London", fold=0 + ) + pd.Timestamp( + year=2019, month=10, day=27, hour=1, minute=30, tz="dateutil/Europe/London", fold=1 + ) .. _timeseries.timezone_ambiguous: @@ -2436,8 +2462,9 @@ twice within one day ("clocks fall back"). The following options are available: .. ipython:: python - rng_hourly = pd.DatetimeIndex(['11/06/2011 00:00', '11/06/2011 01:00', - '11/06/2011 01:00', '11/06/2011 02:00']) + rng_hourly = pd.DatetimeIndex( + ["11/06/2011 00:00", "11/06/2011 01:00", "11/06/2011 01:00", "11/06/2011 02:00"] + ) This will fail as there are ambiguous times (``'11/06/2011 01:00'``) @@ -2450,9 +2477,9 @@ Handle these ambiguous times by specifying the following. .. ipython:: python - rng_hourly.tz_localize('US/Eastern', ambiguous='infer') - rng_hourly.tz_localize('US/Eastern', ambiguous='NaT') - rng_hourly.tz_localize('US/Eastern', ambiguous=[True, True, False, False]) + rng_hourly.tz_localize("US/Eastern", ambiguous="infer") + rng_hourly.tz_localize("US/Eastern", ambiguous="NaT") + rng_hourly.tz_localize("US/Eastern", ambiguous=[True, True, False, False]) .. _timeseries.timezone_nonexistent: @@ -2471,7 +2498,7 @@ can be controlled by the ``nonexistent`` argument. The following options are ava .. ipython:: python - dti = pd.date_range(start='2015-03-29 02:30:00', periods=3, freq='H') + dti = pd.date_range(start="2015-03-29 02:30:00", periods=3, freq="H") # 2:30 is a nonexistent time Localization of nonexistent times will raise an error by default. @@ -2486,10 +2513,10 @@ Transform nonexistent times to ``NaT`` or shift the times. .. ipython:: python dti - dti.tz_localize('Europe/Warsaw', nonexistent='shift_forward') - dti.tz_localize('Europe/Warsaw', nonexistent='shift_backward') - dti.tz_localize('Europe/Warsaw', nonexistent=pd.Timedelta(1, unit='H')) - dti.tz_localize('Europe/Warsaw', nonexistent='NaT') + dti.tz_localize("Europe/Warsaw", nonexistent="shift_forward") + dti.tz_localize("Europe/Warsaw", nonexistent="shift_backward") + dti.tz_localize("Europe/Warsaw", nonexistent=pd.Timedelta(1, unit="H")) + dti.tz_localize("Europe/Warsaw", nonexistent="NaT") .. _timeseries.timezone_series: @@ -2502,7 +2529,7 @@ represented with a dtype of ``datetime64[ns]``. .. ipython:: python - s_naive = pd.Series(pd.date_range('20130101', periods=3)) + s_naive = pd.Series(pd.date_range("20130101", periods=3)) s_naive A :class:`Series` with a time zone **aware** values is @@ -2510,7 +2537,7 @@ represented with a dtype of ``datetime64[ns, tz]`` where ``tz`` is the time zone .. ipython:: python - s_aware = pd.Series(pd.date_range('20130101', periods=3, tz='US/Eastern')) + s_aware = pd.Series(pd.date_range("20130101", periods=3, tz="US/Eastern")) s_aware Both of these :class:`Series` time zone information @@ -2520,7 +2547,7 @@ For example, to localize and convert a naive stamp to time zone aware. .. ipython:: python - s_naive.dt.tz_localize('UTC').dt.tz_convert('US/Eastern') + s_naive.dt.tz_localize("UTC").dt.tz_convert("US/Eastern") Time zone information can also be manipulated using the ``astype`` method. This method can localize and convert time zone naive timestamps or @@ -2529,13 +2556,13 @@ convert time zone aware timestamps. .. ipython:: python # localize and convert a naive time zone - s_naive.astype('datetime64[ns, US/Eastern]') + s_naive.astype("datetime64[ns, US/Eastern]") # make an aware tz naive - s_aware.astype('datetime64[ns]') + s_aware.astype("datetime64[ns]") # convert to a new time zone - s_aware.astype('datetime64[ns, CET]') + s_aware.astype("datetime64[ns, CET]") .. note:: @@ -2561,4 +2588,4 @@ convert time zone aware timestamps. .. ipython:: python - s_aware.to_numpy(dtype='datetime64[ns]') + s_aware.to_numpy(dtype="datetime64[ns]") From 5ca19f81a6cecd38d157d2726f763574ee56c10f Mon Sep 17 00:00:00 2001 From: Prayag Savsani Date: Sat, 3 Oct 2020 20:05:02 +0530 Subject: [PATCH 1003/1025] DOC: use black to fix code style in doc pandas-dev#36777 (#36824) --- doc/source/development/extending.rst | 7 +++--- doc/source/user_guide/duplicates.rst | 35 ++++++++++++---------------- doc/source/user_guide/gotchas.rst | 28 +++++++++++++--------- 3 files changed, 36 insertions(+), 34 deletions(-) diff --git a/doc/source/development/extending.rst b/doc/source/development/extending.rst index 46960140d3a8c..77fe930cf21e3 100644 --- a/doc/source/development/extending.rst +++ b/doc/source/development/extending.rst @@ -50,8 +50,9 @@ decorate a class, providing the name of attribute to add. The class's Now users can access your methods using the ``geo`` namespace: - >>> ds = pd.DataFrame({'longitude': np.linspace(0, 10), - ... 'latitude': np.linspace(0, 20)}) + >>> ds = pd.Dataframe( + ... {"longitude": np.linspace(0, 10), "latitude": np.linspace(0, 20)} + ... ) >>> ds.geo.center (5.0, 10.0) >>> ds.geo.plot() @@ -499,4 +500,4 @@ registers the default "matplotlib" backend as follows. More information on how to implement a third-party plotting backend can be found at -https://github.com/pandas-dev/pandas/blob/master/pandas/plotting/__init__.py#L1. \ No newline at end of file +https://github.com/pandas-dev/pandas/blob/master/pandas/plotting/__init__.py#L1. diff --git a/doc/source/user_guide/duplicates.rst b/doc/source/user_guide/duplicates.rst index b65822fab2b23..2993ca7799510 100644 --- a/doc/source/user_guide/duplicates.rst +++ b/doc/source/user_guide/duplicates.rst @@ -29,8 +29,8 @@ duplicates present. The output can't be determined, and so pandas raises. .. ipython:: python :okexcept: - s1 = pd.Series([0, 1, 2], index=['a', 'b', 'b']) - s1.reindex(['a', 'b', 'c']) + s1 = pd.Series([0, 1, 2], index=["a", "b", "b"]) + s1.reindex(["a", "b", "c"]) Other methods, like indexing, can give very surprising results. Typically indexing with a scalar will *reduce dimensionality*. Slicing a ``DataFrame`` @@ -39,30 +39,30 @@ return a scalar. But with duplicates, this isn't the case. .. ipython:: python - df1 = pd.DataFrame([[0, 1, 2], [3, 4, 5]], columns=['A', 'A', 'B']) + df1 = pd.DataFrame([[0, 1, 2], [3, 4, 5]], columns=["A", "A", "B"]) df1 We have duplicates in the columns. If we slice ``'B'``, we get back a ``Series`` .. ipython:: python - df1['B'] # a series + df1["B"] # a series But slicing ``'A'`` returns a ``DataFrame`` .. ipython:: python - df1['A'] # a DataFrame + df1["A"] # a DataFrame This applies to row labels as well .. ipython:: python - df2 = pd.DataFrame({"A": [0, 1, 2]}, index=['a', 'a', 'b']) + df2 = pd.DataFrame({"A": [0, 1, 2]}, index=["a", "a", "b"]) df2 - df2.loc['b', 'A'] # a scalar - df2.loc['a', 'A'] # a Series + df2.loc["b", "A"] # a scalar + df2.loc["a", "A"] # a Series Duplicate Label Detection ~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -121,29 +121,24 @@ will be raised. .. ipython:: python :okexcept: - pd.Series( - [0, 1, 2], - index=['a', 'b', 'b'] - ).set_flags(allows_duplicate_labels=False) + pd.Series([0, 1, 2], index=["a", "b", "b"]).set_flags(allows_duplicate_labels=False) This applies to both row and column labels for a :class:`DataFrame` .. ipython:: python :okexcept: - pd.DataFrame( - [[0, 1, 2], [3, 4, 5]], columns=["A", "B", "C"], - ).set_flags(allows_duplicate_labels=False) + pd.DataFrame([[0, 1, 2], [3, 4, 5]], columns=["A", "B", "C"],).set_flags( + allows_duplicate_labels=False + ) This attribute can be checked or set with :attr:`~DataFrame.flags.allows_duplicate_labels`, which indicates whether that object can have duplicate labels. .. ipython:: python - df = ( - pd.DataFrame({"A": [0, 1, 2, 3]}, - index=['x', 'y', 'X', 'Y']) - .set_flags(allows_duplicate_labels=False) + df = pd.DataFrame({"A": [0, 1, 2, 3]}, index=["x", "y", "X", "Y"]).set_flags( + allows_duplicate_labels=False ) df df.flags.allows_duplicate_labels @@ -198,7 +193,7 @@ operations. .. ipython:: python :okexcept: - s1 = pd.Series(0, index=['a', 'b']).set_flags(allows_duplicate_labels=False) + s1 = pd.Series(0, index=["a", "b"]).set_flags(allows_duplicate_labels=False) s1 s1.head().rename({"a": "b"}) diff --git a/doc/source/user_guide/gotchas.rst b/doc/source/user_guide/gotchas.rst index a96c70405d859..07c856c96426d 100644 --- a/doc/source/user_guide/gotchas.rst +++ b/doc/source/user_guide/gotchas.rst @@ -21,12 +21,19 @@ when calling :meth:`~DataFrame.info`: .. ipython:: python - dtypes = ['int64', 'float64', 'datetime64[ns]', 'timedelta64[ns]', - 'complex128', 'object', 'bool'] + dtypes = [ + "int64", + "float64", + "datetime64[ns]", + "timedelta64[ns]", + "complex128", + "object", + "bool", + ] n = 5000 data = {t: np.random.randint(100, size=n).astype(t) for t in dtypes} df = pd.DataFrame(data) - df['categorical'] = df['object'].astype('category') + df["categorical"] = df["object"].astype("category") df.info() @@ -40,7 +47,7 @@ as it can be expensive to do this deeper introspection. .. ipython:: python - df.info(memory_usage='deep') + df.info(memory_usage="deep") By default the display option is set to ``True`` but can be explicitly overridden by passing the ``memory_usage`` argument when invoking ``df.info()``. @@ -155,7 +162,7 @@ index, not membership among the values. .. ipython:: python - s = pd.Series(range(5), index=list('abcde')) + s = pd.Series(range(5), index=list("abcde")) 2 in s 'b' in s @@ -206,11 +213,11 @@ arrays. For example: .. ipython:: python - s = pd.Series([1, 2, 3, 4, 5], index=list('abcde')) + s = pd.Series([1, 2, 3, 4, 5], index=list("abcde")) s s.dtype - s2 = s.reindex(['a', 'b', 'c', 'f', 'u']) + s2 = s.reindex(["a", "b", "c", "f", "u"]) s2 s2.dtype @@ -227,12 +234,11 @@ the nullable-integer extension dtypes provided by pandas .. ipython:: python - s_int = pd.Series([1, 2, 3, 4, 5], index=list('abcde'), - dtype=pd.Int64Dtype()) + s_int = pd.Series([1, 2, 3, 4, 5], index=list("abcde"), dtype=pd.Int64Dtype()) s_int s_int.dtype - s2_int = s_int.reindex(['a', 'b', 'c', 'f', 'u']) + s2_int = s_int.reindex(["a", "b", "c", "f", "u"]) s2_int s2_int.dtype @@ -334,7 +340,7 @@ constructors using something similar to the following: .. ipython:: python - x = np.array(list(range(10)), '>i4') # big endian + x = np.array(list(range(10)), ">i4") # big endian newx = x.byteswap().newbyteorder() # force native byteorder s = pd.Series(newx) From 3408b0d88df9fe52cbf33160b5a341076a46bc8c Mon Sep 17 00:00:00 2001 From: Erfan Nariman <34067903+erfannariman@users.noreply.github.com> Date: Sat, 3 Oct 2020 16:58:46 +0200 Subject: [PATCH 1004/1025] CI: silence codecov for unrelated lines (#36600) --- codecov.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/codecov.yml b/codecov.yml index 1644bf315e0ac..6dd1e33a7a671 100644 --- a/codecov.yml +++ b/codecov.yml @@ -1,7 +1,7 @@ codecov: branch: master -comment: off +comment: false coverage: status: @@ -11,3 +11,6 @@ coverage: patch: default: target: '50' + +github_checks: + annotations: false From a684ec16f06ab5001d0c8752415e582c2c37adec Mon Sep 17 00:00:00 2001 From: Erfan Nariman <34067903+erfannariman@users.noreply.github.com> Date: Sat, 3 Oct 2020 20:09:21 +0200 Subject: [PATCH 1005/1025] DOC: reformat doc groupby.rst (#36831) --- doc/source/user_guide/groupby.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst index 9696f14f03b56..ec64442319a84 100644 --- a/doc/source/user_guide/groupby.rst +++ b/doc/source/user_guide/groupby.rst @@ -677,7 +677,7 @@ and unpack the keyword arguments animals.groupby("kind").agg( **{ - "total weight": pd.NamedAgg(column="weight", aggfunc=sum), + "total weight": pd.NamedAgg(column="weight", aggfunc=sum) } ) From 9fa1417d73da0f1a0580f6ab5d89f68e6d57ceab Mon Sep 17 00:00:00 2001 From: Alex Thorne Date: Sat, 3 Oct 2020 19:17:07 +0100 Subject: [PATCH 1006/1025] TST: GH23452 test reorder_categories() on categorical index (#36648) --- pandas/tests/indexing/test_categorical.py | 28 +++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/pandas/tests/indexing/test_categorical.py b/pandas/tests/indexing/test_categorical.py index 9f3ee81fac2eb..fae229aecc3d4 100644 --- a/pandas/tests/indexing/test_categorical.py +++ b/pandas/tests/indexing/test_categorical.py @@ -807,3 +807,31 @@ def test_loc_with_non_string_categories(self, idx_values, ordered): result.loc[sl, "A"] = ["qux", "qux2"] expected = DataFrame({"A": ["qux", "qux2", "baz"]}, index=cat_idx) tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "categories", + [ + pytest.param(["a", "b", "c"], id="str"), + pytest.param( + [pd.Interval(0, 1), pd.Interval(1, 2), pd.Interval(2, 3)], + id="pd.Interval", + ), + ], + ) + def test_reorder_index_with_categories(self, categories): + # GH23452 + df = DataFrame( + {"foo": range(len(categories))}, + index=CategoricalIndex( + data=categories, categories=categories, ordered=True + ), + ) + df.index = df.index.reorder_categories(df.index.categories[::-1]) + result = df.sort_index() + expected = DataFrame( + {"foo": reversed(range(len(categories)))}, + index=CategoricalIndex( + data=categories[::-1], categories=categories[::-1], ordered=True + ), + ) + tm.assert_frame_equal(result, expected) From ded52b0db28625f31cc7b39351b38ebf52b8c4b7 Mon Sep 17 00:00:00 2001 From: lrjball <50599110+lrjball@users.noreply.github.com> Date: Sun, 4 Oct 2020 04:27:50 +0100 Subject: [PATCH 1007/1025] Typo fix (#36844) Noticed a minor typo when using the docs --- doc/source/user_guide/style.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/user_guide/style.ipynb b/doc/source/user_guide/style.ipynb index 77a1fef28f373..12dd72f761408 100644 --- a/doc/source/user_guide/style.ipynb +++ b/doc/source/user_guide/style.ipynb @@ -793,7 +793,7 @@ "source": [ "The next option you have are \"table styles\".\n", "These are styles that apply to the table as a whole, but don't look at the data.\n", - "Certain sytlings, including pseudo-selectors like `:hover` can only be used this way." + "Certain stylings, including pseudo-selectors like `:hover` can only be used this way." ] }, { From d4b98de41edc0f12840a9e7720e4b26575eacce1 Mon Sep 17 00:00:00 2001 From: Maxim Ivanov <41443370+ivanovmg@users.noreply.github.com> Date: Sun, 4 Oct 2020 11:39:42 +0700 Subject: [PATCH 1008/1025] CLN: private funcs in concat.py (#36726) * REF: extract func _select_upcast_cls_from_dtype * REF: extract function _get_upcast_classes * CLN: rename g -> common_dtype * TYP: type extracted functions * DOC: add docstrings to extracted methods * TYP: cast instead of ignoring mypy error * CLN: import SparseDtype only for type checking --- pandas/core/internals/concat.py | 107 ++++++++++++++++++-------------- 1 file changed, 62 insertions(+), 45 deletions(-) diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index f5d0c921e1006..7ad058cfeb83c 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -1,6 +1,6 @@ from collections import defaultdict import copy -from typing import Dict, List +from typing import TYPE_CHECKING, Any, Dict, List, Sequence, Tuple, cast import numpy as np @@ -28,6 +28,9 @@ from pandas.core.internals.blocks import make_block from pandas.core.internals.managers import BlockManager +if TYPE_CHECKING: + from pandas.core.arrays.sparse.dtype import SparseDtype + def concatenate_block_managers( mgrs_indexers, axes, concat_axis: int, copy: bool @@ -344,7 +347,7 @@ def _concatenate_join_units(join_units, concat_axis, copy): return concat_values -def _get_empty_dtype_and_na(join_units): +def _get_empty_dtype_and_na(join_units: Sequence[JoinUnit]) -> Tuple[DtypeObj, Any]: """ Return dtype and N/A values to use when concatenating specified units. @@ -374,45 +377,8 @@ def _get_empty_dtype_and_na(join_units): else: dtypes[i] = unit.dtype - upcast_classes: Dict[str, List[DtypeObj]] = defaultdict(list) - null_upcast_classes: Dict[str, List[DtypeObj]] = defaultdict(list) - for dtype, unit in zip(dtypes, join_units): - if dtype is None: - continue - - if is_categorical_dtype(dtype): - upcast_cls = "category" - elif is_datetime64tz_dtype(dtype): - upcast_cls = "datetimetz" - - elif is_extension_array_dtype(dtype): - upcast_cls = "extension" - - elif issubclass(dtype.type, np.bool_): - upcast_cls = "bool" - elif issubclass(dtype.type, np.object_): - upcast_cls = "object" - elif is_datetime64_dtype(dtype): - upcast_cls = "datetime" - elif is_timedelta64_dtype(dtype): - upcast_cls = "timedelta" - elif is_sparse(dtype): - upcast_cls = dtype.subtype.name - elif is_float_dtype(dtype) or is_numeric_dtype(dtype): - upcast_cls = dtype.name - else: - upcast_cls = "float" + upcast_classes = _get_upcast_classes(join_units, dtypes) - # Null blocks should not influence upcast class selection, unless there - # are only null blocks, when same upcasting rules must be applied to - # null upcast classes. - if unit.is_na: - null_upcast_classes[upcast_cls].append(dtype) - else: - upcast_classes[upcast_cls].append(dtype) - - if not upcast_classes: - upcast_classes = null_upcast_classes # TODO: de-duplicate with maybe_promote? # create the result if "extension" in upcast_classes: @@ -441,23 +407,74 @@ def _get_empty_dtype_and_na(join_units): return np.dtype("m8[ns]"), np.timedelta64("NaT", "ns") else: # pragma try: - g = np.find_common_type(upcast_classes, []) + common_dtype = np.find_common_type(upcast_classes, []) except TypeError: # At least one is an ExtensionArray return np.dtype(np.object_), np.nan else: - if is_float_dtype(g): - return g, g.type(np.nan) - elif is_numeric_dtype(g): + if is_float_dtype(common_dtype): + return common_dtype, common_dtype.type(np.nan) + elif is_numeric_dtype(common_dtype): if has_none_blocks: return np.dtype(np.float64), np.nan else: - return g, None + return common_dtype, None msg = "invalid dtype determination in get_concat_dtype" raise AssertionError(msg) +def _get_upcast_classes( + join_units: Sequence[JoinUnit], + dtypes: Sequence[DtypeObj], +) -> Dict[str, List[DtypeObj]]: + """Create mapping between upcast class names and lists of dtypes.""" + upcast_classes: Dict[str, List[DtypeObj]] = defaultdict(list) + null_upcast_classes: Dict[str, List[DtypeObj]] = defaultdict(list) + for dtype, unit in zip(dtypes, join_units): + if dtype is None: + continue + + upcast_cls = _select_upcast_cls_from_dtype(dtype) + # Null blocks should not influence upcast class selection, unless there + # are only null blocks, when same upcasting rules must be applied to + # null upcast classes. + if unit.is_na: + null_upcast_classes[upcast_cls].append(dtype) + else: + upcast_classes[upcast_cls].append(dtype) + + if not upcast_classes: + upcast_classes = null_upcast_classes + + return upcast_classes + + +def _select_upcast_cls_from_dtype(dtype: DtypeObj) -> str: + """Select upcast class name based on dtype.""" + if is_categorical_dtype(dtype): + return "category" + elif is_datetime64tz_dtype(dtype): + return "datetimetz" + elif is_extension_array_dtype(dtype): + return "extension" + elif issubclass(dtype.type, np.bool_): + return "bool" + elif issubclass(dtype.type, np.object_): + return "object" + elif is_datetime64_dtype(dtype): + return "datetime" + elif is_timedelta64_dtype(dtype): + return "timedelta" + elif is_sparse(dtype): + dtype = cast("SparseDtype", dtype) + return dtype.subtype.name + elif is_float_dtype(dtype) or is_numeric_dtype(dtype): + return dtype.name + else: + return "float" + + def _is_uniform_join_units(join_units: List[JoinUnit]) -> bool: """ Check if the join units consist of blocks of uniform type that can From 2f24cc2a89d3f4012f1167935a61f16e89cfc2cc Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 3 Oct 2020 21:47:46 -0700 Subject: [PATCH 1009/1025] ENH: match stdlib behavior for datetimelike comparisons (#36647) * ENH: match stdlib behavior for datetimelike comparisons * update test Co-authored-by: Jeff Reback --- doc/source/whatsnew/v1.2.0.rst | 1 + pandas/_libs/lib.pyx | 7 -- pandas/_libs/tslibs/timestamps.pxd | 2 +- pandas/_libs/tslibs/timestamps.pyx | 26 ++++-- pandas/core/arrays/datetimelike.py | 13 ++- pandas/tests/arithmetic/test_datetime64.py | 83 ++++++++++++------- pandas/tests/reductions/test_reductions.py | 10 +-- .../scalar/timestamp/test_comparisons.py | 27 +++--- pandas/tests/series/indexing/test_datetime.py | 2 +- 9 files changed, 104 insertions(+), 67 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index cb0858fd678f8..b30e4177270b8 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -308,6 +308,7 @@ Datetimelike - Bug in :meth:`DatetimeIndex.searchsorted`, :meth:`TimedeltaIndex.searchsorted`, :meth:`PeriodIndex.searchsorted`, and :meth:`Series.searchsorted` with ``datetime64``, ``timedelta64`` or ``Period`` dtype placement of ``NaT`` values being inconsistent with ``NumPy`` (:issue:`36176`, :issue:`36254`) - Inconsistency in :class:`DatetimeArray`, :class:`TimedeltaArray`, and :class:`PeriodArray` setitem casting arrays of strings to datetimelike scalars but not scalar strings (:issue:`36261`) - Bug in :class:`DatetimeIndex.shift` incorrectly raising when shifting empty indexes (:issue:`14811`) +- :class:`Timestamp` and :class:`DatetimeIndex` comparisons between timezone-aware and timezone-naive objects now follow the standard library ``datetime`` behavior, returning ``True``/``False`` for ``!=``/``==`` and raising for inequality comparisons (:issue:`28507`) - Bug in :meth:`DatetimeIndex.equals` and :meth:`TimedeltaIndex.equals` incorrectly considering ``int64`` indexes as equal (:issue:`36744`) Timedelta diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 61a9634b00211..922dcd7e74aa0 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -584,13 +584,6 @@ def array_equivalent_object(left: object[:], right: object[:]) -> bool: elif not (PyObject_RichCompareBool(x, y, Py_EQ) or (x is None or is_nan(x)) and (y is None or is_nan(y))): return False - except TypeError as err: - # Avoid raising TypeError on tzawareness mismatch - # TODO: This try/except can be removed if/when Timestamp - # comparisons are changed to match datetime, see GH#28507 - if "tz-naive and tz-aware" in str(err): - return False - raise except ValueError: # Avoid raising ValueError when comparing Numpy arrays to other types if cnp.PyArray_IsAnyScalar(x) != cnp.PyArray_IsAnyScalar(y): diff --git a/pandas/_libs/tslibs/timestamps.pxd b/pandas/_libs/tslibs/timestamps.pxd index 307b6dfc90715..6fb7b1ea8f520 100644 --- a/pandas/_libs/tslibs/timestamps.pxd +++ b/pandas/_libs/tslibs/timestamps.pxd @@ -19,8 +19,8 @@ cdef class _Timestamp(ABCTimestamp): cdef bint _get_start_end_field(self, str field) cdef _get_date_name_field(self, str field, object locale) cdef int64_t _maybe_convert_value_to_local(self) + cdef bint _can_compare(self, datetime other) cpdef to_datetime64(self) - cdef _assert_tzawareness_compat(_Timestamp self, datetime other) cpdef datetime to_pydatetime(_Timestamp self, bint warn=*) cdef bint _compare_outside_nanorange(_Timestamp self, datetime other, int op) except -1 diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index 78f7b2150f720..a8f6c60bcb300 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -260,6 +260,10 @@ cdef class _Timestamp(ABCTimestamp): if other.dtype.kind == "M": if self.tz is None: return PyObject_RichCompare(self.asm8, other, op) + elif op == Py_NE: + return np.ones(other.shape, dtype=np.bool_) + elif op == Py_EQ: + return np.zeros(other.shape, dtype=np.bool_) raise TypeError( "Cannot compare tz-naive and tz-aware timestamps" ) @@ -278,7 +282,12 @@ cdef class _Timestamp(ABCTimestamp): else: return NotImplemented - self._assert_tzawareness_compat(ots) + if not self._can_compare(ots): + if op == Py_NE or op == Py_EQ: + return NotImplemented + raise TypeError( + "Cannot compare tz-naive and tz-aware timestamps" + ) return cmp_scalar(self.value, ots.value, op) cdef bint _compare_outside_nanorange(_Timestamp self, datetime other, @@ -286,16 +295,15 @@ cdef class _Timestamp(ABCTimestamp): cdef: datetime dtval = self.to_pydatetime() - self._assert_tzawareness_compat(other) + if not self._can_compare(other): + return NotImplemented + return PyObject_RichCompareBool(dtval, other, op) - cdef _assert_tzawareness_compat(_Timestamp self, datetime other): - if self.tzinfo is None: - if other.tzinfo is not None: - raise TypeError('Cannot compare tz-naive and tz-aware ' - 'timestamps') - elif other.tzinfo is None: - raise TypeError('Cannot compare tz-naive and tz-aware timestamps') + cdef bint _can_compare(self, datetime other): + if self.tzinfo is not None: + return other.tzinfo is not None + return other.tzinfo is None def __add__(self, other): cdef: diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 83a9c0ba61c2d..8b6f49cc7d589 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -685,7 +685,11 @@ def _validate_comparison_value(self, other, opname: str): if isinstance(other, self._recognized_scalars) or other is NaT: other = self._scalar_type(other) # type: ignore[call-arg] - self._check_compatible_with(other) + try: + self._check_compatible_with(other) + except TypeError as err: + # e.g. tzawareness mismatch + raise InvalidComparison(other) from err elif not is_list_like(other): raise InvalidComparison(other) @@ -696,8 +700,13 @@ def _validate_comparison_value(self, other, opname: str): else: try: other = self._validate_listlike(other, opname, allow_object=True) + self._check_compatible_with(other) except TypeError as err: - raise InvalidComparison(other) from err + if is_object_dtype(getattr(other, "dtype", None)): + # We will have to operate element-wise + pass + else: + raise InvalidComparison(other) from err return other diff --git a/pandas/tests/arithmetic/test_datetime64.py b/pandas/tests/arithmetic/test_datetime64.py index 46be296759088..e9dc83d106651 100644 --- a/pandas/tests/arithmetic/test_datetime64.py +++ b/pandas/tests/arithmetic/test_datetime64.py @@ -558,26 +558,30 @@ def test_comparison_tzawareness_compat(self, op, box_with_array): dr = tm.box_expected(dr, box) dz = tm.box_expected(dz, box) - msg = "Cannot compare tz-naive and tz-aware" - with pytest.raises(TypeError, match=msg): - op(dr, dz) - if box is pd.DataFrame: tolist = lambda x: x.astype(object).values.tolist()[0] else: tolist = list - with pytest.raises(TypeError, match=msg): - op(dr, tolist(dz)) - with pytest.raises(TypeError, match=msg): - op(dr, np.array(tolist(dz), dtype=object)) - with pytest.raises(TypeError, match=msg): - op(dz, dr) + if op not in [operator.eq, operator.ne]: + msg = ( + r"Invalid comparison between dtype=datetime64\[ns.*\] " + "and (Timestamp|DatetimeArray|list|ndarray)" + ) + with pytest.raises(TypeError, match=msg): + op(dr, dz) - with pytest.raises(TypeError, match=msg): - op(dz, tolist(dr)) - with pytest.raises(TypeError, match=msg): - op(dz, np.array(tolist(dr), dtype=object)) + with pytest.raises(TypeError, match=msg): + op(dr, tolist(dz)) + with pytest.raises(TypeError, match=msg): + op(dr, np.array(tolist(dz), dtype=object)) + with pytest.raises(TypeError, match=msg): + op(dz, dr) + + with pytest.raises(TypeError, match=msg): + op(dz, tolist(dr)) + with pytest.raises(TypeError, match=msg): + op(dz, np.array(tolist(dr), dtype=object)) # The aware==aware and naive==naive comparisons should *not* raise assert np.all(dr == dr) @@ -609,17 +613,20 @@ def test_comparison_tzawareness_compat_scalars(self, op, box_with_array): ts_tz = pd.Timestamp("2000-03-14 01:59", tz="Europe/Amsterdam") assert np.all(dr > ts) - msg = "Cannot compare tz-naive and tz-aware" - with pytest.raises(TypeError, match=msg): - op(dr, ts_tz) + msg = r"Invalid comparison between dtype=datetime64\[ns.*\] and Timestamp" + if op not in [operator.eq, operator.ne]: + with pytest.raises(TypeError, match=msg): + op(dr, ts_tz) assert np.all(dz > ts_tz) - with pytest.raises(TypeError, match=msg): - op(dz, ts) + if op not in [operator.eq, operator.ne]: + with pytest.raises(TypeError, match=msg): + op(dz, ts) - # GH#12601: Check comparison against Timestamps and DatetimeIndex - with pytest.raises(TypeError, match=msg): - op(ts, dz) + if op not in [operator.eq, operator.ne]: + # GH#12601: Check comparison against Timestamps and DatetimeIndex + with pytest.raises(TypeError, match=msg): + op(ts, dz) @pytest.mark.parametrize( "op", @@ -637,15 +644,31 @@ def test_comparison_tzawareness_compat_scalars(self, op, box_with_array): def test_scalar_comparison_tzawareness( self, op, other, tz_aware_fixture, box_with_array ): + box = box_with_array tz = tz_aware_fixture dti = pd.date_range("2016-01-01", periods=2, tz=tz) + xbox = box if box not in [pd.Index, pd.array] else np.ndarray dtarr = tm.box_expected(dti, box_with_array) - msg = "Cannot compare tz-naive and tz-aware" - with pytest.raises(TypeError, match=msg): - op(dtarr, other) - with pytest.raises(TypeError, match=msg): - op(other, dtarr) + if op in [operator.eq, operator.ne]: + exbool = op is operator.ne + expected = np.array([exbool, exbool], dtype=bool) + expected = tm.box_expected(expected, xbox) + + result = op(dtarr, other) + tm.assert_equal(result, expected) + + result = op(other, dtarr) + tm.assert_equal(result, expected) + else: + msg = ( + r"Invalid comparison between dtype=datetime64\[ns, .*\] " + f"and {type(other).__name__}" + ) + with pytest.raises(TypeError, match=msg): + op(dtarr, other) + with pytest.raises(TypeError, match=msg): + op(other, dtarr) @pytest.mark.parametrize( "op", @@ -745,10 +768,8 @@ def test_dti_cmp_object_dtype(self): tm.assert_numpy_array_equal(result, expected) other = dti.tz_localize(None) - msg = "Cannot compare tz-naive and tz-aware" - with pytest.raises(TypeError, match=msg): - # tzawareness failure - dti != other + result = dti != other + tm.assert_numpy_array_equal(result, expected) other = np.array(list(dti[:5]) + [Timedelta(days=1)] * 5) result = dti == other diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py index db7cd54d23a2b..fe97925c2bb74 100644 --- a/pandas/tests/reductions/test_reductions.py +++ b/pandas/tests/reductions/test_reductions.py @@ -56,13 +56,13 @@ def test_ops(self, opname, obj): expected = getattr(obj.values, opname)() else: expected = pd.Period(ordinal=getattr(obj.asi8, opname)(), freq=obj.freq) - try: - assert result == expected - except TypeError: - # comparing tz-aware series with np.array results in - # TypeError + + if getattr(obj, "tz", None) is not None: + # We need to de-localize before comparing to the numpy-produced result expected = expected.astype("M8[ns]").astype("int64") assert result.value == expected + else: + assert result == expected @pytest.mark.parametrize("opname", ["max", "min"]) @pytest.mark.parametrize( diff --git a/pandas/tests/scalar/timestamp/test_comparisons.py b/pandas/tests/scalar/timestamp/test_comparisons.py index 71693a9ca61ce..3d1f71def5836 100644 --- a/pandas/tests/scalar/timestamp/test_comparisons.py +++ b/pandas/tests/scalar/timestamp/test_comparisons.py @@ -56,9 +56,18 @@ def test_comparison_dt64_ndarray_tzaware(self, reverse, all_compare_operators): if reverse: left, right = arr, ts - msg = "Cannot compare tz-naive and tz-aware timestamps" - with pytest.raises(TypeError, match=msg): - op(left, right) + if op is operator.eq: + expected = np.array([False, False], dtype=bool) + result = op(left, right) + tm.assert_numpy_array_equal(result, expected) + elif op is operator.ne: + expected = np.array([True, True], dtype=bool) + result = op(left, right) + tm.assert_numpy_array_equal(result, expected) + else: + msg = "Cannot compare tz-naive and tz-aware timestamps" + with pytest.raises(TypeError, match=msg): + op(left, right) def test_comparison_object_array(self): # GH#15183 @@ -139,10 +148,8 @@ def test_cant_compare_tz_naive_w_aware(self, utc_fixture): b = Timestamp("3/12/2012", tz=utc_fixture) msg = "Cannot compare tz-naive and tz-aware timestamps" - with pytest.raises(TypeError, match=msg): - a == b - with pytest.raises(TypeError, match=msg): - a != b + assert not a == b + assert a != b with pytest.raises(TypeError, match=msg): a < b with pytest.raises(TypeError, match=msg): @@ -152,10 +159,8 @@ def test_cant_compare_tz_naive_w_aware(self, utc_fixture): with pytest.raises(TypeError, match=msg): a >= b - with pytest.raises(TypeError, match=msg): - b == a - with pytest.raises(TypeError, match=msg): - b != a + assert not b == a + assert b != a with pytest.raises(TypeError, match=msg): b < a with pytest.raises(TypeError, match=msg): diff --git a/pandas/tests/series/indexing/test_datetime.py b/pandas/tests/series/indexing/test_datetime.py index b7fbed2b325b3..0389099a195d0 100644 --- a/pandas/tests/series/indexing/test_datetime.py +++ b/pandas/tests/series/indexing/test_datetime.py @@ -258,7 +258,7 @@ def test_getitem_setitem_datetimeindex(): lb = datetime(1990, 1, 1, 4) rb = datetime(1990, 1, 1, 7) - msg = "Cannot compare tz-naive and tz-aware datetime-like objects" + msg = r"Invalid comparison between dtype=datetime64\[ns, US/Eastern\] and datetime" with pytest.raises(TypeError, match=msg): # tznaive vs tzaware comparison is invalid # see GH#18376, GH#18162 From 2745a4e9828d596c05bd18f2f16cfd2ad61e5a62 Mon Sep 17 00:00:00 2001 From: Meghana Varanasi Date: Sun, 4 Oct 2020 17:15:42 +0530 Subject: [PATCH 1010/1025] doc/source/ecosystem.rst (#36856) --- doc/source/ecosystem.rst | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/doc/source/ecosystem.rst b/doc/source/ecosystem.rst index ed6ce7e9759b6..4086f64817568 100644 --- a/doc/source/ecosystem.rst +++ b/doc/source/ecosystem.rst @@ -170,7 +170,9 @@ invoked with the following command .. code:: python - import dtale; dtale.show(df) + import dtale + + dtale.show(df) D-Tale integrates seamlessly with jupyter notebooks, python terminals, kaggle & Google Colab. Here are some demos of the `grid `__ From 3b0b51b319cbed603a67b377c1f6b7035a45d74d Mon Sep 17 00:00:00 2001 From: beanan Date: Mon, 5 Oct 2020 01:39:23 +0800 Subject: [PATCH 1011/1025] DOC: black enhancingperf.rst and 10min.rst code style (#36849) --- doc/source/user_guide/10min.rst | 101 ++++++++--------- doc/source/user_guide/enhancingperf.rst | 141 ++++++++++++------------ setup.cfg | 3 +- 3 files changed, 124 insertions(+), 121 deletions(-) diff --git a/doc/source/user_guide/10min.rst b/doc/source/user_guide/10min.rst index 673f8689736f1..8270b2ee49bd8 100644 --- a/doc/source/user_guide/10min.rst +++ b/doc/source/user_guide/10min.rst @@ -34,9 +34,9 @@ and labeled columns: .. ipython:: python - dates = pd.date_range('20130101', periods=6) + dates = pd.date_range("20130101", periods=6) dates - df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD')) + df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list("ABCD")) df Creating a :class:`DataFrame` by passing a dict of objects that can be converted to series-like. @@ -156,7 +156,7 @@ Sorting by values: .. ipython:: python - df.sort_values(by='B') + df.sort_values(by="B") Selection --------- @@ -178,14 +178,14 @@ equivalent to ``df.A``: .. ipython:: python - df['A'] + df["A"] Selecting via ``[]``, which slices the rows. .. ipython:: python df[0:3] - df['20130102':'20130104'] + df["20130102":"20130104"] Selection by label ~~~~~~~~~~~~~~~~~~ @@ -202,31 +202,31 @@ Selecting on a multi-axis by label: .. ipython:: python - df.loc[:, ['A', 'B']] + df.loc[:, ["A", "B"]] Showing label slicing, both endpoints are *included*: .. ipython:: python - df.loc['20130102':'20130104', ['A', 'B']] + df.loc["20130102":"20130104", ["A", "B"]] Reduction in the dimensions of the returned object: .. ipython:: python - df.loc['20130102', ['A', 'B']] + df.loc["20130102", ["A", "B"]] For getting a scalar value: .. ipython:: python - df.loc[dates[0], 'A'] + df.loc[dates[0], "A"] For getting fast access to a scalar (equivalent to the prior method): .. ipython:: python - df.at[dates[0], 'A'] + df.at[dates[0], "A"] Selection by position ~~~~~~~~~~~~~~~~~~~~~ @@ -282,7 +282,7 @@ Using a single column's values to select data. .. ipython:: python - df[df['A'] > 0] + df[df["A"] > 0] Selecting values from a DataFrame where a boolean condition is met. @@ -295,9 +295,9 @@ Using the :func:`~Series.isin` method for filtering: .. ipython:: python df2 = df.copy() - df2['E'] = ['one', 'one', 'two', 'three', 'four', 'three'] + df2["E"] = ["one", "one", "two", "three", "four", "three"] df2 - df2[df2['E'].isin(['two', 'four'])] + df2[df2["E"].isin(["two", "four"])] Setting ~~~~~~~ @@ -307,15 +307,15 @@ by the indexes. .. ipython:: python - s1 = pd.Series([1, 2, 3, 4, 5, 6], index=pd.date_range('20130102', periods=6)) + s1 = pd.Series([1, 2, 3, 4, 5, 6], index=pd.date_range("20130102", periods=6)) s1 - df['F'] = s1 + df["F"] = s1 Setting values by label: .. ipython:: python - df.at[dates[0], 'A'] = 0 + df.at[dates[0], "A"] = 0 Setting values by position: @@ -327,7 +327,7 @@ Setting by assigning with a NumPy array: .. ipython:: python - df.loc[:, 'D'] = np.array([5] * len(df)) + df.loc[:, "D"] = np.array([5] * len(df)) The result of the prior setting operations. @@ -356,15 +356,15 @@ returns a copy of the data. .. ipython:: python - df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ['E']) - df1.loc[dates[0]:dates[1], 'E'] = 1 + df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ["E"]) + df1.loc[dates[0] : dates[1], "E"] = 1 df1 To drop any rows that have missing data. .. ipython:: python - df1.dropna(how='any') + df1.dropna(how="any") Filling missing data. @@ -408,7 +408,7 @@ In addition, pandas automatically broadcasts along the specified dimension. s = pd.Series([1, 3, 5, np.nan, 6, 8], index=dates).shift(2) s - df.sub(s, axis='index') + df.sub(s, axis="index") Apply @@ -444,7 +444,7 @@ some cases always uses them). See more at :ref:`Vectorized String Methods .. ipython:: python - s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat']) + s = pd.Series(["A", "B", "C", "Aaba", "Baca", np.nan, "CABA", "dog", "cat"]) s.str.lower() Merge @@ -486,21 +486,21 @@ SQL style merges. See the :ref:`Database style joining ` section. .. ipython:: python - left = pd.DataFrame({'key': ['foo', 'foo'], 'lval': [1, 2]}) - right = pd.DataFrame({'key': ['foo', 'foo'], 'rval': [4, 5]}) + left = pd.DataFrame({"key": ["foo", "foo"], "lval": [1, 2]}) + right = pd.DataFrame({"key": ["foo", "foo"], "rval": [4, 5]}) left right - pd.merge(left, right, on='key') + pd.merge(left, right, on="key") Another example that can be given is: .. ipython:: python - left = pd.DataFrame({'key': ['foo', 'bar'], 'lval': [1, 2]}) - right = pd.DataFrame({'key': ['foo', 'bar'], 'rval': [4, 5]}) + left = pd.DataFrame({"key": ["foo", "bar"], "lval": [1, 2]}) + right = pd.DataFrame({"key": ["foo", "bar"], "rval": [4, 5]}) left right - pd.merge(left, right, on='key') + pd.merge(left, right, on="key") Grouping -------- @@ -531,14 +531,14 @@ groups. .. ipython:: python - df.groupby('A').sum() + df.groupby("A").sum() Grouping by multiple columns forms a hierarchical index, and again we can apply the :meth:`~pandas.core.groupby.GroupBy.sum` function. .. ipython:: python - df.groupby(['A', 'B']).sum() + df.groupby(["A", "B"]).sum() Reshaping --------- @@ -559,8 +559,8 @@ Stack ] ) ) - index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second']) - df = pd.DataFrame(np.random.randn(8, 2), index=index, columns=['A', 'B']) + index = pd.MultiIndex.from_tuples(tuples, names=["first", "second"]) + df = pd.DataFrame(np.random.randn(8, 2), index=index, columns=["A", "B"]) df2 = df[:4] df2 @@ -603,7 +603,7 @@ We can produce pivot tables from this data very easily: .. ipython:: python - pd.pivot_table(df, values='D', index=['A', 'B'], columns=['C']) + pd.pivot_table(df, values="D", index=["A", "B"], columns=["C"]) Time series @@ -616,31 +616,31 @@ financial applications. See the :ref:`Time Series section `. .. ipython:: python - rng = pd.date_range('1/1/2012', periods=100, freq='S') + rng = pd.date_range("1/1/2012", periods=100, freq="S") ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng) - ts.resample('5Min').sum() + ts.resample("5Min").sum() Time zone representation: .. ipython:: python - rng = pd.date_range('3/6/2012 00:00', periods=5, freq='D') + rng = pd.date_range("3/6/2012 00:00", periods=5, freq="D") ts = pd.Series(np.random.randn(len(rng)), rng) ts - ts_utc = ts.tz_localize('UTC') + ts_utc = ts.tz_localize("UTC") ts_utc Converting to another time zone: .. ipython:: python - ts_utc.tz_convert('US/Eastern') + ts_utc.tz_convert("US/Eastern") Converting between time span representations: .. ipython:: python - rng = pd.date_range('1/1/2012', periods=5, freq='M') + rng = pd.date_range("1/1/2012", periods=5, freq="M") ts = pd.Series(np.random.randn(len(rng)), index=rng) ts ps = ts.to_period() @@ -654,9 +654,9 @@ the quarter end: .. ipython:: python - prng = pd.period_range('1990Q1', '2000Q4', freq='Q-NOV') + prng = pd.period_range("1990Q1", "2000Q4", freq="Q-NOV") ts = pd.Series(np.random.randn(len(prng)), prng) - ts.index = (prng.asfreq('M', 'e') + 1).asfreq('H', 's') + 9 + ts.index = (prng.asfreq("M", "e") + 1).asfreq("H", "s") + 9 ts.head() Categoricals @@ -754,19 +754,20 @@ CSV .. ipython:: python - df.to_csv('foo.csv') + df.to_csv("foo.csv") :ref:`Reading from a csv file. ` .. ipython:: python - pd.read_csv('foo.csv') + pd.read_csv("foo.csv") .. ipython:: python :suppress: import os - os.remove('foo.csv') + + os.remove("foo.csv") HDF5 ~~~~ @@ -777,18 +778,18 @@ Writing to a HDF5 Store. .. ipython:: python - df.to_hdf('foo.h5', 'df') + df.to_hdf("foo.h5", "df") Reading from a HDF5 Store. .. ipython:: python - pd.read_hdf('foo.h5', 'df') + pd.read_hdf("foo.h5", "df") .. ipython:: python :suppress: - os.remove('foo.h5') + os.remove("foo.h5") Excel ~~~~~ @@ -799,18 +800,18 @@ Writing to an excel file. .. ipython:: python - df.to_excel('foo.xlsx', sheet_name='Sheet1') + df.to_excel("foo.xlsx", sheet_name="Sheet1") Reading from an excel file. .. ipython:: python - pd.read_excel('foo.xlsx', 'Sheet1', index_col=None, na_values=['NA']) + pd.read_excel("foo.xlsx", "Sheet1", index_col=None, na_values=["NA"]) .. ipython:: python :suppress: - os.remove('foo.xlsx') + os.remove("foo.xlsx") Gotchas ------- diff --git a/doc/source/user_guide/enhancingperf.rst b/doc/source/user_guide/enhancingperf.rst index ce9db0a5279c3..d30554986607d 100644 --- a/doc/source/user_guide/enhancingperf.rst +++ b/doc/source/user_guide/enhancingperf.rst @@ -48,10 +48,14 @@ We have a ``DataFrame`` to which we want to apply a function row-wise. .. ipython:: python - df = pd.DataFrame({'a': np.random.randn(1000), - 'b': np.random.randn(1000), - 'N': np.random.randint(100, 1000, (1000)), - 'x': 'x'}) + df = pd.DataFrame( + { + "a": np.random.randn(1000), + "b": np.random.randn(1000), + "N": np.random.randint(100, 1000, (1000)), + "x": "x", + } + ) df Here's the function in pure Python: @@ -61,6 +65,7 @@ Here's the function in pure Python: def f(x): return x * (x - 1) + def integrate_f(a, b, N): s = 0 dx = (b - a) / N @@ -72,7 +77,7 @@ We achieve our result by using ``apply`` (row-wise): .. code-block:: ipython - In [7]: %timeit df.apply(lambda x: integrate_f(x['a'], x['b'], x['N']), axis=1) + In [7]: %timeit df.apply(lambda x: integrate_f(x["a"], x["b"], x["N"]), axis=1) 10 loops, best of 3: 174 ms per loop But clearly this isn't fast enough for us. Let's take a look and see where the @@ -81,7 +86,7 @@ four calls) using the `prun ipython magic function 0) & (df2 > 0) & (df3 > 0) & (df4 > 0)') + %timeit pd.eval("(df1 > 0) & (df2 > 0) & (df3 > 0) & (df4 > 0)") :func:`~pandas.eval` also works with unaligned pandas objects: @@ -560,7 +557,7 @@ Now let's do the same thing but with comparisons: .. ipython:: python - %timeit pd.eval('df1 + df2 + df3 + df4 + s') + %timeit pd.eval("df1 + df2 + df3 + df4 + s") .. note:: @@ -587,19 +584,19 @@ evaluate an expression in the "context" of a :class:`~pandas.DataFrame`. :suppress: try: - del a + del a except NameError: - pass + pass try: - del b + del b except NameError: - pass + pass .. ipython:: python - df = pd.DataFrame(np.random.randn(5, 2), columns=['a', 'b']) - df.eval('a + b') + df = pd.DataFrame(np.random.randn(5, 2), columns=["a", "b"]) + df.eval("a + b") Any expression that is a valid :func:`pandas.eval` expression is also a valid :meth:`DataFrame.eval` expression, with the added benefit that you don't have to @@ -617,9 +614,9 @@ on the original ``DataFrame`` or return a copy with the new column. .. ipython:: python df = pd.DataFrame(dict(a=range(5), b=range(5, 10))) - df.eval('c = a + b', inplace=True) - df.eval('d = a + b + c', inplace=True) - df.eval('a = 1', inplace=True) + df.eval("c = a + b", inplace=True) + df.eval("d = a + b + c", inplace=True) + df.eval("a = 1", inplace=True) df When ``inplace`` is set to ``False``, the default, a copy of the ``DataFrame`` with the @@ -628,7 +625,7 @@ new or modified columns is returned and the original frame is unchanged. .. ipython:: python df - df.eval('e = a - c', inplace=False) + df.eval("e = a - c", inplace=False) df As a convenience, multiple assignments can be performed by using a @@ -636,19 +633,22 @@ multi-line string. .. ipython:: python - df.eval(""" + df.eval( + """ c = a + b d = a + b + c - a = 1""", inplace=False) + a = 1""", + inplace=False, + ) The equivalent in standard Python would be .. ipython:: python df = pd.DataFrame(dict(a=range(5), b=range(5, 10))) - df['c'] = df['a'] + df['b'] - df['d'] = df['a'] + df['b'] + df['c'] - df['a'] = 1 + df["c"] = df["a"] + df["b"] + df["d"] = df["a"] + df["b"] + df["c"] + df["a"] = 1 df The ``query`` method has a ``inplace`` keyword which determines @@ -657,8 +657,8 @@ whether the query modifies the original frame. .. ipython:: python df = pd.DataFrame(dict(a=range(5), b=range(5, 10))) - df.query('a > 2') - df.query('a > 2', inplace=True) + df.query("a > 2") + df.query("a > 2", inplace=True) df Local variables @@ -669,10 +669,10 @@ expression by placing the ``@`` character in front of the name. For example, .. ipython:: python - df = pd.DataFrame(np.random.randn(5, 2), columns=list('ab')) + df = pd.DataFrame(np.random.randn(5, 2), columns=list("ab")) newcol = np.random.randn(len(df)) - df.eval('b + @newcol') - df.query('b < @newcol') + df.eval("b + @newcol") + df.query("b < @newcol") If you don't prefix the local variable with ``@``, pandas will raise an exception telling you the variable is undefined. @@ -685,8 +685,8 @@ name in an expression. .. ipython:: python a = np.random.randn() - df.query('@a < a') - df.loc[a < df['a']] # same as the previous expression + df.query("@a < a") + df.loc[a < df["a"]] # same as the previous expression With :func:`pandas.eval` you cannot use the ``@`` prefix *at all*, because it isn't defined in that context. ``pandas`` will let you know this if you try to @@ -696,14 +696,14 @@ use ``@`` in a top-level call to :func:`pandas.eval`. For example, :okexcept: a, b = 1, 2 - pd.eval('@a + b') + pd.eval("@a + b") In this case, you should simply refer to the variables like you would in standard Python. .. ipython:: python - pd.eval('a + b') + pd.eval("a + b") :func:`pandas.eval` parsers @@ -723,10 +723,10 @@ semantics. .. ipython:: python - expr = '(df1 > 0) & (df2 > 0) & (df3 > 0) & (df4 > 0)' - x = pd.eval(expr, parser='python') - expr_no_parens = 'df1 > 0 & df2 > 0 & df3 > 0 & df4 > 0' - y = pd.eval(expr_no_parens, parser='pandas') + expr = "(df1 > 0) & (df2 > 0) & (df3 > 0) & (df4 > 0)" + x = pd.eval(expr, parser="python") + expr_no_parens = "df1 > 0 & df2 > 0 & df3 > 0 & df4 > 0" + y = pd.eval(expr_no_parens, parser="pandas") np.all(x == y) @@ -735,10 +735,10 @@ well: .. ipython:: python - expr = '(df1 > 0) & (df2 > 0) & (df3 > 0) & (df4 > 0)' - x = pd.eval(expr, parser='python') - expr_with_ands = 'df1 > 0 and df2 > 0 and df3 > 0 and df4 > 0' - y = pd.eval(expr_with_ands, parser='pandas') + expr = "(df1 > 0) & (df2 > 0) & (df3 > 0) & (df4 > 0)" + x = pd.eval(expr, parser="python") + expr_with_ands = "df1 > 0 and df2 > 0 and df3 > 0 and df4 > 0" + y = pd.eval(expr_with_ands, parser="pandas") np.all(x == y) @@ -768,7 +768,7 @@ is a bit slower (not by much) than evaluating the same expression in Python .. ipython:: python - %timeit pd.eval('df1 + df2 + df3 + df4', engine='python') + %timeit pd.eval("df1 + df2 + df3 + df4", engine="python") :func:`pandas.eval` performance @@ -812,10 +812,11 @@ you have an expression--for example .. ipython:: python - df = pd.DataFrame({'strings': np.repeat(list('cba'), 3), - 'nums': np.repeat(range(3), 3)}) + df = pd.DataFrame( + {"strings": np.repeat(list("cba"), 3), "nums": np.repeat(range(3), 3)} + ) df - df.query('strings == "a" and nums == 1') + df.query("strings == 'a' and nums == 1") the numeric part of the comparison (``nums == 1``) will be evaluated by ``numexpr``. diff --git a/setup.cfg b/setup.cfg index 73986f692b6cd..8702e903d825b 100644 --- a/setup.cfg +++ b/setup.cfg @@ -39,7 +39,8 @@ bootstrap = import pandas as pd np # avoiding error when importing again numpy or pandas pd # (in some cases we want to do it to show users) -ignore = E402, # module level import not at top of file +ignore = E203, # space before : (needed for how black formats slicing) + E402, # module level import not at top of file W503, # line break before binary operator # Classes/functions in different blocks can generate those errors E302, # expected 2 blank lines, found 0 From 08c312bc07c2a5e3cfd527e11a6f46e2379f3c47 Mon Sep 17 00:00:00 2001 From: Levi Matus Date: Sun, 4 Oct 2020 17:13:31 -0300 Subject: [PATCH 1012/1025] DOC: normalize usage of word "pandas" (#36845) --- doc/source/development/code_style.rst | 2 +- doc/source/development/contributing.rst | 6 ++--- doc/source/development/maintaining.rst | 2 +- doc/source/ecosystem.rst | 22 +++++++++---------- .../comparison/comparison_with_r.rst | 16 +++++++------- .../comparison/comparison_with_stata.rst | 8 +++---- doc/source/getting_started/install.rst | 4 ++-- doc/source/getting_started/overview.rst | 2 +- doc/source/reference/arrays.rst | 14 ++++++------ doc/source/reference/series.rst | 2 +- doc/source/user_guide/basics.rst | 18 +++++++-------- doc/source/user_guide/boolean.rst | 2 +- doc/source/user_guide/categorical.rst | 4 ++-- doc/source/user_guide/cookbook.rst | 2 +- doc/source/user_guide/dsintro.rst | 12 +++++----- doc/source/user_guide/duplicates.rst | 2 +- doc/source/user_guide/enhancingperf.rst | 2 +- doc/source/user_guide/groupby.rst | 4 ++-- doc/source/user_guide/indexing.rst | 10 ++++----- doc/source/user_guide/integer_na.rst | 2 +- doc/source/user_guide/io.rst | 20 ++++++++--------- doc/source/user_guide/missing_data.rst | 4 ++-- doc/source/user_guide/scale.rst | 8 +++---- doc/source/user_guide/sparse.rst | 4 ++-- doc/source/user_guide/timedeltas.rst | 2 +- doc/source/user_guide/timeseries.rst | 2 +- doc/source/user_guide/visualization.rst | 4 ++-- doc/source/whatsnew/v0.11.0.rst | 2 +- doc/source/whatsnew/v0.13.0.rst | 2 +- doc/source/whatsnew/v0.17.0.rst | 4 ++-- doc/source/whatsnew/v0.19.0.rst | 6 ++--- doc/source/whatsnew/v0.20.0.rst | 8 +++---- doc/source/whatsnew/v0.21.0.rst | 4 ++-- doc/source/whatsnew/v0.21.1.rst | 2 +- doc/source/whatsnew/v0.22.0.rst | 2 +- doc/source/whatsnew/v0.23.0.rst | 12 +++++----- doc/source/whatsnew/v0.23.2.rst | 2 +- doc/source/whatsnew/v0.24.0.rst | 10 ++++----- doc/source/whatsnew/v0.25.0.rst | 16 +++++++------- doc/source/whatsnew/v0.25.1.rst | 2 +- doc/source/whatsnew/v0.25.2.rst | 2 +- doc/source/whatsnew/v1.0.0.rst | 8 +++---- doc/source/whatsnew/v1.1.3.rst | 2 +- doc/source/whatsnew/v1.2.0.rst | 4 ++-- 44 files changed, 134 insertions(+), 134 deletions(-) diff --git a/doc/source/development/code_style.rst b/doc/source/development/code_style.rst index 387f65ea583a0..5aa1c1099d6e0 100644 --- a/doc/source/development/code_style.rst +++ b/doc/source/development/code_style.rst @@ -9,7 +9,7 @@ pandas code style guide .. contents:: Table of contents: :local: -*pandas* follows the `PEP8 `_ +pandas follows the `PEP8 `_ standard and uses `Black `_ and `Flake8 `_ to ensure a consistent code format throughout the project. For details see the diff --git a/doc/source/development/contributing.rst b/doc/source/development/contributing.rst index d6955c5d4b8d2..17eba825d1c29 100644 --- a/doc/source/development/contributing.rst +++ b/doc/source/development/contributing.rst @@ -155,7 +155,7 @@ Using a Docker container Instead of manually setting up a development environment, you can use `Docker `_ to automatically create the environment with just several -commands. Pandas provides a ``DockerFile`` in the root directory to build a Docker image +commands. pandas provides a ``DockerFile`` in the root directory to build a Docker image with a full pandas development environment. **Docker Commands** @@ -190,7 +190,7 @@ Note that you might need to rebuild the C extensions if/when you merge with upst Installing a C compiler ~~~~~~~~~~~~~~~~~~~~~~~ -Pandas uses C extensions (mostly written using Cython) to speed up certain +pandas uses C extensions (mostly written using Cython) to speed up certain operations. To install pandas from source, you need to compile these C extensions, which means you need a C compiler. This process depends on which platform you're using. @@ -1219,7 +1219,7 @@ This test shows off several useful features of Hypothesis, as well as demonstrating a good use-case: checking properties that should hold over a large or complicated domain of inputs. -To keep the Pandas test suite running quickly, parametrized tests are +To keep the pandas test suite running quickly, parametrized tests are preferred if the inputs or logic are simple, with Hypothesis tests reserved for cases with complex logic or where there are too many combinations of options or subtle interactions to test (or think of!) all of them. diff --git a/doc/source/development/maintaining.rst b/doc/source/development/maintaining.rst index cd084ab263477..2a21704c27005 100644 --- a/doc/source/development/maintaining.rst +++ b/doc/source/development/maintaining.rst @@ -207,7 +207,7 @@ Only core team members can merge pull requests. We have a few guidelines. 1. You should typically not self-merge your own pull requests. Exceptions include things like small changes to fix CI (e.g. pinning a package version). 2. You should not merge pull requests that have an active discussion, or pull - requests that has any ``-1`` votes from a core maintainer. Pandas operates + requests that has any ``-1`` votes from a core maintainer. pandas operates by consensus. 3. For larger changes, it's good to have a +1 from at least two core team members. diff --git a/doc/source/ecosystem.rst b/doc/source/ecosystem.rst index 4086f64817568..8f04d05cfcb04 100644 --- a/doc/source/ecosystem.rst +++ b/doc/source/ecosystem.rst @@ -98,7 +98,7 @@ With Altair, you can spend more time understanding your data and its meaning. Altair's API is simple, friendly and consistent and built on top of the powerful Vega-Lite JSON specification. This elegant simplicity produces beautiful and effective visualizations with a -minimal amount of code. Altair works with Pandas DataFrames. +minimal amount of code. Altair works with pandas DataFrames. `Bokeh `__ @@ -110,7 +110,7 @@ graphics in the style of Protovis/D3, while delivering high-performance interact large data to thin clients. `Pandas-Bokeh `__ provides a high level API -for Bokeh that can be loaded as a native Pandas plotting backend via +for Bokeh that can be loaded as a native pandas plotting backend via .. code:: python @@ -187,7 +187,7 @@ IDE ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ IPython is an interactive command shell and distributed computing -environment. IPython tab completion works with Pandas methods and also +environment. IPython tab completion works with pandas methods and also attributes like DataFrame columns. `Jupyter Notebook / Jupyter Lab `__ @@ -201,7 +201,7 @@ Jupyter notebooks can be converted to a number of open standard output formats Python) through 'Download As' in the web interface and ``jupyter convert`` in a shell. -Pandas DataFrames implement ``_repr_html_``and ``_repr_latex`` methods +pandas DataFrames implement ``_repr_html_``and ``_repr_latex`` methods which are utilized by Jupyter Notebook for displaying (abbreviated) HTML or LaTeX tables. LaTeX output is properly escaped. (Note: HTML tables may or may not be @@ -229,7 +229,7 @@ Its `Variable Explorer `__ allows users to view, manipulate and edit pandas ``Index``, ``Series``, and ``DataFrame`` objects like a "spreadsheet", including copying and modifying values, sorting, displaying a "heatmap", converting data types and more. -Pandas objects can also be renamed, duplicated, new columns added, +pandas objects can also be renamed, duplicated, new columns added, copyed/pasted to/from the clipboard (as TSV), and saved/loaded to/from a file. Spyder can also import data from a variety of plain text and binary files or the clipboard into a new pandas DataFrame via a sophisticated import wizard. @@ -276,13 +276,13 @@ The following data feeds are available: `Quandl/Python `__ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Quandl API for Python wraps the Quandl REST API to return -Pandas DataFrames with timeseries indexes. +pandas DataFrames with timeseries indexes. `Pydatastream `__ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ PyDatastream is a Python interface to the `Refinitiv Datastream (DWS) `__ -REST API to return indexed Pandas DataFrames with financial data. +REST API to return indexed pandas DataFrames with financial data. This package requires valid credentials for this API (non free). `pandaSDMX `__ @@ -357,7 +357,7 @@ Out-of-core ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Blaze provides a standard API for doing computations with various -in-memory and on-disk backends: NumPy, Pandas, SQLAlchemy, MongoDB, PyTables, +in-memory and on-disk backends: NumPy, pandas, SQLAlchemy, MongoDB, PyTables, PySpark. `Dask `__ @@ -403,7 +403,7 @@ If also displays progress bars. `Ray `__ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Pandas on Ray is an early stage DataFrame library that wraps Pandas and transparently distributes the data and computation. The user does not need to know how many cores their system has, nor do they need to specify how to distribute the data. In fact, users can continue using their previous Pandas notebooks while experiencing a considerable speedup from Pandas on Ray, even on a single machine. Only a modification of the import statement is needed, as we demonstrate below. Once you’ve changed your import statement, you’re ready to use Pandas on Ray just like you would Pandas. +pandas on Ray is an early stage DataFrame library that wraps pandas and transparently distributes the data and computation. The user does not need to know how many cores their system has, nor do they need to specify how to distribute the data. In fact, users can continue using their previous pandas notebooks while experiencing a considerable speedup from pandas on Ray, even on a single machine. Only a modification of the import statement is needed, as we demonstrate below. Once you’ve changed your import statement, you’re ready to use pandas on Ray just like you would pandas. .. code:: python @@ -414,7 +414,7 @@ Pandas on Ray is an early stage DataFrame library that wraps Pandas and transpar `Vaex `__ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Increasingly, packages are being built on top of pandas to address specific needs in data preparation, analysis and visualization. Vaex is a python library for Out-of-Core DataFrames (similar to Pandas), to visualize and explore big tabular datasets. It can calculate statistics such as mean, sum, count, standard deviation etc, on an N-dimensional grid up to a billion (10\ :sup:`9`) objects/rows per second. Visualization is done using histograms, density plots and 3d volume rendering, allowing interactive exploration of big data. Vaex uses memory mapping, zero memory copy policy and lazy computations for best performance (no memory wasted). +Increasingly, packages are being built on top of pandas to address specific needs in data preparation, analysis and visualization. Vaex is a python library for Out-of-Core DataFrames (similar to pandas), to visualize and explore big tabular datasets. It can calculate statistics such as mean, sum, count, standard deviation etc, on an N-dimensional grid up to a billion (10\ :sup:`9`) objects/rows per second. Visualization is done using histograms, density plots and 3d volume rendering, allowing interactive exploration of big data. Vaex uses memory mapping, zero memory copy policy and lazy computations for best performance (no memory wasted). * vaex.from_pandas * vaex.to_pandas_df @@ -424,7 +424,7 @@ Increasingly, packages are being built on top of pandas to address specific need Extension data types -------------------- -Pandas provides an interface for defining +pandas provides an interface for defining :ref:`extension types ` to extend NumPy's type system. The following libraries implement that interface to provide types not found in NumPy or pandas, which work well with pandas' data containers. diff --git a/doc/source/getting_started/comparison/comparison_with_r.rst b/doc/source/getting_started/comparison/comparison_with_r.rst index 358bb6ad951f0..864081002086b 100644 --- a/doc/source/getting_started/comparison/comparison_with_r.rst +++ b/doc/source/getting_started/comparison/comparison_with_r.rst @@ -5,11 +5,11 @@ Comparison with R / R libraries ******************************* -Since ``pandas`` aims to provide a lot of the data manipulation and analysis +Since pandas aims to provide a lot of the data manipulation and analysis functionality that people use `R `__ for, this page was started to provide a more detailed look at the `R language `__ and its many third -party libraries as they relate to ``pandas``. In comparisons with R and CRAN +party libraries as they relate to pandas. In comparisons with R and CRAN libraries, we care about the following things: * **Functionality / flexibility**: what can/cannot be done with each tool @@ -21,7 +21,7 @@ libraries, we care about the following things: This page is also here to offer a bit of a translation guide for users of these R packages. -For transfer of ``DataFrame`` objects from ``pandas`` to R, one option is to +For transfer of ``DataFrame`` objects from pandas to R, one option is to use HDF5 files, see :ref:`io.external_compatibility` for an example. @@ -118,7 +118,7 @@ or by integer location df <- data.frame(matrix(rnorm(1000), ncol=100)) df[, c(1:10, 25:30, 40, 50:100)] -Selecting multiple columns by name in ``pandas`` is straightforward +Selecting multiple columns by name in pandas is straightforward .. ipython:: python @@ -235,7 +235,7 @@ since the subclass sizes are possibly irregular. Using a data.frame called tapply(baseball$batting.average, baseball.example$team, max) -In ``pandas`` we may use :meth:`~pandas.pivot_table` method to handle this: +In pandas we may use :meth:`~pandas.pivot_table` method to handle this: .. ipython:: python @@ -268,7 +268,7 @@ column's values are less than another column's values: subset(df, a <= b) df[df$a <= df$b,] # note the comma -In ``pandas``, there are a few ways to perform subsetting. You can use +In pandas, there are a few ways to perform subsetting. You can use :meth:`~pandas.DataFrame.query` or pass an expression as if it were an index/slice as well as standard boolean indexing: @@ -295,7 +295,7 @@ An expression using a data.frame called ``df`` in R with the columns ``a`` and with(df, a + b) df$a + df$b # same as the previous expression -In ``pandas`` the equivalent expression, using the +In pandas the equivalent expression, using the :meth:`~pandas.DataFrame.eval` method, would be: .. ipython:: python @@ -347,7 +347,7 @@ summarize ``x`` by ``month``: mean = round(mean(x), 2), sd = round(sd(x), 2)) -In ``pandas`` the equivalent expression, using the +In pandas the equivalent expression, using the :meth:`~pandas.DataFrame.groupby` method, would be: .. ipython:: python diff --git a/doc/source/getting_started/comparison/comparison_with_stata.rst b/doc/source/getting_started/comparison/comparison_with_stata.rst index 7b8d9c6be61db..014506cc18327 100644 --- a/doc/source/getting_started/comparison/comparison_with_stata.rst +++ b/doc/source/getting_started/comparison/comparison_with_stata.rst @@ -146,7 +146,7 @@ the pandas command would be: # alternatively, read_table is an alias to read_csv with tab delimiter tips = pd.read_table("tips.csv", header=None) -Pandas can also read Stata data sets in ``.dta`` format with the :func:`read_stata` function. +pandas can also read Stata data sets in ``.dta`` format with the :func:`read_stata` function. .. code-block:: python @@ -172,7 +172,7 @@ Similarly in pandas, the opposite of ``read_csv`` is :meth:`DataFrame.to_csv`. tips.to_csv("tips2.csv") -Pandas can also export to Stata file format with the :meth:`DataFrame.to_stata` method. +pandas can also export to Stata file format with the :meth:`DataFrame.to_stata` method. .. code-block:: python @@ -583,7 +583,7 @@ should be used for comparisons. outer_join[pd.isna(outer_join["value_x"])] outer_join[pd.notna(outer_join["value_x"])] -Pandas also provides a variety of methods to work with missing data -- some of +pandas also provides a variety of methods to work with missing data -- some of which would be challenging to express in Stata. For example, there are methods to drop all rows with any missing values, replacing missing values with a specified value, like the mean, or forward filling from previous rows. See the @@ -674,7 +674,7 @@ Other considerations Disk vs memory ~~~~~~~~~~~~~~ -Pandas and Stata both operate exclusively in memory. This means that the size of +pandas and Stata both operate exclusively in memory. This means that the size of data able to be loaded in pandas is limited by your machine's memory. If out of core processing is needed, one possibility is the `dask.dataframe `_ diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index a6341451b1b80..70d145c54e919 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -184,7 +184,7 @@ You can find simple installation instructions for pandas in this document: ``ins Installing from source ~~~~~~~~~~~~~~~~~~~~~~ -See the :ref:`contributing guide ` for complete instructions on building from the git source tree. Further, see :ref:`creating a development environment ` if you wish to create a *pandas* development environment. +See the :ref:`contributing guide ` for complete instructions on building from the git source tree. Further, see :ref:`creating a development environment ` if you wish to create a pandas development environment. Running the test suite ---------------------- @@ -249,7 +249,7 @@ Recommended dependencies Optional dependencies ~~~~~~~~~~~~~~~~~~~~~ -Pandas has many optional dependencies that are only used for specific methods. +pandas has many optional dependencies that are only used for specific methods. For example, :func:`pandas.read_hdf` requires the ``pytables`` package, while :meth:`DataFrame.to_markdown` requires the ``tabulate`` package. If the optional dependency is not installed, pandas will raise an ``ImportError`` when diff --git a/doc/source/getting_started/overview.rst b/doc/source/getting_started/overview.rst index 57d87d4ec8a91..3043cf25c5312 100644 --- a/doc/source/getting_started/overview.rst +++ b/doc/source/getting_started/overview.rst @@ -6,7 +6,7 @@ Package overview **************** -**pandas** is a `Python `__ package providing fast, +pandas is a `Python `__ package providing fast, flexible, and expressive data structures designed to make working with "relational" or "labeled" data both easy and intuitive. It aims to be the fundamental high-level building block for doing practical, **real-world** data diff --git a/doc/source/reference/arrays.rst b/doc/source/reference/arrays.rst index 1725c415fa020..5c068d8404cd6 100644 --- a/doc/source/reference/arrays.rst +++ b/doc/source/reference/arrays.rst @@ -16,7 +16,7 @@ For some data types, pandas extends NumPy's type system. String aliases for thes can be found at :ref:`basics.dtypes`. =================== ========================= ================== ============================= -Kind of Data Pandas Data Type Scalar Array +Kind of Data pandas Data Type Scalar Array =================== ========================= ================== ============================= TZ-aware datetime :class:`DatetimeTZDtype` :class:`Timestamp` :ref:`api.arrays.datetime` Timedeltas (none) :class:`Timedelta` :ref:`api.arrays.timedelta` @@ -29,7 +29,7 @@ Strings :class:`StringDtype` :class:`str` :ref:`api.array Boolean (with NA) :class:`BooleanDtype` :class:`bool` :ref:`api.arrays.bool` =================== ========================= ================== ============================= -Pandas and third-party libraries can extend NumPy's type system (see :ref:`extending.extension-types`). +pandas and third-party libraries can extend NumPy's type system (see :ref:`extending.extension-types`). The top-level :meth:`array` method can be used to create a new array, which may be stored in a :class:`Series`, :class:`Index`, or as a column in a :class:`DataFrame`. @@ -43,7 +43,7 @@ stored in a :class:`Series`, :class:`Index`, or as a column in a :class:`DataFra Datetime data ------------- -NumPy cannot natively represent timezone-aware datetimes. Pandas supports this +NumPy cannot natively represent timezone-aware datetimes. pandas supports this with the :class:`arrays.DatetimeArray` extension array, which can hold timezone-naive or timezone-aware values. @@ -162,7 +162,7 @@ If the data are tz-aware, then every value in the array must have the same timez Timedelta data -------------- -NumPy can natively represent timedeltas. Pandas provides :class:`Timedelta` +NumPy can natively represent timedeltas. pandas provides :class:`Timedelta` for symmetry with :class:`Timestamp`. .. autosummary:: @@ -217,7 +217,7 @@ A collection of timedeltas may be stored in a :class:`TimedeltaArray`. Timespan data ------------- -Pandas represents spans of times as :class:`Period` objects. +pandas represents spans of times as :class:`Period` objects. Period ------ @@ -352,7 +352,7 @@ Nullable integer ---------------- :class:`numpy.ndarray` cannot natively represent integer-data with missing values. -Pandas provides this through :class:`arrays.IntegerArray`. +pandas provides this through :class:`arrays.IntegerArray`. .. autosummary:: :toctree: api/ @@ -378,7 +378,7 @@ Pandas provides this through :class:`arrays.IntegerArray`. Categorical data ---------------- -Pandas defines a custom data type for representing data that can take only a +pandas defines a custom data type for representing data that can take only a limited, fixed set of values. The dtype of a ``Categorical`` can be described by a :class:`pandas.api.types.CategoricalDtype`. diff --git a/doc/source/reference/series.rst b/doc/source/reference/series.rst index 5131d35334693..f1069e46b56cc 100644 --- a/doc/source/reference/series.rst +++ b/doc/source/reference/series.rst @@ -280,7 +280,7 @@ Time Series-related Accessors --------- -Pandas provides dtype-specific methods under various accessors. +pandas provides dtype-specific methods under various accessors. These are separate namespaces within :class:`Series` that only apply to specific data types. diff --git a/doc/source/user_guide/basics.rst b/doc/source/user_guide/basics.rst index e348111fe7881..5fa214d2ed389 100644 --- a/doc/source/user_guide/basics.rst +++ b/doc/source/user_guide/basics.rst @@ -52,7 +52,7 @@ Note, **these attributes can be safely assigned to**! df.columns = [x.lower() for x in df.columns] df -Pandas objects (:class:`Index`, :class:`Series`, :class:`DataFrame`) can be +pandas objects (:class:`Index`, :class:`Series`, :class:`DataFrame`) can be thought of as containers for arrays, which hold the actual data and do the actual computation. For many types, the underlying array is a :class:`numpy.ndarray`. However, pandas and 3rd party libraries may *extend* @@ -410,7 +410,7 @@ data structure with a scalar value: pd.Series(['foo', 'bar', 'baz']) == 'foo' pd.Index(['foo', 'bar', 'baz']) == 'foo' -Pandas also handles element-wise comparisons between different array-like +pandas also handles element-wise comparisons between different array-like objects of the same length: .. ipython:: python @@ -804,7 +804,7 @@ Is equivalent to: (df_p.pipe(extract_city_name) .pipe(add_country_name, country_name="US")) -Pandas encourages the second style, which is known as method chaining. +pandas encourages the second style, which is known as method chaining. ``pipe`` makes it easy to use your own or another library's functions in method chains, alongside pandas' methods. @@ -1498,7 +1498,7 @@ Thus, for example, iterating over a DataFrame gives you the column names: print(col) -Pandas objects also have the dict-like :meth:`~DataFrame.items` method to +pandas objects also have the dict-like :meth:`~DataFrame.items` method to iterate over the (key, value) pairs. To iterate over the rows of a DataFrame, you can use the following methods: @@ -1741,7 +1741,7 @@ always uses them). .. note:: Prior to pandas 1.0, string methods were only available on ``object`` -dtype - ``Series``. Pandas 1.0 added the :class:`StringDtype` which is dedicated + ``Series``. pandas 1.0 added the :class:`StringDtype` which is dedicated to strings. See :ref:`text.types` for more. Please see :ref:`Vectorized String Methods ` for a complete @@ -1752,7 +1752,7 @@ description. Sorting ------- -Pandas supports three kinds of sorting: sorting by index labels, +pandas supports three kinds of sorting: sorting by index labels, sorting by column values, and sorting by a combination of both. .. _basics.sort_index: @@ -1995,7 +1995,7 @@ columns of a DataFrame. NumPy provides support for ``float``, ``int``, ``bool``, ``timedelta64[ns]`` and ``datetime64[ns]`` (note that NumPy does not support timezone-aware datetimes). -Pandas and third-party libraries *extend* NumPy's type system in a few places. +pandas and third-party libraries *extend* NumPy's type system in a few places. This section describes the extensions pandas has made internally. See :ref:`extending.extension-types` for how to write your own extension that works with pandas. See :ref:`ecosystem.extensions` for a list of third-party @@ -2032,7 +2032,7 @@ documentation sections for more on each type. | Boolean (with NA) | :class:`BooleanDtype` | :class:`bool` | :class:`arrays.BooleanArray` | ``'boolean'`` | :ref:`api.arrays.bool` | +-------------------+---------------------------+--------------------+-------------------------------+-----------------------------------------+-------------------------------+ -Pandas has two ways to store strings. +pandas has two ways to store strings. 1. ``object`` dtype, which can hold any Python object, including strings. 2. :class:`StringDtype`, which is dedicated to strings. @@ -2424,5 +2424,5 @@ All NumPy dtypes are subclasses of ``numpy.generic``: .. note:: - Pandas also defines the types ``category``, and ``datetime64[ns, tz]``, which are not integrated into the normal + pandas also defines the types ``category``, and ``datetime64[ns, tz]``, which are not integrated into the normal NumPy hierarchy and won't show up with the above function. diff --git a/doc/source/user_guide/boolean.rst b/doc/source/user_guide/boolean.rst index d690c1093399a..76c922fcef638 100644 --- a/doc/source/user_guide/boolean.rst +++ b/doc/source/user_guide/boolean.rst @@ -82,7 +82,7 @@ the ``NA`` really is ``True`` or ``False``, since ``True & True`` is ``True``, but ``True & False`` is ``False``, so we can't determine the output. -This differs from how ``np.nan`` behaves in logical operations. Pandas treated +This differs from how ``np.nan`` behaves in logical operations. pandas treated ``np.nan`` is *always false in the output*. In ``or`` diff --git a/doc/source/user_guide/categorical.rst b/doc/source/user_guide/categorical.rst index 6a8e1767ef7e8..67f11bbb45b02 100644 --- a/doc/source/user_guide/categorical.rst +++ b/doc/source/user_guide/categorical.rst @@ -1011,7 +1011,7 @@ The following differences to R's factor functions can be observed: * In contrast to R's ``factor`` function, using categorical data as the sole input to create a new categorical series will *not* remove unused categories but create a new categorical series which is equal to the passed in one! -* R allows for missing values to be included in its ``levels`` (pandas' ``categories``). Pandas +* R allows for missing values to be included in its ``levels`` (pandas' ``categories``). pandas does not allow ``NaN`` categories, but missing values can still be in the ``values``. @@ -1107,7 +1107,7 @@ are not numeric data (even in the case that ``.categories`` is numeric). dtype in apply ~~~~~~~~~~~~~~ -Pandas currently does not preserve the dtype in apply functions: If you apply along rows you get +pandas currently does not preserve the dtype in apply functions: If you apply along rows you get a ``Series`` of ``object`` ``dtype`` (same as getting a row -> getting one element will return a basic type) and applying along columns will also convert to object. ``NaN`` values are unaffected. You can use ``fillna`` to handle missing values before applying a function. diff --git a/doc/source/user_guide/cookbook.rst b/doc/source/user_guide/cookbook.rst index 0a30d865f3c23..214b8a680fa7e 100644 --- a/doc/source/user_guide/cookbook.rst +++ b/doc/source/user_guide/cookbook.rst @@ -15,7 +15,7 @@ Simplified, condensed, new-user friendly, in-line examples have been inserted wh augment the Stack-Overflow and GitHub links. Many of the links contain expanded information, above what the in-line examples offer. -Pandas (pd) and Numpy (np) are the only two abbreviated imported modules. The rest are kept +pandas (pd) and Numpy (np) are the only two abbreviated imported modules. The rest are kept explicitly imported for newer users. These examples are written for Python 3. Minor tweaks might be necessary for earlier python diff --git a/doc/source/user_guide/dsintro.rst b/doc/source/user_guide/dsintro.rst index d698b316d321e..905877cca61db 100644 --- a/doc/source/user_guide/dsintro.rst +++ b/doc/source/user_guide/dsintro.rst @@ -78,13 +78,13 @@ Series can be instantiated from dicts: When the data is a dict, and an index is not passed, the ``Series`` index will be ordered by the dict's insertion order, if you're using Python - version >= 3.6 and Pandas version >= 0.23. + version >= 3.6 and pandas version >= 0.23. - If you're using Python < 3.6 or Pandas < 0.23, and an index is not passed, + If you're using Python < 3.6 or pandas < 0.23, and an index is not passed, the ``Series`` index will be the lexically ordered list of dict keys. In the example above, if you were on a Python version lower than 3.6 or a -Pandas version lower than 0.23, the ``Series`` would be ordered by the lexical +pandas version lower than 0.23, the ``Series`` would be ordered by the lexical order of the dict keys (i.e. ``['a', 'b', 'c']`` rather than ``['b', 'a', 'c']``). If an index is passed, the values in data corresponding to the labels in the @@ -151,7 +151,7 @@ index (to disable :ref:`automatic alignment `, for example). :attr:`Series.array` will always be an :class:`~pandas.api.extensions.ExtensionArray`. Briefly, an ExtensionArray is a thin wrapper around one or more *concrete* arrays like a -:class:`numpy.ndarray`. Pandas knows how to take an ``ExtensionArray`` and +:class:`numpy.ndarray`. pandas knows how to take an ``ExtensionArray`` and store it in a ``Series`` or a column of a ``DataFrame``. See :ref:`basics.dtypes` for more. @@ -290,9 +290,9 @@ based on common sense rules. When the data is a dict, and ``columns`` is not specified, the ``DataFrame`` columns will be ordered by the dict's insertion order, if you are using - Python version >= 3.6 and Pandas >= 0.23. + Python version >= 3.6 and pandas >= 0.23. - If you are using Python < 3.6 or Pandas < 0.23, and ``columns`` is not + If you are using Python < 3.6 or pandas < 0.23, and ``columns`` is not specified, the ``DataFrame`` columns will be the lexically ordered list of dict keys. diff --git a/doc/source/user_guide/duplicates.rst b/doc/source/user_guide/duplicates.rst index 2993ca7799510..7cda067fb24ad 100644 --- a/doc/source/user_guide/duplicates.rst +++ b/doc/source/user_guide/duplicates.rst @@ -79,7 +79,7 @@ unique with :attr:`Index.is_unique`: .. note:: Checking whether an index is unique is somewhat expensive for large datasets. - Pandas does cache this result, so re-checking on the same index is very fast. + pandas does cache this result, so re-checking on the same index is very fast. :meth:`Index.duplicated` will return a boolean ndarray indicating whether a label is repeated. diff --git a/doc/source/user_guide/enhancingperf.rst b/doc/source/user_guide/enhancingperf.rst index d30554986607d..cc8de98165fac 100644 --- a/doc/source/user_guide/enhancingperf.rst +++ b/doc/source/user_guide/enhancingperf.rst @@ -689,7 +689,7 @@ name in an expression. df.loc[a < df["a"]] # same as the previous expression With :func:`pandas.eval` you cannot use the ``@`` prefix *at all*, because it -isn't defined in that context. ``pandas`` will let you know this if you try to +isn't defined in that context. pandas will let you know this if you try to use ``@`` in a top-level call to :func:`pandas.eval`. For example, .. ipython:: python diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst index ec64442319a84..6427cea6fa510 100644 --- a/doc/source/user_guide/groupby.rst +++ b/doc/source/user_guide/groupby.rst @@ -614,7 +614,7 @@ For a grouped ``DataFrame``, you can rename in a similar manner: grouped["C"].agg(["sum", "sum"]) - Pandas *does* allow you to provide multiple lambdas. In this case, pandas + pandas *does* allow you to provide multiple lambdas. In this case, pandas will mangle the name of the (nameless) lambda functions, appending ``_`` to each subsequent lambda. @@ -636,7 +636,7 @@ accepts the special syntax in :meth:`GroupBy.agg`, known as "named aggregation", - The keywords are the *output* column names - The values are tuples whose first element is the column to select - and the second element is the aggregation to apply to that column. Pandas + and the second element is the aggregation to apply to that column. pandas provides the ``pandas.NamedAgg`` namedtuple with the fields ``['column', 'aggfunc']`` to make it clearer what the arguments are. As usual, the aggregation can be a callable or a string alias. diff --git a/doc/source/user_guide/indexing.rst b/doc/source/user_guide/indexing.rst index b11baad1e3eb5..530fdfba7d12c 100644 --- a/doc/source/user_guide/indexing.rst +++ b/doc/source/user_guide/indexing.rst @@ -46,7 +46,7 @@ Different choices for indexing ------------------------------ Object selection has had a number of user-requested additions in order to -support more explicit location based indexing. Pandas now supports three types +support more explicit location based indexing. pandas now supports three types of multi-axis indexing. * ``.loc`` is primarily label based, but may also be used with a boolean array. ``.loc`` will raise ``KeyError`` when the items are not found. Allowed inputs are: @@ -315,7 +315,7 @@ Selection by label .. versionchanged:: 1.0.0 - Pandas will raise a ``KeyError`` if indexing with a list with missing labels. See :ref:`list-like Using loc with + pandas will raise a ``KeyError`` if indexing with a list with missing labels. See :ref:`list-like Using loc with missing keys in a list is Deprecated `. pandas provides a suite of methods in order to have **purely label based indexing**. This is a strict inclusion based protocol. @@ -433,7 +433,7 @@ Selection by position This is sometimes called ``chained assignment`` and should be avoided. See :ref:`Returning a View versus Copy `. -Pandas provides a suite of methods in order to get **purely integer based indexing**. The semantics follow closely Python and NumPy slicing. These are ``0-based`` indexing. When slicing, the start bound is *included*, while the upper bound is *excluded*. Trying to use a non-integer, even a **valid** label will raise an ``IndexError``. +pandas provides a suite of methods in order to get **purely integer based indexing**. The semantics follow closely Python and NumPy slicing. These are ``0-based`` indexing. When slicing, the start bound is *included*, while the upper bound is *excluded*. Trying to use a non-integer, even a **valid** label will raise an ``IndexError``. The ``.iloc`` attribute is the primary access method. The following are valid inputs: @@ -1812,7 +1812,7 @@ about! Sometimes a ``SettingWithCopy`` warning will arise at times when there's no obvious chained indexing going on. **These** are the bugs that -``SettingWithCopy`` is designed to catch! Pandas is probably trying to warn you +``SettingWithCopy`` is designed to catch! pandas is probably trying to warn you that you've done this: .. code-block:: python @@ -1835,7 +1835,7 @@ When you use chained indexing, the order and type of the indexing operation partially determine whether the result is a slice into the original object, or a copy of the slice. -Pandas has the ``SettingWithCopyWarning`` because assigning to a copy of a +pandas has the ``SettingWithCopyWarning`` because assigning to a copy of a slice is frequently not intentional, but a mistake caused by chained indexing returning a copy where a slice was expected. diff --git a/doc/source/user_guide/integer_na.rst b/doc/source/user_guide/integer_na.rst index acee1638570f7..be38736f493b5 100644 --- a/doc/source/user_guide/integer_na.rst +++ b/doc/source/user_guide/integer_na.rst @@ -30,7 +30,7 @@ numbers. Construction ------------ -Pandas can represent integer data with possibly missing values using +pandas can represent integer data with possibly missing values using :class:`arrays.IntegerArray`. This is an :ref:`extension types ` implemented within pandas. diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 184894bbafe28..ae22ee836cd8c 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -894,7 +894,7 @@ take full advantage of the flexibility of the date parsing API: ) df -Pandas will try to call the ``date_parser`` function in three different ways. If +pandas will try to call the ``date_parser`` function in three different ways. If an exception is raised, the next one is tried: 1. ``date_parser`` is first called with one or more arrays as arguments, @@ -926,7 +926,7 @@ Note that performance-wise, you should try these methods of parsing dates in ord Parsing a CSV with mixed timezones ++++++++++++++++++++++++++++++++++ -Pandas cannot natively represent a column or index with mixed timezones. If your CSV +pandas cannot natively represent a column or index with mixed timezones. If your CSV file contains columns with a mixture of timezones, the default result will be an object-dtype column with strings, even with ``parse_dates``. @@ -1602,7 +1602,7 @@ python engine is selected explicitly using ``engine='python'``. Reading/writing remote files '''''''''''''''''''''''''''' -You can pass in a URL to read or write remote files to many of Pandas' IO +You can pass in a URL to read or write remote files to many of pandas' IO functions - the following example shows reading a CSV file: .. code-block:: python @@ -2265,7 +2265,7 @@ The full list of types supported are described in the Table Schema spec. This table shows the mapping from pandas types: =============== ================= -Pandas type Table Schema type +pandas type Table Schema type =============== ================= int64 integer float64 number @@ -2661,7 +2661,7 @@ that contain URLs. url_df = pd.DataFrame( { - "name": ["Python", "Pandas"], + "name": ["Python", "pandas"], "url": ["https://www.python.org/", "https://pandas.pydata.org"], } ) @@ -3143,7 +3143,7 @@ one can pass an :class:`~pandas.io.excel.ExcelWriter`. Writing Excel files to memory +++++++++++++++++++++++++++++ -Pandas supports writing Excel files to buffer-like objects such as ``StringIO`` or +pandas supports writing Excel files to buffer-like objects such as ``StringIO`` or ``BytesIO`` using :class:`~pandas.io.excel.ExcelWriter`. .. code-block:: python @@ -3177,7 +3177,7 @@ Pandas supports writing Excel files to buffer-like objects such as ``StringIO`` Excel writer engines '''''''''''''''''''' -Pandas chooses an Excel writer via two methods: +pandas chooses an Excel writer via two methods: 1. the ``engine`` keyword argument 2. the filename extension (via the default specified in config options) @@ -3474,7 +3474,7 @@ for some advanced strategies .. warning:: - Pandas uses PyTables for reading and writing HDF5 files, which allows + pandas uses PyTables for reading and writing HDF5 files, which allows serializing object-dtype data with pickle. Loading pickled data received from untrusted sources can be unsafe. @@ -4734,7 +4734,7 @@ Several caveats. * Duplicate column names and non-string columns names are not supported. * The ``pyarrow`` engine always writes the index to the output, but ``fastparquet`` only writes non-default - indexes. This extra column can cause problems for non-Pandas consumers that are not expecting it. You can + indexes. This extra column can cause problems for non-pandas consumers that are not expecting it. You can force including or omitting indexes with the ``index`` argument, regardless of the underlying engine. * Index level names, if specified, must be strings. * In the ``pyarrow`` engine, categorical dtypes for non-string types can be serialized to parquet, but will de-serialize as their primitive dtype. @@ -4894,7 +4894,7 @@ ORC .. versionadded:: 1.0.0 Similar to the :ref:`parquet ` format, the `ORC Format `__ is a binary columnar serialization -for data frames. It is designed to make reading data frames efficient. Pandas provides *only* a reader for the +for data frames. It is designed to make reading data frames efficient. pandas provides *only* a reader for the ORC format, :func:`~pandas.read_orc`. This requires the `pyarrow `__ library. .. _io.sql: diff --git a/doc/source/user_guide/missing_data.rst b/doc/source/user_guide/missing_data.rst index 3c97cc7da6edb..7eb377694910b 100644 --- a/doc/source/user_guide/missing_data.rst +++ b/doc/source/user_guide/missing_data.rst @@ -81,7 +81,7 @@ Integer dtypes and missing data ------------------------------- Because ``NaN`` is a float, a column of integers with even one missing values -is cast to floating-point dtype (see :ref:`gotchas.intna` for more). Pandas +is cast to floating-point dtype (see :ref:`gotchas.intna` for more). pandas provides a nullable integer array, which can be used by explicitly requesting the dtype: @@ -735,7 +735,7 @@ However, these can be filled in using :meth:`~DataFrame.fillna` and it will work reindexed[crit.fillna(False)] reindexed[crit.fillna(True)] -Pandas provides a nullable integer dtype, but you must explicitly request it +pandas provides a nullable integer dtype, but you must explicitly request it when creating the series or column. Notice that we use a capital "I" in the ``dtype="Int64"``. diff --git a/doc/source/user_guide/scale.rst b/doc/source/user_guide/scale.rst index f36f27269a996..7f2419bc7f19d 100644 --- a/doc/source/user_guide/scale.rst +++ b/doc/source/user_guide/scale.rst @@ -4,7 +4,7 @@ Scaling to large datasets ************************* -Pandas provides data structures for in-memory analytics, which makes using pandas +pandas provides data structures for in-memory analytics, which makes using pandas to analyze datasets that are larger than memory datasets somewhat tricky. Even datasets that are a sizable fraction of memory become unwieldy, as some pandas operations need to make intermediate copies. @@ -13,7 +13,7 @@ This document provides a few recommendations for scaling your analysis to larger It's a complement to :ref:`enhancingperf`, which focuses on speeding up analysis for datasets that fit in memory. -But first, it's worth considering *not using pandas*. Pandas isn't the right +But first, it's worth considering *not using pandas*. pandas isn't the right tool for all situations. If you're working with very large datasets and a tool like PostgreSQL fits your needs, then you should probably be using that. Assuming you want or need the expressiveness and power of pandas, let's carry on. @@ -230,7 +230,7 @@ different library that implements these out-of-core algorithms for you. Use other libraries ------------------- -Pandas is just one library offering a DataFrame API. Because of its popularity, +pandas is just one library offering a DataFrame API. Because of its popularity, pandas' API has become something of a standard that other libraries implement. The pandas documentation maintains a list of libraries implementing a DataFrame API in :ref:`our ecosystem page `. @@ -259,7 +259,7 @@ Inspecting the ``ddf`` object, we see a few things * There are new attributes like ``.npartitions`` and ``.divisions`` The partitions and divisions are how Dask parallelizes computation. A **Dask** -DataFrame is made up of many **Pandas** DataFrames. A single method call on a +DataFrame is made up of many pandas DataFrames. A single method call on a Dask DataFrame ends up making many pandas method calls, and Dask knows how to coordinate everything to get the result. diff --git a/doc/source/user_guide/sparse.rst b/doc/source/user_guide/sparse.rst index 62e35cb994faf..3156e3088d860 100644 --- a/doc/source/user_guide/sparse.rst +++ b/doc/source/user_guide/sparse.rst @@ -6,7 +6,7 @@ Sparse data structures ********************** -Pandas provides data structures for efficiently storing sparse data. +pandas provides data structures for efficiently storing sparse data. These are not necessarily sparse in the typical "mostly 0". Rather, you can view these objects as being "compressed" where any data matching a specific value (``NaN`` / missing value, though any value can be chosen, including 0) is omitted. The compressed values are not actually stored in the array. @@ -116,7 +116,7 @@ Sparse accessor .. versionadded:: 0.24.0 -Pandas provides a ``.sparse`` accessor, similar to ``.str`` for string data, ``.cat`` +pandas provides a ``.sparse`` accessor, similar to ``.str`` for string data, ``.cat`` for categorical data, and ``.dt`` for datetime-like data. This namespace provides attributes and methods that are specific to sparse data. diff --git a/doc/source/user_guide/timedeltas.rst b/doc/source/user_guide/timedeltas.rst index 971a415088220..cb265d34229dd 100644 --- a/doc/source/user_guide/timedeltas.rst +++ b/doc/source/user_guide/timedeltas.rst @@ -100,7 +100,7 @@ The ``unit`` keyword argument specifies the unit of the Timedelta: Timedelta limitations ~~~~~~~~~~~~~~~~~~~~~ -Pandas represents ``Timedeltas`` in nanosecond resolution using +pandas represents ``Timedeltas`` in nanosecond resolution using 64 bit integers. As such, the 64 bit integer limits determine the ``Timedelta`` limits. diff --git a/doc/source/user_guide/timeseries.rst b/doc/source/user_guide/timeseries.rst index 11ec90085d9bf..be2c67521dc5d 100644 --- a/doc/source/user_guide/timeseries.rst +++ b/doc/source/user_guide/timeseries.rst @@ -1549,7 +1549,7 @@ Converting to Python datetimes Resampling ---------- -Pandas has a simple, powerful, and efficient functionality for performing +pandas has a simple, powerful, and efficient functionality for performing resampling operations during frequency conversion (e.g., converting secondly data into 5-minutely data). This is extremely common in, but not limited to, financial applications. diff --git a/doc/source/user_guide/visualization.rst b/doc/source/user_guide/visualization.rst index 46ab29a52747a..a6c3d9814b03d 100644 --- a/doc/source/user_guide/visualization.rst +++ b/doc/source/user_guide/visualization.rst @@ -776,7 +776,7 @@ See the `matplotlib pie documentation `__ around the source of the ``RuntimeWarning`` to control how these conditions are handled. @@ -1372,7 +1372,7 @@ Deprecations - ``Timestamp.offset`` property (and named arg in the constructor), has been deprecated in favor of ``freq`` (:issue:`12160`) - ``pd.tseries.util.pivot_annual`` is deprecated. Use ``pivot_table`` as alternative, an example is :ref:`here ` (:issue:`736`) - ``pd.tseries.util.isleapyear`` has been deprecated and will be removed in a subsequent release. Datetime-likes now have a ``.is_leap_year`` property (:issue:`13727`) -- ``Panel4D`` and ``PanelND`` constructors are deprecated and will be removed in a future version. The recommended way to represent these types of n-dimensional data are with the `xarray package `__. Pandas provides a :meth:`~Panel4D.to_xarray` method to automate this conversion (:issue:`13564`). +- ``Panel4D`` and ``PanelND`` constructors are deprecated and will be removed in a future version. The recommended way to represent these types of n-dimensional data are with the `xarray package `__. pandas provides a :meth:`~Panel4D.to_xarray` method to automate this conversion (:issue:`13564`). - ``pandas.tseries.frequencies.get_standard_freq`` is deprecated. Use ``pandas.tseries.frequencies.to_offset(freq).rule_code`` instead (:issue:`13874`) - ``pandas.tseries.frequencies.to_offset``'s ``freqstr`` keyword is deprecated in favor of ``freq`` (:issue:`13874`) - ``Categorical.from_array`` has been deprecated and will be removed in a future version (:issue:`13854`) diff --git a/doc/source/whatsnew/v0.20.0.rst b/doc/source/whatsnew/v0.20.0.rst index 3f7a89112958b..a9e57f0039735 100644 --- a/doc/source/whatsnew/v0.20.0.rst +++ b/doc/source/whatsnew/v0.20.0.rst @@ -26,7 +26,7 @@ Highlights include: .. warning:: - Pandas has changed the internal structure and layout of the code base. + pandas has changed the internal structure and layout of the code base. This can affect imports that are not from the top-level ``pandas.*`` namespace, please see the changes :ref:`here `. Check the :ref:`API Changes ` and :ref:`deprecations ` before updating. @@ -243,7 +243,7 @@ The default is to infer the compression type from the extension (``compression=' UInt64 support improved ^^^^^^^^^^^^^^^^^^^^^^^ -Pandas has significantly improved support for operations involving unsigned, +pandas has significantly improved support for operations involving unsigned, or purely non-negative, integers. Previously, handling these integers would result in improper rounding or data-type casting, leading to incorrect results. Notably, a new numerical index, ``UInt64Index``, has been created (:issue:`14937`) @@ -333,7 +333,7 @@ You must enable this by setting the ``display.html.table_schema`` option to ``Tr SciPy sparse matrix from/to SparseDataFrame ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Pandas now supports creating sparse dataframes directly from ``scipy.sparse.spmatrix`` instances. +pandas now supports creating sparse dataframes directly from ``scipy.sparse.spmatrix`` instances. See the :ref:`documentation ` for more information. (:issue:`4343`) All sparse formats are supported, but matrices that are not in :mod:`COOrdinate ` format will be converted, copying data as needed. @@ -1355,7 +1355,7 @@ Deprecate Panel ^^^^^^^^^^^^^^^ ``Panel`` is deprecated and will be removed in a future version. The recommended way to represent 3-D data are -with a ``MultiIndex`` on a ``DataFrame`` via the :meth:`~Panel.to_frame` or with the `xarray package `__. Pandas +with a ``MultiIndex`` on a ``DataFrame`` via the :meth:`~Panel.to_frame` or with the `xarray package `__. pandas provides a :meth:`~Panel.to_xarray` method to automate this conversion (:issue:`13563`). .. code-block:: ipython diff --git a/doc/source/whatsnew/v0.21.0.rst b/doc/source/whatsnew/v0.21.0.rst index 926bcaa21ac3a..6035b89aa8643 100644 --- a/doc/source/whatsnew/v0.21.0.rst +++ b/doc/source/whatsnew/v0.21.0.rst @@ -900,13 +900,13 @@ New behavior: No automatic Matplotlib converters ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Pandas no longer registers our ``date``, ``time``, ``datetime``, +pandas no longer registers our ``date``, ``time``, ``datetime``, ``datetime64``, and ``Period`` converters with matplotlib when pandas is imported. Matplotlib plot methods (``plt.plot``, ``ax.plot``, ...), will not nicely format the x-axis for ``DatetimeIndex`` or ``PeriodIndex`` values. You must explicitly register these methods: -Pandas built-in ``Series.plot`` and ``DataFrame.plot`` *will* register these +pandas built-in ``Series.plot`` and ``DataFrame.plot`` *will* register these converters on first-use (:issue:`17710`). .. note:: diff --git a/doc/source/whatsnew/v0.21.1.rst b/doc/source/whatsnew/v0.21.1.rst index f930dfac869cd..2d72f6470fc81 100644 --- a/doc/source/whatsnew/v0.21.1.rst +++ b/doc/source/whatsnew/v0.21.1.rst @@ -34,7 +34,7 @@ Highlights include: Restore Matplotlib datetime converter registration ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Pandas implements some matplotlib converters for nicely formatting the axis +pandas implements some matplotlib converters for nicely formatting the axis labels on plots with ``datetime`` or ``Period`` values. Prior to pandas 0.21.0, these were implicitly registered with matplotlib, as a side effect of ``import pandas``. diff --git a/doc/source/whatsnew/v0.22.0.rst b/doc/source/whatsnew/v0.22.0.rst index 66d3ab3305565..92b514ce59660 100644 --- a/doc/source/whatsnew/v0.22.0.rst +++ b/doc/source/whatsnew/v0.22.0.rst @@ -20,7 +20,7 @@ release note (singular!). Backwards incompatible API changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Pandas 0.22.0 changes the handling of empty and all-*NA* sums and products. The +pandas 0.22.0 changes the handling of empty and all-*NA* sums and products. The summary is that * The sum of an empty or all-*NA* ``Series`` is now ``0`` diff --git a/doc/source/whatsnew/v0.23.0.rst b/doc/source/whatsnew/v0.23.0.rst index cb811fd83d90d..f4caea9d363eb 100644 --- a/doc/source/whatsnew/v0.23.0.rst +++ b/doc/source/whatsnew/v0.23.0.rst @@ -189,7 +189,7 @@ resetting indexes. See the :ref:`Sorting by Indexes and Values Extending pandas with custom types (experimental) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Pandas now supports storing array-like objects that aren't necessarily 1-D NumPy +pandas now supports storing array-like objects that aren't necessarily 1-D NumPy arrays as columns in a DataFrame or values in a Series. This allows third-party libraries to implement extensions to NumPy's types, similar to how pandas implemented categoricals, datetimes with timezones, periods, and intervals. @@ -553,7 +553,7 @@ Other enhancements - :class:`~pandas.tseries.offsets.WeekOfMonth` constructor now supports ``n=0`` (:issue:`20517`). - :class:`DataFrame` and :class:`Series` now support matrix multiplication (``@``) operator (:issue:`10259`) for Python>=3.5 - Updated :meth:`DataFrame.to_gbq` and :meth:`pandas.read_gbq` signature and documentation to reflect changes from - the Pandas-GBQ library version 0.4.0. Adds intersphinx mapping to Pandas-GBQ + the pandas-gbq library version 0.4.0. Adds intersphinx mapping to pandas-gbq library. (:issue:`20564`) - Added new writer for exporting Stata dta files in version 117, ``StataWriter117``. This format supports exporting strings with lengths up to 2,000,000 characters (:issue:`16450`) - :func:`to_hdf` and :func:`read_hdf` now accept an ``errors`` keyword argument to control encoding error handling (:issue:`20835`) @@ -593,7 +593,7 @@ Instantiation from dicts preserves dict insertion order for Python 3.6+ Until Python 3.6, dicts in Python had no formally defined ordering. For Python version 3.6 and later, dicts are ordered by insertion order, see `PEP 468 `_. -Pandas will use the dict's insertion order, when creating a ``Series`` or +pandas will use the dict's insertion order, when creating a ``Series`` or ``DataFrame`` from a dict and you're using Python version 3.6 or higher. (:issue:`19884`) @@ -643,7 +643,7 @@ Deprecate Panel ^^^^^^^^^^^^^^^ ``Panel`` was deprecated in the 0.20.x release, showing as a ``DeprecationWarning``. Using ``Panel`` will now show a ``FutureWarning``. The recommended way to represent 3-D data are -with a ``MultiIndex`` on a ``DataFrame`` via the :meth:`~Panel.to_frame` or with the `xarray package `__. Pandas +with a ``MultiIndex`` on a ``DataFrame`` via the :meth:`~Panel.to_frame` or with the `xarray package `__. pandas provides a :meth:`~Panel.to_xarray` method to automate this conversion (:issue:`13563`, :issue:`18324`). .. code-block:: ipython @@ -884,7 +884,7 @@ Extraction of matching patterns from strings By default, extracting matching patterns from strings with :func:`str.extract` used to return a ``Series`` if a single group was being extracted (a ``DataFrame`` if more than one group was -extracted). As of Pandas 0.23.0 :func:`str.extract` always returns a ``DataFrame``, unless +extracted). As of pandas 0.23.0 :func:`str.extract` always returns a ``DataFrame``, unless ``expand`` is set to ``False``. Finally, ``None`` was an accepted value for the ``expand`` parameter (which was equivalent to ``False``), but now raises a ``ValueError``. (:issue:`11386`) @@ -1175,7 +1175,7 @@ Performance improvements Documentation changes ~~~~~~~~~~~~~~~~~~~~~ -Thanks to all of the contributors who participated in the Pandas Documentation +Thanks to all of the contributors who participated in the pandas Documentation Sprint, which took place on March 10th. We had about 500 participants from over 30 locations across the world. You should notice that many of the :ref:`API docstrings ` have greatly improved. diff --git a/doc/source/whatsnew/v0.23.2.rst b/doc/source/whatsnew/v0.23.2.rst index 9f24092d1d4ae..99650e8291d3d 100644 --- a/doc/source/whatsnew/v0.23.2.rst +++ b/doc/source/whatsnew/v0.23.2.rst @@ -11,7 +11,7 @@ and bug fixes. We recommend that all users upgrade to this version. .. note:: - Pandas 0.23.2 is first pandas release that's compatible with + pandas 0.23.2 is first pandas release that's compatible with Python 3.7 (:issue:`20552`) .. warning:: diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 9a2e96f717d9b..9ef50045d5b5e 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -38,7 +38,7 @@ Enhancements Optional integer NA support ^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Pandas has gained the ability to hold integer dtypes with missing values. This long requested feature is enabled through the use of :ref:`extension types `. +pandas has gained the ability to hold integer dtypes with missing values. This long requested feature is enabled through the use of :ref:`extension types `. .. note:: @@ -384,7 +384,7 @@ Other enhancements - :meth:`Series.droplevel` and :meth:`DataFrame.droplevel` are now implemented (:issue:`20342`) - Added support for reading from/writing to Google Cloud Storage via the ``gcsfs`` library (:issue:`19454`, :issue:`23094`) - :func:`DataFrame.to_gbq` and :func:`read_gbq` signature and documentation updated to - reflect changes from the `Pandas-GBQ library version 0.8.0 + reflect changes from the `pandas-gbq library version 0.8.0 `__. Adds a ``credentials`` argument, which enables the use of any kind of `google-auth credentials @@ -432,7 +432,7 @@ Other enhancements Backwards incompatible API changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Pandas 0.24.0 includes a number of API breaking changes. +pandas 0.24.0 includes a number of API breaking changes. .. _whatsnew_0240.api_breaking.deps: @@ -1217,7 +1217,7 @@ Extension type changes **Equality and hashability** -Pandas now requires that extension dtypes be hashable (i.e. the respective +pandas now requires that extension dtypes be hashable (i.e. the respective ``ExtensionDtype`` objects; hashability is not a requirement for the values of the corresponding ``ExtensionArray``). The base class implements a default ``__eq__`` and ``__hash__``. If you have a parametrized dtype, you should @@ -1925,7 +1925,7 @@ Build changes Other ^^^^^ -- Bug where C variables were declared with external linkage causing import errors if certain other C libraries were imported before Pandas. (:issue:`24113`) +- Bug where C variables were declared with external linkage causing import errors if certain other C libraries were imported before pandas. (:issue:`24113`) .. _whatsnew_0.24.0.contributors: diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 7b4440148677b..43b42c5cb5648 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -36,7 +36,7 @@ Enhancements Groupby aggregation with relabeling ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Pandas has added special groupby behavior, known as "named aggregation", for naming the +pandas has added special groupby behavior, known as "named aggregation", for naming the output columns when applying multiple aggregation functions to specific columns (:issue:`18366`, :issue:`26512`). .. ipython:: python @@ -53,7 +53,7 @@ output columns when applying multiple aggregation functions to specific columns Pass the desired columns names as the ``**kwargs`` to ``.agg``. The values of ``**kwargs`` should be tuples where the first element is the column selection, and the second element is the -aggregation function to apply. Pandas provides the ``pandas.NamedAgg`` namedtuple to make it clearer +aggregation function to apply. pandas provides the ``pandas.NamedAgg`` namedtuple to make it clearer what the arguments to the function are, but plain tuples are accepted as well. .. ipython:: python @@ -425,7 +425,7 @@ of ``object`` dtype. :attr:`Series.str` will now infer the dtype data *within* t Categorical dtypes are preserved during groupby ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Previously, columns that were categorical, but not the groupby key(s) would be converted to ``object`` dtype during groupby operations. Pandas now will preserve these dtypes. (:issue:`18502`) +Previously, columns that were categorical, but not the groupby key(s) would be converted to ``object`` dtype during groupby operations. pandas now will preserve these dtypes. (:issue:`18502`) .. ipython:: python @@ -545,14 +545,14 @@ with :attr:`numpy.nan` in the case of an empty :class:`DataFrame` (:issue:`26397 ``__str__`` methods now call ``__repr__`` rather than vice versa ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Pandas has until now mostly defined string representations in a Pandas objects's +pandas has until now mostly defined string representations in a pandas objects' ``__str__``/``__unicode__``/``__bytes__`` methods, and called ``__str__`` from the ``__repr__`` method, if a specific ``__repr__`` method is not found. This is not needed for Python3. -In Pandas 0.25, the string representations of Pandas objects are now generally +In pandas 0.25, the string representations of pandas objects are now generally defined in ``__repr__``, and calls to ``__str__`` in general now pass the call on to the ``__repr__``, if a specific ``__str__`` method doesn't exist, as is standard for Python. -This change is backward compatible for direct usage of Pandas, but if you subclass -Pandas objects *and* give your subclasses specific ``__str__``/``__repr__`` methods, +This change is backward compatible for direct usage of pandas, but if you subclass +pandas objects *and* give your subclasses specific ``__str__``/``__repr__`` methods, you may have to adjust your ``__str__``/``__repr__`` methods (:issue:`26495`). .. _whatsnew_0250.api_breaking.interval_indexing: @@ -881,7 +881,7 @@ Other API changes - Bug in :meth:`DatetimeIndex.snap` which didn't preserving the ``name`` of the input :class:`Index` (:issue:`25575`) - The ``arg`` argument in :meth:`pandas.core.groupby.DataFrameGroupBy.agg` has been renamed to ``func`` (:issue:`26089`) - The ``arg`` argument in :meth:`pandas.core.window._Window.aggregate` has been renamed to ``func`` (:issue:`26372`) -- Most Pandas classes had a ``__bytes__`` method, which was used for getting a python2-style bytestring representation of the object. This method has been removed as a part of dropping Python2 (:issue:`26447`) +- Most pandas classes had a ``__bytes__`` method, which was used for getting a python2-style bytestring representation of the object. This method has been removed as a part of dropping Python2 (:issue:`26447`) - The ``.str``-accessor has been disabled for 1-level :class:`MultiIndex`, use :meth:`MultiIndex.to_flat_index` if necessary (:issue:`23679`) - Removed support of gtk package for clipboards (:issue:`26563`) - Using an unsupported version of Beautiful Soup 4 will now raise an ``ImportError`` instead of a ``ValueError`` (:issue:`27063`) diff --git a/doc/source/whatsnew/v0.25.1.rst b/doc/source/whatsnew/v0.25.1.rst index 2a2b511356a69..8a16bab63f1bf 100644 --- a/doc/source/whatsnew/v0.25.1.rst +++ b/doc/source/whatsnew/v0.25.1.rst @@ -10,7 +10,7 @@ I/O and LZMA ~~~~~~~~~~~~ Some users may unknowingly have an incomplete Python installation lacking the ``lzma`` module from the standard library. In this case, ``import pandas`` failed due to an ``ImportError`` (:issue:`27575`). -Pandas will now warn, rather than raising an ``ImportError`` if the ``lzma`` module is not present. Any subsequent attempt to use ``lzma`` methods will raise a ``RuntimeError``. +pandas will now warn, rather than raising an ``ImportError`` if the ``lzma`` module is not present. Any subsequent attempt to use ``lzma`` methods will raise a ``RuntimeError``. A possible fix for the lack of the ``lzma`` module is to ensure you have the necessary libraries and then re-install Python. For example, on MacOS installing Python with ``pyenv`` may lead to an incomplete Python installation due to unmet system dependencies at compilation time (like ``xz``). Compilation will succeed, but Python might fail at run time. The issue can be solved by installing the necessary dependencies and then re-installing Python. diff --git a/doc/source/whatsnew/v0.25.2.rst b/doc/source/whatsnew/v0.25.2.rst index c0c68ce4b1f44..a5ea8933762ab 100644 --- a/doc/source/whatsnew/v0.25.2.rst +++ b/doc/source/whatsnew/v0.25.2.rst @@ -8,7 +8,7 @@ including other versions of pandas. .. note:: - Pandas 0.25.2 adds compatibility for Python 3.8 (:issue:`28147`). + pandas 0.25.2 adds compatibility for Python 3.8 (:issue:`28147`). .. _whatsnew_0252.bug_fixes: diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 32175d344c320..ddc40d6d40594 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -18,7 +18,7 @@ including other versions of pandas. New deprecation policy ~~~~~~~~~~~~~~~~~~~~~~ -Starting with Pandas 1.0.0, pandas will adopt a variant of `SemVer`_ to +Starting with pandas 1.0.0, pandas will adopt a variant of `SemVer`_ to version releases. Briefly, * Deprecations will be introduced in minor releases (e.g. 1.1.0, 1.2.0, 2.1.0, ...) @@ -676,7 +676,7 @@ depending on how the results are cast back to the original dtype. Increased minimum version for Python ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Pandas 1.0.0 supports Python 3.6.1 and higher (:issue:`29212`). +pandas 1.0.0 supports Python 3.6.1 and higher (:issue:`29212`). .. _whatsnew_100.api_breaking.deps: @@ -749,7 +749,7 @@ See :ref:`install.dependencies` and :ref:`install.optional_dependencies` for mor Build changes ^^^^^^^^^^^^^ -Pandas has added a `pyproject.toml `_ file and will no longer include +pandas has added a `pyproject.toml `_ file and will no longer include cythonized files in the source distribution uploaded to PyPI (:issue:`28341`, :issue:`20775`). If you're installing a built distribution (wheel) or via conda, this shouldn't have any effect on you. If you're building pandas from source, you should no longer need to install Cython into your build environment before calling ``pip install pandas``. @@ -763,7 +763,7 @@ Other API changes - :class:`core.groupby.GroupBy.transform` now raises on invalid operation names (:issue:`27489`) - :meth:`pandas.api.types.infer_dtype` will now return "integer-na" for integer and ``np.nan`` mix (:issue:`27283`) - :meth:`MultiIndex.from_arrays` will no longer infer names from arrays if ``names=None`` is explicitly provided (:issue:`27292`) -- In order to improve tab-completion, Pandas does not include most deprecated attributes when introspecting a pandas object using ``dir`` (e.g. ``dir(df)``). +- In order to improve tab-completion, pandas does not include most deprecated attributes when introspecting a pandas object using ``dir`` (e.g. ``dir(df)``). To see which attributes are excluded, see an object's ``_deprecations`` attribute, for example ``pd.DataFrame._deprecations`` (:issue:`28805`). - The returned dtype of :func:`unique` now matches the input dtype. (:issue:`27874`) - Changed the default configuration value for ``options.matplotlib.register_converters`` from ``True`` to ``"auto"`` (:issue:`18720`). diff --git a/doc/source/whatsnew/v1.1.3.rst b/doc/source/whatsnew/v1.1.3.rst index acf1dafc59885..af714b1bb2ab1 100644 --- a/doc/source/whatsnew/v1.1.3.rst +++ b/doc/source/whatsnew/v1.1.3.rst @@ -16,7 +16,7 @@ Enhancements Added support for new Python version ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Pandas 1.1.3 now supports Python 3.9 (:issue:`36296`). +pandas 1.1.3 now supports Python 3.9 (:issue:`36296`). Development Changes ^^^^^^^^^^^^^^^^^^^ diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index b30e4177270b8..5c2d099ed3119 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -33,7 +33,7 @@ By default, duplicates continue to be allowed pd.Series([1, 2], index=['a', 'a']).set_flags(allows_duplicate_labels=False) -Pandas will propagate the ``allows_duplicate_labels`` property through many operations. +pandas will propagate the ``allows_duplicate_labels`` property through many operations. .. ipython:: python :okexcept: @@ -175,7 +175,7 @@ Other enhancements Increased minimum version for Python ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Pandas 1.2.0 supports Python 3.7.1 and higher (:issue:`35214`). +pandas 1.2.0 supports Python 3.7.1 and higher (:issue:`35214`). .. _whatsnew_120.api_breaking.deps: From 40f81a8dea3bc3d5bfbe3fc3c395c91aabdd3c17 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Sun, 4 Oct 2020 23:39:45 +0100 Subject: [PATCH 1013/1025] TYP: update setup.cfg (#36854) --- setup.cfg | 6 ------ 1 file changed, 6 deletions(-) diff --git a/setup.cfg b/setup.cfg index 8702e903d825b..3279a485c9bf3 100644 --- a/setup.cfg +++ b/setup.cfg @@ -208,9 +208,6 @@ check_untyped_defs=False [mypy-pandas.core.indexes.multi] check_untyped_defs=False -[mypy-pandas.core.indexes.period] -check_untyped_defs=False - [mypy-pandas.core.indexes.range] check_untyped_defs=False @@ -244,9 +241,6 @@ check_untyped_defs=False [mypy-pandas.core.series] check_untyped_defs=False -[mypy-pandas.core.strings] -check_untyped_defs=False - [mypy-pandas.core.window.common] check_untyped_defs=False From 7c3fac4fff3532105d27aceb36b1b6cb01ac2d74 Mon Sep 17 00:00:00 2001 From: Daniel Saxton <2658661+dsaxton@users.noreply.github.com> Date: Sun, 4 Oct 2020 17:41:01 -0500 Subject: [PATCH 1014/1025] CI: Update error message for np_dev (#36864) * CI: Update error message for np_dev * Comma * Fix --- pandas/tests/series/indexing/test_indexing.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/pandas/tests/series/indexing/test_indexing.py b/pandas/tests/series/indexing/test_indexing.py index 1fafdf00393e1..fbdac2bb2d8e8 100644 --- a/pandas/tests/series/indexing/test_indexing.py +++ b/pandas/tests/series/indexing/test_indexing.py @@ -367,14 +367,17 @@ def test_2d_to_1d_assignment_raises(): x = np.random.randn(2, 2) y = pd.Series(range(2)) - msg = ( - r"shape mismatch: value array of shape \(2,2\) could not be " - r"broadcast to indexing result of shape \(2,\)" + msg = "|".join( + [ + r"shape mismatch: value array of shape \(2,2\) could not be " + r"broadcast to indexing result of shape \(2,\)", + r"cannot reshape array of size 4 into shape \(2,\)", + ] ) with pytest.raises(ValueError, match=msg): y.loc[range(2)] = x - msg = r"could not broadcast input array from shape \(2,2\) into shape \(2\)" + msg = r"could not broadcast input array from shape \(2,2\) into shape \(2,?\)" with pytest.raises(ValueError, match=msg): y.loc[:] = x From 32624424f793de1d834ee857b0c1da9ee960e8e9 Mon Sep 17 00:00:00 2001 From: Maria-Alexandra Ilie <30919494+maria-ilie@users.noreply.github.com> Date: Sun, 4 Oct 2020 19:07:28 -0700 Subject: [PATCH 1015/1025] DOC: ran blacken docs tool and checked output to improve formatting #36777 (#36802) --- doc/source/user_guide/10min.rst | 10 +- doc/source/user_guide/advanced.rst | 251 +++++++------ doc/source/user_guide/basics.rst | 564 ++++++++++++++++------------- setup.cfg | 1 + 4 files changed, 444 insertions(+), 382 deletions(-) diff --git a/doc/source/user_guide/10min.rst b/doc/source/user_guide/10min.rst index 8270b2ee49bd8..08f83a4674ada 100644 --- a/doc/source/user_guide/10min.rst +++ b/doc/source/user_guide/10min.rst @@ -667,9 +667,10 @@ pandas can include categorical data in a :class:`DataFrame`. For full docs, see .. ipython:: python - df = pd.DataFrame( - {"id": [1, 2, 3, 4, 5, 6], "raw_grade": ["a", "b", "b", "a", "a", "e"]} - ) + df = pd.DataFrame( + {"id": [1, 2, 3, 4, 5, 6], "raw_grade": ["a", "b", "b", "a", "a", "e"]} + ) + Convert the raw grades to a categorical data type. @@ -718,7 +719,8 @@ We use the standard convention for referencing the matplotlib API: .. ipython:: python import matplotlib.pyplot as plt - plt.close('all') + + plt.close("all") .. ipython:: python diff --git a/doc/source/user_guide/advanced.rst b/doc/source/user_guide/advanced.rst index 8cd35e94ae743..cec777e0f021e 100644 --- a/doc/source/user_guide/advanced.rst +++ b/doc/source/user_guide/advanced.rst @@ -62,12 +62,14 @@ demonstrate different ways to initialize MultiIndexes. .. ipython:: python - arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'], - ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']] + arrays = [ + ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"], + ["one", "two", "one", "two", "one", "two", "one", "two"], + ] tuples = list(zip(*arrays)) tuples - index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second']) + index = pd.MultiIndex.from_tuples(tuples, names=["first", "second"]) index s = pd.Series(np.random.randn(8), index=index) @@ -78,8 +80,8 @@ to use the :meth:`MultiIndex.from_product` method: .. ipython:: python - iterables = [['bar', 'baz', 'foo', 'qux'], ['one', 'two']] - pd.MultiIndex.from_product(iterables, names=['first', 'second']) + iterables = [["bar", "baz", "foo", "qux"], ["one", "two"]] + pd.MultiIndex.from_product(iterables, names=["first", "second"]) You can also construct a ``MultiIndex`` from a ``DataFrame`` directly, using the method :meth:`MultiIndex.from_frame`. This is a complementary method to @@ -89,9 +91,10 @@ the method :meth:`MultiIndex.from_frame`. This is a complementary method to .. ipython:: python - df = pd.DataFrame([['bar', 'one'], ['bar', 'two'], - ['foo', 'one'], ['foo', 'two']], - columns=['first', 'second']) + df = pd.DataFrame( + [["bar", "one"], ["bar", "two"], ["foo", "one"], ["foo", "two"]], + columns=["first", "second"], + ) pd.MultiIndex.from_frame(df) As a convenience, you can pass a list of arrays directly into ``Series`` or @@ -99,8 +102,10 @@ As a convenience, you can pass a list of arrays directly into ``Series`` or .. ipython:: python - arrays = [np.array(['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux']), - np.array(['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two'])] + arrays = [ + np.array(["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"]), + np.array(["one", "two", "one", "two", "one", "two", "one", "two"]), + ] s = pd.Series(np.random.randn(8), index=arrays) s df = pd.DataFrame(np.random.randn(8, 4), index=arrays) @@ -119,7 +124,7 @@ of the index is up to you: .. ipython:: python - df = pd.DataFrame(np.random.randn(3, 8), index=['A', 'B', 'C'], columns=index) + df = pd.DataFrame(np.random.randn(3, 8), index=["A", "B", "C"], columns=index) df pd.DataFrame(np.random.randn(6, 6), index=index[:6], columns=index[:6]) @@ -129,7 +134,7 @@ bit easier on the eyes. Note that how the index is displayed can be controlled u .. ipython:: python - with pd.option_context('display.multi_sparse', False): + with pd.option_context("display.multi_sparse", False): df It's worth keeping in mind that there's nothing preventing you from using @@ -157,7 +162,7 @@ location at a particular level: .. ipython:: python index.get_level_values(0) - index.get_level_values('second') + index.get_level_values("second") Basic indexing on axis with MultiIndex ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -169,10 +174,10 @@ completely analogous way to selecting a column in a regular DataFrame: .. ipython:: python - df['bar'] - df['bar', 'one'] - df['bar']['one'] - s['qux'] + df["bar"] + df["bar", "one"] + df["bar"]["one"] + s["qux"] See :ref:`Cross-section with hierarchical index ` for how to select on a deeper level. @@ -190,7 +195,7 @@ For example:   df.columns.levels # original MultiIndex - df[['foo','qux']].columns.levels # sliced + df[["foo","qux"]].columns.levels # sliced This is done to avoid a recomputation of the levels in order to make slicing highly performant. If you want to see only the used levels, you can use the @@ -198,17 +203,17 @@ highly performant. If you want to see only the used levels, you can use the .. ipython:: python - df[['foo', 'qux']].columns.to_numpy() + df[["foo", "qux"]].columns.to_numpy() # for a specific level - df[['foo', 'qux']].columns.get_level_values(0) + df[["foo", "qux"]].columns.get_level_values(0) To reconstruct the ``MultiIndex`` with only the used levels, the :meth:`~MultiIndex.remove_unused_levels` method may be used. .. ipython:: python - new_mi = df[['foo', 'qux']].columns.remove_unused_levels() + new_mi = df[["foo", "qux"]].columns.remove_unused_levels() new_mi.levels Data alignment and using ``reindex`` @@ -229,7 +234,7 @@ called with another ``MultiIndex``, or even a list or array of tuples: .. ipython:: python s.reindex(index[:3]) - s.reindex([('foo', 'two'), ('bar', 'one'), ('qux', 'one'), ('baz', 'one')]) + s.reindex([("foo", "two"), ("bar", "one"), ("qux", "one"), ("baz", "one")]) .. _advanced.advanced_hierarchical: @@ -244,7 +249,7 @@ keys take the form of tuples. For example, the following works as you would expe df = df.T df - df.loc[('bar', 'two')] + df.loc[("bar", "two")] Note that ``df.loc['bar', 'two']`` would also work in this example, but this shorthand notation can lead to ambiguity in general. @@ -254,7 +259,7 @@ like this: .. ipython:: python - df.loc[('bar', 'two'), 'A'] + df.loc[("bar", "two"), "A"] You don't have to specify all levels of the ``MultiIndex`` by passing only the first elements of the tuple. For example, you can use "partial" indexing to @@ -262,7 +267,7 @@ get all elements with ``bar`` in the first level as follows: .. ipython:: python - df.loc['bar'] + df.loc["bar"] This is a shortcut for the slightly more verbose notation ``df.loc[('bar',),]`` (equivalent to ``df.loc['bar',]`` in this example). @@ -271,20 +276,20 @@ to ``df.loc['bar',]`` in this example). .. ipython:: python - df.loc['baz':'foo'] + df.loc["baz":"foo"] You can slice with a 'range' of values, by providing a slice of tuples. .. ipython:: python - df.loc[('baz', 'two'):('qux', 'one')] - df.loc[('baz', 'two'):'foo'] + df.loc[("baz", "two"):("qux", "one")] + df.loc[("baz", "two"):"foo"] Passing a list of labels or tuples works similar to reindexing: .. ipython:: python - df.loc[[('bar', 'two'), ('qux', 'one')]] + df.loc[[("bar", "two"), ("qux", "one")]] .. note:: @@ -298,8 +303,9 @@ whereas a tuple of lists refer to several values within a level: .. ipython:: python - s = pd.Series([1, 2, 3, 4, 5, 6], - index=pd.MultiIndex.from_product([["A", "B"], ["c", "d", "e"]])) + s = pd.Series( + [1, 2, 3, 4, 5, 6], index=pd.MultiIndex.from_product([["A", "B"], ["c", "d", "e"]]) + ) s.loc[[("A", "c"), ("B", "d")]] # list of tuples s.loc[(["A", "B"], ["c", "d"])] # tuple of lists @@ -329,37 +335,44 @@ As usual, **both sides** of the slicers are included as this is label indexing. .. code-block:: python - df.loc[(slice('A1', 'A3'), ...), :] # noqa: E999 + df.loc[(slice("A1", "A3"), ...), :] # noqa: E999   You should **not** do this:   .. code-block:: python - df.loc[(slice('A1', 'A3'), ...)] # noqa: E999 + df.loc[(slice("A1", "A3"), ...)] # noqa: E999 .. ipython:: python def mklbl(prefix, n): return ["%s%s" % (prefix, i) for i in range(n)] - miindex = pd.MultiIndex.from_product([mklbl('A', 4), - mklbl('B', 2), - mklbl('C', 4), - mklbl('D', 2)]) - micolumns = pd.MultiIndex.from_tuples([('a', 'foo'), ('a', 'bar'), - ('b', 'foo'), ('b', 'bah')], - names=['lvl0', 'lvl1']) - dfmi = pd.DataFrame(np.arange(len(miindex) * len(micolumns)) - .reshape((len(miindex), len(micolumns))), - index=miindex, - columns=micolumns).sort_index().sort_index(axis=1) + + miindex = pd.MultiIndex.from_product( + [mklbl("A", 4), mklbl("B", 2), mklbl("C", 4), mklbl("D", 2)] + ) + micolumns = pd.MultiIndex.from_tuples( + [("a", "foo"), ("a", "bar"), ("b", "foo"), ("b", "bah")], names=["lvl0", "lvl1"] + ) + dfmi = ( + pd.DataFrame( + np.arange(len(miindex) * len(micolumns)).reshape( + (len(miindex), len(micolumns)) + ), + index=miindex, + columns=micolumns, + ) + .sort_index() + .sort_index(axis=1) + ) dfmi Basic MultiIndex slicing using slices, lists, and labels. .. ipython:: python - dfmi.loc[(slice('A1', 'A3'), slice(None), ['C1', 'C3']), :] + dfmi.loc[(slice("A1", "A3"), slice(None), ["C1", "C3"]), :] You can use :class:`pandas.IndexSlice` to facilitate a more natural syntax @@ -368,36 +381,36 @@ using ``:``, rather than using ``slice(None)``. .. ipython:: python idx = pd.IndexSlice - dfmi.loc[idx[:, :, ['C1', 'C3']], idx[:, 'foo']] + dfmi.loc[idx[:, :, ["C1", "C3"]], idx[:, "foo"]] It is possible to perform quite complicated selections using this method on multiple axes at the same time. .. ipython:: python - dfmi.loc['A1', (slice(None), 'foo')] - dfmi.loc[idx[:, :, ['C1', 'C3']], idx[:, 'foo']] + dfmi.loc["A1", (slice(None), "foo")] + dfmi.loc[idx[:, :, ["C1", "C3"]], idx[:, "foo"]] Using a boolean indexer you can provide selection related to the *values*. .. ipython:: python - mask = dfmi[('a', 'foo')] > 200 - dfmi.loc[idx[mask, :, ['C1', 'C3']], idx[:, 'foo']] + mask = dfmi[("a", "foo")] > 200 + dfmi.loc[idx[mask, :, ["C1", "C3"]], idx[:, "foo"]] You can also specify the ``axis`` argument to ``.loc`` to interpret the passed slicers on a single axis. .. ipython:: python - dfmi.loc(axis=0)[:, :, ['C1', 'C3']] + dfmi.loc(axis=0)[:, :, ["C1", "C3"]] Furthermore, you can *set* the values using the following methods. .. ipython:: python df2 = dfmi.copy() - df2.loc(axis=0)[:, :, ['C1', 'C3']] = -10 + df2.loc(axis=0)[:, :, ["C1", "C3"]] = -10 df2 You can use a right-hand-side of an alignable object as well. @@ -405,7 +418,7 @@ You can use a right-hand-side of an alignable object as well. .. ipython:: python df2 = dfmi.copy() - df2.loc[idx[:, :, ['C1', 'C3']], :] = df2 * 1000 + df2.loc[idx[:, :, ["C1", "C3"]], :] = df2 * 1000 df2 .. _advanced.xs: @@ -419,12 +432,12 @@ selecting data at a particular level of a ``MultiIndex`` easier. .. ipython:: python df - df.xs('one', level='second') + df.xs("one", level="second") .. ipython:: python # using the slicers - df.loc[(slice(None), 'one'), :] + df.loc[(slice(None), "one"), :] You can also select on the columns with ``xs``, by providing the axis argument. @@ -432,36 +445,36 @@ providing the axis argument. .. ipython:: python df = df.T - df.xs('one', level='second', axis=1) + df.xs("one", level="second", axis=1) .. ipython:: python # using the slicers - df.loc[:, (slice(None), 'one')] + df.loc[:, (slice(None), "one")] ``xs`` also allows selection with multiple keys. .. ipython:: python - df.xs(('one', 'bar'), level=('second', 'first'), axis=1) + df.xs(("one", "bar"), level=("second", "first"), axis=1) .. ipython:: python # using the slicers - df.loc[:, ('bar', 'one')] + df.loc[:, ("bar", "one")] You can pass ``drop_level=False`` to ``xs`` to retain the level that was selected. .. ipython:: python - df.xs('one', level='second', axis=1, drop_level=False) + df.xs("one", level="second", axis=1, drop_level=False) Compare the above with the result using ``drop_level=True`` (the default value). .. ipython:: python - df.xs('one', level='second', axis=1, drop_level=True) + df.xs("one", level="second", axis=1, drop_level=True) .. ipython:: python :suppress: @@ -479,8 +492,9 @@ values across a level. For instance: .. ipython:: python - midx = pd.MultiIndex(levels=[['zero', 'one'], ['x', 'y']], - codes=[[1, 1, 0, 0], [1, 0, 1, 0]]) + midx = pd.MultiIndex( + levels=[["zero", "one"], ["x", "y"]], codes=[[1, 1, 0, 0], [1, 0, 1, 0]] + ) df = pd.DataFrame(np.random.randn(4, 2), index=midx) df df2 = df.mean(level=0) @@ -543,7 +557,7 @@ used to move the values from the ``MultiIndex`` to a column. .. ipython:: python - df.rename_axis(index=['abc', 'def']) + df.rename_axis(index=["abc", "def"]) Note that the columns of a ``DataFrame`` are an index, so that using ``rename_axis`` with the ``columns`` argument will change the name of that @@ -561,7 +575,7 @@ When working with an ``Index`` object directly, rather than via a ``DataFrame``, .. ipython:: python - mi = pd.MultiIndex.from_product([[1, 2], ['a', 'b']], names=['x', 'y']) + mi = pd.MultiIndex.from_product([[1, 2], ["a", "b"]], names=["x", "y"]) mi.names mi2 = mi.rename("new name", level=0) @@ -586,6 +600,7 @@ they need to be sorted. As with any index, you can use :meth:`~DataFrame.sort_in .. ipython:: python import random + random.shuffle(tuples) s = pd.Series(np.random.randn(8), index=pd.MultiIndex.from_tuples(tuples)) s @@ -600,9 +615,9 @@ are named. .. ipython:: python - s.index.set_names(['L1', 'L2'], inplace=True) - s.sort_index(level='L1') - s.sort_index(level='L2') + s.index.set_names(["L1", "L2"], inplace=True) + s.sort_index(level="L1") + s.sort_index(level="L2") On higher dimensional objects, you can sort any of the other axes by level if they have a ``MultiIndex``: @@ -617,10 +632,10 @@ return a copy of the data rather than a view: .. ipython:: python - dfm = pd.DataFrame({'jim': [0, 0, 1, 1], - 'joe': ['x', 'x', 'z', 'y'], - 'jolie': np.random.rand(4)}) - dfm = dfm.set_index(['jim', 'joe']) + dfm = pd.DataFrame( + {"jim": [0, 0, 1, 1], "joe": ["x", "x", "z", "y"], "jolie": np.random.rand(4)} + ) + dfm = dfm.set_index(["jim", "joe"]) dfm .. code-block:: ipython @@ -661,7 +676,7 @@ And now selection works as expected. .. ipython:: python - dfm.loc[(0, 'y'):(1, 'z')] + dfm.loc[(0, "y"):(1, "z")] Take methods ------------ @@ -754,18 +769,18 @@ and allows efficient indexing and storage of an index with a large number of dup .. ipython:: python from pandas.api.types import CategoricalDtype - df = pd.DataFrame({'A': np.arange(6), - 'B': list('aabbca')}) - df['B'] = df['B'].astype(CategoricalDtype(list('cab'))) + + df = pd.DataFrame({"A": np.arange(6), "B": list("aabbca")}) + df["B"] = df["B"].astype(CategoricalDtype(list("cab"))) df df.dtypes - df['B'].cat.categories + df["B"].cat.categories Setting the index will create a ``CategoricalIndex``. .. ipython:: python - df2 = df.set_index('B') + df2 = df.set_index("B") df2.index Indexing with ``__getitem__/.iloc/.loc`` works similarly to an ``Index`` with duplicates. @@ -773,13 +788,13 @@ The indexers **must** be in the category or the operation will raise a ``KeyErro .. ipython:: python - df2.loc['a'] + df2.loc["a"] The ``CategoricalIndex`` is **preserved** after indexing: .. ipython:: python - df2.loc['a'].index + df2.loc["a"].index Sorting the index will sort by the order of the categories (recall that we created the index with ``CategoricalDtype(list('cab'))``, so the sorted @@ -804,17 +819,16 @@ values **not** in the categories, similarly to how you can reindex **any** panda .. ipython:: python - df3 = pd.DataFrame({'A': np.arange(3), - 'B': pd.Series(list('abc')).astype('category')}) - df3 = df3.set_index('B') + df3 = pd.DataFrame({"A": np.arange(3), "B": pd.Series(list("abc")).astype("category")}) + df3 = df3.set_index("B") df3 .. ipython:: python - df3.reindex(['a', 'e']) - df3.reindex(['a', 'e']).index - df3.reindex(pd.Categorical(['a', 'e'], categories=list('abe'))) - df3.reindex(pd.Categorical(['a', 'e'], categories=list('abe'))).index + df3.reindex(["a", "e"]) + df3.reindex(["a", "e"]).index + df3.reindex(pd.Categorical(["a", "e"], categories=list("abe"))) + df3.reindex(pd.Categorical(["a", "e"], categories=list("abe"))).index .. warning:: @@ -823,16 +837,14 @@ values **not** in the categories, similarly to how you can reindex **any** panda .. ipython:: python - df4 = pd.DataFrame({'A': np.arange(2), - 'B': list('ba')}) - df4['B'] = df4['B'].astype(CategoricalDtype(list('ab'))) - df4 = df4.set_index('B') + df4 = pd.DataFrame({"A": np.arange(2), "B": list("ba")}) + df4["B"] = df4["B"].astype(CategoricalDtype(list("ab"))) + df4 = df4.set_index("B") df4.index - df5 = pd.DataFrame({'A': np.arange(2), - 'B': list('bc')}) - df5['B'] = df5['B'].astype(CategoricalDtype(list('bc'))) - df5 = df5.set_index('B') + df5 = pd.DataFrame({"A": np.arange(2), "B": list("bc")}) + df5["B"] = df5["B"].astype(CategoricalDtype(list("bc"))) + df5 = df5.set_index("B") df5.index .. code-block:: ipython @@ -916,12 +928,16 @@ example, be millisecond offsets. .. ipython:: python - dfir = pd.concat([pd.DataFrame(np.random.randn(5, 2), - index=np.arange(5) * 250.0, - columns=list('AB')), - pd.DataFrame(np.random.randn(6, 2), - index=np.arange(4, 10) * 250.1, - columns=list('AB'))]) + dfir = pd.concat( + [ + pd.DataFrame( + np.random.randn(5, 2), index=np.arange(5) * 250.0, columns=list("AB") + ), + pd.DataFrame( + np.random.randn(6, 2), index=np.arange(4, 10) * 250.1, columns=list("AB") + ), + ] + ) dfir Selection operations then will always work on a value basis, for all selection operators. @@ -929,7 +945,7 @@ Selection operations then will always work on a value basis, for all selection o .. ipython:: python dfir[0:1000.4] - dfir.loc[0:1001, 'A'] + dfir.loc[0:1001, "A"] dfir.loc[1000.4] You could retrieve the first 1 second (1000 ms) of data as such: @@ -963,8 +979,9 @@ An ``IntervalIndex`` can be used in ``Series`` and in ``DataFrame`` as the index .. ipython:: python - df = pd.DataFrame({'A': [1, 2, 3, 4]}, - index=pd.IntervalIndex.from_breaks([0, 1, 2, 3, 4])) + df = pd.DataFrame( + {"A": [1, 2, 3, 4]}, index=pd.IntervalIndex.from_breaks([0, 1, 2, 3, 4]) + ) df Label based indexing via ``.loc`` along the edges of an interval works as you would expect, @@ -1041,9 +1058,9 @@ datetime-like intervals: pd.interval_range(start=0, end=5) - pd.interval_range(start=pd.Timestamp('2017-01-01'), periods=4) + pd.interval_range(start=pd.Timestamp("2017-01-01"), periods=4) - pd.interval_range(end=pd.Timedelta('3 days'), periods=3) + pd.interval_range(end=pd.Timedelta("3 days"), periods=3) The ``freq`` parameter can used to specify non-default frequencies, and can utilize a variety of :ref:`frequency aliases ` with datetime-like intervals: @@ -1052,18 +1069,18 @@ of :ref:`frequency aliases ` with datetime-like inter pd.interval_range(start=0, periods=5, freq=1.5) - pd.interval_range(start=pd.Timestamp('2017-01-01'), periods=4, freq='W') + pd.interval_range(start=pd.Timestamp("2017-01-01"), periods=4, freq="W") - pd.interval_range(start=pd.Timedelta('0 days'), periods=3, freq='9H') + pd.interval_range(start=pd.Timedelta("0 days"), periods=3, freq="9H") Additionally, the ``closed`` parameter can be used to specify which side(s) the intervals are closed on. Intervals are closed on the right side by default. .. ipython:: python - pd.interval_range(start=0, end=4, closed='both') + pd.interval_range(start=0, end=4, closed="both") - pd.interval_range(start=0, end=4, closed='neither') + pd.interval_range(start=0, end=4, closed="neither") Specifying ``start``, ``end``, and ``periods`` will generate a range of evenly spaced intervals from ``start`` to ``end`` inclusively, with ``periods`` number of elements @@ -1073,8 +1090,7 @@ in the resulting ``IntervalIndex``: pd.interval_range(start=0, end=6, periods=4) - pd.interval_range(pd.Timestamp('2018-01-01'), - pd.Timestamp('2018-02-28'), periods=3) + pd.interval_range(pd.Timestamp("2018-01-01"), pd.Timestamp("2018-02-28"), periods=3) Miscellaneous indexing FAQ -------------------------- @@ -1112,7 +1128,7 @@ normal Python ``list``. Monotonicity of an index can be tested with the :meth:`~ .. ipython:: python - df = pd.DataFrame(index=[2, 3, 3, 4, 5], columns=['data'], data=list(range(5))) + df = pd.DataFrame(index=[2, 3, 3, 4, 5], columns=["data"], data=list(range(5))) df.index.is_monotonic_increasing # no rows 0 or 1, but still returns rows 2, 3 (both of them), and 4: @@ -1126,8 +1142,7 @@ On the other hand, if the index is not monotonic, then both slice bounds must be .. ipython:: python - df = pd.DataFrame(index=[2, 3, 1, 4, 3, 5], - columns=['data'], data=list(range(6))) + df = pd.DataFrame(index=[2, 3, 1, 4, 3, 5], columns=["data"], data=list(range(6))) df.index.is_monotonic_increasing # OK because 2 and 4 are in the index @@ -1149,7 +1164,7 @@ the :meth:`~Index.is_unique` attribute. .. ipython:: python - weakly_monotonic = pd.Index(['a', 'b', 'c', 'c']) + weakly_monotonic = pd.Index(["a", "b", "c", "c"]) weakly_monotonic weakly_monotonic.is_monotonic_increasing weakly_monotonic.is_monotonic_increasing & weakly_monotonic.is_unique @@ -1167,7 +1182,7 @@ consider the following ``Series``: .. ipython:: python - s = pd.Series(np.random.randn(6), index=list('abcdef')) + s = pd.Series(np.random.randn(6), index=list("abcdef")) s Suppose we wished to slice from ``c`` to ``e``, using integers this would be @@ -1190,7 +1205,7 @@ slicing include both endpoints: .. ipython:: python - s.loc['c':'e'] + s.loc["c":"e"] This is most definitely a "practicality beats purity" sort of thing, but it is something to watch out for if you expect label-based slicing to behave exactly diff --git a/doc/source/user_guide/basics.rst b/doc/source/user_guide/basics.rst index 5fa214d2ed389..8c01913e55318 100644 --- a/doc/source/user_guide/basics.rst +++ b/doc/source/user_guide/basics.rst @@ -12,10 +12,9 @@ the :ref:`10 minutes to pandas <10min>` section: .. ipython:: python - index = pd.date_range('1/1/2000', periods=8) - s = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e']) - df = pd.DataFrame(np.random.randn(8, 3), index=index, - columns=['A', 'B', 'C']) + index = pd.date_range("1/1/2000", periods=8) + s = pd.Series(np.random.randn(5), index=["a", "b", "c", "d", "e"]) + df = pd.DataFrame(np.random.randn(8, 3), index=index, columns=["A", "B", "C"]) .. _basics.head_tail: @@ -97,7 +96,7 @@ Timezones may be preserved with ``dtype=object`` .. ipython:: python - ser = pd.Series(pd.date_range('2000', periods=2, tz="CET")) + ser = pd.Series(pd.date_range("2000", periods=2, tz="CET")) ser.to_numpy(dtype=object) Or thrown away with ``dtype='datetime64[ns]'`` @@ -174,8 +173,8 @@ These are both enabled to be used by default, you can control this by setting th .. code-block:: python - pd.set_option('compute.use_bottleneck', False) - pd.set_option('compute.use_numexpr', False) + pd.set_option("compute.use_bottleneck", False) + pd.set_option("compute.use_numexpr", False) .. _basics.binop: @@ -204,18 +203,21 @@ either match on the *index* or *columns* via the **axis** keyword: .. ipython:: python - df = pd.DataFrame({ - 'one': pd.Series(np.random.randn(3), index=['a', 'b', 'c']), - 'two': pd.Series(np.random.randn(4), index=['a', 'b', 'c', 'd']), - 'three': pd.Series(np.random.randn(3), index=['b', 'c', 'd'])}) + df = pd.DataFrame( + { + "one": pd.Series(np.random.randn(3), index=["a", "b", "c"]), + "two": pd.Series(np.random.randn(4), index=["a", "b", "c", "d"]), + "three": pd.Series(np.random.randn(3), index=["b", "c", "d"]), + } + ) df row = df.iloc[1] - column = df['two'] + column = df["two"] - df.sub(row, axis='columns') + df.sub(row, axis="columns") df.sub(row, axis=1) - df.sub(column, axis='index') + df.sub(column, axis="index") df.sub(column, axis=0) .. ipython:: python @@ -228,10 +230,10 @@ Furthermore you can align a level of a MultiIndexed DataFrame with a Series. .. ipython:: python dfmi = df.copy() - dfmi.index = pd.MultiIndex.from_tuples([(1, 'a'), (1, 'b'), - (1, 'c'), (2, 'a')], - names=['first', 'second']) - dfmi.sub(column, axis=0, level='second') + dfmi.index = pd.MultiIndex.from_tuples( + [(1, "a"), (1, "b"), (1, "c"), (2, "a")], names=["first", "second"] + ) + dfmi.sub(column, axis=0, level="second") Series and Index also support the :func:`divmod` builtin. This function takes the floor division and modulo operation at the same time returning a two-tuple @@ -273,7 +275,7 @@ using ``fillna`` if you wish). :suppress: df2 = df.copy() - df2['three']['a'] = 1. + df2["three"]["a"] = 1.0 .. ipython:: python @@ -325,7 +327,7 @@ You can test if a pandas object is empty, via the :attr:`~DataFrame.empty` prope .. ipython:: python df.empty - pd.DataFrame(columns=list('ABC')).empty + pd.DataFrame(columns=list("ABC")).empty To evaluate single-element pandas objects in a boolean context, use the method :meth:`~DataFrame.bool`: @@ -394,8 +396,8 @@ equality to be True: .. ipython:: python - df1 = pd.DataFrame({'col': ['foo', 0, np.nan]}) - df2 = pd.DataFrame({'col': [np.nan, 0, 'foo']}, index=[2, 1, 0]) + df1 = pd.DataFrame({"col": ["foo", 0, np.nan]}) + df2 = pd.DataFrame({"col": [np.nan, 0, "foo"]}, index=[2, 1, 0]) df1.equals(df2) df1.equals(df2.sort_index()) @@ -407,16 +409,16 @@ data structure with a scalar value: .. ipython:: python - pd.Series(['foo', 'bar', 'baz']) == 'foo' - pd.Index(['foo', 'bar', 'baz']) == 'foo' + pd.Series(["foo", "bar", "baz"]) == "foo" + pd.Index(["foo", "bar", "baz"]) == "foo" pandas also handles element-wise comparisons between different array-like objects of the same length: .. ipython:: python - pd.Series(['foo', 'bar', 'baz']) == pd.Index(['foo', 'bar', 'qux']) - pd.Series(['foo', 'bar', 'baz']) == np.array(['foo', 'bar', 'qux']) + pd.Series(["foo", "bar", "baz"]) == pd.Index(["foo", "bar", "qux"]) + pd.Series(["foo", "bar", "baz"]) == np.array(["foo", "bar", "qux"]) Trying to compare ``Index`` or ``Series`` objects of different lengths will raise a ValueError: @@ -458,10 +460,12 @@ which we illustrate: .. ipython:: python - df1 = pd.DataFrame({'A': [1., np.nan, 3., 5., np.nan], - 'B': [np.nan, 2., 3., np.nan, 6.]}) - df2 = pd.DataFrame({'A': [5., 2., 4., np.nan, 3., 7.], - 'B': [np.nan, np.nan, 3., 4., 6., 8.]}) + df1 = pd.DataFrame( + {"A": [1.0, np.nan, 3.0, 5.0, np.nan], "B": [np.nan, 2.0, 3.0, np.nan, 6.0]} + ) + df2 = pd.DataFrame( + {"A": [5.0, 2.0, 4.0, np.nan, 3.0, 7.0], "B": [np.nan, np.nan, 3.0, 4.0, 6.0, 8.0]} + ) df1 df2 df1.combine_first(df2) @@ -480,6 +484,8 @@ So, for instance, to reproduce :meth:`~DataFrame.combine_first` as above: def combiner(x, y): return np.where(pd.isna(x), y, x) + + df1.combine(df2, combiner) .. _basics.stats: @@ -570,8 +576,8 @@ will exclude NAs on Series input by default: .. ipython:: python - np.mean(df['one']) - np.mean(df['one'].to_numpy()) + np.mean(df["one"]) + np.mean(df["one"].to_numpy()) :meth:`Series.nunique` will return the number of unique non-NA values in a Series: @@ -597,8 +603,7 @@ course): series = pd.Series(np.random.randn(1000)) series[::2] = np.nan series.describe() - frame = pd.DataFrame(np.random.randn(1000, 5), - columns=['a', 'b', 'c', 'd', 'e']) + frame = pd.DataFrame(np.random.randn(1000, 5), columns=["a", "b", "c", "d", "e"]) frame.iloc[::2] = np.nan frame.describe() @@ -606,7 +611,7 @@ You can select specific percentiles to include in the output: .. ipython:: python - series.describe(percentiles=[.05, .25, .75, .95]) + series.describe(percentiles=[0.05, 0.25, 0.75, 0.95]) By default, the median is always included. @@ -615,7 +620,7 @@ summary of the number of unique values and most frequently occurring values: .. ipython:: python - s = pd.Series(['a', 'a', 'b', 'b', 'a', 'a', np.nan, 'c', 'd', 'a']) + s = pd.Series(["a", "a", "b", "b", "a", "a", np.nan, "c", "d", "a"]) s.describe() Note that on a mixed-type DataFrame object, :meth:`~DataFrame.describe` will @@ -624,7 +629,7 @@ categorical columns: .. ipython:: python - frame = pd.DataFrame({'a': ['Yes', 'Yes', 'No', 'No'], 'b': range(4)}) + frame = pd.DataFrame({"a": ["Yes", "Yes", "No", "No"], "b": range(4)}) frame.describe() This behavior can be controlled by providing a list of types as ``include``/``exclude`` @@ -632,9 +637,9 @@ arguments. The special value ``all`` can also be used: .. ipython:: python - frame.describe(include=['object']) - frame.describe(include=['number']) - frame.describe(include='all') + frame.describe(include=["object"]) + frame.describe(include=["number"]) + frame.describe(include="all") That feature relies on :ref:`select_dtypes `. Refer to there for details about accepted inputs. @@ -654,7 +659,7 @@ corresponding values: s1 s1.idxmin(), s1.idxmax() - df1 = pd.DataFrame(np.random.randn(5, 3), columns=['A', 'B', 'C']) + df1 = pd.DataFrame(np.random.randn(5, 3), columns=["A", "B", "C"]) df1 df1.idxmin(axis=0) df1.idxmax(axis=1) @@ -665,9 +670,9 @@ matching index: .. ipython:: python - df3 = pd.DataFrame([2, 1, 1, 3, np.nan], columns=['A'], index=list('edcba')) + df3 = pd.DataFrame([2, 1, 1, 3, np.nan], columns=["A"], index=list("edcba")) df3 - df3['A'].idxmin() + df3["A"].idxmin() .. note:: @@ -706,8 +711,9 @@ Similarly, you can get the most frequently occurring value(s), i.e. the mode, of s5 = pd.Series([1, 1, 3, 3, 3, 5, 5, 7, 7, 7]) s5.mode() - df5 = pd.DataFrame({"A": np.random.randint(0, 7, size=50), - "B": np.random.randint(-10, 15, size=50)}) + df5 = pd.DataFrame( + {"A": np.random.randint(0, 7, size=50), "B": np.random.randint(-10, 15, size=50)} + ) df5.mode() @@ -732,7 +738,7 @@ normally distributed data into equal-size quartiles like so: .. ipython:: python arr = np.random.randn(30) - factor = pd.qcut(arr, [0, .25, .5, .75, 1]) + factor = pd.qcut(arr, [0, 0.25, 0.5, 0.75, 1]) factor pd.value_counts(factor) @@ -775,18 +781,20 @@ First some setup: """ Chicago, IL -> Chicago for city_name column """ - df['city_name'] = df['city_and_code'].str.split(",").str.get(0) + df["city_name"] = df["city_and_code"].str.split(",").str.get(0) return df + def add_country_name(df, country_name=None): """ Chicago -> Chicago-US for city_name column """ - col = 'city_name' - df['city_and_country'] = df[col] + country_name + col = "city_name" + df["city_and_country"] = df[col] + country_name return df - df_p = pd.DataFrame({'city_and_code': ['Chicago, IL']}) + + df_p = pd.DataFrame({"city_and_code": ["Chicago, IL"]}) ``extract_city_name`` and ``add_country_name`` are functions taking and returning ``DataFrames``. @@ -795,14 +803,13 @@ Now compare the following: .. ipython:: python - add_country_name(extract_city_name(df_p), country_name='US') + add_country_name(extract_city_name(df_p), country_name="US") Is equivalent to: .. ipython:: python - (df_p.pipe(extract_city_name) - .pipe(add_country_name, country_name="US")) + df_p.pipe(extract_city_name).pipe(add_country_name, country_name="US") pandas encourages the second style, which is known as method chaining. ``pipe`` makes it easy to use your own or another library's functions @@ -820,14 +827,15 @@ For example, we can fit a regression using statsmodels. Their API expects a form import statsmodels.formula.api as sm - bb = pd.read_csv('data/baseball.csv', index_col='id') + bb = pd.read_csv("data/baseball.csv", index_col="id") - (bb.query('h > 0') - .assign(ln_h=lambda df: np.log(df.h)) - .pipe((sm.ols, 'data'), 'hr ~ ln_h + year + g + C(lg)') - .fit() - .summary() - ) + ( + bb.query("h > 0") + .assign(ln_h=lambda df: np.log(df.h)) + .pipe((sm.ols, "data"), "hr ~ ln_h + year + g + C(lg)") + .fit() + .summary() + ) The pipe method is inspired by unix pipes and more recently dplyr_ and magrittr_, which have introduced the popular ``(%>%)`` (read pipe) operator for R_. @@ -858,8 +866,8 @@ The :meth:`~DataFrame.apply` method will also dispatch on a string method name. .. ipython:: python - df.apply('mean') - df.apply('mean', axis=1) + df.apply("mean") + df.apply("mean", axis=1) The return type of the function passed to :meth:`~DataFrame.apply` affects the type of the final output from ``DataFrame.apply`` for the default behaviour: @@ -878,8 +886,11 @@ maximum value for each column occurred: .. ipython:: python - tsdf = pd.DataFrame(np.random.randn(1000, 3), columns=['A', 'B', 'C'], - index=pd.date_range('1/1/2000', periods=1000)) + tsdf = pd.DataFrame( + np.random.randn(1000, 3), + columns=["A", "B", "C"], + index=pd.date_range("1/1/2000", periods=1000), + ) tsdf.apply(lambda x: x.idxmax()) You may also pass additional arguments and keyword arguments to the :meth:`~DataFrame.apply` @@ -902,8 +913,11 @@ Series operation on each column or row: .. ipython:: python :suppress: - tsdf = pd.DataFrame(np.random.randn(10, 3), columns=['A', 'B', 'C'], - index=pd.date_range('1/1/2000', periods=10)) + tsdf = pd.DataFrame( + np.random.randn(10, 3), + columns=["A", "B", "C"], + index=pd.date_range("1/1/2000", periods=10), + ) tsdf.iloc[3:7] = np.nan .. ipython:: python @@ -933,8 +947,11 @@ We will use a similar starting frame from above: .. ipython:: python - tsdf = pd.DataFrame(np.random.randn(10, 3), columns=['A', 'B', 'C'], - index=pd.date_range('1/1/2000', periods=10)) + tsdf = pd.DataFrame( + np.random.randn(10, 3), + columns=["A", "B", "C"], + index=pd.date_range("1/1/2000", periods=10), + ) tsdf.iloc[3:7] = np.nan tsdf @@ -946,7 +963,7 @@ output: tsdf.agg(np.sum) - tsdf.agg('sum') + tsdf.agg("sum") # these are equivalent to a ``.sum()`` because we are aggregating # on a single function @@ -956,7 +973,7 @@ Single aggregations on a ``Series`` this will return a scalar value: .. ipython:: python - tsdf['A'].agg('sum') + tsdf["A"].agg("sum") Aggregating with multiple functions @@ -968,25 +985,25 @@ These are naturally named from the aggregation function. .. ipython:: python - tsdf.agg(['sum']) + tsdf.agg(["sum"]) Multiple functions yield multiple rows: .. ipython:: python - tsdf.agg(['sum', 'mean']) + tsdf.agg(["sum", "mean"]) On a ``Series``, multiple functions return a ``Series``, indexed by the function names: .. ipython:: python - tsdf['A'].agg(['sum', 'mean']) + tsdf["A"].agg(["sum", "mean"]) Passing a ``lambda`` function will yield a ```` named row: .. ipython:: python - tsdf['A'].agg(['sum', lambda x: x.mean()]) + tsdf["A"].agg(["sum", lambda x: x.mean()]) Passing a named function will yield that name for the row: @@ -995,7 +1012,8 @@ Passing a named function will yield that name for the row: def mymean(x): return x.mean() - tsdf['A'].agg(['sum', mymean]) + + tsdf["A"].agg(["sum", mymean]) Aggregating with a dict +++++++++++++++++++++++ @@ -1006,7 +1024,7 @@ are not in any particular order, you can use an ``OrderedDict`` instead to guara .. ipython:: python - tsdf.agg({'A': 'mean', 'B': 'sum'}) + tsdf.agg({"A": "mean", "B": "sum"}) Passing a list-like will generate a ``DataFrame`` output. You will get a matrix-like output of all of the aggregators. The output will consist of all unique functions. Those that are @@ -1014,7 +1032,7 @@ not noted for a particular column will be ``NaN``: .. ipython:: python - tsdf.agg({'A': ['mean', 'min'], 'B': 'sum'}) + tsdf.agg({"A": ["mean", "min"], "B": "sum"}) .. _basics.aggregation.mixed_string: @@ -1026,15 +1044,19 @@ aggregations. This is similar to how ``.groupby.agg`` works. .. ipython:: python - mdf = pd.DataFrame({'A': [1, 2, 3], - 'B': [1., 2., 3.], - 'C': ['foo', 'bar', 'baz'], - 'D': pd.date_range('20130101', periods=3)}) + mdf = pd.DataFrame( + { + "A": [1, 2, 3], + "B": [1.0, 2.0, 3.0], + "C": ["foo", "bar", "baz"], + "D": pd.date_range("20130101", periods=3), + } + ) mdf.dtypes .. ipython:: python - mdf.agg(['min', 'sum']) + mdf.agg(["min", "sum"]) .. _basics.aggregation.custom_describe: @@ -1049,11 +1071,11 @@ to the built in :ref:`describe function `. from functools import partial q_25 = partial(pd.Series.quantile, q=0.25) - q_25.__name__ = '25%' + q_25.__name__ = "25%" q_75 = partial(pd.Series.quantile, q=0.75) - q_75.__name__ = '75%' + q_75.__name__ = "75%" - tsdf.agg(['count', 'mean', 'std', 'min', q_25, 'median', q_75, 'max']) + tsdf.agg(["count", "mean", "std", "min", q_25, "median", q_75, "max"]) .. _basics.transform: @@ -1068,8 +1090,11 @@ We create a frame similar to the one used in the above sections. .. ipython:: python - tsdf = pd.DataFrame(np.random.randn(10, 3), columns=['A', 'B', 'C'], - index=pd.date_range('1/1/2000', periods=10)) + tsdf = pd.DataFrame( + np.random.randn(10, 3), + columns=["A", "B", "C"], + index=pd.date_range("1/1/2000", periods=10), + ) tsdf.iloc[3:7] = np.nan tsdf @@ -1080,7 +1105,7 @@ function name or a user defined function. :okwarning: tsdf.transform(np.abs) - tsdf.transform('abs') + tsdf.transform("abs") tsdf.transform(lambda x: x.abs()) Here :meth:`~DataFrame.transform` received a single function; this is equivalent to a `ufunc @@ -1094,7 +1119,7 @@ Passing a single function to ``.transform()`` with a ``Series`` will yield a sin .. ipython:: python - tsdf['A'].transform(np.abs) + tsdf["A"].transform(np.abs) Transform with multiple functions @@ -1113,7 +1138,7 @@ resulting column names will be the transforming functions. .. ipython:: python - tsdf['A'].transform([np.abs, lambda x: x + 1]) + tsdf["A"].transform([np.abs, lambda x: x + 1]) Transforming with a dict @@ -1124,7 +1149,7 @@ Passing a dict of functions will allow selective transforming per column. .. ipython:: python - tsdf.transform({'A': np.abs, 'B': lambda x: x + 1}) + tsdf.transform({"A": np.abs, "B": lambda x: x + 1}) Passing a dict of lists will generate a MultiIndexed DataFrame with these selective transforms. @@ -1132,7 +1157,7 @@ selective transforms. .. ipython:: python :okwarning: - tsdf.transform({'A': np.abs, 'B': [lambda x: x + 1, 'sqrt']}) + tsdf.transform({"A": np.abs, "B": [lambda x: x + 1, "sqrt"]}) .. _basics.elementwise: @@ -1153,10 +1178,12 @@ a single value and returning a single value. For example: df4 + def f(x): return len(str(x)) - df4['one'].map(f) + + df4["one"].map(f) df4.applymap(f) :meth:`Series.map` has an additional feature; it can be used to easily @@ -1165,9 +1192,8 @@ to :ref:`merging/joining functionality `: .. ipython:: python - s = pd.Series(['six', 'seven', 'six', 'seven', 'six'], - index=['a', 'b', 'c', 'd', 'e']) - t = pd.Series({'six': 6., 'seven': 7.}) + s = pd.Series(["six", "seven", "six", "seven", "six"], index=["a", "b", "c", "d", "e"]) + t = pd.Series({"six": 6.0, "seven": 7.0}) s s.map(t) @@ -1192,9 +1218,9 @@ Here is a simple example: .. ipython:: python - s = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e']) + s = pd.Series(np.random.randn(5), index=["a", "b", "c", "d", "e"]) s - s.reindex(['e', 'b', 'f', 'd']) + s.reindex(["e", "b", "f", "d"]) Here, the ``f`` label was not contained in the Series and hence appears as ``NaN`` in the result. @@ -1204,13 +1230,13 @@ With a DataFrame, you can simultaneously reindex the index and columns: .. ipython:: python df - df.reindex(index=['c', 'f', 'b'], columns=['three', 'two', 'one']) + df.reindex(index=["c", "f", "b"], columns=["three", "two", "one"]) You may also use ``reindex`` with an ``axis`` keyword: .. ipython:: python - df.reindex(['c', 'f', 'b'], axis='index') + df.reindex(["c", "f", "b"], axis="index") Note that the ``Index`` objects containing the actual axis labels can be **shared** between objects. So if we have a Series and a DataFrame, the @@ -1230,8 +1256,8 @@ where you specify a single ``labels`` argument and the ``axis`` it applies to. .. ipython:: python - df.reindex(['c', 'f', 'b'], axis='index') - df.reindex(['three', 'two', 'one'], axis='columns') + df.reindex(["c", "f", "b"], axis="index") + df.reindex(["three", "two", "one"], axis="columns") .. seealso:: @@ -1261,7 +1287,7 @@ available to make this simpler: .. ipython:: python :suppress: - df2 = df.reindex(['a', 'b', 'c'], columns=['one', 'two']) + df2 = df.reindex(["a", "b", "c"], columns=["one", "two"]) df3 = df2 - df2.mean() @@ -1288,12 +1314,12 @@ It returns a tuple with both of the reindexed Series: .. ipython:: python - s = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e']) + s = pd.Series(np.random.randn(5), index=["a", "b", "c", "d", "e"]) s1 = s[:4] s2 = s[1:] s1.align(s2) - s1.align(s2, join='inner') - s1.align(s2, join='left') + s1.align(s2, join="inner") + s1.align(s2, join="left") .. _basics.df_join: @@ -1302,13 +1328,13 @@ columns by default: .. ipython:: python - df.align(df2, join='inner') + df.align(df2, join="inner") You can also pass an ``axis`` option to only align on the specified axis: .. ipython:: python - df.align(df2, join='inner', axis=0) + df.align(df2, join="inner", axis=0) .. _basics.align.frame.series: @@ -1339,16 +1365,16 @@ We illustrate these fill methods on a simple Series: .. ipython:: python - rng = pd.date_range('1/3/2000', periods=8) + rng = pd.date_range("1/3/2000", periods=8) ts = pd.Series(np.random.randn(8), index=rng) ts2 = ts[[0, 3, 6]] ts ts2 ts2.reindex(ts.index) - ts2.reindex(ts.index, method='ffill') - ts2.reindex(ts.index, method='bfill') - ts2.reindex(ts.index, method='nearest') + ts2.reindex(ts.index, method="ffill") + ts2.reindex(ts.index, method="bfill") + ts2.reindex(ts.index, method="nearest") These methods require that the indexes are **ordered** increasing or decreasing. @@ -1359,7 +1385,7 @@ Note that the same result could have been achieved using .. ipython:: python - ts2.reindex(ts.index).fillna(method='ffill') + ts2.reindex(ts.index).fillna(method="ffill") :meth:`~Series.reindex` will raise a ValueError if the index is not monotonically increasing or decreasing. :meth:`~Series.fillna` and :meth:`~Series.interpolate` @@ -1376,14 +1402,14 @@ matches: .. ipython:: python - ts2.reindex(ts.index, method='ffill', limit=1) + ts2.reindex(ts.index, method="ffill", limit=1) In contrast, tolerance specifies the maximum distance between the index and indexer values: .. ipython:: python - ts2.reindex(ts.index, method='ffill', tolerance='1 day') + ts2.reindex(ts.index, method="ffill", tolerance="1 day") Notice that when used on a ``DatetimeIndex``, ``TimedeltaIndex`` or ``PeriodIndex``, ``tolerance`` will coerced into a ``Timedelta`` if possible. @@ -1400,14 +1426,14 @@ It removes a set of labels from an axis: .. ipython:: python df - df.drop(['a', 'd'], axis=0) - df.drop(['one'], axis=1) + df.drop(["a", "d"], axis=0) + df.drop(["one"], axis=1) Note that the following also works, but is a bit less obvious / clean: .. ipython:: python - df.reindex(df.index.difference(['a', 'd'])) + df.reindex(df.index.difference(["a", "d"])) .. _basics.rename: @@ -1428,8 +1454,10 @@ Series can also be used: .. ipython:: python - df.rename(columns={'one': 'foo', 'two': 'bar'}, - index={'a': 'apple', 'b': 'banana', 'd': 'durian'}) + df.rename( + columns={"one": "foo", "two": "bar"}, + index={"a": "apple", "b": "banana", "d": "durian"}, + ) If the mapping doesn't include a column/index label, it isn't renamed. Note that extra labels in the mapping don't throw an error. @@ -1439,8 +1467,8 @@ you specify a single ``mapper`` and the ``axis`` to apply that mapping to. .. ipython:: python - df.rename({'one': 'foo', 'two': 'bar'}, axis='columns') - df.rename({'a': 'apple', 'b': 'banana', 'd': 'durian'}, axis='index') + df.rename({"one": "foo", "two": "bar"}, axis="columns") + df.rename({"a": "apple", "b": "banana", "d": "durian"}, axis="index") The :meth:`~DataFrame.rename` method also provides an ``inplace`` named @@ -1464,12 +1492,12 @@ labels). .. ipython:: python - df = pd.DataFrame({'x': [1, 2, 3, 4, 5, 6], - 'y': [10, 20, 30, 40, 50, 60]}, - index=pd.MultiIndex.from_product([['a', 'b', 'c'], [1, 2]], - names=['let', 'num'])) + df = pd.DataFrame( + {"x": [1, 2, 3, 4, 5, 6], "y": [10, 20, 30, 40, 50, 60]}, + index=pd.MultiIndex.from_product([["a", "b", "c"], [1, 2]], names=["let", "num"]), + ) df - df.rename_axis(index={'let': 'abc'}) + df.rename_axis(index={"let": "abc"}) df.rename_axis(index=str.upper) .. _basics.iteration: @@ -1491,8 +1519,9 @@ Thus, for example, iterating over a DataFrame gives you the column names: .. ipython:: python - df = pd.DataFrame({'col1': np.random.randn(3), - 'col2': np.random.randn(3)}, index=['a', 'b', 'c']) + df = pd.DataFrame( + {"col1": np.random.randn(3), "col2": np.random.randn(3)}, index=["a", "b", "c"] + ) for col in df: print(col) @@ -1540,10 +1569,10 @@ To iterate over the rows of a DataFrame, you can use the following methods: .. ipython:: python - df = pd.DataFrame({'a': [1, 2, 3], 'b': ['a', 'b', 'c']}) + df = pd.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]}) for index, row in df.iterrows(): - row['a'] = 10 + row["a"] = 10 df @@ -1576,7 +1605,7 @@ index value along with a Series containing the data in each row: .. ipython:: python for row_index, row in df.iterrows(): - print(row_index, row, sep='\n') + print(row_index, row, sep="\n") .. note:: @@ -1586,7 +1615,7 @@ index value along with a Series containing the data in each row: .. ipython:: python - df_orig = pd.DataFrame([[1, 1.5]], columns=['int', 'float']) + df_orig = pd.DataFrame([[1, 1.5]], columns=["int", "float"]) df_orig.dtypes row = next(df_orig.iterrows())[1] row @@ -1596,8 +1625,8 @@ index value along with a Series containing the data in each row: .. ipython:: python - row['int'].dtype - df_orig['int'].dtype + row["int"].dtype + df_orig["int"].dtype To preserve dtypes while iterating over the rows, it is better to use :meth:`~DataFrame.itertuples` which returns namedtuples of the values @@ -1607,7 +1636,7 @@ For instance, a contrived way to transpose the DataFrame would be: .. ipython:: python - df2 = pd.DataFrame({'x': [1, 2, 3], 'y': [4, 5, 6]}) + df2 = pd.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]}) print(df2) print(df2.T) @@ -1652,7 +1681,7 @@ This will return a Series, indexed like the existing Series. .. ipython:: python # datetime - s = pd.Series(pd.date_range('20130101 09:10:12', periods=4)) + s = pd.Series(pd.date_range("20130101 09:10:12", periods=4)) s s.dt.hour s.dt.second @@ -1668,7 +1697,7 @@ You can easily produces tz aware transformations: .. ipython:: python - stz = s.dt.tz_localize('US/Eastern') + stz = s.dt.tz_localize("US/Eastern") stz stz.dt.tz @@ -1676,7 +1705,7 @@ You can also chain these types of operations: .. ipython:: python - s.dt.tz_localize('UTC').dt.tz_convert('US/Eastern') + s.dt.tz_localize("UTC").dt.tz_convert("US/Eastern") You can also format datetime values as strings with :meth:`Series.dt.strftime` which supports the same format as the standard :meth:`~datetime.datetime.strftime`. @@ -1684,23 +1713,23 @@ supports the same format as the standard :meth:`~datetime.datetime.strftime`. .. ipython:: python # DatetimeIndex - s = pd.Series(pd.date_range('20130101', periods=4)) + s = pd.Series(pd.date_range("20130101", periods=4)) s - s.dt.strftime('%Y/%m/%d') + s.dt.strftime("%Y/%m/%d") .. ipython:: python # PeriodIndex - s = pd.Series(pd.period_range('20130101', periods=4)) + s = pd.Series(pd.period_range("20130101", periods=4)) s - s.dt.strftime('%Y/%m/%d') + s.dt.strftime("%Y/%m/%d") The ``.dt`` accessor works for period and timedelta dtypes. .. ipython:: python # period - s = pd.Series(pd.period_range('20130101', periods=4, freq='D')) + s = pd.Series(pd.period_range("20130101", periods=4, freq="D")) s s.dt.year s.dt.day @@ -1708,7 +1737,7 @@ The ``.dt`` accessor works for period and timedelta dtypes. .. ipython:: python # timedelta - s = pd.Series(pd.timedelta_range('1 day 00:00:05', periods=4, freq='s')) + s = pd.Series(pd.timedelta_range("1 day 00:00:05", periods=4, freq="s")) s s.dt.days s.dt.seconds @@ -1729,8 +1758,9 @@ built-in string methods. For example: .. ipython:: python - s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'], - dtype="string") + s = pd.Series( + ["A", "B", "C", "Aaba", "Baca", np.nan, "CABA", "dog", "cat"], dtype="string" + ) s.str.lower() Powerful pattern-matching methods are provided as well, but note that @@ -1765,13 +1795,15 @@ used to sort a pandas object by its index levels. .. ipython:: python - df = pd.DataFrame({ - 'one': pd.Series(np.random.randn(3), index=['a', 'b', 'c']), - 'two': pd.Series(np.random.randn(4), index=['a', 'b', 'c', 'd']), - 'three': pd.Series(np.random.randn(3), index=['b', 'c', 'd'])}) + df = pd.DataFrame( + { + "one": pd.Series(np.random.randn(3), index=["a", "b", "c"]), + "two": pd.Series(np.random.randn(4), index=["a", "b", "c", "d"]), + "three": pd.Series(np.random.randn(3), index=["b", "c", "d"]), + } + ) - unsorted_df = df.reindex(index=['a', 'd', 'c', 'b'], - columns=['three', 'two', 'one']) + unsorted_df = df.reindex(index=["a", "d", "c", "b"], columns=["three", "two", "one"]) unsorted_df # DataFrame @@ -1780,7 +1812,7 @@ used to sort a pandas object by its index levels. unsorted_df.sort_index(axis=1) # Series - unsorted_df['three'].sort_index() + unsorted_df["three"].sort_index() .. _basics.sort_index_key: @@ -1792,11 +1824,9 @@ the key is applied per-level to the levels specified by ``level``. .. ipython:: python - s1 = pd.DataFrame({ - "a": ['B', 'a', 'C'], - "b": [1, 2, 3], - "c": [2, 3, 4] - }).set_index(list("ab")) + s1 = pd.DataFrame({"a": ["B", "a", "C"], "b": [1, 2, 3], "c": [2, 3, 4]}).set_index( + list("ab") + ) s1 .. ipython:: python @@ -1819,16 +1849,14 @@ to use to determine the sorted order. .. ipython:: python - df1 = pd.DataFrame({'one': [2, 1, 1, 1], - 'two': [1, 3, 2, 4], - 'three': [5, 4, 3, 2]}) - df1.sort_values(by='two') + df1 = pd.DataFrame({"one": [2, 1, 1, 1], "two": [1, 3, 2, 4], "three": [5, 4, 3, 2]}) + df1.sort_values(by="two") The ``by`` parameter can take a list of column names, e.g.: .. ipython:: python - df1[['one', 'two', 'three']].sort_values(by=['one', 'two']) + df1[["one", "two", "three"]].sort_values(by=["one", "two"]) These methods have special treatment of NA values via the ``na_position`` argument: @@ -1837,7 +1865,7 @@ argument: s[2] = np.nan s.sort_values() - s.sort_values(na_position='first') + s.sort_values(na_position="first") .. _basics.sort_value_key: @@ -1848,7 +1876,7 @@ to apply to the values being sorted. .. ipython:: python - s1 = pd.Series(['B', 'a', 'C']) + s1 = pd.Series(["B", "a", "C"]) .. ipython:: python @@ -1862,12 +1890,12 @@ a Series, e.g. .. ipython:: python - df = pd.DataFrame({"a": ['B', 'a', 'C'], "b": [1, 2, 3]}) + df = pd.DataFrame({"a": ["B", "a", "C"], "b": [1, 2, 3]}) .. ipython:: python - df.sort_values(by='a') - df.sort_values(by='a', key=lambda col: col.str.lower()) + df.sort_values(by="a") + df.sort_values(by="a", key=lambda col: col.str.lower()) The name or type of each column can be used to apply different functions to different columns. @@ -1883,20 +1911,20 @@ refer to either columns or index level names. .. ipython:: python # Build MultiIndex - idx = pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('a', 2), - ('b', 2), ('b', 1), ('b', 1)]) - idx.names = ['first', 'second'] + idx = pd.MultiIndex.from_tuples( + [("a", 1), ("a", 2), ("a", 2), ("b", 2), ("b", 1), ("b", 1)] + ) + idx.names = ["first", "second"] # Build DataFrame - df_multi = pd.DataFrame({'A': np.arange(6, 0, -1)}, - index=idx) + df_multi = pd.DataFrame({"A": np.arange(6, 0, -1)}, index=idx) df_multi Sort by 'second' (index) and 'A' (column) .. ipython:: python - df_multi.sort_values(by=['second', 'A']) + df_multi.sort_values(by=["second", "A"]) .. note:: @@ -1917,8 +1945,8 @@ Series has the :meth:`~Series.searchsorted` method, which works similarly to ser = pd.Series([1, 2, 3]) ser.searchsorted([0, 3]) ser.searchsorted([0, 4]) - ser.searchsorted([1, 3], side='right') - ser.searchsorted([1, 3], side='left') + ser.searchsorted([1, 3], side="right") + ser.searchsorted([1, 3], side="left") ser = pd.Series([3, 1, 2]) ser.searchsorted([0, 3], sorter=np.argsort(ser)) @@ -1943,13 +1971,17 @@ faster than sorting the entire Series and calling ``head(n)`` on the result. .. ipython:: python - df = pd.DataFrame({'a': [-2, -1, 1, 10, 8, 11, -1], - 'b': list('abdceff'), - 'c': [1.0, 2.0, 4.0, 3.2, np.nan, 3.0, 4.0]}) - df.nlargest(3, 'a') - df.nlargest(5, ['a', 'c']) - df.nsmallest(3, 'a') - df.nsmallest(5, ['a', 'c']) + df = pd.DataFrame( + { + "a": [-2, -1, 1, 10, 8, 11, -1], + "b": list("abdceff"), + "c": [1.0, 2.0, 4.0, 3.2, np.nan, 3.0, 4.0], + } + ) + df.nlargest(3, "a") + df.nlargest(5, ["a", "c"]) + df.nsmallest(3, "a") + df.nsmallest(5, ["a", "c"]) .. _basics.multiindex_sorting: @@ -1962,10 +1994,8 @@ all levels to ``by``. .. ipython:: python - df1.columns = pd.MultiIndex.from_tuples([('a', 'one'), - ('a', 'two'), - ('b', 'three')]) - df1.sort_values(by=('a', 'two')) + df1.columns = pd.MultiIndex.from_tuples([("a", "one"), ("a", "two"), ("b", "three")]) + df1.sort_values(by=("a", "two")) Copying @@ -2048,13 +2078,17 @@ with the data type of each column. .. ipython:: python - dft = pd.DataFrame({'A': np.random.rand(3), - 'B': 1, - 'C': 'foo', - 'D': pd.Timestamp('20010102'), - 'E': pd.Series([1.0] * 3).astype('float32'), - 'F': False, - 'G': pd.Series([1] * 3, dtype='int8')}) + dft = pd.DataFrame( + { + "A": np.random.rand(3), + "B": 1, + "C": "foo", + "D": pd.Timestamp("20010102"), + "E": pd.Series([1.0] * 3).astype("float32"), + "F": False, + "G": pd.Series([1] * 3, dtype="int8"), + } + ) dft dft.dtypes @@ -2062,7 +2096,7 @@ On a ``Series`` object, use the :attr:`~Series.dtype` attribute. .. ipython:: python - dft['A'].dtype + dft["A"].dtype If a pandas object contains data with multiple dtypes *in a single column*, the dtype of the column will be chosen to accommodate all of the data types @@ -2071,10 +2105,10 @@ dtype of the column will be chosen to accommodate all of the data types .. ipython:: python # these ints are coerced to floats - pd.Series([1, 2, 3, 4, 5, 6.]) + pd.Series([1, 2, 3, 4, 5, 6.0]) # string data forces an ``object`` dtype - pd.Series([1, 2, 3, 6., 'foo']) + pd.Series([1, 2, 3, 6.0, "foo"]) The number of columns of each type in a ``DataFrame`` can be found by calling ``DataFrame.dtypes.value_counts()``. @@ -2090,13 +2124,16 @@ different numeric dtypes will **NOT** be combined. The following example will gi .. ipython:: python - df1 = pd.DataFrame(np.random.randn(8, 1), columns=['A'], dtype='float32') + df1 = pd.DataFrame(np.random.randn(8, 1), columns=["A"], dtype="float32") df1 df1.dtypes - df2 = pd.DataFrame({'A': pd.Series(np.random.randn(8), dtype='float16'), - 'B': pd.Series(np.random.randn(8)), - 'C': pd.Series(np.array(np.random.randn(8), - dtype='uint8'))}) + df2 = pd.DataFrame( + { + "A": pd.Series(np.random.randn(8), dtype="float16"), + "B": pd.Series(np.random.randn(8)), + "C": pd.Series(np.array(np.random.randn(8), dtype="uint8")), + } + ) df2 df2.dtypes @@ -2109,9 +2146,9 @@ The following will all result in ``int64`` dtypes. .. ipython:: python - pd.DataFrame([1, 2], columns=['a']).dtypes - pd.DataFrame({'a': [1, 2]}).dtypes - pd.DataFrame({'a': 1}, index=list(range(2))).dtypes + pd.DataFrame([1, 2], columns=["a"]).dtypes + pd.DataFrame({"a": [1, 2]}).dtypes + pd.DataFrame({"a": 1}, index=list(range(2))).dtypes Note that Numpy will choose *platform-dependent* types when creating arrays. The following **WILL** result in ``int32`` on 32-bit platform. @@ -2159,15 +2196,15 @@ then the more *general* one will be used as the result of the operation. df3.dtypes # conversion of dtypes - df3.astype('float32').dtypes + df3.astype("float32").dtypes Convert a subset of columns to a specified type using :meth:`~DataFrame.astype`. .. ipython:: python - dft = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]}) - dft[['a', 'b']] = dft[['a', 'b']].astype(np.uint8) + dft = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}) + dft[["a", "b"]] = dft[["a", "b"]].astype(np.uint8) dft dft.dtypes @@ -2175,8 +2212,8 @@ Convert certain columns to a specific dtype by passing a dict to :meth:`~DataFra .. ipython:: python - dft1 = pd.DataFrame({'a': [1, 0, 1], 'b': [4, 5, 6], 'c': [7, 8, 9]}) - dft1 = dft1.astype({'a': np.bool, 'c': np.float64}) + dft1 = pd.DataFrame({"a": [1, 0, 1], "b": [4, 5, 6], "c": [7, 8, 9]}) + dft1 = dft1.astype({"a": np.bool, "c": np.float64}) dft1 dft1.dtypes @@ -2188,9 +2225,9 @@ Convert certain columns to a specific dtype by passing a dict to :meth:`~DataFra .. ipython:: python - dft = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]}) - dft.loc[:, ['a', 'b']].astype(np.uint8).dtypes - dft.loc[:, ['a', 'b']] = dft.loc[:, ['a', 'b']].astype(np.uint8) + dft = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}) + dft.loc[:, ["a", "b"]].astype(np.uint8).dtypes + dft.loc[:, ["a", "b"]] = dft.loc[:, ["a", "b"]].astype(np.uint8) dft.dtypes .. _basics.object_conversion: @@ -2206,10 +2243,10 @@ to the correct type. .. ipython:: python import datetime - df = pd.DataFrame([[1, 2], - ['a', 'b'], - [datetime.datetime(2016, 3, 2), - datetime.datetime(2016, 3, 2)]]) + + df = pd.DataFrame( + [[1, 2], ["a", "b"], [datetime.datetime(2016, 3, 2), datetime.datetime(2016, 3, 2)]] + ) df = df.T df df.dtypes @@ -2228,7 +2265,7 @@ hard conversion of objects to a specified type: .. ipython:: python - m = ['1.1', 2, 3] + m = ["1.1", 2, 3] pd.to_numeric(m) * :meth:`~pandas.to_datetime` (conversion to datetime objects) @@ -2236,14 +2273,15 @@ hard conversion of objects to a specified type: .. ipython:: python import datetime - m = ['2016-07-09', datetime.datetime(2016, 3, 2)] + + m = ["2016-07-09", datetime.datetime(2016, 3, 2)] pd.to_datetime(m) * :meth:`~pandas.to_timedelta` (conversion to timedelta objects) .. ipython:: python - m = ['5us', pd.Timedelta('1day')] + m = ["5us", pd.Timedelta("1day")] pd.to_timedelta(m) To force a conversion, we can pass in an ``errors`` argument, which specifies how pandas should deal with elements @@ -2256,14 +2294,15 @@ non-conforming elements intermixed that you want to represent as missing: .. ipython:: python import datetime - m = ['apple', datetime.datetime(2016, 3, 2)] - pd.to_datetime(m, errors='coerce') - m = ['apple', 2, 3] - pd.to_numeric(m, errors='coerce') + m = ["apple", datetime.datetime(2016, 3, 2)] + pd.to_datetime(m, errors="coerce") - m = ['apple', pd.Timedelta('1day')] - pd.to_timedelta(m, errors='coerce') + m = ["apple", 2, 3] + pd.to_numeric(m, errors="coerce") + + m = ["apple", pd.Timedelta("1day")] + pd.to_timedelta(m, errors="coerce") The ``errors`` parameter has a third option of ``errors='ignore'``, which will simply return the passed in data if it encounters any errors with the conversion to a desired data type: @@ -2271,25 +2310,26 @@ encounters any errors with the conversion to a desired data type: .. ipython:: python import datetime - m = ['apple', datetime.datetime(2016, 3, 2)] - pd.to_datetime(m, errors='ignore') - m = ['apple', 2, 3] - pd.to_numeric(m, errors='ignore') + m = ["apple", datetime.datetime(2016, 3, 2)] + pd.to_datetime(m, errors="ignore") + + m = ["apple", 2, 3] + pd.to_numeric(m, errors="ignore") - m = ['apple', pd.Timedelta('1day')] - pd.to_timedelta(m, errors='ignore') + m = ["apple", pd.Timedelta("1day")] + pd.to_timedelta(m, errors="ignore") In addition to object conversion, :meth:`~pandas.to_numeric` provides another argument ``downcast``, which gives the option of downcasting the newly (or already) numeric data to a smaller dtype, which can conserve memory: .. ipython:: python - m = ['1', 2, 3] - pd.to_numeric(m, downcast='integer') # smallest signed int dtype - pd.to_numeric(m, downcast='signed') # same as 'integer' - pd.to_numeric(m, downcast='unsigned') # smallest unsigned int dtype - pd.to_numeric(m, downcast='float') # smallest float dtype + m = ["1", 2, 3] + pd.to_numeric(m, downcast="integer") # smallest signed int dtype + pd.to_numeric(m, downcast="signed") # same as 'integer' + pd.to_numeric(m, downcast="unsigned") # smallest unsigned int dtype + pd.to_numeric(m, downcast="float") # smallest float dtype As these methods apply only to one-dimensional arrays, lists or scalars; they cannot be used directly on multi-dimensional objects such as DataFrames. However, with :meth:`~pandas.DataFrame.apply`, we can "apply" the function over each column efficiently: @@ -2297,16 +2337,16 @@ as DataFrames. However, with :meth:`~pandas.DataFrame.apply`, we can "apply" the .. ipython:: python import datetime - df = pd.DataFrame([ - ['2016-07-09', datetime.datetime(2016, 3, 2)]] * 2, dtype='O') + + df = pd.DataFrame([["2016-07-09", datetime.datetime(2016, 3, 2)]] * 2, dtype="O") df df.apply(pd.to_datetime) - df = pd.DataFrame([['1.1', 2, 3]] * 2, dtype='O') + df = pd.DataFrame([["1.1", 2, 3]] * 2, dtype="O") df df.apply(pd.to_numeric) - df = pd.DataFrame([['5us', pd.Timedelta('1day')]] * 2, dtype='O') + df = pd.DataFrame([["5us", pd.Timedelta("1day")]] * 2, dtype="O") df df.apply(pd.to_timedelta) @@ -2319,8 +2359,8 @@ See also :ref:`Support for integer NA `. .. ipython:: python - dfi = df3.astype('int32') - dfi['E'] = 1 + dfi = df3.astype("int32") + dfi["E"] = 1 dfi dfi.dtypes @@ -2333,7 +2373,7 @@ While float dtypes are unchanged. .. ipython:: python dfa = df3.copy() - dfa['A'] = dfa['A'].astype('float32') + dfa["A"] = dfa["A"].astype("float32") dfa.dtypes casted = dfa[df2 > 0] @@ -2353,18 +2393,22 @@ dtypes: .. ipython:: python - df = pd.DataFrame({'string': list('abc'), - 'int64': list(range(1, 4)), - 'uint8': np.arange(3, 6).astype('u1'), - 'float64': np.arange(4.0, 7.0), - 'bool1': [True, False, True], - 'bool2': [False, True, False], - 'dates': pd.date_range('now', periods=3), - 'category': pd.Series(list("ABC")).astype('category')}) - df['tdeltas'] = df.dates.diff() - df['uint64'] = np.arange(3, 6).astype('u8') - df['other_dates'] = pd.date_range('20130101', periods=3) - df['tz_aware_dates'] = pd.date_range('20130101', periods=3, tz='US/Eastern') + df = pd.DataFrame( + { + "string": list("abc"), + "int64": list(range(1, 4)), + "uint8": np.arange(3, 6).astype("u1"), + "float64": np.arange(4.0, 7.0), + "bool1": [True, False, True], + "bool2": [False, True, False], + "dates": pd.date_range("now", periods=3), + "category": pd.Series(list("ABC")).astype("category"), + } + ) + df["tdeltas"] = df.dates.diff() + df["uint64"] = np.arange(3, 6).astype("u8") + df["other_dates"] = pd.date_range("20130101", periods=3) + df["tz_aware_dates"] = pd.date_range("20130101", periods=3, tz="US/Eastern") df And the dtypes: @@ -2388,7 +2432,7 @@ You can also pass the name of a dtype in the `NumPy dtype hierarchy .. ipython:: python - df.select_dtypes(include=['bool']) + df.select_dtypes(include=["bool"]) :meth:`~pandas.DataFrame.select_dtypes` also works with generic dtypes as well. @@ -2397,13 +2441,13 @@ integers: .. ipython:: python - df.select_dtypes(include=['number', 'bool'], exclude=['unsignedinteger']) + df.select_dtypes(include=["number", "bool"], exclude=["unsignedinteger"]) To select string columns you must use the ``object`` dtype: .. ipython:: python - df.select_dtypes(include=['object']) + df.select_dtypes(include=["object"]) To see all the child dtypes of a generic ``dtype`` like ``numpy.number`` you can define a function that returns a tree of child dtypes: diff --git a/setup.cfg b/setup.cfg index 3279a485c9bf3..a7c0f3484517f 100644 --- a/setup.cfg +++ b/setup.cfg @@ -42,6 +42,7 @@ bootstrap = ignore = E203, # space before : (needed for how black formats slicing) E402, # module level import not at top of file W503, # line break before binary operator + E203, # space before : (needed for how black formats slicing) # Classes/functions in different blocks can generate those errors E302, # expected 2 blank lines, found 0 E305, # expected 2 blank lines after class or function definition, found 0 From cd4a558f96ce4be5fda39ec42d818fc694533094 Mon Sep 17 00:00:00 2001 From: beanan Date: Mon, 5 Oct 2020 16:05:51 +0800 Subject: [PATCH 1016/1025] CLN: Remove the duplicate configuration of flake8-rst in setup.cfg (#36877) --- setup.cfg | 1 - 1 file changed, 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index a7c0f3484517f..3279a485c9bf3 100644 --- a/setup.cfg +++ b/setup.cfg @@ -42,7 +42,6 @@ bootstrap = ignore = E203, # space before : (needed for how black formats slicing) E402, # module level import not at top of file W503, # line break before binary operator - E203, # space before : (needed for how black formats slicing) # Classes/functions in different blocks can generate those errors E302, # expected 2 blank lines, found 0 E305, # expected 2 blank lines after class or function definition, found 0 From 51bf61617f92b7c1f1397bca0f98052b203ca70d Mon Sep 17 00:00:00 2001 From: "T. JEGHAM" <41241424+Tazminia@users.noreply.github.com> Date: Mon, 5 Oct 2020 12:15:10 +0200 Subject: [PATCH 1017/1025] upgrade flake8 to 3.8.4 (#36882) --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index d0c9f12614d0d..6a311c6f702e8 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -4,7 +4,7 @@ repos: hooks: - id: black - repo: https://gitlab.com/pycqa/flake8 - rev: 3.8.3 + rev: 3.8.4 hooks: - id: flake8 additional_dependencies: [flake8-comprehensions>=3.1.0] From c2a40bf38a430495c5607ffd9425212b03144d14 Mon Sep 17 00:00:00 2001 From: Meghana Varanasi Date: Mon, 5 Oct 2020 18:28:09 +0530 Subject: [PATCH 1018/1025] DOC: doc/source/whatsnew (#36857) --- doc/source/whatsnew/v0.10.0.rst | 29 ++-- doc/source/whatsnew/v0.10.1.rst | 64 ++++---- doc/source/whatsnew/v0.12.0.rst | 50 +++--- doc/source/whatsnew/v0.13.1.rst | 53 +++---- doc/source/whatsnew/v0.14.1.rst | 10 +- doc/source/whatsnew/v0.15.1.rst | 17 ++- doc/source/whatsnew/v0.16.1.rst | 58 ++++--- doc/source/whatsnew/v0.16.2.rst | 22 +-- doc/source/whatsnew/v0.17.0.rst | 156 +++++++++---------- doc/source/whatsnew/v0.17.1.rst | 14 +- doc/source/whatsnew/v0.18.1.rst | 88 ++++++----- doc/source/whatsnew/v0.19.0.rst | 261 +++++++++++++++++--------------- doc/source/whatsnew/v0.19.1.rst | 2 +- doc/source/whatsnew/v0.19.2.rst | 2 +- doc/source/whatsnew/v0.20.2.rst | 2 +- doc/source/whatsnew/v0.20.3.rst | 2 +- doc/source/whatsnew/v0.21.1.rst | 2 +- doc/source/whatsnew/v0.22.0.rst | 13 +- doc/source/whatsnew/v0.5.0.rst | 2 +- doc/source/whatsnew/v0.6.0.rst | 2 +- doc/source/whatsnew/v0.7.3.rst | 32 ++-- doc/source/whatsnew/v0.8.0.rst | 17 ++- doc/source/whatsnew/v0.9.0.rst | 10 +- 23 files changed, 472 insertions(+), 436 deletions(-) diff --git a/doc/source/whatsnew/v0.10.0.rst b/doc/source/whatsnew/v0.10.0.rst index 443250592a4a7..aa2749c85a232 100644 --- a/doc/source/whatsnew/v0.10.0.rst +++ b/doc/source/whatsnew/v0.10.0.rst @@ -49,8 +49,8 @@ talking about: :okwarning: import pandas as pd - df = pd.DataFrame(np.random.randn(6, 4), - index=pd.date_range('1/1/2000', periods=6)) + + df = pd.DataFrame(np.random.randn(6, 4), index=pd.date_range("1/1/2000", periods=6)) df # deprecated now df - df[0] @@ -184,12 +184,14 @@ labeled the aggregated group with the end of the interval: the next day). import io - data = ('a,b,c\n' - '1,Yes,2\n' - '3,No,4') + data = """ + a,b,c + 1,Yes,2 + 3,No,4 + """ print(data) pd.read_csv(io.StringIO(data), header=None) - pd.read_csv(io.StringIO(data), header=None, prefix='X') + pd.read_csv(io.StringIO(data), header=None, prefix="X") - Values like ``'Yes'`` and ``'No'`` are not interpreted as boolean by default, though this can be controlled by new ``true_values`` and ``false_values`` @@ -199,7 +201,7 @@ labeled the aggregated group with the end of the interval: the next day). print(data) pd.read_csv(io.StringIO(data)) - pd.read_csv(io.StringIO(data), true_values=['Yes'], false_values=['No']) + pd.read_csv(io.StringIO(data), true_values=["Yes"], false_values=["No"]) - The file parsers will not recognize non-string values arising from a converter function as NA if passed in the ``na_values`` argument. It's better @@ -210,10 +212,10 @@ labeled the aggregated group with the end of the interval: the next day). .. ipython:: python - s = pd.Series([np.nan, 1., 2., np.nan, 4]) + s = pd.Series([np.nan, 1.0, 2.0, np.nan, 4]) s s.fillna(0) - s.fillna(method='pad') + s.fillna(method="pad") Convenience methods ``ffill`` and ``bfill`` have been added: @@ -229,7 +231,8 @@ Convenience methods ``ffill`` and ``bfill`` have been added: .. ipython:: python def f(x): - return pd.Series([x, x**2], index=['x', 'x^2']) + return pd.Series([x, x ** 2], index=["x", "x^2"]) + s = pd.Series(np.random.rand(5)) s @@ -272,20 +275,20 @@ The old behavior of printing out summary information can be achieved via the .. ipython:: python - pd.set_option('expand_frame_repr', False) + pd.set_option("expand_frame_repr", False) wide_frame .. ipython:: python :suppress: - pd.reset_option('expand_frame_repr') + pd.reset_option("expand_frame_repr") The width of each line can be changed via 'line_width' (80 by default): .. code-block:: python - pd.set_option('line_width', 40) + pd.set_option("line_width", 40) wide_frame diff --git a/doc/source/whatsnew/v0.10.1.rst b/doc/source/whatsnew/v0.10.1.rst index 3dc680c46a4d9..d71a0d5ca68cd 100644 --- a/doc/source/whatsnew/v0.10.1.rst +++ b/doc/source/whatsnew/v0.10.1.rst @@ -45,29 +45,31 @@ You may need to upgrade your existing data files. Please visit the import os - os.remove('store.h5') + os.remove("store.h5") You can designate (and index) certain columns that you want to be able to perform queries on a table, by passing a list to ``data_columns`` .. ipython:: python - store = pd.HDFStore('store.h5') - df = pd.DataFrame(np.random.randn(8, 3), - index=pd.date_range('1/1/2000', periods=8), - columns=['A', 'B', 'C']) - df['string'] = 'foo' - df.loc[df.index[4:6], 'string'] = np.nan - df.loc[df.index[7:9], 'string'] = 'bar' - df['string2'] = 'cool' + store = pd.HDFStore("store.h5") + df = pd.DataFrame( + np.random.randn(8, 3), + index=pd.date_range("1/1/2000", periods=8), + columns=["A", "B", "C"], + ) + df["string"] = "foo" + df.loc[df.index[4:6], "string"] = np.nan + df.loc[df.index[7:9], "string"] = "bar" + df["string2"] = "cool" df # on-disk operations - store.append('df', df, data_columns=['B', 'C', 'string', 'string2']) - store.select('df', "B>0 and string=='foo'") + store.append("df", df, data_columns=["B", "C", "string", "string2"]) + store.select("df", "B>0 and string=='foo'") # this is in-memory version of this type of selection - df[(df.B > 0) & (df.string == 'foo')] + df[(df.B > 0) & (df.string == "foo")] Retrieving unique values in an indexable or data column. @@ -75,19 +77,19 @@ Retrieving unique values in an indexable or data column. # note that this is deprecated as of 0.14.0 # can be replicated by: store.select_column('df','index').unique() - store.unique('df', 'index') - store.unique('df', 'string') + store.unique("df", "index") + store.unique("df", "string") You can now store ``datetime64`` in data columns .. ipython:: python df_mixed = df.copy() - df_mixed['datetime64'] = pd.Timestamp('20010102') - df_mixed.loc[df_mixed.index[3:4], ['A', 'B']] = np.nan + df_mixed["datetime64"] = pd.Timestamp("20010102") + df_mixed.loc[df_mixed.index[3:4], ["A", "B"]] = np.nan - store.append('df_mixed', df_mixed) - df_mixed1 = store.select('df_mixed') + store.append("df_mixed", df_mixed) + df_mixed1 = store.select("df_mixed") df_mixed1 df_mixed1.dtypes.value_counts() @@ -97,7 +99,7 @@ columns, this is equivalent to passing a .. ipython:: python - store.select('df', columns=['A', 'B']) + store.select("df", columns=["A", "B"]) ``HDFStore`` now serializes MultiIndex dataframes when appending tables. @@ -160,29 +162,31 @@ combined result, by using ``where`` on a selector table. .. ipython:: python - df_mt = pd.DataFrame(np.random.randn(8, 6), - index=pd.date_range('1/1/2000', periods=8), - columns=['A', 'B', 'C', 'D', 'E', 'F']) - df_mt['foo'] = 'bar' + df_mt = pd.DataFrame( + np.random.randn(8, 6), + index=pd.date_range("1/1/2000", periods=8), + columns=["A", "B", "C", "D", "E", "F"], + ) + df_mt["foo"] = "bar" # you can also create the tables individually - store.append_to_multiple({'df1_mt': ['A', 'B'], 'df2_mt': None}, - df_mt, selector='df1_mt') + store.append_to_multiple( + {"df1_mt": ["A", "B"], "df2_mt": None}, df_mt, selector="df1_mt" + ) store # individual tables were created - store.select('df1_mt') - store.select('df2_mt') + store.select("df1_mt") + store.select("df2_mt") # as a multiple - store.select_as_multiple(['df1_mt', 'df2_mt'], where=['A>0', 'B>0'], - selector='df1_mt') + store.select_as_multiple(["df1_mt", "df2_mt"], where=["A>0", "B>0"], selector="df1_mt") .. ipython:: python :suppress: store.close() - os.remove('store.h5') + os.remove("store.h5") **Enhancements** diff --git a/doc/source/whatsnew/v0.12.0.rst b/doc/source/whatsnew/v0.12.0.rst index 9971ae22822f6..4de76510c6bc1 100644 --- a/doc/source/whatsnew/v0.12.0.rst +++ b/doc/source/whatsnew/v0.12.0.rst @@ -47,7 +47,7 @@ API changes .. ipython:: python - p = pd.DataFrame({'first': [4, 5, 8], 'second': [0, 0, 3]}) + p = pd.DataFrame({"first": [4, 5, 8], "second": [0, 0, 3]}) p % 0 p % p p / p @@ -95,8 +95,8 @@ API changes .. ipython:: python - df = pd.DataFrame(range(5), index=list('ABCDE'), columns=['a']) - mask = (df.a % 2 == 0) + df = pd.DataFrame(range(5), index=list("ABCDE"), columns=["a"]) + mask = df.a % 2 == 0 mask # this is what you should use @@ -141,21 +141,24 @@ API changes .. code-block:: python from pandas.io.parsers import ExcelFile - xls = ExcelFile('path_to_file.xls') - xls.parse('Sheet1', index_col=None, na_values=['NA']) + + xls = ExcelFile("path_to_file.xls") + xls.parse("Sheet1", index_col=None, na_values=["NA"]) With .. code-block:: python import pandas as pd - pd.read_excel('path_to_file.xls', 'Sheet1', index_col=None, na_values=['NA']) + + pd.read_excel("path_to_file.xls", "Sheet1", index_col=None, na_values=["NA"]) - added top-level function ``read_sql`` that is equivalent to the following .. code-block:: python from pandas.io.sql import read_frame + read_frame(...) - ``DataFrame.to_html`` and ``DataFrame.to_latex`` now accept a path for @@ -200,7 +203,7 @@ IO enhancements .. ipython:: python :okwarning: - df = pd.DataFrame({'a': range(3), 'b': list('abc')}) + df = pd.DataFrame({"a": range(3), "b": list("abc")}) print(df) html = df.to_html() alist = pd.read_html(html, index_col=0) @@ -248,16 +251,18 @@ IO enhancements .. ipython:: python from pandas._testing import makeCustomDataframe as mkdf + df = mkdf(5, 3, r_idx_nlevels=2, c_idx_nlevels=4) - df.to_csv('mi.csv') - print(open('mi.csv').read()) - pd.read_csv('mi.csv', header=[0, 1, 2, 3], index_col=[0, 1]) + df.to_csv("mi.csv") + print(open("mi.csv").read()) + pd.read_csv("mi.csv", header=[0, 1, 2, 3], index_col=[0, 1]) .. ipython:: python :suppress: import os - os.remove('mi.csv') + + os.remove("mi.csv") - Support for ``HDFStore`` (via ``PyTables 3.0.0``) on Python3 @@ -304,8 +309,8 @@ Other enhancements .. ipython:: python - df = pd.DataFrame({'a': list('ab..'), 'b': [1, 2, 3, 4]}) - df.replace(regex=r'\s*\.\s*', value=np.nan) + df = pd.DataFrame({"a": list("ab.."), "b": [1, 2, 3, 4]}) + df.replace(regex=r"\s*\.\s*", value=np.nan) to replace all occurrences of the string ``'.'`` with zero or more instances of surrounding white space with ``NaN``. @@ -314,7 +319,7 @@ Other enhancements .. ipython:: python - df.replace('.', np.nan) + df.replace(".", np.nan) to replace all occurrences of the string ``'.'`` with ``NaN``. @@ -359,8 +364,8 @@ Other enhancements .. ipython:: python - dff = pd.DataFrame({'A': np.arange(8), 'B': list('aabbbbcc')}) - dff.groupby('B').filter(lambda x: len(x) > 2) + dff = pd.DataFrame({"A": np.arange(8), "B": list("aabbbbcc")}) + dff.groupby("B").filter(lambda x: len(x) > 2) Alternatively, instead of dropping the offending groups, we can return a like-indexed objects where the groups that do not pass the filter are @@ -368,7 +373,7 @@ Other enhancements .. ipython:: python - dff.groupby('B').filter(lambda x: len(x) > 2, dropna=False) + dff.groupby("B").filter(lambda x: len(x) > 2, dropna=False) - Series and DataFrame hist methods now take a ``figsize`` argument (:issue:`3834`) @@ -397,17 +402,18 @@ Experimental features from pandas.tseries.offsets import CustomBusinessDay from datetime import datetime + # As an interesting example, let's look at Egypt where # a Friday-Saturday weekend is observed. - weekmask_egypt = 'Sun Mon Tue Wed Thu' + weekmask_egypt = "Sun Mon Tue Wed Thu" # They also observe International Workers' Day so let's # add that for a couple of years - holidays = ['2012-05-01', datetime(2013, 5, 1), np.datetime64('2014-05-01')] + holidays = ["2012-05-01", datetime(2013, 5, 1), np.datetime64("2014-05-01")] bday_egypt = CustomBusinessDay(holidays=holidays, weekmask=weekmask_egypt) dt = datetime(2013, 4, 30) print(dt + 2 * bday_egypt) dts = pd.date_range(dt, periods=5, freq=bday_egypt) - print(pd.Series(dts.weekday, dts).map(pd.Series('Mon Tue Wed Thu Fri Sat Sun'.split()))) + print(pd.Series(dts.weekday, dts).map(pd.Series("Mon Tue Wed Thu Fri Sat Sun".split()))) Bug fixes ~~~~~~~~~ @@ -430,14 +436,14 @@ Bug fixes .. ipython:: python :okwarning: - strs = 'go', 'bow', 'joe', 'slow' + strs = "go", "bow", "joe", "slow" ds = pd.Series(strs) for s in ds.str: print(s) s - s.dropna().values.item() == 'w' + s.dropna().values.item() == "w" The last element yielded by the iterator will be a ``Series`` containing the last element of the longest string in the ``Series`` with all other diff --git a/doc/source/whatsnew/v0.13.1.rst b/doc/source/whatsnew/v0.13.1.rst index 9e416f8eeb3f1..1215786b4cccc 100644 --- a/doc/source/whatsnew/v0.13.1.rst +++ b/doc/source/whatsnew/v0.13.1.rst @@ -31,16 +31,16 @@ Highlights include: .. ipython:: python - df = pd.DataFrame({'A': np.array(['foo', 'bar', 'bah', 'foo', 'bar'])}) - df['A'].iloc[0] = np.nan + df = pd.DataFrame({"A": np.array(["foo", "bar", "bah", "foo", "bar"])}) + df["A"].iloc[0] = np.nan df The recommended way to do this type of assignment is: .. ipython:: python - df = pd.DataFrame({'A': np.array(['foo', 'bar', 'bah', 'foo', 'bar'])}) - df.loc[0, 'A'] = np.nan + df = pd.DataFrame({"A": np.array(["foo", "bar", "bah", "foo", "bar"])}) + df.loc[0, "A"] = np.nan df Output formatting enhancements @@ -52,24 +52,27 @@ Output formatting enhancements .. ipython:: python - max_info_rows = pd.get_option('max_info_rows') + max_info_rows = pd.get_option("max_info_rows") - df = pd.DataFrame({'A': np.random.randn(10), - 'B': np.random.randn(10), - 'C': pd.date_range('20130101', periods=10) - }) + df = pd.DataFrame( + { + "A": np.random.randn(10), + "B": np.random.randn(10), + "C": pd.date_range("20130101", periods=10), + } + ) df.iloc[3:6, [0, 2]] = np.nan .. ipython:: python # set to not display the null counts - pd.set_option('max_info_rows', 0) + pd.set_option("max_info_rows", 0) df.info() .. ipython:: python # this is the default (same as in 0.13.0) - pd.set_option('max_info_rows', max_info_rows) + pd.set_option("max_info_rows", max_info_rows) df.info() - Add ``show_dimensions`` display option for the new DataFrame repr to control whether the dimensions print. @@ -77,10 +80,10 @@ Output formatting enhancements .. ipython:: python df = pd.DataFrame([[1, 2], [3, 4]]) - pd.set_option('show_dimensions', False) + pd.set_option("show_dimensions", False) df - pd.set_option('show_dimensions', True) + pd.set_option("show_dimensions", True) df - The ``ArrayFormatter`` for ``datetime`` and ``timedelta64`` now intelligently @@ -98,10 +101,9 @@ Output formatting enhancements .. ipython:: python - df = pd.DataFrame([pd.Timestamp('20010101'), - pd.Timestamp('20040601')], columns=['age']) - df['today'] = pd.Timestamp('20130419') - df['diff'] = df['today'] - df['age'] + df = pd.DataFrame([pd.Timestamp("20010101"), pd.Timestamp("20040601")], columns=["age"]) + df["today"] = pd.Timestamp("20130419") + df["diff"] = df["today"] - df["age"] df API changes @@ -115,8 +117,8 @@ API changes .. ipython:: python - s = pd.Series(['a', 'a|b', np.nan, 'a|c']) - s.str.get_dummies(sep='|') + s = pd.Series(["a", "a|b", np.nan, "a|c"]) + s.str.get_dummies(sep="|") - Added the ``NDFrame.equals()`` method to compare if two NDFrames are equal have equal axes, dtypes, and values. Added the @@ -126,8 +128,8 @@ API changes .. code-block:: python - df = pd.DataFrame({'col': ['foo', 0, np.nan]}) - df2 = pd.DataFrame({'col': [np.nan, 0, 'foo']}, index=[2, 1, 0]) + df = pd.DataFrame({"col": ["foo", 0, np.nan]}) + df2 = pd.DataFrame({"col": [np.nan, 0, "foo"]}, index=[2, 1, 0]) df.equals(df2) df.equals(df2.sort_index()) @@ -204,8 +206,7 @@ Enhancements .. code-block:: python # Try to infer the format for the index column - df = pd.read_csv('foo.csv', index_col=0, parse_dates=True, - infer_datetime_format=True) + df = pd.read_csv("foo.csv", index_col=0, parse_dates=True, infer_datetime_format=True) - ``date_format`` and ``datetime_format`` keywords can now be specified when writing to ``excel`` files (:issue:`4133`) @@ -215,10 +216,10 @@ Enhancements .. ipython:: python - shades = ['light', 'dark'] - colors = ['red', 'green', 'blue'] + shades = ["light", "dark"] + colors = ["red", "green", "blue"] - pd.MultiIndex.from_product([shades, colors], names=['shade', 'color']) + pd.MultiIndex.from_product([shades, colors], names=["shade", "color"]) - Panel :meth:`~pandas.Panel.apply` will work on non-ufuncs. See :ref:`the docs`. diff --git a/doc/source/whatsnew/v0.14.1.rst b/doc/source/whatsnew/v0.14.1.rst index 354d67a525d0e..78fd182ea86c3 100644 --- a/doc/source/whatsnew/v0.14.1.rst +++ b/doc/source/whatsnew/v0.14.1.rst @@ -68,7 +68,8 @@ API changes :suppress: import pandas.tseries.offsets as offsets - d = pd.Timestamp('2014-01-01 09:00') + + d = pd.Timestamp("2014-01-01 09:00") .. ipython:: python @@ -100,10 +101,10 @@ Enhancements import pandas.tseries.offsets as offsets day = offsets.Day() - day.apply(pd.Timestamp('2014-01-01 09:00')) + day.apply(pd.Timestamp("2014-01-01 09:00")) day = offsets.Day(normalize=True) - day.apply(pd.Timestamp('2014-01-01 09:00')) + day.apply(pd.Timestamp("2014-01-01 09:00")) - ``PeriodIndex`` is represented as the same format as ``DatetimeIndex`` (:issue:`7601`) - ``StringMethods`` now work on empty Series (:issue:`7242`) @@ -123,8 +124,7 @@ Enhancements .. ipython:: python - rng = pd.date_range('3/6/2012 00:00', periods=10, freq='D', - tz='dateutil/Europe/London') + rng = pd.date_range("3/6/2012 00:00", periods=10, freq="D", tz="dateutil/Europe/London") rng.tz See :ref:`the docs `. diff --git a/doc/source/whatsnew/v0.15.1.rst b/doc/source/whatsnew/v0.15.1.rst index da56f07e84d9f..a1d4f9d14a905 100644 --- a/doc/source/whatsnew/v0.15.1.rst +++ b/doc/source/whatsnew/v0.15.1.rst @@ -23,7 +23,7 @@ API changes .. ipython:: python - s = pd.Series(pd.date_range('20130101', periods=5, freq='D')) + s = pd.Series(pd.date_range("20130101", periods=5, freq="D")) s.iloc[2] = np.nan s @@ -52,8 +52,7 @@ API changes .. ipython:: python np.random.seed(2718281) - df = pd.DataFrame(np.random.randint(0, 100, (10, 2)), - columns=['jim', 'joe']) + df = pd.DataFrame(np.random.randint(0, 100, (10, 2)), columns=["jim", "joe"]) df.head() ts = pd.Series(5 * np.random.randint(0, 3, 10)) @@ -80,9 +79,9 @@ API changes .. ipython:: python - df = pd.DataFrame({'jim': range(5), 'joe': range(5, 10)}) + df = pd.DataFrame({"jim": range(5), "joe": range(5, 10)}) df - gr = df.groupby(df['jim'] < 2) + gr = df.groupby(df["jim"] < 2) previous behavior (excludes 1st column from output): @@ -106,7 +105,7 @@ API changes .. ipython:: python - s = pd.Series(['a', 'b', 'c', 'd'], [4, 3, 2, 1]) + s = pd.Series(["a", "b", "c", "d"], [4, 3, 2, 1]) s previous behavior: @@ -208,6 +207,7 @@ Enhancements .. ipython:: python from collections import deque + df1 = pd.DataFrame([1, 2, 3]) df2 = pd.DataFrame([4, 5, 6]) @@ -228,8 +228,9 @@ Enhancements .. ipython:: python - dfi = pd.DataFrame(1, index=pd.MultiIndex.from_product([['a'], - range(1000)]), columns=['A']) + dfi = pd.DataFrame( + 1, index=pd.MultiIndex.from_product([["a"], range(1000)]), columns=["A"] + ) previous behavior: diff --git a/doc/source/whatsnew/v0.16.1.rst b/doc/source/whatsnew/v0.16.1.rst index a89ede8f024a0..39767684c01d0 100644 --- a/doc/source/whatsnew/v0.16.1.rst +++ b/doc/source/whatsnew/v0.16.1.rst @@ -209,9 +209,8 @@ when sampling from rows. .. ipython:: python - df = pd.DataFrame({'col1': [9, 8, 7, 6], - 'weight_column': [0.5, 0.4, 0.1, 0]}) - df.sample(n=3, weights='weight_column') + df = pd.DataFrame({"col1": [9, 8, 7, 6], "weight_column": [0.5, 0.4, 0.1, 0]}) + df.sample(n=3, weights="weight_column") .. _whatsnew_0161.enhancements.string: @@ -229,7 +228,7 @@ enhancements make string operations easier and more consistent with standard pyt .. ipython:: python - idx = pd.Index([' jack', 'jill ', ' jesse ', 'frank']) + idx = pd.Index([" jack", "jill ", " jesse ", "frank"]) idx.str.strip() One special case for the ``.str`` accessor on ``Index`` is that if a string method returns ``bool``, the ``.str`` accessor @@ -238,11 +237,11 @@ enhancements make string operations easier and more consistent with standard pyt .. ipython:: python - idx = pd.Index(['a1', 'a2', 'b1', 'b2']) + idx = pd.Index(["a1", "a2", "b1", "b2"]) s = pd.Series(range(4), index=idx) s - idx.str.startswith('a') - s[s.index.str.startswith('a')] + idx.str.startswith("a") + s[s.index.str.startswith("a")] - The following new methods are accessible via ``.str`` accessor to apply the function to each values. (:issue:`9766`, :issue:`9773`, :issue:`10031`, :issue:`10045`, :issue:`10052`) @@ -257,21 +256,21 @@ enhancements make string operations easier and more consistent with standard pyt .. ipython:: python - s = pd.Series(['a,b', 'a,c', 'b,c']) + s = pd.Series(["a,b", "a,c", "b,c"]) # return Series - s.str.split(',') + s.str.split(",") # return DataFrame - s.str.split(',', expand=True) + s.str.split(",", expand=True) - idx = pd.Index(['a,b', 'a,c', 'b,c']) + idx = pd.Index(["a,b", "a,c", "b,c"]) # return Index - idx.str.split(',') + idx.str.split(",") # return MultiIndex - idx.str.split(',', expand=True) + idx.str.split(",", expand=True) - Improved ``extract`` and ``get_dummies`` methods for ``Index.str`` (:issue:`9980`) @@ -286,9 +285,9 @@ Other enhancements .. ipython:: python - pd.Timestamp('2014-08-01 09:00') + pd.tseries.offsets.BusinessHour() - pd.Timestamp('2014-08-01 07:00') + pd.tseries.offsets.BusinessHour() - pd.Timestamp('2014-08-01 16:30') + pd.tseries.offsets.BusinessHour() + pd.Timestamp("2014-08-01 09:00") + pd.tseries.offsets.BusinessHour() + pd.Timestamp("2014-08-01 07:00") + pd.tseries.offsets.BusinessHour() + pd.Timestamp("2014-08-01 16:30") + pd.tseries.offsets.BusinessHour() - ``DataFrame.diff`` now takes an ``axis`` parameter that determines the direction of differencing (:issue:`9727`) @@ -300,8 +299,8 @@ Other enhancements .. ipython:: python - df = pd.DataFrame(np.random.randn(3, 3), columns=['A', 'B', 'C']) - df.drop(['A', 'X'], axis=1, errors='ignore') + df = pd.DataFrame(np.random.randn(3, 3), columns=["A", "B", "C"]) + df.drop(["A", "X"], axis=1, errors="ignore") - Add support for separating years and quarters using dashes, for example 2014-Q1. (:issue:`9688`) @@ -382,19 +381,16 @@ New behavior .. ipython:: python - pd.set_option('display.width', 80) - pd.Index(range(4), name='foo') - pd.Index(range(30), name='foo') - pd.Index(range(104), name='foo') - pd.CategoricalIndex(['a', 'bb', 'ccc', 'dddd'], - ordered=True, name='foobar') - pd.CategoricalIndex(['a', 'bb', 'ccc', 'dddd'] * 10, - ordered=True, name='foobar') - pd.CategoricalIndex(['a', 'bb', 'ccc', 'dddd'] * 100, - ordered=True, name='foobar') - pd.date_range('20130101', periods=4, name='foo', tz='US/Eastern') - pd.date_range('20130101', periods=25, freq='D') - pd.date_range('20130101', periods=104, name='foo', tz='US/Eastern') + pd.set_option("display.width", 80) + pd.Index(range(4), name="foo") + pd.Index(range(30), name="foo") + pd.Index(range(104), name="foo") + pd.CategoricalIndex(["a", "bb", "ccc", "dddd"], ordered=True, name="foobar") + pd.CategoricalIndex(["a", "bb", "ccc", "dddd"] * 10, ordered=True, name="foobar") + pd.CategoricalIndex(["a", "bb", "ccc", "dddd"] * 100, ordered=True, name="foobar") + pd.date_range("20130101", periods=4, name="foo", tz="US/Eastern") + pd.date_range("20130101", periods=25, freq="D") + pd.date_range("20130101", periods=104, name="foo", tz="US/Eastern") .. _whatsnew_0161.performance: diff --git a/doc/source/whatsnew/v0.16.2.rst b/doc/source/whatsnew/v0.16.2.rst index 2cb0cbec68eff..bb2aa166419b4 100644 --- a/doc/source/whatsnew/v0.16.2.rst +++ b/doc/source/whatsnew/v0.16.2.rst @@ -48,9 +48,10 @@ This can be rewritten as .. code-block:: python - (df.pipe(h) # noqa F821 - .pipe(g, arg1=1) # noqa F821 - .pipe(f, arg2=2, arg3=3) # noqa F821 + ( + df.pipe(h) # noqa F821 + .pipe(g, arg1=1) # noqa F821 + .pipe(f, arg2=2, arg3=3) # noqa F821 ) Now both the code and the logic flow from top to bottom. Keyword arguments are next to @@ -64,15 +65,16 @@ of ``(function, keyword)`` indicating where the DataFrame should flow. For examp import statsmodels.formula.api as sm - bb = pd.read_csv('data/baseball.csv', index_col='id') + bb = pd.read_csv("data/baseball.csv", index_col="id") # sm.ols takes (formula, data) - (bb.query('h > 0') - .assign(ln_h=lambda df: np.log(df.h)) - .pipe((sm.ols, 'data'), 'hr ~ ln_h + year + g + C(lg)') - .fit() - .summary() - ) + ( + bb.query("h > 0") + .assign(ln_h=lambda df: np.log(df.h)) + .pipe((sm.ols, "data"), "hr ~ ln_h + year + g + C(lg)") + .fit() + .summary() + ) The pipe method is inspired by unix pipes, which stream text through processes. More recently dplyr_ and magrittr_ have introduced the diff --git a/doc/source/whatsnew/v0.17.0.rst b/doc/source/whatsnew/v0.17.0.rst index 3e49bb30401a3..1658f877f5523 100644 --- a/doc/source/whatsnew/v0.17.0.rst +++ b/doc/source/whatsnew/v0.17.0.rst @@ -80,9 +80,13 @@ The new implementation allows for having a single-timezone across all rows, with .. ipython:: python - df = pd.DataFrame({'A': pd.date_range('20130101', periods=3), - 'B': pd.date_range('20130101', periods=3, tz='US/Eastern'), - 'C': pd.date_range('20130101', periods=3, tz='CET')}) + df = pd.DataFrame( + { + "A": pd.date_range("20130101", periods=3), + "B": pd.date_range("20130101", periods=3, tz="US/Eastern"), + "C": pd.date_range("20130101", periods=3, tz="CET"), + } + ) df df.dtypes @@ -95,8 +99,8 @@ This uses a new-dtype representation as well, that is very similar in look-and-f .. ipython:: python - df['B'].dtype - type(df['B'].dtype) + df["B"].dtype + type(df["B"].dtype) .. note:: @@ -119,8 +123,8 @@ This uses a new-dtype representation as well, that is very similar in look-and-f .. ipython:: python - pd.date_range('20130101', periods=3, tz='US/Eastern') - pd.date_range('20130101', periods=3, tz='US/Eastern').dtype + pd.date_range("20130101", periods=3, tz="US/Eastern") + pd.date_range("20130101", periods=3, tz="US/Eastern").dtype .. _whatsnew_0170.gil: @@ -138,9 +142,10 @@ as well as the ``.sum()`` operation. N = 1000000 ngroups = 10 - df = DataFrame({'key': np.random.randint(0, ngroups, size=N), - 'data': np.random.randn(N)}) - df.groupby('key')['data'].sum() + df = DataFrame( + {"key": np.random.randint(0, ngroups, size=N), "data": np.random.randn(N)} + ) + df.groupby("key")["data"].sum() Releasing of the GIL could benefit an application that uses threads for user interactions (e.g. QT_), or performing multi-threaded computations. A nice example of a library that can handle these types of computation-in-parallel is the dask_ library. @@ -189,16 +194,16 @@ We are now supporting a ``Series.dt.strftime`` method for datetime-likes to gene .. ipython:: python # DatetimeIndex - s = pd.Series(pd.date_range('20130101', periods=4)) + s = pd.Series(pd.date_range("20130101", periods=4)) s - s.dt.strftime('%Y/%m/%d') + s.dt.strftime("%Y/%m/%d") .. ipython:: python # PeriodIndex - s = pd.Series(pd.period_range('20130101', periods=4)) + s = pd.Series(pd.period_range("20130101", periods=4)) s - s.dt.strftime('%Y/%m/%d') + s.dt.strftime("%Y/%m/%d") The string format is as the python standard library and details can be found `here `_ @@ -210,7 +215,7 @@ Series.dt.total_seconds .. ipython:: python # TimedeltaIndex - s = pd.Series(pd.timedelta_range('1 minutes', periods=4)) + s = pd.Series(pd.timedelta_range("1 minutes", periods=4)) s s.dt.total_seconds() @@ -225,18 +230,18 @@ A multiplied freq represents a span of corresponding length. The example below c .. ipython:: python - p = pd.Period('2015-08-01', freq='3D') + p = pd.Period("2015-08-01", freq="3D") p p + 1 p - 2 p.to_timestamp() - p.to_timestamp(how='E') + p.to_timestamp(how="E") You can use the multiplied freq in ``PeriodIndex`` and ``period_range``. .. ipython:: python - idx = pd.period_range('2015-08-01', periods=4, freq='2D') + idx = pd.period_range("2015-08-01", periods=4, freq="2D") idx idx + 1 @@ -249,14 +254,14 @@ Support for SAS XPORT files .. code-block:: python - df = pd.read_sas('sas_xport.xpt') + df = pd.read_sas("sas_xport.xpt") It is also possible to obtain an iterator and read an XPORT file incrementally. .. code-block:: python - for df in pd.read_sas('sas_xport.xpt', chunksize=10000): + for df in pd.read_sas("sas_xport.xpt", chunksize=10000): do_something(df) See the :ref:`docs ` for more details. @@ -270,7 +275,7 @@ Support for math functions in .eval() .. code-block:: python - df = pd.DataFrame({'a': np.random.randn(10)}) + df = pd.DataFrame({"a": np.random.randn(10)}) df.eval("b = sin(a)") The support math functions are ``sin``, ``cos``, ``exp``, ``log``, ``expm1``, ``log1p``, @@ -292,23 +297,26 @@ See the :ref:`documentation ` for more details. .. ipython:: python - df = pd.DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]], - columns=pd.MultiIndex.from_product( - [['foo', 'bar'], ['a', 'b']], names=['col1', 'col2']), - index=pd.MultiIndex.from_product([['j'], ['l', 'k']], - names=['i1', 'i2'])) + df = pd.DataFrame( + [[1, 2, 3, 4], [5, 6, 7, 8]], + columns=pd.MultiIndex.from_product( + [["foo", "bar"], ["a", "b"]], names=["col1", "col2"] + ), + index=pd.MultiIndex.from_product([["j"], ["l", "k"]], names=["i1", "i2"]), + ) df - df.to_excel('test.xlsx') + df.to_excel("test.xlsx") - df = pd.read_excel('test.xlsx', header=[0, 1], index_col=[0, 1]) + df = pd.read_excel("test.xlsx", header=[0, 1], index_col=[0, 1]) df .. ipython:: python :suppress: import os - os.remove('test.xlsx') + + os.remove("test.xlsx") Previously, it was necessary to specify the ``has_index_names`` argument in ``read_excel``, if the serialized data had index names. For version 0.17.0 the output format of ``to_excel`` @@ -354,14 +362,14 @@ Some East Asian countries use Unicode characters its width is corresponding to 2 .. ipython:: python - df = pd.DataFrame({u'国籍': ['UK', u'日本'], u'名前': ['Alice', u'しのぶ']}) + df = pd.DataFrame({u"国籍": ["UK", u"日本"], u"名前": ["Alice", u"しのぶ"]}) df; .. image:: ../_static/option_unicode01.png .. ipython:: python - pd.set_option('display.unicode.east_asian_width', True) + pd.set_option("display.unicode.east_asian_width", True) df; .. image:: ../_static/option_unicode02.png @@ -371,7 +379,7 @@ For further details, see :ref:`here ` .. ipython:: python :suppress: - pd.set_option('display.unicode.east_asian_width', False) + pd.set_option("display.unicode.east_asian_width", False) .. _whatsnew_0170.enhancements.other: @@ -391,9 +399,9 @@ Other enhancements .. ipython:: python - df1 = pd.DataFrame({'col1':[0,1], 'col_left':['a','b']}) - df2 = pd.DataFrame({'col1':[1,2,2],'col_right':[2,2,2]}) - pd.merge(df1, df2, on='col1', how='outer', indicator=True) + df1 = pd.DataFrame({"col1": [0, 1], "col_left": ["a", "b"]}) + df2 = pd.DataFrame({"col1": [1, 2, 2], "col_right": [2, 2, 2]}) + pd.merge(df1, df2, on="col1", how="outer", indicator=True) For more, see the :ref:`updated docs ` @@ -407,7 +415,7 @@ Other enhancements .. ipython:: python - foo = pd.Series([1, 2], name='foo') + foo = pd.Series([1, 2], name="foo") bar = pd.Series([1, 2]) baz = pd.Series([4, 5]) @@ -434,46 +442,43 @@ Other enhancements .. ipython:: python ser = pd.Series([np.nan, np.nan, 5, np.nan, np.nan, np.nan, 13]) - ser.interpolate(limit=1, limit_direction='both') + ser.interpolate(limit=1, limit_direction="both") - Added a ``DataFrame.round`` method to round the values to a variable number of decimal places (:issue:`10568`). .. ipython:: python - df = pd.DataFrame(np.random.random([3, 3]), - columns=['A', 'B', 'C'], - index=['first', 'second', 'third']) + df = pd.DataFrame( + np.random.random([3, 3]), + columns=["A", "B", "C"], + index=["first", "second", "third"], + ) df df.round(2) - df.round({'A': 0, 'C': 2}) + df.round({"A": 0, "C": 2}) - ``drop_duplicates`` and ``duplicated`` now accept a ``keep`` keyword to target first, last, and all duplicates. The ``take_last`` keyword is deprecated, see :ref:`here ` (:issue:`6511`, :issue:`8505`) .. ipython:: python - s = pd.Series(['A', 'B', 'C', 'A', 'B', 'D']) + s = pd.Series(["A", "B", "C", "A", "B", "D"]) s.drop_duplicates() - s.drop_duplicates(keep='last') + s.drop_duplicates(keep="last") s.drop_duplicates(keep=False) - Reindex now has a ``tolerance`` argument that allows for finer control of :ref:`basics.limits_on_reindex_fill` (:issue:`10411`): .. ipython:: python - df = pd.DataFrame({'x': range(5), - 't': pd.date_range('2000-01-01', periods=5)}) - df.reindex([0.1, 1.9, 3.5], - method='nearest', - tolerance=0.2) + df = pd.DataFrame({"x": range(5), "t": pd.date_range("2000-01-01", periods=5)}) + df.reindex([0.1, 1.9, 3.5], method="nearest", tolerance=0.2) When used on a ``DatetimeIndex``, ``TimedeltaIndex`` or ``PeriodIndex``, ``tolerance`` will coerced into a ``Timedelta`` if possible. This allows you to specify tolerance with a string: .. ipython:: python - df = df.set_index('t') - df.reindex(pd.to_datetime(['1999-12-31']), - method='nearest', - tolerance='1 day') + df = df.set_index("t") + df.reindex(pd.to_datetime(["1999-12-31"]), method="nearest", tolerance="1 day") ``tolerance`` is also exposed by the lower level ``Index.get_indexer`` and ``Index.get_loc`` methods. @@ -627,13 +632,13 @@ Of course you can coerce this as well. .. ipython:: python - pd.to_datetime(['2009-07-31', 'asd'], errors='coerce') + pd.to_datetime(["2009-07-31", "asd"], errors="coerce") To keep the previous behavior, you can use ``errors='ignore'``: .. ipython:: python - pd.to_datetime(['2009-07-31', 'asd'], errors='ignore') + pd.to_datetime(["2009-07-31", "asd"], errors="ignore") Furthermore, ``pd.to_timedelta`` has gained a similar API, of ``errors='raise'|'ignore'|'coerce'``, and the ``coerce`` keyword has been deprecated in favor of ``errors='coerce'``. @@ -667,9 +672,9 @@ New behavior: .. ipython:: python - pd.Timestamp('2012Q2') - pd.Timestamp('2014') - pd.DatetimeIndex(['2012Q2', '2014']) + pd.Timestamp("2012Q2") + pd.Timestamp("2014") + pd.DatetimeIndex(["2012Q2", "2014"]) .. note:: @@ -678,6 +683,7 @@ New behavior: .. ipython:: python import pandas.tseries.offsets as offsets + pd.Timestamp.now() pd.Timestamp.now() + offsets.DateOffset(years=1) @@ -780,8 +786,7 @@ Previous behavior: .. ipython:: python - df_with_missing = pd.DataFrame({'col1': [0, np.nan, 2], - 'col2': [1, np.nan, np.nan]}) + df_with_missing = pd.DataFrame({"col1": [0, np.nan, 2], "col2": [1, np.nan, np.nan]}) df_with_missing @@ -806,18 +811,16 @@ New behavior: .. ipython:: python - df_with_missing.to_hdf('file.h5', - 'df_with_missing', - format='table', - mode='w') + df_with_missing.to_hdf("file.h5", "df_with_missing", format="table", mode="w") - pd.read_hdf('file.h5', 'df_with_missing') + pd.read_hdf("file.h5", "df_with_missing") .. ipython:: python :suppress: import os - os.remove('file.h5') + + os.remove("file.h5") See the :ref:`docs ` for more details. @@ -848,8 +851,8 @@ regular formatting as well as scientific notation, similar to how numpy's ``prec .. ipython:: python - pd.set_option('display.precision', 2) - pd.DataFrame({'x': [123.456789]}) + pd.set_option("display.precision", 2) + pd.DataFrame({"x": [123.456789]}) To preserve output behavior with prior versions the default value of ``display.precision`` has been reduced to ``6`` from ``7``. @@ -857,7 +860,7 @@ from ``7``. .. ipython:: python :suppress: - pd.set_option('display.precision', 6) + pd.set_option("display.precision", 6) .. _whatsnew_0170.api_breaking.categorical_unique: @@ -871,14 +874,11 @@ Changes to ``Categorical.unique`` .. ipython:: python - cat = pd.Categorical(['C', 'A', 'B', 'C'], - categories=['A', 'B', 'C'], - ordered=True) + cat = pd.Categorical(["C", "A", "B", "C"], categories=["A", "B", "C"], ordered=True) cat cat.unique() - cat = pd.Categorical(['C', 'A', 'B', 'C'], - categories=['A', 'B', 'C']) + cat = pd.Categorical(["C", "A", "B", "C"], categories=["A", "B", "C"]) cat cat.unique() @@ -980,9 +980,11 @@ Removal of prior version deprecations/changes .. ipython:: python np.random.seed(1234) - df = pd.DataFrame(np.random.randn(5, 2), - columns=list('AB'), - index=pd.date_range('2013-01-01', periods=5)) + df = pd.DataFrame( + np.random.randn(5, 2), + columns=list("AB"), + index=pd.date_range("2013-01-01", periods=5), + ) df Previously @@ -1005,7 +1007,7 @@ Removal of prior version deprecations/changes .. ipython:: python - df.add(df.A, axis='index') + df.add(df.A, axis="index") - Remove ``table`` keyword in ``HDFStore.put/append``, in favor of using ``format=`` (:issue:`4645`) diff --git a/doc/source/whatsnew/v0.17.1.rst b/doc/source/whatsnew/v0.17.1.rst index 5d15a01aee5a0..6b0a28ec47568 100644 --- a/doc/source/whatsnew/v0.17.1.rst +++ b/doc/source/whatsnew/v0.17.1.rst @@ -52,8 +52,8 @@ Here's a quick example: .. ipython:: python np.random.seed(123) - df = pd.DataFrame(np.random.randn(10, 5), columns=list('abcde')) - html = df.style.background_gradient(cmap='viridis', low=.5) + df = pd.DataFrame(np.random.randn(10, 5), columns=list("abcde")) + html = df.style.background_gradient(cmap="viridis", low=0.5) We can render the HTML to get the following table. @@ -80,14 +80,14 @@ Enhancements .. ipython:: python - df = pd.DataFrame({'A': ['foo'] * 1000}) # noqa: F821 - df['B'] = df['A'].astype('category') + df = pd.DataFrame({"A": ["foo"] * 1000}) # noqa: F821 + df["B"] = df["A"].astype("category") # shows the '+' as we have object dtypes df.info() # we have an accurate memory assessment (but can be expensive to compute this) - df.info(memory_usage='deep') + df.info(memory_usage="deep") - ``Index`` now has a ``fillna`` method (:issue:`10089`) @@ -99,11 +99,11 @@ Enhancements .. ipython:: python - s = pd.Series(list('aabb')).astype('category') + s = pd.Series(list("aabb")).astype("category") s s.str.contains("a") - date = pd.Series(pd.date_range('1/1/2015', periods=5)).astype('category') + date = pd.Series(pd.date_range("1/1/2015", periods=5)).astype("category") date date.dt.day diff --git a/doc/source/whatsnew/v0.18.1.rst b/doc/source/whatsnew/v0.18.1.rst index 13ed6bc38163b..3db00f686d62c 100644 --- a/doc/source/whatsnew/v0.18.1.rst +++ b/doc/source/whatsnew/v0.18.1.rst @@ -42,6 +42,7 @@ see :ref:`Custom Business Hour ` (:issue:`11514`) from pandas.tseries.offsets import CustomBusinessHour from pandas.tseries.holiday import USFederalHolidayCalendar + bhour_us = CustomBusinessHour(calendar=USFederalHolidayCalendar()) Friday before MLK Day @@ -49,6 +50,7 @@ Friday before MLK Day .. ipython:: python import datetime + dt = datetime.datetime(2014, 1, 17, 15) dt + bhour_us @@ -72,41 +74,42 @@ Previously you would have to do this to get a rolling window mean per-group: .. ipython:: python - df = pd.DataFrame({'A': [1] * 20 + [2] * 12 + [3] * 8, - 'B': np.arange(40)}) + df = pd.DataFrame({"A": [1] * 20 + [2] * 12 + [3] * 8, "B": np.arange(40)}) df .. ipython:: python - df.groupby('A').apply(lambda x: x.rolling(4).B.mean()) + df.groupby("A").apply(lambda x: x.rolling(4).B.mean()) Now you can do: .. ipython:: python - df.groupby('A').rolling(4).B.mean() + df.groupby("A").rolling(4).B.mean() For ``.resample(..)`` type of operations, previously you would have to: .. ipython:: python - df = pd.DataFrame({'date': pd.date_range(start='2016-01-01', - periods=4, - freq='W'), - 'group': [1, 1, 2, 2], - 'val': [5, 6, 7, 8]}).set_index('date') + df = pd.DataFrame( + { + "date": pd.date_range(start="2016-01-01", periods=4, freq="W"), + "group": [1, 1, 2, 2], + "val": [5, 6, 7, 8], + } + ).set_index("date") df .. ipython:: python - df.groupby('group').apply(lambda x: x.resample('1D').ffill()) + df.groupby("group").apply(lambda x: x.resample("1D").ffill()) Now you can do: .. ipython:: python - df.groupby('group').resample('1D').ffill() + df.groupby("group").resample("1D").ffill() .. _whatsnew_0181.enhancements.method_chain: @@ -129,9 +132,7 @@ arguments. .. ipython:: python - df = pd.DataFrame({'A': [1, 2, 3], - 'B': [4, 5, 6], - 'C': [7, 8, 9]}) + df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6], "C": [7, 8, 9]}) df.where(lambda x: x > 4, lambda x: x + 10) Methods ``.loc[]``, ``.iloc[]``, ``.ix[]`` @@ -146,7 +147,7 @@ can return a valid boolean indexer or anything which is valid for these indexer' df.loc[lambda x: x.A >= 2, lambda x: x.sum() > 10] # callable returns list of labels - df.loc[lambda x: [1, 2], lambda x: ['A', 'B']] + df.loc[lambda x: [1, 2], lambda x: ["A", "B"]] Indexing with``[]`` """"""""""""""""""" @@ -157,17 +158,15 @@ class and index type. .. ipython:: python - df[lambda x: 'A'] + df[lambda x: "A"] Using these methods / indexers, you can chain data selection operations without using temporary variable. .. ipython:: python - bb = pd.read_csv('data/baseball.csv', index_col='id') - (bb.groupby(['year', 'team']) - .sum() - .loc[lambda df: df.r > 100]) + bb = pd.read_csv("data/baseball.csv", index_col="id") + (bb.groupby(["year", "team"]).sum().loc[lambda df: df.r > 100]) .. _whatsnew_0181.partial_string_indexing: @@ -180,13 +179,13 @@ Partial string indexing now matches on ``DateTimeIndex`` when part of a ``MultiI dft2 = pd.DataFrame( np.random.randn(20, 1), - columns=['A'], - index=pd.MultiIndex.from_product([pd.date_range('20130101', - periods=10, - freq='12H'), - ['a', 'b']])) + columns=["A"], + index=pd.MultiIndex.from_product( + [pd.date_range("20130101", periods=10, freq="12H"), ["a", "b"]] + ), + ) dft2 - dft2.loc['2013-01-05'] + dft2.loc["2013-01-05"] On other levels @@ -195,7 +194,7 @@ On other levels idx = pd.IndexSlice dft2 = dft2.swaplevel(0, 1).sort_index() dft2 - dft2.loc[idx[:, '2013-01-05'], :] + dft2.loc[idx[:, "2013-01-05"], :] .. _whatsnew_0181.enhancements.assembling: @@ -206,10 +205,9 @@ Assembling datetimes .. ipython:: python - df = pd.DataFrame({'year': [2015, 2016], - 'month': [2, 3], - 'day': [4, 5], - 'hour': [2, 3]}) + df = pd.DataFrame( + {"year": [2015, 2016], "month": [2, 3], "day": [4, 5], "hour": [2, 3]} + ) df Assembling using the passed frame. @@ -222,7 +220,7 @@ You can pass only the columns that you need to assemble. .. ipython:: python - pd.to_datetime(df[['year', 'month', 'day']]) + pd.to_datetime(df[["year", "month", "day"]]) .. _whatsnew_0181.other: @@ -243,7 +241,7 @@ Other enhancements .. ipython:: python - idx = pd.Index([1., 2., 3., 4.], dtype='float') + idx = pd.Index([1.0, 2.0, 3.0, 4.0], dtype="float") # default, allow_fill=True, fill_value=None idx.take([2, -1]) @@ -253,8 +251,8 @@ Other enhancements .. ipython:: python - idx = pd.Index(['a|b', 'a|c', 'b|c']) - idx.str.get_dummies('|') + idx = pd.Index(["a|b", "a|c", "b|c"]) + idx.str.get_dummies("|") - ``pd.crosstab()`` has gained a ``normalize`` argument for normalizing frequency tables (:issue:`12569`). Examples in the updated docs :ref:`here `. @@ -313,8 +311,7 @@ The index in ``.groupby(..).nth()`` output is now more consistent when the ``as_ .. ipython:: python - df = pd.DataFrame({'A': ['a', 'b', 'a'], - 'B': [1, 2, 3]}) + df = pd.DataFrame({"A": ["a", "b", "a"], "B": [1, 2, 3]}) df Previous behavior: @@ -337,16 +334,16 @@ New behavior: .. ipython:: python - df.groupby('A', as_index=True)['B'].nth(0) - df.groupby('A', as_index=False)['B'].nth(0) + df.groupby("A", as_index=True)["B"].nth(0) + df.groupby("A", as_index=False)["B"].nth(0) Furthermore, previously, a ``.groupby`` would always sort, regardless if ``sort=False`` was passed with ``.nth()``. .. ipython:: python np.random.seed(1234) - df = pd.DataFrame(np.random.randn(100, 2), columns=['a', 'b']) - df['c'] = np.random.randint(0, 4, 100) + df = pd.DataFrame(np.random.randn(100, 2), columns=["a", "b"]) + df["c"] = np.random.randint(0, 4, 100) Previous behavior: @@ -374,8 +371,8 @@ New behavior: .. ipython:: python - df.groupby('c', sort=True).nth(1) - df.groupby('c', sort=False).nth(1) + df.groupby("c", sort=True).nth(1) + df.groupby("c", sort=False).nth(1) .. _whatsnew_0181.numpy_compatibility: @@ -421,8 +418,9 @@ Using ``apply`` on resampling groupby operations (using a ``pd.TimeGrouper``) no .. ipython:: python - df = pd.DataFrame({'date': pd.to_datetime(['10/10/2000', '11/10/2000']), - 'value': [10, 13]}) + df = pd.DataFrame( + {"date": pd.to_datetime(["10/10/2000", "11/10/2000"]), "value": [10, 13]} + ) df Previous behavior: diff --git a/doc/source/whatsnew/v0.19.0.rst b/doc/source/whatsnew/v0.19.0.rst index 5732367a69af2..08ccc1565125f 100644 --- a/doc/source/whatsnew/v0.19.0.rst +++ b/doc/source/whatsnew/v0.19.0.rst @@ -49,10 +49,8 @@ except that we match on nearest key rather than equal keys. .. ipython:: python - left = pd.DataFrame({'a': [1, 5, 10], - 'left_val': ['a', 'b', 'c']}) - right = pd.DataFrame({'a': [1, 2, 3, 6, 7], - 'right_val': [1, 2, 3, 6, 7]}) + left = pd.DataFrame({"a": [1, 5, 10], "left_val": ["a", "b", "c"]}) + right = pd.DataFrame({"a": [1, 2, 3, 6, 7], "right_val": [1, 2, 3, 6, 7]}) left right @@ -62,13 +60,13 @@ recent value otherwise. .. ipython:: python - pd.merge_asof(left, right, on='a') + pd.merge_asof(left, right, on="a") We can also match rows ONLY with prior data, and not an exact match. .. ipython:: python - pd.merge_asof(left, right, on='a', allow_exact_matches=False) + pd.merge_asof(left, right, on="a", allow_exact_matches=False) In a typical time-series example, we have ``trades`` and ``quotes`` and we want to ``asof-join`` them. @@ -76,36 +74,44 @@ This also illustrates using the ``by`` parameter to group data before merging. .. ipython:: python - trades = pd.DataFrame({ - 'time': pd.to_datetime(['20160525 13:30:00.023', - '20160525 13:30:00.038', - '20160525 13:30:00.048', - '20160525 13:30:00.048', - '20160525 13:30:00.048']), - 'ticker': ['MSFT', 'MSFT', - 'GOOG', 'GOOG', 'AAPL'], - 'price': [51.95, 51.95, - 720.77, 720.92, 98.00], - 'quantity': [75, 155, - 100, 100, 100]}, - columns=['time', 'ticker', 'price', 'quantity']) - - quotes = pd.DataFrame({ - 'time': pd.to_datetime(['20160525 13:30:00.023', - '20160525 13:30:00.023', - '20160525 13:30:00.030', - '20160525 13:30:00.041', - '20160525 13:30:00.048', - '20160525 13:30:00.049', - '20160525 13:30:00.072', - '20160525 13:30:00.075']), - 'ticker': ['GOOG', 'MSFT', 'MSFT', 'MSFT', - 'GOOG', 'AAPL', 'GOOG', 'MSFT'], - 'bid': [720.50, 51.95, 51.97, 51.99, - 720.50, 97.99, 720.50, 52.01], - 'ask': [720.93, 51.96, 51.98, 52.00, - 720.93, 98.01, 720.88, 52.03]}, - columns=['time', 'ticker', 'bid', 'ask']) + trades = pd.DataFrame( + { + "time": pd.to_datetime( + [ + "20160525 13:30:00.023", + "20160525 13:30:00.038", + "20160525 13:30:00.048", + "20160525 13:30:00.048", + "20160525 13:30:00.048", + ] + ), + "ticker": ["MSFT", "MSFT", "GOOG", "GOOG", "AAPL"], + "price": [51.95, 51.95, 720.77, 720.92, 98.00], + "quantity": [75, 155, 100, 100, 100], + }, + columns=["time", "ticker", "price", "quantity"], + ) + + quotes = pd.DataFrame( + { + "time": pd.to_datetime( + [ + "20160525 13:30:00.023", + "20160525 13:30:00.023", + "20160525 13:30:00.030", + "20160525 13:30:00.041", + "20160525 13:30:00.048", + "20160525 13:30:00.049", + "20160525 13:30:00.072", + "20160525 13:30:00.075", + ] + ), + "ticker": ["GOOG", "MSFT", "MSFT", "MSFT", "GOOG", "AAPL", "GOOG", "MSFT"], + "bid": [720.50, 51.95, 51.97, 51.99, 720.50, 97.99, 720.50, 52.01], + "ask": [720.93, 51.96, 51.98, 52.00, 720.93, 98.01, 720.88, 52.03], + }, + columns=["time", "ticker", "bid", "ask"], + ) .. ipython:: python @@ -118,9 +124,7 @@ that forward filling happens automatically taking the most recent non-NaN value. .. ipython:: python - pd.merge_asof(trades, quotes, - on='time', - by='ticker') + pd.merge_asof(trades, quotes, on="time", by="ticker") This returns a merged DataFrame with the entries in the same order as the original left passed DataFrame (``trades`` in this case), with the fields of the ``quotes`` merged. @@ -135,9 +139,10 @@ See the full documentation :ref:`here `. .. ipython:: python - dft = pd.DataFrame({'B': [0, 1, 2, np.nan, 4]}, - index=pd.date_range('20130101 09:00:00', - periods=5, freq='s')) + dft = pd.DataFrame( + {"B": [0, 1, 2, np.nan, 4]}, + index=pd.date_range("20130101 09:00:00", periods=5, freq="s"), + ) dft This is a regular frequency index. Using an integer window parameter works to roll along the window frequency. @@ -151,20 +156,26 @@ Specifying an offset allows a more intuitive specification of the rolling freque .. ipython:: python - dft.rolling('2s').sum() + dft.rolling("2s").sum() Using a non-regular, but still monotonic index, rolling with an integer window does not impart any special calculation. .. ipython:: python - dft = pd.DataFrame({'B': [0, 1, 2, np.nan, 4]}, - index=pd.Index([pd.Timestamp('20130101 09:00:00'), - pd.Timestamp('20130101 09:00:02'), - pd.Timestamp('20130101 09:00:03'), - pd.Timestamp('20130101 09:00:05'), - pd.Timestamp('20130101 09:00:06')], - name='foo')) + dft = pd.DataFrame( + {"B": [0, 1, 2, np.nan, 4]}, + index=pd.Index( + [ + pd.Timestamp("20130101 09:00:00"), + pd.Timestamp("20130101 09:00:02"), + pd.Timestamp("20130101 09:00:03"), + pd.Timestamp("20130101 09:00:05"), + pd.Timestamp("20130101 09:00:06"), + ], + name="foo", + ), + ) dft dft.rolling(2).sum() @@ -173,7 +184,7 @@ Using the time-specification generates variable windows for this sparse data. .. ipython:: python - dft.rolling('2s').sum() + dft.rolling("2s").sum() Furthermore, we now allow an optional ``on`` parameter to specify a column (rather than the default of the index) in a DataFrame. @@ -182,7 +193,7 @@ default of the index) in a DataFrame. dft = dft.reset_index() dft - dft.rolling('2s', on='foo').sum() + dft.rolling("2s", on="foo").sum() .. _whatsnew_0190.enhancements.read_csv_dupe_col_names_support: @@ -199,8 +210,8 @@ they are in the file or passed in as the ``names`` parameter (:issue:`7160`, :is .. ipython:: python - data = '0,1,2\n3,4,5' - names = ['a', 'b', 'a'] + data = "0,1,2\n3,4,5" + names = ["a", "b", "a"] **Previous behavior**: @@ -235,17 +246,22 @@ converting to ``Categorical`` after parsing. See the io :ref:`docs here ` (:issue:`10008`, :issue:`13156`) @@ -415,7 +431,7 @@ The ``pd.get_dummies`` function now returns dummy-encoded columns as small integ .. ipython:: python - pd.get_dummies(['a', 'b', 'a', 'c']).dtypes + pd.get_dummies(["a", "b", "a", "c"]).dtypes .. _whatsnew_0190.enhancements.to_numeric_downcast: @@ -427,9 +443,9 @@ Downcast values to smallest possible dtype in ``to_numeric`` .. ipython:: python - s = ['1', 2, 3] - pd.to_numeric(s, downcast='unsigned') - pd.to_numeric(s, downcast='integer') + s = ["1", 2, 3] + pd.to_numeric(s, downcast="unsigned") + pd.to_numeric(s, downcast="integer") .. _whatsnew_0190.dev_api: @@ -447,7 +463,8 @@ The following are now part of this API: import pprint from pandas.api import types - funcs = [f for f in dir(types) if not f.startswith('_')] + + funcs = [f for f in dir(types) if not f.startswith("_")] pprint.pprint(funcs) .. note:: @@ -472,16 +489,16 @@ Other enhancements .. ipython:: python - df = pd.DataFrame({'date': pd.date_range('2015-01-01', freq='W', periods=5), - 'a': np.arange(5)}, - index=pd.MultiIndex.from_arrays([[1, 2, 3, 4, 5], - pd.date_range('2015-01-01', - freq='W', - periods=5) - ], names=['v', 'd'])) + df = pd.DataFrame( + {"date": pd.date_range("2015-01-01", freq="W", periods=5), "a": np.arange(5)}, + index=pd.MultiIndex.from_arrays( + [[1, 2, 3, 4, 5], pd.date_range("2015-01-01", freq="W", periods=5)], + names=["v", "d"], + ), + ) df - df.resample('M', on='date').sum() - df.resample('M', level='d').sum() + df.resample("M", on="date").sum() + df.resample("M", level="d").sum() - The ``.get_credentials()`` method of ``GbqConnector`` can now first try to fetch `the application default credentials `__. See the docs for more details (:issue:`13577`). - The ``.tz_localize()`` method of ``DatetimeIndex`` and ``Timestamp`` has gained the ``errors`` keyword, so you can potentially coerce nonexistent timestamps to ``NaT``. The default behavior remains to raising a ``NonExistentTimeError`` (:issue:`13057`) @@ -507,10 +524,9 @@ Other enhancements .. ipython:: python - df = pd.DataFrame({'A': [2, 7], 'B': [3, 5], 'C': [4, 8]}, - index=['row1', 'row2']) + df = pd.DataFrame({"A": [2, 7], "B": [3, 5], "C": [4, 8]}, index=["row1", "row2"]) df - df.sort_values(by='row2', axis=1) + df.sort_values(by="row2", axis=1) - Added documentation to :ref:`I/O` regarding the perils of reading in columns with mixed dtypes and how to handle it (:issue:`13746`) - :meth:`~DataFrame.to_html` now has a ``border`` argument to control the value in the opening ```` tag. The default is the value of the ``html.border`` option, which defaults to 1. This also affects the notebook HTML repr, but since Jupyter's CSS includes a border-width attribute, the visual effect is the same. (:issue:`11563`). @@ -583,12 +599,12 @@ Arithmetic operators align both ``index`` (no changes). .. ipython:: python - s1 = pd.Series([1, 2, 3], index=list('ABC')) - s2 = pd.Series([2, 2, 2], index=list('ABD')) + s1 = pd.Series([1, 2, 3], index=list("ABC")) + s2 = pd.Series([2, 2, 2], index=list("ABD")) s1 + s2 - df1 = pd.DataFrame([1, 2, 3], index=list('ABC')) - df2 = pd.DataFrame([2, 2, 2], index=list('ABD')) + df1 = pd.DataFrame([1, 2, 3], index=list("ABC")) + df2 = pd.DataFrame([2, 2, 2], index=list("ABD")) df1 + df2 Comparison operators @@ -661,8 +677,8 @@ Logical operators align both ``.index`` of left and right hand side. .. ipython:: python - s1 = pd.Series([True, False, True], index=list('ABC')) - s2 = pd.Series([True, True, True], index=list('ABD')) + s1 = pd.Series([True, False, True], index=list("ABC")) + s2 = pd.Series([True, True, True], index=list("ABD")) s1 & s2 .. note:: @@ -679,8 +695,8 @@ Logical operators align both ``.index`` of left and right hand side. .. ipython:: python - df1 = pd.DataFrame([True, False, True], index=list('ABC')) - df2 = pd.DataFrame([True, True, True], index=list('ABD')) + df1 = pd.DataFrame([True, False, True], index=list("ABC")) + df2 = pd.DataFrame([True, True, True], index=list("ABD")) df1 & df2 Flexible comparison methods @@ -691,8 +707,8 @@ which has the different ``index``. .. ipython:: python - s1 = pd.Series([1, 2, 3], index=['a', 'b', 'c']) - s2 = pd.Series([2, 2, 2], index=['b', 'c', 'd']) + s1 = pd.Series([1, 2, 3], index=["a", "b", "c"]) + s2 = pd.Series([2, 2, 2], index=["b", "c", "d"]) s1.eq(s2) s1.ge(s2) @@ -749,7 +765,7 @@ This will now convert integers/floats with the default unit of ``ns``. .. ipython:: python - pd.to_datetime([1, 'foo'], errors='coerce') + pd.to_datetime([1, "foo"], errors="coerce") Bug fixes related to ``.to_datetime()``: @@ -768,9 +784,9 @@ Merging will now preserve the dtype of the join keys (:issue:`8596`) .. ipython:: python - df1 = pd.DataFrame({'key': [1], 'v1': [10]}) + df1 = pd.DataFrame({"key": [1], "v1": [10]}) df1 - df2 = pd.DataFrame({'key': [1, 2], 'v1': [20, 30]}) + df2 = pd.DataFrame({"key": [1, 2], "v1": [20, 30]}) df2 **Previous behavior**: @@ -796,16 +812,16 @@ We are able to preserve the join keys .. ipython:: python - pd.merge(df1, df2, how='outer') - pd.merge(df1, df2, how='outer').dtypes + pd.merge(df1, df2, how="outer") + pd.merge(df1, df2, how="outer").dtypes Of course if you have missing values that are introduced, then the resulting dtype will be upcast, which is unchanged from previous. .. ipython:: python - pd.merge(df1, df2, how='outer', on='key') - pd.merge(df1, df2, how='outer', on='key').dtypes + pd.merge(df1, df2, how="outer", on="key") + pd.merge(df1, df2, how="outer", on="key").dtypes .. _whatsnew_0190.api.describe: @@ -889,7 +905,7 @@ As a consequence of this change, ``PeriodIndex`` no longer has an integer dtype: .. ipython:: python - pi = pd.PeriodIndex(['2016-08-01'], freq='D') + pi = pd.PeriodIndex(["2016-08-01"], freq="D") pi pd.api.types.is_integer_dtype(pi) pd.api.types.is_period_dtype(pi) @@ -916,7 +932,7 @@ These result in ``pd.NaT`` without providing ``freq`` option. .. ipython:: python - pd.Period('NaT') + pd.Period("NaT") pd.Period(None) @@ -955,7 +971,7 @@ of integers (:issue:`13988`). .. ipython:: python - pi = pd.PeriodIndex(['2011-01', '2011-02'], freq='M') + pi = pd.PeriodIndex(["2011-01", "2011-02"], freq="M") pi.values @@ -985,7 +1001,7 @@ Previous behavior: .. ipython:: python - pd.Index(['a', 'b']) + pd.Index(['a', 'c']) + pd.Index(["a", "b"]) + pd.Index(["a", "c"]) Note that numeric Index objects already performed element-wise operations. For example, the behavior of adding two integer Indexes is unchanged. @@ -1011,8 +1027,10 @@ DatetimeIndex objects resulting in a TimedeltaIndex: .. ipython:: python - (pd.DatetimeIndex(['2016-01-01', '2016-01-02']) - - pd.DatetimeIndex(['2016-01-02', '2016-01-03'])) + ( + pd.DatetimeIndex(["2016-01-01", "2016-01-02"]) + - pd.DatetimeIndex(["2016-01-02", "2016-01-03"]) + ) .. _whatsnew_0190.api.difference: @@ -1073,8 +1091,7 @@ Previously, most ``Index`` classes returned ``np.ndarray``, and ``DatetimeIndex` .. ipython:: python pd.Index([1, 2, 3]).unique() - pd.DatetimeIndex(['2011-01-01', '2011-01-02', '2011-01-03'], - tz='Asia/Tokyo').unique() + pd.DatetimeIndex(["2011-01-01", "2011-01-02", "2011-01-03"], tz="Asia/Tokyo").unique() .. _whatsnew_0190.api.multiindex: @@ -1086,8 +1103,8 @@ in ``MultiIndex`` levels (:issue:`13743`, :issue:`13854`). .. ipython:: python - cat = pd.Categorical(['a', 'b'], categories=list("bac")) - lvl1 = ['foo', 'bar'] + cat = pd.Categorical(["a", "b"], categories=list("bac")) + lvl1 = ["foo", "bar"] midx = pd.MultiIndex.from_arrays([cat, lvl1]) midx @@ -1113,9 +1130,9 @@ As a consequence, ``groupby`` and ``set_index`` also preserve categorical dtypes .. ipython:: python - df = pd.DataFrame({'A': [0, 1], 'B': [10, 11], 'C': cat}) - df_grouped = df.groupby(by=['A', 'C']).first() - df_set_idx = df.set_index(['A', 'C']) + df = pd.DataFrame({"A": [0, 1], "B": [10, 11], "C": cat}) + df_grouped = df.groupby(by=["A", "C"]).first() + df_set_idx = df.set_index(["A", "C"]) **Previous behavior**: @@ -1163,7 +1180,7 @@ the result of calling :func:`read_csv` without the ``chunksize=`` argument .. ipython:: python - data = 'A,B\n0,1\n2,3\n4,5\n6,7' + data = "A,B\n0,1\n2,3\n4,5\n6,7" **Previous behavior**: @@ -1248,7 +1265,7 @@ Operators now preserve dtypes .. code-block:: python - s = pd.SparseSeries([1., 0., 2., 0.], fill_value=0) + s = pd.SparseSeries([1.0, 0.0, 2.0, 0.0], fill_value=0) s s.astype(np.int64) diff --git a/doc/source/whatsnew/v0.19.1.rst b/doc/source/whatsnew/v0.19.1.rst index f8b60f457b33f..6ff3fb6900a99 100644 --- a/doc/source/whatsnew/v0.19.1.rst +++ b/doc/source/whatsnew/v0.19.1.rst @@ -8,7 +8,7 @@ Version 0.19.1 (November 3, 2016) .. ipython:: python :suppress: - from pandas import * # noqa F401, F403 + from pandas import * # noqa F401, F403 This is a minor bug-fix release from 0.19.0 and includes some small regression fixes, diff --git a/doc/source/whatsnew/v0.19.2.rst b/doc/source/whatsnew/v0.19.2.rst index 924c95f21ceff..bba89d78be869 100644 --- a/doc/source/whatsnew/v0.19.2.rst +++ b/doc/source/whatsnew/v0.19.2.rst @@ -8,7 +8,7 @@ Version 0.19.2 (December 24, 2016) .. ipython:: python :suppress: - from pandas import * # noqa F401, F403 + from pandas import * # noqa F401, F403 This is a minor bug-fix release in the 0.19.x series and includes some small regression fixes, diff --git a/doc/source/whatsnew/v0.20.2.rst b/doc/source/whatsnew/v0.20.2.rst index 7f84c6b3f17bd..430a39d2d2e97 100644 --- a/doc/source/whatsnew/v0.20.2.rst +++ b/doc/source/whatsnew/v0.20.2.rst @@ -8,7 +8,7 @@ Version 0.20.2 (June 4, 2017) .. ipython:: python :suppress: - from pandas import * # noqa F401, F403 + from pandas import * # noqa F401, F403 This is a minor bug-fix release in the 0.20.x series and includes some small regression fixes, diff --git a/doc/source/whatsnew/v0.20.3.rst b/doc/source/whatsnew/v0.20.3.rst index 888d0048ca9f3..ff28f6830783e 100644 --- a/doc/source/whatsnew/v0.20.3.rst +++ b/doc/source/whatsnew/v0.20.3.rst @@ -8,7 +8,7 @@ Version 0.20.3 (July 7, 2017) .. ipython:: python :suppress: - from pandas import * # noqa F401, F403 + from pandas import * # noqa F401, F403 This is a minor bug-fix release in the 0.20.x series and includes some small regression fixes diff --git a/doc/source/whatsnew/v0.21.1.rst b/doc/source/whatsnew/v0.21.1.rst index 2d72f6470fc81..090a988d6406a 100644 --- a/doc/source/whatsnew/v0.21.1.rst +++ b/doc/source/whatsnew/v0.21.1.rst @@ -8,7 +8,7 @@ Version 0.21.1 (December 12, 2017) .. ipython:: python :suppress: - from pandas import * # noqa F401, F403 + from pandas import * # noqa F401, F403 This is a minor bug-fix release in the 0.21.x series and includes some small regression fixes, diff --git a/doc/source/whatsnew/v0.22.0.rst b/doc/source/whatsnew/v0.22.0.rst index 92b514ce59660..ec9769c22e76b 100644 --- a/doc/source/whatsnew/v0.22.0.rst +++ b/doc/source/whatsnew/v0.22.0.rst @@ -8,7 +8,7 @@ Version 0.22.0 (December 29, 2017) .. ipython:: python :suppress: - from pandas import * # noqa F401, F403 + from pandas import * # noqa F401, F403 This is a major release from 0.21.1 and includes a single, API-breaking change. @@ -119,7 +119,7 @@ instead of ``NaN``. .. ipython:: python - grouper = pd.Categorical(['a', 'a'], categories=['a', 'b']) + grouper = pd.Categorical(["a", "a"], categories=["a", "b"]) pd.Series([1, 2]).groupby(grouper).sum() To restore the 0.21 behavior of returning ``NaN`` for unobserved groups, @@ -159,15 +159,14 @@ sum and ``1`` for product. .. ipython:: python - s = pd.Series([1, 1, np.nan, np.nan], - index=pd.date_range('2017', periods=4)) - s.resample('2d').sum() + s = pd.Series([1, 1, np.nan, np.nan], index=pd.date_range("2017", periods=4)) + s.resample("2d").sum() To restore the 0.21 behavior of returning ``NaN``, use ``min_count>=1``. .. ipython:: python - s.resample('2d').sum(min_count=1) + s.resample("2d").sum(min_count=1) In particular, upsampling and taking the sum or product is affected, as upsampling introduces missing values even if the original series was @@ -190,7 +189,7 @@ entirely valid. .. ipython:: python - idx = pd.DatetimeIndex(['2017-01-01', '2017-01-02']) + idx = pd.DatetimeIndex(["2017-01-01", "2017-01-02"]) pd.Series([1, 2], index=idx).resample("12H").sum() Once again, the ``min_count`` keyword is available to restore the 0.21 behavior. diff --git a/doc/source/whatsnew/v0.5.0.rst b/doc/source/whatsnew/v0.5.0.rst index 7ccb141260f18..7447a10fa1d6b 100644 --- a/doc/source/whatsnew/v0.5.0.rst +++ b/doc/source/whatsnew/v0.5.0.rst @@ -9,7 +9,7 @@ Version 0.5.0 (October 24, 2011) .. ipython:: python :suppress: - from pandas import * # noqa F401, F403 + from pandas import * # noqa F401, F403 New features diff --git a/doc/source/whatsnew/v0.6.0.rst b/doc/source/whatsnew/v0.6.0.rst index 1cb9dcbe159aa..8ff688eaa91e7 100644 --- a/doc/source/whatsnew/v0.6.0.rst +++ b/doc/source/whatsnew/v0.6.0.rst @@ -8,7 +8,7 @@ Version 0.6.0 (November 25, 2011) .. ipython:: python :suppress: - from pandas import * # noqa F401, F403 + from pandas import * # noqa F401, F403 New features diff --git a/doc/source/whatsnew/v0.7.3.rst b/doc/source/whatsnew/v0.7.3.rst index 5ed48c0d8d6d9..4ca31baf560bb 100644 --- a/doc/source/whatsnew/v0.7.3.rst +++ b/doc/source/whatsnew/v0.7.3.rst @@ -23,7 +23,8 @@ New features .. code-block:: python from pandas.tools.plotting import scatter_matrix - scatter_matrix(df, alpha=0.2) # noqa F821 + + scatter_matrix(df, alpha=0.2) # noqa F821 - Add ``stacked`` argument to Series and DataFrame's ``plot`` method for @@ -31,12 +32,12 @@ New features .. code-block:: python - df.plot(kind='bar', stacked=True) # noqa F821 + df.plot(kind="bar", stacked=True) # noqa F821 .. code-block:: python - df.plot(kind='barh', stacked=True) # noqa F821 + df.plot(kind="barh", stacked=True) # noqa F821 - Add log x and y :ref:`scaling options ` to @@ -52,9 +53,9 @@ Reverted some changes to how NA values (represented typically as ``NaN`` or .. ipython:: python - series = pd.Series(['Steve', np.nan, 'Joe']) - series == 'Steve' - series != 'Steve' + series = pd.Series(["Steve", np.nan, "Joe"]) + series == "Steve" + series != "Steve" In comparisons, NA / NaN will always come through as ``False`` except with ``!=`` which is ``True``. *Be very careful* with boolean arithmetic, especially @@ -63,7 +64,7 @@ filter into boolean array operations if you are worried about this: .. ipython:: python - mask = series == 'Steve' + mask = series == "Steve" series[mask & series.notnull()] While propagating NA in comparisons may seem like the right behavior to some @@ -82,15 +83,18 @@ Series, to be more consistent with the ``groupby`` behavior with DataFrame: .. ipython:: python :okwarning: - df = pd.DataFrame({'A': ['foo', 'bar', 'foo', 'bar', - 'foo', 'bar', 'foo', 'foo'], - 'B': ['one', 'one', 'two', 'three', - 'two', 'two', 'one', 'three'], - 'C': np.random.randn(8), 'D': np.random.randn(8)}) + df = pd.DataFrame( + { + "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], + "B": ["one", "one", "two", "three", "two", "two", "one", "three"], + "C": np.random.randn(8), + "D": np.random.randn(8), + } + ) df - grouped = df.groupby('A')['C'] + grouped = df.groupby("A")["C"] grouped.describe() - grouped.apply(lambda x: x.sort_values()[-2:]) # top 2 values + grouped.apply(lambda x: x.sort_values()[-2:]) # top 2 values .. _whatsnew_0.7.3.contributors: diff --git a/doc/source/whatsnew/v0.8.0.rst b/doc/source/whatsnew/v0.8.0.rst index 9bba68d8c331d..8a84630a28b34 100644 --- a/doc/source/whatsnew/v0.8.0.rst +++ b/doc/source/whatsnew/v0.8.0.rst @@ -159,7 +159,8 @@ New plotting methods .. code-block:: python import pandas as pd - fx = pd.read_pickle('data/fx_prices') + + fx = pd.read_pickle("data/fx_prices") import matplotlib.pyplot as plt ``Series.plot`` now supports a ``secondary_y`` option: @@ -168,20 +169,19 @@ New plotting methods plt.figure() - fx['FR'].plot(style='g') + fx["FR"].plot(style="g") - fx['IT'].plot(style='k--', secondary_y=True) + fx["IT"].plot(style="k--", secondary_y=True) Vytautas Jancauskas, the 2012 GSOC participant, has added many new plot types. For example, ``'kde'`` is a new option: .. ipython:: python - s = pd.Series(np.concatenate((np.random.randn(1000), - np.random.randn(1000) * 0.5 + 3))) + s = pd.Series(np.concatenate((np.random.randn(1000), np.random.randn(1000) * 0.5 + 3))) plt.figure() s.hist(density=True, alpha=0.2) - s.plot(kind='kde') + s.plot(kind="kde") See :ref:`the plotting page ` for much more. @@ -205,7 +205,8 @@ with code using scalar values because you are handing control over to NumPy: .. ipython:: python import datetime - rng = pd.date_range('1/1/2000', periods=10) + + rng = pd.date_range("1/1/2000", periods=10) rng[5] isinstance(rng[5], datetime.datetime) rng_asarray = np.asarray(rng) @@ -251,7 +252,7 @@ type. See `matplotlib documentation .. ipython:: python - rng = pd.date_range('1/1/2000', periods=10) + rng = pd.date_range("1/1/2000", periods=10) rng np.asarray(rng) converted = np.asarray(rng, dtype=object) diff --git a/doc/source/whatsnew/v0.9.0.rst b/doc/source/whatsnew/v0.9.0.rst index 5172b1989765d..44ded51e31fda 100644 --- a/doc/source/whatsnew/v0.9.0.rst +++ b/doc/source/whatsnew/v0.9.0.rst @@ -41,9 +41,11 @@ API changes import io - data = ('0,0,1\n' - '1,1,0\n' - '0,1,0') + data = """ + 0,0,1 + 1,1,0 + 0,1,0 + """ df = pd.read_csv(io.StringIO(data), header=None) df @@ -59,7 +61,7 @@ API changes s1 = pd.Series([1, 2, 3]) s1 - s2 = pd.Series(s1, index=['foo', 'bar', 'baz']) + s2 = pd.Series(s1, index=["foo", "bar", "baz"]) s2 - Deprecated ``day_of_year`` API removed from PeriodIndex, use ``dayofyear`` From ee917d03f93da394f5750abcadc2b60bb00f919d Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Mon, 5 Oct 2020 13:58:39 +0100 Subject: [PATCH 1019/1025] TYP: check_untyped_defs core.arrays.base (#36885) --- pandas/core/arrays/base.py | 56 +++++++++++++++++++------------------- setup.cfg | 3 -- 2 files changed, 28 insertions(+), 31 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index c2fc72ff753a8..94d6428b44043 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -1176,22 +1176,22 @@ def _create_arithmetic_method(cls, op): @classmethod def _add_arithmetic_ops(cls): - cls.__add__ = cls._create_arithmetic_method(operator.add) - cls.__radd__ = cls._create_arithmetic_method(ops.radd) - cls.__sub__ = cls._create_arithmetic_method(operator.sub) - cls.__rsub__ = cls._create_arithmetic_method(ops.rsub) - cls.__mul__ = cls._create_arithmetic_method(operator.mul) - cls.__rmul__ = cls._create_arithmetic_method(ops.rmul) - cls.__pow__ = cls._create_arithmetic_method(operator.pow) - cls.__rpow__ = cls._create_arithmetic_method(ops.rpow) - cls.__mod__ = cls._create_arithmetic_method(operator.mod) - cls.__rmod__ = cls._create_arithmetic_method(ops.rmod) - cls.__floordiv__ = cls._create_arithmetic_method(operator.floordiv) - cls.__rfloordiv__ = cls._create_arithmetic_method(ops.rfloordiv) - cls.__truediv__ = cls._create_arithmetic_method(operator.truediv) - cls.__rtruediv__ = cls._create_arithmetic_method(ops.rtruediv) - cls.__divmod__ = cls._create_arithmetic_method(divmod) - cls.__rdivmod__ = cls._create_arithmetic_method(ops.rdivmod) + setattr(cls, "__add__", cls._create_arithmetic_method(operator.add)) + setattr(cls, "__radd__", cls._create_arithmetic_method(ops.radd)) + setattr(cls, "__sub__", cls._create_arithmetic_method(operator.sub)) + setattr(cls, "__rsub__", cls._create_arithmetic_method(ops.rsub)) + setattr(cls, "__mul__", cls._create_arithmetic_method(operator.mul)) + setattr(cls, "__rmul__", cls._create_arithmetic_method(ops.rmul)) + setattr(cls, "__pow__", cls._create_arithmetic_method(operator.pow)) + setattr(cls, "__rpow__", cls._create_arithmetic_method(ops.rpow)) + setattr(cls, "__mod__", cls._create_arithmetic_method(operator.mod)) + setattr(cls, "__rmod__", cls._create_arithmetic_method(ops.rmod)) + setattr(cls, "__floordiv__", cls._create_arithmetic_method(operator.floordiv)) + setattr(cls, "__rfloordiv__", cls._create_arithmetic_method(ops.rfloordiv)) + setattr(cls, "__truediv__", cls._create_arithmetic_method(operator.truediv)) + setattr(cls, "__rtruediv__", cls._create_arithmetic_method(ops.rtruediv)) + setattr(cls, "__divmod__", cls._create_arithmetic_method(divmod)) + setattr(cls, "__rdivmod__", cls._create_arithmetic_method(ops.rdivmod)) @classmethod def _create_comparison_method(cls, op): @@ -1199,12 +1199,12 @@ def _create_comparison_method(cls, op): @classmethod def _add_comparison_ops(cls): - cls.__eq__ = cls._create_comparison_method(operator.eq) - cls.__ne__ = cls._create_comparison_method(operator.ne) - cls.__lt__ = cls._create_comparison_method(operator.lt) - cls.__gt__ = cls._create_comparison_method(operator.gt) - cls.__le__ = cls._create_comparison_method(operator.le) - cls.__ge__ = cls._create_comparison_method(operator.ge) + setattr(cls, "__eq__", cls._create_comparison_method(operator.eq)) + setattr(cls, "__ne__", cls._create_comparison_method(operator.ne)) + setattr(cls, "__lt__", cls._create_comparison_method(operator.lt)) + setattr(cls, "__gt__", cls._create_comparison_method(operator.gt)) + setattr(cls, "__le__", cls._create_comparison_method(operator.le)) + setattr(cls, "__ge__", cls._create_comparison_method(operator.ge)) @classmethod def _create_logical_method(cls, op): @@ -1212,12 +1212,12 @@ def _create_logical_method(cls, op): @classmethod def _add_logical_ops(cls): - cls.__and__ = cls._create_logical_method(operator.and_) - cls.__rand__ = cls._create_logical_method(ops.rand_) - cls.__or__ = cls._create_logical_method(operator.or_) - cls.__ror__ = cls._create_logical_method(ops.ror_) - cls.__xor__ = cls._create_logical_method(operator.xor) - cls.__rxor__ = cls._create_logical_method(ops.rxor) + setattr(cls, "__and__", cls._create_logical_method(operator.and_)) + setattr(cls, "__rand__", cls._create_logical_method(ops.rand_)) + setattr(cls, "__or__", cls._create_logical_method(operator.or_)) + setattr(cls, "__ror__", cls._create_logical_method(ops.ror_)) + setattr(cls, "__xor__", cls._create_logical_method(operator.xor)) + setattr(cls, "__rxor__", cls._create_logical_method(ops.rxor)) class ExtensionScalarOpsMixin(ExtensionOpsMixin): diff --git a/setup.cfg b/setup.cfg index 3279a485c9bf3..75722f2a7809f 100644 --- a/setup.cfg +++ b/setup.cfg @@ -142,9 +142,6 @@ check_untyped_defs=False [mypy-pandas.core.apply] check_untyped_defs=False -[mypy-pandas.core.arrays.base] -check_untyped_defs=False - [mypy-pandas.core.arrays.datetimelike] check_untyped_defs=False From 345aa61ecd99b0eef6e2a865fb5de5470bbcd2df Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Mon, 5 Oct 2020 13:59:08 +0100 Subject: [PATCH 1020/1025] TYP: check_untyped_defs compat.pickle_compat (#36884) --- pandas/compat/pickle_compat.py | 4 ++-- setup.cfg | 3 --- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/pandas/compat/pickle_compat.py b/pandas/compat/pickle_compat.py index ef9f36705a7ee..80ee1f2e20154 100644 --- a/pandas/compat/pickle_compat.py +++ b/pandas/compat/pickle_compat.py @@ -274,7 +274,7 @@ def patch_pickle(): """ orig_loads = pkl.loads try: - pkl.loads = loads + setattr(pkl, "loads", loads) yield finally: - pkl.loads = orig_loads + setattr(pkl, "loads", orig_loads) diff --git a/setup.cfg b/setup.cfg index 75722f2a7809f..e125eea226b10 100644 --- a/setup.cfg +++ b/setup.cfg @@ -136,9 +136,6 @@ check_untyped_defs=False [mypy-pandas._version] check_untyped_defs=False -[mypy-pandas.compat.pickle_compat] -check_untyped_defs=False - [mypy-pandas.core.apply] check_untyped_defs=False From 5e25a21a973a71ea869a0269804f67b7a6e9a4be Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Mon, 5 Oct 2020 06:01:36 -0700 Subject: [PATCH 1021/1025] PERF: Improve RollingGroupby.count (#36872) --- doc/source/whatsnew/v1.2.0.rst | 1 + pandas/core/window/common.py | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 5c2d099ed3119..a269580bc4453 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -284,6 +284,7 @@ Performance improvements - ``Styler`` uuid method altered to compress data transmission over web whilst maintaining reasonably low table collision probability (:issue:`36345`) - Performance improvement in :meth:`pd.to_datetime` with non-ns time unit for ``float`` ``dtype`` columns (:issue:`20445`) - Performance improvement in setting values on a :class:`IntervalArray` (:issue:`36310`) +- Performance improvement in :meth:`RollingGroupby.count` (:issue:`35625`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/window/common.py b/pandas/core/window/common.py index 6452eb8c6b3a9..2e7e7cd47c336 100644 --- a/pandas/core/window/common.py +++ b/pandas/core/window/common.py @@ -58,7 +58,6 @@ def __init__(self, obj, *args, **kwargs): self._groupby.grouper.mutated = True super().__init__(obj, *args, **kwargs) - count = _dispatch("count") corr = _dispatch("corr", other=None, pairwise=None) cov = _dispatch("cov", other=None, pairwise=None) From ee436ada0ab047b12654879477c7ef8d098415c7 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Mon, 5 Oct 2020 15:03:29 +0100 Subject: [PATCH 1022/1025] DOC: 1.1.3 release date (#36887) --- doc/source/whatsnew/v1.1.3.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.1.3.rst b/doc/source/whatsnew/v1.1.3.rst index af714b1bb2ab1..2323afbe00e5d 100644 --- a/doc/source/whatsnew/v1.1.3.rst +++ b/doc/source/whatsnew/v1.1.3.rst @@ -1,7 +1,7 @@ .. _whatsnew_113: -What's new in 1.1.3 (??) ------------------------- +What's new in 1.1.3 (October 5, 2020) +------------------------------------- These are the changes in pandas 1.1.3. See :ref:`release` for a full changelog including other versions of pandas. From b884267c83c1e41f58345b10f6cc1d6e644b969b Mon Sep 17 00:00:00 2001 From: Avinash Pancham <44933366+avinashpancham@users.noreply.github.com> Date: Mon, 5 Oct 2020 19:41:29 +0200 Subject: [PATCH 1023/1025] Add asv benchmarks for select_dtypes (#36839) --- asv_bench/benchmarks/dtypes.py | 57 ++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) diff --git a/asv_bench/benchmarks/dtypes.py b/asv_bench/benchmarks/dtypes.py index bd17b710b108d..a5ed5c389fee4 100644 --- a/asv_bench/benchmarks/dtypes.py +++ b/asv_bench/benchmarks/dtypes.py @@ -1,5 +1,9 @@ +import string + import numpy as np +from pandas import DataFrame +import pandas._testing as tm from pandas.api.types import pandas_dtype from .pandas_vb_common import ( @@ -62,4 +66,57 @@ def time_infer(self, dtype): lib.infer_dtype(self.data_dict[dtype], skipna=False) +class SelectDtypes: + + params = [ + tm.ALL_INT_DTYPES + + tm.ALL_EA_INT_DTYPES + + tm.FLOAT_DTYPES + + tm.COMPLEX_DTYPES + + tm.DATETIME64_DTYPES + + tm.TIMEDELTA64_DTYPES + + tm.BOOL_DTYPES + ] + param_names = ["dtype"] + + def setup(self, dtype): + N, K = 5000, 50 + self.index = tm.makeStringIndex(N) + self.columns = tm.makeStringIndex(K) + + def create_df(data): + return DataFrame(data, index=self.index, columns=self.columns) + + self.df_int = create_df(np.random.randint(low=100, size=(N, K))) + self.df_float = create_df(np.random.randn(N, K)) + self.df_bool = create_df(np.random.choice([True, False], size=(N, K))) + self.df_string = create_df( + np.random.choice(list(string.ascii_letters), size=(N, K)) + ) + + def time_select_dtype_int_include(self, dtype): + self.df_int.select_dtypes(include=dtype) + + def time_select_dtype_int_exclude(self, dtype): + self.df_int.select_dtypes(exclude=dtype) + + def time_select_dtype_float_include(self, dtype): + self.df_float.select_dtypes(include=dtype) + + def time_select_dtype_float_exclude(self, dtype): + self.df_float.select_dtypes(exclude=dtype) + + def time_select_dtype_bool_include(self, dtype): + self.df_bool.select_dtypes(include=dtype) + + def time_select_dtype_bool_exclude(self, dtype): + self.df_bool.select_dtypes(exclude=dtype) + + def time_select_dtype_string_include(self, dtype): + self.df_string.select_dtypes(include=dtype) + + def time_select_dtype_string_exclude(self, dtype): + self.df_string.select_dtypes(exclude=dtype) + + from .pandas_vb_common import setup # noqa: F401 isort:skip From 5f14ef70d3e21dfd8b2685ff46dfaa84edb023e0 Mon Sep 17 00:00:00 2001 From: Daniel Saxton <2658661+dsaxton@users.noreply.github.com> Date: Mon, 5 Oct 2020 15:05:44 -0500 Subject: [PATCH 1024/1025] DOC: Start v1.1.4 release notes (#36689) --- doc/source/whatsnew/index.rst | 1 + doc/source/whatsnew/v1.1.3.rst | 2 +- doc/source/whatsnew/v1.1.4.rst | 42 ++++++++++++++++++++++++++++++++++ 3 files changed, 44 insertions(+), 1 deletion(-) create mode 100644 doc/source/whatsnew/v1.1.4.rst diff --git a/doc/source/whatsnew/index.rst b/doc/source/whatsnew/index.rst index 933ed3cb8babf..848121f822383 100644 --- a/doc/source/whatsnew/index.rst +++ b/doc/source/whatsnew/index.rst @@ -24,6 +24,7 @@ Version 1.1 .. toctree:: :maxdepth: 2 + v1.1.4 v1.1.3 v1.1.2 v1.1.1 diff --git a/doc/source/whatsnew/v1.1.3.rst b/doc/source/whatsnew/v1.1.3.rst index 2323afbe00e5d..e752eb54d0c15 100644 --- a/doc/source/whatsnew/v1.1.3.rst +++ b/doc/source/whatsnew/v1.1.3.rst @@ -75,4 +75,4 @@ Other Contributors ~~~~~~~~~~~~ -.. contributors:: v1.1.2..v1.1.3|HEAD +.. contributors:: v1.1.2..v1.1.3 diff --git a/doc/source/whatsnew/v1.1.4.rst b/doc/source/whatsnew/v1.1.4.rst new file mode 100644 index 0000000000000..e63912ebc8fee --- /dev/null +++ b/doc/source/whatsnew/v1.1.4.rst @@ -0,0 +1,42 @@ +.. _whatsnew_114: + +What's new in 1.1.4 (??) +------------------------ + +These are the changes in pandas 1.1.4. See :ref:`release` for a full changelog +including other versions of pandas. + +{{ header }} + +.. --------------------------------------------------------------------------- + +.. _whatsnew_114.regressions: + +Fixed regressions +~~~~~~~~~~~~~~~~~ +- + +.. --------------------------------------------------------------------------- + +.. _whatsnew_114.bug_fixes: + +Bug fixes +~~~~~~~~~ +- + +.. --------------------------------------------------------------------------- + +.. _whatsnew_114.other: + +Other +~~~~~ +- + +.. --------------------------------------------------------------------------- + +.. _whatsnew_114.contributors: + +Contributors +~~~~~~~~~~~~ + +.. contributors:: v1.1.3..v1.1.4|HEAD From 6ebc186179f6de282b503f8595949973332f4454 Mon Sep 17 00:00:00 2001 From: tr4umatic4i Date: Wed, 3 Jun 2020 20:20:05 -0700 Subject: [PATCH 1025/1025] add support for na_rep when using pd.NA in _format_strings See #33950 --- pandas/io/formats/format.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index acc6c47efd236..f60b1269cab87 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -1335,7 +1335,7 @@ def _format(x): elif x is NA and self.na_rep != "NaN": return self.na_rep elif x is NA: - return str(NA) + return self.na_rep elif x is NaT or np.isnat(x): return "NaT" except (TypeError, ValueError):