From f3d45aa02719e5a93347b02fb6c90b64b14f58b9 Mon Sep 17 00:00:00 2001 From: Josiah Baker Date: Sun, 29 Sep 2019 15:41:05 -0400 Subject: [PATCH 01/22] DOC: fix PR09 doc string errors in Timestamp class This fixes Parameter {} description should finish with "." in the Timestamp class closes #28673 --- pandas/_libs/tslibs/timestamps.pyx | 209 ++++++++++++++++------------- 1 file changed, 113 insertions(+), 96 deletions(-) diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index 6ca39d83afd25..5fa18cdca0aa5 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -251,11 +251,11 @@ class Timestamp(_Timestamp): Parameters ---------- ordinal : int - date corresponding to a proleptic Gregorian ordinal + Date corresponding to a proleptic Gregorian ordinal. freq : str, DateOffset - Offset which Timestamp will have + Offset to apply to the Timestamp. tz : str, pytz.timezone, dateutil.tz.tzfile or None - Time zone for time which Timestamp will have. + Time zone for the Timestamp. """ return cls(datetime.fromordinal(ordinal), freq=freq, tz=tz) @@ -271,7 +271,7 @@ class Timestamp(_Timestamp): Parameters ---------- tz : str or timezone object, default None - Timezone to localize to + Timezone to localize to. """ if isinstance(tz, str): tz = maybe_get_tz(tz) @@ -289,7 +289,7 @@ class Timestamp(_Timestamp): Parameters ---------- tz : str or timezone object, default None - Timezone to localize to + Timezone to localize to. """ return cls.now(tz) @@ -445,29 +445,32 @@ class Timestamp(_Timestamp): Parameters ---------- - freq : a freq string indicating the rounding resolution - ambiguous : bool, 'NaT', default 'raise' - - bool contains flags to determine if time is dst or not (note - that this flag is only applicable for ambiguous fall dst dates) - - 'NaT' will return NaT for an ambiguous time - - 'raise' will raise an AmbiguousTimeError for an ambiguous time - - .. versionadded:: 0.24.0 - nonexistent : 'shift_forward', 'shift_backward, 'NaT', timedelta, \ -default 'raise' - A nonexistent time does not exist in a particular timezone - where clocks moved forward due to DST. - - - 'shift_forward' will shift the nonexistent time forward to the - closest existing time - - 'shift_backward' will shift the nonexistent time backward to the - closest existing time - - 'NaT' will return NaT where there are nonexistent times - - timedelta objects will shift nonexistent times by the timedelta - - 'raise' will raise an NonExistentTimeError if there are - nonexistent times - - .. versionadded:: 0.24.0 + freq : str + Frequency string indicating the ceiling resolution. + ambiguous : {'raise', 'NaT', bool}, default 'raise' + The behavior is as follows: + + * bool contains flags to determine if time is dst or not (note + that this flag is only applicable for ambiguous fall dst dates). + * 'NaT' will return NaT for an ambiguous time. + * 'raise' will raise an AmbiguousTimeError for an ambiguous time. + + .. versionadded:: 0.24.0. + nonexistent : {'raise', 'shift_forward', 'shift_backward, 'NaT', \ +timedelta}, default 'raise' + A nonexistent time does not exist in a particular timezone where + clocks moved forward due to DST. + + * 'shift_forward' will shift the nonexistent time forward to the + closest existing time. + * 'shift_backward' will shift the nonexistent time backward to the + closest existing time. + * 'NaT' will return NaT where there are nonexistent times. + * timedelta objects will shift nonexistent times by the timedelta. + * 'raise' will raise an NonExistentTimeError if there are + nonexistent times. + + .. versionadded:: 0.24.0. Returns ------- @@ -487,33 +490,36 @@ default 'raise' Parameters ---------- - freq : a freq string indicating the flooring resolution - ambiguous : bool, 'NaT', default 'raise' - - bool contains flags to determine if time is dst or not (note - that this flag is only applicable for ambiguous fall dst dates) - - 'NaT' will return NaT for an ambiguous time - - 'raise' will raise an AmbiguousTimeError for an ambiguous time - - .. versionadded:: 0.24.0 - nonexistent : 'shift_forward', 'shift_backward, 'NaT', timedelta, \ -default 'raise' - A nonexistent time does not exist in a particular timezone - where clocks moved forward due to DST. - - - 'shift_forward' will shift the nonexistent time forward to the - closest existing time - - 'shift_backward' will shift the nonexistent time backward to the - closest existing time - - 'NaT' will return NaT where there are nonexistent times - - timedelta objects will shift nonexistent times by the timedelta - - 'raise' will raise an NonExistentTimeError if there are - nonexistent times - - .. versionadded:: 0.24.0 + freq : str + Frequency string indicating the ceiling resolution. + ambiguous : {'raise', 'NaT', bool}, default 'raise' + The behavior is as follows: + + * bool contains flags to determine if time is dst or not (note + that this flag is only applicable for ambiguous fall dst dates). + * 'NaT' will return NaT for an ambiguous time. + * 'raise' will raise an AmbiguousTimeError for an ambiguous time. + + .. versionadded:: 0.24.0. + nonexistent : {'raise', 'shift_forward', 'shift_backward, 'NaT', \ +timedelta}, default 'raise' + A nonexistent time does not exist in a particular timezone where + clocks moved forward due to DST. + + * 'shift_forward' will shift the nonexistent time forward to the + closest existing time. + * 'shift_backward' will shift the nonexistent time backward to the + closest existing time. + * 'NaT' will return NaT where there are nonexistent times. + * timedelta objects will shift nonexistent times by the timedelta. + * 'raise' will raise an NonExistentTimeError if there are + nonexistent times. + + .. versionadded:: 0.24.0. Raises ------ - ValueError if the freq cannot be converted + ValueError if the freq cannot be converted. """ return self._round(freq, RoundTo.MINUS_INFTY, ambiguous, nonexistent) @@ -523,33 +529,36 @@ default 'raise' Parameters ---------- - freq : a freq string indicating the ceiling resolution - ambiguous : bool, 'NaT', default 'raise' - - bool contains flags to determine if time is dst or not (note - that this flag is only applicable for ambiguous fall dst dates) - - 'NaT' will return NaT for an ambiguous time - - 'raise' will raise an AmbiguousTimeError for an ambiguous time - - .. versionadded:: 0.24.0 - nonexistent : 'shift_forward', 'shift_backward, 'NaT', timedelta, \ -default 'raise' - A nonexistent time does not exist in a particular timezone - where clocks moved forward due to DST. - - - 'shift_forward' will shift the nonexistent time forward to the - closest existing time - - 'shift_backward' will shift the nonexistent time backward to the - closest existing time - - 'NaT' will return NaT where there are nonexistent times - - timedelta objects will shift nonexistent times by the timedelta - - 'raise' will raise an NonExistentTimeError if there are - nonexistent times - - .. versionadded:: 0.24.0 + freq : str + Frequency string indicating the ceiling resolution. + ambiguous : {'raise', 'NaT', bool}, default 'raise' + The behavior is as follows: + + * bool contains flags to determine if time is dst or not (note + that this flag is only applicable for ambiguous fall dst dates). + * 'NaT' will return NaT for an ambiguous time. + * 'raise' will raise an AmbiguousTimeError for an ambiguous time. + + .. versionadded:: 0.24.0. + nonexistent : {'raise', 'shift_forward', 'shift_backward, 'NaT', \ +timedelta}, default 'raise' + A nonexistent time does not exist in a particular timezone where + clocks moved forward due to DST. + + * 'shift_forward' will shift the nonexistent time forward to the + closest existing time. + * 'shift_backward' will shift the nonexistent time backward to the + closest existing time. + * 'NaT' will return NaT where there are nonexistent times. + * timedelta objects will shift nonexistent times by the timedelta. + * 'raise' will raise an NonExistentTimeError if there are + nonexistent times. + + .. versionadded:: 0.24.0. Raises ------ - ValueError if the freq cannot be converted + ValueError if the freq cannot be converted. """ return self._round(freq, RoundTo.PLUS_INFTY, ambiguous, nonexistent) @@ -606,7 +615,7 @@ default 'raise' Parameters ---------- locale : string, default None (English locale) - locale determining the language in which to return the day name + Locale determining the language in which to return the day name. Returns ------- @@ -623,7 +632,7 @@ default 'raise' Parameters ---------- locale : string, default None (English locale) - locale determining the language in which to return the month name + Locale determining the language in which to return the month name. Returns ------- @@ -779,35 +788,43 @@ default 'raise' `ambiguous` parameter dictates how ambiguous times should be handled. - - bool contains flags to determine if time is dst or not (note - that this flag is only applicable for ambiguous fall dst dates) - - 'NaT' will return NaT for an ambiguous time - - 'raise' will raise an AmbiguousTimeError for an ambiguous time + The behavior is as follows: + + * bool contains flags to determine if time is dst or not (note + that this flag is only applicable for ambiguous fall dst dates). + * 'NaT' will return NaT for an ambiguous time. + * 'raise' will raise an AmbiguousTimeError for an ambiguous time. nonexistent : 'shift_forward', 'shift_backward, 'NaT', timedelta, \ default 'raise' A nonexistent time does not exist in a particular timezone where clocks moved forward due to DST. - - 'shift_forward' will shift the nonexistent time forward to the - closest existing time - - 'shift_backward' will shift the nonexistent time backward to the - closest existing time - - 'NaT' will return NaT where there are nonexistent times - - timedelta objects will shift nonexistent times by the timedelta - - 'raise' will raise an NonExistentTimeError if there are - nonexistent times + The behavior is as follows: - .. versionadded:: 0.24.0 + * 'shift_forward' will shift the nonexistent time forward to the + closest existing time. + * 'shift_backward' will shift the nonexistent time backward to the + closest existing time. + * 'NaT' will return NaT where there are nonexistent times. + * timedelta objects will shift nonexistent times by the timedelta. + * 'raise' will raise an NonExistentTimeError if there are + nonexistent times. + + .. versionadded:: 0.24.0. errors : 'raise', 'coerce', default None - - 'raise' will raise a NonExistentTimeError if a timestamp is not - valid in the specified timezone (e.g. due to a transition from - or to DST time). Use ``nonexistent='raise'`` instead. - - 'coerce' will return NaT if the timestamp can not be converted + Determine how errors should be handled. + + The behavior is as follows: + + * 'raise' will raise a NonExistentTimeError if a timestamp is not + valid in the specified timezone (e.g. due to a transition from + or to DST time). Use ``nonexistent='raise'`` instead. + * 'coerce' will return NaT if the timestamp can not be converted into the specified timezone. Use ``nonexistent='NaT'`` instead. - .. deprecated:: 0.24.0 + .. deprecated:: 0.24.0. Returns ------- From a4ee42bb02707de49a578bfbe5d781182f7dd722 Mon Sep 17 00:00:00 2001 From: Josiah Baker Date: Sun, 29 Sep 2019 19:10:56 -0400 Subject: [PATCH 02/22] fix docstrings for Timestamp, NaT classes docstrings for Timestamp, NaT and Timedelta classes must match for overlapping methods. verified they now pass the test_nat.py test --- pandas/_libs/tslibs/nattype.pyx | 186 ++++++++++++++++------------- pandas/_libs/tslibs/timedeltas.pyx | 9 +- pandas/_libs/tslibs/timestamps.pyx | 19 ++- 3 files changed, 116 insertions(+), 98 deletions(-) diff --git a/pandas/_libs/tslibs/nattype.pyx b/pandas/_libs/tslibs/nattype.pyx index 328fc26e4fef6..c6ff48e755545 100644 --- a/pandas/_libs/tslibs/nattype.pyx +++ b/pandas/_libs/tslibs/nattype.pyx @@ -396,7 +396,7 @@ class NaTType(_NaT): Parameters ---------- locale : string, default None (English locale) - locale determining the language in which to return the month name + Locale determining the language in which to return the month name. Returns ------- @@ -411,7 +411,7 @@ class NaTType(_NaT): Parameters ---------- locale : string, default None (English locale) - locale determining the language in which to return the day name + Locale determining the language in which to return the day name. Returns ------- @@ -509,11 +509,11 @@ class NaTType(_NaT): Parameters ---------- ordinal : int - date corresponding to a proleptic Gregorian ordinal + Date corresponding to a proleptic Gregorian ordinal. freq : str, DateOffset - Offset which Timestamp will have + Offset to apply to the Timestamp. tz : str, pytz.timezone, dateutil.tz.tzfile or None - Time zone for time which Timestamp will have. + Time zone for the Timestamp. """) # _nat_methods @@ -534,7 +534,7 @@ class NaTType(_NaT): Parameters ---------- tz : str or timezone object, default None - Timezone to localize to + Timezone to localize to. """) today = _make_nat_func('today', # noqa:E128 """ @@ -547,7 +547,7 @@ class NaTType(_NaT): Parameters ---------- tz : str or timezone object, default None - Timezone to localize to + Timezone to localize to. """) round = _make_nat_func('round', # noqa:E128 """ @@ -555,29 +555,32 @@ class NaTType(_NaT): Parameters ---------- - freq : a freq string indicating the rounding resolution - ambiguous : bool, 'NaT', default 'raise' - - bool contains flags to determine if time is dst or not (note - that this flag is only applicable for ambiguous fall dst dates) - - 'NaT' will return NaT for an ambiguous time - - 'raise' will raise an AmbiguousTimeError for an ambiguous time - - .. versionadded:: 0.24.0 - nonexistent : 'shift_forward', 'shift_backward, 'NaT', timedelta, \ -default 'raise' + freq : str + Frequency string indicating the rounding resolution. + ambiguous : {'raise', 'NaT', bool}, default 'raise' + The behavior is as follows: + + * bool contains flags to determine if time is dst or not (note + that this flag is only applicable for ambiguous fall dst dates). + * 'NaT' will return NaT for an ambiguous time. + * 'raise' will raise an AmbiguousTimeError for an ambiguous time. + + .. versionadded:: 0.24.0. + nonexistent : {'raise', 'shift_forward', 'shift_backward, 'NaT', \ +timedelta}, default 'raise' A nonexistent time does not exist in a particular timezone where clocks moved forward due to DST. - - 'shift_forward' will shift the nonexistent time forward to the - closest existing time - - 'shift_backward' will shift the nonexistent time backward to the - closest existing time - - 'NaT' will return NaT where there are nonexistent times - - timedelta objects will shift nonexistent times by the timedelta - - 'raise' will raise an NonExistentTimeError if there are - nonexistent times + * 'shift_forward' will shift the nonexistent time forward to the + closest existing time. + * 'shift_backward' will shift the nonexistent time backward to the + closest existing time. + * 'NaT' will return NaT where there are nonexistent times. + * timedelta objects will shift nonexistent times by the timedelta. + * 'raise' will raise an NonExistentTimeError if there are + nonexistent times. - .. versionadded:: 0.24.0 + .. versionadded:: 0.24.0. Returns ------- @@ -585,7 +588,7 @@ default 'raise' Raises ------ - ValueError if the freq cannot be converted + ValueError if the freq cannot be converted. """) floor = _make_nat_func('floor', # noqa:E128 """ @@ -593,33 +596,36 @@ default 'raise' Parameters ---------- - freq : a freq string indicating the flooring resolution - ambiguous : bool, 'NaT', default 'raise' - - bool contains flags to determine if time is dst or not (note - that this flag is only applicable for ambiguous fall dst dates) - - 'NaT' will return NaT for an ambiguous time - - 'raise' will raise an AmbiguousTimeError for an ambiguous time - - .. versionadded:: 0.24.0 - nonexistent : 'shift_forward', 'shift_backward, 'NaT', timedelta, \ -default 'raise' + freq : str + Frequency string indicating the flooring resolution. + ambiguous : {'raise', 'NaT', bool}, default 'raise' + The behavior is as follows: + + * bool contains flags to determine if time is dst or not (note + that this flag is only applicable for ambiguous fall dst dates). + * 'NaT' will return NaT for an ambiguous time. + * 'raise' will raise an AmbiguousTimeError for an ambiguous time. + + .. versionadded:: 0.24.0. + nonexistent : {'raise', 'shift_forward', 'shift_backward, 'NaT', \ +timedelta}, default 'raise' A nonexistent time does not exist in a particular timezone where clocks moved forward due to DST. - - 'shift_forward' will shift the nonexistent time forward to the - closest existing time - - 'shift_backward' will shift the nonexistent time backward to the - closest existing time - - 'NaT' will return NaT where there are nonexistent times - - timedelta objects will shift nonexistent times by the timedelta - - 'raise' will raise an NonExistentTimeError if there are - nonexistent times + * 'shift_forward' will shift the nonexistent time forward to the + closest existing time. + * 'shift_backward' will shift the nonexistent time backward to the + closest existing time. + * 'NaT' will return NaT where there are nonexistent times. + * timedelta objects will shift nonexistent times by the timedelta. + * 'raise' will raise an NonExistentTimeError if there are + nonexistent times. - .. versionadded:: 0.24.0 + .. versionadded:: 0.24.0. Raises ------ - ValueError if the freq cannot be converted + ValueError if the freq cannot be converted. """) ceil = _make_nat_func('ceil', # noqa:E128 """ @@ -627,33 +633,36 @@ default 'raise' Parameters ---------- - freq : a freq string indicating the ceiling resolution - ambiguous : bool, 'NaT', default 'raise' - - bool contains flags to determine if time is dst or not (note - that this flag is only applicable for ambiguous fall dst dates) - - 'NaT' will return NaT for an ambiguous time - - 'raise' will raise an AmbiguousTimeError for an ambiguous time - - .. versionadded:: 0.24.0 - nonexistent : 'shift_forward', 'shift_backward, 'NaT', timedelta, \ -default 'raise' + freq : str + Frequency string indicating the ceiling resolution. + ambiguous : {'raise', 'NaT', bool}, default 'raise' + The behavior is as follows: + + * bool contains flags to determine if time is dst or not (note + that this flag is only applicable for ambiguous fall dst dates). + * 'NaT' will return NaT for an ambiguous time. + * 'raise' will raise an AmbiguousTimeError for an ambiguous time. + + .. versionadded:: 0.24.0. + nonexistent : {'raise', 'shift_forward', 'shift_backward, 'NaT', \ +timedelta}, default 'raise' A nonexistent time does not exist in a particular timezone where clocks moved forward due to DST. - - 'shift_forward' will shift the nonexistent time forward to the - closest existing time - - 'shift_backward' will shift the nonexistent time backward to the - closest existing time - - 'NaT' will return NaT where there are nonexistent times - - timedelta objects will shift nonexistent times by the timedelta - - 'raise' will raise an NonExistentTimeError if there are - nonexistent times + * 'shift_forward' will shift the nonexistent time forward to the + closest existing time. + * 'shift_backward' will shift the nonexistent time backward to the + closest existing time. + * 'NaT' will return NaT where there are nonexistent times. + * timedelta objects will shift nonexistent times by the timedelta. + * 'raise' will raise an NonExistentTimeError if there are + nonexistent times. - .. versionadded:: 0.24.0 + .. versionadded:: 0.24.0. Raises ------ - ValueError if the freq cannot be converted + ValueError if the freq cannot be converted. """) tz_convert = _make_nat_func('tz_convert', # noqa:E128 @@ -694,35 +703,42 @@ default 'raise' `ambiguous` parameter dictates how ambiguous times should be handled. - - bool contains flags to determine if time is dst or not (note - that this flag is only applicable for ambiguous fall dst dates) - - 'NaT' will return NaT for an ambiguous time - - 'raise' will raise an AmbiguousTimeError for an ambiguous time + The behavior is as follows: + + * bool contains flags to determine if time is dst or not (note + that this flag is only applicable for ambiguous fall dst dates). + * 'NaT' will return NaT for an ambiguous time. + * 'raise' will raise an AmbiguousTimeError for an ambiguous time. nonexistent : 'shift_forward', 'shift_backward, 'NaT', timedelta, \ default 'raise' A nonexistent time does not exist in a particular timezone where clocks moved forward due to DST. - - 'shift_forward' will shift the nonexistent time forward to the - closest existing time - - 'shift_backward' will shift the nonexistent time backward to the - closest existing time - - 'NaT' will return NaT where there are nonexistent times - - timedelta objects will shift nonexistent times by the timedelta - - 'raise' will raise an NonExistentTimeError if there are - nonexistent times + The behavior is as follows: - .. versionadded:: 0.24.0 + * 'shift_forward' will shift the nonexistent time forward to the + closest existing time. + * 'shift_backward' will shift the nonexistent time backward to the + closest existing time. + * 'NaT' will return NaT where there are nonexistent times. + * timedelta objects will shift nonexistent times by the timedelta. + * 'raise' will raise an NonExistentTimeError if there are + nonexistent times. + .. versionadded:: 0.24.0. errors : 'raise', 'coerce', default None - - 'raise' will raise a NonExistentTimeError if a timestamp is not - valid in the specified timezone (e.g. due to a transition from - or to DST time). Use ``nonexistent='raise'`` instead. - - 'coerce' will return NaT if the timestamp can not be converted + Determine how errors should be handled. + + The behavior is as follows: + + * 'raise' will raise a NonExistentTimeError if a timestamp is not + valid in the specified timezone (e.g. due to a transition from + or to DST time). Use ``nonexistent='raise'`` instead. + * 'coerce' will return NaT if the timestamp can not be converted into the specified timezone. Use ``nonexistent='NaT'`` instead. - .. deprecated:: 0.24.0 + .. deprecated:: 0.24.0. Returns ------- diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index b232042c70eac..82fa37bd9448a 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -1323,7 +1323,8 @@ class Timedelta(_Timedelta): Parameters ---------- - freq : a freq string indicating the rounding resolution + freq : str + Frequency string indicating the rounding resolution. Returns ------- @@ -1341,7 +1342,8 @@ class Timedelta(_Timedelta): Parameters ---------- - freq : a freq string indicating the flooring resolution + freq : str + Frequency string indicating the flooring resolution. """ return self._round(freq, np.floor) @@ -1351,7 +1353,8 @@ class Timedelta(_Timedelta): Parameters ---------- - freq : a freq string indicating the ceiling resolution + freq : str + Frequency string indicating the ceiling resolution. """ return self._round(freq, np.ceil) diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index 5fa18cdca0aa5..197bd7ec6bb9b 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -446,7 +446,7 @@ class Timestamp(_Timestamp): Parameters ---------- freq : str - Frequency string indicating the ceiling resolution. + Frequency string indicating the rounding resolution. ambiguous : {'raise', 'NaT', bool}, default 'raise' The behavior is as follows: @@ -458,8 +458,8 @@ class Timestamp(_Timestamp): .. versionadded:: 0.24.0. nonexistent : {'raise', 'shift_forward', 'shift_backward, 'NaT', \ timedelta}, default 'raise' - A nonexistent time does not exist in a particular timezone where - clocks moved forward due to DST. + A nonexistent time does not exist in a particular timezone + where clocks moved forward due to DST. * 'shift_forward' will shift the nonexistent time forward to the closest existing time. @@ -478,7 +478,7 @@ timedelta}, default 'raise' Raises ------ - ValueError if the freq cannot be converted + ValueError if the freq cannot be converted. """ return self._round( freq, RoundTo.NEAREST_HALF_EVEN, ambiguous, nonexistent @@ -491,7 +491,7 @@ timedelta}, default 'raise' Parameters ---------- freq : str - Frequency string indicating the ceiling resolution. + Frequency string indicating the flooring resolution. ambiguous : {'raise', 'NaT', bool}, default 'raise' The behavior is as follows: @@ -503,8 +503,8 @@ timedelta}, default 'raise' .. versionadded:: 0.24.0. nonexistent : {'raise', 'shift_forward', 'shift_backward, 'NaT', \ timedelta}, default 'raise' - A nonexistent time does not exist in a particular timezone where - clocks moved forward due to DST. + A nonexistent time does not exist in a particular timezone + where clocks moved forward due to DST. * 'shift_forward' will shift the nonexistent time forward to the closest existing time. @@ -542,8 +542,8 @@ timedelta}, default 'raise' .. versionadded:: 0.24.0. nonexistent : {'raise', 'shift_forward', 'shift_backward, 'NaT', \ timedelta}, default 'raise' - A nonexistent time does not exist in a particular timezone where - clocks moved forward due to DST. + A nonexistent time does not exist in a particular timezone + where clocks moved forward due to DST. * 'shift_forward' will shift the nonexistent time forward to the closest existing time. @@ -812,7 +812,6 @@ default 'raise' nonexistent times. .. versionadded:: 0.24.0. - errors : 'raise', 'coerce', default None Determine how errors should be handled. From 70703e1c7343e6bd02d9d155d945928fabc11f56 Mon Sep 17 00:00:00 2001 From: Josiah Baker Date: Sun, 29 Sep 2019 19:38:56 -0400 Subject: [PATCH 03/22] fix unit description in Timedelta --- pandas/_libs/tslibs/timedeltas.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index 82fa37bd9448a..5181fff2e589f 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -1222,7 +1222,7 @@ class Timedelta(_Timedelta): 'm', 'minute', 'min', 'minutes', 'T', 'S', 'seconds', 'sec', 'second', 'ms', 'milliseconds', 'millisecond', 'milli', 'millis', 'L', 'us', 'microseconds', 'microsecond', 'micro', 'micros', 'U', - 'ns', 'nanoseconds', 'nano', 'nanos', 'nanosecond', 'N'} + 'ns', 'nanoseconds', 'nano', 'nanos', 'nanosecond', 'N'}. **kwargs Available kwargs: {days, seconds, microseconds, milliseconds, minutes, hours, weeks}. From 59f29e701f60ee78bd7e458c660f868c681b1294 Mon Sep 17 00:00:00 2001 From: Josiah Baker Date: Sun, 29 Sep 2019 15:41:05 -0400 Subject: [PATCH 04/22] DOC: fix PR09 doc string errors in Timestamp class This fixes Parameter {} description should finish with "." in the Timestamp class closes #28673 --- pandas/_libs/tslibs/timestamps.pyx | 209 ++++++++++++++++------------- 1 file changed, 113 insertions(+), 96 deletions(-) diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index 6ca39d83afd25..5fa18cdca0aa5 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -251,11 +251,11 @@ class Timestamp(_Timestamp): Parameters ---------- ordinal : int - date corresponding to a proleptic Gregorian ordinal + Date corresponding to a proleptic Gregorian ordinal. freq : str, DateOffset - Offset which Timestamp will have + Offset to apply to the Timestamp. tz : str, pytz.timezone, dateutil.tz.tzfile or None - Time zone for time which Timestamp will have. + Time zone for the Timestamp. """ return cls(datetime.fromordinal(ordinal), freq=freq, tz=tz) @@ -271,7 +271,7 @@ class Timestamp(_Timestamp): Parameters ---------- tz : str or timezone object, default None - Timezone to localize to + Timezone to localize to. """ if isinstance(tz, str): tz = maybe_get_tz(tz) @@ -289,7 +289,7 @@ class Timestamp(_Timestamp): Parameters ---------- tz : str or timezone object, default None - Timezone to localize to + Timezone to localize to. """ return cls.now(tz) @@ -445,29 +445,32 @@ class Timestamp(_Timestamp): Parameters ---------- - freq : a freq string indicating the rounding resolution - ambiguous : bool, 'NaT', default 'raise' - - bool contains flags to determine if time is dst or not (note - that this flag is only applicable for ambiguous fall dst dates) - - 'NaT' will return NaT for an ambiguous time - - 'raise' will raise an AmbiguousTimeError for an ambiguous time - - .. versionadded:: 0.24.0 - nonexistent : 'shift_forward', 'shift_backward, 'NaT', timedelta, \ -default 'raise' - A nonexistent time does not exist in a particular timezone - where clocks moved forward due to DST. - - - 'shift_forward' will shift the nonexistent time forward to the - closest existing time - - 'shift_backward' will shift the nonexistent time backward to the - closest existing time - - 'NaT' will return NaT where there are nonexistent times - - timedelta objects will shift nonexistent times by the timedelta - - 'raise' will raise an NonExistentTimeError if there are - nonexistent times - - .. versionadded:: 0.24.0 + freq : str + Frequency string indicating the ceiling resolution. + ambiguous : {'raise', 'NaT', bool}, default 'raise' + The behavior is as follows: + + * bool contains flags to determine if time is dst or not (note + that this flag is only applicable for ambiguous fall dst dates). + * 'NaT' will return NaT for an ambiguous time. + * 'raise' will raise an AmbiguousTimeError for an ambiguous time. + + .. versionadded:: 0.24.0. + nonexistent : {'raise', 'shift_forward', 'shift_backward, 'NaT', \ +timedelta}, default 'raise' + A nonexistent time does not exist in a particular timezone where + clocks moved forward due to DST. + + * 'shift_forward' will shift the nonexistent time forward to the + closest existing time. + * 'shift_backward' will shift the nonexistent time backward to the + closest existing time. + * 'NaT' will return NaT where there are nonexistent times. + * timedelta objects will shift nonexistent times by the timedelta. + * 'raise' will raise an NonExistentTimeError if there are + nonexistent times. + + .. versionadded:: 0.24.0. Returns ------- @@ -487,33 +490,36 @@ default 'raise' Parameters ---------- - freq : a freq string indicating the flooring resolution - ambiguous : bool, 'NaT', default 'raise' - - bool contains flags to determine if time is dst or not (note - that this flag is only applicable for ambiguous fall dst dates) - - 'NaT' will return NaT for an ambiguous time - - 'raise' will raise an AmbiguousTimeError for an ambiguous time - - .. versionadded:: 0.24.0 - nonexistent : 'shift_forward', 'shift_backward, 'NaT', timedelta, \ -default 'raise' - A nonexistent time does not exist in a particular timezone - where clocks moved forward due to DST. - - - 'shift_forward' will shift the nonexistent time forward to the - closest existing time - - 'shift_backward' will shift the nonexistent time backward to the - closest existing time - - 'NaT' will return NaT where there are nonexistent times - - timedelta objects will shift nonexistent times by the timedelta - - 'raise' will raise an NonExistentTimeError if there are - nonexistent times - - .. versionadded:: 0.24.0 + freq : str + Frequency string indicating the ceiling resolution. + ambiguous : {'raise', 'NaT', bool}, default 'raise' + The behavior is as follows: + + * bool contains flags to determine if time is dst or not (note + that this flag is only applicable for ambiguous fall dst dates). + * 'NaT' will return NaT for an ambiguous time. + * 'raise' will raise an AmbiguousTimeError for an ambiguous time. + + .. versionadded:: 0.24.0. + nonexistent : {'raise', 'shift_forward', 'shift_backward, 'NaT', \ +timedelta}, default 'raise' + A nonexistent time does not exist in a particular timezone where + clocks moved forward due to DST. + + * 'shift_forward' will shift the nonexistent time forward to the + closest existing time. + * 'shift_backward' will shift the nonexistent time backward to the + closest existing time. + * 'NaT' will return NaT where there are nonexistent times. + * timedelta objects will shift nonexistent times by the timedelta. + * 'raise' will raise an NonExistentTimeError if there are + nonexistent times. + + .. versionadded:: 0.24.0. Raises ------ - ValueError if the freq cannot be converted + ValueError if the freq cannot be converted. """ return self._round(freq, RoundTo.MINUS_INFTY, ambiguous, nonexistent) @@ -523,33 +529,36 @@ default 'raise' Parameters ---------- - freq : a freq string indicating the ceiling resolution - ambiguous : bool, 'NaT', default 'raise' - - bool contains flags to determine if time is dst or not (note - that this flag is only applicable for ambiguous fall dst dates) - - 'NaT' will return NaT for an ambiguous time - - 'raise' will raise an AmbiguousTimeError for an ambiguous time - - .. versionadded:: 0.24.0 - nonexistent : 'shift_forward', 'shift_backward, 'NaT', timedelta, \ -default 'raise' - A nonexistent time does not exist in a particular timezone - where clocks moved forward due to DST. - - - 'shift_forward' will shift the nonexistent time forward to the - closest existing time - - 'shift_backward' will shift the nonexistent time backward to the - closest existing time - - 'NaT' will return NaT where there are nonexistent times - - timedelta objects will shift nonexistent times by the timedelta - - 'raise' will raise an NonExistentTimeError if there are - nonexistent times - - .. versionadded:: 0.24.0 + freq : str + Frequency string indicating the ceiling resolution. + ambiguous : {'raise', 'NaT', bool}, default 'raise' + The behavior is as follows: + + * bool contains flags to determine if time is dst or not (note + that this flag is only applicable for ambiguous fall dst dates). + * 'NaT' will return NaT for an ambiguous time. + * 'raise' will raise an AmbiguousTimeError for an ambiguous time. + + .. versionadded:: 0.24.0. + nonexistent : {'raise', 'shift_forward', 'shift_backward, 'NaT', \ +timedelta}, default 'raise' + A nonexistent time does not exist in a particular timezone where + clocks moved forward due to DST. + + * 'shift_forward' will shift the nonexistent time forward to the + closest existing time. + * 'shift_backward' will shift the nonexistent time backward to the + closest existing time. + * 'NaT' will return NaT where there are nonexistent times. + * timedelta objects will shift nonexistent times by the timedelta. + * 'raise' will raise an NonExistentTimeError if there are + nonexistent times. + + .. versionadded:: 0.24.0. Raises ------ - ValueError if the freq cannot be converted + ValueError if the freq cannot be converted. """ return self._round(freq, RoundTo.PLUS_INFTY, ambiguous, nonexistent) @@ -606,7 +615,7 @@ default 'raise' Parameters ---------- locale : string, default None (English locale) - locale determining the language in which to return the day name + Locale determining the language in which to return the day name. Returns ------- @@ -623,7 +632,7 @@ default 'raise' Parameters ---------- locale : string, default None (English locale) - locale determining the language in which to return the month name + Locale determining the language in which to return the month name. Returns ------- @@ -779,35 +788,43 @@ default 'raise' `ambiguous` parameter dictates how ambiguous times should be handled. - - bool contains flags to determine if time is dst or not (note - that this flag is only applicable for ambiguous fall dst dates) - - 'NaT' will return NaT for an ambiguous time - - 'raise' will raise an AmbiguousTimeError for an ambiguous time + The behavior is as follows: + + * bool contains flags to determine if time is dst or not (note + that this flag is only applicable for ambiguous fall dst dates). + * 'NaT' will return NaT for an ambiguous time. + * 'raise' will raise an AmbiguousTimeError for an ambiguous time. nonexistent : 'shift_forward', 'shift_backward, 'NaT', timedelta, \ default 'raise' A nonexistent time does not exist in a particular timezone where clocks moved forward due to DST. - - 'shift_forward' will shift the nonexistent time forward to the - closest existing time - - 'shift_backward' will shift the nonexistent time backward to the - closest existing time - - 'NaT' will return NaT where there are nonexistent times - - timedelta objects will shift nonexistent times by the timedelta - - 'raise' will raise an NonExistentTimeError if there are - nonexistent times + The behavior is as follows: - .. versionadded:: 0.24.0 + * 'shift_forward' will shift the nonexistent time forward to the + closest existing time. + * 'shift_backward' will shift the nonexistent time backward to the + closest existing time. + * 'NaT' will return NaT where there are nonexistent times. + * timedelta objects will shift nonexistent times by the timedelta. + * 'raise' will raise an NonExistentTimeError if there are + nonexistent times. + + .. versionadded:: 0.24.0. errors : 'raise', 'coerce', default None - - 'raise' will raise a NonExistentTimeError if a timestamp is not - valid in the specified timezone (e.g. due to a transition from - or to DST time). Use ``nonexistent='raise'`` instead. - - 'coerce' will return NaT if the timestamp can not be converted + Determine how errors should be handled. + + The behavior is as follows: + + * 'raise' will raise a NonExistentTimeError if a timestamp is not + valid in the specified timezone (e.g. due to a transition from + or to DST time). Use ``nonexistent='raise'`` instead. + * 'coerce' will return NaT if the timestamp can not be converted into the specified timezone. Use ``nonexistent='NaT'`` instead. - .. deprecated:: 0.24.0 + .. deprecated:: 0.24.0. Returns ------- From b58af1ae979b6ec86dfc03e5ce6363774af02ca2 Mon Sep 17 00:00:00 2001 From: Josiah Baker Date: Sun, 29 Sep 2019 19:10:56 -0400 Subject: [PATCH 05/22] fix docstrings for Timestamp, NaT classes docstrings for Timestamp, NaT and Timedelta classes must match for overlapping methods. verified they now pass the test_nat.py test --- pandas/_libs/tslibs/nattype.pyx | 186 ++++++++++++++++------------- pandas/_libs/tslibs/timedeltas.pyx | 9 +- pandas/_libs/tslibs/timestamps.pyx | 19 ++- 3 files changed, 116 insertions(+), 98 deletions(-) diff --git a/pandas/_libs/tslibs/nattype.pyx b/pandas/_libs/tslibs/nattype.pyx index 328fc26e4fef6..c6ff48e755545 100644 --- a/pandas/_libs/tslibs/nattype.pyx +++ b/pandas/_libs/tslibs/nattype.pyx @@ -396,7 +396,7 @@ class NaTType(_NaT): Parameters ---------- locale : string, default None (English locale) - locale determining the language in which to return the month name + Locale determining the language in which to return the month name. Returns ------- @@ -411,7 +411,7 @@ class NaTType(_NaT): Parameters ---------- locale : string, default None (English locale) - locale determining the language in which to return the day name + Locale determining the language in which to return the day name. Returns ------- @@ -509,11 +509,11 @@ class NaTType(_NaT): Parameters ---------- ordinal : int - date corresponding to a proleptic Gregorian ordinal + Date corresponding to a proleptic Gregorian ordinal. freq : str, DateOffset - Offset which Timestamp will have + Offset to apply to the Timestamp. tz : str, pytz.timezone, dateutil.tz.tzfile or None - Time zone for time which Timestamp will have. + Time zone for the Timestamp. """) # _nat_methods @@ -534,7 +534,7 @@ class NaTType(_NaT): Parameters ---------- tz : str or timezone object, default None - Timezone to localize to + Timezone to localize to. """) today = _make_nat_func('today', # noqa:E128 """ @@ -547,7 +547,7 @@ class NaTType(_NaT): Parameters ---------- tz : str or timezone object, default None - Timezone to localize to + Timezone to localize to. """) round = _make_nat_func('round', # noqa:E128 """ @@ -555,29 +555,32 @@ class NaTType(_NaT): Parameters ---------- - freq : a freq string indicating the rounding resolution - ambiguous : bool, 'NaT', default 'raise' - - bool contains flags to determine if time is dst or not (note - that this flag is only applicable for ambiguous fall dst dates) - - 'NaT' will return NaT for an ambiguous time - - 'raise' will raise an AmbiguousTimeError for an ambiguous time - - .. versionadded:: 0.24.0 - nonexistent : 'shift_forward', 'shift_backward, 'NaT', timedelta, \ -default 'raise' + freq : str + Frequency string indicating the rounding resolution. + ambiguous : {'raise', 'NaT', bool}, default 'raise' + The behavior is as follows: + + * bool contains flags to determine if time is dst or not (note + that this flag is only applicable for ambiguous fall dst dates). + * 'NaT' will return NaT for an ambiguous time. + * 'raise' will raise an AmbiguousTimeError for an ambiguous time. + + .. versionadded:: 0.24.0. + nonexistent : {'raise', 'shift_forward', 'shift_backward, 'NaT', \ +timedelta}, default 'raise' A nonexistent time does not exist in a particular timezone where clocks moved forward due to DST. - - 'shift_forward' will shift the nonexistent time forward to the - closest existing time - - 'shift_backward' will shift the nonexistent time backward to the - closest existing time - - 'NaT' will return NaT where there are nonexistent times - - timedelta objects will shift nonexistent times by the timedelta - - 'raise' will raise an NonExistentTimeError if there are - nonexistent times + * 'shift_forward' will shift the nonexistent time forward to the + closest existing time. + * 'shift_backward' will shift the nonexistent time backward to the + closest existing time. + * 'NaT' will return NaT where there are nonexistent times. + * timedelta objects will shift nonexistent times by the timedelta. + * 'raise' will raise an NonExistentTimeError if there are + nonexistent times. - .. versionadded:: 0.24.0 + .. versionadded:: 0.24.0. Returns ------- @@ -585,7 +588,7 @@ default 'raise' Raises ------ - ValueError if the freq cannot be converted + ValueError if the freq cannot be converted. """) floor = _make_nat_func('floor', # noqa:E128 """ @@ -593,33 +596,36 @@ default 'raise' Parameters ---------- - freq : a freq string indicating the flooring resolution - ambiguous : bool, 'NaT', default 'raise' - - bool contains flags to determine if time is dst or not (note - that this flag is only applicable for ambiguous fall dst dates) - - 'NaT' will return NaT for an ambiguous time - - 'raise' will raise an AmbiguousTimeError for an ambiguous time - - .. versionadded:: 0.24.0 - nonexistent : 'shift_forward', 'shift_backward, 'NaT', timedelta, \ -default 'raise' + freq : str + Frequency string indicating the flooring resolution. + ambiguous : {'raise', 'NaT', bool}, default 'raise' + The behavior is as follows: + + * bool contains flags to determine if time is dst or not (note + that this flag is only applicable for ambiguous fall dst dates). + * 'NaT' will return NaT for an ambiguous time. + * 'raise' will raise an AmbiguousTimeError for an ambiguous time. + + .. versionadded:: 0.24.0. + nonexistent : {'raise', 'shift_forward', 'shift_backward, 'NaT', \ +timedelta}, default 'raise' A nonexistent time does not exist in a particular timezone where clocks moved forward due to DST. - - 'shift_forward' will shift the nonexistent time forward to the - closest existing time - - 'shift_backward' will shift the nonexistent time backward to the - closest existing time - - 'NaT' will return NaT where there are nonexistent times - - timedelta objects will shift nonexistent times by the timedelta - - 'raise' will raise an NonExistentTimeError if there are - nonexistent times + * 'shift_forward' will shift the nonexistent time forward to the + closest existing time. + * 'shift_backward' will shift the nonexistent time backward to the + closest existing time. + * 'NaT' will return NaT where there are nonexistent times. + * timedelta objects will shift nonexistent times by the timedelta. + * 'raise' will raise an NonExistentTimeError if there are + nonexistent times. - .. versionadded:: 0.24.0 + .. versionadded:: 0.24.0. Raises ------ - ValueError if the freq cannot be converted + ValueError if the freq cannot be converted. """) ceil = _make_nat_func('ceil', # noqa:E128 """ @@ -627,33 +633,36 @@ default 'raise' Parameters ---------- - freq : a freq string indicating the ceiling resolution - ambiguous : bool, 'NaT', default 'raise' - - bool contains flags to determine if time is dst or not (note - that this flag is only applicable for ambiguous fall dst dates) - - 'NaT' will return NaT for an ambiguous time - - 'raise' will raise an AmbiguousTimeError for an ambiguous time - - .. versionadded:: 0.24.0 - nonexistent : 'shift_forward', 'shift_backward, 'NaT', timedelta, \ -default 'raise' + freq : str + Frequency string indicating the ceiling resolution. + ambiguous : {'raise', 'NaT', bool}, default 'raise' + The behavior is as follows: + + * bool contains flags to determine if time is dst or not (note + that this flag is only applicable for ambiguous fall dst dates). + * 'NaT' will return NaT for an ambiguous time. + * 'raise' will raise an AmbiguousTimeError for an ambiguous time. + + .. versionadded:: 0.24.0. + nonexistent : {'raise', 'shift_forward', 'shift_backward, 'NaT', \ +timedelta}, default 'raise' A nonexistent time does not exist in a particular timezone where clocks moved forward due to DST. - - 'shift_forward' will shift the nonexistent time forward to the - closest existing time - - 'shift_backward' will shift the nonexistent time backward to the - closest existing time - - 'NaT' will return NaT where there are nonexistent times - - timedelta objects will shift nonexistent times by the timedelta - - 'raise' will raise an NonExistentTimeError if there are - nonexistent times + * 'shift_forward' will shift the nonexistent time forward to the + closest existing time. + * 'shift_backward' will shift the nonexistent time backward to the + closest existing time. + * 'NaT' will return NaT where there are nonexistent times. + * timedelta objects will shift nonexistent times by the timedelta. + * 'raise' will raise an NonExistentTimeError if there are + nonexistent times. - .. versionadded:: 0.24.0 + .. versionadded:: 0.24.0. Raises ------ - ValueError if the freq cannot be converted + ValueError if the freq cannot be converted. """) tz_convert = _make_nat_func('tz_convert', # noqa:E128 @@ -694,35 +703,42 @@ default 'raise' `ambiguous` parameter dictates how ambiguous times should be handled. - - bool contains flags to determine if time is dst or not (note - that this flag is only applicable for ambiguous fall dst dates) - - 'NaT' will return NaT for an ambiguous time - - 'raise' will raise an AmbiguousTimeError for an ambiguous time + The behavior is as follows: + + * bool contains flags to determine if time is dst or not (note + that this flag is only applicable for ambiguous fall dst dates). + * 'NaT' will return NaT for an ambiguous time. + * 'raise' will raise an AmbiguousTimeError for an ambiguous time. nonexistent : 'shift_forward', 'shift_backward, 'NaT', timedelta, \ default 'raise' A nonexistent time does not exist in a particular timezone where clocks moved forward due to DST. - - 'shift_forward' will shift the nonexistent time forward to the - closest existing time - - 'shift_backward' will shift the nonexistent time backward to the - closest existing time - - 'NaT' will return NaT where there are nonexistent times - - timedelta objects will shift nonexistent times by the timedelta - - 'raise' will raise an NonExistentTimeError if there are - nonexistent times + The behavior is as follows: - .. versionadded:: 0.24.0 + * 'shift_forward' will shift the nonexistent time forward to the + closest existing time. + * 'shift_backward' will shift the nonexistent time backward to the + closest existing time. + * 'NaT' will return NaT where there are nonexistent times. + * timedelta objects will shift nonexistent times by the timedelta. + * 'raise' will raise an NonExistentTimeError if there are + nonexistent times. + .. versionadded:: 0.24.0. errors : 'raise', 'coerce', default None - - 'raise' will raise a NonExistentTimeError if a timestamp is not - valid in the specified timezone (e.g. due to a transition from - or to DST time). Use ``nonexistent='raise'`` instead. - - 'coerce' will return NaT if the timestamp can not be converted + Determine how errors should be handled. + + The behavior is as follows: + + * 'raise' will raise a NonExistentTimeError if a timestamp is not + valid in the specified timezone (e.g. due to a transition from + or to DST time). Use ``nonexistent='raise'`` instead. + * 'coerce' will return NaT if the timestamp can not be converted into the specified timezone. Use ``nonexistent='NaT'`` instead. - .. deprecated:: 0.24.0 + .. deprecated:: 0.24.0. Returns ------- diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index b232042c70eac..82fa37bd9448a 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -1323,7 +1323,8 @@ class Timedelta(_Timedelta): Parameters ---------- - freq : a freq string indicating the rounding resolution + freq : str + Frequency string indicating the rounding resolution. Returns ------- @@ -1341,7 +1342,8 @@ class Timedelta(_Timedelta): Parameters ---------- - freq : a freq string indicating the flooring resolution + freq : str + Frequency string indicating the flooring resolution. """ return self._round(freq, np.floor) @@ -1351,7 +1353,8 @@ class Timedelta(_Timedelta): Parameters ---------- - freq : a freq string indicating the ceiling resolution + freq : str + Frequency string indicating the ceiling resolution. """ return self._round(freq, np.ceil) diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index 5fa18cdca0aa5..197bd7ec6bb9b 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -446,7 +446,7 @@ class Timestamp(_Timestamp): Parameters ---------- freq : str - Frequency string indicating the ceiling resolution. + Frequency string indicating the rounding resolution. ambiguous : {'raise', 'NaT', bool}, default 'raise' The behavior is as follows: @@ -458,8 +458,8 @@ class Timestamp(_Timestamp): .. versionadded:: 0.24.0. nonexistent : {'raise', 'shift_forward', 'shift_backward, 'NaT', \ timedelta}, default 'raise' - A nonexistent time does not exist in a particular timezone where - clocks moved forward due to DST. + A nonexistent time does not exist in a particular timezone + where clocks moved forward due to DST. * 'shift_forward' will shift the nonexistent time forward to the closest existing time. @@ -478,7 +478,7 @@ timedelta}, default 'raise' Raises ------ - ValueError if the freq cannot be converted + ValueError if the freq cannot be converted. """ return self._round( freq, RoundTo.NEAREST_HALF_EVEN, ambiguous, nonexistent @@ -491,7 +491,7 @@ timedelta}, default 'raise' Parameters ---------- freq : str - Frequency string indicating the ceiling resolution. + Frequency string indicating the flooring resolution. ambiguous : {'raise', 'NaT', bool}, default 'raise' The behavior is as follows: @@ -503,8 +503,8 @@ timedelta}, default 'raise' .. versionadded:: 0.24.0. nonexistent : {'raise', 'shift_forward', 'shift_backward, 'NaT', \ timedelta}, default 'raise' - A nonexistent time does not exist in a particular timezone where - clocks moved forward due to DST. + A nonexistent time does not exist in a particular timezone + where clocks moved forward due to DST. * 'shift_forward' will shift the nonexistent time forward to the closest existing time. @@ -542,8 +542,8 @@ timedelta}, default 'raise' .. versionadded:: 0.24.0. nonexistent : {'raise', 'shift_forward', 'shift_backward, 'NaT', \ timedelta}, default 'raise' - A nonexistent time does not exist in a particular timezone where - clocks moved forward due to DST. + A nonexistent time does not exist in a particular timezone + where clocks moved forward due to DST. * 'shift_forward' will shift the nonexistent time forward to the closest existing time. @@ -812,7 +812,6 @@ default 'raise' nonexistent times. .. versionadded:: 0.24.0. - errors : 'raise', 'coerce', default None Determine how errors should be handled. From 5b99c06b827be825053a35ff345b9258a67ab494 Mon Sep 17 00:00:00 2001 From: Josiah Baker Date: Sun, 29 Sep 2019 19:38:56 -0400 Subject: [PATCH 06/22] fix unit description in Timedelta --- pandas/_libs/tslibs/timedeltas.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index 82fa37bd9448a..5181fff2e589f 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -1222,7 +1222,7 @@ class Timedelta(_Timedelta): 'm', 'minute', 'min', 'minutes', 'T', 'S', 'seconds', 'sec', 'second', 'ms', 'milliseconds', 'millisecond', 'milli', 'millis', 'L', 'us', 'microseconds', 'microsecond', 'micro', 'micros', 'U', - 'ns', 'nanoseconds', 'nano', 'nanos', 'nanosecond', 'N'} + 'ns', 'nanoseconds', 'nano', 'nanos', 'nanosecond', 'N'}. **kwargs Available kwargs: {days, seconds, microseconds, milliseconds, minutes, hours, weeks}. From e3eb09efd05b9c39228b0da68af43765ccfea43d Mon Sep 17 00:00:00 2001 From: Josiah Baker Date: Mon, 30 Sep 2019 20:53:36 -0400 Subject: [PATCH 07/22] change ambiguous param args and remove period change ambiguous parameter to be more readable remove the period after versionadd --- pandas/_libs/tslibs/nattype.pyx | 22 +++++++++++----------- pandas/_libs/tslibs/timestamps.pyx | 22 +++++++++++----------- 2 files changed, 22 insertions(+), 22 deletions(-) diff --git a/pandas/_libs/tslibs/nattype.pyx b/pandas/_libs/tslibs/nattype.pyx index c6ff48e755545..75462b4b0a914 100644 --- a/pandas/_libs/tslibs/nattype.pyx +++ b/pandas/_libs/tslibs/nattype.pyx @@ -557,7 +557,7 @@ class NaTType(_NaT): ---------- freq : str Frequency string indicating the rounding resolution. - ambiguous : {'raise', 'NaT', bool}, default 'raise' + ambiguous : bool or {'raise', 'NaT'}, default 'raise' The behavior is as follows: * bool contains flags to determine if time is dst or not (note @@ -565,7 +565,7 @@ class NaTType(_NaT): * 'NaT' will return NaT for an ambiguous time. * 'raise' will raise an AmbiguousTimeError for an ambiguous time. - .. versionadded:: 0.24.0. + .. versionadded:: 0.24.0 nonexistent : {'raise', 'shift_forward', 'shift_backward, 'NaT', \ timedelta}, default 'raise' A nonexistent time does not exist in a particular timezone @@ -580,7 +580,7 @@ timedelta}, default 'raise' * 'raise' will raise an NonExistentTimeError if there are nonexistent times. - .. versionadded:: 0.24.0. + .. versionadded:: 0.24.0 Returns ------- @@ -598,7 +598,7 @@ timedelta}, default 'raise' ---------- freq : str Frequency string indicating the flooring resolution. - ambiguous : {'raise', 'NaT', bool}, default 'raise' + ambiguous : bool or {'raise', 'NaT'}, default 'raise' The behavior is as follows: * bool contains flags to determine if time is dst or not (note @@ -606,7 +606,7 @@ timedelta}, default 'raise' * 'NaT' will return NaT for an ambiguous time. * 'raise' will raise an AmbiguousTimeError for an ambiguous time. - .. versionadded:: 0.24.0. + .. versionadded:: 0.24.0 nonexistent : {'raise', 'shift_forward', 'shift_backward, 'NaT', \ timedelta}, default 'raise' A nonexistent time does not exist in a particular timezone @@ -621,7 +621,7 @@ timedelta}, default 'raise' * 'raise' will raise an NonExistentTimeError if there are nonexistent times. - .. versionadded:: 0.24.0. + .. versionadded:: 0.24.0 Raises ------ @@ -635,7 +635,7 @@ timedelta}, default 'raise' ---------- freq : str Frequency string indicating the ceiling resolution. - ambiguous : {'raise', 'NaT', bool}, default 'raise' + ambiguous : bool or {'raise', 'NaT'}, default 'raise' The behavior is as follows: * bool contains flags to determine if time is dst or not (note @@ -643,7 +643,7 @@ timedelta}, default 'raise' * 'NaT' will return NaT for an ambiguous time. * 'raise' will raise an AmbiguousTimeError for an ambiguous time. - .. versionadded:: 0.24.0. + .. versionadded:: 0.24.0 nonexistent : {'raise', 'shift_forward', 'shift_backward, 'NaT', \ timedelta}, default 'raise' A nonexistent time does not exist in a particular timezone @@ -658,7 +658,7 @@ timedelta}, default 'raise' * 'raise' will raise an NonExistentTimeError if there are nonexistent times. - .. versionadded:: 0.24.0. + .. versionadded:: 0.24.0 Raises ------ @@ -726,7 +726,7 @@ default 'raise' * 'raise' will raise an NonExistentTimeError if there are nonexistent times. - .. versionadded:: 0.24.0. + .. versionadded:: 0.24.0 errors : 'raise', 'coerce', default None Determine how errors should be handled. @@ -738,7 +738,7 @@ default 'raise' * 'coerce' will return NaT if the timestamp can not be converted into the specified timezone. Use ``nonexistent='NaT'`` instead. - .. deprecated:: 0.24.0. + .. deprecated:: 0.24.0 Returns ------- diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index 197bd7ec6bb9b..d30ab7b854309 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -447,7 +447,7 @@ class Timestamp(_Timestamp): ---------- freq : str Frequency string indicating the rounding resolution. - ambiguous : {'raise', 'NaT', bool}, default 'raise' + ambiguous : bool or {'raise', 'NaT'}, default 'raise' The behavior is as follows: * bool contains flags to determine if time is dst or not (note @@ -455,7 +455,7 @@ class Timestamp(_Timestamp): * 'NaT' will return NaT for an ambiguous time. * 'raise' will raise an AmbiguousTimeError for an ambiguous time. - .. versionadded:: 0.24.0. + .. versionadded:: 0.24.0 nonexistent : {'raise', 'shift_forward', 'shift_backward, 'NaT', \ timedelta}, default 'raise' A nonexistent time does not exist in a particular timezone @@ -470,7 +470,7 @@ timedelta}, default 'raise' * 'raise' will raise an NonExistentTimeError if there are nonexistent times. - .. versionadded:: 0.24.0. + .. versionadded:: 0.24.0 Returns ------- @@ -492,7 +492,7 @@ timedelta}, default 'raise' ---------- freq : str Frequency string indicating the flooring resolution. - ambiguous : {'raise', 'NaT', bool}, default 'raise' + ambiguous : bool or {'raise', 'NaT'}, default 'raise' The behavior is as follows: * bool contains flags to determine if time is dst or not (note @@ -500,7 +500,7 @@ timedelta}, default 'raise' * 'NaT' will return NaT for an ambiguous time. * 'raise' will raise an AmbiguousTimeError for an ambiguous time. - .. versionadded:: 0.24.0. + .. versionadded:: 0.24.0 nonexistent : {'raise', 'shift_forward', 'shift_backward, 'NaT', \ timedelta}, default 'raise' A nonexistent time does not exist in a particular timezone @@ -515,7 +515,7 @@ timedelta}, default 'raise' * 'raise' will raise an NonExistentTimeError if there are nonexistent times. - .. versionadded:: 0.24.0. + .. versionadded:: 0.24.0 Raises ------ @@ -531,7 +531,7 @@ timedelta}, default 'raise' ---------- freq : str Frequency string indicating the ceiling resolution. - ambiguous : {'raise', 'NaT', bool}, default 'raise' + ambiguous : bool or {'raise', 'NaT'}, default 'raise' The behavior is as follows: * bool contains flags to determine if time is dst or not (note @@ -539,7 +539,7 @@ timedelta}, default 'raise' * 'NaT' will return NaT for an ambiguous time. * 'raise' will raise an AmbiguousTimeError for an ambiguous time. - .. versionadded:: 0.24.0. + .. versionadded:: 0.24.0 nonexistent : {'raise', 'shift_forward', 'shift_backward, 'NaT', \ timedelta}, default 'raise' A nonexistent time does not exist in a particular timezone @@ -554,7 +554,7 @@ timedelta}, default 'raise' * 'raise' will raise an NonExistentTimeError if there are nonexistent times. - .. versionadded:: 0.24.0. + .. versionadded:: 0.24.0 Raises ------ @@ -811,7 +811,7 @@ default 'raise' * 'raise' will raise an NonExistentTimeError if there are nonexistent times. - .. versionadded:: 0.24.0. + .. versionadded:: 0.24.0 errors : 'raise', 'coerce', default None Determine how errors should be handled. @@ -823,7 +823,7 @@ default 'raise' * 'coerce' will return NaT if the timestamp can not be converted into the specified timezone. Use ``nonexistent='NaT'`` instead. - .. deprecated:: 0.24.0. + .. deprecated:: 0.24.0 Returns ------- From 776451e8414bfd0c6f554f6f9b389a7f1aeb9b19 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 30 Sep 2019 21:00:18 -0700 Subject: [PATCH 08/22] CLN: Assorted typings (#28604) --- pandas/core/algorithms.py | 24 ++++++++++++------------ pandas/core/util/hashing.py | 21 ++++++++++++++------- 2 files changed, 26 insertions(+), 19 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 6e73e1636a75b..002bbcc63d04f 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -176,7 +176,6 @@ def _reconstruct_data(values, dtype, original): ------- Index for extension types, otherwise ndarray casted to dtype """ - from pandas import Index if is_extension_array_dtype(dtype): values = dtype.construct_array_type()._from_sequence(values) @@ -184,7 +183,7 @@ def _reconstruct_data(values, dtype, original): values = values.astype(dtype) # we only support object dtypes bool Index - if isinstance(original, Index): + if isinstance(original, ABCIndexClass): values = values.astype(object) elif dtype is not None: values = values.astype(dtype) @@ -833,7 +832,7 @@ def duplicated(values, keep="first"): return f(values, keep=keep) -def mode(values, dropna=True): +def mode(values, dropna: bool = True): """ Returns the mode(s) of an array. @@ -1888,7 +1887,7 @@ def searchsorted(arr, value, side="left", sorter=None): } -def diff(arr, n, axis=0): +def diff(arr, n: int, axis: int = 0): """ difference of n between self, analogous to s-s.shift(n) @@ -1904,7 +1903,6 @@ def diff(arr, n, axis=0): Returns ------- shifted - """ n = int(n) @@ -1935,13 +1933,15 @@ def diff(arr, n, axis=0): f = _diff_special[arr.dtype.name] f(arr, out_arr, n, axis) else: - res_indexer = [slice(None)] * arr.ndim - res_indexer[axis] = slice(n, None) if n >= 0 else slice(None, n) - res_indexer = tuple(res_indexer) - - lag_indexer = [slice(None)] * arr.ndim - lag_indexer[axis] = slice(None, -n) if n > 0 else slice(-n, None) - lag_indexer = tuple(lag_indexer) + # To keep mypy happy, _res_indexer is a list while res_indexer is + # a tuple, ditto for lag_indexer. + _res_indexer = [slice(None)] * arr.ndim + _res_indexer[axis] = slice(n, None) if n >= 0 else slice(None, n) + res_indexer = tuple(_res_indexer) + + _lag_indexer = [slice(None)] * arr.ndim + _lag_indexer[axis] = slice(None, -n) if n > 0 else slice(-n, None) + lag_indexer = tuple(_lag_indexer) # need to make sure that we account for na for datelike/timedelta # we don't actually want to subtract these i8 numbers diff --git a/pandas/core/util/hashing.py b/pandas/core/util/hashing.py index bcdbf0855cbb4..4bcc53606aeca 100644 --- a/pandas/core/util/hashing.py +++ b/pandas/core/util/hashing.py @@ -26,7 +26,7 @@ _default_hash_key = "0123456789123456" -def _combine_hash_arrays(arrays, num_items): +def _combine_hash_arrays(arrays, num_items: int): """ Parameters ---------- @@ -55,7 +55,11 @@ def _combine_hash_arrays(arrays, num_items): def hash_pandas_object( - obj, index=True, encoding="utf8", hash_key=None, categorize=True + obj, + index: bool = True, + encoding: str = "utf8", + hash_key=None, + categorize: bool = True, ): """ Return a data hash of the Index/Series/DataFrame. @@ -125,7 +129,10 @@ def hash_pandas_object( for _ in [None] ) num_items += 1 - hashes = itertools.chain(hashes, index_hash_generator) + + # keep `hashes` specifically a generator to keep mypy happy + _hashes = itertools.chain(hashes, index_hash_generator) + hashes = (x for x in _hashes) h = _combine_hash_arrays(hashes, num_items) h = Series(h, index=obj.index, dtype="uint64", copy=False) @@ -179,7 +186,7 @@ def hash_tuples(vals, encoding="utf8", hash_key=None): return h -def hash_tuple(val, encoding="utf8", hash_key=None): +def hash_tuple(val, encoding: str = "utf8", hash_key=None): """ Hash a single tuple efficiently @@ -201,7 +208,7 @@ def hash_tuple(val, encoding="utf8", hash_key=None): return h -def _hash_categorical(c, encoding, hash_key): +def _hash_categorical(c, encoding: str, hash_key: str): """ Hash a Categorical by hashing its categories, and then mapping the codes to the hashes @@ -239,7 +246,7 @@ def _hash_categorical(c, encoding, hash_key): return result -def hash_array(vals, encoding="utf8", hash_key=None, categorize=True): +def hash_array(vals, encoding: str = "utf8", hash_key=None, categorize: bool = True): """ Given a 1d array, return an array of deterministic integers. @@ -317,7 +324,7 @@ def hash_array(vals, encoding="utf8", hash_key=None, categorize=True): return vals -def _hash_scalar(val, encoding="utf8", hash_key=None): +def _hash_scalar(val, encoding: str = "utf8", hash_key=None): """ Hash scalar value From 962140f39fea80f33ef2d22dd8566cd6045eb6cb Mon Sep 17 00:00:00 2001 From: Josiah Baker Date: Tue, 1 Oct 2019 00:40:56 -0400 Subject: [PATCH 09/22] remove periods after versionadd and fix deprecated directive missed a few periods from the last commit also incorrectly moved a sphinx directive earlier --- pandas/_libs/tslibs/nattype.pyx | 8 ++++---- pandas/_libs/tslibs/timestamps.pyx | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/pandas/_libs/tslibs/nattype.pyx b/pandas/_libs/tslibs/nattype.pyx index c00fac48d51f0..b17c6079d81fd 100644 --- a/pandas/_libs/tslibs/nattype.pyx +++ b/pandas/_libs/tslibs/nattype.pyx @@ -580,7 +580,7 @@ timedelta}, default 'raise' * 'raise' will raise an NonExistentTimeError if there are nonexistent times. - .. versionadded:: 0.24.0. + .. versionadded:: 0.24.0 Returns ------- @@ -621,7 +621,7 @@ timedelta}, default 'raise' * 'raise' will raise an NonExistentTimeError if there are nonexistent times. - .. versionadded:: 0.24.0. + .. versionadded:: 0.24.0 Raises ------ @@ -658,7 +658,7 @@ timedelta}, default 'raise' * 'raise' will raise an NonExistentTimeError if there are nonexistent times. - .. versionadded:: 0.24.0. + .. versionadded:: 0.24.0 Raises ------ @@ -738,7 +738,7 @@ default 'raise' * 'coerce' will return NaT if the timestamp can not be converted into the specified timezone. Use ``nonexistent='NaT'`` instead. - .. deprecated:: 0.24.0 + .. deprecated:: 0.24.0 Returns ------- diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index 8bb8c543159e1..261fd7d8068aa 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -470,7 +470,7 @@ timedelta}, default 'raise' * 'raise' will raise an NonExistentTimeError if there are nonexistent times. - .. versionadded:: 0.24.0. + .. versionadded:: 0.24.0 Returns ------- @@ -515,7 +515,7 @@ timedelta}, default 'raise' * 'raise' will raise an NonExistentTimeError if there are nonexistent times. - .. versionadded:: 0.24.0. + .. versionadded:: 0.24.0 Raises ------ @@ -554,7 +554,7 @@ timedelta}, default 'raise' * 'raise' will raise an NonExistentTimeError if there are nonexistent times. - .. versionadded:: 0.24.0. + .. versionadded:: 0.24.0 Raises ------ @@ -823,7 +823,7 @@ default 'raise' * 'coerce' will return NaT if the timestamp can not be converted into the specified timezone. Use ``nonexistent='NaT'`` instead. - .. deprecated:: 0.24.0 + .. deprecated:: 0.24.0 Returns ------- From 225d5e2bb5ac2443bf34af1f57d1ad42982a4680 Mon Sep 17 00:00:00 2001 From: Jack Bicknell Date: Tue, 1 Oct 2019 05:08:12 +0100 Subject: [PATCH 10/22] DOC: Fixed PR08 docstring errors in pandas.tseries (#28571) --- pandas/tseries/offsets.py | 116 +++++++++++++++++++++++--------------- 1 file changed, 70 insertions(+), 46 deletions(-) diff --git a/pandas/tseries/offsets.py b/pandas/tseries/offsets.py index 82cbfa831bf32..4ebb4f353a8fd 100644 --- a/pandas/tseries/offsets.py +++ b/pandas/tseries/offsets.py @@ -1007,9 +1007,9 @@ class CustomBusinessDay(_CustomMixin, BusinessDay): normalize : bool, default False Normalize start/end dates to midnight before generating date range weekmask : str, Default 'Mon Tue Wed Thu Fri' - weekmask of valid business days, passed to ``numpy.busdaycalendar`` + Weekmask of valid business days, passed to ``numpy.busdaycalendar`` holidays : list - list/array of dates to exclude from the set of valid business days, + List/array of dates to exclude from the set of valid business days, passed to ``numpy.busdaycalendar`` calendar : pd.HolidayCalendar or np.busdaycalendar offset : timedelta, default timedelta(0) @@ -1671,16 +1671,19 @@ class WeekOfMonth(_WeekOfMonthMixin, DateOffset): Parameters ---------- n : int - week : {0, 1, 2, 3, ...}, default 0 - 0 is 1st week of month, 1 2nd week, etc. - weekday : {0, 1, ..., 6}, default 0 - 0: Mondays - 1: Tuesdays - 2: Wednesdays - 3: Thursdays - 4: Fridays - 5: Saturdays - 6: Sundays + week : int {0, 1, 2, 3, ...}, default 0 + A specific integer for the week of the month. + e.g. 0 is 1st week of month, 1 is the 2nd week, etc. + weekday : int {0, 1, ..., 6}, default 0 + A specific integer for the day of the week. + + - 0 is Monday + - 1 is Tuesday + - 2 is Wednesday + - 3 is Thursday + - 4 is Friday + - 5 is Saturday + - 6 is Sunday """ _prefix = "WOM" @@ -1747,14 +1750,16 @@ class LastWeekOfMonth(_WeekOfMonthMixin, DateOffset): Parameters ---------- n : int, default 1 - weekday : {0, 1, ..., 6}, default 0 - 0: Mondays - 1: Tuesdays - 2: Wednesdays - 3: Thursdays - 4: Fridays - 5: Saturdays - 6: Sundays + weekday : int {0, 1, ..., 6}, default 0 + A specific integer for the day of the week. + + - 0 is Monday + - 1 is Tuesday + - 2 is Wednesday + - 3 is Thursday + - 4 is Friday + - 5 is Saturday + - 6 is Sunday """ _prefix = "LWOM" @@ -2055,6 +2060,7 @@ class FY5253(DateOffset): http://en.wikipedia.org/wiki/4-4-5_calendar The year may either: + - end on the last X day of the Y month. - end on the last X day closest to the last day of the Y month. @@ -2064,17 +2070,25 @@ class FY5253(DateOffset): Parameters ---------- n : int - weekday : {0, 1, ..., 6} - 0: Mondays - 1: Tuesdays - 2: Wednesdays - 3: Thursdays - 4: Fridays - 5: Saturdays - 6: Sundays - startingMonth : The month in which fiscal years end. {1, 2, ... 12} - variation : str - {"nearest", "last"} for "LastOfMonth" or "NearestEndMonth" + weekday : int {0, 1, ..., 6}, default 0 + A specific integer for the day of the week. + + - 0 is Monday + - 1 is Tuesday + - 2 is Wednesday + - 3 is Thursday + - 4 is Friday + - 5 is Saturday + - 6 is Sunday + + startingMonth : int {1, 2, ... 12}, default 1 + The month in which the fiscal year ends. + + variation : str, default "nearest" + Method of employing 4-4-5 calendar. There are two options: + + - "nearest" means year end is **weekday** closest to last day of month in year. + - "last" means year end is final **weekday** of the final month in fiscal year. """ _prefix = "RE" @@ -2258,6 +2272,7 @@ class FY5253Quarter(DateOffset): http://en.wikipedia.org/wiki/4-4-5_calendar The year may either: + - end on the last X day of the Y month. - end on the last X day closest to the last day of the Y month. @@ -2271,19 +2286,28 @@ class FY5253Quarter(DateOffset): Parameters ---------- n : int - weekday : {0, 1, ..., 6} - 0: Mondays - 1: Tuesdays - 2: Wednesdays - 3: Thursdays - 4: Fridays - 5: Saturdays - 6: Sundays - startingMonth : The month in which fiscal years end. {1, 2, ... 12} - qtr_with_extra_week : The quarter number that has the leap - or 14 week when needed. {1,2,3,4} - variation : str - {"nearest", "last"} for "LastOfMonth" or "NearestEndMonth" + weekday : int {0, 1, ..., 6}, default 0 + A specific integer for the day of the week. + + - 0 is Monday + - 1 is Tuesday + - 2 is Wednesday + - 3 is Thursday + - 4 is Friday + - 5 is Saturday + - 6 is Sunday + + startingMonth : int {1, 2, ..., 12}, default 1 + The month in which fiscal years end. + + qtr_with_extra_week : int {1, 2, 3, 4}, default 1 + The quarter number that has the leap or 14 week when needed. + + variation : str, default "nearest" + Method of employing 4-4-5 calendar. There are two options: + + - "nearest" means year end is **weekday** closest to last day of month in year. + - "last" means year end is final **weekday** of the final month in fiscal year. """ _prefix = "REQ" @@ -2707,8 +2731,8 @@ def generate_range(start=None, end=None, periods=None, offset=BDay()): Parameters ---------- - start : datetime (default None) - end : datetime (default None) + start : datetime, (default None) + end : datetime, (default None) periods : int, (default None) offset : DateOffset, (default BDay()) From 4661d77bbf3ae97d21be8a2a5bdfa4f900a20e2f Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 1 Oct 2019 06:59:03 -0500 Subject: [PATCH 11/22] DOC: Add scaling to large datasets section (#28577) * DOC: Add scaling to large datasets section Closes https://github.com/pandas-dev/pandas/issues/28315 --- doc/.gitignore | 4 + doc/source/index.rst.template | 1 + doc/source/user_guide/index.rst | 1 + doc/source/user_guide/scale.rst | 373 ++++++++++++++++++++++++ doc/source/whatsnew/v1.0.0.rst | 7 + environment.yml | 8 +- pandas/util/testing.py | 81 +++++ requirements-dev.txt | 7 +- scripts/generate_pip_deps_from_conda.py | 2 +- 9 files changed, 481 insertions(+), 3 deletions(-) create mode 100644 doc/.gitignore create mode 100644 doc/source/user_guide/scale.rst diff --git a/doc/.gitignore b/doc/.gitignore new file mode 100644 index 0000000000000..e23892d6100e8 --- /dev/null +++ b/doc/.gitignore @@ -0,0 +1,4 @@ +data/ +timeseries.csv +timeseries.parquet +timeseries_wide.parquet diff --git a/doc/source/index.rst.template b/doc/source/index.rst.template index f5669626aa2b3..6ff42eee9dad2 100644 --- a/doc/source/index.rst.template +++ b/doc/source/index.rst.template @@ -83,6 +83,7 @@ See the :ref:`overview` for more detail about what's in the library. * :doc:`user_guide/style` * :doc:`user_guide/options` * :doc:`user_guide/enhancingperf` + * :doc:`user_guide/scale` * :doc:`user_guide/sparse` * :doc:`user_guide/gotchas` * :doc:`user_guide/cookbook` diff --git a/doc/source/user_guide/index.rst b/doc/source/user_guide/index.rst index 05df83decbd7e..b86961a71433b 100644 --- a/doc/source/user_guide/index.rst +++ b/doc/source/user_guide/index.rst @@ -38,6 +38,7 @@ Further information on any specific method can be obtained in the style options enhancingperf + scale sparse gotchas cookbook diff --git a/doc/source/user_guide/scale.rst b/doc/source/user_guide/scale.rst new file mode 100644 index 0000000000000..7b590a3a1fcc8 --- /dev/null +++ b/doc/source/user_guide/scale.rst @@ -0,0 +1,373 @@ +.. _scale: + +************************* +Scaling to large datasets +************************* + +Pandas provides data structures for in-memory analytics, which makes using pandas +to analyze datasets that are larger than memory datasets somewhat tricky. Even datasets +that are a sizable fraction of memory become unwieldy, as some pandas operations need +to make intermediate copies. + +This document provides a few recommendations for scaling your analysis to larger datasets. +It's a complement to :ref:`enhancingperf`, which focuses on speeding up analysis +for datasets that fit in memory. + +But first, it's worth considering *not using pandas*. Pandas isn't the right +tool for all situations. If you're working with very large datasets and a tool +like PostgreSQL fits your needs, then you should probably be using that. +Assuming you want or need the expressiveness and power of pandas, let's carry on. + +.. ipython:: python + + import pandas as pd + import numpy as np + +.. ipython:: python + :suppress: + + from pandas.util.testing import _make_timeseries + + # Make a random in-memory dataset + ts = _make_timeseries(freq="30S", seed=0) + ts.to_csv("timeseries.csv") + ts.to_parquet("timeseries.parquet") + + +Load less data +-------------- + +.. ipython:: python + :suppress: + + # make a similar dataset with many columns + timeseries = [ + _make_timeseries(freq="1T", seed=i).rename(columns=lambda x: f"{x}_{i}") + for i in range(10) + ] + ts_wide = pd.concat(timeseries, axis=1) + ts_wide.to_parquet("timeseries_wide.parquet") + +Suppose our raw dataset on disk has many columns:: + + id_0 name_0 x_0 y_0 id_1 name_1 x_1 ... name_8 x_8 y_8 id_9 name_9 x_9 y_9 + timestamp ... + 2000-01-01 00:00:00 1015 Michael -0.399453 0.095427 994 Frank -0.176842 ... Dan -0.315310 0.713892 1025 Victor -0.135779 0.346801 + 2000-01-01 00:01:00 969 Patricia 0.650773 -0.874275 1003 Laura 0.459153 ... Ursula 0.913244 -0.630308 1047 Wendy -0.886285 0.035852 + 2000-01-01 00:02:00 1016 Victor -0.721465 -0.584710 1046 Michael 0.524994 ... Ray -0.656593 0.692568 1064 Yvonne 0.070426 0.432047 + 2000-01-01 00:03:00 939 Alice -0.746004 -0.908008 996 Ingrid -0.414523 ... Jerry -0.958994 0.608210 978 Wendy 0.855949 -0.648988 + 2000-01-01 00:04:00 1017 Dan 0.919451 -0.803504 1048 Jerry -0.569235 ... Frank -0.577022 -0.409088 994 Bob -0.270132 0.335176 + ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... + 2000-12-30 23:56:00 999 Tim 0.162578 0.512817 973 Kevin -0.403352 ... Tim -0.380415 0.008097 1041 Charlie 0.191477 -0.599519 + 2000-12-30 23:57:00 970 Laura -0.433586 -0.600289 958 Oliver -0.966577 ... Zelda 0.971274 0.402032 1038 Ursula 0.574016 -0.930992 + 2000-12-30 23:58:00 1065 Edith 0.232211 -0.454540 971 Tim 0.158484 ... Alice -0.222079 -0.919274 1022 Dan 0.031345 -0.657755 + 2000-12-30 23:59:00 1019 Ingrid 0.322208 -0.615974 981 Hannah 0.607517 ... Sarah -0.424440 -0.117274 990 George -0.375530 0.563312 + 2000-12-31 00:00:00 937 Ursula -0.906523 0.943178 1018 Alice -0.564513 ... Jerry 0.236837 0.807650 985 Oliver 0.777642 0.783392 + + [525601 rows x 40 columns] + + +To load the columns we want, we have two options. +Option 1 loads in all the data and then filters to what we need. + +.. ipython:: python + + columns = ['id_0', 'name_0', 'x_0', 'y_0'] + + pd.read_parquet("timeseries_wide.parquet")[columns] + +Option 2 only loads the columns we request. + +.. ipython:: python + + pd.read_parquet("timeseries_wide.parquet", columns=columns) + +If we were to measure the memory usage of the two calls, we'd see that specifying +``columns`` uses about 1/10th the memory in this case. + +With :func:`pandas.read_csv`, you can specify ``usecols`` to limit the columns +read into memory. Not all file formats that can be read by pandas provide an option +to read a subset of columns. + +Use efficient datatypes +----------------------- + +The default pandas data types are not the most memory efficient. This is +especially true for high-cardinality text data (columns with relatively few +unique values). By using more efficient data types you can store larger datasets +in memory. + +.. ipython:: python + + ts = pd.read_parquet("timeseries.parquet") + ts + +Now, let's inspect the data types and memory usage to see where we should focus our +attention. + +.. ipython:: python + + ts.dtypes + +.. ipython:: python + + ts.memory_usage(deep=True) # memory usage in bytes + + +The ``name`` column is taking up much more memory than any other. It has just a +few unique values, so it's a good candidate for converting to a +:class:`Categorical`. With a Categorical, we store each unique name once and use +space-efficient integers to know which specific name is used in each row. + + +.. ipython:: python + + ts2 = ts.copy() + ts2['name'] = ts2['name'].astype('category') + ts2.memory_usage(deep=True) + +We can go a bit further and downcast the numeric columns to their smallest types +using :func:`pandas.to_numeric`. + +.. ipython:: python + + ts2['id'] = pd.to_numeric(ts2['id'], downcast='unsigned') + ts2[['x', 'y']] = ts2[['x', 'y']].apply(pd.to_numeric, downcast='float') + ts2.dtypes + +.. ipython:: python + + ts2.memory_usage(deep=True) + +.. ipython:: python + + reduction = (ts2.memory_usage(deep=True).sum() + / ts.memory_usage(deep=True).sum()) + print(f"{reduction:0.2f}") + +In all, we've reduced the in-memory footprint of this dataset to 1/5 of its +original size. + +See :ref:`categorical` for more on ``Categorical`` and :ref:`basics.dtypes` +for an overview of all of pandas' dtypes. + +Use chunking +------------ + +Some workloads can be achieved with chunking: splitting a large problem like "convert this +directory of CSVs to parquet" into a bunch of small problems ("convert this individual CSV +file into a Parquet file. Now repeat that for each file in this directory."). As long as each chunk +fits in memory, you can work with datasets that are much larger than memory. + +.. note:: + + Chunking works well when the operation you're performing requires zero or minimal + coordination between chunks. For more complicated workflows, you're better off + :ref:`using another library `. + +Suppose we have an even larger "logical dataset" on disk that's a directory of parquet +files. Each file in the directory represents a different year of the entire dataset. + +.. ipython:: python + :suppress: + + import pathlib + + N = 12 + starts = [f'20{i:>02d}-01-01' for i in range(N)] + ends = [f'20{i:>02d}-12-13' for i in range(N)] + + pathlib.Path("data/timeseries").mkdir(exist_ok=True) + + for i, (start, end) in enumerate(zip(starts, ends)): + ts = _make_timeseries(start=start, end=end, freq='1T', seed=i) + ts.to_parquet(f"data/timeseries/ts-{i:0>2d}.parquet") + + +:: + + data + └── timeseries + ├── ts-00.parquet + ├── ts-01.parquet + ├── ts-02.parquet + ├── ts-03.parquet + ├── ts-04.parquet + ├── ts-05.parquet + ├── ts-06.parquet + ├── ts-07.parquet + ├── ts-08.parquet + ├── ts-09.parquet + ├── ts-10.parquet + └── ts-11.parquet + +Now we'll implement an out-of-core ``value_counts``. The peak memory usage of this +workflow is the single largest chunk, plus a small series storing the unique value +counts up to this point. As long as each individual file fits in memory, this will +work for arbitrary-sized datasets. + +.. ipython:: python + + %%time + files = pathlib.Path("data/timeseries/").glob("ts*.parquet") + counts = pd.Series(dtype=int) + for path in files: + # Only one dataframe is in memory at a time... + df = pd.read_parquet(path) + # ... plus a small Series `counts`, which is updated. + counts = counts.add(df['name'].value_counts(), fill_value=0) + counts.astype(int) + +Some readers, like :meth:`pandas.read_csv`, offer parameters to control the +``chunksize`` when reading a single file. + +Manually chunking is an OK option for workflows that don't +require too sophisticated of operations. Some operations, like ``groupby``, are +much harder to do chunkwise. In these cases, you may be better switching to a +different library that implements these out-of-core algorithms for you. + +.. _scale.other_libraries: + +Use other libraries +------------------- + +Pandas is just one library offering a DataFrame API. Because of its popularity, +pandas' API has become something of a standard that other libraries implement. +The pandas documentation maintains a list of libraries implementing a DataFrame API +in :ref:`our ecosystem page `. + +For example, `Dask`_, a parallel computing library, has `dask.dataframe`_, a +pandas-like API for working with larger than memory datasets in parallel. Dask +can use multiple threads or processes on a single machine, or a cluster of +machines to process data in parallel. + + +We'll import ``dask.dataframe`` and notice that the API feels similar to pandas. +We can use Dask's ``read_parquet`` function, but provide a globstring of files to read in. + +.. ipython:: python + + import dask.dataframe as dd + + ddf = dd.read_parquet("data/timeseries/ts*.parquet", engine="pyarrow") + ddf + +Inspecting the ``ddf`` object, we see a few things + +* There are familiar attributes like ``.columns`` and ``.dtypes`` +* There are familiar methods like ``.groupby``, ``.sum``, etc. +* There are new attributes like ``.npartitions`` and ``.divisions`` + +The partitions and divisions are how Dask parallizes computation. A **Dask** +DataFrame is made up of many **Pandas** DataFrames. A single method call on a +Dask DataFrame ends up making many pandas method calls, and Dask knows how to +coordinate everything to get the result. + +.. ipython:: python + + ddf.columns + ddf.dtypes + ddf.npartitions + +One major difference: the ``dask.dataframe`` API is *lazy*. If you look at the +repr above, you'll notice that the values aren't actually printed out; just the +column names and dtypes. That's because Dask hasn't actually read the data yet. +Rather than executing immediately, doing operations build up a **task graph**. + +.. ipython:: python + + ddf + ddf['name'] + ddf['name'].value_counts() + +Each of these calls is instant because the result isn't being computed yet. +We're just building up a list of computation to do when someone needs the +result. Dask knows that the return type of a ``pandas.Series.value_counts`` +is a pandas Series with a certain dtype and a certain name. So the Dask version +returns a Dask Series with the same dtype and the same name. + +To get the actual result you can call ``.compute()``. + +.. ipython:: python + + %time ddf['name'].value_counts().compute() + +At that point, you get back the same thing you'd get with pandas, in this case +a concrete pandas Series with the count of each ``name``. + +Calling ``.compute`` causes the full task graph to be executed. This includes +reading the data, selecting the columns, and doing the ``value_counts``. The +execution is done *in parallel* where possible, and Dask tries to keep the +overall memory footprint small. You can work with datasets that are much larger +than memory, as long as each partition (a regular pandas DataFrame) fits in memory. + +By default, ``dask.dataframe`` operations use a threadpool to do operations in +parallel. We can also connect to a cluster to distribute the work on many +machines. In this case we'll connect to a local "cluster" made up of several +processes on this single machine. + +.. code-block:: python + + >>> from dask.distributed import Client, LocalCluster + + >>> cluster = LocalCluster() + >>> client = Client(cluster) + >>> client + + +Once this ``client`` is created, all of Dask's computation will take place on +the cluster (which is just processes in this case). + +Dask implements the most used parts of the pandas API. For example, we can do +a familiar groupby aggregation. + +.. ipython:: python + + %time ddf.groupby('name')[['x', 'y']].mean().compute().head() + +The grouping and aggregation is done out-of-core and in parallel. + +When Dask knows the ``divisions`` of a dataset, certain optimizations are +possible. When reading parquet datasets written by dask, the divisions will be +known automatically. In this case, since we created the parquet files manually, +we need to supply the divisions manually. + +.. ipython:: python + + N = 12 + starts = [f'20{i:>02d}-01-01' for i in range(N)] + ends = [f'20{i:>02d}-12-13' for i in range(N)] + + divisions = tuple(pd.to_datetime(starts)) + (pd.Timestamp(ends[-1]),) + ddf.divisions = divisions + ddf + +Now we can do things like fast random access with ``.loc``. + +.. ipython:: python + + ddf.loc['2002-01-01 12:01':'2002-01-01 12:05'].compute() + +Dask knows to just look in the 3rd partition for selecting values in `2002`. It +doesn't need to look at any other data. + +Many workflows involve a large amount of data and processing it in a way that +reduces the size to something that fits in memory. In this case, we'll resample +to daily frequency and take the mean. Once we've taken the mean, we know the +results will fit in memory, so we can safely call ``compute`` without running +out of memory. At that point it's just a regular pandas object. + +.. ipython:: python + + @savefig dask_resample.png + ddf[['x', 'y']].resample("1D").mean().cumsum().compute().plot() + +These Dask examples have all be done using multiple processes on a single +machine. Dask can be `deployed on a cluster +`_ to scale up to even larger +datasets. + +You see more dask examples at https://examples.dask.org. + +.. _Dask: https://dask.org +.. _dask.dataframe: https://docs.dask.org/en/latest/dataframe.html diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index eb4b72d01d59a..b075a9d8b5e8b 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -111,6 +111,13 @@ Other API changes - :meth:`MultiIndex.from_arrays` will no longer infer names from arrays if ``names=None`` is explicitly provided (:issue:`27292`) - +.. _whatsnew_1000.api.documentation: + +Documentation Improvements +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +- Added new section on :ref:`scale` (:issue:`28315`). + .. _whatsnew_1000.deprecations: Deprecations diff --git a/environment.yml b/environment.yml index 7629fa52e7829..7c3ec9064cba3 100644 --- a/environment.yml +++ b/environment.yml @@ -35,6 +35,12 @@ dependencies: - nbconvert>=5.4.1 - nbsphinx - pandoc + # Dask and its dependencies + - dask-core + - toolz>=0.7.3 + - fsspec>=0.5.1 + - partd>=0.3.10 + - cloudpickle>=0.2.1 # web (jinja2 is also needed, but it's also an optional pandas dependency) - markdown @@ -76,7 +82,7 @@ dependencies: - html5lib # pandas.read_html - lxml # pandas.read_html - openpyxl # pandas.read_excel, DataFrame.to_excel, pandas.ExcelWriter, pandas.ExcelFile - - pyarrow>=0.9.0 # pandas.read_paquet, DataFrame.to_parquet, pandas.read_feather, DataFrame.to_feather + - pyarrow>=0.13.1 # pandas.read_paquet, DataFrame.to_parquet, pandas.read_feather, DataFrame.to_feather - pyqt>=5.9.2 # pandas.read_clipboard - pytables>=3.4.2 # pandas.read_hdf, DataFrame.to_hdf - python-snappy # required by pyarrow diff --git a/pandas/util/testing.py b/pandas/util/testing.py index aee58f808d9e6..1c0a8dbc19ccd 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -1651,6 +1651,87 @@ def makeMultiIndex(k=10, names=None, **kwargs): return MultiIndex.from_product((("foo", "bar"), (1, 2)), names=names, **kwargs) +_names = [ + "Alice", + "Bob", + "Charlie", + "Dan", + "Edith", + "Frank", + "George", + "Hannah", + "Ingrid", + "Jerry", + "Kevin", + "Laura", + "Michael", + "Norbert", + "Oliver", + "Patricia", + "Quinn", + "Ray", + "Sarah", + "Tim", + "Ursula", + "Victor", + "Wendy", + "Xavier", + "Yvonne", + "Zelda", +] + + +def _make_timeseries(start="2000-01-01", end="2000-12-31", freq="1D", seed=None): + """ + Make a DataFrame with a DatetimeIndex + + Parameters + ---------- + start : str or Timestamp, default "2000-01-01" + The start of the index. Passed to date_range with `freq`. + end : str or Timestamp, default "2000-12-31" + The end of the index. Passed to date_range with `freq`. + freq : str or Freq + The frequency to use for the DatetimeIndex + seed : int, optional + The random state seed. + + * name : object dtype with string names + * id : int dtype with + * x, y : float dtype + + Examples + -------- + >>> _make_timeseries() + id name x y + timestamp + 2000-01-01 982 Frank 0.031261 0.986727 + 2000-01-02 1025 Edith -0.086358 -0.032920 + 2000-01-03 982 Edith 0.473177 0.298654 + 2000-01-04 1009 Sarah 0.534344 -0.750377 + 2000-01-05 963 Zelda -0.271573 0.054424 + ... ... ... ... ... + 2000-12-27 980 Ingrid -0.132333 -0.422195 + 2000-12-28 972 Frank -0.376007 -0.298687 + 2000-12-29 1009 Ursula -0.865047 -0.503133 + 2000-12-30 1000 Hannah -0.063757 -0.507336 + 2000-12-31 972 Tim -0.869120 0.531685 + """ + index = pd.date_range(start=start, end=end, freq=freq, name="timestamp") + n = len(index) + state = np.random.RandomState(seed) + columns = { + "name": state.choice(_names, size=n), + "id": state.poisson(1000, size=n), + "x": state.rand(n) * 2 - 1, + "y": state.rand(n) * 2 - 1, + } + df = pd.DataFrame(columns, index=index, columns=sorted(columns)) + if df.index[-1] == end: + df = df.iloc[:-1] + return df + + def all_index_generator(k=10): """Generator which can be iterated over to get instances of all the various index classes. diff --git a/requirements-dev.txt b/requirements-dev.txt index fd8e6378240b4..698e4f3aea094 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -17,6 +17,11 @@ numpydoc>=0.9.0 nbconvert>=5.4.1 nbsphinx pandoc +dask-core +toolz>=0.7.3 +fsspec>=0.5.1 +partd>=0.3.10 +cloudpickle>=0.2.1 markdown feedparser pyyaml @@ -48,7 +53,7 @@ fastparquet>=0.2.1 html5lib lxml openpyxl -pyarrow>=0.9.0 +pyarrow>=0.13.1 pyqt5>=5.9.2 tables>=3.4.2 python-snappy diff --git a/scripts/generate_pip_deps_from_conda.py b/scripts/generate_pip_deps_from_conda.py index 29fe8bf84c12b..44fe50b99560a 100755 --- a/scripts/generate_pip_deps_from_conda.py +++ b/scripts/generate_pip_deps_from_conda.py @@ -20,7 +20,7 @@ import yaml EXCLUDE = {"python=3"} -RENAME = {"pytables": "tables", "pyqt": "pyqt5"} +RENAME = {"pytables": "tables", "pyqt": "pyqt5", "dask-core": "dask"} def conda_package_to_pip(package): From c9b6f83ce6ebca0f0208d38467cc68d69b471b8a Mon Sep 17 00:00:00 2001 From: Kaiqi Dong Date: Tue, 1 Oct 2019 14:00:37 +0200 Subject: [PATCH 12/22] BUG: restore limit in RangeIndex.get_indexer (#28671) --- doc/source/whatsnew/v0.25.2.rst | 2 +- pandas/core/indexes/range.py | 6 ++++-- pandas/tests/frame/test_indexing.py | 16 ++++++++++++++++ pandas/tests/indexes/test_range.py | 8 ++++++++ 4 files changed, 29 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.25.2.rst b/doc/source/whatsnew/v0.25.2.rst index 14682b706f924..f904d69d6421b 100644 --- a/doc/source/whatsnew/v0.25.2.rst +++ b/doc/source/whatsnew/v0.25.2.rst @@ -49,7 +49,7 @@ Interval Indexing ^^^^^^^^ -- +- Fix regression in :meth:`DataFrame.reindex` not following ``limit`` argument (:issue:`28631`). - - diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 8783351cc74d1..43445a0d5d5a2 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -380,8 +380,10 @@ def get_loc(self, key, method=None, tolerance=None): @Appender(_index_shared_docs["get_indexer"]) def get_indexer(self, target, method=None, limit=None, tolerance=None): - if not (method is None and tolerance is None and is_list_like(target)): - return super().get_indexer(target, method=method, tolerance=tolerance) + if com.any_not_none(method, tolerance, limit) or not is_list_like(target): + return super().get_indexer( + target, method=method, tolerance=tolerance, limit=limit + ) if self.step > 0: start, stop, step = self.start, self.stop, self.step diff --git a/pandas/tests/frame/test_indexing.py b/pandas/tests/frame/test_indexing.py index 6b073c460ea08..6d239e96cd167 100644 --- a/pandas/tests/frame/test_indexing.py +++ b/pandas/tests/frame/test_indexing.py @@ -2217,6 +2217,22 @@ def test_reindex_frame_add_nat(self): assert mask[-5:].all() assert not mask[:-5].any() + def test_reindex_limit(self): + # GH 28631 + data = [["A", "A", "A"], ["B", "B", "B"], ["C", "C", "C"], ["D", "D", "D"]] + exp_data = [ + ["A", "A", "A"], + ["B", "B", "B"], + ["C", "C", "C"], + ["D", "D", "D"], + ["D", "D", "D"], + [np.nan, np.nan, np.nan], + ] + df = DataFrame(data) + result = df.reindex([0, 1, 2, 3, 4, 5], method="ffill", limit=1) + expected = DataFrame(exp_data) + tm.assert_frame_equal(result, expected) + def test_set_dataframe_column_ns_dtype(self): x = DataFrame([datetime.now(), datetime.now()]) assert x[0].dtype == np.dtype("M8[ns]") diff --git a/pandas/tests/indexes/test_range.py b/pandas/tests/indexes/test_range.py index 58b98297f00f3..7e08a5deaff7a 100644 --- a/pandas/tests/indexes/test_range.py +++ b/pandas/tests/indexes/test_range.py @@ -416,6 +416,14 @@ def test_get_indexer_backfill(self): expected = np.array([0, 1, 1, 2, 2, 3, 3, 4, 4, 5], dtype=np.intp) tm.assert_numpy_array_equal(indexer, expected) + def test_get_indexer_limit(self): + # GH 28631 + idx = RangeIndex(4) + target = RangeIndex(6) + result = idx.get_indexer(target, method="pad", limit=1) + expected = np.array([0, 1, 2, 3, 3, -1], dtype=np.intp) + tm.assert_numpy_array_equal(result, expected) + def test_join_outer(self): # join with Int64Index other = Int64Index(np.arange(25, 14, -1)) From d68e9fb4563b54cbdad135edb2b484b30e2d807d Mon Sep 17 00:00:00 2001 From: Victoria Zdanovskaya Date: Tue, 1 Oct 2019 13:12:04 +0100 Subject: [PATCH 13/22] DOC: Fixed PR09 docstring errors in pandas.tseries (#27977) (#28707) --- pandas/tseries/offsets.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/pandas/tseries/offsets.py b/pandas/tseries/offsets.py index 4ebb4f353a8fd..81d8869dd7ba0 100644 --- a/pandas/tseries/offsets.py +++ b/pandas/tseries/offsets.py @@ -204,7 +204,8 @@ def __add__(date): normalize : bool, default False Whether to round the result of a DateOffset addition down to the previous midnight. - **kwds : Temporal parameter that add to or replace the offset value. + **kwds + Temporal parameter that add to or replace the offset value. Parameters that **add** to the offset (like Timedelta): @@ -1005,12 +1006,12 @@ class CustomBusinessDay(_CustomMixin, BusinessDay): ---------- n : int, default 1 normalize : bool, default False - Normalize start/end dates to midnight before generating date range + Normalize start/end dates to midnight before generating date range. weekmask : str, Default 'Mon Tue Wed Thu Fri' - Weekmask of valid business days, passed to ``numpy.busdaycalendar`` + Weekmask of valid business days, passed to ``numpy.busdaycalendar``. holidays : list List/array of dates to exclude from the set of valid business days, - passed to ``numpy.busdaycalendar`` + passed to ``numpy.busdaycalendar``. calendar : pd.HolidayCalendar or np.busdaycalendar offset : timedelta, default timedelta(0) """ @@ -1519,7 +1520,7 @@ class Week(DateOffset): Parameters ---------- weekday : int, default None - Always generate specific day of week. 0 for Monday + Always generate specific day of week. 0 for Monday. """ _adjust_dst = True @@ -2085,7 +2086,9 @@ class FY5253(DateOffset): The month in which the fiscal year ends. variation : str, default "nearest" - Method of employing 4-4-5 calendar. There are two options: + Method of employing 4-4-5 calendar. + + There are two options: - "nearest" means year end is **weekday** closest to last day of month in year. - "last" means year end is final **weekday** of the final month in fiscal year. @@ -2304,7 +2307,9 @@ class FY5253Quarter(DateOffset): The quarter number that has the leap or 14 week when needed. variation : str, default "nearest" - Method of employing 4-4-5 calendar. There are two options: + Method of employing 4-4-5 calendar. + + There are two options: - "nearest" means year end is **weekday** closest to last day of month in year. - "last" means year end is final **weekday** of the final month in fiscal year. From 2ad37043a83c1ef4afe821a0a4e324b463413361 Mon Sep 17 00:00:00 2001 From: Victoria Zdanovskaya Date: Tue, 1 Oct 2019 13:49:34 +0100 Subject: [PATCH 14/22] CI Failing: TestReadHtml.test_spam_url #28708 (#28710) --- pandas/tests/io/test_html.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index 183d217eb09d6..1045b72f0aa6e 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -135,8 +135,8 @@ def test_banklist_url(self): @network def test_spam_url(self): url = ( - "http://ndb.nal.usda.gov/ndb/foods/show/300772?fg=&man=&" - "lfacet=&format=&count=&max=25&offset=&sort=&qlookup=spam" + "https://raw.githubusercontent.com/pandas-dev/pandas/master/" + "pandas/tests/io/data/spam.html" ) df1 = self.read_html(url, ".*Water.*") df2 = self.read_html(url, "Unit") From 5ec718a880825d8900394fc09b5f0463ab10e430 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 1 Oct 2019 06:06:03 -0700 Subject: [PATCH 15/22] TST: un-xfail incorrectly xfailed tests for maybe_promote (#28564) --- pandas/core/dtypes/cast.py | 28 +++++++- pandas/tests/dtypes/cast/test_promote.py | 85 +++++++----------------- 2 files changed, 50 insertions(+), 63 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index b59660056aadb..a3ad84ff89a66 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -358,6 +358,7 @@ def maybe_promote(dtype, fill_value=np.nan): fill_value = NaT elif is_extension_array_dtype(dtype) and isna(fill_value): fill_value = dtype.na_value + elif is_float(fill_value): if issubclass(dtype.type, np.bool_): dtype = np.object_ @@ -366,6 +367,8 @@ def maybe_promote(dtype, fill_value=np.nan): elif is_bool(fill_value): if not issubclass(dtype.type, np.bool_): dtype = np.object_ + else: + fill_value = np.bool_(fill_value) elif is_integer(fill_value): if issubclass(dtype.type, np.bool_): dtype = np.object_ @@ -374,6 +377,10 @@ def maybe_promote(dtype, fill_value=np.nan): arr = np.asarray(fill_value) if arr != arr.astype(dtype): dtype = arr.dtype + elif issubclass(dtype.type, np.floating): + # check if we can cast + if _check_lossless_cast(fill_value, dtype): + fill_value = dtype.type(fill_value) elif is_complex(fill_value): if issubclass(dtype.type, np.bool_): dtype = np.object_ @@ -398,12 +405,31 @@ def maybe_promote(dtype, fill_value=np.nan): pass elif is_datetime64tz_dtype(dtype): pass - elif issubclass(np.dtype(dtype).type, str): + elif issubclass(np.dtype(dtype).type, (bytes, str)): dtype = np.object_ return dtype, fill_value +def _check_lossless_cast(value, dtype: np.dtype) -> bool: + """ + Check if we can cast the given value to the given dtype _losslesly_. + + Parameters + ---------- + value : object + dtype : np.dtype + + Returns + ------- + bool + """ + casted = dtype.type(value) + if casted == value: + return True + return False + + def infer_dtype_from(val, pandas_dtype=False): """ interpret the dtype from a scalar or array. This is a convenience diff --git a/pandas/tests/dtypes/cast/test_promote.py b/pandas/tests/dtypes/cast/test_promote.py index 44aebd4d277f2..211c550100018 100644 --- a/pandas/tests/dtypes/cast/test_promote.py +++ b/pandas/tests/dtypes/cast/test_promote.py @@ -23,6 +23,7 @@ is_timedelta64_dtype, ) from pandas.core.dtypes.dtypes import DatetimeTZDtype, PandasExtensionDtype +from pandas.core.dtypes.missing import isna import pandas as pd @@ -95,6 +96,7 @@ def _safe_dtype_assert(left_dtype, right_dtype): """ Compare two dtypes without raising TypeError. """ + __tracebackhide__ = True if isinstance(right_dtype, PandasExtensionDtype): # switch order of equality check because numpy dtypes (e.g. if # left_dtype is np.object_) do not know some expected dtypes (e.g. @@ -157,20 +159,17 @@ def _check_promote( _safe_dtype_assert(result_dtype, expected_dtype) - # for equal values, also check type (relevant e.g. for int vs float, resp. - # for different datetimes and timedeltas) - match_value = ( - result_fill_value - == expected_fill_value - # disabled type check due to too many xfails; GH 23982/25425 - # and type(result_fill_value) == type(expected_fill_value) - ) + # GH#23982/25425 require the same type in addition to equality/NA-ness + res_type = type(result_fill_value) + ex_type = type(expected_fill_value) + assert res_type == ex_type + + match_value = result_fill_value == expected_fill_value + # Note: type check above ensures that we have the _same_ NA value # for missing values, None == None and iNaT == iNaT (which is checked # through match_value above), but np.nan != np.nan and pd.NaT != pd.NaT - match_missing = (result_fill_value is np.nan and expected_fill_value is np.nan) or ( - result_fill_value is NaT and expected_fill_value is NaT - ) + match_missing = isna(result_fill_value) and isna(expected_fill_value) assert match_value or match_missing @@ -251,7 +250,9 @@ def test_maybe_promote_bool_with_any(any_numpy_dtype_reduced, box): if boxed and fill_dtype == bool: pytest.xfail("falsely upcasts to object") - if boxed and box_dtype is None and is_datetime_or_timedelta_dtype(fill_dtype): + if boxed and box_dtype is None and fill_dtype.kind == "M": + pytest.xfail("wrongly casts fill_value") + if boxed and box_dtype is None and fill_dtype.kind == "m": pytest.xfail("wrongly casts fill_value") # create array of given dtype; casts "1" to correct dtype @@ -282,7 +283,9 @@ def test_maybe_promote_any_with_bool(any_numpy_dtype_reduced, box): pytest.xfail("falsely upcasts to object") if boxed and dtype not in (str, object) and box_dtype is None: pytest.xfail("falsely upcasts to object") - if not boxed and is_datetime_or_timedelta_dtype(dtype): + if not boxed and dtype.kind == "M": + pytest.xfail("raises error") + if not boxed and dtype.kind == "m": pytest.xfail("raises error") # filling anything but bool with bool casts to object @@ -393,9 +396,6 @@ def test_maybe_promote_datetimetz_with_any_numpy_dtype( fill_dtype = np.dtype(any_numpy_dtype_reduced) boxed, box_dtype = box # read from parametrized fixture - if box_dtype != object: - pytest.xfail("does not upcast correctly") - # create array of given dtype; casts "1" to correct dtype fill_value = np.array([1], dtype=fill_dtype)[0] @@ -430,8 +430,6 @@ def test_maybe_promote_datetimetz_with_datetimetz( pytest.xfail("Cannot process fill_value with this dtype, see GH 24310") if dtype.tz == fill_dtype.tz and boxed: pytest.xfail("falsely upcasts") - if dtype.tz != fill_dtype.tz and not boxed: - pytest.xfail("falsely upcasts") # create array of given dtype; casts "1" to correct dtype fill_value = pd.Series([10 ** 9], dtype=fill_dtype)[0] @@ -466,14 +464,10 @@ def test_maybe_promote_datetimetz_with_na(tz_aware_fixture, fill_value, box): dtype = DatetimeTZDtype(tz=tz_aware_fixture) boxed, box_dtype = box # read from parametrized fixture - if boxed and ( - box_dtype == object - or (box_dtype is None and (fill_value is None or fill_value is NaT)) - ): - pytest.xfail("false upcasts to object") # takes the opinion that DatetimeTZ should have single na-marker # using iNaT would lead to errors elsewhere -> NaT if not boxed and fill_value == iNaT: + # TODO: are we sure iNaT _should_ be cast to NaT? pytest.xfail("wrong missing value marker") expected_dtype = dtype @@ -509,8 +503,10 @@ def test_maybe_promote_any_numpy_dtype_with_datetimetz( fill_dtype = DatetimeTZDtype(tz=tz_aware_fixture) boxed, box_dtype = box # read from parametrized fixture - if is_datetime_or_timedelta_dtype(dtype) and not boxed: + if dtype.kind == "m" and not boxed: pytest.xfail("raises error") + elif dtype.kind == "M" and not boxed: + pytest.xfail("Comes back as M8 instead of object") fill_value = pd.Series([fill_value], dtype=fill_dtype)[0] @@ -566,19 +562,6 @@ def test_maybe_promote_any_with_timedelta64( else: if boxed and box_dtype is None and is_timedelta64_dtype(type(fill_value)): pytest.xfail("does not upcast correctly") - if ( - not boxed - and is_timedelta64_dtype(type(fill_value)) - and ( - is_integer_dtype(dtype) - or is_float_dtype(dtype) - or is_complex_dtype(dtype) - or issubclass(dtype.type, np.bytes_) - ) - ): - pytest.xfail("does not upcast correctly") - if box_dtype == "td_dtype": - pytest.xfail("falsely upcasts") if not boxed and is_datetime64_dtype(dtype): pytest.xfail("raises error") @@ -612,7 +595,9 @@ def test_maybe_promote_string_with_any(string_dtype, any_numpy_dtype_reduced, bo fill_dtype = np.dtype(any_numpy_dtype_reduced) boxed, box_dtype = box # read from parametrized fixture - if boxed and box_dtype is None and is_datetime_or_timedelta_dtype(fill_dtype): + if boxed and box_dtype is None and fill_dtype.kind == "m": + pytest.xfail("wrong missing value marker") + if boxed and box_dtype is None and fill_dtype.kind == "M": pytest.xfail("wrong missing value marker") # create array of given dtype; casts "1" to correct dtype @@ -652,17 +637,6 @@ def test_maybe_promote_any_with_string(any_numpy_dtype_reduced, string_dtype, bo if is_datetime_or_timedelta_dtype(dtype) and box_dtype != object: pytest.xfail("does not upcast or raises") - if ( - boxed - and box_dtype in (None, "str") - and ( - is_integer_dtype(dtype) - or is_float_dtype(dtype) - or is_complex_dtype(dtype) - or issubclass(dtype.type, np.bytes_) - ) - ): - pytest.xfail("does not upcast correctly") # create array of given dtype fill_value = "abc" @@ -760,19 +734,6 @@ def test_maybe_promote_any_numpy_dtype_with_na( pytest.xfail("does not upcast to object") elif dtype == "uint64" and not boxed and fill_value == iNaT: pytest.xfail("does not upcast correctly") - elif is_datetime_or_timedelta_dtype(dtype) and boxed: - pytest.xfail("falsely upcasts to object") - elif ( - boxed - and ( - is_integer_dtype(dtype) or is_float_dtype(dtype) or is_complex_dtype(dtype) - ) - and fill_value is not NaT - and dtype != "uint64" - ): - pytest.xfail("falsely upcasts to object") - elif boxed and dtype == "uint64" and (fill_value is np.nan or fill_value is None): - pytest.xfail("falsely upcasts to object") # below: opinionated that iNaT should be interpreted as missing value elif ( not boxed From dc6780475dc3dd613e14a09920f77d68f5f650e0 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Tue, 1 Oct 2019 06:07:52 -0700 Subject: [PATCH 16/22] CLN: Define and pin GroupBy properties without exec (#28651) --- ci/code_checks.sh | 6 ++- pandas/core/groupby/generic.py | 84 +++++++++++++++++++--------------- pandas/core/groupby/groupby.py | 2 - 3 files changed, 51 insertions(+), 41 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index b03c4f2238445..e13738b98833a 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -125,6 +125,10 @@ if [[ -z "$CHECK" || "$CHECK" == "patterns" ]]; then # invgrep -R --include="*.py*" -E "from numpy import nan " pandas # GH#24822 not yet implemented since the offending imports have not all been removed RET=$(($RET + $?)) ; echo $MSG "DONE" + MSG='Check for use of exec' ; echo $MSG + invgrep -R --include="*.py*" -E "[^a-zA-Z0-9_]exec\(" pandas + RET=$(($RET + $?)) ; echo $MSG "DONE" + MSG='Check for pytest warns' ; echo $MSG invgrep -r -E --include '*.py' 'pytest\.warns' pandas/tests/ RET=$(($RET + $?)) ; echo $MSG "DONE" @@ -184,7 +188,7 @@ if [[ -z "$CHECK" || "$CHECK" == "patterns" ]]; then invgrep -R --include="*.rst" ".. ipython ::" doc/source RET=$(($RET + $?)) ; echo $MSG "DONE" - MSG='Check that no file in the repo contains tailing whitespaces' ; echo $MSG + MSG='Check that no file in the repo contains trailing whitespaces' ; echo $MSG set -o pipefail if [[ "$AZURE" == "true" ]]; then # we exclude all c/cpp files as the c/cpp files of pandas code base are tested when Linting .c and .h files diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index f8f1455561c03..0ab19448043f6 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -11,7 +11,7 @@ from functools import partial from textwrap import dedent import typing -from typing import Any, Callable, FrozenSet, Iterator, Sequence, Type, Union +from typing import Any, Callable, FrozenSet, Sequence, Type, Union import warnings import numpy as np @@ -70,47 +70,63 @@ ScalarResult = typing.TypeVar("ScalarResult") -def whitelist_method_generator( - base_class: Type[GroupBy], klass: Type[FrameOrSeries], whitelist: FrozenSet[str] -) -> Iterator[str]: +def generate_property(name: str, klass: Type[FrameOrSeries]): """ - Yields all GroupBy member defs for DataFrame/Series names in whitelist. + Create a property for a GroupBy subclass to dispatch to DataFrame/Series. + + Parameters + ---------- + name : str + klass : {DataFrame, Series} + + Returns + ------- + property + """ + + def prop(self): + return self._make_wrapper(name) + + parent_method = getattr(klass, name) + prop.__doc__ = parent_method.__doc__ or "" + prop.__name__ = name + return property(prop) + + +def pin_whitelisted_properties(klass: Type[FrameOrSeries], whitelist: FrozenSet[str]): + """ + Create GroupBy member defs for DataFrame/Series names in a whitelist. Parameters ---------- - base_class : Groupby class - base class klass : DataFrame or Series class class where members are defined. - whitelist : frozenset + whitelist : frozenset[str] Set of names of klass methods to be constructed Returns ------- - The generator yields a sequence of strings, each suitable for exec'ing, - that define implementations of the named methods for DataFrameGroupBy - or SeriesGroupBy. + class decorator + Notes + ----- Since we don't want to override methods explicitly defined in the base class, any such name is skipped. """ - property_wrapper_template = """@property -def %(name)s(self) : - \"""%(doc)s\""" - return self.__getattr__('%(name)s')""" - - for name in whitelist: - # don't override anything that was explicitly defined - # in the base class - if hasattr(base_class, name): - continue - # ugly, but we need the name string itself in the method. - f = getattr(klass, name) - doc = f.__doc__ - doc = doc if type(doc) == str else "" - wrapper_template = property_wrapper_template - params = {"name": name, "doc": doc} - yield wrapper_template % params + + def pinner(cls): + for name in whitelist: + if hasattr(cls, name): + # don't override anything that was explicitly defined + # in the base class + continue + + prop = generate_property(name, klass) + setattr(cls, name, prop) + + return cls + + return pinner class NDFrameGroupBy(GroupBy): @@ -747,13 +763,9 @@ def filter(self, func, dropna=True, *args, **kwargs): return self._apply_filter(indices, dropna) +@pin_whitelisted_properties(Series, base.series_apply_whitelist) class SeriesGroupBy(GroupBy): - # - # Make class defs of attributes on SeriesGroupBy whitelist - _apply_whitelist = base.series_apply_whitelist - for _def_str in whitelist_method_generator(GroupBy, Series, _apply_whitelist): - exec(_def_str) @property def _selection_name(self): @@ -1368,15 +1380,11 @@ def pct_change(self, periods=1, fill_method="pad", limit=None, freq=None): return (filled / shifted) - 1 +@pin_whitelisted_properties(DataFrame, base.dataframe_apply_whitelist) class DataFrameGroupBy(NDFrameGroupBy): _apply_whitelist = base.dataframe_apply_whitelist - # - # Make class defs of attributes on DataFrameGroupBy whitelist. - for _def_str in whitelist_method_generator(GroupBy, DataFrame, _apply_whitelist): - exec(_def_str) - _block_agg_axis = 1 _agg_see_also_doc = dedent( diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 6facbe7e01c57..984954fe14bb5 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -562,8 +562,6 @@ def __getattr__(self, attr): return object.__getattribute__(self, attr) if attr in self.obj: return self[attr] - if hasattr(self.obj, attr): - return self._make_wrapper(attr) raise AttributeError( "%r object has no attribute %r" % (type(self).__name__, attr) From c794bbfdd47e43adc06af5a6de7b3de211682b46 Mon Sep 17 00:00:00 2001 From: Oluokun Adedayo Date: Tue, 1 Oct 2019 15:55:26 +0100 Subject: [PATCH 17/22] DOC: Fixed PR08, PR09 doctring issues in pandas.core.groupby (#28709) --- pandas/core/groupby/generic.py | 2 +- pandas/core/groupby/groupby.py | 36 +++++++++++++++++----------------- 2 files changed, 19 insertions(+), 19 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 0ab19448043f6..b5aec189700ce 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -709,7 +709,7 @@ def filter(self, func, dropna=True, *args, **kwargs): f : function Function to apply to each subframe. Should return True or False. dropna : Drop groups that do not pass the filter. True by default; - if False, groups that evaluate False are filled with NaNs. + If False, groups that evaluate False are filled with NaNs. Returns ------- diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 984954fe14bb5..e93ce3ce93164 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -212,9 +212,9 @@ class providing the base-class of operations. string indicating the keyword of `callable` that expects the %(klass)s object. args : iterable, optional - positional arguments passed into `func`. + Positional arguments passed into `func`. kwargs : dict, optional - a dictionary of keyword arguments passed into `func`. + A dictionary of keyword arguments passed into `func`. Returns ------- @@ -664,11 +664,11 @@ def get_group(self, name, obj=None): Parameters ---------- name : object - the name of the group to get as a DataFrame + The name of the group to get as a DataFrame. obj : DataFrame, default None - the DataFrame to take the DataFrame out of. If + The DataFrame to take the DataFrame out of. If it is None, the object groupby was called on will - be used + be used. Returns ------- @@ -1114,7 +1114,7 @@ def any(self, skipna=True): Parameters ---------- skipna : bool, default True - Flag to ignore nan values during truth testing + Flag to ignore nan values during truth testing. Returns ------- @@ -1131,7 +1131,7 @@ def all(self, skipna=True): Parameters ---------- skipna : bool, default True - Flag to ignore nan values during truth testing + Flag to ignore nan values during truth testing. Returns ------- @@ -1252,7 +1252,7 @@ def std(self, ddof=1, *args, **kwargs): Parameters ---------- ddof : int, default 1 - degrees of freedom + Degrees of freedom. Returns ------- @@ -1275,7 +1275,7 @@ def var(self, ddof=1, *args, **kwargs): Parameters ---------- ddof : int, default 1 - degrees of freedom + Degrees of freedom. Returns ------- @@ -1310,7 +1310,7 @@ def sem(self, ddof=1): Parameters ---------- ddof : int, default 1 - degrees of freedom + Degrees of freedom. Returns ------- @@ -1622,7 +1622,7 @@ def pad(self, limit=None): Parameters ---------- limit : int, optional - limit of how many values to fill + Limit of how many values to fill. Returns ------- @@ -1648,7 +1648,7 @@ def backfill(self, limit=None): Parameters ---------- limit : int, optional - limit of how many values to fill + Limit of how many values to fill. Returns ------- @@ -1680,10 +1680,10 @@ def nth(self, n: Union[int, List[int]], dropna: Optional[str] = None) -> DataFra Parameters ---------- n : int or list of ints - a single nth value for the row or a list of nth values + A single nth value for the row or a list of nth values. dropna : None or str, optional - apply the specified dropna operation before counting which row is - the nth row. Needs to be None, 'any' or 'all' + Apply the specified dropna operation before counting which row is + the nth row. Needs to be None, 'any' or 'all'. Returns ------- @@ -2098,13 +2098,13 @@ def rank( * first: ranks assigned in order they appear in the array * dense: like 'min', but rank always increases by 1 between groups ascending : bool, default True - False for ranks by high (1) to low (N) + False for ranks by high (1) to low (N). na_option : {'keep', 'top', 'bottom'}, default 'keep' * keep: leave NA values where they are * top: smallest rank if ascending * bottom: smallest rank if descending pct : bool, default False - Compute percentage rank of data within each group + Compute percentage rank of data within each group. axis : int, default 0 The axis of the object over which to compute the rank. @@ -2312,7 +2312,7 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None): Parameters ---------- periods : int, default 1 - number of periods to shift + Number of periods to shift. freq : frequency string axis : axis to shift, default 0 fill_value : optional From 4e371c16ae1ff217a0f35a3527fdc02503e81881 Mon Sep 17 00:00:00 2001 From: "Laura Collard, PhD" <35954013+LauraCollard@users.noreply.github.com> Date: Tue, 1 Oct 2019 16:43:52 +0100 Subject: [PATCH 18/22] DOC: Fixed PR06 docstring errors in pandas.DataFrame (#28718) --- pandas/core/frame.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 16f34fee5e1ff..9467978f13d30 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -7948,7 +7948,7 @@ def idxmin(self, axis=0, skipna=True): ---------- axis : {0 or 'index', 1 or 'columns'}, default 0 The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise - skipna : boolean, default True + skipna : bool, default True Exclude NA/null values. If an entire row/column is NA, the result will be NA. @@ -7985,7 +7985,7 @@ def idxmax(self, axis=0, skipna=True): ---------- axis : {0 or 'index', 1 or 'columns'}, default 0 The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise - skipna : boolean, default True + skipna : bool, default True Exclude NA/null values. If an entire row/column is NA, the result will be NA. From 21f6505c931c715bfceee790458f5d6b1fceb5f0 Mon Sep 17 00:00:00 2001 From: "Laura Collard, PhD" <35954013+LauraCollard@users.noreply.github.com> Date: Tue, 1 Oct 2019 16:50:40 +0100 Subject: [PATCH 19/22] DOC: Fixed PR06 docstrings errors in pandas.arrays.IntervalArray (#28721) --- pandas/core/arrays/interval.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 1f4b76a259f00..6dd0b116b3b0d 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -259,7 +259,7 @@ def _from_factorized(cls, values, original): closed : {'left', 'right', 'both', 'neither'}, default 'right' Whether the intervals are closed on the left-side, right-side, both or neither. - copy : boolean, default False + copy : bool, default False copy the data dtype : dtype or None, default None If None, dtype will be inferred @@ -315,7 +315,7 @@ def from_breaks(cls, breaks, closed="right", copy=False, dtype=None): closed : {'left', 'right', 'both', 'neither'}, default 'right' Whether the intervals are closed on the left-side, right-side, both or neither. - copy : boolean, default False + copy : bool, default False Copy the data. dtype : dtype, optional If None, dtype will be inferred. @@ -387,7 +387,7 @@ def from_arrays(cls, left, right, closed="right", copy=False, dtype=None): closed : {'left', 'right', 'both', 'neither'}, default 'right' Whether the intervals are closed on the left-side, right-side, both or neither. - copy : boolean, default False + copy : bool, default False by-default copy the data, this is compat only and ignored dtype : dtype or None, default None If None, dtype will be inferred @@ -811,7 +811,7 @@ def value_counts(self, dropna=True): Parameters ---------- - dropna : boolean, default True + dropna : bool, default True Don't include counts of NaN. Returns From 7e05957c79bcf782db0ee5ca09296e2a833499a3 Mon Sep 17 00:00:00 2001 From: Josiah Baker Date: Tue, 1 Oct 2019 11:50:51 -0400 Subject: [PATCH 20/22] DOC: fix PR09,PR08 docstring errors in pandas.plotting (#28689) --- pandas/plotting/_misc.py | 91 +++++++++++++++++++--------------------- 1 file changed, 44 insertions(+), 47 deletions(-) diff --git a/pandas/plotting/_misc.py b/pandas/plotting/_misc.py index a8e86d9dfa997..74ce60c6116a9 100644 --- a/pandas/plotting/_misc.py +++ b/pandas/plotting/_misc.py @@ -14,9 +14,9 @@ def table(ax, data, rowLabels=None, colLabels=None, **kwargs): ---------- ax : Matplotlib axes object data : DataFrame or Series - data for table contents - kwargs : keywords, optional - keyword arguments which passed to matplotlib.table.table. + Data for table contents. + **kwargs + Keyword arguments to be passed to matplotlib.table.table. If `rowLabels` or `colLabels` is not specified, data index or column name will be used. @@ -82,7 +82,7 @@ def scatter_matrix( density_kwds=None, hist_kwds=None, range_padding=0.05, - **kwds + **kwargs ): """ Draw a matrix of scatter plots. @@ -91,28 +91,26 @@ def scatter_matrix( ---------- frame : DataFrame alpha : float, optional - amount of transparency applied + Amount of transparency applied. figsize : (float,float), optional - a tuple (width, height) in inches + A tuple (width, height) in inches. ax : Matplotlib axis object, optional grid : bool, optional - setting this to True will show the grid + Setting this to True will show the grid. diagonal : {'hist', 'kde'} - pick between 'kde' and 'hist' for - either Kernel Density Estimation or Histogram - plot in the diagonal + Pick between 'kde' and 'hist' for either Kernel Density Estimation or + Histogram plot in the diagonal. marker : str, optional - Matplotlib marker type, default '.' - hist_kwds : other plotting keyword arguments - To be passed to hist function - density_kwds : other plotting keyword arguments - To be passed to kernel density estimate plot - range_padding : float, optional - relative extension of axis range in x and y - with respect to (x_max - x_min) or (y_max - y_min), - default 0.05 - kwds : other plotting keyword arguments - To be passed to scatter function + Matplotlib marker type, default '.'. + density_kwds : keywords + Keyword arguments to be passed to kernel density estimate plot. + hist_kwds : keywords + Keyword arguments to be passed to hist function. + range_padding : float, default 0.05 + Relative extension of axis range in x and y with respect to + (x_max - x_min) or (y_max - y_min). + **kwargs + Keyword arguments to be passed to scatter function. Returns ------- @@ -136,7 +134,7 @@ def scatter_matrix( density_kwds=density_kwds, hist_kwds=hist_kwds, range_padding=range_padding, - **kwds + **kwargs ) @@ -215,7 +213,7 @@ def radviz(frame, class_column, ax=None, color=None, colormap=None, **kwds): @deprecate_kwarg(old_arg_name="data", new_arg_name="frame") def andrews_curves( - frame, class_column, ax=None, samples=200, color=None, colormap=None, **kwds + frame, class_column, ax=None, samples=200, color=None, colormap=None, **kwargs ): """ Generate a matplotlib plot of Andrews curves, for visualising clusters of @@ -233,17 +231,17 @@ def andrews_curves( Parameters ---------- frame : DataFrame - Data to be plotted, preferably normalized to (0.0, 1.0) + Data to be plotted, preferably normalized to (0.0, 1.0). class_column : Name of the column containing class names ax : matplotlib axes object, default None samples : Number of points to plot in each curve color : list or tuple, optional - Colors to use for the different classes + Colors to use for the different classes. colormap : str or matplotlib colormap object, default None Colormap to select colors from. If string, load colormap with that name from matplotlib. - kwds : keywords - Options to pass to matplotlib plotting method + **kwargs + Options to pass to matplotlib plotting method. Returns ------- @@ -257,7 +255,7 @@ def andrews_curves( samples=samples, color=color, colormap=colormap, - **kwds + **kwargs ) @@ -327,7 +325,7 @@ def parallel_coordinates( axvlines=True, axvlines_kwds=None, sort_labels=False, - **kwds + **kwargs ): """ Parallel coordinates plotting. @@ -336,30 +334,29 @@ def parallel_coordinates( ---------- frame : DataFrame class_column : str - Column name containing class names + Column name containing class names. cols : list, optional - A list of column names to use + A list of column names to use. ax : matplotlib.axis, optional - matplotlib axis object + Matplotlib axis object. color : list or tuple, optional - Colors to use for the different classes + Colors to use for the different classes. use_columns : bool, optional - If true, columns will be used as xticks + If true, columns will be used as xticks. xticks : list or tuple, optional - A list of values to use for xticks + A list of values to use for xticks. colormap : str or matplotlib colormap, default None Colormap to use for line colors. axvlines : bool, optional - If true, vertical lines will be added at each xtick + If true, vertical lines will be added at each xtick. axvlines_kwds : keywords, optional - Options to be passed to axvline method for vertical lines - sort_labels : bool, False - Sort class_column labels, useful when assigning colors + Options to be passed to axvline method for vertical lines. + sort_labels : bool, default False + Sort class_column labels, useful when assigning colors. .. versionadded:: 0.20.0 - - kwds : keywords - Options to pass to matplotlib plotting method + **kwargs + Options to pass to matplotlib plotting method. Returns ------- @@ -388,7 +385,7 @@ def parallel_coordinates( axvlines=axvlines, axvlines_kwds=axvlines_kwds, sort_labels=sort_labels, - **kwds + **kwargs ) @@ -411,7 +408,7 @@ def lag_plot(series, lag=1, ax=None, **kwds): return plot_backend.lag_plot(series=series, lag=lag, ax=ax, **kwds) -def autocorrelation_plot(series, ax=None, **kwds): +def autocorrelation_plot(series, ax=None, **kwargs): """ Autocorrelation plot for time series. @@ -419,15 +416,15 @@ def autocorrelation_plot(series, ax=None, **kwds): ---------- series : Time series ax : Matplotlib axis object, optional - kwds : keywords - Options to pass to matplotlib plotting method + **kwargs + Options to pass to matplotlib plotting method. Returns ------- class:`matplotlib.axis.Axes` """ plot_backend = _get_plot_backend("matplotlib") - return plot_backend.autocorrelation_plot(series=series, ax=ax, **kwds) + return plot_backend.autocorrelation_plot(series=series, ax=ax, **kwargs) def tsplot(series, plotf, ax=None, **kwargs): From dd028fd106f1480a79f942033f6bb1bc4d9cfd35 Mon Sep 17 00:00:00 2001 From: Tola A <33249563+tolaa001@users.noreply.github.com> Date: Tue, 1 Oct 2019 16:51:04 +0100 Subject: [PATCH 21/22] precursor to Split out test_pytables.py to sub-module of tests (#28715) --- pandas/tests/io/pytables/test_pytables.py | 859 +++++++++++----------- 1 file changed, 436 insertions(+), 423 deletions(-) diff --git a/pandas/tests/io/pytables/test_pytables.py b/pandas/tests/io/pytables/test_pytables.py index ae604b1141204..46d8ef04dd8e5 100644 --- a/pandas/tests/io/pytables/test_pytables.py +++ b/pandas/tests/io/pytables/test_pytables.py @@ -51,6 +51,19 @@ tables = pytest.importorskip("tables") +@pytest.fixture +def setup_path(): + """Fixture for setup path""" + return "tmp.__{}__.h5".format(tm.rands(10)) + + +@pytest.fixture(scope="class", autouse=True) +def setup_mode(): + tm.reset_testing_mode() + yield + tm.set_testing_mode() + + # TODO: # remove when gh-24839 is fixed; this affects numpy 1.16 # and pytables 3.4.4 @@ -148,36 +161,16 @@ def _maybe_remove(store, key): pass -class Base: - @classmethod - def setup_class(cls): - - # Pytables 3.0.0 deprecates lots of things - tm.reset_testing_mode() - - @classmethod - def teardown_class(cls): - - # Pytables 3.0.0 deprecates lots of things - tm.set_testing_mode() - - def setup_method(self, method): - self.path = "tmp.__{}__.h5".format(tm.rands(10)) - - def teardown_method(self, method): - pass - - @pytest.mark.single -class TestHDFStore(Base): - def test_format_kwarg_in_constructor(self): +class TestHDFStore: + def test_format_kwarg_in_constructor(self, setup_path): # GH 13291 - with ensure_clean_path(self.path) as path: + with ensure_clean_path(setup_path) as path: with pytest.raises(ValueError): HDFStore(path, format="table") - def test_context(self): - path = create_tempfile(self.path) + def test_context(self, setup_path): + path = create_tempfile(setup_path) try: with HDFStore(path) as tbl: raise ValueError("blah") @@ -196,8 +189,8 @@ def test_context(self): finally: safe_remove(path) - def test_conv_read_write(self): - path = create_tempfile(self.path) + def test_conv_read_write(self, setup_path): + path = create_tempfile(setup_path) try: def roundtrip(key, obj, **kwargs): @@ -222,24 +215,24 @@ def roundtrip(key, obj, **kwargs): finally: safe_remove(path) - def test_long_strings(self): + def test_long_strings(self, setup_path): # GH6166 df = DataFrame( {"a": tm.rands_array(100, size=10)}, index=tm.rands_array(100, size=10) ) - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: store.append("df", df, data_columns=["a"]) result = store.select("df") assert_frame_equal(df, result) - def test_api(self): + def test_api(self, setup_path): # GH4584 # API issue when to_hdf doesn't accept append AND format args - with ensure_clean_path(self.path) as path: + with ensure_clean_path(setup_path) as path: df = tm.makeDataFrame() df.iloc[:10].to_hdf(path, "df", append=True, format="table") @@ -251,7 +244,7 @@ def test_api(self): df.iloc[10:].to_hdf(path, "df", append=True, format="table") assert_frame_equal(read_hdf(path, "df"), df) - with ensure_clean_path(self.path) as path: + with ensure_clean_path(setup_path) as path: df = tm.makeDataFrame() df.iloc[:10].to_hdf(path, "df", append=True) @@ -263,7 +256,7 @@ def test_api(self): df.iloc[10:].to_hdf(path, "df", append=True) assert_frame_equal(read_hdf(path, "df"), df) - with ensure_clean_path(self.path) as path: + with ensure_clean_path(setup_path) as path: df = tm.makeDataFrame() df.to_hdf(path, "df", append=False, format="fixed") @@ -278,7 +271,7 @@ def test_api(self): df.to_hdf(path, "df") assert_frame_equal(read_hdf(path, "df"), df) - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: path = store._path df = tm.makeDataFrame() @@ -305,7 +298,7 @@ def test_api(self): store.append("df", df.iloc[10:], append=True, format=None) assert_frame_equal(store.select("df"), df) - with ensure_clean_path(self.path) as path: + with ensure_clean_path(setup_path) as path: # Invalid. df = tm.makeDataFrame() @@ -326,10 +319,10 @@ def test_api(self): with pytest.raises(FileNotFoundError): read_hdf(path, "df") - def test_api_default_format(self): + def test_api_default_format(self, setup_path): # default_format option - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: df = tm.makeDataFrame() pd.set_option("io.hdf.default_format", "fixed") @@ -349,7 +342,7 @@ def test_api_default_format(self): pd.set_option("io.hdf.default_format", None) - with ensure_clean_path(self.path) as path: + with ensure_clean_path(setup_path) as path: df = tm.makeDataFrame() @@ -370,9 +363,9 @@ def test_api_default_format(self): pd.set_option("io.hdf.default_format", None) - def test_keys(self): + def test_keys(self, setup_path): - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: store["a"] = tm.makeTimeSeries() store["b"] = tm.makeStringSeries() store["c"] = tm.makeDataFrame() @@ -382,12 +375,12 @@ def test_keys(self): assert set(store.keys()) == expected assert set(store) == expected - def test_keys_ignore_hdf_softlink(self): + def test_keys_ignore_hdf_softlink(self, setup_path): # GH 20523 # Puts a softlink into HDF file and rereads - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: df = DataFrame(dict(A=range(5), B=range(5))) store.put("df", df) @@ -399,15 +392,15 @@ def test_keys_ignore_hdf_softlink(self): # Should ignore the softlink assert store.keys() == ["/df"] - def test_iter_empty(self): + def test_iter_empty(self, setup_path): - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: # GH 12221 assert list(store) == [] - def test_repr(self): + def test_repr(self, setup_path): - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: repr(store) store.info() store["a"] = tm.makeTimeSeries() @@ -441,7 +434,7 @@ def test_repr(self): store.info() # storers - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: df = tm.makeDataFrame() store.append("df", df) @@ -451,9 +444,9 @@ def test_repr(self): str(s) @ignore_natural_naming_warning - def test_contains(self): + def test_contains(self, setup_path): - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: store["a"] = tm.makeTimeSeries() store["b"] = tm.makeDataFrame() store["foo/bar"] = tm.makeDataFrame() @@ -470,9 +463,9 @@ def test_contains(self): store["node())"] = tm.makeDataFrame() assert "node())" in store - def test_versioning(self): + def test_versioning(self, setup_path): - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: store["a"] = tm.makeTimeSeries() store["b"] = tm.makeDataFrame() df = tm.makeTimeDataFrame() @@ -493,13 +486,13 @@ def test_versioning(self): with pytest.raises(Exception): store.select("df2") - def test_mode(self): + def test_mode(self, setup_path): df = tm.makeTimeDataFrame() def check(mode): - with ensure_clean_path(self.path) as path: + with ensure_clean_path(setup_path) as path: # constructor if mode in ["r", "r+"]: @@ -511,7 +504,7 @@ def check(mode): assert store._handle.mode == mode store.close() - with ensure_clean_path(self.path) as path: + with ensure_clean_path(setup_path) as path: # context if mode in ["r", "r+"]: @@ -522,7 +515,7 @@ def check(mode): with HDFStore(path, mode=mode) as store: assert store._handle.mode == mode - with ensure_clean_path(self.path) as path: + with ensure_clean_path(setup_path) as path: # conv write if mode in ["r", "r+"]: @@ -543,7 +536,7 @@ def check(mode): def check_default_mode(): # read_hdf uses default mode - with ensure_clean_path(self.path) as path: + with ensure_clean_path(setup_path) as path: df.to_hdf(path, "df", mode="w") result = read_hdf(path, "df") assert_frame_equal(result, df) @@ -554,9 +547,9 @@ def check_default_mode(): check("w") check_default_mode() - def test_reopen_handle(self): + def test_reopen_handle(self, setup_path): - with ensure_clean_path(self.path) as path: + with ensure_clean_path(setup_path) as path: store = HDFStore(path, mode="a") store["a"] = tm.makeTimeSeries() @@ -602,9 +595,9 @@ def test_reopen_handle(self): store.close() assert not store.is_open - def test_open_args(self): + def test_open_args(self, setup_path): - with ensure_clean_path(self.path) as path: + with ensure_clean_path(setup_path) as path: df = tm.makeDataFrame() @@ -623,16 +616,16 @@ def test_open_args(self): # the file should not have actually been written assert not os.path.exists(path) - def test_flush(self): + def test_flush(self, setup_path): - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: store["a"] = tm.makeTimeSeries() store.flush() store.flush(fsync=True) - def test_get(self): + def test_get(self, setup_path): - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: store["a"] = tm.makeTimeSeries() left = store.get("a") right = store["a"] @@ -666,7 +659,7 @@ def test_get(self): ), ], ) - def test_walk(self, where, expected): + def test_walk(self, where, expected, setup_path): # GH10143 objs = { "df1": pd.DataFrame([1, 2, 3]), @@ -705,9 +698,9 @@ def test_walk(self, where, expected): else: tm.assert_series_equal(obj, objs[leaf]) - def test_getattr(self): + def test_getattr(self, setup_path): - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: s = tm.makeTimeSeries() store["a"] = s @@ -732,9 +725,9 @@ def test_getattr(self): for x in ["mode", "path", "handle", "complib"]: getattr(store, "_{x}".format(x=x)) - def test_put(self): + def test_put(self, setup_path): - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: ts = tm.makeTimeSeries() df = tm.makeTimeDataFrame() @@ -763,9 +756,9 @@ def test_put(self): store.put("c", df[:10], format="table", append=False) tm.assert_frame_equal(df[:10], store["c"]) - def test_put_string_index(self): + def test_put_string_index(self, setup_path): - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: index = Index( ["I am a very long string index: {i}".format(i=i) for i in range(20)] @@ -792,9 +785,9 @@ def test_put_string_index(self): store["b"] = df tm.assert_frame_equal(store["b"], df) - def test_put_compression(self): + def test_put_compression(self, setup_path): - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: df = tm.makeTimeDataFrame() store.put("c", df, format="table", complib="zlib") @@ -805,10 +798,10 @@ def test_put_compression(self): store.put("b", df, format="fixed", complib="zlib") @td.skip_if_windows_python_3 - def test_put_compression_blosc(self): + def test_put_compression_blosc(self, setup_path): df = tm.makeTimeDataFrame() - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: # can't compress if format='fixed' with pytest.raises(ValueError): @@ -817,13 +810,13 @@ def test_put_compression_blosc(self): store.put("c", df, format="table", complib="blosc") tm.assert_frame_equal(store["c"], df) - def test_complibs_default_settings(self): + def test_complibs_default_settings(self, setup_path): # GH15943 df = tm.makeDataFrame() # Set complevel and check if complib is automatically set to # default value - with ensure_clean_path(self.path) as tmpfile: + with ensure_clean_path(setup_path) as tmpfile: df.to_hdf(tmpfile, "df", complevel=9) result = pd.read_hdf(tmpfile, "df") tm.assert_frame_equal(result, df) @@ -834,7 +827,7 @@ def test_complibs_default_settings(self): assert node.filters.complib == "zlib" # Set complib and check to see if compression is disabled - with ensure_clean_path(self.path) as tmpfile: + with ensure_clean_path(setup_path) as tmpfile: df.to_hdf(tmpfile, "df", complib="zlib") result = pd.read_hdf(tmpfile, "df") tm.assert_frame_equal(result, df) @@ -845,7 +838,7 @@ def test_complibs_default_settings(self): assert node.filters.complib is None # Check if not setting complib or complevel results in no compression - with ensure_clean_path(self.path) as tmpfile: + with ensure_clean_path(setup_path) as tmpfile: df.to_hdf(tmpfile, "df") result = pd.read_hdf(tmpfile, "df") tm.assert_frame_equal(result, df) @@ -856,7 +849,7 @@ def test_complibs_default_settings(self): assert node.filters.complib is None # Check if file-defaults can be overridden on a per table basis - with ensure_clean_path(self.path) as tmpfile: + with ensure_clean_path(setup_path) as tmpfile: store = pd.HDFStore(tmpfile) store.append("dfc", df, complevel=9, complib="blosc") store.append("df", df) @@ -870,7 +863,7 @@ def test_complibs_default_settings(self): assert node.filters.complevel == 9 assert node.filters.complib == "blosc" - def test_complibs(self): + def test_complibs(self, setup_path): # GH14478 df = tm.makeDataFrame() @@ -887,7 +880,7 @@ def test_complibs(self): all_tests = [(lib, lvl) for lib in all_complibs for lvl in all_levels] for (lib, lvl) in all_tests: - with ensure_clean_path(self.path) as tmpfile: + with ensure_clean_path(setup_path) as tmpfile: gname = "foo" # Write and read file to see if data is consistent @@ -906,13 +899,13 @@ def test_complibs(self): assert node.filters.complib == lib h5table.close() - def test_put_integer(self): + def test_put_integer(self, setup_path): # non-date, non-string index df = DataFrame(np.random.randn(50, 100)) - self._check_roundtrip(df, tm.assert_frame_equal) + self._check_roundtrip(df, tm.assert_frame_equal, setup_path) @xfail_non_writeable - def test_put_mixed_type(self): + def test_put_mixed_type(self, setup_path): df = tm.makeTimeDataFrame() df["obj1"] = "foo" df["obj2"] = "bar" @@ -928,7 +921,7 @@ def test_put_mixed_type(self): df.loc[3:6, ["obj1"]] = np.nan df = df._consolidate()._convert(datetime=True) - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: _maybe_remove(store, "df") # PerformanceWarning @@ -942,9 +935,9 @@ def test_put_mixed_type(self): @pytest.mark.filterwarnings( "ignore:object name:tables.exceptions.NaturalNameWarning" ) - def test_append(self): + def test_append(self, setup_path): - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: # this is allowed by almost always don't want to do it # tables.NaturalNameWarning): @@ -1010,9 +1003,9 @@ def test_append(self): store.append("uints", uint_data, data_columns=["u08", "u16", "u32"]) tm.assert_frame_equal(store["uints"], uint_data) - def test_append_series(self): + def test_append_series(self, setup_path): - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: # basic ss = tm.makeStringSeries() @@ -1056,11 +1049,11 @@ def test_append_series(self): store.append("mi", s) tm.assert_series_equal(store["mi"], s) - def test_store_index_types(self): + def test_store_index_types(self, setup_path): # GH5386 # test storing various index types - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: def check(format, index): df = DataFrame(np.random.randn(10, 2), columns=list("AB")) @@ -1093,9 +1086,9 @@ def check(format, index): @pytest.mark.skipif( not is_platform_little_endian(), reason="reason platform is not little endian" ) - def test_encoding(self): + def test_encoding(self, setup_path): - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: df = DataFrame(dict(A="foo", B="bar"), index=range(5)) df.loc[2, "A"] = np.nan df.loc[3, "B"] = np.nan @@ -1122,7 +1115,7 @@ def test_encoding(self): ], ) @pytest.mark.parametrize("dtype", ["category", object]) - def test_latin_encoding(self, dtype, val): + def test_latin_encoding(self, setup_path, dtype, val): enc = "latin-1" nan_rep = "" key = "data" @@ -1130,7 +1123,7 @@ def test_latin_encoding(self, dtype, val): val = [x.decode(enc) if isinstance(x, bytes) else x for x in val] ser = pd.Series(val, dtype=dtype) - with ensure_clean_path(self.path) as store: + with ensure_clean_path(setup_path) as store: ser.to_hdf(store, key, format="table", encoding=enc, nan_rep=nan_rep) retr = read_hdf(store, key) @@ -1147,9 +1140,9 @@ def test_latin_encoding(self, dtype, val): # for x in examples: # roundtrip(s, nan_rep=b'\xf8\xfc') - def test_append_some_nans(self): + def test_append_some_nans(self, setup_path): - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: df = DataFrame( { "A": Series(np.random.randn(20)).astype("int32"), @@ -1193,9 +1186,9 @@ def test_append_some_nans(self): store.append("df3", df3[10:]) tm.assert_frame_equal(store["df3"], df3) - def test_append_all_nans(self): + def test_append_all_nans(self, setup_path): - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: df = DataFrame( {"A1": np.random.randn(20), "A2": np.random.randn(20)}, @@ -1283,14 +1276,14 @@ def test_append_all_nans(self): {"col1": [0, np.nan, 2], "col2": [1, np.nan, np.nan]} ) - with ensure_clean_path(self.path) as path: + with ensure_clean_path(setup_path) as path: df_with_missing.to_hdf(path, "df_with_missing", format="table") reloaded = read_hdf(path, "df_with_missing") tm.assert_frame_equal(df_with_missing, reloaded) - def test_read_missing_key_close_store(self): + def test_read_missing_key_close_store(self, setup_path): # GH 25766 - with ensure_clean_path(self.path) as path: + with ensure_clean_path(setup_path) as path: df = pd.DataFrame({"a": range(2), "b": range(2)}) df.to_hdf(path, "k1") @@ -1301,9 +1294,9 @@ def test_read_missing_key_close_store(self): # read with KeyError before another write df.to_hdf(path, "k2") - def test_append_frame_column_oriented(self): + def test_append_frame_column_oriented(self, setup_path): - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: # column oriented df = tm.makeTimeDataFrame() @@ -1325,10 +1318,10 @@ def test_append_frame_column_oriented(self): with pytest.raises(TypeError): store.select("df1", "columns=A and index>df.index[4]") - def test_append_with_different_block_ordering(self): + def test_append_with_different_block_ordering(self, setup_path): # GH 4096; using same frames, but different block orderings - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: for i in range(10): @@ -1351,7 +1344,7 @@ def test_append_with_different_block_ordering(self): # test a different ordering but with more fields (like invalid # combinate) - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: df = DataFrame(np.random.randn(10, 2), columns=list("AB"), dtype="float64") df["int64"] = Series([1] * len(df), dtype="int64") @@ -1368,9 +1361,9 @@ def test_append_with_different_block_ordering(self): with pytest.raises(ValueError): store.append("df", df) - def test_append_with_strings(self): + def test_append_with_strings(self, setup_path): - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: with catch_warnings(record=True): def check_col(key, name, size): @@ -1444,7 +1437,7 @@ def check_col(key, name, size): result = store.select("df") tm.assert_frame_equal(result, df) - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: def check_col(key, name, size): assert getattr( @@ -1484,9 +1477,9 @@ def check_col(key, name, size): with pytest.raises(ValueError): store.append("df", df, min_itemsize={"foo": 20, "foobar": 20}) - def test_append_with_empty_string(self): + def test_append_with_empty_string(self, setup_path): - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: # with all empty strings (GH 12242) df = DataFrame({"x": ["a", "b", "c", "d", "e", "f", ""]}) @@ -1494,9 +1487,9 @@ def test_append_with_empty_string(self): store.append("df", df[-1:], min_itemsize={"x": 1}) tm.assert_frame_equal(store.select("df"), df) - def test_to_hdf_with_min_itemsize(self): + def test_to_hdf_with_min_itemsize(self, setup_path): - with ensure_clean_path(self.path) as path: + with ensure_clean_path(setup_path) as path: # min_itemsize in index with to_hdf (GH 10381) df = tm.makeMixedDataFrame().set_index("C") @@ -1516,20 +1509,20 @@ def test_to_hdf_with_min_itemsize(self): @pytest.mark.parametrize( "format", [pytest.param("fixed", marks=xfail_non_writeable), "table"] ) - def test_to_hdf_errors(self, format): + def test_to_hdf_errors(self, format, setup_path): data = ["\ud800foo"] ser = pd.Series(data, index=pd.Index(data)) - with ensure_clean_path(self.path) as path: + with ensure_clean_path(setup_path) as path: # GH 20835 ser.to_hdf(path, "table", format=format, errors="surrogatepass") result = pd.read_hdf(path, "table", errors="surrogatepass") tm.assert_series_equal(result, ser) - def test_append_with_data_columns(self): + def test_append_with_data_columns(self, setup_path): - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: df = tm.makeTimeDataFrame() df.iloc[0, df.columns.get_loc("B")] = 1.0 _maybe_remove(store, "df") @@ -1570,7 +1563,7 @@ def check_col(key, name, size): == size ) - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: _maybe_remove(store, "df") store.append( "df", df_new, data_columns=["string"], min_itemsize={"string": 30} @@ -1585,7 +1578,7 @@ def check_col(key, name, size): ) check_col("df", "string", 30) - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: df_new["string2"] = "foobarbah" df_new["string_block1"] = "foobarbah1" df_new["string_block2"] = "foobarbah2" @@ -1600,7 +1593,7 @@ def check_col(key, name, size): check_col("df", "string2", 40) check_col("df", "values_block_1", 50) - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: # multiple data columns df_new = df.copy() df_new.iloc[0, df_new.columns.get_loc("A")] = 1.0 @@ -1633,7 +1626,7 @@ def check_col(key, name, size): expected = df_new[(df_new.string == "foo") & (df_new.string2 == "cool")] tm.assert_frame_equal(result, expected, check_index_type=False) - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: # doc example df_dc = df.copy() df_dc["string"] = "foo" @@ -1657,7 +1650,7 @@ def check_col(key, name, size): expected = df_dc[(df_dc.B > 0) & (df_dc.C > 0) & (df_dc.string == "foo")] tm.assert_frame_equal(result, expected, check_index_type=False) - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: # doc example part 2 np.random.seed(1234) index = date_range("1/1/2000", periods=8) @@ -1681,9 +1674,9 @@ def check_col(key, name, size): expected = df_dc[(df_dc.B > 0) & (df_dc.C > 0) & (df_dc.string == "foo")] tm.assert_frame_equal(result, expected) - def test_create_table_index(self): + def test_create_table_index(self, setup_path): - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: with catch_warnings(record=True): @@ -1713,7 +1706,7 @@ def col(t, column): with pytest.raises(TypeError): store.create_table_index("f2") - def test_append_hierarchical(self): + def test_append_hierarchical(self, setup_path): index = MultiIndex( levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], @@ -1721,7 +1714,7 @@ def test_append_hierarchical(self): ) df = DataFrame(np.random.randn(10, 3), index=index, columns=["A", "B", "C"]) - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: store.append("mi", df) result = store.select("mi") tm.assert_frame_equal(result, df) @@ -1737,7 +1730,7 @@ def test_append_hierarchical(self): expected = df.reindex(columns=["A", "B"]) tm.assert_frame_equal(result, expected) - def test_column_multiindex(self): + def test_column_multiindex(self, setup_path): # GH 4710 # recreate multi-indexes properly @@ -1749,7 +1742,7 @@ def test_column_multiindex(self): if isinstance(expected.index, RangeIndex): expected.index = Int64Index(expected.index) - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: store.put("df", df) tm.assert_frame_equal( @@ -1767,7 +1760,7 @@ def test_column_multiindex(self): store.put("df3", df, format="table", data_columns=True) # appending multi-column on existing table (see GH 6167) - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: store.append("df2", df) store.append("df2", df) @@ -1781,18 +1774,18 @@ def test_column_multiindex(self): if isinstance(expected.index, RangeIndex): expected.index = Int64Index(expected.index) - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: store.put("df1", df, format="table") tm.assert_frame_equal( store["df1"], expected, check_index_type=True, check_column_type=True ) - def test_store_multiindex(self): + def test_store_multiindex(self, setup_path): # validate multi-index names # GH 5527 - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: def make_index(names=None): return MultiIndex.from_tuples( @@ -1858,7 +1851,7 @@ def make_index(names=None): store.append("df", df) tm.assert_frame_equal(store.select("df"), df) - def test_select_columns_in_where(self): + def test_select_columns_in_where(self, setup_path): # GH 6169 # recreate multi-indexes when columns is passed @@ -1872,7 +1865,7 @@ def test_select_columns_in_where(self): # With a DataFrame df = DataFrame(np.random.randn(10, 3), index=index, columns=["A", "B", "C"]) - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: store.put("df", df, format="table") expected = df[["A"]] @@ -1882,29 +1875,29 @@ def test_select_columns_in_where(self): # With a Series s = Series(np.random.randn(10), index=index, name="A") - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: store.put("s", s, format="table") tm.assert_series_equal(store.select("s", where="columns=['A']"), s) - def test_mi_data_columns(self): + def test_mi_data_columns(self, setup_path): # GH 14435 idx = pd.MultiIndex.from_arrays( [date_range("2000-01-01", periods=5), range(5)], names=["date", "id"] ) df = pd.DataFrame({"a": [1.1, 1.2, 1.3, 1.4, 1.5]}, index=idx) - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: store.append("df", df, data_columns=True) actual = store.select("df", where="id == 1") expected = df.iloc[[1], :] tm.assert_frame_equal(actual, expected) - def test_pass_spec_to_storer(self): + def test_pass_spec_to_storer(self, setup_path): df = tm.makeDataFrame() - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: store.put("df", df) with pytest.raises(TypeError): store.select("df", columns=["A"]) @@ -1912,9 +1905,9 @@ def test_pass_spec_to_storer(self): store.select("df", where=[("columns=A")]) @xfail_non_writeable - def test_append_misc(self): + def test_append_misc(self, setup_path): - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: df = tm.makeDataFrame() store.append("df", df, chunksize=1) result = store.select("df") @@ -1927,7 +1920,7 @@ def test_append_misc(self): # more chunksize in append tests def check(obj, comparator): for c in [10, 200, 1000]: - with ensure_clean_store(self.path, mode="w") as store: + with ensure_clean_store(setup_path, mode="w") as store: store.append("obj", obj, chunksize=c) result = store.select("obj") comparator(result, obj) @@ -1942,7 +1935,7 @@ def check(obj, comparator): check(df, tm.assert_frame_equal) # empty frame, GH4273 - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: # 0 len df_empty = DataFrame(columns=list("ABC")) @@ -1962,9 +1955,9 @@ def check(obj, comparator): store.put("df2", df) assert_frame_equal(store.select("df2"), df) - def test_append_raise(self): + def test_append_raise(self, setup_path): - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: # test append with invalid input to get good error messages @@ -2007,18 +2000,18 @@ def test_append_raise(self): with pytest.raises(ValueError): store.append("df", df) - def test_table_index_incompatible_dtypes(self): + def test_table_index_incompatible_dtypes(self, setup_path): df1 = DataFrame({"a": [1, 2, 3]}) df2 = DataFrame({"a": [4, 5, 6]}, index=date_range("1/1/2000", periods=3)) - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: store.put("frame", df1, format="table") with pytest.raises(TypeError): store.put("frame", df2, format="table", append=True) - def test_table_values_dtypes_roundtrip(self): + def test_table_values_dtypes_roundtrip(self, setup_path): - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: df1 = DataFrame({"a": [1, 2, 3]}, dtype="f8") store.append("df_f8", df1) assert_series_equal(df1.dtypes, store["df_f8"].dtypes) @@ -2072,7 +2065,7 @@ def test_table_values_dtypes_roundtrip(self): expected = expected.sort_index() tm.assert_series_equal(result, expected) - def test_table_mixed_dtypes(self): + def test_table_mixed_dtypes(self, setup_path): # frame df = tm.makeDataFrame() @@ -2090,13 +2083,13 @@ def test_table_mixed_dtypes(self): df.loc[3:6, ["obj1"]] = np.nan df = df._consolidate()._convert(datetime=True) - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: store.append("df1_mixed", df) tm.assert_frame_equal(store.select("df1_mixed"), df) - def test_unimplemented_dtypes_table_columns(self): + def test_unimplemented_dtypes_table_columns(self, setup_path): - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: dtypes = [("date", datetime.date(2001, 1, 2))] @@ -2114,7 +2107,7 @@ def test_unimplemented_dtypes_table_columns(self): df["datetime1"] = datetime.date(2001, 1, 2) df = df._consolidate()._convert(datetime=True) - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: # this fails because we have a date in the object block...... with pytest.raises(TypeError): store.append("df_unimplemented", df) @@ -2127,7 +2120,7 @@ def test_unimplemented_dtypes_table_columns(self): "exactly equal to 1.15.0: gh-22098" ), ) - def test_calendar_roundtrip_issue(self): + def test_calendar_roundtrip_issue(self, setup_path): # 8591 # doc example from tseries holiday section @@ -2145,7 +2138,7 @@ def test_calendar_roundtrip_issue(self): s = Series(dts.weekday, dts).map(Series("Mon Tue Wed Thu Fri Sat Sun".split())) - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: store.put("fixed", s) result = store.select("fixed") @@ -2155,18 +2148,18 @@ def test_calendar_roundtrip_issue(self): result = store.select("table") assert_series_equal(result, s) - def test_roundtrip_tz_aware_index(self): + def test_roundtrip_tz_aware_index(self, setup_path): # GH 17618 time = pd.Timestamp("2000-01-01 01:00:00", tz="US/Eastern") df = pd.DataFrame(data=[0], index=[time]) - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: store.put("frame", df, format="fixed") recons = store["frame"] tm.assert_frame_equal(recons, df) assert recons.index[0].value == 946706400000000000 - def test_append_with_timedelta(self): + def test_append_with_timedelta(self, setup_path): # GH 3577 # append timedelta @@ -2182,7 +2175,7 @@ def test_append_with_timedelta(self): df["C"] = df["A"] - df["B"] df.loc[3:5, "C"] = np.nan - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: # table _maybe_remove(store, "df") @@ -2215,9 +2208,9 @@ def test_append_with_timedelta(self): result = store.select("df2") assert_frame_equal(result, df) - def test_remove(self): + def test_remove(self, setup_path): - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: ts = tm.makeTimeSeries() df = tm.makeDataFrame() @@ -2255,9 +2248,9 @@ def test_remove(self): del store["b"] assert len(store) == 0 - def test_invalid_terms(self): + def test_invalid_terms(self, setup_path): - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: with catch_warnings(record=True): @@ -2279,7 +2272,7 @@ def test_invalid_terms(self): store.select("df", "index>") # from the docs - with ensure_clean_path(self.path) as path: + with ensure_clean_path(setup_path) as path: dfq = DataFrame( np.random.randn(10, 4), columns=list("ABCD"), @@ -2294,7 +2287,7 @@ def test_invalid_terms(self): read_hdf(path, "dfq", where="A>0 or C>0") # catch the invalid reference - with ensure_clean_path(self.path) as path: + with ensure_clean_path(setup_path) as path: dfq = DataFrame( np.random.randn(10, 4), columns=list("ABCD"), @@ -2305,9 +2298,9 @@ def test_invalid_terms(self): with pytest.raises(ValueError): read_hdf(path, "dfq", where="A>0 or C>0") - def test_same_name_scoping(self): + def test_same_name_scoping(self, setup_path): - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: import pandas as pd @@ -2331,29 +2324,31 @@ def test_same_name_scoping(self): result = store.select("df", "index>datetime(2013,1,5)") assert_frame_equal(result, expected) - def test_series(self): + def test_series(self, setup_path): s = tm.makeStringSeries() - self._check_roundtrip(s, tm.assert_series_equal) + self._check_roundtrip(s, tm.assert_series_equal, path=setup_path) ts = tm.makeTimeSeries() - self._check_roundtrip(ts, tm.assert_series_equal) + self._check_roundtrip(ts, tm.assert_series_equal, path=setup_path) ts2 = Series(ts.index, Index(ts.index, dtype=object)) - self._check_roundtrip(ts2, tm.assert_series_equal) + self._check_roundtrip(ts2, tm.assert_series_equal, path=setup_path) ts3 = Series(ts.values, Index(np.asarray(ts.index, dtype=object), dtype=object)) - self._check_roundtrip(ts3, tm.assert_series_equal, check_index_type=False) + self._check_roundtrip( + ts3, tm.assert_series_equal, path=setup_path, check_index_type=False + ) - def test_float_index(self): + def test_float_index(self, setup_path): # GH #454 index = np.random.randn(10) s = Series(np.random.randn(10), index=index) - self._check_roundtrip(s, tm.assert_series_equal) + self._check_roundtrip(s, tm.assert_series_equal, path=setup_path) @xfail_non_writeable - def test_tuple_index(self): + def test_tuple_index(self, setup_path): # GH #492 col = np.arange(10) @@ -2363,11 +2358,11 @@ def test_tuple_index(self): with catch_warnings(record=True): simplefilter("ignore", pd.errors.PerformanceWarning) - self._check_roundtrip(DF, tm.assert_frame_equal) + self._check_roundtrip(DF, tm.assert_frame_equal, path=setup_path) @xfail_non_writeable @pytest.mark.filterwarnings("ignore::pandas.errors.PerformanceWarning") - def test_index_types(self): + def test_index_types(self, setup_path): with catch_warnings(record=True): values = np.random.randn(2) @@ -2378,54 +2373,54 @@ def test_index_types(self): with catch_warnings(record=True): ser = Series(values, [0, "y"]) - self._check_roundtrip(ser, func) + self._check_roundtrip(ser, func, path=setup_path) with catch_warnings(record=True): ser = Series(values, [datetime.datetime.today(), 0]) - self._check_roundtrip(ser, func) + self._check_roundtrip(ser, func, path=setup_path) with catch_warnings(record=True): ser = Series(values, ["y", 0]) - self._check_roundtrip(ser, func) + self._check_roundtrip(ser, func, path=setup_path) with catch_warnings(record=True): ser = Series(values, [datetime.date.today(), "a"]) - self._check_roundtrip(ser, func) + self._check_roundtrip(ser, func, path=setup_path) with catch_warnings(record=True): ser = Series(values, [0, "y"]) - self._check_roundtrip(ser, func) + self._check_roundtrip(ser, func, path=setup_path) ser = Series(values, [datetime.datetime.today(), 0]) - self._check_roundtrip(ser, func) + self._check_roundtrip(ser, func, path=setup_path) ser = Series(values, ["y", 0]) - self._check_roundtrip(ser, func) + self._check_roundtrip(ser, func, path=setup_path) ser = Series(values, [datetime.date.today(), "a"]) - self._check_roundtrip(ser, func) + self._check_roundtrip(ser, func, path=setup_path) ser = Series(values, [1.23, "b"]) - self._check_roundtrip(ser, func) + self._check_roundtrip(ser, func, path=setup_path) ser = Series(values, [1, 1.53]) - self._check_roundtrip(ser, func) + self._check_roundtrip(ser, func, path=setup_path) ser = Series(values, [1, 5]) - self._check_roundtrip(ser, func) + self._check_roundtrip(ser, func, path=setup_path) ser = Series( values, [datetime.datetime(2012, 1, 1), datetime.datetime(2012, 1, 2)] ) - self._check_roundtrip(ser, func) + self._check_roundtrip(ser, func, path=setup_path) - def test_timeseries_preepoch(self): + def test_timeseries_preepoch(self, setup_path): dr = bdate_range("1/1/1940", "1/1/1960") ts = Series(np.random.randn(len(dr)), index=dr) try: - self._check_roundtrip(ts, tm.assert_series_equal) + self._check_roundtrip(ts, tm.assert_series_equal, path=setup_path) except OverflowError: pytest.skip("known failer on some windows platforms") @@ -2433,7 +2428,7 @@ def test_timeseries_preepoch(self): @pytest.mark.parametrize( "compression", [False, pytest.param(True, marks=td.skip_if_windows_python_3)] ) - def test_frame(self, compression): + def test_frame(self, compression, setup_path): df = tm.makeDataFrame() @@ -2441,13 +2436,19 @@ def test_frame(self, compression): df.values[0, 0] = np.nan df.values[5, 3] = np.nan - self._check_roundtrip_table(df, tm.assert_frame_equal, compression=compression) - self._check_roundtrip(df, tm.assert_frame_equal, compression=compression) + self._check_roundtrip_table( + df, tm.assert_frame_equal, path=setup_path, compression=compression + ) + self._check_roundtrip( + df, tm.assert_frame_equal, path=setup_path, compression=compression + ) tdf = tm.makeTimeDataFrame() - self._check_roundtrip(tdf, tm.assert_frame_equal, compression=compression) + self._check_roundtrip( + tdf, tm.assert_frame_equal, path=setup_path, compression=compression + ) - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: # not consolidated df["foo"] = np.random.randn(len(df)) store["df"] = df @@ -2455,38 +2456,38 @@ def test_frame(self, compression): assert recons._data.is_consolidated() # empty - self._check_roundtrip(df[:0], tm.assert_frame_equal) + self._check_roundtrip(df[:0], tm.assert_frame_equal, path=setup_path) @xfail_non_writeable - def test_empty_series_frame(self): + def test_empty_series_frame(self, setup_path): s0 = Series() s1 = Series(name="myseries") df0 = DataFrame() df1 = DataFrame(index=["a", "b", "c"]) df2 = DataFrame(columns=["d", "e", "f"]) - self._check_roundtrip(s0, tm.assert_series_equal) - self._check_roundtrip(s1, tm.assert_series_equal) - self._check_roundtrip(df0, tm.assert_frame_equal) - self._check_roundtrip(df1, tm.assert_frame_equal) - self._check_roundtrip(df2, tm.assert_frame_equal) + self._check_roundtrip(s0, tm.assert_series_equal, path=setup_path) + self._check_roundtrip(s1, tm.assert_series_equal, path=setup_path) + self._check_roundtrip(df0, tm.assert_frame_equal, path=setup_path) + self._check_roundtrip(df1, tm.assert_frame_equal, path=setup_path) + self._check_roundtrip(df2, tm.assert_frame_equal, path=setup_path) @xfail_non_writeable @pytest.mark.parametrize( "dtype", [np.int64, np.float64, np.object, "m8[ns]", "M8[ns]"] ) - def test_empty_series(self, dtype): + def test_empty_series(self, dtype, setup_path): s = Series(dtype=dtype) - self._check_roundtrip(s, tm.assert_series_equal) + self._check_roundtrip(s, tm.assert_series_equal, path=setup_path) - def test_can_serialize_dates(self): + def test_can_serialize_dates(self, setup_path): rng = [x.date() for x in bdate_range("1/1/2000", "1/30/2000")] frame = DataFrame(np.random.randn(len(rng), 4), index=rng) - self._check_roundtrip(frame, tm.assert_frame_equal) + self._check_roundtrip(frame, tm.assert_frame_equal, path=setup_path) - def test_store_hierarchical(self): + def test_store_hierarchical(self, setup_path): index = MultiIndex( levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], @@ -2494,39 +2495,39 @@ def test_store_hierarchical(self): ) frame = DataFrame(np.random.randn(10, 3), index=index, columns=["A", "B", "C"]) - self._check_roundtrip(frame, tm.assert_frame_equal) - self._check_roundtrip(frame.T, tm.assert_frame_equal) - self._check_roundtrip(frame["A"], tm.assert_series_equal) + self._check_roundtrip(frame, tm.assert_frame_equal, path=setup_path) + self._check_roundtrip(frame.T, tm.assert_frame_equal, path=setup_path) + self._check_roundtrip(frame["A"], tm.assert_series_equal, path=setup_path) # check that the names are stored - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: store["frame"] = frame recons = store["frame"] tm.assert_frame_equal(recons, frame) - def test_store_index_name(self): + def test_store_index_name(self, setup_path): df = tm.makeDataFrame() df.index.name = "foo" - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: store["frame"] = df recons = store["frame"] tm.assert_frame_equal(recons, df) - def test_store_index_name_with_tz(self): + def test_store_index_name_with_tz(self, setup_path): # GH 13884 df = pd.DataFrame({"A": [1, 2]}) df.index = pd.DatetimeIndex([1234567890123456787, 1234567890123456788]) df.index = df.index.tz_localize("UTC") df.index.name = "foo" - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: store.put("frame", df, format="table") recons = store["frame"] tm.assert_frame_equal(recons, df) @pytest.mark.parametrize("table_format", ["table", "fixed"]) - def test_store_index_name_numpy_str(self, table_format): + def test_store_index_name_numpy_str(self, table_format, setup_path): # GH #13492 idx = pd.Index( pd.to_datetime([datetime.date(2000, 1, 1), datetime.date(2000, 1, 2)]), @@ -2539,7 +2540,7 @@ def test_store_index_name_numpy_str(self, table_format): df = pd.DataFrame(np.arange(4).reshape(2, 2), columns=idx, index=idx1) # This used to fail, returning numpy strings instead of python strings. - with ensure_clean_path(self.path) as path: + with ensure_clean_path(setup_path) as path: df.to_hdf(path, "df", format=table_format) df2 = read_hdf(path, "df") @@ -2548,11 +2549,11 @@ def test_store_index_name_numpy_str(self, table_format): assert type(df2.index.name) == str assert type(df2.columns.name) == str - def test_store_series_name(self): + def test_store_series_name(self, setup_path): df = tm.makeDataFrame() series = df["A"] - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: store["series"] = series recons = store["series"] tm.assert_series_equal(recons, series) @@ -2561,7 +2562,7 @@ def test_store_series_name(self): @pytest.mark.parametrize( "compression", [False, pytest.param(True, marks=td.skip_if_windows_python_3)] ) - def test_store_mixed(self, compression): + def test_store_mixed(self, compression, setup_path): def _make_one(): df = tm.makeDataFrame() df["obj1"] = "foo" @@ -2575,10 +2576,10 @@ def _make_one(): df1 = _make_one() df2 = _make_one() - self._check_roundtrip(df1, tm.assert_frame_equal) - self._check_roundtrip(df2, tm.assert_frame_equal) + self._check_roundtrip(df1, tm.assert_frame_equal, path=setup_path) + self._check_roundtrip(df2, tm.assert_frame_equal, path=setup_path) - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: store["obj"] = df1 tm.assert_frame_equal(store["obj"], df1) store["obj"] = df2 @@ -2586,25 +2587,34 @@ def _make_one(): # check that can store Series of all of these types self._check_roundtrip( - df1["obj1"], tm.assert_series_equal, compression=compression + df1["obj1"], + tm.assert_series_equal, + path=setup_path, + compression=compression, ) self._check_roundtrip( - df1["bool1"], tm.assert_series_equal, compression=compression + df1["bool1"], + tm.assert_series_equal, + path=setup_path, + compression=compression, ) self._check_roundtrip( - df1["int1"], tm.assert_series_equal, compression=compression + df1["int1"], + tm.assert_series_equal, + path=setup_path, + compression=compression, ) @pytest.mark.filterwarnings( "ignore:\\nduplicate:pandas.io.pytables.DuplicateWarning" ) - def test_select_with_dups(self): + def test_select_with_dups(self, setup_path): # single dtypes df = DataFrame(np.random.randn(10, 4), columns=["A", "A", "B", "B"]) df.index = date_range("20130101 9:30", periods=10, freq="T") - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: store.append("df", df) result = store.select("df") @@ -2631,7 +2641,7 @@ def test_select_with_dups(self): ) df.index = date_range("20130101 9:30", periods=10, freq="T") - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: store.append("df", df) result = store.select("df") @@ -2651,7 +2661,7 @@ def test_select_with_dups(self): assert_frame_equal(result, expected, by_blocks=True) # duplicates on both index and columns - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: store.append("df", df) store.append("df", df) @@ -2660,18 +2670,18 @@ def test_select_with_dups(self): result = store.select("df", columns=["B", "A"]) assert_frame_equal(result, expected, by_blocks=True) - def test_overwrite_node(self): + def test_overwrite_node(self, setup_path): - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: store["a"] = tm.makeTimeDataFrame() ts = tm.makeTimeSeries() store["a"] = ts tm.assert_series_equal(store["a"], ts) - def test_select(self): + def test_select(self, setup_path): - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: with catch_warnings(record=True): @@ -2709,9 +2719,9 @@ def test_select(self): expected = df[df.A > 0].reindex(columns=["C", "D"]) tm.assert_frame_equal(expected, result) - def test_select_dtypes(self): + def test_select_dtypes(self, setup_path): - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: # with a Timestamp data column (GH #2637) df = DataFrame( dict(ts=bdate_range("2012-01-01", periods=300), A=np.random.randn(300)) @@ -2767,7 +2777,7 @@ def test_select_dtypes(self): expected = df.reindex(index=list(df.index)[0:10], columns=["A"]) tm.assert_frame_equal(expected, result) - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: # floats w/o NaN df = DataFrame(dict(cols=range(11), values=range(11)), dtype="float64") @@ -2806,7 +2816,7 @@ def test_select_dtypes(self): # test selection with comparison against numpy scalar # GH 11283 - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: df = tm.makeDataFrame() expected = df[df["A"] > 0] @@ -2816,9 +2826,9 @@ def test_select_dtypes(self): result = store.select("df", where=["A>np_zero"]) tm.assert_frame_equal(expected, result) - def test_select_with_many_inputs(self): + def test_select_with_many_inputs(self, setup_path): - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: df = DataFrame( dict( @@ -2869,10 +2879,10 @@ def test_select_with_many_inputs(self): tm.assert_frame_equal(expected, result) assert len(result) == 100 - def test_select_iterator(self): + def test_select_iterator(self, setup_path): # single table - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: df = tm.makeTimeDataFrame(500) _maybe_remove(store, "df") @@ -2893,7 +2903,7 @@ def test_select_iterator(self): result = concat(results) tm.assert_frame_equal(result, expected) - with ensure_clean_path(self.path) as path: + with ensure_clean_path(setup_path) as path: df = tm.makeTimeDataFrame(500) df.to_hdf(path, "df_non_table") @@ -2904,7 +2914,7 @@ def test_select_iterator(self): with pytest.raises(TypeError): read_hdf(path, "df_non_table", iterator=True) - with ensure_clean_path(self.path) as path: + with ensure_clean_path(setup_path) as path: df = tm.makeTimeDataFrame(500) df.to_hdf(path, "df", format="table") @@ -2918,7 +2928,7 @@ def test_select_iterator(self): # multiple - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: df1 = tm.makeTimeDataFrame(500) store.append("df1", df1, data_columns=True) @@ -2939,14 +2949,14 @@ def test_select_iterator(self): result = concat(results) tm.assert_frame_equal(expected, result) - def test_select_iterator_complete_8014(self): + def test_select_iterator_complete_8014(self, setup_path): # GH 8014 # using iterator and where clause chunksize = 1e4 # no iterator - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: expected = tm.makeTimeDataFrame(100064, "S") _maybe_remove(store, "df") @@ -2980,7 +2990,7 @@ def test_select_iterator_complete_8014(self): tm.assert_frame_equal(expected, result) # with iterator, full range - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: expected = tm.makeTimeDataFrame(100064, "S") _maybe_remove(store, "df") @@ -3014,14 +3024,14 @@ def test_select_iterator_complete_8014(self): result = concat(results) tm.assert_frame_equal(expected, result) - def test_select_iterator_non_complete_8014(self): + def test_select_iterator_non_complete_8014(self, setup_path): # GH 8014 # using iterator and where clause chunksize = 1e4 # with iterator, non complete range - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: expected = tm.makeTimeDataFrame(100064, "S") _maybe_remove(store, "df") @@ -3056,7 +3066,7 @@ def test_select_iterator_non_complete_8014(self): tm.assert_frame_equal(rexpected, result) # with iterator, empty where - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: expected = tm.makeTimeDataFrame(100064, "S") _maybe_remove(store, "df") @@ -3069,7 +3079,7 @@ def test_select_iterator_non_complete_8014(self): results = [s for s in store.select("df", where=where, chunksize=chunksize)] assert 0 == len(results) - def test_select_iterator_many_empty_frames(self): + def test_select_iterator_many_empty_frames(self, setup_path): # GH 8014 # using iterator and where clause can return many empty @@ -3077,7 +3087,7 @@ def test_select_iterator_many_empty_frames(self): chunksize = int(1e4) # with iterator, range limited to the first chunk - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: expected = tm.makeTimeDataFrame(100000, "S") _maybe_remove(store, "df") @@ -3134,14 +3144,14 @@ def test_select_iterator_many_empty_frames(self): @pytest.mark.filterwarnings( "ignore:\\nthe :pandas.io.pytables.AttributeConflictWarning" ) - def test_retain_index_attributes(self): + def test_retain_index_attributes(self, setup_path): # GH 3499, losing frequency info on index recreation df = DataFrame( dict(A=Series(range(3), index=date_range("2000-1-1", periods=3, freq="H"))) ) - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: _maybe_remove(store, "data") store.put("data", df, format="table") @@ -3194,8 +3204,8 @@ def test_retain_index_attributes(self): @pytest.mark.filterwarnings( "ignore:\\nthe :pandas.io.pytables.AttributeConflictWarning" ) - def test_retain_index_attributes2(self): - with ensure_clean_path(self.path) as path: + def test_retain_index_attributes2(self, setup_path): + with ensure_clean_path(setup_path) as path: with catch_warnings(record=True): @@ -3232,11 +3242,11 @@ def test_retain_index_attributes2(self): assert read_hdf(path, "data").index.name is None - def test_frame_select(self): + def test_frame_select(self, setup_path): df = tm.makeTimeDataFrame() - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: store.put("frame", df, format="table") date = df.index[len(df) // 2] @@ -3265,14 +3275,14 @@ def test_frame_select(self): # with pytest.raises(ValueError): # store.select('frame', [crit1, crit2]) - def test_frame_select_complex(self): + def test_frame_select_complex(self, setup_path): # select via complex criteria df = tm.makeTimeDataFrame() df["string"] = "foo" df.loc[df.index[0:4], "string"] = "bar" - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: store.put("df", df, format="table", data_columns=["string"]) # empty @@ -3317,7 +3327,7 @@ def test_frame_select_complex(self): expected = df.loc[df.index > df.index[3]].reindex(columns=["A", "B"]) tm.assert_frame_equal(result, expected) - def test_frame_select_complex2(self): + def test_frame_select_complex2(self, setup_path): with ensure_clean_path(["parms.hdf", "hist.hdf"]) as paths: @@ -3381,13 +3391,13 @@ def test_frame_select_complex2(self): store.close() - def test_invalid_filtering(self): + def test_invalid_filtering(self, setup_path): # can't use more than one filter (atm) df = tm.makeTimeDataFrame() - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: store.put("df", df, format="table") # not implemented @@ -3398,9 +3408,9 @@ def test_invalid_filtering(self): with pytest.raises(NotImplementedError): store.select("df", "columns=['A','B'] & columns=['C']") - def test_string_select(self): + def test_string_select(self, setup_path): # GH 2973 - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: df = tm.makeTimeDataFrame() @@ -3440,11 +3450,11 @@ def test_string_select(self): expected = df[df.int != 2] assert_frame_equal(result, expected) - def test_read_column(self): + def test_read_column(self, setup_path): df = tm.makeTimeDataFrame() - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: _maybe_remove(store, "df") # GH 17912 @@ -3513,10 +3523,10 @@ def test_read_column(self): result = store.select_column("df4", "B") tm.assert_series_equal(result, expected) - def test_coordinates(self): + def test_coordinates(self, setup_path): df = tm.makeTimeDataFrame() - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: _maybe_remove(store, "df") store.append("df", df) @@ -3561,7 +3571,7 @@ def test_coordinates(self): tm.assert_frame_equal(result, expected) # pass array/mask as the coordinates - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: df = DataFrame( np.random.randn(1000, 2), index=date_range("20000101", periods=1000) @@ -3617,13 +3627,13 @@ def test_coordinates(self): expected = df[5:10] tm.assert_frame_equal(result, expected) - def test_append_to_multiple(self): + def test_append_to_multiple(self, setup_path): df1 = tm.makeTimeDataFrame() df2 = tm.makeTimeDataFrame().rename(columns="{}_2".format) df2["foo"] = "bar" df = concat([df1, df2], axis=1) - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: # exceptions with pytest.raises(ValueError): @@ -3647,13 +3657,13 @@ def test_append_to_multiple(self): expected = df[(df.A > 0) & (df.B > 0)] tm.assert_frame_equal(result, expected) - def test_append_to_multiple_dropna(self): + def test_append_to_multiple_dropna(self, setup_path): df1 = tm.makeTimeDataFrame() df2 = tm.makeTimeDataFrame().rename(columns="{}_2".format) df1.iloc[1, df1.columns.get_indexer(["A", "B"])] = np.nan df = concat([df1, df2], axis=1) - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: # dropna=True should guarantee rows are synchronized store.append_to_multiple( @@ -3667,13 +3677,13 @@ def test_append_to_multiple_dropna(self): @pytest.mark.xfail( run=False, reason="append_to_multiple_dropna_false is not raising as failed" ) - def test_append_to_multiple_dropna_false(self): + def test_append_to_multiple_dropna_false(self, setup_path): df1 = tm.makeTimeDataFrame() df2 = tm.makeTimeDataFrame().rename(columns="{}_2".format) df1.iloc[1, df1.columns.get_indexer(["A", "B"])] = np.nan df = concat([df1, df2], axis=1) - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: # dropna=False shouldn't synchronize row indexes store.append_to_multiple( @@ -3685,13 +3695,13 @@ def test_append_to_multiple_dropna_false(self): assert not store.select("df1a").index.equals(store.select("df2a").index) - def test_select_as_multiple(self): + def test_select_as_multiple(self, setup_path): df1 = tm.makeTimeDataFrame() df2 = tm.makeTimeDataFrame().rename(columns="{}_2".format) df2["foo"] = "bar" - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: # no tables stored with pytest.raises(Exception): @@ -3759,9 +3769,9 @@ def test_select_as_multiple(self): LooseVersion(tables.__version__) < LooseVersion("3.1.0"), reason=("tables version does not support fix for nan selection bug: GH 4858"), ) - def test_nan_selection_bug_4858(self): + def test_nan_selection_bug_4858(self, setup_path): - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: df = DataFrame(dict(cols=range(6), values=range(6)), dtype="float64") df["cols"] = (df["cols"] + 10).apply(str) @@ -3777,9 +3787,9 @@ def test_nan_selection_bug_4858(self): result = store.select("df", where="values>2.0") assert_frame_equal(result, expected) - def test_start_stop_table(self): + def test_start_stop_table(self, setup_path): - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: # table df = DataFrame(dict(A=np.random.rand(20), B=np.random.rand(20))) @@ -3795,10 +3805,10 @@ def test_start_stop_table(self): expected = df.loc[30:40, ["A"]] tm.assert_frame_equal(result, expected) - def test_start_stop_multiple(self): + def test_start_stop_multiple(self, setup_path): # GH 16209 - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: df = DataFrame({"foo": [1, 2], "bar": [1, 2]}) @@ -3811,9 +3821,9 @@ def test_start_stop_multiple(self): expected = df.loc[[0], ["foo", "bar"]] tm.assert_frame_equal(result, expected) - def test_start_stop_fixed(self): + def test_start_stop_fixed(self, setup_path): - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: # fixed, GH 8287 df = DataFrame( @@ -3851,13 +3861,13 @@ def test_start_stop_fixed(self): df.iloc[3:5, 1:3] = np.nan df.iloc[8:10, -2] = np.nan - def test_select_filter_corner(self): + def test_select_filter_corner(self, setup_path): df = DataFrame(np.random.randn(50, 100)) df.index = ["{c:3d}".format(c=c) for c in df.index] df.columns = ["{c:3d}".format(c=c) for c in df.columns] - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: store.put("frame", df, format="table") crit = "columns=df.columns[:75]" @@ -3868,7 +3878,7 @@ def test_select_filter_corner(self): result = store.select("frame", [crit]) tm.assert_frame_equal(result, df.loc[:, df.columns[:75:2]]) - def test_path_pathlib(self): + def test_path_pathlib(self, setup_path): df = tm.makeDataFrame() result = tm.round_trip_pathlib( @@ -3877,7 +3887,7 @@ def test_path_pathlib(self): tm.assert_frame_equal(df, result) @pytest.mark.parametrize("start, stop", [(0, 2), (1, 2), (None, None)]) - def test_contiguous_mixed_data_table(self, start, stop): + def test_contiguous_mixed_data_table(self, start, stop, setup_path): # GH 17021 # ValueError when reading a contiguous mixed-data table ft. VLArray df = DataFrame( @@ -3887,13 +3897,13 @@ def test_contiguous_mixed_data_table(self, start, stop): } ) - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: store.append("test_dataset", df) result = store.select("test_dataset", start=start, stop=stop) assert_frame_equal(df[start:stop], result) - def test_path_pathlib_hdfstore(self): + def test_path_pathlib_hdfstore(self, setup_path): df = tm.makeDataFrame() def writer(path): @@ -3907,14 +3917,14 @@ def reader(path): result = tm.round_trip_pathlib(writer, reader) tm.assert_frame_equal(df, result) - def test_pickle_path_localpath(self): + def test_pickle_path_localpath(self, setup_path): df = tm.makeDataFrame() result = tm.round_trip_pathlib( lambda p: df.to_hdf(p, "df"), lambda p: pd.read_hdf(p, "df") ) tm.assert_frame_equal(df, result) - def test_path_localpath_hdfstore(self): + def test_path_localpath_hdfstore(self, setup_path): df = tm.makeDataFrame() def writer(path): @@ -3928,23 +3938,25 @@ def reader(path): result = tm.round_trip_localpath(writer, reader) tm.assert_frame_equal(df, result) - def _check_roundtrip(self, obj, comparator, compression=False, **kwargs): + def _check_roundtrip(self, obj, comparator, path, compression=False, **kwargs): options = {} if compression: options["complib"] = _default_compressor - with ensure_clean_store(self.path, "w", **options) as store: + with ensure_clean_store(path, "w", **options) as store: store["obj"] = obj retrieved = store["obj"] comparator(retrieved, obj, **kwargs) - def _check_double_roundtrip(self, obj, comparator, compression=False, **kwargs): + def _check_double_roundtrip( + self, obj, comparator, path, compression=False, **kwargs + ): options = {} if compression: options["complib"] = compression or _default_compressor - with ensure_clean_store(self.path, "w", **options) as store: + with ensure_clean_store(path, "w", **options) as store: store["obj"] = obj retrieved = store["obj"] comparator(retrieved, obj, **kwargs) @@ -3952,21 +3964,21 @@ def _check_double_roundtrip(self, obj, comparator, compression=False, **kwargs): again = store["obj"] comparator(again, obj, **kwargs) - def _check_roundtrip_table(self, obj, comparator, compression=False): + def _check_roundtrip_table(self, obj, comparator, path, compression=False): options = {} if compression: options["complib"] = _default_compressor - with ensure_clean_store(self.path, "w", **options) as store: + with ensure_clean_store(path, "w", **options) as store: store.put("obj", obj, format="table") retrieved = store["obj"] comparator(retrieved, obj) - def test_multiple_open_close(self): + def test_multiple_open_close(self, setup_path): # gh-4409: open & close multiple times - with ensure_clean_path(self.path) as path: + with ensure_clean_path(setup_path) as path: df = tm.makeDataFrame() df.to_hdf(path, "df", mode="w", format="table") @@ -3980,7 +3992,7 @@ def test_multiple_open_close(self): assert "CLOSED" in store.info() assert not store.is_open - with ensure_clean_path(self.path) as path: + with ensure_clean_path(setup_path) as path: if pytables._table_file_open_policy_is_strict: @@ -4042,7 +4054,7 @@ def test_multiple_open_close(self): assert not store2.is_open # ops on a closed store - with ensure_clean_path(self.path) as path: + with ensure_clean_path(setup_path) as path: df = tm.makeDataFrame() df.to_hdf(path, "df", mode="w", format="table") @@ -4086,7 +4098,7 @@ def test_multiple_open_close(self): with pytest.raises(ClosedFileError, match="file is not open"): store.select("df") - def test_pytables_native_read(self, datapath): + def test_pytables_native_read(self, datapath, setup_path): with ensure_clean_store( datapath("io", "data", "legacy_hdf/pytables_native.h5"), mode="r" ) as store: @@ -4096,7 +4108,7 @@ def test_pytables_native_read(self, datapath): @pytest.mark.skipif( is_platform_windows(), reason="native2 read fails oddly on windows" ) - def test_pytables_native2_read(self, datapath): + def test_pytables_native2_read(self, datapath, setup_path): with ensure_clean_store( datapath("io", "data", "legacy_hdf", "pytables_native2.h5"), mode="r" ) as store: @@ -4105,7 +4117,7 @@ def test_pytables_native2_read(self, datapath): assert isinstance(d1, DataFrame) @xfail_non_writeable - def test_legacy_table_fixed_format_read_py2(self, datapath): + def test_legacy_table_fixed_format_read_py2(self, datapath, setup_path): # GH 24510 # legacy table with fixed format written in Python 2 with ensure_clean_store( @@ -4119,7 +4131,7 @@ def test_legacy_table_fixed_format_read_py2(self, datapath): ) assert_frame_equal(expected, result) - def test_legacy_table_read_py2(self, datapath): + def test_legacy_table_read_py2(self, datapath, setup_path): # issue: 24925 # legacy table written in Python 2 with ensure_clean_store( @@ -4130,7 +4142,7 @@ def test_legacy_table_read_py2(self, datapath): expected = pd.DataFrame({"a": ["a", "b"], "b": [2, 3]}) assert_frame_equal(expected, result) - def test_copy(self): + def test_copy(self, setup_path): with catch_warnings(record=True): @@ -4179,7 +4191,7 @@ def do_copy(f, new_f=None, keys=None, propindexes=True, **kwargs): df = tm.makeDataFrame() try: - path = create_tempfile(self.path) + path = create_tempfile(setup_path) st = HDFStore(path) st.append("df", df, data_columns=["A"]) st.close() @@ -4188,17 +4200,17 @@ def do_copy(f, new_f=None, keys=None, propindexes=True, **kwargs): finally: safe_remove(path) - def test_store_datetime_fractional_secs(self): + def test_store_datetime_fractional_secs(self, setup_path): - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: dt = datetime.datetime(2012, 1, 2, 3, 4, 5, 123456) series = Series([0], [dt]) store["a"] = series assert store["a"].index[0] == dt - def test_tseries_indices_series(self): + def test_tseries_indices_series(self, setup_path): - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: idx = tm.makeDateIndex(10) ser = Series(np.random.randn(len(idx)), idx) store["a"] = ser @@ -4217,9 +4229,9 @@ def test_tseries_indices_series(self): assert result.index.freq == ser.index.freq tm.assert_class_equal(result.index, ser.index, obj="series index") - def test_tseries_indices_frame(self): + def test_tseries_indices_frame(self, setup_path): - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: idx = tm.makeDateIndex(10) df = DataFrame(np.random.randn(len(idx), 3), index=idx) store["a"] = df @@ -4238,7 +4250,7 @@ def test_tseries_indices_frame(self): assert result.index.freq == df.index.freq tm.assert_class_equal(result.index, df.index, obj="dataframe index") - def test_unicode_index(self): + def test_unicode_index(self, setup_path): unicode_values = ["\u03c3", "\u03c3\u03c3"] @@ -4246,30 +4258,30 @@ def test_unicode_index(self): with catch_warnings(record=True): simplefilter("ignore", pd.errors.PerformanceWarning) s = Series(np.random.randn(len(unicode_values)), unicode_values) - self._check_roundtrip(s, tm.assert_series_equal) + self._check_roundtrip(s, tm.assert_series_equal, path=setup_path) - def test_unicode_longer_encoded(self): + def test_unicode_longer_encoded(self, setup_path): # GH 11234 char = "\u0394" df = pd.DataFrame({"A": [char]}) - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: store.put("df", df, format="table", encoding="utf-8") result = store.get("df") tm.assert_frame_equal(result, df) df = pd.DataFrame({"A": ["a", char], "B": ["b", "b"]}) - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: store.put("df", df, format="table", encoding="utf-8") result = store.get("df") tm.assert_frame_equal(result, df) @xfail_non_writeable - def test_store_datetime_mixed(self): + def test_store_datetime_mixed(self, setup_path): df = DataFrame({"a": [1, 2, 3], "b": [1.0, 2.0, 3.0], "c": ["a", "b", "c"]}) ts = tm.makeTimeSeries() df["d"] = ts.index[:3] - self._check_roundtrip(df, tm.assert_frame_equal) + self._check_roundtrip(df, tm.assert_frame_equal, path=setup_path) # FIXME: don't leave commented-out code # def test_cant_write_multiindex_table(self): @@ -4281,14 +4293,14 @@ def test_store_datetime_mixed(self): # with pytest.raises(Exception): # store.put('foo', df, format='table') - def test_append_with_diff_col_name_types_raises_value_error(self): + def test_append_with_diff_col_name_types_raises_value_error(self, setup_path): df = DataFrame(np.random.randn(10, 1)) df2 = DataFrame({"a": np.random.randn(10)}) df3 = DataFrame({(1, 2): np.random.randn(10)}) df4 = DataFrame({("1", 2): np.random.randn(10)}) df5 = DataFrame({("1", 2, object): np.random.randn(10)}) - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: name = "df_{}".format(tm.rands(10)) store.append(name, df) @@ -4296,7 +4308,7 @@ def test_append_with_diff_col_name_types_raises_value_error(self): with pytest.raises(ValueError): store.append(name, d) - def test_query_with_nested_special_character(self): + def test_query_with_nested_special_character(self, setup_path): df = DataFrame( { "a": ["a", "a", "c", "b", "test & test", "c", "b", "e"], @@ -4304,14 +4316,14 @@ def test_query_with_nested_special_character(self): } ) expected = df[df.a == "test & test"] - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: store.append("test", df, format="table", data_columns=True) result = store.select("test", 'a = "test & test"') tm.assert_frame_equal(expected, result) - def test_categorical(self): + def test_categorical(self, setup_path): - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: # Basic _maybe_remove(store, "s") @@ -4429,7 +4441,7 @@ def test_categorical(self): ): store.select("df3/meta/s/meta") - def test_categorical_conversion(self): + def test_categorical_conversion(self, setup_path): # GH13322 # Check that read_hdf with categorical columns doesn't return rows if @@ -4443,7 +4455,7 @@ def test_categorical_conversion(self): # We are expecting an empty DataFrame matching types of df expected = df.iloc[[], :] - with ensure_clean_path(self.path) as path: + with ensure_clean_path(setup_path) as path: df.to_hdf(path, "df", format="table", data_columns=True) result = read_hdf(path, "df", where="obsids=B") tm.assert_frame_equal(result, expected) @@ -4454,12 +4466,12 @@ def test_categorical_conversion(self): # We are expecting an empty DataFrame matching types of df expected = df.iloc[[], :] - with ensure_clean_path(self.path) as path: + with ensure_clean_path(setup_path) as path: df.to_hdf(path, "df", format="table", data_columns=True) result = read_hdf(path, "df", where="obsids=B") tm.assert_frame_equal(result, expected) - def test_categorical_nan_only_columns(self): + def test_categorical_nan_only_columns(self, setup_path): # GH18413 # Check that read_hdf with categorical columns with NaN-only values can # be read back. @@ -4475,15 +4487,15 @@ def test_categorical_nan_only_columns(self): df["b"] = df.b.astype("category") df["d"] = df.b.astype("category") expected = df - with ensure_clean_path(self.path) as path: + with ensure_clean_path(setup_path) as path: df.to_hdf(path, "df", format="table", data_columns=True) result = read_hdf(path, "df") tm.assert_frame_equal(result, expected) - def test_duplicate_column_name(self): + def test_duplicate_column_name(self, setup_path): df = DataFrame(columns=["a", "a"], data=[[0, 0]]) - with ensure_clean_path(self.path) as path: + with ensure_clean_path(setup_path) as path: with pytest.raises(ValueError): df.to_hdf(path, "df", format="fixed") @@ -4494,30 +4506,30 @@ def test_duplicate_column_name(self): assert df.equals(other) assert other.equals(df) - def test_round_trip_equals(self): + def test_round_trip_equals(self, setup_path): # GH 9330 df = DataFrame({"B": [1, 2], "A": ["x", "y"]}) - with ensure_clean_path(self.path) as path: + with ensure_clean_path(setup_path) as path: df.to_hdf(path, "df", format="table") other = read_hdf(path, "df") tm.assert_frame_equal(df, other) assert df.equals(other) assert other.equals(df) - def test_preserve_timedeltaindex_type(self): + def test_preserve_timedeltaindex_type(self, setup_path): # GH9635 # Storing TimedeltaIndexed DataFrames in fixed stores did not preserve # the type of the index. df = DataFrame(np.random.normal(size=(10, 5))) df.index = timedelta_range(start="0s", periods=10, freq="1s", name="example") - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: store["df"] = df assert_frame_equal(store["df"], df) - def test_columns_multiindex_modified(self): + def test_columns_multiindex_modified(self, setup_path): # BUG: 7212 # read_hdf store.select modified the passed columns parameters # when multi-indexed. @@ -4527,7 +4539,7 @@ def test_columns_multiindex_modified(self): df = df.set_index(keys="E", append=True) data_columns = df.index.names + df.columns.tolist() - with ensure_clean_path(self.path) as path: + with ensure_clean_path(setup_path) as path: df.to_hdf( path, "df", @@ -4542,7 +4554,7 @@ def test_columns_multiindex_modified(self): assert cols2load_original == cols2load @ignore_natural_naming_warning - def test_to_hdf_with_object_column_names(self): + def test_to_hdf_with_object_column_names(self, setup_path): # GH9057 # Writing HDF5 table format should only work for string-like # column types @@ -4562,7 +4574,7 @@ def test_to_hdf_with_object_column_names(self): for index in types_should_fail: df = DataFrame(np.random.randn(10, 2), columns=index(2)) - with ensure_clean_path(self.path) as path: + with ensure_clean_path(setup_path) as path: with catch_warnings(record=True): msg = "cannot have non-object label DataIndexableCol" with pytest.raises(ValueError, match=msg): @@ -4570,7 +4582,7 @@ def test_to_hdf_with_object_column_names(self): for index in types_should_run: df = DataFrame(np.random.randn(10, 2), columns=index(2)) - with ensure_clean_path(self.path) as path: + with ensure_clean_path(setup_path) as path: with catch_warnings(record=True): df.to_hdf(path, "df", format="table", data_columns=True) result = pd.read_hdf( @@ -4578,14 +4590,14 @@ def test_to_hdf_with_object_column_names(self): ) assert len(result) - def test_read_hdf_open_store(self): + def test_read_hdf_open_store(self, setup_path): # GH10330 # No check for non-string path_or-buf, and no test of open store df = DataFrame(np.random.rand(4, 5), index=list("abcd"), columns=list("ABCDE")) df.index.name = "letters" df = df.set_index(keys="E", append=True) - with ensure_clean_path(self.path) as path: + with ensure_clean_path(setup_path) as path: df.to_hdf(path, "df", mode="w") direct = read_hdf(path, "df") store = HDFStore(path, mode="r") @@ -4594,12 +4606,12 @@ def test_read_hdf_open_store(self): assert store.is_open store.close() - def test_read_hdf_iterator(self): + def test_read_hdf_iterator(self, setup_path): df = DataFrame(np.random.rand(4, 5), index=list("abcd"), columns=list("ABCDE")) df.index.name = "letters" df = df.set_index(keys="E", append=True) - with ensure_clean_path(self.path) as path: + with ensure_clean_path(setup_path) as path: df.to_hdf(path, "df", mode="w", format="t") direct = read_hdf(path, "df") iterator = read_hdf(path, "df", iterator=True) @@ -4608,10 +4620,10 @@ def test_read_hdf_iterator(self): tm.assert_frame_equal(direct, indirect) iterator.store.close() - def test_read_hdf_errors(self): + def test_read_hdf_errors(self, setup_path): df = DataFrame(np.random.rand(4, 5), index=list("abcd"), columns=list("ABCDE")) - with ensure_clean_path(self.path) as path: + with ensure_clean_path(setup_path) as path: with pytest.raises(IOError): read_hdf(path, "key") @@ -4626,20 +4638,20 @@ def test_read_hdf_generic_buffer_errors(self): with pytest.raises(NotImplementedError): read_hdf(BytesIO(b""), "df") - def test_invalid_complib(self): + def test_invalid_complib(self, setup_path): df = DataFrame(np.random.rand(4, 5), index=list("abcd"), columns=list("ABCDE")) - with ensure_clean_path(self.path) as path: + with ensure_clean_path(setup_path) as path: with pytest.raises(ValueError): df.to_hdf(path, "df", complib="foolib") # GH10443 - def test_read_nokey(self): + def test_read_nokey(self, setup_path): df = DataFrame(np.random.rand(4, 5), index=list("abcd"), columns=list("ABCDE")) # Categorical dtype not supported for "fixed" format. So no need # to test with that dtype in the dataframe here. - with ensure_clean_path(self.path) as path: + with ensure_clean_path(setup_path) as path: df.to_hdf(path, "df", mode="a") reread = read_hdf(path) assert_frame_equal(df, reread) @@ -4648,11 +4660,11 @@ def test_read_nokey(self): with pytest.raises(ValueError): read_hdf(path) - def test_read_nokey_table(self): + def test_read_nokey_table(self, setup_path): # GH13231 df = DataFrame({"i": range(5), "c": Series(list("abacd"), dtype="category")}) - with ensure_clean_path(self.path) as path: + with ensure_clean_path(setup_path) as path: df.to_hdf(path, "df", mode="a", format="table") reread = read_hdf(path) assert_frame_equal(df, reread) @@ -4661,8 +4673,8 @@ def test_read_nokey_table(self): with pytest.raises(ValueError): read_hdf(path) - def test_read_nokey_empty(self): - with ensure_clean_path(self.path) as path: + def test_read_nokey_empty(self, setup_path): + with ensure_clean_path(setup_path) as path: store = HDFStore(path) store.close() @@ -4670,7 +4682,7 @@ def test_read_nokey_empty(self): read_hdf(path) @td.skip_if_no("pathlib") - def test_read_from_pathlib_path(self): + def test_read_from_pathlib_path(self, setup_path): # GH11773 from pathlib import Path @@ -4678,7 +4690,7 @@ def test_read_from_pathlib_path(self): expected = DataFrame( np.random.rand(4, 5), index=list("abcd"), columns=list("ABCDE") ) - with ensure_clean_path(self.path) as filename: + with ensure_clean_path(setup_path) as filename: path_obj = Path(filename) expected.to_hdf(path_obj, "df", mode="a") @@ -4687,7 +4699,7 @@ def test_read_from_pathlib_path(self): tm.assert_frame_equal(expected, actual) @td.skip_if_no("py.path") - def test_read_from_py_localpath(self): + def test_read_from_py_localpath(self, setup_path): # GH11773 from py.path import local as LocalPath @@ -4695,7 +4707,7 @@ def test_read_from_py_localpath(self): expected = DataFrame( np.random.rand(4, 5), index=list("abcd"), columns=list("ABCDE") ) - with ensure_clean_path(self.path) as filename: + with ensure_clean_path(setup_path) as filename: path_obj = LocalPath(filename) expected.to_hdf(path_obj, "df", mode="a") @@ -4703,11 +4715,11 @@ def test_read_from_py_localpath(self): tm.assert_frame_equal(expected, actual) - def test_query_long_float_literal(self): + def test_query_long_float_literal(self, setup_path): # GH 14241 df = pd.DataFrame({"A": [1000000000.0009, 1000000000.0011, 1000000000.0015]}) - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: store.append("test", df, format="table", data_columns=True) cutoff = 1000000000.0006 @@ -4724,7 +4736,7 @@ def test_query_long_float_literal(self): expected = df.loc[[1], :] tm.assert_frame_equal(expected, result) - def test_query_compare_column_type(self): + def test_query_compare_column_type(self, setup_path): # GH 15492 df = pd.DataFrame( { @@ -4736,7 +4748,7 @@ def test_query_compare_column_type(self): columns=["date", "real_date", "float", "int"], ) - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: store.append("test", df, format="table", data_columns=True) ts = pd.Timestamp("2014-01-01") # noqa @@ -4773,12 +4785,12 @@ def test_query_compare_column_type(self): tm.assert_frame_equal(expected, result) @pytest.mark.parametrize("format", ["fixed", "table"]) - def test_read_hdf_series_mode_r(self, format): + def test_read_hdf_series_mode_r(self, format, setup_path): # GH 16583 # Tests that reading a Series saved to an HDF file # still works if a mode='r' argument is supplied series = tm.makeFloatSeries() - with ensure_clean_path(self.path) as path: + with ensure_clean_path(setup_path) as path: series.to_hdf(path, key="data", format=format) result = pd.read_hdf(path, key="data", mode="r") tm.assert_series_equal(result, series) @@ -4836,26 +4848,26 @@ def test_select_empty_where(self, where): CategoricalIndex(list("abc")), ], ) - def test_to_hdf_multiindex_extension_dtype(self, idx): + def test_to_hdf_multiindex_extension_dtype(self, idx, setup_path): # GH 7775 mi = MultiIndex.from_arrays([idx, idx]) df = pd.DataFrame(0, index=mi, columns=["a"]) - with ensure_clean_path(self.path) as path: + with ensure_clean_path(setup_path) as path: with pytest.raises(NotImplementedError, match="Saving a MultiIndex"): df.to_hdf(path, "df") -class TestHDFComplexValues(Base): +class TestHDFComplexValues: # GH10447 - def test_complex_fixed(self): + def test_complex_fixed(self, setup_path): df = DataFrame( np.random.rand(4, 5).astype(np.complex64), index=list("abcd"), columns=list("ABCDE"), ) - with ensure_clean_path(self.path) as path: + with ensure_clean_path(setup_path) as path: df.to_hdf(path, "df") reread = read_hdf(path, "df") assert_frame_equal(df, reread) @@ -4865,19 +4877,19 @@ def test_complex_fixed(self): index=list("abcd"), columns=list("ABCDE"), ) - with ensure_clean_path(self.path) as path: + with ensure_clean_path(setup_path) as path: df.to_hdf(path, "df") reread = read_hdf(path, "df") assert_frame_equal(df, reread) - def test_complex_table(self): + def test_complex_table(self, setup_path): df = DataFrame( np.random.rand(4, 5).astype(np.complex64), index=list("abcd"), columns=list("ABCDE"), ) - with ensure_clean_path(self.path) as path: + with ensure_clean_path(setup_path) as path: df.to_hdf(path, "df", format="table") reread = read_hdf(path, "df") assert_frame_equal(df, reread) @@ -4888,13 +4900,13 @@ def test_complex_table(self): columns=list("ABCDE"), ) - with ensure_clean_path(self.path) as path: + with ensure_clean_path(setup_path) as path: df.to_hdf(path, "df", format="table", mode="w") reread = read_hdf(path, "df") assert_frame_equal(df, reread) @xfail_non_writeable - def test_complex_mixed_fixed(self): + def test_complex_mixed_fixed(self, setup_path): complex64 = np.array( [1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j], dtype=np.complex64 ) @@ -4911,12 +4923,12 @@ def test_complex_mixed_fixed(self): }, index=list("abcd"), ) - with ensure_clean_path(self.path) as path: + with ensure_clean_path(setup_path) as path: df.to_hdf(path, "df") reread = read_hdf(path, "df") assert_frame_equal(df, reread) - def test_complex_mixed_table(self): + def test_complex_mixed_table(self, setup_path): complex64 = np.array( [1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j], dtype=np.complex64 ) @@ -4934,17 +4946,17 @@ def test_complex_mixed_table(self): index=list("abcd"), ) - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: store.append("df", df, data_columns=["A", "B"]) result = store.select("df", where="A>2") assert_frame_equal(df.loc[df.A > 2], result) - with ensure_clean_path(self.path) as path: + with ensure_clean_path(setup_path) as path: df.to_hdf(path, "df", format="table") reread = read_hdf(path, "df") assert_frame_equal(df, reread) - def test_complex_across_dimensions_fixed(self): + def test_complex_across_dimensions_fixed(self, setup_path): with catch_warnings(record=True): complex128 = np.array([1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j]) s = Series(complex128, index=list("abcd")) @@ -4953,12 +4965,12 @@ def test_complex_across_dimensions_fixed(self): objs = [s, df] comps = [tm.assert_series_equal, tm.assert_frame_equal] for obj, comp in zip(objs, comps): - with ensure_clean_path(self.path) as path: + with ensure_clean_path(setup_path) as path: obj.to_hdf(path, "obj", format="fixed") reread = read_hdf(path, "obj") comp(obj, reread) - def test_complex_across_dimensions(self): + def test_complex_across_dimensions(self, setup_path): complex128 = np.array([1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j]) s = Series(complex128, index=list("abcd")) df = DataFrame({"A": s, "B": s}) @@ -4968,12 +4980,12 @@ def test_complex_across_dimensions(self): objs = [df] comps = [tm.assert_frame_equal] for obj, comp in zip(objs, comps): - with ensure_clean_path(self.path) as path: + with ensure_clean_path(setup_path) as path: obj.to_hdf(path, "obj", format="table") reread = read_hdf(path, "obj") comp(obj, reread) - def test_complex_indexing_error(self): + def test_complex_indexing_error(self, setup_path): complex128 = np.array( [1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j], dtype=np.complex128 ) @@ -4981,36 +4993,37 @@ def test_complex_indexing_error(self): {"A": [1, 2, 3, 4], "B": ["a", "b", "c", "d"], "C": complex128}, index=list("abcd"), ) - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: with pytest.raises(TypeError): store.append("df", df, data_columns=["C"]) - def test_complex_series_error(self): + def test_complex_series_error(self, setup_path): complex128 = np.array([1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j]) s = Series(complex128, index=list("abcd")) - with ensure_clean_path(self.path) as path: + with ensure_clean_path(setup_path) as path: with pytest.raises(TypeError): s.to_hdf(path, "obj", format="t") - with ensure_clean_path(self.path) as path: + with ensure_clean_path(setup_path) as path: s.to_hdf(path, "obj", format="t", index=False) reread = read_hdf(path, "obj") tm.assert_series_equal(s, reread) - def test_complex_append(self): + def test_complex_append(self, setup_path): df = DataFrame( {"a": np.random.randn(100).astype(np.complex128), "b": np.random.randn(100)} ) - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: store.append("df", df, data_columns=["b"]) store.append("df", df) result = store.select("df") assert_frame_equal(pd.concat([df, df], 0), result) -class TestTimezones(Base): +# @pytest.mark.usefixtures("setup_path") +class TestTimezones: def _compare_with_tz(self, a, b): tm.assert_frame_equal(a, b) @@ -5024,7 +5037,7 @@ def _compare_with_tz(self, a, b): "invalid tz comparison [{a_e}] [{b_e}]".format(a_e=a_e, b_e=b_e) ) - def test_append_with_timezones_dateutil(self): + def test_append_with_timezones_dateutil(self, setup_path): from datetime import timedelta @@ -5035,7 +5048,7 @@ def test_append_with_timezones_dateutil(self): gettz = lambda x: maybe_get_tz("dateutil/" + x) # as columns - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: _maybe_remove(store, "df_tz") df = DataFrame( @@ -5101,7 +5114,7 @@ def test_append_with_timezones_dateutil(self): store.append("df_tz", df) # as index - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: # GH 4098 example df = DataFrame( @@ -5125,12 +5138,12 @@ def test_append_with_timezones_dateutil(self): result = store.select("df") assert_frame_equal(result, df) - def test_append_with_timezones_pytz(self): + def test_append_with_timezones_pytz(self, setup_path): from datetime import timedelta # as columns - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: _maybe_remove(store, "df_tz") df = DataFrame( @@ -5195,7 +5208,7 @@ def test_append_with_timezones_pytz(self): store.append("df_tz", df) # as index - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: # GH 4098 example df = DataFrame( @@ -5219,7 +5232,7 @@ def test_append_with_timezones_pytz(self): result = store.select("df") assert_frame_equal(result, df) - def test_tseries_select_index_column(self): + def test_tseries_select_index_column(self, setup_path): # GH7777 # selecting a UTC datetimeindex column did # not preserve UTC tzinfo set before storing @@ -5228,7 +5241,7 @@ def test_tseries_select_index_column(self): rng = date_range("1/1/2000", "1/30/2000") frame = DataFrame(np.random.randn(len(rng), 4), index=rng) - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: store.append("frame", frame) result = store.select_column("frame", "index") assert rng.tz == DatetimeIndex(result.values).tz @@ -5237,7 +5250,7 @@ def test_tseries_select_index_column(self): rng = date_range("1/1/2000", "1/30/2000", tz="UTC") frame = DataFrame(np.random.randn(len(rng), 4), index=rng) - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: store.append("frame", frame) result = store.select_column("frame", "index") assert rng.tz == result.dt.tz @@ -5246,13 +5259,13 @@ def test_tseries_select_index_column(self): rng = date_range("1/1/2000", "1/30/2000", tz="US/Eastern") frame = DataFrame(np.random.randn(len(rng), 4), index=rng) - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: store.append("frame", frame) result = store.select_column("frame", "index") assert rng.tz == result.dt.tz - def test_timezones_fixed(self): - with ensure_clean_store(self.path) as store: + def test_timezones_fixed(self, setup_path): + with ensure_clean_store(setup_path) as store: # index rng = date_range("1/1/2000", "1/30/2000", tz="US/Eastern") @@ -5277,24 +5290,24 @@ def test_timezones_fixed(self): result = store["df"] assert_frame_equal(result, df) - def test_fixed_offset_tz(self): + def test_fixed_offset_tz(self, setup_path): rng = date_range("1/1/2000 00:00:00-07:00", "1/30/2000 00:00:00-07:00") frame = DataFrame(np.random.randn(len(rng), 4), index=rng) - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: store["frame"] = frame recons = store["frame"] tm.assert_index_equal(recons.index, rng) assert rng.tz == recons.index.tz @td.skip_if_windows - def test_store_timezone(self): + def test_store_timezone(self, setup_path): # GH2852 # issue storing datetime.date with a timezone as it resets when read # back in a new timezone # original method - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: today = datetime.date(2013, 9, 10) df = DataFrame([1, 2, 3], index=[today, today, today]) @@ -5303,7 +5316,7 @@ def test_store_timezone(self): assert_frame_equal(result, df) # with tz setting - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: with set_timezone("EST5EDT"): today = datetime.date(2013, 9, 10) @@ -5315,7 +5328,7 @@ def test_store_timezone(self): assert_frame_equal(result, df) - def test_legacy_datetimetz_object(self, datapath): + def test_legacy_datetimetz_object(self, datapath, setup_path): # legacy from < 0.17.0 # 8260 expected = DataFrame( @@ -5331,9 +5344,9 @@ def test_legacy_datetimetz_object(self, datapath): result = store["df"] assert_frame_equal(result, expected) - def test_dst_transitions(self): + def test_dst_transitions(self, setup_path): # make sure we are not failing on transitions - with ensure_clean_store(self.path) as store: + with ensure_clean_store(setup_path) as store: times = pd.date_range( "2013-10-26 23:00", "2013-10-27 01:00", @@ -5349,7 +5362,7 @@ def test_dst_transitions(self): result = store.select("df") assert_frame_equal(result, df) - def test_read_with_where_tz_aware_index(self): + def test_read_with_where_tz_aware_index(self, setup_path): # GH 11926 periods = 10 dts = pd.date_range("20151201", periods=periods, freq="D", tz="UTC") @@ -5357,13 +5370,13 @@ def test_read_with_where_tz_aware_index(self): expected = pd.DataFrame({"MYCOL": 0}, index=mi) key = "mykey" - with ensure_clean_path(self.path) as path: + with ensure_clean_path(setup_path) as path: with pd.HDFStore(path) as store: store.append(key, expected, format="table", append=True) result = pd.read_hdf(path, key, where="DATE > 20151130") assert_frame_equal(result, expected) - def test_py2_created_with_datetimez(self, datapath): + def test_py2_created_with_datetimez(self, datapath, setup_path): # The test HDF5 file was created in Python 2, but could not be read in # Python 3. # From c5bc9285293e7d5075ea9d14d728bced03924e30 Mon Sep 17 00:00:00 2001 From: Josiah Baker Date: Tue, 1 Oct 2019 12:05:53 -0400 Subject: [PATCH 22/22] fix merge conflict