pandas-dev · mroeschke · Jan 31, 2023 · Jan 31, 2023
diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx
@@ -238,8 +238,6 @@ cdef class IndexEngine:
         return self.unique == 1
 
     cdef _do_unique_check(self):
-
-        # this de-facto the same
         self._ensure_mapping_populated()
 
     @property

diff --git a/pandas/_libs/indexing.pyi b/pandas/_libs/indexing.pyi
@@ -9,7 +9,7 @@ _IndexingMixinT = TypeVar("_IndexingMixinT", bound=IndexingMixin)
 
 class NDFrameIndexerBase(Generic[_IndexingMixinT]):
     name: str
-    # in practise obj is either a DataFrame or a Series
+    # in practice obj is either a DataFrame or a Series
     obj: _IndexingMixinT
 
     def __init__(self, name: str, obj: _IndexingMixinT) -> None: ...

diff --git a/pandas/_libs/internals.pyx b/pandas/_libs/internals.pyx
@@ -33,7 +33,6 @@ from pandas._libs.util cimport (
 @cython.final
 @cython.freelist(32)
 cdef class BlockPlacement:
-    # __slots__ = '_as_slice', '_as_array', '_len'
     cdef:
         slice _as_slice
         ndarray _as_array  # Note: this still allows `None`; will be intp_t
@@ -621,7 +620,7 @@ cdef class NumpyBlock(SharedBlock):
         public ndarray values
 
     def __cinit__(self, ndarray values, BlockPlacement placement, int ndim):
-        # set values here the (implicit) call to SharedBlock.__cinit__ will
+        # set values here; the (implicit) call to SharedBlock.__cinit__ will
         #  set placement and ndim
         self.values = values
 
@@ -643,7 +642,7 @@ cdef class NDArrayBackedBlock(SharedBlock):
         NDArrayBacked values
 
     def __cinit__(self, NDArrayBacked values, BlockPlacement placement, int ndim):
-        # set values here the (implicit) call to SharedBlock.__cinit__ will
+        # set values here; the (implicit) call to SharedBlock.__cinit__ will
         #  set placement and ndim
         self.values = values
 
@@ -662,7 +661,7 @@ cdef class Block(SharedBlock):
         public object values
 
     def __cinit__(self, object values, BlockPlacement placement, int ndim):
-        # set values here the (implicit) call to SharedBlock.__cinit__ will
+        # set values here; the (implicit) call to SharedBlock.__cinit__ will
         #  set placement and ndim
         self.values = values
 

diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
@@ -492,7 +492,7 @@ def get_reverse_indexer(const intp_t[:] indexer, Py_ssize_t length) -> ndarray:
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
-# Can add const once https://github.com/cython/cython/issues/1772 resolved
+# TODO(cython3): Can add const once cython#1772 is resolved
 def has_infs(floating[:] arr) -> bool:
     cdef:
         Py_ssize_t i, n = len(arr)

diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
@@ -46,6 +46,7 @@ from libc.string cimport (
 
 
 cdef extern from "Python.h":
+    # TODO(cython3): get this from cpython.unicode
     object PyUnicode_FromString(char *v)
 
 
@@ -453,14 +454,12 @@ cdef class TextReader:
 
         self.skipfooter = skipfooter
 
-        # suboptimal
         if usecols is not None:
             self.has_usecols = 1
             # GH-20558, validate usecols at higher level and only pass clean
             # usecols into TextReader.
             self.usecols = usecols
 
-        # TODO: XXX?
         if skipfooter > 0:
             self.parser.on_bad_lines = SKIP
 
@@ -501,7 +500,6 @@ cdef class TextReader:
         self.dtype = dtype
         self.use_nullable_dtypes = use_nullable_dtypes
 
-        # XXX
         self.noconvert = set()
 
         self.index_col = index_col
@@ -761,7 +759,7 @@ cdef class TextReader:
         # Corner case, not enough lines in the file
         if self.parser.lines < data_line + 1:
             field_count = len(header[0])
-        else:  # not self.has_usecols:
+        else:
 
             field_count = self.parser.line_fields[data_line]
 
@@ -1409,6 +1407,8 @@ def _maybe_upcast(arr, use_nullable_dtypes: bool = False):
     The casted array.
     """
     if is_extension_array_dtype(arr.dtype):
+        # TODO: the docstring says arr is an ndarray, in which case this cannot
+        #  be reached. Is that incorrect?
         return arr
 
     na_value = na_values[arr.dtype]

diff --git a/pandas/_libs/reduction.pyi b/pandas/_libs/reduction.pyi
@@ -1,8 +1,6 @@
 from typing import Any
 
-import numpy as np
+from pandas._typing import DtypeObj
 
-from pandas._typing import ExtensionDtype
-
-def check_result_array(obj: object, dtype: np.dtype | ExtensionDtype) -> None: ...
+def check_result_array(obj: object, dtype: DtypeObj) -> None: ...
 def extract_result(res: object) -> Any: ...
diff --git a/pandas/_libs/sparse.pyx b/pandas/_libs/sparse.pyx
@@ -301,9 +301,6 @@ cdef class BlockIndex(SparseIndex):
         self.nblocks = np.int32(len(self.blocs))
         self.npoints = self.blengths.sum()
 
-        # self.block_start = blocs
-        # self.block_end = blocs + blengths
-
         self.check_integrity()
 
     def __reduce__(self):

diff --git a/pandas/_libs/sparse_op_helper.pxi.in b/pandas/_libs/sparse_op_helper.pxi.in
@@ -137,16 +137,16 @@ cdef tuple block_op_{{opname}}_{{dtype}}({{dtype}}_t[:] x_,
                                                 {{dtype}}_t[:] y_,
                                                 BlockIndex yindex,
                                                 {{dtype}}_t yfill):
-    '''
+    """
     Binary operator on BlockIndex objects with fill values
-    '''
+    """
 
     cdef:
         BlockIndex out_index
-        Py_ssize_t xi = 0, yi = 0, out_i = 0 # fp buf indices
-        int32_t xbp = 0, ybp = 0 # block positions
+        Py_ssize_t xi = 0, yi = 0, out_i = 0  # fp buf indices
+        int32_t xbp = 0, ybp = 0  # block positions
         int32_t xloc, yloc
-        Py_ssize_t xblock = 0, yblock = 0 # block numbers
+        Py_ssize_t xblock = 0, yblock = 0  # block numbers
 
         {{dtype}}_t[:] x, y
         ndarray[{{rdtype}}_t, ndim=1] out

diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx
@@ -115,7 +115,7 @@ def format_array_from_datetime(
 
     Parameters
     ----------
-    values : a 1-d i8 array
+    values : ndarray[int64_t], arbitrary ndim
     tz : tzinfo or None, default None
     format : str or None, default None
           a strftime capable string
@@ -260,9 +260,9 @@ def array_with_unit_to_datetime(
     cdef:
         Py_ssize_t i, n=len(values)
         int64_t mult
-        bint is_ignore = errors=="ignore"
-        bint is_coerce = errors=="coerce"
-        bint is_raise = errors=="raise"
+        bint is_ignore = errors == "ignore"
+        bint is_coerce = errors == "coerce"
+        bint is_raise = errors == "raise"
         ndarray[int64_t] iresult
         tzinfo tz = None
         float fval
@@ -446,9 +446,9 @@ cpdef array_to_datetime(
         npy_datetimestruct dts
         bint utc_convert = bool(utc)
         bint seen_datetime_offset = False
-        bint is_raise = errors=="raise"
-        bint is_ignore = errors=="ignore"
-        bint is_coerce = errors=="coerce"
+        bint is_raise = errors == "raise"
+        bint is_ignore = errors == "ignore"
+        bint is_coerce = errors == "coerce"
         bint is_same_offsets
         _TSObject _ts
         float tz_offset

diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx
@@ -53,7 +53,6 @@ from pandas._libs.tslibs.np_datetime import OutOfBoundsDatetime
 from pandas._libs.tslibs.timezones cimport (
     get_utcoffset,
     is_utc,
-    maybe_get_tz,
 )
 from pandas._libs.tslibs.util cimport (
     is_datetime64_object,
@@ -124,7 +123,7 @@ cdef int64_t cast_from_unit(object ts, str unit) except? -1:
         dt64obj = np.datetime64(ts, unit)
         return get_datetime64_nanos(dt64obj, NPY_FR_ns)
 
-    # cast the unit, multiply base/frace separately
+    # cast the unit, multiply base/frac separately
     # to avoid precision issues from float -> int
     try:
         base = <int64_t>ts
@@ -380,7 +379,6 @@ cdef _TSObject convert_datetime_to_tsobject(
     obj.creso = reso
     obj.fold = ts.fold
     if tz is not None:
-        tz = maybe_get_tz(tz)
 
         if ts.tzinfo is not None:
             # Convert the current timezone to the passed timezone

diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx
@@ -2612,11 +2612,7 @@ class Period(_Period):
 
                 if freq is None and ordinal != NPY_NAT:
                     # Skip NaT, since it doesn't have a resolution
-                    try:
-                        freq = attrname_to_abbrevs[reso]
-                    except KeyError:
-                        raise ValueError(f"Invalid frequency or could not "
-                                         f"infer: {reso}")
+                    freq = attrname_to_abbrevs[reso]
                     freq = to_offset(freq)
 
         elif PyDateTime_Check(value):

diff --git a/pandas/_libs/tslibs/timedeltas.pxd b/pandas/_libs/tslibs/timedeltas.pxd
@@ -10,7 +10,6 @@ cpdef int64_t delta_to_nanoseconds(
 ) except? -1
 cdef convert_to_timedelta64(object ts, str unit)
 cdef bint is_any_td_scalar(object obj)
-cdef object ensure_td64ns(object ts)
 
 
 cdef class _Timedelta(timedelta):

diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx
@@ -691,10 +691,6 @@ cdef timedelta_from_spec(object number, object frac, object unit):
             "values and are not supported."
         )
 
-    if unit == "M":
-        # To parse ISO 8601 string, 'M' should be treated as minute,
-        # not month
-        unit = "m"
     unit = parse_timedelta_unit(unit)
 
     n = "".join(number) + "." + "".join(frac)

diff --git a/pandas/_libs/tslibs/tzconversion.pyx b/pandas/_libs/tslibs/tzconversion.pyx
@@ -545,7 +545,7 @@ cdef _get_utc_bounds_zoneinfo(ndarray vals, tz, NPY_DATETIMEUNIT creso):
 
         pandas_datetime_to_datetimestruct(val, creso, &dts)
         # casting to pydatetime drops nanoseconds etc, which we will
-        #  need to re-add later as 'extra''
+        #  need to re-add later as 'extra'
         extra = (dts.ps // 1000) * (pps // 1_000_000_000)
 
         dt = datetime_new(dts.year, dts.month, dts.day, dts.hour,

diff --git a/pandas/_libs/tslibs/util.pxd b/pandas/_libs/tslibs/util.pxd
@@ -2,15 +2,6 @@
 from cpython.object cimport PyTypeObject
 
 
-cdef extern from *:
-    """
-    PyObject* char_to_string(const char* data) {
-        return PyUnicode_FromString(data);
-    }
-    """
-    object char_to_string(const char* data)
-
-
 cdef extern from "Python.h":
     # Note: importing extern-style allows us to declare these as nogil
     # functions, whereas `from cpython cimport` does not.

diff --git a/pandas/_libs/writers.pyi b/pandas/_libs/writers.pyi
@@ -17,5 +17,4 @@ def word_len(val: object) -> int: ...
 def string_array_replace_from_nan_rep(
     arr: np.ndarray,  # np.ndarray[object, ndim=1]
     nan_rep: object,
-    replace: object = ...,
 ) -> None: ...
diff --git a/pandas/_libs/writers.pyx b/pandas/_libs/writers.pyx
@@ -161,15 +161,13 @@ cpdef inline Py_ssize_t word_len(object val):
 def string_array_replace_from_nan_rep(
     ndarray[object, ndim=1] arr,
     object nan_rep,
-    object replace=np.nan
 ) -> None:
     """
-    Replace the values in the array with 'replacement' if
-    they are 'nan_rep'. Return the same array.
+    Replace the values in the array with np.nan if they are nan_rep.
     """
     cdef:
         Py_ssize_t length = len(arr), i = 0
 
     for i in range(length):
         if arr[i] == nan_rep:
-            arr[i] = replace
+            arr[i] = np.nan
diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py
@@ -2030,17 +2030,11 @@ def _sequence_to_dt64ns(
             )
             if tz and inferred_tz:
                 #  two timezones: convert to intended from base UTC repr
-                if data.dtype == "i8":
-                    # GH#42505
-                    # by convention, these are _already_ UTC, e.g
-                    return data.view(DT64NS_DTYPE), tz, None
-
-                if timezones.is_utc(tz):
-                    # Fastpath, avoid copy made in tzconversion
-                    utc_vals = data.view("i8")
-                else:
-                    utc_vals = tz_convert_from_utc(data.view("i8"), tz)
-                data = utc_vals.view(DT64NS_DTYPE)
+                assert data.dtype == "i8"
+                # GH#42505
+                # by convention, these are _already_ UTC, e.g
+                return data.view(DT64NS_DTYPE), tz, None
+
             elif inferred_tz:
                 tz = inferred_tz
 

diff --git a/pandas/core/dtypes/base.py b/pandas/core/dtypes/base.py
@@ -134,7 +134,7 @@ def __eq__(self, other: Any) -> bool:
 
     def __hash__(self) -> int:
         # for python>=3.10, different nan objects have different hashes
-        # we need  to avoid that und thus use hash function with old behavior
+        # we need to avoid that and thus use hash function with old behavior
         return object_hash(tuple(getattr(self, attr) for attr in self._metadata))
 
     def __ne__(self, other: Any) -> bool:

diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py
@@ -28,8 +28,8 @@
     tz_compare,
 )
 from pandas._libs.tslibs.dtypes import (
-    NpyDatetimeUnit,
     PeriodDtypeBase,
+    abbrev_to_npy_unit,
 )
 from pandas._typing import (
     Dtype,
@@ -722,13 +722,7 @@ def _creso(self) -> int:
         """
         The NPY_DATETIMEUNIT corresponding to this dtype's resolution.
         """
-        reso = {
-            "s": NpyDatetimeUnit.NPY_FR_s,
-            "ms": NpyDatetimeUnit.NPY_FR_ms,
-            "us": NpyDatetimeUnit.NPY_FR_us,
-            "ns": NpyDatetimeUnit.NPY_FR_ns,
-        }[self.unit]
-        return reso.value
+        return abbrev_to_npy_unit(self.unit)
 
     @property
     def unit(self) -> str_type:

@@ -503,6 +503,7 @@ def _cython_transform(
                 "transform", obj._values, how, axis, **kwargs
             )
         except NotImplementedError as err:
+            # e.g. test_groupby_raises_string
             raise TypeError(f"{how} is not supported for {obj.dtype} dtype") from err
 
         return obj._constructor(result, index=self.obj.index, name=obj.name)