Skip to content

REF: de-duplicate get_conversion_factor #47770

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jul 18, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion pandas/_libs/tslibs/dtypes.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@ cdef NPY_DATETIMEUNIT abbrev_to_npy_unit(str abbrev)
cdef NPY_DATETIMEUNIT freq_group_code_to_npy_unit(int freq) nogil
cpdef int64_t periods_per_day(NPY_DATETIMEUNIT reso=*) except? -1
cpdef int64_t periods_per_second(NPY_DATETIMEUNIT reso) except? -1
cdef int64_t get_conversion_factor(NPY_DATETIMEUNIT from_unit, NPY_DATETIMEUNIT to_unit) except? -1

cdef dict attrname_to_abbrevs

Expand Down
81 changes: 6 additions & 75 deletions pandas/_libs/tslibs/dtypes.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,10 @@ cimport cython

from enum import Enum

from pandas._libs.tslibs.np_datetime cimport NPY_DATETIMEUNIT
from pandas._libs.tslibs.np_datetime cimport (
NPY_DATETIMEUNIT,
get_conversion_factor,
)


cdef class PeriodDtypeBase:
Expand Down Expand Up @@ -386,83 +389,11 @@ cpdef int64_t periods_per_day(NPY_DATETIMEUNIT reso=NPY_DATETIMEUNIT.NPY_FR_ns)
"""
How many of the given time units fit into a single day?
"""
cdef:
int64_t day_units

if reso == NPY_DATETIMEUNIT.NPY_FR_ps:
# pico is the smallest unit for which we don't overflow, so
# we exclude femto and atto
day_units = 24 * 3600 * 1_000_000_000_000
elif reso == NPY_DATETIMEUNIT.NPY_FR_ns:
day_units = 24 * 3600 * 1_000_000_000
elif reso == NPY_DATETIMEUNIT.NPY_FR_us:
day_units = 24 * 3600 * 1_000_000
elif reso == NPY_DATETIMEUNIT.NPY_FR_ms:
day_units = 24 * 3600 * 1_000
elif reso == NPY_DATETIMEUNIT.NPY_FR_s:
day_units = 24 * 3600
elif reso == NPY_DATETIMEUNIT.NPY_FR_m:
day_units = 24 * 60
elif reso == NPY_DATETIMEUNIT.NPY_FR_h:
day_units = 24
elif reso == NPY_DATETIMEUNIT.NPY_FR_D:
day_units = 1
else:
raise NotImplementedError(reso)
return day_units
return get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_D, reso)


cpdef int64_t periods_per_second(NPY_DATETIMEUNIT reso) except? -1:
if reso == NPY_DATETIMEUNIT.NPY_FR_ns:
return 1_000_000_000
elif reso == NPY_DATETIMEUNIT.NPY_FR_us:
return 1_000_000
elif reso == NPY_DATETIMEUNIT.NPY_FR_ms:
return 1_000
elif reso == NPY_DATETIMEUNIT.NPY_FR_s:
return 1
else:
raise NotImplementedError(reso)


@cython.overflowcheck(True)
cdef int64_t get_conversion_factor(NPY_DATETIMEUNIT from_unit, NPY_DATETIMEUNIT to_unit) except? -1:
"""
Find the factor by which we need to multiply to convert from from_unit to to_unit.
"""
if (
from_unit == NPY_DATETIMEUNIT.NPY_FR_GENERIC
or to_unit == NPY_DATETIMEUNIT.NPY_FR_GENERIC
):
raise ValueError("unit-less resolutions are not supported")
if from_unit > to_unit:
raise ValueError

if from_unit == to_unit:
return 1

if from_unit == NPY_DATETIMEUNIT.NPY_FR_W:
return 7 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_D, to_unit)
elif from_unit == NPY_DATETIMEUNIT.NPY_FR_D:
return 24 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_h, to_unit)
elif from_unit == NPY_DATETIMEUNIT.NPY_FR_h:
return 60 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_m, to_unit)
elif from_unit == NPY_DATETIMEUNIT.NPY_FR_m:
return 60 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_s, to_unit)
elif from_unit == NPY_DATETIMEUNIT.NPY_FR_s:
return 1000 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_ms, to_unit)
elif from_unit == NPY_DATETIMEUNIT.NPY_FR_ms:
return 1000 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_us, to_unit)
elif from_unit == NPY_DATETIMEUNIT.NPY_FR_us:
return 1000 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_ns, to_unit)
elif from_unit == NPY_DATETIMEUNIT.NPY_FR_ns:
return 1000 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_ps, to_unit)
elif from_unit == NPY_DATETIMEUNIT.NPY_FR_ps:
return 1000 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_fs, to_unit)
elif from_unit == NPY_DATETIMEUNIT.NPY_FR_fs:
return 1000 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_as, to_unit)
else:
raise ValueError(from_unit, to_unit)
return get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_s, reso)


cdef dict _reso_str_map = {
Expand Down
1 change: 1 addition & 0 deletions pandas/_libs/tslibs/np_datetime.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ cpdef cnp.ndarray astype_overflowsafe(
cnp.dtype dtype, # ndarray[datetime64[anyunit]]
bint copy=*,
)
cdef int64_t get_conversion_factor(NPY_DATETIMEUNIT from_unit, NPY_DATETIMEUNIT to_unit) except? -1

cdef bint cmp_dtstructs(npy_datetimestruct* left, npy_datetimestruct* right, int op)
cdef get_implementation_bounds(
Expand Down
41 changes: 41 additions & 0 deletions pandas/_libs/tslibs/np_datetime.pyx
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
cimport cython
from cpython.datetime cimport (
PyDateTime_DATE_GET_HOUR,
PyDateTime_DATE_GET_MICROSECOND,
Expand Down Expand Up @@ -450,3 +451,43 @@ cdef int op_to_op_code(op):
return Py_GE
if op is operator.gt:
return Py_GT


@cython.overflowcheck(True)
cdef int64_t get_conversion_factor(NPY_DATETIMEUNIT from_unit, NPY_DATETIMEUNIT to_unit) except? -1:
"""
Find the factor by which we need to multiply to convert from from_unit to to_unit.
"""
if (
from_unit == NPY_DATETIMEUNIT.NPY_FR_GENERIC
or to_unit == NPY_DATETIMEUNIT.NPY_FR_GENERIC
):
raise ValueError("unit-less resolutions are not supported")
if from_unit > to_unit:
raise ValueError

if from_unit == to_unit:
return 1

if from_unit == NPY_DATETIMEUNIT.NPY_FR_W:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Might be worth exploring a iterative solution in the future for perf maybe?

e.g.

factor = 1
while from_unit != to_unit:
    factor *= factor_map[from_unit] e.g. 7 (day to week)
    from_unit = factor_map_step[from_unit] e.g. `NPY_DATETIMEUNIT.NPY_FR_W`
return factor

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have a branch where i've implemented what i expect would be the maximally-performant solution:

+cdef extern from *:
+    """
+    /*
+    # equiv:
+
+    def func(x, y):
+        if x == "B" or y == "B":
+            return -1
+        dt = np.datetime64(1, x)
+        return dt.astype(f"M8[{y}]").view("i8")
+
+    units = ["Y", "M", "W", "B", "D", "h", "m", "s", "ms", "us", "ns"]
+    mat = np.array([[func(x, y) for x in units] for y in units]).T
+    */
+
+    // must use npy typedef b/c int64_t is aliased in cython-generated c
+    // LL needed to avoid overflows
+    // see https://github.com/pandas-dev/pandas/pull/34416/
+    static npy_int64 npy_unit_conversion_matrix[11][11] = {
+        {1, 12, 52, -1, 365, 8760, 525600, 31536000, 31536000*1000LL, 31536000*1000000LL, 31536000*1000000000LL},
+        {0, 1, 4, -1, 31, 744, 44640, 2678400, 2678400*1000LL, 2678400*1000000LL, 2678400*1000000000LL},
+        {0, 0, 1, -1, 7, 168, 10080, 604800, 604800*1000LL, 604800*1000000LL, 604800*1000000000LL},
+        {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}, // NPY_FR_B not used
+        {0, 0, 0, -1, 1, 24, 1440, 86400, 86400*1000, 86400*1000000LL, 86400*1000000000LL},
+        {0, 0, 0, -1, 0, 1, 60, 3600, 3600*1000, 3600*1000000LL, 3600*1000000000LL},
+        {0, 0, 0, -1, 0, 0, 1, 60, 60*1000, 60*1000000LL, 60*1000000000LL},
+        {0, 0, 0, -1, 0, 0, 0, 1, 1000, 1000000LL, 1000000000LL},
+        {0, 0, 0, -1, 0, 0, 0, 0, 1, 1000LL, 1000000LL},
+        {0, 0, 0, -1, 0, 0, 0, 0, 0, 1, 1000LL},
+        {0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 1LL},
+    };
+    """
+    int64_t npy_unit_conversion_matrix[11][11]
+
+
+cdef int64_t get_conversion_factor(NPY_DATETIMEUNIT from_unit, NPY_DATETIMEUNIT to_unit) except? -1:
+    """
+    Find the factor by which we need to multiply to convert from from_unit to to_unit.
+    """
+    if (
+        from_unit == NPY_DATETIMEUNIT.NPY_FR_GENERIC
+        or to_unit == NPY_DATETIMEUNIT.NPY_FR_GENERIC
+    ):
+        raise ValueError("unit-less resolutions are not supported")
+    #if (
+    #    from_unit == NPY_DATETIMEUNIT.NPY_FR_B
+    #    or to_unit == NPY_DATETIMEUNIT.NPY_FR_B
+    #):
+    #    raise ValueError("NPY_FR_B resolution is not supported")
+
+    if from_unit > to_unit:
+        raise ValueError
+    if to_unit > NPY_DATETIMEUNIT.NPY_FR_ns or from_unit > NPY_DATETIMEUNIT.NPY_FR_ns:
+        # We don't bother with ps, fs, as
+        raise NotImplementedError(from_unit, to_unit)
+
+    return npy_unit_conversion_matrix[<Py_ssize_t>from_unit][<Py_ssize_t>to_unit]

but so far it doesn't appear to make much difference.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Gotcha. This current version is fairly clear already so IMO what's here in this PR is probably just worth keeping for the clarity

return 7 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_D, to_unit)
elif from_unit == NPY_DATETIMEUNIT.NPY_FR_D:
return 24 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_h, to_unit)
elif from_unit == NPY_DATETIMEUNIT.NPY_FR_h:
return 60 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_m, to_unit)
elif from_unit == NPY_DATETIMEUNIT.NPY_FR_m:
return 60 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_s, to_unit)
elif from_unit == NPY_DATETIMEUNIT.NPY_FR_s:
return 1000 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_ms, to_unit)
elif from_unit == NPY_DATETIMEUNIT.NPY_FR_ms:
return 1000 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_us, to_unit)
elif from_unit == NPY_DATETIMEUNIT.NPY_FR_us:
return 1000 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_ns, to_unit)
elif from_unit == NPY_DATETIMEUNIT.NPY_FR_ns:
return 1000 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_ps, to_unit)
elif from_unit == NPY_DATETIMEUNIT.NPY_FR_ps:
return 1000 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_fs, to_unit)
elif from_unit == NPY_DATETIMEUNIT.NPY_FR_fs:
return 1000 * get_conversion_factor(NPY_DATETIMEUNIT.NPY_FR_as, to_unit)
else:
raise ValueError(from_unit, to_unit)
6 changes: 2 additions & 4 deletions pandas/_libs/tslibs/timedeltas.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,7 @@ from pandas._libs.tslibs.conversion cimport (
cast_from_unit,
precision_from_unit,
)
from pandas._libs.tslibs.dtypes cimport (
get_conversion_factor,
npy_unit_to_abbrev,
)
from pandas._libs.tslibs.dtypes cimport npy_unit_to_abbrev
from pandas._libs.tslibs.nattype cimport (
NPY_NAT,
c_NaT as NaT,
Expand All @@ -50,6 +47,7 @@ from pandas._libs.tslibs.np_datetime cimport (
NPY_FR_ns,
cmp_dtstructs,
cmp_scalar,
get_conversion_factor,
get_datetime64_unit,
get_timedelta64_value,
get_unit_from_dtype,
Expand Down
2 changes: 1 addition & 1 deletion pandas/_libs/tslibs/timestamps.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,6 @@ from pandas._libs.tslibs.conversion cimport (
maybe_localize_tso,
)
from pandas._libs.tslibs.dtypes cimport (
get_conversion_factor,
npy_unit_to_abbrev,
periods_per_day,
periods_per_second,
Expand Down Expand Up @@ -83,6 +82,7 @@ from pandas._libs.tslibs.np_datetime cimport (
NPY_FR_ns,
cmp_dtstructs,
cmp_scalar,
get_conversion_factor,
get_datetime64_unit,
get_datetime64_value,
get_unit_from_dtype,
Expand Down