From ea06dca0945aed9b453e5cc36bc3a61523bb4190 Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Tue, 31 Jul 2018 11:12:52 -0700 Subject: [PATCH 1/6] make public is_nan instead of private _checknull --- pandas/_libs/hashing.pyx | 9 ++++----- pandas/_libs/interval.pyx | 2 +- pandas/_libs/lib.pyx | 4 ++-- pandas/_libs/missing.pyx | 8 ++++---- pandas/_libs/ops.pyx | 10 +++++----- pandas/_libs/src/inference.pyx | 8 ++++---- pandas/_libs/tslibs/nattype.pyx | 5 ++--- pandas/_libs/tslibs/timedeltas.pyx | 2 +- pandas/_libs/tslibs/util.pxd | 15 +++++++++++++-- 9 files changed, 36 insertions(+), 27 deletions(-) diff --git a/pandas/_libs/hashing.pyx b/pandas/_libs/hashing.pyx index ff92ee306288a..54a7c97a5d7dd 100644 --- a/pandas/_libs/hashing.pyx +++ b/pandas/_libs/hashing.pyx @@ -3,14 +3,13 @@ # at https://github.com/veorq/SipHash import cython +from cpython cimport PyBytes_Check, PyUnicode_Check +from libc.stdlib cimport malloc, free import numpy as np from numpy cimport ndarray, uint8_t, uint32_t, uint64_t -from util cimport _checknull -from cpython cimport (PyBytes_Check, - PyUnicode_Check) -from libc.stdlib cimport malloc, free +from util cimport is_nan DEF cROUNDS = 2 DEF dROUNDS = 4 @@ -65,7 +64,7 @@ def hash_object_array(ndarray[object] arr, object key, object encoding='utf8'): data = val elif PyUnicode_Check(val): data = val.encode(encoding) - elif _checknull(val): + elif val is None or is_nan(val): # null, stringify and encode data = str(val).encode(encoding) diff --git a/pandas/_libs/interval.pyx b/pandas/_libs/interval.pyx index 40fdfedaa23d5..f5c3a0a6bc47c 100644 --- a/pandas/_libs/interval.pyx +++ b/pandas/_libs/interval.pyx @@ -391,7 +391,7 @@ cpdef intervals_to_interval_bounds(ndarray intervals, for i in range(len(intervals)): interval = intervals[i] - if util._checknull(interval): + if interval is None or util.is_nan(interval): left[i] = np.nan right[i] = np.nan continue diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 4cc119a700ca0..d80b5fd2bd0b9 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -34,7 +34,7 @@ from missing cimport checknull cimport util cdef int64_t NPY_NAT = util.get_nat() -from util cimport is_array, _checknull +from util cimport is_array, is_nan def values_from_object(object o): @@ -429,7 +429,7 @@ cpdef bint array_equivalent_object(object[:] left, object[:] right): # we are either not equal or both nan # I think None == None will be true here if not (PyObject_RichCompareBool(x, y, Py_EQ) or - _checknull(x) and _checknull(y)): + (x is None or is_nan(x)) and (y is None or is_nan(y))): return False return True diff --git a/pandas/_libs/missing.pyx b/pandas/_libs/missing.pyx index 6161a55b22342..36b8b3806d7e3 100644 --- a/pandas/_libs/missing.pyx +++ b/pandas/_libs/missing.pyx @@ -74,7 +74,7 @@ cpdef bint checknull(object val): elif util.is_array(val): return False else: - return util._checknull(val) + return val is None or util.is_nan(val) cpdef bint checknull_old(object val): @@ -297,7 +297,7 @@ cpdef bint isneginf_scalar(object val): cdef inline bint is_null_datetime64(v): # determine if we have a null for a datetime (or integer versions), # excluding np.timedelta64('nat') - if util._checknull(v): + if v is None or util.is_nan(v): return True elif v is NaT: return True @@ -309,7 +309,7 @@ cdef inline bint is_null_datetime64(v): cdef inline bint is_null_timedelta64(v): # determine if we have a null for a timedelta (or integer versions), # excluding np.datetime64('nat') - if util._checknull(v): + if v is None or util.is_nan(v): return True elif v is NaT: return True @@ -321,7 +321,7 @@ cdef inline bint is_null_timedelta64(v): cdef inline bint is_null_period(v): # determine if we have a null for a Period (or integer versions), # excluding np.datetime64('nat') and np.timedelta64('nat') - if util._checknull(v): + if v is None or util.is_nan(v): return True elif v is NaT: return True diff --git a/pandas/_libs/ops.pyx b/pandas/_libs/ops.pyx index 148018ece20e2..20df169b0bc15 100644 --- a/pandas/_libs/ops.pyx +++ b/pandas/_libs/ops.pyx @@ -13,7 +13,7 @@ import numpy as np from numpy cimport ndarray, uint8_t -from util cimport UINT8_MAX, _checknull +from util cimport UINT8_MAX, is_nan from missing cimport checknull @@ -190,13 +190,13 @@ def scalar_binop(ndarray[object] values, object val, object op): object x result = np.empty(n, dtype=object) - if _checknull(val): + if val is None or is_nan(val): result.fill(val) return result for i in range(n): x = values[i] - if _checknull(x): + if x is None or is_nan(x): result[i] = x else: result[i] = op(x, val) @@ -237,9 +237,9 @@ def vec_binop(ndarray[object] left, ndarray[object] right, object op): try: result[i] = op(x, y) except TypeError: - if _checknull(x): + if x is None or is_nan(x): result[i] = x - elif _checknull(y): + elif y is None or is_nan(y): result[i] = y else: raise diff --git a/pandas/_libs/src/inference.pyx b/pandas/_libs/src/inference.pyx index 8cf69057a7e74..99846c2771ebe 100644 --- a/pandas/_libs/src/inference.pyx +++ b/pandas/_libs/src/inference.pyx @@ -393,7 +393,7 @@ def infer_dtype(object value, bint skipna=False): # do not use is_nul_datetimelike to keep # np.datetime64('nat') and np.timedelta64('nat') - if util._checknull(val): + if val is None or util.is_nan(val): pass elif val is NaT: seen_pdnat = True @@ -522,7 +522,7 @@ cpdef object infer_datetimelike_array(object arr): if len(objs) == 3: break - elif util._checknull(v): + elif v is None or util.is_nan(v): # nan or None pass elif v is NaT: @@ -660,7 +660,7 @@ cdef class Validator: ) cdef bint is_valid_null(self, object value) except -1: - return util._checknull(value) + return value is None or util.is_nan(value) cdef bint is_array_typed(self) except -1: return False @@ -828,7 +828,7 @@ cdef class TemporalValidator(Validator): cdef inline bint is_valid_skipna(self, object value) except -1: cdef: bint is_typed_null = self.is_valid_null(value) - bint is_generic_null = util._checknull(value) + bint is_generic_null = value is None or util.is_nan(value) self.generic_null_count += is_typed_null and is_generic_null return self.is_value_typed(value) or is_typed_null or is_generic_null diff --git a/pandas/_libs/tslibs/nattype.pyx b/pandas/_libs/tslibs/nattype.pyx index 2fe8fab2e2e19..6b079abf81185 100644 --- a/pandas/_libs/tslibs/nattype.pyx +++ b/pandas/_libs/tslibs/nattype.pyx @@ -586,8 +586,7 @@ NaT = NaTType() cdef inline bint checknull_with_nat(object val): """ utility to check if a value is a nat or not """ - return val is None or ( - PyFloat_Check(val) and val != val) or val is NaT + return val is None or util.is_nan(val) or val is NaT cdef inline bint is_null_datetimelike(object val): @@ -602,7 +601,7 @@ cdef inline bint is_null_datetimelike(object val): ------- null_datetimelike : bool """ - if util._checknull(val): + if val is None or util.is_nan(val): return True elif val is NaT: return True diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index f7a6cf0c6dafc..c4a9555e1a148 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -1368,7 +1368,7 @@ class Timedelta(_Timedelta): '{op}'.format(dtype=other.dtype, op='__floordiv__')) - elif is_float_object(other) and util._checknull(other): + elif is_float_object(other) and util.is_nan(other): # i.e. np.nan return NotImplemented diff --git a/pandas/_libs/tslibs/util.pxd b/pandas/_libs/tslibs/util.pxd index 624ed7ced2654..7576f9bc6f9b0 100644 --- a/pandas/_libs/tslibs/util.pxd +++ b/pandas/_libs/tslibs/util.pxd @@ -228,5 +228,16 @@ cdef inline bint is_offset_object(object val): return getattr(val, '_typ', None) == "dateoffset" -cdef inline bint _checknull(object val): - return val is None or (PyFloat_Check(val) and val != val) +cdef inline bint is_nan(object val): + """ + Check if val is a Not-A-Number float, including float('NaN') and np.nan. + + Parameters + ---------- + val : object + + Returns + ------- + is_nan : bool + """ + return isinstance(val, float) and val != val From c177e206ef1dc0f2b636820982e6fed3b8d72997 Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Tue, 31 Jul 2018 11:31:01 -0700 Subject: [PATCH 2/6] cleanup non-py syntax --- pandas/_libs/parsers.pyx | 9 +++++---- pandas/_libs/writers.pyx | 2 +- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index a24e2cdd99f6f..fba7f210b34a1 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -53,7 +53,7 @@ from pandas.core.dtypes.common import ( pandas_dtype) from pandas.core.arrays import Categorical from pandas.core.dtypes.concat import union_categoricals -import pandas.io.common as com +import pandas.io.common as icom from pandas.errors import (ParserError, DtypeWarning, EmptyDataError, ParserWarning) @@ -665,7 +665,8 @@ cdef class TextReader: if b'utf-16' in (self.encoding or b''): # we need to read utf-16 through UTF8Recoder. # if source is utf-16, convert source to utf-8 by UTF8Recoder. - source = com.UTF8Recoder(source, self.encoding.decode('utf-8')) + source = icom.UTF8Recoder(source, + self.encoding.decode('utf-8')) self.encoding = b'utf-8' self.c_encoding = self.encoding @@ -1356,7 +1357,7 @@ cdef asbytes(object o): # common NA values # no longer excluding inf representations # '1.#INF','-1.#INF', '1.#INF000000', -_NA_VALUES = _ensure_encoded(list(com._NA_VALUES)) +_NA_VALUES = _ensure_encoded(list(icom._NA_VALUES)) def _maybe_upcast(arr): @@ -2247,7 +2248,7 @@ def sanitize_objects(ndarray[object] values, set na_values, n = len(values) onan = np.nan - for i from 0 <= i < n: + for i in range(n): val = values[i] if (convert_empty and val == '') or (val in na_values): values[i] = onan diff --git a/pandas/_libs/writers.pyx b/pandas/_libs/writers.pyx index 041eb59812ae3..796f4b754857e 100644 --- a/pandas/_libs/writers.pyx +++ b/pandas/_libs/writers.pyx @@ -163,7 +163,7 @@ def string_array_replace_from_nan_rep( if replace is None: replace = np.nan - for i from 0 <= i < length: + for i in range(length): if arr[i] == nan_rep: arr[i] = replace From e3f22011c3c70f52c1d12a7207cc9b26c7bcf0d7 Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Tue, 31 Jul 2018 11:43:31 -0700 Subject: [PATCH 3/6] fixup missed usage --- pandas/_libs/missing.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/missing.pyx b/pandas/_libs/missing.pyx index 36b8b3806d7e3..e9c3cf12eb328 100644 --- a/pandas/_libs/missing.pyx +++ b/pandas/_libs/missing.pyx @@ -113,7 +113,7 @@ cpdef bint checknull_old(object val): elif util.is_array(val): return False else: - return util._checknull(val) + return val is None or util.is_nan(val) cdef inline bint _check_none_nan_inf_neginf(object val): From 6905ee87da8be841fa0f578e984644487cf2c1d8 Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Tue, 31 Jul 2018 11:45:01 -0700 Subject: [PATCH 4/6] Cleanup import and whitespace --- pandas/_libs/skiplist.pyx | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/skiplist.pyx b/pandas/_libs/skiplist.pyx index 23836ef7f4de9..eec0457fc4caf 100644 --- a/pandas/_libs/skiplist.pyx +++ b/pandas/_libs/skiplist.pyx @@ -5,6 +5,7 @@ # Link: http://code.activestate.com/recipes/576930/ # Cython version: Wes McKinney +from random import random from libc.math cimport log @@ -17,8 +18,6 @@ cdef double Log2(double x): return log(x) / log(2.) -from random import random - # TODO: optimize this, make less messy cdef class Node: @@ -32,9 +31,11 @@ cdef class Node: self.next = next self.width = width + # Singleton terminator node NIL = Node(np.inf, [], []) + cdef class IndexableSkiplist: """ Sorted collection supporting O(lg n) insertion, removal, and From 01e79d2138bb6e2a4833948808aae394325727dc Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Wed, 1 Aug 2018 17:54:28 -0700 Subject: [PATCH 5/6] have is_nan check for np.float_ --- pandas/_libs/tslibs/util.pxd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/tslibs/util.pxd b/pandas/_libs/tslibs/util.pxd index 7576f9bc6f9b0..0470202ee7d98 100644 --- a/pandas/_libs/tslibs/util.pxd +++ b/pandas/_libs/tslibs/util.pxd @@ -240,4 +240,4 @@ cdef inline bint is_nan(object val): ------- is_nan : bool """ - return isinstance(val, float) and val != val + return is_float_object(val) and val != val From 70edf93d2023145f02bdcecb825821e769a63562 Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Thu, 2 Aug 2018 11:13:24 -0700 Subject: [PATCH 6/6] fix segfaults --- pandas/_libs/hashing.pyx | 3 ++- pandas/_libs/interval.pyx | 1 + pandas/_libs/ops.pyx | 3 ++- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/hashing.pyx b/pandas/_libs/hashing.pyx index 2fee97cbc2ea7..a9775d3950187 100644 --- a/pandas/_libs/hashing.pyx +++ b/pandas/_libs/hashing.pyx @@ -7,7 +7,8 @@ from cpython cimport PyBytes_Check, PyUnicode_Check from libc.stdlib cimport malloc, free import numpy as np -from numpy cimport uint8_t, uint32_t, uint64_t +from numpy cimport uint8_t, uint32_t, uint64_t, import_array +import_array() from util cimport is_nan diff --git a/pandas/_libs/interval.pyx b/pandas/_libs/interval.pyx index f5c3a0a6bc47c..22153b58cc49b 100644 --- a/pandas/_libs/interval.pyx +++ b/pandas/_libs/interval.pyx @@ -12,6 +12,7 @@ from numpy cimport ndarray cimport util +util.import_array() from tslibs import Timestamp from tslibs.timezones cimport tz_compare diff --git a/pandas/_libs/ops.pyx b/pandas/_libs/ops.pyx index 20df169b0bc15..006bf0cdd3269 100644 --- a/pandas/_libs/ops.pyx +++ b/pandas/_libs/ops.pyx @@ -10,7 +10,8 @@ cimport cython from cython cimport Py_ssize_t import numpy as np -from numpy cimport ndarray, uint8_t +from numpy cimport ndarray, uint8_t, import_array +import_array() from util cimport UINT8_MAX, is_nan