diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt index e77532b2fe432..39ed153376e66 100644 --- a/doc/source/whatsnew/v0.17.0.txt +++ b/doc/source/whatsnew/v0.17.0.txt @@ -276,6 +276,7 @@ Performance Improvements - 8x improvement in ``timedelta64`` and ``datetime64`` ops (:issue:`6755`) - Significantly improved performance of indexing ``MultiIndex`` with slicers (:issue:`10287`) - Improved performance of ``Series.isin`` for datetimelike/integer Series (:issue:`10287`) +- 20x improvement in ``concat`` of Categoricals when categories are identical (:issue:`10587`) .. _whatsnew_0170.bug_fixes: diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index edd4a532cf8f5..7aa9145d8aed8 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -1715,18 +1715,20 @@ def _convert_to_list_like(list_like): return [list_like] def _concat_compat(to_concat, axis=0): - """ - provide concatenation of an object/categorical array of arrays each of which is a single dtype + """Concatenate an object/categorical array of arrays, each of which is a + single dtype Parameters ---------- to_concat : array of arrays - axis : axis to provide concatenation - in the current impl this is always 0, e.g. we only have 1-d categoricals + axis : int + Axis to provide concatenation in the current implementation this is + always 0, e.g. we only have 1D categoricals Returns ------- - a single array, preserving the combined dtypes + Categorical + A single array, preserving the combined dtypes """ def convert_categorical(x): @@ -1735,31 +1737,34 @@ def convert_categorical(x): return x.get_values() return x.ravel() - typs = get_dtype_kinds(to_concat) - if not len(typs-set(['object','category'])): - - # we only can deal with object & category types - pass - - else: - + if get_dtype_kinds(to_concat) - set(['object', 'category']): # convert to object type and perform a regular concat from pandas.core.common import _concat_compat - return _concat_compat([ np.array(x,copy=False).astype('object') for x in to_concat ],axis=0) + return _concat_compat([np.array(x, copy=False, dtype=object) + for x in to_concat], axis=0) - # we could have object blocks and categorical's here - # if we only have a single cateogoricals then combine everything + # we could have object blocks and categoricals here + # if we only have a single categoricals then combine everything # else its a non-compat categorical - categoricals = [ x for x in to_concat if is_categorical_dtype(x.dtype) ] - objects = [ x for x in to_concat if is_object_dtype(x.dtype) ] + categoricals = [x for x in to_concat if is_categorical_dtype(x.dtype)] # validate the categories - categories = None - for x in categoricals: - if categories is None: - categories = x.categories - if not categories.equals(x.categories): + categories = categoricals[0] + rawcats = categories.categories + for x in categoricals[1:]: + if not categories.is_dtype_equal(x): raise ValueError("incompatible categories in categorical concat") - # concat them - return Categorical(np.concatenate([ convert_categorical(x) for x in to_concat ],axis=0), categories=categories) + # we've already checked that all categoricals are the same, so if their + # length is equal to the input then we have all the same categories + if len(categoricals) == len(to_concat): + # concating numeric types is much faster than concating object types + # and fastpath takes a shorter path through the constructor + return Categorical(np.concatenate([x.codes for x in to_concat], axis=0), + rawcats, + ordered=categoricals[0].ordered, + fastpath=True) + else: + concatted = np.concatenate(list(map(convert_categorical, to_concat)), + axis=0) + return Categorical(concatted, rawcats) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 6b7909086403e..37d6cb9c0d5b6 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -4388,7 +4388,11 @@ def is_null(self): # Usually it's enough to check but a small fraction of values to see if # a block is NOT null, chunks should help in such cases. 1000 value # was chosen rather arbitrarily. - values_flat = self.block.values.ravel() + values = self.block.values + if self.block.is_categorical: + values_flat = values.categories + else: + values_flat = values.ravel() total_len = values_flat.shape[0] chunk_len = max(total_len // 40, 1000) for i in range(0, total_len, chunk_len): diff --git a/pandas/lib.pyx b/pandas/lib.pyx index 27ba6f953306d..44be74b78d6bb 100644 --- a/pandas/lib.pyx +++ b/pandas/lib.pyx @@ -21,6 +21,7 @@ from cpython cimport (PyDict_New, PyDict_GetItem, PyDict_SetItem, PyTuple_SetItem, PyTuple_New, PyObject_SetAttrString, + PyObject_RichCompareBool, PyBytes_GET_SIZE, PyUnicode_GET_SIZE) @@ -372,19 +373,19 @@ def isnullobj2d_old(ndarray[object, ndim=2] arr): result[i, j] = 1 return result.view(np.bool_) -def list_to_object_array(list obj): + +@cython.wraparound(False) +@cython.boundscheck(False) +cpdef ndarray[object] list_to_object_array(list obj): ''' Convert list to object ndarray. Seriously can't believe I had to write this function ''' cdef: - Py_ssize_t i, n - ndarray[object] arr - - n = len(obj) - arr = np.empty(n, dtype=object) + Py_ssize_t i, n = len(obj) + ndarray[object] arr = np.empty(n, dtype=object) - for i from 0 <= i < n: + for i in range(n): arr[i] = obj[i] return arr @@ -732,28 +733,25 @@ def scalar_compare(ndarray[object] values, object val, object op): return result.view(bool) + @cython.wraparound(False) @cython.boundscheck(False) -def array_equivalent_object(ndarray[object] left, ndarray[object] right): +cpdef bint array_equivalent_object(object[:] left, object[:] right): """ perform an element by element comparion on 1-d object arrays taking into account nan positions """ - cdef Py_ssize_t i, n - cdef object x, y + cdef: + Py_ssize_t i, n = left.shape[0] + object x, y - n = len(left) - for i from 0 <= i < n: + for i in range(n): x = left[i] y = right[i] # we are either not equal or both nan # I think None == None will be true here - if cpython.PyObject_RichCompareBool(x, y, cpython.Py_EQ): - continue - elif _checknull(x) and _checknull(y): - continue - else: + if not (PyObject_RichCompareBool(x, y, cpython.Py_EQ) or + _checknull(x) and _checknull(y)): return False - return True diff --git a/vb_suite/categoricals.py b/vb_suite/categoricals.py new file mode 100644 index 0000000000000..cb33f1bb6c0b1 --- /dev/null +++ b/vb_suite/categoricals.py @@ -0,0 +1,16 @@ +from vbench.benchmark import Benchmark +from datetime import datetime + +common_setup = """from pandas_vb_common import * +""" + +#---------------------------------------------------------------------- +# Series constructors + +setup = common_setup + """ +s = pd.Series(list('aabbcd') * 1000000).astype('category') +""" + +concat_categorical = \ + Benchmark("concat([s, s])", setup=setup, name='concat_categorical', + start_date=datetime(year=2015, month=7, day=15))