Skip to content

Commit ba0f0c6

Browse files
committed
move int8/16/32 type conversions to index_class_helper.pxi.in
1 parent 7960275 commit ba0f0c6

File tree

5 files changed

+39
-16
lines changed

5 files changed

+39
-16
lines changed

doc/source/whatsnew/v0.24.0.txt

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -497,15 +497,17 @@ Removal of prior version deprecations/changes
497497
Performance Improvements
498498
~~~~~~~~~~~~~~~~~~~~~~~~
499499

500-
- Very large improvement in performance of slicing when the index is a :class:`CategoricalIndex`,
501-
both when indexing by label (using .loc) and position(.iloc).
502-
Likewise, slicing a ``CategoricalIndex`` itself (i.e. ``ci[100:200]``) shows similar speed improvements (:issue:`21659`)
503-
- Improved performance of :func:`Series.describe` in case of numeric dtpyes (:issue:`21274`)
504-
- Improved performance of :func:`pandas.core.groupby.GroupBy.rank` when dealing with tied rankings (:issue:`21237`)
505-
- Improved performance of :func:`DataFrame.set_index` with columns consisting of :class:`Period` objects (:issue:`21582`,:issue:`21606`)
500+
- Slicing Series and Dataframe with an monotonically increasing :class:`CategoricalIndex`
501+
is now very fast and has speed comparable to slicing with Int64Index. Memory
502+
consumption from such slicing operations is likewise much lower.
503+
The speed increase is both when indexing by label (using .loc) and position(.iloc) (:issue:`20395`)
504+
- Slicing a ``CategoricalIndex`` itself (i.e. ``ci[1000:2000]``) shows similar speed improvements as above (:issue:`21659`)
506505
- Improved performance of membership checks in :class:`Categorical` and :class:`CategoricalIndex`
507506
(i.e. ``x in cat``-style checks are much faster). :meth:`CategoricalIndex.contains`
508507
is likewise much faster (:issue:`21369`, :issue:`21508`)
508+
- Improved performance of :func:`Series.describe` in case of numeric dtypes (:issue:`21274`)
509+
- Improved performance of :func:`pandas.core.groupby.GroupBy.rank` when dealing with tied rankings (:issue:`21237`)
510+
- Improved performance of :func:`DataFrame.set_index` with columns consisting of :class:`Period` objects (:issue:`21582`,:issue:`21606`)
509511
- Improved performance of :meth:`HDFStore.groups` (and dependent functions like
510512
:meth:`~HDFStore.keys`. (i.e. ``x in store`` checks are much faster)
511513
(:issue:`21372`)

pandas/_libs/algos_common_helper.pxi.in

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,14 +22,17 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
2222
{{py:
2323

2424
# name, c_type, dtype, can_hold_na, nogil
25-
dtypes = [('float64', 'float64_t', 'np.float64', True, True),
25+
dtypes = [('object', 'object', 'object', True, False),
26+
('float64', 'float64_t', 'np.float64', True, True),
2627
('float32', 'float32_t', 'np.float32', True, True),
27-
('object', 'object', 'object', True, False),
2828
('int64', 'int64_t', 'np.int64', False, True),
29-
('uint64', 'uint64_t', 'np.uint64', False, True),
3029
('int32', 'int32_t', 'np.int32', False, True),
3130
('int16', 'int16_t', 'np.int16', False, True),
3231
('int8', 'int8_t', 'np.int8', False, True),
32+
('uint64', 'uint64_t', 'np.uint64', False, True),
33+
('uint32', 'uint32_t', 'np.uint32', False, True),
34+
('uint16', 'uint16_t', 'np.uint16', False, True),
35+
('uint8', 'uint8_t', 'np.uint8', False, True),
3336
('bool', 'uint8_t', 'np.bool', False, True)]
3437

3538
def get_dispatch(dtypes):

pandas/_libs/index.pyx

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,10 @@ from cpython.slice cimport PySlice_Check
88

99
import numpy as np
1010
cimport numpy as cnp
11-
from numpy cimport (ndarray, float64_t, int32_t,
12-
int8_t, int16_t, int32_t, int64_t,
13-
uint8_t, uint64_t,
11+
from numpy cimport (ndarray,
12+
float64_t, float32_t,
13+
int64_t,int32_t, int16_t, int8_t,
14+
uint64_t, uint32_t, uint16_t, uint8_t,
1415
intp_t,
1516
# Note: NPY_DATETIME, NPY_TIMEDELTA are only available
1617
# for cimport in cython>=0.27.3
@@ -266,8 +267,6 @@ cdef class IndexEngine:
266267
if not self.is_mapping_populated:
267268

268269
values = self._get_index_values()
269-
if values.dtype in {'int8', 'int16', 'int32'}:
270-
values = algos.ensure_int64(values)
271270
self.mapping = self._make_hash_table(len(values))
272271
self._call_map_locations(values)
273272

pandas/_libs/index_class_helper.pxi.in

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,11 +12,15 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
1212

1313
# name, dtype, ctype
1414
dtypes = [('Float64', 'float64', 'float64_t'),
15+
('Float32', 'float32', 'float32_t'),
1516
('Int64', 'int64', 'int64_t'),
1617
('Int32', 'int32', 'int32_t'),
1718
('Int16', 'int16', 'int16_t'),
1819
('Int8', 'int8', 'int8_t'),
1920
('UInt64', 'uint64', 'uint64_t'),
21+
('UInt32', 'uint32', 'uint32_t'),
22+
('UInt16', 'uint16', 'uint16_t'),
23+
('UInt8', 'uint8', 'uint8_t'),
2024
('Object', 'object', 'object'),
2125
]
2226
}}
@@ -41,11 +45,24 @@ cdef class {{name}}Engine(IndexEngine):
4145
{{if name == 'Object'}}
4246
return _hash.PyObjectHashTable(n)
4347
{{elif name in {'Int8', 'Int16', 'Int32'} }}
48+
# {{name}}HashTable is not available, so we use Int64HashTable
4449
return _hash.Int64HashTable(n)
50+
{{elif name in {'UInt8', 'UInt16', 'UInt32'} }}
51+
# {{name}}HashTable is not available, so we use UInt64HashTable
52+
return _hash.UInt64HashTable(n)
53+
{{elif name in {'Float32'} }}
54+
# {{name}}HashTable is not available, so we use Float64HashTable
55+
return _hash.Float64HashTable(n)
4556
{{else}}
4657
return _hash.{{name}}HashTable(n)
4758
{{endif}}
4859

60+
{{if name in {'Int8', 'Int16', 'Int32'} }}
61+
cpdef _call_map_locations(self, values):
62+
# self.mapping is of type Int64HashTable, so convert dtype of values
63+
self.mapping.map_locations(algos.ensure_int64(values))
64+
{{endif}}
65+
4966
{{if name != 'Float64' and name != 'Object'}}
5067
cdef _check_type(self, object val):
5168
hash(val)

pandas/core/indexes/category.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -75,8 +75,10 @@ class CategoricalIndex(Index, accessor.PandasDelegate):
7575

7676
@property
7777
def _engine_type(self):
78-
type_name = self.codes.dtype.name.capitalize()
79-
return getattr(libindex, "{}Engine".format(type_name))
78+
# self.codes can have dtype int8, int16, int 32 or int64, so we need
79+
# to return the corresponding engine type (libindex.Int8Engine, etc.).
80+
engine_name = "{}Engine".format(self.codes.dtype.name.capitalize())
81+
return getattr(libindex, engine_name)
8082
_attributes = ['name']
8183

8284
def __new__(cls, data=None, categories=None, ordered=None, dtype=None,

0 commit comments

Comments
 (0)