Skip to content

Commit e79f481

Browse files
committed
REF: refactor MultiIndex to not store tuples, be more efficient. testing and compatibility checks. close #1467
1 parent 09118d2 commit e79f481

File tree

12 files changed

+127
-52
lines changed

12 files changed

+127
-52
lines changed

pandas/core/algorithms.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -78,21 +78,21 @@ def _count_generic(values, table_type, type_caster):
7878
from pandas.core.series import Series
7979

8080
values = type_caster(values)
81-
table = table_type(len(values))
81+
table = table_type(min(len(values), 1000000))
8282
uniques, labels, counts = table.factorize(values)
8383

8484
return Series(counts, index=uniques)
8585

8686
def _match_generic(values, index, table_type, type_caster):
8787
values = type_caster(values)
8888
index = type_caster(index)
89-
table = table_type(len(index))
89+
table = table_type(min(len(index), 1000000))
9090
table.map_locations(index)
9191
return table.lookup(values)
9292

9393
def _unique_generic(values, table_type, type_caster):
9494
values = type_caster(values)
95-
table = table_type(len(values))
95+
table = table_type(min(len(values), 1000000))
9696
uniques = table.unique(values)
9797
return type_caster(uniques)
9898

pandas/core/common.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -489,6 +489,8 @@ def _possibly_cast_item(obj, item, dtype):
489489

490490
def _is_bool_indexer(key):
491491
if isinstance(key, np.ndarray) and key.dtype == np.object_:
492+
key = np.asarray(key)
493+
492494
if not lib.is_bool_array(key):
493495
if isnull(key).any():
494496
raise ValueError('cannot index with vector containing '

pandas/core/frame.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3514,8 +3514,10 @@ def _apply_standard(self, func, axis, ignore_failures=False):
35143514
values = self.values
35153515
dummy = Series(np.nan, index=self._get_axis(axis),
35163516
dtype=values.dtype)
3517+
3518+
labels = self._get_agg_axis(axis)
35173519
result = lib.reduce(values, func, axis=axis, dummy=dummy,
3518-
labels=self._get_agg_axis(axis))
3520+
labels=labels)
35193521
return Series(result, index=self._get_agg_axis(axis))
35203522
except Exception:
35213523
pass

pandas/core/groupby.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2084,7 +2084,7 @@ def _compress_group_index(group_index, sort=True):
20842084
"""
20852085

20862086
uniques = []
2087-
table = lib.Int64HashTable(len(group_index))
2087+
table = lib.Int64HashTable(min(1000000, len(group_index)))
20882088

20892089
group_index = com._ensure_int64(group_index)
20902090

pandas/core/index.py

Lines changed: 63 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -230,7 +230,7 @@ def _cleanup(self):
230230
@cache_readonly
231231
def _engine(self):
232232
# property, for now, slow to look up
233-
return self._engine_type(weakref.ref(self))
233+
return self._engine_type(lambda: self.values, len(self))
234234

235235
def _get_level_number(self, level):
236236
if not isinstance(level, int):
@@ -752,7 +752,10 @@ def isin(self, values):
752752
is_contained : ndarray (boolean dtype)
753753
"""
754754
value_set = set(values)
755-
return lib.ismember(self, value_set)
755+
return lib.ismember(self._array_values(), value_set)
756+
757+
def _array_values(self):
758+
return self
756759

757760
def _get_method(self, method):
758761
if method:
@@ -1223,14 +1226,8 @@ def __new__(cls, levels=None, labels=None, sortorder=None, names=None):
12231226
levels = [_ensure_index(lev) for lev in levels]
12241227
labels = [np.asarray(labs, dtype=np.int_) for labs in labels]
12251228

1226-
values = [ndtake(lev.values, lab)
1227-
for lev, lab in zip(levels, labels)]
1228-
1229-
# Need to box timestamps, etc.
1230-
values = _clean_arrays(values)
1231-
1232-
subarr = lib.fast_zip(values).view(cls)
1233-
1229+
# v3, 0.8.0
1230+
subarr = np.empty(0, dtype=object).view(cls)
12341231
subarr.levels = levels
12351232
subarr.labels = labels
12361233

@@ -1267,14 +1264,42 @@ def copy(self, order='C'):
12671264
cp.sortorder = self.sortorder
12681265
return cp
12691266

1267+
def _array_values(self):
1268+
# hack for various methods
1269+
return self.values
1270+
12701271
@property
12711272
def dtype(self):
12721273
return np.dtype('O')
12731274

1275+
def __repr__(self):
1276+
output = 'MultiIndex\n%s'
1277+
1278+
options = np.get_printoptions()
1279+
np.set_printoptions(threshold=50)
1280+
1281+
if len(self) > 100:
1282+
values = np.concatenate([self[:50].values,
1283+
self[-50:].values])
1284+
else:
1285+
values = self.values
1286+
summary = np.array2string(values, max_line_width=70)
1287+
1288+
np.set_printoptions(threshold=options['threshold'])
1289+
1290+
return output % summary
1291+
1292+
def __len__(self):
1293+
return len(self.labels[0])
1294+
12741295
@property
12751296
def _constructor(self):
12761297
return MultiIndex.from_tuples
12771298

1299+
@cache_readonly
1300+
def inferred_type(self):
1301+
return 'mixed'
1302+
12781303
@staticmethod
12791304
def _from_elements(values, labels=None, levels=None, names=None,
12801305
sortorder=None):
@@ -1302,21 +1327,35 @@ def _get_level_number(self, level):
13021327
% (self.nlevels, level))
13031328
return level
13041329

1330+
_tuples = None
1331+
13051332
@property
13061333
def values(self):
1307-
if self._is_legacy_format:
1308-
# for legacy MultiIndex
1309-
values = [ndtake(np.asarray(lev), lab)
1310-
for lev, lab in zip(self.levels, self.labels)]
1311-
return lib.fast_zip(values)
1312-
else:
1334+
if self._is_v2:
13131335
return self.view(np.ndarray)
1336+
else:
1337+
if self._tuples is not None:
1338+
return self._tuples
13141339

1340+
values = [ndtake(lev.values, lab)
1341+
for lev, lab in zip(self.levels, self.labels)]
1342+
1343+
# Need to box timestamps, etc.
1344+
values = _clean_arrays(values)
1345+
self._tuples = lib.fast_zip(values)
1346+
return self._tuples
1347+
1348+
# fml
13151349
@property
1316-
def _is_legacy_format(self):
1350+
def _is_v1(self):
13171351
contents = self.view(np.ndarray)
13181352
return len(contents) > 0 and not isinstance(contents[0], tuple)
13191353

1354+
@property
1355+
def _is_v2(self):
1356+
contents = self.view(np.ndarray)
1357+
return len(contents) > 0 and isinstance(contents[0], tuple)
1358+
13201359
@property
13211360
def _has_complex_internals(self):
13221361
# to disable groupby tricks
@@ -1458,7 +1497,7 @@ def from_arrays(cls, arrays, sortorder=None, names=None):
14581497
-------
14591498
index : MultiIndex
14601499
"""
1461-
from pandas.core.categorical import Factor
1500+
from pandas.core.categorical import Categorical
14621501

14631502
if len(arrays) == 1:
14641503
name = None if names is None else names[0]
@@ -1467,7 +1506,7 @@ def from_arrays(cls, arrays, sortorder=None, names=None):
14671506
levels = []
14681507
labels = []
14691508
for arr in arrays:
1470-
factor = Factor.from_array(arr)
1509+
factor = Categorical.from_array(arr)
14711510
levels.append(factor.levels)
14721511
labels.append(factor.labels)
14731512

@@ -1539,7 +1578,6 @@ def __setstate__(self, state):
15391578
self.sortorder = sortorder
15401579

15411580
def __getitem__(self, key):
1542-
arr_idx = self.view(np.ndarray)
15431581
if np.isscalar(key):
15441582
return tuple(lev[lab[key]]
15451583
for lev, lab in zip(self.levels, self.labels))
@@ -1551,11 +1589,10 @@ def __getitem__(self, key):
15511589
# cannot be sure whether the result will be sorted
15521590
sortorder = None
15531591

1554-
new_tuples = arr_idx[key]
1592+
result = np.empty(0, dtype=object).view(type(self))
15551593
new_labels = [lab[key] for lab in self.labels]
15561594

15571595
# an optimization
1558-
result = new_tuples.view(MultiIndex)
15591596
result.levels = list(self.levels)
15601597
result.labels = new_labels
15611598
result.sortorder = sortorder
@@ -1759,11 +1796,8 @@ def sortlevel(self, level=0, ascending=True):
17591796
indexer = com._ensure_platform_int(indexer)
17601797
new_labels = [lab.take(indexer) for lab in self.labels]
17611798

1762-
new_index = MultiIndex._from_elements(self.values.take(indexer),
1763-
labels=new_labels,
1764-
levels=self.levels,
1765-
names=self.names,
1766-
sortorder=level)
1799+
new_index = MultiIndex(labels=new_labels, levels=self.levels,
1800+
names=self.names, sortorder=level)
17671801

17681802
return new_index, indexer
17691803

@@ -1800,15 +1834,13 @@ def get_indexer(self, target, method=None, limit=None):
18001834
target = _ensure_index(target)
18011835

18021836
target_index = target
1803-
if isinstance(target, MultiIndex) and target._is_legacy_format:
1837+
if isinstance(target, MultiIndex):
18041838
target_index = target.get_tuple_index()
18051839

18061840
if target_index.dtype != object:
18071841
return np.ones(len(target_index)) * -1
18081842

1809-
self_index = self
1810-
if self._is_legacy_format:
1811-
self_index = self.get_tuple_index()
1843+
self_index = self.get_tuple_index()
18121844

18131845
if method == 'pad':
18141846
assert(self.is_unique and self.is_monotonic)

pandas/core/internals.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -863,8 +863,7 @@ def delete(self, item):
863863
i, _ = self._find_block(item)
864864
loc = self.items.get_loc(item)
865865

866-
new_items = self.items._constructor(
867-
np.delete(np.asarray(self.items), loc))
866+
new_items = self.items.delete(loc)
868867

869868
self._delete_from_block(i, item)
870869
self.set_items_norename(new_items)

pandas/core/series.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -312,7 +312,7 @@ def __new__(cls, data=None, index=None, dtype=None, name=None,
312312
elif isinstance(index, PeriodIndex):
313313
data = [data.get(i, nan) for i in index]
314314
else:
315-
data = lib.fast_multiget(data, index, default=np.nan)
315+
data = lib.fast_multiget(data, index.values, default=np.nan)
316316
except TypeError:
317317
data = [data.get(i, nan) for i in index]
318318

@@ -763,7 +763,7 @@ def __repr__(self):
763763
width, height = get_terminal_size()
764764
max_rows = (height if fmt.print_config.max_rows == 0
765765
else fmt.print_config.max_rows)
766-
if len(self.index) > max_rows:
766+
if len(self.index) > (max_rows or 1000):
767767
result = self._tidy_repr(min(30, max_rows - 4))
768768
elif len(self.index) > 0:
769769
result = self._get_repr(print_header=True,

pandas/src/engines.pyx

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -52,18 +52,18 @@ cdef int _SIZE_CUTOFF = 1000000
5252
cdef class IndexEngine:
5353

5454
cdef readonly:
55-
object index_weakref
55+
object vgetter
5656
HashTable mapping
5757
bint over_size_threshold
5858

5959
cdef:
6060
bint unique, monotonic
6161
bint initialized, monotonic_check, unique_check
6262

63-
def __init__(self, index_weakref):
64-
self.index_weakref = index_weakref
63+
def __init__(self, vgetter, n):
64+
self.vgetter = vgetter
6565

66-
self.over_size_threshold = len(index_weakref()) >= _SIZE_CUTOFF
66+
self.over_size_threshold = n >= _SIZE_CUTOFF
6767

6868
self.initialized = 0
6969
self.monotonic_check = 0
@@ -206,7 +206,7 @@ cdef class IndexEngine:
206206
self.monotonic_check = 1
207207

208208
cdef _get_index_values(self):
209-
return self.index_weakref().values
209+
return self.vgetter()
210210

211211
cdef inline _do_unique_check(self):
212212
self._ensure_mapping_populated()
@@ -370,7 +370,7 @@ cdef class DatetimeEngine(Int64Engine):
370370
return _to_i8(val) in self.mapping
371371

372372
cdef _get_index_values(self):
373-
return self.index_weakref().values.view('i8')
373+
return self.vgetter().view('i8')
374374

375375
def _call_monotonic(self, values):
376376
return _algos.is_monotonic_int64(values)

pandas/src/reduce.pyx

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -358,5 +358,8 @@ cdef class Slider:
358358
self.buf.data = self.orig_data
359359

360360
def reduce(arr, f, axis=0, dummy=None, labels=None):
361+
if labels._has_complex_internals:
362+
raise Exception('Cannot use shortcut')
363+
361364
reducer = Reducer(arr, f, axis=axis, dummy=dummy, labels=labels)
362365
return reducer.get_result()

pandas/tests/test_index.py

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import pickle
66
import unittest
77
import nose
8+
import os
89

910
import numpy as np
1011
from numpy.testing import assert_array_equal
@@ -13,6 +14,7 @@
1314
from pandas.core.index import Index, Int64Index, MultiIndex
1415
from pandas.util.testing import assert_almost_equal
1516
from pandas.util import py3compat
17+
import pandas.core.common as com
1618

1719
import pandas.util.testing as tm
1820

@@ -895,15 +897,34 @@ def test_legacy_pickle(self):
895897
if py3compat.PY3:
896898
raise nose.SkipTest
897899

898-
import os
899900
def curpath():
900901
pth, _ = os.path.split(os.path.abspath(__file__))
901902
return pth
902903

903904
ppath = os.path.join(curpath(), 'data/multiindex_v1.pickle')
904905
obj = pickle.load(open(ppath, 'r'))
905906

906-
self.assert_(obj._is_legacy_format)
907+
self.assert_(obj._is_v1)
908+
909+
obj2 = MultiIndex.from_tuples(obj.values)
910+
self.assert_(obj.equals(obj2))
911+
912+
res = obj.get_indexer(obj)
913+
exp = np.arange(len(obj))
914+
assert_almost_equal(res, exp)
915+
916+
res = obj.get_indexer(obj2[::-1])
917+
exp = obj.get_indexer(obj[::-1])
918+
exp2 = obj2.get_indexer(obj2[::-1])
919+
assert_almost_equal(res, exp)
920+
assert_almost_equal(exp, exp2)
921+
922+
def test_legacy_v2_unpickle(self):
923+
# 0.7.3 -> 0.8.0 format manage
924+
pth, _ = os.path.split(os.path.abspath(__file__))
925+
filepath = os.path.join(pth, 'data', 'mindex_073.pickle')
926+
927+
obj = com.load(filepath)
907928

908929
obj2 = MultiIndex.from_tuples(obj.values)
909930
self.assert_(obj.equals(obj2))

0 commit comments

Comments
 (0)