Skip to content

Commit ce3c4fa

Browse files
committed
ENH: re #720, added alternative private constructor
1 parent b6ee864 commit ce3c4fa

File tree

4 files changed

+44
-10
lines changed

4 files changed

+44
-10
lines changed

pandas/core/frame.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4055,12 +4055,18 @@ def complete_dataframe(obj, prev_completions):
40554055
except Exception:
40564056
pass
40574057

4058-
def _indexer_from_factorized(labels, shape):
4058+
def _indexer_from_factorized(labels, shape, compress=True):
40594059
from pandas.core.groupby import get_group_index, _compress_group_index
40604060

40614061
group_index = get_group_index(labels, shape)
4062-
comp_ids, obs_ids = _compress_group_index(group_index)
4063-
max_group = len(obs_ids)
4062+
4063+
if compress:
4064+
comp_ids, obs_ids = _compress_group_index(group_index)
4065+
max_group = len(obs_ids)
4066+
else:
4067+
comp_ids = group_index
4068+
max_group = np.prod(shape)
4069+
40644070
indexer, _ = lib.groupsort_indexer(comp_ids.astype('i4'), max_group)
40654071

40664072
return indexer

pandas/core/groupby.py

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1324,8 +1324,9 @@ def _get_slice(slob):
13241324

13251325
def get_group_index(label_list, shape):
13261326
"""
1327-
Gets the offsets into what would be the cartesian product of all
1328-
possible labels given the label_list.
1327+
For the particular label_list, gets the offsets into the hypothetical list
1328+
representing the totally ordered cartesian product of all possible label
1329+
combinations.
13291330
"""
13301331
if len(label_list) == 1:
13311332
return label_list[0]
@@ -1409,24 +1410,38 @@ def cython_aggregate(values, group_index, ngroups, how='add'):
14091410
# sorting levels...cleverly?
14101411

14111412
def _compress_group_index(group_index, sort=True):
1413+
"""
1414+
Group_index is offsets into cartesian product of all possible labels. This
1415+
space can be huge, so this function compresses it, by computing offsets
1416+
(comp_ids) into the list of unique labels (obs_group_ids).
1417+
"""
1418+
14121419
uniques = []
14131420
table = lib.Int64HashTable(len(group_index))
14141421

14151422
group_index = _ensure_int64(group_index)
1423+
1424+
# note, group labels come out ascending (ie, 1,2,3 etc)
14161425
comp_ids = table.get_labels_groupby(group_index, uniques)
14171426

1418-
# these are the ones we observed
1427+
# these are the unique ones we observed, in the order we observed them
14191428
obs_group_ids = np.array(uniques, dtype='i8')
14201429

14211430
if sort and len(obs_group_ids) > 0:
1431+
# sorter is index where elements ought to go
14221432
sorter = obs_group_ids.argsort()
1433+
1434+
# reverse_indexer is where elements came from
14231435
reverse_indexer = np.empty(len(sorter), dtype='i4')
14241436
reverse_indexer.put(sorter, np.arange(len(sorter)))
14251437

14261438
mask = comp_ids < 0
1439+
1440+
# move comp_ids to right locations (ie, unsort ascending labels)
14271441
comp_ids = reverse_indexer.take(comp_ids)
14281442
np.putmask(comp_ids, mask, -1)
14291443

1444+
# sort observed ids
14301445
obs_group_ids = obs_group_ids.take(sorter)
14311446

14321447
return comp_ids, obs_group_ids

pandas/core/index.py

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1095,6 +1095,16 @@ def copy(self, order='C'):
10951095
def dtype(self):
10961096
return np.dtype('O')
10971097

1098+
@staticmethod
1099+
def _from_elements(self, values, labels=None, levels=None, names=None,
1100+
sortorder=None):
1101+
index = values.view(MultiIndex)
1102+
index.levels = levels
1103+
index.labels = labels
1104+
index.names = names
1105+
index.sortorder = sortorder
1106+
return index
1107+
10981108
def _get_level_number(self, level):
10991109
try:
11001110
count = self.names.count(level)
@@ -1527,14 +1537,17 @@ def sortlevel(self, level=0, ascending=True):
15271537
level = self._get_level_number(level)
15281538
primary = labels.pop(level)
15291539
indexer = _indexer_from_factorized((primary,) + tuple(labels),
1530-
self.levshape)
1540+
self.levshape, compress=False)
15311541
if not ascending:
15321542
indexer = indexer[::-1]
15331543

15341544
new_labels = [lab.take(indexer) for lab in self.labels]
15351545

1536-
new_index = MultiIndex(levels=self.levels, labels=new_labels,
1537-
names=self.names, sortorder=level)
1546+
new_index = self._from_elements(self.values.take(indexer),
1547+
labels = new_labels,
1548+
levels = self.levels,
1549+
names = self.names,
1550+
sortorder = level)
15381551

15391552
return new_index, indexer
15401553

vb_suite/suite.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@
4444
"""
4545
dependencies = ['pandas_vb_common.py']
4646

47-
START_DATE = datetime(2010, 6, 1)
47+
START_DATE = datetime(2012, 1, 20)
4848

4949
repo = GitRepo(REPO_PATH)
5050

0 commit comments

Comments
 (0)