Skip to content

Commit 0a21448

Browse files
committed
Merge branch 'categorical-cleanup' of https://github.com/jseabold/pandas into jseabold-categorical-cleanup
Conflicts: RELEASE.rst
2 parents 3bf8269 + 76644bf commit 0a21448

File tree

8 files changed

+125
-26
lines changed

8 files changed

+125
-26
lines changed

RELEASE.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ pandas 0.11.1
6161
- table writing performance improvements.
6262
- Add modulo operator to Series, DataFrame
6363
- Add ``date`` method to DatetimeIndex
64+
- Simplified the API and added a describe method to Categorical
6465

6566
**API Changes**
6667

doc/source/groupby.rst

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -606,8 +606,8 @@ versions of pandas, but users were generally discarding the NA group anyway
606606
Grouping with ordered factors
607607
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
608608

609-
Categorical variables represented as instance of pandas's ``Factor`` class can
610-
be used as group keys. If so, the order of the levels will be preserved:
609+
Categorical variables represented as instance of pandas's ``Categorical`` class
610+
can be used as group keys. If so, the order of the levels will be preserved:
611611

612612
.. ipython:: python
613613

pandas/core/categorical.py

Lines changed: 75 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from pandas.core.algorithms import factorize
66
from pandas.core.index import Index
77
import pandas.core.common as com
8+
from pandas.core.frame import DataFrame
89

910

1011
def _cat_compare_op(op):
@@ -32,23 +33,68 @@ class Categorical(object):
3233
Parameters
3334
----------
3435
labels : ndarray of integers
35-
levels : Index-like (unique)
36-
37-
data : array-like
36+
If levels is given, the integer at label `i` is the index of the level
37+
for that label. I.e., the level at labels[i] is levels[labels[i]].
38+
Otherwise, if levels is None, these are just the labels and the levels
39+
are assumed to be the unique labels. See from_array.
40+
levels : Index-like (unique), optional
41+
The unique levels for each label. If not given, the levels are assumed
42+
to be the unique values of labels.
43+
name : str, optional
44+
Name for the Categorical variable. If levels is None, will attempt
45+
to infer from labels.
3846
3947
Returns
4048
-------
4149
**Attributes**
4250
* labels : ndarray
4351
* levels : ndarray
52+
53+
Examples
54+
--------
55+
>>> from pandas import Categorical
56+
>>> Categorical([0, 1, 2, 0, 1, 2], [1, 2, 3])
57+
Categorical:
58+
array([1, 2, 3, 1, 2, 3])
59+
Levels (3): Int64Index([1, 2, 3])
60+
61+
>>> Categorical([0,1,2,0,1,2], ['a', 'b', 'c'])
62+
Categorical:
63+
array(['a', 'b', 'c', 'a', 'b', 'c'], dtype=object)
64+
Levels (3): Index(['a', 'b', 'c'], dtype=object)
65+
66+
>>> Categorical(['a', 'b', 'c', 'a', 'b', 'c'])
67+
Categorical:
68+
array(['a', 'b', 'c', 'a', 'b', 'c'], dtype=object)
69+
Levels (3): Index(['a', 'b', 'c'], dtype=object)
4470
"""
45-
def __init__(self, labels, levels, name=None):
71+
def __init__(self, labels, levels=None, name=None):
72+
if levels is None:
73+
if name is None:
74+
name = getattr(labels, 'name', None)
75+
if isinstance(labels, Index) and hasattr(labels, 'factorize'):
76+
labels, levels = labels.factorize()
77+
else:
78+
try:
79+
labels, levels = factorize(labels, sort=True)
80+
except TypeError:
81+
labels, levels = factorize(labels, sort=False)
82+
4683
self.labels = labels
4784
self.levels = levels
4885
self.name = name
4986

5087
@classmethod
5188
def from_array(cls, data):
89+
"""
90+
Make a Categorical type from a single array-like object.
91+
92+
Parameters
93+
----------
94+
data : array-like
95+
Can be an Index or array-like. The levels are assumed to be
96+
the unique values of `data`.
97+
"""
5298
if isinstance(data, Index) and hasattr(data, 'factorize'):
5399
labels, levels = data.factorize()
54100
else:
@@ -131,4 +177,28 @@ def equals(self, other):
131177
return (self.levels.equals(other.levels) and
132178
np.array_equal(self.labels, other.labels))
133179

134-
Factor = Categorical
180+
def describe(self):
181+
"""
182+
Returns a dataframe with frequency and counts by level.
183+
"""
184+
#Hack?
185+
grouped = DataFrame(self.labels).groupby(0)
186+
counts = grouped.count().values.squeeze()
187+
freqs = counts/float(counts.sum())
188+
return DataFrame.from_dict(dict(
189+
counts=counts,
190+
freqs=freqs,
191+
levels=self.levels)).set_index('levels')
192+
193+
194+
class Factor(Categorical):
195+
def __init__(self, labels, levels=None, name=None):
196+
from warnings import warn
197+
warn("Factor is deprecated. Use Categorical instead", FutureWarning)
198+
super(Factor, self).__init__(labels, levels, name)
199+
200+
@classmethod
201+
def from_array(cls, data):
202+
from warnings import warn
203+
warn("Factor is deprecated. Use Categorical instead", FutureWarning)
204+
return super(Factor, cls).from_array(data)

pandas/core/frame.py

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1704,8 +1704,8 @@ def convert_objects(self, convert_dates=True, convert_numeric=False, copy=True):
17041704
-------
17051705
converted : DataFrame
17061706
"""
1707-
return self._constructor(self._data.convert(convert_dates=convert_dates,
1708-
convert_numeric=convert_numeric,
1707+
return self._constructor(self._data.convert(convert_dates=convert_dates,
1708+
convert_numeric=convert_numeric,
17091709
copy=copy))
17101710

17111711
#----------------------------------------------------------------------
@@ -3375,7 +3375,7 @@ def fillna(self, value=None, method=None, axis=0, inplace=False,
33753375
a reference to the filled object, which is self if inplace=True
33763376
limit : int, default None
33773377
Maximum size gap to forward or backward fill
3378-
downcast : dict, default is None, a dict of item->dtype of what to
3378+
downcast : dict, default is None, a dict of item->dtype of what to
33793379
downcast if possible
33803380
33813381
See also
@@ -3425,7 +3425,7 @@ def fillna(self, value=None, method=None, axis=0, inplace=False,
34253425
result[k].fillna(v, inplace=True)
34263426
return result
34273427
else:
3428-
new_data = self._data.fillna(value, inplace=inplace,
3428+
new_data = self._data.fillna(value, inplace=inplace,
34293429
downcast=downcast)
34303430

34313431
if inplace:
@@ -3966,8 +3966,8 @@ def combine(self, other, func, fill_value=None, overwrite=True):
39663966
result[col] = arr
39673967

39683968
# convert_objects just in case
3969-
return self._constructor(result,
3970-
index=new_index,
3969+
return self._constructor(result,
3970+
index=new_index,
39713971
columns=new_columns).convert_objects(
39723972
convert_dates=True,
39733973
copy=False)
@@ -4000,7 +4000,7 @@ def combiner(x, y, needs_i8_conversion=False):
40004000
y_values = y_values.view('i8')
40014001
else:
40024002
mask = isnull(x_values)
4003-
4003+
40044004
return expressions.where(mask, y_values, x_values, raise_on_error=True)
40054005

40064006
return self.combine(other, combiner, overwrite=False)
@@ -5581,11 +5581,11 @@ def group_agg(values, bounds, f):
55815581

55825582
def factor_agg(factor, vec, func):
55835583
"""
5584-
Aggregate array based on Factor
5584+
Aggregate array based on Categorical
55855585
55865586
Parameters
55875587
----------
5588-
factor : Factor
5588+
factor : Categorical
55895589
length n
55905590
vec : sequence
55915591
length n
@@ -5594,7 +5594,11 @@ def factor_agg(factor, vec, func):
55945594
55955595
Returns
55965596
-------
5597-
ndarray corresponding to Factor levels
5597+
ndarray corresponding to factor levels
5598+
5599+
See Also
5600+
--------
5601+
pandas.Categorical
55985602
"""
55995603
indexer = np.argsort(factor.labels)
56005604
unique_labels = np.arange(len(factor.levels))

pandas/core/panel.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
_try_sort, _default_index,
1111
_infer_dtype_from_scalar,
1212
notnull)
13-
from pandas.core.categorical import Factor
13+
from pandas.core.categorical import Categorical
1414
from pandas.core.index import (Index, MultiIndex, _ensure_index,
1515
_get_combined_index)
1616
from pandas.core.indexing import _maybe_droplevels, _is_list_like
@@ -82,8 +82,8 @@ def panel_index(time, panels, names=['time', 'panel']):
8282
(1962, 'C')], dtype=object)
8383
"""
8484
time, panels = _ensure_like_indices(time, panels)
85-
time_factor = Factor.from_array(time)
86-
panel_factor = Factor.from_array(panels)
85+
time_factor = Categorical.from_array(time)
86+
panel_factor = Categorical.from_array(panels)
8787

8888
labels = [time_factor.labels, panel_factor.labels]
8989
levels = [time_factor.levels, panel_factor.levels]

pandas/core/reshape.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -791,9 +791,9 @@ def make_axis_dummies(frame, axis='minor', transform=None):
791791
axis : {'major', 'minor'}, default 'minor'
792792
transform : function, default None
793793
Function to apply to axis labels first. For example, to
794-
get "day of week" dummies in a time series regression
794+
get "day of week" dummies in a time series regression
795795
you might call::
796-
796+
797797
make_axis_dummies(panel, axis='major',
798798
transform=lambda d: d.weekday())
799799
Returns
@@ -852,6 +852,6 @@ def block2d_to_blocknd(values, items, shape, labels, ref_items=None):
852852

853853

854854
def factor_indexer(shape, labels):
855-
""" given a tuple of shape and a list of Factor lables, return the expanded label indexer """
855+
""" given a tuple of shape and a list of Categorical labels, return the expanded label indexer """
856856
mult = np.array(shape)[::-1].cumprod()[::-1]
857857
return com._ensure_platform_int(np.sum(np.array(labels).T * np.append(mult, [1]), axis=1).T)

pandas/tests/test_factor.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
from pandas.core.api import value_counts
1010
from pandas.core.categorical import Categorical
1111
from pandas.core.index import Index, Int64Index, MultiIndex
12+
from pandas.core.frame import DataFrame
1213
from pandas.util.testing import assert_almost_equal
1314
import pandas.core.common as com
1415

@@ -111,6 +112,29 @@ def test_na_flags_int_levels(self):
111112

112113
self.assert_(np.array_equal(com.isnull(cat), labels == -1))
113114

115+
def test_levels_none(self):
116+
factor = Categorical(['a', 'b', 'b', 'a',
117+
'a', 'c', 'c', 'c'])
118+
self.assert_(factor.equals(self.factor))
119+
120+
def test_describe(self):
121+
# string type
122+
desc = self.factor.describe()
123+
expected = DataFrame.from_dict(dict(counts=[3, 2, 3],
124+
freqs=[3/8., 2/8., 3/8.],
125+
levels=['a', 'b', 'c'])
126+
).set_index('levels')
127+
tm.assert_frame_equal(desc, expected)
128+
129+
# check an integer one
130+
desc = Categorical([1,2,3,1,2,3,3,2,1,1,1]).describe()
131+
expected = DataFrame.from_dict(dict(counts=[5, 3, 3],
132+
freqs=[5/11., 3/11., 3/11.],
133+
levels=[1,2,3]
134+
)
135+
).set_index('levels')
136+
tm.assert_frame_equal(desc, expected)
137+
114138
if __name__ == '__main__':
115139
import nose
116140
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],

pandas/tools/merge.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
import itertools
66
import numpy as np
77

8-
from pandas.core.categorical import Factor
8+
from pandas.core.categorical import Categorical
99
from pandas.core.frame import DataFrame, _merge_doc
1010
from pandas.core.generic import NDFrame
1111
from pandas.core.groupby import get_group_index
@@ -1200,7 +1200,7 @@ def _make_concat_multiindex(indexes, keys, levels=None, names=None):
12001200
names = [None] * len(zipped)
12011201

12021202
if levels is None:
1203-
levels = [Factor.from_array(zp).levels for zp in zipped]
1203+
levels = [Categorical.from_array(zp).levels for zp in zipped]
12041204
else:
12051205
levels = [_ensure_index(x) for x in levels]
12061206
else:
@@ -1238,7 +1238,7 @@ def _make_concat_multiindex(indexes, keys, levels=None, names=None):
12381238
levels.extend(concat_index.levels)
12391239
label_list.extend(concat_index.labels)
12401240
else:
1241-
factor = Factor.from_array(concat_index)
1241+
factor = Categorical.from_array(concat_index)
12421242
levels.append(factor.levels)
12431243
label_list.append(factor.labels)
12441244

0 commit comments

Comments
 (0)