ENH: Add set_index to Series

h-vetinari · h-vetinari · commit 964186bd498a · 2018-09-10T13:31:28.000+02:00
diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt
@@ -181,6 +181,7 @@ Other Enhancements
   The default compression for ``to_csv``, ``to_json``, and ``to_pickle`` methods has been updated to ``'infer'`` (:issue:`22004`).
 - :func:`to_timedelta` now supports iso-formated timedelta strings (:issue:`21877`)
 - :class:`Series` and :class:`DataFrame` now support :class:`Iterable` in constructor (:issue:`2193`)
+- :class:`Series` has gained the method :meth:`Series.set_index`, which works like its :class:`DataFrame` counterpart :meth:`DataFrame.set_index` (:issue:`21684`)
 - :class:`DatetimeIndex` gained :attr:`DatetimeIndex.timetz` attribute. Returns local time with timezone information. (:issue:`21358`)
 - :class:`Resampler` now is iterable like :class:`GroupBy` (:issue:`15314`).
 - :ref:`Series.resample` and :ref:`DataFrame.resample` have gained the :meth:`Resampler.quantile` (:issue:`15023`).
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -3843,6 +3843,10 @@ def set_index(self, keys, drop=True, append=False, inplace=False,
             necessary. Setting to False will improve the performance of this
             method
 
+        Returns
+        -------
+        reindexed : DataFrame if inplace is False, else None
+
         Examples
         --------
         >>> df = pd.DataFrame({'month': [1, 4, 7, 10],
@@ -3883,73 +3887,30 @@ def set_index(self, keys, drop=True, append=False, inplace=False,
         2  2014  4      40
         3  2013  7      84
         4  2014  10     31
-
-        Returns
-        -------
-        dataframe : DataFrame
         """
-        inplace = validate_bool_kwarg(inplace, 'inplace')
+        from pandas import Series
+
         if not isinstance(keys, list):
             keys = [keys]
 
-        if inplace:
-            frame = self
-        else:
-            frame = self.copy()
-
-        arrays = []
-        names = []
-        if append:
-            names = [x for x in self.index.names]
-            if isinstance(self.index, MultiIndex):
-                for i in range(self.index.nlevels):
-                    arrays.append(self.index._get_level_values(i))
-            else:
-                arrays.append(self.index)
-
-        to_remove = []
-        for col in keys:
-            if isinstance(col, MultiIndex):
-                # append all but the last column so we don't have to modify
-                # the end of this loop
-                for n in range(col.nlevels - 1):
-                    arrays.append(col._get_level_values(n))
-
-                level = col._get_level_values(col.nlevels - 1)
-                names.extend(col.names)
-            elif isinstance(col, Series):
-                level = col._values
-                names.append(col.name)
-            elif isinstance(col, Index):
-                level = col
-                names.append(col.name)
-            elif isinstance(col, (list, np.ndarray, Index)):
-                level = col
-                names.append(None)
-            else:
-                level = frame[col]._values
-                names.append(col)
-                if drop:
-                    to_remove.append(col)
-            arrays.append(level)
-
-        index = ensure_index_from_sequences(arrays, names)
-
-        if verify_integrity and not index.is_unique:
-            duplicates = index[index.duplicated()].unique()
-            raise ValueError('Index has duplicate keys: {dup}'.format(
-                dup=duplicates))
-
-        for c in to_remove:
-            del frame[c]
-
-        # clear up memory usage
-        index._cleanup()
-
-        frame.index = index
-
-        if not inplace:
-            return frame
+        # collect elements from "keys" that are not allowed array types
+        col_labels = [x for x in keys
+                      if not isinstance(x, (Series, Index, MultiIndex,
+                                            list, np.ndarray))]
+        if any(x not in self for x in col_labels):
+            # if there are any labels that are invalid, we raise a KeyError
+            missing = [x for x in col_labels if x not in self]
+            raise KeyError('{}'.format(missing))
+        elif len(set(col_labels)) < len(col_labels):
+            # if all are valid labels, but there are duplicates
+            dup = Series(col_labels)
+            dup = list(dup.loc[dup.duplicated()])
+            raise ValueError('Passed duplicate column names '
+                             'to keys: {dup}'.format(dup=dup))
+        vi = verify_integrity
+        return super(DataFrame, self).set_index(keys=keys, drop=drop,
+                                                append=append, inplace=inplace,
+                                                verify_integrity=vi)
 
     def reset_index(self, level=None, drop=False, inplace=False, col_level=0,
                     col_fill=''):
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -32,11 +32,13 @@
 from pandas.core.dtypes.cast import maybe_promote, maybe_upcast_putmask
 from pandas.core.dtypes.inference import is_hashable
 from pandas.core.dtypes.missing import isna, notna
-from pandas.core.dtypes.generic import ABCSeries, ABCPanel, ABCDataFrame
+from pandas.core.dtypes.generic import (ABCIndexClass, ABCMultiIndex, ABCPanel,
+                                        ABCSeries, ABCDataFrame)
 
 from pandas.core.base import PandasObject, SelectionMixin
-from pandas.core.index import (Index, MultiIndex, ensure_index,
-                               InvalidIndexError, RangeIndex)
+from pandas.core.index import (Index, MultiIndex,
+                               InvalidIndexError, RangeIndex,
+                               ensure_index, ensure_index_from_sequences)
 import pandas.core.indexing as indexing
 from pandas.core.indexes.datetimes import DatetimeIndex
 from pandas.core.indexes.period import PeriodIndex, Period
@@ -663,6 +665,132 @@ def _set_axis(self, axis, labels):
         y : same as input
         """
 
+    def set_index(self, keys, drop=True, append=False, inplace=False,
+                  verify_integrity=False):
+        """
+        Set the Series/DataFrame index (row labels) using one or more given
+        arrays (or column labels in case of DataFrame).
+        By default yields a new object.
+
+        Parameters
+        ----------
+        keys : column label or list of column labels / arrays. For Series case,
+            only array or list of arrays is allowed.
+        drop : boolean, default True
+            Delete columns to be used as the new index (only for DataFrame).
+        append : boolean, default False
+            Whether to append columns to existing index
+        inplace : boolean, default False
+            Modify the Series/DataFrame in place (do not create a new object)
+        verify_integrity : boolean, default False
+            Check the new index for duplicates. Otherwise defer the check until
+            necessary. Setting to False will improve the performance of this
+            method
+
+        Returns
+        -------
+        reindexed : Series/DataFrame if inplace is False, else None
+
+        Examples
+        --------
+        >>> df = pd.DataFrame({'month': [1, 4, 7, 10],
+        ...                    'year': [2012, 2014, 2013, 2014],
+        ...                    'sale':[55, 40, 84, 31]})
+           month  sale  year
+        0  1      55    2012
+        1  4      40    2014
+        2  7      84    2013
+        3  10     31    2014
+
+        Set the index to become the 'month' column:
+
+        >>> df.set_index('month')
+               sale  year
+        month
+        1      55    2012
+        4      40    2014
+        7      84    2013
+        10     31    2014
+
+        Create a multi-index using columns 'year' and 'month':
+
+        >>> df.set_index(['year', 'month'])
+                    sale
+        year  month
+        2012  1     55
+        2014  4     40
+        2013  7     84
+        2014  10    31
+
+        Create a multi-index using a set of values and a column:
+
+        >>> df.set_index([[1, 2, 3, 4], 'year'])
+                 month  sale
+           year
+        1  2012  1      55
+        2  2014  4      40
+        3  2013  7      84
+        4  2014  10     31
+        """
+        inplace = validate_bool_kwarg(inplace, 'inplace')
+        if inplace:
+            obj = self
+        else:
+            obj = self.copy()
+
+        arrays = []
+        names = []
+        if append:
+            names = [x for x in self.index.names]
+            if isinstance(self.index, ABCMultiIndex):
+                for i in range(self.index.nlevels):
+                    arrays.append(self.index._get_level_values(i))
+            else:
+                arrays.append(self.index)
+
+        to_remove = []
+        for col in keys:
+            if isinstance(col, ABCMultiIndex):
+                for n in range(col.nlevels):
+                    arrays.append(col._get_level_values(n))
+                names.extend(col.names)
+            elif isinstance(col, ABCIndexClass):
+                # Index but not MultiIndex (treated above)
+                arrays.append(col)
+                names.append(col.name)
+            elif isinstance(col, ABCSeries):
+                arrays.append(col._values)
+                names.append(col.name)
+            elif isinstance(col, (list, np.ndarray)):
+                arrays.append(col)
+                names.append(None)
+            # from here, col can only be a column label (and self a DataFrame);
+            # see checks in Series.set_index and DataFrame.set_index
+            else:
+                arrays.append(obj[col]._values)
+                names.append(col)
+                if drop:
+                    to_remove.append(col)
+
+        index = ensure_index_from_sequences(arrays, names)
+
+        if verify_integrity and not index.is_unique:
+            duplicates = list(index[index.duplicated()])
+            raise ValueError('Index has duplicate keys: {dup}'.format(
+                dup=duplicates))
+
+        # use set to handle duplicate column names gracefully in case of drop
+        for c in set(to_remove):
+            del obj[c]
+
+        # clear up memory usage
+        index._cleanup()
+
+        obj.index = index
+
+        if not inplace:
+            return obj
+
     @Appender(_shared_docs['transpose'] % _shared_doc_kwargs)
     def transpose(self, *args, **kwargs):
 
diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -35,8 +35,8 @@
     _is_unorderable_exception,
     ensure_platform_int,
     pandas_dtype)
-from pandas.core.dtypes.generic import (
-    ABCSparseArray, ABCDataFrame, ABCIndexClass)
+from pandas.core.dtypes.generic import (ABCDataFrame, ABCIndexClass,
+                                        ABCSeries, ABCSparseArray)
 from pandas.core.dtypes.cast import (
     maybe_upcast, infer_dtype_from_scalar,
     maybe_convert_platform,
@@ -1094,6 +1094,86 @@ def _set_value(self, label, value, takeable=False):
         return self
     _set_value.__doc__ = set_value.__doc__
 
+    def set_index(self, arrays, append=False, inplace=False,
+                  verify_integrity=False):
+        """
+        Set the Series index (row labels) using one or more columns.
+        By default yields a new object.
+
+        Parameters
+        ----------
+        arrays : array or list of arrays
+            Either a Series, Index, MultiIndex, list, np.ndarray or a list
+            containing only Series, Index, MultiIndex, list, np.ndarray
+        append : boolean, default False
+            Whether to append columns to existing index
+        inplace : boolean, default False
+            Modify the Series in place (do not create a new object)
+        verify_integrity : boolean, default False
+            Check the new index for duplicates. Otherwise defer the check until
+            necessary. Setting to False will improve the performance of this
+            method
+
+        Returns
+        -------
+        reindexed : Series if inplace is False, else None
+
+        Examples
+        --------
+        >>> s = pd.Series(range(3))
+        0    10
+        1    11
+        2    12
+        dtype: int64
+
+        Set the index to become `['a', 'b', 'c']`:
+
+        >>> s.set_index(['a', 'b', 'c'])
+        a    10
+        b    11
+        c    12
+        dtype: int64
+
+        Create a multi-index by appending to the existing index:
+
+        >>> s.set_index(['a', 'b', 'c'], append=True)
+        0  a    10
+        1  b    11
+        2  c    12
+        dtype: int64
+
+        Create a multi-index by passing a list of arrays:
+
+        >>> t = s.set_index([['a', 'b', 'c'], ['I', 'II', 'III']]) ** 2
+        >>> t
+        a  I      100
+        b  II     121
+        c  III    144
+        dtype: int64
+
+        Apply index from another object (of the same length!):
+
+        >>> s.set_index(t.index)
+        a  I      10
+        b  II     11
+        c  III    12
+        dtype: int64
+        """
+        if not isinstance(arrays, list):
+            arrays = [arrays]
+        elif all(is_scalar(x) for x in arrays):
+            arrays = [arrays]
+
+        if any(not isinstance(x, (ABCSeries, ABCIndexClass, list, np.ndarray))
+               for x in arrays):
+            raise TypeError('arrays must be Series, Index, MultiIndex, list, '
+                            'np.ndarray or list containing only Series, '
+                            'Index, MultiIndex, list, np.ndarray')
+
+        return super(Series, self).set_index(keys=arrays, drop=False,
+                                             append=append, inplace=inplace,
+                                             verify_integrity=verify_integrity)
+
     def reset_index(self, level=None, drop=False, name=None, inplace=False):
         """
         Generate a new DataFrame or Series with the index reset.
diff --git a/pandas/tests/frame/common.py b/pandas/tests/frame/common.py
@@ -103,6 +103,15 @@ def simple(self):
         return pd.DataFrame(arr, columns=['one', 'two', 'three'],
                             index=['a', 'b', 'c'])
 
+    @cache_readonly
+    def dummy(self):
+        df = pd.DataFrame({'A': ['foo', 'foo', 'foo', 'bar', 'bar'],
+                           'B': ['one', 'two', 'three', 'one', 'two'],
+                           'C': ['a', 'b', 'c', 'd', 'e'],
+                           'D': np.random.randn(5),
+                           'E': np.random.randn(5)})
+        return df
+
 # self.ts3 = tm.makeTimeSeries()[-5:]
 # self.ts4 = tm.makeTimeSeries()[1:-1]
 
diff --git a/pandas/tests/frame/conftest.py b/pandas/tests/frame/conftest.py
diff --git a/pandas/tests/frame/test_alter_axes.py b/pandas/tests/frame/test_alter_axes.py