pandas-dev
diff --git a/‎asv_bench/benchmarks/sparse.py
Lines changed: 2 additions & 2 deletions b/‎asv_bench/benchmarks/sparse.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎asv_bench/benchmarks/timeseries.py
Lines changed: 1 addition & 1 deletion b/‎asv_bench/benchmarks/timeseries.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎ci/requirements-3.6_NUMPY_DEV.build
Lines changed: 0 additions & 1 deletion b/‎ci/requirements-3.6_NUMPY_DEV.build
Lines changed: 0 additions & 1 deletion
diff --git a/‎ci/requirements-3.6_NUMPY_DEV.build.sh
Lines changed: 3 additions & 0 deletions b/‎ci/requirements-3.6_NUMPY_DEV.build.sh
Lines changed: 3 additions & 0 deletions
diff --git a/‎doc/source/advanced.rst
Lines changed: 3 additions & 1 deletion b/‎doc/source/advanced.rst
Lines changed: 3 additions & 1 deletion
diff --git a/‎doc/source/api.rst
Lines changed: 4 additions & 1 deletion b/‎doc/source/api.rst
Lines changed: 4 additions & 1 deletion
diff --git a/‎doc/source/categorical.rst
Lines changed: 95 additions & 8 deletions b/‎doc/source/categorical.rst
Lines changed: 95 additions & 8 deletions
diff --git a/‎doc/source/groupby.rst
Lines changed: 2 additions & 2 deletions b/‎doc/source/groupby.rst
Lines changed: 2 additions & 2 deletions
diff --git a/‎doc/source/io.rst
Lines changed: 2 additions & 2 deletions b/‎doc/source/io.rst
Lines changed: 2 additions & 2 deletions
diff --git a/‎doc/source/merging.rst
Lines changed: 8 additions & 3 deletions b/‎doc/source/merging.rst
Lines changed: 8 additions & 3 deletions
diff --git a/‎doc/source/whatsnew/v0.21.0.txt
Lines changed: 50 additions & 1 deletion b/‎doc/source/whatsnew/v0.21.0.txt
Lines changed: 50 additions & 1 deletion
diff --git a/‎pandas/_libs/groupby.pyx
Lines changed: 0 additions & 2 deletions b/‎pandas/_libs/groupby.pyx
Lines changed: 0 additions & 2 deletions
diff --git a/‎pandas/_libs/join.pyx
Lines changed: 0 additions & 2 deletions b/‎pandas/_libs/join.pyx
Lines changed: 0 additions & 2 deletions
diff --git a/‎pandas/_libs/parsers.pyx
Lines changed: 1 addition & 1 deletion b/‎pandas/_libs/parsers.pyx
Lines changed: 1 addition & 1 deletion
@@ -1,4 +1,4 @@
-from itertools import repeat
+import itertools
 
 from .pandas_vb_common import *
 import scipy.sparse
@@ -33,7 +33,7 @@ def time_sparse_from_scipy(self):
         SparseDataFrame(scipy.sparse.rand(1000, 1000, 0.005))
 
     def time_sparse_from_dict(self):
-        SparseDataFrame(dict(zip(range(1000), repeat([0]))))
+        SparseDataFrame(dict(zip(range(1000), itertools.repeat([0]))))
 
 
 class sparse_series_from_coo(object):
 
@@ -56,7 +56,7 @@ def setup(self):
         self.no_freq = self.rng7[:50000].append(self.rng7[50002:])
         self.d_freq = self.rng7[:50000].append(self.rng7[50000:])
 
-        self.rng8 = date_range(start='1/1/1700', freq='B', periods=100000)
+        self.rng8 = date_range(start='1/1/1700', freq='B', periods=75000)
         self.b_freq = self.rng8[:50000].append(self.rng8[50000:])
 
     def time_add_timedelta(self):
 
@@ -1,3 +1,2 @@
 python=3.6*
 pytz
-cython
@@ -14,4 +14,7 @@ pip install --pre --upgrade --timeout=60 -f $PRE_WHEELS numpy scipy
 # install dateutil from master
 pip install -U git+git://github.com/dateutil/dateutil.git
 
+# cython via pip
+pip install cython
+
 true
@@ -638,9 +638,11 @@ and allows efficient indexing and storage of an index with a large number of dup
 
 .. ipython:: python
 
+   from pandas.api.types import CategoricalDtype
+
    df = pd.DataFrame({'A': np.arange(6),
                       'B': list('aabbca')})
-   df['B'] = df['B'].astype('category', categories=list('cab'))
+   df['B'] = df['B'].astype(CategoricalDtype(list('cab')))
    df
    df.dtypes
    df.B.cat.categories
 
@@ -646,7 +646,10 @@ strings and apply several methods to it. These can be accessed like
 Categorical
 ~~~~~~~~~~~
 
-If the Series is of dtype ``category``, ``Series.cat`` can be used to change the the categorical
+.. autoclass:: api.types.CategoricalDtype
+   :members: categories, ordered
+
+If the Series is of dtype ``CategoricalDtype``, ``Series.cat`` can be used to change the categorical
 data. This accessor is similar to the ``Series.dt`` or ``Series.str`` and has the
 following usable methods and properties:
 
 
@@ -89,12 +89,22 @@ By passing a :class:`pandas.Categorical` object to a `Series` or assigning it to
     df["B"] = raw_cat
     df
 
-You can also specify differently ordered categories or make the resulting data ordered, by passing these arguments to ``astype()``:
+Anywhere above we passed a keyword ``dtype='category'``, we used the default behavior of
+
+1. categories are inferred from the data
+2. categories are unordered.
+
+To control those behaviors, instead of passing ``'category'``, use an instance
+of :class:`~pandas.api.types.CategoricalDtype`.
 
 .. ipython:: python
 
-    s = pd.Series(["a","b","c","a"])
-    s_cat = s.astype("category", categories=["b","c","d"], ordered=False)
+    from pandas.api.types import CategoricalDtype
+
+    s = pd.Series(["a", "b", "c", "a"])
+    cat_type = CategoricalDtype(categories=["b", "c", "d"],
+                                ordered=True)
+    s_cat = s.astype(cat_type)
     s_cat
 
 Categorical data has a specific ``category`` :ref:`dtype <basics.dtypes>`:
@@ -133,6 +143,75 @@ constructor to save the factorize step during normal constructor mode:
     splitter = np.random.choice([0,1], 5, p=[0.5,0.5])
     s = pd.Series(pd.Categorical.from_codes(splitter, categories=["train", "test"]))
 
+.. _categorical.categoricaldtype:
+
+CategoricalDtype
+----------------
+
+.. versionchanged:: 0.21.0
+
+A categorical's type is fully described by
+
+1. ``categories``: a sequence of unique values and no missing values
+2. ``ordered``: a boolean
+
+This information can be stored in a :class:`~pandas.api.types.CategoricalDtype`.
+The ``categories`` argument is optional, which implies that the actual categories
+should be inferred from whatever is present in the data when the
+:class:`pandas.Categorical` is created. The categories are assumed to be unordered
+by default.      
+
+.. ipython:: python
+
+   from pandas.api.types import CategoricalDtype
+
+   CategoricalDtype(['a', 'b', 'c'])
+   CategoricalDtype(['a', 'b', 'c'], ordered=True)
+   CategoricalDtype()
+
+A :class:`~pandas.api.types.CategoricalDtype` can be used in any place pandas
+expects a `dtype`. For example :func:`pandas.read_csv`,
+:func:`pandas.DataFrame.astype`, or in the Series constructor.
+
+.. note::
+
+    As a convenience, you can use the string ``'category'`` in place of a
+    :class:`~pandas.api.types.CategoricalDtype` when you want the default behavior of
+    the categories being unordered, and equal to the set values present in the
+    array. In other words, ``dtype='category'`` is equivalent to
+    ``dtype=CategoricalDtype()``.
+
+Equality Semantics
+~~~~~~~~~~~~~~~~~~
+
+Two instances of :class:`~pandas.api.types.CategoricalDtype` compare equal
+whenever they have the same categories and orderedness. When comparing two
+unordered categoricals, the order of the ``categories`` is not considered
+
+.. ipython:: python
+
+   c1 = CategoricalDtype(['a', 'b', 'c'], ordered=False)
+
+   # Equal, since order is not considered when ordered=False
+   c1 == CategoricalDtype(['b', 'c', 'a'], ordered=False)
+
+   # Unequal, since the second CategoricalDtype is ordered
+   c1 == CategoricalDtype(['a',  'b', 'c'], ordered=True)
+
+All instances of ``CategoricalDtype`` compare equal to the string ``'category'``
+
+.. ipython:: python
+
+   c1 == 'category'
+
+.. warning::
+
+   Since ``dtype='category'`` is essentially ``CategoricalDtype(None, False)``,
+   and since all instances ``CategoricalDtype`` compare equal to ``'category'``,
+   all instances of ``CategoricalDtype`` compare equal to a
+   ``CategoricalDtype(None, False)``, regardless of ``categories`` or
+   ``ordered``.
+
 Description
 -----------
 
@@ -184,7 +263,7 @@ It's also possible to pass in the categories in a specific order:
 
     .. ipython:: python
 
-         s = pd.Series(list('babc')).astype('category', categories=list('abcd'))
+         s = pd.Series(list('babc')).astype(CategoricalDtype(list('abcd')))
          s
 
          # categories
@@ -301,7 +380,9 @@ meaning and certain operations are possible. If the categorical is unordered, ``
 
     s = pd.Series(pd.Categorical(["a","b","c","a"], ordered=False))
     s.sort_values(inplace=True)
-    s = pd.Series(["a","b","c","a"]).astype('category', ordered=True)
+    s = pd.Series(["a","b","c","a"]).astype(
+        CategoricalDtype(ordered=True)
+    )
     s.sort_values(inplace=True)
     s
     s.min(), s.max()
@@ -401,9 +482,15 @@ categories or a categorical with any list-like object, will raise a TypeError.
 
 .. ipython:: python
 
-    cat = pd.Series([1,2,3]).astype("category", categories=[3,2,1], ordered=True)
-    cat_base = pd.Series([2,2,2]).astype("category", categories=[3,2,1], ordered=True)
-    cat_base2 = pd.Series([2,2,2]).astype("category", ordered=True)
+    cat = pd.Series([1,2,3]).astype(
+        CategoricalDtype([3, 2, 1], ordered=True)
+    )
+    cat_base = pd.Series([2,2,2]).astype(
+        CategoricalDtype([3, 2, 1], ordered=True)
+    )
+    cat_base2 = pd.Series([2,2,2]).astype(
+        CategoricalDtype(ordered=True)
+    )
 
     cat
     cat_base
 
@@ -1060,7 +1060,7 @@ To select from a DataFrame or Series the nth item, use the nth method. This is a
    g.nth(-1)
    g.nth(1)
 
-If you want to select the nth not-null item, use the ``dropna`` kwarg. For a DataFrame this should be either ``'any'`` or ``'all'`` just like you would pass to dropna, for a Series this just needs to be truthy.
+If you want to select the nth not-null item, use the ``dropna`` kwarg. For a DataFrame this should be either ``'any'`` or ``'all'`` just like you would pass to dropna:
 
 .. ipython:: python
 
@@ -1072,7 +1072,7 @@ If you want to select the nth not-null item, use the ``dropna`` kwarg. For a Dat
    g.nth(-1, dropna='any')  # NaNs denote group exhausted when using dropna
    g.last()
 
-   g.B.nth(0, dropna=True)
+   g.B.nth(0, dropna='all')
 
 As with other methods, passing ``as_index=False``, will achieve a filtration, which returns the grouped row.
 
 
@@ -113,8 +113,8 @@ header : int or list of ints, default ``'infer'``
   rather than the first line of the file.
 names : array-like, default ``None``
   List of column names to use. If file contains no header row, then you should
-  explicitly pass ``header=None``. Duplicates in this list are not allowed unless
-  ``mangle_dupe_cols=True``, which is the default.
+  explicitly pass ``header=None``. Duplicates in this list will cause
+    a ``UserWarning`` to be issued.
 index_col :  int or sequence or ``False``, default ``None``
   Column to use as the row labels of the DataFrame. If a sequence is given, a
   MultiIndex is used. If you have a malformed file with delimiters at the end of
 
@@ -830,8 +830,10 @@ The left frame.
 
 .. ipython:: python
 
+   from pandas.api.types import CategoricalDtype
+
    X = pd.Series(np.random.choice(['foo', 'bar'], size=(10,)))
-   X = X.astype('category', categories=['foo', 'bar'])
+   X = X.astype(CategoricalDtype(categories=['foo', 'bar']))
 
    left = pd.DataFrame({'X': X,
                         'Y': np.random.choice(['one', 'two', 'three'], size=(10,))})
@@ -842,8 +844,11 @@ The right frame.
 
 .. ipython:: python
 
-   right = pd.DataFrame({'X': pd.Series(['foo', 'bar']).astype('category', categories=['foo', 'bar']),
-                         'Z': [1, 2]})
+   right = pd.DataFrame({
+        'X': pd.Series(['foo', 'bar'],
+                       dtype=CategoricalDtype(['foo', 'bar'])),
+        'Z': [1, 2]
+   })
    right
    right.dtypes
 
 
@@ -10,6 +10,8 @@ users upgrade to this version.
 Highlights include:
 
 - Integration with `Apache Parquet <https://parquet.apache.org/>`__, including a new top-level :func:`read_parquet` and :func:`DataFrame.to_parquet` method, see :ref:`here <io.parquet>`.
+- New user-facing :class:`pandas.api.types.CategoricalDtype` for specifying
+  categoricals independent of the data, see :ref:`here <whatsnew_0210.enhancements.categorical_dtype>`.
 
 Check the :ref:`API Changes <whatsnew_0210.api_breaking>` and :ref:`deprecations <whatsnew_0210.deprecations>` before updating.
 
@@ -89,6 +91,49 @@ This does not raise any obvious exceptions, but also does not create a new colum
 
 Setting a list-like data structure into a new attribute now raise a ``UserWarning`` about the potential for unexpected behavior. See :ref:`Attribute Access <indexing.attribute_access>`.
 
+``drop`` now also accepts index/columns keywords
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The :meth:`~DataFrame.drop` method has gained ``index``/``columns`` keywords as an
+alternative to specify the ``axis`` and to make it similar in usage to ``reindex``
+(:issue:`12392`).
+
+For example:
+
+.. ipython:: python
+
+    df = pd.DataFrame(np.arange(8).reshape(2,4),
+                      columns=['A', 'B', 'C', 'D'])
+    df
+    df.drop(['B', 'C'], axis=1)
+    # the following is now equivalent
+    df.drop(columns=['B', 'C'])
+
+.. _whatsnew_0210.enhancements.categorical_dtype:
+
+``CategoricalDtype`` for specifying categoricals
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+:class:`pandas.api.types.CategoricalDtype` has been added to the public API and
+expanded to include the ``categories`` and ``ordered`` attributes. A
+``CategoricalDtype`` can be used to specify the set of categories and
+orderedness of an array, independent of the data themselves. This can be useful,
+e.g., when converting string data to a ``Categorical`` (:issue:`14711`,
+:issue:`15078`, :issue:`16015`):
+
+.. ipython:: python
+
+   from pandas.api.types import CategoricalDtype
+
+   s = pd.Series(['a', 'b', 'c', 'a'])  # strings
+   dtype = CategoricalDtype(categories=['a', 'b', 'c', 'd'], ordered=True)
+   s.astype(dtype)
+
+The ``.dtype`` property of a ``Categorical``, ``CategoricalIndex`` or a
+``Series`` with categorical type will now return an instance of ``CategoricalDtype``.
+
+See the :ref:`CategoricalDtype docs <categorical.categoricaldtype>` for more.
+
 .. _whatsnew_0210.enhancements.other:
 
 Other Enhancements
@@ -110,13 +155,14 @@ Other Enhancements
 - :func:`date_range` now accepts 'Y' in addition to 'A' as an alias for end of year (:issue:`9313`)
 - Integration with `Apache Parquet <https://parquet.apache.org/>`__, including a new top-level :func:`read_parquet` and :func:`DataFrame.to_parquet` method, see :ref:`here <io.parquet>`. (:issue:`15838`, :issue:`17438`)
 - :func:`DataFrame.add_prefix` and :func:`DataFrame.add_suffix` now accept strings containing the '%' character. (:issue:`17151`)
-- `read_*` methods can now infer compression from non-string paths, such as ``pathlib.Path`` objects (:issue:`17206`).
+- Read/write methods that infer compression (:func:`read_csv`, :func:`read_table`, :func:`read_pickle`, and :meth:`~DataFrame.to_pickle`) can now infer from non-string paths, such as ``pathlib.Path`` objects (:issue:`17206`).
 - :func:`pd.read_sas()` now recognizes much more of the most frequently used date (datetime) formats in SAS7BDAT files (:issue:`15871`).
 - :func:`DataFrame.items` and :func:`Series.items` is now present in both Python 2 and 3 and is lazy in all cases (:issue:`13918`, :issue:`17213`)
 - :func:`Styler.where` has been implemented. It is as a convenience for :func:`Styler.applymap` and enables simple DataFrame styling on the Jupyter notebook (:issue:`17474`).
 - :func:`MultiIndex.is_monotonic_decreasing` has been implemented.  Previously returned ``False`` in all cases. (:issue:`16554`)
 - :func:`Categorical.rename_categories` now accepts a dict-like argument as `new_categories` and only updates the categories found in that dict. (:issue:`17336`)
 - :func:`read_excel` raises ``ImportError`` with a better message if ``xlrd`` is not installed. (:issue:`17613`)
+- :meth:`DataFrame.assign` will preserve the original order of ``**kwargs`` for Python 3.6+ users instead of sorting the column names
 
 
 .. _whatsnew_0210.api_breaking:
@@ -422,6 +468,7 @@ Other API Changes
 - The Categorical constructor no longer accepts a scalar for the ``categories`` keyword. (:issue:`16022`)
 - Accessing a non-existent attribute on a closed :class:`~pandas.HDFStore` will now
   raise an ``AttributeError`` rather than a ``ClosedFileError`` (:issue:`16301`)
+- :func:`read_csv` now issues a ``UserWarning`` if the ``names`` parameter contains duplicates (:issue:`17095`)
 - :func:`read_csv` now treats ``'null'`` strings as missing values by default (:issue:`16471`)
 - :func:`read_csv` now treats ``'n/a'`` strings as missing values by default (:issue:`16078`)
 - :class:`pandas.HDFStore`'s string representation is now faster and less detailed. For the previous behavior, use ``pandas.HDFStore.info()``. (:issue:`16503`).
@@ -498,6 +545,7 @@ Conversion
 - Bug in :func:`Series.fillna` returns frame when ``inplace=True`` and ``value`` is dict (:issue:`16156`)
 - Bug in :attr:`Timestamp.weekday_name` returning a UTC-based weekday name when localized to a timezone (:issue:`17354`)
 - Bug in ``Timestamp.replace`` when replacing ``tzinfo`` around DST changes (:issue:`15683`)
+- Bug in ``Timedelta`` construction and arithmetic that would not propagate the ``Overflow`` exception (:issue:`17367`)
 
 Indexing
 ^^^^^^^^
@@ -517,6 +565,7 @@ Indexing
 - Bug in ``CategoricalIndex`` reindexing in which specified indices containing duplicates were not being respected (:issue:`17323`)
 - Bug in intersection of ``RangeIndex`` with negative step (:issue:`17296`)
 - Bug in ``IntervalIndex`` where performing a scalar lookup fails for included right endpoints of non-overlapping monotonic decreasing indexes (:issue:`16417`, :issue:`17271`)
+- Bug in :meth:`DataFrame.first_valid_index` and :meth:`DataFrame.last_valid_index` when no valid entry (:issue:`17400`)
 - Bug in ``Series.rename`` when called with a `callable` alters name of series rather than index of series. (:issue:`17407`)
 
 I/O
 
@@ -7,8 +7,6 @@ cimport cython
 
 cnp.import_array()
 
-cimport util
-
 from numpy cimport (ndarray,
                     double_t,
                     int8_t, int16_t, int32_t, int64_t, uint8_t, uint16_t,
 
@@ -8,8 +8,6 @@ from cython cimport Py_ssize_t
 
 np.import_array()
 
-cimport util
-
 from numpy cimport (ndarray,
                     int8_t, int16_t, int32_t, int64_t, uint8_t, uint16_t,
                     uint32_t, uint64_t, float16_t, float32_t, float64_t)
 
@@ -255,7 +255,7 @@ cdef extern from "parser/tokenizer.h":
 
 #    inline int to_complex(char *item, double *p_real,
 #                          double *p_imag, char sci, char decimal)
-    inline int to_longlong(char *item, long long *p_value) nogil
+    int to_longlong(char *item, long long *p_value) nogil
 #    inline int to_longlong_thousands(char *item, long long *p_value,
 #                                     char tsep)
     int to_boolean(const char *item, uint8_t *val) nogil
Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,2 @@`
`1`	`1`	`python=3.6*`
`2`	`2`	`pytz`
`3`		`-cython`