Skip to content

Commit 0acce38

Browse files
committed
Merge remote-tracking branch 'upstream/master' into clip
2 parents 89a55c8 + 5f312da commit 0acce38

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

73 files changed

+1739
-695
lines changed

asv_bench/benchmarks/indexing.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -204,6 +204,12 @@ def setup(self):
204204
[np.arange(100), list('A'), list('A')],
205205
names=['one', 'two', 'three'])
206206

207+
rng = np.random.RandomState(4)
208+
size = 1 << 16
209+
self.mi_unused_levels = pd.MultiIndex.from_arrays([
210+
rng.randint(0, 1 << 13, size),
211+
rng.randint(0, 1 << 10, size)])[rng.rand(size) < 0.1]
212+
207213
def time_series_xs_mi_ix(self):
208214
self.s.ix[999]
209215

@@ -248,6 +254,9 @@ def time_multiindex_small_get_loc_warm(self):
248254
def time_is_monotonic(self):
249255
self.miint.is_monotonic
250256

257+
def time_remove_unused_levels(self):
258+
self.mi_unused_levels.remove_unused_levels()
259+
251260

252261
class IntervalIndexing(object):
253262
goal_time = 0.2

ci/build_docs.sh

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,15 @@ if [ "$DOC" ]; then
5959
git remote -v
6060

6161
git push origin gh-pages -f
62+
63+
echo "Running doctests"
64+
cd "$TRAVIS_BUILD_DIR"
65+
pytest --doctest-modules \
66+
pandas/core/reshape/concat.py \
67+
pandas/core/reshape/pivot.py \
68+
pandas/core/reshape/reshape.py \
69+
pandas/core/reshape/tile.py
70+
6271
fi
6372

6473
exit 0

doc/source/api.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -724,6 +724,7 @@ Serialization / IO / Conversion
724724
Series.to_dense
725725
Series.to_string
726726
Series.to_clipboard
727+
Series.to_latex
727728

728729
Sparse
729730
~~~~~~
@@ -1285,6 +1286,8 @@ Attributes
12851286
Index.is_monotonic
12861287
Index.is_monotonic_increasing
12871288
Index.is_monotonic_decreasing
1289+
Index.is_strictly_monotonic_increasing
1290+
Index.is_strictly_monotonic_decreasing
12881291
Index.is_unique
12891292
Index.has_duplicates
12901293
Index.dtype
@@ -1704,6 +1707,7 @@ Computations / Descriptive Stats
17041707
GroupBy.mean
17051708
GroupBy.median
17061709
GroupBy.min
1710+
GroupBy.ngroup
17071711
GroupBy.nth
17081712
GroupBy.ohlc
17091713
GroupBy.prod

doc/source/groupby.rst

Lines changed: 57 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1122,12 +1122,36 @@ To see the order in which each row appears within its group, use the
11221122

11231123
.. ipython:: python
11241124
1125-
df = pd.DataFrame(list('aaabba'), columns=['A'])
1126-
df
1125+
dfg = pd.DataFrame(list('aaabba'), columns=['A'])
1126+
dfg
1127+
1128+
dfg.groupby('A').cumcount()
1129+
1130+
dfg.groupby('A').cumcount(ascending=False)
1131+
1132+
.. _groupby.ngroup:
1133+
1134+
Enumerate groups
1135+
~~~~~~~~~~~~~~~~
1136+
1137+
.. versionadded:: 0.20.2
1138+
1139+
To see the ordering of the groups (as opposed to the order of rows
1140+
within a group given by ``cumcount``) you can use the ``ngroup``
1141+
method.
1142+
1143+
Note that the numbers given to the groups match the order in which the
1144+
groups would be seen when iterating over the groupby object, not the
1145+
order they are first observed.
1146+
1147+
.. ipython:: python
11271148
1128-
df.groupby('A').cumcount()
1149+
dfg = pd.DataFrame(list('aaabba'), columns=['A'])
1150+
dfg
11291151
1130-
df.groupby('A').cumcount(ascending=False) # kwarg only
1152+
dfg.groupby('A').ngroup()
1153+
1154+
dfg.groupby('A').ngroup(ascending=False)
11311155
11321156
Plotting
11331157
~~~~~~~~
@@ -1176,14 +1200,41 @@ Regroup columns of a DataFrame according to their sum, and sum the aggregated on
11761200
df
11771201
df.groupby(df.sum(), axis=1).sum()
11781202
1203+
.. _groupby.multicolumn_factorization
1204+
1205+
Multi-column factorization
1206+
~~~~~~~~~~~~~~~~~~~~~~~~~~
1207+
1208+
By using ``.ngroup()``, we can extract information about the groups in
1209+
a way similar to :func:`factorize` (as described further in the
1210+
:ref:`reshaping API <reshaping.factorization>`) but which applies
1211+
naturally to multiple columns of mixed type and different
1212+
sources. This can be useful as an intermediate categorical-like step
1213+
in processing, when the relationships between the group rows are more
1214+
important than their content, or as input to an algorithm which only
1215+
accepts the integer encoding. (For more information about support in
1216+
pandas for full categorical data, see the :ref:`Categorical
1217+
introduction <categorical>` and the
1218+
:ref:`API documentation <api.categorical>`.)
1219+
1220+
.. ipython:: python
1221+
1222+
dfg = pd.DataFrame({"A": [1, 1, 2, 3, 2], "B": list("aaaba")})
1223+
1224+
dfg
1225+
1226+
dfg.groupby(["A", "B"]).ngroup()
1227+
1228+
dfg.groupby(["A", [0, 0, 0, 1, 1]]).ngroup()
1229+
11791230
Groupby by Indexer to 'resample' data
11801231
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
11811232

1182-
Resampling produces new hypothetical samples(resamples) from already existing observed data or from a model that generates data. These new samples are similar to the pre-existing samples.
1233+
Resampling produces new hypothetical samples (resamples) from already existing observed data or from a model that generates data. These new samples are similar to the pre-existing samples.
11831234

11841235
In order to resample to work on indices that are non-datetimelike , the following procedure can be utilized.
11851236

1186-
In the following examples, **df.index // 5** returns a binary array which is used to determine what get's selected for the groupby operation.
1237+
In the following examples, **df.index // 5** returns a binary array which is used to determine what gets selected for the groupby operation.
11871238

11881239
.. note:: The below example shows how we can downsample by consolidation of samples into fewer samples. Here by using **df.index // 5**, we are aggregating the samples in bins. By applying **std()** function, we aggregate the information contained in many samples into a small subset of values which is their standard deviation thereby reducing the number of samples.
11891240

doc/source/io.rst

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -226,8 +226,8 @@ NA and Missing Data Handling
226226
na_values : scalar, str, list-like, or dict, default ``None``
227227
Additional strings to recognize as NA/NaN. If dict passed, specific per-column
228228
NA values. By default the following values are interpreted as NaN:
229-
``'-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN', '#N/A N/A', '#N/A', 'N/A', 'NA',
230-
'#NA', 'NULL', 'NaN', '-NaN', 'nan', '-nan', ''``.
229+
``'-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN', '#N/A N/A', '#N/A', 'N/A', 'n/a', 'NA',
230+
'#NA', 'NULL', 'null', 'NaN', '-NaN', 'nan', '-nan', ''``.
231231
keep_default_na : boolean, default ``True``
232232
If na_values are specified and keep_default_na is ``False`` the default NaN
233233
values are overridden, otherwise they're appended to.
@@ -2739,11 +2739,6 @@ should be passed to ``index_col`` and ``header``
27392739
import os
27402740
os.remove('path_to_file.xlsx')
27412741
2742-
.. warning::
2743-
2744-
Excel files saved in version 0.16.2 or prior that had index names will still able to be read in,
2745-
but the ``has_index_names`` argument must specified to ``True``.
2746-
27472742
27482743
Parsing Specific Columns
27492744
++++++++++++++++++++++++

doc/source/reshaping.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -636,7 +636,7 @@ When a column contains only one level, it will be omitted in the result.
636636
637637
pd.get_dummies(df, drop_first=True)
638638
639-
639+
.. _reshaping.factorize:
640640

641641
Factorizing values
642642
------------------

doc/source/whatsnew/v0.20.0.txt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -515,7 +515,6 @@ Other Enhancements
515515
- Options added to allow one to turn on/off using ``bottleneck`` and ``numexpr``, see :ref:`here <basics.accelerate>` (:issue:`16157`)
516516
- ``DataFrame.style.bar()`` now accepts two more options to further customize the bar chart. Bar alignment is set with ``align='left'|'mid'|'zero'``, the default is "left", which is backward compatible; You can now pass a list of ``color=[color_negative, color_positive]``. (:issue:`14757`)
517517

518-
519518
.. _ISO 8601 duration: https://en.wikipedia.org/wiki/ISO_8601#Durations
520519

521520

doc/source/whatsnew/v0.20.2.txt

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,13 @@ Enhancements
2020
~~~~~~~~~~~~
2121

2222
- Unblocked access to additional compression types supported in pytables: 'blosc:blosclz, 'blosc:lz4', 'blosc:lz4hc', 'blosc:snappy', 'blosc:zlib', 'blosc:zstd' (:issue:`14478`)
23+
- ``Series`` provides a ``to_latex`` method (:issue:`16180`)
24+
- Added :attr:`Index.is_strictly_monotonic_increasing` and :attr:`Index.is_strictly_monotonic_decreasing` properties (:issue:`16515`)
25+
26+
- A new groupby method :meth:`~pandas.core.groupby.GroupBy.ngroup`,
27+
parallel to the existing :meth:`~pandas.core.groupby.GroupBy.cumcount`,
28+
has been added to return the group order (:issue:`11642`); see
29+
:ref:`here <groupby.ngroup>`.
2330

2431
.. _whatsnew_0202.performance:
2532

@@ -30,15 +37,23 @@ Performance Improvements
3037
- Performance regression fix for MultiIndexes (:issue:`16319`, :issue:`16346`)
3138
- Improved performance of ``.clip()`` with scalar arguments (:issue:`15400`)
3239
- Improved performance of groupby with categorical groupers (:issue:`16413`)
40+
- Improved performance of ``MultiIndex.remove_unused_levels()`` (:issue:`16556`)
3341

3442
.. _whatsnew_0202.bug_fixes:
3543

3644
Bug Fixes
3745
~~~~~~~~~
3846

47+
- Silenced a warning on some Windows environments about "tput: terminal attributes: No such device or address" when
48+
detecting the terminal size. This fix only applies to python 3 (:issue:`16496`)
3949
- Bug in using ``pathlib.Path`` or ``py.path.local`` objects with io functions (:issue:`16291`)
50+
- Bug in ``Index.symmetric_difference()`` on two equal MultiIndex's, results in a TypeError (:issue `13490`)
4051
- Bug in ``DataFrame.update()`` with ``overwrite=False`` and ``NaN values`` (:issue:`15593`)
41-
52+
- Passing an invalid engine to :func:`read_csv` now raises an informative
53+
``ValueError`` rather than ``UnboundLocalError``. (:issue:`16511`)
54+
- Bug in :func:`unique` on an array of tuples (:issue:`16519`)
55+
- Bug in :func:`cut`` when ``labels`` are set, resulting in incorrect label ordering (:issue:`16459`)
56+
- Fixed a compatibility issue with IPython 6.0's tab completion showing deprecation warnings on Categoricals (:issue:`16409`)
4257

4358
Conversion
4459
^^^^^^^^^^
@@ -51,15 +66,19 @@ Indexing
5166
^^^^^^^^
5267

5368
- Bug in ``DataFrame.reset_index(level=)`` with single level index (:issue:`16263`)
54-
69+
- Bug in partial string indexing with a monotonic, but not strictly-monotonic, index incorrectly reversing the slice bounds (:issue:`16515`)
70+
- Bug in ``MultiIndex.remove_unused_levels()`` (:issue:`16556`)
5571

5672
I/O
5773
^^^
5874

5975
- Bug in pd.read_csv() when comment is passed in space deliminted text files (:issue:`16472`)
6076
- Bug that would force importing of the clipboard routines unnecessarily, potentially causing an import error on startup (:issue:`16288`)
6177
- Bug that raised IndexError HTML-rendering an empty DataFrame (:issue:`15953`)
78+
- Bug in ``pd.read_csv()`` in which tarfile object inputs were raising an error in Python 2.x for the C engine (:issue:`16530`)
79+
- Bug where ``DataFrame.to_html()`` ignored the ``index_names`` parameter (:issue:`16493`)
6280

81+
- Bug in ``HDFStore.select_as_multiple()`` where start/stop arguments were not respected (:issue:`16209`)
6382

6483
Plotting
6584
^^^^^^^^
@@ -75,6 +94,8 @@ Groupby/Resample/Rolling
7594
^^^^^^^^^^^^^^^^^^^^^^^^
7695

7796
- Bug creating datetime rolling window on an empty DataFrame (:issue:`15819`)
97+
- Bug in ``rolling.cov()`` with offset window (:issue:`16058`)
98+
- Bug in ``.resample()`` and ``.groupby()`` when aggregating on integers (:issue:`16361`)
7899

79100

80101
Sparse
@@ -89,6 +110,7 @@ Reshaping
89110
- Bug in ``pd.wide_to_long()`` where no error was raised when ``i`` was not a unique identifier (:issue:`16382`)
90111
- Bug in ``Series.isin(..)`` with a list of tuples (:issue:`16394`)
91112
- Bug in construction of a ``DataFrame`` with mixed dtypes including an all-NaT column. (:issue:`16395`)
113+
- Bug in ``DataFrame.agg()`` and ``Series.agg()`` with aggregating on non-callable attributes (:issue:`16405`)
92114

93115

94116
Numeric

doc/source/whatsnew/v0.21.0.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ Other Enhancements
3737
- :func:`api.types.infer_dtype` now infers decimals. (:issue: `15690`)
3838
- :func:`read_feather` has gained the ``nthreads`` parameter for multi-threaded operations (:issue:`16359`)
3939
- :func:`DataFrame.clip()` and :func: `Series.cip()` have gained an inplace argument. (:issue: `15388`)
40+
- :func:`crosstab` has gained a ``margins_name`` parameter to define the name of the row / column that will contain the totals when margins=True. (:issue:`15972`)
4041

4142
.. _whatsnew_0210.api_breaking:
4243

@@ -48,6 +49,8 @@ Backwards incompatible API changes
4849

4950
- Accessing a non-existent attribute on a closed :class:`HDFStore` will now
5051
raise an ``AttributeError`` rather than a ``ClosedFileError`` (:issue:`16301`)
52+
- :func:`read_csv` now treats ``'null'`` strings as missing values by default (:issue:`16471`)
53+
- :func:`read_csv` now treats ``'n/a'`` strings as missing values by default (:issue:`16078`)
5154

5255
.. _whatsnew_0210.api:
5356

@@ -69,6 +72,7 @@ Deprecations
6972
Removal of prior version deprecations/changes
7073
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
7174

75+
- ``pd.read_excel()`` has dropped the ``has_index_names`` parameter (:issue:`10967`)
7276

7377

7478
.. _whatsnew_0210.performance:

pandas/_libs/index.pyx

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -152,7 +152,7 @@ cdef class IndexEngine:
152152

153153
try:
154154
return self.mapping.get_item(val)
155-
except TypeError:
155+
except (TypeError, ValueError):
156156
raise KeyError(val)
157157

158158
cdef inline _get_loc_duplicates(self, object val):
@@ -470,7 +470,7 @@ cdef class DatetimeEngine(Int64Engine):
470470
try:
471471
val = _to_i8(val)
472472
return self.mapping.get_item(val)
473-
except TypeError:
473+
except (TypeError, ValueError):
474474
self._date_check_type(val)
475475
raise KeyError(val)
476476

pandas/_libs/parsers.pyx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -277,7 +277,7 @@ DEFAULT_CHUNKSIZE = 256 * 1024
277277
# no longer excluding inf representations
278278
# '1.#INF','-1.#INF', '1.#INF000000',
279279
_NA_VALUES = [b'-1.#IND', b'1.#QNAN', b'1.#IND', b'-1.#QNAN',
280-
b'#N/A N/A', b'NA', b'#NA', b'NULL', b'NaN',
280+
b'#N/A N/A', b'n/a', b'NA', b'#NA', b'NULL', b'null', b'NaN',
281281
b'nan', b'']
282282

283283

pandas/conftest.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,3 +45,13 @@ def spmatrix(request):
4545
tm._skip_if_no_scipy()
4646
from scipy import sparse
4747
return getattr(sparse, request.param + '_matrix')
48+
49+
50+
@pytest.fixture
51+
def ip():
52+
"""An instance of IPython.InteractiveShell.
53+
Will raise a skip if IPython is not installed.
54+
"""
55+
pytest.importorskip('IPython', minversion="6.0.0")
56+
from IPython.core.interactiveshell import InteractiveShell
57+
return InteractiveShell()

pandas/core/algorithms.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -163,7 +163,7 @@ def _ensure_arraylike(values):
163163
ABCIndexClass, ABCSeries)):
164164
inferred = lib.infer_dtype(values)
165165
if inferred in ['mixed', 'string', 'unicode']:
166-
values = np.asarray(values, dtype=object)
166+
values = lib.list_to_object_array(values)
167167
else:
168168
values = np.asarray(values)
169169
return values
@@ -328,6 +328,11 @@ def unique(values):
328328
[b, a, c]
329329
Categories (3, object): [a < b < c]
330330
331+
An array of tuples
332+
333+
>>> pd.unique([('a', 'b'), ('b', 'a'), ('a', 'c'), ('b', 'a')])
334+
array([('a', 'b'), ('b', 'a'), ('a', 'c')], dtype=object)
335+
331336
See Also
332337
--------
333338
pandas.Index.unique

pandas/core/base.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -378,7 +378,7 @@ def aggregate(self, func, *args, **kwargs):
378378
def _try_aggregate_string_function(self, arg, *args, **kwargs):
379379
"""
380380
if arg is a string, then try to operate on it:
381-
- try to find a function on ourselves
381+
- try to find a function (or attribute) on ourselves
382382
- try to find a numpy function
383383
- raise
384384
@@ -387,7 +387,15 @@ def _try_aggregate_string_function(self, arg, *args, **kwargs):
387387

388388
f = getattr(self, arg, None)
389389
if f is not None:
390-
return f(*args, **kwargs)
390+
if callable(f):
391+
return f(*args, **kwargs)
392+
393+
# people may try to aggregate on a non-callable attribute
394+
# but don't let them think they can pass args to it
395+
assert len(args) == 0
396+
assert len([kwarg for kwarg in kwargs
397+
if kwarg not in ['axis', '_level']]) == 0
398+
return f
391399

392400
f = getattr(np, arg, None)
393401
if f is not None:

pandas/core/categorical.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -342,6 +342,13 @@ def __init__(self, values, categories=None, ordered=False, fastpath=False):
342342
self._categories = categories
343343
self._codes = coerce_indexer_dtype(codes, categories)
344344

345+
def __dir__(self):
346+
# Avoid IPython warnings for deprecated properties
347+
# https://github.com/pandas-dev/pandas/issues/16409
348+
rv = set(dir(type(self)))
349+
rv.discard("labels")
350+
return sorted(rv)
351+
345352
@property
346353
def _constructor(self):
347354
return Categorical

0 commit comments

Comments
 (0)