From 7b41c7f4941c9aacf2c965c4dfac5ec057172e72 Mon Sep 17 00:00:00 2001 From: keitakurita Date: Fri, 12 May 2017 08:00:04 +0900 Subject: [PATCH 01/44] BUG: incorrect handling of scipy.sparse.dok formats (#16197) (#16191) (cherry picked from commit 1c0b63281db0486aa8182d550e9bceb641e5f9a4) --- doc/source/whatsnew/v0.20.2.txt | 3 +-- pandas/core/sparse/frame.py | 2 +- pandas/tests/sparse/test_frame.py | 28 +++++++++++++++++++++++++--- 3 files changed, 27 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v0.20.2.txt b/doc/source/whatsnew/v0.20.2.txt index 504f8004bc8a6..7403bf4c183f1 100644 --- a/doc/source/whatsnew/v0.20.2.txt +++ b/doc/source/whatsnew/v0.20.2.txt @@ -66,8 +66,7 @@ Groupby/Resample/Rolling Sparse ^^^^^^ - - +- Bug in construction of SparseDataFrame from ``scipy.sparse.dok_matrix`` (:issue:`16179`) Reshaping ^^^^^^^^^ diff --git a/pandas/core/sparse/frame.py b/pandas/core/sparse/frame.py index 3c8f6e8c6257d..461dd50c5da6e 100644 --- a/pandas/core/sparse/frame.py +++ b/pandas/core/sparse/frame.py @@ -190,8 +190,8 @@ def _init_spmatrix(self, data, index, columns, dtype=None, values = Series(data.data, index=data.row, copy=False) for col, rowvals in values.groupby(data.col): # get_blocks expects int32 row indices in sorted order + rowvals = rowvals.sort_index() rows = rowvals.index.values.astype(np.int32) - rows.sort() blocs, blens = get_blocks(rows) sdict[columns[col]] = SparseSeries( diff --git a/pandas/tests/sparse/test_frame.py b/pandas/tests/sparse/test_frame.py index 0312b76ec30a5..654d12b782f37 100644 --- a/pandas/tests/sparse/test_frame.py +++ b/pandas/tests/sparse/test_frame.py @@ -1146,8 +1146,8 @@ def test_isnotnull(self): tm.assert_frame_equal(res.to_dense(), exp) -@pytest.mark.parametrize('index', [None, list('ab')]) # noqa: F811 -@pytest.mark.parametrize('columns', [None, list('cd')]) +@pytest.mark.parametrize('index', [None, list('abc')]) # noqa: F811 +@pytest.mark.parametrize('columns', [None, list('def')]) @pytest.mark.parametrize('fill_value', [None, 0, np.nan]) @pytest.mark.parametrize('dtype', [bool, int, float, np.uint16]) def test_from_to_scipy(spmatrix, index, columns, fill_value, dtype): @@ -1156,7 +1156,9 @@ def test_from_to_scipy(spmatrix, index, columns, fill_value, dtype): # Make one ndarray and from it one sparse matrix, both to be used for # constructing frames and comparing results - arr = np.eye(2, dtype=dtype) + arr = np.eye(3, dtype=dtype) + # GH 16179 + arr[0, 1] = dtype(2) try: spm = spmatrix(arr) assert spm.dtype == arr.dtype @@ -1245,6 +1247,26 @@ def test_from_to_scipy_object(spmatrix, fill_value): assert sdf.to_coo().dtype == res_dtype +def test_from_scipy_correct_ordering(spmatrix): + # GH 16179 + tm.skip_if_no_package('scipy') + + arr = np.arange(1, 5).reshape(2, 2) + try: + spm = spmatrix(arr) + assert spm.dtype == arr.dtype + except (TypeError, AssertionError): + # If conversion to sparse fails for this spmatrix type and arr.dtype, + # then the combination is not currently supported in NumPy, so we + # can just skip testing it thoroughly + return + + sdf = pd.SparseDataFrame(spm) + expected = pd.SparseDataFrame(arr) + tm.assert_sp_frame_equal(sdf, expected) + tm.assert_frame_equal(sdf.to_dense(), expected.to_dense()) + + class TestSparseDataFrameArithmetic(object): def test_numeric_op_scalar(self): From 7858082675bf6750bd5b7a3d9b06178768b5df17 Mon Sep 17 00:00:00 2001 From: linebp Date: Fri, 12 May 2017 00:55:09 +0200 Subject: [PATCH 02/44] Unblock supported compression libs in pytables (#16196) (cherry picked from commit 541e8e83b47c68afcf0034f7094d97c3645ca48b) --- doc/source/whatsnew/v0.20.2.txt | 2 +- pandas/core/generic.py | 17 +++++++++------ pandas/io/pytables.py | 24 +++++++++++++-------- pandas/tests/io/test_pytables.py | 37 ++++++++++++++++++++++++++++++-- 4 files changed, 62 insertions(+), 18 deletions(-) diff --git a/doc/source/whatsnew/v0.20.2.txt b/doc/source/whatsnew/v0.20.2.txt index 7403bf4c183f1..65d36c9cb7f5d 100644 --- a/doc/source/whatsnew/v0.20.2.txt +++ b/doc/source/whatsnew/v0.20.2.txt @@ -19,7 +19,7 @@ Highlights include: Enhancements ~~~~~~~~~~~~ - +- Unblocked access to additional compression types supported in pytables: 'blosc:blosclz, 'blosc:lz4', 'blosc:lz4hc', 'blosc:snappy', 'blosc:zlib', 'blosc:zstd' (:issue:`14478`) .. _whatsnew_0201.performance: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 27a489293db8f..f89e4f87ce9fd 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1266,12 +1266,17 @@ def to_hdf(self, path_or_buf, key, **kwargs): `__. Applicable only to format='table'. - complevel : int, 1-9, default 0 - If a complib is specified compression will be applied - where possible - complib : {'zlib', 'bzip2', 'lzo', 'blosc', None}, default None - If complevel is > 0 apply compression to objects written - in the store wherever possible + complevel : int, 0-9, default 0 + Specifies a compression level for data. + A value of 0 disables compression. + complib : {'zlib', 'lzo', 'bzip2', 'blosc', None}, default None + Specifies the compression library to be used. + As of v0.20.2 these additional compressors for Blosc are supported + (default if no compressor specified: 'blosc:blosclz'): + {'blosc:blosclz', 'blosc:lz4', 'blosc:lz4hc', 'blosc:snappy', + 'blosc:zlib', 'blosc:zstd'}. + Specifying a compression library which is not available issues + a ValueError. fletcher32 : bool, default False If applying compression use the fletcher32 checksum dropna : boolean, default False. diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 17bedd016f617..f017421c1f83a 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -402,12 +402,17 @@ class HDFStore(StringMixin): and if the file does not exist it is created. ``'r+'`` It is similar to ``'a'``, but the file must already exist. - complevel : int, 1-9, default 0 - If a complib is specified compression will be applied - where possible - complib : {'zlib', 'bzip2', 'lzo', 'blosc', None}, default None - If complevel is > 0 apply compression to objects written - in the store wherever possible + complevel : int, 0-9, default 0 + Specifies a compression level for data. + A value of 0 disables compression. + complib : {'zlib', 'lzo', 'bzip2', 'blosc', None}, default None + Specifies the compression library to be used. + As of v0.20.2 these additional compressors for Blosc are supported + (default if no compressor specified: 'blosc:blosclz'): + {'blosc:blosclz', 'blosc:lz4', 'blosc:lz4hc', 'blosc:snappy', + 'blosc:zlib', 'blosc:zstd'}. + Specifying a compression library which is not available issues + a ValueError. fletcher32 : bool, default False If applying compression use the fletcher32 checksum @@ -430,9 +435,10 @@ def __init__(self, path, mode=None, complevel=None, complib=None, raise ImportError('HDFStore requires PyTables, "{ex}" problem ' 'importing'.format(ex=str(ex))) - if complib not in (None, 'blosc', 'bzip2', 'lzo', 'zlib'): - raise ValueError("complib only supports 'blosc', 'bzip2', lzo' " - "or 'zlib' compression.") + if complib is not None and complib not in tables.filters.all_complibs: + raise ValueError( + "complib only supports {libs} compression.".format( + libs=tables.filters.all_complibs)) self._path = path if mode is None: diff --git a/pandas/tests/io/test_pytables.py b/pandas/tests/io/test_pytables.py index ee44fea55e51a..50027539e92ea 100644 --- a/pandas/tests/io/test_pytables.py +++ b/pandas/tests/io/test_pytables.py @@ -734,6 +734,39 @@ def test_put_compression_blosc(self): store.put('c', df, format='table', complib='blosc') tm.assert_frame_equal(store['c'], df) + def test_complibs(self): + # GH14478 + df = tm.makeDataFrame() + + # Building list of all complibs and complevels tuples + all_complibs = tables.filters.all_complibs + # Remove lzo if its not available on this platform + if not tables.which_lib_version('lzo'): + all_complibs.remove('lzo') + all_levels = range(0, 10) + all_tests = [(lib, lvl) for lib in all_complibs for lvl in all_levels] + + for (lib, lvl) in all_tests: + with ensure_clean_path(self.path) as tmpfile: + gname = 'foo' + + # Write and read file to see if data is consistent + df.to_hdf(tmpfile, gname, complib=lib, complevel=lvl) + result = pd.read_hdf(tmpfile, gname) + tm.assert_frame_equal(result, df) + + # Open file and check metadata + # for correct amount of compression + h5table = tables.open_file(tmpfile, mode='r') + for node in h5table.walk_nodes(where='/' + gname, + classname='Leaf'): + assert node.filters.complevel == lvl + if lvl == 0: + assert node.filters.complib is None + else: + assert node.filters.complib == lib + h5table.close() + def test_put_integer(self): # non-date, non-string index df = DataFrame(np.random.randn(50, 100)) @@ -4939,8 +4972,8 @@ def test_invalid_complib(self): index=list('abcd'), columns=list('ABCDE')) with ensure_clean_path(self.path) as path: - pytest.raises(ValueError, df.to_hdf, path, - 'df', complib='blosc:zlib') + with pytest.raises(ValueError): + df.to_hdf(path, 'df', complib='foolib') # GH10443 def test_read_nokey(self): From e5cf9b618f1dd117270823fd11614cd26ee802cc Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 11 May 2017 11:07:28 -0500 Subject: [PATCH 03/44] BUG: Accept list-like color with single col in plot (#16233) Closes #3486 (cherry picked from commit 1bee0357a97c2c3d79adcd5f120773d7627baca0) --- doc/source/whatsnew/v0.20.2.txt | 2 ++ pandas/plotting/_core.py | 3 ++- pandas/tests/plotting/test_frame.py | 5 +++++ 3 files changed, 9 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.20.2.txt b/doc/source/whatsnew/v0.20.2.txt index 65d36c9cb7f5d..f5d4568352277 100644 --- a/doc/source/whatsnew/v0.20.2.txt +++ b/doc/source/whatsnew/v0.20.2.txt @@ -54,6 +54,8 @@ I/O Plotting ^^^^^^^^ +- Bug in ``DataFrame.plot`` with a single column and a list-like ``color`` (:issue:`3486`) + diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index e88979b14c8af..c0f9f62106330 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -180,7 +180,8 @@ def _validate_color_args(self): colors = self.kwds.pop('colors') self.kwds['color'] = colors - if ('color' in self.kwds and self.nseries == 1): + if ('color' in self.kwds and self.nseries == 1 and + not is_list_like(self.kwds['color'])): # support series.plot(color='green') self.kwds['color'] = [self.kwds['color']] diff --git a/pandas/tests/plotting/test_frame.py b/pandas/tests/plotting/test_frame.py index 4a4a71d7ea639..2de8c9acff98c 100644 --- a/pandas/tests/plotting/test_frame.py +++ b/pandas/tests/plotting/test_frame.py @@ -153,6 +153,11 @@ def test_mpl2_color_cycle_str(self): else: pytest.skip("not supported in matplotlib < 2.0.0") + def test_color_single_series_list(self): + # GH 3486 + df = DataFrame({"A": [1, 2, 3]}) + _check_plot_works(df.plot, color=['red']) + def test_color_empty_string(self): df = DataFrame(randn(10, 2)) with pytest.raises(ValueError): From d538851994309049d365b086e0d1fb971627e7a2 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 6 May 2017 18:05:06 -0400 Subject: [PATCH 04/44] TST: release testing of downstream packages (#16261) (cherry picked from commit b9798161b37109020048c101753fd904536970ad) --- .travis.yml | 2 +- ci/install_travis.sh | 27 +++++++----- ci/script_multi.sh | 20 ++++++--- ci/script_single.sh | 4 +- pandas/tests/test_downstream.py | 44 ++++++++++++++----- scripts/build_dist.sh | 6 +-- .../build_dist_for_release.sh | 2 +- 7 files changed, 67 insertions(+), 38 deletions(-) rename ci/install_release_build.sh => scripts/build_dist_for_release.sh (69%) diff --git a/.travis.yml b/.travis.yml index e5e05ed26da56..f0ece15de65db 100644 --- a/.travis.yml +++ b/.travis.yml @@ -123,7 +123,7 @@ after_success: after_script: - echo "after_script start" - - source activate pandas && python -c "import pandas; pandas.show_versions();" + - source activate pandas && cd /tmp && python -c "import pandas; pandas.show_versions();" - if [ -e /tmp/single.xml ]; then ci/print_skipped.py /tmp/single.xml; fi diff --git a/ci/install_travis.sh b/ci/install_travis.sh index 601edded29f5a..8cf6f2ce636da 100755 --- a/ci/install_travis.sh +++ b/ci/install_travis.sh @@ -119,15 +119,7 @@ if [ "$COVERAGE" ]; then fi echo -if [ "$BUILD_TEST" ]; then - - # build & install testing - echo ["Starting installation test."] - bash ci/install_release_build.sh - conda uninstall -y cython - time pip install dist/*tar.gz || exit 1 - -else +if [ -z "$BUILD_TEST" ]; then # build but don't install echo "[build em]" @@ -163,9 +155,22 @@ fi # w/o removing anything else echo echo "[removing installed pandas]" -conda remove pandas --force +conda remove pandas -y --force -if [ -z "$BUILD_TEST" ]; then +if [ "$BUILD_TEST" ]; then + + # remove any installation + pip uninstall -y pandas + conda list pandas + pip list --format columns |grep pandas + + # build & install testing + echo ["building release"] + bash scripts/build_dist_for_release.sh + conda uninstall -y cython + time pip install dist/*tar.gz || exit 1 + +else # install our pandas echo diff --git a/ci/script_multi.sh b/ci/script_multi.sh index 663d2feb5be23..daa929e177666 100755 --- a/ci/script_multi.sh +++ b/ci/script_multi.sh @@ -19,20 +19,26 @@ export PYTHONHASHSEED=$(python -c 'import random; print(random.randint(1, 429496 echo PYTHONHASHSEED=$PYTHONHASHSEED if [ "$BUILD_TEST" ]; then - echo "build-test" + echo "[build-test]" + + echo "[env]" + pip list --format columns |grep pandas + + echo "[running]" cd /tmp - pwd - conda list pandas - echo "running" - python -c "import pandas; pandas.test(['-n 2'])" + unset PYTHONPATH + python -c "import pandas; pandas.test(['-n 2', '--skip-slow', '--skip-network', '-r xX'])" + elif [ "$DOC" ]; then echo "We are not running pytest as this is a doc-build" + elif [ "$COVERAGE" ]; then echo pytest -s -n 2 -m "not single" --cov=pandas --cov-report xml:/tmp/cov-multiple.xml --junitxml=/tmp/multiple.xml $TEST_ARGS pandas pytest -s -n 2 -m "not single" --cov=pandas --cov-report xml:/tmp/cov-multiple.xml --junitxml=/tmp/multiple.xml $TEST_ARGS pandas + else - echo pytest -n 2 -m "not single" --junitxml=/tmp/multiple.xml $TEST_ARGS pandas - pytest -n 2 -m "not single" --junitxml=/tmp/multiple.xml $TEST_ARGS pandas # TODO: doctest + echo pytest -n 2 -r xX -m "not single" --junitxml=/tmp/multiple.xml $TEST_ARGS pandas + pytest -n 2 -r xX -m "not single" --junitxml=/tmp/multiple.xml $TEST_ARGS pandas # TODO: doctest fi RET="$?" diff --git a/ci/script_single.sh b/ci/script_single.sh index db637679f0e0f..245b4e6152c4d 100755 --- a/ci/script_single.sh +++ b/ci/script_single.sh @@ -20,8 +20,8 @@ elif [ "$COVERAGE" ]; then echo pytest -s -m "single" --cov=pandas --cov-report xml:/tmp/cov-single.xml --junitxml=/tmp/single.xml $TEST_ARGS pandas pytest -s -m "single" --cov=pandas --cov-report xml:/tmp/cov-single.xml --junitxml=/tmp/single.xml $TEST_ARGS pandas else - echo pytest -m "single" --junitxml=/tmp/single.xml $TEST_ARGS pandas - pytest -m "single" --junitxml=/tmp/single.xml $TEST_ARGS pandas # TODO: doctest + echo pytest -m "single" -r xX --junitxml=/tmp/single.xml $TEST_ARGS pandas + pytest -m "single" -r xX --junitxml=/tmp/single.xml $TEST_ARGS pandas # TODO: doctest fi RET="$?" diff --git a/pandas/tests/test_downstream.py b/pandas/tests/test_downstream.py index 2baedb82aa2a7..12976272cb8b1 100644 --- a/pandas/tests/test_downstream.py +++ b/pandas/tests/test_downstream.py @@ -4,7 +4,28 @@ import pytest import numpy as np # noqa from pandas import DataFrame +from pandas.compat import PY36 from pandas.util import testing as tm +import importlib + + +def import_module(name): + # we *only* want to skip if the module is truly not available + # and NOT just an actual import error because of pandas changes + + if PY36: + try: + return importlib.import_module(name) + except ModuleNotFoundError: # noqa + pytest.skip("skipping as {} not available".format(name)) + + else: + try: + return importlib.import_module(name) + except ImportError as e: + if "No module named" in str(e) and name in str(e): + pytest.skip("skipping as {} not available".format(name)) + raise @pytest.fixture @@ -14,8 +35,8 @@ def df(): def test_dask(df): - toolz = pytest.importorskip('toolz') # noqa - dask = pytest.importorskip('dask') # noqa + toolz = import_module('toolz') # noqa + dask = import_module('dask') # noqa import dask.dataframe as dd @@ -26,14 +47,14 @@ def test_dask(df): def test_xarray(df): - xarray = pytest.importorskip('xarray') # noqa + xarray = import_module('xarray') # noqa assert df.to_xarray() is not None def test_statsmodels(): - statsmodels = pytest.importorskip('statsmodels') # noqa + statsmodels = import_module('statsmodels') # noqa import statsmodels.api as sm import statsmodels.formula.api as smf df = sm.datasets.get_rdataset("Guerry", "HistData").data @@ -42,7 +63,7 @@ def test_statsmodels(): def test_scikit_learn(df): - sklearn = pytest.importorskip('sklearn') # noqa + sklearn = import_module('sklearn') # noqa from sklearn import svm, datasets digits = datasets.load_digits() @@ -53,33 +74,34 @@ def test_scikit_learn(df): def test_seaborn(): - seaborn = pytest.importorskip('seaborn') + seaborn = import_module('seaborn') tips = seaborn.load_dataset("tips") seaborn.stripplot(x="day", y="total_bill", data=tips) def test_pandas_gbq(df): - pandas_gbq = pytest.importorskip('pandas-gbq') # noqa + pandas_gbq = import_module('pandas_gbq') # noqa -@tm.network +@pytest.mark.xfail(reason=("pandas_datareader<=0.3.0 " + "broken w.r.t. pandas >= 0.20.0")) def test_pandas_datareader(): - pandas_datareader = pytest.importorskip('pandas-datareader') # noqa + pandas_datareader = import_module('pandas_datareader') # noqa pandas_datareader.get_data_yahoo('AAPL') def test_geopandas(): - geopandas = pytest.importorskip('geopandas') # noqa + geopandas = import_module('geopandas') # noqa fp = geopandas.datasets.get_path('naturalearth_lowres') assert geopandas.read_file(fp) is not None def test_pyarrow(df): - pyarrow = pytest.importorskip('pyarrow') # noqa + pyarrow = import_module('pyarrow') # noqa table = pyarrow.Table.from_pandas(df) result = table.to_pandas() tm.assert_frame_equal(result, df) diff --git a/scripts/build_dist.sh b/scripts/build_dist.sh index d6a7d0ba67239..c3f849ce7a6eb 100755 --- a/scripts/build_dist.sh +++ b/scripts/build_dist.sh @@ -10,11 +10,7 @@ read -p "Ok to continue (y/n)? " answer case ${answer:0:1} in y|Y ) echo "Building distribution" - rm -rf dist - git clean -xfd - python setup.py clean - python setup.py cython - python setup.py sdist --formats=gztar + ./build_dist_for_release.sh ;; * ) echo "Not building distribution" diff --git a/ci/install_release_build.sh b/scripts/build_dist_for_release.sh similarity index 69% rename from ci/install_release_build.sh rename to scripts/build_dist_for_release.sh index f8373176643fa..e77974ae08b0c 100644 --- a/ci/install_release_build.sh +++ b/scripts/build_dist_for_release.sh @@ -2,7 +2,7 @@ # this requires cython to be installed -# this builds the release cleanly +# this builds the release cleanly & is building on the current checkout rm -rf dist git clean -xfd python setup.py clean From 1e146e4810876b79115d18afe24caa13f1629841 Mon Sep 17 00:00:00 2001 From: Pietro Battiston Date: Sat, 6 May 2017 23:50:14 +0200 Subject: [PATCH 05/44] BUG: support for "level=" when reset_index() is called with a single level Index (#16266) (cherry picked from commit 8809b04ffe55013b7e6fd2eab4156b6c3665c9fe) --- doc/source/whatsnew/v0.20.2.txt | 2 +- pandas/core/frame.py | 18 ++++++------ pandas/tests/frame/test_alter_axes.py | 37 ++++++++++++++++++++++++ pandas/tests/series/test_alter_axes.py | 39 ++++++++++++++++++++++++++ 4 files changed, 86 insertions(+), 10 deletions(-) diff --git a/doc/source/whatsnew/v0.20.2.txt b/doc/source/whatsnew/v0.20.2.txt index f5d4568352277..2c3bdd76dc764 100644 --- a/doc/source/whatsnew/v0.20.2.txt +++ b/doc/source/whatsnew/v0.20.2.txt @@ -42,7 +42,7 @@ Conversion Indexing ^^^^^^^^ - +- Bug in ``DataFrame.reset_index(level=)`` with single level index (:issue:`16263`) I/O diff --git a/pandas/core/frame.py b/pandas/core/frame.py index e6ea58e7e05be..8d437102e4d18 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3012,12 +3012,12 @@ def _maybe_casted_values(index, labels=None): return values new_index = _default_index(len(new_obj)) - if isinstance(self.index, MultiIndex): - if level is not None: - if not isinstance(level, (tuple, list)): - level = [level] - level = [self.index._get_level_number(lev) for lev in level] - if len(level) < len(self.index.levels): + if level is not None: + if not isinstance(level, (tuple, list)): + level = [level] + level = [self.index._get_level_number(lev) for lev in level] + if isinstance(self.index, MultiIndex): + if len(level) < self.index.nlevels: new_index = self.index.droplevel(level) if not drop: @@ -3033,6 +3033,8 @@ def _maybe_casted_values(index, labels=None): multi_col = isinstance(self.columns, MultiIndex) for i, (lev, lab) in reversed(list(enumerate(to_insert))): + if not (level is None or i in level): + continue name = names[i] if multi_col: col_name = (list(name) if isinstance(name, tuple) @@ -3049,11 +3051,9 @@ def _maybe_casted_values(index, labels=None): missing = self.columns.nlevels - len(name_lst) name_lst += [col_fill] * missing name = tuple(name_lst) - # to ndarray and maybe infer different dtype level_values = _maybe_casted_values(lev, lab) - if level is None or i in level: - new_obj.insert(0, name, level_values) + new_obj.insert(0, name, level_values) new_obj.index = new_index if not inplace: diff --git a/pandas/tests/frame/test_alter_axes.py b/pandas/tests/frame/test_alter_axes.py index e6313dfc602a8..fbfbcc14e9150 100644 --- a/pandas/tests/frame/test_alter_axes.py +++ b/pandas/tests/frame/test_alter_axes.py @@ -641,6 +641,43 @@ def test_reset_index(self): xp = xp.set_index(['B'], append=True) assert_frame_equal(rs, xp, check_names=False) + def test_reset_index_level(self): + df = pd.DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]], + columns=['A', 'B', 'C', 'D']) + + for levels in ['A', 'B'], [0, 1]: + # With MultiIndex + result = df.set_index(['A', 'B']).reset_index(level=levels[0]) + tm.assert_frame_equal(result, df.set_index('B')) + + result = df.set_index(['A', 'B']).reset_index(level=levels[:1]) + tm.assert_frame_equal(result, df.set_index('B')) + + result = df.set_index(['A', 'B']).reset_index(level=levels) + tm.assert_frame_equal(result, df) + + result = df.set_index(['A', 'B']).reset_index(level=levels, + drop=True) + tm.assert_frame_equal(result, df[['C', 'D']]) + + # With single-level Index (GH 16263) + result = df.set_index('A').reset_index(level=levels[0]) + tm.assert_frame_equal(result, df) + + result = df.set_index('A').reset_index(level=levels[:1]) + tm.assert_frame_equal(result, df) + + result = df.set_index(['A']).reset_index(level=levels[0], + drop=True) + tm.assert_frame_equal(result, df[['B', 'C', 'D']]) + + # Missing levels - for both MultiIndex and single-level Index: + for idx_lev in ['A', 'B'], ['A']: + with tm.assert_raises_regex(KeyError, 'Level E '): + df.set_index(idx_lev).reset_index(level=['A', 'E']) + with tm.assert_raises_regex(IndexError, 'Too many levels'): + df.set_index(idx_lev).reset_index(level=[0, 1, 2]) + def test_reset_index_right_dtype(self): time = np.arange(0.0, 10, np.sqrt(2) / 2) s1 = Series((9.81 * time ** 2) / 2, diff --git a/pandas/tests/series/test_alter_axes.py b/pandas/tests/series/test_alter_axes.py index 150767ee9e2b2..98ae749aaa10e 100644 --- a/pandas/tests/series/test_alter_axes.py +++ b/pandas/tests/series/test_alter_axes.py @@ -141,6 +141,45 @@ def test_reset_index(self): tm.assert_index_equal(rs.index, Index(index.get_level_values(1))) assert isinstance(rs, Series) + def test_reset_index_level(self): + df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], + columns=['A', 'B', 'C']) + + for levels in ['A', 'B'], [0, 1]: + # With MultiIndex + s = df.set_index(['A', 'B'])['C'] + + result = s.reset_index(level=levels[0]) + tm.assert_frame_equal(result, df.set_index('B')) + + result = s.reset_index(level=levels[:1]) + tm.assert_frame_equal(result, df.set_index('B')) + + result = s.reset_index(level=levels) + tm.assert_frame_equal(result, df) + + result = df.set_index(['A', 'B']).reset_index(level=levels, + drop=True) + tm.assert_frame_equal(result, df[['C']]) + + with tm.assert_raises_regex(KeyError, 'Level E '): + s.reset_index(level=['A', 'E']) + + # With single-level Index + s = df.set_index('A')['B'] + + result = s.reset_index(level=levels[0]) + tm.assert_frame_equal(result, df[['A', 'B']]) + + result = s.reset_index(level=levels[:1]) + tm.assert_frame_equal(result, df[['A', 'B']]) + + result = s.reset_index(level=levels[0], drop=True) + tm.assert_series_equal(result, df['B']) + + with tm.assert_raises_regex(IndexError, 'Too many levels'): + s.reset_index(level=[0, 1, 2]) + def test_reset_index_range(self): # GH 12071 s = pd.Series(range(2), name='A', dtype='int64') From 38684f33413c32ad6d990f1e767b6257708d4085 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 6 May 2017 19:22:03 -0400 Subject: [PATCH 06/44] TST: remove xfailing css tests (#16272) (cherry picked from commit 1fdcb3a7e05bba36c9b5781e8f1d996293f6127c) --- pandas/tests/io/formats/test_css.py | 76 +----------------------- pandas/tests/io/formats/test_to_excel.py | 17 +----- 2 files changed, 2 insertions(+), 91 deletions(-) diff --git a/pandas/tests/io/formats/test_css.py b/pandas/tests/io/formats/test_css.py index 44f95266b6c78..c07856dc63602 100644 --- a/pandas/tests/io/formats/test_css.py +++ b/pandas/tests/io/formats/test_css.py @@ -29,32 +29,6 @@ def test_css_parse_normalisation(name, norm, abnorm): assert_same_resolution(norm, abnorm) -@pytest.mark.xfail(reason='CSS comments not yet stripped') -def test_css_parse_comments(): - assert_same_resolution('hello: world', - 'hello/* foo */:/* bar \n */ world /*;not:here*/') - - -@pytest.mark.xfail(reason='''we don't need to handle specificity - markers like !important, but we should - ignore them in the future''') -def test_css_parse_specificity(): - assert_same_resolution('font-weight: bold', 'font-weight: bold !important') - - -@pytest.mark.xfail(reason='Splitting CSS declarations not yet sensitive to ' - '; in CSS strings') -def test_css_parse_strings(): - # semicolons in strings - with tm.assert_produces_warning(CSSWarning): - assert_resolves( - 'background-image: url(\'http://blah.com/foo?a;b=c\')', - {'background-image': 'url(\'http://blah.com/foo?a;b=c\')'}) - assert_resolves( - 'background-image: url("http://blah.com/foo?a;b=c")', - {'background-image': 'url("http://blah.com/foo?a;b=c")'}) - - @pytest.mark.parametrize( 'invalid_css,remainder', [ # No colon @@ -62,15 +36,7 @@ def test_css_parse_strings(): ('border-style: solid; hello-world', 'border-style: solid'), ('border-style: solid; hello-world; font-weight: bold', 'border-style: solid; font-weight: bold'), - # Unclosed string - pytest.mark.xfail(('background-image: "abc', ''), - reason='Unclosed CSS strings not detected'), - pytest.mark.xfail(('font-family: "abc', ''), - reason='Unclosed CSS strings not detected'), - pytest.mark.xfail(('background-image: \'abc', ''), - reason='Unclosed CSS strings not detected'), - pytest.mark.xfail(('font-family: \'abc', ''), - reason='Unclosed CSS strings not detected'), + # Unclosed string fail # Invalid size ('font-size: blah', 'font-size: 1em'), ('font-size: 1a2b', 'font-size: 1em'), @@ -124,46 +90,6 @@ def test_css_side_shorthands(shorthand, expansions): {}) -@pytest.mark.xfail(reason='CSS font shorthand not yet handled') -@pytest.mark.parametrize('css,props', [ - ('font: italic bold 12pt helvetica,sans-serif', - {'font-family': 'helvetica,sans-serif', - 'font-style': 'italic', - 'font-weight': 'bold', - 'font-size': '12pt'}), - ('font: bold italic 12pt helvetica,sans-serif', - {'font-family': 'helvetica,sans-serif', - 'font-style': 'italic', - 'font-weight': 'bold', - 'font-size': '12pt'}), -]) -def test_css_font_shorthand(css, props): - assert_resolves(css, props) - - -@pytest.mark.xfail(reason='CSS background shorthand not yet handled') -@pytest.mark.parametrize('css,props', [ - ('background: blue', {'background-color': 'blue'}), - ('background: fixed blue', - {'background-color': 'blue', 'background-attachment': 'fixed'}), -]) -def test_css_background_shorthand(css, props): - assert_resolves(css, props) - - -@pytest.mark.xfail(reason='CSS border shorthand not yet handled') -@pytest.mark.parametrize('style,equiv', [ - ('border: 1px solid red', - 'border-width: 1px; border-style: solid; border-color: red'), - ('border: solid red 1px', - 'border-width: 1px; border-style: solid; border-color: red'), - ('border: red solid', - 'border-style: solid; border-color: red'), -]) -def test_css_border_shorthand(style, equiv): - assert_same_resolution(style, equiv) - - @pytest.mark.parametrize('style,inherited,equiv', [ ('margin: 1px; margin: 2px', '', 'margin: 2px'), diff --git a/pandas/tests/io/formats/test_to_excel.py b/pandas/tests/io/formats/test_to_excel.py index fff5299921270..cdff3b8a5cca8 100644 --- a/pandas/tests/io/formats/test_to_excel.py +++ b/pandas/tests/io/formats/test_to_excel.py @@ -73,14 +73,7 @@ ('text-shadow: 0px -0em 2px #CCC', {'font': {'shadow': True}}), ('text-shadow: 0px -0em 2px', {'font': {'shadow': True}}), ('text-shadow: 0px -2em', {'font': {'shadow': True}}), - pytest.mark.xfail(('text-shadow: #CCC 3px 3px 3px', - {'font': {'shadow': True}}), - reason='text-shadow with color preceding width not yet ' - 'identified as shadow'), - pytest.mark.xfail(('text-shadow: #999 0px 0px 0px', - {'font': {'shadow': False}}), - reason='text-shadow with color preceding zero width not ' - 'yet identified as non-shadow'), + # FILL # - color, fillType ('background-color: red', {'fill': {'fgColor': 'FF0000', @@ -209,11 +202,3 @@ def test_css_to_excel_multiple(): def test_css_to_excel_inherited(css, inherited, expected): convert = CSSToExcelConverter(inherited) assert expected == convert(css) - - -@pytest.mark.xfail(reason='We are not currently warning for all unconverted ' - 'CSS, but possibly should') -def test_css_to_excel_warns_when_not_supported(): - convert = CSSToExcelConverter() - with pytest.warns(UserWarning): - convert('background: red') From 8f231c8f54f289fc4ad28008f138a0bf79055883 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 6 May 2017 18:20:12 -0400 Subject: [PATCH 07/44] DOC: add whatsnew 0.20.2 to display (#16273) (cherry picked from commit b8f6556cad57e60a4a522ff6574003b40c06f688) --- doc/source/whatsnew.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/source/whatsnew.rst b/doc/source/whatsnew.rst index d6fb1c6a8f9cc..ffaeeb78c2799 100644 --- a/doc/source/whatsnew.rst +++ b/doc/source/whatsnew.rst @@ -18,6 +18,8 @@ What's New These are new features and improvements of note in each release. +.. include:: whatsnew/v0.20.2.txt + .. include:: whatsnew/v0.20.0.txt .. include:: whatsnew/v0.19.2.txt From 817558107e92a3f20ec0d9e43fd6902d69785c33 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sat, 6 May 2017 19:39:28 -0400 Subject: [PATCH 08/44] DOC: change 0.20.1 whatsnew text -> 0.20.2 (#16274) (cherry picked from commit ea56550ded81ff20e6ac77548958231a895264f3) --- doc/source/whatsnew/v0.20.2.txt | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v0.20.2.txt b/doc/source/whatsnew/v0.20.2.txt index 2c3bdd76dc764..ba9af2cabd1e1 100644 --- a/doc/source/whatsnew/v0.20.2.txt +++ b/doc/source/whatsnew/v0.20.2.txt @@ -1,6 +1,6 @@ -.. _whatsnew_0201: +.. _whatsnew_0202: -v0.20.1 (???) +v0.20.2 (???) ------------- This is a minor bug-fix release in the 0.20.x series and includes some small regression fixes, @@ -9,26 +9,26 @@ We recommend that all users upgrade to this version. Highlights include: -.. contents:: What's new in v0.20.1 +.. contents:: What's new in v0.20.2 :local: :backlinks: none -.. _whatsnew_0201.enhancements: +.. _whatsnew_0202.enhancements: Enhancements ~~~~~~~~~~~~ - Unblocked access to additional compression types supported in pytables: 'blosc:blosclz, 'blosc:lz4', 'blosc:lz4hc', 'blosc:snappy', 'blosc:zlib', 'blosc:zstd' (:issue:`14478`) -.. _whatsnew_0201.performance: +.. _whatsnew_0202.performance: Performance Improvements ~~~~~~~~~~~~~~~~~~~~~~~~ -.. _whatsnew_0201.bug_fixes: +.. _whatsnew_0202.bug_fixes: Bug Fixes ~~~~~~~~~ From c17a3e935d84f37e24be052f7e5d70eb10648dd6 Mon Sep 17 00:00:00 2001 From: SimonBaron Date: Mon, 8 May 2017 11:56:57 +0100 Subject: [PATCH 09/44] DOC: fixed broken link GH16279 (#16281) (cherry picked from commit 9e57d915744623bef31170629bd1996eda3352cf) --- doc/source/style.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/style.ipynb b/doc/source/style.ipynb index 427b18b988aef..4eeda491426b1 100644 --- a/doc/source/style.ipynb +++ b/doc/source/style.ipynb @@ -12,7 +12,7 @@ "\n", "*Provisional: This is a new feature and still under development. We'll be adding features and possibly making breaking changes in future releases. We'd love to hear your feedback.*\n", "\n", - "This document is written as a Jupyter Notebook, and can be viewed or downloaded [here](http://nbviewer.ipython.org/github/pandas-dev/pandas/blob/master/doc/source/html-styling.ipynb).\n", + "This document is written as a Jupyter Notebook, and can be viewed or downloaded [here](http://nbviewer.ipython.org/github/pandas-dev/pandas/blob/master/doc/source/style.ipynb).\n", "\n", "You can apply **conditional formatting**, the visual styling of a DataFrame\n", "depending on the data within, by using the ``DataFrame.style`` property.\n", From 6f9e90787dfda395d4abd14c12f02dc13662ad3e Mon Sep 17 00:00:00 2001 From: chris-b1 Date: Fri, 12 May 2017 08:29:59 -0500 Subject: [PATCH 10/44] BUG: pathlib.Path in io (#16292) * BUG: pathlib.Path in io * CLN: factor out pathlib roundtrip * add localpath tests for other io * fixup * xfail SAS; type in parser * missing import * xfail for #14704 * fix to_csv * lint * lint cleanup * add feather (xfail) (cherry picked from commit 4cd84582d5ad0fdac5085b12a1affeb6300ba3a3) --- doc/source/whatsnew/v0.20.2.txt | 2 + pandas/io/common.py | 3 ++ pandas/io/formats/format.py | 5 ++- pandas/tests/io/parser/common.py | 13 +++++++ pandas/tests/io/sas/test_sas7bdat.py | 24 ++++++++++++ pandas/tests/io/test_excel.py | 10 +++++ pandas/tests/io/test_feather.py | 13 +++++++ pandas/tests/io/test_packers.py | 12 ++++++ pandas/tests/io/test_pickle.py | 12 ++++++ pandas/tests/io/test_pytables.py | 43 +++++++++++++++++++++ pandas/tests/io/test_stata.py | 12 ++++++ pandas/util/testing.py | 57 ++++++++++++++++++++++++++++ 12 files changed, 204 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.20.2.txt b/doc/source/whatsnew/v0.20.2.txt index ba9af2cabd1e1..ae5b10ba4030d 100644 --- a/doc/source/whatsnew/v0.20.2.txt +++ b/doc/source/whatsnew/v0.20.2.txt @@ -33,6 +33,8 @@ Performance Improvements Bug Fixes ~~~~~~~~~ +- Bug in using ``pathlib.Path`` or ``py.path.local`` objects with io functions (:issue:`16291`) + Conversion ^^^^^^^^^^ diff --git a/pandas/io/common.py b/pandas/io/common.py index 28f90972f95de..14ac4d366fcef 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -314,6 +314,9 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None, handles = list() f = path_or_buf + + # Convert pathlib.Path/py.path.local or string + path_or_buf = _stringify_path(path_or_buf) is_path = isinstance(path_or_buf, compat.string_types) if compression: diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 65098bb2aa404..183d8d9d87d0b 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -32,7 +32,8 @@ OrderedDict, unichr) from pandas.io.formats.terminal import get_terminal_size from pandas.core.config import get_option, set_option -from pandas.io.common import _get_handle, UnicodeWriter, _expand_user +from pandas.io.common import (_get_handle, UnicodeWriter, _expand_user, + _stringify_path) from pandas.io.formats.printing import adjoin, justify, pprint_thing from pandas.io.formats.common import get_level_lengths import pandas.core.common as com @@ -1475,7 +1476,7 @@ def __init__(self, obj, path_or_buf=None, sep=",", na_rep='', if path_or_buf is None: path_or_buf = StringIO() - self.path_or_buf = _expand_user(path_or_buf) + self.path_or_buf = _expand_user(_stringify_path(path_or_buf)) self.sep = sep self.na_rep = na_rep self.float_format = float_format diff --git a/pandas/tests/io/parser/common.py b/pandas/tests/io/parser/common.py index bcce0c6d020ae..31d815a4bca97 100644 --- a/pandas/tests/io/parser/common.py +++ b/pandas/tests/io/parser/common.py @@ -679,6 +679,19 @@ def test_file(self): tm.assert_frame_equal(url_table, local_table) + def test_path_pathlib(self): + df = tm.makeDataFrame() + result = tm.round_trip_pathlib(df.to_csv, + lambda p: self.read_csv(p, index_col=0)) + tm.assert_frame_equal(df, result) + + def test_path_localpath(self): + df = tm.makeDataFrame() + result = tm.round_trip_localpath( + df.to_csv, + lambda p: self.read_csv(p, index_col=0)) + tm.assert_frame_equal(df, result) + def test_nonexistent_path(self): # gh-2428: pls no segfault # gh-14086: raise more helpful FileNotFoundError diff --git a/pandas/tests/io/sas/test_sas7bdat.py b/pandas/tests/io/sas/test_sas7bdat.py index a5157744038f4..7070c3c7c9382 100644 --- a/pandas/tests/io/sas/test_sas7bdat.py +++ b/pandas/tests/io/sas/test_sas7bdat.py @@ -3,6 +3,7 @@ import pandas.util.testing as tm import os import io +import pytest import numpy as np @@ -65,6 +66,29 @@ def test_from_iterator(self): tm.assert_frame_equal(df, df0.iloc[2:5, :]) rdr.close() + @pytest.mark.xfail(reason="read_sas currently doesn't work with pathlib") + def test_path_pathlib(self): + tm._skip_if_no_pathlib() + from pathlib import Path + for j in 0, 1: + df0 = self.data[j] + for k in self.test_ix[j]: + fname = Path(os.path.join(self.dirpath, "test%d.sas7bdat" % k)) + df = pd.read_sas(fname, encoding='utf-8') + tm.assert_frame_equal(df, df0) + + @pytest.mark.xfail(reason="read_sas currently doesn't work with localpath") + def test_path_localpath(self): + tm._skip_if_no_localpath() + from py.path import local as LocalPath + for j in 0, 1: + df0 = self.data[j] + for k in self.test_ix[j]: + fname = LocalPath(os.path.join(self.dirpath, + "test%d.sas7bdat" % k)) + df = pd.read_sas(fname, encoding='utf-8') + tm.assert_frame_equal(df, df0) + def test_iterator_loop(self): # github #13654 for j in 0, 1: diff --git a/pandas/tests/io/test_excel.py b/pandas/tests/io/test_excel.py index c70b5937fea3f..b4a5b24616728 100644 --- a/pandas/tests/io/test_excel.py +++ b/pandas/tests/io/test_excel.py @@ -1858,6 +1858,16 @@ def test_freeze_panes(self): result = read_excel(path) tm.assert_frame_equal(expected, result) + def test_path_pathlib(self): + df = tm.makeDataFrame() + result = tm.round_trip_pathlib(df.to_excel, pd.read_excel) + tm.assert_frame_equal(df, result) + + def test_path_localpath(self): + df = tm.makeDataFrame() + result = tm.round_trip_localpath(df.to_excel, pd.read_excel) + tm.assert_frame_equal(df, result) + def raise_wrapper(major_ver): def versioned_raise_wrapper(orig_method): diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py index 232bb126d9d67..e3190efecba30 100644 --- a/pandas/tests/io/test_feather.py +++ b/pandas/tests/io/test_feather.py @@ -9,6 +9,7 @@ from feather import FeatherError from pandas.util.testing import assert_frame_equal, ensure_clean +import pandas.util.testing as tm @pytest.mark.single @@ -114,3 +115,15 @@ def test_write_with_index(self): df.index = [0, 1, 2] df.columns = pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1)]), self.check_error_on_write(df, ValueError) + + @pytest.mark.xfail(reason="feather currently doesn't work with pathlib") + def test_path_pathlib(self): + df = tm.makeDataFrame().reset_index() + result = tm.round_trip_pathlib(df.to_feather, pd.read_feather) + tm.assert_frame_equal(df, result) + + @pytest.mark.xfail(reason="feather currently doesn't work with localpath") + def test_path_localpath(self): + df = tm.makeDataFrame().reset_index() + result = tm.round_trip_localpath(df.to_feather, pd.read_feather) + tm.assert_frame_equal(df, result) diff --git a/pandas/tests/io/test_packers.py b/pandas/tests/io/test_packers.py index 4b1145129c364..fd42becca3ac3 100644 --- a/pandas/tests/io/test_packers.py +++ b/pandas/tests/io/test_packers.py @@ -134,6 +134,18 @@ def test_string_io(self): result = read_msgpack(p) tm.assert_frame_equal(result, df) + @pytest.mark.xfail(reason="msgpack currently doesn't work with pathlib") + def test_path_pathlib(self): + df = tm.makeDataFrame() + result = tm.round_trip_pathlib(df.to_msgpack, read_msgpack) + tm.assert_frame_equal(df, result) + + @pytest.mark.xfail(reason="msgpack currently doesn't work with localpath") + def test_path_localpath(self): + df = tm.makeDataFrame() + result = tm.round_trip_localpath(df.to_msgpack, read_msgpack) + tm.assert_frame_equal(df, result) + def test_iterator_with_string_io(self): dfs = [DataFrame(np.random.randn(10, 2)) for i in range(5)] diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py index 875b5bd3055b9..429ec5ba1c474 100644 --- a/pandas/tests/io/test_pickle.py +++ b/pandas/tests/io/test_pickle.py @@ -299,6 +299,18 @@ def test_pickle_v0_15_2(): tm.assert_categorical_equal(cat, pd.read_pickle(pickle_path)) +def test_pickle_path_pathlib(): + df = tm.makeDataFrame() + result = tm.round_trip_pathlib(df.to_pickle, pd.read_pickle) + tm.assert_frame_equal(df, result) + + +def test_pickle_path_localpath(): + df = tm.makeDataFrame() + result = tm.round_trip_localpath(df.to_pickle, pd.read_pickle) + tm.assert_frame_equal(df, result) + + # --------------------- # test pickle compression # --------------------- diff --git a/pandas/tests/io/test_pytables.py b/pandas/tests/io/test_pytables.py index 50027539e92ea..bb29425ff4942 100644 --- a/pandas/tests/io/test_pytables.py +++ b/pandas/tests/io/test_pytables.py @@ -4282,6 +4282,49 @@ def test_select_filter_corner(self): result = store.select('frame', [crit]) tm.assert_frame_equal(result, df.loc[:, df.columns[:75:2]]) + def test_path_pathlib(self): + df = tm.makeDataFrame() + + result = tm.round_trip_pathlib( + lambda p: df.to_hdf(p, 'df'), + lambda p: pd.read_hdf(p, 'df')) + tm.assert_frame_equal(df, result) + + @pytest.mark.xfail(reason='pathlib currently doesnt work with HDFStore') + def test_path_pathlib_hdfstore(self): + df = tm.makeDataFrame() + + def writer(path): + with pd.HDFStore(path) as store: + df.to_hdf(store, 'df') + + def reader(path): + with pd.HDFStore(path) as store: + pd.read_hdf(store, 'df') + result = tm.round_trip_pathlib(writer, reader) + tm.assert_frame_equal(df, result) + + def test_pickle_path_localpath(self): + df = tm.makeDataFrame() + result = tm.round_trip_pathlib( + lambda p: df.to_hdf(p, 'df'), + lambda p: pd.read_hdf(p, 'df')) + tm.assert_frame_equal(df, result) + + @pytest.mark.xfail(reason='localpath currently doesnt work with HDFStore') + def test_path_localpath_hdfstore(self): + df = tm.makeDataFrame() + + def writer(path): + with pd.HDFStore(path) as store: + df.to_hdf(store, 'df') + + def reader(path): + with pd.HDFStore(path) as store: + pd.read_hdf(store, 'df') + result = tm.round_trip_localpath(writer, reader) + tm.assert_frame_equal(df, result) + def _check_roundtrip(self, obj, comparator, compression=False, **kwargs): options = {} diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index 4c92c19c51e7a..4ec990116bb62 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -1283,3 +1283,15 @@ def test_invalid_encoding(self): with pytest.raises(ValueError): with tm.ensure_clean() as path: original.to_stata(path, encoding='utf-8') + + @pytest.mark.xfail(reason="stata currently doesn't work with pathlib") + def test_path_pathlib(self): + df = tm.makeDataFrame() + result = tm.round_trip_pathlib(df.to_stata, read_stata) + tm.assert_frame_equal(df, result) + + @pytest.mark.xfail(reason="stata currently doesn't work with localpath") + def test_pickle_path_localpath(self): + df = tm.makeDataFrame() + result = tm.round_trip_localpath(df.to_stata, read_stata) + tm.assert_frame_equal(df, result) diff --git a/pandas/util/testing.py b/pandas/util/testing.py index f6b572cdf7179..04461f84683f8 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -117,6 +117,63 @@ def round_trip_pickle(obj, path=None): return pd.read_pickle(path) +def round_trip_pathlib(writer, reader, path=None): + """ + Write an object to file specifed by a pathlib.Path and read it back + + Parameters + ---------- + writer : callable bound to pandas object + IO writing function (e.g. DataFrame.to_csv ) + reader : callable + IO reading function (e.g. pd.read_csv ) + path : str, default None + The path where the object is written and then read. + + Returns + ------- + round_trip_object : pandas object + The original object that was serialized and then re-read. + """ + + import pytest + Path = pytest.importorskip('pathlib').Path + if path is None: + path = '___pathlib___' + with ensure_clean(path) as path: + writer(Path(path)) + obj = reader(Path(path)) + return obj + + +def round_trip_localpath(writer, reader, path=None): + """ + Write an object to file specifed by a py.path LocalPath and read it back + + Parameters + ---------- + writer : callable bound to pandas object + IO writing function (e.g. DataFrame.to_csv ) + reader : callable + IO reading function (e.g. pd.read_csv ) + path : str, default None + The path where the object is written and then read. + + Returns + ------- + round_trip_object : pandas object + The original object that was serialized and then re-read. + """ + import pytest + LocalPath = pytest.importorskip('py.path').local + if path is None: + path = '___localpath___' + with ensure_clean(path) as path: + writer(LocalPath(path)) + obj = reader(LocalPath(path)) + return obj + + def assert_almost_equal(left, right, check_exact=False, check_dtype='equiv', check_less_precise=False, **kwargs): From 0b77d30b7c6833c324181cdcd5b78fc86cb1e056 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Mon, 8 May 2017 21:10:41 -0400 Subject: [PATCH 11/44] BLD: depending on non-existant file in sparse (#16293) (cherry picked from commit 4bed864a24901d9c2baab5e17c57c956a188602f) --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index d101358fb63dd..9a04bb6994869 100755 --- a/setup.py +++ b/setup.py @@ -524,7 +524,7 @@ def pxd(name): 'sources': ['pandas/_libs/src/parser/tokenizer.c', 'pandas/_libs/src/parser/io.c']}, '_libs.sparse': {'pyxfile': '_libs/sparse', - 'depends': (['pandas/core/sparse/sparse.pyx'] + + 'depends': (['pandas/_libs/sparse.pyx'] + _pxi_dep['sparse'])}, '_libs.testing': {'pyxfile': '_libs/testing', 'depends': ['pandas/_libs/testing.pyx']}, From c7e3d6124e464b384649e9a41304383d9b42ca12 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Mon, 8 May 2017 22:09:31 -0400 Subject: [PATCH 12/44] COMPAT: don't force clipboard routines to be imported in main pandas started (#16294) closes #16288 (cherry picked from commit 1e59b4cca76e32b0bbe9cfdc4b574795467523ac) --- doc/source/whatsnew/v0.20.2.txt | 2 +- pandas/core/generic.py | 4 ++-- pandas/io/api.py | 2 +- pandas/io/{clipboard/clipboard.py => clipboards.py} | 0 4 files changed, 4 insertions(+), 4 deletions(-) rename pandas/io/{clipboard/clipboard.py => clipboards.py} (100%) diff --git a/doc/source/whatsnew/v0.20.2.txt b/doc/source/whatsnew/v0.20.2.txt index ae5b10ba4030d..e6ebafd619f4b 100644 --- a/doc/source/whatsnew/v0.20.2.txt +++ b/doc/source/whatsnew/v0.20.2.txt @@ -50,7 +50,7 @@ Indexing I/O ^^^ - +- Bug that would force importing of the clipboard routines unecessarily, potentially causing an import error on startup (:issue:`16288`) Plotting diff --git a/pandas/core/generic.py b/pandas/core/generic.py index f89e4f87ce9fd..777cfcae7a326 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1387,8 +1387,8 @@ def to_clipboard(self, excel=None, sep=None, **kwargs): - Windows: none - OS X: none """ - from pandas.io.clipboard import clipboard - clipboard.to_clipboard(self, excel=excel, sep=sep, **kwargs) + from pandas.io import clipboards + clipboards.to_clipboard(self, excel=excel, sep=sep, **kwargs) def to_xarray(self): """ diff --git a/pandas/io/api.py b/pandas/io/api.py index 7f0d3c3631f63..a4a25b78942db 100644 --- a/pandas/io/api.py +++ b/pandas/io/api.py @@ -5,7 +5,7 @@ # flake8: noqa from pandas.io.parsers import read_csv, read_table, read_fwf -from pandas.io.clipboard.clipboard import read_clipboard +from pandas.io.clipboards import read_clipboard from pandas.io.excel import ExcelFile, ExcelWriter, read_excel from pandas.io.pytables import HDFStore, get_store, read_hdf from pandas.io.json import read_json diff --git a/pandas/io/clipboard/clipboard.py b/pandas/io/clipboards.py similarity index 100% rename from pandas/io/clipboard/clipboard.py rename to pandas/io/clipboards.py From 1c26a7883ce7d08145fc56ea4198584623ad83f8 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 9 May 2017 06:10:24 -0400 Subject: [PATCH 13/44] PERF: fix clean_index_list perf (#16295) closes #16285 (cherry picked from commit ce4eef3750052cec62ca0fe6536521dec523cd64) --- asv_bench/benchmarks/indexing.py | 3 +++ doc/source/whatsnew/v0.20.2.txt | 1 + pandas/_libs/lib.pyx | 22 +++++++++++++--------- pandas/core/indexes/base.py | 2 +- 4 files changed, 18 insertions(+), 10 deletions(-) diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py index 8947a0fdd796c..31af56b3715a5 100644 --- a/asv_bench/benchmarks/indexing.py +++ b/asv_bench/benchmarks/indexing.py @@ -19,6 +19,9 @@ def time_getitem_list_like(self): def time_getitem_array(self): self.s[np.arange(10000)] + def time_getitem_lists(self): + self.s[np.arange(10000).tolist()] + def time_iloc_array(self): self.s.iloc[np.arange(10000)] diff --git a/doc/source/whatsnew/v0.20.2.txt b/doc/source/whatsnew/v0.20.2.txt index e6ebafd619f4b..d7f6b0612534f 100644 --- a/doc/source/whatsnew/v0.20.2.txt +++ b/doc/source/whatsnew/v0.20.2.txt @@ -26,6 +26,7 @@ Enhancements Performance Improvements ~~~~~~~~~~~~~~~~~~~~~~~~ +- Performance regression fix when indexing with a list-like (:issue:`16285`) .. _whatsnew_0202.bug_fixes: diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 31402c38c770d..f6e574b66a828 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -950,7 +950,6 @@ def clean_index_list(list obj): Utility used in pandas.core.index._ensure_index """ cdef: - ndarray[object] converted Py_ssize_t i, n = len(obj) object v bint all_arrays = 1 @@ -964,15 +963,20 @@ def clean_index_list(list obj): if all_arrays: return obj, all_arrays - converted = np.empty(n, dtype=object) - for i in range(n): - v = obj[i] - if PyList_Check(v) or np.PyArray_Check(v) or hasattr(v, '_data'): - converted[i] = tuple(v) - else: - converted[i] = v + # don't force numpy coerce with nan's + inferred = infer_dtype(obj) + if inferred in ['string', 'bytes', 'unicode', + 'mixed', 'mixed-integer']: + return np.asarray(obj, dtype=object), 0 + elif inferred in ['integer']: + + # TODO: we infer an integer but it *could* be a unint64 + try: + return np.asarray(obj, dtype='int64'), 0 + except OverflowError: + return np.asarray(obj, dtype='object'), 0 - return maybe_convert_objects(converted), 0 + return np.asarray(obj), 0 ctypedef fused pandas_string: diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 82f3bf3b15462..9b29f1b04ff73 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -3960,7 +3960,7 @@ def _ensure_index(index_like, copy=False): if isinstance(index_like, list): if type(index_like) != list: index_like = list(index_like) - # 2200 ? + converted, all_arrays = lib.clean_index_list(index_like) if len(converted) > 0 and all_arrays: From a1ac8d1dc806f0941a8b3e110ac5680003fe37d9 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 9 May 2017 06:09:02 -0400 Subject: [PATCH 14/44] BLD: run only multi on 2.7-build_test build (#16296) (cherry picked from commit 0091810baf28c7872f8204755fb55363642cfcda) --- ci/script_multi.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/script_multi.sh b/ci/script_multi.sh index daa929e177666..d79fc43fbe175 100755 --- a/ci/script_multi.sh +++ b/ci/script_multi.sh @@ -27,7 +27,7 @@ if [ "$BUILD_TEST" ]; then echo "[running]" cd /tmp unset PYTHONPATH - python -c "import pandas; pandas.test(['-n 2', '--skip-slow', '--skip-network', '-r xX'])" + python -c 'import pandas; pandas.test(["-n 2", "--skip-slow", "--skip-network", "-r xX", "-m not single"])' elif [ "$DOC" ]; then echo "We are not running pytest as this is a doc-build" From 9e776901578deb26fc314f4e901137caf599c74a Mon Sep 17 00:00:00 2001 From: gfyoung Date: Tue, 9 May 2017 15:55:19 -0400 Subject: [PATCH 15/44] BUG: Don't segfault to_numeric when input is empty (#16305) Closes gh-16302. (cherry picked from commit 81aa70c2a598f9c12d8cb56a4d758b7f213770fe) --- doc/source/whatsnew/v0.20.2.txt | 4 ++-- pandas/_libs/src/inference.pyx | 5 +++++ pandas/tests/tools/test_numeric.py | 15 +++++++++++++++ 3 files changed, 22 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.20.2.txt b/doc/source/whatsnew/v0.20.2.txt index d7f6b0612534f..6c49d06cb9c57 100644 --- a/doc/source/whatsnew/v0.20.2.txt +++ b/doc/source/whatsnew/v0.20.2.txt @@ -39,7 +39,7 @@ Bug Fixes Conversion ^^^^^^^^^^ - +- Bug in ``pd.to_numeric()`` in which empty data inputs were causing Python to crash (:issue:`16302`) Indexing @@ -51,7 +51,7 @@ Indexing I/O ^^^ -- Bug that would force importing of the clipboard routines unecessarily, potentially causing an import error on startup (:issue:`16288`) +- Bug that would force importing of the clipboard routines unnecessarily, potentially causing an import error on startup (:issue:`16288`) Plotting diff --git a/pandas/_libs/src/inference.pyx b/pandas/_libs/src/inference.pyx index d87a0641291b1..ddd38979e326c 100644 --- a/pandas/_libs/src/inference.pyx +++ b/pandas/_libs/src/inference.pyx @@ -947,8 +947,13 @@ def maybe_convert_numeric(ndarray[object] values, set na_values, ------- numeric_array : array of converted object values to numerical ones """ + + if len(values) == 0: + return np.array([], dtype='i8') + # fastpath for ints - try to convert all based on first value cdef object val = values[0] + if util.is_integer_object(val): try: maybe_ints = values.astype('i8') diff --git a/pandas/tests/tools/test_numeric.py b/pandas/tests/tools/test_numeric.py index f82ad97d7b70f..664a97640387e 100644 --- a/pandas/tests/tools/test_numeric.py +++ b/pandas/tests/tools/test_numeric.py @@ -11,6 +11,21 @@ class TestToNumeric(object): + def test_empty(self): + # see gh-16302 + s = pd.Series([], dtype=object) + + res = to_numeric(s) + expected = pd.Series([], dtype=np.int64) + + tm.assert_series_equal(res, expected) + + # Original issue example + res = to_numeric(s, errors='coerce', downcast='integer') + expected = pd.Series([], dtype=np.int8) + + tm.assert_series_equal(res, expected) + def test_series(self): s = pd.Series(['1', '-3.14', '7']) res = to_numeric(s) From 6f3a0530dba252f1a7ad8b6ae5c155d8d4354f8e Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 10 May 2017 08:36:21 -0400 Subject: [PATCH 16/44] TST: not printing skips (#16318) (cherry picked from commit 0607e03f71b52e131947d73880380796f51d72fb) --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index f0ece15de65db..b7c18d2850a15 100644 --- a/.travis.yml +++ b/.travis.yml @@ -123,7 +123,7 @@ after_success: after_script: - echo "after_script start" - - source activate pandas && cd /tmp && python -c "import pandas; pandas.show_versions();" + - source activate pandas && pushd /tmp && python -c "import pandas; pandas.show_versions();" && popd - if [ -e /tmp/single.xml ]; then ci/print_skipped.py /tmp/single.xml; fi From 8d02272cb60cb79204804114332634fce47a27ee Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 11 May 2017 19:14:20 -0400 Subject: [PATCH 17/44] PERF: improved performance of small multiindexes (#16324) closes #16319 (cherry picked from commit 94ef7b6a2e1e9fd266bab6f22f8573d421d1745f) --- asv_bench/benchmarks/indexing.py | 20 +++++++++++++++---- doc/source/whatsnew/v0.20.2.txt | 2 +- pandas/_libs/index.pyx | 33 +++++++++++++++++++++++++++++++- pandas/core/dtypes/dtypes.py | 6 +++--- pandas/core/indexes/multi.py | 12 ++++++++++-- pandas/core/util/hashing.py | 18 +++++++---------- 6 files changed, 69 insertions(+), 22 deletions(-) diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py index 31af56b3715a5..e1676715853a4 100644 --- a/asv_bench/benchmarks/indexing.py +++ b/asv_bench/benchmarks/indexing.py @@ -193,9 +193,15 @@ def setup(self): np.arange(1000)], names=['one', 'two']) import string - self.mistring = MultiIndex.from_product( - [np.arange(1000), - np.arange(20), list(string.ascii_letters)], + + self.mi_large = MultiIndex.from_product( + [np.arange(1000), np.arange(20), list(string.ascii_letters)], + names=['one', 'two', 'three']) + self.mi_med = MultiIndex.from_product( + [np.arange(1000), np.arange(10), list('A')], + names=['one', 'two', 'three']) + self.mi_small = MultiIndex.from_product( + [np.arange(100), list('A'), list('A')], names=['one', 'two', 'three']) def time_series_xs_mi_ix(self): @@ -218,8 +224,14 @@ def time_multiindex_get_indexer(self): (0, 16), (0, 17), (0, 18), (0, 19)], dtype=object)) + def time_multiindex_large_get_loc(self): + self.mi_large.get_loc((999, 19, 'Z')) + + def time_multiindex_med_get_loc(self): + self.mi_med.get_loc((999, 9, 'A')) + def time_multiindex_string_get_loc(self): - self.mistring.get_loc((999, 19, 'Z')) + self.mi_small.get_loc((99, 'A', 'A')) def time_is_monotonic(self): self.miint.is_monotonic diff --git a/doc/source/whatsnew/v0.20.2.txt b/doc/source/whatsnew/v0.20.2.txt index 6c49d06cb9c57..ad6a11d2d750d 100644 --- a/doc/source/whatsnew/v0.20.2.txt +++ b/doc/source/whatsnew/v0.20.2.txt @@ -27,7 +27,7 @@ Performance Improvements ~~~~~~~~~~~~~~~~~~~~~~~~ - Performance regression fix when indexing with a list-like (:issue:`16285`) - +- Performance regression fix for small MultiIndexes (:issuse:`16319`) .. _whatsnew_0202.bug_fixes: diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index c7a537acf5d6f..21680fb0b3921 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -553,7 +553,34 @@ cdef inline bint _is_utc(object tz): return tz is UTC or isinstance(tz, _du_utc) -cdef class MultiIndexEngine(IndexEngine): +cdef class MultiIndexObjectEngine(ObjectEngine): + """ + provide the same interface as the MultiIndexEngine + but use the IndexEngine for computation + + This provides good performance with samller MI's + """ + def get_indexer(self, values): + # convert a MI to an ndarray + if hasattr(values, 'values'): + values = values.values + return super(MultiIndexObjectEngine, self).get_indexer(values) + + cpdef get_loc(self, object val): + + # convert a MI to an ndarray + if hasattr(val, 'values'): + val = val.values + return super(MultiIndexObjectEngine, self).get_loc(val) + + +cdef class MultiIndexHashEngine(ObjectEngine): + """ + Use a hashing based MultiIndex impl + but use the IndexEngine for computation + + This provides good performance with larger MI's + """ def _call_monotonic(self, object mi): # defer these back to the mi iteself @@ -584,6 +611,10 @@ cdef class MultiIndexEngine(IndexEngine): except TypeError: raise KeyError(val) + def get_indexer(self, values): + self._ensure_mapping_populated() + return self.mapping.lookup(values) + cdef _make_hash_table(self, n): return _hash.MultiIndexHashTable(n) diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 561f1951a4151..dc2c56ea476f9 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -88,12 +88,12 @@ def is_dtype(cls, dtype): """ if hasattr(dtype, 'dtype'): dtype = dtype.dtype - if isinstance(dtype, cls): - return True - elif isinstance(dtype, np.dtype): + if isinstance(dtype, np.dtype): return False elif dtype is None: return False + elif isinstance(dtype, cls): + return True try: return cls.construct_from_string(dtype) is not None except: diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 7ef037d8f3536..3db5633ec30bd 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -75,7 +75,6 @@ class MultiIndex(Index): _levels = FrozenList() _labels = FrozenList() _comparables = ['names'] - _engine_type = libindex.MultiIndexEngine rename = Index.set_names def __new__(cls, levels=None, labels=None, sortorder=None, names=None, @@ -629,7 +628,16 @@ def _get_level_number(self, level): @cache_readonly def _engine(self): - return self._engine_type(lambda: self, len(self)) + + # choose our engine based on our size + # the hashing based MultiIndex for larger + # sizes, and the MultiIndexOjbect for smaller + # xref: https://github.com/pandas-dev/pandas/pull/16324 + l = len(self) + if l > 10000: + return libindex.MultiIndexHashEngine(lambda: self, l) + + return libindex.MultiIndexObjectEngine(lambda: self.values, l) @property def values(self): diff --git a/pandas/core/util/hashing.py b/pandas/core/util/hashing.py index 6a5343e8a8e25..f0829adc94500 100644 --- a/pandas/core/util/hashing.py +++ b/pandas/core/util/hashing.py @@ -5,16 +5,13 @@ import numpy as np from pandas._libs import hashing -from pandas._libs.lib import is_bool_array from pandas.core.dtypes.generic import ( ABCMultiIndex, ABCIndexClass, ABCSeries, ABCDataFrame) from pandas.core.dtypes.common import ( - is_categorical_dtype, is_numeric_dtype, - is_datetime64_dtype, is_timedelta64_dtype, - is_list_like) + is_categorical_dtype, is_list_like) # 16 byte long hashing key _default_hash_key = '0123456789123456' @@ -136,7 +133,6 @@ def hash_tuples(vals, encoding='utf8', hash_key=None): ------- ndarray of hashed values array """ - is_tuple = False if isinstance(vals, tuple): vals = [vals] @@ -231,6 +227,7 @@ def hash_array(vals, encoding='utf8', hash_key=None, categorize=True): if not hasattr(vals, 'dtype'): raise TypeError("must pass a ndarray-like") + dtype = vals.dtype if hash_key is None: hash_key = _default_hash_key @@ -238,22 +235,21 @@ def hash_array(vals, encoding='utf8', hash_key=None, categorize=True): # For categoricals, we hash the categories, then remap the codes to the # hash values. (This check is above the complex check so that we don't ask # numpy if categorical is a subdtype of complex, as it will choke. - if is_categorical_dtype(vals.dtype): + if is_categorical_dtype(dtype): return _hash_categorical(vals, encoding, hash_key) # we'll be working with everything as 64-bit values, so handle this # 128-bit value early - if np.issubdtype(vals.dtype, np.complex128): + elif np.issubdtype(dtype, np.complex128): return hash_array(vals.real) + 23 * hash_array(vals.imag) # First, turn whatever array this is into unsigned 64-bit ints, if we can # manage it. - if is_bool_array(vals): + elif isinstance(dtype, np.bool): vals = vals.astype('u8') - elif (is_datetime64_dtype(vals) or - is_timedelta64_dtype(vals)): + elif issubclass(dtype.type, (np.datetime64, np.timedelta64)): vals = vals.view('i8').astype('u8', copy=False) - elif (is_numeric_dtype(vals) and vals.dtype.itemsize <= 8): + elif issubclass(dtype.type, np.number) and dtype.itemsize <= 8: vals = vals.view('u{}'.format(vals.dtype.itemsize)).astype('u8') else: # With repeated values, its MUCH faster to categorize object dtypes, From 3a25bb90dd282895344b12b20111ce1976903c61 Mon Sep 17 00:00:00 2001 From: DSM Date: Thu, 11 May 2017 07:25:14 -0400 Subject: [PATCH 18/44] BUG: Preserve data order when stacking unsorted levels (#16323) (#16325) (cherry picked from commit b1ff2914120867df9f459756f1209603c6bedf4f) --- doc/source/whatsnew/v0.20.2.txt | 2 +- pandas/core/reshape/reshape.py | 2 +- pandas/tests/test_multilevel.py | 31 +++++++++++++++++++++++++++++++ 3 files changed, 33 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.20.2.txt b/doc/source/whatsnew/v0.20.2.txt index ad6a11d2d750d..03579dab0d6a3 100644 --- a/doc/source/whatsnew/v0.20.2.txt +++ b/doc/source/whatsnew/v0.20.2.txt @@ -76,7 +76,7 @@ Sparse Reshaping ^^^^^^^^^ - +- Bug in ``DataFrame.stack`` with unsorted levels in MultiIndex columns (:issue:`16323`) Numeric diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 779002b300cc7..b0ed6d4c4b84d 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -689,7 +689,7 @@ def _convert_level_number(level_num, columns): new_labels = [np.arange(N).repeat(levsize)] new_names = [this.index.name] # something better? - new_levels.append(frame.columns.levels[level_num]) + new_levels.append(level_vals) new_labels.append(np.tile(level_labels, N)) new_names.append(frame.columns.names[level_num]) diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index ab28b8b43f359..5a0132453cec5 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -1195,6 +1195,37 @@ def test_unstack_unobserved_keys(self): recons = result.stack() tm.assert_frame_equal(recons, df) + def test_stack_order_with_unsorted_levels(self): + # GH 16323 + + def manual_compare_stacked(df, df_stacked, lev0, lev1): + assert all(df.loc[row, col] == + df_stacked.loc[(row, col[lev0]), col[lev1]] + for row in df.index for col in df.columns) + + # deep check for 1-row case + for width in [2, 3]: + levels_poss = itertools.product( + itertools.permutations([0, 1, 2], width), + repeat=2) + + for levels in levels_poss: + columns = MultiIndex(levels=levels, + labels=[[0, 0, 1, 1], + [0, 1, 0, 1]]) + df = DataFrame(columns=columns, data=[range(4)]) + for stack_lev in range(2): + df_stacked = df.stack(stack_lev) + manual_compare_stacked(df, df_stacked, + stack_lev, 1 - stack_lev) + + # check multi-row case + mi = MultiIndex(levels=[["A", "C", "B"], ["B", "A", "C"]], + labels=[np.repeat(range(3), 3), np.tile(range(3), 3)]) + df = DataFrame(columns=mi, index=range(5), + data=np.arange(5 * len(mi)).reshape(5, -1)) + manual_compare_stacked(df, df.stack(0), 0, 1) + def test_groupby_corner(self): midx = MultiIndex(levels=[['foo'], ['bar'], ['baz']], labels=[[0], [0], [0]], From 2eea690d277f6c7b0c0c83cccf6a6b7b8b7985f0 Mon Sep 17 00:00:00 2001 From: Keith Webber Date: Thu, 11 May 2017 15:24:46 -0400 Subject: [PATCH 19/44] DOC: Correctly redirect to SetupTools documentations (#16333) (cherry picked from commit 379fa8743f43470df5ae3d6246eb916b863e3487) --- doc/source/install.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/install.rst b/doc/source/install.rst index 578caae605471..48d51e1200447 100644 --- a/doc/source/install.rst +++ b/doc/source/install.rst @@ -202,7 +202,7 @@ installed), make sure you have `pytest Dependencies ------------ -* `setuptools `__ +* `setuptools `__ * `NumPy `__: 1.7.1 or higher * `python-dateutil `__: 1.5 or higher * `pytz `__: Needed for time zone support From e1e68b71469ecb00e645a652d6412503c1e72087 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 18 May 2017 09:05:54 -0500 Subject: [PATCH 20/44] BUG: Categorical comparison with unordered (#16339) Fixes categorical comparison operations improperly considering ordering when two unordered categoricals are compared. Closes #16014 (cherry picked from commit 91e9e52e625512ac6f84e51bad9f928b72a0b6ba) --- doc/source/categorical.rst | 8 +++++++ doc/source/whatsnew/v0.20.2.txt | 3 +++ pandas/core/categorical.py | 28 ++++++++++++++++++------ pandas/tests/test_categorical.py | 37 ++++++++++++++++++++++++++++++++ 4 files changed, 69 insertions(+), 7 deletions(-) diff --git a/doc/source/categorical.rst b/doc/source/categorical.rst index a508e84465107..ef558381c5e6f 100644 --- a/doc/source/categorical.rst +++ b/doc/source/categorical.rst @@ -453,6 +453,14 @@ the original values: np.asarray(cat) > base +When you compare two unordered categoricals with the same categories, the order is not considered: + +.. ipython:: python + + c1 = pd.Categorical(['a', 'b'], categories=['a', 'b'], ordered=False) + c2 = pd.Categorical(['a', 'b'], categories=['b', 'a'], ordered=False) + c1 == c2 + Operations ---------- diff --git a/doc/source/whatsnew/v0.20.2.txt b/doc/source/whatsnew/v0.20.2.txt index 03579dab0d6a3..a7d272db94ca4 100644 --- a/doc/source/whatsnew/v0.20.2.txt +++ b/doc/source/whatsnew/v0.20.2.txt @@ -83,7 +83,10 @@ Numeric ^^^^^^^ +Categorical +^^^^^^^^^^^ +- Fixed comparison operations considering the order of the categories when both categoricals are unordered (:issue:`16014`) Other ^^^^^ diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index 7eb86232cbb07..2cd3b6ae0fab1 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -55,17 +55,31 @@ def f(self, other): "equality or not") if isinstance(other, Categorical): # Two Categoricals can only be be compared if the categories are - # the same - if ((len(self.categories) != len(other.categories)) or - not ((self.categories == other.categories).all())): - raise TypeError("Categoricals can only be compared if " - "'categories' are the same") + # the same (maybe up to ordering, depending on ordered) + + msg = ("Categoricals can only be compared if " + "'categories' are the same.") + if len(self.categories) != len(other.categories): + raise TypeError(msg + " Categories are different lengths") + elif (self.ordered and not (self.categories == + other.categories).all()): + raise TypeError(msg) + elif not set(self.categories) == set(other.categories): + raise TypeError(msg) + if not (self.ordered == other.ordered): raise TypeError("Categoricals can only be compared if " "'ordered' is the same") - na_mask = (self._codes == -1) | (other._codes == -1) + if not self.ordered and not self.categories.equals( + other.categories): + # both unordered and different order + other_codes = _get_codes_for_values(other, self.categories) + else: + other_codes = other._codes + + na_mask = (self._codes == -1) | (other_codes == -1) f = getattr(self._codes, op) - ret = f(other._codes) + ret = f(other_codes) if na_mask.any(): # In other series, the leads to False, so do that here too ret[na_mask] = False diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index 03adf17f50300..6c611f827698c 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -3822,6 +3822,43 @@ def test_cat_equality(self): pytest.raises(TypeError, lambda: a > b) pytest.raises(TypeError, lambda: b > a) + @pytest.mark.parametrize('ctor', [ + lambda *args, **kwargs: Categorical(*args, **kwargs), + lambda *args, **kwargs: Series(Categorical(*args, **kwargs)), + ]) + def test_unordered_different_order_equal(self, ctor): + # https://github.com/pandas-dev/pandas/issues/16014 + c1 = ctor(['a', 'b'], categories=['a', 'b'], ordered=False) + c2 = ctor(['a', 'b'], categories=['b', 'a'], ordered=False) + assert (c1 == c2).all() + + c1 = ctor(['a', 'b'], categories=['a', 'b'], ordered=False) + c2 = ctor(['b', 'a'], categories=['b', 'a'], ordered=False) + assert (c1 != c2).all() + + c1 = ctor(['a', 'a'], categories=['a', 'b'], ordered=False) + c2 = ctor(['b', 'b'], categories=['b', 'a'], ordered=False) + assert (c1 != c2).all() + + c1 = ctor(['a', 'a'], categories=['a', 'b'], ordered=False) + c2 = ctor(['a', 'b'], categories=['b', 'a'], ordered=False) + result = c1 == c2 + tm.assert_numpy_array_equal(np.array(result), np.array([True, False])) + + def test_unordered_different_categories_raises(self): + c1 = Categorical(['a', 'b'], categories=['a', 'b'], ordered=False) + c2 = Categorical(['a', 'c'], categories=['c', 'a'], ordered=False) + with tm.assert_raises_regex(TypeError, + "Categoricals can only be compared"): + c1 == c2 + + def test_compare_different_lengths(self): + c1 = Categorical([], categories=['a', 'b']) + c2 = Categorical([], categories=['a']) + msg = "Categories are different lengths" + with tm.assert_raises_regex(TypeError, msg): + c1 == c2 + def test_concat_append(self): cat = pd.Categorical(["a", "b"], categories=["a", "b"]) vals = [1, 2] From c6ce9eac9aae4f95701b2ee93bfe1e099f92a255 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 17 May 2017 09:36:51 +0200 Subject: [PATCH 21/44] PERF: improve MultiIndex get_loc performance (#16346) * PERF: improve hash collision check for single MI labels * PERF: specialized hash function for single tuples (cherry picked from commit 34ebad832d6709ecd479c4db4705a9f81da015b3) --- asv_bench/benchmarks/indexing.py | 12 +++++ doc/source/whatsnew/v0.20.2.txt | 4 +- pandas/_libs/hashtable.pxd | 2 + pandas/_libs/hashtable_class_helper.pxi.in | 19 +++++++- pandas/core/indexes/multi.py | 4 +- pandas/core/util/hashing.py | 56 +++++++++++++++++++++- pandas/tests/util/test_hashing.py | 24 +++++++++- 7 files changed, 114 insertions(+), 7 deletions(-) diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py index e1676715853a4..6a2c9d48c4a28 100644 --- a/asv_bench/benchmarks/indexing.py +++ b/asv_bench/benchmarks/indexing.py @@ -227,12 +227,24 @@ def time_multiindex_get_indexer(self): def time_multiindex_large_get_loc(self): self.mi_large.get_loc((999, 19, 'Z')) + def time_multiindex_large_get_loc_warm(self): + for _ in range(1000): + self.mi_large.get_loc((999, 19, 'Z')) + def time_multiindex_med_get_loc(self): self.mi_med.get_loc((999, 9, 'A')) + def time_multiindex_med_get_loc_warm(self): + for _ in range(1000): + self.mi_med.get_loc((999, 9, 'A')) + def time_multiindex_string_get_loc(self): self.mi_small.get_loc((99, 'A', 'A')) + def time_multiindex_small_get_loc_warm(self): + for _ in range(1000): + self.mi_small.get_loc((99, 'A', 'A')) + def time_is_monotonic(self): self.miint.is_monotonic diff --git a/doc/source/whatsnew/v0.20.2.txt b/doc/source/whatsnew/v0.20.2.txt index a7d272db94ca4..6aae772026903 100644 --- a/doc/source/whatsnew/v0.20.2.txt +++ b/doc/source/whatsnew/v0.20.2.txt @@ -27,7 +27,9 @@ Performance Improvements ~~~~~~~~~~~~~~~~~~~~~~~~ - Performance regression fix when indexing with a list-like (:issue:`16285`) -- Performance regression fix for small MultiIndexes (:issuse:`16319`) +- Performance regression fix for MultiIndexes (:issue:`16319`, :issue:`16346`) +- Improved performance of ``.clip()`` with scalar arguments (:issue:`15400`) + .. _whatsnew_0202.bug_fixes: diff --git a/pandas/_libs/hashtable.pxd b/pandas/_libs/hashtable.pxd index 9b352ae1c003b..a0b9e9e47463c 100644 --- a/pandas/_libs/hashtable.pxd +++ b/pandas/_libs/hashtable.pxd @@ -38,6 +38,8 @@ cdef class MultiIndexHashTable(HashTable): cpdef get_item(self, object val) cpdef set_item(self, object key, Py_ssize_t val) + cdef inline void _check_for_collision(self, Py_ssize_t loc, object label) + cdef class StringHashTable(HashTable): cdef kh_str_t *table diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 3ce82dace40a9..003fd12fab6cd 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -4,6 +4,9 @@ Template for each `dtype` helper function for hashtable WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in """ +from lib cimport is_null_datetimelike + + #---------------------------------------------------------------------- # VectorData #---------------------------------------------------------------------- @@ -889,6 +892,19 @@ cdef class MultiIndexHashTable(HashTable): "hash collision\nlocs:\n{}\n" "result:\n{}\nmi:\n{}".format(alocs, result, mi)) + cdef inline void _check_for_collision(self, Py_ssize_t loc, object label): + # validate that the loc maps to the actual value + # version of _check_for_collisions above for single label (tuple) + + result = self.mi[loc] + + if not all(l == r or (is_null_datetimelike(l) + and is_null_datetimelike(r)) + for l, r in zip(result, label)): + raise AssertionError( + "hash collision\nloc:\n{}\n" + "result:\n{}\nmi:\n{}".format(loc, result, label)) + def __contains__(self, object key): try: self.get_item(key) @@ -907,8 +923,7 @@ cdef class MultiIndexHashTable(HashTable): k = kh_get_uint64(self.table, value) if k != self.table.n_buckets: loc = self.table.vals[k] - locs = np.array([loc], dtype=np.int64) - self._check_for_collisions(locs, key) + self._check_for_collision(loc, key) return loc else: raise KeyError(key) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 3db5633ec30bd..569e16f2141ae 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -748,7 +748,7 @@ def _hashed_indexing_key(self, key): we need to stringify if we have mixed levels """ - from pandas.core.util.hashing import hash_tuples + from pandas.core.util.hashing import hash_tuples, hash_tuple if not isinstance(key, tuple): return hash_tuples(key) @@ -762,7 +762,7 @@ def f(k, stringify): return k key = tuple([f(k, stringify) for k, stringify in zip(key, self._have_mixed_levels)]) - return hash_tuples(key) + return hash_tuple(key) @Appender(base._shared_docs['duplicated'] % _index_doc_kwargs) def duplicated(self, keep='first'): diff --git a/pandas/core/util/hashing.py b/pandas/core/util/hashing.py index f0829adc94500..e41ffae9d03c2 100644 --- a/pandas/core/util/hashing.py +++ b/pandas/core/util/hashing.py @@ -4,7 +4,7 @@ import itertools import numpy as np -from pandas._libs import hashing +from pandas._libs import hashing, tslib from pandas.core.dtypes.generic import ( ABCMultiIndex, ABCIndexClass, @@ -12,6 +12,9 @@ ABCDataFrame) from pandas.core.dtypes.common import ( is_categorical_dtype, is_list_like) +from pandas.core.dtypes.missing import isnull +from pandas.core.dtypes.cast import infer_dtype_from_scalar + # 16 byte long hashing key _default_hash_key = '0123456789123456' @@ -164,6 +167,29 @@ def hash_tuples(vals, encoding='utf8', hash_key=None): return h +def hash_tuple(val, encoding='utf8', hash_key=None): + """ + Hash a single tuple efficiently + + Parameters + ---------- + val : single tuple + encoding : string, default 'utf8' + hash_key : string key to encode, default to _default_hash_key + + Returns + ------- + hash + + """ + hashes = (_hash_scalar(v, encoding=encoding, hash_key=hash_key) + for v in val) + + h = _combine_hash_arrays(hashes, len(val))[0] + + return h + + def _hash_categorical(c, encoding, hash_key): """ Hash a Categorical by hashing its categories, and then mapping the codes @@ -276,3 +302,31 @@ def hash_array(vals, encoding='utf8', hash_key=None, categorize=True): vals *= np.uint64(0x94d049bb133111eb) vals ^= vals >> 31 return vals + + +def _hash_scalar(val, encoding='utf8', hash_key=None): + """ + Hash scalar value + + Returns + ------- + 1d uint64 numpy array of hash value, of length 1 + """ + + if isnull(val): + # this is to be consistent with the _hash_categorical implementation + return np.array([np.iinfo(np.uint64).max], dtype='u8') + + if getattr(val, 'tzinfo', None) is not None: + # for tz-aware datetimes, we need the underlying naive UTC value and + # not the tz aware object or pd extension type (as + # infer_dtype_from_scalar would do) + if not isinstance(val, tslib.Timestamp): + val = tslib.Timestamp(val) + val = val.tz_convert(None) + + dtype, val = infer_dtype_from_scalar(val) + vals = np.array([val], dtype=dtype) + + return hash_array(vals, hash_key=hash_key, encoding=encoding, + categorize=False) diff --git a/pandas/tests/util/test_hashing.py b/pandas/tests/util/test_hashing.py index e1e6e43529a7d..289592939e3da 100644 --- a/pandas/tests/util/test_hashing.py +++ b/pandas/tests/util/test_hashing.py @@ -1,4 +1,5 @@ import pytest +import datetime from warnings import catch_warnings import numpy as np @@ -6,7 +7,7 @@ from pandas import DataFrame, Series, Index, MultiIndex from pandas.util import hash_array, hash_pandas_object -from pandas.core.util.hashing import hash_tuples +from pandas.core.util.hashing import hash_tuples, hash_tuple, _hash_scalar import pandas.util.testing as tm @@ -79,6 +80,27 @@ def test_hash_tuples(self): result = hash_tuples(tups[0]) assert result == expected[0] + def test_hash_tuple(self): + # test equivalence between hash_tuples and hash_tuple + for tup in [(1, 'one'), (1, np.nan), (1.0, pd.NaT, 'A'), + ('A', pd.Timestamp("2012-01-01"))]: + result = hash_tuple(tup) + expected = hash_tuples([tup])[0] + assert result == expected + + def test_hash_scalar(self): + for val in [1, 1.4, 'A', b'A', u'A', pd.Timestamp("2012-01-01"), + pd.Timestamp("2012-01-01", tz='Europe/Brussels'), + datetime.datetime(2012, 1, 1), + pd.Timestamp("2012-01-01", tz='EST').to_pydatetime(), + pd.Timedelta('1 days'), datetime.timedelta(1), + pd.Period('2012-01-01', freq='D'), pd.Interval(0, 1), + np.nan, pd.NaT, None]: + result = _hash_scalar(val) + expected = hash_array(np.array([val], dtype=object), + categorize=True) + assert result[0] == expected[0] + def test_hash_tuples_err(self): for val in [5, 'foo', pd.Timestamp('20130101')]: From 41d90dc95df5e47ed44f75c5a5e82604ac28348e Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 16 May 2017 07:02:40 -0400 Subject: [PATCH 22/44] PERF: improved clip performance (#16364) closes #15400 (cherry picked from commit 42e2a87f2a8848795238de1259a3daa5612e393d) --- asv_bench/benchmarks/series_methods.py | 11 +++++++++ doc/source/whatsnew/v0.20.2.txt | 3 +-- pandas/core/generic.py | 33 +++++++++++++++++++++++++- pandas/tests/series/test_analytics.py | 1 + 4 files changed, 45 insertions(+), 3 deletions(-) diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py index c66654ee1e006..3c0e2869357ae 100644 --- a/asv_bench/benchmarks/series_methods.py +++ b/asv_bench/benchmarks/series_methods.py @@ -111,6 +111,7 @@ def setup(self): def time_series_dropna_int64(self): self.s.dropna() + class series_dropna_datetime(object): goal_time = 0.2 @@ -120,3 +121,13 @@ def setup(self): def time_series_dropna_datetime(self): self.s.dropna() + + +class series_clip(object): + goal_time = 0.2 + + def setup(self): + self.s = pd.Series(np.random.randn(50)) + + def time_series_dropna_datetime(self): + self.s.clip(0, 1) diff --git a/doc/source/whatsnew/v0.20.2.txt b/doc/source/whatsnew/v0.20.2.txt index 6aae772026903..ea5f0913cb316 100644 --- a/doc/source/whatsnew/v0.20.2.txt +++ b/doc/source/whatsnew/v0.20.2.txt @@ -19,7 +19,7 @@ Highlights include: Enhancements ~~~~~~~~~~~~ -- Unblocked access to additional compression types supported in pytables: 'blosc:blosclz, 'blosc:lz4', 'blosc:lz4hc', 'blosc:snappy', 'blosc:zlib', 'blosc:zstd' (:issue:`14478`) +- Unblocked access to additional compression types supported in pytables: 'blosc:blosclz, 'blosc:lz4', 'blosc:lz4hc', 'blosc:snappy', 'blosc:zlib', 'blosc:zstd' (:issue:`14478`) .. _whatsnew_0202.performance: @@ -30,7 +30,6 @@ Performance Improvements - Performance regression fix for MultiIndexes (:issue:`16319`, :issue:`16346`) - Improved performance of ``.clip()`` with scalar arguments (:issue:`15400`) - .. _whatsnew_0202.bug_fixes: Bug Fixes diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 777cfcae7a326..3e1c5c3f354fd 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -14,6 +14,7 @@ _ensure_int64, needs_i8_conversion, is_scalar, + is_number, is_integer, is_bool, is_bool_dtype, is_numeric_dtype, @@ -4104,6 +4105,22 @@ def isnull(self): def notnull(self): return notnull(self).__finalize__(self) + def _clip_with_scalar(self, lower, upper): + + if ((lower is not None and np.any(isnull(lower))) or + (upper is not None and np.any(isnull(upper)))): + raise ValueError("Cannot use an NA value as a clip threshold") + + result = self.values + mask = isnull(result) + if upper is not None: + result = np.where(result >= upper, upper, result) + if lower is not None: + result = np.where(result <= lower, lower, result) + result[mask] = np.nan + return self._constructor( + result, **self._construct_axes_dict()).__finalize__(self) + def clip(self, lower=None, upper=None, axis=None, *args, **kwargs): """ Trim values at input threshold(s). @@ -4122,12 +4139,13 @@ def clip(self, lower=None, upper=None, axis=None, *args, **kwargs): Examples -------- >>> df - 0 1 + 0 1 0 0.335232 -1.256177 1 -1.367855 0.746646 2 0.027753 -1.176076 3 0.230930 -0.679613 4 1.261967 0.570967 + >>> df.clip(-1.0, 0.5) 0 1 0 0.335232 -1.000000 @@ -4135,6 +4153,7 @@ def clip(self, lower=None, upper=None, axis=None, *args, **kwargs): 2 0.027753 -1.000000 3 0.230930 -0.679613 4 0.500000 0.500000 + >>> t 0 -0.3 1 -0.2 @@ -4142,6 +4161,7 @@ def clip(self, lower=None, upper=None, axis=None, *args, **kwargs): 3 0.0 4 0.1 dtype: float64 + >>> df.clip(t, t + 1, axis=0) 0 1 0 0.335232 -0.300000 @@ -4160,6 +4180,11 @@ def clip(self, lower=None, upper=None, axis=None, *args, **kwargs): if is_scalar(lower) and is_scalar(upper): lower, upper = min(lower, upper), max(lower, upper) + # fast-path for scalars + if ((lower is None or (is_scalar(lower) and is_number(lower))) and + (upper is None or (is_scalar(upper) and is_number(upper)))): + return self._clip_with_scalar(lower, upper) + result = self if lower is not None: result = result.clip_lower(lower, axis) @@ -4189,6 +4214,9 @@ def clip_upper(self, threshold, axis=None): if np.any(isnull(threshold)): raise ValueError("Cannot use an NA value as a clip threshold") + if is_scalar(threshold) and is_number(threshold): + return self._clip_with_scalar(None, threshold) + subset = self.le(threshold, axis=axis) | isnull(self) return self.where(subset, threshold, axis=axis) @@ -4213,6 +4241,9 @@ def clip_lower(self, threshold, axis=None): if np.any(isnull(threshold)): raise ValueError("Cannot use an NA value as a clip threshold") + if is_scalar(threshold) and is_number(threshold): + return self._clip_with_scalar(threshold, None) + subset = self.ge(threshold, axis=axis) | isnull(self) return self.where(subset, threshold, axis=axis) diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index ec6a118ec3639..18c6c9a6dd021 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -1011,6 +1011,7 @@ def test_clip_against_series(self): lower = Series([1.0, 2.0, 3.0]) upper = Series([1.5, 2.5, 3.5]) + assert_series_equal(s.clip(lower, upper), Series([1.0, 2.0, 3.5])) assert_series_equal(s.clip(1.5, upper), Series([1.5, 1.5, 3.5])) From a4956693ee576e6677c5aa301ed8379d4d0c956b Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 16 May 2017 20:53:28 -0400 Subject: [PATCH 23/44] TST: followup to #16364, catch errstate warnings (#16373) (cherry picked from commit e97865e5a60099b785daf58f6be085ef6d906427) --- pandas/core/generic.py | 14 +++++++++----- pandas/tests/frame/test_analytics.py | 11 +++++++++++ 2 files changed, 20 insertions(+), 5 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 3e1c5c3f354fd..c33b30c78d812 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -4113,11 +4113,15 @@ def _clip_with_scalar(self, lower, upper): result = self.values mask = isnull(result) - if upper is not None: - result = np.where(result >= upper, upper, result) - if lower is not None: - result = np.where(result <= lower, lower, result) - result[mask] = np.nan + + with np.errstate(all='ignore'): + if upper is not None: + result = np.where(result >= upper, upper, result) + if lower is not None: + result = np.where(result <= lower, lower, result) + if np.any(mask): + result[mask] = np.nan + return self._constructor( result, **self._construct_axes_dict()).__finalize__(self) diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index be89b27912d1c..fa5769d9c0e65 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -1824,6 +1824,17 @@ def test_dataframe_clip(self): assert (clipped_df.values[ub_mask] == ub).all() assert (clipped_df.values[mask] == df.values[mask]).all() + @pytest.mark.xfail(reason=("clip on mixed integer or floats " + "with integer clippers coerces to float")) + def test_clip_mixed_numeric(self): + + df = DataFrame({'A': [1, 2, 3], + 'B': [1., np.nan, 3.]}) + result = df.clip(1, 2) + expected = DataFrame({'A': [1, 2, 2], + 'B': [1., np.nan, 2.]}) + tm.assert_frame_equal(result, expected, check_like=True) + def test_clip_against_series(self): # GH #6966 From 5a686023863ca6dc9a13b0a46766ffbc43d69008 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 16 May 2017 22:14:45 -0400 Subject: [PATCH 24/44] TST: remove pandas-datareader xfail as 0.4.0 works (#16374) (cherry picked from commit a3021eaed89e0198547c0a1583f03d0963267536) --- pandas/tests/test_downstream.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/tests/test_downstream.py b/pandas/tests/test_downstream.py index 12976272cb8b1..27e3c29a70a9f 100644 --- a/pandas/tests/test_downstream.py +++ b/pandas/tests/test_downstream.py @@ -84,12 +84,10 @@ def test_pandas_gbq(df): pandas_gbq = import_module('pandas_gbq') # noqa -@pytest.mark.xfail(reason=("pandas_datareader<=0.3.0 " - "broken w.r.t. pandas >= 0.20.0")) def test_pandas_datareader(): pandas_datareader = import_module('pandas_datareader') # noqa - pandas_datareader.get_data_yahoo('AAPL') + pandas_datareader.get_data_google('AAPL') def test_geopandas(): From ace96a36893ffc011ceeccdab7f10ef43f98719a Mon Sep 17 00:00:00 2001 From: RobinFiveWords Date: Wed, 24 May 2017 19:19:06 -0400 Subject: [PATCH 25/44] BUG: reshape fix for maybe_infer_to_datetimelike() closes #16362 Author: RobinFiveWords Closes #16395 from RobinFiveWords/cast-infer-datetime-reshape-fix and squashes the following commits: 7ad1e7d [RobinFiveWords] redid lost changes to cast.py and test_cast.py afa2eeb [RobinFiveWords] added whatsnew0.20.2 entry 7a35624 [RobinFiveWords] removed whatsnew entry again 2ec60a6 [RobinFiveWords] added back whatsnew change (cherry picked from commit 05d0667169e4b770cfad94f4a19c1d6ae9a98536) --- doc/source/whatsnew/v0.20.2.txt | 3 +++ pandas/core/dtypes/cast.py | 2 +- pandas/tests/dtypes/test_cast.py | 13 ++++++++++++- 3 files changed, 16 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.20.2.txt b/doc/source/whatsnew/v0.20.2.txt index ea5f0913cb316..7183fedd88f0a 100644 --- a/doc/source/whatsnew/v0.20.2.txt +++ b/doc/source/whatsnew/v0.20.2.txt @@ -78,6 +78,9 @@ Reshaping ^^^^^^^^^ - Bug in ``DataFrame.stack`` with unsorted levels in MultiIndex columns (:issue:`16323`) +- Bug in ``pd.wide_to_long()`` where no error was raised when ``i`` was not a unique identifier (:issue:`16382`) +- Bug in ``Series.isin(..)`` with a list of tuples (:issue:`16394`) +- Bug in construction of a ``DataFrame`` with mixed dtypes including an all-NaT column. (:issue:`16395`) Numeric diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 19d3792f73de7..fd61813a57c98 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -837,7 +837,7 @@ def try_timedelta(v): try: return to_timedelta(v)._values.reshape(shape) except: - return v + return v.reshape(shape) inferred_type = lib.infer_datetimelike_array(_ensure_object(v)) diff --git a/pandas/tests/dtypes/test_cast.py b/pandas/tests/dtypes/test_cast.py index e92724a5d9cd4..767e99d98cf29 100644 --- a/pandas/tests/dtypes/test_cast.py +++ b/pandas/tests/dtypes/test_cast.py @@ -9,7 +9,7 @@ from datetime import datetime, timedelta, date import numpy as np -from pandas import Timedelta, Timestamp, DatetimeIndex +from pandas import Timedelta, Timestamp, DatetimeIndex, DataFrame, NaT from pandas.core.dtypes.cast import ( maybe_downcast_to_dtype, @@ -213,6 +213,17 @@ def test_maybe_convert_scalar(self): result = maybe_convert_scalar(Timedelta('1 day 1 min')) assert result == Timedelta('1 day 1 min').value + def test_maybe_infer_to_datetimelike(self): + # GH16362 + # pandas=0.20.1 raises IndexError: tuple index out of range + result = DataFrame(np.array([[NaT, 'a', 'b', 0], + [NaT, 'b', 'c', 1]])) + assert result.size == 8 + # this construction was fine + result = DataFrame(np.array([[NaT, 'a', 0], + [NaT, 'b', 1]])) + assert result.size == 6 + class TestConvert(object): From 772e63f1c5303369234950dc05e7f37f818c5b04 Mon Sep 17 00:00:00 2001 From: Erik Fredriksen Date: Tue, 23 May 2017 21:52:22 +0200 Subject: [PATCH 26/44] BUG: wide_to_long should check for unique id vars (#16382) (#16403) * BUG: wide_to_long should check for unique id vars (#16382) * Fix uncaught lint error * Add whatsnew note (bug fix) (cherry picked from commit 04356a83c0dc8a749c84e168535e6673f2548ec6) --- doc/source/whatsnew/v0.20.2.txt | 1 + pandas/core/reshape/reshape.py | 3 +++ pandas/tests/reshape/test_reshape.py | 11 +++++++++++ 3 files changed, 15 insertions(+) diff --git a/doc/source/whatsnew/v0.20.2.txt b/doc/source/whatsnew/v0.20.2.txt index 7183fedd88f0a..465ca3714ea89 100644 --- a/doc/source/whatsnew/v0.20.2.txt +++ b/doc/source/whatsnew/v0.20.2.txt @@ -83,6 +83,7 @@ Reshaping - Bug in construction of a ``DataFrame`` with mixed dtypes including an all-NaT column. (:issue:`16395`) + Numeric ^^^^^^^ diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index b0ed6d4c4b84d..f944dfe22361a 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -1046,6 +1046,9 @@ def melt_stub(df, stub, i, j, value_vars, sep): else: i = list(i) + if df[i].duplicated().any(): + raise ValueError("the id variables need to uniquely identify each row") + value_vars = list(map(lambda stub: get_var_names(df, stub, sep, suffix), stubnames)) diff --git a/pandas/tests/reshape/test_reshape.py b/pandas/tests/reshape/test_reshape.py index 79626d89026a7..d47a95924bd10 100644 --- a/pandas/tests/reshape/test_reshape.py +++ b/pandas/tests/reshape/test_reshape.py @@ -976,3 +976,14 @@ def test_multiple_id_columns(self): exp_frame = exp_frame.set_index(['famid', 'birth', 'age'])[['ht']] long_frame = wide_to_long(df, 'ht', i=['famid', 'birth'], j='age') tm.assert_frame_equal(long_frame, exp_frame) + + def test_non_unique_idvars(self): + # GH16382 + # Raise an error message if non unique id vars (i) are passed + df = pd.DataFrame({ + 'A_A1': [1, 2, 3, 4, 5], + 'B_B1': [1, 2, 3, 4, 5], + 'x': [1, 1, 1, 1, 1] + }) + with pytest.raises(ValueError): + wide_to_long(df, ['A_A', 'B_B'], i='x', j='colname') From 1e3141e2adde1b5bbd967f9f9a0b4d0c1066ffcd Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 22 May 2017 15:02:08 +0200 Subject: [PATCH 27/44] DOC: add google analytics to the documentation (#16412) (cherry picked from commit 5fe042f507c2c5a9bd4f104111e37a21cfa6365b) --- doc/source/themes/nature_with_gtoc/layout.html | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/doc/source/themes/nature_with_gtoc/layout.html b/doc/source/themes/nature_with_gtoc/layout.html index ddf1e861f5f81..a2106605c5562 100644 --- a/doc/source/themes/nature_with_gtoc/layout.html +++ b/doc/source/themes/nature_with_gtoc/layout.html @@ -94,4 +94,15 @@

{{ _('Search') }}

}); }); + {% endblock %} \ No newline at end of file From 61503ead6e1c3678ee04777979b45754643e2c1b Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Mon, 22 May 2017 09:10:27 -0400 Subject: [PATCH 28/44] PERF: don't materialize arrays on checking in groupby (#16413) (cherry picked from commit d5a681bfa2de24b4a1449956c84393a413909738) --- asv_bench/benchmarks/groupby.py | 9 +++++++++ doc/source/whatsnew/v0.20.2.txt | 1 + pandas/core/indexes/base.py | 1 - 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index c0c3a42cc4464..13b5cd2b06032 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -368,6 +368,11 @@ def setup(self): self.dates = (np.datetime64('now') + self.offsets) self.df = DataFrame({'key1': np.random.randint(0, 500, size=self.n), 'key2': np.random.randint(0, 100, size=self.n), 'value1': np.random.randn(self.n), 'value2': np.random.randn(self.n), 'value3': np.random.randn(self.n), 'dates': self.dates, }) + N = 1000000 + self.draws = pd.Series(np.random.randn(N)) + labels = pd.Series(['foo', 'bar', 'baz', 'qux'] * (N // 4)) + self.cats = labels.astype('category') + def time_groupby_multi_size(self): self.df.groupby(['key1', 'key2']).size() @@ -377,6 +382,10 @@ def time_groupby_dt_size(self): def time_groupby_dt_timegrouper_size(self): self.df.groupby(TimeGrouper(key='dates', freq='M')).size() + def time_groupby_size(self): + self.draws.groupby(self.cats).size() + + #---------------------------------------------------------------------- # groupby with a variable value for ngroups diff --git a/doc/source/whatsnew/v0.20.2.txt b/doc/source/whatsnew/v0.20.2.txt index 465ca3714ea89..bd17b6499a24a 100644 --- a/doc/source/whatsnew/v0.20.2.txt +++ b/doc/source/whatsnew/v0.20.2.txt @@ -29,6 +29,7 @@ Performance Improvements - Performance regression fix when indexing with a list-like (:issue:`16285`) - Performance regression fix for MultiIndexes (:issue:`16319`, :issue:`16346`) - Improved performance of ``.clip()`` with scalar arguments (:issue:`15400`) +- Improved performance of groupby with categorical groupers (:issue:`16413`) .. _whatsnew_0202.bug_fixes: diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 9b29f1b04ff73..2af4f112ca941 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2388,7 +2388,6 @@ def get_loc(self, key, method=None, tolerance=None): if tolerance is not None: raise ValueError('tolerance argument only valid if using pad, ' 'backfill or nearest lookups') - key = _values_from_object(key) try: return self._engine.get_loc(key) except KeyError: From 75efd8a9d24ee4aa6f2c684a17fe321854c43dae Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 26 May 2017 07:32:52 -0500 Subject: [PATCH 29/44] COMPAT: Catch warnings on tab-complete in IPy 6 (#16414) Properties may run code with Jedi completion in IPython 6 Closes https://github.com/pandas-dev/pandas/issues/16409 (cherry picked from commit 3c9a74bc869decd7be5e180ed65bef694553e2fb) --- doc/source/whatsnew/v0.20.2.txt | 4 ++++ pandas/conftest.py | 10 ++++++++++ pandas/core/categorical.py | 7 +++++++ pandas/core/resample.py | 6 ++++++ pandas/tests/test_categorical.py | 11 +++++++++++ pandas/tests/test_resample.py | 17 +++++++++++++++-- 6 files changed, 53 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.20.2.txt b/doc/source/whatsnew/v0.20.2.txt index bd17b6499a24a..3f0c578995097 100644 --- a/doc/source/whatsnew/v0.20.2.txt +++ b/doc/source/whatsnew/v0.20.2.txt @@ -38,6 +38,10 @@ Bug Fixes - Bug in using ``pathlib.Path`` or ``py.path.local`` objects with io functions (:issue:`16291`) + +- Fixed a compatibility issue with IPython 6.0's tab completion showing deprecation warnings on Categoricals (:issue:`16409`) + + Conversion ^^^^^^^^^^ diff --git a/pandas/conftest.py b/pandas/conftest.py index 1149fae3fc0b0..8a3ffe22242ac 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -45,3 +45,13 @@ def spmatrix(request): tm._skip_if_no_scipy() from scipy import sparse return getattr(sparse, request.param + '_matrix') + + +@pytest.fixture +def ip(): + """An instance of IPython.InteractiveShell. + Will raise a skip if IPython is not installed. + """ + pytest.importorskip('IPython', minversion="6.0.0") + from IPython.core.interactiveshell import InteractiveShell + return InteractiveShell() diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index 2cd3b6ae0fab1..a5e61797bd478 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -342,6 +342,13 @@ def __init__(self, values, categories=None, ordered=False, fastpath=False): self._categories = categories self._codes = coerce_indexer_dtype(codes, categories) + def __dir__(self): + # Avoid IPython warnings for deprecated properties + # https://github.com/pandas-dev/pandas/issues/16409 + rv = set(dir(type(self))) + rv.discard("labels") + return sorted(rv) + @property def _constructor(self): return Categorical diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 631b91c3aad11..2bb825541e23b 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -184,6 +184,12 @@ def __getattr__(self, attr): matches_pattern = any(attr.startswith(x) for x in self._deprecated_valid_patterns) if not matches_pattern and attr not in self._deprecated_valids: + # avoid the warning, if it's just going to be an exception + # anyway. + if not hasattr(self.obj, attr): + raise AttributeError("'{}' has no attribute '{}'".format( + type(self.obj).__name__, attr + )) self = self._deprecated(attr) return object.__getattribute__(self, attr) diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index 6c611f827698c..3471f0b13b84b 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -737,6 +737,17 @@ def test_unicode_print(self): assert _rep(c) == expected + def test_tab_complete_warning(self, ip): + # https://github.com/pandas-dev/pandas/issues/16409 + pytest.importorskip('IPython', minversion="6.0.0") + from IPython.core.completer import provisionalcompleter + + code = "import pandas as pd; c = pd.Categorical([])" + ip.run_code(code) + with tm.assert_produces_warning(None): + with provisionalcompleter('ignore'): + list(ip.Completer.completions('c.', 1)) + def test_periodindex(self): idx1 = PeriodIndex(['2014-01', '2014-01', '2014-02', '2014-02', '2014-03', '2014-03'], freq='M') diff --git a/pandas/tests/test_resample.py b/pandas/tests/test_resample.py index 9734431c8b012..dadae026979d2 100644 --- a/pandas/tests/test_resample.py +++ b/pandas/tests/test_resample.py @@ -3,6 +3,7 @@ from warnings import catch_warnings from datetime import datetime, timedelta from functools import partial +from textwrap import dedent import pytest import numpy as np @@ -282,8 +283,7 @@ def test_attribute_access(self): tm.assert_series_equal(r.A.sum(), r['A'].sum()) # getting - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - pytest.raises(AttributeError, lambda: r.F) + pytest.raises(AttributeError, lambda: r.F) # setting def f(): @@ -2820,6 +2820,19 @@ def test_back_compat_v180(self): expected = df.groupby('A').resample('4s').mean().ffill() assert_frame_equal(result, expected) + def test_tab_complete_ipython6_warning(self, ip): + from IPython.core.completer import provisionalcompleter + code = dedent("""\ + import pandas.util.testing as tm + s = tm.makeTimeSeries() + rs = s.resample("D") + """) + ip.run_code(code) + + with tm.assert_produces_warning(None): + with provisionalcompleter('ignore'): + list(ip.Completer.completions('rs.', 1)) + def test_deferred_with_groupby(self): # GH 12486 From 5f82c419c15b4a82a041e6531e1220fa10fb5d4a Mon Sep 17 00:00:00 2001 From: lloydkirk Date: Wed, 24 May 2017 23:24:14 +0000 Subject: [PATCH 30/44] Error with .drop([]) on non-unique index (#16428) (cherry picked from commit b0a51df89e40691608bb8d9aa80f2d7e4861b9e1) --- doc/source/whatsnew/v0.20.2.txt | 2 ++ pandas/core/generic.py | 3 ++- pandas/tests/frame/test_axis_select_reindex.py | 6 ++++++ 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.20.2.txt b/doc/source/whatsnew/v0.20.2.txt index 3f0c578995097..1e440c17e0069 100644 --- a/doc/source/whatsnew/v0.20.2.txt +++ b/doc/source/whatsnew/v0.20.2.txt @@ -100,3 +100,5 @@ Categorical Other ^^^^^ + +- Bug in ``pd.drop([])`` for DataFrame with non-unique indices (:issue:`16270`) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index c33b30c78d812..cdfdc24a5b919 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -12,6 +12,7 @@ from pandas._libs import tslib, lib from pandas.core.dtypes.common import ( _ensure_int64, + _ensure_object, needs_i8_conversion, is_scalar, is_number, @@ -2062,7 +2063,7 @@ def drop(self, labels, axis=0, level=None, inplace=False, errors='raise'): result = dropped else: - labels = com._index_labels_to_array(labels) + labels = _ensure_object(com._index_labels_to_array(labels)) if level is not None: if not isinstance(axis, MultiIndex): raise AssertionError('axis must be a MultiIndex') diff --git a/pandas/tests/frame/test_axis_select_reindex.py b/pandas/tests/frame/test_axis_select_reindex.py index a6326083c1bee..87d942101f5f1 100644 --- a/pandas/tests/frame/test_axis_select_reindex.py +++ b/pandas/tests/frame/test_axis_select_reindex.py @@ -61,6 +61,11 @@ def test_drop_names(self): expected = Index(['e', 'f'], name='second') tm.assert_index_equal(dropped.columns, expected) + # GH 16398 + dropped = df.drop([], errors='ignore') + expected = Index(['a', 'b', 'c'], name='first') + tm.assert_index_equal(dropped.index, expected) + def test_drop_col_still_multiindex(self): arrays = [['a', 'b', 'c', 'top'], ['', '', '', 'OD'], @@ -100,6 +105,7 @@ def test_drop(self): columns=['a', 'a', 'b']) assert_frame_equal(nu_df.drop('a', axis=1), nu_df[['b']]) assert_frame_equal(nu_df.drop('b', axis='columns'), nu_df['a']) + assert_frame_equal(nu_df.drop([]), nu_df) # GH 16398 nu_df = nu_df.set_index(pd.Index(['X', 'Y', 'X'])) nu_df.columns = list('abc') From aded53eb7314ece3dc653e0380d52bcccc307cfa Mon Sep 17 00:00:00 2001 From: WBare Date: Tue, 23 May 2017 11:59:16 -0400 Subject: [PATCH 31/44] BUG: Interpolate limit=n GH16282 (#16429) * BUG: Interpolate limit=n GH16282 * Fix: comment line over the 80 char limit * Test: Added small test for code coverage * DOC: Moved whats new comment from 0.21.0 to 0.20.2 * Update v0.21.0.txt Removed extraneous newline (cherry picked from commit a8a497f0d8c1acd472d57dfb48832292fb3f8c2e) --- doc/source/whatsnew/v0.20.2.txt | 2 +- pandas/core/missing.py | 60 ++++++++++++++++------------- pandas/tests/series/test_missing.py | 18 +++++++++ pandas/tests/test_common.py | 1 + 4 files changed, 53 insertions(+), 28 deletions(-) diff --git a/doc/source/whatsnew/v0.20.2.txt b/doc/source/whatsnew/v0.20.2.txt index 1e440c17e0069..c2d6741b27a43 100644 --- a/doc/source/whatsnew/v0.20.2.txt +++ b/doc/source/whatsnew/v0.20.2.txt @@ -91,7 +91,7 @@ Reshaping Numeric ^^^^^^^ - +- Bug in .interpolate(), where limit_direction was not respected when limit=None (default) was passed (:issue:16282) Categorical ^^^^^^^^^^^ diff --git a/pandas/core/missing.py b/pandas/core/missing.py index 3010348423340..51778684d68f5 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -160,35 +160,41 @@ def _interp_limit(invalid, fw_limit, bw_limit): start_nans = set(range(ys.first_valid_index())) end_nans = set(range(1 + ys.last_valid_index(), len(valid))) - # This is a list of the indexes in the series whose yvalue is currently - # NaN, but whose interpolated yvalue will be overwritten with NaN after - # computing the interpolation. For each index in this list, one of these - # conditions is true of the corresponding NaN in the yvalues: + # violate_limit is a list of the indexes in the series whose yvalue is + # currently NaN, and should still be NaN after the interpolation. + # Specifically: # - # a) It is one of a chain of NaNs at the beginning of the series, and - # either limit is not specified or limit_direction is 'forward'. - # b) It is one of a chain of NaNs at the end of the series, and limit is - # specified and limit_direction is 'backward' or 'both'. - # c) Limit is nonzero and it is further than limit from the nearest non-NaN - # value (with respect to the limit_direction setting). + # If limit_direction='forward' or None then the list will contain NaNs at + # the beginning of the series, and NaNs that are more than 'limit' away + # from the prior non-NaN. # - # The default behavior is to fill forward with no limit, ignoring NaNs at - # the beginning (see issues #9218 and #10420) - violate_limit = sorted(start_nans) - - if limit is not None: - if not is_integer(limit): - raise ValueError('Limit must be an integer') - if limit < 1: - raise ValueError('Limit must be greater than 0') - if limit_direction == 'forward': - violate_limit = sorted(start_nans | set(_interp_limit(invalid, - limit, 0))) - if limit_direction == 'backward': - violate_limit = sorted(end_nans | set(_interp_limit(invalid, 0, - limit))) - if limit_direction == 'both': - violate_limit = sorted(_interp_limit(invalid, limit, limit)) + # If limit_direction='backward' then the list will contain NaNs at + # the end of the series, and NaNs that are more than 'limit' away + # from the subsequent non-NaN. + # + # If limit_direction='both' then the list will contain NaNs that + # are more than 'limit' away from any non-NaN. + # + # If limit=None, then use default behavior of filling an unlimited number + # of NaNs in the direction specified by limit_direction + + # default limit is unlimited GH #16282 + if limit is None: + limit = len(xvalues) + elif not is_integer(limit): + raise ValueError('Limit must be an integer') + elif limit < 1: + raise ValueError('Limit must be greater than 0') + + # each possible limit_direction + if limit_direction == 'forward': + violate_limit = sorted(start_nans | + set(_interp_limit(invalid, limit, 0))) + elif limit_direction == 'backward': + violate_limit = sorted(end_nans | + set(_interp_limit(invalid, 0, limit))) + elif limit_direction == 'both': + violate_limit = sorted(_interp_limit(invalid, limit, limit)) xvalues = getattr(xvalues, 'values', xvalues) yvalues = getattr(yvalues, 'values', yvalues) diff --git a/pandas/tests/series/test_missing.py b/pandas/tests/series/test_missing.py index c52c41877d5c0..8e73c17684a16 100644 --- a/pandas/tests/series/test_missing.py +++ b/pandas/tests/series/test_missing.py @@ -931,6 +931,24 @@ def test_interp_limit_forward(self): limit_direction='FORWARD') assert_series_equal(result, expected) + def test_interp_unlimited(self): + # these test are for issue #16282 default Limit=None is unlimited + s = Series([np.nan, 1., 3., np.nan, np.nan, np.nan, 11., np.nan]) + expected = Series([1., 1., 3., 5., 7., 9., 11., 11.]) + result = s.interpolate(method='linear', + limit_direction='both') + assert_series_equal(result, expected) + + expected = Series([np.nan, 1., 3., 5., 7., 9., 11., 11.]) + result = s.interpolate(method='linear', + limit_direction='forward') + assert_series_equal(result, expected) + + expected = Series([1., 1., 3., 5., 7., 9., 11., np.nan]) + result = s.interpolate(method='linear', + limit_direction='backward') + assert_series_equal(result, expected) + def test_interp_limit_bad_direction(self): s = Series([1, 3, np.nan, np.nan, np.nan, 11]) diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py index d7dbaccb87ee8..77ef535e08964 100644 --- a/pandas/tests/test_common.py +++ b/pandas/tests/test_common.py @@ -16,6 +16,7 @@ def test_mut_exclusive(): com._mut_exclusive(a=1, b=2) assert com._mut_exclusive(a=1, b=None) == 1 assert com._mut_exclusive(major=None, major_axis=None) is None + assert com._mut_exclusive(a=None, b=2) == 2 def test_get_callable_name(): From e7356294cc70589e6e7a716b5a7eefa8fd2349e3 Mon Sep 17 00:00:00 2001 From: Patrick Luo Date: Wed, 24 May 2017 19:15:32 -0400 Subject: [PATCH 32/44] BUG: handle nan values in DataFrame.update when overwrite=False (#15593) (#16430) (cherry picked from commit 85080aaf332711dbaebf4b4b266df053ccc6b52c) --- doc/source/whatsnew/v0.20.2.txt | 1 + pandas/core/frame.py | 8 ++++---- pandas/tests/frame/test_combine_concat.py | 22 ++++++++++++++++++++++ 3 files changed, 27 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v0.20.2.txt b/doc/source/whatsnew/v0.20.2.txt index c2d6741b27a43..de097e2ee5524 100644 --- a/doc/source/whatsnew/v0.20.2.txt +++ b/doc/source/whatsnew/v0.20.2.txt @@ -37,6 +37,7 @@ Bug Fixes ~~~~~~~~~ - Bug in using ``pathlib.Path`` or ``py.path.local`` objects with io functions (:issue:`16291`) +- Bug in ``DataFrame.update()`` with ``overwrite=False`` and ``NaN values`` (:issue:`15593`) - Fixed a compatibility issue with IPython 6.0's tab completion showing deprecation warnings on Categoricals (:issue:`16409`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 8d437102e4d18..f42d1c2651fd3 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3852,13 +3852,13 @@ def update(self, other, join='left', overwrite=True, filter_func=None, if overwrite: mask = isnull(that) - - # don't overwrite columns unecessarily - if mask.all(): - continue else: mask = notnull(this) + # don't overwrite columns unecessarily + if mask.all(): + continue + self[col] = expressions.where(mask, this, that, raise_on_error=True) diff --git a/pandas/tests/frame/test_combine_concat.py b/pandas/tests/frame/test_combine_concat.py index 688cacdee263e..f32efccf85fc6 100644 --- a/pandas/tests/frame/test_combine_concat.py +++ b/pandas/tests/frame/test_combine_concat.py @@ -763,3 +763,25 @@ def test_concat_datetime_datetime64_frame(self): # it works! pd.concat([df1, df2_obj]) + + +class TestDataFrameUpdate(TestData): + + def test_update_nan(self): + # #15593 #15617 + # test 1 + df1 = DataFrame({'A': [1.0, 2, 3], 'B': date_range('2000', periods=3)}) + df2 = DataFrame({'A': [None, 2, 3]}) + expected = df1.copy() + df1.update(df2, overwrite=False) + + tm.assert_frame_equal(df1, expected) + + # test 2 + df1 = DataFrame({'A': [1.0, None, 3], 'B': date_range('2000', periods=3)}) + df2 = DataFrame({'A': [None, 2, 3]}) + expected = DataFrame({'A': [1.0, 2, 3], 'B': date_range('2000', periods=3)}) + df1.update(df2, overwrite=False) + + tm.assert_frame_equal(df1, expected) + From c4b53c7a5b10133e1cb53d8f7b63c219f93621f1 Mon Sep 17 00:00:00 2001 From: chernrick Date: Thu, 25 May 2017 03:23:19 -0700 Subject: [PATCH 33/44] 15819 rolling window on empty df (#16431) (cherry picked from commit e41fe7f52a7ae6be962e683f40500624b2ba2cf6) --- doc/source/whatsnew/v0.20.2.txt | 5 +---- pandas/core/window.py | 2 +- pandas/tests/test_window.py | 32 ++++++++++++++++++++++++++++++++ 3 files changed, 34 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v0.20.2.txt b/doc/source/whatsnew/v0.20.2.txt index de097e2ee5524..1d0eb2aa174b0 100644 --- a/doc/source/whatsnew/v0.20.2.txt +++ b/doc/source/whatsnew/v0.20.2.txt @@ -38,11 +38,8 @@ Bug Fixes - Bug in using ``pathlib.Path`` or ``py.path.local`` objects with io functions (:issue:`16291`) - Bug in ``DataFrame.update()`` with ``overwrite=False`` and ``NaN values`` (:issue:`15593`) - - - Fixed a compatibility issue with IPython 6.0's tab completion showing deprecation warnings on Categoricals (:issue:`16409`) - Conversion ^^^^^^^^^^ @@ -72,7 +69,7 @@ Plotting Groupby/Resample/Rolling ^^^^^^^^^^^^^^^^^^^^^^^^ - +- Bug creating datetime rolling window on an empty DataFrame (:issue:`15819`) Sparse diff --git a/pandas/core/window.py b/pandas/core/window.py index df8e0c05009f4..cf1bad706ae1d 100644 --- a/pandas/core/window.py +++ b/pandas/core/window.py @@ -1074,7 +1074,7 @@ def validate(self): super(Rolling, self).validate() # we allow rolling on a datetimelike index - if (self.is_datetimelike and + if ((self.obj.empty or self.is_datetimelike) and isinstance(self.window, (compat.string_types, DateOffset, timedelta))): diff --git a/pandas/tests/test_window.py b/pandas/tests/test_window.py index 634cd5fe2586b..6a640d62108b3 100644 --- a/pandas/tests/test_window.py +++ b/pandas/tests/test_window.py @@ -441,6 +441,20 @@ def test_closed(self): with pytest.raises(ValueError): df.rolling(window=3, closed='neither') + @pytest.mark.parametrize('roller', ['1s', 1]) + def tests_empty_df_rolling(self, roller): + # GH 15819 Verifies that datetime and integer rolling windows can be + # applied to empty DataFrames + expected = DataFrame() + result = DataFrame().rolling(roller).sum() + tm.assert_frame_equal(result, expected) + + # Verifies that datetime and integer rolling windows can be applied to + # empty DataFrames with datetime index + expected = DataFrame(index=pd.DatetimeIndex([])) + result = DataFrame(index=pd.DatetimeIndex([])).rolling(roller).sum() + tm.assert_frame_equal(result, expected) + class TestExpanding(Base): @@ -483,6 +497,24 @@ def test_numpy_compat(self): tm.assert_raises_regex(UnsupportedFunctionCall, msg, getattr(e, func), dtype=np.float64) + @pytest.mark.parametrize( + 'expander', + [1, pytest.mark.xfail( + reason='GH 16425 expanding with offset not supported')('1s')]) + def tests_empty_df_expanding(self, expander): + # GH 15819 Verifies that datetime and integer expanding windows can be + # applied to empty DataFrames + expected = DataFrame() + result = DataFrame().expanding(expander).sum() + tm.assert_frame_equal(result, expected) + + # Verifies that datetime and integer expanding windows can be applied + # to empty DataFrames with datetime index + expected = DataFrame(index=pd.DatetimeIndex([])) + result = DataFrame( + index=pd.DatetimeIndex([])).expanding(expander).sum() + tm.assert_frame_equal(result, expected) + class TestEWM(Base): From 07ae2e0a05a1138ea066072028859fb5529b46f1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andrew=20=E4=BA=AE?= Date: Wed, 24 May 2017 19:09:25 -0700 Subject: [PATCH 34/44] BUG: Silence numpy warnings when broadcasting comparison ops (GH16378, GH16306) (#16433) TST: test for fix of GH16378, GH16306(cherry picked from commit 96f3e7ceb1363a475f0843045ca282df45e631dd) --- doc/source/whatsnew/v0.20.2.txt | 1 + pandas/core/ops.py | 3 ++- pandas/tests/frame/test_analytics.py | 13 +++++++++++++ 3 files changed, 16 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.20.2.txt b/doc/source/whatsnew/v0.20.2.txt index 1d0eb2aa174b0..393e08438c902 100644 --- a/doc/source/whatsnew/v0.20.2.txt +++ b/doc/source/whatsnew/v0.20.2.txt @@ -44,6 +44,7 @@ Conversion ^^^^^^^^^^ - Bug in ``pd.to_numeric()`` in which empty data inputs were causing Python to crash (:issue:`16302`) +- Silence numpy warnings when broadcasting DataFrame to Series with comparison ops (:issue:`16378`, :issue:`16306`) Indexing diff --git a/pandas/core/ops.py b/pandas/core/ops.py index e7cfbdb0fc9c6..55473ec8d7cad 100644 --- a/pandas/core/ops.py +++ b/pandas/core/ops.py @@ -1250,7 +1250,8 @@ def _flex_comp_method_FRAME(op, name, str_rep=None, default_axis='columns', masker=False): def na_op(x, y): try: - result = op(x, y) + with np.errstate(invalid='ignore'): + result = op(x, y) except TypeError: xrav = x.ravel() result = np.empty(x.size, dtype=bool) diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index fa5769d9c0e65..b54f822f67e22 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -2056,3 +2056,16 @@ def test_n_duplicate_index(self, df_duplicates, n, order): result = df.nlargest(n, order) expected = df.sort_values(order, ascending=False).head(n) tm.assert_frame_equal(result, expected) + + def test_series_broadcasting(self): + # smoke test for numpy warnings + # GH 16378, GH 16306 + df = DataFrame([1.0, 1.0, 1.0]) + df_nan = DataFrame({'A': [np.nan, 2.0, np.nan]}) + s = Series([1, 1, 1]) + s_nan = Series([np.nan, np.nan, 1]) + + with tm.assert_produces_warning(None): + df_nan.clip_lower(s, axis=0) + for op in ['lt', 'le', 'gt', 'ge', 'eq', 'ne']: + getattr(df, op)(s_nan, axis=0) From 3f20852519429c79dd78d74727b7c632f960138f Mon Sep 17 00:00:00 2001 From: jaredsnyder Date: Tue, 23 May 2017 02:51:06 -0500 Subject: [PATCH 35/44] BUG: fix isin with Series of tuples values (#16394) (#16434) * Swiched out "values = np.array(list(values), dtype='object')" for "values = lib.list_to_object_array(list(values))" in the isin() method found in core/algorithms.py Added test for comparing to a list of tuples (cherry picked from commit e053ee301d82a44ddc86dc7e164fea2d5c5178f8) --- doc/source/whatsnew/v0.20.2.txt | 2 -- pandas/core/algorithms.py | 2 +- pandas/tests/frame/test_analytics.py | 8 ++++++++ 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.20.2.txt b/doc/source/whatsnew/v0.20.2.txt index 393e08438c902..4781e162a8a5c 100644 --- a/doc/source/whatsnew/v0.20.2.txt +++ b/doc/source/whatsnew/v0.20.2.txt @@ -86,8 +86,6 @@ Reshaping - Bug in ``Series.isin(..)`` with a list of tuples (:issue:`16394`) - Bug in construction of a ``DataFrame`` with mixed dtypes including an all-NaT column. (:issue:`16395`) - - Numeric ^^^^^^^ - Bug in .interpolate(), where limit_direction was not respected when limit=None (default) was passed (:issue:16282) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index a745ec616eda8..77d79c9585e57 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -388,7 +388,7 @@ def isin(comps, values): "[{0}]".format(type(values).__name__)) if not isinstance(values, (ABCIndex, ABCSeries, np.ndarray)): - values = np.array(list(values), dtype='object') + values = lib.list_to_object_array(list(values)) comps, dtype, _ = _ensure_data(comps) values, _, _ = _ensure_data(values, dtype=dtype) diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index b54f822f67e22..818c1fc574551 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -1201,6 +1201,14 @@ def test_isin_df(self): expected['B'] = False tm.assert_frame_equal(result, expected) + def test_isin_tuples(self): + # GH16394 + df = pd.DataFrame({'A': [1, 2, 3], 'B': ['a', 'b', 'f']}) + df['C'] = list(zip(df['A'], df['B'])) + result = df['C'].isin([(1, 'a')]) + tm.assert_series_equal(result, + Series([True, False, False], name="C")) + def test_isin_df_dupe_values(self): df1 = DataFrame({'A': [1, 2, 3, 4], 'B': [2, np.nan, 4, 4]}) # just cols duped From 1d1ef5d09620d2ee6e0fabeabc34f1ad5ccd343b Mon Sep 17 00:00:00 2001 From: JimStearns206 Date: Tue, 23 May 2017 09:52:44 -0700 Subject: [PATCH 36/44] BUG: Render empty DataFrame as empty HTML table w/o raising IndexError. (#16441) * BUG: Render empty DataFrame as empty HTML table w/o raising IndexError. * TST: Test rendering of 2 empty-ish DataFrames (#15953) DataFrame with an index but no column, and one with a column but no index. Add entry to whatsnew. (cherry picked from commit d9a63d07e12a8cab2821814d449ddb66cedf90bb) --- doc/source/whatsnew/v0.20.2.txt | 1 + pandas/io/formats/style.py | 37 ++++++++++++++------------- pandas/tests/io/formats/test_style.py | 10 ++++++++ 3 files changed, 30 insertions(+), 18 deletions(-) diff --git a/doc/source/whatsnew/v0.20.2.txt b/doc/source/whatsnew/v0.20.2.txt index 4781e162a8a5c..a7aa038af7751 100644 --- a/doc/source/whatsnew/v0.20.2.txt +++ b/doc/source/whatsnew/v0.20.2.txt @@ -57,6 +57,7 @@ I/O ^^^ - Bug that would force importing of the clipboard routines unnecessarily, potentially causing an import error on startup (:issue:`16288`) +- Bug that raised IndexError HTML-rendering an empty DataFrame (:issue:`15953`) Plotting diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index eac82ddde2318..3d7e0fcdc69b3 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -238,24 +238,25 @@ def format_attr(pair): "class": " ".join(cs), "is_visible": True}) - for c, value in enumerate(clabels[r]): - cs = [COL_HEADING_CLASS, "level%s" % r, "col%s" % c] - cs.extend(cell_context.get( - "col_headings", {}).get(r, {}).get(c, [])) - es = { - "type": "th", - "value": value, - "display_value": value, - "class": " ".join(cs), - "is_visible": _is_visible(c, r, col_lengths), - } - colspan = col_lengths.get((r, c), 0) - if colspan > 1: - es["attributes"] = [ - format_attr({"key": "colspan", "value": colspan}) - ] - row_es.append(es) - head.append(row_es) + if clabels: + for c, value in enumerate(clabels[r]): + cs = [COL_HEADING_CLASS, "level%s" % r, "col%s" % c] + cs.extend(cell_context.get( + "col_headings", {}).get(r, {}).get(c, [])) + es = { + "type": "th", + "value": value, + "display_value": value, + "class": " ".join(cs), + "is_visible": _is_visible(c, r, col_lengths), + } + colspan = col_lengths.get((r, c), 0) + if colspan > 1: + es["attributes"] = [ + format_attr({"key": "colspan", "value": colspan}) + ] + row_es.append(es) + head.append(row_es) if self.data.index.names and not all(x is None for x in self.data.index.names): diff --git a/pandas/tests/io/formats/test_style.py b/pandas/tests/io/formats/test_style.py index ee7356f12f498..9911888f758fb 100644 --- a/pandas/tests/io/formats/test_style.py +++ b/pandas/tests/io/formats/test_style.py @@ -103,6 +103,16 @@ def test_render(self): s.render() # it worked? + def test_render_empty_dfs(self): + empty_df = DataFrame() + es = Styler(empty_df) + es.render() + # An index but no columns + DataFrame(columns=['a']).style.render() + # A column but no index + DataFrame(index=['a']).style.render() + # No IndexError raised? + def test_render_double(self): df = pd.DataFrame({"A": [0, 1]}) style = lambda x: pd.Series(["color: red; border: 1px", From 033a892a1f7a9fcfc85592629d89f4c9aa500aec Mon Sep 17 00:00:00 2001 From: Hugues Valois Date: Tue, 23 May 2017 18:24:41 -0700 Subject: [PATCH 37/44] BUG: Don't ignore figsize in df.boxplot (#16445) * Propagate the figsize via the rcParams, since matplotlib doesn't allow passing it as a parameter to gca(). * Update what's new for v0.21.0 and use rc_context() to temporarily change rcParams. * Move bug fix from 0.21.0 whatsnew to 0.20.2. * Allow passing in an rc to _gca() instead of just figsize, and added a test for boxplot figsize. * Fix style violations. (cherry picked from commit 044feb537ec7e127822a62a7cb90e97d61ff5a56) --- doc/source/whatsnew/v0.20.2.txt | 2 ++ pandas/plotting/_core.py | 14 +++++--------- pandas/tests/plotting/test_boxplot_method.py | 8 ++++++++ 3 files changed, 15 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v0.20.2.txt b/doc/source/whatsnew/v0.20.2.txt index a7aa038af7751..0ee25759ce6b2 100644 --- a/doc/source/whatsnew/v0.20.2.txt +++ b/doc/source/whatsnew/v0.20.2.txt @@ -64,6 +64,8 @@ Plotting ^^^^^^^^ - Bug in ``DataFrame.plot`` with a single column and a list-like ``color`` (:issue:`3486`) +- Bug in ``plot`` where ``NaT`` in ``DatetimeIndex`` results in ``Timestamp.min`` (:issue: `12405`) +- Bug in ``DataFrame.boxplot`` where ``figsize`` keyword was not respected for non-grouped boxplots (:issue:`11959`) diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index c0f9f62106330..ec7c1f02f2ee8 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -48,9 +48,10 @@ def _get_standard_kind(kind): return {'density': 'kde'}.get(kind, kind) -def _gca(): +def _gca(rc=None): import matplotlib.pyplot as plt - return plt.gca() + with plt.rc_context(rc): + return plt.gca() def _gcf(): @@ -1869,12 +1870,6 @@ def plot_series(data, kind='line', ax=None, # Series unique **kwds): import matplotlib.pyplot as plt - """ - If no axes is specified, check whether there are existing figures - If there is no existing figures, _gca() will - create a figure with the default figsize, causing the figsize=parameter to - be ignored. - """ if ax is None and len(plt.get_fignums()) > 0: ax = _gca() ax = MPLPlot._get_ax_layer(ax) @@ -2004,7 +1999,8 @@ def plot_group(keys, values, ax): "'by' is None") if ax is None: - ax = _gca() + rc = {'figure.figsize': figsize} if figsize is not None else {} + ax = _gca(rc) data = data._get_numeric_data() if columns is None: columns = data.columns diff --git a/pandas/tests/plotting/test_boxplot_method.py b/pandas/tests/plotting/test_boxplot_method.py index 1e06c13980657..547dd0154de4e 100644 --- a/pandas/tests/plotting/test_boxplot_method.py +++ b/pandas/tests/plotting/test_boxplot_method.py @@ -160,6 +160,14 @@ def test_boxplot_empty_column(self): df.loc[:, 0] = np.nan _check_plot_works(df.boxplot, return_type='axes') + @slow + def test_figsize(self): + df = DataFrame(np.random.rand(10, 5), + columns=['A', 'B', 'C', 'D', 'E']) + result = df.boxplot(return_type='axes', figsize=(12, 8)) + assert result.figure.bbox_inches.width == 12 + assert result.figure.bbox_inches.height == 8 + def test_fontsize(self): df = DataFrame({"a": [1, 2, 3, 4, 5, 6]}) self._check_ticks_props(df.boxplot("a", fontsize=16), From 01bc87235b0772186ca639ccafd73cd10e5ca1a0 Mon Sep 17 00:00:00 2001 From: Pankaj Pandey Date: Wed, 24 May 2017 17:36:05 +0000 Subject: [PATCH 38/44] BUG: Fix warning with c engine when skipping lines with comment (#16455) * Fix correct warning with c engine when skipping lines Fixed bug where c engine would not print warnings for lines it skipped in case the skipped line had an inline comment. Also, its accounting of number of fields in such lines would be off by one. * Use `tm.capture_stderr` to capture stderr * Add bug fix note in `whatsnew/v0.20.3.txt` * Move test to CParserTests The behavior is only applicable on the `c` engine. * Update whatsnew bug entry as per review (cherry picked from commit 97ad3fb9c87226ad983267e2891dbbf68432b8ea) --- doc/source/whatsnew/v0.20.2.txt | 1 + pandas/_libs/src/parser/tokenizer.c | 3 +++ pandas/tests/io/parser/c_parser_only.py | 29 +++++++++++++++++++++++++ 3 files changed, 33 insertions(+) diff --git a/doc/source/whatsnew/v0.20.2.txt b/doc/source/whatsnew/v0.20.2.txt index 0ee25759ce6b2..247d2a188d6fa 100644 --- a/doc/source/whatsnew/v0.20.2.txt +++ b/doc/source/whatsnew/v0.20.2.txt @@ -56,6 +56,7 @@ Indexing I/O ^^^ +- Bug in pd.read_csv() when comment is passed in space deliminted text files (:issue:`16472`) - Bug that would force importing of the clipboard routines unnecessarily, potentially causing an import error on startup (:issue:`16288`) - Bug that raised IndexError HTML-rendering an empty DataFrame (:issue:`15953`) diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index 6b0775e54da0c..be23ebb023383 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -832,6 +832,9 @@ int tokenize_bytes(parser_t *self, size_t line_limit, int start_lines) { } else if (IS_CARRIAGE(c)) { self->state = EAT_CRNL; break; + } else if (IS_COMMENT_CHAR(c)) { + self->state = EAT_COMMENT; + break; } else if (!IS_WHITESPACE(c)) { self->state = START_FIELD; // fall through to subsequent state diff --git a/pandas/tests/io/parser/c_parser_only.py b/pandas/tests/io/parser/c_parser_only.py index 3e7a648474bc3..56ac10404b7b2 100644 --- a/pandas/tests/io/parser/c_parser_only.py +++ b/pandas/tests/io/parser/c_parser_only.py @@ -7,6 +7,8 @@ further arguments when parsing. """ +import sys + import pytest import numpy as np @@ -417,3 +419,30 @@ def test_data_after_quote(self): expected = DataFrame({'a': ['1', 'ba']}) tm.assert_frame_equal(result, expected) + + @tm.capture_stderr + def test_comment_whitespace_delimited(self): + test_input = """\ +1 2 +2 2 3 +3 2 3 # 3 fields +4 2 3# 3 fields +5 2 # 2 fields +6 2# 2 fields +7 # 1 field, NaN +8# 1 field, NaN +9 2 3 # skipped line +# comment""" + df = self.read_csv(StringIO(test_input), comment='#', header=None, + delimiter='\\s+', skiprows=0, + error_bad_lines=False) + error = sys.stderr.getvalue() + # skipped lines 2, 3, 4, 9 + for line_num in (2, 3, 4, 9): + assert 'Skipping line {}'.format(line_num) in error, error + expected = DataFrame([[1, 2], + [5, 2], + [6, 2], + [7, np.nan], + [8, np.nan]]) + tm.assert_frame_equal(df, expected) From c52f647bfa76390c0da3154fa0458bd7bb520a02 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 23 May 2017 19:33:18 +0200 Subject: [PATCH 39/44] DOC: update make.py script (#16456) (cherry picked from commit 92372c7632a92d7bd980534b5a78590b2d9453ad) --- doc/make.py | 35 ++++++++++++++++++++++++----------- 1 file changed, 24 insertions(+), 11 deletions(-) diff --git a/doc/make.py b/doc/make.py index e70655c3e2f92..781347a3c3e1b 100755 --- a/doc/make.py +++ b/doc/make.py @@ -34,39 +34,52 @@ SPHINX_BUILD = 'sphinxbuild' -def upload_dev(user='pandas'): +def _process_user(user): + if user is None or user is False: + user = '' + else: + user = user + '@' + return user + + +def upload_dev(user=None): 'push a copy to the pydata dev directory' - if os.system('cd build/html; rsync -avz . {0}@pandas.pydata.org' + user = _process_user(user) + if os.system('cd build/html; rsync -avz . {0}pandas.pydata.org' ':/usr/share/nginx/pandas/pandas-docs/dev/ -essh'.format(user)): raise SystemExit('Upload to Pydata Dev failed') -def upload_dev_pdf(user='pandas'): +def upload_dev_pdf(user=None): 'push a copy to the pydata dev directory' - if os.system('cd build/latex; scp pandas.pdf {0}@pandas.pydata.org' + user = _process_user(user) + if os.system('cd build/latex; scp pandas.pdf {0}pandas.pydata.org' ':/usr/share/nginx/pandas/pandas-docs/dev/'.format(user)): raise SystemExit('PDF upload to Pydata Dev failed') -def upload_stable(user='pandas'): +def upload_stable(user=None): 'push a copy to the pydata stable directory' - if os.system('cd build/html; rsync -avz . {0}@pandas.pydata.org' + user = _process_user(user) + if os.system('cd build/html; rsync -avz . {0}pandas.pydata.org' ':/usr/share/nginx/pandas/pandas-docs/stable/ -essh'.format(user)): raise SystemExit('Upload to stable failed') -def upload_stable_pdf(user='pandas'): +def upload_stable_pdf(user=None): 'push a copy to the pydata dev directory' - if os.system('cd build/latex; scp pandas.pdf {0}@pandas.pydata.org' + user = _process_user(user) + if os.system('cd build/latex; scp pandas.pdf {0}pandas.pydata.org' ':/usr/share/nginx/pandas/pandas-docs/stable/'.format(user)): raise SystemExit('PDF upload to stable failed') -def upload_prev(ver, doc_root='./', user='pandas'): +def upload_prev(ver, doc_root='./', user=None): 'push a copy of older release to appropriate version directory' + user = _process_user(user) local_dir = doc_root + 'build/html' remote_dir = '/usr/share/nginx/pandas/pandas-docs/version/%s/' % ver - cmd = 'cd %s; rsync -avz . %s@pandas.pydata.org:%s -essh' + cmd = 'cd %s; rsync -avz . %spandas.pydata.org:%s -essh' cmd = cmd % (local_dir, user, remote_dir) print(cmd) if os.system(cmd): @@ -74,7 +87,7 @@ def upload_prev(ver, doc_root='./', user='pandas'): 'Upload to %s from %s failed' % (remote_dir, local_dir)) local_dir = doc_root + 'build/latex' - pdf_cmd = 'cd %s; scp pandas.pdf %s@pandas.pydata.org:%s' + pdf_cmd = 'cd %s; scp pandas.pdf %spandas.pydata.org:%s' pdf_cmd = pdf_cmd % (local_dir, user, remote_dir) if os.system(pdf_cmd): raise SystemExit('Upload PDF to %s from %s failed' % (ver, doc_root)) From 7b0ed01a62a9519ed30c4c6ba2c8d225d2728a85 Mon Sep 17 00:00:00 2001 From: Becky Sweger Date: Fri, 26 May 2017 07:47:44 -0400 Subject: [PATCH 40/44] ENH: Add to_latex() method to Series (#16180) (#16465) * ENH: Add to_latex() method to Series (#16180) This changeset adds _repr_latex_ to the Series class and moves the to_latex() method from the DataFrame class to the NDFrame class. * Add Series to_latex test * Move _repr_latex_ to NDFrame Streamline things a bit by moving _repr_latex_ methods out of the Series and DataFrame classes * DOC: Added versionchanged (cherry picked from commit 6a6227d19b6aa522d6ae5362da0589a45913d7bd) --- doc/source/api.rst | 1 + doc/source/whatsnew/v0.20.2.txt | 1 + pandas/core/frame.py | 98 --------------------- pandas/core/generic.py | 106 ++++++++++++++++++++++- pandas/tests/io/formats/test_to_latex.py | 17 +++- pandas/tests/series/test_repr.py | 20 ++++- 6 files changed, 142 insertions(+), 101 deletions(-) diff --git a/doc/source/api.rst b/doc/source/api.rst index cb5136df1ff8b..e7d12df56d260 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -724,6 +724,7 @@ Serialization / IO / Conversion Series.to_dense Series.to_string Series.to_clipboard + Series.to_latex Sparse ~~~~~~ diff --git a/doc/source/whatsnew/v0.20.2.txt b/doc/source/whatsnew/v0.20.2.txt index 247d2a188d6fa..07ab637dd29f5 100644 --- a/doc/source/whatsnew/v0.20.2.txt +++ b/doc/source/whatsnew/v0.20.2.txt @@ -20,6 +20,7 @@ Enhancements ~~~~~~~~~~~~ - Unblocked access to additional compression types supported in pytables: 'blosc:blosclz, 'blosc:lz4', 'blosc:lz4hc', 'blosc:snappy', 'blosc:zlib', 'blosc:zstd' (:issue:`14478`) +- ``Series`` provides a ``to_latex`` method (:issue:`16180`) .. _whatsnew_0202.performance: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index f42d1c2651fd3..22f73490335f5 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -620,16 +620,6 @@ def _repr_html_(self): else: return None - def _repr_latex_(self): - """ - Returns a LaTeX representation for a particular Dataframe. - Mainly for use with nbconvert (jupyter notebook conversion to pdf). - """ - if get_option('display.latex.repr'): - return self.to_latex() - else: - return None - @property def style(self): """ @@ -1596,94 +1586,6 @@ def to_html(self, buf=None, columns=None, col_space=None, header=True, if buf is None: return formatter.buf.getvalue() - @Substitution(header='Write out column names. If a list of string is given, \ -it is assumed to be aliases for the column names.') - @Appender(fmt.common_docstring + fmt.return_docstring, indents=1) - def to_latex(self, buf=None, columns=None, col_space=None, header=True, - index=True, na_rep='NaN', formatters=None, float_format=None, - sparsify=None, index_names=True, bold_rows=True, - column_format=None, longtable=None, escape=None, - encoding=None, decimal='.', multicolumn=None, - multicolumn_format=None, multirow=None): - r""" - Render a DataFrame to a tabular environment table. You can splice - this into a LaTeX document. Requires \usepackage{booktabs}. - - `to_latex`-specific options: - - bold_rows : boolean, default True - Make the row labels bold in the output - column_format : str, default None - The columns format as specified in `LaTeX table format - `__ e.g 'rcl' for 3 - columns - longtable : boolean, default will be read from the pandas config module - Default: False. - Use a longtable environment instead of tabular. Requires adding - a \usepackage{longtable} to your LaTeX preamble. - escape : boolean, default will be read from the pandas config module - Default: True. - When set to False prevents from escaping latex special - characters in column names. - encoding : str, default None - A string representing the encoding to use in the output file, - defaults to 'ascii' on Python 2 and 'utf-8' on Python 3. - decimal : string, default '.' - Character recognized as decimal separator, e.g. ',' in Europe. - - .. versionadded:: 0.18.0 - - multicolumn : boolean, default True - Use \multicolumn to enhance MultiIndex columns. - The default will be read from the config module. - - .. versionadded:: 0.20.0 - - multicolumn_format : str, default 'l' - The alignment for multicolumns, similar to `column_format` - The default will be read from the config module. - - .. versionadded:: 0.20.0 - - multirow : boolean, default False - Use \multirow to enhance MultiIndex rows. - Requires adding a \usepackage{multirow} to your LaTeX preamble. - Will print centered labels (instead of top-aligned) - across the contained rows, separating groups via clines. - The default will be read from the pandas config module. - - .. versionadded:: 0.20.0 - - """ - # Get defaults from the pandas config - if longtable is None: - longtable = get_option("display.latex.longtable") - if escape is None: - escape = get_option("display.latex.escape") - if multicolumn is None: - multicolumn = get_option("display.latex.multicolumn") - if multicolumn_format is None: - multicolumn_format = get_option("display.latex.multicolumn_format") - if multirow is None: - multirow = get_option("display.latex.multirow") - - formatter = fmt.DataFrameFormatter(self, buf=buf, columns=columns, - col_space=col_space, na_rep=na_rep, - header=header, index=index, - formatters=formatters, - float_format=float_format, - bold_rows=bold_rows, - sparsify=sparsify, - index_names=index_names, - escape=escape, decimal=decimal) - formatter.to_latex(column_format=column_format, longtable=longtable, - encoding=encoding, multicolumn=multicolumn, - multicolumn_format=multicolumn_format, - multirow=multirow) - - if buf is None: - return formatter.buf.getvalue() - def info(self, verbose=None, buf=None, max_cols=None, memory_usage=None, null_counts=None): """ diff --git a/pandas/core/generic.py b/pandas/core/generic.py index cdfdc24a5b919..1a1bbc37cd816 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -46,7 +46,7 @@ import pandas.core.common as com import pandas.core.missing as missing from pandas.io.formats.printing import pprint_thing -from pandas.io.formats.format import format_percentiles +from pandas.io.formats.format import format_percentiles, DataFrameFormatter from pandas.tseries.frequencies import to_offset from pandas import compat from pandas.compat.numpy import function as nv @@ -1051,6 +1051,16 @@ def __setstate__(self, state): # ---------------------------------------------------------------------- # IO + def _repr_latex_(self): + """ + Returns a LaTeX representation for a particular object. + Mainly for use with nbconvert (jupyter notebook conversion to pdf). + """ + if config.get_option('display.latex.repr'): + return self.to_latex() + else: + return None + # ---------------------------------------------------------------------- # I/O Methods @@ -1489,6 +1499,100 @@ def to_xarray(self): coords=coords, ) + _shared_docs['to_latex'] = """ + Render an object to a tabular environment table. You can splice + this into a LaTeX document. Requires \\usepackage{booktabs}. + + .. versionchanged:: 0.20.2 + Added to Series + + `to_latex`-specific options: + + bold_rows : boolean, default True + Make the row labels bold in the output + column_format : str, default None + The columns format as specified in `LaTeX table format + `__ e.g 'rcl' for 3 + columns + longtable : boolean, default will be read from the pandas config module + Default: False. + Use a longtable environment instead of tabular. Requires adding + a \\usepackage{longtable} to your LaTeX preamble. + escape : boolean, default will be read from the pandas config module + Default: True. + When set to False prevents from escaping latex special + characters in column names. + encoding : str, default None + A string representing the encoding to use in the output file, + defaults to 'ascii' on Python 2 and 'utf-8' on Python 3. + decimal : string, default '.' + Character recognized as decimal separator, e.g. ',' in Europe. + + .. versionadded:: 0.18.0 + + multicolumn : boolean, default True + Use \multicolumn to enhance MultiIndex columns. + The default will be read from the config module. + + .. versionadded:: 0.20.0 + + multicolumn_format : str, default 'l' + The alignment for multicolumns, similar to `column_format` + The default will be read from the config module. + + .. versionadded:: 0.20.0 + + multirow : boolean, default False + Use \multirow to enhance MultiIndex rows. + Requires adding a \\usepackage{multirow} to your LaTeX preamble. + Will print centered labels (instead of top-aligned) + across the contained rows, separating groups via clines. + The default will be read from the pandas config module. + + .. versionadded:: 0.20.0 + """ + + @Substitution(header='Write out column names. If a list of string is given, \ +it is assumed to be aliases for the column names.') + @Appender(_shared_docs['to_latex'] % _shared_doc_kwargs) + def to_latex(self, buf=None, columns=None, col_space=None, header=True, + index=True, na_rep='NaN', formatters=None, float_format=None, + sparsify=None, index_names=True, bold_rows=True, + column_format=None, longtable=None, escape=None, + encoding=None, decimal='.', multicolumn=None, + multicolumn_format=None, multirow=None): + # Get defaults from the pandas config + if self.ndim == 1: + self = self.to_frame() + if longtable is None: + longtable = config.get_option("display.latex.longtable") + if escape is None: + escape = config.get_option("display.latex.escape") + if multicolumn is None: + multicolumn = config.get_option("display.latex.multicolumn") + if multicolumn_format is None: + multicolumn_format = config.get_option( + "display.latex.multicolumn_format") + if multirow is None: + multirow = config.get_option("display.latex.multirow") + + formatter = DataFrameFormatter(self, buf=buf, columns=columns, + col_space=col_space, na_rep=na_rep, + header=header, index=index, + formatters=formatters, + float_format=float_format, + bold_rows=bold_rows, + sparsify=sparsify, + index_names=index_names, + escape=escape, decimal=decimal) + formatter.to_latex(column_format=column_format, longtable=longtable, + encoding=encoding, multicolumn=multicolumn, + multicolumn_format=multicolumn_format, + multirow=multirow) + + if buf is None: + return formatter.buf.getvalue() + # ---------------------------------------------------------------------- # Fancy Indexing diff --git a/pandas/tests/io/formats/test_to_latex.py b/pandas/tests/io/formats/test_to_latex.py index 2542deb0cedf1..4ee77abb32c26 100644 --- a/pandas/tests/io/formats/test_to_latex.py +++ b/pandas/tests/io/formats/test_to_latex.py @@ -3,7 +3,7 @@ import pytest import pandas as pd -from pandas import DataFrame, compat +from pandas import DataFrame, compat, Series from pandas.util import testing as tm from pandas.compat import u import codecs @@ -491,3 +491,18 @@ def test_to_latex_decimal(self, frame): """ assert withindex_result == withindex_expected + + def test_to_latex_series(self): + s = Series(['a', 'b', 'c']) + withindex_result = s.to_latex() + withindex_expected = r"""\begin{tabular}{ll} +\toprule +{} & 0 \\ +\midrule +0 & a \\ +1 & b \\ +2 & c \\ +\bottomrule +\end{tabular} +""" + assert withindex_result == withindex_expected diff --git a/pandas/tests/series/test_repr.py b/pandas/tests/series/test_repr.py index 3af61b0a902d3..c22e2ca8e0dc8 100644 --- a/pandas/tests/series/test_repr.py +++ b/pandas/tests/series/test_repr.py @@ -8,7 +8,7 @@ import numpy as np import pandas as pd -from pandas import (Index, Series, DataFrame, date_range) +from pandas import (Index, Series, DataFrame, date_range, option_context) from pandas.core.index import MultiIndex from pandas.compat import lrange, range, u @@ -180,3 +180,21 @@ def test_timeseries_repr_object_dtype(self): ts2 = ts.iloc[np.random.randint(0, len(ts) - 1, 400)] repr(ts2).splitlines()[-1] + + def test_latex_repr(self): + result = r"""\begin{tabular}{ll} +\toprule +{} & 0 \\ +\midrule +0 & $\alpha$ \\ +1 & b \\ +2 & c \\ +\bottomrule +\end{tabular} +""" + with option_context('display.latex.escape', False, + 'display.latex.repr', True): + s = Series([r'$\alpha$', 'b', 'c']) + assert result == s._repr_latex_() + + assert s._repr_latex_() is None From ffe4429051571332dc32c77ee992c468595ccdb3 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 24 May 2017 06:35:20 -0400 Subject: [PATCH 41/44] COMPAT: feather-format 0.4.0 compat (#16475) (cherry picked from commit 692a5b94a038cc54fc7e18b77225efcecaccacab) --- ci/requirements-3.5_OSX.sh | 2 +- pandas/tests/io/test_feather.py | 17 +++++++++++++---- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/ci/requirements-3.5_OSX.sh b/ci/requirements-3.5_OSX.sh index cfbd2882a8a2d..39ea1a0cf67bf 100644 --- a/ci/requirements-3.5_OSX.sh +++ b/ci/requirements-3.5_OSX.sh @@ -4,4 +4,4 @@ source activate pandas echo "install 35_OSX" -conda install -n pandas -c conda-forge feather-format +conda install -n pandas -c conda-forge feather-format==0.3.1 diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py index e3190efecba30..96df05aa096e4 100644 --- a/pandas/tests/io/test_feather.py +++ b/pandas/tests/io/test_feather.py @@ -10,6 +10,10 @@ from feather import FeatherError from pandas.util.testing import assert_frame_equal, ensure_clean import pandas.util.testing as tm +from distutils.version import LooseVersion + + +fv = LooseVersion(feather.__version__) @pytest.mark.single @@ -57,6 +61,7 @@ def test_basic(self): assert df.dttz.dtype.tz.zone == 'US/Eastern' self.check_round_trip(df) + @pytest.mark.skipif(fv >= '0.4.0', reason='fixed in 0.4.0') def test_strided_data_issues(self): # strided data issuehttps://github.com/wesm/feather/issues/97 @@ -76,12 +81,10 @@ def test_stringify_columns(self): df = pd.DataFrame(np.arange(12).reshape(4, 3)).copy() self.check_error_on_write(df, ValueError) + @pytest.mark.skipif(fv >= '0.4.0', reason='fixed in 0.4.0') def test_unsupported(self): - # period - df = pd.DataFrame({'a': pd.period_range('2013', freq='M', periods=3)}) - self.check_error_on_write(df, ValueError) - + # timedelta df = pd.DataFrame({'a': pd.timedelta_range('1 day', periods=3)}) self.check_error_on_write(df, FeatherError) @@ -89,6 +92,12 @@ def test_unsupported(self): df = pd.DataFrame({'a': ['a', 1, 2.0]}) self.check_error_on_write(df, ValueError) + def test_unsupported_other(self): + + # period + df = pd.DataFrame({'a': pd.period_range('2013', freq='M', periods=3)}) + self.check_error_on_write(df, ValueError) + def test_write_with_index(self): df = pd.DataFrame({'A': [1, 2, 3]}) From 788716f46494a183646949818e1e142331e8dded Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 24 May 2017 22:24:05 -0500 Subject: [PATCH 42/44] CLN: Small linting failures (#16491) (cherry picked from commit d7962c50df5edcee19d6fce80a030eef9a3c7ae4) --- pandas/tests/frame/test_combine_concat.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pandas/tests/frame/test_combine_concat.py b/pandas/tests/frame/test_combine_concat.py index f32efccf85fc6..e82faaeef2986 100644 --- a/pandas/tests/frame/test_combine_concat.py +++ b/pandas/tests/frame/test_combine_concat.py @@ -778,10 +778,11 @@ def test_update_nan(self): tm.assert_frame_equal(df1, expected) # test 2 - df1 = DataFrame({'A': [1.0, None, 3], 'B': date_range('2000', periods=3)}) + df1 = DataFrame({'A': [1.0, None, 3], + 'B': date_range('2000', periods=3)}) df2 = DataFrame({'A': [None, 2, 3]}) - expected = DataFrame({'A': [1.0, 2, 3], 'B': date_range('2000', periods=3)}) + expected = DataFrame({'A': [1.0, 2, 3], + 'B': date_range('2000', periods=3)}) df1.update(df2, overwrite=False) tm.assert_frame_equal(df1, expected) - From baf8b151bfbfa705bfdfcefa21db3d68433f2cc0 Mon Sep 17 00:00:00 2001 From: Aaron Barber Date: Fri, 26 May 2017 12:11:55 -0700 Subject: [PATCH 43/44] TST: ujson tests are not being run (#16499) (#16500) closes #16499 (cherry picked from commit 66491574df3d223a15f2fd229793ccdbfd8a0fa3) --- pandas/tests/io/json/test_ujson.py | 44 +++++++++++++++--------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/pandas/tests/io/json/test_ujson.py b/pandas/tests/io/json/test_ujson.py index 86b0e5a0c6a2d..59d908638a244 100644 --- a/pandas/tests/io/json/test_ujson.py +++ b/pandas/tests/io/json/test_ujson.py @@ -25,7 +25,7 @@ else partial(json.dumps, encoding="utf-8")) -class UltraJSONTests(object): +class TestUltraJSONTests(object): @pytest.mark.skipif(compat.is_platform_32bit(), reason="not compliant on 32-bit, xref #15865") @@ -946,19 +946,19 @@ def my_obj_handler(obj): ujson.decode(ujson.encode(l, default_handler=str))) -class NumpyJSONTests(object): +class TestNumpyJSONTests(object): - def testBool(self): + def test_Bool(self): b = np.bool(True) assert ujson.decode(ujson.encode(b)) == b - def testBoolArray(self): + def test_BoolArray(self): inpt = np.array([True, False, True, True, False, True, False, False], dtype=np.bool) outp = np.array(ujson.decode(ujson.encode(inpt)), dtype=np.bool) tm.assert_numpy_array_equal(inpt, outp) - def testInt(self): + def test_Int(self): num = np.int(2562010) assert np.int(ujson.decode(ujson.encode(num))) == num @@ -986,7 +986,7 @@ def testInt(self): num = np.uint64(2562010) assert np.uint64(ujson.decode(ujson.encode(num))) == num - def testIntArray(self): + def test_IntArray(self): arr = np.arange(100, dtype=np.int) dtypes = (np.int, np.int8, np.int16, np.int32, np.int64, np.uint, np.uint8, np.uint16, np.uint32, np.uint64) @@ -995,7 +995,7 @@ def testIntArray(self): outp = np.array(ujson.decode(ujson.encode(inpt)), dtype=dtype) tm.assert_numpy_array_equal(inpt, outp) - def testIntMax(self): + def test_IntMax(self): num = np.int(np.iinfo(np.int).max) assert np.int(ujson.decode(ujson.encode(num))) == num @@ -1025,7 +1025,7 @@ def testIntMax(self): num = np.uint64(np.iinfo(np.int64).max) assert np.uint64(ujson.decode(ujson.encode(num))) == num - def testFloat(self): + def test_Float(self): num = np.float(256.2013) assert np.float(ujson.decode(ujson.encode(num))) == num @@ -1035,7 +1035,7 @@ def testFloat(self): num = np.float64(256.2013) assert np.float64(ujson.decode(ujson.encode(num))) == num - def testFloatArray(self): + def test_FloatArray(self): arr = np.arange(12.5, 185.72, 1.7322, dtype=np.float) dtypes = (np.float, np.float32, np.float64) @@ -1045,7 +1045,7 @@ def testFloatArray(self): inpt, double_precision=15)), dtype=dtype) tm.assert_almost_equal(inpt, outp) - def testFloatMax(self): + def test_FloatMax(self): num = np.float(np.finfo(np.float).max / 10) tm.assert_almost_equal(np.float(ujson.decode( ujson.encode(num, double_precision=15))), num, 15) @@ -1058,7 +1058,7 @@ def testFloatMax(self): tm.assert_almost_equal(np.float64(ujson.decode( ujson.encode(num, double_precision=15))), num, 15) - def testArrays(self): + def test_Arrays(self): arr = np.arange(100) arr = arr.reshape((10, 10)) @@ -1099,13 +1099,13 @@ def testArrays(self): outp = ujson.decode(ujson.encode(arr), numpy=True, dtype=np.float32) tm.assert_almost_equal(arr, outp) - def testOdArray(self): + def test_OdArray(self): def will_raise(): ujson.encode(np.array(1)) pytest.raises(TypeError, will_raise) - def testArrayNumpyExcept(self): + def test_ArrayNumpyExcept(self): input = ujson.dumps([42, {}, 'a']) try: @@ -1188,7 +1188,7 @@ def testArrayNumpyExcept(self): except: assert False, "Wrong exception" - def testArrayNumpyLabelled(self): + def test_ArrayNumpyLabelled(self): input = {'a': []} output = ujson.loads(ujson.dumps(input), numpy=True, labelled=True) assert (np.empty((1, 0)) == output[0]).all() @@ -1222,9 +1222,9 @@ def testArrayNumpyLabelled(self): assert (np.array(['a', 'b']) == output[2]).all() -class PandasJSONTests(object): +class TestPandasJSONTests(object): - def testDataFrame(self): + def test_DataFrame(self): df = DataFrame([[1, 2, 3], [4, 5, 6]], index=[ 'a', 'b'], columns=['x', 'y', 'z']) @@ -1254,7 +1254,7 @@ def testDataFrame(self): tm.assert_index_equal(df.transpose().columns, outp.columns) tm.assert_index_equal(df.transpose().index, outp.index) - def testDataFrameNumpy(self): + def test_DataFrameNumpy(self): df = DataFrame([[1, 2, 3], [4, 5, 6]], index=[ 'a', 'b'], columns=['x', 'y', 'z']) @@ -1277,7 +1277,7 @@ def testDataFrameNumpy(self): tm.assert_index_equal(df.transpose().columns, outp.columns) tm.assert_index_equal(df.transpose().index, outp.index) - def testDataFrameNested(self): + def test_DataFrameNested(self): df = DataFrame([[1, 2, 3], [4, 5, 6]], index=[ 'a', 'b'], columns=['x', 'y', 'z']) @@ -1303,7 +1303,7 @@ def testDataFrameNested(self): 'df2': ujson.decode(ujson.encode(df, orient="split"))} assert ujson.decode(ujson.encode(nested, orient="split")) == exp - def testDataFrameNumpyLabelled(self): + def test_DataFrameNumpyLabelled(self): df = DataFrame([[1, 2, 3], [4, 5, 6]], index=[ 'a', 'b'], columns=['x', 'y', 'z']) @@ -1326,7 +1326,7 @@ def testDataFrameNumpyLabelled(self): tm.assert_index_equal(df.columns, outp.columns) tm.assert_index_equal(df.index, outp.index) - def testSeries(self): + def test_Series(self): s = Series([10, 20, 30, 40, 50, 60], name="series", index=[6, 7, 8, 9, 10, 15]).sort_values() @@ -1374,7 +1374,7 @@ def testSeries(self): s, orient="index"), numpy=True)).sort_values() tm.assert_series_equal(outp, exp) - def testSeriesNested(self): + def test_SeriesNested(self): s = Series([10, 20, 30, 40, 50, 60], name="series", index=[6, 7, 8, 9, 10, 15]).sort_values() @@ -1400,7 +1400,7 @@ def testSeriesNested(self): 's2': ujson.decode(ujson.encode(s, orient="index"))} assert ujson.decode(ujson.encode(nested, orient="index")) == exp - def testIndex(self): + def test_Index(self): i = Index([23, 45, 18, 98, 43, 11], name="index") # column indexed From 9c4a033bf7e170118fd19e7f148e53cb79d24a01 Mon Sep 17 00:00:00 2001 From: "John W. O'Brien" Date: Mon, 29 May 2017 12:00:42 -0400 Subject: [PATCH 44/44] TST: Specify HTML file encoding on PY3 (#16526) (cherry picked from commit e60dc4cff2c6e8a2283fbb906faeb8cb01df37ff) --- pandas/tests/io/test_html.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index 6da77bf423609..1e1d653cf94d1 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -20,7 +20,7 @@ from pandas import (DataFrame, MultiIndex, read_csv, Timestamp, Index, date_range, Series) from pandas.compat import (map, zip, StringIO, string_types, BytesIO, - is_platform_windows) + is_platform_windows, PY3) from pandas.io.common import URLError, urlopen, file_path_to_url from pandas.io.html import read_html from pandas._libs.parsers import ParserError @@ -96,6 +96,9 @@ def read_html(self, *args, **kwargs): class TestReadHtml(ReadHtmlMixin): flavor = 'bs4' spam_data = os.path.join(DATA_PATH, 'spam.html') + spam_data_kwargs = {} + if PY3: + spam_data_kwargs['encoding'] = 'UTF-8' banklist_data = os.path.join(DATA_PATH, 'banklist.html') @classmethod @@ -247,10 +250,10 @@ def test_infer_types(self): assert_framelist_equal(df1, df2) def test_string_io(self): - with open(self.spam_data) as f: + with open(self.spam_data, **self.spam_data_kwargs) as f: data1 = StringIO(f.read()) - with open(self.spam_data) as f: + with open(self.spam_data, **self.spam_data_kwargs) as f: data2 = StringIO(f.read()) df1 = self.read_html(data1, '.*Water.*') @@ -258,7 +261,7 @@ def test_string_io(self): assert_framelist_equal(df1, df2) def test_string(self): - with open(self.spam_data) as f: + with open(self.spam_data, **self.spam_data_kwargs) as f: data = f.read() df1 = self.read_html(data, '.*Water.*') @@ -267,10 +270,10 @@ def test_string(self): assert_framelist_equal(df1, df2) def test_file_like(self): - with open(self.spam_data) as f: + with open(self.spam_data, **self.spam_data_kwargs) as f: df1 = self.read_html(f, '.*Water.*') - with open(self.spam_data) as f: + with open(self.spam_data, **self.spam_data_kwargs) as f: df2 = self.read_html(f, 'Unit') assert_framelist_equal(df1, df2)