diff --git a/.travis.yml b/.travis.yml index be167451f3460..3e24f3798ca04 100644 --- a/.travis.yml +++ b/.travis.yml @@ -66,19 +66,6 @@ matrix: apt: packages: - python-gtk2 - - python: 3.4 - env: - - PYTHON_VERSION=3.4 - - JOB_NAME: "34_nslow" - - NOSE_ARGS="not slow and not disabled" - - FULL_DEPS=true - - CLIPBOARD=xsel - - CACHE_NAME="34_nslow" - - USE_CACHE=true - addons: - apt: - packages: - - xsel - python: 3.5 env: - PYTHON_VERSION=3.5 @@ -93,6 +80,33 @@ matrix: apt: packages: - xsel + - python: 3.6-dev + env: + - PYTHON_VERSION=3.6 + - JOB_NAME: "36_dev" + - JOB_TAG=_DEV + - NOSE_ARGS="not slow and not network and not disabled" + - PANDAS_TESTING_MODE="deprecate" + addons: + apt: + packages: + - libatlas-base-dev + - gfortran +# In allow_failures + - python: 2.7 + env: + - PYTHON_VERSION=2.7 + - JOB_NAME: "27_nslow_nnet_COMPAT" + - NOSE_ARGS="not slow and not network and not disabled" + - LOCALE_OVERRIDE="it_IT.UTF-8" + - INSTALL_TEST=true + - JOB_TAG=_COMPAT + - CACHE_NAME="27_nslow_nnet_COMPAT" + - USE_CACHE=true + addons: + apt: + packages: + - language-pack-it # In allow_failures - python: 2.7 env: @@ -103,45 +117,46 @@ matrix: - FULL_DEPS=true - CACHE_NAME="27_slow" - USE_CACHE=true +# In allow_failures + - python: 2.7 + env: + - PYTHON_VERSION=2.7 + - JOB_NAME: "27_build_test_conda" + - JOB_TAG=_BUILD_TEST + - NOSE_ARGS="not slow and not disabled" + - FULL_DEPS=true + - BUILD_TEST=true + - CACHE_NAME="27_build_test_conda" + - USE_CACHE=true # In allow_failures - python: 3.4 env: - PYTHON_VERSION=3.4 - - JOB_NAME: "34_slow" - - JOB_TAG=_SLOW - - NOSE_ARGS="slow and not network and not disabled" + - JOB_NAME: "34_nslow" + - NOSE_ARGS="not slow and not disabled" - FULL_DEPS=true - CLIPBOARD=xsel - - CACHE_NAME="34_slow" + - CACHE_NAME="34_nslow" - USE_CACHE=true addons: apt: packages: - xsel # In allow_failures - - python: 2.7 + - python: 3.4 env: - - PYTHON_VERSION=2.7 - - JOB_NAME: "27_build_test_conda" - - JOB_TAG=_BUILD_TEST - - NOSE_ARGS="not slow and not disabled" + - PYTHON_VERSION=3.4 + - JOB_NAME: "34_slow" + - JOB_TAG=_SLOW + - NOSE_ARGS="slow and not network and not disabled" - FULL_DEPS=true - - BUILD_TEST=true - - CACHE_NAME="27_build_test_conda" + - CLIPBOARD=xsel + - CACHE_NAME="34_slow" - USE_CACHE=true -# In allow_failures - - python: 3.6-dev - env: - - PYTHON_VERSION=3.6 - - JOB_NAME: "36_dev" - - JOB_TAG=_DEV - - NOSE_ARGS="not slow and not network and not disabled" - - PANDAS_TESTING_MODE="deprecate" addons: apt: packages: - - libatlas-base-dev - - gfortran + - xsel # In allow_failures - python: 3.5 env: @@ -157,21 +172,6 @@ matrix: packages: - libatlas-base-dev - gfortran -# In allow_failures - - python: 2.7 - env: - - PYTHON_VERSION=2.7 - - JOB_NAME: "27_nslow_nnet_COMPAT" - - NOSE_ARGS="not slow and not network and not disabled" - - LOCALE_OVERRIDE="it_IT.UTF-8" - - INSTALL_TEST=true - - JOB_TAG=_COMPAT - - CACHE_NAME="27_nslow_nnet_COMPAT" - - USE_CACHE=true - addons: - apt: - packages: - - language-pack-it # In allow_failures - python: 3.5 env: @@ -226,18 +226,19 @@ matrix: - BUILD_TEST=true - CACHE_NAME="27_build_test_conda" - USE_CACHE=true - - python: 3.6-dev + - python: 3.4 env: - - PYTHON_VERSION=3.6 - - JOB_NAME: "36_dev" - - JOB_TAG=_DEV - - NOSE_ARGS="not slow and not network and not disabled" - - PANDAS_TESTING_MODE="deprecate" + - PYTHON_VERSION=3.4 + - JOB_NAME: "34_nslow" + - NOSE_ARGS="not slow and not disabled" + - FULL_DEPS=true + - CLIPBOARD=xsel + - CACHE_NAME="34_nslow" + - USE_CACHE=true addons: apt: packages: - - libatlas-base-dev - - gfortran + - xsel - python: 3.5 env: - PYTHON_VERSION=3.5 diff --git a/asv_bench/benchmarks/algorithms.py b/asv_bench/benchmarks/algorithms.py index 20d149493951f..fe657936c403e 100644 --- a/asv_bench/benchmarks/algorithms.py +++ b/asv_bench/benchmarks/algorithms.py @@ -18,7 +18,7 @@ def setup(self): self.float = pd.Float64Index(np.random.randn(N).repeat(5)) # Convenience naming. - self.checked_add = pd.core.nanops._checked_add_with_arr + self.checked_add = pd.core.algorithms.checked_add_with_arr self.arr = np.arange(1000000) self.arrpos = np.arange(1000000) @@ -26,6 +26,9 @@ def setup(self): self.arrmixed = np.array([1, -1]).repeat(500000) self.strings = tm.makeStringIndex(100000) + self.arr_nan = np.random.choice([True, False], size=1000000) + self.arrmixed_nan = np.random.choice([True, False], size=1000000) + # match self.uniques = tm.makeStringIndex(1000).values self.all = self.uniques.repeat(10) @@ -69,6 +72,16 @@ def time_add_overflow_neg_arr(self): def time_add_overflow_mixed_arr(self): self.checked_add(self.arr, self.arrmixed) + def time_add_overflow_first_arg_nan(self): + self.checked_add(self.arr, self.arrmixed, arr_mask=self.arr_nan) + + def time_add_overflow_second_arg_nan(self): + self.checked_add(self.arr, self.arrmixed, b_mask=self.arrmixed_nan) + + def time_add_overflow_both_arg_nan(self): + self.checked_add(self.arr, self.arrmixed, arr_mask=self.arr_nan, + b_mask=self.arrmixed_nan) + class Hashing(object): goal_time = 0.2 diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py index 8cbf5b8d97b70..adbe73aa5c5ef 100644 --- a/asv_bench/benchmarks/frame_methods.py +++ b/asv_bench/benchmarks/frame_methods.py @@ -68,6 +68,8 @@ class Iteration(object): def setup(self): self.df = DataFrame(randn(10000, 1000)) self.df2 = DataFrame(np.random.randn(50000, 10)) + self.df3 = pd.DataFrame(np.random.randn(1000,5000), + columns=['C'+str(c) for c in range(5000)]) def f(self): if hasattr(self.df, '_item_cache'): @@ -85,6 +87,11 @@ def time_iteritems(self): def time_iteritems_cached(self): self.g() + def time_iteritems_indexing(self): + df = self.df3 + for col in df: + df[col] + def time_itertuples(self): for row in self.df2.itertuples(): pass diff --git a/asv_bench/benchmarks/io_bench.py b/asv_bench/benchmarks/io_bench.py index 0f15ab6e5e142..2ce3c4726b783 100644 --- a/asv_bench/benchmarks/io_bench.py +++ b/asv_bench/benchmarks/io_bench.py @@ -153,7 +153,7 @@ def setup(self, compression, engine): # The Python 2 C parser can't read bz2 from open files. raise NotImplementedError try: - import boto + import s3fs except ImportError: # Skip these benchmarks if `boto` is not installed. raise NotImplementedError diff --git a/asv_bench/benchmarks/join_merge.py b/asv_bench/benchmarks/join_merge.py index 9eefe80c8e5e4..d9c631fa92efd 100644 --- a/asv_bench/benchmarks/join_merge.py +++ b/asv_bench/benchmarks/join_merge.py @@ -302,12 +302,19 @@ def setup(self): self.df1 = self.df1.sort_values('time') self.df2 = self.df2.sort_values('time') + self.df1['time32'] = np.int32(self.df1.time) + self.df2['time32'] = np.int32(self.df2.time) + self.df1a = self.df1[['time', 'value1']] self.df2a = self.df2[['time', 'value2']] self.df1b = self.df1[['time', 'key', 'value1']] self.df2b = self.df2[['time', 'key', 'value2']] self.df1c = self.df1[['time', 'key2', 'value1']] self.df2c = self.df2[['time', 'key2', 'value2']] + self.df1d = self.df1[['time32', 'value1']] + self.df2d = self.df2[['time32', 'value2']] + self.df1e = self.df1[['time', 'key', 'key2', 'value1']] + self.df2e = self.df2[['time', 'key', 'key2', 'value2']] def time_noby(self): merge_asof(self.df1a, self.df2a, on='time') @@ -318,6 +325,12 @@ def time_by_object(self): def time_by_int(self): merge_asof(self.df1c, self.df2c, on='time', by='key2') + def time_on_int32(self): + merge_asof(self.df1d, self.df2d, on='time32') + + def time_multiby(self): + merge_asof(self.df1e, self.df2e, on='time', by=['key', 'key2']) + #---------------------------------------------------------------------- # data alignment diff --git a/asv_bench/benchmarks/period.py b/asv_bench/benchmarks/period.py index ff5a201057bcd..f9837191a7bae 100644 --- a/asv_bench/benchmarks/period.py +++ b/asv_bench/benchmarks/period.py @@ -49,3 +49,28 @@ def time_value_counts_pindex(self): self.i.value_counts() +class period_standard_indexing(object): + goal_time = 0.2 + + def setup(self): + self.index = PeriodIndex(start='1985', periods=1000, freq='D') + self.series = Series(range(1000), index=self.index) + self.period = self.index[500] + + def time_get_loc(self): + self.index.get_loc(self.period) + + def time_shape(self): + self.index.shape + + def time_shallow_copy(self): + self.index._shallow_copy() + + def time_series_loc(self): + self.series.loc[self.period] + + def time_align(self): + pd.DataFrame({'a': self.series, 'b': self.series[:500]}) + + def time_intersection(self): + self.index[:750].intersection(self.index[250:]) diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py index 4e368c6d7cde2..413c4e044fd3a 100644 --- a/asv_bench/benchmarks/series_methods.py +++ b/asv_bench/benchmarks/series_methods.py @@ -8,13 +8,28 @@ def setup(self): self.dr = pd.date_range( start=datetime(2015,10,26), end=datetime(2016,1,1), - freq='10s' - ) # ~500k long + freq='50s' + ) # ~100k long def time_series_constructor_no_data_datetime_index(self): Series(data=None, index=self.dr) +class series_constructor_dict_data_datetime_index(object): + goal_time = 0.2 + + def setup(self): + self.dr = pd.date_range( + start=datetime(2015, 10, 26), + end=datetime(2016, 1, 1), + freq='50s' + ) # ~100k long + self.data = {d: v for d, v in zip(self.dr, range(len(self.dr)))} + + def time_series_constructor_no_data_datetime_index(self): + Series(data=self.data, index=self.dr) + + class series_isin_int64(object): goal_time = 0.2 diff --git a/ci/lint.sh b/ci/lint.sh index d7df6215450b4..32ac606a4d30a 100755 --- a/ci/lint.sh +++ b/ci/lint.sh @@ -7,6 +7,8 @@ source activate pandas RET=0 if [ "$LINT" ]; then + pip install cpplint + # pandas/rpy is deprecated and will be removed. # pandas/src is C code, so no need to search there. echo "Linting *.py" @@ -43,13 +45,11 @@ if [ "$LINT" ]; then # from Cython files nor do we want to lint C files that we didn't modify for # this particular codebase (e.g. src/headers, src/klib, src/msgpack). However, # we can lint all header files since they aren't "generated" like C files are. - pip install cpplint - echo "Linting *.c and *.h" for path in '*.h' 'period_helper.c' 'datetime' 'parser' 'ujson' do echo "linting -> pandas/src/$path" - cpplint --extensions=c,h --headers=h --filter=-readability/casting,-runtime/int,-build/include_subdir --recursive pandas/src/$path + cpplint --quiet --extensions=c,h --headers=h --filter=-readability/casting,-runtime/int,-build/include_subdir --recursive pandas/src/$path if [ $? -ne "0" ]; then RET=1 fi diff --git a/ci/requirements-2.7-64.run b/ci/requirements-2.7-64.run index 42b5a789ae31a..94472dafd565d 100644 --- a/ci/requirements-2.7-64.run +++ b/ci/requirements-2.7-64.run @@ -11,7 +11,7 @@ sqlalchemy lxml=3.2.1 scipy xlsxwriter -boto +s3fs bottleneck html5lib beautiful-soup diff --git a/ci/requirements-2.7.run b/ci/requirements-2.7.run index 560d6571b8771..2bfb8a3777fdf 100644 --- a/ci/requirements-2.7.run +++ b/ci/requirements-2.7.run @@ -11,7 +11,7 @@ sqlalchemy=0.9.6 lxml=3.2.1 scipy xlsxwriter=0.4.6 -boto=2.36.0 +s3fs bottleneck psycopg2=2.5.2 patsy diff --git a/ci/requirements-2.7_SLOW.run b/ci/requirements-2.7_SLOW.run index f02a7cb8a309a..630d22636f284 100644 --- a/ci/requirements-2.7_SLOW.run +++ b/ci/requirements-2.7_SLOW.run @@ -13,7 +13,7 @@ numexpr pytables sqlalchemy lxml -boto +s3fs bottleneck psycopg2 pymysql diff --git a/ci/requirements-3.5.run b/ci/requirements-3.5.run index 333641caf26c4..1d1cb38fd57a6 100644 --- a/ci/requirements-3.5.run +++ b/ci/requirements-3.5.run @@ -17,7 +17,7 @@ sqlalchemy pymysql psycopg2 xarray -boto +s3fs # incompat with conda ATM # beautiful-soup diff --git a/ci/requirements-3.5_OSX.run b/ci/requirements-3.5_OSX.run index ffa291ab7ff77..eceb2f9cdcebc 100644 --- a/ci/requirements-3.5_OSX.run +++ b/ci/requirements-3.5_OSX.run @@ -12,7 +12,7 @@ matplotlib jinja2 bottleneck xarray -boto +s3fs # incompat with conda ATM # beautiful-soup diff --git a/doc/cheatsheet/Pandas_Cheat_Sheet.pdf b/doc/cheatsheet/Pandas_Cheat_Sheet.pdf new file mode 100644 index 0000000000000..a0bff02d45f91 Binary files /dev/null and b/doc/cheatsheet/Pandas_Cheat_Sheet.pdf differ diff --git a/doc/cheatsheet/Pandas_Cheat_Sheet.pptx b/doc/cheatsheet/Pandas_Cheat_Sheet.pptx new file mode 100644 index 0000000000000..399edf84e7d1c Binary files /dev/null and b/doc/cheatsheet/Pandas_Cheat_Sheet.pptx differ diff --git a/doc/cheatsheet/README.txt b/doc/cheatsheet/README.txt new file mode 100644 index 0000000000000..e2f6ec042e9cc --- /dev/null +++ b/doc/cheatsheet/README.txt @@ -0,0 +1,4 @@ +The Pandas Cheat Sheet was created using Microsoft Powerpoint 2013. +To create the PDF version, within Powerpoint, simply do a "Save As" +and pick "PDF' as the format. + diff --git a/doc/source/contributing.rst b/doc/source/contributing.rst index 38718bc5ca19a..ecc2a5e723c45 100644 --- a/doc/source/contributing.rst +++ b/doc/source/contributing.rst @@ -113,11 +113,12 @@ want to clone your fork to your machine:: This creates the directory `pandas-yourname` and connects your repository to the upstream (main project) *pandas* repository. -The testing suite will run automatically on Travis-CI once your pull request is -submitted. However, if you wish to run the test suite on a branch prior to -submitting the pull request, then Travis-CI needs to be hooked up to your -GitHub repository. Instructions for doing so are `here -`__. +The testing suite will run automatically on Travis-CI and Appveyor once your +pull request is submitted. However, if you wish to run the test suite on a +branch prior to submitting the pull request, then Travis-CI and/or AppVeyor +need to be hooked up to your GitHub repository. Instructions for doing so +are `here `__ for +Travis-CI and `here `__ for AppVeyor. Creating a branch ----------------- @@ -142,7 +143,7 @@ To update this branch, you need to retrieve the changes from the master branch:: git fetch upstream git rebase upstream/master -This will replay your commits on top of the lastest pandas git master. If this +This will replay your commits on top of the latest pandas git master. If this leads to merge conflicts, you must resolve these before submitting your pull request. If you have uncommitted changes, you will need to ``stash`` them prior to updating. This will effectively store your changes and they can be reapplied @@ -396,7 +397,7 @@ evocations, sphinx will try to only build the pages that have been modified. If you want to do a full clean build, do:: python make.py clean - python make.py build + python make.py html Starting with *pandas* 0.13.1 you can tell ``make.py`` to compile only a single section of the docs, greatly reducing the turn-around time for checking your changes. @@ -442,18 +443,80 @@ Contributing to the code base Code standards -------------- +Writing good code is not just about what you write. It is also about *how* you +write it. During testing on Travis-CI, several tools will be run to check your +code for stylistic errors. Generating any warnings will cause the test to fail. +Thus, good style is a requirement for submitting code to *pandas*. + +In addition, because a lot of people use our library, it is important that we +do not make sudden changes to the code that could have the potential to break +a lot of user code as a result, that is, we need it to be as *backwards compatible* +as possible to avoid mass breakages. + +Additional standards are outlined on the `code style wiki +page `_. + +C (cpplint) +~~~~~~~~~~~ + +*pandas* uses the `Google `_ +standard. Google provides an open source style checker called ``cpplint``, but we +use a fork of it that can be found `here `_. +Here are *some* of the more common ``cpplint`` issues: + + - we restrict line-length to 80 characters to promote readability + - every header file must include a header guard to avoid name collisions if re-included + +Travis-CI will run the `cpplint `_ tool +and report any stylistic errors in your code. Therefore, it is helpful before +submitting code to run the check yourself:: + + cpplint --extensions=c,h --headers=h --filter=-readability/casting,-runtime/int,-build/include_subdir modified-c-file + +You can also run this command on an entire directory if necessary:: + + cpplint --extensions=c,h --headers=h --filter=-readability/casting,-runtime/int,-build/include_subdir --recursive modified-c-directory + +To make your commits compliant with this standard, you can install the +`ClangFormat `_ tool, which can be +downloaded `here `_. To configure, in your home directory, +run the following command:: + + clang-format style=google -dump-config > .clang-format + +Then modify the file to ensure that any indentation width parameters are at least four. +Once configured, you can run the tool as follows:: + + clang-format modified-c-file + +This will output what your file will look like if the changes are made, and to apply +them, just run the following command:: + + clang-format -i modified-c-file + +To run the tool on an entire directory, you can run the following analogous commands:: + + clang-format modified-c-directory/*.c modified-c-directory/*.h + clang-format -i modified-c-directory/*.c modified-c-directory/*.h + +Do note that this tool is best-effort, meaning that it will try to correct as +many errors as possible, but it may not correct *all* of them. Thus, it is +recommended that you run ``cpplint`` to double check and make any other style +fixes manually. + +Python (PEP8) +~~~~~~~~~~~~~ + *pandas* uses the `PEP8 `_ standard. There are several tools to ensure you abide by this standard. Here are *some* of the more common ``PEP8`` issues: - - we restrict line-length to 80 characters to promote readability + - we restrict line-length to 79 characters to promote readability - passing arguments should have spaces after commas, e.g. ``foo(arg1, arg2, kw1='bar')`` -The Travis-CI will run `flake8 `_ tool and report -any stylistic errors in your code. Generating any warnings will cause the build to fail; -thus these are part of the requirements for submitting code to *pandas*. - -It is helpful before submitting code to run this yourself on the diff:: +Travis-CI will run the `flake8 `_ tool +and report any stylistic errors in your code. Therefore, it is helpful before +submitting code to run the check yourself on the diff:: git diff master | flake8 --diff @@ -466,8 +529,8 @@ and make these changes with:: pep8radius master --diff --in-place -Additional standards are outlined on the `code style wiki -page `_. +Backwards Compatibility +~~~~~~~~~~~~~~~~~~~~~~~ Please try to maintain backward compatibility. *pandas* has lots of users with lots of existing code, so don't break it if at all possible. If you think breakage is required, diff --git a/doc/source/install.rst b/doc/source/install.rst index d45b8765cfd8a..f62342fa52e5c 100644 --- a/doc/source/install.rst +++ b/doc/source/install.rst @@ -262,7 +262,7 @@ Optional Dependencies * `XlsxWriter `__: Alternative Excel writer * `Jinja2 `__: Template engine for conditional HTML formatting. -* `boto `__: necessary for Amazon S3 access. +* `s3fs `__: necessary for Amazon S3 access (s3fs >= 0.0.7). * `blosc `__: for msgpack compression using ``blosc`` * One of `PyQt4 `__, `PySide diff --git a/doc/source/io.rst b/doc/source/io.rst index 17c7653072526..9d51d2599d668 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -1487,6 +1487,23 @@ options include: Specifying any of the above options will produce a ``ParserWarning`` unless the python engine is selected explicitly using ``engine='python'``. +Reading remote files +'''''''''''''''''''' + +You can pass in a URL to a CSV file: + +.. code-block:: python + + df = pd.read_csv('https://download.bls.gov/pub/time.series/cu/cu.item', + sep='\t') + +S3 URLs are handled as well: + +.. code-block:: python + + df = pd.read_csv('s3://pandas-test/tips.csv') + + Writing out Data '''''''''''''''' @@ -4004,7 +4021,7 @@ and data values from the values and assembles them into a ``data.frame``: name_paths = paste(listing$group[name_nodes], listing$name[name_nodes], sep = "/") columns = list() for (idx in seq(data_paths)) { - # NOTE: matrices returned by h5read have to be transposed to to obtain + # NOTE: matrices returned by h5read have to be transposed to obtain # required Fortran order! data <- data.frame(t(h5read(h5File, data_paths[idx]))) names <- t(h5read(h5File, name_paths[idx])) diff --git a/doc/source/timeseries.rst b/doc/source/timeseries.rst index 9253124f7e8b2..6de6abed9a681 100644 --- a/doc/source/timeseries.rst +++ b/doc/source/timeseries.rst @@ -358,8 +358,8 @@ See :ref:`here ` for ways to represent data outside these bound. .. _timeseries.datetimeindex: -DatetimeIndex -------------- +Indexing +-------- One of the main uses for ``DatetimeIndex`` is as an index for pandas objects. The ``DatetimeIndex`` class contains many timeseries related optimizations: @@ -399,8 +399,8 @@ intelligent functionality like selection, slicing, etc. .. _timeseries.partialindexing: -DatetimeIndex Partial String Indexing -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Partial String Indexing +~~~~~~~~~~~~~~~~~~~~~~~ You can pass in dates and strings that parse to dates as indexing parameters: @@ -457,22 +457,6 @@ We are stopping on the included end-point as it is part of the index dft['2013-1-15':'2013-1-15 12:30:00'] -.. warning:: - - The following selection will raise a ``KeyError``; otherwise this selection methodology - would be inconsistent with other selection methods in pandas (as this is not a *slice*, nor does it - resolve to one) - - .. code-block:: python - - dft['2013-1-15 12:30:00'] - - To select a single row, use ``.loc`` - - .. ipython:: python - - dft.loc['2013-1-15 12:30:00'] - .. versionadded:: 0.18.0 DatetimeIndex Partial String Indexing also works on DataFrames with a ``MultiIndex``. For example: @@ -491,12 +475,86 @@ DatetimeIndex Partial String Indexing also works on DataFrames with a ``MultiInd dft2 = dft2.swaplevel(0, 1).sort_index() dft2.loc[idx[:, '2013-01-05'], :] -Datetime Indexing -~~~~~~~~~~~~~~~~~ +.. _timeseries.slice_vs_exact_match: -Indexing a ``DateTimeIndex`` with a partial string depends on the "accuracy" of the period, in other words how specific the interval is in relation to the frequency of the index. In contrast, indexing with datetime objects is exact, because the objects have exact meaning. These also follow the semantics of *including both endpoints*. +Slice vs. exact match +~~~~~~~~~~~~~~~~~~~~~ -These ``datetime`` objects are specific ``hours, minutes,`` and ``seconds`` even though they were not explicitly specified (they are ``0``). +.. versionchanged:: 0.20.0 + +The same string used as an indexing parameter can be treated either as a slice or as an exact match depending on the resolution of an index. If the string is less accurate than the index, it will be treated as a slice, otherwise as an exact match. + +For example, let us consider ``Series`` object which index has minute resolution. + +.. ipython:: python + + series_minute = pd.Series([1, 2, 3], + pd.DatetimeIndex(['2011-12-31 23:59:00', + '2012-01-01 00:00:00', + '2012-01-01 00:02:00'])) + series_minute.index.resolution + +A Timestamp string less accurate than a minute gives a ``Series`` object. + +.. ipython:: python + + series_minute['2011-12-31 23'] + +A Timestamp string with minute resolution (or more accurate), gives a scalar instead, i.e. it is not casted to a slice. + +.. ipython:: python + + series_minute['2011-12-31 23:59'] + series_minute['2011-12-31 23:59:00'] + +If index resolution is second, then, the minute-accurate timestamp gives a ``Series``. + +.. ipython:: python + + series_second = pd.Series([1, 2, 3], + pd.DatetimeIndex(['2011-12-31 23:59:59', + '2012-01-01 00:00:00', + '2012-01-01 00:00:01'])) + series_second.index.resolution + series_second['2011-12-31 23:59'] + +If the timestamp string is treated as a slice, it can be used to index ``DataFrame`` with ``[]`` as well. + +.. ipython:: python + + dft_minute = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]}, + index=series_minute.index) + dft_minute['2011-12-31 23'] + + +.. warning:: + + However if the string is treated as an exact match, the selection in ``DataFrame``'s ``[]`` will be column-wise and not row-wise, see :ref:`Indexing Basics `. For example ``dft_minute['2011-12-31 23:59']`` will raise ``KeyError`` as ``'2012-12-31 23:59'`` has the same resolution as index and there is no column with such name: + + To *always* have unambiguous selection, whether the row is treated as a slice or a single selection, use ``.loc``. + + .. ipython:: python + + dft_minute.loc['2011-12-31 23:59'] + +Note also that ``DatetimeIndex`` resolution cannot be less precise than day. + +.. ipython:: python + + series_monthly = pd.Series([1, 2, 3], + pd.DatetimeIndex(['2011-12', + '2012-01', + '2012-02'])) + series_monthly.index.resolution + series_monthly['2011-12'] # returns Series + + +Exact Indexing +~~~~~~~~~~~~~~ + +As discussed in previous section, indexing a ``DateTimeIndex`` with a partial string depends on the "accuracy" of the period, in other words how specific the interval is in relation to the resolution of the index. In contrast, indexing with ``Timestamp`` or ``datetime`` objects is exact, because the objects have exact meaning. These also follow the semantics of *including both endpoints*. + +These ``Timestamp`` and ``datetime`` objects have exact ``hours, minutes,`` and ``seconds``, even though they were not explicitly specified (they are ``0``). .. ipython:: python @@ -525,10 +583,10 @@ regularity will result in a ``DatetimeIndex`` (but frequency is lost): ts[[0, 2, 6]].index -.. _timeseries.offsets: +.. _timeseries.components: Time/Date Components -~~~~~~~~~~~~~~~~~~~~~~~~~~~ +-------------------- There are several time/date properties that one can access from ``Timestamp`` or a collection of timestamps like a ``DateTimeIndex``. @@ -564,6 +622,8 @@ There are several time/date properties that one can access from ``Timestamp`` or Furthermore, if you have a ``Series`` with datetimelike values, then you can access these properties via the ``.dt`` accessor, see the :ref:`docs ` +.. _timeseries.offsets: + DateOffset objects ------------------ @@ -628,12 +688,12 @@ We could have done the same thing with ``DateOffset``: The key features of a ``DateOffset`` object are: - - it can be added / subtracted to/from a datetime object to obtain a - shifted date - - it can be multiplied by an integer (positive or negative) so that the - increment will be applied multiple times - - it has ``rollforward`` and ``rollback`` methods for moving a date forward - or backward to the next or previous "offset date" +- it can be added / subtracted to/from a datetime object to obtain a + shifted date +- it can be multiplied by an integer (positive or negative) so that the + increment will be applied multiple times +- it has ``rollforward`` and ``rollback`` methods for moving a date forward + or backward to the next or previous "offset date" Subclasses of ``DateOffset`` define the ``apply`` function which dictates custom date increment logic, such as adding business days: @@ -745,7 +805,7 @@ used exactly like a ``Timedelta`` - see the Note that some offsets (such as ``BQuarterEnd``) do not have a vectorized implementation. They can still be used but may -calculate significantly slower and will raise a ``PerformanceWarning`` +calculate significantly slower and will show a ``PerformanceWarning`` .. ipython:: python :okwarning: @@ -755,8 +815,8 @@ calculate significantly slower and will raise a ``PerformanceWarning`` .. _timeseries.custombusinessdays: -Custom Business Days (Experimental) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Custom Business Days +~~~~~~~~~~~~~~~~~~~~ The ``CDay`` or ``CustomBusinessDay`` class provides a parametric ``BusinessDay`` class which can be used to create customized business day @@ -785,7 +845,7 @@ Let's map to the weekday names pd.Series(dts.weekday, dts).map(pd.Series('Mon Tue Wed Thu Fri Sat Sun'.split())) -As of v0.14 holiday calendars can be used to provide the list of holidays. See the +Holiday calendars can be used to provide the list of holidays. See the :ref:`holiday calendar` section for more information. .. ipython:: python @@ -1289,12 +1349,15 @@ limited to, financial applications. See some :ref:`cookbook examples ` for some advanced strategies Starting in version 0.18.1, the ``resample()`` function can be used directly from -DataFrameGroupBy objects, see the :ref:`groupby docs `. +``DataFrameGroupBy`` objects, see the :ref:`groupby docs `. .. note:: ``.resample()`` is similar to using a ``.rolling()`` operation with a time-based offset, see a discussion :ref:`here ` +Basics +~~~~~~ + .. ipython:: python rng = pd.date_range('1/1/2012', periods=100, freq='S') diff --git a/doc/source/whatsnew/v0.19.2.txt b/doc/source/whatsnew/v0.19.2.txt index 82d43db667550..e9af6ee2a921a 100644 --- a/doc/source/whatsnew/v0.19.2.txt +++ b/doc/source/whatsnew/v0.19.2.txt @@ -10,6 +10,8 @@ We recommend that all users upgrade to this version. Highlights include: - Compatibility with Python 3.6 +- Added a `Pandas Cheat Sheet `__. (:issue:`13202`). + .. contents:: What's new in v0.19.2 :local: @@ -22,6 +24,9 @@ Performance Improvements ~~~~~~~~~~~~~~~~~~~~~~~~ - Improved performance of ``.replace()`` (:issue:`12745`) +- Improved performance of ``PeriodIndex`` (:issue:`14822`) +- Performance regression in indexing with getitem (:issue:`14930`) +- Improved performance ``Series`` creation with a datetime index and dictionary data (:issue:`14894`) .. _whatsnew_0192.enhancements.other: @@ -29,6 +34,7 @@ Other Enhancements ~~~~~~~~~~~~~~~~~~ - ``pd.merge_asof()`` gained ``left_index``/``right_index`` and ``left_by``/``right_by`` arguments (:issue:`14253`) +- ``pd.merge_asof()`` can take multiple columns in ``by`` parameter and has specialized dtypes for better performace (:issue:`13936`) @@ -39,10 +45,13 @@ Bug Fixes - Compat with ``dateutil==2.6.0``; segfault reported in the testing suite (:issue:`14621`) - Allow ``nanoseconds`` in ``Timestamp.replace`` as a kwarg (:issue:`14621`) +- Bug in ``pd.read_csv`` in which aliasing was being done for ``na_values`` when passed in as a dictionary (:issue:`14203`) +- Bug in ``pd.read_csv`` in which column indices for a dict-like ``na_values`` were not being respected (:issue:`14203`) - Bug in ``pd.read_csv`` where reading files fails, if the number of headers is equal to the number of lines in the file (:issue:`14515`) - Bug in ``pd.read_csv`` for the Python engine in which an unhelpful error message was being raised when multi-char delimiters were not being respected with quotes (:issue:`14582`) - Fix bugs (:issue:`14734`, :issue:`13654`) in ``pd.read_sas`` and ``pandas.io.sas.sas7bdat.SAS7BDATReader`` that caused problems when reading a SAS file incrementally. - Bug in ``pd.read_csv`` for the Python engine in which an unhelpful error message was being raised when ``skipfooter`` was not being respected by Python's CSV library (:issue:`13879`) +- Bug in ``.fillna()`` in which timezone aware datetime64 values were incorrectly rounded (:issue:`14872`) - Bug in ``.groupby(..., sort=True)`` of a non-lexsorted MultiIndex when grouping with multiple levels (:issue:`14776`) @@ -65,6 +74,7 @@ Bug Fixes - Compat with python 3.6 for Timestamp pickles (:issue:`14689`) - Bug in resampling a ``DatetimeIndex`` in local TZ, covering a DST change, which would raise ``AmbiguousTimeError`` (:issue:`14682`) +- Bug in indexing that transformed ``RecursionError`` into ``KeyError`` or ``IndexingError`` (:issue:`14554`) - Bug in ``HDFStore`` when writing a ``MultiIndex`` when using ``data_columns=True`` (:issue:`14435`) @@ -78,7 +88,7 @@ Bug Fixes - Bug in clipboard functions on linux with python2 with unicode and separators (:issue:`13747`) - Bug in clipboard functions on Windows 10 and python 3 (:issue:`14362`, :issue:`12807`) - Bug in ``.to_clipboard()`` and Excel compat (:issue:`12529`) - +- Bug in ``DataFrame.combine_first()`` for integer columns (:issue:`14687`). - Bug in ``pd.read_csv()`` in which the ``dtype`` parameter was not being respected for empty data (:issue:`14712`) - Bug in ``pd.read_csv()`` in which the ``nrows`` parameter was not being respected for large input when using the C engine for parsing (:issue:`7626`) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 2855cde95ac2a..2a3c83708b6f3 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -23,6 +23,8 @@ New features ~~~~~~~~~~~~ +.. _whatsnew_0200.enhancements.dataio_dtype: + ``dtype`` keyword for data IO ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -64,6 +66,27 @@ Strings passed to ``DataFrame.groupby()`` as the ``by`` parameter may now refere df.groupby(['second', 'A']).sum() +.. _whatsnew_0200.enhancements.compressed_urls: + +Better support for compressed URLs in ``read_csv`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The compression code was refactored (:issue:`12688`). As a result, reading +dataframes from URLs in :func:`read_csv` or :func:`read_table` now supports +additional compression methods: ``xz``, ``bz2``, and ``zip`` (:issue:`14570`). +Previously, only ``gzip`` compression was supported. By default, compression of +URLs and paths are now both inferred using their file extensions. Additionally, +support for bz2 compression in the python 2 c-engine improved (:issue:`14874`). + +.. ipython:: python + url = 'https://github.com/{repo}/raw/{branch}/{path}'.format( + repo = 'pandas-dev/pandas', + branch = 'master', + path = 'pandas/io/tests/parser/data/salaries.csv.bz2', + ) + df = pd.read_table(url, compression='infer') # default, infer compression + df = pd.read_table(url, compression='bz2') # explicitly specify compression + df.head(2) .. _whatsnew_0200.enhancements.other: @@ -84,6 +107,9 @@ Other enhancements - ``pd.DataFrame.plot`` now prints a title above each subplot if ``suplots=True`` and ``title`` is a list of strings (:issue:`14753`) - ``pd.Series.interpolate`` now supports timedelta as an index type with ``method='time'`` (:issue:`6424`) - ``pandas.io.json.json_normalize()`` gained the option ``errors='ignore'|'raise'``; the default is ``errors='raise'`` which is backward compatible. (:issue:`14583`) +- ``pandas.io.json.json_normalize()`` gained ``sep`` option that accepts ``str``, default is ".", which is backward compatible. (:issue:`14883`) + +- ``.select_dtypes()`` now allows the string 'datetimetz' to generically select datetimes with tz (:issue:`14910`) .. _whatsnew_0200.api_breaking: @@ -91,18 +117,127 @@ Other enhancements Backwards incompatible API changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. _whatsnew_0200.api: +.. _whatsnew.api_breaking.index_map +Map on Index types now return other Index types +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -- ``CParserError`` has been renamed to ``ParserError`` in ``pd.read_csv`` and will be removed in the future (:issue:`12665`) -- ``SparseArray.cumsum()`` and ``SparseSeries.cumsum()`` will now always return ``SparseArray`` and ``SparseSeries`` respectively (:issue:`12855`) +- ``map`` on an ``Index`` now returns an ``Index``, not a numpy array (:issue:`12766`) + + .. ipython:: python + + idx = Index([1, 2]) + idx + mi = MultiIndex.from_tuples([(1, 2), (2, 4)]) + mi + + Previous Behavior: + + .. code-block:: ipython + + In [5]: idx.map(lambda x: x * 2) + Out[5]: array([2, 4]) + + In [6]: idx.map(lambda x: (x, x * 2)) + Out[6]: array([(1, 2), (2, 4)], dtype=object) + + In [7]: mi.map(lambda x: x) + Out[7]: array([(1, 2), (2, 4)], dtype=object) + + In [8]: mi.map(lambda x: x[0]) + Out[8]: array([1, 2]) + + New Behavior: + + .. ipython:: python + + idx.map(lambda x: x * 2) + + idx.map(lambda x: (x, x * 2)) + + mi.map(lambda x: x) + + mi.map(lambda x: x[0]) + + +- ``map`` on a ``Series`` with ``datetime64`` values may return ``int64`` dtypes rather than ``int32`` + + .. ipython:: python + + s = Series(date_range('2011-01-02T00:00', '2011-01-02T02:00', freq='H').tz_localize('Asia/Tokyo')) + s + Previous Behavior: + .. code-block:: ipython + In [9]: s.map(lambda x: x.hour) + Out[9]: + 0 0 + 1 1 + 2 2 + dtype: int32 + + + New Behavior: + + .. ipython:: python + + s.map(lambda x: x.hour) + +.. _whatsnew_0200.api_breaking.s3: + +S3 File Handling +^^^^^^^^^^^^^^^^ + +pandas now uses `s3fs `_ for handling S3 connections. This shouldn't break +any code. However, since s3fs is not a required dependency, you will need to install it separately, like ``boto`` +in prior versions of pandas. (:issue:`11915`). + +.. _whatsnew_0200.api_breaking.partial_string_indexing: + +Partial String Indexing Changes +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:ref:`DatetimeIndex Partial String Indexing ` now works as exact match, provided that string resolution coincides with index resolution, including a case when both are seconds (:issue:`14826`). See :ref:`Slice vs. Exact Match ` for details. + +.. ipython:: python + + df = DataFrame({'a': [1, 2, 3]}, DatetimeIndex(['2011-12-31 23:59:59', + '2012-01-01 00:00:00', + '2012-01-01 00:00:01'])) +Previous Behavior: + +.. code-block:: ipython + + In [4]: df['2011-12-31 23:59:59'] + Out[4]: + a + 2011-12-31 23:59:59 1 + + In [5]: df['a']['2011-12-31 23:59:59'] + Out[5]: + 2011-12-31 23:59:59 1 + Name: a, dtype: int64 + + +New Behavior: + +.. code-block:: ipython + + In [4]: df['2011-12-31 23:59:59'] + KeyError: '2011-12-31 23:59:59' + + In [5]: df['a']['2011-12-31 23:59:59'] + Out[5]: 1 + +.. _whatsnew_0200.api: Other API Changes ^^^^^^^^^^^^^^^^^ +- ``CParserError`` has been renamed to ``ParserError`` in ``pd.read_csv`` and will be removed in the future (:issue:`12665`) +- ``SparseArray.cumsum()`` and ``SparseSeries.cumsum()`` will now always return ``SparseArray`` and ``SparseSeries`` respectively (:issue:`12855`) .. _whatsnew_0200.deprecations: @@ -144,19 +279,25 @@ Performance Improvements Bug Fixes ~~~~~~~~~ +- Bug in ``TimedeltaIndex`` addition where overflow was being allowed without error (:issue:`14816`) +- Bug in ``DataFrame`` construction in which unsigned 64-bit integer elements were being converted to objects (:issue:`14881`) - Bug in ``astype()`` where ``inf`` values were incorrectly converted to integers. Now raises error now with ``astype()`` for Series and DataFrames (:issue:`14265`) +- Bug in ``DataFrame(..).apply(to_numeric)`` when values are of type decimal.Decimal. (:issue:`14827`) +- Bug in ``describe()`` when passing a numpy array which does not contain the median to the ``percentiles`` keyword argument (:issue:`14908`) +- Bug in ``pd.read_msgpack()`` in which ``Series`` categoricals were being improperly processed (:issue:`14901`) +- Bug in ``Series`` construction with a datetimetz (:issue:`14928`) - - +- Bug in ``Series.unique()`` in which unsigned 64-bit integers were causing overflow (:issue:`14721`) - Require at least 0.23 version of cython to avoid problems with character encodings (:issue:`14699`) +- Bug in converting object elements of array-like objects to unsigned 64-bit integers (:issue:`4471`) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 0d4d4143e6b9b..e51774ce4d9b4 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -25,8 +25,10 @@ _ensure_platform_int, _ensure_object, _ensure_float64, + _ensure_uint64, _ensure_int64, is_list_like) +from pandas.compat.numpy import _np_version_under1p10 from pandas.types.missing import isnull import pandas.core.common as com @@ -112,6 +114,41 @@ def _unique_generic(values, table_type, type_caster): return type_caster(uniques) +def unique1d(values): + """ + Hash table-based unique + """ + if np.issubdtype(values.dtype, np.floating): + table = htable.Float64HashTable(len(values)) + uniques = np.array(table.unique(_ensure_float64(values)), + dtype=np.float64) + elif np.issubdtype(values.dtype, np.datetime64): + table = htable.Int64HashTable(len(values)) + uniques = table.unique(_ensure_int64(values)) + uniques = uniques.view('M8[ns]') + elif np.issubdtype(values.dtype, np.timedelta64): + table = htable.Int64HashTable(len(values)) + uniques = table.unique(_ensure_int64(values)) + uniques = uniques.view('m8[ns]') + elif np.issubdtype(values.dtype, np.signedinteger): + table = htable.Int64HashTable(len(values)) + uniques = table.unique(_ensure_int64(values)) + elif np.issubdtype(values.dtype, np.unsignedinteger): + table = htable.UInt64HashTable(len(values)) + uniques = table.unique(_ensure_uint64(values)) + else: + + # its cheaper to use a String Hash Table than Object + if lib.infer_dtype(values) in ['string']: + table = htable.StringHashTable(len(values)) + else: + table = htable.PyObjectHashTable(len(values)) + + uniques = table.unique(_ensure_object(values)) + + return uniques + + def isin(comps, values): """ Compute the isin boolean array @@ -550,6 +587,95 @@ def rank(values, axis=0, method='average', na_option='keep', return ranks + +def checked_add_with_arr(arr, b, arr_mask=None, b_mask=None): + """ + Perform array addition that checks for underflow and overflow. + + Performs the addition of an int64 array and an int64 integer (or array) + but checks that they do not result in overflow first. For elements that + are indicated to be NaN, whether or not there is overflow for that element + is automatically ignored. + + Parameters + ---------- + arr : array addend. + b : array or scalar addend. + arr_mask : boolean array or None + array indicating which elements to exclude from checking + b_mask : boolean array or boolean or None + array or scalar indicating which element(s) to exclude from checking + + Returns + ------- + sum : An array for elements x + b for each element x in arr if b is + a scalar or an array for elements x + y for each element pair + (x, y) in (arr, b). + + Raises + ------ + OverflowError if any x + y exceeds the maximum or minimum int64 value. + """ + def _broadcast(arr_or_scalar, shape): + """ + Helper function to broadcast arrays / scalars to the desired shape. + """ + if _np_version_under1p10: + if lib.isscalar(arr_or_scalar): + out = np.empty(shape) + out.fill(arr_or_scalar) + else: + out = arr_or_scalar + else: + out = np.broadcast_to(arr_or_scalar, shape) + return out + + # For performance reasons, we broadcast 'b' to the new array 'b2' + # so that it has the same size as 'arr'. + b2 = _broadcast(b, arr.shape) + if b_mask is not None: + # We do the same broadcasting for b_mask as well. + b2_mask = _broadcast(b_mask, arr.shape) + else: + b2_mask = None + + # For elements that are NaN, regardless of their value, we should + # ignore whether they overflow or not when doing the checked add. + if arr_mask is not None and b2_mask is not None: + not_nan = np.logical_not(arr_mask | b2_mask) + elif arr_mask is not None: + not_nan = np.logical_not(arr_mask) + elif b_mask is not None: + not_nan = np.logical_not(b2_mask) + else: + not_nan = np.empty(arr.shape, dtype=bool) + not_nan.fill(True) + + # gh-14324: For each element in 'arr' and its corresponding element + # in 'b2', we check the sign of the element in 'b2'. If it is positive, + # we then check whether its sum with the element in 'arr' exceeds + # np.iinfo(np.int64).max. If so, we have an overflow error. If it + # it is negative, we then check whether its sum with the element in + # 'arr' exceeds np.iinfo(np.int64).min. If so, we have an overflow + # error as well. + mask1 = b2 > 0 + mask2 = b2 < 0 + + if not mask1.any(): + to_raise = ((np.iinfo(np.int64).min - b2 > arr) & not_nan).any() + elif not mask2.any(): + to_raise = ((np.iinfo(np.int64).max - b2 < arr) & not_nan).any() + else: + to_raise = (((np.iinfo(np.int64).max - + b2[mask1] < arr[mask1]) & not_nan[mask1]).any() or + ((np.iinfo(np.int64).min - + b2[mask2] > arr[mask2]) & not_nan[mask2]).any()) + + if to_raise: + raise OverflowError("Overflow in int64 addition") + return arr + b + + _rank1d_functions = { 'float64': algos.rank_1d_float64, 'int64': algos.rank_1d_int64, diff --git a/pandas/core/base.py b/pandas/core/base.py index d412349447794..49e43a60403ca 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -814,7 +814,7 @@ def transpose(self, *args, **kwargs): @property def shape(self): """ return a tuple of the shape of the underlying data """ - return self.values.shape + return self._values.shape @property def ndim(self): @@ -842,22 +842,22 @@ def data(self): @property def itemsize(self): """ return the size of the dtype of the item of the underlying data """ - return self.values.itemsize + return self._values.itemsize @property def nbytes(self): """ return the number of bytes in the underlying data """ - return self.values.nbytes + return self._values.nbytes @property def strides(self): """ return the strides of the underlying data """ - return self.values.strides + return self._values.strides @property def size(self): """ return the number of elements in the underlying data """ - return self.values.size + return self._values.size @property def flags(self): @@ -969,7 +969,7 @@ def unique(self): if hasattr(values, 'unique'): result = values.unique() else: - from pandas.core.nanops import unique1d + from pandas.core.algorithms import unique1d result = unique1d(values) return result diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index 922fb84684729..7f2e6093d0f4c 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -25,7 +25,7 @@ is_scalar) from pandas.core.common import is_null_slice -from pandas.core.algorithms import factorize, take_1d +from pandas.core.algorithms import factorize, take_1d, unique1d from pandas.core.base import (PandasObject, PandasDelegate, NoNewAttributesMixin, _shared_docs) import pandas.core.common as com @@ -930,8 +930,7 @@ def remove_unused_categories(self, inplace=False): return cat def map(self, mapper): - """ - Apply mapper function to its categories (not codes). + """Apply mapper function to its categories (not codes). Parameters ---------- @@ -943,7 +942,8 @@ def map(self, mapper): Returns ------- - applied : Categorical or np.ndarray. + applied : Categorical or Index. + """ new_categories = self.categories.map(mapper) try: @@ -1834,7 +1834,6 @@ def unique(self): unique values : ``Categorical`` """ - from pandas.core.nanops import unique1d # unlike np.unique, unique1d does not sort unique_codes = unique1d(self.codes) cat = self.copy() diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 0d4bcd781cf74..7305df0f57736 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2257,7 +2257,12 @@ def select_dtypes(self, include=None, exclude=None): this will return *all* object dtype columns * See the `numpy dtype hierarchy `__ + * To select datetimes, use np.datetime64, 'datetime' or 'datetime64' + * To select timedeltas, use np.timedelta64, 'timedelta' or + 'timedelta64' * To select Pandas categorical dtypes, use 'category' + * To select Pandas datetimetz dtypes, use 'datetimetz' (new in 0.20.0), + or a 'datetime64[ns, tz]' string Examples -------- @@ -3665,10 +3670,8 @@ def combine(self, other, func, fill_value=None, overwrite=True): otherSeries[other_mask] = fill_value # if we have different dtypes, possibily promote - if notnull(series).all(): - new_dtype = this_dtype - otherSeries = otherSeries.astype(new_dtype) - else: + new_dtype = this_dtype + if not is_dtype_equal(this_dtype, other_dtype): new_dtype = _find_common_type([this_dtype, other_dtype]) if not is_dtype_equal(this_dtype, new_dtype): series = series.astype(new_dtype) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 48d799811aa94..3678168890444 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -5262,6 +5262,9 @@ def describe(self, percentiles=None, include=None, exclude=None): raise ValueError("Cannot describe a DataFrame without columns") if percentiles is not None: + # explicit conversion of `percentiles` to list + percentiles = list(percentiles) + # get them all to be in [0, 1] self._check_percentile(percentiles) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index b249cded39133..950ad53abe5e0 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -14,10 +14,10 @@ from pandas.compat.numpy import function as nv from pandas.compat.numpy import _np_version_under1p8 -from pandas.types.common import (_DATELIKE_DTYPES, - is_numeric_dtype, +from pandas.types.common import (is_numeric_dtype, is_timedelta64_dtype, is_datetime64_dtype, is_categorical_dtype, + is_datetimelike, is_datetime_or_timedelta_dtype, is_bool, is_integer_dtype, is_complex_dtype, @@ -3453,10 +3453,10 @@ def first_non_None_value(values): # if we have date/time like in the original, then coerce dates # as we are stacking can easily have object dtypes here so = self._selected_obj - if (so.ndim == 2 and so.dtypes.isin(_DATELIKE_DTYPES).any()): + if (so.ndim == 2 and so.dtypes.apply(is_datetimelike).any()): result = result._convert(numeric=True) date_cols = self._selected_obj.select_dtypes( - include=list(_DATELIKE_DTYPES)).columns + include=['datetime', 'timedelta']).columns date_cols = date_cols.intersection(result.columns) result[date_cols] = (result[date_cols] ._convert(datetime=True, diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index c4ae3dcca8367..107d68c192ead 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -848,7 +848,7 @@ def _multi_take(self, tup): [(a, self._convert_for_reindex(t, axis=o._get_axis_number(a))) for t, a in zip(tup, o._AXIS_ORDERS)]) return o.reindex(**d) - except: + except(KeyError, IndexingError): raise self._exception def _convert_for_reindex(self, key, axis=0): diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 120a9cbcd1a75..05ac3356c1770 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -4314,11 +4314,6 @@ def form_blocks(arrays, names, axes): elif is_datetimetz(v): datetime_tz_items.append((i, k, v)) elif issubclass(v.dtype.type, np.integer): - if v.dtype == np.uint64: - # HACK #2355 definite overflow - if (v > 2**63 - 1).any(): - object_items.append((i, k, v)) - continue int_items.append((i, k, v)) elif v.dtype == np.bool_: bool_items.append((i, k, v)) diff --git a/pandas/core/missing.py b/pandas/core/missing.py index f1191ff1c7009..e83a0518d97f6 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -10,9 +10,8 @@ from pandas.compat import range, string_types from pandas.types.common import (is_numeric_v_string_like, is_float_dtype, is_datetime64_dtype, - is_integer_dtype, _ensure_float64, - is_scalar, - _DATELIKE_DTYPES, + is_datetime64tz_dtype, is_integer_dtype, + _ensure_float64, is_scalar, needs_i8_conversion) from pandas.types.missing import isnull @@ -450,7 +449,7 @@ def pad_1d(values, limit=None, mask=None, dtype=None): _method = None if is_float_dtype(values): _method = getattr(algos, 'pad_inplace_%s' % dtype.name, None) - elif dtype in _DATELIKE_DTYPES or is_datetime64_dtype(values): + elif is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype): _method = _pad_1d_datetime elif is_integer_dtype(values): values = _ensure_float64(values) @@ -475,7 +474,7 @@ def backfill_1d(values, limit=None, mask=None, dtype=None): _method = None if is_float_dtype(values): _method = getattr(algos, 'backfill_inplace_%s' % dtype.name, None) - elif dtype in _DATELIKE_DTYPES or is_datetime64_dtype(values): + elif is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype): _method = _backfill_1d_datetime elif is_integer_dtype(values): values = _ensure_float64(values) @@ -501,7 +500,7 @@ def pad_2d(values, limit=None, mask=None, dtype=None): _method = None if is_float_dtype(values): _method = getattr(algos, 'pad_2d_inplace_%s' % dtype.name, None) - elif dtype in _DATELIKE_DTYPES or is_datetime64_dtype(values): + elif is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype): _method = _pad_2d_datetime elif is_integer_dtype(values): values = _ensure_float64(values) @@ -531,7 +530,7 @@ def backfill_2d(values, limit=None, mask=None, dtype=None): _method = None if is_float_dtype(values): _method = getattr(algos, 'backfill_2d_inplace_%s' % dtype.name, None) - elif dtype in _DATELIKE_DTYPES or is_datetime64_dtype(values): + elif is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype): _method = _backfill_2d_datetime elif is_integer_dtype(values): values = _ensure_float64(values) diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index d7d68ad536be5..1f76bc850cee9 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -9,11 +9,8 @@ except ImportError: # pragma: no cover _USE_BOTTLENECK = False -import pandas.hashtable as _hash from pandas import compat, lib, algos, tslib -from pandas.compat.numpy import _np_version_under1p10 -from pandas.types.common import (_ensure_int64, _ensure_object, - _ensure_float64, _get_dtype, +from pandas.types.common import (_get_dtype, is_float, is_scalar, is_integer, is_complex, is_float_dtype, is_complex_dtype, is_integer_dtype, @@ -785,82 +782,3 @@ def f(x, y): nanle = make_nancomp(operator.le) naneq = make_nancomp(operator.eq) nanne = make_nancomp(operator.ne) - - -def unique1d(values): - """ - Hash table-based unique - """ - if np.issubdtype(values.dtype, np.floating): - table = _hash.Float64HashTable(len(values)) - uniques = np.array(table.unique(_ensure_float64(values)), - dtype=np.float64) - elif np.issubdtype(values.dtype, np.datetime64): - table = _hash.Int64HashTable(len(values)) - uniques = table.unique(_ensure_int64(values)) - uniques = uniques.view('M8[ns]') - elif np.issubdtype(values.dtype, np.timedelta64): - table = _hash.Int64HashTable(len(values)) - uniques = table.unique(_ensure_int64(values)) - uniques = uniques.view('m8[ns]') - elif np.issubdtype(values.dtype, np.integer): - table = _hash.Int64HashTable(len(values)) - uniques = table.unique(_ensure_int64(values)) - else: - table = _hash.PyObjectHashTable(len(values)) - uniques = table.unique(_ensure_object(values)) - return uniques - - -def _checked_add_with_arr(arr, b): - """ - Performs the addition of an int64 array and an int64 integer (or array) - but checks that they do not result in overflow first. - - Parameters - ---------- - arr : array addend. - b : array or scalar addend. - - Returns - ------- - sum : An array for elements x + b for each element x in arr if b is - a scalar or an array for elements x + y for each element pair - (x, y) in (arr, b). - - Raises - ------ - OverflowError if any x + y exceeds the maximum or minimum int64 value. - """ - # For performance reasons, we broadcast 'b' to the new array 'b2' - # so that it has the same size as 'arr'. - if _np_version_under1p10: - if lib.isscalar(b): - b2 = np.empty(arr.shape) - b2.fill(b) - else: - b2 = b - else: - b2 = np.broadcast_to(b, arr.shape) - - # gh-14324: For each element in 'arr' and its corresponding element - # in 'b2', we check the sign of the element in 'b2'. If it is positive, - # we then check whether its sum with the element in 'arr' exceeds - # np.iinfo(np.int64).max. If so, we have an overflow error. If it - # it is negative, we then check whether its sum with the element in - # 'arr' exceeds np.iinfo(np.int64).min. If so, we have an overflow - # error as well. - mask1 = b2 > 0 - mask2 = b2 < 0 - - if not mask1.any(): - to_raise = (np.iinfo(np.int64).min - b2 > arr).any() - elif not mask2.any(): - to_raise = (np.iinfo(np.int64).max - b2 < arr).any() - else: - to_raise = ((np.iinfo(np.int64).max - b2[mask1] < arr[mask1]).any() or - (np.iinfo(np.int64).min - b2[mask2] > arr[mask2]).any()) - - if to_raise: - raise OverflowError("Overflow in int64 addition") - return arr + b diff --git a/pandas/core/ops.py b/pandas/core/ops.py index 80de3cd85d4db..396b0e048bc49 100644 --- a/pandas/core/ops.py +++ b/pandas/core/ops.py @@ -545,9 +545,9 @@ def _offset(lvalues, rvalues): # with tz, convert to UTC if self.is_datetime64tz_lhs: - lvalues = lvalues.tz_localize(None) + lvalues = lvalues.tz_convert('UTC').tz_localize(None) if self.is_datetime64tz_rhs: - rvalues = rvalues.tz_localize(None) + rvalues = rvalues.tz_convert('UTC').tz_localize(None) lvalues = lvalues.view(np.int64) rvalues = rvalues.view(np.int64) diff --git a/pandas/core/series.py b/pandas/core/series.py index 7018865e5b3ec..f656d72296e3a 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -188,7 +188,8 @@ def __init__(self, data=None, index=None, dtype=None, name=None, if len(data): # coerce back to datetime objects for lookup data = _dict_compat(data) - data = lib.fast_multiget(data, index.astype('O'), + data = lib.fast_multiget(data, + index.asobject.values, default=np.nan) else: data = np.nan diff --git a/pandas/hashtable.pxd b/pandas/hashtable.pxd index f3ea7ad792160..cd06b938310a8 100644 --- a/pandas/hashtable.pxd +++ b/pandas/hashtable.pxd @@ -1,10 +1,17 @@ -from khash cimport kh_int64_t, kh_float64_t, kh_pymap_t, kh_str_t, int64_t, float64_t +from khash cimport (kh_int64_t, kh_uint64_t, kh_float64_t, kh_pymap_t, + kh_str_t, uint64_t, int64_t, float64_t) # prototypes for sharing cdef class HashTable: pass +cdef class UInt64HashTable(HashTable): + cdef kh_uint64_t *table + + cpdef get_item(self, uint64_t val) + cpdef set_item(self, uint64_t key, Py_ssize_t val) + cdef class Int64HashTable(HashTable): cdef kh_int64_t *table diff --git a/pandas/index.pyx b/pandas/index.pyx index a6eb74727a999..a245e85d80f96 100644 --- a/pandas/index.pyx +++ b/pandas/index.pyx @@ -82,20 +82,13 @@ cdef class IndexEngine: cdef: bint unique, monotonic_inc, monotonic_dec - bint initialized, monotonic_check, unique_check + bint need_monotonic_check, need_unique_check def __init__(self, vgetter, n): self.vgetter = vgetter self.over_size_threshold = n >= _SIZE_CUTOFF - - self.initialized = 0 - self.monotonic_check = 0 - self.unique_check = 0 - - self.unique = 0 - self.monotonic_inc = 0 - self.monotonic_dec = 0 + self.clear_mapping() def __contains__(self, object val): self._ensure_mapping_populated() @@ -213,16 +206,20 @@ cdef class IndexEngine: property is_unique: def __get__(self): - if not self.initialized: - self.initialize() + if self.need_unique_check: + self._do_unique_check() - self.unique_check = 1 return self.unique == 1 + cdef inline _do_unique_check(self): + + # this de-facto the same + self._ensure_mapping_populated() + property is_monotonic_increasing: def __get__(self): - if not self.monotonic_check: + if self.need_monotonic_check: self._do_monotonic_check() return self.monotonic_inc == 1 @@ -230,7 +227,7 @@ cdef class IndexEngine: property is_monotonic_decreasing: def __get__(self): - if not self.monotonic_check: + if self.need_monotonic_check: self._do_monotonic_check() return self.monotonic_dec == 1 @@ -246,13 +243,12 @@ cdef class IndexEngine: self.monotonic_dec = 0 is_unique = 0 - self.monotonic_check = 1 + self.need_monotonic_check = 0 # we can only be sure of uniqueness if is_unique=1 if is_unique: - self.initialized = 1 self.unique = 1 - self.unique_check = 1 + self.need_unique_check = 0 cdef _get_index_values(self): return self.vgetter() @@ -266,30 +262,32 @@ cdef class IndexEngine: cdef _check_type(self, object val): hash(val) + property is_mapping_populated: + + def __get__(self): + return self.mapping is not None + cdef inline _ensure_mapping_populated(self): - # need to reset if we have previously - # set the initialized from monotonic checks - if self.unique_check: - self.initialized = 0 - if not self.initialized: - self.initialize() - - cdef initialize(self): - values = self._get_index_values() + # this populates the mapping + # if its not already populated + # also satisfies the need_unique_check - self.mapping = self._make_hash_table(len(values)) - self.mapping.map_locations(values) + if not self.is_mapping_populated: - if len(self.mapping) == len(values): - self.unique = 1 + values = self._get_index_values() + + self.mapping = self._make_hash_table(len(values)) + self.mapping.map_locations(values) + + if len(self.mapping) == len(values): + self.unique = 1 - self.initialized = 1 + self.need_unique_check = 0 def clear_mapping(self): self.mapping = None - self.initialized = 0 - self.monotonic_check = 0 - self.unique_check = 0 + self.need_monotonic_check = 1 + self.need_unique_check = 1 self.unique = 0 self.monotonic_inc = 0 diff --git a/pandas/indexes/base.py b/pandas/indexes/base.py index 512abfd88c78c..1cc546629589d 100644 --- a/pandas/indexes/base.py +++ b/pandas/indexes/base.py @@ -2427,8 +2427,7 @@ def groupby(self, values): return result def map(self, mapper): - """ - Apply mapper function to its values. + """Apply mapper function to an index. Parameters ---------- @@ -2437,9 +2436,21 @@ def map(self, mapper): Returns ------- - applied : array + applied : Union[Index, MultiIndex], inferred + The output of the mapping function applied to the index. + If the function returns a tuple with more than one element + a MultiIndex will be returned. + """ - return self._arrmap(self.values, mapper) + from .multi import MultiIndex + mapped_values = self._arrmap(self.values, mapper) + attributes = self._get_attributes_dict() + if mapped_values.size and isinstance(mapped_values[0], tuple): + return MultiIndex.from_tuples(mapped_values, + names=attributes.get('name')) + + attributes['copy'] = False + return Index(mapped_values, **attributes) def isin(self, values, level=None): """ diff --git a/pandas/indexes/category.py b/pandas/indexes/category.py index c1f5d47e1e04f..2c89f72975ade 100644 --- a/pandas/indexes/category.py +++ b/pandas/indexes/category.py @@ -517,22 +517,22 @@ def take(self, indices, axis=0, allow_fill=True, return self._create_from_codes(taken) def map(self, mapper): - """ - Apply mapper function to its categories (not codes). + """Apply mapper function to its categories (not codes). Parameters ---------- mapper : callable Function to be applied. When all categories are mapped - to different categories, the result will be Categorical which has - the same order property as the original. Otherwise, the result will - be np.ndarray. + to different categories, the result will be a CategoricalIndex + which has the same order property as the original. Otherwise, + the result will be a Index. Returns ------- - applied : Categorical or np.ndarray. + applied : CategoricalIndex or Index + """ - return self.values.map(mapper) + return self._shallow_copy_with_infer(self.values.map(mapper)) def delete(self, loc): """ diff --git a/pandas/io/common.py b/pandas/io/common.py index c115fab217fba..6817c824ad786 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -12,6 +12,12 @@ from pandas.core.common import AbstractMethodError from pandas.types.common import is_number +try: + from s3fs import S3File + need_text_wrapping = (BytesIO, S3File) +except ImportError: + need_text_wrapping = (BytesIO,) + # common NA values # no longer excluding inf representations # '1.#INF','-1.#INF', '1.#INF000000', @@ -187,8 +193,8 @@ def _stringify_path(filepath_or_buffer): def get_filepath_or_buffer(filepath_or_buffer, encoding=None, compression=None): """ - If the filepath_or_buffer is a url, translate and return the buffer - passthru otherwise. + If the filepath_or_buffer is a url, translate and return the buffer. + Otherwise passthrough. Parameters ---------- @@ -212,10 +218,10 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None, return reader, encoding, compression if _is_s3_url(filepath_or_buffer): - from pandas.io.s3 import get_filepath_or_buffer - return get_filepath_or_buffer(filepath_or_buffer, - encoding=encoding, - compression=compression) + from pandas.io import s3 + return s3.get_filepath_or_buffer(filepath_or_buffer, + encoding=encoding, + compression=compression) # It is a pathlib.Path/py.path.local or string filepath_or_buffer = _stringify_path(filepath_or_buffer) @@ -247,23 +253,26 @@ def file_path_to_url(path): def _infer_compression(filepath_or_buffer, compression): """ - Get file handle for given path/buffer and mode. + Get the compression method for filepath_or_buffer. If compression='infer', + the inferred compression method is returned. Otherwise, the input + compression method is returned unchanged, unless it's invalid, in which + case an error is raised. Parameters ---------- filepath_or_buf : a path (str) or buffer - compression : str, or None + compression : str or None + the compression method including None for no compression and 'infer' Returns ------- - string compression method, None + string or None : + compression method Raises ------ ValueError on invalid compression specified - - If compression='infer', infer compression. If compression """ # No compression has been explicitly specified @@ -388,7 +397,7 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None, handles.append(f) # in Python 3, convert BytesIO or fileobjects passed with an encoding - if compat.PY3 and (compression or isinstance(f, compat.BytesIO)): + if compat.PY3 and (compression or isinstance(f, need_text_wrapping)): from io import TextIOWrapper f = TextIOWrapper(f, encoding=encoding) handles.append(f) diff --git a/pandas/io/json.py b/pandas/io/json.py index 0a6b8af179e12..009792cf00716 100644 --- a/pandas/io/json.py +++ b/pandas/io/json.py @@ -24,8 +24,8 @@ def to_json(path_or_buf, obj, orient=None, date_format='epoch', default_handler=None, lines=False): if lines and orient != 'records': - raise ValueError( - "'lines' keyword only valid when 'orient' is records") + raise ValueError( + "'lines' keyword only valid when 'orient' is records") if isinstance(obj, Series): s = SeriesWriter( @@ -726,8 +726,8 @@ def nested_to_record(ds, prefix="", level=0): def json_normalize(data, record_path=None, meta=None, meta_prefix=None, record_prefix=None, - errors='raise'): - + errors='raise', + sep='.'): """ "Normalize" semi-structured JSON data into a flat table @@ -752,6 +752,12 @@ def json_normalize(data, record_path=None, meta=None, .. versionadded:: 0.20.0 + sep : string, default '.' + Nested records will generate names separated by sep (separator), + e.g., for sep='.', { 'foo' : { 'bar' : 0 } } -> foo.bar + + .. versionadded:: 0.20.0 + Returns ------- frame : DataFrame @@ -828,7 +834,9 @@ def _pull_field(js, spec): lengths = [] meta_vals = defaultdict(list) - meta_keys = ['.'.join(val) for val in meta] + if not isinstance(sep, compat.string_types): + sep = str(sep) + meta_keys = [sep.join(val) for val in meta] def _recursive_extract(data, path, seen_meta, level=0): if len(path) > 1: diff --git a/pandas/io/packers.py b/pandas/io/packers.py index 1838d9175e597..ab44e46c96b77 100644 --- a/pandas/io/packers.py +++ b/pandas/io/packers.py @@ -593,17 +593,13 @@ def decode(obj): elif typ == u'series': dtype = dtype_for(obj[u'dtype']) pd_dtype = pandas_dtype(dtype) - np_dtype = pandas_dtype(dtype).base index = obj[u'index'] result = globals()[obj[u'klass']](unconvert(obj[u'data'], dtype, obj[u'compress']), index=index, - dtype=np_dtype, + dtype=pd_dtype, name=obj[u'name']) - tz = getattr(pd_dtype, 'tz', None) - if tz: - result = result.dt.tz_localize('UTC').dt.tz_convert(tz) return result elif typ == u'block_manager': diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 200943324ce66..8e4246787ed5b 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -2055,9 +2055,27 @@ def _clean_mapping(mapping): else: clean_dtypes = _clean_mapping(self.dtype) - return self._convert_to_ndarrays(data, self.na_values, self.na_fvalues, - self.verbose, clean_conv, - clean_dtypes) + # Apply NA values. + clean_na_values = {} + clean_na_fvalues = {} + + if isinstance(self.na_values, dict): + for col in self.na_values: + na_value = self.na_values[col] + na_fvalue = self.na_fvalues[col] + + if isinstance(col, int) and col not in self.orig_names: + col = self.orig_names[col] + + clean_na_values[col] = na_value + clean_na_fvalues[col] = na_fvalue + else: + clean_na_values = self.na_values + clean_na_fvalues = self.na_fvalues + + return self._convert_to_ndarrays(data, clean_na_values, + clean_na_fvalues, self.verbose, + clean_conv, clean_dtypes) def _to_recarray(self, data, columns): dtypes = [] @@ -2767,6 +2785,7 @@ def _clean_na_values(na_values, keep_default_na=True): na_values = [] na_fvalues = set() elif isinstance(na_values, dict): + na_values = na_values.copy() # Prevent aliasing. if keep_default_na: for k, v in compat.iteritems(na_values): if not is_list_like(v): diff --git a/pandas/io/s3.py b/pandas/io/s3.py index 8aa3694834a0a..5e48de757d00e 100644 --- a/pandas/io/s3.py +++ b/pandas/io/s3.py @@ -1,14 +1,10 @@ """ s3 support for remote file interactivity """ - -import os from pandas import compat -from pandas.compat import BytesIO - try: - import boto - from boto.s3 import key + import s3fs + from botocore.exceptions import NoCredentialsError except: - raise ImportError("boto is required to handle s3 files") + raise ImportError("The s3fs library is required to handle s3 files") if compat.PY3: from urllib.parse import urlparse as parse_url @@ -16,95 +12,24 @@ from urlparse import urlparse as parse_url -class BotoFileLikeReader(key.Key): - """boto Key modified to be more file-like - - This modification of the boto Key will read through a supplied - S3 key once, then stop. The unmodified boto Key object will repeatedly - cycle through a file in S3: after reaching the end of the file, - boto will close the file. Then the next call to `read` or `next` will - re-open the file and start reading from the beginning. - - Also adds a `readline` function which will split the returned - values by the `\n` character. - """ - - def __init__(self, *args, **kwargs): - encoding = kwargs.pop("encoding", None) # Python 2 compat - super(BotoFileLikeReader, self).__init__(*args, **kwargs) - # Add a flag to mark the end of the read. - self.finished_read = False - self.buffer = "" - self.lines = [] - if encoding is None and compat.PY3: - encoding = "utf-8" - self.encoding = encoding - self.lines = [] - - def next(self): - return self.readline() - - __next__ = next - - def read(self, *args, **kwargs): - if self.finished_read: - return b'' if compat.PY3 else '' - return super(BotoFileLikeReader, self).read(*args, **kwargs) - - def close(self, *args, **kwargs): - self.finished_read = True - return super(BotoFileLikeReader, self).close(*args, **kwargs) - - def seekable(self): - """Needed for reading by bz2""" - return False - - def readline(self): - """Split the contents of the Key by '\n' characters.""" - if self.lines: - retval = self.lines[0] - self.lines = self.lines[1:] - return retval - if self.finished_read: - if self.buffer: - retval, self.buffer = self.buffer, "" - return retval - else: - raise StopIteration - - if self.encoding: - self.buffer = "{}{}".format( - self.buffer, self.read(8192).decode(self.encoding)) - else: - self.buffer = "{}{}".format(self.buffer, self.read(8192)) - - split_buffer = self.buffer.split("\n") - self.lines.extend(split_buffer[:-1]) - self.buffer = split_buffer[-1] - - return self.readline() +def _strip_schema(url): + """Returns the url without the s3:// part""" + result = parse_url(url) + return result.netloc + result.path def get_filepath_or_buffer(filepath_or_buffer, encoding=None, compression=None): - - # Assuming AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY and AWS_S3_HOST - # are environment variables - parsed_url = parse_url(filepath_or_buffer) - s3_host = os.environ.get('AWS_S3_HOST', 's3.amazonaws.com') - + fs = s3fs.S3FileSystem(anon=False) try: - conn = boto.connect_s3(host=s3_host) - except boto.exception.NoAuthHandlerFound: - conn = boto.connect_s3(host=s3_host, anon=True) - - b = conn.get_bucket(parsed_url.netloc, validate=False) - if compat.PY2 and compression: - k = boto.s3.key.Key(b, parsed_url.path) - filepath_or_buffer = BytesIO(k.get_contents_as_string( - encoding=encoding)) - else: - k = BotoFileLikeReader(b, parsed_url.path, encoding=encoding) - k.open('r') # Expose read errors immediately - filepath_or_buffer = k + filepath_or_buffer = fs.open(_strip_schema(filepath_or_buffer)) + except (OSError, NoCredentialsError): + # boto3 has troubles when trying to access a public file + # when credentialed... + # An OSError is raised if you have credentials, but they + # aren't valid for that bucket. + # A NoCredentialsError is raised if you don't have creds + # for that bucket. + fs = s3fs.S3FileSystem(anon=True) + filepath_or_buffer = fs.open(_strip_schema(filepath_or_buffer)) return filepath_or_buffer, None, compression diff --git a/pandas/io/tests/json/test_json_norm.py b/pandas/io/tests/json/test_json_norm.py index 36110898448ea..7164dc176e5b3 100644 --- a/pandas/io/tests/json/test_json_norm.py +++ b/pandas/io/tests/json/test_json_norm.py @@ -63,6 +63,21 @@ def test_simple_normalize(self): tm.assert_frame_equal(result, expected) + def test_simple_normalize_with_default_separator(self): + result = json_normalize({'A': {'A': 1, 'B': 2}}) + expected = DataFrame([[1, 2]], columns={'A.A', 'A.B'}) + tm.assert_frame_equal(result, expected) + + def test_simple_normalize_with_user_specified_separator(self): + result = json_normalize({'A': {'A': 1, 'B': 2}}, sep='_') + expected = DataFrame([[1, 2]], columns={'A_A', 'A_B'}) + tm.assert_frame_equal(result, expected) + + def test_simple_normalize_with_user_specified_unicode_separator(self): + result = json_normalize({'A': {'A': 1, 'B': 2}}, sep=u'\u03c3') + expected = DataFrame([[1, 2]], columns={u'A\u03c3A', u'A\u03c3B'}) + tm.assert_frame_equal(result, expected) + def test_more_deeply_nested(self): data = [{'country': 'USA', 'states': [{'name': 'California', @@ -133,6 +148,18 @@ def test_shallow_nested(self): expected = DataFrame(ex_data, columns=result.columns) tm.assert_frame_equal(result, expected) + def test_simple_normalize_with_default_separator(self): + result = json_normalize({"A": {"A": 1, "B": 2}}) + expected = pd.DataFrame([[1, 2]], columns={"A.A", "A.B"}) + assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) + + def test_simple_normalize_with_user_specified_separator(self): + result = json_normalize({"A": {"A": 1, "B": 2}}, sep='_') + expected = pd.DataFrame([[1, 2]], columns={"A_A", "A_B"}) + assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) + def test_meta_name_conflict(self): data = [{'foo': 'hello', 'bar': 'there', diff --git a/pandas/io/tests/json/test_pandas.py b/pandas/io/tests/json/test_pandas.py index e6e6f33669e17..d7f903153fdae 100644 --- a/pandas/io/tests/json/test_pandas.py +++ b/pandas/io/tests/json/test_pandas.py @@ -388,6 +388,10 @@ def test_frame_empty(self): self.assertFalse(df._is_mixed_type) assert_frame_equal(read_json(df.to_json(), dtype=dict(df.dtypes)), df, check_index_type=False) + # GH 7445 + result = pd.DataFrame({'test': []}, index=[]).to_json(orient='columns') + expected = '{"test":{}}' + tm.assert_equal(result, expected) def test_frame_empty_mixedtype(self): # mixed type diff --git a/pandas/io/tests/parser/compression.py b/pandas/io/tests/parser/compression.py index 3b0c571032fe6..e95617faf2071 100644 --- a/pandas/io/tests/parser/compression.py +++ b/pandas/io/tests/parser/compression.py @@ -8,7 +8,6 @@ import nose import pandas.util.testing as tm -from pandas import compat class CompressionTests(object): @@ -114,12 +113,8 @@ def test_bz2(self): path, compression='bz3') with open(path, 'rb') as fin: - if compat.PY3: - result = self.read_csv(fin, compression='bz2') - tm.assert_frame_equal(result, expected) - elif self.engine is not 'python': - self.assertRaises(ValueError, self.read_csv, - fin, compression='bz2') + result = self.read_csv(fin, compression='bz2') + tm.assert_frame_equal(result, expected) with tm.ensure_clean('test.bz2') as path: tmp = bz2.BZ2File(path, mode='wb') diff --git a/pandas/io/tests/parser/na_values.py b/pandas/io/tests/parser/na_values.py index 92107cf2e82a7..e245bc5589145 100644 --- a/pandas/io/tests/parser/na_values.py +++ b/pandas/io/tests/parser/na_values.py @@ -266,3 +266,26 @@ def test_na_values_scalar(self): out = self.read_csv(StringIO(data), names=names, na_values={'a': 2, 'b': 1}) tm.assert_frame_equal(out, expected) + + def test_na_values_dict_aliasing(self): + na_values = {'a': 2, 'b': 1} + na_values_copy = na_values.copy() + + names = ['a', 'b'] + data = '1,2\n2,1' + + expected = DataFrame([[1.0, 2.0], [np.nan, np.nan]], columns=names) + out = self.read_csv(StringIO(data), names=names, na_values=na_values) + + tm.assert_frame_equal(out, expected) + tm.assert_dict_equal(na_values, na_values_copy) + + def test_na_values_dict_col_index(self): + # see gh-14203 + + data = 'a\nfoo\n1' + na_values = {0: 'foo'} + + out = self.read_csv(StringIO(data), na_values=na_values) + expected = DataFrame({'a': [np.nan, 1]}) + tm.assert_frame_equal(out, expected) diff --git a/pandas/io/tests/parser/test_network.py b/pandas/io/tests/parser/test_network.py index fd7a1babe4e01..8e71cf1cc7e4c 100644 --- a/pandas/io/tests/parser/test_network.py +++ b/pandas/io/tests/parser/test_network.py @@ -12,7 +12,6 @@ import pandas.util.testing as tm from pandas import DataFrame -from pandas import compat from pandas.io.parsers import read_csv, read_table @@ -39,7 +38,7 @@ def test_compressed_urls(self): for compression, extension in self.compression_to_extension.items(): url = self.base_url + extension # args is a (compression, engine) tuple - for args in product([compression, 'infer'], ['python']): + for args in product([compression, 'infer'], ['python', 'c']): # test_fxn is a workaround for more descriptive nose reporting. # See http://stackoverflow.com/a/37393684/4651668. test_fxn = functools.partial(self.check_table) @@ -57,25 +56,19 @@ class TestS3(tm.TestCase): def setUp(self): try: - import boto # noqa + import s3fs # noqa except ImportError: - raise nose.SkipTest("boto not installed") + raise nose.SkipTest("s3fs not installed") @tm.network def test_parse_public_s3_bucket(self): for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]: - if comp == 'bz2' and compat.PY2: - # The Python 2 C parser can't read bz2 from S3. - self.assertRaises(ValueError, read_csv, - 's3://pandas-test/tips.csv' + ext, - compression=comp) - else: - df = read_csv('s3://pandas-test/tips.csv' + - ext, compression=comp) - self.assertTrue(isinstance(df, DataFrame)) - self.assertFalse(df.empty) - tm.assert_frame_equal(read_csv( - tm.get_data_path('tips.csv')), df) + df = read_csv('s3://pandas-test/tips.csv' + + ext, compression=comp) + self.assertTrue(isinstance(df, DataFrame)) + self.assertFalse(df.empty) + tm.assert_frame_equal(read_csv( + tm.get_data_path('tips.csv')), df) # Read public file from bucket with not-public contents df = read_csv('s3://cant_get_it/tips.csv') @@ -104,18 +97,12 @@ def test_parse_public_s3a_bucket(self): @tm.network def test_parse_public_s3_bucket_nrows(self): for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]: - if comp == 'bz2' and compat.PY2: - # The Python 2 C parser can't read bz2 from S3. - self.assertRaises(ValueError, read_csv, - 's3://pandas-test/tips.csv' + ext, - compression=comp) - else: - df = read_csv('s3://pandas-test/tips.csv' + - ext, nrows=10, compression=comp) - self.assertTrue(isinstance(df, DataFrame)) - self.assertFalse(df.empty) - tm.assert_frame_equal(read_csv( - tm.get_data_path('tips.csv')).iloc[:10], df) + df = read_csv('s3://pandas-test/tips.csv' + + ext, nrows=10, compression=comp) + self.assertTrue(isinstance(df, DataFrame)) + self.assertFalse(df.empty) + tm.assert_frame_equal(read_csv( + tm.get_data_path('tips.csv')).iloc[:10], df) @tm.network def test_parse_public_s3_bucket_chunked(self): @@ -123,24 +110,18 @@ def test_parse_public_s3_bucket_chunked(self): chunksize = 5 local_tips = read_csv(tm.get_data_path('tips.csv')) for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]: - if comp == 'bz2' and compat.PY2: - # The Python 2 C parser can't read bz2 from S3. - self.assertRaises(ValueError, read_csv, - 's3://pandas-test/tips.csv' + ext, - compression=comp) - else: - df_reader = read_csv('s3://pandas-test/tips.csv' + ext, - chunksize=chunksize, compression=comp) - self.assertEqual(df_reader.chunksize, chunksize) - for i_chunk in [0, 1, 2]: - # Read a couple of chunks and make sure we see them - # properly. - df = df_reader.get_chunk() - self.assertTrue(isinstance(df, DataFrame)) - self.assertFalse(df.empty) - true_df = local_tips.iloc[ - chunksize * i_chunk: chunksize * (i_chunk + 1)] - tm.assert_frame_equal(true_df, df) + df_reader = read_csv('s3://pandas-test/tips.csv' + ext, + chunksize=chunksize, compression=comp) + self.assertEqual(df_reader.chunksize, chunksize) + for i_chunk in [0, 1, 2]: + # Read a couple of chunks and make sure we see them + # properly. + df = df_reader.get_chunk() + self.assertTrue(isinstance(df, DataFrame)) + self.assertFalse(df.empty) + true_df = local_tips.iloc[ + chunksize * i_chunk: chunksize * (i_chunk + 1)] + tm.assert_frame_equal(true_df, df) @tm.network def test_parse_public_s3_bucket_chunked_python(self): @@ -193,15 +174,12 @@ def test_parse_public_s3_bucket_nrows_python(self): @tm.network def test_s3_fails(self): - import boto - with tm.assertRaisesRegexp(boto.exception.S3ResponseError, - 'S3ResponseError: 404 Not Found'): + with tm.assertRaises(IOError): read_csv('s3://nyqpug/asdf.csv') # Receive a permission error when trying to read a private bucket. # It's irrelevant here that this isn't actually a table. - with tm.assertRaisesRegexp(boto.exception.S3ResponseError, - 'S3ResponseError: 403 Forbidden'): + with tm.assertRaises(IOError): read_csv('s3://cant_get_it/') if __name__ == '__main__': diff --git a/pandas/io/tests/parser/test_textreader.py b/pandas/io/tests/parser/test_textreader.py index 49b70fc5e8703..98cb09cd85480 100644 --- a/pandas/io/tests/parser/test_textreader.py +++ b/pandas/io/tests/parser/test_textreader.py @@ -392,6 +392,12 @@ def test_empty_field_eof(self): names=list('abcd'), engine='c') assert_frame_equal(df, c) + def test_empty_csv_input(self): + # GH14867 + df = read_csv(StringIO(), chunksize=20, header=None, + names=['a', 'b', 'c']) + self.assertTrue(isinstance(df, TextFileReader)) + def assert_array_dicts_equal(left, right): for k, v in compat.iteritems(left): diff --git a/pandas/io/tests/test_excel.py b/pandas/io/tests/test_excel.py index 7a1b5655cfbf7..8db0e6202f7fc 100644 --- a/pandas/io/tests/test_excel.py +++ b/pandas/io/tests/test_excel.py @@ -64,11 +64,11 @@ def _skip_if_no_excelsuite(): _skip_if_no_openpyxl() -def _skip_if_no_boto(): +def _skip_if_no_s3fs(): try: - import boto # NOQA + import s3fs # noqa except ImportError: - raise nose.SkipTest('boto not installed, skipping') + raise nose.SkipTest('s3fs not installed, skipping') _seriesd = tm.getSeriesData() @@ -582,7 +582,7 @@ def test_read_from_http_url(self): @tm.network(check_before_test=True) def test_read_from_s3_url(self): - _skip_if_no_boto() + _skip_if_no_s3fs() url = ('s3://pandas-test/test1' + self.ext) url_table = read_excel(url) diff --git a/pandas/io/tests/test_packers.py b/pandas/io/tests/test_packers.py index 91042775ba19d..63c2ffc629ca6 100644 --- a/pandas/io/tests/test_packers.py +++ b/pandas/io/tests/test_packers.py @@ -363,6 +363,8 @@ def setUp(self): 'F': [Timestamp('20130102', tz='US/Eastern')] * 2 + [Timestamp('20130603', tz='CET')] * 3, 'G': [Timestamp('20130102', tz='US/Eastern')] * 5, + 'H': Categorical([1, 2, 3, 4, 5]), + 'I': Categorical([1, 2, 3, 4, 5], ordered=True), } self.d['float'] = Series(data['A']) @@ -370,6 +372,8 @@ def setUp(self): self.d['mixed'] = Series(data['E']) self.d['dt_tz_mixed'] = Series(data['F']) self.d['dt_tz'] = Series(data['G']) + self.d['cat_ordered'] = Series(data['H']) + self.d['cat_unordered'] = Series(data['I']) def test_basic(self): diff --git a/pandas/lib.pyx b/pandas/lib.pyx index b09a1c2755a06..761969491cfc7 100644 --- a/pandas/lib.pyx +++ b/pandas/lib.pyx @@ -257,7 +257,7 @@ cdef double INF = np.inf cdef double NEGINF = -INF -cpdef checknull(object val): +cpdef bint checknull(object val): if util.is_float_object(val) or util.is_complex_object(val): return val != val # and val != INF and val != NEGINF elif util.is_datetime64_object(val): @@ -272,7 +272,7 @@ cpdef checknull(object val): return _checknull(val) -cpdef checknull_old(object val): +cpdef bint checknull_old(object val): if util.is_float_object(val) or util.is_complex_object(val): return val != val or val == INF or val == NEGINF elif util.is_datetime64_object(val): @@ -287,21 +287,21 @@ cpdef checknull_old(object val): return util._checknull(val) -cpdef isposinf_scalar(object val): +cpdef bint isposinf_scalar(object val): if util.is_float_object(val) and val == INF: return True else: return False -cpdef isneginf_scalar(object val): +cpdef bint isneginf_scalar(object val): if util.is_float_object(val) and val == NEGINF: return True else: return False -def isscalar(object val): +cpdef bint isscalar(object val): """ Return True if given value is scalar. @@ -313,6 +313,7 @@ def isscalar(object val): - instances of datetime.datetime - instances of datetime.timedelta - Period + - instances of decimal.Decimal """ @@ -325,7 +326,8 @@ def isscalar(object val): or PyDate_Check(val) or PyDelta_Check(val) or PyTime_Check(val) - or util.is_period_object(val)) + or util.is_period_object(val) + or is_decimal(val)) def item_from_zerodim(object val): diff --git a/pandas/parser.pyx b/pandas/parser.pyx index d94a4ef278dee..2464ee15b36b7 100644 --- a/pandas/parser.pyx +++ b/pandas/parser.pyx @@ -621,8 +621,9 @@ cdef class TextReader: if isinstance(source, basestring) or PY3: source = bz2.BZ2File(source, 'rb') else: - raise ValueError('Python 2 cannot read bz2 from open file ' - 'handle') + content = source.read() + source.close() + source = compat.StringIO(bz2.decompress(content)) elif self.compression == 'zip': import zipfile zip_file = zipfile.ZipFile(source) @@ -1262,19 +1263,23 @@ cdef class TextReader: return None, set() if isinstance(self.na_values, dict): + key = None values = None + if name is not None and name in self.na_values: - values = self.na_values[name] - if values is not None and not isinstance(values, list): - values = list(values) - fvalues = self.na_fvalues[name] - if fvalues is not None and not isinstance(fvalues, set): - fvalues = set(fvalues) - else: - if i in self.na_values: - return self.na_values[i], self.na_fvalues[i] - else: - return _NA_VALUES, set() + key = name + elif i in self.na_values: + key = i + else: # No na_values provided for this column. + return _NA_VALUES, set() + + values = self.na_values[key] + if values is not None and not isinstance(values, list): + values = list(values) + + fvalues = self.na_fvalues[key] + if fvalues is not None and not isinstance(fvalues, set): + fvalues = set(fvalues) return _ensure_encoded(values), fvalues else: diff --git a/pandas/sparse/tests/test_frame.py b/pandas/sparse/tests/test_frame.py index 5cc765a2c1cf3..ab12099b5624d 100644 --- a/pandas/sparse/tests/test_frame.py +++ b/pandas/sparse/tests/test_frame.py @@ -215,6 +215,21 @@ def test_constructor_preserve_attr(self): self.assertEqual(df['x'].dtype, np.int64) self.assertEqual(df['x'].fill_value, 0) + def test_constructor_nan_dataframe(self): + # GH 10079 + trains = np.arange(100) + tresholds = [10, 20, 30, 40, 50, 60] + tuples = [(i, j) for i in trains for j in tresholds] + index = pd.MultiIndex.from_tuples(tuples, + names=['trains', 'tresholds']) + matrix = np.empty((len(index), len(trains))) + matrix.fill(np.nan) + df = pd.DataFrame(matrix, index=index, columns=trains, dtype=float) + result = df.to_sparse() + expected = pd.SparseDataFrame(matrix, index=index, columns=trains, + dtype=float) + tm.assert_sp_frame_equal(result, expected) + def test_dtypes(self): df = DataFrame(np.random.randn(10000, 4)) df.ix[:9998] = np.nan diff --git a/pandas/src/algos_common_helper.pxi.in b/pandas/src/algos_common_helper.pxi.in index c52c734f727e9..c1c190704b4c7 100644 --- a/pandas/src/algos_common_helper.pxi.in +++ b/pandas/src/algos_common_helper.pxi.in @@ -553,6 +553,7 @@ dtypes = [('float64', 'FLOAT64', 'float64'), ('int16', 'INT16', 'int16'), ('int32', 'INT32', 'int32'), ('int64', 'INT64', 'int64'), + ('uint64', 'UINT64', 'uint64'), # ('platform_int', 'INT', 'int_'), # ('object', 'OBJECT', 'object_'), ] diff --git a/pandas/src/hashtable_class_helper.pxi.in b/pandas/src/hashtable_class_helper.pxi.in index 22714e6305677..55c840b20c78b 100644 --- a/pandas/src/hashtable_class_helper.pxi.in +++ b/pandas/src/hashtable_class_helper.pxi.in @@ -17,7 +17,8 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in dtypes = [('Float64', 'float64', 'float64_t'), ('Int64', 'int64', 'int64_t'), - ('String', 'string', 'char *')] + ('String', 'string', 'char *'), + ('UInt64', 'uint64', 'uint64_t')] }} {{for name, dtype, arg in dtypes}} @@ -40,6 +41,7 @@ cdef inline void append_data_{{dtype}}({{name}}VectorData *data, ctypedef fused vector_data: Int64VectorData + UInt64VectorData Float64VectorData StringVectorData @@ -54,6 +56,7 @@ cdef inline bint needs_resize(vector_data *data) nogil: # name, dtype, arg, idtype dtypes = [('Float64', 'float64', 'float64_t', 'np.float64'), + ('UInt64', 'uint64', 'uint64_t', 'np.uint64'), ('Int64', 'int64', 'int64_t', 'np.int64')] }} @@ -201,6 +204,7 @@ cdef class HashTable: # name, dtype, null_condition, float_group dtypes = [('Float64', 'float64', 'val != val', True), + ('UInt64', 'uint64', 'val == 0', False), ('Int64', 'int64', 'val == iNaT', False)] }} diff --git a/pandas/src/hashtable_func_helper.pxi.in b/pandas/src/hashtable_func_helper.pxi.in index 1840b914f3328..f3e16cfd32963 100644 --- a/pandas/src/hashtable_func_helper.pxi.in +++ b/pandas/src/hashtable_func_helper.pxi.in @@ -11,7 +11,7 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in {{py: # name -dtypes = ['float64', 'int64'] +dtypes = ['float64', 'int64', 'uint64'] }} diff --git a/pandas/src/inference.pyx b/pandas/src/inference.pyx index 5ac2c70bb1808..2f829417f9bb2 100644 --- a/pandas/src/inference.pyx +++ b/pandas/src/inference.pyx @@ -1,4 +1,5 @@ import sys +from decimal import Decimal cimport util from tslib import NaT, get_timezone from datetime import datetime, timedelta @@ -12,26 +13,34 @@ from util cimport (UINT8_MAX, UINT16_MAX, UINT32_MAX, UINT64_MAX, # core.common import for fast inference checks +npy_int64_max = np.iinfo(np.int64).max -def is_float(object obj): + +cpdef bint is_float(object obj): return util.is_float_object(obj) -def is_integer(object obj): +cpdef bint is_integer(object obj): return util.is_integer_object(obj) -def is_bool(object obj): +cpdef bint is_bool(object obj): return util.is_bool_object(obj) -def is_complex(object obj): +cpdef bint is_complex(object obj): return util.is_complex_object(obj) + +cpdef bint is_decimal(object obj): + return isinstance(obj, Decimal) + + cpdef bint is_period(object val): """ Return a boolean if this is a Period object """ return util.is_period_object(val) + _TYPE_MAP = { 'categorical': 'categorical', 'category': 'categorical', @@ -229,7 +238,7 @@ def infer_dtype(object _values): return 'mixed' -def is_possible_datetimelike_array(object arr): +cpdef bint is_possible_datetimelike_array(object arr): # determine if we have a possible datetimelike (or null-like) array cdef: Py_ssize_t i, n = len(arr) @@ -314,7 +323,7 @@ cdef inline bint is_timedelta(object o): return PyDelta_Check(o) or util.is_timedelta64_object(o) -def is_bool_array(ndarray values): +cpdef bint is_bool_array(ndarray values): cdef: Py_ssize_t i, n = len(values) ndarray[object] objbuf @@ -335,11 +344,7 @@ def is_bool_array(ndarray values): return False -def is_integer(object o): - return util.is_integer_object(o) - - -def is_integer_array(ndarray values): +cpdef bint is_integer_array(ndarray values): cdef: Py_ssize_t i, n = len(values) ndarray[object] objbuf @@ -360,7 +365,7 @@ def is_integer_array(ndarray values): return False -def is_integer_float_array(ndarray values): +cpdef bint is_integer_float_array(ndarray values): cdef: Py_ssize_t i, n = len(values) ndarray[object] objbuf @@ -383,7 +388,7 @@ def is_integer_float_array(ndarray values): return False -def is_float_array(ndarray values): +cpdef bint is_float_array(ndarray values): cdef: Py_ssize_t i, n = len(values) ndarray[object] objbuf @@ -404,7 +409,7 @@ def is_float_array(ndarray values): return False -def is_string_array(ndarray values): +cpdef bint is_string_array(ndarray values): cdef: Py_ssize_t i, n = len(values) ndarray[object] objbuf @@ -426,7 +431,7 @@ def is_string_array(ndarray values): return False -def is_unicode_array(ndarray values): +cpdef bint is_unicode_array(ndarray values): cdef: Py_ssize_t i, n = len(values) ndarray[object] objbuf @@ -447,7 +452,7 @@ def is_unicode_array(ndarray values): return False -def is_bytes_array(ndarray values): +cpdef bint is_bytes_array(ndarray values): cdef: Py_ssize_t i, n = len(values) ndarray[object] objbuf @@ -468,7 +473,7 @@ def is_bytes_array(ndarray values): return False -def is_datetime_array(ndarray[object] values): +cpdef bint is_datetime_array(ndarray[object] values): cdef Py_ssize_t i, null_count = 0, n = len(values) cdef object v if n == 0: @@ -486,7 +491,7 @@ def is_datetime_array(ndarray[object] values): return null_count != n -def is_datetime64_array(ndarray values): +cpdef bint is_datetime64_array(ndarray values): cdef Py_ssize_t i, null_count = 0, n = len(values) cdef object v if n == 0: @@ -504,7 +509,7 @@ def is_datetime64_array(ndarray values): return null_count != n -cpdef is_datetime_with_singletz_array(ndarray[object] values): +cpdef bint is_datetime_with_singletz_array(ndarray[object] values): """ Check values have the same tzinfo attribute. Doesn't check values are datetime-like types. @@ -532,7 +537,7 @@ cpdef is_datetime_with_singletz_array(ndarray[object] values): return True -def is_timedelta_array(ndarray values): +cpdef bint is_timedelta_array(ndarray values): cdef Py_ssize_t i, null_count = 0, n = len(values) cdef object v if n == 0: @@ -548,7 +553,7 @@ def is_timedelta_array(ndarray values): return null_count != n -def is_timedelta64_array(ndarray values): +cpdef bint is_timedelta64_array(ndarray values): cdef Py_ssize_t i, null_count = 0, n = len(values) cdef object v if n == 0: @@ -564,7 +569,7 @@ def is_timedelta64_array(ndarray values): return null_count != n -def is_timedelta_or_timedelta64_array(ndarray values): +cpdef bint is_timedelta_or_timedelta64_array(ndarray values): """ infer with timedeltas and/or nat/none """ cdef Py_ssize_t i, null_count = 0, n = len(values) cdef object v @@ -581,7 +586,7 @@ def is_timedelta_or_timedelta64_array(ndarray values): return null_count != n -def is_date_array(ndarray[object] values): +cpdef bint is_date_array(ndarray[object] values): cdef Py_ssize_t i, n = len(values) if n == 0: return False @@ -591,7 +596,7 @@ def is_date_array(ndarray[object] values): return True -def is_time_array(ndarray[object] values): +cpdef bint is_time_array(ndarray[object] values): cdef Py_ssize_t i, n = len(values) if n == 0: return False @@ -601,7 +606,7 @@ def is_time_array(ndarray[object] values): return True -def is_period_array(ndarray[object] values): +cpdef bint is_period_array(ndarray[object] values): cdef Py_ssize_t i, null_count = 0, n = len(values) cdef object v if n == 0: @@ -673,6 +678,9 @@ def maybe_convert_numeric(object[:] values, set na_values, elif util.is_complex_object(val): complexes[i] = val seen_complex = True + elif is_decimal(val): + floats[i] = complexes[i] = val + seen_float = True else: try: status = floatify(val, &fval, &maybe_int) @@ -722,6 +730,7 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, ndarray[float64_t] floats ndarray[complex128_t] complexes ndarray[int64_t] ints + ndarray[uint64_t] uints ndarray[uint8_t] bools ndarray[int64_t] idatetimes ndarray[int64_t] itimedeltas @@ -731,6 +740,8 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, bint seen_datetimetz = 0 bint seen_timedelta = 0 bint seen_int = 0 + bint seen_uint = 0 + bint seen_sint = 0 bint seen_bool = 0 bint seen_object = 0 bint seen_null = 0 @@ -743,6 +754,7 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, floats = np.empty(n, dtype='f8') complexes = np.empty(n, dtype='c16') ints = np.empty(n, dtype='i8') + uints = np.empty(n, dtype='u8') bools = np.empty(n, dtype=np.uint8) if convert_datetime: @@ -798,11 +810,21 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, floats[i] = val complexes[i] = val if not seen_null: - try: - ints[i] = val - except OverflowError: + seen_uint = seen_uint or (val > npy_int64_max) + seen_sint = seen_sint or (val < 0) + + if seen_uint and seen_sint: seen_object = 1 break + + if seen_uint: + uints[i] = val + elif seen_sint: + ints[i] = val + else: + uints[i] = val + ints[i] = val + elif util.is_complex_object(val): complexes[i] = val seen_complex = 1 @@ -865,7 +887,10 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, elif seen_float: return floats elif seen_int: - return ints + if seen_uint: + return uints + else: + return ints elif (not seen_datetime and not seen_numeric and not seen_timedelta): return bools.view(np.bool_) @@ -896,7 +921,10 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0, if not seen_int: return floats elif seen_int: - return ints + if seen_uint: + return uints + else: + return ints elif (not seen_datetime and not seen_numeric and not seen_timedelta): return bools.view(np.bool_) diff --git a/pandas/src/joins_func_helper.pxi.in b/pandas/src/joins_func_helper.pxi.in index 06c35cfb69e53..33926a23f7f41 100644 --- a/pandas/src/joins_func_helper.pxi.in +++ b/pandas/src/joins_func_helper.pxi.in @@ -1,3 +1,4 @@ +# cython: boundscheck=False, wraparound=False """ Template for each `dtype` helper function for hashtable @@ -14,7 +15,9 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in by_dtypes = [('PyObjectHashTable', 'object'), ('Int64HashTable', 'int64_t')] # on_dtype -on_dtypes = ['int64_t', 'double'] +on_dtypes = ['uint8_t', 'uint16_t', 'uint32_t', 'uint64_t', + 'int8_t', 'int16_t', 'int32_t', 'int64_t', + 'float', 'double'] }} @@ -98,7 +101,9 @@ def asof_join_{{on_dtype}}_by_{{by_dtype}}(ndarray[{{on_dtype}}] left_values, {{py: # on_dtype -dtypes = ['int64_t', 'double'] +dtypes = ['uint8_t', 'uint16_t', 'uint32_t', 'uint64_t', + 'int8_t', 'int16_t', 'int32_t', 'int64_t', + 'float', 'double'] }} diff --git a/pandas/src/khash.pxd b/pandas/src/khash.pxd index b28f43eecfac7..adb0fe285dbb8 100644 --- a/pandas/src/khash.pxd +++ b/pandas/src/khash.pxd @@ -1,5 +1,5 @@ from cpython cimport PyObject -from numpy cimport int64_t, int32_t, uint32_t, float64_t +from numpy cimport int64_t, uint64_t, int32_t, uint32_t, float64_t cdef extern from "khash_python.h": ctypedef uint32_t khint_t @@ -55,7 +55,6 @@ cdef extern from "khash_python.h": bint kh_exist_str(kh_str_t*, khiter_t) nogil - ctypedef struct kh_int64_t: khint_t n_buckets, size, n_occupied, upper_bound uint32_t *flags @@ -72,6 +71,24 @@ cdef extern from "khash_python.h": bint kh_exist_int64(kh_int64_t*, khiter_t) nogil + ctypedef uint64_t khuint64_t + + ctypedef struct kh_uint64_t: + khint_t n_buckets, size, n_occupied, upper_bound + uint32_t *flags + khuint64_t *keys + size_t *vals + + inline kh_uint64_t* kh_init_uint64() nogil + inline void kh_destroy_uint64(kh_uint64_t*) nogil + inline void kh_clear_uint64(kh_uint64_t*) nogil + inline khint_t kh_get_uint64(kh_uint64_t*, int64_t) nogil + inline void kh_resize_uint64(kh_uint64_t*, khint_t) nogil + inline khint_t kh_put_uint64(kh_uint64_t*, int64_t, int*) nogil + inline void kh_del_uint64(kh_uint64_t*, khint_t) nogil + + bint kh_exist_uint64(kh_uint64_t*, khiter_t) nogil + ctypedef struct kh_float64_t: khint_t n_buckets, size, n_occupied, upper_bound uint32_t *flags diff --git a/pandas/src/klib/khash.h b/pandas/src/klib/khash.h index dc004a0e1770b..869607a44c001 100644 --- a/pandas/src/klib/khash.h +++ b/pandas/src/klib/khash.h @@ -567,12 +567,14 @@ typedef const char *kh_cstr_t; #define kh_exist_str(h, k) (kh_exist(h, k)) #define kh_exist_float64(h, k) (kh_exist(h, k)) +#define kh_exist_uint64(h, k) (kh_exist(h, k)) #define kh_exist_int64(h, k) (kh_exist(h, k)) #define kh_exist_int32(h, k) (kh_exist(h, k)) KHASH_MAP_INIT_STR(str, size_t) KHASH_MAP_INIT_INT(int32, size_t) KHASH_MAP_INIT_INT64(int64, size_t) +KHASH_MAP_INIT_UINT64(uint64, size_t) #endif /* __AC_KHASH_H */ diff --git a/pandas/tests/frame/test_block_internals.py b/pandas/tests/frame/test_block_internals.py index e51cc0f5a6ec7..706820b06b12e 100644 --- a/pandas/tests/frame/test_block_internals.py +++ b/pandas/tests/frame/test_block_internals.py @@ -142,7 +142,7 @@ def test_constructor_with_convert(self): df = DataFrame({'A': [2 ** 63]}) result = df['A'] - expected = Series(np.asarray([2 ** 63], np.object_), name='A') + expected = Series(np.asarray([2 ** 63], np.uint64), name='A') assert_series_equal(result, expected) df = DataFrame({'A': [datetime(2005, 1, 1), True]}) diff --git a/pandas/tests/frame/test_combine_concat.py b/pandas/tests/frame/test_combine_concat.py index 5b5236843643d..c6b69dad3e6b5 100644 --- a/pandas/tests/frame/test_combine_concat.py +++ b/pandas/tests/frame/test_combine_concat.py @@ -725,3 +725,13 @@ def test_combine_first_period(self): exp = pd.DataFrame({'P': exp_dts}, index=[1, 2, 3, 4, 5, 7]) tm.assert_frame_equal(res, exp) self.assertEqual(res['P'].dtype, 'object') + + def test_combine_first_int(self): + # GH14687 - integer series that do no align exactly + + df1 = pd.DataFrame({'a': [0, 1, 3, 5]}, dtype='int64') + df2 = pd.DataFrame({'a': [1, 4]}, dtype='int64') + + res = df1.combine_first(df2) + tm.assert_frame_equal(res, df1) + self.assertEqual(res['a'].dtype, 'int64') diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 489c85a7234b8..bf0fabaf3e402 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -183,13 +183,14 @@ def test_constructor_bool(self): self.assertEqual(df.values.dtype, np.bool_) def test_constructor_overflow_int64(self): + # see gh-14881 values = np.array([2 ** 64 - i for i in range(1, 10)], dtype=np.uint64) result = DataFrame({'a': values}) - self.assertEqual(result['a'].dtype, object) + self.assertEqual(result['a'].dtype, np.uint64) - # #2355 + # see gh-2355 data_scores = [(6311132704823138710, 273), (2685045978526272070, 23), (8921811264899370420, 45), (long(17019687244989530680), 270), @@ -198,7 +199,7 @@ def test_constructor_overflow_int64(self): data = np.zeros((len(data_scores),), dtype=dtype) data[:] = data_scores df_crawls = DataFrame(data) - self.assertEqual(df_crawls['uid'].dtype, object) + self.assertEqual(df_crawls['uid'].dtype, np.uint64) def test_constructor_ordereddict(self): import random diff --git a/pandas/tests/frame/test_dtypes.py b/pandas/tests/frame/test_dtypes.py index 61030c262a44b..43a108e9acc80 100644 --- a/pandas/tests/frame/test_dtypes.py +++ b/pandas/tests/frame/test_dtypes.py @@ -109,15 +109,48 @@ def test_select_dtypes_include(self): 'c': np.arange(3, 6).astype('u1'), 'd': np.arange(4.0, 7.0, dtype='float64'), 'e': [True, False, True], - 'f': pd.Categorical(list('abc'))}) + 'f': pd.Categorical(list('abc')), + 'g': pd.date_range('20130101', periods=3), + 'h': pd.date_range('20130101', periods=3, + tz='US/Eastern'), + 'i': pd.date_range('20130101', periods=3, + tz='CET'), + 'j': pd.period_range('2013-01', periods=3, + freq='M'), + 'k': pd.timedelta_range('1 day', periods=3)}) + ri = df.select_dtypes(include=[np.number]) + ei = df[['b', 'c', 'd', 'k']] + assert_frame_equal(ri, ei) + + ri = df.select_dtypes(include=[np.number], exclude=['timedelta']) ei = df[['b', 'c', 'd']] assert_frame_equal(ri, ei) - ri = df.select_dtypes(include=[np.number, 'category']) + ri = df.select_dtypes(include=[np.number, 'category'], + exclude=['timedelta']) ei = df[['b', 'c', 'd', 'f']] assert_frame_equal(ri, ei) + ri = df.select_dtypes(include=['datetime']) + ei = df[['g']] + assert_frame_equal(ri, ei) + + ri = df.select_dtypes(include=['datetime64']) + ei = df[['g']] + assert_frame_equal(ri, ei) + + ri = df.select_dtypes(include=['datetimetz']) + ei = df[['h', 'i']] + assert_frame_equal(ri, ei) + + ri = df.select_dtypes(include=['timedelta']) + ei = df[['k']] + assert_frame_equal(ri, ei) + + self.assertRaises(NotImplementedError, + lambda: df.select_dtypes(include=['period'])) + def test_select_dtypes_exclude(self): df = DataFrame({'a': list('abc'), 'b': list(range(1, 4)), diff --git a/pandas/tests/frame/test_operators.py b/pandas/tests/frame/test_operators.py index 85aadee8b0900..8462d5cd9bcf6 100644 --- a/pandas/tests/frame/test_operators.py +++ b/pandas/tests/frame/test_operators.py @@ -378,10 +378,10 @@ def test_arith_flex_frame(self): result = getattr(self.mixed_int, op)(2 + self.mixed_int) exp = f(self.mixed_int, 2 + self.mixed_int) - # overflow in the uint + # no overflow in the uint dtype = None if op in ['sub']: - dtype = dict(B='object', C=None) + dtype = dict(B='uint64', C=None) elif op in ['add', 'mul']: dtype = dict(C=None) assert_frame_equal(result, exp) @@ -410,10 +410,10 @@ def test_arith_flex_frame(self): 2 + self.mixed_int) exp = f(self.mixed_int, 2 + self.mixed_int) - # overflow in the uint + # no overflow in the uint dtype = None if op in ['sub']: - dtype = dict(B='object', C=None) + dtype = dict(B='uint64', C=None) elif op in ['add', 'mul']: dtype = dict(C=None) assert_frame_equal(result, exp) diff --git a/pandas/tests/frame/test_timeseries.py b/pandas/tests/frame/test_timeseries.py index 9758c2b9c805e..c6c3b4f43b55a 100644 --- a/pandas/tests/frame/test_timeseries.py +++ b/pandas/tests/frame/test_timeseries.py @@ -366,6 +366,21 @@ def test_operation_on_NaT(self): exp = pd.Series([pd.NaT], index=["foo"]) tm.assert_series_equal(res, exp) + def test_datetime_assignment_with_NaT_and_diff_time_units(self): + # GH 7492 + data_ns = np.array([1, 'nat'], dtype='datetime64[ns]') + result = pd.Series(data_ns).to_frame() + result['new'] = data_ns + expected = pd.DataFrame({0: [1, None], + 'new': [1, None]}, dtype='datetime64[ns]') + tm.assert_frame_equal(result, expected) + # OutOfBoundsDatetime error shouldn't occur + data_s = np.array([1, 'nat'], dtype='datetime64[s]') + result['new'] = data_s + expected = pd.DataFrame({0: [1, None], + 'new': [1e9, None]}, dtype='datetime64[ns]') + tm.assert_frame_equal(result, expected) + if __name__ == '__main__': import nose diff --git a/pandas/tests/groupby/test_aggregate.py b/pandas/tests/groupby/test_aggregate.py new file mode 100644 index 0000000000000..6b162b71f79de --- /dev/null +++ b/pandas/tests/groupby/test_aggregate.py @@ -0,0 +1,494 @@ +# -*- coding: utf-8 -*- +from __future__ import print_function +import nose +from datetime import datetime + + +from pandas import date_range +from pandas.core.index import MultiIndex +from pandas.core.api import DataFrame + +from pandas.core.series import Series + +from pandas.util.testing import (assert_frame_equal, assert_series_equal + ) + +from pandas.core.groupby import (SpecificationError) +from pandas.compat import (lmap, OrderedDict) +from pandas.formats.printing import pprint_thing + +from pandas import compat + +import pandas.core.common as com +import numpy as np + +import pandas.util.testing as tm +import pandas as pd + + +class TestGroupByAggregate(tm.TestCase): + + _multiprocess_can_split_ = True + + def setUp(self): + self.ts = tm.makeTimeSeries() + + self.seriesd = tm.getSeriesData() + self.tsd = tm.getTimeSeriesData() + self.frame = DataFrame(self.seriesd) + self.tsframe = DataFrame(self.tsd) + + self.df = DataFrame( + {'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], + 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], + 'C': np.random.randn(8), + 'D': np.random.randn(8)}) + + self.df_mixed_floats = DataFrame( + {'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], + 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], + 'C': np.random.randn(8), + 'D': np.array( + np.random.randn(8), dtype='float32')}) + + index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two', + 'three']], + labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], + [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=['first', 'second']) + self.mframe = DataFrame(np.random.randn(10, 3), index=index, + columns=['A', 'B', 'C']) + + self.three_group = DataFrame( + {'A': ['foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar', + 'foo', 'foo', 'foo'], + 'B': ['one', 'one', 'one', 'two', 'one', 'one', 'one', 'two', + 'two', 'two', 'one'], + 'C': ['dull', 'dull', 'shiny', 'dull', 'dull', 'shiny', 'shiny', + 'dull', 'shiny', 'shiny', 'shiny'], + 'D': np.random.randn(11), + 'E': np.random.randn(11), + 'F': np.random.randn(11)}) + + def test_agg_api(self): + + # GH 6337 + # http://stackoverflow.com/questions/21706030/pandas-groupby-agg-function-column-dtype-error + # different api for agg when passed custom function with mixed frame + + df = DataFrame({'data1': np.random.randn(5), + 'data2': np.random.randn(5), + 'key1': ['a', 'a', 'b', 'b', 'a'], + 'key2': ['one', 'two', 'one', 'two', 'one']}) + grouped = df.groupby('key1') + + def peak_to_peak(arr): + return arr.max() - arr.min() + + expected = grouped.agg([peak_to_peak]) + expected.columns = ['data1', 'data2'] + result = grouped.agg(peak_to_peak) + assert_frame_equal(result, expected) + + def test_agg_regression1(self): + grouped = self.tsframe.groupby([lambda x: x.year, lambda x: x.month]) + result = grouped.agg(np.mean) + expected = grouped.mean() + assert_frame_equal(result, expected) + + def test_agg_datetimes_mixed(self): + data = [[1, '2012-01-01', 1.0], [2, '2012-01-02', 2.0], [3, None, 3.0]] + + df1 = DataFrame({'key': [x[0] for x in data], + 'date': [x[1] for x in data], + 'value': [x[2] for x in data]}) + + data = [[row[0], datetime.strptime(row[1], '%Y-%m-%d').date() if row[1] + else None, row[2]] for row in data] + + df2 = DataFrame({'key': [x[0] for x in data], + 'date': [x[1] for x in data], + 'value': [x[2] for x in data]}) + + df1['weights'] = df1['value'] / df1['value'].sum() + gb1 = df1.groupby('date').aggregate(np.sum) + + df2['weights'] = df1['value'] / df1['value'].sum() + gb2 = df2.groupby('date').aggregate(np.sum) + + assert (len(gb1) == len(gb2)) + + def test_agg_period_index(self): + from pandas import period_range, PeriodIndex + prng = period_range('2012-1-1', freq='M', periods=3) + df = DataFrame(np.random.randn(3, 2), index=prng) + rs = df.groupby(level=0).sum() + tm.assertIsInstance(rs.index, PeriodIndex) + + # GH 3579 + index = period_range(start='1999-01', periods=5, freq='M') + s1 = Series(np.random.rand(len(index)), index=index) + s2 = Series(np.random.rand(len(index)), index=index) + series = [('s1', s1), ('s2', s2)] + df = DataFrame.from_items(series) + grouped = df.groupby(df.index.month) + list(grouped) + + def test_agg_dict_parameter_cast_result_dtypes(self): + # GH 12821 + + df = DataFrame( + {'class': ['A', 'A', 'B', 'B', 'C', 'C', 'D', 'D'], + 'time': date_range('1/1/2011', periods=8, freq='H')}) + df.loc[[0, 1, 2, 5], 'time'] = None + + # test for `first` function + exp = df.loc[[0, 3, 4, 6]].set_index('class') + grouped = df.groupby('class') + assert_frame_equal(grouped.first(), exp) + assert_frame_equal(grouped.agg('first'), exp) + assert_frame_equal(grouped.agg({'time': 'first'}), exp) + assert_series_equal(grouped.time.first(), exp['time']) + assert_series_equal(grouped.time.agg('first'), exp['time']) + + # test for `last` function + exp = df.loc[[0, 3, 4, 7]].set_index('class') + grouped = df.groupby('class') + assert_frame_equal(grouped.last(), exp) + assert_frame_equal(grouped.agg('last'), exp) + assert_frame_equal(grouped.agg({'time': 'last'}), exp) + assert_series_equal(grouped.time.last(), exp['time']) + assert_series_equal(grouped.time.agg('last'), exp['time']) + + def test_agg_must_agg(self): + grouped = self.df.groupby('A')['C'] + self.assertRaises(Exception, grouped.agg, lambda x: x.describe()) + self.assertRaises(Exception, grouped.agg, lambda x: x.index[:2]) + + def test_agg_ser_multi_key(self): + # TODO(wesm): unused + ser = self.df.C # noqa + + f = lambda x: x.sum() + results = self.df.C.groupby([self.df.A, self.df.B]).aggregate(f) + expected = self.df.groupby(['A', 'B']).sum()['C'] + assert_series_equal(results, expected) + + def test_agg_apply_corner(self): + # nothing to group, all NA + grouped = self.ts.groupby(self.ts * np.nan) + self.assertEqual(self.ts.dtype, np.float64) + + # groupby float64 values results in Float64Index + exp = Series([], dtype=np.float64, index=pd.Index( + [], dtype=np.float64)) + assert_series_equal(grouped.sum(), exp) + assert_series_equal(grouped.agg(np.sum), exp) + assert_series_equal(grouped.apply(np.sum), exp, check_index_type=False) + + # DataFrame + grouped = self.tsframe.groupby(self.tsframe['A'] * np.nan) + exp_df = DataFrame(columns=self.tsframe.columns, dtype=float, + index=pd.Index([], dtype=np.float64)) + assert_frame_equal(grouped.sum(), exp_df, check_names=False) + assert_frame_equal(grouped.agg(np.sum), exp_df, check_names=False) + assert_frame_equal(grouped.apply(np.sum), exp_df.iloc[:, :0], + check_names=False) + + def test_agg_grouping_is_list_tuple(self): + from pandas.core.groupby import Grouping + + df = tm.makeTimeDataFrame() + + grouped = df.groupby(lambda x: x.year) + grouper = grouped.grouper.groupings[0].grouper + grouped.grouper.groupings[0] = Grouping(self.ts.index, list(grouper)) + + result = grouped.agg(np.mean) + expected = grouped.mean() + tm.assert_frame_equal(result, expected) + + grouped.grouper.groupings[0] = Grouping(self.ts.index, tuple(grouper)) + + result = grouped.agg(np.mean) + expected = grouped.mean() + tm.assert_frame_equal(result, expected) + + def test_aggregate_api_consistency(self): + # GH 9052 + # make sure that the aggregates via dict + # are consistent + + df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar', + 'foo', 'bar', 'foo', 'foo'], + 'B': ['one', 'one', 'two', 'two', + 'two', 'two', 'one', 'two'], + 'C': np.random.randn(8) + 1.0, + 'D': np.arange(8)}) + + grouped = df.groupby(['A', 'B']) + c_mean = grouped['C'].mean() + c_sum = grouped['C'].sum() + d_mean = grouped['D'].mean() + d_sum = grouped['D'].sum() + + result = grouped['D'].agg(['sum', 'mean']) + expected = pd.concat([d_sum, d_mean], + axis=1) + expected.columns = ['sum', 'mean'] + assert_frame_equal(result, expected, check_like=True) + + result = grouped.agg([np.sum, np.mean]) + expected = pd.concat([c_sum, + c_mean, + d_sum, + d_mean], + axis=1) + expected.columns = MultiIndex.from_product([['C', 'D'], + ['sum', 'mean']]) + assert_frame_equal(result, expected, check_like=True) + + result = grouped[['D', 'C']].agg([np.sum, np.mean]) + expected = pd.concat([d_sum, + d_mean, + c_sum, + c_mean], + axis=1) + expected.columns = MultiIndex.from_product([['D', 'C'], + ['sum', 'mean']]) + assert_frame_equal(result, expected, check_like=True) + + result = grouped.agg({'C': 'mean', 'D': 'sum'}) + expected = pd.concat([d_sum, + c_mean], + axis=1) + assert_frame_equal(result, expected, check_like=True) + + result = grouped.agg({'C': ['mean', 'sum'], + 'D': ['mean', 'sum']}) + expected = pd.concat([c_mean, + c_sum, + d_mean, + d_sum], + axis=1) + expected.columns = MultiIndex.from_product([['C', 'D'], + ['mean', 'sum']]) + + result = grouped[['D', 'C']].agg({'r': np.sum, + 'r2': np.mean}) + expected = pd.concat([d_sum, + c_sum, + d_mean, + c_mean], + axis=1) + expected.columns = MultiIndex.from_product([['r', 'r2'], + ['D', 'C']]) + assert_frame_equal(result, expected, check_like=True) + + def test_agg_compat(self): + + # GH 12334 + + df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar', + 'foo', 'bar', 'foo', 'foo'], + 'B': ['one', 'one', 'two', 'two', + 'two', 'two', 'one', 'two'], + 'C': np.random.randn(8) + 1.0, + 'D': np.arange(8)}) + + g = df.groupby(['A', 'B']) + + expected = pd.concat([g['D'].sum(), + g['D'].std()], + axis=1) + expected.columns = MultiIndex.from_tuples([('C', 'sum'), + ('C', 'std')]) + result = g['D'].agg({'C': ['sum', 'std']}) + assert_frame_equal(result, expected, check_like=True) + + expected = pd.concat([g['D'].sum(), + g['D'].std()], + axis=1) + expected.columns = ['C', 'D'] + result = g['D'].agg({'C': 'sum', 'D': 'std'}) + assert_frame_equal(result, expected, check_like=True) + + def test_agg_nested_dicts(self): + + # API change for disallowing these types of nested dicts + df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar', + 'foo', 'bar', 'foo', 'foo'], + 'B': ['one', 'one', 'two', 'two', + 'two', 'two', 'one', 'two'], + 'C': np.random.randn(8) + 1.0, + 'D': np.arange(8)}) + + g = df.groupby(['A', 'B']) + + def f(): + g.aggregate({'r1': {'C': ['mean', 'sum']}, + 'r2': {'D': ['mean', 'sum']}}) + + self.assertRaises(SpecificationError, f) + + result = g.agg({'C': {'ra': ['mean', 'std']}, + 'D': {'rb': ['mean', 'std']}}) + expected = pd.concat([g['C'].mean(), g['C'].std(), g['D'].mean(), + g['D'].std()], axis=1) + expected.columns = pd.MultiIndex.from_tuples([('ra', 'mean'), ( + 'ra', 'std'), ('rb', 'mean'), ('rb', 'std')]) + assert_frame_equal(result, expected, check_like=True) + + # same name as the original column + # GH9052 + expected = g['D'].agg({'result1': np.sum, 'result2': np.mean}) + expected = expected.rename(columns={'result1': 'D'}) + result = g['D'].agg({'D': np.sum, 'result2': np.mean}) + assert_frame_equal(result, expected, check_like=True) + + def test_agg_python_multiindex(self): + grouped = self.mframe.groupby(['A', 'B']) + + result = grouped.agg(np.mean) + expected = grouped.mean() + tm.assert_frame_equal(result, expected) + + def test_aggregate_str_func(self): + def _check_results(grouped): + # single series + result = grouped['A'].agg('std') + expected = grouped['A'].std() + assert_series_equal(result, expected) + + # group frame by function name + result = grouped.aggregate('var') + expected = grouped.var() + assert_frame_equal(result, expected) + + # group frame by function dict + result = grouped.agg(OrderedDict([['A', 'var'], ['B', 'std'], + ['C', 'mean'], ['D', 'sem']])) + expected = DataFrame(OrderedDict([['A', grouped['A'].var( + )], ['B', grouped['B'].std()], ['C', grouped['C'].mean()], + ['D', grouped['D'].sem()]])) + assert_frame_equal(result, expected) + + by_weekday = self.tsframe.groupby(lambda x: x.weekday()) + _check_results(by_weekday) + + by_mwkday = self.tsframe.groupby([lambda x: x.month, + lambda x: x.weekday()]) + _check_results(by_mwkday) + + def test_aggregate_item_by_item(self): + + df = self.df.copy() + df['E'] = ['a'] * len(self.df) + grouped = self.df.groupby('A') + + # API change in 0.11 + # def aggfun(ser): + # return len(ser + 'a') + # result = grouped.agg(aggfun) + # self.assertEqual(len(result.columns), 1) + + aggfun = lambda ser: ser.size + result = grouped.agg(aggfun) + foo = (self.df.A == 'foo').sum() + bar = (self.df.A == 'bar').sum() + K = len(result.columns) + + # GH5782 + # odd comparisons can result here, so cast to make easy + exp = pd.Series(np.array([foo] * K), index=list('BCD'), + dtype=np.float64, name='foo') + tm.assert_series_equal(result.xs('foo'), exp) + + exp = pd.Series(np.array([bar] * K), index=list('BCD'), + dtype=np.float64, name='bar') + tm.assert_almost_equal(result.xs('bar'), exp) + + def aggfun(ser): + return ser.size + + result = DataFrame().groupby(self.df.A).agg(aggfun) + tm.assertIsInstance(result, DataFrame) + self.assertEqual(len(result), 0) + + def test_agg_item_by_item_raise_typeerror(self): + from numpy.random import randint + + df = DataFrame(randint(10, size=(20, 10))) + + def raiseException(df): + pprint_thing('----------------------------------------') + pprint_thing(df.to_string()) + raise TypeError + + self.assertRaises(TypeError, df.groupby(0).agg, raiseException) + + def test_series_agg_multikey(self): + ts = tm.makeTimeSeries() + grouped = ts.groupby([lambda x: x.year, lambda x: x.month]) + + result = grouped.agg(np.sum) + expected = grouped.sum() + assert_series_equal(result, expected) + + def test_series_agg_multi_pure_python(self): + data = DataFrame( + {'A': ['foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar', + 'foo', 'foo', 'foo'], + 'B': ['one', 'one', 'one', 'two', 'one', 'one', 'one', 'two', + 'two', 'two', 'one'], + 'C': ['dull', 'dull', 'shiny', 'dull', 'dull', 'shiny', 'shiny', + 'dull', 'shiny', 'shiny', 'shiny'], + 'D': np.random.randn(11), + 'E': np.random.randn(11), + 'F': np.random.randn(11)}) + + def bad(x): + assert (len(x.base) > 0) + return 'foo' + + result = data.groupby(['A', 'B']).agg(bad) + expected = data.groupby(['A', 'B']).agg(lambda x: 'foo') + assert_frame_equal(result, expected) + + +def assert_fp_equal(a, b): + assert (np.abs(a - b) < 1e-12).all() + + +def _check_groupby(df, result, keys, field, f=lambda x: x.sum()): + tups = lmap(tuple, df[keys].values) + tups = com._asarray_tuplesafe(tups) + expected = f(df.groupby(tups)[field]) + for k, v in compat.iteritems(expected): + assert (result[k] == v) + + +def test_decons(): + from pandas.core.groupby import decons_group_index, get_group_index + + def testit(label_list, shape): + group_index = get_group_index(label_list, shape, sort=True, xnull=True) + label_list2 = decons_group_index(group_index, shape) + + for a, b in zip(label_list, label_list2): + assert (np.array_equal(a, b)) + + shape = (4, 5, 6) + label_list = [np.tile([0, 1, 2, 3, 0, 1, 2, 3], 100), np.tile( + [0, 2, 4, 3, 0, 1, 2, 3], 100), np.tile( + [5, 1, 0, 2, 3, 0, 5, 4], 100)] + testit(label_list, shape) + + shape = (10000, 10000) + label_list = [np.tile(np.arange(10000), 5), np.tile(np.arange(10000), 5)] + testit(label_list, shape) + + +if __name__ == '__main__': + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure', '-s' + ], exit=False) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py new file mode 100644 index 0000000000000..99bea3a10115b --- /dev/null +++ b/pandas/tests/groupby/test_categorical.py @@ -0,0 +1,477 @@ +# -*- coding: utf-8 -*- +from __future__ import print_function +import nose +from numpy import nan + + +from pandas.core.index import Index, MultiIndex, CategoricalIndex +from pandas.core.api import DataFrame, Categorical + +from pandas.core.series import Series + +from pandas.util.testing import (assert_frame_equal, assert_series_equal + ) + +from pandas.compat import (lmap) + +from pandas import compat + +import pandas.core.common as com +import numpy as np + +import pandas.util.testing as tm +import pandas as pd + + +class TestGroupByCategorical(tm.TestCase): + + _multiprocess_can_split_ = True + + def setUp(self): + self.ts = tm.makeTimeSeries() + + self.seriesd = tm.getSeriesData() + self.tsd = tm.getTimeSeriesData() + self.frame = DataFrame(self.seriesd) + self.tsframe = DataFrame(self.tsd) + + self.df = DataFrame( + {'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], + 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], + 'C': np.random.randn(8), + 'D': np.random.randn(8)}) + + self.df_mixed_floats = DataFrame( + {'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], + 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], + 'C': np.random.randn(8), + 'D': np.array( + np.random.randn(8), dtype='float32')}) + + index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two', + 'three']], + labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], + [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=['first', 'second']) + self.mframe = DataFrame(np.random.randn(10, 3), index=index, + columns=['A', 'B', 'C']) + + self.three_group = DataFrame( + {'A': ['foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar', + 'foo', 'foo', 'foo'], + 'B': ['one', 'one', 'one', 'two', 'one', 'one', 'one', 'two', + 'two', 'two', 'one'], + 'C': ['dull', 'dull', 'shiny', 'dull', 'dull', 'shiny', 'shiny', + 'dull', 'shiny', 'shiny', 'shiny'], + 'D': np.random.randn(11), + 'E': np.random.randn(11), + 'F': np.random.randn(11)}) + + def test_apply_use_categorical_name(self): + from pandas import qcut + cats = qcut(self.df.C, 4) + + def get_stats(group): + return {'min': group.min(), + 'max': group.max(), + 'count': group.count(), + 'mean': group.mean()} + + result = self.df.groupby(cats).D.apply(get_stats) + self.assertEqual(result.index.names[0], 'C') + + def test_apply_categorical_data(self): + # GH 10138 + for ordered in [True, False]: + dense = Categorical(list('abc'), ordered=ordered) + # 'b' is in the categories but not in the list + missing = Categorical( + list('aaa'), categories=['a', 'b'], ordered=ordered) + values = np.arange(len(dense)) + df = DataFrame({'missing': missing, + 'dense': dense, + 'values': values}) + grouped = df.groupby(['missing', 'dense']) + + # missing category 'b' should still exist in the output index + idx = MultiIndex.from_product( + [Categorical(['a', 'b'], ordered=ordered), + Categorical(['a', 'b', 'c'], ordered=ordered)], + names=['missing', 'dense']) + expected = DataFrame([0, 1, 2, np.nan, np.nan, np.nan], + index=idx, + columns=['values']) + + assert_frame_equal(grouped.apply(lambda x: np.mean(x)), expected) + assert_frame_equal(grouped.mean(), expected) + assert_frame_equal(grouped.agg(np.mean), expected) + + # but for transform we should still get back the original index + idx = MultiIndex.from_product([['a'], ['a', 'b', 'c']], + names=['missing', 'dense']) + expected = Series(1, index=idx) + assert_series_equal(grouped.apply(lambda x: 1), expected) + + def test_groupby_categorical(self): + levels = ['foo', 'bar', 'baz', 'qux'] + codes = np.random.randint(0, 4, size=100) + + cats = Categorical.from_codes(codes, levels, ordered=True) + + data = DataFrame(np.random.randn(100, 4)) + + result = data.groupby(cats).mean() + + expected = data.groupby(np.asarray(cats)).mean() + exp_idx = CategoricalIndex(levels, categories=cats.categories, + ordered=True) + expected = expected.reindex(exp_idx) + + assert_frame_equal(result, expected) + + grouped = data.groupby(cats) + desc_result = grouped.describe() + + idx = cats.codes.argsort() + ord_labels = np.asarray(cats).take(idx) + ord_data = data.take(idx) + + exp_cats = Categorical(ord_labels, ordered=True, + categories=['foo', 'bar', 'baz', 'qux']) + expected = ord_data.groupby(exp_cats, sort=False).describe() + expected.index.names = [None, None] + assert_frame_equal(desc_result, expected) + + # GH 10460 + expc = Categorical.from_codes(np.arange(4).repeat(8), + levels, ordered=True) + exp = CategoricalIndex(expc) + self.assert_index_equal(desc_result.index.get_level_values(0), exp) + exp = Index(['count', 'mean', 'std', 'min', '25%', '50%', + '75%', 'max'] * 4) + self.assert_index_equal(desc_result.index.get_level_values(1), exp) + + def test_groupby_datetime_categorical(self): + # GH9049: ensure backward compatibility + levels = pd.date_range('2014-01-01', periods=4) + codes = np.random.randint(0, 4, size=100) + + cats = Categorical.from_codes(codes, levels, ordered=True) + + data = DataFrame(np.random.randn(100, 4)) + result = data.groupby(cats).mean() + + expected = data.groupby(np.asarray(cats)).mean() + expected = expected.reindex(levels) + expected.index = CategoricalIndex(expected.index, + categories=expected.index, + ordered=True) + + assert_frame_equal(result, expected) + + grouped = data.groupby(cats) + desc_result = grouped.describe() + + idx = cats.codes.argsort() + ord_labels = cats.take_nd(idx) + ord_data = data.take(idx) + expected = ord_data.groupby(ord_labels).describe() + expected.index.names = [None, None] + assert_frame_equal(desc_result, expected) + tm.assert_index_equal(desc_result.index, expected.index) + tm.assert_index_equal( + desc_result.index.get_level_values(0), + expected.index.get_level_values(0)) + + # GH 10460 + expc = Categorical.from_codes( + np.arange(4).repeat(8), levels, ordered=True) + exp = CategoricalIndex(expc) + self.assert_index_equal(desc_result.index.get_level_values(0), exp) + exp = Index(['count', 'mean', 'std', 'min', '25%', '50%', + '75%', 'max'] * 4) + self.assert_index_equal(desc_result.index.get_level_values(1), exp) + + def test_groupby_categorical_index(self): + + levels = ['foo', 'bar', 'baz', 'qux'] + codes = np.random.randint(0, 4, size=20) + cats = Categorical.from_codes(codes, levels, ordered=True) + df = DataFrame( + np.repeat( + np.arange(20), 4).reshape(-1, 4), columns=list('abcd')) + df['cats'] = cats + + # with a cat index + result = df.set_index('cats').groupby(level=0).sum() + expected = df[list('abcd')].groupby(cats.codes).sum() + expected.index = CategoricalIndex( + Categorical.from_codes( + [0, 1, 2, 3], levels, ordered=True), name='cats') + assert_frame_equal(result, expected) + + # with a cat column, should produce a cat index + result = df.groupby('cats').sum() + expected = df[list('abcd')].groupby(cats.codes).sum() + expected.index = CategoricalIndex( + Categorical.from_codes( + [0, 1, 2, 3], levels, ordered=True), name='cats') + assert_frame_equal(result, expected) + + def test_groupby_describe_categorical_columns(self): + # GH 11558 + cats = pd.CategoricalIndex(['qux', 'foo', 'baz', 'bar'], + categories=['foo', 'bar', 'baz', 'qux'], + ordered=True) + df = DataFrame(np.random.randn(20, 4), columns=cats) + result = df.groupby([1, 2, 3, 4] * 5).describe() + + tm.assert_index_equal(result.columns, cats) + tm.assert_categorical_equal(result.columns.values, cats.values) + + def test_groupby_unstack_categorical(self): + # GH11558 (example is taken from the original issue) + df = pd.DataFrame({'a': range(10), + 'medium': ['A', 'B'] * 5, + 'artist': list('XYXXY') * 2}) + df['medium'] = df['medium'].astype('category') + + gcat = df.groupby(['artist', 'medium'])['a'].count().unstack() + result = gcat.describe() + + exp_columns = pd.CategoricalIndex(['A', 'B'], ordered=False, + name='medium') + tm.assert_index_equal(result.columns, exp_columns) + tm.assert_categorical_equal(result.columns.values, exp_columns.values) + + result = gcat['A'] + gcat['B'] + expected = pd.Series([6, 4], index=pd.Index(['X', 'Y'], name='artist')) + tm.assert_series_equal(result, expected) + + def test_groupby_categorical_unequal_len(self): + # GH3011 + series = Series([np.nan, np.nan, 1, 1, 2, 2, 3, 3, 4, 4]) + # The raises only happens with categorical, not with series of types + # category + bins = pd.cut(series.dropna().values, 4) + + # len(bins) != len(series) here + self.assertRaises(ValueError, lambda: series.groupby(bins).mean()) + + def test_groupby_categorical_two_columns(self): + + # https://github.com/pandas-dev/pandas/issues/8138 + d = {'cat': + pd.Categorical(["a", "b", "a", "b"], categories=["a", "b", "c"], + ordered=True), + 'ints': [1, 1, 2, 2], + 'val': [10, 20, 30, 40]} + test = pd.DataFrame(d) + + # Grouping on a single column + groups_single_key = test.groupby("cat") + res = groups_single_key.agg('mean') + + exp_index = pd.CategoricalIndex(["a", "b", "c"], name="cat", + ordered=True) + exp = DataFrame({"ints": [1.5, 1.5, np.nan], "val": [20, 30, np.nan]}, + index=exp_index) + tm.assert_frame_equal(res, exp) + + # Grouping on two columns + groups_double_key = test.groupby(["cat", "ints"]) + res = groups_double_key.agg('mean') + exp = DataFrame({"val": [10, 30, 20, 40, np.nan, np.nan], + "cat": pd.Categorical(["a", "a", "b", "b", "c", "c"], + ordered=True), + "ints": [1, 2, 1, 2, 1, 2]}).set_index(["cat", "ints" + ]) + tm.assert_frame_equal(res, exp) + + # GH 10132 + for key in [('a', 1), ('b', 2), ('b', 1), ('a', 2)]: + c, i = key + result = groups_double_key.get_group(key) + expected = test[(test.cat == c) & (test.ints == i)] + assert_frame_equal(result, expected) + + d = {'C1': [3, 3, 4, 5], 'C2': [1, 2, 3, 4], 'C3': [10, 100, 200, 34]} + test = pd.DataFrame(d) + values = pd.cut(test['C1'], [1, 2, 3, 6]) + values.name = "cat" + groups_double_key = test.groupby([values, 'C2']) + + res = groups_double_key.agg('mean') + nan = np.nan + idx = MultiIndex.from_product( + [Categorical(["(1, 2]", "(2, 3]", "(3, 6]"], ordered=True), + [1, 2, 3, 4]], + names=["cat", "C2"]) + exp = DataFrame({"C1": [nan, nan, nan, nan, 3, 3, + nan, nan, nan, nan, 4, 5], + "C3": [nan, nan, nan, nan, 10, 100, + nan, nan, nan, nan, 200, 34]}, index=idx) + tm.assert_frame_equal(res, exp) + + def test_groupby_multi_categorical_as_index(self): + # GH13204 + df = DataFrame({'cat': Categorical([1, 2, 2], [1, 2, 3]), + 'A': [10, 11, 11], + 'B': [101, 102, 103]}) + result = df.groupby(['cat', 'A'], as_index=False).sum() + expected = DataFrame({'cat': Categorical([1, 1, 2, 2, 3, 3]), + 'A': [10, 11, 10, 11, 10, 11], + 'B': [101.0, nan, nan, 205.0, nan, nan]}, + columns=['cat', 'A', 'B']) + tm.assert_frame_equal(result, expected) + + # function grouper + f = lambda r: df.loc[r, 'A'] + result = df.groupby(['cat', f], as_index=False).sum() + expected = DataFrame({'cat': Categorical([1, 1, 2, 2, 3, 3]), + 'A': [10.0, nan, nan, 22.0, nan, nan], + 'B': [101.0, nan, nan, 205.0, nan, nan]}, + columns=['cat', 'A', 'B']) + tm.assert_frame_equal(result, expected) + + # another not in-axis grouper (conflicting names in index) + s = Series(['a', 'b', 'b'], name='cat') + result = df.groupby(['cat', s], as_index=False).sum() + expected = DataFrame({'cat': Categorical([1, 1, 2, 2, 3, 3]), + 'A': [10.0, nan, nan, 22.0, nan, nan], + 'B': [101.0, nan, nan, 205.0, nan, nan]}, + columns=['cat', 'A', 'B']) + tm.assert_frame_equal(result, expected) + + # is original index dropped? + expected = DataFrame({'cat': Categorical([1, 1, 2, 2, 3, 3]), + 'A': [10, 11, 10, 11, 10, 11], + 'B': [101.0, nan, nan, 205.0, nan, nan]}, + columns=['cat', 'A', 'B']) + + group_columns = ['cat', 'A'] + + for name in [None, 'X', 'B', 'cat']: + df.index = Index(list("abc"), name=name) + + if name in group_columns and name in df.index.names: + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + result = df.groupby(group_columns, as_index=False).sum() + + else: + result = df.groupby(group_columns, as_index=False).sum() + + tm.assert_frame_equal(result, expected, check_index_type=True) + + def test_groupby_preserve_categorical_dtype(self): + # GH13743, GH13854 + df = DataFrame({'A': [1, 2, 1, 1, 2], + 'B': [10, 16, 22, 28, 34], + 'C1': Categorical(list("abaab"), + categories=list("bac"), + ordered=False), + 'C2': Categorical(list("abaab"), + categories=list("bac"), + ordered=True)}) + # single grouper + exp_full = DataFrame({'A': [2.0, 1.0, np.nan], + 'B': [25.0, 20.0, np.nan], + 'C1': Categorical(list("bac"), + categories=list("bac"), + ordered=False), + 'C2': Categorical(list("bac"), + categories=list("bac"), + ordered=True)}) + for col in ['C1', 'C2']: + result1 = df.groupby(by=col, as_index=False).mean() + result2 = df.groupby(by=col, as_index=True).mean().reset_index() + expected = exp_full.reindex(columns=result1.columns) + tm.assert_frame_equal(result1, expected) + tm.assert_frame_equal(result2, expected) + + # multiple grouper + exp_full = DataFrame({'A': [1, 1, 1, 2, 2, 2], + 'B': [np.nan, 20.0, np.nan, 25.0, np.nan, + np.nan], + 'C1': Categorical(list("bacbac"), + categories=list("bac"), + ordered=False), + 'C2': Categorical(list("bacbac"), + categories=list("bac"), + ordered=True)}) + for cols in [['A', 'C1'], ['A', 'C2']]: + result1 = df.groupby(by=cols, as_index=False).mean() + result2 = df.groupby(by=cols, as_index=True).mean().reset_index() + expected = exp_full.reindex(columns=result1.columns) + tm.assert_frame_equal(result1, expected) + tm.assert_frame_equal(result2, expected) + + def test_groupby_categorical_no_compress(self): + data = Series(np.random.randn(9)) + + codes = np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]) + cats = Categorical.from_codes(codes, [0, 1, 2], ordered=True) + + result = data.groupby(cats).mean() + exp = data.groupby(codes).mean() + + exp.index = CategoricalIndex(exp.index, categories=cats.categories, + ordered=cats.ordered) + assert_series_equal(result, exp) + + codes = np.array([0, 0, 0, 1, 1, 1, 3, 3, 3]) + cats = Categorical.from_codes(codes, [0, 1, 2, 3], ordered=True) + + result = data.groupby(cats).mean() + exp = data.groupby(codes).mean().reindex(cats.categories) + exp.index = CategoricalIndex(exp.index, categories=cats.categories, + ordered=cats.ordered) + assert_series_equal(result, exp) + + cats = Categorical(["a", "a", "a", "b", "b", "b", "c", "c", "c"], + categories=["a", "b", "c", "d"], ordered=True) + data = DataFrame({"a": [1, 1, 1, 2, 2, 2, 3, 4, 5], "b": cats}) + + result = data.groupby("b").mean() + result = result["a"].values + exp = np.array([1, 2, 4, np.nan]) + self.assert_numpy_array_equal(result, exp) + + +def assert_fp_equal(a, b): + assert (np.abs(a - b) < 1e-12).all() + + +def _check_groupby(df, result, keys, field, f=lambda x: x.sum()): + tups = lmap(tuple, df[keys].values) + tups = com._asarray_tuplesafe(tups) + expected = f(df.groupby(tups)[field]) + for k, v in compat.iteritems(expected): + assert (result[k] == v) + + +def test_decons(): + from pandas.core.groupby import decons_group_index, get_group_index + + def testit(label_list, shape): + group_index = get_group_index(label_list, shape, sort=True, xnull=True) + label_list2 = decons_group_index(group_index, shape) + + for a, b in zip(label_list, label_list2): + assert (np.array_equal(a, b)) + + shape = (4, 5, 6) + label_list = [np.tile([0, 1, 2, 3, 0, 1, 2, 3], 100), np.tile( + [0, 2, 4, 3, 0, 1, 2, 3], 100), np.tile( + [5, 1, 0, 2, 3, 0, 5, 4], 100)] + testit(label_list, shape) + + shape = (10000, 10000) + label_list = [np.tile(np.arange(10000), 5), np.tile(np.arange(10000), 5)] + testit(label_list, shape) + + +if __name__ == '__main__': + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure', '-s' + ], exit=False) diff --git a/pandas/tests/groupby/test_filters.py b/pandas/tests/groupby/test_filters.py new file mode 100644 index 0000000000000..fb0f52886ec31 --- /dev/null +++ b/pandas/tests/groupby/test_filters.py @@ -0,0 +1,648 @@ +# -*- coding: utf-8 -*- +from __future__ import print_function +import nose + +from numpy import nan + + +from pandas import Timestamp +from pandas.core.index import MultiIndex +from pandas.core.api import DataFrame + +from pandas.core.series import Series + +from pandas.util.testing import (assert_frame_equal, assert_series_equal + ) +from pandas.compat import (lmap) + +from pandas import compat + +import pandas.core.common as com +import numpy as np + +import pandas.util.testing as tm +import pandas as pd + + +class TestGroupByFilter(tm.TestCase): + + _multiprocess_can_split_ = True + + def setUp(self): + self.ts = tm.makeTimeSeries() + + self.seriesd = tm.getSeriesData() + self.tsd = tm.getTimeSeriesData() + self.frame = DataFrame(self.seriesd) + self.tsframe = DataFrame(self.tsd) + + self.df = DataFrame( + {'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], + 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], + 'C': np.random.randn(8), + 'D': np.random.randn(8)}) + + self.df_mixed_floats = DataFrame( + {'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], + 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], + 'C': np.random.randn(8), + 'D': np.array( + np.random.randn(8), dtype='float32')}) + + index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two', + 'three']], + labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], + [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=['first', 'second']) + self.mframe = DataFrame(np.random.randn(10, 3), index=index, + columns=['A', 'B', 'C']) + + self.three_group = DataFrame( + {'A': ['foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar', + 'foo', 'foo', 'foo'], + 'B': ['one', 'one', 'one', 'two', 'one', 'one', 'one', 'two', + 'two', 'two', 'one'], + 'C': ['dull', 'dull', 'shiny', 'dull', 'dull', 'shiny', 'shiny', + 'dull', 'shiny', 'shiny', 'shiny'], + 'D': np.random.randn(11), + 'E': np.random.randn(11), + 'F': np.random.randn(11)}) + + def test_filter_series(self): + s = pd.Series([1, 3, 20, 5, 22, 24, 7]) + expected_odd = pd.Series([1, 3, 5, 7], index=[0, 1, 3, 6]) + expected_even = pd.Series([20, 22, 24], index=[2, 4, 5]) + grouper = s.apply(lambda x: x % 2) + grouped = s.groupby(grouper) + assert_series_equal( + grouped.filter(lambda x: x.mean() < 10), expected_odd) + assert_series_equal( + grouped.filter(lambda x: x.mean() > 10), expected_even) + # Test dropna=False. + assert_series_equal( + grouped.filter(lambda x: x.mean() < 10, dropna=False), + expected_odd.reindex(s.index)) + assert_series_equal( + grouped.filter(lambda x: x.mean() > 10, dropna=False), + expected_even.reindex(s.index)) + + def test_filter_single_column_df(self): + df = pd.DataFrame([1, 3, 20, 5, 22, 24, 7]) + expected_odd = pd.DataFrame([1, 3, 5, 7], index=[0, 1, 3, 6]) + expected_even = pd.DataFrame([20, 22, 24], index=[2, 4, 5]) + grouper = df[0].apply(lambda x: x % 2) + grouped = df.groupby(grouper) + assert_frame_equal( + grouped.filter(lambda x: x.mean() < 10), expected_odd) + assert_frame_equal( + grouped.filter(lambda x: x.mean() > 10), expected_even) + # Test dropna=False. + assert_frame_equal( + grouped.filter(lambda x: x.mean() < 10, dropna=False), + expected_odd.reindex(df.index)) + assert_frame_equal( + grouped.filter(lambda x: x.mean() > 10, dropna=False), + expected_even.reindex(df.index)) + + def test_filter_multi_column_df(self): + df = pd.DataFrame({'A': [1, 12, 12, 1], 'B': [1, 1, 1, 1]}) + grouper = df['A'].apply(lambda x: x % 2) + grouped = df.groupby(grouper) + expected = pd.DataFrame({'A': [12, 12], 'B': [1, 1]}, index=[1, 2]) + assert_frame_equal( + grouped.filter(lambda x: x['A'].sum() - x['B'].sum() > 10), + expected) + + def test_filter_mixed_df(self): + df = pd.DataFrame({'A': [1, 12, 12, 1], 'B': 'a b c d'.split()}) + grouper = df['A'].apply(lambda x: x % 2) + grouped = df.groupby(grouper) + expected = pd.DataFrame({'A': [12, 12], 'B': ['b', 'c']}, index=[1, 2]) + assert_frame_equal( + grouped.filter(lambda x: x['A'].sum() > 10), expected) + + def test_filter_out_all_groups(self): + s = pd.Series([1, 3, 20, 5, 22, 24, 7]) + grouper = s.apply(lambda x: x % 2) + grouped = s.groupby(grouper) + assert_series_equal(grouped.filter(lambda x: x.mean() > 1000), s[[]]) + df = pd.DataFrame({'A': [1, 12, 12, 1], 'B': 'a b c d'.split()}) + grouper = df['A'].apply(lambda x: x % 2) + grouped = df.groupby(grouper) + assert_frame_equal( + grouped.filter(lambda x: x['A'].sum() > 1000), df.ix[[]]) + + def test_filter_out_no_groups(self): + s = pd.Series([1, 3, 20, 5, 22, 24, 7]) + grouper = s.apply(lambda x: x % 2) + grouped = s.groupby(grouper) + filtered = grouped.filter(lambda x: x.mean() > 0) + assert_series_equal(filtered, s) + df = pd.DataFrame({'A': [1, 12, 12, 1], 'B': 'a b c d'.split()}) + grouper = df['A'].apply(lambda x: x % 2) + grouped = df.groupby(grouper) + filtered = grouped.filter(lambda x: x['A'].mean() > 0) + assert_frame_equal(filtered, df) + + def test_filter_out_all_groups_in_df(self): + # GH12768 + df = pd.DataFrame({'a': [1, 1, 2], 'b': [1, 2, 0]}) + res = df.groupby('a') + res = res.filter(lambda x: x['b'].sum() > 5, dropna=False) + expected = pd.DataFrame({'a': [nan] * 3, 'b': [nan] * 3}) + assert_frame_equal(expected, res) + + df = pd.DataFrame({'a': [1, 1, 2], 'b': [1, 2, 0]}) + res = df.groupby('a') + res = res.filter(lambda x: x['b'].sum() > 5, dropna=True) + expected = pd.DataFrame({'a': [], 'b': []}, dtype="int64") + assert_frame_equal(expected, res) + + def test_filter_condition_raises(self): + def raise_if_sum_is_zero(x): + if x.sum() == 0: + raise ValueError + else: + return x.sum() > 0 + + s = pd.Series([-1, 0, 1, 2]) + grouper = s.apply(lambda x: x % 2) + grouped = s.groupby(grouper) + self.assertRaises(TypeError, + lambda: grouped.filter(raise_if_sum_is_zero)) + + def test_filter_with_axis_in_groupby(self): + # issue 11041 + index = pd.MultiIndex.from_product([range(10), [0, 1]]) + data = pd.DataFrame( + np.arange(100).reshape(-1, 20), columns=index, dtype='int64') + result = data.groupby(level=0, + axis=1).filter(lambda x: x.iloc[0, 0] > 10) + expected = data.iloc[:, 12:20] + assert_frame_equal(result, expected) + + def test_filter_bad_shapes(self): + df = DataFrame({'A': np.arange(8), + 'B': list('aabbbbcc'), + 'C': np.arange(8)}) + s = df['B'] + g_df = df.groupby('B') + g_s = s.groupby(s) + + f = lambda x: x + self.assertRaises(TypeError, lambda: g_df.filter(f)) + self.assertRaises(TypeError, lambda: g_s.filter(f)) + + f = lambda x: x == 1 + self.assertRaises(TypeError, lambda: g_df.filter(f)) + self.assertRaises(TypeError, lambda: g_s.filter(f)) + + f = lambda x: np.outer(x, x) + self.assertRaises(TypeError, lambda: g_df.filter(f)) + self.assertRaises(TypeError, lambda: g_s.filter(f)) + + def test_filter_nan_is_false(self): + df = DataFrame({'A': np.arange(8), + 'B': list('aabbbbcc'), + 'C': np.arange(8)}) + s = df['B'] + g_df = df.groupby(df['B']) + g_s = s.groupby(s) + + f = lambda x: np.nan + assert_frame_equal(g_df.filter(f), df.loc[[]]) + assert_series_equal(g_s.filter(f), s[[]]) + + def test_filter_against_workaround(self): + np.random.seed(0) + # Series of ints + s = Series(np.random.randint(0, 100, 1000)) + grouper = s.apply(lambda x: np.round(x, -1)) + grouped = s.groupby(grouper) + f = lambda x: x.mean() > 10 + old_way = s[grouped.transform(f).astype('bool')] + new_way = grouped.filter(f) + assert_series_equal(new_way.sort_values(), old_way.sort_values()) + + # Series of floats + s = 100 * Series(np.random.random(1000)) + grouper = s.apply(lambda x: np.round(x, -1)) + grouped = s.groupby(grouper) + f = lambda x: x.mean() > 10 + old_way = s[grouped.transform(f).astype('bool')] + new_way = grouped.filter(f) + assert_series_equal(new_way.sort_values(), old_way.sort_values()) + + # Set up DataFrame of ints, floats, strings. + from string import ascii_lowercase + letters = np.array(list(ascii_lowercase)) + N = 1000 + random_letters = letters.take(np.random.randint(0, 26, N)) + df = DataFrame({'ints': Series(np.random.randint(0, 100, N)), + 'floats': N / 10 * Series(np.random.random(N)), + 'letters': Series(random_letters)}) + + # Group by ints; filter on floats. + grouped = df.groupby('ints') + old_way = df[grouped.floats. + transform(lambda x: x.mean() > N / 20).astype('bool')] + new_way = grouped.filter(lambda x: x['floats'].mean() > N / 20) + assert_frame_equal(new_way, old_way) + + # Group by floats (rounded); filter on strings. + grouper = df.floats.apply(lambda x: np.round(x, -1)) + grouped = df.groupby(grouper) + old_way = df[grouped.letters. + transform(lambda x: len(x) < N / 10).astype('bool')] + new_way = grouped.filter(lambda x: len(x.letters) < N / 10) + assert_frame_equal(new_way, old_way) + + # Group by strings; filter on ints. + grouped = df.groupby('letters') + old_way = df[grouped.ints. + transform(lambda x: x.mean() > N / 20).astype('bool')] + new_way = grouped.filter(lambda x: x['ints'].mean() > N / 20) + assert_frame_equal(new_way, old_way) + + def test_filter_using_len(self): + # BUG GH4447 + df = DataFrame({'A': np.arange(8), + 'B': list('aabbbbcc'), + 'C': np.arange(8)}) + grouped = df.groupby('B') + actual = grouped.filter(lambda x: len(x) > 2) + expected = DataFrame( + {'A': np.arange(2, 6), + 'B': list('bbbb'), + 'C': np.arange(2, 6)}, index=np.arange(2, 6)) + assert_frame_equal(actual, expected) + + actual = grouped.filter(lambda x: len(x) > 4) + expected = df.ix[[]] + assert_frame_equal(actual, expected) + + # Series have always worked properly, but we'll test anyway. + s = df['B'] + grouped = s.groupby(s) + actual = grouped.filter(lambda x: len(x) > 2) + expected = Series(4 * ['b'], index=np.arange(2, 6), name='B') + assert_series_equal(actual, expected) + + actual = grouped.filter(lambda x: len(x) > 4) + expected = s[[]] + assert_series_equal(actual, expected) + + def test_filter_maintains_ordering(self): + # Simple case: index is sequential. #4621 + df = DataFrame({'pid': [1, 1, 1, 2, 2, 3, 3, 3], + 'tag': [23, 45, 62, 24, 45, 34, 25, 62]}) + s = df['pid'] + grouped = df.groupby('tag') + actual = grouped.filter(lambda x: len(x) > 1) + expected = df.iloc[[1, 2, 4, 7]] + assert_frame_equal(actual, expected) + + grouped = s.groupby(df['tag']) + actual = grouped.filter(lambda x: len(x) > 1) + expected = s.iloc[[1, 2, 4, 7]] + assert_series_equal(actual, expected) + + # Now index is sequentially decreasing. + df.index = np.arange(len(df) - 1, -1, -1) + s = df['pid'] + grouped = df.groupby('tag') + actual = grouped.filter(lambda x: len(x) > 1) + expected = df.iloc[[1, 2, 4, 7]] + assert_frame_equal(actual, expected) + + grouped = s.groupby(df['tag']) + actual = grouped.filter(lambda x: len(x) > 1) + expected = s.iloc[[1, 2, 4, 7]] + assert_series_equal(actual, expected) + + # Index is shuffled. + SHUFFLED = [4, 6, 7, 2, 1, 0, 5, 3] + df.index = df.index[SHUFFLED] + s = df['pid'] + grouped = df.groupby('tag') + actual = grouped.filter(lambda x: len(x) > 1) + expected = df.iloc[[1, 2, 4, 7]] + assert_frame_equal(actual, expected) + + grouped = s.groupby(df['tag']) + actual = grouped.filter(lambda x: len(x) > 1) + expected = s.iloc[[1, 2, 4, 7]] + assert_series_equal(actual, expected) + + def test_filter_multiple_timestamp(self): + # GH 10114 + df = DataFrame({'A': np.arange(5, dtype='int64'), + 'B': ['foo', 'bar', 'foo', 'bar', 'bar'], + 'C': Timestamp('20130101')}) + + grouped = df.groupby(['B', 'C']) + + result = grouped['A'].filter(lambda x: True) + assert_series_equal(df['A'], result) + + result = grouped['A'].transform(len) + expected = Series([2, 3, 2, 3, 3], name='A') + assert_series_equal(result, expected) + + result = grouped.filter(lambda x: True) + assert_frame_equal(df, result) + + result = grouped.transform('sum') + expected = DataFrame({'A': [2, 8, 2, 8, 8]}) + assert_frame_equal(result, expected) + + result = grouped.transform(len) + expected = DataFrame({'A': [2, 3, 2, 3, 3]}) + assert_frame_equal(result, expected) + + def test_filter_and_transform_with_non_unique_int_index(self): + # GH4620 + index = [1, 1, 1, 2, 1, 1, 0, 1] + df = DataFrame({'pid': [1, 1, 1, 2, 2, 3, 3, 3], + 'tag': [23, 45, 62, 24, 45, 34, 25, 62]}, index=index) + grouped_df = df.groupby('tag') + ser = df['pid'] + grouped_ser = ser.groupby(df['tag']) + expected_indexes = [1, 2, 4, 7] + + # Filter DataFrame + actual = grouped_df.filter(lambda x: len(x) > 1) + expected = df.iloc[expected_indexes] + assert_frame_equal(actual, expected) + + actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False) + expected = df.copy() + expected.iloc[[0, 3, 5, 6]] = np.nan + assert_frame_equal(actual, expected) + + # Filter Series + actual = grouped_ser.filter(lambda x: len(x) > 1) + expected = ser.take(expected_indexes) + assert_series_equal(actual, expected) + + actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False) + NA = np.nan + expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name='pid') + # ^ made manually because this can get confusing! + assert_series_equal(actual, expected) + + # Transform Series + actual = grouped_ser.transform(len) + expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name='pid') + assert_series_equal(actual, expected) + + # Transform (a column from) DataFrameGroupBy + actual = grouped_df.pid.transform(len) + assert_series_equal(actual, expected) + + def test_filter_and_transform_with_multiple_non_unique_int_index(self): + # GH4620 + index = [1, 1, 1, 2, 0, 0, 0, 1] + df = DataFrame({'pid': [1, 1, 1, 2, 2, 3, 3, 3], + 'tag': [23, 45, 62, 24, 45, 34, 25, 62]}, index=index) + grouped_df = df.groupby('tag') + ser = df['pid'] + grouped_ser = ser.groupby(df['tag']) + expected_indexes = [1, 2, 4, 7] + + # Filter DataFrame + actual = grouped_df.filter(lambda x: len(x) > 1) + expected = df.iloc[expected_indexes] + assert_frame_equal(actual, expected) + + actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False) + expected = df.copy() + expected.iloc[[0, 3, 5, 6]] = np.nan + assert_frame_equal(actual, expected) + + # Filter Series + actual = grouped_ser.filter(lambda x: len(x) > 1) + expected = ser.take(expected_indexes) + assert_series_equal(actual, expected) + + actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False) + NA = np.nan + expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name='pid') + # ^ made manually because this can get confusing! + assert_series_equal(actual, expected) + + # Transform Series + actual = grouped_ser.transform(len) + expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name='pid') + assert_series_equal(actual, expected) + + # Transform (a column from) DataFrameGroupBy + actual = grouped_df.pid.transform(len) + assert_series_equal(actual, expected) + + def test_filter_and_transform_with_non_unique_float_index(self): + # GH4620 + index = np.array([1, 1, 1, 2, 1, 1, 0, 1], dtype=float) + df = DataFrame({'pid': [1, 1, 1, 2, 2, 3, 3, 3], + 'tag': [23, 45, 62, 24, 45, 34, 25, 62]}, index=index) + grouped_df = df.groupby('tag') + ser = df['pid'] + grouped_ser = ser.groupby(df['tag']) + expected_indexes = [1, 2, 4, 7] + + # Filter DataFrame + actual = grouped_df.filter(lambda x: len(x) > 1) + expected = df.iloc[expected_indexes] + assert_frame_equal(actual, expected) + + actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False) + expected = df.copy() + expected.iloc[[0, 3, 5, 6]] = np.nan + assert_frame_equal(actual, expected) + + # Filter Series + actual = grouped_ser.filter(lambda x: len(x) > 1) + expected = ser.take(expected_indexes) + assert_series_equal(actual, expected) + + actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False) + NA = np.nan + expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name='pid') + # ^ made manually because this can get confusing! + assert_series_equal(actual, expected) + + # Transform Series + actual = grouped_ser.transform(len) + expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name='pid') + assert_series_equal(actual, expected) + + # Transform (a column from) DataFrameGroupBy + actual = grouped_df.pid.transform(len) + assert_series_equal(actual, expected) + + def test_filter_and_transform_with_non_unique_timestamp_index(self): + # GH4620 + t0 = Timestamp('2013-09-30 00:05:00') + t1 = Timestamp('2013-10-30 00:05:00') + t2 = Timestamp('2013-11-30 00:05:00') + index = [t1, t1, t1, t2, t1, t1, t0, t1] + df = DataFrame({'pid': [1, 1, 1, 2, 2, 3, 3, 3], + 'tag': [23, 45, 62, 24, 45, 34, 25, 62]}, index=index) + grouped_df = df.groupby('tag') + ser = df['pid'] + grouped_ser = ser.groupby(df['tag']) + expected_indexes = [1, 2, 4, 7] + + # Filter DataFrame + actual = grouped_df.filter(lambda x: len(x) > 1) + expected = df.iloc[expected_indexes] + assert_frame_equal(actual, expected) + + actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False) + expected = df.copy() + expected.iloc[[0, 3, 5, 6]] = np.nan + assert_frame_equal(actual, expected) + + # Filter Series + actual = grouped_ser.filter(lambda x: len(x) > 1) + expected = ser.take(expected_indexes) + assert_series_equal(actual, expected) + + actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False) + NA = np.nan + expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name='pid') + # ^ made manually because this can get confusing! + assert_series_equal(actual, expected) + + # Transform Series + actual = grouped_ser.transform(len) + expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name='pid') + assert_series_equal(actual, expected) + + # Transform (a column from) DataFrameGroupBy + actual = grouped_df.pid.transform(len) + assert_series_equal(actual, expected) + + def test_filter_and_transform_with_non_unique_string_index(self): + # GH4620 + index = list('bbbcbbab') + df = DataFrame({'pid': [1, 1, 1, 2, 2, 3, 3, 3], + 'tag': [23, 45, 62, 24, 45, 34, 25, 62]}, index=index) + grouped_df = df.groupby('tag') + ser = df['pid'] + grouped_ser = ser.groupby(df['tag']) + expected_indexes = [1, 2, 4, 7] + + # Filter DataFrame + actual = grouped_df.filter(lambda x: len(x) > 1) + expected = df.iloc[expected_indexes] + assert_frame_equal(actual, expected) + + actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False) + expected = df.copy() + expected.iloc[[0, 3, 5, 6]] = np.nan + assert_frame_equal(actual, expected) + + # Filter Series + actual = grouped_ser.filter(lambda x: len(x) > 1) + expected = ser.take(expected_indexes) + assert_series_equal(actual, expected) + + actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False) + NA = np.nan + expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name='pid') + # ^ made manually because this can get confusing! + assert_series_equal(actual, expected) + + # Transform Series + actual = grouped_ser.transform(len) + expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name='pid') + assert_series_equal(actual, expected) + + # Transform (a column from) DataFrameGroupBy + actual = grouped_df.pid.transform(len) + assert_series_equal(actual, expected) + + def test_filter_has_access_to_grouped_cols(self): + df = DataFrame([[1, 2], [1, 3], [5, 6]], columns=['A', 'B']) + g = df.groupby('A') + # previously didn't have access to col A #???? + filt = g.filter(lambda x: x['A'].sum() == 2) + assert_frame_equal(filt, df.iloc[[0, 1]]) + + def test_filter_enforces_scalarness(self): + df = pd.DataFrame([ + ['best', 'a', 'x'], + ['worst', 'b', 'y'], + ['best', 'c', 'x'], + ['best', 'd', 'y'], + ['worst', 'd', 'y'], + ['worst', 'd', 'y'], + ['best', 'd', 'z'], + ], columns=['a', 'b', 'c']) + with tm.assertRaisesRegexp(TypeError, 'filter function returned a.*'): + df.groupby('c').filter(lambda g: g['a'] == 'best') + + def test_filter_non_bool_raises(self): + df = pd.DataFrame([ + ['best', 'a', 1], + ['worst', 'b', 1], + ['best', 'c', 1], + ['best', 'd', 1], + ['worst', 'd', 1], + ['worst', 'd', 1], + ['best', 'd', 1], + ], columns=['a', 'b', 'c']) + with tm.assertRaisesRegexp(TypeError, 'filter function returned a.*'): + df.groupby('a').filter(lambda g: g.c.mean()) + + def test_filter_dropna_with_empty_groups(self): + # GH 10780 + data = pd.Series(np.random.rand(9), index=np.repeat([1, 2, 3], 3)) + groupped = data.groupby(level=0) + result_false = groupped.filter(lambda x: x.mean() > 1, dropna=False) + expected_false = pd.Series([np.nan] * 9, + index=np.repeat([1, 2, 3], 3)) + tm.assert_series_equal(result_false, expected_false) + + result_true = groupped.filter(lambda x: x.mean() > 1, dropna=True) + expected_true = pd.Series(index=pd.Index([], dtype=int)) + tm.assert_series_equal(result_true, expected_true) + + +def assert_fp_equal(a, b): + assert (np.abs(a - b) < 1e-12).all() + + +def _check_groupby(df, result, keys, field, f=lambda x: x.sum()): + tups = lmap(tuple, df[keys].values) + tups = com._asarray_tuplesafe(tups) + expected = f(df.groupby(tups)[field]) + for k, v in compat.iteritems(expected): + assert (result[k] == v) + + +def test_decons(): + from pandas.core.groupby import decons_group_index, get_group_index + + def testit(label_list, shape): + group_index = get_group_index(label_list, shape, sort=True, xnull=True) + label_list2 = decons_group_index(group_index, shape) + + for a, b in zip(label_list, label_list2): + assert (np.array_equal(a, b)) + + shape = (4, 5, 6) + label_list = [np.tile([0, 1, 2, 3, 0, 1, 2, 3], 100), np.tile( + [0, 2, 4, 3, 0, 1, 2, 3], 100), np.tile( + [5, 1, 0, 2, 3, 0, 5, 4], 100)] + testit(label_list, shape) + + shape = (10000, 10000) + label_list = [np.tile(np.arange(10000), 5), np.tile(np.arange(10000), 5)] + testit(label_list, shape) + + +if __name__ == '__main__': + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure', '-s' + ], exit=False) diff --git a/pandas/tests/test_groupby.py b/pandas/tests/groupby/test_groupby.py similarity index 81% rename from pandas/tests/test_groupby.py rename to pandas/tests/groupby/test_groupby.py index 7b98a45395752..f8d9d73590a60 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -14,7 +14,6 @@ _lexsort_indexer) from pandas.core.series import Series from pandas.core.config import option_context -from pandas.formats.printing import pprint_thing from pandas.util.testing import (assert_panel_equal, assert_frame_equal, assert_series_equal, assert_almost_equal, assert_index_equal, assertRaisesRegexp) @@ -864,110 +863,6 @@ def f(grp): e.name = None assert_series_equal(result, e) - def test_agg_api(self): - - # GH 6337 - # http://stackoverflow.com/questions/21706030/pandas-groupby-agg-function-column-dtype-error - # different api for agg when passed custom function with mixed frame - - df = DataFrame({'data1': np.random.randn(5), - 'data2': np.random.randn(5), - 'key1': ['a', 'a', 'b', 'b', 'a'], - 'key2': ['one', 'two', 'one', 'two', 'one']}) - grouped = df.groupby('key1') - - def peak_to_peak(arr): - return arr.max() - arr.min() - - expected = grouped.agg([peak_to_peak]) - expected.columns = ['data1', 'data2'] - result = grouped.agg(peak_to_peak) - assert_frame_equal(result, expected) - - def test_agg_regression1(self): - grouped = self.tsframe.groupby([lambda x: x.year, lambda x: x.month]) - result = grouped.agg(np.mean) - expected = grouped.mean() - assert_frame_equal(result, expected) - - def test_agg_datetimes_mixed(self): - data = [[1, '2012-01-01', 1.0], [2, '2012-01-02', 2.0], [3, None, 3.0]] - - df1 = DataFrame({'key': [x[0] for x in data], - 'date': [x[1] for x in data], - 'value': [x[2] for x in data]}) - - data = [[row[0], datetime.strptime(row[1], '%Y-%m-%d').date() if row[1] - else None, row[2]] for row in data] - - df2 = DataFrame({'key': [x[0] for x in data], - 'date': [x[1] for x in data], - 'value': [x[2] for x in data]}) - - df1['weights'] = df1['value'] / df1['value'].sum() - gb1 = df1.groupby('date').aggregate(np.sum) - - df2['weights'] = df1['value'] / df1['value'].sum() - gb2 = df2.groupby('date').aggregate(np.sum) - - assert (len(gb1) == len(gb2)) - - def test_agg_period_index(self): - from pandas import period_range, PeriodIndex - prng = period_range('2012-1-1', freq='M', periods=3) - df = DataFrame(np.random.randn(3, 2), index=prng) - rs = df.groupby(level=0).sum() - tm.assertIsInstance(rs.index, PeriodIndex) - - # GH 3579 - index = period_range(start='1999-01', periods=5, freq='M') - s1 = Series(np.random.rand(len(index)), index=index) - s2 = Series(np.random.rand(len(index)), index=index) - series = [('s1', s1), ('s2', s2)] - df = DataFrame.from_items(series) - grouped = df.groupby(df.index.month) - list(grouped) - - def test_agg_dict_parameter_cast_result_dtypes(self): - # GH 12821 - - df = DataFrame( - {'class': ['A', 'A', 'B', 'B', 'C', 'C', 'D', 'D'], - 'time': date_range('1/1/2011', periods=8, freq='H')}) - df.loc[[0, 1, 2, 5], 'time'] = None - - # test for `first` function - exp = df.loc[[0, 3, 4, 6]].set_index('class') - grouped = df.groupby('class') - assert_frame_equal(grouped.first(), exp) - assert_frame_equal(grouped.agg('first'), exp) - assert_frame_equal(grouped.agg({'time': 'first'}), exp) - assert_series_equal(grouped.time.first(), exp['time']) - assert_series_equal(grouped.time.agg('first'), exp['time']) - - # test for `last` function - exp = df.loc[[0, 3, 4, 7]].set_index('class') - grouped = df.groupby('class') - assert_frame_equal(grouped.last(), exp) - assert_frame_equal(grouped.agg('last'), exp) - assert_frame_equal(grouped.agg({'time': 'last'}), exp) - assert_series_equal(grouped.time.last(), exp['time']) - assert_series_equal(grouped.time.agg('last'), exp['time']) - - def test_agg_must_agg(self): - grouped = self.df.groupby('A')['C'] - self.assertRaises(Exception, grouped.agg, lambda x: x.describe()) - self.assertRaises(Exception, grouped.agg, lambda x: x.index[:2]) - - def test_agg_ser_multi_key(self): - # TODO(wesm): unused - ser = self.df.C # noqa - - f = lambda x: x.sum() - results = self.df.C.groupby([self.df.A, self.df.B]).aggregate(f) - expected = self.df.groupby(['A', 'B']).sum()['C'] - assert_series_equal(results, expected) - def test_get_group(self): wp = tm.makePanel() grouped = wp.groupby(lambda x: x.month, axis='major') @@ -1034,58 +929,11 @@ def test_get_group_grouped_by_tuple(self): expected = DataFrame({'ids': [(dt[0], ), (dt[0], )]}, index=[0, 2]) assert_frame_equal(result, expected) - def test_agg_apply_corner(self): - # nothing to group, all NA - grouped = self.ts.groupby(self.ts * np.nan) - self.assertEqual(self.ts.dtype, np.float64) - - # groupby float64 values results in Float64Index - exp = Series([], dtype=np.float64, index=pd.Index( - [], dtype=np.float64)) - assert_series_equal(grouped.sum(), exp) - assert_series_equal(grouped.agg(np.sum), exp) - assert_series_equal(grouped.apply(np.sum), exp, check_index_type=False) - - # DataFrame - grouped = self.tsframe.groupby(self.tsframe['A'] * np.nan) - exp_df = DataFrame(columns=self.tsframe.columns, dtype=float, - index=pd.Index([], dtype=np.float64)) - assert_frame_equal(grouped.sum(), exp_df, check_names=False) - assert_frame_equal(grouped.agg(np.sum), exp_df, check_names=False) - assert_frame_equal(grouped.apply(np.sum), exp_df.iloc[:, :0], - check_names=False) - - def test_agg_grouping_is_list_tuple(self): - from pandas.core.groupby import Grouping - - df = tm.makeTimeDataFrame() - - grouped = df.groupby(lambda x: x.year) - grouper = grouped.grouper.groupings[0].grouper - grouped.grouper.groupings[0] = Grouping(self.ts.index, list(grouper)) - - result = grouped.agg(np.mean) - expected = grouped.mean() - tm.assert_frame_equal(result, expected) - - grouped.grouper.groupings[0] = Grouping(self.ts.index, tuple(grouper)) - - result = grouped.agg(np.mean) - expected = grouped.mean() - tm.assert_frame_equal(result, expected) - def test_grouping_error_on_multidim_input(self): from pandas.core.groupby import Grouping self.assertRaises(ValueError, Grouping, self.df.index, self.df[['A', 'A']]) - def test_agg_python_multiindex(self): - grouped = self.mframe.groupby(['A', 'B']) - - result = grouped.agg(np.mean) - expected = grouped.mean() - tm.assert_frame_equal(result, expected) - def test_apply_describe_bug(self): grouped = self.mframe.groupby(level='first') grouped.describe() # it works! @@ -1185,80 +1033,6 @@ def test_groups(self): self.assertTrue((self.df.ix[v]['A'] == k[0]).all()) self.assertTrue((self.df.ix[v]['B'] == k[1]).all()) - def test_aggregate_str_func(self): - def _check_results(grouped): - # single series - result = grouped['A'].agg('std') - expected = grouped['A'].std() - assert_series_equal(result, expected) - - # group frame by function name - result = grouped.aggregate('var') - expected = grouped.var() - assert_frame_equal(result, expected) - - # group frame by function dict - result = grouped.agg(OrderedDict([['A', 'var'], ['B', 'std'], - ['C', 'mean'], ['D', 'sem']])) - expected = DataFrame(OrderedDict([['A', grouped['A'].var( - )], ['B', grouped['B'].std()], ['C', grouped['C'].mean()], - ['D', grouped['D'].sem()]])) - assert_frame_equal(result, expected) - - by_weekday = self.tsframe.groupby(lambda x: x.weekday()) - _check_results(by_weekday) - - by_mwkday = self.tsframe.groupby([lambda x: x.month, - lambda x: x.weekday()]) - _check_results(by_mwkday) - - def test_aggregate_item_by_item(self): - - df = self.df.copy() - df['E'] = ['a'] * len(self.df) - grouped = self.df.groupby('A') - - # API change in 0.11 - # def aggfun(ser): - # return len(ser + 'a') - # result = grouped.agg(aggfun) - # self.assertEqual(len(result.columns), 1) - - aggfun = lambda ser: ser.size - result = grouped.agg(aggfun) - foo = (self.df.A == 'foo').sum() - bar = (self.df.A == 'bar').sum() - K = len(result.columns) - - # GH5782 - # odd comparisons can result here, so cast to make easy - exp = pd.Series(np.array([foo] * K), index=list('BCD'), - dtype=np.float64, name='foo') - tm.assert_series_equal(result.xs('foo'), exp) - - exp = pd.Series(np.array([bar] * K), index=list('BCD'), - dtype=np.float64, name='bar') - tm.assert_almost_equal(result.xs('bar'), exp) - - def aggfun(ser): - return ser.size - - result = DataFrame().groupby(self.df.A).agg(aggfun) - tm.assertIsInstance(result, DataFrame) - self.assertEqual(len(result), 0) - - def test_agg_item_by_item_raise_typeerror(self): - from numpy.random import randint - - df = DataFrame(randint(10, size=(20, 10))) - - def raiseException(df): - pprint_thing('----------------------------------------') - pprint_thing(df.to_string()) - raise TypeError - - self.assertRaises(TypeError, df.groupby(0).agg, raiseException) - def test_basic_regression(self): # regression T = [1.0 * x for x in lrange(1, 10) * 10][:1095] @@ -1600,6 +1374,15 @@ def test_groupby_transform_with_int(self): expected = DataFrame(dict(B=1, C=[2, 3, 4, 10, 5, -1])) assert_frame_equal(result, expected) + def test_groupby_transform_with_nan_group(self): + # GH 9941 + df = pd.DataFrame({'a': range(10), + 'b': [1, 1, 2, 3, np.nan, 4, 4, 5, 5, 5]}) + result = df.groupby(df.b)['a'].transform(max) + expected = pd.Series([1., 1., 2., 3., np.nan, 6., 6., 9., 9., 9.], + name='a') + assert_series_equal(result, expected) + def test_indices_concatenation_order(self): # GH 2808 @@ -1687,34 +1470,6 @@ def test_series_describe_single(self): expected = grouped.describe() assert_series_equal(result, expected) - def test_series_agg_multikey(self): - ts = tm.makeTimeSeries() - grouped = ts.groupby([lambda x: x.year, lambda x: x.month]) - - result = grouped.agg(np.sum) - expected = grouped.sum() - assert_series_equal(result, expected) - - def test_series_agg_multi_pure_python(self): - data = DataFrame( - {'A': ['foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar', - 'foo', 'foo', 'foo'], - 'B': ['one', 'one', 'one', 'two', 'one', 'one', 'one', 'two', - 'two', 'two', 'one'], - 'C': ['dull', 'dull', 'shiny', 'dull', 'dull', 'shiny', 'shiny', - 'dull', 'shiny', 'shiny', 'shiny'], - 'D': np.random.randn(11), - 'E': np.random.randn(11), - 'F': np.random.randn(11)}) - - def bad(x): - assert (len(x.base) > 0) - return 'foo' - - result = data.groupby(['A', 'B']).agg(bad) - expected = data.groupby(['A', 'B']).agg(lambda x: 'foo') - assert_frame_equal(result, expected) - def test_series_index_name(self): grouped = self.df.ix[:, ['C']].groupby(self.df['A']) result = grouped.agg(lambda x: x.mean()) @@ -1828,138 +1583,6 @@ def test_frame_set_name_single(self): result = grouped['C'].agg({'foo': np.mean, 'bar': np.std}) self.assertEqual(result.index.name, 'A') - def test_aggregate_api_consistency(self): - # GH 9052 - # make sure that the aggregates via dict - # are consistent - - df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar', - 'foo', 'bar', 'foo', 'foo'], - 'B': ['one', 'one', 'two', 'two', - 'two', 'two', 'one', 'two'], - 'C': np.random.randn(8) + 1.0, - 'D': np.arange(8)}) - - grouped = df.groupby(['A', 'B']) - c_mean = grouped['C'].mean() - c_sum = grouped['C'].sum() - d_mean = grouped['D'].mean() - d_sum = grouped['D'].sum() - - result = grouped['D'].agg(['sum', 'mean']) - expected = pd.concat([d_sum, d_mean], - axis=1) - expected.columns = ['sum', 'mean'] - assert_frame_equal(result, expected, check_like=True) - - result = grouped.agg([np.sum, np.mean]) - expected = pd.concat([c_sum, - c_mean, - d_sum, - d_mean], - axis=1) - expected.columns = MultiIndex.from_product([['C', 'D'], - ['sum', 'mean']]) - assert_frame_equal(result, expected, check_like=True) - - result = grouped[['D', 'C']].agg([np.sum, np.mean]) - expected = pd.concat([d_sum, - d_mean, - c_sum, - c_mean], - axis=1) - expected.columns = MultiIndex.from_product([['D', 'C'], - ['sum', 'mean']]) - assert_frame_equal(result, expected, check_like=True) - - result = grouped.agg({'C': 'mean', 'D': 'sum'}) - expected = pd.concat([d_sum, - c_mean], - axis=1) - assert_frame_equal(result, expected, check_like=True) - - result = grouped.agg({'C': ['mean', 'sum'], - 'D': ['mean', 'sum']}) - expected = pd.concat([c_mean, - c_sum, - d_mean, - d_sum], - axis=1) - expected.columns = MultiIndex.from_product([['C', 'D'], - ['mean', 'sum']]) - - result = grouped[['D', 'C']].agg({'r': np.sum, - 'r2': np.mean}) - expected = pd.concat([d_sum, - c_sum, - d_mean, - c_mean], - axis=1) - expected.columns = MultiIndex.from_product([['r', 'r2'], - ['D', 'C']]) - assert_frame_equal(result, expected, check_like=True) - - def test_agg_compat(self): - - # GH 12334 - - df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar', - 'foo', 'bar', 'foo', 'foo'], - 'B': ['one', 'one', 'two', 'two', - 'two', 'two', 'one', 'two'], - 'C': np.random.randn(8) + 1.0, - 'D': np.arange(8)}) - - g = df.groupby(['A', 'B']) - - expected = pd.concat([g['D'].sum(), - g['D'].std()], - axis=1) - expected.columns = MultiIndex.from_tuples([('C', 'sum'), - ('C', 'std')]) - result = g['D'].agg({'C': ['sum', 'std']}) - assert_frame_equal(result, expected, check_like=True) - - expected = pd.concat([g['D'].sum(), - g['D'].std()], - axis=1) - expected.columns = ['C', 'D'] - result = g['D'].agg({'C': 'sum', 'D': 'std'}) - assert_frame_equal(result, expected, check_like=True) - - def test_agg_nested_dicts(self): - - # API change for disallowing these types of nested dicts - df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar', - 'foo', 'bar', 'foo', 'foo'], - 'B': ['one', 'one', 'two', 'two', - 'two', 'two', 'one', 'two'], - 'C': np.random.randn(8) + 1.0, - 'D': np.arange(8)}) - - g = df.groupby(['A', 'B']) - - def f(): - g.aggregate({'r1': {'C': ['mean', 'sum']}, - 'r2': {'D': ['mean', 'sum']}}) - - self.assertRaises(SpecificationError, f) - - result = g.agg({'C': {'ra': ['mean', 'std']}, - 'D': {'rb': ['mean', 'std']}}) - expected = pd.concat([g['C'].mean(), g['C'].std(), g['D'].mean(), - g['D'].std()], axis=1) - expected.columns = pd.MultiIndex.from_tuples([('ra', 'mean'), ( - 'ra', 'std'), ('rb', 'mean'), ('rb', 'std')]) - assert_frame_equal(result, expected, check_like=True) - - # same name as the original column - # GH9052 - expected = g['D'].agg({'result1': np.sum, 'result2': np.mean}) - expected = expected.rename(columns={'result1': 'D'}) - result = g['D'].agg({'D': np.sum, 'result2': np.mean}) - assert_frame_equal(result, expected, check_like=True) - def test_multi_iter(self): s = Series(np.arange(6)) k1 = np.array(['a', 'a', 'a', 'b', 'b', 'b']) @@ -3351,51 +2974,6 @@ def filt2(x): result = data.groupby('id_field').apply(filt2) assert_frame_equal(result, expected) - def test_apply_use_categorical_name(self): - from pandas import qcut - cats = qcut(self.df.C, 4) - - def get_stats(group): - return {'min': group.min(), - 'max': group.max(), - 'count': group.count(), - 'mean': group.mean()} - - result = self.df.groupby(cats).D.apply(get_stats) - self.assertEqual(result.index.names[0], 'C') - - def test_apply_categorical_data(self): - # GH 10138 - for ordered in [True, False]: - dense = Categorical(list('abc'), ordered=ordered) - # 'b' is in the categories but not in the list - missing = Categorical( - list('aaa'), categories=['a', 'b'], ordered=ordered) - values = np.arange(len(dense)) - df = DataFrame({'missing': missing, - 'dense': dense, - 'values': values}) - grouped = df.groupby(['missing', 'dense']) - - # missing category 'b' should still exist in the output index - idx = MultiIndex.from_product( - [Categorical(['a', 'b'], ordered=ordered), - Categorical(['a', 'b', 'c'], ordered=ordered)], - names=['missing', 'dense']) - expected = DataFrame([0, 1, 2, np.nan, np.nan, np.nan], - index=idx, - columns=['values']) - - assert_frame_equal(grouped.apply(lambda x: np.mean(x)), expected) - assert_frame_equal(grouped.mean(), expected) - assert_frame_equal(grouped.agg(np.mean), expected) - - # but for transform we should still get back the original index - idx = MultiIndex.from_product([['a'], ['a', 'b', 'c']], - names=['missing', 'dense']) - expected = Series(1, index=idx) - assert_series_equal(grouped.apply(lambda x: 1), expected) - def test_apply_corner_cases(self): # #535, can't use sliding iterator @@ -4342,142 +3920,6 @@ def test_groupby_sort_multiindex_series(self): result = mseries.groupby(level=['a', 'b'], sort=True).first() assert_series_equal(result, mseries_result.sort_index()) - def test_groupby_categorical(self): - levels = ['foo', 'bar', 'baz', 'qux'] - codes = np.random.randint(0, 4, size=100) - - cats = Categorical.from_codes(codes, levels, ordered=True) - - data = DataFrame(np.random.randn(100, 4)) - - result = data.groupby(cats).mean() - - expected = data.groupby(np.asarray(cats)).mean() - exp_idx = CategoricalIndex(levels, categories=cats.categories, - ordered=True) - expected = expected.reindex(exp_idx) - - assert_frame_equal(result, expected) - - grouped = data.groupby(cats) - desc_result = grouped.describe() - - idx = cats.codes.argsort() - ord_labels = np.asarray(cats).take(idx) - ord_data = data.take(idx) - - exp_cats = Categorical(ord_labels, ordered=True, - categories=['foo', 'bar', 'baz', 'qux']) - expected = ord_data.groupby(exp_cats, sort=False).describe() - expected.index.names = [None, None] - assert_frame_equal(desc_result, expected) - - # GH 10460 - expc = Categorical.from_codes(np.arange(4).repeat(8), - levels, ordered=True) - exp = CategoricalIndex(expc) - self.assert_index_equal(desc_result.index.get_level_values(0), exp) - exp = Index(['count', 'mean', 'std', 'min', '25%', '50%', - '75%', 'max'] * 4) - self.assert_index_equal(desc_result.index.get_level_values(1), exp) - - def test_groupby_datetime_categorical(self): - # GH9049: ensure backward compatibility - levels = pd.date_range('2014-01-01', periods=4) - codes = np.random.randint(0, 4, size=100) - - cats = Categorical.from_codes(codes, levels, ordered=True) - - data = DataFrame(np.random.randn(100, 4)) - result = data.groupby(cats).mean() - - expected = data.groupby(np.asarray(cats)).mean() - expected = expected.reindex(levels) - expected.index = CategoricalIndex(expected.index, - categories=expected.index, - ordered=True) - - assert_frame_equal(result, expected) - - grouped = data.groupby(cats) - desc_result = grouped.describe() - - idx = cats.codes.argsort() - ord_labels = cats.take_nd(idx) - ord_data = data.take(idx) - expected = ord_data.groupby(ord_labels).describe() - expected.index.names = [None, None] - assert_frame_equal(desc_result, expected) - tm.assert_index_equal(desc_result.index, expected.index) - tm.assert_index_equal( - desc_result.index.get_level_values(0), - expected.index.get_level_values(0)) - - # GH 10460 - expc = Categorical.from_codes( - np.arange(4).repeat(8), levels, ordered=True) - exp = CategoricalIndex(expc) - self.assert_index_equal(desc_result.index.get_level_values(0), exp) - exp = Index(['count', 'mean', 'std', 'min', '25%', '50%', - '75%', 'max'] * 4) - self.assert_index_equal(desc_result.index.get_level_values(1), exp) - - def test_groupby_categorical_index(self): - - levels = ['foo', 'bar', 'baz', 'qux'] - codes = np.random.randint(0, 4, size=20) - cats = Categorical.from_codes(codes, levels, ordered=True) - df = DataFrame( - np.repeat( - np.arange(20), 4).reshape(-1, 4), columns=list('abcd')) - df['cats'] = cats - - # with a cat index - result = df.set_index('cats').groupby(level=0).sum() - expected = df[list('abcd')].groupby(cats.codes).sum() - expected.index = CategoricalIndex( - Categorical.from_codes( - [0, 1, 2, 3], levels, ordered=True), name='cats') - assert_frame_equal(result, expected) - - # with a cat column, should produce a cat index - result = df.groupby('cats').sum() - expected = df[list('abcd')].groupby(cats.codes).sum() - expected.index = CategoricalIndex( - Categorical.from_codes( - [0, 1, 2, 3], levels, ordered=True), name='cats') - assert_frame_equal(result, expected) - - def test_groupby_describe_categorical_columns(self): - # GH 11558 - cats = pd.CategoricalIndex(['qux', 'foo', 'baz', 'bar'], - categories=['foo', 'bar', 'baz', 'qux'], - ordered=True) - df = DataFrame(np.random.randn(20, 4), columns=cats) - result = df.groupby([1, 2, 3, 4] * 5).describe() - - tm.assert_index_equal(result.columns, cats) - tm.assert_categorical_equal(result.columns.values, cats.values) - - def test_groupby_unstack_categorical(self): - # GH11558 (example is taken from the original issue) - df = pd.DataFrame({'a': range(10), - 'medium': ['A', 'B'] * 5, - 'artist': list('XYXXY') * 2}) - df['medium'] = df['medium'].astype('category') - - gcat = df.groupby(['artist', 'medium'])['a'].count().unstack() - result = gcat.describe() - - exp_columns = pd.CategoricalIndex(['A', 'B'], ordered=False, - name='medium') - tm.assert_index_equal(result.columns, exp_columns) - tm.assert_categorical_equal(result.columns.values, exp_columns.values) - - result = gcat['A'] + gcat['B'] - expected = pd.Series([6, 4], index=pd.Index(['X', 'Y'], name='artist')) - tm.assert_series_equal(result, expected) - def test_groupby_groups_datetimeindex(self): # #1430 from pandas.tseries.api import DatetimeIndex @@ -4695,37 +4137,6 @@ def test_median_empty_bins(self): expected = df.groupby(bins).agg(lambda x: x.median()) assert_frame_equal(result, expected) - def test_groupby_categorical_no_compress(self): - data = Series(np.random.randn(9)) - - codes = np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]) - cats = Categorical.from_codes(codes, [0, 1, 2], ordered=True) - - result = data.groupby(cats).mean() - exp = data.groupby(codes).mean() - - exp.index = CategoricalIndex(exp.index, categories=cats.categories, - ordered=cats.ordered) - assert_series_equal(result, exp) - - codes = np.array([0, 0, 0, 1, 1, 1, 3, 3, 3]) - cats = Categorical.from_codes(codes, [0, 1, 2, 3], ordered=True) - - result = data.groupby(cats).mean() - exp = data.groupby(codes).mean().reindex(cats.categories) - exp.index = CategoricalIndex(exp.index, categories=cats.categories, - ordered=cats.ordered) - assert_series_equal(result, exp) - - cats = Categorical(["a", "a", "a", "b", "b", "b", "c", "c", "c"], - categories=["a", "b", "c", "d"], ordered=True) - data = DataFrame({"a": [1, 1, 1, 2, 2, 2, 3, 4, 5], "b": cats}) - - result = data.groupby("b").mean() - result = result["a"].values - exp = np.array([1, 2, 4, np.nan]) - self.assert_numpy_array_equal(result, exp) - def test_groupby_non_arithmetic_agg_types(self): # GH9311, GH6620 df = pd.DataFrame([{'a': 1, @@ -4837,16 +4248,6 @@ def test_groupby_datetime64_32_bit(self): expected = Series([pd.Timestamp('2000-01-1')] * 2, name='B') assert_series_equal(result, expected) - def test_groupby_categorical_unequal_len(self): - # GH3011 - series = Series([np.nan, np.nan, 1, 1, 2, 2, 3, 3, 4, 4]) - # The raises only happens with categorical, not with series of types - # category - bins = pd.cut(series.dropna().values, 4) - - # len(bins) != len(series) here - self.assertRaises(ValueError, lambda: series.groupby(bins).mean()) - def test_groupby_multiindex_missing_pair(self): # GH9049 df = DataFrame({'group1': ['a', 'a', 'a', 'b'], @@ -5444,534 +4845,6 @@ def test_cumcount_groupby_not_col(self): assert_series_equal(expected, g.cumcount()) assert_series_equal(expected, sg.cumcount()) - def test_filter_series(self): - s = pd.Series([1, 3, 20, 5, 22, 24, 7]) - expected_odd = pd.Series([1, 3, 5, 7], index=[0, 1, 3, 6]) - expected_even = pd.Series([20, 22, 24], index=[2, 4, 5]) - grouper = s.apply(lambda x: x % 2) - grouped = s.groupby(grouper) - assert_series_equal( - grouped.filter(lambda x: x.mean() < 10), expected_odd) - assert_series_equal( - grouped.filter(lambda x: x.mean() > 10), expected_even) - # Test dropna=False. - assert_series_equal( - grouped.filter(lambda x: x.mean() < 10, dropna=False), - expected_odd.reindex(s.index)) - assert_series_equal( - grouped.filter(lambda x: x.mean() > 10, dropna=False), - expected_even.reindex(s.index)) - - def test_filter_single_column_df(self): - df = pd.DataFrame([1, 3, 20, 5, 22, 24, 7]) - expected_odd = pd.DataFrame([1, 3, 5, 7], index=[0, 1, 3, 6]) - expected_even = pd.DataFrame([20, 22, 24], index=[2, 4, 5]) - grouper = df[0].apply(lambda x: x % 2) - grouped = df.groupby(grouper) - assert_frame_equal( - grouped.filter(lambda x: x.mean() < 10), expected_odd) - assert_frame_equal( - grouped.filter(lambda x: x.mean() > 10), expected_even) - # Test dropna=False. - assert_frame_equal( - grouped.filter(lambda x: x.mean() < 10, dropna=False), - expected_odd.reindex(df.index)) - assert_frame_equal( - grouped.filter(lambda x: x.mean() > 10, dropna=False), - expected_even.reindex(df.index)) - - def test_filter_multi_column_df(self): - df = pd.DataFrame({'A': [1, 12, 12, 1], 'B': [1, 1, 1, 1]}) - grouper = df['A'].apply(lambda x: x % 2) - grouped = df.groupby(grouper) - expected = pd.DataFrame({'A': [12, 12], 'B': [1, 1]}, index=[1, 2]) - assert_frame_equal( - grouped.filter(lambda x: x['A'].sum() - x['B'].sum() > 10), - expected) - - def test_filter_mixed_df(self): - df = pd.DataFrame({'A': [1, 12, 12, 1], 'B': 'a b c d'.split()}) - grouper = df['A'].apply(lambda x: x % 2) - grouped = df.groupby(grouper) - expected = pd.DataFrame({'A': [12, 12], 'B': ['b', 'c']}, index=[1, 2]) - assert_frame_equal( - grouped.filter(lambda x: x['A'].sum() > 10), expected) - - def test_filter_out_all_groups(self): - s = pd.Series([1, 3, 20, 5, 22, 24, 7]) - grouper = s.apply(lambda x: x % 2) - grouped = s.groupby(grouper) - assert_series_equal(grouped.filter(lambda x: x.mean() > 1000), s[[]]) - df = pd.DataFrame({'A': [1, 12, 12, 1], 'B': 'a b c d'.split()}) - grouper = df['A'].apply(lambda x: x % 2) - grouped = df.groupby(grouper) - assert_frame_equal( - grouped.filter(lambda x: x['A'].sum() > 1000), df.ix[[]]) - - def test_filter_out_no_groups(self): - s = pd.Series([1, 3, 20, 5, 22, 24, 7]) - grouper = s.apply(lambda x: x % 2) - grouped = s.groupby(grouper) - filtered = grouped.filter(lambda x: x.mean() > 0) - assert_series_equal(filtered, s) - df = pd.DataFrame({'A': [1, 12, 12, 1], 'B': 'a b c d'.split()}) - grouper = df['A'].apply(lambda x: x % 2) - grouped = df.groupby(grouper) - filtered = grouped.filter(lambda x: x['A'].mean() > 0) - assert_frame_equal(filtered, df) - - def test_filter_out_all_groups_in_df(self): - # GH12768 - df = pd.DataFrame({'a': [1, 1, 2], 'b': [1, 2, 0]}) - res = df.groupby('a') - res = res.filter(lambda x: x['b'].sum() > 5, dropna=False) - expected = pd.DataFrame({'a': [nan] * 3, 'b': [nan] * 3}) - assert_frame_equal(expected, res) - - df = pd.DataFrame({'a': [1, 1, 2], 'b': [1, 2, 0]}) - res = df.groupby('a') - res = res.filter(lambda x: x['b'].sum() > 5, dropna=True) - expected = pd.DataFrame({'a': [], 'b': []}, dtype="int64") - assert_frame_equal(expected, res) - - def test_filter_condition_raises(self): - def raise_if_sum_is_zero(x): - if x.sum() == 0: - raise ValueError - else: - return x.sum() > 0 - - s = pd.Series([-1, 0, 1, 2]) - grouper = s.apply(lambda x: x % 2) - grouped = s.groupby(grouper) - self.assertRaises(TypeError, - lambda: grouped.filter(raise_if_sum_is_zero)) - - def test_filter_with_axis_in_groupby(self): - # issue 11041 - index = pd.MultiIndex.from_product([range(10), [0, 1]]) - data = pd.DataFrame( - np.arange(100).reshape(-1, 20), columns=index, dtype='int64') - result = data.groupby(level=0, - axis=1).filter(lambda x: x.iloc[0, 0] > 10) - expected = data.iloc[:, 12:20] - assert_frame_equal(result, expected) - - def test_filter_bad_shapes(self): - df = DataFrame({'A': np.arange(8), - 'B': list('aabbbbcc'), - 'C': np.arange(8)}) - s = df['B'] - g_df = df.groupby('B') - g_s = s.groupby(s) - - f = lambda x: x - self.assertRaises(TypeError, lambda: g_df.filter(f)) - self.assertRaises(TypeError, lambda: g_s.filter(f)) - - f = lambda x: x == 1 - self.assertRaises(TypeError, lambda: g_df.filter(f)) - self.assertRaises(TypeError, lambda: g_s.filter(f)) - - f = lambda x: np.outer(x, x) - self.assertRaises(TypeError, lambda: g_df.filter(f)) - self.assertRaises(TypeError, lambda: g_s.filter(f)) - - def test_filter_nan_is_false(self): - df = DataFrame({'A': np.arange(8), - 'B': list('aabbbbcc'), - 'C': np.arange(8)}) - s = df['B'] - g_df = df.groupby(df['B']) - g_s = s.groupby(s) - - f = lambda x: np.nan - assert_frame_equal(g_df.filter(f), df.loc[[]]) - assert_series_equal(g_s.filter(f), s[[]]) - - def test_filter_against_workaround(self): - np.random.seed(0) - # Series of ints - s = Series(np.random.randint(0, 100, 1000)) - grouper = s.apply(lambda x: np.round(x, -1)) - grouped = s.groupby(grouper) - f = lambda x: x.mean() > 10 - old_way = s[grouped.transform(f).astype('bool')] - new_way = grouped.filter(f) - assert_series_equal(new_way.sort_values(), old_way.sort_values()) - - # Series of floats - s = 100 * Series(np.random.random(1000)) - grouper = s.apply(lambda x: np.round(x, -1)) - grouped = s.groupby(grouper) - f = lambda x: x.mean() > 10 - old_way = s[grouped.transform(f).astype('bool')] - new_way = grouped.filter(f) - assert_series_equal(new_way.sort_values(), old_way.sort_values()) - - # Set up DataFrame of ints, floats, strings. - from string import ascii_lowercase - letters = np.array(list(ascii_lowercase)) - N = 1000 - random_letters = letters.take(np.random.randint(0, 26, N)) - df = DataFrame({'ints': Series(np.random.randint(0, 100, N)), - 'floats': N / 10 * Series(np.random.random(N)), - 'letters': Series(random_letters)}) - - # Group by ints; filter on floats. - grouped = df.groupby('ints') - old_way = df[grouped.floats. - transform(lambda x: x.mean() > N / 20).astype('bool')] - new_way = grouped.filter(lambda x: x['floats'].mean() > N / 20) - assert_frame_equal(new_way, old_way) - - # Group by floats (rounded); filter on strings. - grouper = df.floats.apply(lambda x: np.round(x, -1)) - grouped = df.groupby(grouper) - old_way = df[grouped.letters. - transform(lambda x: len(x) < N / 10).astype('bool')] - new_way = grouped.filter(lambda x: len(x.letters) < N / 10) - assert_frame_equal(new_way, old_way) - - # Group by strings; filter on ints. - grouped = df.groupby('letters') - old_way = df[grouped.ints. - transform(lambda x: x.mean() > N / 20).astype('bool')] - new_way = grouped.filter(lambda x: x['ints'].mean() > N / 20) - assert_frame_equal(new_way, old_way) - - def test_filter_using_len(self): - # BUG GH4447 - df = DataFrame({'A': np.arange(8), - 'B': list('aabbbbcc'), - 'C': np.arange(8)}) - grouped = df.groupby('B') - actual = grouped.filter(lambda x: len(x) > 2) - expected = DataFrame( - {'A': np.arange(2, 6), - 'B': list('bbbb'), - 'C': np.arange(2, 6)}, index=np.arange(2, 6)) - assert_frame_equal(actual, expected) - - actual = grouped.filter(lambda x: len(x) > 4) - expected = df.ix[[]] - assert_frame_equal(actual, expected) - - # Series have always worked properly, but we'll test anyway. - s = df['B'] - grouped = s.groupby(s) - actual = grouped.filter(lambda x: len(x) > 2) - expected = Series(4 * ['b'], index=np.arange(2, 6), name='B') - assert_series_equal(actual, expected) - - actual = grouped.filter(lambda x: len(x) > 4) - expected = s[[]] - assert_series_equal(actual, expected) - - def test_filter_maintains_ordering(self): - # Simple case: index is sequential. #4621 - df = DataFrame({'pid': [1, 1, 1, 2, 2, 3, 3, 3], - 'tag': [23, 45, 62, 24, 45, 34, 25, 62]}) - s = df['pid'] - grouped = df.groupby('tag') - actual = grouped.filter(lambda x: len(x) > 1) - expected = df.iloc[[1, 2, 4, 7]] - assert_frame_equal(actual, expected) - - grouped = s.groupby(df['tag']) - actual = grouped.filter(lambda x: len(x) > 1) - expected = s.iloc[[1, 2, 4, 7]] - assert_series_equal(actual, expected) - - # Now index is sequentially decreasing. - df.index = np.arange(len(df) - 1, -1, -1) - s = df['pid'] - grouped = df.groupby('tag') - actual = grouped.filter(lambda x: len(x) > 1) - expected = df.iloc[[1, 2, 4, 7]] - assert_frame_equal(actual, expected) - - grouped = s.groupby(df['tag']) - actual = grouped.filter(lambda x: len(x) > 1) - expected = s.iloc[[1, 2, 4, 7]] - assert_series_equal(actual, expected) - - # Index is shuffled. - SHUFFLED = [4, 6, 7, 2, 1, 0, 5, 3] - df.index = df.index[SHUFFLED] - s = df['pid'] - grouped = df.groupby('tag') - actual = grouped.filter(lambda x: len(x) > 1) - expected = df.iloc[[1, 2, 4, 7]] - assert_frame_equal(actual, expected) - - grouped = s.groupby(df['tag']) - actual = grouped.filter(lambda x: len(x) > 1) - expected = s.iloc[[1, 2, 4, 7]] - assert_series_equal(actual, expected) - - def test_filter_multiple_timestamp(self): - # GH 10114 - df = DataFrame({'A': np.arange(5, dtype='int64'), - 'B': ['foo', 'bar', 'foo', 'bar', 'bar'], - 'C': Timestamp('20130101')}) - - grouped = df.groupby(['B', 'C']) - - result = grouped['A'].filter(lambda x: True) - assert_series_equal(df['A'], result) - - result = grouped['A'].transform(len) - expected = Series([2, 3, 2, 3, 3], name='A') - assert_series_equal(result, expected) - - result = grouped.filter(lambda x: True) - assert_frame_equal(df, result) - - result = grouped.transform('sum') - expected = DataFrame({'A': [2, 8, 2, 8, 8]}) - assert_frame_equal(result, expected) - - result = grouped.transform(len) - expected = DataFrame({'A': [2, 3, 2, 3, 3]}) - assert_frame_equal(result, expected) - - def test_filter_and_transform_with_non_unique_int_index(self): - # GH4620 - index = [1, 1, 1, 2, 1, 1, 0, 1] - df = DataFrame({'pid': [1, 1, 1, 2, 2, 3, 3, 3], - 'tag': [23, 45, 62, 24, 45, 34, 25, 62]}, index=index) - grouped_df = df.groupby('tag') - ser = df['pid'] - grouped_ser = ser.groupby(df['tag']) - expected_indexes = [1, 2, 4, 7] - - # Filter DataFrame - actual = grouped_df.filter(lambda x: len(x) > 1) - expected = df.iloc[expected_indexes] - assert_frame_equal(actual, expected) - - actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False) - expected = df.copy() - expected.iloc[[0, 3, 5, 6]] = np.nan - assert_frame_equal(actual, expected) - - # Filter Series - actual = grouped_ser.filter(lambda x: len(x) > 1) - expected = ser.take(expected_indexes) - assert_series_equal(actual, expected) - - actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False) - NA = np.nan - expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name='pid') - # ^ made manually because this can get confusing! - assert_series_equal(actual, expected) - - # Transform Series - actual = grouped_ser.transform(len) - expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name='pid') - assert_series_equal(actual, expected) - - # Transform (a column from) DataFrameGroupBy - actual = grouped_df.pid.transform(len) - assert_series_equal(actual, expected) - - def test_filter_and_transform_with_multiple_non_unique_int_index(self): - # GH4620 - index = [1, 1, 1, 2, 0, 0, 0, 1] - df = DataFrame({'pid': [1, 1, 1, 2, 2, 3, 3, 3], - 'tag': [23, 45, 62, 24, 45, 34, 25, 62]}, index=index) - grouped_df = df.groupby('tag') - ser = df['pid'] - grouped_ser = ser.groupby(df['tag']) - expected_indexes = [1, 2, 4, 7] - - # Filter DataFrame - actual = grouped_df.filter(lambda x: len(x) > 1) - expected = df.iloc[expected_indexes] - assert_frame_equal(actual, expected) - - actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False) - expected = df.copy() - expected.iloc[[0, 3, 5, 6]] = np.nan - assert_frame_equal(actual, expected) - - # Filter Series - actual = grouped_ser.filter(lambda x: len(x) > 1) - expected = ser.take(expected_indexes) - assert_series_equal(actual, expected) - - actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False) - NA = np.nan - expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name='pid') - # ^ made manually because this can get confusing! - assert_series_equal(actual, expected) - - # Transform Series - actual = grouped_ser.transform(len) - expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name='pid') - assert_series_equal(actual, expected) - - # Transform (a column from) DataFrameGroupBy - actual = grouped_df.pid.transform(len) - assert_series_equal(actual, expected) - - def test_filter_and_transform_with_non_unique_float_index(self): - # GH4620 - index = np.array([1, 1, 1, 2, 1, 1, 0, 1], dtype=float) - df = DataFrame({'pid': [1, 1, 1, 2, 2, 3, 3, 3], - 'tag': [23, 45, 62, 24, 45, 34, 25, 62]}, index=index) - grouped_df = df.groupby('tag') - ser = df['pid'] - grouped_ser = ser.groupby(df['tag']) - expected_indexes = [1, 2, 4, 7] - - # Filter DataFrame - actual = grouped_df.filter(lambda x: len(x) > 1) - expected = df.iloc[expected_indexes] - assert_frame_equal(actual, expected) - - actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False) - expected = df.copy() - expected.iloc[[0, 3, 5, 6]] = np.nan - assert_frame_equal(actual, expected) - - # Filter Series - actual = grouped_ser.filter(lambda x: len(x) > 1) - expected = ser.take(expected_indexes) - assert_series_equal(actual, expected) - - actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False) - NA = np.nan - expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name='pid') - # ^ made manually because this can get confusing! - assert_series_equal(actual, expected) - - # Transform Series - actual = grouped_ser.transform(len) - expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name='pid') - assert_series_equal(actual, expected) - - # Transform (a column from) DataFrameGroupBy - actual = grouped_df.pid.transform(len) - assert_series_equal(actual, expected) - - def test_filter_and_transform_with_non_unique_timestamp_index(self): - # GH4620 - t0 = Timestamp('2013-09-30 00:05:00') - t1 = Timestamp('2013-10-30 00:05:00') - t2 = Timestamp('2013-11-30 00:05:00') - index = [t1, t1, t1, t2, t1, t1, t0, t1] - df = DataFrame({'pid': [1, 1, 1, 2, 2, 3, 3, 3], - 'tag': [23, 45, 62, 24, 45, 34, 25, 62]}, index=index) - grouped_df = df.groupby('tag') - ser = df['pid'] - grouped_ser = ser.groupby(df['tag']) - expected_indexes = [1, 2, 4, 7] - - # Filter DataFrame - actual = grouped_df.filter(lambda x: len(x) > 1) - expected = df.iloc[expected_indexes] - assert_frame_equal(actual, expected) - - actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False) - expected = df.copy() - expected.iloc[[0, 3, 5, 6]] = np.nan - assert_frame_equal(actual, expected) - - # Filter Series - actual = grouped_ser.filter(lambda x: len(x) > 1) - expected = ser.take(expected_indexes) - assert_series_equal(actual, expected) - - actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False) - NA = np.nan - expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name='pid') - # ^ made manually because this can get confusing! - assert_series_equal(actual, expected) - - # Transform Series - actual = grouped_ser.transform(len) - expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name='pid') - assert_series_equal(actual, expected) - - # Transform (a column from) DataFrameGroupBy - actual = grouped_df.pid.transform(len) - assert_series_equal(actual, expected) - - def test_filter_and_transform_with_non_unique_string_index(self): - # GH4620 - index = list('bbbcbbab') - df = DataFrame({'pid': [1, 1, 1, 2, 2, 3, 3, 3], - 'tag': [23, 45, 62, 24, 45, 34, 25, 62]}, index=index) - grouped_df = df.groupby('tag') - ser = df['pid'] - grouped_ser = ser.groupby(df['tag']) - expected_indexes = [1, 2, 4, 7] - - # Filter DataFrame - actual = grouped_df.filter(lambda x: len(x) > 1) - expected = df.iloc[expected_indexes] - assert_frame_equal(actual, expected) - - actual = grouped_df.filter(lambda x: len(x) > 1, dropna=False) - expected = df.copy() - expected.iloc[[0, 3, 5, 6]] = np.nan - assert_frame_equal(actual, expected) - - # Filter Series - actual = grouped_ser.filter(lambda x: len(x) > 1) - expected = ser.take(expected_indexes) - assert_series_equal(actual, expected) - - actual = grouped_ser.filter(lambda x: len(x) > 1, dropna=False) - NA = np.nan - expected = Series([NA, 1, 1, NA, 2, NA, NA, 3], index, name='pid') - # ^ made manually because this can get confusing! - assert_series_equal(actual, expected) - - # Transform Series - actual = grouped_ser.transform(len) - expected = Series([1, 2, 2, 1, 2, 1, 1, 2], index, name='pid') - assert_series_equal(actual, expected) - - # Transform (a column from) DataFrameGroupBy - actual = grouped_df.pid.transform(len) - assert_series_equal(actual, expected) - - def test_filter_has_access_to_grouped_cols(self): - df = DataFrame([[1, 2], [1, 3], [5, 6]], columns=['A', 'B']) - g = df.groupby('A') - # previously didn't have access to col A #???? - filt = g.filter(lambda x: x['A'].sum() == 2) - assert_frame_equal(filt, df.iloc[[0, 1]]) - - def test_filter_enforces_scalarness(self): - df = pd.DataFrame([ - ['best', 'a', 'x'], - ['worst', 'b', 'y'], - ['best', 'c', 'x'], - ['best', 'd', 'y'], - ['worst', 'd', 'y'], - ['worst', 'd', 'y'], - ['best', 'd', 'z'], - ], columns=['a', 'b', 'c']) - with tm.assertRaisesRegexp(TypeError, 'filter function returned a.*'): - df.groupby('c').filter(lambda g: g['a'] == 'best') - - def test_filter_non_bool_raises(self): - df = pd.DataFrame([ - ['best', 'a', 1], - ['worst', 'b', 1], - ['best', 'c', 1], - ['best', 'd', 1], - ['worst', 'd', 1], - ['worst', 'd', 1], - ['best', 'd', 1], - ], columns=['a', 'b', 'c']) - with tm.assertRaisesRegexp(TypeError, 'filter function returned a.*'): - df.groupby('a').filter(lambda g: g.c.mean()) - def test_fill_constistency(self): # GH9221 @@ -6687,145 +5560,6 @@ def test_transform_doesnt_clobber_ints(self): expected = gb2.transform('mean') tm.assert_frame_equal(result, expected) - def test_groupby_categorical_two_columns(self): - - # https://github.com/pandas-dev/pandas/issues/8138 - d = {'cat': - pd.Categorical(["a", "b", "a", "b"], categories=["a", "b", "c"], - ordered=True), - 'ints': [1, 1, 2, 2], - 'val': [10, 20, 30, 40]} - test = pd.DataFrame(d) - - # Grouping on a single column - groups_single_key = test.groupby("cat") - res = groups_single_key.agg('mean') - - exp_index = pd.CategoricalIndex(["a", "b", "c"], name="cat", - ordered=True) - exp = DataFrame({"ints": [1.5, 1.5, np.nan], "val": [20, 30, np.nan]}, - index=exp_index) - tm.assert_frame_equal(res, exp) - - # Grouping on two columns - groups_double_key = test.groupby(["cat", "ints"]) - res = groups_double_key.agg('mean') - exp = DataFrame({"val": [10, 30, 20, 40, np.nan, np.nan], - "cat": pd.Categorical(["a", "a", "b", "b", "c", "c"], - ordered=True), - "ints": [1, 2, 1, 2, 1, 2]}).set_index(["cat", "ints" - ]) - tm.assert_frame_equal(res, exp) - - # GH 10132 - for key in [('a', 1), ('b', 2), ('b', 1), ('a', 2)]: - c, i = key - result = groups_double_key.get_group(key) - expected = test[(test.cat == c) & (test.ints == i)] - assert_frame_equal(result, expected) - - d = {'C1': [3, 3, 4, 5], 'C2': [1, 2, 3, 4], 'C3': [10, 100, 200, 34]} - test = pd.DataFrame(d) - values = pd.cut(test['C1'], [1, 2, 3, 6]) - values.name = "cat" - groups_double_key = test.groupby([values, 'C2']) - - res = groups_double_key.agg('mean') - nan = np.nan - idx = MultiIndex.from_product( - [Categorical(["(1, 2]", "(2, 3]", "(3, 6]"], ordered=True), - [1, 2, 3, 4]], - names=["cat", "C2"]) - exp = DataFrame({"C1": [nan, nan, nan, nan, 3, 3, - nan, nan, nan, nan, 4, 5], - "C3": [nan, nan, nan, nan, 10, 100, - nan, nan, nan, nan, 200, 34]}, index=idx) - tm.assert_frame_equal(res, exp) - - def test_groupby_multi_categorical_as_index(self): - # GH13204 - df = DataFrame({'cat': Categorical([1, 2, 2], [1, 2, 3]), - 'A': [10, 11, 11], - 'B': [101, 102, 103]}) - result = df.groupby(['cat', 'A'], as_index=False).sum() - expected = DataFrame({'cat': Categorical([1, 1, 2, 2, 3, 3]), - 'A': [10, 11, 10, 11, 10, 11], - 'B': [101.0, nan, nan, 205.0, nan, nan]}, - columns=['cat', 'A', 'B']) - tm.assert_frame_equal(result, expected) - - # function grouper - f = lambda r: df.loc[r, 'A'] - result = df.groupby(['cat', f], as_index=False).sum() - expected = DataFrame({'cat': Categorical([1, 1, 2, 2, 3, 3]), - 'A': [10.0, nan, nan, 22.0, nan, nan], - 'B': [101.0, nan, nan, 205.0, nan, nan]}, - columns=['cat', 'A', 'B']) - tm.assert_frame_equal(result, expected) - - # another not in-axis grouper (conflicting names in index) - s = Series(['a', 'b', 'b'], name='cat') - result = df.groupby(['cat', s], as_index=False).sum() - expected = DataFrame({'cat': Categorical([1, 1, 2, 2, 3, 3]), - 'A': [10.0, nan, nan, 22.0, nan, nan], - 'B': [101.0, nan, nan, 205.0, nan, nan]}, - columns=['cat', 'A', 'B']) - tm.assert_frame_equal(result, expected) - - # is original index dropped? - expected = DataFrame({'cat': Categorical([1, 1, 2, 2, 3, 3]), - 'A': [10, 11, 10, 11, 10, 11], - 'B': [101.0, nan, nan, 205.0, nan, nan]}, - columns=['cat', 'A', 'B']) - - for name in [None, 'X', 'B', 'cat']: - df.index = Index(list("abc"), name=name) - result = df.groupby(['cat', 'A'], as_index=False).sum() - tm.assert_frame_equal(result, expected, check_index_type=True) - - def test_groupby_preserve_categorical_dtype(self): - # GH13743, GH13854 - df = DataFrame({'A': [1, 2, 1, 1, 2], - 'B': [10, 16, 22, 28, 34], - 'C1': Categorical(list("abaab"), - categories=list("bac"), - ordered=False), - 'C2': Categorical(list("abaab"), - categories=list("bac"), - ordered=True)}) - # single grouper - exp_full = DataFrame({'A': [2.0, 1.0, np.nan], - 'B': [25.0, 20.0, np.nan], - 'C1': Categorical(list("bac"), - categories=list("bac"), - ordered=False), - 'C2': Categorical(list("bac"), - categories=list("bac"), - ordered=True)}) - for col in ['C1', 'C2']: - result1 = df.groupby(by=col, as_index=False).mean() - result2 = df.groupby(by=col, as_index=True).mean().reset_index() - expected = exp_full.reindex(columns=result1.columns) - tm.assert_frame_equal(result1, expected) - tm.assert_frame_equal(result2, expected) - - # multiple grouper - exp_full = DataFrame({'A': [1, 1, 1, 2, 2, 2], - 'B': [np.nan, 20.0, np.nan, 25.0, np.nan, - np.nan], - 'C1': Categorical(list("bacbac"), - categories=list("bac"), - ordered=False), - 'C2': Categorical(list("bacbac"), - categories=list("bac"), - ordered=True)}) - for cols in [['A', 'C1'], ['A', 'C2']]: - result1 = df.groupby(by=cols, as_index=False).mean() - result2 = df.groupby(by=cols, as_index=True).mean().reset_index() - expected = exp_full.reindex(columns=result1.columns) - tm.assert_frame_equal(result1, expected) - tm.assert_frame_equal(result2, expected) - def test_groupby_apply_all_none(self): # Tests to make sure no errors if apply function returns all None # values. Issue 9684. diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 9be4935716989..3536a52432b8c 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -767,6 +767,48 @@ def test_sub(self): self.assertRaises(TypeError, lambda: idx - idx.tolist()) self.assertRaises(TypeError, lambda: idx.tolist() - idx) + def test_map_identity_mapping(self): + # GH 12766 + for name, cur_index in self.indices.items(): + tm.assert_index_equal(cur_index, cur_index.map(lambda x: x)) + + def test_map_with_tuples(self): + # GH 12766 + + # Test that returning a single tuple from an Index + # returns an Index. + boolean_index = tm.makeIntIndex(3).map(lambda x: (x,)) + expected = Index([(0,), (1,), (2,)]) + tm.assert_index_equal(boolean_index, expected) + + # Test that returning a tuple from a map of a single index + # returns a MultiIndex object. + boolean_index = tm.makeIntIndex(3).map(lambda x: (x, x == 1)) + expected = MultiIndex.from_tuples([(0, False), (1, True), (2, False)]) + tm.assert_index_equal(boolean_index, expected) + + # Test that returning a single object from a MultiIndex + # returns an Index. + first_level = ['foo', 'bar', 'baz'] + multi_index = MultiIndex.from_tuples(lzip(first_level, [1, 2, 3])) + reduced_index = multi_index.map(lambda x: x[0]) + tm.assert_index_equal(reduced_index, Index(first_level)) + + def test_map_tseries_indices_return_index(self): + date_index = tm.makeDateIndex(10) + exp = Index([1] * 10) + tm.assert_index_equal(exp, date_index.map(lambda x: 1)) + + period_index = tm.makePeriodIndex(10) + tm.assert_index_equal(exp, period_index.map(lambda x: 1)) + + tdelta_index = tm.makeTimedeltaIndex(10) + tm.assert_index_equal(exp, tdelta_index.map(lambda x: 1)) + + date_index = tm.makeDateIndex(24, freq='h', name='hourly') + exp = Index(range(24), name='hourly') + tm.assert_index_equal(exp, date_index.map(lambda x: x.hour)) + def test_append_multiple(self): index = Index(['a', 'b', 'c', 'd', 'e', 'f']) @@ -1194,16 +1236,16 @@ def check_slice(in_slice, expected): self.assert_index_equal(result, expected) for in_slice, expected in [ - (SLC[::-1], 'yxdcb'), (SLC['b':'y':-1], ''), - (SLC['b'::-1], 'b'), (SLC[:'b':-1], 'yxdcb'), - (SLC[:'y':-1], 'y'), (SLC['y'::-1], 'yxdcb'), - (SLC['y'::-4], 'yb'), - # absent labels - (SLC[:'a':-1], 'yxdcb'), (SLC[:'a':-2], 'ydb'), - (SLC['z'::-1], 'yxdcb'), (SLC['z'::-3], 'yc'), - (SLC['m'::-1], 'dcb'), (SLC[:'m':-1], 'yx'), - (SLC['a':'a':-1], ''), (SLC['z':'z':-1], ''), - (SLC['m':'m':-1], '') + (SLC[::-1], 'yxdcb'), (SLC['b':'y':-1], ''), + (SLC['b'::-1], 'b'), (SLC[:'b':-1], 'yxdcb'), + (SLC[:'y':-1], 'y'), (SLC['y'::-1], 'yxdcb'), + (SLC['y'::-4], 'yb'), + # absent labels + (SLC[:'a':-1], 'yxdcb'), (SLC[:'a':-2], 'ydb'), + (SLC['z'::-1], 'yxdcb'), (SLC['z'::-3], 'yc'), + (SLC['m'::-1], 'dcb'), (SLC[:'m':-1], 'yx'), + (SLC['a':'a':-1], ''), (SLC['z':'z':-1], ''), + (SLC['m':'m':-1], '') ]: check_slice(in_slice, expected) diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py index 819b88bf4c5d3..708f424d9bad1 100644 --- a/pandas/tests/indexes/test_category.py +++ b/pandas/tests/indexes/test_category.py @@ -207,19 +207,20 @@ def test_map(self): ci = pd.CategoricalIndex(list('ABABC'), categories=list('CBA'), ordered=True) result = ci.map(lambda x: x.lower()) - exp = pd.Categorical(list('ababc'), categories=list('cba'), - ordered=True) - tm.assert_categorical_equal(result, exp) + exp = pd.CategoricalIndex(list('ababc'), categories=list('cba'), + ordered=True) + tm.assert_index_equal(result, exp) ci = pd.CategoricalIndex(list('ABABC'), categories=list('BAC'), ordered=False, name='XXX') result = ci.map(lambda x: x.lower()) - exp = pd.Categorical(list('ababc'), categories=list('bac'), - ordered=False) - tm.assert_categorical_equal(result, exp) + exp = pd.CategoricalIndex(list('ababc'), categories=list('bac'), + ordered=False, name='XXX') + tm.assert_index_equal(result, exp) - tm.assert_numpy_array_equal(ci.map(lambda x: 1), - np.array([1] * 5, dtype=np.int64)) + # GH 12766: Return an index not an array + tm.assert_index_equal(ci.map(lambda x: 1), + Index(np.array([1] * 5, dtype=np.int64), name='XXX')) # change categories dtype ci = pd.CategoricalIndex(list('ABABC'), categories=list('BAC'), @@ -228,9 +229,9 @@ def f(x): return {'A': 10, 'B': 20, 'C': 30}.get(x) result = ci.map(f) - exp = pd.Categorical([10, 20, 10, 20, 30], categories=[20, 10, 30], - ordered=False) - tm.assert_categorical_equal(result, exp) + exp = pd.CategoricalIndex([10, 20, 10, 20, 30], categories=[20, 10, 30], + ordered=False) + tm.assert_index_equal(result, exp) def test_where(self): i = self.create_index() diff --git a/pandas/tests/series/test_apply.py b/pandas/tests/series/test_apply.py index 8d7676bef4d72..ec7ffde344d31 100644 --- a/pandas/tests/series/test_apply.py +++ b/pandas/tests/series/test_apply.py @@ -123,8 +123,9 @@ def test_apply_datetimetz(self): tm.assert_series_equal(result, exp) # change dtype + # GH 14506 : Returned dtype changed from int32 to int64 result = s.apply(lambda x: x.hour) - exp = pd.Series(list(range(24)) + [0], name='XX', dtype=np.int32) + exp = pd.Series(list(range(24)) + [0], name='XX', dtype=np.int64) tm.assert_series_equal(result, exp) # not vectorized @@ -317,8 +318,9 @@ def test_map_datetimetz(self): tm.assert_series_equal(result, exp) # change dtype + # GH 14506 : Returned dtype changed from int32 to int64 result = s.map(lambda x: x.hour) - exp = pd.Series(list(range(24)) + [0], name='XX', dtype=np.int32) + exp = pd.Series(list(range(24)) + [0], name='XX', dtype=np.int64) tm.assert_series_equal(result, exp) with tm.assertRaises(NotImplementedError): diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index ed7b0fda19cb7..a7e3ebdfc43d0 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -529,6 +529,21 @@ def test_constructor_with_datetime_tz(self): expected = Series(pd.DatetimeIndex(['NaT', 'NaT'], tz='US/Eastern')) assert_series_equal(s, expected) + def test_construction_consistency(self): + + # make sure that we are not re-localizing upon construction + # GH 14928 + s = Series(pd.date_range('20130101', periods=3, tz='US/Eastern')) + + result = Series(s, dtype=s.dtype) + tm.assert_series_equal(result, s) + + result = Series(s.dt.tz_convert('UTC'), dtype=s.dtype) + tm.assert_series_equal(result, s) + + result = Series(s.values, dtype=s.dtype) + tm.assert_series_equal(result, s) + def test_constructor_periodindex(self): # GH7932 # converting a PeriodIndex when put in a Series diff --git a/pandas/tests/series/test_missing.py b/pandas/tests/series/test_missing.py index 5666a07cad4b8..ed558275674c7 100644 --- a/pandas/tests/series/test_missing.py +++ b/pandas/tests/series/test_missing.py @@ -1,7 +1,8 @@ # coding=utf-8 # pylint: disable-msg=E1101,W0612 -from datetime import timedelta +import pytz +from datetime import timedelta, datetime from numpy import nan import numpy as np @@ -10,7 +11,6 @@ from pandas import (Series, isnull, date_range, MultiIndex, Index) from pandas.tseries.index import Timestamp - from pandas.compat import range from pandas.util.testing import assert_series_equal import pandas.util.testing as tm @@ -250,6 +250,24 @@ def test_datetime64_tz_fillna(self): self.assert_series_equal(expected, result) self.assert_series_equal(pd.isnull(s), null_loc) + def test_datetime64tz_fillna_round_issue(self): + # GH 14872 + + data = pd.Series([pd.NaT, pd.NaT, + datetime(2016, 12, 12, 22, 24, 6, 100001, + tzinfo=pytz.utc)]) + + filled = data.fillna(method='bfill') + + expected = pd.Series([datetime(2016, 12, 12, 22, 24, 6, + 100001, tzinfo=pytz.utc), + datetime(2016, 12, 12, 22, 24, 6, + 100001, tzinfo=pytz.utc), + datetime(2016, 12, 12, 22, 24, 6, + 100001, tzinfo=pytz.utc)]) + + assert_series_equal(filled, expected) + def test_fillna_int(self): s = Series(np.random.randint(-100, 100, 50)) s.fillna(method='ffill', inplace=True) @@ -908,7 +926,6 @@ def test_interp_timedelta64(self): index=pd.to_timedelta([1, 2, 4])) assert_series_equal(result, expected) - if __name__ == '__main__': import nose nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index f89f41abd0d35..7f1745edbb816 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -277,28 +277,6 @@ def test_factorize_nan(self): self.assertTrue( np.array_equal(pd.isnull(key), expected == na_sentinel)) - def test_vector_resize(self): - # Test for memory errors after internal vector - # reallocations (pull request #7157) - - def _test_vector_resize(htable, uniques, dtype, nvals): - vals = np.array(np.random.randn(1000), dtype=dtype) - # get_labels appends to the vector - htable.get_labels(vals[:nvals], uniques, 0, -1) - # to_array resizes the vector - uniques.to_array() - htable.get_labels(vals, uniques, 0, -1) - - test_cases = [ - (hashtable.PyObjectHashTable, hashtable.ObjectVector, 'object'), - (hashtable.Float64HashTable, hashtable.Float64Vector, 'float64'), - (hashtable.Int64HashTable, hashtable.Int64Vector, 'int64')] - - for (tbl, vect, dtype) in test_cases: - # resizing to empty is a special case - _test_vector_resize(tbl(), vect(), dtype, 0) - _test_vector_resize(tbl(), vect(), dtype, 10) - def test_complex_sorting(self): # gh 12666 - check no segfault # Test not valid numpy versions older than 1.11 @@ -912,6 +890,52 @@ class TestGroupVarFloat32(tm.TestCase, GroupVarTestMixin): rtol = 1e-2 +class TestHashTable(tm.TestCase): + + def test_lookup_nan(self): + xs = np.array([2.718, 3.14, np.nan, -7, 5, 2, 3]) + m = hashtable.Float64HashTable() + m.map_locations(xs) + self.assert_numpy_array_equal(m.lookup(xs), + np.arange(len(xs), dtype=np.int64)) + + def test_lookup_overflow(self): + xs = np.array([1, 2, 2**63], dtype=np.uint64) + m = hashtable.UInt64HashTable() + m.map_locations(xs) + self.assert_numpy_array_equal(m.lookup(xs), + np.arange(len(xs), dtype=np.int64)) + + def test_get_unique(self): + s = pd.Series([1, 2, 2**63, 2**63], dtype=np.uint64) + exp = np.array([1, 2, 2**63], dtype=np.uint64) + self.assert_numpy_array_equal(s.unique(), exp) + + def test_vector_resize(self): + # Test for memory errors after internal vector + # reallocations (pull request #7157) + + def _test_vector_resize(htable, uniques, dtype, nvals): + vals = np.array(np.random.randn(1000), dtype=dtype) + # get_labels appends to the vector + htable.get_labels(vals[:nvals], uniques, 0, -1) + # to_array resizes the vector + uniques.to_array() + htable.get_labels(vals, uniques, 0, -1) + + test_cases = [ + (hashtable.PyObjectHashTable, hashtable.ObjectVector, 'object'), + (hashtable.StringHashTable, hashtable.ObjectVector, 'object'), + (hashtable.Float64HashTable, hashtable.Float64Vector, 'float64'), + (hashtable.Int64HashTable, hashtable.Int64Vector, 'int64'), + (hashtable.UInt64HashTable, hashtable.UInt64Vector, 'uint64')] + + for (tbl, vect, dtype) in test_cases: + # resizing to empty is a special case + _test_vector_resize(tbl(), vect(), dtype, 0) + _test_vector_resize(tbl(), vect(), dtype, 10) + + def test_quantile(): s = Series(np.random.randn(100)) @@ -1129,6 +1153,55 @@ def test_ensure_platform_int(): assert (result is arr) +def test_int64_add_overflow(): + # see gh-14068 + msg = "Overflow in int64 addition" + m = np.iinfo(np.int64).max + n = np.iinfo(np.int64).min + + with tm.assertRaisesRegexp(OverflowError, msg): + algos.checked_add_with_arr(np.array([m, m]), m) + with tm.assertRaisesRegexp(OverflowError, msg): + algos.checked_add_with_arr(np.array([m, m]), np.array([m, m])) + with tm.assertRaisesRegexp(OverflowError, msg): + algos.checked_add_with_arr(np.array([n, n]), n) + with tm.assertRaisesRegexp(OverflowError, msg): + algos.checked_add_with_arr(np.array([n, n]), np.array([n, n])) + with tm.assertRaisesRegexp(OverflowError, msg): + algos.checked_add_with_arr(np.array([m, n]), np.array([n, n])) + with tm.assertRaisesRegexp(OverflowError, msg): + algos.checked_add_with_arr(np.array([m, m]), np.array([m, m]), + arr_mask=np.array([False, True])) + with tm.assertRaisesRegexp(OverflowError, msg): + algos.checked_add_with_arr(np.array([m, m]), np.array([m, m]), + b_mask=np.array([False, True])) + with tm.assertRaisesRegexp(OverflowError, msg): + algos.checked_add_with_arr(np.array([m, m]), np.array([m, m]), + arr_mask=np.array([False, True]), + b_mask=np.array([False, True])) + with tm.assertRaisesRegexp(OverflowError, msg): + with tm.assert_produces_warning(RuntimeWarning): + algos.checked_add_with_arr(np.array([m, m]), + np.array([np.nan, m])) + + # Check that the nan boolean arrays override whether or not + # the addition overflows. We don't check the result but just + # the fact that an OverflowError is not raised. + with tm.assertRaises(AssertionError): + with tm.assertRaisesRegexp(OverflowError, msg): + algos.checked_add_with_arr(np.array([m, m]), np.array([m, m]), + arr_mask=np.array([True, True])) + with tm.assertRaises(AssertionError): + with tm.assertRaisesRegexp(OverflowError, msg): + algos.checked_add_with_arr(np.array([m, m]), np.array([m, m]), + b_mask=np.array([True, True])) + with tm.assertRaises(AssertionError): + with tm.assertRaisesRegexp(OverflowError, msg): + algos.checked_add_with_arr(np.array([m, m]), np.array([m, m]), + arr_mask=np.array([True, False]), + b_mask=np.array([False, True])) + + if __name__ == '__main__': import nose nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index a5cd0bbc28369..717eae3e59715 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -1051,17 +1051,6 @@ def test_searchsorted(self): self.assertTrue(0 <= index <= len(o)) -class TestFloat64HashTable(tm.TestCase): - - def test_lookup_nan(self): - from pandas.hashtable import Float64HashTable - xs = np.array([2.718, 3.14, np.nan, -7, 5, 2, 3]) - m = Float64HashTable() - m.map_locations(xs) - self.assert_numpy_array_equal(m.lookup(xs), - np.arange(len(xs), dtype=np.int64)) - - class TestTranspose(Ops): errmsg = "the 'axes' parameter is not supported" diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index 5320b2216ee40..5d2c317cc0f81 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -1669,7 +1669,8 @@ def test_map(self): tm.assert_categorical_equal(result, exp) result = c.map(lambda x: 1) - tm.assert_numpy_array_equal(result, np.array([1] * 5, dtype=np.int64)) + # GH 12766: Return an index not an array + tm.assert_index_equal(result, Index(np.array([1] * 5, dtype=np.int64))) class TestCategoricalAsBlock(tm.TestCase): diff --git a/pandas/tests/test_generic.py b/pandas/tests/test_generic.py index 84df82db69f77..3500ce913462a 100644 --- a/pandas/tests/test_generic.py +++ b/pandas/tests/test_generic.py @@ -996,6 +996,13 @@ def test_describe_percentiles_insert_median(self): self.assertTrue('0%' in d1.index) self.assertTrue('100%' in d2.index) + def test_describe_percentiles_insert_median_ndarray(self): + # GH14908 + df = tm.makeDataFrame() + result = df.describe(percentiles=np.array([.25, .75])) + expected = df.describe(percentiles=[.25, .75]) + assert_frame_equal(result, expected) + def test_describe_percentiles_unique(self): # GH13104 df = tm.makeDataFrame() diff --git a/pandas/tests/test_nanops.py b/pandas/tests/test_nanops.py index be634228b1b6e..dd3a49de55d73 100644 --- a/pandas/tests/test_nanops.py +++ b/pandas/tests/test_nanops.py @@ -1002,28 +1002,6 @@ def prng(self): return np.random.RandomState(1234) -def test_int64_add_overflow(): - # see gh-14068 - msg = "Overflow in int64 addition" - m = np.iinfo(np.int64).max - n = np.iinfo(np.int64).min - - with tm.assertRaisesRegexp(OverflowError, msg): - nanops._checked_add_with_arr(np.array([m, m]), m) - with tm.assertRaisesRegexp(OverflowError, msg): - nanops._checked_add_with_arr(np.array([m, m]), np.array([m, m])) - with tm.assertRaisesRegexp(OverflowError, msg): - nanops._checked_add_with_arr(np.array([n, n]), n) - with tm.assertRaisesRegexp(OverflowError, msg): - nanops._checked_add_with_arr(np.array([n, n]), np.array([n, n])) - with tm.assertRaisesRegexp(OverflowError, msg): - nanops._checked_add_with_arr(np.array([m, n]), np.array([n, n])) - with tm.assertRaisesRegexp(OverflowError, msg): - with tm.assert_produces_warning(RuntimeWarning): - nanops._checked_add_with_arr(np.array([m, m]), - np.array([np.nan, m])) - - if __name__ == '__main__': import nose nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure', '-s' diff --git a/pandas/tests/types/test_inference.py b/pandas/tests/types/test_inference.py index a63ae5f7cf74e..f83ad51c2f648 100644 --- a/pandas/tests/types/test_inference.py +++ b/pandas/tests/types/test_inference.py @@ -254,6 +254,20 @@ def test_convert_non_hashable(self): result = lib.maybe_convert_numeric(arr, set(), False, True) tm.assert_numpy_array_equal(result, np.array([np.nan, 1.0, np.nan])) + def test_maybe_convert_objects_uint64(self): + # see gh-4471 + arr = np.array([2**63], dtype=object) + exp = np.array([2**63], dtype=np.uint64) + tm.assert_numpy_array_equal(lib.maybe_convert_objects(arr), exp) + + arr = np.array([2, -1], dtype=object) + exp = np.array([2, -1], dtype=np.int64) + tm.assert_numpy_array_equal(lib.maybe_convert_objects(arr), exp) + + arr = np.array([2**63, -1], dtype=object) + exp = np.array([2**63, -1], dtype=object) + tm.assert_numpy_array_equal(lib.maybe_convert_objects(arr), exp) + class TestTypeInference(tm.TestCase): _multiprocess_can_split_ = True diff --git a/pandas/tools/merge.py b/pandas/tools/merge.py index 198991531e0a7..efae7c63a9d0e 100644 --- a/pandas/tools/merge.py +++ b/pandas/tools/merge.py @@ -5,6 +5,8 @@ import copy import warnings +import string + import numpy as np from pandas.compat import range, lrange, lzip, zip, map, filter import pandas.compat as compat @@ -28,7 +30,8 @@ is_list_like, _ensure_int64, _ensure_float64, - _ensure_object) + _ensure_object, + _get_dtype) from pandas.types.missing import na_value_for_dtype from pandas.core.generic import NDFrame @@ -271,8 +274,8 @@ def merge_asof(left, right, on=None, DataFrame whose 'on' key is less than or equal to the left's key. Both DataFrames must be sorted by the key. - Optionally perform group-wise merge. This searches for the nearest match - on the 'on' key within the same group according to 'by'. + Optionally match on equivalent keys with 'by' before searching for nearest + match with 'on'. .. versionadded:: 0.19.0 @@ -299,16 +302,15 @@ def merge_asof(left, right, on=None, .. versionadded:: 0.19.2 - by : column name - Group both the left and right DataFrames by the group column; perform - the merge operation on these pieces and recombine. + by : column name or list of column names + Match on these columns before performing merge operation. left_by : column name - Field name to group by in the left DataFrame. + Field names to match on in the left DataFrame. .. versionadded:: 0.19.2 right_by : column name - Field name to group by in the right DataFrame. + Field names to match on in the right DataFrame. .. versionadded:: 0.19.2 @@ -997,17 +999,13 @@ def get_result(self): return result -_asof_functions = { - 'int64_t': _join.asof_join_int64_t, - 'double': _join.asof_join_double, -} +def _asof_function(on_type): + return getattr(_join, 'asof_join_%s' % on_type, None) + + +def _asof_by_function(on_type, by_type): + return getattr(_join, 'asof_join_%s_by_%s' % (on_type, by_type), None) -_asof_by_functions = { - ('int64_t', 'int64_t'): _join.asof_join_int64_t_by_int64_t, - ('double', 'int64_t'): _join.asof_join_double_by_int64_t, - ('int64_t', 'object'): _join.asof_join_int64_t_by_object, - ('double', 'object'): _join.asof_join_double_by_object, -} _type_casters = { 'int64_t': _ensure_int64, @@ -1015,9 +1013,32 @@ def get_result(self): 'object': _ensure_object, } +_cython_types = { + 'uint8': 'uint8_t', + 'uint32': 'uint32_t', + 'uint16': 'uint16_t', + 'uint64': 'uint64_t', + 'int8': 'int8_t', + 'int32': 'int32_t', + 'int16': 'int16_t', + 'int64': 'int64_t', + 'float16': 'error', + 'float32': 'float', + 'float64': 'double', +} + def _get_cython_type(dtype): - """ Given a dtype, return 'int64_t', 'double', or 'object' """ + """ Given a dtype, return a C name like 'int64_t' or 'double' """ + type_name = _get_dtype(dtype).name + ctype = _cython_types.get(type_name, 'object') + if ctype == 'error': + raise MergeError('unsupported type: ' + type_name) + return ctype + + +def _get_cython_type_upcast(dtype): + """ Upcast a dtype to 'int64_t', 'double', or 'object' """ if is_integer_dtype(dtype): return 'int64_t' elif is_float_dtype(dtype): @@ -1084,11 +1105,6 @@ def _validate_specification(self): if not is_list_like(self.right_by): self.right_by = [self.right_by] - if len(self.left_by) != 1: - raise MergeError("can only asof by a single key") - if len(self.right_by) != 1: - raise MergeError("can only asof by a single key") - self.left_on = self.left_by + list(self.left_on) self.right_on = self.right_by + list(self.right_on) @@ -1142,6 +1158,13 @@ def _get_merge_keys(self): def _get_join_indexers(self): """ return the join indexers """ + def flip(xs): + """ unlike np.transpose, this returns an array of tuples """ + labels = list(string.ascii_lowercase[:len(xs)]) + dtypes = [x.dtype for x in xs] + labeled_dtypes = list(zip(labels, dtypes)) + return np.array(lzip(*xs), labeled_dtypes) + # values to compare left_values = (self.left.index.values if self.left_index else self.left_join_keys[-1]) @@ -1165,22 +1188,23 @@ def _get_join_indexers(self): # a "by" parameter requires special handling if self.left_by is not None: - left_by_values = self.left_join_keys[0] - right_by_values = self.right_join_keys[0] - - # choose appropriate function by type - on_type = _get_cython_type(left_values.dtype) - by_type = _get_cython_type(left_by_values.dtype) + if len(self.left_join_keys) > 2: + # get tuple representation of values if more than one + left_by_values = flip(self.left_join_keys[0:-1]) + right_by_values = flip(self.right_join_keys[0:-1]) + else: + left_by_values = self.left_join_keys[0] + right_by_values = self.right_join_keys[0] - on_type_caster = _type_casters[on_type] + # upcast 'by' parameter because HashTable is limited + by_type = _get_cython_type_upcast(left_by_values.dtype) by_type_caster = _type_casters[by_type] - func = _asof_by_functions[(on_type, by_type)] - - left_values = on_type_caster(left_values) - right_values = on_type_caster(right_values) left_by_values = by_type_caster(left_by_values) right_by_values = by_type_caster(right_by_values) + # choose appropriate function by type + on_type = _get_cython_type(left_values.dtype) + func = _asof_by_function(on_type, by_type) return func(left_values, right_values, left_by_values, @@ -1190,12 +1214,7 @@ def _get_join_indexers(self): else: # choose appropriate function by type on_type = _get_cython_type(left_values.dtype) - type_caster = _type_casters[on_type] - func = _asof_functions[on_type] - - left_values = type_caster(left_values) - right_values = type_caster(right_values) - + func = _asof_function(on_type) return func(left_values, right_values, self.allow_exact_matches, diff --git a/pandas/tools/tests/test_merge_asof.py b/pandas/tools/tests/test_merge_asof.py index d33ba30d7f032..bbbf1a3bdfff9 100644 --- a/pandas/tools/tests/test_merge_asof.py +++ b/pandas/tools/tests/test_merge_asof.py @@ -221,6 +221,117 @@ def test_missing_right_by(self): expected.loc[expected.ticker == 'MSFT', ['bid', 'ask']] = np.nan assert_frame_equal(result, expected) + def test_multiby(self): + # GH13936 + trades = pd.DataFrame({ + 'time': pd.to_datetime(['20160525 13:30:00.023', + '20160525 13:30:00.023', + '20160525 13:30:00.046', + '20160525 13:30:00.048', + '20160525 13:30:00.050']), + 'ticker': ['MSFT', 'MSFT', + 'GOOG', 'GOOG', 'AAPL'], + 'exch': ['ARCA', 'NSDQ', 'NSDQ', 'BATS', 'NSDQ'], + 'price': [51.95, 51.95, + 720.77, 720.92, 98.00], + 'quantity': [75, 155, + 100, 100, 100]}, + columns=['time', 'ticker', 'exch', + 'price', 'quantity']) + + quotes = pd.DataFrame({ + 'time': pd.to_datetime(['20160525 13:30:00.023', + '20160525 13:30:00.023', + '20160525 13:30:00.030', + '20160525 13:30:00.041', + '20160525 13:30:00.045', + '20160525 13:30:00.049']), + 'ticker': ['GOOG', 'MSFT', 'MSFT', + 'MSFT', 'GOOG', 'AAPL'], + 'exch': ['BATS', 'NSDQ', 'ARCA', 'ARCA', + 'NSDQ', 'ARCA'], + 'bid': [720.51, 51.95, 51.97, 51.99, + 720.50, 97.99], + 'ask': [720.92, 51.96, 51.98, 52.00, + 720.93, 98.01]}, + columns=['time', 'ticker', 'exch', 'bid', 'ask']) + + expected = pd.DataFrame({ + 'time': pd.to_datetime(['20160525 13:30:00.023', + '20160525 13:30:00.023', + '20160525 13:30:00.046', + '20160525 13:30:00.048', + '20160525 13:30:00.050']), + 'ticker': ['MSFT', 'MSFT', + 'GOOG', 'GOOG', 'AAPL'], + 'exch': ['ARCA', 'NSDQ', 'NSDQ', 'BATS', 'NSDQ'], + 'price': [51.95, 51.95, + 720.77, 720.92, 98.00], + 'quantity': [75, 155, + 100, 100, 100], + 'bid': [np.nan, 51.95, 720.50, 720.51, np.nan], + 'ask': [np.nan, 51.96, 720.93, 720.92, np.nan]}, + columns=['time', 'ticker', 'exch', + 'price', 'quantity', 'bid', 'ask']) + + result = pd.merge_asof(trades, quotes, on='time', + by=['ticker', 'exch']) + assert_frame_equal(result, expected) + + def test_multiby_heterogeneous_types(self): + # GH13936 + trades = pd.DataFrame({ + 'time': pd.to_datetime(['20160525 13:30:00.023', + '20160525 13:30:00.023', + '20160525 13:30:00.046', + '20160525 13:30:00.048', + '20160525 13:30:00.050']), + 'ticker': [0, 0, 1, 1, 2], + 'exch': ['ARCA', 'NSDQ', 'NSDQ', 'BATS', 'NSDQ'], + 'price': [51.95, 51.95, + 720.77, 720.92, 98.00], + 'quantity': [75, 155, + 100, 100, 100]}, + columns=['time', 'ticker', 'exch', + 'price', 'quantity']) + + quotes = pd.DataFrame({ + 'time': pd.to_datetime(['20160525 13:30:00.023', + '20160525 13:30:00.023', + '20160525 13:30:00.030', + '20160525 13:30:00.041', + '20160525 13:30:00.045', + '20160525 13:30:00.049']), + 'ticker': [1, 0, 0, 0, 1, 2], + 'exch': ['BATS', 'NSDQ', 'ARCA', 'ARCA', + 'NSDQ', 'ARCA'], + 'bid': [720.51, 51.95, 51.97, 51.99, + 720.50, 97.99], + 'ask': [720.92, 51.96, 51.98, 52.00, + 720.93, 98.01]}, + columns=['time', 'ticker', 'exch', 'bid', 'ask']) + + expected = pd.DataFrame({ + 'time': pd.to_datetime(['20160525 13:30:00.023', + '20160525 13:30:00.023', + '20160525 13:30:00.046', + '20160525 13:30:00.048', + '20160525 13:30:00.050']), + 'ticker': [0, 0, 1, 1, 2], + 'exch': ['ARCA', 'NSDQ', 'NSDQ', 'BATS', 'NSDQ'], + 'price': [51.95, 51.95, + 720.77, 720.92, 98.00], + 'quantity': [75, 155, + 100, 100, 100], + 'bid': [np.nan, 51.95, 720.50, 720.51, np.nan], + 'ask': [np.nan, 51.96, 720.93, 720.92, np.nan]}, + columns=['time', 'ticker', 'exch', + 'price', 'quantity', 'bid', 'ask']) + + result = pd.merge_asof(trades, quotes, on='time', + by=['ticker', 'exch']) + assert_frame_equal(result, expected) + def test_basic2(self): expected = self.read_data('asof2.csv') @@ -542,6 +653,78 @@ def test_on_float(self): assert_frame_equal(result, expected) + def test_on_specialized_type(self): + # GH13936 + for dtype in [np.uint8, np.uint16, np.uint32, np.uint64, + np.int8, np.int16, np.int32, np.int64, + np.float16, np.float32, np.float64]: + df1 = pd.DataFrame({ + 'value': [5, 2, 25, 100, 78, 120, 79], + 'symbol': list("ABCDEFG")}, + columns=['symbol', 'value']) + df1.value = dtype(df1.value) + + df2 = pd.DataFrame({ + 'value': [0, 80, 120, 125], + 'result': list('xyzw')}, + columns=['value', 'result']) + df2.value = dtype(df2.value) + + df1 = df1.sort_values('value').reset_index(drop=True) + + if dtype == np.float16: + with self.assertRaises(MergeError): + pd.merge_asof(df1, df2, on='value') + continue + + result = pd.merge_asof(df1, df2, on='value') + + expected = pd.DataFrame( + {'symbol': list("BACEGDF"), + 'value': [2, 5, 25, 78, 79, 100, 120], + 'result': list('xxxxxyz') + }, columns=['symbol', 'value', 'result']) + expected.value = dtype(expected.value) + + assert_frame_equal(result, expected) + + def test_on_specialized_type_by_int(self): + # GH13936 + for dtype in [np.uint8, np.uint16, np.uint32, np.uint64, + np.int8, np.int16, np.int32, np.int64, + np.float16, np.float32, np.float64]: + df1 = pd.DataFrame({ + 'value': [5, 2, 25, 100, 78, 120, 79], + 'key': [1, 2, 3, 2, 3, 1, 2], + 'symbol': list("ABCDEFG")}, + columns=['symbol', 'key', 'value']) + df1.value = dtype(df1.value) + + df2 = pd.DataFrame({ + 'value': [0, 80, 120, 125], + 'key': [1, 2, 2, 3], + 'result': list('xyzw')}, + columns=['value', 'key', 'result']) + df2.value = dtype(df2.value) + + df1 = df1.sort_values('value').reset_index(drop=True) + + if dtype == np.float16: + with self.assertRaises(MergeError): + pd.merge_asof(df1, df2, on='value', by='key') + else: + result = pd.merge_asof(df1, df2, on='value', by='key') + + expected = pd.DataFrame({ + 'symbol': list("BACEGDF"), + 'key': [2, 1, 3, 3, 2, 2, 1], + 'value': [2, 5, 25, 78, 79, 100, 120], + 'result': [np.nan, 'x', np.nan, np.nan, np.nan, 'y', 'x']}, + columns=['symbol', 'key', 'value', 'result']) + expected.value = dtype(expected.value) + + assert_frame_equal(result, expected) + def test_on_float_by_int(self): # type specialize both "by" and "on" parameters df1 = pd.DataFrame({ diff --git a/pandas/tools/tests/test_util.py b/pandas/tools/tests/test_util.py index f9647721e3c5b..f808abcda9418 100644 --- a/pandas/tools/tests/test_util.py +++ b/pandas/tools/tests/test_util.py @@ -2,6 +2,7 @@ import locale import codecs import nose +import decimal import numpy as np from numpy import iinfo @@ -208,6 +209,46 @@ def test_numeric(self): res = to_numeric(s) tm.assert_series_equal(res, expected) + # GH 14827 + df = pd.DataFrame(dict( + a=[1.2, decimal.Decimal(3.14), decimal.Decimal("infinity"), '0.1'], + b=[1.0, 2.0, 3.0, 4.0], + )) + expected = pd.DataFrame(dict( + a=[1.2, 3.14, np.inf, 0.1], + b=[1.0, 2.0, 3.0, 4.0], + )) + + # Test to_numeric over one column + df_copy = df.copy() + df_copy['a'] = df_copy['a'].apply(to_numeric) + tm.assert_frame_equal(df_copy, expected) + + # Test to_numeric over multiple columns + df_copy = df.copy() + df_copy[['a', 'b']] = df_copy[['a', 'b']].apply(to_numeric) + tm.assert_frame_equal(df_copy, expected) + + def test_numeric_lists_and_arrays(self): + # Test to_numeric with embedded lists and arrays + df = pd.DataFrame(dict( + a=[[decimal.Decimal(3.14), 1.0], decimal.Decimal(1.6), 0.1] + )) + df['a'] = df['a'].apply(to_numeric) + expected = pd.DataFrame(dict( + a=[[3.14, 1.0], 1.6, 0.1], + )) + tm.assert_frame_equal(df, expected) + + df = pd.DataFrame(dict( + a=[np.array([decimal.Decimal(3.14), 1.0]), 0.1] + )) + df['a'] = df['a'].apply(to_numeric) + expected = pd.DataFrame(dict( + a=[[3.14, 1.0], 0.1], + )) + tm.assert_frame_equal(df, expected) + def test_all_nan(self): s = pd.Series(['a', 'b', 'c']) res = to_numeric(s, errors='coerce') diff --git a/pandas/tools/util.py b/pandas/tools/util.py index b50bf9dc448bc..daecf3d093680 100644 --- a/pandas/tools/util.py +++ b/pandas/tools/util.py @@ -5,7 +5,10 @@ is_numeric_dtype, is_datetime_or_timedelta_dtype, is_list_like, - _ensure_object) + _ensure_object, + is_decimal, + is_scalar as isscalar) + from pandas.types.cast import _possibly_downcast_to_dtype import pandas as pd @@ -173,7 +176,9 @@ def to_numeric(arg, errors='raise', downcast=None): values = arg.values elif isinstance(arg, (list, tuple)): values = np.array(arg, dtype='O') - elif np.isscalar(arg): + elif isscalar(arg): + if is_decimal(arg): + return float(arg) if is_number(arg): return arg is_scalar = True diff --git a/pandas/tseries/base.py b/pandas/tseries/base.py index 4645ae24684ff..63e56e09e91fe 100644 --- a/pandas/tseries/base.py +++ b/pandas/tseries/base.py @@ -16,6 +16,7 @@ ABCPeriodIndex, ABCIndexClass) from pandas.types.missing import isnull from pandas.core import common as com, algorithms +from pandas.core.algorithms import checked_add_with_arr from pandas.core.common import AbstractMethodError import pandas.formats.printing as printing @@ -27,7 +28,6 @@ from pandas.util.decorators import Appender, cache_readonly import pandas.types.concat as _concat import pandas.tseries.frequencies as frequencies -import pandas.algos as _algos class DatelikeOps(object): @@ -330,11 +330,16 @@ def _nat_new(self, box=True): def map(self, f): try: result = f(self) - if not isinstance(result, (np.ndarray, Index)): - raise TypeError + + # Try to use this result if we can + if isinstance(result, np.ndarray): + self._shallow_copy(result) + + if not isinstance(result, Index): + raise TypeError('The map function must return an Index object') return result except Exception: - return _algos.arrmap_object(self.asobject.values, f) + return self.asobject.map(f) def sort_values(self, return_indexer=False, ascending=True): """ @@ -684,7 +689,8 @@ def _add_delta_td(self, other): # return the i8 result view inc = tslib._delta_to_nanoseconds(other) - new_values = (self.asi8 + inc).view('i8') + new_values = checked_add_with_arr(self.asi8, inc, + arr_mask=self._isnan).view('i8') if self.hasnans: new_values[self._isnan] = tslib.iNaT return new_values.view('i8') @@ -699,7 +705,9 @@ def _add_delta_tdi(self, other): self_i8 = self.asi8 other_i8 = other.asi8 - new_values = self_i8 + other_i8 + new_values = checked_add_with_arr(self_i8, other_i8, + arr_mask=self._isnan, + b_mask=other._isnan) if self.hasnans or other.hasnans: mask = (self._isnan) | (other._isnan) new_values[mask] = tslib.iNaT diff --git a/pandas/tseries/index.py b/pandas/tseries/index.py index 3edf75fbb82ae..aca962c8178d3 100644 --- a/pandas/tseries/index.py +++ b/pandas/tseries/index.py @@ -1293,14 +1293,12 @@ def _parsed_string_to_bounds(self, reso, parsed): def _partial_date_slice(self, reso, parsed, use_lhs=True, use_rhs=True): is_monotonic = self.is_monotonic - if ((reso in ['day', 'hour', 'minute'] and - not (self._resolution < Resolution.get_reso(reso) or - not is_monotonic)) or - (reso == 'second' and - not (self._resolution <= Resolution.RESO_SEC or - not is_monotonic))): + if (is_monotonic and reso in ['day', 'hour', 'minute', 'second'] and + self._resolution >= Resolution.get_reso(reso)): # These resolution/monotonicity validations came from GH3931, # GH3452 and GH2369. + + # See also GH14826 raise KeyError if reso == 'microsecond': diff --git a/pandas/tseries/period.py b/pandas/tseries/period.py index 4bab3bc14461e..8c75195b25ef5 100644 --- a/pandas/tseries/period.py +++ b/pandas/tseries/period.py @@ -65,6 +65,7 @@ def dt64arr_to_periodarr(data, freq, tz): # --- Period index sketch + _DIFFERENT_FREQ_INDEX = period._DIFFERENT_FREQ_INDEX @@ -305,7 +306,7 @@ def _simple_new(cls, values, name=None, freq=None, **kwargs): if (len(values) > 0 and is_float_dtype(values)): raise TypeError("PeriodIndex can't take floats") else: - return PeriodIndex(values, name=name, freq=freq, **kwargs) + return cls(values, name=name, freq=freq, **kwargs) values = np.array(values, dtype='int64', copy=False) @@ -326,6 +327,8 @@ def _shallow_copy(self, values=None, **kwargs): if kwargs.get('freq') is None: # freq must be provided kwargs['freq'] = self.freq + if values is None: + values = self._values return super(PeriodIndex, self)._shallow_copy(values=values, **kwargs) def _coerce_scalar_to_index(self, item): @@ -356,9 +359,8 @@ def __contains__(self, key): def asi8(self): return self._values.view('i8') - @property + @cache_readonly def _int64index(self): - # do not cache, same as .asi8 return Int64Index(self.asi8, name=self.name, fastpath=True) @property diff --git a/pandas/tseries/tdi.py b/pandas/tseries/tdi.py index 7e77d8baf3b2c..1585aac0c8ead 100644 --- a/pandas/tseries/tdi.py +++ b/pandas/tseries/tdi.py @@ -20,8 +20,8 @@ import pandas.compat as compat from pandas.compat import u from pandas.tseries.frequencies import to_offset +from pandas.core.algorithms import checked_add_with_arr from pandas.core.base import _shared_docs -from pandas.core.nanops import _checked_add_with_arr from pandas.indexes.base import _index_shared_docs import pandas.core.common as com import pandas.types.concat as _concat @@ -347,7 +347,7 @@ def _add_datelike(self, other): else: other = Timestamp(other) i8 = self.asi8 - result = _checked_add_with_arr(i8, other.value) + result = checked_add_with_arr(i8, other.value) result = self._maybe_mask_results(result, fill_value=tslib.iNaT) return DatetimeIndex(result, name=self.name, copy=False) diff --git a/pandas/tseries/tests/test_converter.py b/pandas/tseries/tests/test_converter.py index 37d9c35639c32..7e4ed288e31c1 100644 --- a/pandas/tseries/tests/test_converter.py +++ b/pandas/tseries/tests/test_converter.py @@ -3,7 +3,7 @@ import nose import numpy as np -from pandas import Timestamp, Period +from pandas import Timestamp, Period, Index from pandas.compat import u import pandas.util.testing as tm from pandas.tseries.offsets import Second, Milli, Micro @@ -104,8 +104,8 @@ def test_dateindex_conversion(self): for freq in ('B', 'L', 'S'): dateindex = tm.makeDateIndex(k=10, freq=freq) rs = self.dtc.convert(dateindex, None, None) - xp = converter.dates.date2num(dateindex._mpl_repr()) - tm.assert_almost_equal(rs, xp, decimals) + xp = Index(converter.dates.date2num(dateindex._mpl_repr())) + tm.assert_index_equal(rs, xp, decimals) def test_resolution(self): def _assert_less(ts1, ts2): diff --git a/pandas/tseries/tests/test_period.py b/pandas/tseries/tests/test_period.py index fe0d28dd9c508..ad4f669fceb42 100644 --- a/pandas/tseries/tests/test_period.py +++ b/pandas/tseries/tests/test_period.py @@ -2101,14 +2101,6 @@ def test_comp_period(self): exp = idx.values < idx.values[10] self.assert_numpy_array_equal(result, exp) - def test_getitem_ndim2(self): - idx = period_range('2007-01', periods=3, freq='M') - - result = idx[:, None] - # MPL kludge, internally has incorrect shape - tm.assertIsInstance(result, PeriodIndex) - self.assertEqual(result.shape, (len(idx), )) - def test_getitem_index(self): idx = period_range('2007-01', periods=10, freq='M', name='x') @@ -3521,8 +3513,8 @@ def test_map(self): tm.assert_index_equal(result, expected) result = index.map(lambda x: x.ordinal) - exp = np.array([x.ordinal for x in index], dtype=np.int64) - tm.assert_numpy_array_equal(result, exp) + exp = Index([x.ordinal for x in index]) + tm.assert_index_equal(result, exp) def test_map_with_string_constructor(self): raw = [2005, 2007, 2009] @@ -3534,20 +3526,17 @@ def test_map_with_string_constructor(self): types += text_type, for t in types: - expected = np.array(lmap(t, raw), dtype=object) + expected = Index(lmap(t, raw)) res = index.map(t) - # should return an array - tm.assertIsInstance(res, np.ndarray) + # should return an Index + tm.assertIsInstance(res, Index) # preserve element types self.assertTrue(all(isinstance(resi, t) for resi in res)) - # dtype should be object - self.assertEqual(res.dtype, np.dtype('object').type) - # lastly, values should compare equal - tm.assert_numpy_array_equal(res, expected) + tm.assert_index_equal(res, expected) def test_convert_array_of_periods(self): rng = period_range('1/1/2000', periods=20, freq='D') diff --git a/pandas/tseries/tests/test_timedeltas.py b/pandas/tseries/tests/test_timedeltas.py index f0d14014d6559..1d07b4ab39a99 100644 --- a/pandas/tseries/tests/test_timedeltas.py +++ b/pandas/tseries/tests/test_timedeltas.py @@ -1203,6 +1203,28 @@ def test_implementation_limits(self): with tm.assertRaises(OverflowError): Timedelta(max_td.value + 1, 'ns') + def test_timedelta_arithmetic(self): + data = pd.Series(['nat', '32 days'], dtype='timedelta64[ns]') + deltas = [timedelta(days=1), Timedelta(1, unit='D')] + for delta in deltas: + result_method = data.add(delta) + result_operator = data + delta + expected = pd.Series(['nat', '33 days'], dtype='timedelta64[ns]') + tm.assert_series_equal(result_operator, expected) + tm.assert_series_equal(result_method, expected) + + result_method = data.sub(delta) + result_operator = data - delta + expected = pd.Series(['nat', '31 days'], dtype='timedelta64[ns]') + tm.assert_series_equal(result_operator, expected) + tm.assert_series_equal(result_method, expected) + # GH 9396 + result_method = data.div(delta) + result_operator = data / delta + expected = pd.Series([np.nan, 32.], dtype='float64') + tm.assert_series_equal(result_operator, expected) + tm.assert_series_equal(result_method, expected) + class TestTimedeltaIndex(tm.TestCase): _multiprocess_can_split_ = True @@ -1513,8 +1535,8 @@ def test_map(self): f = lambda x: x.days result = rng.map(f) - exp = np.array([f(x) for x in rng], dtype=np.int64) - self.assert_numpy_array_equal(result, exp) + exp = Int64Index([f(x) for x in rng]) + tm.assert_index_equal(result, exp) def test_misc_coverage(self): @@ -1958,11 +1980,34 @@ def test_add_overflow(self): with tm.assertRaisesRegexp(OverflowError, msg): Timestamp('2000') + to_timedelta(106580, 'D') + _NaT = int(pd.NaT) + 1 msg = "Overflow in int64 addition" with tm.assertRaisesRegexp(OverflowError, msg): to_timedelta([106580], 'D') + Timestamp('2000') with tm.assertRaisesRegexp(OverflowError, msg): Timestamp('2000') + to_timedelta([106580], 'D') + with tm.assertRaisesRegexp(OverflowError, msg): + to_timedelta([_NaT]) - Timedelta('1 days') + with tm.assertRaisesRegexp(OverflowError, msg): + to_timedelta(['5 days', _NaT]) - Timedelta('1 days') + with tm.assertRaisesRegexp(OverflowError, msg): + (to_timedelta([_NaT, '5 days', '1 hours']) - + to_timedelta(['7 seconds', _NaT, '4 hours'])) + + # These should not overflow! + exp = TimedeltaIndex([pd.NaT]) + result = to_timedelta([pd.NaT]) - Timedelta('1 days') + tm.assert_index_equal(result, exp) + + exp = TimedeltaIndex(['4 days', pd.NaT]) + result = to_timedelta(['5 days', pd.NaT]) - Timedelta('1 days') + tm.assert_index_equal(result, exp) + + exp = TimedeltaIndex([pd.NaT, pd.NaT, '5 hours']) + result = (to_timedelta([pd.NaT, '5 days', '1 hours']) + + to_timedelta(['7 seconds', pd.NaT, '4 hours'])) + tm.assert_index_equal(result, exp) + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py index beacc21912edc..3f4a10619f7f5 100644 --- a/pandas/tseries/tests/test_timeseries.py +++ b/pandas/tseries/tests/test_timeseries.py @@ -266,16 +266,15 @@ def test_indexing(self): expected = ts['2013'] assert_series_equal(expected, ts) - # GH 3925, indexing with a seconds resolution string / datetime object + # GH14826, indexing with a seconds resolution string / datetime object df = DataFrame(randn(5, 5), columns=['open', 'high', 'low', 'close', 'volume'], index=date_range('2012-01-02 18:01:00', periods=5, tz='US/Central', freq='s')) expected = df.loc[[df.index[2]]] - result = df['2012-01-02 18:01:02'] - assert_frame_equal(result, expected) # this is a single date, so will raise + self.assertRaises(KeyError, df.__getitem__, '2012-01-02 18:01:02', ) self.assertRaises(KeyError, df.__getitem__, df.index[2], ) def test_recreate_from_data(self): @@ -3003,8 +3002,8 @@ def test_map(self): f = lambda x: x.strftime('%Y%m%d') result = rng.map(f) - exp = np.array([f(x) for x in rng], dtype='=U8') - tm.assert_almost_equal(result, exp) + exp = Index([f(x) for x in rng], dtype=' NaT # integer or integer array -> date-like array - if result.dtype in _DATELIKE_DTYPES: + if is_datetimelike(result.dtype): if is_scalar(other): if isnull(other): other = result.dtype.type('nat') @@ -666,7 +667,7 @@ def _possibly_castable(arr): # otherwise try to coerce kind = arr.dtype.kind if kind == 'M' or kind == 'm': - return arr.dtype in _DATELIKE_DTYPES + return is_datetime64_dtype(arr.dtype) return arr.dtype.name not in _POSSIBLY_CAST_DTYPES @@ -822,9 +823,10 @@ def _possibly_cast_to_datetime(value, dtype, errors='raise'): elif is_datetime64tz: # input has to be UTC at this point, so just # localize - value = to_datetime( - value, - errors=errors).tz_localize(dtype.tz) + value = (to_datetime(value, errors=errors) + .tz_localize('UTC') + .tz_convert(dtype.tz) + ) elif is_timedelta64: value = to_timedelta(value, errors=errors)._values except (AttributeError, ValueError, TypeError): diff --git a/pandas/types/common.py b/pandas/types/common.py index 754ff80924c07..06c8ef6e35cd7 100644 --- a/pandas/types/common.py +++ b/pandas/types/common.py @@ -22,9 +22,6 @@ _NS_DTYPE = np.dtype('M8[ns]') _TD_DTYPE = np.dtype('m8[ns]') _INT64_DTYPE = np.dtype(np.int64) -_DATELIKE_DTYPES = set([np.dtype(t) - for t in ['M8[ns]', 'M8[ns]', - 'm8[ns]', 'm8[ns]']]) _ensure_float64 = algos.ensure_float64 _ensure_float32 = algos.ensure_float32 @@ -35,6 +32,8 @@ def _ensure_float(arr): arr = arr.astype(float) return arr + +_ensure_uint64 = algos.ensure_uint64 _ensure_int64 = algos.ensure_int64 _ensure_int32 = algos.ensure_int32 _ensure_int16 = algos.ensure_int16 @@ -126,7 +125,8 @@ def is_datetime_arraylike(arr): def is_datetimelike(arr): - return (arr.dtype in _DATELIKE_DTYPES or + return (is_datetime64_dtype(arr) or is_datetime64tz_dtype(arr) or + is_timedelta64_dtype(arr) or isinstance(arr, ABCPeriodIndex) or is_datetimetz(arr)) @@ -402,6 +402,11 @@ def _get_dtype_from_object(dtype): pass return dtype.type elif isinstance(dtype, string_types): + if dtype in ['datetimetz', 'datetime64tz']: + return DatetimeTZDtype.type + elif dtype in ['period']: + raise NotImplementedError + if dtype == 'datetime' or dtype == 'timedelta': dtype += '64' diff --git a/pandas/types/inference.py b/pandas/types/inference.py index 35a2dc2fb831b..d2a2924b27659 100644 --- a/pandas/types/inference.py +++ b/pandas/types/inference.py @@ -18,6 +18,8 @@ is_scalar = lib.isscalar +is_decimal = lib.is_decimal + def is_number(obj): return isinstance(obj, (Number, np.number)) diff --git a/pandas/types/missing.py b/pandas/types/missing.py index a4af127e0c381..e6791b79bf3bd 100644 --- a/pandas/types/missing.py +++ b/pandas/types/missing.py @@ -19,8 +19,7 @@ is_object_dtype, is_integer, _TD_DTYPE, - _NS_DTYPE, - _DATELIKE_DTYPES) + _NS_DTYPE) from .inference import is_list_like @@ -169,7 +168,7 @@ def _isnull_ndarraylike_old(obj): vec = lib.isnullobj_old(values.ravel()) result[:] = vec.reshape(shape) - elif dtype in _DATELIKE_DTYPES: + elif is_datetime64_dtype(dtype): # this is the NaT pattern result = values.view('i8') == iNaT else: diff --git a/pandas/util/print_versions.py b/pandas/util/print_versions.py index 3747e2ff6ca8f..657681d4c33ce 100644 --- a/pandas/util/print_versions.py +++ b/pandas/util/print_versions.py @@ -94,7 +94,7 @@ def show_versions(as_json=False): ("pymysql", lambda mod: mod.__version__), ("psycopg2", lambda mod: mod.__version__), ("jinja2", lambda mod: mod.__version__), - ("boto", lambda mod: mod.__version__), + ("s3fs", lambda mod: mod.__version__), ("pandas_datareader", lambda mod: mod.__version__) ] diff --git a/setup.py b/setup.py index 7a55daa74b1c5..e3774d8e36ce9 100755 --- a/setup.py +++ b/setup.py @@ -637,6 +637,7 @@ def pxd(name): 'pandas.tests', 'pandas.tests.frame', 'pandas.tests.indexes', + 'pandas.tests.groupby', 'pandas.tests.series', 'pandas.tests.formats', 'pandas.tests.types',