From 78ed061d1b62d9f352e6829547ed0bbaf4d71263 Mon Sep 17 00:00:00 2001 From: Tola Date: Mon, 15 Oct 2018 21:19:11 +0100 Subject: [PATCH 01/14] BUG: merge_asof on columns containing nulls --- doc/source/whatsnew/v0.24.1.txt | 1006 +++++++++++++++++++++++++++++++ pandas/core/reshape/merge.py | 6 +- 2 files changed, 1011 insertions(+), 1 deletion(-) create mode 100644 doc/source/whatsnew/v0.24.1.txt diff --git a/doc/source/whatsnew/v0.24.1.txt b/doc/source/whatsnew/v0.24.1.txt new file mode 100644 index 0000000000000..9463458bfa64f --- /dev/null +++ b/doc/source/whatsnew/v0.24.1.txt @@ -0,0 +1,1006 @@ +.. _whatsnew_0240: + +v0.24.0 (Month XX, 2018) +------------------------ + +.. warning:: + + Starting January 1, 2019, pandas feature releases will support Python 3 only. + See :ref:`install.dropping-27` for more. + +.. _whatsnew_0240.enhancements: + +New features +~~~~~~~~~~~~ +- :func:`merge` now directly allows merge between objects of type ``DataFrame`` and named ``Series``, without the need to convert the ``Series`` object into a ``DataFrame`` beforehand (:issue:`21220`) + + +- ``ExcelWriter`` now accepts ``mode`` as a keyword argument, enabling append to existing workbooks when using the ``openpyxl`` engine (:issue:`3441`) + +- :func:`DataFrame.to_parquet` now accepts ``index`` as an argument, allowing +the user to override the engine's default behavior to include or omit the +dataframe's indexes from the resulting Parquet file. (:issue:`20768`) +- :meth:`DataFrame.corr` and :meth:`Series.corr` now accept a callable for generic calculation methods of correlation, e.g. histogram intersection (:issue:`22684`) + + +.. _whatsnew_0240.enhancements.extension_array_operators: + +``ExtensionArray`` operator support +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +A ``Series`` based on an ``ExtensionArray`` now supports arithmetic and comparison +operators (:issue:`19577`). There are two approaches for providing operator support for an ``ExtensionArray``: + +1. Define each of the operators on your ``ExtensionArray`` subclass. +2. Use an operator implementation from pandas that depends on operators that are already defined + on the underlying elements (scalars) of the ``ExtensionArray``. + +See the :ref:`ExtensionArray Operator Support +` documentation section for details on both +ways of adding operator support. + +.. _whatsnew_0240.enhancements.intna: + +Optional Integer NA Support +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Pandas has gained the ability to hold integer dtypes with missing values. This long requested feature is enabled through the use of :ref:`extension types `. +Here is an example of the usage. + +We can construct a ``Series`` with the specified dtype. The dtype string ``Int64`` is a pandas ``ExtensionDtype``. Specifying a list or array using the traditional missing value +marker of ``np.nan`` will infer to integer dtype. The display of the ``Series`` will also use the ``NaN`` to indicate missing values in string outputs. (:issue:`20700`, :issue:`20747`, :issue:`22441`, :issue:`21789`, :issue:`22346`) + +.. ipython:: python + + s = pd.Series([1, 2, np.nan], dtype='Int64') + s + + +Operations on these dtypes will propagate ``NaN`` as other pandas operations. + +.. ipython:: python + + # arithmetic + s + 1 + + # comparison + s == 1 + + # indexing + s.iloc[1:3] + + # operate with other dtypes + s + s.iloc[1:3].astype('Int8') + + # coerce when needed + s + 0.01 + +These dtypes can operate as part of of ``DataFrame``. + +.. ipython:: python + + df = pd.DataFrame({'A': s, 'B': [1, 1, 3], 'C': list('aab')}) + df + df.dtypes + + +These dtypes can be merged & reshaped & casted. + +.. ipython:: python + + pd.concat([df[['A']], df[['B', 'C']]], axis=1).dtypes + df['A'].astype(float) + +Reduction and groupby operations such as 'sum' work. + +.. ipython:: python + + df.sum() + df.groupby('B').A.sum() + +.. warning:: + + The Integer NA support currently uses the captilized dtype version, e.g. ``Int8`` as compared to the traditional ``int8``. This may be changed at a future date. + +.. _whatsnew_0240.enhancements.read_html: + +``read_html`` Enhancements +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:func:`read_html` previously ignored ``colspan`` and ``rowspan`` attributes. +Now it understands them, treating them as sequences of cells with the same +value. (:issue:`17054`) + +.. ipython:: python + + result = pd.read_html(""" + + + + + + + + + + + +
ABC
12
""") + +Previous Behavior: + +.. code-block:: ipython + + In [13]: result + Out [13]: + [ A B C + 0 1 2 NaN] + +Current Behavior: + +.. ipython:: python + + result + + +.. _whatsnew_0240.enhancements.interval: + +Storing Interval Data in Series and DataFrame +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Interval data may now be stored in a ``Series`` or ``DataFrame``, in addition to an +:class:`IntervalIndex` like previously (:issue:`19453`). + +.. ipython:: python + + ser = pd.Series(pd.interval_range(0, 5)) + ser + ser.dtype + +Previously, these would be cast to a NumPy array of ``Interval`` objects. In general, +this should result in better performance when storing an array of intervals in +a :class:`Series`. + +Note that the ``.values`` of a ``Series`` containing intervals is no longer a NumPy +array, but rather an ``ExtensionArray``: + +.. ipython:: python + + ser.values + +This is the same behavior as ``Series.values`` for categorical data. See +:ref:`whatsnew_0240.api_breaking.interval_values` for more. + + +.. _whatsnew_0240.enhancements.other: + +Other Enhancements +^^^^^^^^^^^^^^^^^^ +- :func:`to_datetime` now supports the ``%Z`` and ``%z`` directive when passed into ``format`` (:issue:`13486`) +- :func:`Series.mode` and :func:`DataFrame.mode` now support the ``dropna`` parameter which can be used to specify whether ``NaN``/``NaT`` values should be considered (:issue:`17534`) +- :func:`to_csv` now supports ``compression`` keyword when a file handle is passed. (:issue:`21227`) +- :meth:`Index.droplevel` is now implemented also for flat indexes, for compatibility with :class:`MultiIndex` (:issue:`21115`) +- :meth:`Series.droplevel` and :meth:`DataFrame.droplevel` are now implemented (:issue:`20342`) +- Added support for reading from/writing to Google Cloud Storage via the ``gcsfs`` library (:issue:`19454`, :issue:`23094`) +- :func:`to_gbq` and :func:`read_gbq` signature and documentation updated to + reflect changes from the `Pandas-GBQ library version 0.6.0 + `__. + (:issue:`21627`, :issue:`22557`) +- New method :meth:`HDFStore.walk` will recursively walk the group hierarchy of an HDF5 file (:issue:`10932`) +- :func:`read_html` copies cell data across ``colspan`` and ``rowspan``, and it treats all-``th`` table rows as headers if ``header`` kwarg is not given and there is no ``thead`` (:issue:`17054`) +- :meth:`Series.nlargest`, :meth:`Series.nsmallest`, :meth:`DataFrame.nlargest`, and :meth:`DataFrame.nsmallest` now accept the value ``"all"`` for the ``keep`` argument. This keeps all ties for the nth largest/smallest value (:issue:`16818`) +- :class:`IntervalIndex` has gained the :meth:`~IntervalIndex.set_closed` method to change the existing ``closed`` value (:issue:`21670`) +- :func:`~DataFrame.to_csv`, :func:`~Series.to_csv`, :func:`~DataFrame.to_json`, and :func:`~Series.to_json` now support ``compression='infer'`` to infer compression based on filename extension (:issue:`15008`). + The default compression for ``to_csv``, ``to_json``, and ``to_pickle`` methods has been updated to ``'infer'`` (:issue:`22004`). +- :func:`to_timedelta` now supports iso-formated timedelta strings (:issue:`21877`) +- :class:`Series` and :class:`DataFrame` now support :class:`Iterable` in constructor (:issue:`2193`) +- :class:`DatetimeIndex` gained :attr:`DatetimeIndex.timetz` attribute. Returns local time with timezone information. (:issue:`21358`) +- :meth:`round`, :meth:`ceil`, and meth:`floor` for :class:`DatetimeIndex` and :class:`Timestamp` now support an ``ambiguous`` argument for handling datetimes that are rounded to ambiguous times (:issue:`18946`) +- :class:`Resampler` now is iterable like :class:`GroupBy` (:issue:`15314`). +- :meth:`Series.resample` and :meth:`DataFrame.resample` have gained the :meth:`Resampler.quantile` (:issue:`15023`). +- :meth:`Index.to_frame` now supports overriding column name(s) (:issue:`22580`). +- New attribute :attr:`__git_version__` will return git commit sha of current build (:issue:`21295`). +- Compatibility with Matplotlib 3.0 (:issue:`22790`). + +.. _whatsnew_0240.api_breaking: + +Backwards incompatible API changes +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +- A newly constructed empty :class:`DataFrame` with integer as the ``dtype`` will now only be cast to ``float64`` if ``index`` is specified (:issue:`22858`) + +.. _whatsnew_0240.api_breaking.deps: + +Dependencies have increased minimum versions +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +We have updated our minimum supported versions of dependencies (:issue:`21242`). +If installed, we now require: + ++-----------------+-----------------+----------+ +| Package | Minimum Version | Required | ++=================+=================+==========+ +| numpy | 1.12.0 | X | ++-----------------+-----------------+----------+ +| bottleneck | 1.2.0 | | ++-----------------+-----------------+----------+ +| matplotlib | 2.0.0 | | ++-----------------+-----------------+----------+ +| numexpr | 2.6.1 | | ++-----------------+-----------------+----------+ +| pytables | 3.4.2 | | ++-----------------+-----------------+----------+ +| scipy | 0.18.1 | | ++-----------------+-----------------+----------+ + +.. _whatsnew_0240.api_breaking.interval_values: + +``IntervalIndex.values`` is now an ``IntervalArray`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The :attr:`~Interval.values` attribute of an :class:`IntervalIndex` now returns an +``IntervalArray``, rather than a NumPy array of :class:`Interval` objects (:issue:`19453`). + +Previous Behavior: + +.. code-block:: ipython + + In [1]: idx = pd.interval_range(0, 4) + + In [2]: idx.values + Out[2]: + array([Interval(0, 1, closed='right'), Interval(1, 2, closed='right'), + Interval(2, 3, closed='right'), Interval(3, 4, closed='right')], + dtype=object) + +New Behavior: + +.. ipython:: python + + idx = pd.interval_range(0, 4) + idx.values + +This mirrors ``CategoricalIndex.values``, which returns a ``Categorical``. + +For situations where you need an ``ndarray`` of ``Interval`` objects, use +:meth:`numpy.asarray` or ``idx.astype(object)``. + +.. ipython:: python + + np.asarray(idx) + idx.values.astype(object) + +.. _whatsnew_0240.api.timezone_offset_parsing: + +Parsing Datetime Strings with Timezone Offsets +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Previously, parsing datetime strings with UTC offsets with :func:`to_datetime` +or :class:`DatetimeIndex` would automatically convert the datetime to UTC +without timezone localization. This is inconsistent from parsing the same +datetime string with :class:`Timestamp` which would preserve the UTC +offset in the ``tz`` attribute. Now, :func:`to_datetime` preserves the UTC +offset in the ``tz`` attribute when all the datetime strings have the same +UTC offset (:issue:`17697`, :issue:`11736`, :issue:`22457`) + +*Previous Behavior*: + +.. code-block:: ipython + + In [2]: pd.to_datetime("2015-11-18 15:30:00+05:30") + Out[2]: Timestamp('2015-11-18 10:00:00') + + In [3]: pd.Timestamp("2015-11-18 15:30:00+05:30") + Out[3]: Timestamp('2015-11-18 15:30:00+0530', tz='pytz.FixedOffset(330)') + + # Different UTC offsets would automatically convert the datetimes to UTC (without a UTC timezone) + In [4]: pd.to_datetime(["2015-11-18 15:30:00+05:30", "2015-11-18 16:30:00+06:30"]) + Out[4]: DatetimeIndex(['2015-11-18 10:00:00', '2015-11-18 10:00:00'], dtype='datetime64[ns]', freq=None) + +*Current Behavior*: + +.. ipython:: python + + pd.to_datetime("2015-11-18 15:30:00+05:30") + pd.Timestamp("2015-11-18 15:30:00+05:30") + +Parsing datetime strings with the same UTC offset will preserve the UTC offset in the ``tz`` + +.. ipython:: python + + pd.to_datetime(["2015-11-18 15:30:00+05:30"] * 2) + +Parsing datetime strings with different UTC offsets will now create an Index of +``datetime.datetime`` objects with different UTC offsets + +.. ipython:: python + + idx = pd.to_datetime(["2015-11-18 15:30:00+05:30", "2015-11-18 16:30:00+06:30"]) + idx + idx[0] + idx[1] + +Passing ``utc=True`` will mimic the previous behavior but will correctly indicate +that the dates have been converted to UTC + +.. ipython:: python + + pd.to_datetime(["2015-11-18 15:30:00+05:30", "2015-11-18 16:30:00+06:30"], utc=True) + +.. _whatsnew_0240.api_breaking.calendarday: + +CalendarDay Offset +^^^^^^^^^^^^^^^^^^ + +:class:`Day` and associated frequency alias ``'D'`` were documented to represent +a calendar day; however, arithmetic and operations with :class:`Day` sometimes +respected absolute time instead (i.e. ``Day(n)`` and acted identically to ``Timedelta(days=n)``). + +*Previous Behavior*: + +.. code-block:: ipython + + + In [2]: ts = pd.Timestamp('2016-10-30 00:00:00', tz='Europe/Helsinki') + + # Respects calendar arithmetic + In [3]: pd.date_range(start=ts, freq='D', periods=3) + Out[3]: + DatetimeIndex(['2016-10-30 00:00:00+03:00', '2016-10-31 00:00:00+02:00', + '2016-11-01 00:00:00+02:00'], + dtype='datetime64[ns, Europe/Helsinki]', freq='D') + + # Respects absolute arithmetic + In [4]: ts + pd.tseries.frequencies.to_offset('D') + Out[4]: Timestamp('2016-10-30 23:00:00+0200', tz='Europe/Helsinki') + +:class:`CalendarDay` and associated frequency alias ``'CD'`` are now available +and respect calendar day arithmetic while :class:`Day` and frequency alias ``'D'`` +will now respect absolute time (:issue:`22274`, :issue:`20596`, :issue:`16980`, :issue:`8774`) +See the :ref:`documentation here ` for more information. + +Addition with :class:`CalendarDay` across a daylight savings time transition: + +.. ipython:: python + + ts = pd.Timestamp('2016-10-30 00:00:00', tz='Europe/Helsinki') + ts + pd.offsets.Day(1) + ts + pd.offsets.CalendarDay(1) + +.. _whatsnew_0240.api_breaking.period_end_time: + +Time values in ``dt.end_time`` and ``to_timestamp(how='end')`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The time values in :class:`Period` and :class:`PeriodIndex` objects are now set +to '23:59:59.999999999' when calling :attr:`Series.dt.end_time`, :attr:`Period.end_time`, +:attr:`PeriodIndex.end_time`, :func:`Period.to_timestamp()` with ``how='end'``, +or :func:`PeriodIndex.to_timestamp()` with ``how='end'`` (:issue:`17157`) + +Previous Behavior: + +.. code-block:: ipython + + In [2]: p = pd.Period('2017-01-01', 'D') + In [3]: pi = pd.PeriodIndex([p]) + + In [4]: pd.Series(pi).dt.end_time[0] + Out[4]: Timestamp(2017-01-01 00:00:00) + + In [5]: p.end_time + Out[5]: Timestamp(2017-01-01 23:59:59.999999999) + +Current Behavior: + +Calling :attr:`Series.dt.end_time` will now result in a time of '23:59:59.999999999' as +is the case with :attr:`Period.end_time`, for example + +.. ipython:: python + + p = pd.Period('2017-01-01', 'D') + pi = pd.PeriodIndex([p]) + + pd.Series(pi).dt.end_time[0] + + p.end_time + +.. _whatsnew_0240.api_breaking.sparse_values: + +Sparse Data Structure Refactor +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +``SparseArray``, the array backing ``SparseSeries`` and the columns in a ``SparseDataFrame``, +is now an extension array (:issue:`21978`, :issue:`19056`, :issue:`22835`). +To conform to this interface and for consistency with the rest of pandas, some API breaking +changes were made: + +- ``SparseArray`` is no longer a subclass of :class:`numpy.ndarray`. To convert a SparseArray to a NumPy array, use :meth:`numpy.asarray`. +- ``SparseArray.dtype`` and ``SparseSeries.dtype`` are now instances of :class:`SparseDtype`, rather than ``np.dtype``. Access the underlying dtype with ``SparseDtype.subtype``. +- :meth:`numpy.asarray(sparse_array)` now returns a dense array with all the values, not just the non-fill-value values (:issue:`14167`) +- ``SparseArray.take`` now matches the API of :meth:`pandas.api.extensions.ExtensionArray.take` (:issue:`19506`): + + * The default value of ``allow_fill`` has changed from ``False`` to ``True``. + * The ``out`` and ``mode`` parameters are now longer accepted (previously, this raised if they were specified). + * Passing a scalar for ``indices`` is no longer allowed. + +- The result of concatenating a mix of sparse and dense Series is a Series with sparse values, rather than a ``SparseSeries``. +- ``SparseDataFrame.combine`` and ``DataFrame.combine_first`` no longer supports combining a sparse column with a dense column while preserving the sparse subtype. The result will be an object-dtype SparseArray. +- Setting :attr:`SparseArray.fill_value` to a fill value with a different dtype is now allowed. + + +Some new warnings are issued for operations that require or are likely to materialize a large dense array: + +- A :class:`errors.PerformanceWarning` is issued when using fillna with a ``method``, as a dense array is constructed to create the filled array. Filling with a ``value`` is the efficient way to fill a sparse array. +- A :class:`errors.PerformanceWarning` is now issued when concatenating sparse Series with differing fill values. The fill value from the first sparse array continues to be used. + +In addition to these API breaking changes, many :ref:`performance improvements and bug fixes have been made `. + +.. _whatsnew_0240.api_breaking.frame_to_dict_index_orient: + +Raise ValueError in ``DataFrame.to_dict(orient='index')`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Bug in :func:`DataFrame.to_dict` raises ``ValueError`` when used with +``orient='index'`` and a non-unique index instead of losing data (:issue:`22801`) + +.. ipython:: python + :okexcept: + + df = pd.DataFrame({'a': [1, 2], 'b': [0.5, 0.75]}, index=['A', 'A']) + df + + df.to_dict(orient='index') + +.. _whatsnew_0240.api.datetimelike.normalize: + +Tick DateOffset Normalize Restrictions +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Creating a ``Tick`` object (:class:`Day`, :class:`Hour`, :class:`Minute`, +:class:`Second`, :class:`Milli`, :class:`Micro`, :class:`Nano`) with +``normalize=True`` is no longer supported. This prevents unexpected behavior +where addition could fail to be monotone or associative. (:issue:`21427`) + +*Previous Behavior*: + +.. code-block:: ipython + + + In [2]: ts = pd.Timestamp('2018-06-11 18:01:14') + + In [3]: ts + Out[3]: Timestamp('2018-06-11 18:01:14') + + In [4]: tic = pd.offsets.Hour(n=2, normalize=True) + ...: + + In [5]: tic + Out[5]: <2 * Hours> + + In [6]: ts + tic + Out[6]: Timestamp('2018-06-11 00:00:00') + + In [7]: ts + tic + tic + tic == ts + (tic + tic + tic) + Out[7]: False + +*Current Behavior*: + +.. ipython:: python + + ts = pd.Timestamp('2018-06-11 18:01:14') + tic = pd.offsets.Hour(n=2) + ts + tic + tic + tic == ts + (tic + tic + tic) + + +.. _whatsnew_0240.api.datetimelike: + + +.. _whatsnew_0240.api.period_subtraction: + +Period Subtraction +^^^^^^^^^^^^^^^^^^ + +Subtraction of a ``Period`` from another ``Period`` will give a ``DateOffset``. +instead of an integer (:issue:`21314`) + +.. ipython:: python + + june = pd.Period('June 2018') + april = pd.Period('April 2018') + june - april + +Previous Behavior: + +.. code-block:: ipython + + In [2]: june = pd.Period('June 2018') + + In [3]: april = pd.Period('April 2018') + + In [4]: june - april + Out [4]: 2 + +Similarly, subtraction of a ``Period`` from a ``PeriodIndex`` will now return +an ``Index`` of ``DateOffset`` objects instead of an ``Int64Index`` + +.. ipython:: python + + pi = pd.period_range('June 2018', freq='M', periods=3) + pi - pi[0] + +Previous Behavior: + +.. code-block:: ipython + + In [2]: pi = pd.period_range('June 2018', freq='M', periods=3) + + In [3]: pi - pi[0] + Out[3]: Int64Index([0, 1, 2], dtype='int64') + + +.. _whatsnew_0240.api.timedelta64_subtract_nan: + +Addition/Subtraction of ``NaN`` from :class:`DataFrame` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Adding or subtracting ``NaN`` from a :class:`DataFrame` column with +``timedelta64[ns]`` dtype will now raise a ``TypeError`` instead of returning +all-``NaT``. This is for compatibility with ``TimedeltaIndex`` and +``Series`` behavior (:issue:`22163`) + +.. ipython:: python + :okexcept: + + df = pd.DataFrame([pd.Timedelta(days=1)]) + df - np.nan + +Previous Behavior: + +.. code-block:: ipython + + In [4]: df = pd.DataFrame([pd.Timedelta(days=1)]) + + In [5]: df - np.nan + Out[5]: + 0 + 0 NaT + + +.. _whatsnew_0240.api.dataframe_arithmetic_broadcasting: + +DataFrame Arithmetic Operations Broadcasting Changes +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +:class:`DataFrame` arithmetic operations when operating with 2-dimensional +``np.ndarray`` objects now broadcast in the same way as ``np.ndarray``s +broadcast. (:issue:`23000`) + +Previous Behavior: + +.. code-block:: ipython + + In [3]: arr = np.arange(6).reshape(3, 2) + In [4]: df = pd.DataFrame(arr) + In [5]: df + arr[[0], :] # 1 row, 2 columns + ... + ValueError: Unable to coerce to DataFrame, shape must be (3, 2): given (1, 2) + In [6]: df + arr[:, [1]] # 1 column, 3 rows + ... + ValueError: Unable to coerce to DataFrame, shape must be (3, 2): given (3, 1) + +*Current Behavior*: + +.. ipython:: python + arr = np.arange(6).reshape(3, 2) + df = pd.DataFrame(arr) + df + +.. ipython:: python + df + arr[[0], :] # 1 row, 2 columns + df + arr[:, [1]] # 1 column, 3 rows + + +.. _whatsnew_0240.api.extension: + +ExtensionType Changes +^^^^^^^^^^^^^^^^^^^^^ + +**:class:`pandas.api.extensions.ExtensionDtype` Equality and Hashability** + +Pandas now requires that extension dtypes be hashable. The base class implements +a default ``__eq__`` and ``__hash__``. If you have a parametrized dtype, you should +update the ``ExtensionDtype._metadata`` tuple to match the signature of your +``__init__`` method. See :class:`pandas.api.extensions.ExtensionDtype` for more (:issue:`22476`). + +**Other changes** + +- ``ExtensionArray`` has gained the abstract methods ``.dropna()`` (:issue:`21185`) +- ``ExtensionDtype`` has gained the ability to instantiate from string dtypes, e.g. ``decimal`` would instantiate a registered ``DecimalDtype``; furthermore + the ``ExtensionDtype`` has gained the method ``construct_array_type`` (:issue:`21185`) +- An ``ExtensionArray`` with a boolean dtype now works correctly as a boolean indexer. :meth:`pandas.api.types.is_bool_dtype` now properly considers them boolean (:issue:`22326`) +- Added ``ExtensionDtype._is_numeric`` for controlling whether an extension dtype is considered numeric (:issue:`22290`). +- The ``ExtensionArray`` constructor, ``_from_sequence`` now take the keyword arg ``copy=False`` (:issue:`21185`) +- Bug in :meth:`Series.get` for ``Series`` using ``ExtensionArray`` and integer index (:issue:`21257`) +- :meth:`~Series.shift` now dispatches to :meth:`ExtensionArray.shift` (:issue:`22386`) +- :meth:`Series.combine()` works correctly with :class:`~pandas.api.extensions.ExtensionArray` inside of :class:`Series` (:issue:`20825`) +- :meth:`Series.combine()` with scalar argument now works for any function type (:issue:`21248`) +- :meth:`Series.astype` and :meth:`DataFrame.astype` now dispatch to :meth:`ExtensionArray.astype` (:issue:`21185:`). +- Slicing a single row of a ``DataFrame`` with multiple ExtensionArrays of the same type now preserves the dtype, rather than coercing to object (:issue:`22784`) +- Added :meth:`pandas.api.types.register_extension_dtype` to register an extension type with pandas (:issue:`22664`) +- Series backed by an ``ExtensionArray`` now work with :func:`util.hash_pandas_object` (:issue:`23066`) +- Updated the ``.type`` attribute for ``PeriodDtype``, ``DatetimeTZDtype``, and ``IntervalDtype`` to be instances of the dtype (``Period``, ``Timestamp``, and ``Interval`` respectively) (:issue:`22938`) +- :func:`ExtensionArray.isna` is allowed to return an ``ExtensionArray`` (:issue:`22325`). +- Support for reduction operations such as ``sum``, ``mean`` via opt-in base class method override (:issue:`22762`) + +.. _whatsnew_0240.api.incompatibilities: + +Series and Index Data-Dtype Incompatibilities +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +``Series`` and ``Index`` constructors now raise when the +data is incompatible with a passed ``dtype=`` (:issue:`15832`) + +Previous Behavior: + +.. code-block:: ipython + + In [4]: pd.Series([-1], dtype="uint64") + Out [4]: + 0 18446744073709551615 + dtype: uint64 + +Current Behavior: + +.. code-block:: ipython + + In [4]: pd.Series([-1], dtype="uint64") + Out [4]: + ... + OverflowError: Trying to coerce negative values to unsigned integers + +.. _whatsnew_0240.api.crosstab_dtypes + +Crosstab Preserves Dtypes +^^^^^^^^^^^^^^^^^^^^^^^^^ + +:func:`crosstab` will preserve now dtypes in some cases that previously would +cast from integer dtype to floating dtype (:issue:`22019`) + +Previous Behavior: + +.. code-block:: ipython + + In [3]: df = pd.DataFrame({'a': [1, 2, 2, 2, 2], 'b': [3, 3, 4, 4, 4], + ...: 'c': [1, 1, np.nan, 1, 1]}) + In [4]: pd.crosstab(df.a, df.b, normalize='columns') + Out[4]: + b 3 4 + a + 1 0.5 0.0 + 2 0.5 1.0 + +Current Behavior: + +.. code-block:: ipython + + In [3]: df = pd.DataFrame({'a': [1, 2, 2, 2, 2], 'b': [3, 3, 4, 4, 4], + ...: 'c': [1, 1, np.nan, 1, 1]}) + In [4]: pd.crosstab(df.a, df.b, normalize='columns') + +Datetimelike API Changes +^^^^^^^^^^^^^^^^^^^^^^^^ + +- For :class:`DatetimeIndex` and :class:`TimedeltaIndex` with non-``None`` ``freq`` attribute, addition or subtraction of integer-dtyped array or ``Index`` will return an object of the same class (:issue:`19959`) +- :class:`DateOffset` objects are now immutable. Attempting to alter one of these will now raise ``AttributeError`` (:issue:`21341`) +- :class:`PeriodIndex` subtraction of another ``PeriodIndex`` will now return an object-dtype :class:`Index` of :class:`DateOffset` objects instead of raising a ``TypeError`` (:issue:`20049`) +- :func:`cut` and :func:`qcut` now returns a :class:`DatetimeIndex` or :class:`TimedeltaIndex` bins when the input is datetime or timedelta dtype respectively and ``retbins=True`` (:issue:`19891`) +- :meth:`DatetimeIndex.to_period` and :meth:`Timestamp.to_period` will issue a warning when timezone information will be lost (:issue:`21333`) + +.. _whatsnew_0240.api.other: + +Other API Changes +^^^^^^^^^^^^^^^^^ + +- :class:`DatetimeIndex` now accepts :class:`Int64Index` arguments as epoch timestamps (:issue:`20997`) +- Accessing a level of a ``MultiIndex`` with a duplicate name (e.g. in + :meth:`~MultiIndex.get_level_values`) now raises a ``ValueError`` instead of + a ``KeyError`` (:issue:`21678`). +- Invalid construction of ``IntervalDtype`` will now always raise a ``TypeError`` rather than a ``ValueError`` if the subdtype is invalid (:issue:`21185`) +- Trying to reindex a ``DataFrame`` with a non unique ``MultiIndex`` now raises a ``ValueError`` instead of an ``Exception`` (:issue:`21770`) +- :meth:`PeriodIndex.tz_convert` and :meth:`PeriodIndex.tz_localize` have been removed (:issue:`21781`) +- :class:`Index` subtraction will attempt to operate element-wise instead of raising ``TypeError`` (:issue:`19369`) +- :class:`pandas.io.formats.style.Styler` supports a ``number-format`` property when using :meth:`~pandas.io.formats.style.Styler.to_excel` (:issue:`22015`) +- :meth:`DataFrame.corr` and :meth:`Series.corr` now raise a ``ValueError`` along with a helpful error message instead of a ``KeyError`` when supplied with an invalid method (:issue:`22298`) +- :meth:`shift` will now always return a copy, instead of the previous behaviour of returning self when shifting by 0 (:issue:`22397`) +- Slicing a single row of a DataFrame with multiple ExtensionArrays of the same type now preserves the dtype, rather than coercing to object (:issue:`22784`) +- :class:`DateOffset` attribute `_cacheable` and method `_should_cache` have been removed (:issue:`23118`) + +.. _whatsnew_0240.deprecations: + +Deprecations +~~~~~~~~~~~~ + +- :meth:`DataFrame.to_stata`, :meth:`read_stata`, :class:`StataReader` and :class:`StataWriter` have deprecated the ``encoding`` argument. The encoding of a Stata dta file is determined by the file type and cannot be changed (:issue:`21244`) +- :meth:`MultiIndex.to_hierarchical` is deprecated and will be removed in a future version (:issue:`21613`) +- :meth:`Series.ptp` is deprecated. Use ``numpy.ptp`` instead (:issue:`21614`) +- :meth:`Series.compress` is deprecated. Use ``Series[condition]`` instead (:issue:`18262`) +- The signature of :meth:`Series.to_csv` has been uniformed to that of :meth:`DataFrame.to_csv`: the name of the first argument is now ``path_or_buf``, the order of subsequent arguments has changed, the ``header`` argument now defaults to ``True``. (:issue:`19715`) +- :meth:`Categorical.from_codes` has deprecated providing float values for the ``codes`` argument. (:issue:`21767`) +- :func:`pandas.read_table` is deprecated. Instead, use :func:`pandas.read_csv` passing ``sep='\t'`` if necessary (:issue:`21948`) +- :meth:`Series.str.cat` has deprecated using arbitrary list-likes *within* list-likes. A list-like container may still contain + many ``Series``, ``Index`` or 1-dimensional ``np.ndarray``, or alternatively, only scalar values. (:issue:`21950`) +- :meth:`FrozenNDArray.searchsorted` has deprecated the ``v`` parameter in favor of ``value`` (:issue:`14645`) +- :func:`DatetimeIndex.shift` and :func:`PeriodIndex.shift` now accept ``periods`` argument instead of ``n`` for consistency with :func:`Index.shift` and :func:`Series.shift`. Using ``n`` throws a deprecation warning (:issue:`22458`, :issue:`22912`) + +.. _whatsnew_0240.prior_deprecations: + +Removal of prior version deprecations/changes +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +- The ``LongPanel`` and ``WidePanel`` classes have been removed (:issue:`10892`) +- :meth:`Series.repeat` has renamed the ``reps`` argument to ``repeats`` (:issue:`14645`) +- Several private functions were removed from the (non-public) module ``pandas.core.common`` (:issue:`22001`) +- Removal of the previously deprecated module ``pandas.core.datetools`` (:issue:`14105`, :issue:`14094`) +- Strings passed into :meth:`DataFrame.groupby` that refer to both column and index levels will raise a ``ValueError`` (:issue:`14432`) +- :meth:`Index.repeat` and :meth:`MultiIndex.repeat` have renamed the ``n`` argument to ``repeats`` (:issue:`14645`) +- Removal of the previously deprecated ``as_indexer`` keyword completely from ``str.match()`` (:issue:`22356`, :issue:`6581`) +- Removed the ``pandas.formats.style`` shim for :class:`pandas.io.formats.style.Styler` (:issue:`16059`) +- :meth:`Categorical.searchsorted` and :meth:`Series.searchsorted` have renamed the ``v`` argument to ``value`` (:issue:`14645`) +- :meth:`TimedeltaIndex.searchsorted`, :meth:`DatetimeIndex.searchsorted`, and :meth:`PeriodIndex.searchsorted` have renamed the ``key`` argument to ``value`` (:issue:`14645`) +- Removal of the previously deprecated module ``pandas.json`` (:issue:`19944`) + +.. _whatsnew_0240.performance: + +Performance Improvements +~~~~~~~~~~~~~~~~~~~~~~~~ + +- Very large improvement in performance of slicing when the index is a :class:`CategoricalIndex`, + both when indexing by label (using .loc) and position(.iloc). + Likewise, slicing a ``CategoricalIndex`` itself (i.e. ``ci[100:200]``) shows similar speed improvements (:issue:`21659`) +- Improved performance of :func:`Series.describe` in case of numeric dtpyes (:issue:`21274`) +- Improved performance of :func:`pandas.core.groupby.GroupBy.rank` when dealing with tied rankings (:issue:`21237`) +- Improved performance of :func:`DataFrame.set_index` with columns consisting of :class:`Period` objects (:issue:`21582`, :issue:`21606`) +- Improved performance of membership checks in :class:`Categorical` and :class:`CategoricalIndex` + (i.e. ``x in cat``-style checks are much faster). :meth:`CategoricalIndex.contains` + is likewise much faster (:issue:`21369`, :issue:`21508`) +- Improved performance of :meth:`HDFStore.groups` (and dependent functions like + :meth:`~HDFStore.keys`. (i.e. ``x in store`` checks are much faster) + (:issue:`21372`) +- Improved the performance of :func:`pandas.get_dummies` with ``sparse=True`` (:issue:`21997`) +- Improved performance of :func:`IndexEngine.get_indexer_non_unique` for sorted, non-unique indexes (:issue:`9466`) +- Improved performance of :func:`PeriodIndex.unique` (:issue:`23083`) + + +.. _whatsnew_0240.docs: + +Documentation Changes +~~~~~~~~~~~~~~~~~~~~~ + +- Added sphinx spelling extension, updated documentation on how to use the spell check (:issue:`21079`) +- +- + +.. _whatsnew_0240.bug_fixes: + +Bug Fixes +~~~~~~~~~ + +Categorical +^^^^^^^^^^^ + +- Bug in :meth:`Categorical.from_codes` where ``NaN`` values in ``codes`` were silently converted to ``0`` (:issue:`21767`). In the future this will raise a ``ValueError``. Also changes the behavior of ``.from_codes([1.1, 2.0])``. +- Bug when indexing with a boolean-valued ``Categorical``. Now a boolean-valued ``Categorical`` is treated as a boolean mask (:issue:`22665`) +- Constructing a :class:`CategoricalIndex` with empty values and boolean categories was raising a ``ValueError`` after a change to dtype coercion (:issue:`22702`). + +Datetimelike +^^^^^^^^^^^^ + +- Fixed bug where two :class:`DateOffset` objects with different ``normalize`` attributes could evaluate as equal (:issue:`21404`) +- Fixed bug where :meth:`Timestamp.resolution` incorrectly returned 1-microsecond ``timedelta`` instead of 1-nanosecond :class:`Timedelta` (:issue:`21336`, :issue:`21365`) +- Bug in :func:`to_datetime` that did not consistently return an :class:`Index` when ``box=True`` was specified (:issue:`21864`) +- Bug in :class:`DatetimeIndex` comparisons where string comparisons incorrectly raises ``TypeError`` (:issue:`22074`) +- Bug in :class:`DatetimeIndex` comparisons when comparing against ``timedelta64[ns]`` dtyped arrays; in some cases ``TypeError`` was incorrectly raised, in others it incorrectly failed to raise (:issue:`22074`) +- Bug in :class:`DatetimeIndex` comparisons when comparing against object-dtyped arrays (:issue:`22074`) +- Bug in :class:`DataFrame` with ``datetime64[ns]`` dtype addition and subtraction with ``Timedelta``-like objects (:issue:`22005`, :issue:`22163`) +- Bug in :class:`DataFrame` with ``datetime64[ns]`` dtype addition and subtraction with ``DateOffset`` objects returning an ``object`` dtype instead of ``datetime64[ns]`` dtype (:issue:`21610`, :issue:`22163`) +- Bug in :class:`DataFrame` with ``datetime64[ns]`` dtype comparing against ``NaT`` incorrectly (:issue:`22242`, :issue:`22163`) +- Bug in :class:`DataFrame` with ``datetime64[ns]`` dtype subtracting ``Timestamp``-like object incorrectly returned ``datetime64[ns]`` dtype instead of ``timedelta64[ns]`` dtype (:issue:`8554`, :issue:`22163`) +- Bug in :class:`DataFrame` with ``datetime64[ns]`` dtype subtracting ``np.datetime64`` object with non-nanosecond unit failing to convert to nanoseconds (:issue:`18874`, :issue:`22163`) +- Bug in :class:`DataFrame` comparisons against ``Timestamp``-like objects failing to raise ``TypeError`` for inequality checks with mismatched types (:issue:`8932`, :issue:`22163`) +- Bug in :class:`DataFrame` with mixed dtypes including ``datetime64[ns]`` incorrectly raising ``TypeError`` on equality comparisons (:issue:`13128`, :issue:`22163`) +- Bug in :meth:`DataFrame.eq` comparison against ``NaT`` incorrectly returning ``True`` or ``NaN`` (:issue:`15697`, :issue:`22163`) +- Bug in :class:`DatetimeIndex` subtraction that incorrectly failed to raise ``OverflowError`` (:issue:`22492`, :issue:`22508`) +- Bug in :class:`DatetimeIndex` incorrectly allowing indexing with ``Timedelta`` object (:issue:`20464`) +- Bug in :class:`DatetimeIndex` where frequency was being set if original frequency was ``None`` (:issue:`22150`) +- Bug in rounding methods of :class:`DatetimeIndex` (:meth:`~DatetimeIndex.round`, :meth:`~DatetimeIndex.ceil`, :meth:`~DatetimeIndex.floor`) and :class:`Timestamp` (:meth:`~Timestamp.round`, :meth:`~Timestamp.ceil`, :meth:`~Timestamp.floor`) could give rise to loss of precision (:issue:`22591`) +- Bug in :func:`to_datetime` with an :class:`Index` argument that would drop the ``name`` from the result (:issue:`21697`) +- Bug in :class:`PeriodIndex` where adding or subtracting a :class:`timedelta` or :class:`Tick` object produced incorrect results (:issue:`22988`) + +Timedelta +^^^^^^^^^ +- Bug in :class:`DataFrame` with ``timedelta64[ns]`` dtype division by ``Timedelta``-like scalar incorrectly returning ``timedelta64[ns]`` dtype instead of ``float64`` dtype (:issue:`20088`, :issue:`22163`) +- Bug in adding a :class:`Index` with object dtype to a :class:`Series` with ``timedelta64[ns]`` dtype incorrectly raising (:issue:`22390`) +- Bug in multiplying a :class:`Series` with numeric dtype against a ``timedelta`` object (:issue:`22390`) +- Bug in :class:`Series` with numeric dtype when adding or subtracting an an array or ``Series`` with ``timedelta64`` dtype (:issue:`22390`) +- Bug in :class:`Index` with numeric dtype when multiplying or dividing an array with dtype ``timedelta64`` (:issue:`22390`) +- Bug in :class:`TimedeltaIndex` incorrectly allowing indexing with ``Timestamp`` object (:issue:`20464`) +- Fixed bug where subtracting :class:`Timedelta` from an object-dtyped array would raise ``TypeError`` (:issue:`21980`) +- Fixed bug in adding a :class:`DataFrame` with all-`timedelta64[ns]` dtypes to a :class:`DataFrame` with all-integer dtypes returning incorrect results instead of raising ``TypeError`` (:issue:`22696`) +- + +Timezones +^^^^^^^^^ + +- Bug in :meth:`DatetimeIndex.shift` where an ``AssertionError`` would raise when shifting across DST (:issue:`8616`) +- Bug in :class:`Timestamp` constructor where passing an invalid timezone offset designator (``Z``) would not raise a ``ValueError`` (:issue:`8910`) +- Bug in :meth:`Timestamp.replace` where replacing at a DST boundary would retain an incorrect offset (:issue:`7825`) +- Bug in :meth:`Series.replace` with ``datetime64[ns, tz]`` data when replacing ``NaT`` (:issue:`11792`) +- Bug in :class:`Timestamp` when passing different string date formats with a timezone offset would produce different timezone offsets (:issue:`12064`) +- Bug when comparing a tz-naive :class:`Timestamp` to a tz-aware :class:`DatetimeIndex` which would coerce the :class:`DatetimeIndex` to tz-naive (:issue:`12601`) +- Bug in :meth:`Series.truncate` with a tz-aware :class:`DatetimeIndex` which would cause a core dump (:issue:`9243`) +- Bug in :class:`Series` constructor which would coerce tz-aware and tz-naive :class:`Timestamp` to tz-aware (:issue:`13051`) +- Bug in :class:`Index` with ``datetime64[ns, tz]`` dtype that did not localize integer data correctly (:issue:`20964`) +- Bug in :class:`DatetimeIndex` where constructing with an integer and tz would not localize correctly (:issue:`12619`) +- Fixed bug where :meth:`DataFrame.describe` and :meth:`Series.describe` on tz-aware datetimes did not show `first` and `last` result (:issue:`21328`) +- Bug in :class:`DatetimeIndex` comparisons failing to raise ``TypeError`` when comparing timezone-aware ``DatetimeIndex`` against ``np.datetime64`` (:issue:`22074`) +- Bug in ``DataFrame`` assignment with a timezone-aware scalar (:issue:`19843`) +- Bug in :func:`Dataframe.asof` that raised a ``TypeError`` when attempting to compare tz-naive and tz-aware timestamps (:issue:`21194`) +- Bug when constructing a :class:`DatetimeIndex` with :class:`Timestamp`s constructed with the ``replace`` method across DST (:issue:`18785`) +- Bug when setting a new value with :meth:`DataFrame.loc` with a :class:`DatetimeIndex` with a DST transition (:issue:`18308`, :issue:`20724`) +- Bug in :meth:`DatetimeIndex.unique` that did not re-localize tz-aware dates correctly (:issue:`21737`) +- Bug when indexing a :class:`Series` with a DST transition (:issue:`21846`) +- Bug in :meth:`DataFrame.resample` and :meth:`Series.resample` where an ``AmbiguousTimeError`` or ``NonExistentTimeError`` would raise if a timezone aware timeseries ended on a DST transition (:issue:`19375`, :issue:`10117`) + +Offsets +^^^^^^^ + +- Bug in :class:`FY5253` where date offsets could incorrectly raise an ``AssertionError`` in arithmetic operatons (:issue:`14774`) +- Bug in :class:`DateOffset` where keyword arguments ``week`` and ``milliseconds`` were accepted and ignored. Passing these will now raise ``ValueError`` (:issue:`19398`) +- + +Numeric +^^^^^^^ + +- Bug in :class:`Series` ``__rmatmul__`` doesn't support matrix vector multiplication (:issue:`21530`) +- Bug in :func:`factorize` fails with read-only array (:issue:`12813`) +- Fixed bug in :func:`unique` handled signed zeros inconsistently: for some inputs 0.0 and -0.0 were treated as equal and for some inputs as different. Now they are treated as equal for all inputs (:issue:`21866`) +- Bug in :meth:`DataFrame.agg`, :meth:`DataFrame.transform` and :meth:`DataFrame.apply` where, + when supplied with a list of functions and ``axis=1`` (e.g. ``df.apply(['sum', 'mean'], axis=1)``), + a ``TypeError`` was wrongly raised. For all three methods such calculation are now done correctly. (:issue:`16679`). +- Bug in :class:`Series` comparison against datetime-like scalars and arrays (:issue:`22074`) +- Bug in :class:`DataFrame` multiplication between boolean dtype and integer returning ``object`` dtype instead of integer dtype (:issue:`22047`, :issue:`22163`) +- Bug in :meth:`DataFrame.apply` where, when supplied with a string argument and additional positional or keyword arguments (e.g. ``df.apply('sum', min_count=1)``), a ``TypeError`` was wrongly raised (:issue:`22376`) +- Bug in :meth:`DataFrame.astype` to extension dtype may raise ``AttributeError`` (:issue:`22578`) +- Bug in :class:`DataFrame` with ``timedelta64[ns]`` dtype arithmetic operations with ``ndarray`` with integer dtype incorrectly treating the narray as ``timedelta64[ns]`` dtype (:issue:`23114`) + +Strings +^^^^^^^ + +- +- +- + +Interval +^^^^^^^^ + +- Bug in the :class:`IntervalIndex` constructor where the ``closed`` parameter did not always override the inferred ``closed`` (:issue:`19370`) +- Bug in the ``IntervalIndex`` repr where a trailing comma was missing after the list of intervals (:issue:`20611`) +- Bug in :class:`Interval` where scalar arithmetic operations did not retain the ``closed`` value (:issue:`22313`) +- Bug in :class:`IntervalIndex` where indexing with datetime-like values raised a ``KeyError`` (:issue:`20636`) + +Indexing +^^^^^^^^ + +- The traceback from a ``KeyError`` when asking ``.loc`` for a single missing label is now shorter and more clear (:issue:`21557`) +- When ``.ix`` is asked for a missing integer label in a :class:`MultiIndex` with a first level of integer type, it now raises a ``KeyError``, consistently with the case of a flat :class:`Int64Index`, rather than falling back to positional indexing (:issue:`21593`) +- Bug in :meth:`DatetimeIndex.reindex` when reindexing a tz-naive and tz-aware :class:`DatetimeIndex` (:issue:`8306`) +- Bug in :class:`DataFrame` when setting values with ``.loc`` and a timezone aware :class:`DatetimeIndex` (:issue:`11365`) +- ``DataFrame.__getitem__`` now accepts dictionaries and dictionary keys as list-likes of labels, consistently with ``Series.__getitem__`` (:issue:`21294`) +- Fixed ``DataFrame[np.nan]`` when columns are non-unique (:issue:`21428`) +- Bug when indexing :class:`DatetimeIndex` with nanosecond resolution dates and timezones (:issue:`11679`) +- Bug where indexing with a Numpy array containing negative values would mutate the indexer (:issue:`21867`) +- Bug where mixed indexes wouldn't allow integers for ``.at`` (:issue:`19860`) +- ``Float64Index.get_loc`` now raises ``KeyError`` when boolean key passed. (:issue:`19087`) +- Bug in :meth:`DataFrame.loc` when indexing with an :class:`IntervalIndex` (:issue:`19977`) +- :class:`Index` no longer mangles ``None``, ``NaN`` and ``NaT``, i.e. they are treated as three different keys. However, for numeric Index all three are still coerced to a ``NaN`` (:issue:`22332`) +- Bug in `scalar in Index` if scalar is a float while the ``Index`` is of integer dtype (:issue:`22085`) + +Missing +^^^^^^^ + +- Bug in :func:`DataFrame.fillna` where a ``ValueError`` would raise when one column contained a ``datetime64[ns, tz]`` dtype (:issue:`15522`) +- Bug in :func:`Series.hasnans` that could be incorrectly cached and return incorrect answers if null elements are introduced after an initial call (:issue:`19700`) +- :func:`Series.isin` now treats all NaN-floats as equal also for `np.object`-dtype. This behavior is consistent with the behavior for float64 (:issue:`22119`) +- :func:`unique` no longer mangles NaN-floats and the ``NaT``-object for `np.object`-dtype, i.e. ``NaT`` is no longer coerced to a NaN-value and is treated as a different entity. (:issue:`22295`) + + +MultiIndex +^^^^^^^^^^ + +- Removed compatibility for :class:`MultiIndex` pickles prior to version 0.8.0; compatibility with :class:`MultiIndex` pickles from version 0.13 forward is maintained (:issue:`21654`) +- :meth:`MultiIndex.get_loc_level` (and as a consequence, ``.loc`` on a :class:`MultiIndex`ed object) will now raise a ``KeyError``, rather than returning an empty ``slice``, if asked a label which is present in the ``levels`` but is unused (:issue:`22221`) +- Fix ``TypeError`` in Python 3 when creating :class:`MultiIndex` in which some levels have mixed types, e.g. when some labels are tuples (:issue:`15457`) + +I/O +^^^ + +- :func:`read_html()` no longer ignores all-whitespace ```` within ```` when considering the ``skiprows`` and ``header`` arguments. Previously, users had to decrease their ``header`` and ``skiprows`` values on such tables to work around the issue. (:issue:`21641`) +- :func:`read_excel()` will correctly show the deprecation warning for previously deprecated ``sheetname`` (:issue:`17994`) +- :func:`read_csv()` and func:`read_table()` will throw ``UnicodeError`` and not coredump on badly encoded strings (:issue:`22748`) +- :func:`read_csv()` will correctly parse timezone-aware datetimes (:issue:`22256`) +- :func:`read_sas()` will parse numbers in sas7bdat-files that have width less than 8 bytes correctly. (:issue:`21616`) +- :func:`read_sas()` will correctly parse sas7bdat files with many columns (:issue:`22628`) +- :func:`read_sas()` will correctly parse sas7bdat files with data page types having also bit 7 set (so page type is 128 + 256 = 384) (:issue:`16615`) +- Bug in :meth:`detect_client_encoding` where potential ``IOError`` goes unhandled when importing in a mod_wsgi process due to restricted access to stdout. (:issue:`21552`) +- Bug in :func:`to_string()` that broke column alignment when ``index=False`` and width of first column's values is greater than the width of first column's header (:issue:`16839`, :issue:`13032`) + +Plotting +^^^^^^^^ + +- Bug in :func:`DataFrame.plot.scatter` and :func:`DataFrame.plot.hexbin` caused x-axis label and ticklabels to disappear when colorbar was on in IPython inline backend (:issue:`10611`, :issue:`10678`, and :issue:`20455`) +- Bug in plotting a Series with datetimes using :func:`matplotlib.axes.Axes.scatter` (:issue:`22039`) + +Groupby/Resample/Rolling +^^^^^^^^^^^^^^^^^^^^^^^^ + +- Bug in :func:`pandas.core.groupby.GroupBy.first` and :func:`pandas.core.groupby.GroupBy.last` with ``as_index=False`` leading to the loss of timezone information (:issue:`15884`) +- Bug in :meth:`DatetimeIndex.resample` when downsampling across a DST boundary (:issue:`8531`) +- Bug where ``ValueError`` is wrongly raised when calling :func:`~pandas.core.groupby.SeriesGroupBy.count` method of a + ``SeriesGroupBy`` when the grouping variable only contains NaNs and numpy version < 1.13 (:issue:`21956`). +- Multiple bugs in :func:`pandas.core.Rolling.min` with ``closed='left'`` and a + datetime-like index leading to incorrect results and also segfault. (:issue:`21704`) +- Bug in :meth:`Resampler.apply` when passing postiional arguments to applied func (:issue:`14615`). +- Bug in :meth:`Series.resample` when passing ``numpy.timedelta64`` to ``loffset`` kwarg (:issue:`7687`). +- Bug in :meth:`Resampler.asfreq` when frequency of ``TimedeltaIndex`` is a subperiod of a new frequency (:issue:`13022`). +- Bug in :meth:`SeriesGroupBy.mean` when values were integral but could not fit inside of int64, overflowing instead. (:issue:`22487`) +- :func:`RollingGroupby.agg` and :func:`ExpandingGroupby.agg` now support multiple aggregation functions as parameters (:issue:`15072`) +- Bug in :meth:`DataFrame.resample` and :meth:`Series.resample` when resampling by a weekly offset (``'W'``) across a DST transition (:issue:`9119`, :issue:`21459`) + +Reshaping +^^^^^^^^^ + +- Bug in :func:`pandas.concat` when joining resampled DataFrames with timezone aware index (:issue:`13783`) +- Bug in :meth:`Series.combine_first` with ``datetime64[ns, tz]`` dtype which would return tz-naive result (:issue:`21469`) +- Bug in :meth:`Series.where` and :meth:`DataFrame.where` with ``datetime64[ns, tz]`` dtype (:issue:`21546`) +- Bug in :meth:`Series.mask` and :meth:`DataFrame.mask` with ``list`` conditionals (:issue:`21891`) +- Bug in :meth:`DataFrame.replace` raises RecursionError when converting OutOfBounds ``datetime64[ns, tz]`` (:issue:`20380`) +- :func:`pandas.core.groupby.GroupBy.rank` now raises a ``ValueError`` when an invalid value is passed for argument ``na_option`` (:issue:`22124`) +- Bug in :func:`get_dummies` with Unicode attributes in Python 2 (:issue:`22084`) +- Bug in :meth:`DataFrame.replace` raises ``RecursionError`` when replacing empty lists (:issue:`22083`) +- Bug in :meth:`Series.replace` and meth:`DataFrame.replace` when dict is used as the ``to_replace`` value and one key in the dict is is another key's value, the results were inconsistent between using integer key and using string key (:issue:`20656`) +- Bug in :meth:`DataFrame.drop_duplicates` for empty ``DataFrame`` which incorrectly raises an error (:issue:`20516`) +- Bug in :func:`pandas.wide_to_long` when a string is passed to the stubnames argument and a column name is a substring of that stubname (:issue:`22468`) +- Bug in :func:`merge` when merging ``datetime64[ns, tz]`` data that contained a DST transition (:issue:`18885`) +- Bug in :func:`merge_asof` when merging on float values within defined tolerance (:issue:`22981`) +- Bug in :func:`pandas.concat` when concatenating a multicolumn DataFrame with tz-aware data against a DataFrame with a different number of columns (:issue`22796`) +- Bug in :func:`merge_asof` when merging on columns containing nulls values (:issue:`22981`) + +.. _whatsnew_0240.bug_fixes.sparse: + +Sparse +^^^^^^ + +- Updating a boolean, datetime, or timedelta column to be Sparse now works (:issue:`22367`) +- Bug in :meth:`Series.to_sparse` with Series already holding sparse data not constructing properly (:issue:`22389`) +- Providing a ``sparse_index`` to the SparseArray constructor no longer defaults the na-value to ``np.nan`` for all dtypes. The correct na_value for ``data.dtype`` is now used. +- Bug in ``SparseArray.nbytes`` under-reporting its memory usage by not including the size of its sparse index. +- Improved performance of :meth:`Series.shift` for non-NA ``fill_value``, as values are no longer converted to a dense array. +- Bug in ``DataFrame.groupby`` not including ``fill_value`` in the groups for non-NA ``fill_value`` when grouping by a sparse column (:issue:`5078`) +- Bug in unary inversion operator (``~``) on a ``SparseSeries`` with boolean values. The performance of this has also been improved (:issue:`22835`) + +Build Changes +^^^^^^^^^^^^^ + +- Building pandas for development now requires ``cython >= 0.28.2`` (:issue:`21688`) +- Testing pandas now requires ``hypothesis>=3.58``. You can find `the Hypothesis docs here `_, and a pandas-specific introduction :ref:`in the contributing guide `. (:issue:`22280`) +- + +Other +^^^^^ + +- :meth:`~pandas.io.formats.style.Styler.background_gradient` now takes a ``text_color_threshold`` parameter to automatically lighten the text color based on the luminance of the background color. This improves readability with dark background colors without the need to limit the background colormap range. (:issue:`21258`) +- Require at least 0.28.2 version of ``cython`` to support read-only memoryviews (:issue:`21688`) +- :meth:`~pandas.io.formats.style.Styler.background_gradient` now also supports tablewise application (in addition to rowwise and columnwise) with ``axis=None`` (:issue:`15204`) +- :meth:`DataFrame.nlargest` and :meth:`DataFrame.nsmallest` now returns the correct n values when keep != 'all' also when tied on the first columns (:issue:`22752`) +- :meth:`~pandas.io.formats.style.Styler.bar` now also supports tablewise application (in addition to rowwise and columnwise) with ``axis=None`` and setting clipping range with ``vmin`` and ``vmax`` (:issue:`21548` and :issue:`21526`). ``NaN`` values are also handled properly. +- Logical operations ``&, |, ^`` between :class:`Series` and :class:`Index` will no longer raise ``ValueError`` (:issue:`22092`) +- Bug in :meth:`DataFrame.combine_first` in which column types were unexpectedly converted to float (:issue:`20699`) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index d0c7b66978661..236707cf32209 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -36,7 +36,7 @@ ensure_float64, ensure_object, _get_dtype) -from pandas.core.dtypes.missing import na_value_for_dtype +from pandas.core.dtypes.missing import na_value_for_dtype, isnull from pandas.core.internals import (items_overlap_with_suffix, concatenate_block_managers) from pandas.util._decorators import Appender, Substitution @@ -1390,6 +1390,10 @@ def flip(xs): self.right_join_keys[-1]) tolerance = self.tolerance + # Check null values before merge + if isnull(left_values).sum() > 0 or isnull(right_values).sum() > 0: + raise MergeError('Merge keys cannot contain null values') + # we required sortedness in the join keys msg = "{side} keys must be sorted" if not Index(left_values).is_monotonic: From b31f1297e4bdbebe846003899a9596c90b7694f3 Mon Sep 17 00:00:00 2001 From: Sven Date: Tue, 16 Oct 2018 07:08:56 +0100 Subject: [PATCH 02/14] update release file --- doc/source/whatsnew/v0.24.0.txt | 1 + doc/source/whatsnew/v0.24.1.txt | 1006 ------------------------------- 2 files changed, 1 insertion(+), 1006 deletions(-) delete mode 100644 doc/source/whatsnew/v0.24.1.txt diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 3053625721560..ba59e050cea70 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -972,6 +972,7 @@ Reshaping - Bug in :func:`merge` when merging ``datetime64[ns, tz]`` data that contained a DST transition (:issue:`18885`) - Bug in :func:`merge_asof` when merging on float values within defined tolerance (:issue:`22981`) - Bug in :func:`pandas.concat` when concatenating a multicolumn DataFrame with tz-aware data against a DataFrame with a different number of columns (:issue`22796`) +- Bug in :func:`merge_asof` where confusing error message raised when attempting to merge with missing values .. _whatsnew_0240.bug_fixes.sparse: diff --git a/doc/source/whatsnew/v0.24.1.txt b/doc/source/whatsnew/v0.24.1.txt deleted file mode 100644 index 9463458bfa64f..0000000000000 --- a/doc/source/whatsnew/v0.24.1.txt +++ /dev/null @@ -1,1006 +0,0 @@ -.. _whatsnew_0240: - -v0.24.0 (Month XX, 2018) ------------------------- - -.. warning:: - - Starting January 1, 2019, pandas feature releases will support Python 3 only. - See :ref:`install.dropping-27` for more. - -.. _whatsnew_0240.enhancements: - -New features -~~~~~~~~~~~~ -- :func:`merge` now directly allows merge between objects of type ``DataFrame`` and named ``Series``, without the need to convert the ``Series`` object into a ``DataFrame`` beforehand (:issue:`21220`) - - -- ``ExcelWriter`` now accepts ``mode`` as a keyword argument, enabling append to existing workbooks when using the ``openpyxl`` engine (:issue:`3441`) - -- :func:`DataFrame.to_parquet` now accepts ``index`` as an argument, allowing -the user to override the engine's default behavior to include or omit the -dataframe's indexes from the resulting Parquet file. (:issue:`20768`) -- :meth:`DataFrame.corr` and :meth:`Series.corr` now accept a callable for generic calculation methods of correlation, e.g. histogram intersection (:issue:`22684`) - - -.. _whatsnew_0240.enhancements.extension_array_operators: - -``ExtensionArray`` operator support -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -A ``Series`` based on an ``ExtensionArray`` now supports arithmetic and comparison -operators (:issue:`19577`). There are two approaches for providing operator support for an ``ExtensionArray``: - -1. Define each of the operators on your ``ExtensionArray`` subclass. -2. Use an operator implementation from pandas that depends on operators that are already defined - on the underlying elements (scalars) of the ``ExtensionArray``. - -See the :ref:`ExtensionArray Operator Support -` documentation section for details on both -ways of adding operator support. - -.. _whatsnew_0240.enhancements.intna: - -Optional Integer NA Support -^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Pandas has gained the ability to hold integer dtypes with missing values. This long requested feature is enabled through the use of :ref:`extension types `. -Here is an example of the usage. - -We can construct a ``Series`` with the specified dtype. The dtype string ``Int64`` is a pandas ``ExtensionDtype``. Specifying a list or array using the traditional missing value -marker of ``np.nan`` will infer to integer dtype. The display of the ``Series`` will also use the ``NaN`` to indicate missing values in string outputs. (:issue:`20700`, :issue:`20747`, :issue:`22441`, :issue:`21789`, :issue:`22346`) - -.. ipython:: python - - s = pd.Series([1, 2, np.nan], dtype='Int64') - s - - -Operations on these dtypes will propagate ``NaN`` as other pandas operations. - -.. ipython:: python - - # arithmetic - s + 1 - - # comparison - s == 1 - - # indexing - s.iloc[1:3] - - # operate with other dtypes - s + s.iloc[1:3].astype('Int8') - - # coerce when needed - s + 0.01 - -These dtypes can operate as part of of ``DataFrame``. - -.. ipython:: python - - df = pd.DataFrame({'A': s, 'B': [1, 1, 3], 'C': list('aab')}) - df - df.dtypes - - -These dtypes can be merged & reshaped & casted. - -.. ipython:: python - - pd.concat([df[['A']], df[['B', 'C']]], axis=1).dtypes - df['A'].astype(float) - -Reduction and groupby operations such as 'sum' work. - -.. ipython:: python - - df.sum() - df.groupby('B').A.sum() - -.. warning:: - - The Integer NA support currently uses the captilized dtype version, e.g. ``Int8`` as compared to the traditional ``int8``. This may be changed at a future date. - -.. _whatsnew_0240.enhancements.read_html: - -``read_html`` Enhancements -^^^^^^^^^^^^^^^^^^^^^^^^^^ - -:func:`read_html` previously ignored ``colspan`` and ``rowspan`` attributes. -Now it understands them, treating them as sequences of cells with the same -value. (:issue:`17054`) - -.. ipython:: python - - result = pd.read_html(""" - - - - - - - - - - - -
ABC
12
""") - -Previous Behavior: - -.. code-block:: ipython - - In [13]: result - Out [13]: - [ A B C - 0 1 2 NaN] - -Current Behavior: - -.. ipython:: python - - result - - -.. _whatsnew_0240.enhancements.interval: - -Storing Interval Data in Series and DataFrame -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Interval data may now be stored in a ``Series`` or ``DataFrame``, in addition to an -:class:`IntervalIndex` like previously (:issue:`19453`). - -.. ipython:: python - - ser = pd.Series(pd.interval_range(0, 5)) - ser - ser.dtype - -Previously, these would be cast to a NumPy array of ``Interval`` objects. In general, -this should result in better performance when storing an array of intervals in -a :class:`Series`. - -Note that the ``.values`` of a ``Series`` containing intervals is no longer a NumPy -array, but rather an ``ExtensionArray``: - -.. ipython:: python - - ser.values - -This is the same behavior as ``Series.values`` for categorical data. See -:ref:`whatsnew_0240.api_breaking.interval_values` for more. - - -.. _whatsnew_0240.enhancements.other: - -Other Enhancements -^^^^^^^^^^^^^^^^^^ -- :func:`to_datetime` now supports the ``%Z`` and ``%z`` directive when passed into ``format`` (:issue:`13486`) -- :func:`Series.mode` and :func:`DataFrame.mode` now support the ``dropna`` parameter which can be used to specify whether ``NaN``/``NaT`` values should be considered (:issue:`17534`) -- :func:`to_csv` now supports ``compression`` keyword when a file handle is passed. (:issue:`21227`) -- :meth:`Index.droplevel` is now implemented also for flat indexes, for compatibility with :class:`MultiIndex` (:issue:`21115`) -- :meth:`Series.droplevel` and :meth:`DataFrame.droplevel` are now implemented (:issue:`20342`) -- Added support for reading from/writing to Google Cloud Storage via the ``gcsfs`` library (:issue:`19454`, :issue:`23094`) -- :func:`to_gbq` and :func:`read_gbq` signature and documentation updated to - reflect changes from the `Pandas-GBQ library version 0.6.0 - `__. - (:issue:`21627`, :issue:`22557`) -- New method :meth:`HDFStore.walk` will recursively walk the group hierarchy of an HDF5 file (:issue:`10932`) -- :func:`read_html` copies cell data across ``colspan`` and ``rowspan``, and it treats all-``th`` table rows as headers if ``header`` kwarg is not given and there is no ``thead`` (:issue:`17054`) -- :meth:`Series.nlargest`, :meth:`Series.nsmallest`, :meth:`DataFrame.nlargest`, and :meth:`DataFrame.nsmallest` now accept the value ``"all"`` for the ``keep`` argument. This keeps all ties for the nth largest/smallest value (:issue:`16818`) -- :class:`IntervalIndex` has gained the :meth:`~IntervalIndex.set_closed` method to change the existing ``closed`` value (:issue:`21670`) -- :func:`~DataFrame.to_csv`, :func:`~Series.to_csv`, :func:`~DataFrame.to_json`, and :func:`~Series.to_json` now support ``compression='infer'`` to infer compression based on filename extension (:issue:`15008`). - The default compression for ``to_csv``, ``to_json``, and ``to_pickle`` methods has been updated to ``'infer'`` (:issue:`22004`). -- :func:`to_timedelta` now supports iso-formated timedelta strings (:issue:`21877`) -- :class:`Series` and :class:`DataFrame` now support :class:`Iterable` in constructor (:issue:`2193`) -- :class:`DatetimeIndex` gained :attr:`DatetimeIndex.timetz` attribute. Returns local time with timezone information. (:issue:`21358`) -- :meth:`round`, :meth:`ceil`, and meth:`floor` for :class:`DatetimeIndex` and :class:`Timestamp` now support an ``ambiguous`` argument for handling datetimes that are rounded to ambiguous times (:issue:`18946`) -- :class:`Resampler` now is iterable like :class:`GroupBy` (:issue:`15314`). -- :meth:`Series.resample` and :meth:`DataFrame.resample` have gained the :meth:`Resampler.quantile` (:issue:`15023`). -- :meth:`Index.to_frame` now supports overriding column name(s) (:issue:`22580`). -- New attribute :attr:`__git_version__` will return git commit sha of current build (:issue:`21295`). -- Compatibility with Matplotlib 3.0 (:issue:`22790`). - -.. _whatsnew_0240.api_breaking: - -Backwards incompatible API changes -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -- A newly constructed empty :class:`DataFrame` with integer as the ``dtype`` will now only be cast to ``float64`` if ``index`` is specified (:issue:`22858`) - -.. _whatsnew_0240.api_breaking.deps: - -Dependencies have increased minimum versions -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -We have updated our minimum supported versions of dependencies (:issue:`21242`). -If installed, we now require: - -+-----------------+-----------------+----------+ -| Package | Minimum Version | Required | -+=================+=================+==========+ -| numpy | 1.12.0 | X | -+-----------------+-----------------+----------+ -| bottleneck | 1.2.0 | | -+-----------------+-----------------+----------+ -| matplotlib | 2.0.0 | | -+-----------------+-----------------+----------+ -| numexpr | 2.6.1 | | -+-----------------+-----------------+----------+ -| pytables | 3.4.2 | | -+-----------------+-----------------+----------+ -| scipy | 0.18.1 | | -+-----------------+-----------------+----------+ - -.. _whatsnew_0240.api_breaking.interval_values: - -``IntervalIndex.values`` is now an ``IntervalArray`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -The :attr:`~Interval.values` attribute of an :class:`IntervalIndex` now returns an -``IntervalArray``, rather than a NumPy array of :class:`Interval` objects (:issue:`19453`). - -Previous Behavior: - -.. code-block:: ipython - - In [1]: idx = pd.interval_range(0, 4) - - In [2]: idx.values - Out[2]: - array([Interval(0, 1, closed='right'), Interval(1, 2, closed='right'), - Interval(2, 3, closed='right'), Interval(3, 4, closed='right')], - dtype=object) - -New Behavior: - -.. ipython:: python - - idx = pd.interval_range(0, 4) - idx.values - -This mirrors ``CategoricalIndex.values``, which returns a ``Categorical``. - -For situations where you need an ``ndarray`` of ``Interval`` objects, use -:meth:`numpy.asarray` or ``idx.astype(object)``. - -.. ipython:: python - - np.asarray(idx) - idx.values.astype(object) - -.. _whatsnew_0240.api.timezone_offset_parsing: - -Parsing Datetime Strings with Timezone Offsets -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Previously, parsing datetime strings with UTC offsets with :func:`to_datetime` -or :class:`DatetimeIndex` would automatically convert the datetime to UTC -without timezone localization. This is inconsistent from parsing the same -datetime string with :class:`Timestamp` which would preserve the UTC -offset in the ``tz`` attribute. Now, :func:`to_datetime` preserves the UTC -offset in the ``tz`` attribute when all the datetime strings have the same -UTC offset (:issue:`17697`, :issue:`11736`, :issue:`22457`) - -*Previous Behavior*: - -.. code-block:: ipython - - In [2]: pd.to_datetime("2015-11-18 15:30:00+05:30") - Out[2]: Timestamp('2015-11-18 10:00:00') - - In [3]: pd.Timestamp("2015-11-18 15:30:00+05:30") - Out[3]: Timestamp('2015-11-18 15:30:00+0530', tz='pytz.FixedOffset(330)') - - # Different UTC offsets would automatically convert the datetimes to UTC (without a UTC timezone) - In [4]: pd.to_datetime(["2015-11-18 15:30:00+05:30", "2015-11-18 16:30:00+06:30"]) - Out[4]: DatetimeIndex(['2015-11-18 10:00:00', '2015-11-18 10:00:00'], dtype='datetime64[ns]', freq=None) - -*Current Behavior*: - -.. ipython:: python - - pd.to_datetime("2015-11-18 15:30:00+05:30") - pd.Timestamp("2015-11-18 15:30:00+05:30") - -Parsing datetime strings with the same UTC offset will preserve the UTC offset in the ``tz`` - -.. ipython:: python - - pd.to_datetime(["2015-11-18 15:30:00+05:30"] * 2) - -Parsing datetime strings with different UTC offsets will now create an Index of -``datetime.datetime`` objects with different UTC offsets - -.. ipython:: python - - idx = pd.to_datetime(["2015-11-18 15:30:00+05:30", "2015-11-18 16:30:00+06:30"]) - idx - idx[0] - idx[1] - -Passing ``utc=True`` will mimic the previous behavior but will correctly indicate -that the dates have been converted to UTC - -.. ipython:: python - - pd.to_datetime(["2015-11-18 15:30:00+05:30", "2015-11-18 16:30:00+06:30"], utc=True) - -.. _whatsnew_0240.api_breaking.calendarday: - -CalendarDay Offset -^^^^^^^^^^^^^^^^^^ - -:class:`Day` and associated frequency alias ``'D'`` were documented to represent -a calendar day; however, arithmetic and operations with :class:`Day` sometimes -respected absolute time instead (i.e. ``Day(n)`` and acted identically to ``Timedelta(days=n)``). - -*Previous Behavior*: - -.. code-block:: ipython - - - In [2]: ts = pd.Timestamp('2016-10-30 00:00:00', tz='Europe/Helsinki') - - # Respects calendar arithmetic - In [3]: pd.date_range(start=ts, freq='D', periods=3) - Out[3]: - DatetimeIndex(['2016-10-30 00:00:00+03:00', '2016-10-31 00:00:00+02:00', - '2016-11-01 00:00:00+02:00'], - dtype='datetime64[ns, Europe/Helsinki]', freq='D') - - # Respects absolute arithmetic - In [4]: ts + pd.tseries.frequencies.to_offset('D') - Out[4]: Timestamp('2016-10-30 23:00:00+0200', tz='Europe/Helsinki') - -:class:`CalendarDay` and associated frequency alias ``'CD'`` are now available -and respect calendar day arithmetic while :class:`Day` and frequency alias ``'D'`` -will now respect absolute time (:issue:`22274`, :issue:`20596`, :issue:`16980`, :issue:`8774`) -See the :ref:`documentation here ` for more information. - -Addition with :class:`CalendarDay` across a daylight savings time transition: - -.. ipython:: python - - ts = pd.Timestamp('2016-10-30 00:00:00', tz='Europe/Helsinki') - ts + pd.offsets.Day(1) - ts + pd.offsets.CalendarDay(1) - -.. _whatsnew_0240.api_breaking.period_end_time: - -Time values in ``dt.end_time`` and ``to_timestamp(how='end')`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -The time values in :class:`Period` and :class:`PeriodIndex` objects are now set -to '23:59:59.999999999' when calling :attr:`Series.dt.end_time`, :attr:`Period.end_time`, -:attr:`PeriodIndex.end_time`, :func:`Period.to_timestamp()` with ``how='end'``, -or :func:`PeriodIndex.to_timestamp()` with ``how='end'`` (:issue:`17157`) - -Previous Behavior: - -.. code-block:: ipython - - In [2]: p = pd.Period('2017-01-01', 'D') - In [3]: pi = pd.PeriodIndex([p]) - - In [4]: pd.Series(pi).dt.end_time[0] - Out[4]: Timestamp(2017-01-01 00:00:00) - - In [5]: p.end_time - Out[5]: Timestamp(2017-01-01 23:59:59.999999999) - -Current Behavior: - -Calling :attr:`Series.dt.end_time` will now result in a time of '23:59:59.999999999' as -is the case with :attr:`Period.end_time`, for example - -.. ipython:: python - - p = pd.Period('2017-01-01', 'D') - pi = pd.PeriodIndex([p]) - - pd.Series(pi).dt.end_time[0] - - p.end_time - -.. _whatsnew_0240.api_breaking.sparse_values: - -Sparse Data Structure Refactor -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -``SparseArray``, the array backing ``SparseSeries`` and the columns in a ``SparseDataFrame``, -is now an extension array (:issue:`21978`, :issue:`19056`, :issue:`22835`). -To conform to this interface and for consistency with the rest of pandas, some API breaking -changes were made: - -- ``SparseArray`` is no longer a subclass of :class:`numpy.ndarray`. To convert a SparseArray to a NumPy array, use :meth:`numpy.asarray`. -- ``SparseArray.dtype`` and ``SparseSeries.dtype`` are now instances of :class:`SparseDtype`, rather than ``np.dtype``. Access the underlying dtype with ``SparseDtype.subtype``. -- :meth:`numpy.asarray(sparse_array)` now returns a dense array with all the values, not just the non-fill-value values (:issue:`14167`) -- ``SparseArray.take`` now matches the API of :meth:`pandas.api.extensions.ExtensionArray.take` (:issue:`19506`): - - * The default value of ``allow_fill`` has changed from ``False`` to ``True``. - * The ``out`` and ``mode`` parameters are now longer accepted (previously, this raised if they were specified). - * Passing a scalar for ``indices`` is no longer allowed. - -- The result of concatenating a mix of sparse and dense Series is a Series with sparse values, rather than a ``SparseSeries``. -- ``SparseDataFrame.combine`` and ``DataFrame.combine_first`` no longer supports combining a sparse column with a dense column while preserving the sparse subtype. The result will be an object-dtype SparseArray. -- Setting :attr:`SparseArray.fill_value` to a fill value with a different dtype is now allowed. - - -Some new warnings are issued for operations that require or are likely to materialize a large dense array: - -- A :class:`errors.PerformanceWarning` is issued when using fillna with a ``method``, as a dense array is constructed to create the filled array. Filling with a ``value`` is the efficient way to fill a sparse array. -- A :class:`errors.PerformanceWarning` is now issued when concatenating sparse Series with differing fill values. The fill value from the first sparse array continues to be used. - -In addition to these API breaking changes, many :ref:`performance improvements and bug fixes have been made `. - -.. _whatsnew_0240.api_breaking.frame_to_dict_index_orient: - -Raise ValueError in ``DataFrame.to_dict(orient='index')`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Bug in :func:`DataFrame.to_dict` raises ``ValueError`` when used with -``orient='index'`` and a non-unique index instead of losing data (:issue:`22801`) - -.. ipython:: python - :okexcept: - - df = pd.DataFrame({'a': [1, 2], 'b': [0.5, 0.75]}, index=['A', 'A']) - df - - df.to_dict(orient='index') - -.. _whatsnew_0240.api.datetimelike.normalize: - -Tick DateOffset Normalize Restrictions -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Creating a ``Tick`` object (:class:`Day`, :class:`Hour`, :class:`Minute`, -:class:`Second`, :class:`Milli`, :class:`Micro`, :class:`Nano`) with -``normalize=True`` is no longer supported. This prevents unexpected behavior -where addition could fail to be monotone or associative. (:issue:`21427`) - -*Previous Behavior*: - -.. code-block:: ipython - - - In [2]: ts = pd.Timestamp('2018-06-11 18:01:14') - - In [3]: ts - Out[3]: Timestamp('2018-06-11 18:01:14') - - In [4]: tic = pd.offsets.Hour(n=2, normalize=True) - ...: - - In [5]: tic - Out[5]: <2 * Hours> - - In [6]: ts + tic - Out[6]: Timestamp('2018-06-11 00:00:00') - - In [7]: ts + tic + tic + tic == ts + (tic + tic + tic) - Out[7]: False - -*Current Behavior*: - -.. ipython:: python - - ts = pd.Timestamp('2018-06-11 18:01:14') - tic = pd.offsets.Hour(n=2) - ts + tic + tic + tic == ts + (tic + tic + tic) - - -.. _whatsnew_0240.api.datetimelike: - - -.. _whatsnew_0240.api.period_subtraction: - -Period Subtraction -^^^^^^^^^^^^^^^^^^ - -Subtraction of a ``Period`` from another ``Period`` will give a ``DateOffset``. -instead of an integer (:issue:`21314`) - -.. ipython:: python - - june = pd.Period('June 2018') - april = pd.Period('April 2018') - june - april - -Previous Behavior: - -.. code-block:: ipython - - In [2]: june = pd.Period('June 2018') - - In [3]: april = pd.Period('April 2018') - - In [4]: june - april - Out [4]: 2 - -Similarly, subtraction of a ``Period`` from a ``PeriodIndex`` will now return -an ``Index`` of ``DateOffset`` objects instead of an ``Int64Index`` - -.. ipython:: python - - pi = pd.period_range('June 2018', freq='M', periods=3) - pi - pi[0] - -Previous Behavior: - -.. code-block:: ipython - - In [2]: pi = pd.period_range('June 2018', freq='M', periods=3) - - In [3]: pi - pi[0] - Out[3]: Int64Index([0, 1, 2], dtype='int64') - - -.. _whatsnew_0240.api.timedelta64_subtract_nan: - -Addition/Subtraction of ``NaN`` from :class:`DataFrame` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Adding or subtracting ``NaN`` from a :class:`DataFrame` column with -``timedelta64[ns]`` dtype will now raise a ``TypeError`` instead of returning -all-``NaT``. This is for compatibility with ``TimedeltaIndex`` and -``Series`` behavior (:issue:`22163`) - -.. ipython:: python - :okexcept: - - df = pd.DataFrame([pd.Timedelta(days=1)]) - df - np.nan - -Previous Behavior: - -.. code-block:: ipython - - In [4]: df = pd.DataFrame([pd.Timedelta(days=1)]) - - In [5]: df - np.nan - Out[5]: - 0 - 0 NaT - - -.. _whatsnew_0240.api.dataframe_arithmetic_broadcasting: - -DataFrame Arithmetic Operations Broadcasting Changes -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -:class:`DataFrame` arithmetic operations when operating with 2-dimensional -``np.ndarray`` objects now broadcast in the same way as ``np.ndarray``s -broadcast. (:issue:`23000`) - -Previous Behavior: - -.. code-block:: ipython - - In [3]: arr = np.arange(6).reshape(3, 2) - In [4]: df = pd.DataFrame(arr) - In [5]: df + arr[[0], :] # 1 row, 2 columns - ... - ValueError: Unable to coerce to DataFrame, shape must be (3, 2): given (1, 2) - In [6]: df + arr[:, [1]] # 1 column, 3 rows - ... - ValueError: Unable to coerce to DataFrame, shape must be (3, 2): given (3, 1) - -*Current Behavior*: - -.. ipython:: python - arr = np.arange(6).reshape(3, 2) - df = pd.DataFrame(arr) - df - -.. ipython:: python - df + arr[[0], :] # 1 row, 2 columns - df + arr[:, [1]] # 1 column, 3 rows - - -.. _whatsnew_0240.api.extension: - -ExtensionType Changes -^^^^^^^^^^^^^^^^^^^^^ - -**:class:`pandas.api.extensions.ExtensionDtype` Equality and Hashability** - -Pandas now requires that extension dtypes be hashable. The base class implements -a default ``__eq__`` and ``__hash__``. If you have a parametrized dtype, you should -update the ``ExtensionDtype._metadata`` tuple to match the signature of your -``__init__`` method. See :class:`pandas.api.extensions.ExtensionDtype` for more (:issue:`22476`). - -**Other changes** - -- ``ExtensionArray`` has gained the abstract methods ``.dropna()`` (:issue:`21185`) -- ``ExtensionDtype`` has gained the ability to instantiate from string dtypes, e.g. ``decimal`` would instantiate a registered ``DecimalDtype``; furthermore - the ``ExtensionDtype`` has gained the method ``construct_array_type`` (:issue:`21185`) -- An ``ExtensionArray`` with a boolean dtype now works correctly as a boolean indexer. :meth:`pandas.api.types.is_bool_dtype` now properly considers them boolean (:issue:`22326`) -- Added ``ExtensionDtype._is_numeric`` for controlling whether an extension dtype is considered numeric (:issue:`22290`). -- The ``ExtensionArray`` constructor, ``_from_sequence`` now take the keyword arg ``copy=False`` (:issue:`21185`) -- Bug in :meth:`Series.get` for ``Series`` using ``ExtensionArray`` and integer index (:issue:`21257`) -- :meth:`~Series.shift` now dispatches to :meth:`ExtensionArray.shift` (:issue:`22386`) -- :meth:`Series.combine()` works correctly with :class:`~pandas.api.extensions.ExtensionArray` inside of :class:`Series` (:issue:`20825`) -- :meth:`Series.combine()` with scalar argument now works for any function type (:issue:`21248`) -- :meth:`Series.astype` and :meth:`DataFrame.astype` now dispatch to :meth:`ExtensionArray.astype` (:issue:`21185:`). -- Slicing a single row of a ``DataFrame`` with multiple ExtensionArrays of the same type now preserves the dtype, rather than coercing to object (:issue:`22784`) -- Added :meth:`pandas.api.types.register_extension_dtype` to register an extension type with pandas (:issue:`22664`) -- Series backed by an ``ExtensionArray`` now work with :func:`util.hash_pandas_object` (:issue:`23066`) -- Updated the ``.type`` attribute for ``PeriodDtype``, ``DatetimeTZDtype``, and ``IntervalDtype`` to be instances of the dtype (``Period``, ``Timestamp``, and ``Interval`` respectively) (:issue:`22938`) -- :func:`ExtensionArray.isna` is allowed to return an ``ExtensionArray`` (:issue:`22325`). -- Support for reduction operations such as ``sum``, ``mean`` via opt-in base class method override (:issue:`22762`) - -.. _whatsnew_0240.api.incompatibilities: - -Series and Index Data-Dtype Incompatibilities -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -``Series`` and ``Index`` constructors now raise when the -data is incompatible with a passed ``dtype=`` (:issue:`15832`) - -Previous Behavior: - -.. code-block:: ipython - - In [4]: pd.Series([-1], dtype="uint64") - Out [4]: - 0 18446744073709551615 - dtype: uint64 - -Current Behavior: - -.. code-block:: ipython - - In [4]: pd.Series([-1], dtype="uint64") - Out [4]: - ... - OverflowError: Trying to coerce negative values to unsigned integers - -.. _whatsnew_0240.api.crosstab_dtypes - -Crosstab Preserves Dtypes -^^^^^^^^^^^^^^^^^^^^^^^^^ - -:func:`crosstab` will preserve now dtypes in some cases that previously would -cast from integer dtype to floating dtype (:issue:`22019`) - -Previous Behavior: - -.. code-block:: ipython - - In [3]: df = pd.DataFrame({'a': [1, 2, 2, 2, 2], 'b': [3, 3, 4, 4, 4], - ...: 'c': [1, 1, np.nan, 1, 1]}) - In [4]: pd.crosstab(df.a, df.b, normalize='columns') - Out[4]: - b 3 4 - a - 1 0.5 0.0 - 2 0.5 1.0 - -Current Behavior: - -.. code-block:: ipython - - In [3]: df = pd.DataFrame({'a': [1, 2, 2, 2, 2], 'b': [3, 3, 4, 4, 4], - ...: 'c': [1, 1, np.nan, 1, 1]}) - In [4]: pd.crosstab(df.a, df.b, normalize='columns') - -Datetimelike API Changes -^^^^^^^^^^^^^^^^^^^^^^^^ - -- For :class:`DatetimeIndex` and :class:`TimedeltaIndex` with non-``None`` ``freq`` attribute, addition or subtraction of integer-dtyped array or ``Index`` will return an object of the same class (:issue:`19959`) -- :class:`DateOffset` objects are now immutable. Attempting to alter one of these will now raise ``AttributeError`` (:issue:`21341`) -- :class:`PeriodIndex` subtraction of another ``PeriodIndex`` will now return an object-dtype :class:`Index` of :class:`DateOffset` objects instead of raising a ``TypeError`` (:issue:`20049`) -- :func:`cut` and :func:`qcut` now returns a :class:`DatetimeIndex` or :class:`TimedeltaIndex` bins when the input is datetime or timedelta dtype respectively and ``retbins=True`` (:issue:`19891`) -- :meth:`DatetimeIndex.to_period` and :meth:`Timestamp.to_period` will issue a warning when timezone information will be lost (:issue:`21333`) - -.. _whatsnew_0240.api.other: - -Other API Changes -^^^^^^^^^^^^^^^^^ - -- :class:`DatetimeIndex` now accepts :class:`Int64Index` arguments as epoch timestamps (:issue:`20997`) -- Accessing a level of a ``MultiIndex`` with a duplicate name (e.g. in - :meth:`~MultiIndex.get_level_values`) now raises a ``ValueError`` instead of - a ``KeyError`` (:issue:`21678`). -- Invalid construction of ``IntervalDtype`` will now always raise a ``TypeError`` rather than a ``ValueError`` if the subdtype is invalid (:issue:`21185`) -- Trying to reindex a ``DataFrame`` with a non unique ``MultiIndex`` now raises a ``ValueError`` instead of an ``Exception`` (:issue:`21770`) -- :meth:`PeriodIndex.tz_convert` and :meth:`PeriodIndex.tz_localize` have been removed (:issue:`21781`) -- :class:`Index` subtraction will attempt to operate element-wise instead of raising ``TypeError`` (:issue:`19369`) -- :class:`pandas.io.formats.style.Styler` supports a ``number-format`` property when using :meth:`~pandas.io.formats.style.Styler.to_excel` (:issue:`22015`) -- :meth:`DataFrame.corr` and :meth:`Series.corr` now raise a ``ValueError`` along with a helpful error message instead of a ``KeyError`` when supplied with an invalid method (:issue:`22298`) -- :meth:`shift` will now always return a copy, instead of the previous behaviour of returning self when shifting by 0 (:issue:`22397`) -- Slicing a single row of a DataFrame with multiple ExtensionArrays of the same type now preserves the dtype, rather than coercing to object (:issue:`22784`) -- :class:`DateOffset` attribute `_cacheable` and method `_should_cache` have been removed (:issue:`23118`) - -.. _whatsnew_0240.deprecations: - -Deprecations -~~~~~~~~~~~~ - -- :meth:`DataFrame.to_stata`, :meth:`read_stata`, :class:`StataReader` and :class:`StataWriter` have deprecated the ``encoding`` argument. The encoding of a Stata dta file is determined by the file type and cannot be changed (:issue:`21244`) -- :meth:`MultiIndex.to_hierarchical` is deprecated and will be removed in a future version (:issue:`21613`) -- :meth:`Series.ptp` is deprecated. Use ``numpy.ptp`` instead (:issue:`21614`) -- :meth:`Series.compress` is deprecated. Use ``Series[condition]`` instead (:issue:`18262`) -- The signature of :meth:`Series.to_csv` has been uniformed to that of :meth:`DataFrame.to_csv`: the name of the first argument is now ``path_or_buf``, the order of subsequent arguments has changed, the ``header`` argument now defaults to ``True``. (:issue:`19715`) -- :meth:`Categorical.from_codes` has deprecated providing float values for the ``codes`` argument. (:issue:`21767`) -- :func:`pandas.read_table` is deprecated. Instead, use :func:`pandas.read_csv` passing ``sep='\t'`` if necessary (:issue:`21948`) -- :meth:`Series.str.cat` has deprecated using arbitrary list-likes *within* list-likes. A list-like container may still contain - many ``Series``, ``Index`` or 1-dimensional ``np.ndarray``, or alternatively, only scalar values. (:issue:`21950`) -- :meth:`FrozenNDArray.searchsorted` has deprecated the ``v`` parameter in favor of ``value`` (:issue:`14645`) -- :func:`DatetimeIndex.shift` and :func:`PeriodIndex.shift` now accept ``periods`` argument instead of ``n`` for consistency with :func:`Index.shift` and :func:`Series.shift`. Using ``n`` throws a deprecation warning (:issue:`22458`, :issue:`22912`) - -.. _whatsnew_0240.prior_deprecations: - -Removal of prior version deprecations/changes -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -- The ``LongPanel`` and ``WidePanel`` classes have been removed (:issue:`10892`) -- :meth:`Series.repeat` has renamed the ``reps`` argument to ``repeats`` (:issue:`14645`) -- Several private functions were removed from the (non-public) module ``pandas.core.common`` (:issue:`22001`) -- Removal of the previously deprecated module ``pandas.core.datetools`` (:issue:`14105`, :issue:`14094`) -- Strings passed into :meth:`DataFrame.groupby` that refer to both column and index levels will raise a ``ValueError`` (:issue:`14432`) -- :meth:`Index.repeat` and :meth:`MultiIndex.repeat` have renamed the ``n`` argument to ``repeats`` (:issue:`14645`) -- Removal of the previously deprecated ``as_indexer`` keyword completely from ``str.match()`` (:issue:`22356`, :issue:`6581`) -- Removed the ``pandas.formats.style`` shim for :class:`pandas.io.formats.style.Styler` (:issue:`16059`) -- :meth:`Categorical.searchsorted` and :meth:`Series.searchsorted` have renamed the ``v`` argument to ``value`` (:issue:`14645`) -- :meth:`TimedeltaIndex.searchsorted`, :meth:`DatetimeIndex.searchsorted`, and :meth:`PeriodIndex.searchsorted` have renamed the ``key`` argument to ``value`` (:issue:`14645`) -- Removal of the previously deprecated module ``pandas.json`` (:issue:`19944`) - -.. _whatsnew_0240.performance: - -Performance Improvements -~~~~~~~~~~~~~~~~~~~~~~~~ - -- Very large improvement in performance of slicing when the index is a :class:`CategoricalIndex`, - both when indexing by label (using .loc) and position(.iloc). - Likewise, slicing a ``CategoricalIndex`` itself (i.e. ``ci[100:200]``) shows similar speed improvements (:issue:`21659`) -- Improved performance of :func:`Series.describe` in case of numeric dtpyes (:issue:`21274`) -- Improved performance of :func:`pandas.core.groupby.GroupBy.rank` when dealing with tied rankings (:issue:`21237`) -- Improved performance of :func:`DataFrame.set_index` with columns consisting of :class:`Period` objects (:issue:`21582`, :issue:`21606`) -- Improved performance of membership checks in :class:`Categorical` and :class:`CategoricalIndex` - (i.e. ``x in cat``-style checks are much faster). :meth:`CategoricalIndex.contains` - is likewise much faster (:issue:`21369`, :issue:`21508`) -- Improved performance of :meth:`HDFStore.groups` (and dependent functions like - :meth:`~HDFStore.keys`. (i.e. ``x in store`` checks are much faster) - (:issue:`21372`) -- Improved the performance of :func:`pandas.get_dummies` with ``sparse=True`` (:issue:`21997`) -- Improved performance of :func:`IndexEngine.get_indexer_non_unique` for sorted, non-unique indexes (:issue:`9466`) -- Improved performance of :func:`PeriodIndex.unique` (:issue:`23083`) - - -.. _whatsnew_0240.docs: - -Documentation Changes -~~~~~~~~~~~~~~~~~~~~~ - -- Added sphinx spelling extension, updated documentation on how to use the spell check (:issue:`21079`) -- -- - -.. _whatsnew_0240.bug_fixes: - -Bug Fixes -~~~~~~~~~ - -Categorical -^^^^^^^^^^^ - -- Bug in :meth:`Categorical.from_codes` where ``NaN`` values in ``codes`` were silently converted to ``0`` (:issue:`21767`). In the future this will raise a ``ValueError``. Also changes the behavior of ``.from_codes([1.1, 2.0])``. -- Bug when indexing with a boolean-valued ``Categorical``. Now a boolean-valued ``Categorical`` is treated as a boolean mask (:issue:`22665`) -- Constructing a :class:`CategoricalIndex` with empty values and boolean categories was raising a ``ValueError`` after a change to dtype coercion (:issue:`22702`). - -Datetimelike -^^^^^^^^^^^^ - -- Fixed bug where two :class:`DateOffset` objects with different ``normalize`` attributes could evaluate as equal (:issue:`21404`) -- Fixed bug where :meth:`Timestamp.resolution` incorrectly returned 1-microsecond ``timedelta`` instead of 1-nanosecond :class:`Timedelta` (:issue:`21336`, :issue:`21365`) -- Bug in :func:`to_datetime` that did not consistently return an :class:`Index` when ``box=True`` was specified (:issue:`21864`) -- Bug in :class:`DatetimeIndex` comparisons where string comparisons incorrectly raises ``TypeError`` (:issue:`22074`) -- Bug in :class:`DatetimeIndex` comparisons when comparing against ``timedelta64[ns]`` dtyped arrays; in some cases ``TypeError`` was incorrectly raised, in others it incorrectly failed to raise (:issue:`22074`) -- Bug in :class:`DatetimeIndex` comparisons when comparing against object-dtyped arrays (:issue:`22074`) -- Bug in :class:`DataFrame` with ``datetime64[ns]`` dtype addition and subtraction with ``Timedelta``-like objects (:issue:`22005`, :issue:`22163`) -- Bug in :class:`DataFrame` with ``datetime64[ns]`` dtype addition and subtraction with ``DateOffset`` objects returning an ``object`` dtype instead of ``datetime64[ns]`` dtype (:issue:`21610`, :issue:`22163`) -- Bug in :class:`DataFrame` with ``datetime64[ns]`` dtype comparing against ``NaT`` incorrectly (:issue:`22242`, :issue:`22163`) -- Bug in :class:`DataFrame` with ``datetime64[ns]`` dtype subtracting ``Timestamp``-like object incorrectly returned ``datetime64[ns]`` dtype instead of ``timedelta64[ns]`` dtype (:issue:`8554`, :issue:`22163`) -- Bug in :class:`DataFrame` with ``datetime64[ns]`` dtype subtracting ``np.datetime64`` object with non-nanosecond unit failing to convert to nanoseconds (:issue:`18874`, :issue:`22163`) -- Bug in :class:`DataFrame` comparisons against ``Timestamp``-like objects failing to raise ``TypeError`` for inequality checks with mismatched types (:issue:`8932`, :issue:`22163`) -- Bug in :class:`DataFrame` with mixed dtypes including ``datetime64[ns]`` incorrectly raising ``TypeError`` on equality comparisons (:issue:`13128`, :issue:`22163`) -- Bug in :meth:`DataFrame.eq` comparison against ``NaT`` incorrectly returning ``True`` or ``NaN`` (:issue:`15697`, :issue:`22163`) -- Bug in :class:`DatetimeIndex` subtraction that incorrectly failed to raise ``OverflowError`` (:issue:`22492`, :issue:`22508`) -- Bug in :class:`DatetimeIndex` incorrectly allowing indexing with ``Timedelta`` object (:issue:`20464`) -- Bug in :class:`DatetimeIndex` where frequency was being set if original frequency was ``None`` (:issue:`22150`) -- Bug in rounding methods of :class:`DatetimeIndex` (:meth:`~DatetimeIndex.round`, :meth:`~DatetimeIndex.ceil`, :meth:`~DatetimeIndex.floor`) and :class:`Timestamp` (:meth:`~Timestamp.round`, :meth:`~Timestamp.ceil`, :meth:`~Timestamp.floor`) could give rise to loss of precision (:issue:`22591`) -- Bug in :func:`to_datetime` with an :class:`Index` argument that would drop the ``name`` from the result (:issue:`21697`) -- Bug in :class:`PeriodIndex` where adding or subtracting a :class:`timedelta` or :class:`Tick` object produced incorrect results (:issue:`22988`) - -Timedelta -^^^^^^^^^ -- Bug in :class:`DataFrame` with ``timedelta64[ns]`` dtype division by ``Timedelta``-like scalar incorrectly returning ``timedelta64[ns]`` dtype instead of ``float64`` dtype (:issue:`20088`, :issue:`22163`) -- Bug in adding a :class:`Index` with object dtype to a :class:`Series` with ``timedelta64[ns]`` dtype incorrectly raising (:issue:`22390`) -- Bug in multiplying a :class:`Series` with numeric dtype against a ``timedelta`` object (:issue:`22390`) -- Bug in :class:`Series` with numeric dtype when adding or subtracting an an array or ``Series`` with ``timedelta64`` dtype (:issue:`22390`) -- Bug in :class:`Index` with numeric dtype when multiplying or dividing an array with dtype ``timedelta64`` (:issue:`22390`) -- Bug in :class:`TimedeltaIndex` incorrectly allowing indexing with ``Timestamp`` object (:issue:`20464`) -- Fixed bug where subtracting :class:`Timedelta` from an object-dtyped array would raise ``TypeError`` (:issue:`21980`) -- Fixed bug in adding a :class:`DataFrame` with all-`timedelta64[ns]` dtypes to a :class:`DataFrame` with all-integer dtypes returning incorrect results instead of raising ``TypeError`` (:issue:`22696`) -- - -Timezones -^^^^^^^^^ - -- Bug in :meth:`DatetimeIndex.shift` where an ``AssertionError`` would raise when shifting across DST (:issue:`8616`) -- Bug in :class:`Timestamp` constructor where passing an invalid timezone offset designator (``Z``) would not raise a ``ValueError`` (:issue:`8910`) -- Bug in :meth:`Timestamp.replace` where replacing at a DST boundary would retain an incorrect offset (:issue:`7825`) -- Bug in :meth:`Series.replace` with ``datetime64[ns, tz]`` data when replacing ``NaT`` (:issue:`11792`) -- Bug in :class:`Timestamp` when passing different string date formats with a timezone offset would produce different timezone offsets (:issue:`12064`) -- Bug when comparing a tz-naive :class:`Timestamp` to a tz-aware :class:`DatetimeIndex` which would coerce the :class:`DatetimeIndex` to tz-naive (:issue:`12601`) -- Bug in :meth:`Series.truncate` with a tz-aware :class:`DatetimeIndex` which would cause a core dump (:issue:`9243`) -- Bug in :class:`Series` constructor which would coerce tz-aware and tz-naive :class:`Timestamp` to tz-aware (:issue:`13051`) -- Bug in :class:`Index` with ``datetime64[ns, tz]`` dtype that did not localize integer data correctly (:issue:`20964`) -- Bug in :class:`DatetimeIndex` where constructing with an integer and tz would not localize correctly (:issue:`12619`) -- Fixed bug where :meth:`DataFrame.describe` and :meth:`Series.describe` on tz-aware datetimes did not show `first` and `last` result (:issue:`21328`) -- Bug in :class:`DatetimeIndex` comparisons failing to raise ``TypeError`` when comparing timezone-aware ``DatetimeIndex`` against ``np.datetime64`` (:issue:`22074`) -- Bug in ``DataFrame`` assignment with a timezone-aware scalar (:issue:`19843`) -- Bug in :func:`Dataframe.asof` that raised a ``TypeError`` when attempting to compare tz-naive and tz-aware timestamps (:issue:`21194`) -- Bug when constructing a :class:`DatetimeIndex` with :class:`Timestamp`s constructed with the ``replace`` method across DST (:issue:`18785`) -- Bug when setting a new value with :meth:`DataFrame.loc` with a :class:`DatetimeIndex` with a DST transition (:issue:`18308`, :issue:`20724`) -- Bug in :meth:`DatetimeIndex.unique` that did not re-localize tz-aware dates correctly (:issue:`21737`) -- Bug when indexing a :class:`Series` with a DST transition (:issue:`21846`) -- Bug in :meth:`DataFrame.resample` and :meth:`Series.resample` where an ``AmbiguousTimeError`` or ``NonExistentTimeError`` would raise if a timezone aware timeseries ended on a DST transition (:issue:`19375`, :issue:`10117`) - -Offsets -^^^^^^^ - -- Bug in :class:`FY5253` where date offsets could incorrectly raise an ``AssertionError`` in arithmetic operatons (:issue:`14774`) -- Bug in :class:`DateOffset` where keyword arguments ``week`` and ``milliseconds`` were accepted and ignored. Passing these will now raise ``ValueError`` (:issue:`19398`) -- - -Numeric -^^^^^^^ - -- Bug in :class:`Series` ``__rmatmul__`` doesn't support matrix vector multiplication (:issue:`21530`) -- Bug in :func:`factorize` fails with read-only array (:issue:`12813`) -- Fixed bug in :func:`unique` handled signed zeros inconsistently: for some inputs 0.0 and -0.0 were treated as equal and for some inputs as different. Now they are treated as equal for all inputs (:issue:`21866`) -- Bug in :meth:`DataFrame.agg`, :meth:`DataFrame.transform` and :meth:`DataFrame.apply` where, - when supplied with a list of functions and ``axis=1`` (e.g. ``df.apply(['sum', 'mean'], axis=1)``), - a ``TypeError`` was wrongly raised. For all three methods such calculation are now done correctly. (:issue:`16679`). -- Bug in :class:`Series` comparison against datetime-like scalars and arrays (:issue:`22074`) -- Bug in :class:`DataFrame` multiplication between boolean dtype and integer returning ``object`` dtype instead of integer dtype (:issue:`22047`, :issue:`22163`) -- Bug in :meth:`DataFrame.apply` where, when supplied with a string argument and additional positional or keyword arguments (e.g. ``df.apply('sum', min_count=1)``), a ``TypeError`` was wrongly raised (:issue:`22376`) -- Bug in :meth:`DataFrame.astype` to extension dtype may raise ``AttributeError`` (:issue:`22578`) -- Bug in :class:`DataFrame` with ``timedelta64[ns]`` dtype arithmetic operations with ``ndarray`` with integer dtype incorrectly treating the narray as ``timedelta64[ns]`` dtype (:issue:`23114`) - -Strings -^^^^^^^ - -- -- -- - -Interval -^^^^^^^^ - -- Bug in the :class:`IntervalIndex` constructor where the ``closed`` parameter did not always override the inferred ``closed`` (:issue:`19370`) -- Bug in the ``IntervalIndex`` repr where a trailing comma was missing after the list of intervals (:issue:`20611`) -- Bug in :class:`Interval` where scalar arithmetic operations did not retain the ``closed`` value (:issue:`22313`) -- Bug in :class:`IntervalIndex` where indexing with datetime-like values raised a ``KeyError`` (:issue:`20636`) - -Indexing -^^^^^^^^ - -- The traceback from a ``KeyError`` when asking ``.loc`` for a single missing label is now shorter and more clear (:issue:`21557`) -- When ``.ix`` is asked for a missing integer label in a :class:`MultiIndex` with a first level of integer type, it now raises a ``KeyError``, consistently with the case of a flat :class:`Int64Index`, rather than falling back to positional indexing (:issue:`21593`) -- Bug in :meth:`DatetimeIndex.reindex` when reindexing a tz-naive and tz-aware :class:`DatetimeIndex` (:issue:`8306`) -- Bug in :class:`DataFrame` when setting values with ``.loc`` and a timezone aware :class:`DatetimeIndex` (:issue:`11365`) -- ``DataFrame.__getitem__`` now accepts dictionaries and dictionary keys as list-likes of labels, consistently with ``Series.__getitem__`` (:issue:`21294`) -- Fixed ``DataFrame[np.nan]`` when columns are non-unique (:issue:`21428`) -- Bug when indexing :class:`DatetimeIndex` with nanosecond resolution dates and timezones (:issue:`11679`) -- Bug where indexing with a Numpy array containing negative values would mutate the indexer (:issue:`21867`) -- Bug where mixed indexes wouldn't allow integers for ``.at`` (:issue:`19860`) -- ``Float64Index.get_loc`` now raises ``KeyError`` when boolean key passed. (:issue:`19087`) -- Bug in :meth:`DataFrame.loc` when indexing with an :class:`IntervalIndex` (:issue:`19977`) -- :class:`Index` no longer mangles ``None``, ``NaN`` and ``NaT``, i.e. they are treated as three different keys. However, for numeric Index all three are still coerced to a ``NaN`` (:issue:`22332`) -- Bug in `scalar in Index` if scalar is a float while the ``Index`` is of integer dtype (:issue:`22085`) - -Missing -^^^^^^^ - -- Bug in :func:`DataFrame.fillna` where a ``ValueError`` would raise when one column contained a ``datetime64[ns, tz]`` dtype (:issue:`15522`) -- Bug in :func:`Series.hasnans` that could be incorrectly cached and return incorrect answers if null elements are introduced after an initial call (:issue:`19700`) -- :func:`Series.isin` now treats all NaN-floats as equal also for `np.object`-dtype. This behavior is consistent with the behavior for float64 (:issue:`22119`) -- :func:`unique` no longer mangles NaN-floats and the ``NaT``-object for `np.object`-dtype, i.e. ``NaT`` is no longer coerced to a NaN-value and is treated as a different entity. (:issue:`22295`) - - -MultiIndex -^^^^^^^^^^ - -- Removed compatibility for :class:`MultiIndex` pickles prior to version 0.8.0; compatibility with :class:`MultiIndex` pickles from version 0.13 forward is maintained (:issue:`21654`) -- :meth:`MultiIndex.get_loc_level` (and as a consequence, ``.loc`` on a :class:`MultiIndex`ed object) will now raise a ``KeyError``, rather than returning an empty ``slice``, if asked a label which is present in the ``levels`` but is unused (:issue:`22221`) -- Fix ``TypeError`` in Python 3 when creating :class:`MultiIndex` in which some levels have mixed types, e.g. when some labels are tuples (:issue:`15457`) - -I/O -^^^ - -- :func:`read_html()` no longer ignores all-whitespace ```` within ```` when considering the ``skiprows`` and ``header`` arguments. Previously, users had to decrease their ``header`` and ``skiprows`` values on such tables to work around the issue. (:issue:`21641`) -- :func:`read_excel()` will correctly show the deprecation warning for previously deprecated ``sheetname`` (:issue:`17994`) -- :func:`read_csv()` and func:`read_table()` will throw ``UnicodeError`` and not coredump on badly encoded strings (:issue:`22748`) -- :func:`read_csv()` will correctly parse timezone-aware datetimes (:issue:`22256`) -- :func:`read_sas()` will parse numbers in sas7bdat-files that have width less than 8 bytes correctly. (:issue:`21616`) -- :func:`read_sas()` will correctly parse sas7bdat files with many columns (:issue:`22628`) -- :func:`read_sas()` will correctly parse sas7bdat files with data page types having also bit 7 set (so page type is 128 + 256 = 384) (:issue:`16615`) -- Bug in :meth:`detect_client_encoding` where potential ``IOError`` goes unhandled when importing in a mod_wsgi process due to restricted access to stdout. (:issue:`21552`) -- Bug in :func:`to_string()` that broke column alignment when ``index=False`` and width of first column's values is greater than the width of first column's header (:issue:`16839`, :issue:`13032`) - -Plotting -^^^^^^^^ - -- Bug in :func:`DataFrame.plot.scatter` and :func:`DataFrame.plot.hexbin` caused x-axis label and ticklabels to disappear when colorbar was on in IPython inline backend (:issue:`10611`, :issue:`10678`, and :issue:`20455`) -- Bug in plotting a Series with datetimes using :func:`matplotlib.axes.Axes.scatter` (:issue:`22039`) - -Groupby/Resample/Rolling -^^^^^^^^^^^^^^^^^^^^^^^^ - -- Bug in :func:`pandas.core.groupby.GroupBy.first` and :func:`pandas.core.groupby.GroupBy.last` with ``as_index=False`` leading to the loss of timezone information (:issue:`15884`) -- Bug in :meth:`DatetimeIndex.resample` when downsampling across a DST boundary (:issue:`8531`) -- Bug where ``ValueError`` is wrongly raised when calling :func:`~pandas.core.groupby.SeriesGroupBy.count` method of a - ``SeriesGroupBy`` when the grouping variable only contains NaNs and numpy version < 1.13 (:issue:`21956`). -- Multiple bugs in :func:`pandas.core.Rolling.min` with ``closed='left'`` and a - datetime-like index leading to incorrect results and also segfault. (:issue:`21704`) -- Bug in :meth:`Resampler.apply` when passing postiional arguments to applied func (:issue:`14615`). -- Bug in :meth:`Series.resample` when passing ``numpy.timedelta64`` to ``loffset`` kwarg (:issue:`7687`). -- Bug in :meth:`Resampler.asfreq` when frequency of ``TimedeltaIndex`` is a subperiod of a new frequency (:issue:`13022`). -- Bug in :meth:`SeriesGroupBy.mean` when values were integral but could not fit inside of int64, overflowing instead. (:issue:`22487`) -- :func:`RollingGroupby.agg` and :func:`ExpandingGroupby.agg` now support multiple aggregation functions as parameters (:issue:`15072`) -- Bug in :meth:`DataFrame.resample` and :meth:`Series.resample` when resampling by a weekly offset (``'W'``) across a DST transition (:issue:`9119`, :issue:`21459`) - -Reshaping -^^^^^^^^^ - -- Bug in :func:`pandas.concat` when joining resampled DataFrames with timezone aware index (:issue:`13783`) -- Bug in :meth:`Series.combine_first` with ``datetime64[ns, tz]`` dtype which would return tz-naive result (:issue:`21469`) -- Bug in :meth:`Series.where` and :meth:`DataFrame.where` with ``datetime64[ns, tz]`` dtype (:issue:`21546`) -- Bug in :meth:`Series.mask` and :meth:`DataFrame.mask` with ``list`` conditionals (:issue:`21891`) -- Bug in :meth:`DataFrame.replace` raises RecursionError when converting OutOfBounds ``datetime64[ns, tz]`` (:issue:`20380`) -- :func:`pandas.core.groupby.GroupBy.rank` now raises a ``ValueError`` when an invalid value is passed for argument ``na_option`` (:issue:`22124`) -- Bug in :func:`get_dummies` with Unicode attributes in Python 2 (:issue:`22084`) -- Bug in :meth:`DataFrame.replace` raises ``RecursionError`` when replacing empty lists (:issue:`22083`) -- Bug in :meth:`Series.replace` and meth:`DataFrame.replace` when dict is used as the ``to_replace`` value and one key in the dict is is another key's value, the results were inconsistent between using integer key and using string key (:issue:`20656`) -- Bug in :meth:`DataFrame.drop_duplicates` for empty ``DataFrame`` which incorrectly raises an error (:issue:`20516`) -- Bug in :func:`pandas.wide_to_long` when a string is passed to the stubnames argument and a column name is a substring of that stubname (:issue:`22468`) -- Bug in :func:`merge` when merging ``datetime64[ns, tz]`` data that contained a DST transition (:issue:`18885`) -- Bug in :func:`merge_asof` when merging on float values within defined tolerance (:issue:`22981`) -- Bug in :func:`pandas.concat` when concatenating a multicolumn DataFrame with tz-aware data against a DataFrame with a different number of columns (:issue`22796`) -- Bug in :func:`merge_asof` when merging on columns containing nulls values (:issue:`22981`) - -.. _whatsnew_0240.bug_fixes.sparse: - -Sparse -^^^^^^ - -- Updating a boolean, datetime, or timedelta column to be Sparse now works (:issue:`22367`) -- Bug in :meth:`Series.to_sparse` with Series already holding sparse data not constructing properly (:issue:`22389`) -- Providing a ``sparse_index`` to the SparseArray constructor no longer defaults the na-value to ``np.nan`` for all dtypes. The correct na_value for ``data.dtype`` is now used. -- Bug in ``SparseArray.nbytes`` under-reporting its memory usage by not including the size of its sparse index. -- Improved performance of :meth:`Series.shift` for non-NA ``fill_value``, as values are no longer converted to a dense array. -- Bug in ``DataFrame.groupby`` not including ``fill_value`` in the groups for non-NA ``fill_value`` when grouping by a sparse column (:issue:`5078`) -- Bug in unary inversion operator (``~``) on a ``SparseSeries`` with boolean values. The performance of this has also been improved (:issue:`22835`) - -Build Changes -^^^^^^^^^^^^^ - -- Building pandas for development now requires ``cython >= 0.28.2`` (:issue:`21688`) -- Testing pandas now requires ``hypothesis>=3.58``. You can find `the Hypothesis docs here `_, and a pandas-specific introduction :ref:`in the contributing guide `. (:issue:`22280`) -- - -Other -^^^^^ - -- :meth:`~pandas.io.formats.style.Styler.background_gradient` now takes a ``text_color_threshold`` parameter to automatically lighten the text color based on the luminance of the background color. This improves readability with dark background colors without the need to limit the background colormap range. (:issue:`21258`) -- Require at least 0.28.2 version of ``cython`` to support read-only memoryviews (:issue:`21688`) -- :meth:`~pandas.io.formats.style.Styler.background_gradient` now also supports tablewise application (in addition to rowwise and columnwise) with ``axis=None`` (:issue:`15204`) -- :meth:`DataFrame.nlargest` and :meth:`DataFrame.nsmallest` now returns the correct n values when keep != 'all' also when tied on the first columns (:issue:`22752`) -- :meth:`~pandas.io.formats.style.Styler.bar` now also supports tablewise application (in addition to rowwise and columnwise) with ``axis=None`` and setting clipping range with ``vmin`` and ``vmax`` (:issue:`21548` and :issue:`21526`). ``NaN`` values are also handled properly. -- Logical operations ``&, |, ^`` between :class:`Series` and :class:`Index` will no longer raise ``ValueError`` (:issue:`22092`) -- Bug in :meth:`DataFrame.combine_first` in which column types were unexpectedly converted to float (:issue:`20699`) From 47462c386a7a22a83fba244df342d937cfcf3a6c Mon Sep 17 00:00:00 2001 From: Sven Date: Tue, 16 Oct 2018 07:22:24 +0100 Subject: [PATCH 03/14] move checking of error to later position and update error type --- pandas/core/reshape/merge.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 236707cf32209..20c4afb5641ab 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1390,16 +1390,21 @@ def flip(xs): self.right_join_keys[-1]) tolerance = self.tolerance - # Check null values before merge - if isnull(left_values).sum() > 0 or isnull(right_values).sum() > 0: - raise MergeError('Merge keys cannot contain null values') + # we required sortedness and non-missingness in the join keys + msg_sorted = "{side} keys must be sorted" + msg_missings = "Merge keys contain null values in {side} side" - # we required sortedness in the join keys - msg = "{side} keys must be sorted" if not Index(left_values).is_monotonic: - raise ValueError(msg.format(side='left')) + if isnull(left_values).sum() > 0: + raise ValueError(msg_missings.format(side='left')) + else: + raise ValueError(msg_sorted.format(side='left')) + if not Index(right_values).is_monotonic: - raise ValueError(msg.format(side='right')) + if isnull(right_values).sum() > 0: + raise ValueError(msg_missings.format(side='right')) + else: + raise ValueError(msg_sorted.format(side='right')) # initial type conversion as needed if needs_i8_conversion(left_values): From 8fe797dc8fdb29b83b14d49b602871e8d2806bf2 Mon Sep 17 00:00:00 2001 From: Tola Date: Tue, 16 Oct 2018 20:21:16 +0100 Subject: [PATCH 04/14] BUG: merge_asof on columns containing nulls --- pandas/tests/reshape/merge/test_merge_asof.py | 23 +++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/pandas/tests/reshape/merge/test_merge_asof.py b/pandas/tests/reshape/merge/test_merge_asof.py index c75a6a707cafc..cc1cac6866ff7 100644 --- a/pandas/tests/reshape/merge/test_merge_asof.py +++ b/pandas/tests/reshape/merge/test_merge_asof.py @@ -1007,3 +1007,26 @@ def test_merge_datatype_error(self): with tm.assert_raises_regex(MergeError, msg): merge_asof(left, right, on='a') + + def test_merge_on_nans_int(self): + """ Test merging on integer columns with nans throws a merge error """ + left = pd.DataFrame({'a': [1, 5, 10, 12, np.nan], + 'left_val': ['a', 'b', 'c', 'd', 'e']}) + right = pd.DataFrame({'a': [1, 5, 10, 12, np.nan], + 'right_val': [1, 6, 11, 15, 16]}) + + with pytest.raises(ValueError): + merge_asof(left, right, on='a') + + def test_merge_on_nans_datetime(self): + """ Test merging on datetime columns with nans throws a merge error """ + top_left = pd.DataFrame(pd.date_range('20130101', periods=5), columns=['A']) + bottom_left = pd.DataFrame([np.nan], columns=['A']) + left = pd.concat([top_left, bottom_left]) + + top_right = pd.DataFrame(pd.date_range('20150601', periods=5), columns=['A']) + bottom_right = pd.DataFrame([np.nan], columns=['A']) + right = pd.concat([bottom_right, top_right]) + + with pytest.raises(ValueError): + merge_asof(left, right, on='A') From cfc0152056deebd9d963bbcfc05a629384320611 Mon Sep 17 00:00:00 2001 From: Tola Date: Mon, 15 Oct 2018 21:19:11 +0100 Subject: [PATCH 05/14] BUG: merge_asof on columns containing nulls --- doc/source/whatsnew/v0.24.1.txt | 1006 +++++++++++++++++++++++++++++++ pandas/core/reshape/merge.py | 6 +- 2 files changed, 1011 insertions(+), 1 deletion(-) create mode 100644 doc/source/whatsnew/v0.24.1.txt diff --git a/doc/source/whatsnew/v0.24.1.txt b/doc/source/whatsnew/v0.24.1.txt new file mode 100644 index 0000000000000..9463458bfa64f --- /dev/null +++ b/doc/source/whatsnew/v0.24.1.txt @@ -0,0 +1,1006 @@ +.. _whatsnew_0240: + +v0.24.0 (Month XX, 2018) +------------------------ + +.. warning:: + + Starting January 1, 2019, pandas feature releases will support Python 3 only. + See :ref:`install.dropping-27` for more. + +.. _whatsnew_0240.enhancements: + +New features +~~~~~~~~~~~~ +- :func:`merge` now directly allows merge between objects of type ``DataFrame`` and named ``Series``, without the need to convert the ``Series`` object into a ``DataFrame`` beforehand (:issue:`21220`) + + +- ``ExcelWriter`` now accepts ``mode`` as a keyword argument, enabling append to existing workbooks when using the ``openpyxl`` engine (:issue:`3441`) + +- :func:`DataFrame.to_parquet` now accepts ``index`` as an argument, allowing +the user to override the engine's default behavior to include or omit the +dataframe's indexes from the resulting Parquet file. (:issue:`20768`) +- :meth:`DataFrame.corr` and :meth:`Series.corr` now accept a callable for generic calculation methods of correlation, e.g. histogram intersection (:issue:`22684`) + + +.. _whatsnew_0240.enhancements.extension_array_operators: + +``ExtensionArray`` operator support +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +A ``Series`` based on an ``ExtensionArray`` now supports arithmetic and comparison +operators (:issue:`19577`). There are two approaches for providing operator support for an ``ExtensionArray``: + +1. Define each of the operators on your ``ExtensionArray`` subclass. +2. Use an operator implementation from pandas that depends on operators that are already defined + on the underlying elements (scalars) of the ``ExtensionArray``. + +See the :ref:`ExtensionArray Operator Support +` documentation section for details on both +ways of adding operator support. + +.. _whatsnew_0240.enhancements.intna: + +Optional Integer NA Support +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Pandas has gained the ability to hold integer dtypes with missing values. This long requested feature is enabled through the use of :ref:`extension types `. +Here is an example of the usage. + +We can construct a ``Series`` with the specified dtype. The dtype string ``Int64`` is a pandas ``ExtensionDtype``. Specifying a list or array using the traditional missing value +marker of ``np.nan`` will infer to integer dtype. The display of the ``Series`` will also use the ``NaN`` to indicate missing values in string outputs. (:issue:`20700`, :issue:`20747`, :issue:`22441`, :issue:`21789`, :issue:`22346`) + +.. ipython:: python + + s = pd.Series([1, 2, np.nan], dtype='Int64') + s + + +Operations on these dtypes will propagate ``NaN`` as other pandas operations. + +.. ipython:: python + + # arithmetic + s + 1 + + # comparison + s == 1 + + # indexing + s.iloc[1:3] + + # operate with other dtypes + s + s.iloc[1:3].astype('Int8') + + # coerce when needed + s + 0.01 + +These dtypes can operate as part of of ``DataFrame``. + +.. ipython:: python + + df = pd.DataFrame({'A': s, 'B': [1, 1, 3], 'C': list('aab')}) + df + df.dtypes + + +These dtypes can be merged & reshaped & casted. + +.. ipython:: python + + pd.concat([df[['A']], df[['B', 'C']]], axis=1).dtypes + df['A'].astype(float) + +Reduction and groupby operations such as 'sum' work. + +.. ipython:: python + + df.sum() + df.groupby('B').A.sum() + +.. warning:: + + The Integer NA support currently uses the captilized dtype version, e.g. ``Int8`` as compared to the traditional ``int8``. This may be changed at a future date. + +.. _whatsnew_0240.enhancements.read_html: + +``read_html`` Enhancements +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:func:`read_html` previously ignored ``colspan`` and ``rowspan`` attributes. +Now it understands them, treating them as sequences of cells with the same +value. (:issue:`17054`) + +.. ipython:: python + + result = pd.read_html(""" + + + + + + + + + + + +
ABC
12
""") + +Previous Behavior: + +.. code-block:: ipython + + In [13]: result + Out [13]: + [ A B C + 0 1 2 NaN] + +Current Behavior: + +.. ipython:: python + + result + + +.. _whatsnew_0240.enhancements.interval: + +Storing Interval Data in Series and DataFrame +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Interval data may now be stored in a ``Series`` or ``DataFrame``, in addition to an +:class:`IntervalIndex` like previously (:issue:`19453`). + +.. ipython:: python + + ser = pd.Series(pd.interval_range(0, 5)) + ser + ser.dtype + +Previously, these would be cast to a NumPy array of ``Interval`` objects. In general, +this should result in better performance when storing an array of intervals in +a :class:`Series`. + +Note that the ``.values`` of a ``Series`` containing intervals is no longer a NumPy +array, but rather an ``ExtensionArray``: + +.. ipython:: python + + ser.values + +This is the same behavior as ``Series.values`` for categorical data. See +:ref:`whatsnew_0240.api_breaking.interval_values` for more. + + +.. _whatsnew_0240.enhancements.other: + +Other Enhancements +^^^^^^^^^^^^^^^^^^ +- :func:`to_datetime` now supports the ``%Z`` and ``%z`` directive when passed into ``format`` (:issue:`13486`) +- :func:`Series.mode` and :func:`DataFrame.mode` now support the ``dropna`` parameter which can be used to specify whether ``NaN``/``NaT`` values should be considered (:issue:`17534`) +- :func:`to_csv` now supports ``compression`` keyword when a file handle is passed. (:issue:`21227`) +- :meth:`Index.droplevel` is now implemented also for flat indexes, for compatibility with :class:`MultiIndex` (:issue:`21115`) +- :meth:`Series.droplevel` and :meth:`DataFrame.droplevel` are now implemented (:issue:`20342`) +- Added support for reading from/writing to Google Cloud Storage via the ``gcsfs`` library (:issue:`19454`, :issue:`23094`) +- :func:`to_gbq` and :func:`read_gbq` signature and documentation updated to + reflect changes from the `Pandas-GBQ library version 0.6.0 + `__. + (:issue:`21627`, :issue:`22557`) +- New method :meth:`HDFStore.walk` will recursively walk the group hierarchy of an HDF5 file (:issue:`10932`) +- :func:`read_html` copies cell data across ``colspan`` and ``rowspan``, and it treats all-``th`` table rows as headers if ``header`` kwarg is not given and there is no ``thead`` (:issue:`17054`) +- :meth:`Series.nlargest`, :meth:`Series.nsmallest`, :meth:`DataFrame.nlargest`, and :meth:`DataFrame.nsmallest` now accept the value ``"all"`` for the ``keep`` argument. This keeps all ties for the nth largest/smallest value (:issue:`16818`) +- :class:`IntervalIndex` has gained the :meth:`~IntervalIndex.set_closed` method to change the existing ``closed`` value (:issue:`21670`) +- :func:`~DataFrame.to_csv`, :func:`~Series.to_csv`, :func:`~DataFrame.to_json`, and :func:`~Series.to_json` now support ``compression='infer'`` to infer compression based on filename extension (:issue:`15008`). + The default compression for ``to_csv``, ``to_json``, and ``to_pickle`` methods has been updated to ``'infer'`` (:issue:`22004`). +- :func:`to_timedelta` now supports iso-formated timedelta strings (:issue:`21877`) +- :class:`Series` and :class:`DataFrame` now support :class:`Iterable` in constructor (:issue:`2193`) +- :class:`DatetimeIndex` gained :attr:`DatetimeIndex.timetz` attribute. Returns local time with timezone information. (:issue:`21358`) +- :meth:`round`, :meth:`ceil`, and meth:`floor` for :class:`DatetimeIndex` and :class:`Timestamp` now support an ``ambiguous`` argument for handling datetimes that are rounded to ambiguous times (:issue:`18946`) +- :class:`Resampler` now is iterable like :class:`GroupBy` (:issue:`15314`). +- :meth:`Series.resample` and :meth:`DataFrame.resample` have gained the :meth:`Resampler.quantile` (:issue:`15023`). +- :meth:`Index.to_frame` now supports overriding column name(s) (:issue:`22580`). +- New attribute :attr:`__git_version__` will return git commit sha of current build (:issue:`21295`). +- Compatibility with Matplotlib 3.0 (:issue:`22790`). + +.. _whatsnew_0240.api_breaking: + +Backwards incompatible API changes +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +- A newly constructed empty :class:`DataFrame` with integer as the ``dtype`` will now only be cast to ``float64`` if ``index`` is specified (:issue:`22858`) + +.. _whatsnew_0240.api_breaking.deps: + +Dependencies have increased minimum versions +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +We have updated our minimum supported versions of dependencies (:issue:`21242`). +If installed, we now require: + ++-----------------+-----------------+----------+ +| Package | Minimum Version | Required | ++=================+=================+==========+ +| numpy | 1.12.0 | X | ++-----------------+-----------------+----------+ +| bottleneck | 1.2.0 | | ++-----------------+-----------------+----------+ +| matplotlib | 2.0.0 | | ++-----------------+-----------------+----------+ +| numexpr | 2.6.1 | | ++-----------------+-----------------+----------+ +| pytables | 3.4.2 | | ++-----------------+-----------------+----------+ +| scipy | 0.18.1 | | ++-----------------+-----------------+----------+ + +.. _whatsnew_0240.api_breaking.interval_values: + +``IntervalIndex.values`` is now an ``IntervalArray`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The :attr:`~Interval.values` attribute of an :class:`IntervalIndex` now returns an +``IntervalArray``, rather than a NumPy array of :class:`Interval` objects (:issue:`19453`). + +Previous Behavior: + +.. code-block:: ipython + + In [1]: idx = pd.interval_range(0, 4) + + In [2]: idx.values + Out[2]: + array([Interval(0, 1, closed='right'), Interval(1, 2, closed='right'), + Interval(2, 3, closed='right'), Interval(3, 4, closed='right')], + dtype=object) + +New Behavior: + +.. ipython:: python + + idx = pd.interval_range(0, 4) + idx.values + +This mirrors ``CategoricalIndex.values``, which returns a ``Categorical``. + +For situations where you need an ``ndarray`` of ``Interval`` objects, use +:meth:`numpy.asarray` or ``idx.astype(object)``. + +.. ipython:: python + + np.asarray(idx) + idx.values.astype(object) + +.. _whatsnew_0240.api.timezone_offset_parsing: + +Parsing Datetime Strings with Timezone Offsets +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Previously, parsing datetime strings with UTC offsets with :func:`to_datetime` +or :class:`DatetimeIndex` would automatically convert the datetime to UTC +without timezone localization. This is inconsistent from parsing the same +datetime string with :class:`Timestamp` which would preserve the UTC +offset in the ``tz`` attribute. Now, :func:`to_datetime` preserves the UTC +offset in the ``tz`` attribute when all the datetime strings have the same +UTC offset (:issue:`17697`, :issue:`11736`, :issue:`22457`) + +*Previous Behavior*: + +.. code-block:: ipython + + In [2]: pd.to_datetime("2015-11-18 15:30:00+05:30") + Out[2]: Timestamp('2015-11-18 10:00:00') + + In [3]: pd.Timestamp("2015-11-18 15:30:00+05:30") + Out[3]: Timestamp('2015-11-18 15:30:00+0530', tz='pytz.FixedOffset(330)') + + # Different UTC offsets would automatically convert the datetimes to UTC (without a UTC timezone) + In [4]: pd.to_datetime(["2015-11-18 15:30:00+05:30", "2015-11-18 16:30:00+06:30"]) + Out[4]: DatetimeIndex(['2015-11-18 10:00:00', '2015-11-18 10:00:00'], dtype='datetime64[ns]', freq=None) + +*Current Behavior*: + +.. ipython:: python + + pd.to_datetime("2015-11-18 15:30:00+05:30") + pd.Timestamp("2015-11-18 15:30:00+05:30") + +Parsing datetime strings with the same UTC offset will preserve the UTC offset in the ``tz`` + +.. ipython:: python + + pd.to_datetime(["2015-11-18 15:30:00+05:30"] * 2) + +Parsing datetime strings with different UTC offsets will now create an Index of +``datetime.datetime`` objects with different UTC offsets + +.. ipython:: python + + idx = pd.to_datetime(["2015-11-18 15:30:00+05:30", "2015-11-18 16:30:00+06:30"]) + idx + idx[0] + idx[1] + +Passing ``utc=True`` will mimic the previous behavior but will correctly indicate +that the dates have been converted to UTC + +.. ipython:: python + + pd.to_datetime(["2015-11-18 15:30:00+05:30", "2015-11-18 16:30:00+06:30"], utc=True) + +.. _whatsnew_0240.api_breaking.calendarday: + +CalendarDay Offset +^^^^^^^^^^^^^^^^^^ + +:class:`Day` and associated frequency alias ``'D'`` were documented to represent +a calendar day; however, arithmetic and operations with :class:`Day` sometimes +respected absolute time instead (i.e. ``Day(n)`` and acted identically to ``Timedelta(days=n)``). + +*Previous Behavior*: + +.. code-block:: ipython + + + In [2]: ts = pd.Timestamp('2016-10-30 00:00:00', tz='Europe/Helsinki') + + # Respects calendar arithmetic + In [3]: pd.date_range(start=ts, freq='D', periods=3) + Out[3]: + DatetimeIndex(['2016-10-30 00:00:00+03:00', '2016-10-31 00:00:00+02:00', + '2016-11-01 00:00:00+02:00'], + dtype='datetime64[ns, Europe/Helsinki]', freq='D') + + # Respects absolute arithmetic + In [4]: ts + pd.tseries.frequencies.to_offset('D') + Out[4]: Timestamp('2016-10-30 23:00:00+0200', tz='Europe/Helsinki') + +:class:`CalendarDay` and associated frequency alias ``'CD'`` are now available +and respect calendar day arithmetic while :class:`Day` and frequency alias ``'D'`` +will now respect absolute time (:issue:`22274`, :issue:`20596`, :issue:`16980`, :issue:`8774`) +See the :ref:`documentation here ` for more information. + +Addition with :class:`CalendarDay` across a daylight savings time transition: + +.. ipython:: python + + ts = pd.Timestamp('2016-10-30 00:00:00', tz='Europe/Helsinki') + ts + pd.offsets.Day(1) + ts + pd.offsets.CalendarDay(1) + +.. _whatsnew_0240.api_breaking.period_end_time: + +Time values in ``dt.end_time`` and ``to_timestamp(how='end')`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The time values in :class:`Period` and :class:`PeriodIndex` objects are now set +to '23:59:59.999999999' when calling :attr:`Series.dt.end_time`, :attr:`Period.end_time`, +:attr:`PeriodIndex.end_time`, :func:`Period.to_timestamp()` with ``how='end'``, +or :func:`PeriodIndex.to_timestamp()` with ``how='end'`` (:issue:`17157`) + +Previous Behavior: + +.. code-block:: ipython + + In [2]: p = pd.Period('2017-01-01', 'D') + In [3]: pi = pd.PeriodIndex([p]) + + In [4]: pd.Series(pi).dt.end_time[0] + Out[4]: Timestamp(2017-01-01 00:00:00) + + In [5]: p.end_time + Out[5]: Timestamp(2017-01-01 23:59:59.999999999) + +Current Behavior: + +Calling :attr:`Series.dt.end_time` will now result in a time of '23:59:59.999999999' as +is the case with :attr:`Period.end_time`, for example + +.. ipython:: python + + p = pd.Period('2017-01-01', 'D') + pi = pd.PeriodIndex([p]) + + pd.Series(pi).dt.end_time[0] + + p.end_time + +.. _whatsnew_0240.api_breaking.sparse_values: + +Sparse Data Structure Refactor +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +``SparseArray``, the array backing ``SparseSeries`` and the columns in a ``SparseDataFrame``, +is now an extension array (:issue:`21978`, :issue:`19056`, :issue:`22835`). +To conform to this interface and for consistency with the rest of pandas, some API breaking +changes were made: + +- ``SparseArray`` is no longer a subclass of :class:`numpy.ndarray`. To convert a SparseArray to a NumPy array, use :meth:`numpy.asarray`. +- ``SparseArray.dtype`` and ``SparseSeries.dtype`` are now instances of :class:`SparseDtype`, rather than ``np.dtype``. Access the underlying dtype with ``SparseDtype.subtype``. +- :meth:`numpy.asarray(sparse_array)` now returns a dense array with all the values, not just the non-fill-value values (:issue:`14167`) +- ``SparseArray.take`` now matches the API of :meth:`pandas.api.extensions.ExtensionArray.take` (:issue:`19506`): + + * The default value of ``allow_fill`` has changed from ``False`` to ``True``. + * The ``out`` and ``mode`` parameters are now longer accepted (previously, this raised if they were specified). + * Passing a scalar for ``indices`` is no longer allowed. + +- The result of concatenating a mix of sparse and dense Series is a Series with sparse values, rather than a ``SparseSeries``. +- ``SparseDataFrame.combine`` and ``DataFrame.combine_first`` no longer supports combining a sparse column with a dense column while preserving the sparse subtype. The result will be an object-dtype SparseArray. +- Setting :attr:`SparseArray.fill_value` to a fill value with a different dtype is now allowed. + + +Some new warnings are issued for operations that require or are likely to materialize a large dense array: + +- A :class:`errors.PerformanceWarning` is issued when using fillna with a ``method``, as a dense array is constructed to create the filled array. Filling with a ``value`` is the efficient way to fill a sparse array. +- A :class:`errors.PerformanceWarning` is now issued when concatenating sparse Series with differing fill values. The fill value from the first sparse array continues to be used. + +In addition to these API breaking changes, many :ref:`performance improvements and bug fixes have been made `. + +.. _whatsnew_0240.api_breaking.frame_to_dict_index_orient: + +Raise ValueError in ``DataFrame.to_dict(orient='index')`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Bug in :func:`DataFrame.to_dict` raises ``ValueError`` when used with +``orient='index'`` and a non-unique index instead of losing data (:issue:`22801`) + +.. ipython:: python + :okexcept: + + df = pd.DataFrame({'a': [1, 2], 'b': [0.5, 0.75]}, index=['A', 'A']) + df + + df.to_dict(orient='index') + +.. _whatsnew_0240.api.datetimelike.normalize: + +Tick DateOffset Normalize Restrictions +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Creating a ``Tick`` object (:class:`Day`, :class:`Hour`, :class:`Minute`, +:class:`Second`, :class:`Milli`, :class:`Micro`, :class:`Nano`) with +``normalize=True`` is no longer supported. This prevents unexpected behavior +where addition could fail to be monotone or associative. (:issue:`21427`) + +*Previous Behavior*: + +.. code-block:: ipython + + + In [2]: ts = pd.Timestamp('2018-06-11 18:01:14') + + In [3]: ts + Out[3]: Timestamp('2018-06-11 18:01:14') + + In [4]: tic = pd.offsets.Hour(n=2, normalize=True) + ...: + + In [5]: tic + Out[5]: <2 * Hours> + + In [6]: ts + tic + Out[6]: Timestamp('2018-06-11 00:00:00') + + In [7]: ts + tic + tic + tic == ts + (tic + tic + tic) + Out[7]: False + +*Current Behavior*: + +.. ipython:: python + + ts = pd.Timestamp('2018-06-11 18:01:14') + tic = pd.offsets.Hour(n=2) + ts + tic + tic + tic == ts + (tic + tic + tic) + + +.. _whatsnew_0240.api.datetimelike: + + +.. _whatsnew_0240.api.period_subtraction: + +Period Subtraction +^^^^^^^^^^^^^^^^^^ + +Subtraction of a ``Period`` from another ``Period`` will give a ``DateOffset``. +instead of an integer (:issue:`21314`) + +.. ipython:: python + + june = pd.Period('June 2018') + april = pd.Period('April 2018') + june - april + +Previous Behavior: + +.. code-block:: ipython + + In [2]: june = pd.Period('June 2018') + + In [3]: april = pd.Period('April 2018') + + In [4]: june - april + Out [4]: 2 + +Similarly, subtraction of a ``Period`` from a ``PeriodIndex`` will now return +an ``Index`` of ``DateOffset`` objects instead of an ``Int64Index`` + +.. ipython:: python + + pi = pd.period_range('June 2018', freq='M', periods=3) + pi - pi[0] + +Previous Behavior: + +.. code-block:: ipython + + In [2]: pi = pd.period_range('June 2018', freq='M', periods=3) + + In [3]: pi - pi[0] + Out[3]: Int64Index([0, 1, 2], dtype='int64') + + +.. _whatsnew_0240.api.timedelta64_subtract_nan: + +Addition/Subtraction of ``NaN`` from :class:`DataFrame` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Adding or subtracting ``NaN`` from a :class:`DataFrame` column with +``timedelta64[ns]`` dtype will now raise a ``TypeError`` instead of returning +all-``NaT``. This is for compatibility with ``TimedeltaIndex`` and +``Series`` behavior (:issue:`22163`) + +.. ipython:: python + :okexcept: + + df = pd.DataFrame([pd.Timedelta(days=1)]) + df - np.nan + +Previous Behavior: + +.. code-block:: ipython + + In [4]: df = pd.DataFrame([pd.Timedelta(days=1)]) + + In [5]: df - np.nan + Out[5]: + 0 + 0 NaT + + +.. _whatsnew_0240.api.dataframe_arithmetic_broadcasting: + +DataFrame Arithmetic Operations Broadcasting Changes +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +:class:`DataFrame` arithmetic operations when operating with 2-dimensional +``np.ndarray`` objects now broadcast in the same way as ``np.ndarray``s +broadcast. (:issue:`23000`) + +Previous Behavior: + +.. code-block:: ipython + + In [3]: arr = np.arange(6).reshape(3, 2) + In [4]: df = pd.DataFrame(arr) + In [5]: df + arr[[0], :] # 1 row, 2 columns + ... + ValueError: Unable to coerce to DataFrame, shape must be (3, 2): given (1, 2) + In [6]: df + arr[:, [1]] # 1 column, 3 rows + ... + ValueError: Unable to coerce to DataFrame, shape must be (3, 2): given (3, 1) + +*Current Behavior*: + +.. ipython:: python + arr = np.arange(6).reshape(3, 2) + df = pd.DataFrame(arr) + df + +.. ipython:: python + df + arr[[0], :] # 1 row, 2 columns + df + arr[:, [1]] # 1 column, 3 rows + + +.. _whatsnew_0240.api.extension: + +ExtensionType Changes +^^^^^^^^^^^^^^^^^^^^^ + +**:class:`pandas.api.extensions.ExtensionDtype` Equality and Hashability** + +Pandas now requires that extension dtypes be hashable. The base class implements +a default ``__eq__`` and ``__hash__``. If you have a parametrized dtype, you should +update the ``ExtensionDtype._metadata`` tuple to match the signature of your +``__init__`` method. See :class:`pandas.api.extensions.ExtensionDtype` for more (:issue:`22476`). + +**Other changes** + +- ``ExtensionArray`` has gained the abstract methods ``.dropna()`` (:issue:`21185`) +- ``ExtensionDtype`` has gained the ability to instantiate from string dtypes, e.g. ``decimal`` would instantiate a registered ``DecimalDtype``; furthermore + the ``ExtensionDtype`` has gained the method ``construct_array_type`` (:issue:`21185`) +- An ``ExtensionArray`` with a boolean dtype now works correctly as a boolean indexer. :meth:`pandas.api.types.is_bool_dtype` now properly considers them boolean (:issue:`22326`) +- Added ``ExtensionDtype._is_numeric`` for controlling whether an extension dtype is considered numeric (:issue:`22290`). +- The ``ExtensionArray`` constructor, ``_from_sequence`` now take the keyword arg ``copy=False`` (:issue:`21185`) +- Bug in :meth:`Series.get` for ``Series`` using ``ExtensionArray`` and integer index (:issue:`21257`) +- :meth:`~Series.shift` now dispatches to :meth:`ExtensionArray.shift` (:issue:`22386`) +- :meth:`Series.combine()` works correctly with :class:`~pandas.api.extensions.ExtensionArray` inside of :class:`Series` (:issue:`20825`) +- :meth:`Series.combine()` with scalar argument now works for any function type (:issue:`21248`) +- :meth:`Series.astype` and :meth:`DataFrame.astype` now dispatch to :meth:`ExtensionArray.astype` (:issue:`21185:`). +- Slicing a single row of a ``DataFrame`` with multiple ExtensionArrays of the same type now preserves the dtype, rather than coercing to object (:issue:`22784`) +- Added :meth:`pandas.api.types.register_extension_dtype` to register an extension type with pandas (:issue:`22664`) +- Series backed by an ``ExtensionArray`` now work with :func:`util.hash_pandas_object` (:issue:`23066`) +- Updated the ``.type`` attribute for ``PeriodDtype``, ``DatetimeTZDtype``, and ``IntervalDtype`` to be instances of the dtype (``Period``, ``Timestamp``, and ``Interval`` respectively) (:issue:`22938`) +- :func:`ExtensionArray.isna` is allowed to return an ``ExtensionArray`` (:issue:`22325`). +- Support for reduction operations such as ``sum``, ``mean`` via opt-in base class method override (:issue:`22762`) + +.. _whatsnew_0240.api.incompatibilities: + +Series and Index Data-Dtype Incompatibilities +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +``Series`` and ``Index`` constructors now raise when the +data is incompatible with a passed ``dtype=`` (:issue:`15832`) + +Previous Behavior: + +.. code-block:: ipython + + In [4]: pd.Series([-1], dtype="uint64") + Out [4]: + 0 18446744073709551615 + dtype: uint64 + +Current Behavior: + +.. code-block:: ipython + + In [4]: pd.Series([-1], dtype="uint64") + Out [4]: + ... + OverflowError: Trying to coerce negative values to unsigned integers + +.. _whatsnew_0240.api.crosstab_dtypes + +Crosstab Preserves Dtypes +^^^^^^^^^^^^^^^^^^^^^^^^^ + +:func:`crosstab` will preserve now dtypes in some cases that previously would +cast from integer dtype to floating dtype (:issue:`22019`) + +Previous Behavior: + +.. code-block:: ipython + + In [3]: df = pd.DataFrame({'a': [1, 2, 2, 2, 2], 'b': [3, 3, 4, 4, 4], + ...: 'c': [1, 1, np.nan, 1, 1]}) + In [4]: pd.crosstab(df.a, df.b, normalize='columns') + Out[4]: + b 3 4 + a + 1 0.5 0.0 + 2 0.5 1.0 + +Current Behavior: + +.. code-block:: ipython + + In [3]: df = pd.DataFrame({'a': [1, 2, 2, 2, 2], 'b': [3, 3, 4, 4, 4], + ...: 'c': [1, 1, np.nan, 1, 1]}) + In [4]: pd.crosstab(df.a, df.b, normalize='columns') + +Datetimelike API Changes +^^^^^^^^^^^^^^^^^^^^^^^^ + +- For :class:`DatetimeIndex` and :class:`TimedeltaIndex` with non-``None`` ``freq`` attribute, addition or subtraction of integer-dtyped array or ``Index`` will return an object of the same class (:issue:`19959`) +- :class:`DateOffset` objects are now immutable. Attempting to alter one of these will now raise ``AttributeError`` (:issue:`21341`) +- :class:`PeriodIndex` subtraction of another ``PeriodIndex`` will now return an object-dtype :class:`Index` of :class:`DateOffset` objects instead of raising a ``TypeError`` (:issue:`20049`) +- :func:`cut` and :func:`qcut` now returns a :class:`DatetimeIndex` or :class:`TimedeltaIndex` bins when the input is datetime or timedelta dtype respectively and ``retbins=True`` (:issue:`19891`) +- :meth:`DatetimeIndex.to_period` and :meth:`Timestamp.to_period` will issue a warning when timezone information will be lost (:issue:`21333`) + +.. _whatsnew_0240.api.other: + +Other API Changes +^^^^^^^^^^^^^^^^^ + +- :class:`DatetimeIndex` now accepts :class:`Int64Index` arguments as epoch timestamps (:issue:`20997`) +- Accessing a level of a ``MultiIndex`` with a duplicate name (e.g. in + :meth:`~MultiIndex.get_level_values`) now raises a ``ValueError`` instead of + a ``KeyError`` (:issue:`21678`). +- Invalid construction of ``IntervalDtype`` will now always raise a ``TypeError`` rather than a ``ValueError`` if the subdtype is invalid (:issue:`21185`) +- Trying to reindex a ``DataFrame`` with a non unique ``MultiIndex`` now raises a ``ValueError`` instead of an ``Exception`` (:issue:`21770`) +- :meth:`PeriodIndex.tz_convert` and :meth:`PeriodIndex.tz_localize` have been removed (:issue:`21781`) +- :class:`Index` subtraction will attempt to operate element-wise instead of raising ``TypeError`` (:issue:`19369`) +- :class:`pandas.io.formats.style.Styler` supports a ``number-format`` property when using :meth:`~pandas.io.formats.style.Styler.to_excel` (:issue:`22015`) +- :meth:`DataFrame.corr` and :meth:`Series.corr` now raise a ``ValueError`` along with a helpful error message instead of a ``KeyError`` when supplied with an invalid method (:issue:`22298`) +- :meth:`shift` will now always return a copy, instead of the previous behaviour of returning self when shifting by 0 (:issue:`22397`) +- Slicing a single row of a DataFrame with multiple ExtensionArrays of the same type now preserves the dtype, rather than coercing to object (:issue:`22784`) +- :class:`DateOffset` attribute `_cacheable` and method `_should_cache` have been removed (:issue:`23118`) + +.. _whatsnew_0240.deprecations: + +Deprecations +~~~~~~~~~~~~ + +- :meth:`DataFrame.to_stata`, :meth:`read_stata`, :class:`StataReader` and :class:`StataWriter` have deprecated the ``encoding`` argument. The encoding of a Stata dta file is determined by the file type and cannot be changed (:issue:`21244`) +- :meth:`MultiIndex.to_hierarchical` is deprecated and will be removed in a future version (:issue:`21613`) +- :meth:`Series.ptp` is deprecated. Use ``numpy.ptp`` instead (:issue:`21614`) +- :meth:`Series.compress` is deprecated. Use ``Series[condition]`` instead (:issue:`18262`) +- The signature of :meth:`Series.to_csv` has been uniformed to that of :meth:`DataFrame.to_csv`: the name of the first argument is now ``path_or_buf``, the order of subsequent arguments has changed, the ``header`` argument now defaults to ``True``. (:issue:`19715`) +- :meth:`Categorical.from_codes` has deprecated providing float values for the ``codes`` argument. (:issue:`21767`) +- :func:`pandas.read_table` is deprecated. Instead, use :func:`pandas.read_csv` passing ``sep='\t'`` if necessary (:issue:`21948`) +- :meth:`Series.str.cat` has deprecated using arbitrary list-likes *within* list-likes. A list-like container may still contain + many ``Series``, ``Index`` or 1-dimensional ``np.ndarray``, or alternatively, only scalar values. (:issue:`21950`) +- :meth:`FrozenNDArray.searchsorted` has deprecated the ``v`` parameter in favor of ``value`` (:issue:`14645`) +- :func:`DatetimeIndex.shift` and :func:`PeriodIndex.shift` now accept ``periods`` argument instead of ``n`` for consistency with :func:`Index.shift` and :func:`Series.shift`. Using ``n`` throws a deprecation warning (:issue:`22458`, :issue:`22912`) + +.. _whatsnew_0240.prior_deprecations: + +Removal of prior version deprecations/changes +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +- The ``LongPanel`` and ``WidePanel`` classes have been removed (:issue:`10892`) +- :meth:`Series.repeat` has renamed the ``reps`` argument to ``repeats`` (:issue:`14645`) +- Several private functions were removed from the (non-public) module ``pandas.core.common`` (:issue:`22001`) +- Removal of the previously deprecated module ``pandas.core.datetools`` (:issue:`14105`, :issue:`14094`) +- Strings passed into :meth:`DataFrame.groupby` that refer to both column and index levels will raise a ``ValueError`` (:issue:`14432`) +- :meth:`Index.repeat` and :meth:`MultiIndex.repeat` have renamed the ``n`` argument to ``repeats`` (:issue:`14645`) +- Removal of the previously deprecated ``as_indexer`` keyword completely from ``str.match()`` (:issue:`22356`, :issue:`6581`) +- Removed the ``pandas.formats.style`` shim for :class:`pandas.io.formats.style.Styler` (:issue:`16059`) +- :meth:`Categorical.searchsorted` and :meth:`Series.searchsorted` have renamed the ``v`` argument to ``value`` (:issue:`14645`) +- :meth:`TimedeltaIndex.searchsorted`, :meth:`DatetimeIndex.searchsorted`, and :meth:`PeriodIndex.searchsorted` have renamed the ``key`` argument to ``value`` (:issue:`14645`) +- Removal of the previously deprecated module ``pandas.json`` (:issue:`19944`) + +.. _whatsnew_0240.performance: + +Performance Improvements +~~~~~~~~~~~~~~~~~~~~~~~~ + +- Very large improvement in performance of slicing when the index is a :class:`CategoricalIndex`, + both when indexing by label (using .loc) and position(.iloc). + Likewise, slicing a ``CategoricalIndex`` itself (i.e. ``ci[100:200]``) shows similar speed improvements (:issue:`21659`) +- Improved performance of :func:`Series.describe` in case of numeric dtpyes (:issue:`21274`) +- Improved performance of :func:`pandas.core.groupby.GroupBy.rank` when dealing with tied rankings (:issue:`21237`) +- Improved performance of :func:`DataFrame.set_index` with columns consisting of :class:`Period` objects (:issue:`21582`, :issue:`21606`) +- Improved performance of membership checks in :class:`Categorical` and :class:`CategoricalIndex` + (i.e. ``x in cat``-style checks are much faster). :meth:`CategoricalIndex.contains` + is likewise much faster (:issue:`21369`, :issue:`21508`) +- Improved performance of :meth:`HDFStore.groups` (and dependent functions like + :meth:`~HDFStore.keys`. (i.e. ``x in store`` checks are much faster) + (:issue:`21372`) +- Improved the performance of :func:`pandas.get_dummies` with ``sparse=True`` (:issue:`21997`) +- Improved performance of :func:`IndexEngine.get_indexer_non_unique` for sorted, non-unique indexes (:issue:`9466`) +- Improved performance of :func:`PeriodIndex.unique` (:issue:`23083`) + + +.. _whatsnew_0240.docs: + +Documentation Changes +~~~~~~~~~~~~~~~~~~~~~ + +- Added sphinx spelling extension, updated documentation on how to use the spell check (:issue:`21079`) +- +- + +.. _whatsnew_0240.bug_fixes: + +Bug Fixes +~~~~~~~~~ + +Categorical +^^^^^^^^^^^ + +- Bug in :meth:`Categorical.from_codes` where ``NaN`` values in ``codes`` were silently converted to ``0`` (:issue:`21767`). In the future this will raise a ``ValueError``. Also changes the behavior of ``.from_codes([1.1, 2.0])``. +- Bug when indexing with a boolean-valued ``Categorical``. Now a boolean-valued ``Categorical`` is treated as a boolean mask (:issue:`22665`) +- Constructing a :class:`CategoricalIndex` with empty values and boolean categories was raising a ``ValueError`` after a change to dtype coercion (:issue:`22702`). + +Datetimelike +^^^^^^^^^^^^ + +- Fixed bug where two :class:`DateOffset` objects with different ``normalize`` attributes could evaluate as equal (:issue:`21404`) +- Fixed bug where :meth:`Timestamp.resolution` incorrectly returned 1-microsecond ``timedelta`` instead of 1-nanosecond :class:`Timedelta` (:issue:`21336`, :issue:`21365`) +- Bug in :func:`to_datetime` that did not consistently return an :class:`Index` when ``box=True`` was specified (:issue:`21864`) +- Bug in :class:`DatetimeIndex` comparisons where string comparisons incorrectly raises ``TypeError`` (:issue:`22074`) +- Bug in :class:`DatetimeIndex` comparisons when comparing against ``timedelta64[ns]`` dtyped arrays; in some cases ``TypeError`` was incorrectly raised, in others it incorrectly failed to raise (:issue:`22074`) +- Bug in :class:`DatetimeIndex` comparisons when comparing against object-dtyped arrays (:issue:`22074`) +- Bug in :class:`DataFrame` with ``datetime64[ns]`` dtype addition and subtraction with ``Timedelta``-like objects (:issue:`22005`, :issue:`22163`) +- Bug in :class:`DataFrame` with ``datetime64[ns]`` dtype addition and subtraction with ``DateOffset`` objects returning an ``object`` dtype instead of ``datetime64[ns]`` dtype (:issue:`21610`, :issue:`22163`) +- Bug in :class:`DataFrame` with ``datetime64[ns]`` dtype comparing against ``NaT`` incorrectly (:issue:`22242`, :issue:`22163`) +- Bug in :class:`DataFrame` with ``datetime64[ns]`` dtype subtracting ``Timestamp``-like object incorrectly returned ``datetime64[ns]`` dtype instead of ``timedelta64[ns]`` dtype (:issue:`8554`, :issue:`22163`) +- Bug in :class:`DataFrame` with ``datetime64[ns]`` dtype subtracting ``np.datetime64`` object with non-nanosecond unit failing to convert to nanoseconds (:issue:`18874`, :issue:`22163`) +- Bug in :class:`DataFrame` comparisons against ``Timestamp``-like objects failing to raise ``TypeError`` for inequality checks with mismatched types (:issue:`8932`, :issue:`22163`) +- Bug in :class:`DataFrame` with mixed dtypes including ``datetime64[ns]`` incorrectly raising ``TypeError`` on equality comparisons (:issue:`13128`, :issue:`22163`) +- Bug in :meth:`DataFrame.eq` comparison against ``NaT`` incorrectly returning ``True`` or ``NaN`` (:issue:`15697`, :issue:`22163`) +- Bug in :class:`DatetimeIndex` subtraction that incorrectly failed to raise ``OverflowError`` (:issue:`22492`, :issue:`22508`) +- Bug in :class:`DatetimeIndex` incorrectly allowing indexing with ``Timedelta`` object (:issue:`20464`) +- Bug in :class:`DatetimeIndex` where frequency was being set if original frequency was ``None`` (:issue:`22150`) +- Bug in rounding methods of :class:`DatetimeIndex` (:meth:`~DatetimeIndex.round`, :meth:`~DatetimeIndex.ceil`, :meth:`~DatetimeIndex.floor`) and :class:`Timestamp` (:meth:`~Timestamp.round`, :meth:`~Timestamp.ceil`, :meth:`~Timestamp.floor`) could give rise to loss of precision (:issue:`22591`) +- Bug in :func:`to_datetime` with an :class:`Index` argument that would drop the ``name`` from the result (:issue:`21697`) +- Bug in :class:`PeriodIndex` where adding or subtracting a :class:`timedelta` or :class:`Tick` object produced incorrect results (:issue:`22988`) + +Timedelta +^^^^^^^^^ +- Bug in :class:`DataFrame` with ``timedelta64[ns]`` dtype division by ``Timedelta``-like scalar incorrectly returning ``timedelta64[ns]`` dtype instead of ``float64`` dtype (:issue:`20088`, :issue:`22163`) +- Bug in adding a :class:`Index` with object dtype to a :class:`Series` with ``timedelta64[ns]`` dtype incorrectly raising (:issue:`22390`) +- Bug in multiplying a :class:`Series` with numeric dtype against a ``timedelta`` object (:issue:`22390`) +- Bug in :class:`Series` with numeric dtype when adding or subtracting an an array or ``Series`` with ``timedelta64`` dtype (:issue:`22390`) +- Bug in :class:`Index` with numeric dtype when multiplying or dividing an array with dtype ``timedelta64`` (:issue:`22390`) +- Bug in :class:`TimedeltaIndex` incorrectly allowing indexing with ``Timestamp`` object (:issue:`20464`) +- Fixed bug where subtracting :class:`Timedelta` from an object-dtyped array would raise ``TypeError`` (:issue:`21980`) +- Fixed bug in adding a :class:`DataFrame` with all-`timedelta64[ns]` dtypes to a :class:`DataFrame` with all-integer dtypes returning incorrect results instead of raising ``TypeError`` (:issue:`22696`) +- + +Timezones +^^^^^^^^^ + +- Bug in :meth:`DatetimeIndex.shift` where an ``AssertionError`` would raise when shifting across DST (:issue:`8616`) +- Bug in :class:`Timestamp` constructor where passing an invalid timezone offset designator (``Z``) would not raise a ``ValueError`` (:issue:`8910`) +- Bug in :meth:`Timestamp.replace` where replacing at a DST boundary would retain an incorrect offset (:issue:`7825`) +- Bug in :meth:`Series.replace` with ``datetime64[ns, tz]`` data when replacing ``NaT`` (:issue:`11792`) +- Bug in :class:`Timestamp` when passing different string date formats with a timezone offset would produce different timezone offsets (:issue:`12064`) +- Bug when comparing a tz-naive :class:`Timestamp` to a tz-aware :class:`DatetimeIndex` which would coerce the :class:`DatetimeIndex` to tz-naive (:issue:`12601`) +- Bug in :meth:`Series.truncate` with a tz-aware :class:`DatetimeIndex` which would cause a core dump (:issue:`9243`) +- Bug in :class:`Series` constructor which would coerce tz-aware and tz-naive :class:`Timestamp` to tz-aware (:issue:`13051`) +- Bug in :class:`Index` with ``datetime64[ns, tz]`` dtype that did not localize integer data correctly (:issue:`20964`) +- Bug in :class:`DatetimeIndex` where constructing with an integer and tz would not localize correctly (:issue:`12619`) +- Fixed bug where :meth:`DataFrame.describe` and :meth:`Series.describe` on tz-aware datetimes did not show `first` and `last` result (:issue:`21328`) +- Bug in :class:`DatetimeIndex` comparisons failing to raise ``TypeError`` when comparing timezone-aware ``DatetimeIndex`` against ``np.datetime64`` (:issue:`22074`) +- Bug in ``DataFrame`` assignment with a timezone-aware scalar (:issue:`19843`) +- Bug in :func:`Dataframe.asof` that raised a ``TypeError`` when attempting to compare tz-naive and tz-aware timestamps (:issue:`21194`) +- Bug when constructing a :class:`DatetimeIndex` with :class:`Timestamp`s constructed with the ``replace`` method across DST (:issue:`18785`) +- Bug when setting a new value with :meth:`DataFrame.loc` with a :class:`DatetimeIndex` with a DST transition (:issue:`18308`, :issue:`20724`) +- Bug in :meth:`DatetimeIndex.unique` that did not re-localize tz-aware dates correctly (:issue:`21737`) +- Bug when indexing a :class:`Series` with a DST transition (:issue:`21846`) +- Bug in :meth:`DataFrame.resample` and :meth:`Series.resample` where an ``AmbiguousTimeError`` or ``NonExistentTimeError`` would raise if a timezone aware timeseries ended on a DST transition (:issue:`19375`, :issue:`10117`) + +Offsets +^^^^^^^ + +- Bug in :class:`FY5253` where date offsets could incorrectly raise an ``AssertionError`` in arithmetic operatons (:issue:`14774`) +- Bug in :class:`DateOffset` where keyword arguments ``week`` and ``milliseconds`` were accepted and ignored. Passing these will now raise ``ValueError`` (:issue:`19398`) +- + +Numeric +^^^^^^^ + +- Bug in :class:`Series` ``__rmatmul__`` doesn't support matrix vector multiplication (:issue:`21530`) +- Bug in :func:`factorize` fails with read-only array (:issue:`12813`) +- Fixed bug in :func:`unique` handled signed zeros inconsistently: for some inputs 0.0 and -0.0 were treated as equal and for some inputs as different. Now they are treated as equal for all inputs (:issue:`21866`) +- Bug in :meth:`DataFrame.agg`, :meth:`DataFrame.transform` and :meth:`DataFrame.apply` where, + when supplied with a list of functions and ``axis=1`` (e.g. ``df.apply(['sum', 'mean'], axis=1)``), + a ``TypeError`` was wrongly raised. For all three methods such calculation are now done correctly. (:issue:`16679`). +- Bug in :class:`Series` comparison against datetime-like scalars and arrays (:issue:`22074`) +- Bug in :class:`DataFrame` multiplication between boolean dtype and integer returning ``object`` dtype instead of integer dtype (:issue:`22047`, :issue:`22163`) +- Bug in :meth:`DataFrame.apply` where, when supplied with a string argument and additional positional or keyword arguments (e.g. ``df.apply('sum', min_count=1)``), a ``TypeError`` was wrongly raised (:issue:`22376`) +- Bug in :meth:`DataFrame.astype` to extension dtype may raise ``AttributeError`` (:issue:`22578`) +- Bug in :class:`DataFrame` with ``timedelta64[ns]`` dtype arithmetic operations with ``ndarray`` with integer dtype incorrectly treating the narray as ``timedelta64[ns]`` dtype (:issue:`23114`) + +Strings +^^^^^^^ + +- +- +- + +Interval +^^^^^^^^ + +- Bug in the :class:`IntervalIndex` constructor where the ``closed`` parameter did not always override the inferred ``closed`` (:issue:`19370`) +- Bug in the ``IntervalIndex`` repr where a trailing comma was missing after the list of intervals (:issue:`20611`) +- Bug in :class:`Interval` where scalar arithmetic operations did not retain the ``closed`` value (:issue:`22313`) +- Bug in :class:`IntervalIndex` where indexing with datetime-like values raised a ``KeyError`` (:issue:`20636`) + +Indexing +^^^^^^^^ + +- The traceback from a ``KeyError`` when asking ``.loc`` for a single missing label is now shorter and more clear (:issue:`21557`) +- When ``.ix`` is asked for a missing integer label in a :class:`MultiIndex` with a first level of integer type, it now raises a ``KeyError``, consistently with the case of a flat :class:`Int64Index`, rather than falling back to positional indexing (:issue:`21593`) +- Bug in :meth:`DatetimeIndex.reindex` when reindexing a tz-naive and tz-aware :class:`DatetimeIndex` (:issue:`8306`) +- Bug in :class:`DataFrame` when setting values with ``.loc`` and a timezone aware :class:`DatetimeIndex` (:issue:`11365`) +- ``DataFrame.__getitem__`` now accepts dictionaries and dictionary keys as list-likes of labels, consistently with ``Series.__getitem__`` (:issue:`21294`) +- Fixed ``DataFrame[np.nan]`` when columns are non-unique (:issue:`21428`) +- Bug when indexing :class:`DatetimeIndex` with nanosecond resolution dates and timezones (:issue:`11679`) +- Bug where indexing with a Numpy array containing negative values would mutate the indexer (:issue:`21867`) +- Bug where mixed indexes wouldn't allow integers for ``.at`` (:issue:`19860`) +- ``Float64Index.get_loc`` now raises ``KeyError`` when boolean key passed. (:issue:`19087`) +- Bug in :meth:`DataFrame.loc` when indexing with an :class:`IntervalIndex` (:issue:`19977`) +- :class:`Index` no longer mangles ``None``, ``NaN`` and ``NaT``, i.e. they are treated as three different keys. However, for numeric Index all three are still coerced to a ``NaN`` (:issue:`22332`) +- Bug in `scalar in Index` if scalar is a float while the ``Index`` is of integer dtype (:issue:`22085`) + +Missing +^^^^^^^ + +- Bug in :func:`DataFrame.fillna` where a ``ValueError`` would raise when one column contained a ``datetime64[ns, tz]`` dtype (:issue:`15522`) +- Bug in :func:`Series.hasnans` that could be incorrectly cached and return incorrect answers if null elements are introduced after an initial call (:issue:`19700`) +- :func:`Series.isin` now treats all NaN-floats as equal also for `np.object`-dtype. This behavior is consistent with the behavior for float64 (:issue:`22119`) +- :func:`unique` no longer mangles NaN-floats and the ``NaT``-object for `np.object`-dtype, i.e. ``NaT`` is no longer coerced to a NaN-value and is treated as a different entity. (:issue:`22295`) + + +MultiIndex +^^^^^^^^^^ + +- Removed compatibility for :class:`MultiIndex` pickles prior to version 0.8.0; compatibility with :class:`MultiIndex` pickles from version 0.13 forward is maintained (:issue:`21654`) +- :meth:`MultiIndex.get_loc_level` (and as a consequence, ``.loc`` on a :class:`MultiIndex`ed object) will now raise a ``KeyError``, rather than returning an empty ``slice``, if asked a label which is present in the ``levels`` but is unused (:issue:`22221`) +- Fix ``TypeError`` in Python 3 when creating :class:`MultiIndex` in which some levels have mixed types, e.g. when some labels are tuples (:issue:`15457`) + +I/O +^^^ + +- :func:`read_html()` no longer ignores all-whitespace ```` within ```` when considering the ``skiprows`` and ``header`` arguments. Previously, users had to decrease their ``header`` and ``skiprows`` values on such tables to work around the issue. (:issue:`21641`) +- :func:`read_excel()` will correctly show the deprecation warning for previously deprecated ``sheetname`` (:issue:`17994`) +- :func:`read_csv()` and func:`read_table()` will throw ``UnicodeError`` and not coredump on badly encoded strings (:issue:`22748`) +- :func:`read_csv()` will correctly parse timezone-aware datetimes (:issue:`22256`) +- :func:`read_sas()` will parse numbers in sas7bdat-files that have width less than 8 bytes correctly. (:issue:`21616`) +- :func:`read_sas()` will correctly parse sas7bdat files with many columns (:issue:`22628`) +- :func:`read_sas()` will correctly parse sas7bdat files with data page types having also bit 7 set (so page type is 128 + 256 = 384) (:issue:`16615`) +- Bug in :meth:`detect_client_encoding` where potential ``IOError`` goes unhandled when importing in a mod_wsgi process due to restricted access to stdout. (:issue:`21552`) +- Bug in :func:`to_string()` that broke column alignment when ``index=False`` and width of first column's values is greater than the width of first column's header (:issue:`16839`, :issue:`13032`) + +Plotting +^^^^^^^^ + +- Bug in :func:`DataFrame.plot.scatter` and :func:`DataFrame.plot.hexbin` caused x-axis label and ticklabels to disappear when colorbar was on in IPython inline backend (:issue:`10611`, :issue:`10678`, and :issue:`20455`) +- Bug in plotting a Series with datetimes using :func:`matplotlib.axes.Axes.scatter` (:issue:`22039`) + +Groupby/Resample/Rolling +^^^^^^^^^^^^^^^^^^^^^^^^ + +- Bug in :func:`pandas.core.groupby.GroupBy.first` and :func:`pandas.core.groupby.GroupBy.last` with ``as_index=False`` leading to the loss of timezone information (:issue:`15884`) +- Bug in :meth:`DatetimeIndex.resample` when downsampling across a DST boundary (:issue:`8531`) +- Bug where ``ValueError`` is wrongly raised when calling :func:`~pandas.core.groupby.SeriesGroupBy.count` method of a + ``SeriesGroupBy`` when the grouping variable only contains NaNs and numpy version < 1.13 (:issue:`21956`). +- Multiple bugs in :func:`pandas.core.Rolling.min` with ``closed='left'`` and a + datetime-like index leading to incorrect results and also segfault. (:issue:`21704`) +- Bug in :meth:`Resampler.apply` when passing postiional arguments to applied func (:issue:`14615`). +- Bug in :meth:`Series.resample` when passing ``numpy.timedelta64`` to ``loffset`` kwarg (:issue:`7687`). +- Bug in :meth:`Resampler.asfreq` when frequency of ``TimedeltaIndex`` is a subperiod of a new frequency (:issue:`13022`). +- Bug in :meth:`SeriesGroupBy.mean` when values were integral but could not fit inside of int64, overflowing instead. (:issue:`22487`) +- :func:`RollingGroupby.agg` and :func:`ExpandingGroupby.agg` now support multiple aggregation functions as parameters (:issue:`15072`) +- Bug in :meth:`DataFrame.resample` and :meth:`Series.resample` when resampling by a weekly offset (``'W'``) across a DST transition (:issue:`9119`, :issue:`21459`) + +Reshaping +^^^^^^^^^ + +- Bug in :func:`pandas.concat` when joining resampled DataFrames with timezone aware index (:issue:`13783`) +- Bug in :meth:`Series.combine_first` with ``datetime64[ns, tz]`` dtype which would return tz-naive result (:issue:`21469`) +- Bug in :meth:`Series.where` and :meth:`DataFrame.where` with ``datetime64[ns, tz]`` dtype (:issue:`21546`) +- Bug in :meth:`Series.mask` and :meth:`DataFrame.mask` with ``list`` conditionals (:issue:`21891`) +- Bug in :meth:`DataFrame.replace` raises RecursionError when converting OutOfBounds ``datetime64[ns, tz]`` (:issue:`20380`) +- :func:`pandas.core.groupby.GroupBy.rank` now raises a ``ValueError`` when an invalid value is passed for argument ``na_option`` (:issue:`22124`) +- Bug in :func:`get_dummies` with Unicode attributes in Python 2 (:issue:`22084`) +- Bug in :meth:`DataFrame.replace` raises ``RecursionError`` when replacing empty lists (:issue:`22083`) +- Bug in :meth:`Series.replace` and meth:`DataFrame.replace` when dict is used as the ``to_replace`` value and one key in the dict is is another key's value, the results were inconsistent between using integer key and using string key (:issue:`20656`) +- Bug in :meth:`DataFrame.drop_duplicates` for empty ``DataFrame`` which incorrectly raises an error (:issue:`20516`) +- Bug in :func:`pandas.wide_to_long` when a string is passed to the stubnames argument and a column name is a substring of that stubname (:issue:`22468`) +- Bug in :func:`merge` when merging ``datetime64[ns, tz]`` data that contained a DST transition (:issue:`18885`) +- Bug in :func:`merge_asof` when merging on float values within defined tolerance (:issue:`22981`) +- Bug in :func:`pandas.concat` when concatenating a multicolumn DataFrame with tz-aware data against a DataFrame with a different number of columns (:issue`22796`) +- Bug in :func:`merge_asof` when merging on columns containing nulls values (:issue:`22981`) + +.. _whatsnew_0240.bug_fixes.sparse: + +Sparse +^^^^^^ + +- Updating a boolean, datetime, or timedelta column to be Sparse now works (:issue:`22367`) +- Bug in :meth:`Series.to_sparse` with Series already holding sparse data not constructing properly (:issue:`22389`) +- Providing a ``sparse_index`` to the SparseArray constructor no longer defaults the na-value to ``np.nan`` for all dtypes. The correct na_value for ``data.dtype`` is now used. +- Bug in ``SparseArray.nbytes`` under-reporting its memory usage by not including the size of its sparse index. +- Improved performance of :meth:`Series.shift` for non-NA ``fill_value``, as values are no longer converted to a dense array. +- Bug in ``DataFrame.groupby`` not including ``fill_value`` in the groups for non-NA ``fill_value`` when grouping by a sparse column (:issue:`5078`) +- Bug in unary inversion operator (``~``) on a ``SparseSeries`` with boolean values. The performance of this has also been improved (:issue:`22835`) + +Build Changes +^^^^^^^^^^^^^ + +- Building pandas for development now requires ``cython >= 0.28.2`` (:issue:`21688`) +- Testing pandas now requires ``hypothesis>=3.58``. You can find `the Hypothesis docs here `_, and a pandas-specific introduction :ref:`in the contributing guide `. (:issue:`22280`) +- + +Other +^^^^^ + +- :meth:`~pandas.io.formats.style.Styler.background_gradient` now takes a ``text_color_threshold`` parameter to automatically lighten the text color based on the luminance of the background color. This improves readability with dark background colors without the need to limit the background colormap range. (:issue:`21258`) +- Require at least 0.28.2 version of ``cython`` to support read-only memoryviews (:issue:`21688`) +- :meth:`~pandas.io.formats.style.Styler.background_gradient` now also supports tablewise application (in addition to rowwise and columnwise) with ``axis=None`` (:issue:`15204`) +- :meth:`DataFrame.nlargest` and :meth:`DataFrame.nsmallest` now returns the correct n values when keep != 'all' also when tied on the first columns (:issue:`22752`) +- :meth:`~pandas.io.formats.style.Styler.bar` now also supports tablewise application (in addition to rowwise and columnwise) with ``axis=None`` and setting clipping range with ``vmin`` and ``vmax`` (:issue:`21548` and :issue:`21526`). ``NaN`` values are also handled properly. +- Logical operations ``&, |, ^`` between :class:`Series` and :class:`Index` will no longer raise ``ValueError`` (:issue:`22092`) +- Bug in :meth:`DataFrame.combine_first` in which column types were unexpectedly converted to float (:issue:`20699`) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index d0c7b66978661..236707cf32209 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -36,7 +36,7 @@ ensure_float64, ensure_object, _get_dtype) -from pandas.core.dtypes.missing import na_value_for_dtype +from pandas.core.dtypes.missing import na_value_for_dtype, isnull from pandas.core.internals import (items_overlap_with_suffix, concatenate_block_managers) from pandas.util._decorators import Appender, Substitution @@ -1390,6 +1390,10 @@ def flip(xs): self.right_join_keys[-1]) tolerance = self.tolerance + # Check null values before merge + if isnull(left_values).sum() > 0 or isnull(right_values).sum() > 0: + raise MergeError('Merge keys cannot contain null values') + # we required sortedness in the join keys msg = "{side} keys must be sorted" if not Index(left_values).is_monotonic: From 1c877bb6373c84468981f46d194c913eee91a095 Mon Sep 17 00:00:00 2001 From: Sven Date: Tue, 16 Oct 2018 07:08:56 +0100 Subject: [PATCH 06/14] update release file --- doc/source/whatsnew/v0.24.0.txt | 1 + doc/source/whatsnew/v0.24.1.txt | 1006 ------------------------------- 2 files changed, 1 insertion(+), 1006 deletions(-) delete mode 100644 doc/source/whatsnew/v0.24.1.txt diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 3053625721560..ba59e050cea70 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -972,6 +972,7 @@ Reshaping - Bug in :func:`merge` when merging ``datetime64[ns, tz]`` data that contained a DST transition (:issue:`18885`) - Bug in :func:`merge_asof` when merging on float values within defined tolerance (:issue:`22981`) - Bug in :func:`pandas.concat` when concatenating a multicolumn DataFrame with tz-aware data against a DataFrame with a different number of columns (:issue`22796`) +- Bug in :func:`merge_asof` where confusing error message raised when attempting to merge with missing values .. _whatsnew_0240.bug_fixes.sparse: diff --git a/doc/source/whatsnew/v0.24.1.txt b/doc/source/whatsnew/v0.24.1.txt deleted file mode 100644 index 9463458bfa64f..0000000000000 --- a/doc/source/whatsnew/v0.24.1.txt +++ /dev/null @@ -1,1006 +0,0 @@ -.. _whatsnew_0240: - -v0.24.0 (Month XX, 2018) ------------------------- - -.. warning:: - - Starting January 1, 2019, pandas feature releases will support Python 3 only. - See :ref:`install.dropping-27` for more. - -.. _whatsnew_0240.enhancements: - -New features -~~~~~~~~~~~~ -- :func:`merge` now directly allows merge between objects of type ``DataFrame`` and named ``Series``, without the need to convert the ``Series`` object into a ``DataFrame`` beforehand (:issue:`21220`) - - -- ``ExcelWriter`` now accepts ``mode`` as a keyword argument, enabling append to existing workbooks when using the ``openpyxl`` engine (:issue:`3441`) - -- :func:`DataFrame.to_parquet` now accepts ``index`` as an argument, allowing -the user to override the engine's default behavior to include or omit the -dataframe's indexes from the resulting Parquet file. (:issue:`20768`) -- :meth:`DataFrame.corr` and :meth:`Series.corr` now accept a callable for generic calculation methods of correlation, e.g. histogram intersection (:issue:`22684`) - - -.. _whatsnew_0240.enhancements.extension_array_operators: - -``ExtensionArray`` operator support -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -A ``Series`` based on an ``ExtensionArray`` now supports arithmetic and comparison -operators (:issue:`19577`). There are two approaches for providing operator support for an ``ExtensionArray``: - -1. Define each of the operators on your ``ExtensionArray`` subclass. -2. Use an operator implementation from pandas that depends on operators that are already defined - on the underlying elements (scalars) of the ``ExtensionArray``. - -See the :ref:`ExtensionArray Operator Support -` documentation section for details on both -ways of adding operator support. - -.. _whatsnew_0240.enhancements.intna: - -Optional Integer NA Support -^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Pandas has gained the ability to hold integer dtypes with missing values. This long requested feature is enabled through the use of :ref:`extension types `. -Here is an example of the usage. - -We can construct a ``Series`` with the specified dtype. The dtype string ``Int64`` is a pandas ``ExtensionDtype``. Specifying a list or array using the traditional missing value -marker of ``np.nan`` will infer to integer dtype. The display of the ``Series`` will also use the ``NaN`` to indicate missing values in string outputs. (:issue:`20700`, :issue:`20747`, :issue:`22441`, :issue:`21789`, :issue:`22346`) - -.. ipython:: python - - s = pd.Series([1, 2, np.nan], dtype='Int64') - s - - -Operations on these dtypes will propagate ``NaN`` as other pandas operations. - -.. ipython:: python - - # arithmetic - s + 1 - - # comparison - s == 1 - - # indexing - s.iloc[1:3] - - # operate with other dtypes - s + s.iloc[1:3].astype('Int8') - - # coerce when needed - s + 0.01 - -These dtypes can operate as part of of ``DataFrame``. - -.. ipython:: python - - df = pd.DataFrame({'A': s, 'B': [1, 1, 3], 'C': list('aab')}) - df - df.dtypes - - -These dtypes can be merged & reshaped & casted. - -.. ipython:: python - - pd.concat([df[['A']], df[['B', 'C']]], axis=1).dtypes - df['A'].astype(float) - -Reduction and groupby operations such as 'sum' work. - -.. ipython:: python - - df.sum() - df.groupby('B').A.sum() - -.. warning:: - - The Integer NA support currently uses the captilized dtype version, e.g. ``Int8`` as compared to the traditional ``int8``. This may be changed at a future date. - -.. _whatsnew_0240.enhancements.read_html: - -``read_html`` Enhancements -^^^^^^^^^^^^^^^^^^^^^^^^^^ - -:func:`read_html` previously ignored ``colspan`` and ``rowspan`` attributes. -Now it understands them, treating them as sequences of cells with the same -value. (:issue:`17054`) - -.. ipython:: python - - result = pd.read_html(""" - - - - - - - - - - - -
ABC
12
""") - -Previous Behavior: - -.. code-block:: ipython - - In [13]: result - Out [13]: - [ A B C - 0 1 2 NaN] - -Current Behavior: - -.. ipython:: python - - result - - -.. _whatsnew_0240.enhancements.interval: - -Storing Interval Data in Series and DataFrame -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Interval data may now be stored in a ``Series`` or ``DataFrame``, in addition to an -:class:`IntervalIndex` like previously (:issue:`19453`). - -.. ipython:: python - - ser = pd.Series(pd.interval_range(0, 5)) - ser - ser.dtype - -Previously, these would be cast to a NumPy array of ``Interval`` objects. In general, -this should result in better performance when storing an array of intervals in -a :class:`Series`. - -Note that the ``.values`` of a ``Series`` containing intervals is no longer a NumPy -array, but rather an ``ExtensionArray``: - -.. ipython:: python - - ser.values - -This is the same behavior as ``Series.values`` for categorical data. See -:ref:`whatsnew_0240.api_breaking.interval_values` for more. - - -.. _whatsnew_0240.enhancements.other: - -Other Enhancements -^^^^^^^^^^^^^^^^^^ -- :func:`to_datetime` now supports the ``%Z`` and ``%z`` directive when passed into ``format`` (:issue:`13486`) -- :func:`Series.mode` and :func:`DataFrame.mode` now support the ``dropna`` parameter which can be used to specify whether ``NaN``/``NaT`` values should be considered (:issue:`17534`) -- :func:`to_csv` now supports ``compression`` keyword when a file handle is passed. (:issue:`21227`) -- :meth:`Index.droplevel` is now implemented also for flat indexes, for compatibility with :class:`MultiIndex` (:issue:`21115`) -- :meth:`Series.droplevel` and :meth:`DataFrame.droplevel` are now implemented (:issue:`20342`) -- Added support for reading from/writing to Google Cloud Storage via the ``gcsfs`` library (:issue:`19454`, :issue:`23094`) -- :func:`to_gbq` and :func:`read_gbq` signature and documentation updated to - reflect changes from the `Pandas-GBQ library version 0.6.0 - `__. - (:issue:`21627`, :issue:`22557`) -- New method :meth:`HDFStore.walk` will recursively walk the group hierarchy of an HDF5 file (:issue:`10932`) -- :func:`read_html` copies cell data across ``colspan`` and ``rowspan``, and it treats all-``th`` table rows as headers if ``header`` kwarg is not given and there is no ``thead`` (:issue:`17054`) -- :meth:`Series.nlargest`, :meth:`Series.nsmallest`, :meth:`DataFrame.nlargest`, and :meth:`DataFrame.nsmallest` now accept the value ``"all"`` for the ``keep`` argument. This keeps all ties for the nth largest/smallest value (:issue:`16818`) -- :class:`IntervalIndex` has gained the :meth:`~IntervalIndex.set_closed` method to change the existing ``closed`` value (:issue:`21670`) -- :func:`~DataFrame.to_csv`, :func:`~Series.to_csv`, :func:`~DataFrame.to_json`, and :func:`~Series.to_json` now support ``compression='infer'`` to infer compression based on filename extension (:issue:`15008`). - The default compression for ``to_csv``, ``to_json``, and ``to_pickle`` methods has been updated to ``'infer'`` (:issue:`22004`). -- :func:`to_timedelta` now supports iso-formated timedelta strings (:issue:`21877`) -- :class:`Series` and :class:`DataFrame` now support :class:`Iterable` in constructor (:issue:`2193`) -- :class:`DatetimeIndex` gained :attr:`DatetimeIndex.timetz` attribute. Returns local time with timezone information. (:issue:`21358`) -- :meth:`round`, :meth:`ceil`, and meth:`floor` for :class:`DatetimeIndex` and :class:`Timestamp` now support an ``ambiguous`` argument for handling datetimes that are rounded to ambiguous times (:issue:`18946`) -- :class:`Resampler` now is iterable like :class:`GroupBy` (:issue:`15314`). -- :meth:`Series.resample` and :meth:`DataFrame.resample` have gained the :meth:`Resampler.quantile` (:issue:`15023`). -- :meth:`Index.to_frame` now supports overriding column name(s) (:issue:`22580`). -- New attribute :attr:`__git_version__` will return git commit sha of current build (:issue:`21295`). -- Compatibility with Matplotlib 3.0 (:issue:`22790`). - -.. _whatsnew_0240.api_breaking: - -Backwards incompatible API changes -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -- A newly constructed empty :class:`DataFrame` with integer as the ``dtype`` will now only be cast to ``float64`` if ``index`` is specified (:issue:`22858`) - -.. _whatsnew_0240.api_breaking.deps: - -Dependencies have increased minimum versions -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -We have updated our minimum supported versions of dependencies (:issue:`21242`). -If installed, we now require: - -+-----------------+-----------------+----------+ -| Package | Minimum Version | Required | -+=================+=================+==========+ -| numpy | 1.12.0 | X | -+-----------------+-----------------+----------+ -| bottleneck | 1.2.0 | | -+-----------------+-----------------+----------+ -| matplotlib | 2.0.0 | | -+-----------------+-----------------+----------+ -| numexpr | 2.6.1 | | -+-----------------+-----------------+----------+ -| pytables | 3.4.2 | | -+-----------------+-----------------+----------+ -| scipy | 0.18.1 | | -+-----------------+-----------------+----------+ - -.. _whatsnew_0240.api_breaking.interval_values: - -``IntervalIndex.values`` is now an ``IntervalArray`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -The :attr:`~Interval.values` attribute of an :class:`IntervalIndex` now returns an -``IntervalArray``, rather than a NumPy array of :class:`Interval` objects (:issue:`19453`). - -Previous Behavior: - -.. code-block:: ipython - - In [1]: idx = pd.interval_range(0, 4) - - In [2]: idx.values - Out[2]: - array([Interval(0, 1, closed='right'), Interval(1, 2, closed='right'), - Interval(2, 3, closed='right'), Interval(3, 4, closed='right')], - dtype=object) - -New Behavior: - -.. ipython:: python - - idx = pd.interval_range(0, 4) - idx.values - -This mirrors ``CategoricalIndex.values``, which returns a ``Categorical``. - -For situations where you need an ``ndarray`` of ``Interval`` objects, use -:meth:`numpy.asarray` or ``idx.astype(object)``. - -.. ipython:: python - - np.asarray(idx) - idx.values.astype(object) - -.. _whatsnew_0240.api.timezone_offset_parsing: - -Parsing Datetime Strings with Timezone Offsets -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Previously, parsing datetime strings with UTC offsets with :func:`to_datetime` -or :class:`DatetimeIndex` would automatically convert the datetime to UTC -without timezone localization. This is inconsistent from parsing the same -datetime string with :class:`Timestamp` which would preserve the UTC -offset in the ``tz`` attribute. Now, :func:`to_datetime` preserves the UTC -offset in the ``tz`` attribute when all the datetime strings have the same -UTC offset (:issue:`17697`, :issue:`11736`, :issue:`22457`) - -*Previous Behavior*: - -.. code-block:: ipython - - In [2]: pd.to_datetime("2015-11-18 15:30:00+05:30") - Out[2]: Timestamp('2015-11-18 10:00:00') - - In [3]: pd.Timestamp("2015-11-18 15:30:00+05:30") - Out[3]: Timestamp('2015-11-18 15:30:00+0530', tz='pytz.FixedOffset(330)') - - # Different UTC offsets would automatically convert the datetimes to UTC (without a UTC timezone) - In [4]: pd.to_datetime(["2015-11-18 15:30:00+05:30", "2015-11-18 16:30:00+06:30"]) - Out[4]: DatetimeIndex(['2015-11-18 10:00:00', '2015-11-18 10:00:00'], dtype='datetime64[ns]', freq=None) - -*Current Behavior*: - -.. ipython:: python - - pd.to_datetime("2015-11-18 15:30:00+05:30") - pd.Timestamp("2015-11-18 15:30:00+05:30") - -Parsing datetime strings with the same UTC offset will preserve the UTC offset in the ``tz`` - -.. ipython:: python - - pd.to_datetime(["2015-11-18 15:30:00+05:30"] * 2) - -Parsing datetime strings with different UTC offsets will now create an Index of -``datetime.datetime`` objects with different UTC offsets - -.. ipython:: python - - idx = pd.to_datetime(["2015-11-18 15:30:00+05:30", "2015-11-18 16:30:00+06:30"]) - idx - idx[0] - idx[1] - -Passing ``utc=True`` will mimic the previous behavior but will correctly indicate -that the dates have been converted to UTC - -.. ipython:: python - - pd.to_datetime(["2015-11-18 15:30:00+05:30", "2015-11-18 16:30:00+06:30"], utc=True) - -.. _whatsnew_0240.api_breaking.calendarday: - -CalendarDay Offset -^^^^^^^^^^^^^^^^^^ - -:class:`Day` and associated frequency alias ``'D'`` were documented to represent -a calendar day; however, arithmetic and operations with :class:`Day` sometimes -respected absolute time instead (i.e. ``Day(n)`` and acted identically to ``Timedelta(days=n)``). - -*Previous Behavior*: - -.. code-block:: ipython - - - In [2]: ts = pd.Timestamp('2016-10-30 00:00:00', tz='Europe/Helsinki') - - # Respects calendar arithmetic - In [3]: pd.date_range(start=ts, freq='D', periods=3) - Out[3]: - DatetimeIndex(['2016-10-30 00:00:00+03:00', '2016-10-31 00:00:00+02:00', - '2016-11-01 00:00:00+02:00'], - dtype='datetime64[ns, Europe/Helsinki]', freq='D') - - # Respects absolute arithmetic - In [4]: ts + pd.tseries.frequencies.to_offset('D') - Out[4]: Timestamp('2016-10-30 23:00:00+0200', tz='Europe/Helsinki') - -:class:`CalendarDay` and associated frequency alias ``'CD'`` are now available -and respect calendar day arithmetic while :class:`Day` and frequency alias ``'D'`` -will now respect absolute time (:issue:`22274`, :issue:`20596`, :issue:`16980`, :issue:`8774`) -See the :ref:`documentation here ` for more information. - -Addition with :class:`CalendarDay` across a daylight savings time transition: - -.. ipython:: python - - ts = pd.Timestamp('2016-10-30 00:00:00', tz='Europe/Helsinki') - ts + pd.offsets.Day(1) - ts + pd.offsets.CalendarDay(1) - -.. _whatsnew_0240.api_breaking.period_end_time: - -Time values in ``dt.end_time`` and ``to_timestamp(how='end')`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -The time values in :class:`Period` and :class:`PeriodIndex` objects are now set -to '23:59:59.999999999' when calling :attr:`Series.dt.end_time`, :attr:`Period.end_time`, -:attr:`PeriodIndex.end_time`, :func:`Period.to_timestamp()` with ``how='end'``, -or :func:`PeriodIndex.to_timestamp()` with ``how='end'`` (:issue:`17157`) - -Previous Behavior: - -.. code-block:: ipython - - In [2]: p = pd.Period('2017-01-01', 'D') - In [3]: pi = pd.PeriodIndex([p]) - - In [4]: pd.Series(pi).dt.end_time[0] - Out[4]: Timestamp(2017-01-01 00:00:00) - - In [5]: p.end_time - Out[5]: Timestamp(2017-01-01 23:59:59.999999999) - -Current Behavior: - -Calling :attr:`Series.dt.end_time` will now result in a time of '23:59:59.999999999' as -is the case with :attr:`Period.end_time`, for example - -.. ipython:: python - - p = pd.Period('2017-01-01', 'D') - pi = pd.PeriodIndex([p]) - - pd.Series(pi).dt.end_time[0] - - p.end_time - -.. _whatsnew_0240.api_breaking.sparse_values: - -Sparse Data Structure Refactor -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -``SparseArray``, the array backing ``SparseSeries`` and the columns in a ``SparseDataFrame``, -is now an extension array (:issue:`21978`, :issue:`19056`, :issue:`22835`). -To conform to this interface and for consistency with the rest of pandas, some API breaking -changes were made: - -- ``SparseArray`` is no longer a subclass of :class:`numpy.ndarray`. To convert a SparseArray to a NumPy array, use :meth:`numpy.asarray`. -- ``SparseArray.dtype`` and ``SparseSeries.dtype`` are now instances of :class:`SparseDtype`, rather than ``np.dtype``. Access the underlying dtype with ``SparseDtype.subtype``. -- :meth:`numpy.asarray(sparse_array)` now returns a dense array with all the values, not just the non-fill-value values (:issue:`14167`) -- ``SparseArray.take`` now matches the API of :meth:`pandas.api.extensions.ExtensionArray.take` (:issue:`19506`): - - * The default value of ``allow_fill`` has changed from ``False`` to ``True``. - * The ``out`` and ``mode`` parameters are now longer accepted (previously, this raised if they were specified). - * Passing a scalar for ``indices`` is no longer allowed. - -- The result of concatenating a mix of sparse and dense Series is a Series with sparse values, rather than a ``SparseSeries``. -- ``SparseDataFrame.combine`` and ``DataFrame.combine_first`` no longer supports combining a sparse column with a dense column while preserving the sparse subtype. The result will be an object-dtype SparseArray. -- Setting :attr:`SparseArray.fill_value` to a fill value with a different dtype is now allowed. - - -Some new warnings are issued for operations that require or are likely to materialize a large dense array: - -- A :class:`errors.PerformanceWarning` is issued when using fillna with a ``method``, as a dense array is constructed to create the filled array. Filling with a ``value`` is the efficient way to fill a sparse array. -- A :class:`errors.PerformanceWarning` is now issued when concatenating sparse Series with differing fill values. The fill value from the first sparse array continues to be used. - -In addition to these API breaking changes, many :ref:`performance improvements and bug fixes have been made `. - -.. _whatsnew_0240.api_breaking.frame_to_dict_index_orient: - -Raise ValueError in ``DataFrame.to_dict(orient='index')`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Bug in :func:`DataFrame.to_dict` raises ``ValueError`` when used with -``orient='index'`` and a non-unique index instead of losing data (:issue:`22801`) - -.. ipython:: python - :okexcept: - - df = pd.DataFrame({'a': [1, 2], 'b': [0.5, 0.75]}, index=['A', 'A']) - df - - df.to_dict(orient='index') - -.. _whatsnew_0240.api.datetimelike.normalize: - -Tick DateOffset Normalize Restrictions -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Creating a ``Tick`` object (:class:`Day`, :class:`Hour`, :class:`Minute`, -:class:`Second`, :class:`Milli`, :class:`Micro`, :class:`Nano`) with -``normalize=True`` is no longer supported. This prevents unexpected behavior -where addition could fail to be monotone or associative. (:issue:`21427`) - -*Previous Behavior*: - -.. code-block:: ipython - - - In [2]: ts = pd.Timestamp('2018-06-11 18:01:14') - - In [3]: ts - Out[3]: Timestamp('2018-06-11 18:01:14') - - In [4]: tic = pd.offsets.Hour(n=2, normalize=True) - ...: - - In [5]: tic - Out[5]: <2 * Hours> - - In [6]: ts + tic - Out[6]: Timestamp('2018-06-11 00:00:00') - - In [7]: ts + tic + tic + tic == ts + (tic + tic + tic) - Out[7]: False - -*Current Behavior*: - -.. ipython:: python - - ts = pd.Timestamp('2018-06-11 18:01:14') - tic = pd.offsets.Hour(n=2) - ts + tic + tic + tic == ts + (tic + tic + tic) - - -.. _whatsnew_0240.api.datetimelike: - - -.. _whatsnew_0240.api.period_subtraction: - -Period Subtraction -^^^^^^^^^^^^^^^^^^ - -Subtraction of a ``Period`` from another ``Period`` will give a ``DateOffset``. -instead of an integer (:issue:`21314`) - -.. ipython:: python - - june = pd.Period('June 2018') - april = pd.Period('April 2018') - june - april - -Previous Behavior: - -.. code-block:: ipython - - In [2]: june = pd.Period('June 2018') - - In [3]: april = pd.Period('April 2018') - - In [4]: june - april - Out [4]: 2 - -Similarly, subtraction of a ``Period`` from a ``PeriodIndex`` will now return -an ``Index`` of ``DateOffset`` objects instead of an ``Int64Index`` - -.. ipython:: python - - pi = pd.period_range('June 2018', freq='M', periods=3) - pi - pi[0] - -Previous Behavior: - -.. code-block:: ipython - - In [2]: pi = pd.period_range('June 2018', freq='M', periods=3) - - In [3]: pi - pi[0] - Out[3]: Int64Index([0, 1, 2], dtype='int64') - - -.. _whatsnew_0240.api.timedelta64_subtract_nan: - -Addition/Subtraction of ``NaN`` from :class:`DataFrame` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Adding or subtracting ``NaN`` from a :class:`DataFrame` column with -``timedelta64[ns]`` dtype will now raise a ``TypeError`` instead of returning -all-``NaT``. This is for compatibility with ``TimedeltaIndex`` and -``Series`` behavior (:issue:`22163`) - -.. ipython:: python - :okexcept: - - df = pd.DataFrame([pd.Timedelta(days=1)]) - df - np.nan - -Previous Behavior: - -.. code-block:: ipython - - In [4]: df = pd.DataFrame([pd.Timedelta(days=1)]) - - In [5]: df - np.nan - Out[5]: - 0 - 0 NaT - - -.. _whatsnew_0240.api.dataframe_arithmetic_broadcasting: - -DataFrame Arithmetic Operations Broadcasting Changes -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -:class:`DataFrame` arithmetic operations when operating with 2-dimensional -``np.ndarray`` objects now broadcast in the same way as ``np.ndarray``s -broadcast. (:issue:`23000`) - -Previous Behavior: - -.. code-block:: ipython - - In [3]: arr = np.arange(6).reshape(3, 2) - In [4]: df = pd.DataFrame(arr) - In [5]: df + arr[[0], :] # 1 row, 2 columns - ... - ValueError: Unable to coerce to DataFrame, shape must be (3, 2): given (1, 2) - In [6]: df + arr[:, [1]] # 1 column, 3 rows - ... - ValueError: Unable to coerce to DataFrame, shape must be (3, 2): given (3, 1) - -*Current Behavior*: - -.. ipython:: python - arr = np.arange(6).reshape(3, 2) - df = pd.DataFrame(arr) - df - -.. ipython:: python - df + arr[[0], :] # 1 row, 2 columns - df + arr[:, [1]] # 1 column, 3 rows - - -.. _whatsnew_0240.api.extension: - -ExtensionType Changes -^^^^^^^^^^^^^^^^^^^^^ - -**:class:`pandas.api.extensions.ExtensionDtype` Equality and Hashability** - -Pandas now requires that extension dtypes be hashable. The base class implements -a default ``__eq__`` and ``__hash__``. If you have a parametrized dtype, you should -update the ``ExtensionDtype._metadata`` tuple to match the signature of your -``__init__`` method. See :class:`pandas.api.extensions.ExtensionDtype` for more (:issue:`22476`). - -**Other changes** - -- ``ExtensionArray`` has gained the abstract methods ``.dropna()`` (:issue:`21185`) -- ``ExtensionDtype`` has gained the ability to instantiate from string dtypes, e.g. ``decimal`` would instantiate a registered ``DecimalDtype``; furthermore - the ``ExtensionDtype`` has gained the method ``construct_array_type`` (:issue:`21185`) -- An ``ExtensionArray`` with a boolean dtype now works correctly as a boolean indexer. :meth:`pandas.api.types.is_bool_dtype` now properly considers them boolean (:issue:`22326`) -- Added ``ExtensionDtype._is_numeric`` for controlling whether an extension dtype is considered numeric (:issue:`22290`). -- The ``ExtensionArray`` constructor, ``_from_sequence`` now take the keyword arg ``copy=False`` (:issue:`21185`) -- Bug in :meth:`Series.get` for ``Series`` using ``ExtensionArray`` and integer index (:issue:`21257`) -- :meth:`~Series.shift` now dispatches to :meth:`ExtensionArray.shift` (:issue:`22386`) -- :meth:`Series.combine()` works correctly with :class:`~pandas.api.extensions.ExtensionArray` inside of :class:`Series` (:issue:`20825`) -- :meth:`Series.combine()` with scalar argument now works for any function type (:issue:`21248`) -- :meth:`Series.astype` and :meth:`DataFrame.astype` now dispatch to :meth:`ExtensionArray.astype` (:issue:`21185:`). -- Slicing a single row of a ``DataFrame`` with multiple ExtensionArrays of the same type now preserves the dtype, rather than coercing to object (:issue:`22784`) -- Added :meth:`pandas.api.types.register_extension_dtype` to register an extension type with pandas (:issue:`22664`) -- Series backed by an ``ExtensionArray`` now work with :func:`util.hash_pandas_object` (:issue:`23066`) -- Updated the ``.type`` attribute for ``PeriodDtype``, ``DatetimeTZDtype``, and ``IntervalDtype`` to be instances of the dtype (``Period``, ``Timestamp``, and ``Interval`` respectively) (:issue:`22938`) -- :func:`ExtensionArray.isna` is allowed to return an ``ExtensionArray`` (:issue:`22325`). -- Support for reduction operations such as ``sum``, ``mean`` via opt-in base class method override (:issue:`22762`) - -.. _whatsnew_0240.api.incompatibilities: - -Series and Index Data-Dtype Incompatibilities -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -``Series`` and ``Index`` constructors now raise when the -data is incompatible with a passed ``dtype=`` (:issue:`15832`) - -Previous Behavior: - -.. code-block:: ipython - - In [4]: pd.Series([-1], dtype="uint64") - Out [4]: - 0 18446744073709551615 - dtype: uint64 - -Current Behavior: - -.. code-block:: ipython - - In [4]: pd.Series([-1], dtype="uint64") - Out [4]: - ... - OverflowError: Trying to coerce negative values to unsigned integers - -.. _whatsnew_0240.api.crosstab_dtypes - -Crosstab Preserves Dtypes -^^^^^^^^^^^^^^^^^^^^^^^^^ - -:func:`crosstab` will preserve now dtypes in some cases that previously would -cast from integer dtype to floating dtype (:issue:`22019`) - -Previous Behavior: - -.. code-block:: ipython - - In [3]: df = pd.DataFrame({'a': [1, 2, 2, 2, 2], 'b': [3, 3, 4, 4, 4], - ...: 'c': [1, 1, np.nan, 1, 1]}) - In [4]: pd.crosstab(df.a, df.b, normalize='columns') - Out[4]: - b 3 4 - a - 1 0.5 0.0 - 2 0.5 1.0 - -Current Behavior: - -.. code-block:: ipython - - In [3]: df = pd.DataFrame({'a': [1, 2, 2, 2, 2], 'b': [3, 3, 4, 4, 4], - ...: 'c': [1, 1, np.nan, 1, 1]}) - In [4]: pd.crosstab(df.a, df.b, normalize='columns') - -Datetimelike API Changes -^^^^^^^^^^^^^^^^^^^^^^^^ - -- For :class:`DatetimeIndex` and :class:`TimedeltaIndex` with non-``None`` ``freq`` attribute, addition or subtraction of integer-dtyped array or ``Index`` will return an object of the same class (:issue:`19959`) -- :class:`DateOffset` objects are now immutable. Attempting to alter one of these will now raise ``AttributeError`` (:issue:`21341`) -- :class:`PeriodIndex` subtraction of another ``PeriodIndex`` will now return an object-dtype :class:`Index` of :class:`DateOffset` objects instead of raising a ``TypeError`` (:issue:`20049`) -- :func:`cut` and :func:`qcut` now returns a :class:`DatetimeIndex` or :class:`TimedeltaIndex` bins when the input is datetime or timedelta dtype respectively and ``retbins=True`` (:issue:`19891`) -- :meth:`DatetimeIndex.to_period` and :meth:`Timestamp.to_period` will issue a warning when timezone information will be lost (:issue:`21333`) - -.. _whatsnew_0240.api.other: - -Other API Changes -^^^^^^^^^^^^^^^^^ - -- :class:`DatetimeIndex` now accepts :class:`Int64Index` arguments as epoch timestamps (:issue:`20997`) -- Accessing a level of a ``MultiIndex`` with a duplicate name (e.g. in - :meth:`~MultiIndex.get_level_values`) now raises a ``ValueError`` instead of - a ``KeyError`` (:issue:`21678`). -- Invalid construction of ``IntervalDtype`` will now always raise a ``TypeError`` rather than a ``ValueError`` if the subdtype is invalid (:issue:`21185`) -- Trying to reindex a ``DataFrame`` with a non unique ``MultiIndex`` now raises a ``ValueError`` instead of an ``Exception`` (:issue:`21770`) -- :meth:`PeriodIndex.tz_convert` and :meth:`PeriodIndex.tz_localize` have been removed (:issue:`21781`) -- :class:`Index` subtraction will attempt to operate element-wise instead of raising ``TypeError`` (:issue:`19369`) -- :class:`pandas.io.formats.style.Styler` supports a ``number-format`` property when using :meth:`~pandas.io.formats.style.Styler.to_excel` (:issue:`22015`) -- :meth:`DataFrame.corr` and :meth:`Series.corr` now raise a ``ValueError`` along with a helpful error message instead of a ``KeyError`` when supplied with an invalid method (:issue:`22298`) -- :meth:`shift` will now always return a copy, instead of the previous behaviour of returning self when shifting by 0 (:issue:`22397`) -- Slicing a single row of a DataFrame with multiple ExtensionArrays of the same type now preserves the dtype, rather than coercing to object (:issue:`22784`) -- :class:`DateOffset` attribute `_cacheable` and method `_should_cache` have been removed (:issue:`23118`) - -.. _whatsnew_0240.deprecations: - -Deprecations -~~~~~~~~~~~~ - -- :meth:`DataFrame.to_stata`, :meth:`read_stata`, :class:`StataReader` and :class:`StataWriter` have deprecated the ``encoding`` argument. The encoding of a Stata dta file is determined by the file type and cannot be changed (:issue:`21244`) -- :meth:`MultiIndex.to_hierarchical` is deprecated and will be removed in a future version (:issue:`21613`) -- :meth:`Series.ptp` is deprecated. Use ``numpy.ptp`` instead (:issue:`21614`) -- :meth:`Series.compress` is deprecated. Use ``Series[condition]`` instead (:issue:`18262`) -- The signature of :meth:`Series.to_csv` has been uniformed to that of :meth:`DataFrame.to_csv`: the name of the first argument is now ``path_or_buf``, the order of subsequent arguments has changed, the ``header`` argument now defaults to ``True``. (:issue:`19715`) -- :meth:`Categorical.from_codes` has deprecated providing float values for the ``codes`` argument. (:issue:`21767`) -- :func:`pandas.read_table` is deprecated. Instead, use :func:`pandas.read_csv` passing ``sep='\t'`` if necessary (:issue:`21948`) -- :meth:`Series.str.cat` has deprecated using arbitrary list-likes *within* list-likes. A list-like container may still contain - many ``Series``, ``Index`` or 1-dimensional ``np.ndarray``, or alternatively, only scalar values. (:issue:`21950`) -- :meth:`FrozenNDArray.searchsorted` has deprecated the ``v`` parameter in favor of ``value`` (:issue:`14645`) -- :func:`DatetimeIndex.shift` and :func:`PeriodIndex.shift` now accept ``periods`` argument instead of ``n`` for consistency with :func:`Index.shift` and :func:`Series.shift`. Using ``n`` throws a deprecation warning (:issue:`22458`, :issue:`22912`) - -.. _whatsnew_0240.prior_deprecations: - -Removal of prior version deprecations/changes -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -- The ``LongPanel`` and ``WidePanel`` classes have been removed (:issue:`10892`) -- :meth:`Series.repeat` has renamed the ``reps`` argument to ``repeats`` (:issue:`14645`) -- Several private functions were removed from the (non-public) module ``pandas.core.common`` (:issue:`22001`) -- Removal of the previously deprecated module ``pandas.core.datetools`` (:issue:`14105`, :issue:`14094`) -- Strings passed into :meth:`DataFrame.groupby` that refer to both column and index levels will raise a ``ValueError`` (:issue:`14432`) -- :meth:`Index.repeat` and :meth:`MultiIndex.repeat` have renamed the ``n`` argument to ``repeats`` (:issue:`14645`) -- Removal of the previously deprecated ``as_indexer`` keyword completely from ``str.match()`` (:issue:`22356`, :issue:`6581`) -- Removed the ``pandas.formats.style`` shim for :class:`pandas.io.formats.style.Styler` (:issue:`16059`) -- :meth:`Categorical.searchsorted` and :meth:`Series.searchsorted` have renamed the ``v`` argument to ``value`` (:issue:`14645`) -- :meth:`TimedeltaIndex.searchsorted`, :meth:`DatetimeIndex.searchsorted`, and :meth:`PeriodIndex.searchsorted` have renamed the ``key`` argument to ``value`` (:issue:`14645`) -- Removal of the previously deprecated module ``pandas.json`` (:issue:`19944`) - -.. _whatsnew_0240.performance: - -Performance Improvements -~~~~~~~~~~~~~~~~~~~~~~~~ - -- Very large improvement in performance of slicing when the index is a :class:`CategoricalIndex`, - both when indexing by label (using .loc) and position(.iloc). - Likewise, slicing a ``CategoricalIndex`` itself (i.e. ``ci[100:200]``) shows similar speed improvements (:issue:`21659`) -- Improved performance of :func:`Series.describe` in case of numeric dtpyes (:issue:`21274`) -- Improved performance of :func:`pandas.core.groupby.GroupBy.rank` when dealing with tied rankings (:issue:`21237`) -- Improved performance of :func:`DataFrame.set_index` with columns consisting of :class:`Period` objects (:issue:`21582`, :issue:`21606`) -- Improved performance of membership checks in :class:`Categorical` and :class:`CategoricalIndex` - (i.e. ``x in cat``-style checks are much faster). :meth:`CategoricalIndex.contains` - is likewise much faster (:issue:`21369`, :issue:`21508`) -- Improved performance of :meth:`HDFStore.groups` (and dependent functions like - :meth:`~HDFStore.keys`. (i.e. ``x in store`` checks are much faster) - (:issue:`21372`) -- Improved the performance of :func:`pandas.get_dummies` with ``sparse=True`` (:issue:`21997`) -- Improved performance of :func:`IndexEngine.get_indexer_non_unique` for sorted, non-unique indexes (:issue:`9466`) -- Improved performance of :func:`PeriodIndex.unique` (:issue:`23083`) - - -.. _whatsnew_0240.docs: - -Documentation Changes -~~~~~~~~~~~~~~~~~~~~~ - -- Added sphinx spelling extension, updated documentation on how to use the spell check (:issue:`21079`) -- -- - -.. _whatsnew_0240.bug_fixes: - -Bug Fixes -~~~~~~~~~ - -Categorical -^^^^^^^^^^^ - -- Bug in :meth:`Categorical.from_codes` where ``NaN`` values in ``codes`` were silently converted to ``0`` (:issue:`21767`). In the future this will raise a ``ValueError``. Also changes the behavior of ``.from_codes([1.1, 2.0])``. -- Bug when indexing with a boolean-valued ``Categorical``. Now a boolean-valued ``Categorical`` is treated as a boolean mask (:issue:`22665`) -- Constructing a :class:`CategoricalIndex` with empty values and boolean categories was raising a ``ValueError`` after a change to dtype coercion (:issue:`22702`). - -Datetimelike -^^^^^^^^^^^^ - -- Fixed bug where two :class:`DateOffset` objects with different ``normalize`` attributes could evaluate as equal (:issue:`21404`) -- Fixed bug where :meth:`Timestamp.resolution` incorrectly returned 1-microsecond ``timedelta`` instead of 1-nanosecond :class:`Timedelta` (:issue:`21336`, :issue:`21365`) -- Bug in :func:`to_datetime` that did not consistently return an :class:`Index` when ``box=True`` was specified (:issue:`21864`) -- Bug in :class:`DatetimeIndex` comparisons where string comparisons incorrectly raises ``TypeError`` (:issue:`22074`) -- Bug in :class:`DatetimeIndex` comparisons when comparing against ``timedelta64[ns]`` dtyped arrays; in some cases ``TypeError`` was incorrectly raised, in others it incorrectly failed to raise (:issue:`22074`) -- Bug in :class:`DatetimeIndex` comparisons when comparing against object-dtyped arrays (:issue:`22074`) -- Bug in :class:`DataFrame` with ``datetime64[ns]`` dtype addition and subtraction with ``Timedelta``-like objects (:issue:`22005`, :issue:`22163`) -- Bug in :class:`DataFrame` with ``datetime64[ns]`` dtype addition and subtraction with ``DateOffset`` objects returning an ``object`` dtype instead of ``datetime64[ns]`` dtype (:issue:`21610`, :issue:`22163`) -- Bug in :class:`DataFrame` with ``datetime64[ns]`` dtype comparing against ``NaT`` incorrectly (:issue:`22242`, :issue:`22163`) -- Bug in :class:`DataFrame` with ``datetime64[ns]`` dtype subtracting ``Timestamp``-like object incorrectly returned ``datetime64[ns]`` dtype instead of ``timedelta64[ns]`` dtype (:issue:`8554`, :issue:`22163`) -- Bug in :class:`DataFrame` with ``datetime64[ns]`` dtype subtracting ``np.datetime64`` object with non-nanosecond unit failing to convert to nanoseconds (:issue:`18874`, :issue:`22163`) -- Bug in :class:`DataFrame` comparisons against ``Timestamp``-like objects failing to raise ``TypeError`` for inequality checks with mismatched types (:issue:`8932`, :issue:`22163`) -- Bug in :class:`DataFrame` with mixed dtypes including ``datetime64[ns]`` incorrectly raising ``TypeError`` on equality comparisons (:issue:`13128`, :issue:`22163`) -- Bug in :meth:`DataFrame.eq` comparison against ``NaT`` incorrectly returning ``True`` or ``NaN`` (:issue:`15697`, :issue:`22163`) -- Bug in :class:`DatetimeIndex` subtraction that incorrectly failed to raise ``OverflowError`` (:issue:`22492`, :issue:`22508`) -- Bug in :class:`DatetimeIndex` incorrectly allowing indexing with ``Timedelta`` object (:issue:`20464`) -- Bug in :class:`DatetimeIndex` where frequency was being set if original frequency was ``None`` (:issue:`22150`) -- Bug in rounding methods of :class:`DatetimeIndex` (:meth:`~DatetimeIndex.round`, :meth:`~DatetimeIndex.ceil`, :meth:`~DatetimeIndex.floor`) and :class:`Timestamp` (:meth:`~Timestamp.round`, :meth:`~Timestamp.ceil`, :meth:`~Timestamp.floor`) could give rise to loss of precision (:issue:`22591`) -- Bug in :func:`to_datetime` with an :class:`Index` argument that would drop the ``name`` from the result (:issue:`21697`) -- Bug in :class:`PeriodIndex` where adding or subtracting a :class:`timedelta` or :class:`Tick` object produced incorrect results (:issue:`22988`) - -Timedelta -^^^^^^^^^ -- Bug in :class:`DataFrame` with ``timedelta64[ns]`` dtype division by ``Timedelta``-like scalar incorrectly returning ``timedelta64[ns]`` dtype instead of ``float64`` dtype (:issue:`20088`, :issue:`22163`) -- Bug in adding a :class:`Index` with object dtype to a :class:`Series` with ``timedelta64[ns]`` dtype incorrectly raising (:issue:`22390`) -- Bug in multiplying a :class:`Series` with numeric dtype against a ``timedelta`` object (:issue:`22390`) -- Bug in :class:`Series` with numeric dtype when adding or subtracting an an array or ``Series`` with ``timedelta64`` dtype (:issue:`22390`) -- Bug in :class:`Index` with numeric dtype when multiplying or dividing an array with dtype ``timedelta64`` (:issue:`22390`) -- Bug in :class:`TimedeltaIndex` incorrectly allowing indexing with ``Timestamp`` object (:issue:`20464`) -- Fixed bug where subtracting :class:`Timedelta` from an object-dtyped array would raise ``TypeError`` (:issue:`21980`) -- Fixed bug in adding a :class:`DataFrame` with all-`timedelta64[ns]` dtypes to a :class:`DataFrame` with all-integer dtypes returning incorrect results instead of raising ``TypeError`` (:issue:`22696`) -- - -Timezones -^^^^^^^^^ - -- Bug in :meth:`DatetimeIndex.shift` where an ``AssertionError`` would raise when shifting across DST (:issue:`8616`) -- Bug in :class:`Timestamp` constructor where passing an invalid timezone offset designator (``Z``) would not raise a ``ValueError`` (:issue:`8910`) -- Bug in :meth:`Timestamp.replace` where replacing at a DST boundary would retain an incorrect offset (:issue:`7825`) -- Bug in :meth:`Series.replace` with ``datetime64[ns, tz]`` data when replacing ``NaT`` (:issue:`11792`) -- Bug in :class:`Timestamp` when passing different string date formats with a timezone offset would produce different timezone offsets (:issue:`12064`) -- Bug when comparing a tz-naive :class:`Timestamp` to a tz-aware :class:`DatetimeIndex` which would coerce the :class:`DatetimeIndex` to tz-naive (:issue:`12601`) -- Bug in :meth:`Series.truncate` with a tz-aware :class:`DatetimeIndex` which would cause a core dump (:issue:`9243`) -- Bug in :class:`Series` constructor which would coerce tz-aware and tz-naive :class:`Timestamp` to tz-aware (:issue:`13051`) -- Bug in :class:`Index` with ``datetime64[ns, tz]`` dtype that did not localize integer data correctly (:issue:`20964`) -- Bug in :class:`DatetimeIndex` where constructing with an integer and tz would not localize correctly (:issue:`12619`) -- Fixed bug where :meth:`DataFrame.describe` and :meth:`Series.describe` on tz-aware datetimes did not show `first` and `last` result (:issue:`21328`) -- Bug in :class:`DatetimeIndex` comparisons failing to raise ``TypeError`` when comparing timezone-aware ``DatetimeIndex`` against ``np.datetime64`` (:issue:`22074`) -- Bug in ``DataFrame`` assignment with a timezone-aware scalar (:issue:`19843`) -- Bug in :func:`Dataframe.asof` that raised a ``TypeError`` when attempting to compare tz-naive and tz-aware timestamps (:issue:`21194`) -- Bug when constructing a :class:`DatetimeIndex` with :class:`Timestamp`s constructed with the ``replace`` method across DST (:issue:`18785`) -- Bug when setting a new value with :meth:`DataFrame.loc` with a :class:`DatetimeIndex` with a DST transition (:issue:`18308`, :issue:`20724`) -- Bug in :meth:`DatetimeIndex.unique` that did not re-localize tz-aware dates correctly (:issue:`21737`) -- Bug when indexing a :class:`Series` with a DST transition (:issue:`21846`) -- Bug in :meth:`DataFrame.resample` and :meth:`Series.resample` where an ``AmbiguousTimeError`` or ``NonExistentTimeError`` would raise if a timezone aware timeseries ended on a DST transition (:issue:`19375`, :issue:`10117`) - -Offsets -^^^^^^^ - -- Bug in :class:`FY5253` where date offsets could incorrectly raise an ``AssertionError`` in arithmetic operatons (:issue:`14774`) -- Bug in :class:`DateOffset` where keyword arguments ``week`` and ``milliseconds`` were accepted and ignored. Passing these will now raise ``ValueError`` (:issue:`19398`) -- - -Numeric -^^^^^^^ - -- Bug in :class:`Series` ``__rmatmul__`` doesn't support matrix vector multiplication (:issue:`21530`) -- Bug in :func:`factorize` fails with read-only array (:issue:`12813`) -- Fixed bug in :func:`unique` handled signed zeros inconsistently: for some inputs 0.0 and -0.0 were treated as equal and for some inputs as different. Now they are treated as equal for all inputs (:issue:`21866`) -- Bug in :meth:`DataFrame.agg`, :meth:`DataFrame.transform` and :meth:`DataFrame.apply` where, - when supplied with a list of functions and ``axis=1`` (e.g. ``df.apply(['sum', 'mean'], axis=1)``), - a ``TypeError`` was wrongly raised. For all three methods such calculation are now done correctly. (:issue:`16679`). -- Bug in :class:`Series` comparison against datetime-like scalars and arrays (:issue:`22074`) -- Bug in :class:`DataFrame` multiplication between boolean dtype and integer returning ``object`` dtype instead of integer dtype (:issue:`22047`, :issue:`22163`) -- Bug in :meth:`DataFrame.apply` where, when supplied with a string argument and additional positional or keyword arguments (e.g. ``df.apply('sum', min_count=1)``), a ``TypeError`` was wrongly raised (:issue:`22376`) -- Bug in :meth:`DataFrame.astype` to extension dtype may raise ``AttributeError`` (:issue:`22578`) -- Bug in :class:`DataFrame` with ``timedelta64[ns]`` dtype arithmetic operations with ``ndarray`` with integer dtype incorrectly treating the narray as ``timedelta64[ns]`` dtype (:issue:`23114`) - -Strings -^^^^^^^ - -- -- -- - -Interval -^^^^^^^^ - -- Bug in the :class:`IntervalIndex` constructor where the ``closed`` parameter did not always override the inferred ``closed`` (:issue:`19370`) -- Bug in the ``IntervalIndex`` repr where a trailing comma was missing after the list of intervals (:issue:`20611`) -- Bug in :class:`Interval` where scalar arithmetic operations did not retain the ``closed`` value (:issue:`22313`) -- Bug in :class:`IntervalIndex` where indexing with datetime-like values raised a ``KeyError`` (:issue:`20636`) - -Indexing -^^^^^^^^ - -- The traceback from a ``KeyError`` when asking ``.loc`` for a single missing label is now shorter and more clear (:issue:`21557`) -- When ``.ix`` is asked for a missing integer label in a :class:`MultiIndex` with a first level of integer type, it now raises a ``KeyError``, consistently with the case of a flat :class:`Int64Index`, rather than falling back to positional indexing (:issue:`21593`) -- Bug in :meth:`DatetimeIndex.reindex` when reindexing a tz-naive and tz-aware :class:`DatetimeIndex` (:issue:`8306`) -- Bug in :class:`DataFrame` when setting values with ``.loc`` and a timezone aware :class:`DatetimeIndex` (:issue:`11365`) -- ``DataFrame.__getitem__`` now accepts dictionaries and dictionary keys as list-likes of labels, consistently with ``Series.__getitem__`` (:issue:`21294`) -- Fixed ``DataFrame[np.nan]`` when columns are non-unique (:issue:`21428`) -- Bug when indexing :class:`DatetimeIndex` with nanosecond resolution dates and timezones (:issue:`11679`) -- Bug where indexing with a Numpy array containing negative values would mutate the indexer (:issue:`21867`) -- Bug where mixed indexes wouldn't allow integers for ``.at`` (:issue:`19860`) -- ``Float64Index.get_loc`` now raises ``KeyError`` when boolean key passed. (:issue:`19087`) -- Bug in :meth:`DataFrame.loc` when indexing with an :class:`IntervalIndex` (:issue:`19977`) -- :class:`Index` no longer mangles ``None``, ``NaN`` and ``NaT``, i.e. they are treated as three different keys. However, for numeric Index all three are still coerced to a ``NaN`` (:issue:`22332`) -- Bug in `scalar in Index` if scalar is a float while the ``Index`` is of integer dtype (:issue:`22085`) - -Missing -^^^^^^^ - -- Bug in :func:`DataFrame.fillna` where a ``ValueError`` would raise when one column contained a ``datetime64[ns, tz]`` dtype (:issue:`15522`) -- Bug in :func:`Series.hasnans` that could be incorrectly cached and return incorrect answers if null elements are introduced after an initial call (:issue:`19700`) -- :func:`Series.isin` now treats all NaN-floats as equal also for `np.object`-dtype. This behavior is consistent with the behavior for float64 (:issue:`22119`) -- :func:`unique` no longer mangles NaN-floats and the ``NaT``-object for `np.object`-dtype, i.e. ``NaT`` is no longer coerced to a NaN-value and is treated as a different entity. (:issue:`22295`) - - -MultiIndex -^^^^^^^^^^ - -- Removed compatibility for :class:`MultiIndex` pickles prior to version 0.8.0; compatibility with :class:`MultiIndex` pickles from version 0.13 forward is maintained (:issue:`21654`) -- :meth:`MultiIndex.get_loc_level` (and as a consequence, ``.loc`` on a :class:`MultiIndex`ed object) will now raise a ``KeyError``, rather than returning an empty ``slice``, if asked a label which is present in the ``levels`` but is unused (:issue:`22221`) -- Fix ``TypeError`` in Python 3 when creating :class:`MultiIndex` in which some levels have mixed types, e.g. when some labels are tuples (:issue:`15457`) - -I/O -^^^ - -- :func:`read_html()` no longer ignores all-whitespace ```` within ```` when considering the ``skiprows`` and ``header`` arguments. Previously, users had to decrease their ``header`` and ``skiprows`` values on such tables to work around the issue. (:issue:`21641`) -- :func:`read_excel()` will correctly show the deprecation warning for previously deprecated ``sheetname`` (:issue:`17994`) -- :func:`read_csv()` and func:`read_table()` will throw ``UnicodeError`` and not coredump on badly encoded strings (:issue:`22748`) -- :func:`read_csv()` will correctly parse timezone-aware datetimes (:issue:`22256`) -- :func:`read_sas()` will parse numbers in sas7bdat-files that have width less than 8 bytes correctly. (:issue:`21616`) -- :func:`read_sas()` will correctly parse sas7bdat files with many columns (:issue:`22628`) -- :func:`read_sas()` will correctly parse sas7bdat files with data page types having also bit 7 set (so page type is 128 + 256 = 384) (:issue:`16615`) -- Bug in :meth:`detect_client_encoding` where potential ``IOError`` goes unhandled when importing in a mod_wsgi process due to restricted access to stdout. (:issue:`21552`) -- Bug in :func:`to_string()` that broke column alignment when ``index=False`` and width of first column's values is greater than the width of first column's header (:issue:`16839`, :issue:`13032`) - -Plotting -^^^^^^^^ - -- Bug in :func:`DataFrame.plot.scatter` and :func:`DataFrame.plot.hexbin` caused x-axis label and ticklabels to disappear when colorbar was on in IPython inline backend (:issue:`10611`, :issue:`10678`, and :issue:`20455`) -- Bug in plotting a Series with datetimes using :func:`matplotlib.axes.Axes.scatter` (:issue:`22039`) - -Groupby/Resample/Rolling -^^^^^^^^^^^^^^^^^^^^^^^^ - -- Bug in :func:`pandas.core.groupby.GroupBy.first` and :func:`pandas.core.groupby.GroupBy.last` with ``as_index=False`` leading to the loss of timezone information (:issue:`15884`) -- Bug in :meth:`DatetimeIndex.resample` when downsampling across a DST boundary (:issue:`8531`) -- Bug where ``ValueError`` is wrongly raised when calling :func:`~pandas.core.groupby.SeriesGroupBy.count` method of a - ``SeriesGroupBy`` when the grouping variable only contains NaNs and numpy version < 1.13 (:issue:`21956`). -- Multiple bugs in :func:`pandas.core.Rolling.min` with ``closed='left'`` and a - datetime-like index leading to incorrect results and also segfault. (:issue:`21704`) -- Bug in :meth:`Resampler.apply` when passing postiional arguments to applied func (:issue:`14615`). -- Bug in :meth:`Series.resample` when passing ``numpy.timedelta64`` to ``loffset`` kwarg (:issue:`7687`). -- Bug in :meth:`Resampler.asfreq` when frequency of ``TimedeltaIndex`` is a subperiod of a new frequency (:issue:`13022`). -- Bug in :meth:`SeriesGroupBy.mean` when values were integral but could not fit inside of int64, overflowing instead. (:issue:`22487`) -- :func:`RollingGroupby.agg` and :func:`ExpandingGroupby.agg` now support multiple aggregation functions as parameters (:issue:`15072`) -- Bug in :meth:`DataFrame.resample` and :meth:`Series.resample` when resampling by a weekly offset (``'W'``) across a DST transition (:issue:`9119`, :issue:`21459`) - -Reshaping -^^^^^^^^^ - -- Bug in :func:`pandas.concat` when joining resampled DataFrames with timezone aware index (:issue:`13783`) -- Bug in :meth:`Series.combine_first` with ``datetime64[ns, tz]`` dtype which would return tz-naive result (:issue:`21469`) -- Bug in :meth:`Series.where` and :meth:`DataFrame.where` with ``datetime64[ns, tz]`` dtype (:issue:`21546`) -- Bug in :meth:`Series.mask` and :meth:`DataFrame.mask` with ``list`` conditionals (:issue:`21891`) -- Bug in :meth:`DataFrame.replace` raises RecursionError when converting OutOfBounds ``datetime64[ns, tz]`` (:issue:`20380`) -- :func:`pandas.core.groupby.GroupBy.rank` now raises a ``ValueError`` when an invalid value is passed for argument ``na_option`` (:issue:`22124`) -- Bug in :func:`get_dummies` with Unicode attributes in Python 2 (:issue:`22084`) -- Bug in :meth:`DataFrame.replace` raises ``RecursionError`` when replacing empty lists (:issue:`22083`) -- Bug in :meth:`Series.replace` and meth:`DataFrame.replace` when dict is used as the ``to_replace`` value and one key in the dict is is another key's value, the results were inconsistent between using integer key and using string key (:issue:`20656`) -- Bug in :meth:`DataFrame.drop_duplicates` for empty ``DataFrame`` which incorrectly raises an error (:issue:`20516`) -- Bug in :func:`pandas.wide_to_long` when a string is passed to the stubnames argument and a column name is a substring of that stubname (:issue:`22468`) -- Bug in :func:`merge` when merging ``datetime64[ns, tz]`` data that contained a DST transition (:issue:`18885`) -- Bug in :func:`merge_asof` when merging on float values within defined tolerance (:issue:`22981`) -- Bug in :func:`pandas.concat` when concatenating a multicolumn DataFrame with tz-aware data against a DataFrame with a different number of columns (:issue`22796`) -- Bug in :func:`merge_asof` when merging on columns containing nulls values (:issue:`22981`) - -.. _whatsnew_0240.bug_fixes.sparse: - -Sparse -^^^^^^ - -- Updating a boolean, datetime, or timedelta column to be Sparse now works (:issue:`22367`) -- Bug in :meth:`Series.to_sparse` with Series already holding sparse data not constructing properly (:issue:`22389`) -- Providing a ``sparse_index`` to the SparseArray constructor no longer defaults the na-value to ``np.nan`` for all dtypes. The correct na_value for ``data.dtype`` is now used. -- Bug in ``SparseArray.nbytes`` under-reporting its memory usage by not including the size of its sparse index. -- Improved performance of :meth:`Series.shift` for non-NA ``fill_value``, as values are no longer converted to a dense array. -- Bug in ``DataFrame.groupby`` not including ``fill_value`` in the groups for non-NA ``fill_value`` when grouping by a sparse column (:issue:`5078`) -- Bug in unary inversion operator (``~``) on a ``SparseSeries`` with boolean values. The performance of this has also been improved (:issue:`22835`) - -Build Changes -^^^^^^^^^^^^^ - -- Building pandas for development now requires ``cython >= 0.28.2`` (:issue:`21688`) -- Testing pandas now requires ``hypothesis>=3.58``. You can find `the Hypothesis docs here `_, and a pandas-specific introduction :ref:`in the contributing guide `. (:issue:`22280`) -- - -Other -^^^^^ - -- :meth:`~pandas.io.formats.style.Styler.background_gradient` now takes a ``text_color_threshold`` parameter to automatically lighten the text color based on the luminance of the background color. This improves readability with dark background colors without the need to limit the background colormap range. (:issue:`21258`) -- Require at least 0.28.2 version of ``cython`` to support read-only memoryviews (:issue:`21688`) -- :meth:`~pandas.io.formats.style.Styler.background_gradient` now also supports tablewise application (in addition to rowwise and columnwise) with ``axis=None`` (:issue:`15204`) -- :meth:`DataFrame.nlargest` and :meth:`DataFrame.nsmallest` now returns the correct n values when keep != 'all' also when tied on the first columns (:issue:`22752`) -- :meth:`~pandas.io.formats.style.Styler.bar` now also supports tablewise application (in addition to rowwise and columnwise) with ``axis=None`` and setting clipping range with ``vmin`` and ``vmax`` (:issue:`21548` and :issue:`21526`). ``NaN`` values are also handled properly. -- Logical operations ``&, |, ^`` between :class:`Series` and :class:`Index` will no longer raise ``ValueError`` (:issue:`22092`) -- Bug in :meth:`DataFrame.combine_first` in which column types were unexpectedly converted to float (:issue:`20699`) From 64451863e025396fa1e32cd5adf064c8d2a87c81 Mon Sep 17 00:00:00 2001 From: Sven Date: Tue, 16 Oct 2018 07:22:24 +0100 Subject: [PATCH 07/14] move checking of error to later position and update error type --- pandas/core/reshape/merge.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 236707cf32209..20c4afb5641ab 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1390,16 +1390,21 @@ def flip(xs): self.right_join_keys[-1]) tolerance = self.tolerance - # Check null values before merge - if isnull(left_values).sum() > 0 or isnull(right_values).sum() > 0: - raise MergeError('Merge keys cannot contain null values') + # we required sortedness and non-missingness in the join keys + msg_sorted = "{side} keys must be sorted" + msg_missings = "Merge keys contain null values in {side} side" - # we required sortedness in the join keys - msg = "{side} keys must be sorted" if not Index(left_values).is_monotonic: - raise ValueError(msg.format(side='left')) + if isnull(left_values).sum() > 0: + raise ValueError(msg_missings.format(side='left')) + else: + raise ValueError(msg_sorted.format(side='left')) + if not Index(right_values).is_monotonic: - raise ValueError(msg.format(side='right')) + if isnull(right_values).sum() > 0: + raise ValueError(msg_missings.format(side='right')) + else: + raise ValueError(msg_sorted.format(side='right')) # initial type conversion as needed if needs_i8_conversion(left_values): From 8e26620468f70ddd040cef9cc70e9b9688383c88 Mon Sep 17 00:00:00 2001 From: Tola Date: Tue, 16 Oct 2018 20:21:16 +0100 Subject: [PATCH 08/14] BUG: merge_asof on columns containing nulls --- pandas/tests/reshape/merge/test_merge_asof.py | 23 +++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/pandas/tests/reshape/merge/test_merge_asof.py b/pandas/tests/reshape/merge/test_merge_asof.py index c75a6a707cafc..cc1cac6866ff7 100644 --- a/pandas/tests/reshape/merge/test_merge_asof.py +++ b/pandas/tests/reshape/merge/test_merge_asof.py @@ -1007,3 +1007,26 @@ def test_merge_datatype_error(self): with tm.assert_raises_regex(MergeError, msg): merge_asof(left, right, on='a') + + def test_merge_on_nans_int(self): + """ Test merging on integer columns with nans throws a merge error """ + left = pd.DataFrame({'a': [1, 5, 10, 12, np.nan], + 'left_val': ['a', 'b', 'c', 'd', 'e']}) + right = pd.DataFrame({'a': [1, 5, 10, 12, np.nan], + 'right_val': [1, 6, 11, 15, 16]}) + + with pytest.raises(ValueError): + merge_asof(left, right, on='a') + + def test_merge_on_nans_datetime(self): + """ Test merging on datetime columns with nans throws a merge error """ + top_left = pd.DataFrame(pd.date_range('20130101', periods=5), columns=['A']) + bottom_left = pd.DataFrame([np.nan], columns=['A']) + left = pd.concat([top_left, bottom_left]) + + top_right = pd.DataFrame(pd.date_range('20150601', periods=5), columns=['A']) + bottom_right = pd.DataFrame([np.nan], columns=['A']) + right = pd.concat([bottom_right, top_right]) + + with pytest.raises(ValueError): + merge_asof(left, right, on='A') From 7e6bb94ae59eed4469135b633879b30e75db3deb Mon Sep 17 00:00:00 2001 From: Sven Date: Tue, 16 Oct 2018 20:54:12 +0100 Subject: [PATCH 09/14] BUG: Fix error message on missing values for merge_asof (#23189) --- pandas/core/reshape/merge.py | 2 +- pandas/tests/reshape/merge/test_merge_asof.py | 22 +++++++++---------- 2 files changed, 11 insertions(+), 13 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 20c4afb5641ab..88b1ec7e47bbb 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1392,7 +1392,7 @@ def flip(xs): # we required sortedness and non-missingness in the join keys msg_sorted = "{side} keys must be sorted" - msg_missings = "Merge keys contain null values in {side} side" + msg_missings = "Merge keys contain null values on {side} side" if not Index(left_values).is_monotonic: if isnull(left_values).sum() > 0: diff --git a/pandas/tests/reshape/merge/test_merge_asof.py b/pandas/tests/reshape/merge/test_merge_asof.py index cc1cac6866ff7..835f1a2574540 100644 --- a/pandas/tests/reshape/merge/test_merge_asof.py +++ b/pandas/tests/reshape/merge/test_merge_asof.py @@ -1010,23 +1010,21 @@ def test_merge_datatype_error(self): def test_merge_on_nans_int(self): """ Test merging on integer columns with nans throws a merge error """ - left = pd.DataFrame({'a': [1, 5, 10, 12, np.nan], + msg = "Merge keys contain null values on left side" + left = pd.DataFrame({'a': [1.0, 5.0, 10.0, 12.0, np.nan], 'left_val': ['a', 'b', 'c', 'd', 'e']}) - right = pd.DataFrame({'a': [1, 5, 10, 12, np.nan], - 'right_val': [1, 6, 11, 15, 16]}) + right = pd.DataFrame({'a': [1.0, 5.0, 10.0, 12.0], + 'right_val': [1, 6, 11, 15]}) - with pytest.raises(ValueError): + with tm.assert_raises_regex(ValueError, msg): merge_asof(left, right, on='a') def test_merge_on_nans_datetime(self): """ Test merging on datetime columns with nans throws a merge error """ - top_left = pd.DataFrame(pd.date_range('20130101', periods=5), columns=['A']) - bottom_left = pd.DataFrame([np.nan], columns=['A']) - left = pd.concat([top_left, bottom_left]) - top_right = pd.DataFrame(pd.date_range('20150601', periods=5), columns=['A']) - bottom_right = pd.DataFrame([np.nan], columns=['A']) - right = pd.concat([bottom_right, top_right]) + msg = "Merge keys contain null values on right side" + left = pd.DataFrame(pd.date_range('20130101', periods=5), columns=['a']) + right = pd.DataFrame(pd.date_range('20130102', periods=5).append(pd.Index([None])), columns=['a']) - with pytest.raises(ValueError): - merge_asof(left, right, on='A') + with tm.assert_raises_regex(ValueError, msg): + merge_asof(left, right, on='a') From 8970a51679c6d54b0fc1a226de34db5dc33a228b Mon Sep 17 00:00:00 2001 From: Sven Date: Tue, 16 Oct 2018 20:56:14 +0100 Subject: [PATCH 10/14] BUG: Fix error message on missing values for merge_asof (#23189) --- doc/source/whatsnew/v0.24.0.txt | 2 +- pandas/tests/reshape/merge/test_merge_asof.py | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index ba59e050cea70..16f0b9ee99909 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -972,7 +972,7 @@ Reshaping - Bug in :func:`merge` when merging ``datetime64[ns, tz]`` data that contained a DST transition (:issue:`18885`) - Bug in :func:`merge_asof` when merging on float values within defined tolerance (:issue:`22981`) - Bug in :func:`pandas.concat` when concatenating a multicolumn DataFrame with tz-aware data against a DataFrame with a different number of columns (:issue`22796`) -- Bug in :func:`merge_asof` where confusing error message raised when attempting to merge with missing values +- Bug in :func:`merge_asof` where confusing error message raised when attempting to merge with missing values (:issue:`23189`) .. _whatsnew_0240.bug_fixes.sparse: diff --git a/pandas/tests/reshape/merge/test_merge_asof.py b/pandas/tests/reshape/merge/test_merge_asof.py index 835f1a2574540..ee67d589afd64 100644 --- a/pandas/tests/reshape/merge/test_merge_asof.py +++ b/pandas/tests/reshape/merge/test_merge_asof.py @@ -1009,7 +1009,8 @@ def test_merge_datatype_error(self): merge_asof(left, right, on='a') def test_merge_on_nans_int(self): - """ Test merging on integer columns with nans throws a merge error """ + """ Test merging on integer columns with nans throws a correct ValueError """ + # 23189 msg = "Merge keys contain null values on left side" left = pd.DataFrame({'a': [1.0, 5.0, 10.0, 12.0, np.nan], 'left_val': ['a', 'b', 'c', 'd', 'e']}) @@ -1020,8 +1021,8 @@ def test_merge_on_nans_int(self): merge_asof(left, right, on='a') def test_merge_on_nans_datetime(self): - """ Test merging on datetime columns with nans throws a merge error """ - + """ Test merging on datetime columns with nans throws correct ValueError """ + # 23189 msg = "Merge keys contain null values on right side" left = pd.DataFrame(pd.date_range('20130101', periods=5), columns=['a']) right = pd.DataFrame(pd.date_range('20130102', periods=5).append(pd.Index([None])), columns=['a']) From a2eacc24812f7a6d72532d331e4c7c972f589c8c Mon Sep 17 00:00:00 2001 From: Tola Date: Mon, 15 Oct 2018 21:19:11 +0100 Subject: [PATCH 11/14] BUG: merge_asof on columns containing nulls --- doc/source/whatsnew/v0.24.1.txt | 1006 +++++++++++++++++++++++++++++++ pandas/core/reshape/merge.py | 19 +- 2 files changed, 1013 insertions(+), 12 deletions(-) create mode 100644 doc/source/whatsnew/v0.24.1.txt diff --git a/doc/source/whatsnew/v0.24.1.txt b/doc/source/whatsnew/v0.24.1.txt new file mode 100644 index 0000000000000..9463458bfa64f --- /dev/null +++ b/doc/source/whatsnew/v0.24.1.txt @@ -0,0 +1,1006 @@ +.. _whatsnew_0240: + +v0.24.0 (Month XX, 2018) +------------------------ + +.. warning:: + + Starting January 1, 2019, pandas feature releases will support Python 3 only. + See :ref:`install.dropping-27` for more. + +.. _whatsnew_0240.enhancements: + +New features +~~~~~~~~~~~~ +- :func:`merge` now directly allows merge between objects of type ``DataFrame`` and named ``Series``, without the need to convert the ``Series`` object into a ``DataFrame`` beforehand (:issue:`21220`) + + +- ``ExcelWriter`` now accepts ``mode`` as a keyword argument, enabling append to existing workbooks when using the ``openpyxl`` engine (:issue:`3441`) + +- :func:`DataFrame.to_parquet` now accepts ``index`` as an argument, allowing +the user to override the engine's default behavior to include or omit the +dataframe's indexes from the resulting Parquet file. (:issue:`20768`) +- :meth:`DataFrame.corr` and :meth:`Series.corr` now accept a callable for generic calculation methods of correlation, e.g. histogram intersection (:issue:`22684`) + + +.. _whatsnew_0240.enhancements.extension_array_operators: + +``ExtensionArray`` operator support +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +A ``Series`` based on an ``ExtensionArray`` now supports arithmetic and comparison +operators (:issue:`19577`). There are two approaches for providing operator support for an ``ExtensionArray``: + +1. Define each of the operators on your ``ExtensionArray`` subclass. +2. Use an operator implementation from pandas that depends on operators that are already defined + on the underlying elements (scalars) of the ``ExtensionArray``. + +See the :ref:`ExtensionArray Operator Support +` documentation section for details on both +ways of adding operator support. + +.. _whatsnew_0240.enhancements.intna: + +Optional Integer NA Support +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Pandas has gained the ability to hold integer dtypes with missing values. This long requested feature is enabled through the use of :ref:`extension types `. +Here is an example of the usage. + +We can construct a ``Series`` with the specified dtype. The dtype string ``Int64`` is a pandas ``ExtensionDtype``. Specifying a list or array using the traditional missing value +marker of ``np.nan`` will infer to integer dtype. The display of the ``Series`` will also use the ``NaN`` to indicate missing values in string outputs. (:issue:`20700`, :issue:`20747`, :issue:`22441`, :issue:`21789`, :issue:`22346`) + +.. ipython:: python + + s = pd.Series([1, 2, np.nan], dtype='Int64') + s + + +Operations on these dtypes will propagate ``NaN`` as other pandas operations. + +.. ipython:: python + + # arithmetic + s + 1 + + # comparison + s == 1 + + # indexing + s.iloc[1:3] + + # operate with other dtypes + s + s.iloc[1:3].astype('Int8') + + # coerce when needed + s + 0.01 + +These dtypes can operate as part of of ``DataFrame``. + +.. ipython:: python + + df = pd.DataFrame({'A': s, 'B': [1, 1, 3], 'C': list('aab')}) + df + df.dtypes + + +These dtypes can be merged & reshaped & casted. + +.. ipython:: python + + pd.concat([df[['A']], df[['B', 'C']]], axis=1).dtypes + df['A'].astype(float) + +Reduction and groupby operations such as 'sum' work. + +.. ipython:: python + + df.sum() + df.groupby('B').A.sum() + +.. warning:: + + The Integer NA support currently uses the captilized dtype version, e.g. ``Int8`` as compared to the traditional ``int8``. This may be changed at a future date. + +.. _whatsnew_0240.enhancements.read_html: + +``read_html`` Enhancements +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:func:`read_html` previously ignored ``colspan`` and ``rowspan`` attributes. +Now it understands them, treating them as sequences of cells with the same +value. (:issue:`17054`) + +.. ipython:: python + + result = pd.read_html(""" + + + + + + + + + + + +
ABC
12
""") + +Previous Behavior: + +.. code-block:: ipython + + In [13]: result + Out [13]: + [ A B C + 0 1 2 NaN] + +Current Behavior: + +.. ipython:: python + + result + + +.. _whatsnew_0240.enhancements.interval: + +Storing Interval Data in Series and DataFrame +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Interval data may now be stored in a ``Series`` or ``DataFrame``, in addition to an +:class:`IntervalIndex` like previously (:issue:`19453`). + +.. ipython:: python + + ser = pd.Series(pd.interval_range(0, 5)) + ser + ser.dtype + +Previously, these would be cast to a NumPy array of ``Interval`` objects. In general, +this should result in better performance when storing an array of intervals in +a :class:`Series`. + +Note that the ``.values`` of a ``Series`` containing intervals is no longer a NumPy +array, but rather an ``ExtensionArray``: + +.. ipython:: python + + ser.values + +This is the same behavior as ``Series.values`` for categorical data. See +:ref:`whatsnew_0240.api_breaking.interval_values` for more. + + +.. _whatsnew_0240.enhancements.other: + +Other Enhancements +^^^^^^^^^^^^^^^^^^ +- :func:`to_datetime` now supports the ``%Z`` and ``%z`` directive when passed into ``format`` (:issue:`13486`) +- :func:`Series.mode` and :func:`DataFrame.mode` now support the ``dropna`` parameter which can be used to specify whether ``NaN``/``NaT`` values should be considered (:issue:`17534`) +- :func:`to_csv` now supports ``compression`` keyword when a file handle is passed. (:issue:`21227`) +- :meth:`Index.droplevel` is now implemented also for flat indexes, for compatibility with :class:`MultiIndex` (:issue:`21115`) +- :meth:`Series.droplevel` and :meth:`DataFrame.droplevel` are now implemented (:issue:`20342`) +- Added support for reading from/writing to Google Cloud Storage via the ``gcsfs`` library (:issue:`19454`, :issue:`23094`) +- :func:`to_gbq` and :func:`read_gbq` signature and documentation updated to + reflect changes from the `Pandas-GBQ library version 0.6.0 + `__. + (:issue:`21627`, :issue:`22557`) +- New method :meth:`HDFStore.walk` will recursively walk the group hierarchy of an HDF5 file (:issue:`10932`) +- :func:`read_html` copies cell data across ``colspan`` and ``rowspan``, and it treats all-``th`` table rows as headers if ``header`` kwarg is not given and there is no ``thead`` (:issue:`17054`) +- :meth:`Series.nlargest`, :meth:`Series.nsmallest`, :meth:`DataFrame.nlargest`, and :meth:`DataFrame.nsmallest` now accept the value ``"all"`` for the ``keep`` argument. This keeps all ties for the nth largest/smallest value (:issue:`16818`) +- :class:`IntervalIndex` has gained the :meth:`~IntervalIndex.set_closed` method to change the existing ``closed`` value (:issue:`21670`) +- :func:`~DataFrame.to_csv`, :func:`~Series.to_csv`, :func:`~DataFrame.to_json`, and :func:`~Series.to_json` now support ``compression='infer'`` to infer compression based on filename extension (:issue:`15008`). + The default compression for ``to_csv``, ``to_json``, and ``to_pickle`` methods has been updated to ``'infer'`` (:issue:`22004`). +- :func:`to_timedelta` now supports iso-formated timedelta strings (:issue:`21877`) +- :class:`Series` and :class:`DataFrame` now support :class:`Iterable` in constructor (:issue:`2193`) +- :class:`DatetimeIndex` gained :attr:`DatetimeIndex.timetz` attribute. Returns local time with timezone information. (:issue:`21358`) +- :meth:`round`, :meth:`ceil`, and meth:`floor` for :class:`DatetimeIndex` and :class:`Timestamp` now support an ``ambiguous`` argument for handling datetimes that are rounded to ambiguous times (:issue:`18946`) +- :class:`Resampler` now is iterable like :class:`GroupBy` (:issue:`15314`). +- :meth:`Series.resample` and :meth:`DataFrame.resample` have gained the :meth:`Resampler.quantile` (:issue:`15023`). +- :meth:`Index.to_frame` now supports overriding column name(s) (:issue:`22580`). +- New attribute :attr:`__git_version__` will return git commit sha of current build (:issue:`21295`). +- Compatibility with Matplotlib 3.0 (:issue:`22790`). + +.. _whatsnew_0240.api_breaking: + +Backwards incompatible API changes +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +- A newly constructed empty :class:`DataFrame` with integer as the ``dtype`` will now only be cast to ``float64`` if ``index`` is specified (:issue:`22858`) + +.. _whatsnew_0240.api_breaking.deps: + +Dependencies have increased minimum versions +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +We have updated our minimum supported versions of dependencies (:issue:`21242`). +If installed, we now require: + ++-----------------+-----------------+----------+ +| Package | Minimum Version | Required | ++=================+=================+==========+ +| numpy | 1.12.0 | X | ++-----------------+-----------------+----------+ +| bottleneck | 1.2.0 | | ++-----------------+-----------------+----------+ +| matplotlib | 2.0.0 | | ++-----------------+-----------------+----------+ +| numexpr | 2.6.1 | | ++-----------------+-----------------+----------+ +| pytables | 3.4.2 | | ++-----------------+-----------------+----------+ +| scipy | 0.18.1 | | ++-----------------+-----------------+----------+ + +.. _whatsnew_0240.api_breaking.interval_values: + +``IntervalIndex.values`` is now an ``IntervalArray`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The :attr:`~Interval.values` attribute of an :class:`IntervalIndex` now returns an +``IntervalArray``, rather than a NumPy array of :class:`Interval` objects (:issue:`19453`). + +Previous Behavior: + +.. code-block:: ipython + + In [1]: idx = pd.interval_range(0, 4) + + In [2]: idx.values + Out[2]: + array([Interval(0, 1, closed='right'), Interval(1, 2, closed='right'), + Interval(2, 3, closed='right'), Interval(3, 4, closed='right')], + dtype=object) + +New Behavior: + +.. ipython:: python + + idx = pd.interval_range(0, 4) + idx.values + +This mirrors ``CategoricalIndex.values``, which returns a ``Categorical``. + +For situations where you need an ``ndarray`` of ``Interval`` objects, use +:meth:`numpy.asarray` or ``idx.astype(object)``. + +.. ipython:: python + + np.asarray(idx) + idx.values.astype(object) + +.. _whatsnew_0240.api.timezone_offset_parsing: + +Parsing Datetime Strings with Timezone Offsets +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Previously, parsing datetime strings with UTC offsets with :func:`to_datetime` +or :class:`DatetimeIndex` would automatically convert the datetime to UTC +without timezone localization. This is inconsistent from parsing the same +datetime string with :class:`Timestamp` which would preserve the UTC +offset in the ``tz`` attribute. Now, :func:`to_datetime` preserves the UTC +offset in the ``tz`` attribute when all the datetime strings have the same +UTC offset (:issue:`17697`, :issue:`11736`, :issue:`22457`) + +*Previous Behavior*: + +.. code-block:: ipython + + In [2]: pd.to_datetime("2015-11-18 15:30:00+05:30") + Out[2]: Timestamp('2015-11-18 10:00:00') + + In [3]: pd.Timestamp("2015-11-18 15:30:00+05:30") + Out[3]: Timestamp('2015-11-18 15:30:00+0530', tz='pytz.FixedOffset(330)') + + # Different UTC offsets would automatically convert the datetimes to UTC (without a UTC timezone) + In [4]: pd.to_datetime(["2015-11-18 15:30:00+05:30", "2015-11-18 16:30:00+06:30"]) + Out[4]: DatetimeIndex(['2015-11-18 10:00:00', '2015-11-18 10:00:00'], dtype='datetime64[ns]', freq=None) + +*Current Behavior*: + +.. ipython:: python + + pd.to_datetime("2015-11-18 15:30:00+05:30") + pd.Timestamp("2015-11-18 15:30:00+05:30") + +Parsing datetime strings with the same UTC offset will preserve the UTC offset in the ``tz`` + +.. ipython:: python + + pd.to_datetime(["2015-11-18 15:30:00+05:30"] * 2) + +Parsing datetime strings with different UTC offsets will now create an Index of +``datetime.datetime`` objects with different UTC offsets + +.. ipython:: python + + idx = pd.to_datetime(["2015-11-18 15:30:00+05:30", "2015-11-18 16:30:00+06:30"]) + idx + idx[0] + idx[1] + +Passing ``utc=True`` will mimic the previous behavior but will correctly indicate +that the dates have been converted to UTC + +.. ipython:: python + + pd.to_datetime(["2015-11-18 15:30:00+05:30", "2015-11-18 16:30:00+06:30"], utc=True) + +.. _whatsnew_0240.api_breaking.calendarday: + +CalendarDay Offset +^^^^^^^^^^^^^^^^^^ + +:class:`Day` and associated frequency alias ``'D'`` were documented to represent +a calendar day; however, arithmetic and operations with :class:`Day` sometimes +respected absolute time instead (i.e. ``Day(n)`` and acted identically to ``Timedelta(days=n)``). + +*Previous Behavior*: + +.. code-block:: ipython + + + In [2]: ts = pd.Timestamp('2016-10-30 00:00:00', tz='Europe/Helsinki') + + # Respects calendar arithmetic + In [3]: pd.date_range(start=ts, freq='D', periods=3) + Out[3]: + DatetimeIndex(['2016-10-30 00:00:00+03:00', '2016-10-31 00:00:00+02:00', + '2016-11-01 00:00:00+02:00'], + dtype='datetime64[ns, Europe/Helsinki]', freq='D') + + # Respects absolute arithmetic + In [4]: ts + pd.tseries.frequencies.to_offset('D') + Out[4]: Timestamp('2016-10-30 23:00:00+0200', tz='Europe/Helsinki') + +:class:`CalendarDay` and associated frequency alias ``'CD'`` are now available +and respect calendar day arithmetic while :class:`Day` and frequency alias ``'D'`` +will now respect absolute time (:issue:`22274`, :issue:`20596`, :issue:`16980`, :issue:`8774`) +See the :ref:`documentation here ` for more information. + +Addition with :class:`CalendarDay` across a daylight savings time transition: + +.. ipython:: python + + ts = pd.Timestamp('2016-10-30 00:00:00', tz='Europe/Helsinki') + ts + pd.offsets.Day(1) + ts + pd.offsets.CalendarDay(1) + +.. _whatsnew_0240.api_breaking.period_end_time: + +Time values in ``dt.end_time`` and ``to_timestamp(how='end')`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The time values in :class:`Period` and :class:`PeriodIndex` objects are now set +to '23:59:59.999999999' when calling :attr:`Series.dt.end_time`, :attr:`Period.end_time`, +:attr:`PeriodIndex.end_time`, :func:`Period.to_timestamp()` with ``how='end'``, +or :func:`PeriodIndex.to_timestamp()` with ``how='end'`` (:issue:`17157`) + +Previous Behavior: + +.. code-block:: ipython + + In [2]: p = pd.Period('2017-01-01', 'D') + In [3]: pi = pd.PeriodIndex([p]) + + In [4]: pd.Series(pi).dt.end_time[0] + Out[4]: Timestamp(2017-01-01 00:00:00) + + In [5]: p.end_time + Out[5]: Timestamp(2017-01-01 23:59:59.999999999) + +Current Behavior: + +Calling :attr:`Series.dt.end_time` will now result in a time of '23:59:59.999999999' as +is the case with :attr:`Period.end_time`, for example + +.. ipython:: python + + p = pd.Period('2017-01-01', 'D') + pi = pd.PeriodIndex([p]) + + pd.Series(pi).dt.end_time[0] + + p.end_time + +.. _whatsnew_0240.api_breaking.sparse_values: + +Sparse Data Structure Refactor +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +``SparseArray``, the array backing ``SparseSeries`` and the columns in a ``SparseDataFrame``, +is now an extension array (:issue:`21978`, :issue:`19056`, :issue:`22835`). +To conform to this interface and for consistency with the rest of pandas, some API breaking +changes were made: + +- ``SparseArray`` is no longer a subclass of :class:`numpy.ndarray`. To convert a SparseArray to a NumPy array, use :meth:`numpy.asarray`. +- ``SparseArray.dtype`` and ``SparseSeries.dtype`` are now instances of :class:`SparseDtype`, rather than ``np.dtype``. Access the underlying dtype with ``SparseDtype.subtype``. +- :meth:`numpy.asarray(sparse_array)` now returns a dense array with all the values, not just the non-fill-value values (:issue:`14167`) +- ``SparseArray.take`` now matches the API of :meth:`pandas.api.extensions.ExtensionArray.take` (:issue:`19506`): + + * The default value of ``allow_fill`` has changed from ``False`` to ``True``. + * The ``out`` and ``mode`` parameters are now longer accepted (previously, this raised if they were specified). + * Passing a scalar for ``indices`` is no longer allowed. + +- The result of concatenating a mix of sparse and dense Series is a Series with sparse values, rather than a ``SparseSeries``. +- ``SparseDataFrame.combine`` and ``DataFrame.combine_first`` no longer supports combining a sparse column with a dense column while preserving the sparse subtype. The result will be an object-dtype SparseArray. +- Setting :attr:`SparseArray.fill_value` to a fill value with a different dtype is now allowed. + + +Some new warnings are issued for operations that require or are likely to materialize a large dense array: + +- A :class:`errors.PerformanceWarning` is issued when using fillna with a ``method``, as a dense array is constructed to create the filled array. Filling with a ``value`` is the efficient way to fill a sparse array. +- A :class:`errors.PerformanceWarning` is now issued when concatenating sparse Series with differing fill values. The fill value from the first sparse array continues to be used. + +In addition to these API breaking changes, many :ref:`performance improvements and bug fixes have been made `. + +.. _whatsnew_0240.api_breaking.frame_to_dict_index_orient: + +Raise ValueError in ``DataFrame.to_dict(orient='index')`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Bug in :func:`DataFrame.to_dict` raises ``ValueError`` when used with +``orient='index'`` and a non-unique index instead of losing data (:issue:`22801`) + +.. ipython:: python + :okexcept: + + df = pd.DataFrame({'a': [1, 2], 'b': [0.5, 0.75]}, index=['A', 'A']) + df + + df.to_dict(orient='index') + +.. _whatsnew_0240.api.datetimelike.normalize: + +Tick DateOffset Normalize Restrictions +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Creating a ``Tick`` object (:class:`Day`, :class:`Hour`, :class:`Minute`, +:class:`Second`, :class:`Milli`, :class:`Micro`, :class:`Nano`) with +``normalize=True`` is no longer supported. This prevents unexpected behavior +where addition could fail to be monotone or associative. (:issue:`21427`) + +*Previous Behavior*: + +.. code-block:: ipython + + + In [2]: ts = pd.Timestamp('2018-06-11 18:01:14') + + In [3]: ts + Out[3]: Timestamp('2018-06-11 18:01:14') + + In [4]: tic = pd.offsets.Hour(n=2, normalize=True) + ...: + + In [5]: tic + Out[5]: <2 * Hours> + + In [6]: ts + tic + Out[6]: Timestamp('2018-06-11 00:00:00') + + In [7]: ts + tic + tic + tic == ts + (tic + tic + tic) + Out[7]: False + +*Current Behavior*: + +.. ipython:: python + + ts = pd.Timestamp('2018-06-11 18:01:14') + tic = pd.offsets.Hour(n=2) + ts + tic + tic + tic == ts + (tic + tic + tic) + + +.. _whatsnew_0240.api.datetimelike: + + +.. _whatsnew_0240.api.period_subtraction: + +Period Subtraction +^^^^^^^^^^^^^^^^^^ + +Subtraction of a ``Period`` from another ``Period`` will give a ``DateOffset``. +instead of an integer (:issue:`21314`) + +.. ipython:: python + + june = pd.Period('June 2018') + april = pd.Period('April 2018') + june - april + +Previous Behavior: + +.. code-block:: ipython + + In [2]: june = pd.Period('June 2018') + + In [3]: april = pd.Period('April 2018') + + In [4]: june - april + Out [4]: 2 + +Similarly, subtraction of a ``Period`` from a ``PeriodIndex`` will now return +an ``Index`` of ``DateOffset`` objects instead of an ``Int64Index`` + +.. ipython:: python + + pi = pd.period_range('June 2018', freq='M', periods=3) + pi - pi[0] + +Previous Behavior: + +.. code-block:: ipython + + In [2]: pi = pd.period_range('June 2018', freq='M', periods=3) + + In [3]: pi - pi[0] + Out[3]: Int64Index([0, 1, 2], dtype='int64') + + +.. _whatsnew_0240.api.timedelta64_subtract_nan: + +Addition/Subtraction of ``NaN`` from :class:`DataFrame` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Adding or subtracting ``NaN`` from a :class:`DataFrame` column with +``timedelta64[ns]`` dtype will now raise a ``TypeError`` instead of returning +all-``NaT``. This is for compatibility with ``TimedeltaIndex`` and +``Series`` behavior (:issue:`22163`) + +.. ipython:: python + :okexcept: + + df = pd.DataFrame([pd.Timedelta(days=1)]) + df - np.nan + +Previous Behavior: + +.. code-block:: ipython + + In [4]: df = pd.DataFrame([pd.Timedelta(days=1)]) + + In [5]: df - np.nan + Out[5]: + 0 + 0 NaT + + +.. _whatsnew_0240.api.dataframe_arithmetic_broadcasting: + +DataFrame Arithmetic Operations Broadcasting Changes +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +:class:`DataFrame` arithmetic operations when operating with 2-dimensional +``np.ndarray`` objects now broadcast in the same way as ``np.ndarray``s +broadcast. (:issue:`23000`) + +Previous Behavior: + +.. code-block:: ipython + + In [3]: arr = np.arange(6).reshape(3, 2) + In [4]: df = pd.DataFrame(arr) + In [5]: df + arr[[0], :] # 1 row, 2 columns + ... + ValueError: Unable to coerce to DataFrame, shape must be (3, 2): given (1, 2) + In [6]: df + arr[:, [1]] # 1 column, 3 rows + ... + ValueError: Unable to coerce to DataFrame, shape must be (3, 2): given (3, 1) + +*Current Behavior*: + +.. ipython:: python + arr = np.arange(6).reshape(3, 2) + df = pd.DataFrame(arr) + df + +.. ipython:: python + df + arr[[0], :] # 1 row, 2 columns + df + arr[:, [1]] # 1 column, 3 rows + + +.. _whatsnew_0240.api.extension: + +ExtensionType Changes +^^^^^^^^^^^^^^^^^^^^^ + +**:class:`pandas.api.extensions.ExtensionDtype` Equality and Hashability** + +Pandas now requires that extension dtypes be hashable. The base class implements +a default ``__eq__`` and ``__hash__``. If you have a parametrized dtype, you should +update the ``ExtensionDtype._metadata`` tuple to match the signature of your +``__init__`` method. See :class:`pandas.api.extensions.ExtensionDtype` for more (:issue:`22476`). + +**Other changes** + +- ``ExtensionArray`` has gained the abstract methods ``.dropna()`` (:issue:`21185`) +- ``ExtensionDtype`` has gained the ability to instantiate from string dtypes, e.g. ``decimal`` would instantiate a registered ``DecimalDtype``; furthermore + the ``ExtensionDtype`` has gained the method ``construct_array_type`` (:issue:`21185`) +- An ``ExtensionArray`` with a boolean dtype now works correctly as a boolean indexer. :meth:`pandas.api.types.is_bool_dtype` now properly considers them boolean (:issue:`22326`) +- Added ``ExtensionDtype._is_numeric`` for controlling whether an extension dtype is considered numeric (:issue:`22290`). +- The ``ExtensionArray`` constructor, ``_from_sequence`` now take the keyword arg ``copy=False`` (:issue:`21185`) +- Bug in :meth:`Series.get` for ``Series`` using ``ExtensionArray`` and integer index (:issue:`21257`) +- :meth:`~Series.shift` now dispatches to :meth:`ExtensionArray.shift` (:issue:`22386`) +- :meth:`Series.combine()` works correctly with :class:`~pandas.api.extensions.ExtensionArray` inside of :class:`Series` (:issue:`20825`) +- :meth:`Series.combine()` with scalar argument now works for any function type (:issue:`21248`) +- :meth:`Series.astype` and :meth:`DataFrame.astype` now dispatch to :meth:`ExtensionArray.astype` (:issue:`21185:`). +- Slicing a single row of a ``DataFrame`` with multiple ExtensionArrays of the same type now preserves the dtype, rather than coercing to object (:issue:`22784`) +- Added :meth:`pandas.api.types.register_extension_dtype` to register an extension type with pandas (:issue:`22664`) +- Series backed by an ``ExtensionArray`` now work with :func:`util.hash_pandas_object` (:issue:`23066`) +- Updated the ``.type`` attribute for ``PeriodDtype``, ``DatetimeTZDtype``, and ``IntervalDtype`` to be instances of the dtype (``Period``, ``Timestamp``, and ``Interval`` respectively) (:issue:`22938`) +- :func:`ExtensionArray.isna` is allowed to return an ``ExtensionArray`` (:issue:`22325`). +- Support for reduction operations such as ``sum``, ``mean`` via opt-in base class method override (:issue:`22762`) + +.. _whatsnew_0240.api.incompatibilities: + +Series and Index Data-Dtype Incompatibilities +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +``Series`` and ``Index`` constructors now raise when the +data is incompatible with a passed ``dtype=`` (:issue:`15832`) + +Previous Behavior: + +.. code-block:: ipython + + In [4]: pd.Series([-1], dtype="uint64") + Out [4]: + 0 18446744073709551615 + dtype: uint64 + +Current Behavior: + +.. code-block:: ipython + + In [4]: pd.Series([-1], dtype="uint64") + Out [4]: + ... + OverflowError: Trying to coerce negative values to unsigned integers + +.. _whatsnew_0240.api.crosstab_dtypes + +Crosstab Preserves Dtypes +^^^^^^^^^^^^^^^^^^^^^^^^^ + +:func:`crosstab` will preserve now dtypes in some cases that previously would +cast from integer dtype to floating dtype (:issue:`22019`) + +Previous Behavior: + +.. code-block:: ipython + + In [3]: df = pd.DataFrame({'a': [1, 2, 2, 2, 2], 'b': [3, 3, 4, 4, 4], + ...: 'c': [1, 1, np.nan, 1, 1]}) + In [4]: pd.crosstab(df.a, df.b, normalize='columns') + Out[4]: + b 3 4 + a + 1 0.5 0.0 + 2 0.5 1.0 + +Current Behavior: + +.. code-block:: ipython + + In [3]: df = pd.DataFrame({'a': [1, 2, 2, 2, 2], 'b': [3, 3, 4, 4, 4], + ...: 'c': [1, 1, np.nan, 1, 1]}) + In [4]: pd.crosstab(df.a, df.b, normalize='columns') + +Datetimelike API Changes +^^^^^^^^^^^^^^^^^^^^^^^^ + +- For :class:`DatetimeIndex` and :class:`TimedeltaIndex` with non-``None`` ``freq`` attribute, addition or subtraction of integer-dtyped array or ``Index`` will return an object of the same class (:issue:`19959`) +- :class:`DateOffset` objects are now immutable. Attempting to alter one of these will now raise ``AttributeError`` (:issue:`21341`) +- :class:`PeriodIndex` subtraction of another ``PeriodIndex`` will now return an object-dtype :class:`Index` of :class:`DateOffset` objects instead of raising a ``TypeError`` (:issue:`20049`) +- :func:`cut` and :func:`qcut` now returns a :class:`DatetimeIndex` or :class:`TimedeltaIndex` bins when the input is datetime or timedelta dtype respectively and ``retbins=True`` (:issue:`19891`) +- :meth:`DatetimeIndex.to_period` and :meth:`Timestamp.to_period` will issue a warning when timezone information will be lost (:issue:`21333`) + +.. _whatsnew_0240.api.other: + +Other API Changes +^^^^^^^^^^^^^^^^^ + +- :class:`DatetimeIndex` now accepts :class:`Int64Index` arguments as epoch timestamps (:issue:`20997`) +- Accessing a level of a ``MultiIndex`` with a duplicate name (e.g. in + :meth:`~MultiIndex.get_level_values`) now raises a ``ValueError`` instead of + a ``KeyError`` (:issue:`21678`). +- Invalid construction of ``IntervalDtype`` will now always raise a ``TypeError`` rather than a ``ValueError`` if the subdtype is invalid (:issue:`21185`) +- Trying to reindex a ``DataFrame`` with a non unique ``MultiIndex`` now raises a ``ValueError`` instead of an ``Exception`` (:issue:`21770`) +- :meth:`PeriodIndex.tz_convert` and :meth:`PeriodIndex.tz_localize` have been removed (:issue:`21781`) +- :class:`Index` subtraction will attempt to operate element-wise instead of raising ``TypeError`` (:issue:`19369`) +- :class:`pandas.io.formats.style.Styler` supports a ``number-format`` property when using :meth:`~pandas.io.formats.style.Styler.to_excel` (:issue:`22015`) +- :meth:`DataFrame.corr` and :meth:`Series.corr` now raise a ``ValueError`` along with a helpful error message instead of a ``KeyError`` when supplied with an invalid method (:issue:`22298`) +- :meth:`shift` will now always return a copy, instead of the previous behaviour of returning self when shifting by 0 (:issue:`22397`) +- Slicing a single row of a DataFrame with multiple ExtensionArrays of the same type now preserves the dtype, rather than coercing to object (:issue:`22784`) +- :class:`DateOffset` attribute `_cacheable` and method `_should_cache` have been removed (:issue:`23118`) + +.. _whatsnew_0240.deprecations: + +Deprecations +~~~~~~~~~~~~ + +- :meth:`DataFrame.to_stata`, :meth:`read_stata`, :class:`StataReader` and :class:`StataWriter` have deprecated the ``encoding`` argument. The encoding of a Stata dta file is determined by the file type and cannot be changed (:issue:`21244`) +- :meth:`MultiIndex.to_hierarchical` is deprecated and will be removed in a future version (:issue:`21613`) +- :meth:`Series.ptp` is deprecated. Use ``numpy.ptp`` instead (:issue:`21614`) +- :meth:`Series.compress` is deprecated. Use ``Series[condition]`` instead (:issue:`18262`) +- The signature of :meth:`Series.to_csv` has been uniformed to that of :meth:`DataFrame.to_csv`: the name of the first argument is now ``path_or_buf``, the order of subsequent arguments has changed, the ``header`` argument now defaults to ``True``. (:issue:`19715`) +- :meth:`Categorical.from_codes` has deprecated providing float values for the ``codes`` argument. (:issue:`21767`) +- :func:`pandas.read_table` is deprecated. Instead, use :func:`pandas.read_csv` passing ``sep='\t'`` if necessary (:issue:`21948`) +- :meth:`Series.str.cat` has deprecated using arbitrary list-likes *within* list-likes. A list-like container may still contain + many ``Series``, ``Index`` or 1-dimensional ``np.ndarray``, or alternatively, only scalar values. (:issue:`21950`) +- :meth:`FrozenNDArray.searchsorted` has deprecated the ``v`` parameter in favor of ``value`` (:issue:`14645`) +- :func:`DatetimeIndex.shift` and :func:`PeriodIndex.shift` now accept ``periods`` argument instead of ``n`` for consistency with :func:`Index.shift` and :func:`Series.shift`. Using ``n`` throws a deprecation warning (:issue:`22458`, :issue:`22912`) + +.. _whatsnew_0240.prior_deprecations: + +Removal of prior version deprecations/changes +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +- The ``LongPanel`` and ``WidePanel`` classes have been removed (:issue:`10892`) +- :meth:`Series.repeat` has renamed the ``reps`` argument to ``repeats`` (:issue:`14645`) +- Several private functions were removed from the (non-public) module ``pandas.core.common`` (:issue:`22001`) +- Removal of the previously deprecated module ``pandas.core.datetools`` (:issue:`14105`, :issue:`14094`) +- Strings passed into :meth:`DataFrame.groupby` that refer to both column and index levels will raise a ``ValueError`` (:issue:`14432`) +- :meth:`Index.repeat` and :meth:`MultiIndex.repeat` have renamed the ``n`` argument to ``repeats`` (:issue:`14645`) +- Removal of the previously deprecated ``as_indexer`` keyword completely from ``str.match()`` (:issue:`22356`, :issue:`6581`) +- Removed the ``pandas.formats.style`` shim for :class:`pandas.io.formats.style.Styler` (:issue:`16059`) +- :meth:`Categorical.searchsorted` and :meth:`Series.searchsorted` have renamed the ``v`` argument to ``value`` (:issue:`14645`) +- :meth:`TimedeltaIndex.searchsorted`, :meth:`DatetimeIndex.searchsorted`, and :meth:`PeriodIndex.searchsorted` have renamed the ``key`` argument to ``value`` (:issue:`14645`) +- Removal of the previously deprecated module ``pandas.json`` (:issue:`19944`) + +.. _whatsnew_0240.performance: + +Performance Improvements +~~~~~~~~~~~~~~~~~~~~~~~~ + +- Very large improvement in performance of slicing when the index is a :class:`CategoricalIndex`, + both when indexing by label (using .loc) and position(.iloc). + Likewise, slicing a ``CategoricalIndex`` itself (i.e. ``ci[100:200]``) shows similar speed improvements (:issue:`21659`) +- Improved performance of :func:`Series.describe` in case of numeric dtpyes (:issue:`21274`) +- Improved performance of :func:`pandas.core.groupby.GroupBy.rank` when dealing with tied rankings (:issue:`21237`) +- Improved performance of :func:`DataFrame.set_index` with columns consisting of :class:`Period` objects (:issue:`21582`, :issue:`21606`) +- Improved performance of membership checks in :class:`Categorical` and :class:`CategoricalIndex` + (i.e. ``x in cat``-style checks are much faster). :meth:`CategoricalIndex.contains` + is likewise much faster (:issue:`21369`, :issue:`21508`) +- Improved performance of :meth:`HDFStore.groups` (and dependent functions like + :meth:`~HDFStore.keys`. (i.e. ``x in store`` checks are much faster) + (:issue:`21372`) +- Improved the performance of :func:`pandas.get_dummies` with ``sparse=True`` (:issue:`21997`) +- Improved performance of :func:`IndexEngine.get_indexer_non_unique` for sorted, non-unique indexes (:issue:`9466`) +- Improved performance of :func:`PeriodIndex.unique` (:issue:`23083`) + + +.. _whatsnew_0240.docs: + +Documentation Changes +~~~~~~~~~~~~~~~~~~~~~ + +- Added sphinx spelling extension, updated documentation on how to use the spell check (:issue:`21079`) +- +- + +.. _whatsnew_0240.bug_fixes: + +Bug Fixes +~~~~~~~~~ + +Categorical +^^^^^^^^^^^ + +- Bug in :meth:`Categorical.from_codes` where ``NaN`` values in ``codes`` were silently converted to ``0`` (:issue:`21767`). In the future this will raise a ``ValueError``. Also changes the behavior of ``.from_codes([1.1, 2.0])``. +- Bug when indexing with a boolean-valued ``Categorical``. Now a boolean-valued ``Categorical`` is treated as a boolean mask (:issue:`22665`) +- Constructing a :class:`CategoricalIndex` with empty values and boolean categories was raising a ``ValueError`` after a change to dtype coercion (:issue:`22702`). + +Datetimelike +^^^^^^^^^^^^ + +- Fixed bug where two :class:`DateOffset` objects with different ``normalize`` attributes could evaluate as equal (:issue:`21404`) +- Fixed bug where :meth:`Timestamp.resolution` incorrectly returned 1-microsecond ``timedelta`` instead of 1-nanosecond :class:`Timedelta` (:issue:`21336`, :issue:`21365`) +- Bug in :func:`to_datetime` that did not consistently return an :class:`Index` when ``box=True`` was specified (:issue:`21864`) +- Bug in :class:`DatetimeIndex` comparisons where string comparisons incorrectly raises ``TypeError`` (:issue:`22074`) +- Bug in :class:`DatetimeIndex` comparisons when comparing against ``timedelta64[ns]`` dtyped arrays; in some cases ``TypeError`` was incorrectly raised, in others it incorrectly failed to raise (:issue:`22074`) +- Bug in :class:`DatetimeIndex` comparisons when comparing against object-dtyped arrays (:issue:`22074`) +- Bug in :class:`DataFrame` with ``datetime64[ns]`` dtype addition and subtraction with ``Timedelta``-like objects (:issue:`22005`, :issue:`22163`) +- Bug in :class:`DataFrame` with ``datetime64[ns]`` dtype addition and subtraction with ``DateOffset`` objects returning an ``object`` dtype instead of ``datetime64[ns]`` dtype (:issue:`21610`, :issue:`22163`) +- Bug in :class:`DataFrame` with ``datetime64[ns]`` dtype comparing against ``NaT`` incorrectly (:issue:`22242`, :issue:`22163`) +- Bug in :class:`DataFrame` with ``datetime64[ns]`` dtype subtracting ``Timestamp``-like object incorrectly returned ``datetime64[ns]`` dtype instead of ``timedelta64[ns]`` dtype (:issue:`8554`, :issue:`22163`) +- Bug in :class:`DataFrame` with ``datetime64[ns]`` dtype subtracting ``np.datetime64`` object with non-nanosecond unit failing to convert to nanoseconds (:issue:`18874`, :issue:`22163`) +- Bug in :class:`DataFrame` comparisons against ``Timestamp``-like objects failing to raise ``TypeError`` for inequality checks with mismatched types (:issue:`8932`, :issue:`22163`) +- Bug in :class:`DataFrame` with mixed dtypes including ``datetime64[ns]`` incorrectly raising ``TypeError`` on equality comparisons (:issue:`13128`, :issue:`22163`) +- Bug in :meth:`DataFrame.eq` comparison against ``NaT`` incorrectly returning ``True`` or ``NaN`` (:issue:`15697`, :issue:`22163`) +- Bug in :class:`DatetimeIndex` subtraction that incorrectly failed to raise ``OverflowError`` (:issue:`22492`, :issue:`22508`) +- Bug in :class:`DatetimeIndex` incorrectly allowing indexing with ``Timedelta`` object (:issue:`20464`) +- Bug in :class:`DatetimeIndex` where frequency was being set if original frequency was ``None`` (:issue:`22150`) +- Bug in rounding methods of :class:`DatetimeIndex` (:meth:`~DatetimeIndex.round`, :meth:`~DatetimeIndex.ceil`, :meth:`~DatetimeIndex.floor`) and :class:`Timestamp` (:meth:`~Timestamp.round`, :meth:`~Timestamp.ceil`, :meth:`~Timestamp.floor`) could give rise to loss of precision (:issue:`22591`) +- Bug in :func:`to_datetime` with an :class:`Index` argument that would drop the ``name`` from the result (:issue:`21697`) +- Bug in :class:`PeriodIndex` where adding or subtracting a :class:`timedelta` or :class:`Tick` object produced incorrect results (:issue:`22988`) + +Timedelta +^^^^^^^^^ +- Bug in :class:`DataFrame` with ``timedelta64[ns]`` dtype division by ``Timedelta``-like scalar incorrectly returning ``timedelta64[ns]`` dtype instead of ``float64`` dtype (:issue:`20088`, :issue:`22163`) +- Bug in adding a :class:`Index` with object dtype to a :class:`Series` with ``timedelta64[ns]`` dtype incorrectly raising (:issue:`22390`) +- Bug in multiplying a :class:`Series` with numeric dtype against a ``timedelta`` object (:issue:`22390`) +- Bug in :class:`Series` with numeric dtype when adding or subtracting an an array or ``Series`` with ``timedelta64`` dtype (:issue:`22390`) +- Bug in :class:`Index` with numeric dtype when multiplying or dividing an array with dtype ``timedelta64`` (:issue:`22390`) +- Bug in :class:`TimedeltaIndex` incorrectly allowing indexing with ``Timestamp`` object (:issue:`20464`) +- Fixed bug where subtracting :class:`Timedelta` from an object-dtyped array would raise ``TypeError`` (:issue:`21980`) +- Fixed bug in adding a :class:`DataFrame` with all-`timedelta64[ns]` dtypes to a :class:`DataFrame` with all-integer dtypes returning incorrect results instead of raising ``TypeError`` (:issue:`22696`) +- + +Timezones +^^^^^^^^^ + +- Bug in :meth:`DatetimeIndex.shift` where an ``AssertionError`` would raise when shifting across DST (:issue:`8616`) +- Bug in :class:`Timestamp` constructor where passing an invalid timezone offset designator (``Z``) would not raise a ``ValueError`` (:issue:`8910`) +- Bug in :meth:`Timestamp.replace` where replacing at a DST boundary would retain an incorrect offset (:issue:`7825`) +- Bug in :meth:`Series.replace` with ``datetime64[ns, tz]`` data when replacing ``NaT`` (:issue:`11792`) +- Bug in :class:`Timestamp` when passing different string date formats with a timezone offset would produce different timezone offsets (:issue:`12064`) +- Bug when comparing a tz-naive :class:`Timestamp` to a tz-aware :class:`DatetimeIndex` which would coerce the :class:`DatetimeIndex` to tz-naive (:issue:`12601`) +- Bug in :meth:`Series.truncate` with a tz-aware :class:`DatetimeIndex` which would cause a core dump (:issue:`9243`) +- Bug in :class:`Series` constructor which would coerce tz-aware and tz-naive :class:`Timestamp` to tz-aware (:issue:`13051`) +- Bug in :class:`Index` with ``datetime64[ns, tz]`` dtype that did not localize integer data correctly (:issue:`20964`) +- Bug in :class:`DatetimeIndex` where constructing with an integer and tz would not localize correctly (:issue:`12619`) +- Fixed bug where :meth:`DataFrame.describe` and :meth:`Series.describe` on tz-aware datetimes did not show `first` and `last` result (:issue:`21328`) +- Bug in :class:`DatetimeIndex` comparisons failing to raise ``TypeError`` when comparing timezone-aware ``DatetimeIndex`` against ``np.datetime64`` (:issue:`22074`) +- Bug in ``DataFrame`` assignment with a timezone-aware scalar (:issue:`19843`) +- Bug in :func:`Dataframe.asof` that raised a ``TypeError`` when attempting to compare tz-naive and tz-aware timestamps (:issue:`21194`) +- Bug when constructing a :class:`DatetimeIndex` with :class:`Timestamp`s constructed with the ``replace`` method across DST (:issue:`18785`) +- Bug when setting a new value with :meth:`DataFrame.loc` with a :class:`DatetimeIndex` with a DST transition (:issue:`18308`, :issue:`20724`) +- Bug in :meth:`DatetimeIndex.unique` that did not re-localize tz-aware dates correctly (:issue:`21737`) +- Bug when indexing a :class:`Series` with a DST transition (:issue:`21846`) +- Bug in :meth:`DataFrame.resample` and :meth:`Series.resample` where an ``AmbiguousTimeError`` or ``NonExistentTimeError`` would raise if a timezone aware timeseries ended on a DST transition (:issue:`19375`, :issue:`10117`) + +Offsets +^^^^^^^ + +- Bug in :class:`FY5253` where date offsets could incorrectly raise an ``AssertionError`` in arithmetic operatons (:issue:`14774`) +- Bug in :class:`DateOffset` where keyword arguments ``week`` and ``milliseconds`` were accepted and ignored. Passing these will now raise ``ValueError`` (:issue:`19398`) +- + +Numeric +^^^^^^^ + +- Bug in :class:`Series` ``__rmatmul__`` doesn't support matrix vector multiplication (:issue:`21530`) +- Bug in :func:`factorize` fails with read-only array (:issue:`12813`) +- Fixed bug in :func:`unique` handled signed zeros inconsistently: for some inputs 0.0 and -0.0 were treated as equal and for some inputs as different. Now they are treated as equal for all inputs (:issue:`21866`) +- Bug in :meth:`DataFrame.agg`, :meth:`DataFrame.transform` and :meth:`DataFrame.apply` where, + when supplied with a list of functions and ``axis=1`` (e.g. ``df.apply(['sum', 'mean'], axis=1)``), + a ``TypeError`` was wrongly raised. For all three methods such calculation are now done correctly. (:issue:`16679`). +- Bug in :class:`Series` comparison against datetime-like scalars and arrays (:issue:`22074`) +- Bug in :class:`DataFrame` multiplication between boolean dtype and integer returning ``object`` dtype instead of integer dtype (:issue:`22047`, :issue:`22163`) +- Bug in :meth:`DataFrame.apply` where, when supplied with a string argument and additional positional or keyword arguments (e.g. ``df.apply('sum', min_count=1)``), a ``TypeError`` was wrongly raised (:issue:`22376`) +- Bug in :meth:`DataFrame.astype` to extension dtype may raise ``AttributeError`` (:issue:`22578`) +- Bug in :class:`DataFrame` with ``timedelta64[ns]`` dtype arithmetic operations with ``ndarray`` with integer dtype incorrectly treating the narray as ``timedelta64[ns]`` dtype (:issue:`23114`) + +Strings +^^^^^^^ + +- +- +- + +Interval +^^^^^^^^ + +- Bug in the :class:`IntervalIndex` constructor where the ``closed`` parameter did not always override the inferred ``closed`` (:issue:`19370`) +- Bug in the ``IntervalIndex`` repr where a trailing comma was missing after the list of intervals (:issue:`20611`) +- Bug in :class:`Interval` where scalar arithmetic operations did not retain the ``closed`` value (:issue:`22313`) +- Bug in :class:`IntervalIndex` where indexing with datetime-like values raised a ``KeyError`` (:issue:`20636`) + +Indexing +^^^^^^^^ + +- The traceback from a ``KeyError`` when asking ``.loc`` for a single missing label is now shorter and more clear (:issue:`21557`) +- When ``.ix`` is asked for a missing integer label in a :class:`MultiIndex` with a first level of integer type, it now raises a ``KeyError``, consistently with the case of a flat :class:`Int64Index`, rather than falling back to positional indexing (:issue:`21593`) +- Bug in :meth:`DatetimeIndex.reindex` when reindexing a tz-naive and tz-aware :class:`DatetimeIndex` (:issue:`8306`) +- Bug in :class:`DataFrame` when setting values with ``.loc`` and a timezone aware :class:`DatetimeIndex` (:issue:`11365`) +- ``DataFrame.__getitem__`` now accepts dictionaries and dictionary keys as list-likes of labels, consistently with ``Series.__getitem__`` (:issue:`21294`) +- Fixed ``DataFrame[np.nan]`` when columns are non-unique (:issue:`21428`) +- Bug when indexing :class:`DatetimeIndex` with nanosecond resolution dates and timezones (:issue:`11679`) +- Bug where indexing with a Numpy array containing negative values would mutate the indexer (:issue:`21867`) +- Bug where mixed indexes wouldn't allow integers for ``.at`` (:issue:`19860`) +- ``Float64Index.get_loc`` now raises ``KeyError`` when boolean key passed. (:issue:`19087`) +- Bug in :meth:`DataFrame.loc` when indexing with an :class:`IntervalIndex` (:issue:`19977`) +- :class:`Index` no longer mangles ``None``, ``NaN`` and ``NaT``, i.e. they are treated as three different keys. However, for numeric Index all three are still coerced to a ``NaN`` (:issue:`22332`) +- Bug in `scalar in Index` if scalar is a float while the ``Index`` is of integer dtype (:issue:`22085`) + +Missing +^^^^^^^ + +- Bug in :func:`DataFrame.fillna` where a ``ValueError`` would raise when one column contained a ``datetime64[ns, tz]`` dtype (:issue:`15522`) +- Bug in :func:`Series.hasnans` that could be incorrectly cached and return incorrect answers if null elements are introduced after an initial call (:issue:`19700`) +- :func:`Series.isin` now treats all NaN-floats as equal also for `np.object`-dtype. This behavior is consistent with the behavior for float64 (:issue:`22119`) +- :func:`unique` no longer mangles NaN-floats and the ``NaT``-object for `np.object`-dtype, i.e. ``NaT`` is no longer coerced to a NaN-value and is treated as a different entity. (:issue:`22295`) + + +MultiIndex +^^^^^^^^^^ + +- Removed compatibility for :class:`MultiIndex` pickles prior to version 0.8.0; compatibility with :class:`MultiIndex` pickles from version 0.13 forward is maintained (:issue:`21654`) +- :meth:`MultiIndex.get_loc_level` (and as a consequence, ``.loc`` on a :class:`MultiIndex`ed object) will now raise a ``KeyError``, rather than returning an empty ``slice``, if asked a label which is present in the ``levels`` but is unused (:issue:`22221`) +- Fix ``TypeError`` in Python 3 when creating :class:`MultiIndex` in which some levels have mixed types, e.g. when some labels are tuples (:issue:`15457`) + +I/O +^^^ + +- :func:`read_html()` no longer ignores all-whitespace ```` within ```` when considering the ``skiprows`` and ``header`` arguments. Previously, users had to decrease their ``header`` and ``skiprows`` values on such tables to work around the issue. (:issue:`21641`) +- :func:`read_excel()` will correctly show the deprecation warning for previously deprecated ``sheetname`` (:issue:`17994`) +- :func:`read_csv()` and func:`read_table()` will throw ``UnicodeError`` and not coredump on badly encoded strings (:issue:`22748`) +- :func:`read_csv()` will correctly parse timezone-aware datetimes (:issue:`22256`) +- :func:`read_sas()` will parse numbers in sas7bdat-files that have width less than 8 bytes correctly. (:issue:`21616`) +- :func:`read_sas()` will correctly parse sas7bdat files with many columns (:issue:`22628`) +- :func:`read_sas()` will correctly parse sas7bdat files with data page types having also bit 7 set (so page type is 128 + 256 = 384) (:issue:`16615`) +- Bug in :meth:`detect_client_encoding` where potential ``IOError`` goes unhandled when importing in a mod_wsgi process due to restricted access to stdout. (:issue:`21552`) +- Bug in :func:`to_string()` that broke column alignment when ``index=False`` and width of first column's values is greater than the width of first column's header (:issue:`16839`, :issue:`13032`) + +Plotting +^^^^^^^^ + +- Bug in :func:`DataFrame.plot.scatter` and :func:`DataFrame.plot.hexbin` caused x-axis label and ticklabels to disappear when colorbar was on in IPython inline backend (:issue:`10611`, :issue:`10678`, and :issue:`20455`) +- Bug in plotting a Series with datetimes using :func:`matplotlib.axes.Axes.scatter` (:issue:`22039`) + +Groupby/Resample/Rolling +^^^^^^^^^^^^^^^^^^^^^^^^ + +- Bug in :func:`pandas.core.groupby.GroupBy.first` and :func:`pandas.core.groupby.GroupBy.last` with ``as_index=False`` leading to the loss of timezone information (:issue:`15884`) +- Bug in :meth:`DatetimeIndex.resample` when downsampling across a DST boundary (:issue:`8531`) +- Bug where ``ValueError`` is wrongly raised when calling :func:`~pandas.core.groupby.SeriesGroupBy.count` method of a + ``SeriesGroupBy`` when the grouping variable only contains NaNs and numpy version < 1.13 (:issue:`21956`). +- Multiple bugs in :func:`pandas.core.Rolling.min` with ``closed='left'`` and a + datetime-like index leading to incorrect results and also segfault. (:issue:`21704`) +- Bug in :meth:`Resampler.apply` when passing postiional arguments to applied func (:issue:`14615`). +- Bug in :meth:`Series.resample` when passing ``numpy.timedelta64`` to ``loffset`` kwarg (:issue:`7687`). +- Bug in :meth:`Resampler.asfreq` when frequency of ``TimedeltaIndex`` is a subperiod of a new frequency (:issue:`13022`). +- Bug in :meth:`SeriesGroupBy.mean` when values were integral but could not fit inside of int64, overflowing instead. (:issue:`22487`) +- :func:`RollingGroupby.agg` and :func:`ExpandingGroupby.agg` now support multiple aggregation functions as parameters (:issue:`15072`) +- Bug in :meth:`DataFrame.resample` and :meth:`Series.resample` when resampling by a weekly offset (``'W'``) across a DST transition (:issue:`9119`, :issue:`21459`) + +Reshaping +^^^^^^^^^ + +- Bug in :func:`pandas.concat` when joining resampled DataFrames with timezone aware index (:issue:`13783`) +- Bug in :meth:`Series.combine_first` with ``datetime64[ns, tz]`` dtype which would return tz-naive result (:issue:`21469`) +- Bug in :meth:`Series.where` and :meth:`DataFrame.where` with ``datetime64[ns, tz]`` dtype (:issue:`21546`) +- Bug in :meth:`Series.mask` and :meth:`DataFrame.mask` with ``list`` conditionals (:issue:`21891`) +- Bug in :meth:`DataFrame.replace` raises RecursionError when converting OutOfBounds ``datetime64[ns, tz]`` (:issue:`20380`) +- :func:`pandas.core.groupby.GroupBy.rank` now raises a ``ValueError`` when an invalid value is passed for argument ``na_option`` (:issue:`22124`) +- Bug in :func:`get_dummies` with Unicode attributes in Python 2 (:issue:`22084`) +- Bug in :meth:`DataFrame.replace` raises ``RecursionError`` when replacing empty lists (:issue:`22083`) +- Bug in :meth:`Series.replace` and meth:`DataFrame.replace` when dict is used as the ``to_replace`` value and one key in the dict is is another key's value, the results were inconsistent between using integer key and using string key (:issue:`20656`) +- Bug in :meth:`DataFrame.drop_duplicates` for empty ``DataFrame`` which incorrectly raises an error (:issue:`20516`) +- Bug in :func:`pandas.wide_to_long` when a string is passed to the stubnames argument and a column name is a substring of that stubname (:issue:`22468`) +- Bug in :func:`merge` when merging ``datetime64[ns, tz]`` data that contained a DST transition (:issue:`18885`) +- Bug in :func:`merge_asof` when merging on float values within defined tolerance (:issue:`22981`) +- Bug in :func:`pandas.concat` when concatenating a multicolumn DataFrame with tz-aware data against a DataFrame with a different number of columns (:issue`22796`) +- Bug in :func:`merge_asof` when merging on columns containing nulls values (:issue:`22981`) + +.. _whatsnew_0240.bug_fixes.sparse: + +Sparse +^^^^^^ + +- Updating a boolean, datetime, or timedelta column to be Sparse now works (:issue:`22367`) +- Bug in :meth:`Series.to_sparse` with Series already holding sparse data not constructing properly (:issue:`22389`) +- Providing a ``sparse_index`` to the SparseArray constructor no longer defaults the na-value to ``np.nan`` for all dtypes. The correct na_value for ``data.dtype`` is now used. +- Bug in ``SparseArray.nbytes`` under-reporting its memory usage by not including the size of its sparse index. +- Improved performance of :meth:`Series.shift` for non-NA ``fill_value``, as values are no longer converted to a dense array. +- Bug in ``DataFrame.groupby`` not including ``fill_value`` in the groups for non-NA ``fill_value`` when grouping by a sparse column (:issue:`5078`) +- Bug in unary inversion operator (``~``) on a ``SparseSeries`` with boolean values. The performance of this has also been improved (:issue:`22835`) + +Build Changes +^^^^^^^^^^^^^ + +- Building pandas for development now requires ``cython >= 0.28.2`` (:issue:`21688`) +- Testing pandas now requires ``hypothesis>=3.58``. You can find `the Hypothesis docs here `_, and a pandas-specific introduction :ref:`in the contributing guide `. (:issue:`22280`) +- + +Other +^^^^^ + +- :meth:`~pandas.io.formats.style.Styler.background_gradient` now takes a ``text_color_threshold`` parameter to automatically lighten the text color based on the luminance of the background color. This improves readability with dark background colors without the need to limit the background colormap range. (:issue:`21258`) +- Require at least 0.28.2 version of ``cython`` to support read-only memoryviews (:issue:`21688`) +- :meth:`~pandas.io.formats.style.Styler.background_gradient` now also supports tablewise application (in addition to rowwise and columnwise) with ``axis=None`` (:issue:`15204`) +- :meth:`DataFrame.nlargest` and :meth:`DataFrame.nsmallest` now returns the correct n values when keep != 'all' also when tied on the first columns (:issue:`22752`) +- :meth:`~pandas.io.formats.style.Styler.bar` now also supports tablewise application (in addition to rowwise and columnwise) with ``axis=None`` and setting clipping range with ``vmin`` and ``vmax`` (:issue:`21548` and :issue:`21526`). ``NaN`` values are also handled properly. +- Logical operations ``&, |, ^`` between :class:`Series` and :class:`Index` will no longer raise ``ValueError`` (:issue:`22092`) +- Bug in :meth:`DataFrame.combine_first` in which column types were unexpectedly converted to float (:issue:`20699`) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 88b1ec7e47bbb..236707cf32209 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1390,21 +1390,16 @@ def flip(xs): self.right_join_keys[-1]) tolerance = self.tolerance - # we required sortedness and non-missingness in the join keys - msg_sorted = "{side} keys must be sorted" - msg_missings = "Merge keys contain null values on {side} side" + # Check null values before merge + if isnull(left_values).sum() > 0 or isnull(right_values).sum() > 0: + raise MergeError('Merge keys cannot contain null values') + # we required sortedness in the join keys + msg = "{side} keys must be sorted" if not Index(left_values).is_monotonic: - if isnull(left_values).sum() > 0: - raise ValueError(msg_missings.format(side='left')) - else: - raise ValueError(msg_sorted.format(side='left')) - + raise ValueError(msg.format(side='left')) if not Index(right_values).is_monotonic: - if isnull(right_values).sum() > 0: - raise ValueError(msg_missings.format(side='right')) - else: - raise ValueError(msg_sorted.format(side='right')) + raise ValueError(msg.format(side='right')) # initial type conversion as needed if needs_i8_conversion(left_values): From 9ff2aa90004dbaa2cf2a83347b3147a525f15b57 Mon Sep 17 00:00:00 2001 From: Sven Date: Tue, 16 Oct 2018 07:08:56 +0100 Subject: [PATCH 12/14] update release file --- doc/source/whatsnew/v0.24.1.txt | 1006 ------------------------------- 1 file changed, 1006 deletions(-) delete mode 100644 doc/source/whatsnew/v0.24.1.txt diff --git a/doc/source/whatsnew/v0.24.1.txt b/doc/source/whatsnew/v0.24.1.txt deleted file mode 100644 index 9463458bfa64f..0000000000000 --- a/doc/source/whatsnew/v0.24.1.txt +++ /dev/null @@ -1,1006 +0,0 @@ -.. _whatsnew_0240: - -v0.24.0 (Month XX, 2018) ------------------------- - -.. warning:: - - Starting January 1, 2019, pandas feature releases will support Python 3 only. - See :ref:`install.dropping-27` for more. - -.. _whatsnew_0240.enhancements: - -New features -~~~~~~~~~~~~ -- :func:`merge` now directly allows merge between objects of type ``DataFrame`` and named ``Series``, without the need to convert the ``Series`` object into a ``DataFrame`` beforehand (:issue:`21220`) - - -- ``ExcelWriter`` now accepts ``mode`` as a keyword argument, enabling append to existing workbooks when using the ``openpyxl`` engine (:issue:`3441`) - -- :func:`DataFrame.to_parquet` now accepts ``index`` as an argument, allowing -the user to override the engine's default behavior to include or omit the -dataframe's indexes from the resulting Parquet file. (:issue:`20768`) -- :meth:`DataFrame.corr` and :meth:`Series.corr` now accept a callable for generic calculation methods of correlation, e.g. histogram intersection (:issue:`22684`) - - -.. _whatsnew_0240.enhancements.extension_array_operators: - -``ExtensionArray`` operator support -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -A ``Series`` based on an ``ExtensionArray`` now supports arithmetic and comparison -operators (:issue:`19577`). There are two approaches for providing operator support for an ``ExtensionArray``: - -1. Define each of the operators on your ``ExtensionArray`` subclass. -2. Use an operator implementation from pandas that depends on operators that are already defined - on the underlying elements (scalars) of the ``ExtensionArray``. - -See the :ref:`ExtensionArray Operator Support -` documentation section for details on both -ways of adding operator support. - -.. _whatsnew_0240.enhancements.intna: - -Optional Integer NA Support -^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Pandas has gained the ability to hold integer dtypes with missing values. This long requested feature is enabled through the use of :ref:`extension types `. -Here is an example of the usage. - -We can construct a ``Series`` with the specified dtype. The dtype string ``Int64`` is a pandas ``ExtensionDtype``. Specifying a list or array using the traditional missing value -marker of ``np.nan`` will infer to integer dtype. The display of the ``Series`` will also use the ``NaN`` to indicate missing values in string outputs. (:issue:`20700`, :issue:`20747`, :issue:`22441`, :issue:`21789`, :issue:`22346`) - -.. ipython:: python - - s = pd.Series([1, 2, np.nan], dtype='Int64') - s - - -Operations on these dtypes will propagate ``NaN`` as other pandas operations. - -.. ipython:: python - - # arithmetic - s + 1 - - # comparison - s == 1 - - # indexing - s.iloc[1:3] - - # operate with other dtypes - s + s.iloc[1:3].astype('Int8') - - # coerce when needed - s + 0.01 - -These dtypes can operate as part of of ``DataFrame``. - -.. ipython:: python - - df = pd.DataFrame({'A': s, 'B': [1, 1, 3], 'C': list('aab')}) - df - df.dtypes - - -These dtypes can be merged & reshaped & casted. - -.. ipython:: python - - pd.concat([df[['A']], df[['B', 'C']]], axis=1).dtypes - df['A'].astype(float) - -Reduction and groupby operations such as 'sum' work. - -.. ipython:: python - - df.sum() - df.groupby('B').A.sum() - -.. warning:: - - The Integer NA support currently uses the captilized dtype version, e.g. ``Int8`` as compared to the traditional ``int8``. This may be changed at a future date. - -.. _whatsnew_0240.enhancements.read_html: - -``read_html`` Enhancements -^^^^^^^^^^^^^^^^^^^^^^^^^^ - -:func:`read_html` previously ignored ``colspan`` and ``rowspan`` attributes. -Now it understands them, treating them as sequences of cells with the same -value. (:issue:`17054`) - -.. ipython:: python - - result = pd.read_html(""" - - - - - - - - - - - -
ABC
12
""") - -Previous Behavior: - -.. code-block:: ipython - - In [13]: result - Out [13]: - [ A B C - 0 1 2 NaN] - -Current Behavior: - -.. ipython:: python - - result - - -.. _whatsnew_0240.enhancements.interval: - -Storing Interval Data in Series and DataFrame -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Interval data may now be stored in a ``Series`` or ``DataFrame``, in addition to an -:class:`IntervalIndex` like previously (:issue:`19453`). - -.. ipython:: python - - ser = pd.Series(pd.interval_range(0, 5)) - ser - ser.dtype - -Previously, these would be cast to a NumPy array of ``Interval`` objects. In general, -this should result in better performance when storing an array of intervals in -a :class:`Series`. - -Note that the ``.values`` of a ``Series`` containing intervals is no longer a NumPy -array, but rather an ``ExtensionArray``: - -.. ipython:: python - - ser.values - -This is the same behavior as ``Series.values`` for categorical data. See -:ref:`whatsnew_0240.api_breaking.interval_values` for more. - - -.. _whatsnew_0240.enhancements.other: - -Other Enhancements -^^^^^^^^^^^^^^^^^^ -- :func:`to_datetime` now supports the ``%Z`` and ``%z`` directive when passed into ``format`` (:issue:`13486`) -- :func:`Series.mode` and :func:`DataFrame.mode` now support the ``dropna`` parameter which can be used to specify whether ``NaN``/``NaT`` values should be considered (:issue:`17534`) -- :func:`to_csv` now supports ``compression`` keyword when a file handle is passed. (:issue:`21227`) -- :meth:`Index.droplevel` is now implemented also for flat indexes, for compatibility with :class:`MultiIndex` (:issue:`21115`) -- :meth:`Series.droplevel` and :meth:`DataFrame.droplevel` are now implemented (:issue:`20342`) -- Added support for reading from/writing to Google Cloud Storage via the ``gcsfs`` library (:issue:`19454`, :issue:`23094`) -- :func:`to_gbq` and :func:`read_gbq` signature and documentation updated to - reflect changes from the `Pandas-GBQ library version 0.6.0 - `__. - (:issue:`21627`, :issue:`22557`) -- New method :meth:`HDFStore.walk` will recursively walk the group hierarchy of an HDF5 file (:issue:`10932`) -- :func:`read_html` copies cell data across ``colspan`` and ``rowspan``, and it treats all-``th`` table rows as headers if ``header`` kwarg is not given and there is no ``thead`` (:issue:`17054`) -- :meth:`Series.nlargest`, :meth:`Series.nsmallest`, :meth:`DataFrame.nlargest`, and :meth:`DataFrame.nsmallest` now accept the value ``"all"`` for the ``keep`` argument. This keeps all ties for the nth largest/smallest value (:issue:`16818`) -- :class:`IntervalIndex` has gained the :meth:`~IntervalIndex.set_closed` method to change the existing ``closed`` value (:issue:`21670`) -- :func:`~DataFrame.to_csv`, :func:`~Series.to_csv`, :func:`~DataFrame.to_json`, and :func:`~Series.to_json` now support ``compression='infer'`` to infer compression based on filename extension (:issue:`15008`). - The default compression for ``to_csv``, ``to_json``, and ``to_pickle`` methods has been updated to ``'infer'`` (:issue:`22004`). -- :func:`to_timedelta` now supports iso-formated timedelta strings (:issue:`21877`) -- :class:`Series` and :class:`DataFrame` now support :class:`Iterable` in constructor (:issue:`2193`) -- :class:`DatetimeIndex` gained :attr:`DatetimeIndex.timetz` attribute. Returns local time with timezone information. (:issue:`21358`) -- :meth:`round`, :meth:`ceil`, and meth:`floor` for :class:`DatetimeIndex` and :class:`Timestamp` now support an ``ambiguous`` argument for handling datetimes that are rounded to ambiguous times (:issue:`18946`) -- :class:`Resampler` now is iterable like :class:`GroupBy` (:issue:`15314`). -- :meth:`Series.resample` and :meth:`DataFrame.resample` have gained the :meth:`Resampler.quantile` (:issue:`15023`). -- :meth:`Index.to_frame` now supports overriding column name(s) (:issue:`22580`). -- New attribute :attr:`__git_version__` will return git commit sha of current build (:issue:`21295`). -- Compatibility with Matplotlib 3.0 (:issue:`22790`). - -.. _whatsnew_0240.api_breaking: - -Backwards incompatible API changes -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -- A newly constructed empty :class:`DataFrame` with integer as the ``dtype`` will now only be cast to ``float64`` if ``index`` is specified (:issue:`22858`) - -.. _whatsnew_0240.api_breaking.deps: - -Dependencies have increased minimum versions -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -We have updated our minimum supported versions of dependencies (:issue:`21242`). -If installed, we now require: - -+-----------------+-----------------+----------+ -| Package | Minimum Version | Required | -+=================+=================+==========+ -| numpy | 1.12.0 | X | -+-----------------+-----------------+----------+ -| bottleneck | 1.2.0 | | -+-----------------+-----------------+----------+ -| matplotlib | 2.0.0 | | -+-----------------+-----------------+----------+ -| numexpr | 2.6.1 | | -+-----------------+-----------------+----------+ -| pytables | 3.4.2 | | -+-----------------+-----------------+----------+ -| scipy | 0.18.1 | | -+-----------------+-----------------+----------+ - -.. _whatsnew_0240.api_breaking.interval_values: - -``IntervalIndex.values`` is now an ``IntervalArray`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -The :attr:`~Interval.values` attribute of an :class:`IntervalIndex` now returns an -``IntervalArray``, rather than a NumPy array of :class:`Interval` objects (:issue:`19453`). - -Previous Behavior: - -.. code-block:: ipython - - In [1]: idx = pd.interval_range(0, 4) - - In [2]: idx.values - Out[2]: - array([Interval(0, 1, closed='right'), Interval(1, 2, closed='right'), - Interval(2, 3, closed='right'), Interval(3, 4, closed='right')], - dtype=object) - -New Behavior: - -.. ipython:: python - - idx = pd.interval_range(0, 4) - idx.values - -This mirrors ``CategoricalIndex.values``, which returns a ``Categorical``. - -For situations where you need an ``ndarray`` of ``Interval`` objects, use -:meth:`numpy.asarray` or ``idx.astype(object)``. - -.. ipython:: python - - np.asarray(idx) - idx.values.astype(object) - -.. _whatsnew_0240.api.timezone_offset_parsing: - -Parsing Datetime Strings with Timezone Offsets -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Previously, parsing datetime strings with UTC offsets with :func:`to_datetime` -or :class:`DatetimeIndex` would automatically convert the datetime to UTC -without timezone localization. This is inconsistent from parsing the same -datetime string with :class:`Timestamp` which would preserve the UTC -offset in the ``tz`` attribute. Now, :func:`to_datetime` preserves the UTC -offset in the ``tz`` attribute when all the datetime strings have the same -UTC offset (:issue:`17697`, :issue:`11736`, :issue:`22457`) - -*Previous Behavior*: - -.. code-block:: ipython - - In [2]: pd.to_datetime("2015-11-18 15:30:00+05:30") - Out[2]: Timestamp('2015-11-18 10:00:00') - - In [3]: pd.Timestamp("2015-11-18 15:30:00+05:30") - Out[3]: Timestamp('2015-11-18 15:30:00+0530', tz='pytz.FixedOffset(330)') - - # Different UTC offsets would automatically convert the datetimes to UTC (without a UTC timezone) - In [4]: pd.to_datetime(["2015-11-18 15:30:00+05:30", "2015-11-18 16:30:00+06:30"]) - Out[4]: DatetimeIndex(['2015-11-18 10:00:00', '2015-11-18 10:00:00'], dtype='datetime64[ns]', freq=None) - -*Current Behavior*: - -.. ipython:: python - - pd.to_datetime("2015-11-18 15:30:00+05:30") - pd.Timestamp("2015-11-18 15:30:00+05:30") - -Parsing datetime strings with the same UTC offset will preserve the UTC offset in the ``tz`` - -.. ipython:: python - - pd.to_datetime(["2015-11-18 15:30:00+05:30"] * 2) - -Parsing datetime strings with different UTC offsets will now create an Index of -``datetime.datetime`` objects with different UTC offsets - -.. ipython:: python - - idx = pd.to_datetime(["2015-11-18 15:30:00+05:30", "2015-11-18 16:30:00+06:30"]) - idx - idx[0] - idx[1] - -Passing ``utc=True`` will mimic the previous behavior but will correctly indicate -that the dates have been converted to UTC - -.. ipython:: python - - pd.to_datetime(["2015-11-18 15:30:00+05:30", "2015-11-18 16:30:00+06:30"], utc=True) - -.. _whatsnew_0240.api_breaking.calendarday: - -CalendarDay Offset -^^^^^^^^^^^^^^^^^^ - -:class:`Day` and associated frequency alias ``'D'`` were documented to represent -a calendar day; however, arithmetic and operations with :class:`Day` sometimes -respected absolute time instead (i.e. ``Day(n)`` and acted identically to ``Timedelta(days=n)``). - -*Previous Behavior*: - -.. code-block:: ipython - - - In [2]: ts = pd.Timestamp('2016-10-30 00:00:00', tz='Europe/Helsinki') - - # Respects calendar arithmetic - In [3]: pd.date_range(start=ts, freq='D', periods=3) - Out[3]: - DatetimeIndex(['2016-10-30 00:00:00+03:00', '2016-10-31 00:00:00+02:00', - '2016-11-01 00:00:00+02:00'], - dtype='datetime64[ns, Europe/Helsinki]', freq='D') - - # Respects absolute arithmetic - In [4]: ts + pd.tseries.frequencies.to_offset('D') - Out[4]: Timestamp('2016-10-30 23:00:00+0200', tz='Europe/Helsinki') - -:class:`CalendarDay` and associated frequency alias ``'CD'`` are now available -and respect calendar day arithmetic while :class:`Day` and frequency alias ``'D'`` -will now respect absolute time (:issue:`22274`, :issue:`20596`, :issue:`16980`, :issue:`8774`) -See the :ref:`documentation here ` for more information. - -Addition with :class:`CalendarDay` across a daylight savings time transition: - -.. ipython:: python - - ts = pd.Timestamp('2016-10-30 00:00:00', tz='Europe/Helsinki') - ts + pd.offsets.Day(1) - ts + pd.offsets.CalendarDay(1) - -.. _whatsnew_0240.api_breaking.period_end_time: - -Time values in ``dt.end_time`` and ``to_timestamp(how='end')`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -The time values in :class:`Period` and :class:`PeriodIndex` objects are now set -to '23:59:59.999999999' when calling :attr:`Series.dt.end_time`, :attr:`Period.end_time`, -:attr:`PeriodIndex.end_time`, :func:`Period.to_timestamp()` with ``how='end'``, -or :func:`PeriodIndex.to_timestamp()` with ``how='end'`` (:issue:`17157`) - -Previous Behavior: - -.. code-block:: ipython - - In [2]: p = pd.Period('2017-01-01', 'D') - In [3]: pi = pd.PeriodIndex([p]) - - In [4]: pd.Series(pi).dt.end_time[0] - Out[4]: Timestamp(2017-01-01 00:00:00) - - In [5]: p.end_time - Out[5]: Timestamp(2017-01-01 23:59:59.999999999) - -Current Behavior: - -Calling :attr:`Series.dt.end_time` will now result in a time of '23:59:59.999999999' as -is the case with :attr:`Period.end_time`, for example - -.. ipython:: python - - p = pd.Period('2017-01-01', 'D') - pi = pd.PeriodIndex([p]) - - pd.Series(pi).dt.end_time[0] - - p.end_time - -.. _whatsnew_0240.api_breaking.sparse_values: - -Sparse Data Structure Refactor -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -``SparseArray``, the array backing ``SparseSeries`` and the columns in a ``SparseDataFrame``, -is now an extension array (:issue:`21978`, :issue:`19056`, :issue:`22835`). -To conform to this interface and for consistency with the rest of pandas, some API breaking -changes were made: - -- ``SparseArray`` is no longer a subclass of :class:`numpy.ndarray`. To convert a SparseArray to a NumPy array, use :meth:`numpy.asarray`. -- ``SparseArray.dtype`` and ``SparseSeries.dtype`` are now instances of :class:`SparseDtype`, rather than ``np.dtype``. Access the underlying dtype with ``SparseDtype.subtype``. -- :meth:`numpy.asarray(sparse_array)` now returns a dense array with all the values, not just the non-fill-value values (:issue:`14167`) -- ``SparseArray.take`` now matches the API of :meth:`pandas.api.extensions.ExtensionArray.take` (:issue:`19506`): - - * The default value of ``allow_fill`` has changed from ``False`` to ``True``. - * The ``out`` and ``mode`` parameters are now longer accepted (previously, this raised if they were specified). - * Passing a scalar for ``indices`` is no longer allowed. - -- The result of concatenating a mix of sparse and dense Series is a Series with sparse values, rather than a ``SparseSeries``. -- ``SparseDataFrame.combine`` and ``DataFrame.combine_first`` no longer supports combining a sparse column with a dense column while preserving the sparse subtype. The result will be an object-dtype SparseArray. -- Setting :attr:`SparseArray.fill_value` to a fill value with a different dtype is now allowed. - - -Some new warnings are issued for operations that require or are likely to materialize a large dense array: - -- A :class:`errors.PerformanceWarning` is issued when using fillna with a ``method``, as a dense array is constructed to create the filled array. Filling with a ``value`` is the efficient way to fill a sparse array. -- A :class:`errors.PerformanceWarning` is now issued when concatenating sparse Series with differing fill values. The fill value from the first sparse array continues to be used. - -In addition to these API breaking changes, many :ref:`performance improvements and bug fixes have been made `. - -.. _whatsnew_0240.api_breaking.frame_to_dict_index_orient: - -Raise ValueError in ``DataFrame.to_dict(orient='index')`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Bug in :func:`DataFrame.to_dict` raises ``ValueError`` when used with -``orient='index'`` and a non-unique index instead of losing data (:issue:`22801`) - -.. ipython:: python - :okexcept: - - df = pd.DataFrame({'a': [1, 2], 'b': [0.5, 0.75]}, index=['A', 'A']) - df - - df.to_dict(orient='index') - -.. _whatsnew_0240.api.datetimelike.normalize: - -Tick DateOffset Normalize Restrictions -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Creating a ``Tick`` object (:class:`Day`, :class:`Hour`, :class:`Minute`, -:class:`Second`, :class:`Milli`, :class:`Micro`, :class:`Nano`) with -``normalize=True`` is no longer supported. This prevents unexpected behavior -where addition could fail to be monotone or associative. (:issue:`21427`) - -*Previous Behavior*: - -.. code-block:: ipython - - - In [2]: ts = pd.Timestamp('2018-06-11 18:01:14') - - In [3]: ts - Out[3]: Timestamp('2018-06-11 18:01:14') - - In [4]: tic = pd.offsets.Hour(n=2, normalize=True) - ...: - - In [5]: tic - Out[5]: <2 * Hours> - - In [6]: ts + tic - Out[6]: Timestamp('2018-06-11 00:00:00') - - In [7]: ts + tic + tic + tic == ts + (tic + tic + tic) - Out[7]: False - -*Current Behavior*: - -.. ipython:: python - - ts = pd.Timestamp('2018-06-11 18:01:14') - tic = pd.offsets.Hour(n=2) - ts + tic + tic + tic == ts + (tic + tic + tic) - - -.. _whatsnew_0240.api.datetimelike: - - -.. _whatsnew_0240.api.period_subtraction: - -Period Subtraction -^^^^^^^^^^^^^^^^^^ - -Subtraction of a ``Period`` from another ``Period`` will give a ``DateOffset``. -instead of an integer (:issue:`21314`) - -.. ipython:: python - - june = pd.Period('June 2018') - april = pd.Period('April 2018') - june - april - -Previous Behavior: - -.. code-block:: ipython - - In [2]: june = pd.Period('June 2018') - - In [3]: april = pd.Period('April 2018') - - In [4]: june - april - Out [4]: 2 - -Similarly, subtraction of a ``Period`` from a ``PeriodIndex`` will now return -an ``Index`` of ``DateOffset`` objects instead of an ``Int64Index`` - -.. ipython:: python - - pi = pd.period_range('June 2018', freq='M', periods=3) - pi - pi[0] - -Previous Behavior: - -.. code-block:: ipython - - In [2]: pi = pd.period_range('June 2018', freq='M', periods=3) - - In [3]: pi - pi[0] - Out[3]: Int64Index([0, 1, 2], dtype='int64') - - -.. _whatsnew_0240.api.timedelta64_subtract_nan: - -Addition/Subtraction of ``NaN`` from :class:`DataFrame` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Adding or subtracting ``NaN`` from a :class:`DataFrame` column with -``timedelta64[ns]`` dtype will now raise a ``TypeError`` instead of returning -all-``NaT``. This is for compatibility with ``TimedeltaIndex`` and -``Series`` behavior (:issue:`22163`) - -.. ipython:: python - :okexcept: - - df = pd.DataFrame([pd.Timedelta(days=1)]) - df - np.nan - -Previous Behavior: - -.. code-block:: ipython - - In [4]: df = pd.DataFrame([pd.Timedelta(days=1)]) - - In [5]: df - np.nan - Out[5]: - 0 - 0 NaT - - -.. _whatsnew_0240.api.dataframe_arithmetic_broadcasting: - -DataFrame Arithmetic Operations Broadcasting Changes -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -:class:`DataFrame` arithmetic operations when operating with 2-dimensional -``np.ndarray`` objects now broadcast in the same way as ``np.ndarray``s -broadcast. (:issue:`23000`) - -Previous Behavior: - -.. code-block:: ipython - - In [3]: arr = np.arange(6).reshape(3, 2) - In [4]: df = pd.DataFrame(arr) - In [5]: df + arr[[0], :] # 1 row, 2 columns - ... - ValueError: Unable to coerce to DataFrame, shape must be (3, 2): given (1, 2) - In [6]: df + arr[:, [1]] # 1 column, 3 rows - ... - ValueError: Unable to coerce to DataFrame, shape must be (3, 2): given (3, 1) - -*Current Behavior*: - -.. ipython:: python - arr = np.arange(6).reshape(3, 2) - df = pd.DataFrame(arr) - df - -.. ipython:: python - df + arr[[0], :] # 1 row, 2 columns - df + arr[:, [1]] # 1 column, 3 rows - - -.. _whatsnew_0240.api.extension: - -ExtensionType Changes -^^^^^^^^^^^^^^^^^^^^^ - -**:class:`pandas.api.extensions.ExtensionDtype` Equality and Hashability** - -Pandas now requires that extension dtypes be hashable. The base class implements -a default ``__eq__`` and ``__hash__``. If you have a parametrized dtype, you should -update the ``ExtensionDtype._metadata`` tuple to match the signature of your -``__init__`` method. See :class:`pandas.api.extensions.ExtensionDtype` for more (:issue:`22476`). - -**Other changes** - -- ``ExtensionArray`` has gained the abstract methods ``.dropna()`` (:issue:`21185`) -- ``ExtensionDtype`` has gained the ability to instantiate from string dtypes, e.g. ``decimal`` would instantiate a registered ``DecimalDtype``; furthermore - the ``ExtensionDtype`` has gained the method ``construct_array_type`` (:issue:`21185`) -- An ``ExtensionArray`` with a boolean dtype now works correctly as a boolean indexer. :meth:`pandas.api.types.is_bool_dtype` now properly considers them boolean (:issue:`22326`) -- Added ``ExtensionDtype._is_numeric`` for controlling whether an extension dtype is considered numeric (:issue:`22290`). -- The ``ExtensionArray`` constructor, ``_from_sequence`` now take the keyword arg ``copy=False`` (:issue:`21185`) -- Bug in :meth:`Series.get` for ``Series`` using ``ExtensionArray`` and integer index (:issue:`21257`) -- :meth:`~Series.shift` now dispatches to :meth:`ExtensionArray.shift` (:issue:`22386`) -- :meth:`Series.combine()` works correctly with :class:`~pandas.api.extensions.ExtensionArray` inside of :class:`Series` (:issue:`20825`) -- :meth:`Series.combine()` with scalar argument now works for any function type (:issue:`21248`) -- :meth:`Series.astype` and :meth:`DataFrame.astype` now dispatch to :meth:`ExtensionArray.astype` (:issue:`21185:`). -- Slicing a single row of a ``DataFrame`` with multiple ExtensionArrays of the same type now preserves the dtype, rather than coercing to object (:issue:`22784`) -- Added :meth:`pandas.api.types.register_extension_dtype` to register an extension type with pandas (:issue:`22664`) -- Series backed by an ``ExtensionArray`` now work with :func:`util.hash_pandas_object` (:issue:`23066`) -- Updated the ``.type`` attribute for ``PeriodDtype``, ``DatetimeTZDtype``, and ``IntervalDtype`` to be instances of the dtype (``Period``, ``Timestamp``, and ``Interval`` respectively) (:issue:`22938`) -- :func:`ExtensionArray.isna` is allowed to return an ``ExtensionArray`` (:issue:`22325`). -- Support for reduction operations such as ``sum``, ``mean`` via opt-in base class method override (:issue:`22762`) - -.. _whatsnew_0240.api.incompatibilities: - -Series and Index Data-Dtype Incompatibilities -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -``Series`` and ``Index`` constructors now raise when the -data is incompatible with a passed ``dtype=`` (:issue:`15832`) - -Previous Behavior: - -.. code-block:: ipython - - In [4]: pd.Series([-1], dtype="uint64") - Out [4]: - 0 18446744073709551615 - dtype: uint64 - -Current Behavior: - -.. code-block:: ipython - - In [4]: pd.Series([-1], dtype="uint64") - Out [4]: - ... - OverflowError: Trying to coerce negative values to unsigned integers - -.. _whatsnew_0240.api.crosstab_dtypes - -Crosstab Preserves Dtypes -^^^^^^^^^^^^^^^^^^^^^^^^^ - -:func:`crosstab` will preserve now dtypes in some cases that previously would -cast from integer dtype to floating dtype (:issue:`22019`) - -Previous Behavior: - -.. code-block:: ipython - - In [3]: df = pd.DataFrame({'a': [1, 2, 2, 2, 2], 'b': [3, 3, 4, 4, 4], - ...: 'c': [1, 1, np.nan, 1, 1]}) - In [4]: pd.crosstab(df.a, df.b, normalize='columns') - Out[4]: - b 3 4 - a - 1 0.5 0.0 - 2 0.5 1.0 - -Current Behavior: - -.. code-block:: ipython - - In [3]: df = pd.DataFrame({'a': [1, 2, 2, 2, 2], 'b': [3, 3, 4, 4, 4], - ...: 'c': [1, 1, np.nan, 1, 1]}) - In [4]: pd.crosstab(df.a, df.b, normalize='columns') - -Datetimelike API Changes -^^^^^^^^^^^^^^^^^^^^^^^^ - -- For :class:`DatetimeIndex` and :class:`TimedeltaIndex` with non-``None`` ``freq`` attribute, addition or subtraction of integer-dtyped array or ``Index`` will return an object of the same class (:issue:`19959`) -- :class:`DateOffset` objects are now immutable. Attempting to alter one of these will now raise ``AttributeError`` (:issue:`21341`) -- :class:`PeriodIndex` subtraction of another ``PeriodIndex`` will now return an object-dtype :class:`Index` of :class:`DateOffset` objects instead of raising a ``TypeError`` (:issue:`20049`) -- :func:`cut` and :func:`qcut` now returns a :class:`DatetimeIndex` or :class:`TimedeltaIndex` bins when the input is datetime or timedelta dtype respectively and ``retbins=True`` (:issue:`19891`) -- :meth:`DatetimeIndex.to_period` and :meth:`Timestamp.to_period` will issue a warning when timezone information will be lost (:issue:`21333`) - -.. _whatsnew_0240.api.other: - -Other API Changes -^^^^^^^^^^^^^^^^^ - -- :class:`DatetimeIndex` now accepts :class:`Int64Index` arguments as epoch timestamps (:issue:`20997`) -- Accessing a level of a ``MultiIndex`` with a duplicate name (e.g. in - :meth:`~MultiIndex.get_level_values`) now raises a ``ValueError`` instead of - a ``KeyError`` (:issue:`21678`). -- Invalid construction of ``IntervalDtype`` will now always raise a ``TypeError`` rather than a ``ValueError`` if the subdtype is invalid (:issue:`21185`) -- Trying to reindex a ``DataFrame`` with a non unique ``MultiIndex`` now raises a ``ValueError`` instead of an ``Exception`` (:issue:`21770`) -- :meth:`PeriodIndex.tz_convert` and :meth:`PeriodIndex.tz_localize` have been removed (:issue:`21781`) -- :class:`Index` subtraction will attempt to operate element-wise instead of raising ``TypeError`` (:issue:`19369`) -- :class:`pandas.io.formats.style.Styler` supports a ``number-format`` property when using :meth:`~pandas.io.formats.style.Styler.to_excel` (:issue:`22015`) -- :meth:`DataFrame.corr` and :meth:`Series.corr` now raise a ``ValueError`` along with a helpful error message instead of a ``KeyError`` when supplied with an invalid method (:issue:`22298`) -- :meth:`shift` will now always return a copy, instead of the previous behaviour of returning self when shifting by 0 (:issue:`22397`) -- Slicing a single row of a DataFrame with multiple ExtensionArrays of the same type now preserves the dtype, rather than coercing to object (:issue:`22784`) -- :class:`DateOffset` attribute `_cacheable` and method `_should_cache` have been removed (:issue:`23118`) - -.. _whatsnew_0240.deprecations: - -Deprecations -~~~~~~~~~~~~ - -- :meth:`DataFrame.to_stata`, :meth:`read_stata`, :class:`StataReader` and :class:`StataWriter` have deprecated the ``encoding`` argument. The encoding of a Stata dta file is determined by the file type and cannot be changed (:issue:`21244`) -- :meth:`MultiIndex.to_hierarchical` is deprecated and will be removed in a future version (:issue:`21613`) -- :meth:`Series.ptp` is deprecated. Use ``numpy.ptp`` instead (:issue:`21614`) -- :meth:`Series.compress` is deprecated. Use ``Series[condition]`` instead (:issue:`18262`) -- The signature of :meth:`Series.to_csv` has been uniformed to that of :meth:`DataFrame.to_csv`: the name of the first argument is now ``path_or_buf``, the order of subsequent arguments has changed, the ``header`` argument now defaults to ``True``. (:issue:`19715`) -- :meth:`Categorical.from_codes` has deprecated providing float values for the ``codes`` argument. (:issue:`21767`) -- :func:`pandas.read_table` is deprecated. Instead, use :func:`pandas.read_csv` passing ``sep='\t'`` if necessary (:issue:`21948`) -- :meth:`Series.str.cat` has deprecated using arbitrary list-likes *within* list-likes. A list-like container may still contain - many ``Series``, ``Index`` or 1-dimensional ``np.ndarray``, or alternatively, only scalar values. (:issue:`21950`) -- :meth:`FrozenNDArray.searchsorted` has deprecated the ``v`` parameter in favor of ``value`` (:issue:`14645`) -- :func:`DatetimeIndex.shift` and :func:`PeriodIndex.shift` now accept ``periods`` argument instead of ``n`` for consistency with :func:`Index.shift` and :func:`Series.shift`. Using ``n`` throws a deprecation warning (:issue:`22458`, :issue:`22912`) - -.. _whatsnew_0240.prior_deprecations: - -Removal of prior version deprecations/changes -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -- The ``LongPanel`` and ``WidePanel`` classes have been removed (:issue:`10892`) -- :meth:`Series.repeat` has renamed the ``reps`` argument to ``repeats`` (:issue:`14645`) -- Several private functions were removed from the (non-public) module ``pandas.core.common`` (:issue:`22001`) -- Removal of the previously deprecated module ``pandas.core.datetools`` (:issue:`14105`, :issue:`14094`) -- Strings passed into :meth:`DataFrame.groupby` that refer to both column and index levels will raise a ``ValueError`` (:issue:`14432`) -- :meth:`Index.repeat` and :meth:`MultiIndex.repeat` have renamed the ``n`` argument to ``repeats`` (:issue:`14645`) -- Removal of the previously deprecated ``as_indexer`` keyword completely from ``str.match()`` (:issue:`22356`, :issue:`6581`) -- Removed the ``pandas.formats.style`` shim for :class:`pandas.io.formats.style.Styler` (:issue:`16059`) -- :meth:`Categorical.searchsorted` and :meth:`Series.searchsorted` have renamed the ``v`` argument to ``value`` (:issue:`14645`) -- :meth:`TimedeltaIndex.searchsorted`, :meth:`DatetimeIndex.searchsorted`, and :meth:`PeriodIndex.searchsorted` have renamed the ``key`` argument to ``value`` (:issue:`14645`) -- Removal of the previously deprecated module ``pandas.json`` (:issue:`19944`) - -.. _whatsnew_0240.performance: - -Performance Improvements -~~~~~~~~~~~~~~~~~~~~~~~~ - -- Very large improvement in performance of slicing when the index is a :class:`CategoricalIndex`, - both when indexing by label (using .loc) and position(.iloc). - Likewise, slicing a ``CategoricalIndex`` itself (i.e. ``ci[100:200]``) shows similar speed improvements (:issue:`21659`) -- Improved performance of :func:`Series.describe` in case of numeric dtpyes (:issue:`21274`) -- Improved performance of :func:`pandas.core.groupby.GroupBy.rank` when dealing with tied rankings (:issue:`21237`) -- Improved performance of :func:`DataFrame.set_index` with columns consisting of :class:`Period` objects (:issue:`21582`, :issue:`21606`) -- Improved performance of membership checks in :class:`Categorical` and :class:`CategoricalIndex` - (i.e. ``x in cat``-style checks are much faster). :meth:`CategoricalIndex.contains` - is likewise much faster (:issue:`21369`, :issue:`21508`) -- Improved performance of :meth:`HDFStore.groups` (and dependent functions like - :meth:`~HDFStore.keys`. (i.e. ``x in store`` checks are much faster) - (:issue:`21372`) -- Improved the performance of :func:`pandas.get_dummies` with ``sparse=True`` (:issue:`21997`) -- Improved performance of :func:`IndexEngine.get_indexer_non_unique` for sorted, non-unique indexes (:issue:`9466`) -- Improved performance of :func:`PeriodIndex.unique` (:issue:`23083`) - - -.. _whatsnew_0240.docs: - -Documentation Changes -~~~~~~~~~~~~~~~~~~~~~ - -- Added sphinx spelling extension, updated documentation on how to use the spell check (:issue:`21079`) -- -- - -.. _whatsnew_0240.bug_fixes: - -Bug Fixes -~~~~~~~~~ - -Categorical -^^^^^^^^^^^ - -- Bug in :meth:`Categorical.from_codes` where ``NaN`` values in ``codes`` were silently converted to ``0`` (:issue:`21767`). In the future this will raise a ``ValueError``. Also changes the behavior of ``.from_codes([1.1, 2.0])``. -- Bug when indexing with a boolean-valued ``Categorical``. Now a boolean-valued ``Categorical`` is treated as a boolean mask (:issue:`22665`) -- Constructing a :class:`CategoricalIndex` with empty values and boolean categories was raising a ``ValueError`` after a change to dtype coercion (:issue:`22702`). - -Datetimelike -^^^^^^^^^^^^ - -- Fixed bug where two :class:`DateOffset` objects with different ``normalize`` attributes could evaluate as equal (:issue:`21404`) -- Fixed bug where :meth:`Timestamp.resolution` incorrectly returned 1-microsecond ``timedelta`` instead of 1-nanosecond :class:`Timedelta` (:issue:`21336`, :issue:`21365`) -- Bug in :func:`to_datetime` that did not consistently return an :class:`Index` when ``box=True`` was specified (:issue:`21864`) -- Bug in :class:`DatetimeIndex` comparisons where string comparisons incorrectly raises ``TypeError`` (:issue:`22074`) -- Bug in :class:`DatetimeIndex` comparisons when comparing against ``timedelta64[ns]`` dtyped arrays; in some cases ``TypeError`` was incorrectly raised, in others it incorrectly failed to raise (:issue:`22074`) -- Bug in :class:`DatetimeIndex` comparisons when comparing against object-dtyped arrays (:issue:`22074`) -- Bug in :class:`DataFrame` with ``datetime64[ns]`` dtype addition and subtraction with ``Timedelta``-like objects (:issue:`22005`, :issue:`22163`) -- Bug in :class:`DataFrame` with ``datetime64[ns]`` dtype addition and subtraction with ``DateOffset`` objects returning an ``object`` dtype instead of ``datetime64[ns]`` dtype (:issue:`21610`, :issue:`22163`) -- Bug in :class:`DataFrame` with ``datetime64[ns]`` dtype comparing against ``NaT`` incorrectly (:issue:`22242`, :issue:`22163`) -- Bug in :class:`DataFrame` with ``datetime64[ns]`` dtype subtracting ``Timestamp``-like object incorrectly returned ``datetime64[ns]`` dtype instead of ``timedelta64[ns]`` dtype (:issue:`8554`, :issue:`22163`) -- Bug in :class:`DataFrame` with ``datetime64[ns]`` dtype subtracting ``np.datetime64`` object with non-nanosecond unit failing to convert to nanoseconds (:issue:`18874`, :issue:`22163`) -- Bug in :class:`DataFrame` comparisons against ``Timestamp``-like objects failing to raise ``TypeError`` for inequality checks with mismatched types (:issue:`8932`, :issue:`22163`) -- Bug in :class:`DataFrame` with mixed dtypes including ``datetime64[ns]`` incorrectly raising ``TypeError`` on equality comparisons (:issue:`13128`, :issue:`22163`) -- Bug in :meth:`DataFrame.eq` comparison against ``NaT`` incorrectly returning ``True`` or ``NaN`` (:issue:`15697`, :issue:`22163`) -- Bug in :class:`DatetimeIndex` subtraction that incorrectly failed to raise ``OverflowError`` (:issue:`22492`, :issue:`22508`) -- Bug in :class:`DatetimeIndex` incorrectly allowing indexing with ``Timedelta`` object (:issue:`20464`) -- Bug in :class:`DatetimeIndex` where frequency was being set if original frequency was ``None`` (:issue:`22150`) -- Bug in rounding methods of :class:`DatetimeIndex` (:meth:`~DatetimeIndex.round`, :meth:`~DatetimeIndex.ceil`, :meth:`~DatetimeIndex.floor`) and :class:`Timestamp` (:meth:`~Timestamp.round`, :meth:`~Timestamp.ceil`, :meth:`~Timestamp.floor`) could give rise to loss of precision (:issue:`22591`) -- Bug in :func:`to_datetime` with an :class:`Index` argument that would drop the ``name`` from the result (:issue:`21697`) -- Bug in :class:`PeriodIndex` where adding or subtracting a :class:`timedelta` or :class:`Tick` object produced incorrect results (:issue:`22988`) - -Timedelta -^^^^^^^^^ -- Bug in :class:`DataFrame` with ``timedelta64[ns]`` dtype division by ``Timedelta``-like scalar incorrectly returning ``timedelta64[ns]`` dtype instead of ``float64`` dtype (:issue:`20088`, :issue:`22163`) -- Bug in adding a :class:`Index` with object dtype to a :class:`Series` with ``timedelta64[ns]`` dtype incorrectly raising (:issue:`22390`) -- Bug in multiplying a :class:`Series` with numeric dtype against a ``timedelta`` object (:issue:`22390`) -- Bug in :class:`Series` with numeric dtype when adding or subtracting an an array or ``Series`` with ``timedelta64`` dtype (:issue:`22390`) -- Bug in :class:`Index` with numeric dtype when multiplying or dividing an array with dtype ``timedelta64`` (:issue:`22390`) -- Bug in :class:`TimedeltaIndex` incorrectly allowing indexing with ``Timestamp`` object (:issue:`20464`) -- Fixed bug where subtracting :class:`Timedelta` from an object-dtyped array would raise ``TypeError`` (:issue:`21980`) -- Fixed bug in adding a :class:`DataFrame` with all-`timedelta64[ns]` dtypes to a :class:`DataFrame` with all-integer dtypes returning incorrect results instead of raising ``TypeError`` (:issue:`22696`) -- - -Timezones -^^^^^^^^^ - -- Bug in :meth:`DatetimeIndex.shift` where an ``AssertionError`` would raise when shifting across DST (:issue:`8616`) -- Bug in :class:`Timestamp` constructor where passing an invalid timezone offset designator (``Z``) would not raise a ``ValueError`` (:issue:`8910`) -- Bug in :meth:`Timestamp.replace` where replacing at a DST boundary would retain an incorrect offset (:issue:`7825`) -- Bug in :meth:`Series.replace` with ``datetime64[ns, tz]`` data when replacing ``NaT`` (:issue:`11792`) -- Bug in :class:`Timestamp` when passing different string date formats with a timezone offset would produce different timezone offsets (:issue:`12064`) -- Bug when comparing a tz-naive :class:`Timestamp` to a tz-aware :class:`DatetimeIndex` which would coerce the :class:`DatetimeIndex` to tz-naive (:issue:`12601`) -- Bug in :meth:`Series.truncate` with a tz-aware :class:`DatetimeIndex` which would cause a core dump (:issue:`9243`) -- Bug in :class:`Series` constructor which would coerce tz-aware and tz-naive :class:`Timestamp` to tz-aware (:issue:`13051`) -- Bug in :class:`Index` with ``datetime64[ns, tz]`` dtype that did not localize integer data correctly (:issue:`20964`) -- Bug in :class:`DatetimeIndex` where constructing with an integer and tz would not localize correctly (:issue:`12619`) -- Fixed bug where :meth:`DataFrame.describe` and :meth:`Series.describe` on tz-aware datetimes did not show `first` and `last` result (:issue:`21328`) -- Bug in :class:`DatetimeIndex` comparisons failing to raise ``TypeError`` when comparing timezone-aware ``DatetimeIndex`` against ``np.datetime64`` (:issue:`22074`) -- Bug in ``DataFrame`` assignment with a timezone-aware scalar (:issue:`19843`) -- Bug in :func:`Dataframe.asof` that raised a ``TypeError`` when attempting to compare tz-naive and tz-aware timestamps (:issue:`21194`) -- Bug when constructing a :class:`DatetimeIndex` with :class:`Timestamp`s constructed with the ``replace`` method across DST (:issue:`18785`) -- Bug when setting a new value with :meth:`DataFrame.loc` with a :class:`DatetimeIndex` with a DST transition (:issue:`18308`, :issue:`20724`) -- Bug in :meth:`DatetimeIndex.unique` that did not re-localize tz-aware dates correctly (:issue:`21737`) -- Bug when indexing a :class:`Series` with a DST transition (:issue:`21846`) -- Bug in :meth:`DataFrame.resample` and :meth:`Series.resample` where an ``AmbiguousTimeError`` or ``NonExistentTimeError`` would raise if a timezone aware timeseries ended on a DST transition (:issue:`19375`, :issue:`10117`) - -Offsets -^^^^^^^ - -- Bug in :class:`FY5253` where date offsets could incorrectly raise an ``AssertionError`` in arithmetic operatons (:issue:`14774`) -- Bug in :class:`DateOffset` where keyword arguments ``week`` and ``milliseconds`` were accepted and ignored. Passing these will now raise ``ValueError`` (:issue:`19398`) -- - -Numeric -^^^^^^^ - -- Bug in :class:`Series` ``__rmatmul__`` doesn't support matrix vector multiplication (:issue:`21530`) -- Bug in :func:`factorize` fails with read-only array (:issue:`12813`) -- Fixed bug in :func:`unique` handled signed zeros inconsistently: for some inputs 0.0 and -0.0 were treated as equal and for some inputs as different. Now they are treated as equal for all inputs (:issue:`21866`) -- Bug in :meth:`DataFrame.agg`, :meth:`DataFrame.transform` and :meth:`DataFrame.apply` where, - when supplied with a list of functions and ``axis=1`` (e.g. ``df.apply(['sum', 'mean'], axis=1)``), - a ``TypeError`` was wrongly raised. For all three methods such calculation are now done correctly. (:issue:`16679`). -- Bug in :class:`Series` comparison against datetime-like scalars and arrays (:issue:`22074`) -- Bug in :class:`DataFrame` multiplication between boolean dtype and integer returning ``object`` dtype instead of integer dtype (:issue:`22047`, :issue:`22163`) -- Bug in :meth:`DataFrame.apply` where, when supplied with a string argument and additional positional or keyword arguments (e.g. ``df.apply('sum', min_count=1)``), a ``TypeError`` was wrongly raised (:issue:`22376`) -- Bug in :meth:`DataFrame.astype` to extension dtype may raise ``AttributeError`` (:issue:`22578`) -- Bug in :class:`DataFrame` with ``timedelta64[ns]`` dtype arithmetic operations with ``ndarray`` with integer dtype incorrectly treating the narray as ``timedelta64[ns]`` dtype (:issue:`23114`) - -Strings -^^^^^^^ - -- -- -- - -Interval -^^^^^^^^ - -- Bug in the :class:`IntervalIndex` constructor where the ``closed`` parameter did not always override the inferred ``closed`` (:issue:`19370`) -- Bug in the ``IntervalIndex`` repr where a trailing comma was missing after the list of intervals (:issue:`20611`) -- Bug in :class:`Interval` where scalar arithmetic operations did not retain the ``closed`` value (:issue:`22313`) -- Bug in :class:`IntervalIndex` where indexing with datetime-like values raised a ``KeyError`` (:issue:`20636`) - -Indexing -^^^^^^^^ - -- The traceback from a ``KeyError`` when asking ``.loc`` for a single missing label is now shorter and more clear (:issue:`21557`) -- When ``.ix`` is asked for a missing integer label in a :class:`MultiIndex` with a first level of integer type, it now raises a ``KeyError``, consistently with the case of a flat :class:`Int64Index`, rather than falling back to positional indexing (:issue:`21593`) -- Bug in :meth:`DatetimeIndex.reindex` when reindexing a tz-naive and tz-aware :class:`DatetimeIndex` (:issue:`8306`) -- Bug in :class:`DataFrame` when setting values with ``.loc`` and a timezone aware :class:`DatetimeIndex` (:issue:`11365`) -- ``DataFrame.__getitem__`` now accepts dictionaries and dictionary keys as list-likes of labels, consistently with ``Series.__getitem__`` (:issue:`21294`) -- Fixed ``DataFrame[np.nan]`` when columns are non-unique (:issue:`21428`) -- Bug when indexing :class:`DatetimeIndex` with nanosecond resolution dates and timezones (:issue:`11679`) -- Bug where indexing with a Numpy array containing negative values would mutate the indexer (:issue:`21867`) -- Bug where mixed indexes wouldn't allow integers for ``.at`` (:issue:`19860`) -- ``Float64Index.get_loc`` now raises ``KeyError`` when boolean key passed. (:issue:`19087`) -- Bug in :meth:`DataFrame.loc` when indexing with an :class:`IntervalIndex` (:issue:`19977`) -- :class:`Index` no longer mangles ``None``, ``NaN`` and ``NaT``, i.e. they are treated as three different keys. However, for numeric Index all three are still coerced to a ``NaN`` (:issue:`22332`) -- Bug in `scalar in Index` if scalar is a float while the ``Index`` is of integer dtype (:issue:`22085`) - -Missing -^^^^^^^ - -- Bug in :func:`DataFrame.fillna` where a ``ValueError`` would raise when one column contained a ``datetime64[ns, tz]`` dtype (:issue:`15522`) -- Bug in :func:`Series.hasnans` that could be incorrectly cached and return incorrect answers if null elements are introduced after an initial call (:issue:`19700`) -- :func:`Series.isin` now treats all NaN-floats as equal also for `np.object`-dtype. This behavior is consistent with the behavior for float64 (:issue:`22119`) -- :func:`unique` no longer mangles NaN-floats and the ``NaT``-object for `np.object`-dtype, i.e. ``NaT`` is no longer coerced to a NaN-value and is treated as a different entity. (:issue:`22295`) - - -MultiIndex -^^^^^^^^^^ - -- Removed compatibility for :class:`MultiIndex` pickles prior to version 0.8.0; compatibility with :class:`MultiIndex` pickles from version 0.13 forward is maintained (:issue:`21654`) -- :meth:`MultiIndex.get_loc_level` (and as a consequence, ``.loc`` on a :class:`MultiIndex`ed object) will now raise a ``KeyError``, rather than returning an empty ``slice``, if asked a label which is present in the ``levels`` but is unused (:issue:`22221`) -- Fix ``TypeError`` in Python 3 when creating :class:`MultiIndex` in which some levels have mixed types, e.g. when some labels are tuples (:issue:`15457`) - -I/O -^^^ - -- :func:`read_html()` no longer ignores all-whitespace ```` within ```` when considering the ``skiprows`` and ``header`` arguments. Previously, users had to decrease their ``header`` and ``skiprows`` values on such tables to work around the issue. (:issue:`21641`) -- :func:`read_excel()` will correctly show the deprecation warning for previously deprecated ``sheetname`` (:issue:`17994`) -- :func:`read_csv()` and func:`read_table()` will throw ``UnicodeError`` and not coredump on badly encoded strings (:issue:`22748`) -- :func:`read_csv()` will correctly parse timezone-aware datetimes (:issue:`22256`) -- :func:`read_sas()` will parse numbers in sas7bdat-files that have width less than 8 bytes correctly. (:issue:`21616`) -- :func:`read_sas()` will correctly parse sas7bdat files with many columns (:issue:`22628`) -- :func:`read_sas()` will correctly parse sas7bdat files with data page types having also bit 7 set (so page type is 128 + 256 = 384) (:issue:`16615`) -- Bug in :meth:`detect_client_encoding` where potential ``IOError`` goes unhandled when importing in a mod_wsgi process due to restricted access to stdout. (:issue:`21552`) -- Bug in :func:`to_string()` that broke column alignment when ``index=False`` and width of first column's values is greater than the width of first column's header (:issue:`16839`, :issue:`13032`) - -Plotting -^^^^^^^^ - -- Bug in :func:`DataFrame.plot.scatter` and :func:`DataFrame.plot.hexbin` caused x-axis label and ticklabels to disappear when colorbar was on in IPython inline backend (:issue:`10611`, :issue:`10678`, and :issue:`20455`) -- Bug in plotting a Series with datetimes using :func:`matplotlib.axes.Axes.scatter` (:issue:`22039`) - -Groupby/Resample/Rolling -^^^^^^^^^^^^^^^^^^^^^^^^ - -- Bug in :func:`pandas.core.groupby.GroupBy.first` and :func:`pandas.core.groupby.GroupBy.last` with ``as_index=False`` leading to the loss of timezone information (:issue:`15884`) -- Bug in :meth:`DatetimeIndex.resample` when downsampling across a DST boundary (:issue:`8531`) -- Bug where ``ValueError`` is wrongly raised when calling :func:`~pandas.core.groupby.SeriesGroupBy.count` method of a - ``SeriesGroupBy`` when the grouping variable only contains NaNs and numpy version < 1.13 (:issue:`21956`). -- Multiple bugs in :func:`pandas.core.Rolling.min` with ``closed='left'`` and a - datetime-like index leading to incorrect results and also segfault. (:issue:`21704`) -- Bug in :meth:`Resampler.apply` when passing postiional arguments to applied func (:issue:`14615`). -- Bug in :meth:`Series.resample` when passing ``numpy.timedelta64`` to ``loffset`` kwarg (:issue:`7687`). -- Bug in :meth:`Resampler.asfreq` when frequency of ``TimedeltaIndex`` is a subperiod of a new frequency (:issue:`13022`). -- Bug in :meth:`SeriesGroupBy.mean` when values were integral but could not fit inside of int64, overflowing instead. (:issue:`22487`) -- :func:`RollingGroupby.agg` and :func:`ExpandingGroupby.agg` now support multiple aggregation functions as parameters (:issue:`15072`) -- Bug in :meth:`DataFrame.resample` and :meth:`Series.resample` when resampling by a weekly offset (``'W'``) across a DST transition (:issue:`9119`, :issue:`21459`) - -Reshaping -^^^^^^^^^ - -- Bug in :func:`pandas.concat` when joining resampled DataFrames with timezone aware index (:issue:`13783`) -- Bug in :meth:`Series.combine_first` with ``datetime64[ns, tz]`` dtype which would return tz-naive result (:issue:`21469`) -- Bug in :meth:`Series.where` and :meth:`DataFrame.where` with ``datetime64[ns, tz]`` dtype (:issue:`21546`) -- Bug in :meth:`Series.mask` and :meth:`DataFrame.mask` with ``list`` conditionals (:issue:`21891`) -- Bug in :meth:`DataFrame.replace` raises RecursionError when converting OutOfBounds ``datetime64[ns, tz]`` (:issue:`20380`) -- :func:`pandas.core.groupby.GroupBy.rank` now raises a ``ValueError`` when an invalid value is passed for argument ``na_option`` (:issue:`22124`) -- Bug in :func:`get_dummies` with Unicode attributes in Python 2 (:issue:`22084`) -- Bug in :meth:`DataFrame.replace` raises ``RecursionError`` when replacing empty lists (:issue:`22083`) -- Bug in :meth:`Series.replace` and meth:`DataFrame.replace` when dict is used as the ``to_replace`` value and one key in the dict is is another key's value, the results were inconsistent between using integer key and using string key (:issue:`20656`) -- Bug in :meth:`DataFrame.drop_duplicates` for empty ``DataFrame`` which incorrectly raises an error (:issue:`20516`) -- Bug in :func:`pandas.wide_to_long` when a string is passed to the stubnames argument and a column name is a substring of that stubname (:issue:`22468`) -- Bug in :func:`merge` when merging ``datetime64[ns, tz]`` data that contained a DST transition (:issue:`18885`) -- Bug in :func:`merge_asof` when merging on float values within defined tolerance (:issue:`22981`) -- Bug in :func:`pandas.concat` when concatenating a multicolumn DataFrame with tz-aware data against a DataFrame with a different number of columns (:issue`22796`) -- Bug in :func:`merge_asof` when merging on columns containing nulls values (:issue:`22981`) - -.. _whatsnew_0240.bug_fixes.sparse: - -Sparse -^^^^^^ - -- Updating a boolean, datetime, or timedelta column to be Sparse now works (:issue:`22367`) -- Bug in :meth:`Series.to_sparse` with Series already holding sparse data not constructing properly (:issue:`22389`) -- Providing a ``sparse_index`` to the SparseArray constructor no longer defaults the na-value to ``np.nan`` for all dtypes. The correct na_value for ``data.dtype`` is now used. -- Bug in ``SparseArray.nbytes`` under-reporting its memory usage by not including the size of its sparse index. -- Improved performance of :meth:`Series.shift` for non-NA ``fill_value``, as values are no longer converted to a dense array. -- Bug in ``DataFrame.groupby`` not including ``fill_value`` in the groups for non-NA ``fill_value`` when grouping by a sparse column (:issue:`5078`) -- Bug in unary inversion operator (``~``) on a ``SparseSeries`` with boolean values. The performance of this has also been improved (:issue:`22835`) - -Build Changes -^^^^^^^^^^^^^ - -- Building pandas for development now requires ``cython >= 0.28.2`` (:issue:`21688`) -- Testing pandas now requires ``hypothesis>=3.58``. You can find `the Hypothesis docs here `_, and a pandas-specific introduction :ref:`in the contributing guide `. (:issue:`22280`) -- - -Other -^^^^^ - -- :meth:`~pandas.io.formats.style.Styler.background_gradient` now takes a ``text_color_threshold`` parameter to automatically lighten the text color based on the luminance of the background color. This improves readability with dark background colors without the need to limit the background colormap range. (:issue:`21258`) -- Require at least 0.28.2 version of ``cython`` to support read-only memoryviews (:issue:`21688`) -- :meth:`~pandas.io.formats.style.Styler.background_gradient` now also supports tablewise application (in addition to rowwise and columnwise) with ``axis=None`` (:issue:`15204`) -- :meth:`DataFrame.nlargest` and :meth:`DataFrame.nsmallest` now returns the correct n values when keep != 'all' also when tied on the first columns (:issue:`22752`) -- :meth:`~pandas.io.formats.style.Styler.bar` now also supports tablewise application (in addition to rowwise and columnwise) with ``axis=None`` and setting clipping range with ``vmin`` and ``vmax`` (:issue:`21548` and :issue:`21526`). ``NaN`` values are also handled properly. -- Logical operations ``&, |, ^`` between :class:`Series` and :class:`Index` will no longer raise ``ValueError`` (:issue:`22092`) -- Bug in :meth:`DataFrame.combine_first` in which column types were unexpectedly converted to float (:issue:`20699`) From ceac501364994605951b9a1bbda2831c17b1363e Mon Sep 17 00:00:00 2001 From: Sven Date: Tue, 16 Oct 2018 07:22:24 +0100 Subject: [PATCH 13/14] move checking of error to later position and update error type --- pandas/core/reshape/merge.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 236707cf32209..20c4afb5641ab 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1390,16 +1390,21 @@ def flip(xs): self.right_join_keys[-1]) tolerance = self.tolerance - # Check null values before merge - if isnull(left_values).sum() > 0 or isnull(right_values).sum() > 0: - raise MergeError('Merge keys cannot contain null values') + # we required sortedness and non-missingness in the join keys + msg_sorted = "{side} keys must be sorted" + msg_missings = "Merge keys contain null values in {side} side" - # we required sortedness in the join keys - msg = "{side} keys must be sorted" if not Index(left_values).is_monotonic: - raise ValueError(msg.format(side='left')) + if isnull(left_values).sum() > 0: + raise ValueError(msg_missings.format(side='left')) + else: + raise ValueError(msg_sorted.format(side='left')) + if not Index(right_values).is_monotonic: - raise ValueError(msg.format(side='right')) + if isnull(right_values).sum() > 0: + raise ValueError(msg_missings.format(side='right')) + else: + raise ValueError(msg_sorted.format(side='right')) # initial type conversion as needed if needs_i8_conversion(left_values): From e954263fc7c9203e1e053bf3e00d11336e35c45f Mon Sep 17 00:00:00 2001 From: Sven Date: Tue, 16 Oct 2018 21:11:05 +0100 Subject: [PATCH 14/14] BUG: Fix error message on missing values for merge_asof (#23189) --- pandas/core/reshape/merge.py | 2 +- pandas/tests/reshape/merge/test_merge_asof.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 20c4afb5641ab..88b1ec7e47bbb 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -1392,7 +1392,7 @@ def flip(xs): # we required sortedness and non-missingness in the join keys msg_sorted = "{side} keys must be sorted" - msg_missings = "Merge keys contain null values in {side} side" + msg_missings = "Merge keys contain null values on {side} side" if not Index(left_values).is_monotonic: if isnull(left_values).sum() > 0: diff --git a/pandas/tests/reshape/merge/test_merge_asof.py b/pandas/tests/reshape/merge/test_merge_asof.py index ee67d589afd64..ba0cdda61a12c 100644 --- a/pandas/tests/reshape/merge/test_merge_asof.py +++ b/pandas/tests/reshape/merge/test_merge_asof.py @@ -1009,7 +1009,6 @@ def test_merge_datatype_error(self): merge_asof(left, right, on='a') def test_merge_on_nans_int(self): - """ Test merging on integer columns with nans throws a correct ValueError """ # 23189 msg = "Merge keys contain null values on left side" left = pd.DataFrame({'a': [1.0, 5.0, 10.0, 12.0, np.nan], @@ -1021,11 +1020,12 @@ def test_merge_on_nans_int(self): merge_asof(left, right, on='a') def test_merge_on_nans_datetime(self): - """ Test merging on datetime columns with nans throws correct ValueError """ # 23189 msg = "Merge keys contain null values on right side" - left = pd.DataFrame(pd.date_range('20130101', periods=5), columns=['a']) - right = pd.DataFrame(pd.date_range('20130102', periods=5).append(pd.Index([None])), columns=['a']) + left = pd.DataFrame({"a": pd.date_range('20130101', periods=5)}) + date_vals = pd.date_range('20130102', periods=5)\ + .append(pd.Index([None])) + right = pd.DataFrame({"a": date_vals}) with tm.assert_raises_regex(ValueError, msg): merge_asof(left, right, on='a')