From 52d7d87662ab67f6f4c944088ba4849f69a44a61 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Tue, 21 Jun 2022 18:22:19 -0700 Subject: [PATCH 1/2] Backport PR #47287: DEPS: Sync environment.yml with CI dep files --- .github/workflows/code-checks.yml | 29 + .github/workflows/{posix.yml => ubuntu.yml} | 4 +- ci/deps/actions-310.yaml | 3 +- environment.yml | 158 ++--- pandas/core/algorithms.py | 7 +- pandas/core/array_algos/quantile.py | 5 +- pandas/core/arraylike.py | 6 +- pandas/core/arrays/arrow/array.py | 603 ++++++++++++++++++++ pandas/core/arrays/interval.py | 26 +- pandas/core/arrays/masked.py | 20 +- pandas/core/arrays/sparse/array.py | 17 +- pandas/core/dtypes/astype.py | 407 +++++++++++++ pandas/core/dtypes/common.py | 4 +- pandas/core/exchange/buffer.py | 81 +++ pandas/core/frame.py | 4 +- pandas/core/indexes/base.py | 15 +- pandas/core/indexes/multi.py | 4 +- pandas/core/internals/ops.py | 4 +- pandas/core/missing.py | 23 +- pandas/core/reshape/melt.py | 4 +- pandas/core/series.py | 4 +- pandas/core/window/rolling.py | 5 +- pandas/io/formats/style.py | 10 + pandas/io/parsers/c_parser_wrapper.py | 11 +- pandas/tests/extension/date/array.py | 5 +- requirements-dev.txt | 121 ++-- scripts/generate_pip_deps_from_conda.py | 2 +- 27 files changed, 1340 insertions(+), 242 deletions(-) rename .github/workflows/{posix.yml => ubuntu.yml} (98%) create mode 100644 pandas/core/arrays/arrow/array.py create mode 100644 pandas/core/dtypes/astype.py create mode 100644 pandas/core/exchange/buffer.py diff --git a/.github/workflows/code-checks.yml b/.github/workflows/code-checks.yml index 87b80204d0c19..eaf9fafbff993 100644 --- a/.github/workflows/code-checks.yml +++ b/.github/workflows/code-checks.yml @@ -166,3 +166,32 @@ jobs: - name: Build image run: docker build --pull --no-cache --tag pandas-dev-env . + + requirements-dev-text-installable: + name: Test install requirements-dev.txt + runs-on: ubuntu-latest + + concurrency: + # https://github.community/t/concurrecy-not-work-for-push/183068/7 + group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-requirements-dev-text-installable + cancel-in-progress: true + + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + fetch-depth: 0 + + - name: Setup Python + id: setup_python + uses: actions/setup-python@v3 + with: + python-version: '3.8' + cache: 'pip' + cache-dependency-path: 'requirements-dev.txt' + + - name: Install requirements-dev.txt + run: pip install -r requirements-dev.txt + + - name: Check Pip Cache Hit + run: echo ${{ steps.setup_python.outputs.cache-hit }} diff --git a/.github/workflows/posix.yml b/.github/workflows/ubuntu.yml similarity index 98% rename from .github/workflows/posix.yml rename to .github/workflows/ubuntu.yml index 35c40f2a4aa54..1a57f021e6c4c 100644 --- a/.github/workflows/posix.yml +++ b/.github/workflows/ubuntu.yml @@ -1,4 +1,4 @@ -name: Posix +name: Ubuntu on: push: @@ -145,7 +145,7 @@ jobs: - name: Extra installs # xsel for clipboard tests - run: sudo apt-get update && sudo apt-get install -y libc6-dev-i386 xsel ${{ env.EXTRA_APT }} + run: sudo apt-get update && sudo apt-get install -y xsel ${{ env.EXTRA_APT }} - uses: conda-incubator/setup-miniconda@v2.1.1 with: diff --git a/ci/deps/actions-310.yaml b/ci/deps/actions-310.yaml index 7a879b5ac9648..27a2715c20e86 100644 --- a/ci/deps/actions-310.yaml +++ b/ci/deps/actions-310.yaml @@ -31,8 +31,7 @@ dependencies: - jinja2 - lxml - matplotlib - # TODO: uncomment after numba supports py310 - #- numba + - numba - numexpr - openpyxl - odfpy diff --git a/environment.yml b/environment.yml index 83b00c0dd6421..c5382811a8467 100644 --- a/environment.yml +++ b/environment.yml @@ -1,21 +1,85 @@ +# Local development dependencies including docs building, website upload, ASV benchmark name: pandas-dev channels: - conda-forge dependencies: - # required - - numpy>=1.18.5 - python=3.8 - - python-dateutil>=2.8.1 + + # test dependencies + - cython=0.29.30 + - pytest>=6.0 + - pytest-cov + - pytest-xdist>=1.31 + - psutil + - pytest-asyncio>=0.17 + - boto3 + + # required dependencies + - python-dateutil + - numpy - pytz + # optional dependencies + - beautifulsoup4 + - blosc + - brotlipy + - bottleneck + - fastparquet + - fsspec + - html5lib + - hypothesis + - gcsfs + - jinja2 + - lxml + - matplotlib + - numba>=0.53.1 + - numexpr>=2.8.0 # pin for "Run checks on imported code" job + - openpyxl + - odfpy + - pandas-gbq + - psycopg2 + - pyarrow + - pymysql + - pyreadstat + - pytables + - python-snappy + - pyxlsb + - s3fs + - scipy + - sqlalchemy + - tabulate + - xarray + - xlrd + - xlsxwriter + - xlwt + - zstandard + + # downstream packages + - aiobotocore<2.0.0 # GH#44311 pinned to fix docbuild + - botocore + - cftime + - dask + - ipython + - geopandas-base + - seaborn + - scikit-learn + - statsmodels + - coverage + - pandas-datareader + - pyyaml + - py + - pytorch + + # local testing dependencies + - moto + - flask + # benchmarks - asv - # building # The compiler packages are meta-packages and install the correct compiler (activation) packages on the respective platforms. - c-compiler - cxx-compiler - - cython>=0.29.30 # code checks - black=22.3.0 @@ -24,18 +88,19 @@ dependencies: - flake8-bugbear=21.3.2 # used by flake8, find likely bugs - flake8-comprehensions=3.7.0 # used by flake8, linting of unnecessary comprehensions - isort>=5.2.1 # check that imports are in the right order - - mypy=0.930 - - pre-commit>=2.9.2 + - mypy=0.960 + - pre-commit>=2.15.0 - pycodestyle # used by flake8 - pyupgrade # documentation - gitpython # obtain contributors from git for whatsnew - gitdb - - numpydoc < 1.2 # 2021-02-09 1.2dev breaking CI + - natsort # DataFrame.sort_values doctest + - numpydoc - pandas-dev-flaker=0.4.0 - pydata-sphinx-theme=0.8.0 - - pytest-cython + - pytest-cython # doctest - sphinx - sphinx-panels - types-python-dateutil @@ -47,77 +112,14 @@ dependencies: - nbconvert>=6.4.5 - nbsphinx - pandoc - - # Dask and its dependencies (that dont install with dask) - - dask-core - - toolz>=0.7.3 - - partd>=0.3.10 - - cloudpickle>=0.2.1 - - # web (jinja2 is also needed, but it's also an optional pandas dependency) - - markdown - - feedparser - - pyyaml - - requests - - # testing - - boto3 - - botocore>=1.11 - - hypothesis>=5.5.3 - - moto # mock S3 - - flask - - pytest>=6.0 - - pytest-cov - - pytest-xdist>=1.31 - - pytest-asyncio>=0.17 - - pytest-instafail - - # downstream tests - - seaborn - - statsmodels - - # unused (required indirectly may be?) - ipywidgets - nbformat - notebook>=6.0.3 - - # optional - - blosc - - bottleneck>=1.3.1 - ipykernel - - ipython>=7.11.1 - - jinja2 # pandas.Styler - - matplotlib>=3.3.2 # pandas.plotting, Series.plot, DataFrame.plot - - numexpr>=2.7.1 - - scipy>=1.4.1 - - numba>=0.50.1 - - # optional for io - # --------------- - # pd.read_html - - beautifulsoup4>=4.8.2 - - html5lib - - lxml - - # pd.read_excel, DataFrame.to_excel, pd.ExcelWriter, pd.ExcelFile - - openpyxl - - xlrd - - xlsxwriter - - xlwt - - odfpy - - - fastparquet>=0.4.0 # pandas.read_parquet, DataFrame.to_parquet - - pyarrow>2.0.1 # pandas.read_parquet, DataFrame.to_parquet, pandas.read_feather, DataFrame.to_feather - - python-snappy # required by pyarrow - - pytables>=3.6.1 # pandas.read_hdf, DataFrame.to_hdf - - s3fs>=0.4.0 # file IO when using 's3://...' path - - aiobotocore<2.0.0 # GH#44311 pinned to fix docbuild - - fsspec>=0.7.4 # for generic remote file operations - - gcsfs>=0.6.0 # file IO when using 'gcs://...' path - - sqlalchemy # pandas.read_sql, DataFrame.to_sql - - xarray<0.19 # DataFrame.to_xarray - - cftime # Needed for downstream xarray.CFTimeIndex test - - pyreadstat # pandas.read_spss - - tabulate>=0.8.3 # DataFrame.to_markdown - - natsort # DataFrame.sort_values + # web + - jinja2 # in optional dependencies, but documented here as needed + - markdown + - feedparser + - pyyaml + - requests diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 32e3e19688a63..5493f84fb0be1 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1078,12 +1078,7 @@ def checked_add_with_arr( elif arr_mask is not None: not_nan = np.logical_not(arr_mask) elif b_mask is not None: - # Argument 1 to "__call__" of "_UFunc_Nin1_Nout1" has incompatible type - # "Optional[ndarray[Any, dtype[bool_]]]"; expected - # "Union[_SupportsArray[dtype[Any]], _NestedSequence[_SupportsArray[dtype[An - # y]]], bool, int, float, complex, str, bytes, _NestedSequence[Union[bool, - # int, float, complex, str, bytes]]]" [arg-type] - not_nan = np.logical_not(b2_mask) # type: ignore[arg-type] + not_nan = np.logical_not(b2_mask) else: not_nan = np.empty(arr.shape, dtype=bool) not_nan.fill(True) diff --git a/pandas/core/array_algos/quantile.py b/pandas/core/array_algos/quantile.py index 64cd43a3e77cb..4b0db5eccb6f1 100644 --- a/pandas/core/array_algos/quantile.py +++ b/pandas/core/array_algos/quantile.py @@ -184,5 +184,8 @@ def _nanpercentile( return result else: return np.percentile( - values, qs, axis=1, **{np_percentile_argname: interpolation} + values, + qs, + axis=1, + **{np_percentile_argname: interpolation}, ) diff --git a/pandas/core/arraylike.py b/pandas/core/arraylike.py index b6e9bf1420b21..e241fc119ae02 100644 --- a/pandas/core/arraylike.py +++ b/pandas/core/arraylike.py @@ -265,7 +265,11 @@ def array_ufunc(self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any) return result # Determine if we should defer. - no_defer = (np.ndarray.__array_ufunc__, cls.__array_ufunc__) + # error: "Type[ndarray[Any, Any]]" has no attribute "__array_ufunc__" + no_defer = ( + np.ndarray.__array_ufunc__, # type: ignore[attr-defined] + cls.__array_ufunc__, + ) for item in inputs: higher_priority = ( diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py new file mode 100644 index 0000000000000..c1380fcdbba06 --- /dev/null +++ b/pandas/core/arrays/arrow/array.py @@ -0,0 +1,603 @@ +from __future__ import annotations + +from typing import ( + TYPE_CHECKING, + Any, + TypeVar, +) + +import numpy as np + +from pandas._typing import ( + Dtype, + PositionalIndexer, + TakeIndexer, + npt, +) +from pandas.compat import ( + pa_version_under1p01, + pa_version_under2p0, + pa_version_under5p0, + pa_version_under6p0, +) +from pandas.util._decorators import doc + +from pandas.core.dtypes.common import ( + is_array_like, + is_bool_dtype, + is_integer, + is_integer_dtype, + is_scalar, +) +from pandas.core.dtypes.missing import isna + +from pandas.core.arrays.base import ExtensionArray +from pandas.core.indexers import ( + check_array_indexer, + unpack_tuple_and_ellipses, + validate_indices, +) + +if not pa_version_under1p01: + import pyarrow as pa + import pyarrow.compute as pc + + from pandas.core.arrays.arrow._arrow_utils import fallback_performancewarning + from pandas.core.arrays.arrow.dtype import ArrowDtype + +if TYPE_CHECKING: + from pandas import Series + +ArrowExtensionArrayT = TypeVar("ArrowExtensionArrayT", bound="ArrowExtensionArray") + + +class ArrowExtensionArray(ExtensionArray): + """ + Base class for ExtensionArray backed by Arrow ChunkedArray. + """ + + _data: pa.ChunkedArray + + def __init__(self, values: pa.Array | pa.ChunkedArray) -> None: + if pa_version_under1p01: + msg = "pyarrow>=1.0.0 is required for PyArrow backed ArrowExtensionArray." + raise ImportError(msg) + if isinstance(values, pa.Array): + self._data = pa.chunked_array([values]) + elif isinstance(values, pa.ChunkedArray): + self._data = values + else: + raise ValueError( + f"Unsupported type '{type(values)}' for ArrowExtensionArray" + ) + self._dtype = ArrowDtype(self._data.type) + + @classmethod + def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy=False): + """ + Construct a new ExtensionArray from a sequence of scalars. + """ + if isinstance(dtype, ArrowDtype): + pa_dtype = dtype.pyarrow_dtype + elif dtype: + pa_dtype = pa.from_numpy_dtype(dtype) + else: + pa_dtype = None + + if isinstance(scalars, cls): + data = scalars._data + if pa_dtype: + data = data.cast(pa_dtype) + return cls(data) + else: + return cls( + pa.chunked_array(pa.array(scalars, type=pa_dtype, from_pandas=True)) + ) + + @classmethod + def _from_sequence_of_strings( + cls, strings, *, dtype: Dtype | None = None, copy=False + ): + """ + Construct a new ExtensionArray from a sequence of strings. + """ + return cls._from_sequence(strings, dtype=dtype, copy=copy) + + def __getitem__(self, item: PositionalIndexer): + """Select a subset of self. + + Parameters + ---------- + item : int, slice, or ndarray + * int: The position in 'self' to get. + * slice: A slice object, where 'start', 'stop', and 'step' are + integers or None + * ndarray: A 1-d boolean NumPy ndarray the same length as 'self' + + Returns + ------- + item : scalar or ExtensionArray + + Notes + ----- + For scalar ``item``, return a scalar value suitable for the array's + type. This should be an instance of ``self.dtype.type``. + For slice ``key``, return an instance of ``ExtensionArray``, even + if the slice is length 0 or 1. + For a boolean mask, return an instance of ``ExtensionArray``, filtered + to the values where ``item`` is True. + """ + item = check_array_indexer(self, item) + + if isinstance(item, np.ndarray): + if not len(item): + # Removable once we migrate StringDtype[pyarrow] to ArrowDtype[string] + if self._dtype.name == "string" and self._dtype.storage == "pyarrow": + pa_dtype = pa.string() + else: + pa_dtype = self._dtype.pyarrow_dtype + return type(self)(pa.chunked_array([], type=pa_dtype)) + elif is_integer_dtype(item.dtype): + return self.take(item) + elif is_bool_dtype(item.dtype): + return type(self)(self._data.filter(item)) + else: + raise IndexError( + "Only integers, slices and integer or " + "boolean arrays are valid indices." + ) + elif isinstance(item, tuple): + item = unpack_tuple_and_ellipses(item) + + # error: Non-overlapping identity check (left operand type: + # "Union[Union[int, integer[Any]], Union[slice, List[int], + # ndarray[Any, Any]]]", right operand type: "ellipsis") + if item is Ellipsis: # type: ignore[comparison-overlap] + # TODO: should be handled by pyarrow? + item = slice(None) + + if is_scalar(item) and not is_integer(item): + # e.g. "foo" or 2.5 + # exception message copied from numpy + raise IndexError( + r"only integers, slices (`:`), ellipsis (`...`), numpy.newaxis " + r"(`None`) and integer or boolean arrays are valid indices" + ) + # We are not an array indexer, so maybe e.g. a slice or integer + # indexer. We dispatch to pyarrow. + value = self._data[item] + if isinstance(value, pa.ChunkedArray): + return type(self)(value) + else: + scalar = value.as_py() + if scalar is None: + return self._dtype.na_value + else: + return scalar + + def __arrow_array__(self, type=None): + """Convert myself to a pyarrow ChunkedArray.""" + return self._data + + def equals(self, other) -> bool: + if not isinstance(other, ArrowExtensionArray): + return False + # I'm told that pyarrow makes __eq__ behave like pandas' equals; + # TODO: is this documented somewhere? + return self._data == other._data + + @property + def dtype(self) -> ArrowDtype: + """ + An instance of 'ExtensionDtype'. + """ + return self._dtype + + @property + def nbytes(self) -> int: + """ + The number of bytes needed to store this object in memory. + """ + return self._data.nbytes + + def __len__(self) -> int: + """ + Length of this array. + + Returns + ------- + length : int + """ + return len(self._data) + + def isna(self) -> npt.NDArray[np.bool_]: + """ + Boolean NumPy array indicating if each value is missing. + + This should return a 1-D array the same length as 'self'. + """ + if pa_version_under2p0: + return self._data.is_null().to_pandas().values + else: + return self._data.is_null().to_numpy() + + def copy(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT: + """ + Return a shallow copy of the array. + + Underlying ChunkedArray is immutable, so a deep copy is unnecessary. + + Returns + ------- + type(self) + """ + return type(self)(self._data) + + def dropna(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT: + """ + Return ArrowExtensionArray without NA values. + + Returns + ------- + ArrowExtensionArray + """ + if pa_version_under6p0: + fallback_performancewarning(version="6") + return super().dropna() + else: + return type(self)(pc.drop_null(self._data)) + + @doc(ExtensionArray.factorize) + def factorize(self, na_sentinel: int = -1) -> tuple[np.ndarray, ExtensionArray]: + encoded = self._data.dictionary_encode() + indices = pa.chunked_array( + [c.indices for c in encoded.chunks], type=encoded.type.index_type + ).to_pandas() + if indices.dtype.kind == "f": + indices[np.isnan(indices)] = na_sentinel + indices = indices.astype(np.int64, copy=False) + + if encoded.num_chunks: + uniques = type(self)(encoded.chunk(0).dictionary) + else: + uniques = type(self)(pa.array([], type=encoded.type.value_type)) + + return indices.values, uniques + + def reshape(self, *args, **kwargs): + raise NotImplementedError( + f"{type(self)} does not support reshape " + f"as backed by a 1D pyarrow.ChunkedArray." + ) + + def take( + self, + indices: TakeIndexer, + allow_fill: bool = False, + fill_value: Any = None, + ): + """ + Take elements from an array. + + Parameters + ---------- + indices : sequence of int or one-dimensional np.ndarray of int + Indices to be taken. + allow_fill : bool, default False + How to handle negative values in `indices`. + + * False: negative values in `indices` indicate positional indices + from the right (the default). This is similar to + :func:`numpy.take`. + + * True: negative values in `indices` indicate + missing values. These values are set to `fill_value`. Any other + other negative values raise a ``ValueError``. + + fill_value : any, optional + Fill value to use for NA-indices when `allow_fill` is True. + This may be ``None``, in which case the default NA value for + the type, ``self.dtype.na_value``, is used. + + For many ExtensionArrays, there will be two representations of + `fill_value`: a user-facing "boxed" scalar, and a low-level + physical NA value. `fill_value` should be the user-facing version, + and the implementation should handle translating that to the + physical version for processing the take if necessary. + + Returns + ------- + ExtensionArray + + Raises + ------ + IndexError + When the indices are out of bounds for the array. + ValueError + When `indices` contains negative values other than ``-1`` + and `allow_fill` is True. + + See Also + -------- + numpy.take + api.extensions.take + + Notes + ----- + ExtensionArray.take is called by ``Series.__getitem__``, ``.loc``, + ``iloc``, when `indices` is a sequence of values. Additionally, + it's called by :meth:`Series.reindex`, or any other method + that causes realignment, with a `fill_value`. + """ + # TODO: Remove once we got rid of the (indices < 0) check + if not is_array_like(indices): + indices_array = np.asanyarray(indices) + else: + # error: Incompatible types in assignment (expression has type + # "Sequence[int]", variable has type "ndarray") + indices_array = indices # type: ignore[assignment] + + if len(self._data) == 0 and (indices_array >= 0).any(): + raise IndexError("cannot do a non-empty take") + if indices_array.size > 0 and indices_array.max() >= len(self._data): + raise IndexError("out of bounds value in 'indices'.") + + if allow_fill: + fill_mask = indices_array < 0 + if fill_mask.any(): + validate_indices(indices_array, len(self._data)) + # TODO(ARROW-9433): Treat negative indices as NULL + indices_array = pa.array(indices_array, mask=fill_mask) + result = self._data.take(indices_array) + if isna(fill_value): + return type(self)(result) + # TODO: ArrowNotImplementedError: Function fill_null has no + # kernel matching input types (array[string], scalar[string]) + result = type(self)(result) + result[fill_mask] = fill_value + return result + # return type(self)(pc.fill_null(result, pa.scalar(fill_value))) + else: + # Nothing to fill + return type(self)(self._data.take(indices)) + else: # allow_fill=False + # TODO(ARROW-9432): Treat negative indices as indices from the right. + if (indices_array < 0).any(): + # Don't modify in-place + indices_array = np.copy(indices_array) + indices_array[indices_array < 0] += len(self._data) + return type(self)(self._data.take(indices_array)) + + def unique(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT: + """ + Compute the ArrowExtensionArray of unique values. + + Returns + ------- + ArrowExtensionArray + """ + if pa_version_under2p0: + fallback_performancewarning(version="2") + return super().unique() + else: + return type(self)(pc.unique(self._data)) + + def value_counts(self, dropna: bool = True) -> Series: + """ + Return a Series containing counts of each unique value. + + Parameters + ---------- + dropna : bool, default True + Don't include counts of missing values. + + Returns + ------- + counts : Series + + See Also + -------- + Series.value_counts + """ + from pandas import ( + Index, + Series, + ) + + vc = self._data.value_counts() + + values = vc.field(0) + counts = vc.field(1) + if dropna and self._data.null_count > 0: + mask = values.is_valid() + values = values.filter(mask) + counts = counts.filter(mask) + + # No missing values so we can adhere to the interface and return a numpy array. + counts = np.array(counts) + + index = Index(type(self)(values)) + + return Series(counts, index=index).astype("Int64") + + @classmethod + def _concat_same_type( + cls: type[ArrowExtensionArrayT], to_concat + ) -> ArrowExtensionArrayT: + """ + Concatenate multiple ArrowExtensionArrays. + + Parameters + ---------- + to_concat : sequence of ArrowExtensionArrays + + Returns + ------- + ArrowExtensionArray + """ + import pyarrow as pa + + chunks = [array for ea in to_concat for array in ea._data.iterchunks()] + arr = pa.chunked_array(chunks) + return cls(arr) + + def __setitem__(self, key: int | slice | np.ndarray, value: Any) -> None: + """Set one or more values inplace. + + Parameters + ---------- + key : int, ndarray, or slice + When called from, e.g. ``Series.__setitem__``, ``key`` will be + one of + + * scalar int + * ndarray of integers. + * boolean ndarray + * slice object + + value : ExtensionDtype.type, Sequence[ExtensionDtype.type], or object + value or values to be set of ``key``. + + Returns + ------- + None + """ + key = check_array_indexer(self, key) + indices = self._indexing_key_to_indices(key) + value = self._maybe_convert_setitem_value(value) + + argsort = np.argsort(indices) + indices = indices[argsort] + + if is_scalar(value): + value = np.broadcast_to(value, len(self)) + elif len(indices) != len(value): + raise ValueError("Length of indexer and values mismatch") + else: + value = np.asarray(value)[argsort] + + self._data = self._set_via_chunk_iteration(indices=indices, value=value) + + def _indexing_key_to_indices( + self, key: int | slice | np.ndarray + ) -> npt.NDArray[np.intp]: + """ + Convert indexing key for self into positional indices. + + Parameters + ---------- + key : int | slice | np.ndarray + + Returns + ------- + npt.NDArray[np.intp] + """ + n = len(self) + if isinstance(key, slice): + indices = np.arange(n)[key] + elif is_integer(key): + indices = np.arange(n)[[key]] + elif is_bool_dtype(key): + key = np.asarray(key) + if len(key) != n: + raise ValueError("Length of indexer and values mismatch") + indices = key.nonzero()[0] + else: + key = np.asarray(key) + indices = np.arange(n)[key] + return indices + + def _maybe_convert_setitem_value(self, value): + """Maybe convert value to be pyarrow compatible.""" + # TODO: Make more robust like ArrowStringArray._maybe_convert_setitem_value + return value + + def _set_via_chunk_iteration( + self, indices: npt.NDArray[np.intp], value: npt.NDArray[Any] + ) -> pa.ChunkedArray: + """ + Loop through the array chunks and set the new values while + leaving the chunking layout unchanged. + + Parameters + ---------- + indices : npt.NDArray[np.intp] + Position indices for the underlying ChunkedArray. + + value : ExtensionDtype.type, Sequence[ExtensionDtype.type], or object + value or values to be set of ``key``. + + Notes + ----- + Assumes that indices is sorted. Caller is responsible for sorting. + """ + new_data = [] + stop = 0 + for chunk in self._data.iterchunks(): + start, stop = stop, stop + len(chunk) + if len(indices) == 0 or stop <= indices[0]: + new_data.append(chunk) + else: + n = int(np.searchsorted(indices, stop, side="left")) + c_ind = indices[:n] - start + indices = indices[n:] + n = len(c_ind) + c_value, value = value[:n], value[n:] + new_data.append(self._replace_with_indices(chunk, c_ind, c_value)) + return pa.chunked_array(new_data) + + @classmethod + def _replace_with_indices( + cls, + chunk: pa.Array, + indices: npt.NDArray[np.intp], + value: npt.NDArray[Any], + ) -> pa.Array: + """ + Replace items selected with a set of positional indices. + + Analogous to pyarrow.compute.replace_with_mask, except that replacement + positions are identified via indices rather than a mask. + + Parameters + ---------- + chunk : pa.Array + indices : npt.NDArray[np.intp] + value : npt.NDArray[Any] + Replacement value(s). + + Returns + ------- + pa.Array + """ + n = len(indices) + + if n == 0: + return chunk + + start, stop = indices[[0, -1]] + + if (stop - start) == (n - 1): + # fast path for a contiguous set of indices + arrays = [ + chunk[:start], + pa.array(value, type=chunk.type), + chunk[stop + 1 :], + ] + arrays = [arr for arr in arrays if len(arr)] + if len(arrays) == 1: + return arrays[0] + return pa.concat_arrays(arrays) + + mask = np.zeros(len(chunk), dtype=np.bool_) + mask[indices] = True + + if pa_version_under5p0: + arr = chunk.to_numpy(zero_copy_only=False) + arr[mask] = value + return pa.array(arr, type=chunk.type) + + if isna(value).all(): + return pc.if_else(mask, None, chunk) + + return pc.replace_with_mask(chunk, mask, value) diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 9a1435c3f033d..eb2c6927f56d1 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -661,10 +661,20 @@ def __getitem__( if is_scalar(left) and isna(left): return self._fill_value return Interval(left, right, self.closed) - # error: Argument 1 to "ndim" has incompatible type "Union[ndarray, - # ExtensionArray]"; expected "Union[Union[int, float, complex, str, bytes, - # generic], Sequence[Union[int, float, complex, str, bytes, generic]], - # Sequence[Sequence[Any]], _SupportsArray]" + # error: Argument 1 to "ndim" has incompatible type + # "Union[ndarray[Any, Any], ExtensionArray]"; expected + # "Union[Sequence[Sequence[Sequence[Sequence[Sequence[Any]]]]], + # Union[Union[_SupportsArray[dtype[Any]], + # Sequence[_SupportsArray[dtype[Any]]], + # Sequence[Sequence[_SupportsArray[dtype[Any]]]], + # Sequence[Sequence[Sequence[_SupportsArray[dtype[Any]]]]], + # Sequence[Sequence[Sequence[Sequence[_SupportsArray[dtype[Any]]]]]]], + # Union[bool, int, float, complex, str, bytes, + # Sequence[Union[bool, int, float, complex, str, bytes]], + # Sequence[Sequence[Union[bool, int, float, complex, str, bytes]]], + # Sequence[Sequence[Sequence[Union[bool, int, float, complex, str, bytes]]]], + # Sequence[Sequence[Sequence[Sequence[Union[bool, int, float, + # complex, str, bytes]]]]]]]]" if np.ndim(left) > 1: # type: ignore[arg-type] # GH#30588 multi-dimensional indexer disallowed raise ValueError("multi-dimensional indexing not allowed") @@ -1639,13 +1649,7 @@ def isin(self, values) -> np.ndarray: # complex128 ndarray is much more performant. left = self._combined.view("complex128") right = values._combined.view("complex128") - # Argument 1 to "in1d" has incompatible type "Union[ExtensionArray, - # ndarray[Any, Any], ndarray[Any, dtype[Any]]]"; expected - # "Union[_SupportsArray[dtype[Any]], _NestedSequence[_SupportsArray[ - # dtype[Any]]], bool, int, float, complex, str, bytes, - # _NestedSequence[Union[bool, int, float, complex, str, bytes]]]" - # [arg-type] - return np.in1d(left, right) # type: ignore[arg-type] + return np.in1d(left, right) elif needs_i8_conversion(self.left.dtype) ^ needs_i8_conversion( values.left.dtype diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 7d232654e121e..f271e6c47222e 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -140,7 +140,13 @@ class BaseMaskedArray(OpsMixin, ExtensionArray): def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False): # values is supposed to already be validated in the subclass - if not (isinstance(mask, np.ndarray) and mask.dtype == np.bool_): + if not ( + isinstance(mask, np.ndarray) + and + # error: Non-overlapping equality check + # (left operand type: "dtype[bool_]", right operand type: "Type[bool_]") + mask.dtype == np.bool_ # type: ignore[comparison-overlap] + ): raise TypeError( "mask should be boolean numpy array. Use " "the 'pd.array' function instead" @@ -943,11 +949,7 @@ def any(self, *, skipna: bool = True, **kwargs): nv.validate_any((), kwargs) values = self._data.copy() - # Argument 3 to "putmask" has incompatible type "object"; expected - # "Union[_SupportsArray[dtype[Any]], _NestedSequence[ - # _SupportsArray[dtype[Any]]], bool, int, float, complex, str, bytes, _Nested - # Sequence[Union[bool, int, float, complex, str, bytes]]]" [arg-type] - np.putmask(values, self._mask, self._falsey_value) # type: ignore[arg-type] + np.putmask(values, self._mask, self._falsey_value) result = values.any() if skipna: return result @@ -1023,11 +1025,7 @@ def all(self, *, skipna: bool = True, **kwargs): nv.validate_all((), kwargs) values = self._data.copy() - # Argument 3 to "putmask" has incompatible type "object"; expected - # "Union[_SupportsArray[dtype[Any]], _NestedSequence[ - # _SupportsArray[dtype[Any]]], bool, int, float, complex, str, bytes, _Neste - # dSequence[Union[bool, int, float, complex, str, bytes]]]" [arg-type] - np.putmask(values, self._mask, self._truthy_value) # type: ignore[arg-type] + np.putmask(values, self._mask, self._truthy_value) result = values.all() if skipna: diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index ebfa769eb559d..28501b53a4d02 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -925,15 +925,7 @@ def __getitem__( if is_integer(key): return self._get_val_at(key) elif isinstance(key, tuple): - # Invalid index type "Tuple[Union[int, ellipsis], ...]" for - # "ndarray[Any, Any]"; expected type "Union[SupportsIndex, - # _SupportsArray[dtype[Union[bool_, integer[Any]]]], _NestedSequence[_Su - # pportsArray[dtype[Union[bool_, integer[Any]]]]], - # _NestedSequence[Union[bool, int]], Tuple[Union[SupportsIndex, - # _SupportsArray[dtype[Union[bool_, integer[Any]]]], - # _NestedSequence[_SupportsArray[dtype[Union[bool_, integer[Any]]]]], _N - # estedSequence[Union[bool, int]]], ...]]" [index] - data_slice = self.to_dense()[key] # type: ignore[index] + data_slice = self.to_dense()[key] elif isinstance(key, slice): # Avoid densifying when handling contiguous slices @@ -1173,9 +1165,7 @@ def _concat_same_type( data = np.concatenate(values) indices_arr = np.concatenate(indices) - # Argument 2 to "IntIndex" has incompatible type "ndarray[Any, - # dtype[signedinteger[_32Bit]]]"; expected "Sequence[int]" - sp_index = IntIndex(length, indices_arr) # type: ignore[arg-type] + sp_index = IntIndex(length, indices_arr) else: # when concatenating block indices, we don't claim that you'll @@ -1353,8 +1343,7 @@ def __setstate__(self, state): if isinstance(state, tuple): # Compat for pandas < 0.24.0 nd_state, (fill_value, sp_index) = state - # Need type annotation for "sparse_values" [var-annotated] - sparse_values = np.array([]) # type: ignore[var-annotated] + sparse_values = np.array([]) sparse_values.__setstate__(nd_state) self._sparse_values = sparse_values diff --git a/pandas/core/dtypes/astype.py b/pandas/core/dtypes/astype.py new file mode 100644 index 0000000000000..7dc2c81746454 --- /dev/null +++ b/pandas/core/dtypes/astype.py @@ -0,0 +1,407 @@ +""" +Functions for implementing 'astype' methods according to pandas conventions, +particularly ones that differ from numpy. +""" +from __future__ import annotations + +import inspect +from typing import ( + TYPE_CHECKING, + cast, + overload, +) +import warnings + +import numpy as np + +from pandas._libs import lib +from pandas._libs.tslibs.timedeltas import array_to_timedelta64 +from pandas._typing import ( + ArrayLike, + DtypeObj, + IgnoreRaise, +) +from pandas.errors import IntCastingNaNError +from pandas.util._exceptions import find_stack_level + +from pandas.core.dtypes.common import ( + is_datetime64_dtype, + is_datetime64tz_dtype, + is_dtype_equal, + is_integer_dtype, + is_object_dtype, + is_timedelta64_dtype, + pandas_dtype, +) +from pandas.core.dtypes.dtypes import ( + DatetimeTZDtype, + ExtensionDtype, + PandasDtype, +) +from pandas.core.dtypes.missing import isna + +if TYPE_CHECKING: + from pandas.core.arrays import ( + DatetimeArray, + ExtensionArray, + ) + + +_dtype_obj = np.dtype(object) + + +@overload +def astype_nansafe( + arr: np.ndarray, dtype: np.dtype, copy: bool = ..., skipna: bool = ... +) -> np.ndarray: + ... + + +@overload +def astype_nansafe( + arr: np.ndarray, dtype: ExtensionDtype, copy: bool = ..., skipna: bool = ... +) -> ExtensionArray: + ... + + +def astype_nansafe( + arr: np.ndarray, dtype: DtypeObj, copy: bool = True, skipna: bool = False +) -> ArrayLike: + """ + Cast the elements of an array to a given dtype a nan-safe manner. + + Parameters + ---------- + arr : ndarray + dtype : np.dtype or ExtensionDtype + copy : bool, default True + If False, a view will be attempted but may fail, if + e.g. the item sizes don't align. + skipna: bool, default False + Whether or not we should skip NaN when casting as a string-type. + + Raises + ------ + ValueError + The dtype was a datetime64/timedelta64 dtype, but it had no unit. + """ + + # We get here with 0-dim from sparse + arr = np.atleast_1d(arr) + + # dispatch on extension dtype if needed + if isinstance(dtype, ExtensionDtype): + return dtype.construct_array_type()._from_sequence(arr, dtype=dtype, copy=copy) + + elif not isinstance(dtype, np.dtype): # pragma: no cover + raise ValueError("dtype must be np.dtype or ExtensionDtype") + + if arr.dtype.kind in ["m", "M"] and ( + issubclass(dtype.type, str) or dtype == _dtype_obj + ): + from pandas.core.construction import ensure_wrapped_if_datetimelike + + arr = ensure_wrapped_if_datetimelike(arr) + return arr.astype(dtype, copy=copy) + + if issubclass(dtype.type, str): + shape = arr.shape + if arr.ndim > 1: + arr = arr.ravel() + return lib.ensure_string_array( + arr, skipna=skipna, convert_na_value=False + ).reshape(shape) + + elif is_datetime64_dtype(arr.dtype): + # error: Non-overlapping equality check (left + # operand type: "dtype[Any]", right operand type: "Type[signedinteger[Any]]") + if dtype == np.int64: # type: ignore[comparison-overlap] + if isna(arr).any(): + raise ValueError("Cannot convert NaT values to integer") + return arr.view(dtype) + + # allow frequency conversions + if dtype.kind == "M": + return arr.astype(dtype) + + raise TypeError(f"cannot astype a datetimelike from [{arr.dtype}] to [{dtype}]") + + elif is_timedelta64_dtype(arr.dtype): + # error: Non-overlapping equality check (left + # operand type: "dtype[Any]", right operand type: "Type[signedinteger[Any]]") + if dtype == np.int64: # type: ignore[comparison-overlap] + if isna(arr).any(): + raise ValueError("Cannot convert NaT values to integer") + return arr.view(dtype) + + elif dtype.kind == "m": + return astype_td64_unit_conversion(arr, dtype, copy=copy) + + raise TypeError(f"cannot astype a timedelta from [{arr.dtype}] to [{dtype}]") + + elif np.issubdtype(arr.dtype, np.floating) and is_integer_dtype(dtype): + return _astype_float_to_int_nansafe(arr, dtype, copy) + + elif is_object_dtype(arr.dtype): + + # if we have a datetime/timedelta array of objects + # then coerce to a proper dtype and recall astype_nansafe + + if is_datetime64_dtype(dtype): + from pandas import to_datetime + + return astype_nansafe( + to_datetime(arr.ravel()).values.reshape(arr.shape), + dtype, + copy=copy, + ) + elif is_timedelta64_dtype(dtype): + # bc we know arr.dtype == object, this is equivalent to + # `np.asarray(to_timedelta(arr))`, but using a lower-level API that + # does not require a circular import. + return array_to_timedelta64(arr).view("m8[ns]").astype(dtype, copy=False) + + if dtype.name in ("datetime64", "timedelta64"): + msg = ( + f"The '{dtype.name}' dtype has no unit. Please pass in " + f"'{dtype.name}[ns]' instead." + ) + raise ValueError(msg) + + if copy or is_object_dtype(arr.dtype) or is_object_dtype(dtype): + # Explicit copy, or required since NumPy can't view from / to object. + return arr.astype(dtype, copy=True) + + return arr.astype(dtype, copy=copy) + + +def _astype_float_to_int_nansafe( + values: np.ndarray, dtype: np.dtype, copy: bool +) -> np.ndarray: + """ + astype with a check preventing converting NaN to an meaningless integer value. + """ + if not np.isfinite(values).all(): + raise IntCastingNaNError( + "Cannot convert non-finite values (NA or inf) to integer" + ) + if dtype.kind == "u": + # GH#45151 + if not (values >= 0).all(): + raise ValueError(f"Cannot losslessly cast from {values.dtype} to {dtype}") + return values.astype(dtype, copy=copy) + + +def astype_array(values: ArrayLike, dtype: DtypeObj, copy: bool = False) -> ArrayLike: + """ + Cast array (ndarray or ExtensionArray) to the new dtype. + + Parameters + ---------- + values : ndarray or ExtensionArray + dtype : dtype object + copy : bool, default False + copy if indicated + + Returns + ------- + ndarray or ExtensionArray + """ + if ( + values.dtype.kind in ["m", "M"] + and dtype.kind in ["i", "u"] + and isinstance(dtype, np.dtype) + and dtype.itemsize != 8 + ): + # TODO(2.0) remove special case once deprecation on DTA/TDA is enforced + msg = rf"cannot astype a datetimelike from [{values.dtype}] to [{dtype}]" + raise TypeError(msg) + + if is_datetime64tz_dtype(dtype) and is_datetime64_dtype(values.dtype): + return astype_dt64_to_dt64tz(values, dtype, copy, via_utc=True) + + if is_dtype_equal(values.dtype, dtype): + if copy: + return values.copy() + return values + + if not isinstance(values, np.ndarray): + # i.e. ExtensionArray + values = values.astype(dtype, copy=copy) + + else: + values = astype_nansafe(values, dtype, copy=copy) + + # in pandas we don't store numpy str dtypes, so convert to object + if isinstance(dtype, np.dtype) and issubclass(values.dtype.type, str): + values = np.array(values, dtype=object) + + return values + + +def astype_array_safe( + values: ArrayLike, dtype, copy: bool = False, errors: IgnoreRaise = "raise" +) -> ArrayLike: + """ + Cast array (ndarray or ExtensionArray) to the new dtype. + + This basically is the implementation for DataFrame/Series.astype and + includes all custom logic for pandas (NaN-safety, converting str to object, + not allowing ) + + Parameters + ---------- + values : ndarray or ExtensionArray + dtype : str, dtype convertible + copy : bool, default False + copy if indicated + errors : str, {'raise', 'ignore'}, default 'raise' + - ``raise`` : allow exceptions to be raised + - ``ignore`` : suppress exceptions. On error return original object + + Returns + ------- + ndarray or ExtensionArray + """ + errors_legal_values = ("raise", "ignore") + + if errors not in errors_legal_values: + invalid_arg = ( + "Expected value of kwarg 'errors' to be one of " + f"{list(errors_legal_values)}. Supplied value is '{errors}'" + ) + raise ValueError(invalid_arg) + + if inspect.isclass(dtype) and issubclass(dtype, ExtensionDtype): + msg = ( + f"Expected an instance of {dtype.__name__}, " + "but got the class instead. Try instantiating 'dtype'." + ) + raise TypeError(msg) + + dtype = pandas_dtype(dtype) + if isinstance(dtype, PandasDtype): + # Ensure we don't end up with a PandasArray + dtype = dtype.numpy_dtype + + try: + new_values = astype_array(values, dtype, copy=copy) + except (ValueError, TypeError): + # e.g. astype_nansafe can fail on object-dtype of strings + # trying to convert to float + if errors == "ignore": + new_values = values + else: + raise + + return new_values + + +def astype_td64_unit_conversion( + values: np.ndarray, dtype: np.dtype, copy: bool +) -> np.ndarray: + """ + By pandas convention, converting to non-nano timedelta64 + returns an int64-dtyped array with ints representing multiples + of the desired timedelta unit. This is essentially division. + + Parameters + ---------- + values : np.ndarray[timedelta64[ns]] + dtype : np.dtype + timedelta64 with unit not-necessarily nano + copy : bool + + Returns + ------- + np.ndarray + """ + if is_dtype_equal(values.dtype, dtype): + if copy: + return values.copy() + return values + + # otherwise we are converting to non-nano + result = values.astype(dtype, copy=False) # avoid double-copying + result = result.astype(np.float64) + + mask = isna(values) + np.putmask(result, mask, np.nan) + return result + + +def astype_dt64_to_dt64tz( + values: ArrayLike, dtype: DtypeObj, copy: bool, via_utc: bool = False +) -> DatetimeArray: + # GH#33401 we have inconsistent behaviors between + # Datetimeindex[naive].astype(tzaware) + # Series[dt64].astype(tzaware) + # This collects them in one place to prevent further fragmentation. + + from pandas.core.construction import ensure_wrapped_if_datetimelike + + values = ensure_wrapped_if_datetimelike(values) + values = cast("DatetimeArray", values) + aware = isinstance(dtype, DatetimeTZDtype) + + if via_utc: + # Series.astype behavior + + # caller is responsible for checking this + assert values.tz is None and aware + dtype = cast(DatetimeTZDtype, dtype) + + if copy: + # this should be the only copy + values = values.copy() + + warnings.warn( + "Using .astype to convert from timezone-naive dtype to " + "timezone-aware dtype is deprecated and will raise in a " + "future version. Use ser.dt.tz_localize instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + + # GH#33401 this doesn't match DatetimeArray.astype, which + # goes through the `not via_utc` path + return values.tz_localize("UTC").tz_convert(dtype.tz) + + else: + # DatetimeArray/DatetimeIndex.astype behavior + if values.tz is None and aware: + dtype = cast(DatetimeTZDtype, dtype) + warnings.warn( + "Using .astype to convert from timezone-naive dtype to " + "timezone-aware dtype is deprecated and will raise in a " + "future version. Use obj.tz_localize instead.", + FutureWarning, + stacklevel=find_stack_level(), + ) + + return values.tz_localize(dtype.tz) + + elif aware: + # GH#18951: datetime64_tz dtype but not equal means different tz + dtype = cast(DatetimeTZDtype, dtype) + result = values.tz_convert(dtype.tz) + if copy: + result = result.copy() + return result + + elif values.tz is not None: + warnings.warn( + "Using .astype to convert from timezone-aware dtype to " + "timezone-naive dtype is deprecated and will raise in a " + "future version. Use obj.tz_localize(None) or " + "obj.tz_convert('UTC').tz_localize(None) instead", + FutureWarning, + stacklevel=find_stack_level(), + ) + + result = values.tz_convert("UTC").tz_localize(None) + if copy: + result = result.copy() + return result + + raise NotImplementedError("dtype_equal case should be handled elsewhere") diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 6776064342db0..8c3a032d93a2d 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -534,7 +534,9 @@ def is_string_or_object_np_dtype(dtype: np.dtype) -> bool: """ Faster alternative to is_string_dtype, assumes we have a np.dtype object. """ - return dtype == object or dtype.kind in "SU" + # error: Non-overlapping equality check (left operand type: + # "dtype[Any]", right operand type: "Type[object]") + return dtype == object or dtype.kind in "SU" # type: ignore[comparison-overlap] def is_string_dtype(arr_or_dtype) -> bool: diff --git a/pandas/core/exchange/buffer.py b/pandas/core/exchange/buffer.py new file mode 100644 index 0000000000000..65f2ac6dabef5 --- /dev/null +++ b/pandas/core/exchange/buffer.py @@ -0,0 +1,81 @@ +from typing import ( + Optional, + Tuple, +) + +import numpy as np +from packaging import version + +from pandas.core.exchange.dataframe_protocol import ( + Buffer, + DlpackDeviceType, +) + +_NUMPY_HAS_DLPACK = version.parse(np.__version__) >= version.parse("1.22.0") + + +class PandasBuffer(Buffer): + """ + Data in the buffer is guaranteed to be contiguous in memory. + """ + + def __init__(self, x: np.ndarray, allow_copy: bool = True) -> None: + """ + Handle only regular columns (= numpy arrays) for now. + """ + if not x.strides == (x.dtype.itemsize,): + # The protocol does not support strided buffers, so a copy is + # necessary. If that's not allowed, we need to raise an exception. + if allow_copy: + x = x.copy() + else: + raise RuntimeError( + "Exports cannot be zero-copy in the case " + "of a non-contiguous buffer" + ) + + # Store the numpy array in which the data resides as a private + # attribute, so we can use it to retrieve the public attributes + self._x = x + + @property + def bufsize(self) -> int: + """ + Buffer size in bytes. + """ + return self._x.size * self._x.dtype.itemsize + + @property + def ptr(self) -> int: + """ + Pointer to start of the buffer as an integer. + """ + return self._x.__array_interface__["data"][0] + + def __dlpack__(self): + """ + Represent this structure as DLPack interface. + """ + if _NUMPY_HAS_DLPACK: + # error: "ndarray[Any, Any]" has no attribute "__dlpack__" + return self._x.__dlpack__() # type: ignore[attr-defined] + raise NotImplementedError("__dlpack__") + + def __dlpack_device__(self) -> Tuple[DlpackDeviceType, Optional[int]]: + """ + Device type and device ID for where the data in the buffer resides. + """ + return (DlpackDeviceType.CPU, None) + + def __repr__(self) -> str: + return ( + "PandasBuffer(" + + str( + { + "bufsize": self.bufsize, + "ptr": self.ptr, + "device": self.__dlpack_device__()[0].name, + } + ) + + ")" + ) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 391c12905adae..61be23fcfb0f2 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2427,9 +2427,7 @@ def to_records( if dtype_mapping is None: formats.append(v.dtype) elif isinstance(dtype_mapping, (type, np.dtype, str)): - # Argument 1 to "append" of "list" has incompatible type - # "Union[type, dtype[Any], str]"; expected "dtype[_SCT]" [arg-type] - formats.append(dtype_mapping) # type: ignore[arg-type] + formats.append(dtype_mapping) else: element = "row" if i < index_len else "column" msg = f"Invalid dtype {dtype_mapping} specified for {element} {name}" diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 7175b85e966d7..de83fe12007a2 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4552,12 +4552,7 @@ def _join_non_unique( right = other._values.take(right_idx) if isinstance(join_array, np.ndarray): - # Argument 3 to "putmask" has incompatible type "Union[ExtensionArray, - # ndarray[Any, Any]]"; expected "Union[_SupportsArray[dtype[Any]], - # _NestedSequence[_SupportsArray[dtype[Any]]], bool, int, f - # loat, complex, str, bytes, _NestedSequence[Union[bool, int, float, - # complex, str, bytes]]]" [arg-type] - np.putmask(join_array, mask, right) # type: ignore[arg-type] + np.putmask(join_array, mask, right) else: join_array._putmask(mask, right) @@ -5057,11 +5052,9 @@ def __getitem__(self, key): if result.ndim > 1: deprecate_ndim_indexing(result) if hasattr(result, "_ndarray"): - # error: Item "ndarray[Any, Any]" of "Union[ExtensionArray, - # ndarray[Any, Any]]" has no attribute "_ndarray" [union-attr] # i.e. NDArrayBackedExtensionArray # Unpack to ndarray for MPL compat - return result._ndarray # type: ignore[union-attr] + return result._ndarray return result # NB: Using _constructor._simple_new would break if MultiIndex @@ -6602,9 +6595,7 @@ def insert(self, loc: int, item) -> Index: new_values = np.insert(arr, loc, casted) else: - # No overload variant of "insert" matches argument types - # "ndarray[Any, Any]", "int", "None" [call-overload] - new_values = np.insert(arr, loc, None) # type: ignore[call-overload] + new_values = np.insert(arr, loc, None) loc = loc if loc >= 0 else loc - 1 new_values[loc] = item diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 68db372ff4e51..4c65f50a444d7 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -365,9 +365,7 @@ def _validate_codes(self, level: list, code: list): """ null_mask = isna(level) if np.any(null_mask): - # Incompatible types in assignment (expression has type - # "ndarray[Any, dtype[Any]]", variable has type "List[Any]") - code = np.where(null_mask[code], -1, code) # type: ignore[assignment] + code = np.where(null_mask[code], -1, code) return code def _verify_integrity(self, codes: list | None = None, levels: list | None = None): diff --git a/pandas/core/internals/ops.py b/pandas/core/internals/ops.py index 1160d3b2a8e3a..c938a018574f9 100644 --- a/pandas/core/internals/ops.py +++ b/pandas/core/internals/ops.py @@ -125,7 +125,9 @@ def _get_same_shape_values( # argument type "Tuple[Union[ndarray, slice], slice]" lvals = lvals[rblk.mgr_locs.indexer, :] # type: ignore[call-overload] assert lvals.shape[0] == 1, lvals.shape - lvals = lvals[0, :] + # error: No overload variant of "__getitem__" of "ExtensionArray" matches + # argument type "Tuple[int, slice]" + lvals = lvals[0, :] # type: ignore[call-overload] else: # lvals are 1D, rvals are 2D assert rvals.shape[0] == 1, rvals.shape diff --git a/pandas/core/missing.py b/pandas/core/missing.py index e09701e69c62c..d589a8fbbca70 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -333,15 +333,7 @@ def func(yvalues: np.ndarray) -> None: **kwargs, ) - # Argument 1 to "apply_along_axis" has incompatible type - # "Callable[[ndarray[Any, Any]], None]"; expected - # "Callable[..., Union[_SupportsArray[dtype[]], - # Sequence[_SupportsArray[dtype[ - # ]]], Sequence[Sequence[_SupportsArray[dtype[]]]], - # Sequence[Sequence[Sequence[_SupportsArray[dtype[]]]]], - # Sequence[Sequence[Sequence[Sequence[_SupportsArray[dtype[]]]]]]]]" - # interp each column independently - np.apply_along_axis(func, axis, data) # type: ignore[arg-type] + np.apply_along_axis(func, axis, data) return @@ -779,23 +771,14 @@ def interpolate_2d( Modifies values in-place. """ if limit_area is not None: - # Argument 1 to "apply_along_axis" has incompatible type "partial[None]"; - # expected "Callable[..., Union[_SupportsArray[dtype[]], - # Sequence[_SupportsArray[dtype[]]], Sequence[Sequence - # [_SupportsArray[dtype[]]]], - # Sequence[Sequence[Sequence[_SupportsArray[dtype[]]]]], - # Sequence[Sequence[Sequence[Sequence[_SupportsArray[dtype[]]]]]]]]" - - # Argument 2 to "apply_along_axis" has incompatible type "Union[str, int]"; - # expected "SupportsIndex" [arg-type] np.apply_along_axis( partial( _interpolate_with_limit_area, method=method, limit=limit, limit_area=limit_area, - ), # type: ignore[arg-type] - axis, # type: ignore[arg-type] + ), + axis, values, ) return diff --git a/pandas/core/reshape/melt.py b/pandas/core/reshape/melt.py index 262cd9774f694..aa426d24db75d 100644 --- a/pandas/core/reshape/melt.py +++ b/pandas/core/reshape/melt.py @@ -133,9 +133,7 @@ def melt( if is_extension_array_dtype(id_data): id_data = concat([id_data] * K, ignore_index=True) else: - # Incompatible types in assignment (expression has type - # "ndarray[Any, dtype[Any]]", variable has type "Series") [assignment] - id_data = np.tile(id_data._values, K) # type: ignore[assignment] + id_data = np.tile(id_data._values, K) mdata[col] = id_data mcolumns = id_vars + var_name + [value_name] diff --git a/pandas/core/series.py b/pandas/core/series.py index 43ad67d36ad4b..b957e3a238c2e 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1990,9 +1990,7 @@ def count(self, level=None): lev = lev.insert(cnt, lev._na_value) obs = level_codes[notna(self._values)] - # Argument "minlength" to "bincount" has incompatible type "Optional[int]"; - # expected "SupportsIndex" [arg-type] - out = np.bincount(obs, minlength=len(lev) or None) # type: ignore[arg-type] + out = np.bincount(obs, minlength=len(lev) or None) return self._constructor(out, index=lev, dtype="int64").__finalize__( self, method="count" ) diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 6d74c6db1f7ed..712495aadde56 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -354,7 +354,10 @@ def _prep_values(self, values: ArrayLike) -> np.ndarray: if inf.any(): values = np.where(inf, np.nan, values) - return values + # error: Incompatible return value type + # (got "Union[ExtensionArray, ndarray[Any, Any], + # ndarray[Any, dtype[floating[_64Bit]]]]", expected "ndarray[Any, Any]") + return values # type: ignore[return-value] def _insert_on_column(self, result: DataFrame, obj: DataFrame) -> None: # if we have an 'on' column we want to put it back into diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index 4a8169c0609fd..5c0c4518bc2fb 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -3612,11 +3612,21 @@ def _highlight_between( Return an array of css props based on condition of data values within given range. """ if np.iterable(left) and not isinstance(left, str): + # error: Argument 1 to "_validate_apply_axis_arg" + # has incompatible type "Union[str, float, Period, + # Timedelta, Interval[Any], datetime64, timedelta64, + # datetime, Sequence[Any], ndarray[Any, Any], NDFrame, None]"; + # expected "Union[NDFrame, Sequence[Any], ndarray[Any, Any]]" left = _validate_apply_axis_arg( left, "left", None, data # type: ignore[arg-type] ) if np.iterable(right) and not isinstance(right, str): + # error: Argument 1 to "_validate_apply_axis_arg" + # has incompatible type "Union[str, float, Period, + # Timedelta, Interval[Any], datetime64, timedelta64, + # datetime, Sequence[Any], ndarray[Any, Any], NDFrame, None]"; + # expected "Union[NDFrame, Sequence[Any], ndarray[Any, Any]]" right = _validate_apply_axis_arg( right, "right", None, data # type: ignore[arg-type] ) diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py index fc0f572c79e6b..b0e9c81132ee4 100644 --- a/pandas/io/parsers/c_parser_wrapper.py +++ b/pandas/io/parsers/c_parser_wrapper.py @@ -367,7 +367,7 @@ def _concatenate_chunks(chunks: list[dict[int, ArrayLike]]) -> dict: numpy_dtypes, # type: ignore[arg-type] [], ) - if common_type == object: + if common_type == np.dtype(object): warning_columns.append(str(name)) dtype = dtypes.pop() @@ -384,14 +384,7 @@ def _concatenate_chunks(chunks: list[dict[int, ArrayLike]]) -> dict: arrs # type: ignore[arg-type] ) else: - # Argument 1 to "concatenate" has incompatible type - # "List[Union[ExtensionArray, ndarray[Any, Any]]]"; expected - # "Union[_SupportsArray[dtype[Any]], - # Sequence[_SupportsArray[dtype[Any]]], - # Sequence[Sequence[_SupportsArray[dtype[Any]]]], - # Sequence[Sequence[Sequence[_SupportsArray[dtype[Any]]]]], - # Sequence[Sequence[Sequence[Sequence[_SupportsArray[dtype[Any]]]]]]]" - result[name] = np.concatenate(arrs) # type: ignore[arg-type] + result[name] = np.concatenate(arrs) if warning_columns: warning_names = ",".join(warning_columns) diff --git a/pandas/tests/extension/date/array.py b/pandas/tests/extension/date/array.py index b14b9921be3d3..d29ed293e71ed 100644 --- a/pandas/tests/extension/date/array.py +++ b/pandas/tests/extension/date/array.py @@ -109,10 +109,7 @@ def __init__( self._month = np.zeros(ldates, dtype=np.uint8) # 255 (1, 31) self._day = np.zeros(ldates, dtype=np.uint8) # 255 (1, 12) - # "object_" object is not iterable [misc] - for (i,), (y, m, d) in np.ndenumerate( # type: ignore[misc] - np.char.split(dates, sep="-") - ): + for (i,), (y, m, d) in np.ndenumerate(np.char.split(dates, sep="-")): self._year[i] = int(y) self._month[i] = int(m) self._day[i] = int(d) diff --git a/requirements-dev.txt b/requirements-dev.txt index 05a9f0426440d..041b35e0ef2b2 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,24 +1,80 @@ # This file is auto-generated from environment.yml, do not modify. # See that file for comments about the need/usage of each dependency. -numpy>=1.18.5 -python-dateutil>=2.8.1 +cython==0.29.30 +pytest>=6.0 +pytest-cov +pytest-xdist>=1.31 +psutil +pytest-asyncio>=0.17 +boto3 +python-dateutil +numpy pytz +beautifulsoup4 +blosc +brotlipy +bottleneck +fastparquet +fsspec +html5lib +hypothesis +gcsfs +jinja2 +lxml +matplotlib +numba>=0.53.1 +numexpr>=2.8.0 +openpyxl +odfpy +pandas-gbq +psycopg2 +pyarrow +pymysql +pyreadstat +tables +python-snappy +pyxlsb +s3fs +scipy +sqlalchemy +tabulate +xarray +xlrd +xlsxwriter +xlwt +zstandard +aiobotocore<2.0.0 +botocore +cftime +dask +ipython +geopandas +seaborn +scikit-learn +statsmodels +coverage +pandas-datareader +pyyaml +py +torch +moto +flask asv -cython>=0.29.30 black==22.3.0 cpplint flake8==4.0.1 flake8-bugbear==21.3.2 flake8-comprehensions==3.7.0 isort>=5.2.1 -mypy==0.930 -pre-commit>=2.9.2 +mypy==0.960 +pre-commit>=2.15.0 pycodestyle pyupgrade gitpython gitdb -numpydoc < 1.2 +natsort +numpydoc pandas-dev-flaker==0.4.0 pydata-sphinx-theme==0.8.0 pytest-cython @@ -31,58 +87,13 @@ types-setuptools nbconvert>=6.4.5 nbsphinx pandoc -dask -toolz>=0.7.3 -partd>=0.3.10 -cloudpickle>=0.2.1 -markdown -feedparser -pyyaml -requests -boto3 -botocore>=1.11 -hypothesis>=5.5.3 -moto -flask -pytest>=6.0 -pytest-cov -pytest-xdist>=1.31 -pytest-asyncio>=0.17 -pytest-instafail -seaborn -statsmodels ipywidgets nbformat notebook>=6.0.3 -blosc -bottleneck>=1.3.1 ipykernel -ipython>=7.11.1 jinja2 -matplotlib>=3.3.2 -numexpr>=2.7.1 -scipy>=1.4.1 -numba>=0.50.1 -beautifulsoup4>=4.8.2 -html5lib -lxml -openpyxl -xlrd -xlsxwriter -xlwt -odfpy -fastparquet>=0.4.0 -pyarrow>2.0.1 -python-snappy -tables>=3.6.1 -s3fs>=0.4.0 -aiobotocore<2.0.0 -fsspec>=0.7.4 -gcsfs>=0.6.0 -sqlalchemy -xarray<0.19 -cftime -pyreadstat -tabulate>=0.8.3 -natsort +markdown +feedparser +pyyaml +requests setuptools>=51.0.0 diff --git a/scripts/generate_pip_deps_from_conda.py b/scripts/generate_pip_deps_from_conda.py index 2ea50fa3ac8d4..8cb539d3b02c8 100755 --- a/scripts/generate_pip_deps_from_conda.py +++ b/scripts/generate_pip_deps_from_conda.py @@ -21,7 +21,7 @@ import yaml EXCLUDE = {"python", "c-compiler", "cxx-compiler"} -RENAME = {"pytables": "tables", "dask-core": "dask"} +RENAME = {"pytables": "tables", "geopandas-base": "geopandas", "pytorch": "torch"} def conda_package_to_pip(package: str): From 7ca2ff40930bba223e41b348a227f60502f6fb1f Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Thu, 23 Jun 2022 16:21:35 -0700 Subject: [PATCH 2/2] Remove files that shouldn't exist on 1.4.x? --- pandas/core/arrays/arrow/array.py | 603 ------------------------------ pandas/core/dtypes/astype.py | 407 -------------------- pandas/core/exchange/buffer.py | 81 ---- 3 files changed, 1091 deletions(-) delete mode 100644 pandas/core/arrays/arrow/array.py delete mode 100644 pandas/core/dtypes/astype.py delete mode 100644 pandas/core/exchange/buffer.py diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py deleted file mode 100644 index c1380fcdbba06..0000000000000 --- a/pandas/core/arrays/arrow/array.py +++ /dev/null @@ -1,603 +0,0 @@ -from __future__ import annotations - -from typing import ( - TYPE_CHECKING, - Any, - TypeVar, -) - -import numpy as np - -from pandas._typing import ( - Dtype, - PositionalIndexer, - TakeIndexer, - npt, -) -from pandas.compat import ( - pa_version_under1p01, - pa_version_under2p0, - pa_version_under5p0, - pa_version_under6p0, -) -from pandas.util._decorators import doc - -from pandas.core.dtypes.common import ( - is_array_like, - is_bool_dtype, - is_integer, - is_integer_dtype, - is_scalar, -) -from pandas.core.dtypes.missing import isna - -from pandas.core.arrays.base import ExtensionArray -from pandas.core.indexers import ( - check_array_indexer, - unpack_tuple_and_ellipses, - validate_indices, -) - -if not pa_version_under1p01: - import pyarrow as pa - import pyarrow.compute as pc - - from pandas.core.arrays.arrow._arrow_utils import fallback_performancewarning - from pandas.core.arrays.arrow.dtype import ArrowDtype - -if TYPE_CHECKING: - from pandas import Series - -ArrowExtensionArrayT = TypeVar("ArrowExtensionArrayT", bound="ArrowExtensionArray") - - -class ArrowExtensionArray(ExtensionArray): - """ - Base class for ExtensionArray backed by Arrow ChunkedArray. - """ - - _data: pa.ChunkedArray - - def __init__(self, values: pa.Array | pa.ChunkedArray) -> None: - if pa_version_under1p01: - msg = "pyarrow>=1.0.0 is required for PyArrow backed ArrowExtensionArray." - raise ImportError(msg) - if isinstance(values, pa.Array): - self._data = pa.chunked_array([values]) - elif isinstance(values, pa.ChunkedArray): - self._data = values - else: - raise ValueError( - f"Unsupported type '{type(values)}' for ArrowExtensionArray" - ) - self._dtype = ArrowDtype(self._data.type) - - @classmethod - def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy=False): - """ - Construct a new ExtensionArray from a sequence of scalars. - """ - if isinstance(dtype, ArrowDtype): - pa_dtype = dtype.pyarrow_dtype - elif dtype: - pa_dtype = pa.from_numpy_dtype(dtype) - else: - pa_dtype = None - - if isinstance(scalars, cls): - data = scalars._data - if pa_dtype: - data = data.cast(pa_dtype) - return cls(data) - else: - return cls( - pa.chunked_array(pa.array(scalars, type=pa_dtype, from_pandas=True)) - ) - - @classmethod - def _from_sequence_of_strings( - cls, strings, *, dtype: Dtype | None = None, copy=False - ): - """ - Construct a new ExtensionArray from a sequence of strings. - """ - return cls._from_sequence(strings, dtype=dtype, copy=copy) - - def __getitem__(self, item: PositionalIndexer): - """Select a subset of self. - - Parameters - ---------- - item : int, slice, or ndarray - * int: The position in 'self' to get. - * slice: A slice object, where 'start', 'stop', and 'step' are - integers or None - * ndarray: A 1-d boolean NumPy ndarray the same length as 'self' - - Returns - ------- - item : scalar or ExtensionArray - - Notes - ----- - For scalar ``item``, return a scalar value suitable for the array's - type. This should be an instance of ``self.dtype.type``. - For slice ``key``, return an instance of ``ExtensionArray``, even - if the slice is length 0 or 1. - For a boolean mask, return an instance of ``ExtensionArray``, filtered - to the values where ``item`` is True. - """ - item = check_array_indexer(self, item) - - if isinstance(item, np.ndarray): - if not len(item): - # Removable once we migrate StringDtype[pyarrow] to ArrowDtype[string] - if self._dtype.name == "string" and self._dtype.storage == "pyarrow": - pa_dtype = pa.string() - else: - pa_dtype = self._dtype.pyarrow_dtype - return type(self)(pa.chunked_array([], type=pa_dtype)) - elif is_integer_dtype(item.dtype): - return self.take(item) - elif is_bool_dtype(item.dtype): - return type(self)(self._data.filter(item)) - else: - raise IndexError( - "Only integers, slices and integer or " - "boolean arrays are valid indices." - ) - elif isinstance(item, tuple): - item = unpack_tuple_and_ellipses(item) - - # error: Non-overlapping identity check (left operand type: - # "Union[Union[int, integer[Any]], Union[slice, List[int], - # ndarray[Any, Any]]]", right operand type: "ellipsis") - if item is Ellipsis: # type: ignore[comparison-overlap] - # TODO: should be handled by pyarrow? - item = slice(None) - - if is_scalar(item) and not is_integer(item): - # e.g. "foo" or 2.5 - # exception message copied from numpy - raise IndexError( - r"only integers, slices (`:`), ellipsis (`...`), numpy.newaxis " - r"(`None`) and integer or boolean arrays are valid indices" - ) - # We are not an array indexer, so maybe e.g. a slice or integer - # indexer. We dispatch to pyarrow. - value = self._data[item] - if isinstance(value, pa.ChunkedArray): - return type(self)(value) - else: - scalar = value.as_py() - if scalar is None: - return self._dtype.na_value - else: - return scalar - - def __arrow_array__(self, type=None): - """Convert myself to a pyarrow ChunkedArray.""" - return self._data - - def equals(self, other) -> bool: - if not isinstance(other, ArrowExtensionArray): - return False - # I'm told that pyarrow makes __eq__ behave like pandas' equals; - # TODO: is this documented somewhere? - return self._data == other._data - - @property - def dtype(self) -> ArrowDtype: - """ - An instance of 'ExtensionDtype'. - """ - return self._dtype - - @property - def nbytes(self) -> int: - """ - The number of bytes needed to store this object in memory. - """ - return self._data.nbytes - - def __len__(self) -> int: - """ - Length of this array. - - Returns - ------- - length : int - """ - return len(self._data) - - def isna(self) -> npt.NDArray[np.bool_]: - """ - Boolean NumPy array indicating if each value is missing. - - This should return a 1-D array the same length as 'self'. - """ - if pa_version_under2p0: - return self._data.is_null().to_pandas().values - else: - return self._data.is_null().to_numpy() - - def copy(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT: - """ - Return a shallow copy of the array. - - Underlying ChunkedArray is immutable, so a deep copy is unnecessary. - - Returns - ------- - type(self) - """ - return type(self)(self._data) - - def dropna(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT: - """ - Return ArrowExtensionArray without NA values. - - Returns - ------- - ArrowExtensionArray - """ - if pa_version_under6p0: - fallback_performancewarning(version="6") - return super().dropna() - else: - return type(self)(pc.drop_null(self._data)) - - @doc(ExtensionArray.factorize) - def factorize(self, na_sentinel: int = -1) -> tuple[np.ndarray, ExtensionArray]: - encoded = self._data.dictionary_encode() - indices = pa.chunked_array( - [c.indices for c in encoded.chunks], type=encoded.type.index_type - ).to_pandas() - if indices.dtype.kind == "f": - indices[np.isnan(indices)] = na_sentinel - indices = indices.astype(np.int64, copy=False) - - if encoded.num_chunks: - uniques = type(self)(encoded.chunk(0).dictionary) - else: - uniques = type(self)(pa.array([], type=encoded.type.value_type)) - - return indices.values, uniques - - def reshape(self, *args, **kwargs): - raise NotImplementedError( - f"{type(self)} does not support reshape " - f"as backed by a 1D pyarrow.ChunkedArray." - ) - - def take( - self, - indices: TakeIndexer, - allow_fill: bool = False, - fill_value: Any = None, - ): - """ - Take elements from an array. - - Parameters - ---------- - indices : sequence of int or one-dimensional np.ndarray of int - Indices to be taken. - allow_fill : bool, default False - How to handle negative values in `indices`. - - * False: negative values in `indices` indicate positional indices - from the right (the default). This is similar to - :func:`numpy.take`. - - * True: negative values in `indices` indicate - missing values. These values are set to `fill_value`. Any other - other negative values raise a ``ValueError``. - - fill_value : any, optional - Fill value to use for NA-indices when `allow_fill` is True. - This may be ``None``, in which case the default NA value for - the type, ``self.dtype.na_value``, is used. - - For many ExtensionArrays, there will be two representations of - `fill_value`: a user-facing "boxed" scalar, and a low-level - physical NA value. `fill_value` should be the user-facing version, - and the implementation should handle translating that to the - physical version for processing the take if necessary. - - Returns - ------- - ExtensionArray - - Raises - ------ - IndexError - When the indices are out of bounds for the array. - ValueError - When `indices` contains negative values other than ``-1`` - and `allow_fill` is True. - - See Also - -------- - numpy.take - api.extensions.take - - Notes - ----- - ExtensionArray.take is called by ``Series.__getitem__``, ``.loc``, - ``iloc``, when `indices` is a sequence of values. Additionally, - it's called by :meth:`Series.reindex`, or any other method - that causes realignment, with a `fill_value`. - """ - # TODO: Remove once we got rid of the (indices < 0) check - if not is_array_like(indices): - indices_array = np.asanyarray(indices) - else: - # error: Incompatible types in assignment (expression has type - # "Sequence[int]", variable has type "ndarray") - indices_array = indices # type: ignore[assignment] - - if len(self._data) == 0 and (indices_array >= 0).any(): - raise IndexError("cannot do a non-empty take") - if indices_array.size > 0 and indices_array.max() >= len(self._data): - raise IndexError("out of bounds value in 'indices'.") - - if allow_fill: - fill_mask = indices_array < 0 - if fill_mask.any(): - validate_indices(indices_array, len(self._data)) - # TODO(ARROW-9433): Treat negative indices as NULL - indices_array = pa.array(indices_array, mask=fill_mask) - result = self._data.take(indices_array) - if isna(fill_value): - return type(self)(result) - # TODO: ArrowNotImplementedError: Function fill_null has no - # kernel matching input types (array[string], scalar[string]) - result = type(self)(result) - result[fill_mask] = fill_value - return result - # return type(self)(pc.fill_null(result, pa.scalar(fill_value))) - else: - # Nothing to fill - return type(self)(self._data.take(indices)) - else: # allow_fill=False - # TODO(ARROW-9432): Treat negative indices as indices from the right. - if (indices_array < 0).any(): - # Don't modify in-place - indices_array = np.copy(indices_array) - indices_array[indices_array < 0] += len(self._data) - return type(self)(self._data.take(indices_array)) - - def unique(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT: - """ - Compute the ArrowExtensionArray of unique values. - - Returns - ------- - ArrowExtensionArray - """ - if pa_version_under2p0: - fallback_performancewarning(version="2") - return super().unique() - else: - return type(self)(pc.unique(self._data)) - - def value_counts(self, dropna: bool = True) -> Series: - """ - Return a Series containing counts of each unique value. - - Parameters - ---------- - dropna : bool, default True - Don't include counts of missing values. - - Returns - ------- - counts : Series - - See Also - -------- - Series.value_counts - """ - from pandas import ( - Index, - Series, - ) - - vc = self._data.value_counts() - - values = vc.field(0) - counts = vc.field(1) - if dropna and self._data.null_count > 0: - mask = values.is_valid() - values = values.filter(mask) - counts = counts.filter(mask) - - # No missing values so we can adhere to the interface and return a numpy array. - counts = np.array(counts) - - index = Index(type(self)(values)) - - return Series(counts, index=index).astype("Int64") - - @classmethod - def _concat_same_type( - cls: type[ArrowExtensionArrayT], to_concat - ) -> ArrowExtensionArrayT: - """ - Concatenate multiple ArrowExtensionArrays. - - Parameters - ---------- - to_concat : sequence of ArrowExtensionArrays - - Returns - ------- - ArrowExtensionArray - """ - import pyarrow as pa - - chunks = [array for ea in to_concat for array in ea._data.iterchunks()] - arr = pa.chunked_array(chunks) - return cls(arr) - - def __setitem__(self, key: int | slice | np.ndarray, value: Any) -> None: - """Set one or more values inplace. - - Parameters - ---------- - key : int, ndarray, or slice - When called from, e.g. ``Series.__setitem__``, ``key`` will be - one of - - * scalar int - * ndarray of integers. - * boolean ndarray - * slice object - - value : ExtensionDtype.type, Sequence[ExtensionDtype.type], or object - value or values to be set of ``key``. - - Returns - ------- - None - """ - key = check_array_indexer(self, key) - indices = self._indexing_key_to_indices(key) - value = self._maybe_convert_setitem_value(value) - - argsort = np.argsort(indices) - indices = indices[argsort] - - if is_scalar(value): - value = np.broadcast_to(value, len(self)) - elif len(indices) != len(value): - raise ValueError("Length of indexer and values mismatch") - else: - value = np.asarray(value)[argsort] - - self._data = self._set_via_chunk_iteration(indices=indices, value=value) - - def _indexing_key_to_indices( - self, key: int | slice | np.ndarray - ) -> npt.NDArray[np.intp]: - """ - Convert indexing key for self into positional indices. - - Parameters - ---------- - key : int | slice | np.ndarray - - Returns - ------- - npt.NDArray[np.intp] - """ - n = len(self) - if isinstance(key, slice): - indices = np.arange(n)[key] - elif is_integer(key): - indices = np.arange(n)[[key]] - elif is_bool_dtype(key): - key = np.asarray(key) - if len(key) != n: - raise ValueError("Length of indexer and values mismatch") - indices = key.nonzero()[0] - else: - key = np.asarray(key) - indices = np.arange(n)[key] - return indices - - def _maybe_convert_setitem_value(self, value): - """Maybe convert value to be pyarrow compatible.""" - # TODO: Make more robust like ArrowStringArray._maybe_convert_setitem_value - return value - - def _set_via_chunk_iteration( - self, indices: npt.NDArray[np.intp], value: npt.NDArray[Any] - ) -> pa.ChunkedArray: - """ - Loop through the array chunks and set the new values while - leaving the chunking layout unchanged. - - Parameters - ---------- - indices : npt.NDArray[np.intp] - Position indices for the underlying ChunkedArray. - - value : ExtensionDtype.type, Sequence[ExtensionDtype.type], or object - value or values to be set of ``key``. - - Notes - ----- - Assumes that indices is sorted. Caller is responsible for sorting. - """ - new_data = [] - stop = 0 - for chunk in self._data.iterchunks(): - start, stop = stop, stop + len(chunk) - if len(indices) == 0 or stop <= indices[0]: - new_data.append(chunk) - else: - n = int(np.searchsorted(indices, stop, side="left")) - c_ind = indices[:n] - start - indices = indices[n:] - n = len(c_ind) - c_value, value = value[:n], value[n:] - new_data.append(self._replace_with_indices(chunk, c_ind, c_value)) - return pa.chunked_array(new_data) - - @classmethod - def _replace_with_indices( - cls, - chunk: pa.Array, - indices: npt.NDArray[np.intp], - value: npt.NDArray[Any], - ) -> pa.Array: - """ - Replace items selected with a set of positional indices. - - Analogous to pyarrow.compute.replace_with_mask, except that replacement - positions are identified via indices rather than a mask. - - Parameters - ---------- - chunk : pa.Array - indices : npt.NDArray[np.intp] - value : npt.NDArray[Any] - Replacement value(s). - - Returns - ------- - pa.Array - """ - n = len(indices) - - if n == 0: - return chunk - - start, stop = indices[[0, -1]] - - if (stop - start) == (n - 1): - # fast path for a contiguous set of indices - arrays = [ - chunk[:start], - pa.array(value, type=chunk.type), - chunk[stop + 1 :], - ] - arrays = [arr for arr in arrays if len(arr)] - if len(arrays) == 1: - return arrays[0] - return pa.concat_arrays(arrays) - - mask = np.zeros(len(chunk), dtype=np.bool_) - mask[indices] = True - - if pa_version_under5p0: - arr = chunk.to_numpy(zero_copy_only=False) - arr[mask] = value - return pa.array(arr, type=chunk.type) - - if isna(value).all(): - return pc.if_else(mask, None, chunk) - - return pc.replace_with_mask(chunk, mask, value) diff --git a/pandas/core/dtypes/astype.py b/pandas/core/dtypes/astype.py deleted file mode 100644 index 7dc2c81746454..0000000000000 --- a/pandas/core/dtypes/astype.py +++ /dev/null @@ -1,407 +0,0 @@ -""" -Functions for implementing 'astype' methods according to pandas conventions, -particularly ones that differ from numpy. -""" -from __future__ import annotations - -import inspect -from typing import ( - TYPE_CHECKING, - cast, - overload, -) -import warnings - -import numpy as np - -from pandas._libs import lib -from pandas._libs.tslibs.timedeltas import array_to_timedelta64 -from pandas._typing import ( - ArrayLike, - DtypeObj, - IgnoreRaise, -) -from pandas.errors import IntCastingNaNError -from pandas.util._exceptions import find_stack_level - -from pandas.core.dtypes.common import ( - is_datetime64_dtype, - is_datetime64tz_dtype, - is_dtype_equal, - is_integer_dtype, - is_object_dtype, - is_timedelta64_dtype, - pandas_dtype, -) -from pandas.core.dtypes.dtypes import ( - DatetimeTZDtype, - ExtensionDtype, - PandasDtype, -) -from pandas.core.dtypes.missing import isna - -if TYPE_CHECKING: - from pandas.core.arrays import ( - DatetimeArray, - ExtensionArray, - ) - - -_dtype_obj = np.dtype(object) - - -@overload -def astype_nansafe( - arr: np.ndarray, dtype: np.dtype, copy: bool = ..., skipna: bool = ... -) -> np.ndarray: - ... - - -@overload -def astype_nansafe( - arr: np.ndarray, dtype: ExtensionDtype, copy: bool = ..., skipna: bool = ... -) -> ExtensionArray: - ... - - -def astype_nansafe( - arr: np.ndarray, dtype: DtypeObj, copy: bool = True, skipna: bool = False -) -> ArrayLike: - """ - Cast the elements of an array to a given dtype a nan-safe manner. - - Parameters - ---------- - arr : ndarray - dtype : np.dtype or ExtensionDtype - copy : bool, default True - If False, a view will be attempted but may fail, if - e.g. the item sizes don't align. - skipna: bool, default False - Whether or not we should skip NaN when casting as a string-type. - - Raises - ------ - ValueError - The dtype was a datetime64/timedelta64 dtype, but it had no unit. - """ - - # We get here with 0-dim from sparse - arr = np.atleast_1d(arr) - - # dispatch on extension dtype if needed - if isinstance(dtype, ExtensionDtype): - return dtype.construct_array_type()._from_sequence(arr, dtype=dtype, copy=copy) - - elif not isinstance(dtype, np.dtype): # pragma: no cover - raise ValueError("dtype must be np.dtype or ExtensionDtype") - - if arr.dtype.kind in ["m", "M"] and ( - issubclass(dtype.type, str) or dtype == _dtype_obj - ): - from pandas.core.construction import ensure_wrapped_if_datetimelike - - arr = ensure_wrapped_if_datetimelike(arr) - return arr.astype(dtype, copy=copy) - - if issubclass(dtype.type, str): - shape = arr.shape - if arr.ndim > 1: - arr = arr.ravel() - return lib.ensure_string_array( - arr, skipna=skipna, convert_na_value=False - ).reshape(shape) - - elif is_datetime64_dtype(arr.dtype): - # error: Non-overlapping equality check (left - # operand type: "dtype[Any]", right operand type: "Type[signedinteger[Any]]") - if dtype == np.int64: # type: ignore[comparison-overlap] - if isna(arr).any(): - raise ValueError("Cannot convert NaT values to integer") - return arr.view(dtype) - - # allow frequency conversions - if dtype.kind == "M": - return arr.astype(dtype) - - raise TypeError(f"cannot astype a datetimelike from [{arr.dtype}] to [{dtype}]") - - elif is_timedelta64_dtype(arr.dtype): - # error: Non-overlapping equality check (left - # operand type: "dtype[Any]", right operand type: "Type[signedinteger[Any]]") - if dtype == np.int64: # type: ignore[comparison-overlap] - if isna(arr).any(): - raise ValueError("Cannot convert NaT values to integer") - return arr.view(dtype) - - elif dtype.kind == "m": - return astype_td64_unit_conversion(arr, dtype, copy=copy) - - raise TypeError(f"cannot astype a timedelta from [{arr.dtype}] to [{dtype}]") - - elif np.issubdtype(arr.dtype, np.floating) and is_integer_dtype(dtype): - return _astype_float_to_int_nansafe(arr, dtype, copy) - - elif is_object_dtype(arr.dtype): - - # if we have a datetime/timedelta array of objects - # then coerce to a proper dtype and recall astype_nansafe - - if is_datetime64_dtype(dtype): - from pandas import to_datetime - - return astype_nansafe( - to_datetime(arr.ravel()).values.reshape(arr.shape), - dtype, - copy=copy, - ) - elif is_timedelta64_dtype(dtype): - # bc we know arr.dtype == object, this is equivalent to - # `np.asarray(to_timedelta(arr))`, but using a lower-level API that - # does not require a circular import. - return array_to_timedelta64(arr).view("m8[ns]").astype(dtype, copy=False) - - if dtype.name in ("datetime64", "timedelta64"): - msg = ( - f"The '{dtype.name}' dtype has no unit. Please pass in " - f"'{dtype.name}[ns]' instead." - ) - raise ValueError(msg) - - if copy or is_object_dtype(arr.dtype) or is_object_dtype(dtype): - # Explicit copy, or required since NumPy can't view from / to object. - return arr.astype(dtype, copy=True) - - return arr.astype(dtype, copy=copy) - - -def _astype_float_to_int_nansafe( - values: np.ndarray, dtype: np.dtype, copy: bool -) -> np.ndarray: - """ - astype with a check preventing converting NaN to an meaningless integer value. - """ - if not np.isfinite(values).all(): - raise IntCastingNaNError( - "Cannot convert non-finite values (NA or inf) to integer" - ) - if dtype.kind == "u": - # GH#45151 - if not (values >= 0).all(): - raise ValueError(f"Cannot losslessly cast from {values.dtype} to {dtype}") - return values.astype(dtype, copy=copy) - - -def astype_array(values: ArrayLike, dtype: DtypeObj, copy: bool = False) -> ArrayLike: - """ - Cast array (ndarray or ExtensionArray) to the new dtype. - - Parameters - ---------- - values : ndarray or ExtensionArray - dtype : dtype object - copy : bool, default False - copy if indicated - - Returns - ------- - ndarray or ExtensionArray - """ - if ( - values.dtype.kind in ["m", "M"] - and dtype.kind in ["i", "u"] - and isinstance(dtype, np.dtype) - and dtype.itemsize != 8 - ): - # TODO(2.0) remove special case once deprecation on DTA/TDA is enforced - msg = rf"cannot astype a datetimelike from [{values.dtype}] to [{dtype}]" - raise TypeError(msg) - - if is_datetime64tz_dtype(dtype) and is_datetime64_dtype(values.dtype): - return astype_dt64_to_dt64tz(values, dtype, copy, via_utc=True) - - if is_dtype_equal(values.dtype, dtype): - if copy: - return values.copy() - return values - - if not isinstance(values, np.ndarray): - # i.e. ExtensionArray - values = values.astype(dtype, copy=copy) - - else: - values = astype_nansafe(values, dtype, copy=copy) - - # in pandas we don't store numpy str dtypes, so convert to object - if isinstance(dtype, np.dtype) and issubclass(values.dtype.type, str): - values = np.array(values, dtype=object) - - return values - - -def astype_array_safe( - values: ArrayLike, dtype, copy: bool = False, errors: IgnoreRaise = "raise" -) -> ArrayLike: - """ - Cast array (ndarray or ExtensionArray) to the new dtype. - - This basically is the implementation for DataFrame/Series.astype and - includes all custom logic for pandas (NaN-safety, converting str to object, - not allowing ) - - Parameters - ---------- - values : ndarray or ExtensionArray - dtype : str, dtype convertible - copy : bool, default False - copy if indicated - errors : str, {'raise', 'ignore'}, default 'raise' - - ``raise`` : allow exceptions to be raised - - ``ignore`` : suppress exceptions. On error return original object - - Returns - ------- - ndarray or ExtensionArray - """ - errors_legal_values = ("raise", "ignore") - - if errors not in errors_legal_values: - invalid_arg = ( - "Expected value of kwarg 'errors' to be one of " - f"{list(errors_legal_values)}. Supplied value is '{errors}'" - ) - raise ValueError(invalid_arg) - - if inspect.isclass(dtype) and issubclass(dtype, ExtensionDtype): - msg = ( - f"Expected an instance of {dtype.__name__}, " - "but got the class instead. Try instantiating 'dtype'." - ) - raise TypeError(msg) - - dtype = pandas_dtype(dtype) - if isinstance(dtype, PandasDtype): - # Ensure we don't end up with a PandasArray - dtype = dtype.numpy_dtype - - try: - new_values = astype_array(values, dtype, copy=copy) - except (ValueError, TypeError): - # e.g. astype_nansafe can fail on object-dtype of strings - # trying to convert to float - if errors == "ignore": - new_values = values - else: - raise - - return new_values - - -def astype_td64_unit_conversion( - values: np.ndarray, dtype: np.dtype, copy: bool -) -> np.ndarray: - """ - By pandas convention, converting to non-nano timedelta64 - returns an int64-dtyped array with ints representing multiples - of the desired timedelta unit. This is essentially division. - - Parameters - ---------- - values : np.ndarray[timedelta64[ns]] - dtype : np.dtype - timedelta64 with unit not-necessarily nano - copy : bool - - Returns - ------- - np.ndarray - """ - if is_dtype_equal(values.dtype, dtype): - if copy: - return values.copy() - return values - - # otherwise we are converting to non-nano - result = values.astype(dtype, copy=False) # avoid double-copying - result = result.astype(np.float64) - - mask = isna(values) - np.putmask(result, mask, np.nan) - return result - - -def astype_dt64_to_dt64tz( - values: ArrayLike, dtype: DtypeObj, copy: bool, via_utc: bool = False -) -> DatetimeArray: - # GH#33401 we have inconsistent behaviors between - # Datetimeindex[naive].astype(tzaware) - # Series[dt64].astype(tzaware) - # This collects them in one place to prevent further fragmentation. - - from pandas.core.construction import ensure_wrapped_if_datetimelike - - values = ensure_wrapped_if_datetimelike(values) - values = cast("DatetimeArray", values) - aware = isinstance(dtype, DatetimeTZDtype) - - if via_utc: - # Series.astype behavior - - # caller is responsible for checking this - assert values.tz is None and aware - dtype = cast(DatetimeTZDtype, dtype) - - if copy: - # this should be the only copy - values = values.copy() - - warnings.warn( - "Using .astype to convert from timezone-naive dtype to " - "timezone-aware dtype is deprecated and will raise in a " - "future version. Use ser.dt.tz_localize instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) - - # GH#33401 this doesn't match DatetimeArray.astype, which - # goes through the `not via_utc` path - return values.tz_localize("UTC").tz_convert(dtype.tz) - - else: - # DatetimeArray/DatetimeIndex.astype behavior - if values.tz is None and aware: - dtype = cast(DatetimeTZDtype, dtype) - warnings.warn( - "Using .astype to convert from timezone-naive dtype to " - "timezone-aware dtype is deprecated and will raise in a " - "future version. Use obj.tz_localize instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) - - return values.tz_localize(dtype.tz) - - elif aware: - # GH#18951: datetime64_tz dtype but not equal means different tz - dtype = cast(DatetimeTZDtype, dtype) - result = values.tz_convert(dtype.tz) - if copy: - result = result.copy() - return result - - elif values.tz is not None: - warnings.warn( - "Using .astype to convert from timezone-aware dtype to " - "timezone-naive dtype is deprecated and will raise in a " - "future version. Use obj.tz_localize(None) or " - "obj.tz_convert('UTC').tz_localize(None) instead", - FutureWarning, - stacklevel=find_stack_level(), - ) - - result = values.tz_convert("UTC").tz_localize(None) - if copy: - result = result.copy() - return result - - raise NotImplementedError("dtype_equal case should be handled elsewhere") diff --git a/pandas/core/exchange/buffer.py b/pandas/core/exchange/buffer.py deleted file mode 100644 index 65f2ac6dabef5..0000000000000 --- a/pandas/core/exchange/buffer.py +++ /dev/null @@ -1,81 +0,0 @@ -from typing import ( - Optional, - Tuple, -) - -import numpy as np -from packaging import version - -from pandas.core.exchange.dataframe_protocol import ( - Buffer, - DlpackDeviceType, -) - -_NUMPY_HAS_DLPACK = version.parse(np.__version__) >= version.parse("1.22.0") - - -class PandasBuffer(Buffer): - """ - Data in the buffer is guaranteed to be contiguous in memory. - """ - - def __init__(self, x: np.ndarray, allow_copy: bool = True) -> None: - """ - Handle only regular columns (= numpy arrays) for now. - """ - if not x.strides == (x.dtype.itemsize,): - # The protocol does not support strided buffers, so a copy is - # necessary. If that's not allowed, we need to raise an exception. - if allow_copy: - x = x.copy() - else: - raise RuntimeError( - "Exports cannot be zero-copy in the case " - "of a non-contiguous buffer" - ) - - # Store the numpy array in which the data resides as a private - # attribute, so we can use it to retrieve the public attributes - self._x = x - - @property - def bufsize(self) -> int: - """ - Buffer size in bytes. - """ - return self._x.size * self._x.dtype.itemsize - - @property - def ptr(self) -> int: - """ - Pointer to start of the buffer as an integer. - """ - return self._x.__array_interface__["data"][0] - - def __dlpack__(self): - """ - Represent this structure as DLPack interface. - """ - if _NUMPY_HAS_DLPACK: - # error: "ndarray[Any, Any]" has no attribute "__dlpack__" - return self._x.__dlpack__() # type: ignore[attr-defined] - raise NotImplementedError("__dlpack__") - - def __dlpack_device__(self) -> Tuple[DlpackDeviceType, Optional[int]]: - """ - Device type and device ID for where the data in the buffer resides. - """ - return (DlpackDeviceType.CPU, None) - - def __repr__(self) -> str: - return ( - "PandasBuffer(" - + str( - { - "bufsize": self.bufsize, - "ptr": self.ptr, - "device": self.__dlpack_device__()[0].name, - } - ) - + ")" - )