From 5273b6469c3c602b4d696fb9faca472d65d4a7f9 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Sat, 24 Jun 2023 08:57:49 +0100 Subject: [PATCH 01/38] wip --- .../dataframe_api/__init__.py | 181 ----- .../API_specification/dataframe_api/_types.py | 63 -- .../dataframe_api/column_object.py | 586 -------------- .../dataframe_api/dataframe_object.py | 754 ------------------ .../dataframe_api/groupby_object.py | 51 -- 5 files changed, 1635 deletions(-) delete mode 100644 spec/API_specification/dataframe_api/__init__.py delete mode 100644 spec/API_specification/dataframe_api/_types.py delete mode 100644 spec/API_specification/dataframe_api/column_object.py delete mode 100644 spec/API_specification/dataframe_api/dataframe_object.py delete mode 100644 spec/API_specification/dataframe_api/groupby_object.py diff --git a/spec/API_specification/dataframe_api/__init__.py b/spec/API_specification/dataframe_api/__init__.py deleted file mode 100644 index b8b7014c..00000000 --- a/spec/API_specification/dataframe_api/__init__.py +++ /dev/null @@ -1,181 +0,0 @@ -""" -Function stubs and API documentation for the DataFrame API standard. -""" -from __future__ import annotations - -from typing import Mapping, Sequence, Any - -from .column_object import * -from .dataframe_object import * -from .groupby_object import * - - -__all__ = [ - "__dataframe_api_version", - "column_from_sequence", - "concat", - "dataframe_from_dict", - "is_null", - "null", - "DType", - "Int64", - "Int32", - "Int16", - "Int8", - "UInt64", - "UInt32", - "UInt16", - "UInt8", - "Float64", - "Float32", - "Bool", -] - - -__dataframe_api_version__: str = "YYYY.MM" -""" -String representing the version of the DataFrame API specification to which -the conforming implementation adheres. Set to a concrete value for a stable -implementation of the dataframe API standard. -""" - -def concat(dataframes: Sequence[DataFrame]) -> DataFrame: - """ - Concatenate DataFrames vertically. - - To concatenate horizontally, please use ``insert``. - - Parameters - ---------- - dataframes : Sequence[DataFrame] - DataFrames to concatenate. - Column names, ordering, and dtypes must match. - - Notes - ----- - The order in which the input DataFrames appear in - the output is preserved (so long as the DataFrame implementation supports row - ordering). - """ - ... - -def column_from_sequence(sequence: Sequence[object], *, dtype: DType) -> Column: - """ - Construct Column from sequence of elements. - - Parameters - ---------- - sequence : Sequence[object] - Sequence of elements. Each element must be of the specified - ``dtype``, the corresponding Python builtin scalar type, or - coercible to that Python scalar type. - dtype : DType - Dtype of result. Must be specified. - - Returns - ------- - Column - """ - ... - -def dataframe_from_dict(data: Mapping[str, Column]) -> DataFrame: - """ - Construct DataFrame from map of column names to Columns. - - Parameters - ---------- - data : Mapping[str, Column] - Column must be of the corresponding type of the DataFrame. - For example, it is only supported to build a ``LibraryXDataFrame`` using - ``LibraryXColumn`` instances. - - Returns - ------- - DataFrame - """ - ... - -class null: - """ - A `null` object to represent missing data. - - ``null`` is a scalar, and may be used when constructing a `Column` from a - Python sequence with `column_from_sequence`. It does not support ``is``, - ``==`` or ``bool``. - - Raises - ------ - TypeError - From ``__eq__`` and from ``__bool__``. - - For ``__eq__``: a missing value must not be compared for equality - directly. Instead, use `DataFrame.is_null` or `Column.is_null` to check - for presence of missing values. - - For ``__bool__``: truthiness of a missing value is ambiguous. - - Notes - ----- - Like for Python scalars, the ``null`` object may be duck typed so it can - reside on (e.g.) a GPU. Hence, the builtin ``is`` keyword should not be - used to check if an object *is* the ``null`` object. - - """ - ... - -def is_null(value: object, /) -> bool: - """ - Check if an object is a `null` scalar. - - Parameters - ---------- - value : object - Any input type is valid. - - Returns - ------- - bool - True if the input is a `null` object from the same library which - implements the dataframe API standard, False otherwise. - - """ - -########## -# Dtypes # -########## - -class DType: - """Base class for all dtypes.""" - -class Int64(DType): - """Integer type with 64 bits of precision.""" - -class Int32(DType): - """Integer type with 32 bits of precision.""" - -class Int16(DType): - """Integer type with 16 bits of precision.""" - -class Int8(DType): - """Integer type with 8 bits of precision.""" - -class UInt64(DType): - """Unsigned integer type with 64 bits of precision.""" - -class UInt32(DType): - """Unsigned integer type with 32 bits of precision.""" - -class UInt16(DType): - """Unsigned integer type with 16 bits of precision.""" - -class UInt8(DType): - """Unsigned integer type with 8 bits of precision.""" - -class Float64(DType): - """Floating point type with 64 bits of precision.""" - -class Float32(DType): - """Floating point type with 32 bits of precision.""" - -class Bool(DType): - """Boolean type with 8 bits of precision.""" diff --git a/spec/API_specification/dataframe_api/_types.py b/spec/API_specification/dataframe_api/_types.py deleted file mode 100644 index 2874ba4c..00000000 --- a/spec/API_specification/dataframe_api/_types.py +++ /dev/null @@ -1,63 +0,0 @@ -""" -Types for type annotations used in the dataframe API standard. - -The type variables should be replaced with the actual types for a given -library, e.g., for Pandas TypeVar('DataFrame') would be replaced with pd.DataFrame. -""" -from __future__ import annotations - -from dataclasses import dataclass -from typing import ( - Any, - List, - Literal, - Optional, - Sequence, - Tuple, - TypeVar, - Union, - Protocol, -) -from enum import Enum - -array = TypeVar("array") -Scalar = TypeVar("Scalar") -device = TypeVar("device") -DType = TypeVar("DType") -SupportsDLPack = TypeVar("SupportsDLPack") -SupportsBufferProtocol = TypeVar("SupportsBufferProtocol") -PyCapsule = TypeVar("PyCapsule") -# ellipsis cannot actually be imported from anywhere, so include a dummy here -# to keep pyflakes happy. https://github.com/python/typeshed/issues/3556 -ellipsis = TypeVar("ellipsis") - -_T_co = TypeVar("_T_co", covariant=True) - - -class NestedSequence(Protocol[_T_co]): - def __getitem__(self, key: int, /) -> Union[_T_co, NestedSequence[_T_co]]: - ... - - def __len__(self, /) -> int: - ... - - -__all__ = [ - "Any", - "DataFrame", - "List", - "Literal", - "NestedSequence", - "Optional", - "PyCapsule", - "SupportsBufferProtocol", - "SupportsDLPack", - "Tuple", - "Union", - "Sequence", - "array", - "device", - "DType", - "ellipsis", - "Enum", -] diff --git a/spec/API_specification/dataframe_api/column_object.py b/spec/API_specification/dataframe_api/column_object.py deleted file mode 100644 index ffc4765e..00000000 --- a/spec/API_specification/dataframe_api/column_object.py +++ /dev/null @@ -1,586 +0,0 @@ -from __future__ import annotations - -from typing import Any,NoReturn, Sequence, TYPE_CHECKING, Literal - -if TYPE_CHECKING: - from ._types import Scalar - from . import DType - - -__all__ = ['Column'] - - -class Column: - """ - Column object - - Note that this column object is not meant to be instantiated directly by - users of the library implementing the dataframe API standard. Rather, use - constructor functions or an already-created dataframe object retrieved via - - """ - - def __column_namespace__( - self: Column, /, *, api_version: str | None = None - ) -> Any: - """ - Returns an object that has all the Dataframe Standard API functions on it. - - Parameters - ---------- - api_version: Optional[str] - String representing the version of the dataframe API specification - to be returned, in ``'YYYY.MM'`` form, for example, ``'2023.04'``. - If it is ``None``, it should return the namespace corresponding to - latest version of the dataframe API specification. If the given - version is invalid or not implemented for the given module, an - error should be raised. Default: ``None``. - - Returns - ------- - namespace: Any - An object representing the dataframe API namespace. It should have - every top-level function defined in the specification as an - attribute. It may contain other public names as well, but it is - recommended to only include those names that are part of the - specification. - - """ - - @property - def column(self) -> object: - """ - Return underlying (not-necessarily-Standard-compliant) column. - - If a library only implements the Standard, then this can return `self`. - """ - ... - - def __len__(self) -> int: - """ - Return the number of rows. - """ - - def __iter__(self) -> NoReturn: - """ - Iterate over elements. - - This is intentionally "poisoned" to discourage inefficient code patterns. - - Raises - ------ - NotImplementedError - """ - raise NotImplementedError("'__iter__' is intentionally not implemented.") - - @property - def dtype(self) -> DType: - """ - Return data type of column. - """ - - def get_rows(self, indices: Column[int]) -> Column: - """ - Select a subset of rows, similar to `ndarray.take`. - - Parameters - ---------- - indices : Column[int] - Positions of rows to select. - """ - ... - - def get_value(self, row_number: int) -> Scalar: - """ - Select the value at a row number, similar to `ndarray.__getitem__()`. - - Parameters - ---------- - row_number : int - Row number of value to return. - - Returns - ------- - Scalar - Depends on the dtype of the Column, and may vary - across implementations. - """ - ... - - def sorted_indices( - self, - *, - ascending: bool = True, - nulls_position: Literal['first', 'last'] = 'last', - ) -> Column[int]: - """ - Return row numbers which would sort column. - - If you need to sort the Column, you can simply do:: - - col.get_rows(col.sorted_indices()) - - Parameters - ---------- - ascending : bool - If `True`, sort in ascending order. - If `False`, sort in descending order. - nulls_position : ``{'first', 'last'}`` - Whether null values should be placed at the beginning - or at the end of the result. - Note that the position of NaNs is unspecified and may - vary based on the implementation. - - Returns - ------- - Column[int] - """ - ... - - def __eq__(self, other: Column | Scalar) -> Column: - """ - Compare for equality. - - Nulls should follow Kleene Logic. - - Parameters - ---------- - other : Column or Scalar - If Column, must have same length. - "Scalar" here is defined implicitly by what scalar types are allowed - for the operation by the underling dtypes. - - Returns - ------- - Column - """ - - def __ne__(self, other: Column | Scalar) -> Column: - """ - Compare for non-equality. - - Nulls should follow Kleene Logic. - - Parameters - ---------- - other : Column or Scalar - If Column, must have same length. - "Scalar" here is defined implicitly by what scalar types are allowed - for the operation by the underling dtypes. - - Returns - ------- - Column - """ - - def __ge__(self, other: Column | Scalar) -> Column: - """ - Compare for "greater than or equal to" `other`. - - Parameters - ---------- - other : Column or Scalar - If Column, must have same length. - "Scalar" here is defined implicitly by what scalar types are allowed - for the operation by the underling dtypes. - - Returns - ------- - Column - """ - - def __gt__(self, other: Column | Scalar) -> Column: - """ - Compare for "greater than" `other`. - - Parameters - ---------- - other : Column or Scalar - If Column, must have same length. - "Scalar" here is defined implicitly by what scalar types are allowed - for the operation by the underling dtypes. - - Returns - ------- - Column - """ - - def __le__(self, other: Column | Scalar) -> Column: - """ - Compare for "less than or equal to" `other`. - - Parameters - ---------- - other : Column or Scalar - If Column, must have same length. - "Scalar" here is defined implicitly by what scalar types are allowed - for the operation by the underling dtypes. - - Returns - ------- - Column - """ - - def __lt__(self, other: Column | Scalar) -> Column: - """ - Compare for "less than" `other`. - - Parameters - ---------- - other : Column or Scalar - If Column, must have same length. - "Scalar" here is defined implicitly by what scalar types are allowed - for the operation by the underling dtypes. - - Returns - ------- - Column - """ - - def __and__(self, other: Column[bool] | bool) -> Column[bool]: - """ - Apply logical 'and' to `other` Column (or scalar) and this Column. - - Nulls should follow Kleene Logic. - - Parameters - ---------- - other : Column[bool] or bool - If Column, must have same length. - - Returns - ------- - Column - - Raises - ------ - ValueError - If `self` or `other` is not boolean. - """ - - def __or__(self, other: Column[bool] | bool) -> Column[bool]: - """ - Apply logical 'or' to `other` Column (or scalar) and this column. - - Nulls should follow Kleene Logic. - - Parameters - ---------- - other : Column[bool] or Scalar - If Column, must have same length. - - Returns - ------- - Column[bool] - - Raises - ------ - ValueError - If `self` or `other` is not boolean. - """ - - def __add__(self, other: Column | Scalar) -> Column: - """ - Add `other` column or scalar to this column. - - Parameters - ---------- - other : Column or Scalar - If Column, must have same length. - "Scalar" here is defined implicitly by what scalar types are allowed - for the operation by the underling dtypes. - - Returns - ------- - Column - """ - - def __sub__(self, other: Column | Scalar) -> Column: - """ - Subtract `other` column or scalar from this column. - - Parameters - ---------- - other : Column or Scalar - If Column, must have same length. - "Scalar" here is defined implicitly by what scalar types are allowed - for the operation by the underling dtypes. - - Returns - ------- - Column - """ - - def __mul__(self, other: Column | Scalar) -> Column: - """ - Multiply `other` column or scalar with this column. - - Parameters - ---------- - other : Column or Scalar - If Column, must have same length. - "Scalar" here is defined implicitly by what scalar types are allowed - for the operation by the underling dtypes. - - Returns - ------- - Column - """ - - def __truediv__(self, other: Column | Scalar) -> Column: - """ - Divide this column by `other` column or scalar. True division, returns floats. - - Parameters - ---------- - other : Column or Scalar - If Column, must have same length. - "Scalar" here is defined implicitly by what scalar types are allowed - for the operation by the underling dtypes. - - Returns - ------- - Column - """ - - def __floordiv__(self, other: Column | Scalar) -> Column: - """ - Floor-divide `other` column or scalar to this column. - - Parameters - ---------- - other : Column or Scalar - If Column, must have same length. - "Scalar" here is defined implicitly by what scalar types are allowed - for the operation by the underling dtypes. - - Returns - ------- - Column - """ - - def __pow__(self, other: Column | Scalar) -> Column: - """ - Raise this column to the power of `other`. - - Parameters - ---------- - other : Column or Scalar - If Column, must have same length. - "Scalar" here is defined implicitly by what scalar types are allowed - for the operation by the underling dtypes. - - Returns - ------- - Column - """ - - def __mod__(self, other: Column | Scalar) -> Column: - """ - Returns modulus of this column by `other` (`%` operator). - - Parameters - ---------- - other : Column or Scalar - If Column, must have same length. - "Scalar" here is defined implicitly by what scalar types are allowed - for the operation by the underling dtypes. - - Returns - ------- - Column - """ - - def __divmod__(self, other: Column | Scalar) -> tuple[Column, Column]: - """ - Return quotient and remainder of integer division. See `divmod` builtin function. - - Parameters - ---------- - other : Column or Scalar - If Column, must have same length. - "Scalar" here is defined implicitly by what scalar types are allowed - for the operation by the underling dtypes. - - Returns - ------- - Column - """ - - def __invert__(self) -> Column: - """ - Invert truthiness of (boolean) elements. - - Raises - ------ - ValueError - If any of the Column's columns is not boolean. - """ - - def any(self, *, skip_nulls: bool = True) -> bool: - """ - Reduction returns a bool. - - Raises - ------ - ValueError - If column is not boolean. - """ - - def all(self, *, skip_nulls: bool = True) -> bool: - """ - Reduction returns a bool. - - Raises - ------ - ValueError - If column is not boolean. - """ - - def min(self, *, skip_nulls: bool = True) -> Scalar: - """ - Reduction returns a scalar. Any data type that supports comparisons - must be supported. The returned value has the same dtype as the column. - """ - - def max(self, *, skip_nulls: bool = True) -> Scalar: - """ - Reduction returns a scalar. Any data type that supports comparisons - must be supported. The returned value has the same dtype as the column. - """ - - def sum(self, *, skip_nulls: bool = True) -> Scalar: - """ - Reduction returns a scalar. Must be supported for numerical and - datetime data types. The returned value has the same dtype as the - column. - """ - - def prod(self, *, skip_nulls: bool = True) -> Scalar: - """ - Reduction returns a scalar. Must be supported for numerical data types. - The returned value has the same dtype as the column. - """ - - def median(self, *, skip_nulls: bool = True) -> Scalar: - """ - Reduction returns a scalar. Must be supported for numerical and - datetime data types. Returns a float for numerical data types, and - datetime (with the appropriate timedelta format string) for datetime - dtypes. - """ - - def mean(self, *, skip_nulls: bool = True) -> Scalar: - """ - Reduction returns a scalar. Must be supported for numerical and - datetime data types. Returns a float for numerical data types, and - datetime (with the appropriate timedelta format string) for datetime - dtypes. - """ - - def std(self, *, skip_nulls: bool = True) -> Scalar: - """ - Reduction returns a scalar. Must be supported for numerical and - datetime data types. Returns a float for numerical data types, and - datetime (with the appropriate timedelta format string) for datetime - dtypes. - """ - - def var(self, *, skip_nulls: bool = True) -> Scalar: - """ - Reduction returns a scalar. Must be supported for numerical and - datetime data types. Returns a float for numerical data types, and - datetime (with the appropriate timedelta format string) for datetime - dtypes. - """ - - def is_null(self) -> Column: - """ - Check for 'missing' or 'null' entries. - - Returns - ------- - Column - - See also - -------- - is_nan - - Notes - ----- - Does *not* include NaN-like entries. - May optionally include 'NaT' values (if present in an implementation), - but note that the Standard makes no guarantees about them. - """ - - def is_nan(self) -> Column: - """ - Check for nan entries. - - Returns - ------- - Column - - See also - -------- - is_null - - Notes - ----- - This only checks for 'NaN'. - Does *not* include 'missing' or 'null' entries. - In particular, does not check for `np.timedelta64('NaT')`. - """ - - def is_in(self, values: Column) -> Column[bool]: - """ - Indicate whether the value at each row matches any value in `values`. - - Parameters - ---------- - values : Column - Contains values to compare against. May include ``float('nan')`` and - ``null``, in which case ``'nan'`` and ``null`` will - respectively return ``True`` even though ``float('nan') == float('nan')`` - isn't ``True``. - The dtype of ``values`` must match the current column's dtype. - - Returns - ------- - Column[bool] - """ - - def unique_indices(self, *, skip_nulls: bool = True) -> Column[int]: - """ - Return indices corresponding to unique values in Column. - - Returns - ------- - Column[int] - Indices corresponding to unique values. - - Notes - ----- - There are no ordering guarantees. In particular, if there are multiple - indices corresponding to the same unique value, there is no guarantee - about which one will appear in the result. - If the original Column contains multiple `'NaN'` values, then - only a single index corresponding to those values should be returned. - Likewise for null values (if ``skip_nulls=False``). - To get the unique values, you can do ``col.get_rows(col.unique_indices())``. - """ - ... - - def fill_nan(self, value: float | 'null', /) -> Column: - """ - Fill floating point ``nan`` values with the given fill value. - - Parameters - ---------- - value : float or `null` - Value used to replace any ``nan`` in the column with. Must be - of the Python scalar type matching the dtype of the column (or - be `null`). - - """ - ... diff --git a/spec/API_specification/dataframe_api/dataframe_object.py b/spec/API_specification/dataframe_api/dataframe_object.py deleted file mode 100644 index 6dc2c787..00000000 --- a/spec/API_specification/dataframe_api/dataframe_object.py +++ /dev/null @@ -1,754 +0,0 @@ -from __future__ import annotations - -from typing import Any, Literal, Mapping, Sequence, Union, TYPE_CHECKING, NoReturn - - -if TYPE_CHECKING: - from .column_object import Column - from .groupby_object import GroupBy - from ._types import Scalar - - -__all__ = ["DataFrame"] - - -class DataFrame: - """ - DataFrame object - - Note that this dataframe object is not meant to be instantiated directly by - users of the library implementing the dataframe API standard. Rather, use - constructor functions or an already-created dataframe object retrieved via - - **Python operator support** - - All arithmetic operators defined by the Python language, except for - ``__matmul__``, ``__neg__`` and ``__pos__``, must be supported for - numerical data types. - - All comparison operators defined by the Python language must be supported - by the dataframe object for all data types for which those comparisons are - supported by the builtin scalar types corresponding to a data type. - - In-place operators must not be supported. All operations on the dataframe - object are out-of-place. - - **Methods and Attributes** - - """ - def __dataframe_namespace__( - self: DataFrame, /, *, api_version: str | None = None - ) -> Any: - """ - Returns an object that has all the dataframe API functions on it. - - Parameters - ---------- - api_version: Optional[str] - String representing the version of the dataframe API specification - to be returned, in ``'YYYY.MM'`` form, for example, ``'2023.04'``. - If it is ``None``, it should return the namespace corresponding to - latest version of the dataframe API specification. If the given - version is invalid or not implemented for the given module, an - error should be raised. Default: ``None``. - - Returns - ------- - namespace: Any - An object representing the dataframe API namespace. It should have - every top-level function defined in the specification as an - attribute. It may contain other public names as well, but it is - recommended to only include those names that are part of the - specification. - - """ - - @property - def dataframe(self) -> object: - """ - Return underlying (not-necessarily-Standard-compliant) DataFrame. - - If a library only implements the Standard, then this can return `self`. - """ - ... - - def shape(self) -> tuple[int, int]: - """ - Return number of rows and number of columns. - """ - - def groupby(self, keys: Sequence[str], /) -> GroupBy: - """ - Group the DataFrame by the given columns. - - Parameters - ---------- - keys : Sequence[str] - - Returns - ------- - GroupBy - - Raises - ------ - KeyError - If any of the requested keys are not present. - - Notes - ----- - Downstream operations from this function, like aggregations, return - results for which row order is not guaranteed and is implementation - defined. - """ - ... - - def get_column_by_name(self, name: str, /) -> Column: - """ - Select a column by name. - - Parameters - ---------- - name : str - - Returns - ------- - Column - - Raises - ------ - KeyError - If the key is not present. - """ - ... - - def get_columns_by_name(self, names: Sequence[str], /) -> DataFrame: - """ - Select multiple columns by name. - - Parameters - ---------- - names : Sequence[str] - - Returns - ------- - DataFrame - - Raises - ------ - KeyError - If the any requested key is not present. - """ - ... - - def get_rows(self, indices: "Column[int]") -> DataFrame: - """ - Select a subset of rows, similar to `ndarray.take`. - - Parameters - ---------- - indices : Column[int] - Positions of rows to select. - - Returns - ------- - DataFrame - """ - ... - - def slice_rows( - self, start: int | None, stop: int | None, step: int | None - ) -> DataFrame: - """ - Select a subset of rows corresponding to a slice. - - Parameters - ---------- - start : int or None - stop : int or None - step : int or None - - Returns - ------- - DataFrame - """ - ... - - def get_rows_by_mask(self, mask: "Column[bool]") -> DataFrame: - """ - Select a subset of rows corresponding to a mask. - - Parameters - ---------- - mask : Column[bool] - - Returns - ------- - DataFrame - - Notes - ----- - Some participants preferred a weaker type Arraylike[bool] for mask, - where 'Arraylike' denotes an object adhering to the Array API standard. - """ - ... - - def insert(self, loc: int, label: str, value: Column) -> DataFrame: - """ - Insert column into DataFrame at specified location. - - Parameters - ---------- - loc : int - Insertion index. Must verify 0 <= loc <= len(columns). - label : str - Label of the inserted column. - value : Column - """ - ... - - def drop_column(self, label: str) -> DataFrame: - """ - Drop the specified column. - - Parameters - ---------- - label : str - - Returns - ------- - DataFrame - - Raises - ------ - KeyError - If the label is not present. - """ - ... - - def rename_columns(self, mapping: Mapping[str, str]) -> DataFrame: - """ - Rename columns. - - Parameters - ---------- - mapping : Mapping[str, str] - Keys are old column names, values are new column names. - - Returns - ------- - DataFrame - """ - ... - - def get_column_names(self) -> Sequence[str]: - """ - Get column names. - - Returns - ------- - Sequence[str] - """ - ... - - def sorted_indices( - self, - keys: Sequence[str], - *, - ascending: Sequence[bool] | bool = True, - nulls_position: Literal['first', 'last'] = 'last', - ) -> Column[int]: - """ - Return row numbers which would sort according to given columns. - - If you need to sort the DataFrame, you can simply do:: - - df.get_rows(df.sorted_indices(keys)) - - Parameters - ---------- - keys : Sequence[str] - Names of columns to sort by. - ascending : Sequence[bool] or bool - If `True`, sort by all keys in ascending order. - If `False`, sort by all keys in descending order. - If a sequence, it must be the same length as `keys`, - and determines the direction with which to use each - key to sort by. - nulls_position : ``{'first', 'last'}`` - Whether null values should be placed at the beginning - or at the end of the result. - Note that the position of NaNs is unspecified and may - vary based on the implementation. - - Returns - ------- - Column[int] - - Raises - ------ - ValueError - If `keys` and `ascending` are sequences of different lengths. - """ - ... - - def __eq__(self, other: DataFrame | Scalar) -> DataFrame: - """ - Compare for equality. - - Nulls should follow Kleene Logic. - - Parameters - ---------- - other : DataFrame or Scalar - If DataFrame, must have same length and matching columns. - "Scalar" here is defined implicitly by what scalar types are allowed - for the operation by the underling dtypes. - - Returns - ------- - DataFrame - """ - ... - - def __ne__(self, other: DataFrame | Scalar) -> DataFrame: - """ - Compare for non-equality. - - Nulls should follow Kleene Logic. - - Parameters - ---------- - other : DataFrame or Scalar - If DataFrame, must have same length and matching columns. - "Scalar" here is defined implicitly by what scalar types are allowed - for the operation by the underling dtypes. - - Returns - ------- - DataFrame - """ - ... - - def __ge__(self, other: DataFrame | Scalar) -> DataFrame: - """ - Compare for "greater than or equal to" `other`. - - Parameters - ---------- - other : DataFrame or Scalar - If DataFrame, must have same length and matching columns. - "Scalar" here is defined implicitly by what scalar types are allowed - for the operation by the underling dtypes. - - Returns - ------- - DataFrame - """ - ... - - def __gt__(self, other: DataFrame | Scalar) -> DataFrame: - """ - Compare for "greater than" `other`. - - Parameters - ---------- - other : DataFrame or Scalar - If DataFrame, must have same length and matching columns. - "Scalar" here is defined implicitly by what scalar types are allowed - for the operation by the underling dtypes. - - Returns - ------- - DataFrame - """ - ... - - def __le__(self, other: DataFrame | Scalar) -> DataFrame: - """ - Compare for "less than or equal to" `other`. - - Parameters - ---------- - other : DataFrame or Scalar - If DataFrame, must have same length and matching columns. - "Scalar" here is defined implicitly by what scalar types are allowed - for the operation by the underling dtypes. - - Returns - ------- - DataFrame - """ - ... - - def __lt__(self, other: DataFrame | Scalar) -> DataFrame: - """ - Compare for "less than" `other`. - - Parameters - ---------- - other : DataFrame or Scalar - If DataFrame, must have same length and matching columns. - "Scalar" here is defined implicitly by what scalar types are allowed - for the operation by the underling dtypes. - - Returns - ------- - DataFrame - """ - ... - - def __and__(self, other: DataFrame[bool] | bool) -> DataFrame[bool]: - """ - Apply logical 'and' to `other` DataFrame (or scalar) and this dataframe. - - Nulls should follow Kleene Logic. - - Parameters - ---------- - other : DataFrame[bool] or bool - If DataFrame, must have same length. - - Returns - ------- - DataFrame[bool] - - Raises - ------ - ValueError - If `self` or `other` is not boolean. - """ - - def __or__(self, other: DataFrame[bool] | bool) -> DataFrame[bool]: - """ - Apply logical 'or' to `other` DataFrame (or scalar) and this DataFrame. - - Nulls should follow Kleene Logic. - - Parameters - ---------- - other : DataFrame[bool] or bool - If DataFrame, must have same length. - - Returns - ------- - DataFrame[bool] - - Raises - ------ - ValueError - If `self` or `other` is not boolean. - """ - - def __add__(self, other: DataFrame | Scalar) -> DataFrame: - """ - Add `other` dataframe or scalar to this dataframe. - - Parameters - ---------- - other : DataFrame or Scalar - If DataFrame, must have same length and matching columns. - "Scalar" here is defined implicitly by what scalar types are allowed - for the operation by the underling dtypes. - - Returns - ------- - DataFrame - """ - ... - - def __sub__(self, other: DataFrame | Scalar) -> DataFrame: - """ - Subtract `other` dataframe or scalar from this dataframe. - - Parameters - ---------- - other : DataFrame or Scalar - If DataFrame, must have same length and matching columns. - "Scalar" here is defined implicitly by what scalar types are allowed - for the operation by the underling dtypes. - - Returns - ------- - DataFrame - """ - ... - - def __mul__(self, other: DataFrame | Scalar) -> DataFrame: - """ - Multiply `other` dataframe or scalar with this dataframe. - - Parameters - ---------- - other : DataFrame or Scalar - If DataFrame, must have same length and matching columns. - "Scalar" here is defined implicitly by what scalar types are allowed - for the operation by the underling dtypes. - - Returns - ------- - DataFrame - """ - ... - - def __truediv__(self, other: DataFrame | Scalar) -> DataFrame: - """ - Divide this dataframe by `other` dataframe or scalar. True division, returns floats. - - Parameters - ---------- - other : DataFrame or Scalar - If DataFrame, must have same length and matching columns. - "Scalar" here is defined implicitly by what scalar types are allowed - for the operation by the underling dtypes. - - Returns - ------- - DataFrame - """ - ... - - def __floordiv__(self, other: DataFrame | Scalar) -> DataFrame: - """ - Floor-divide (returns integers) this dataframe by `other` dataframe or scalar. - - Parameters - ---------- - other : DataFrame or Scalar - If DataFrame, must have same length and matching columns. - "Scalar" here is defined implicitly by what scalar types are allowed - for the operation by the underling dtypes. - - Returns - ------- - DataFrame - """ - ... - - def __pow__(self, other: DataFrame | Scalar) -> DataFrame: - """ - Raise this dataframe to the power of `other`. - - Parameters - ---------- - other : DataFrame or Scalar - If DataFrame, must have same length and matching columns. - "Scalar" here is defined implicitly by what scalar types are allowed - for the operation by the underling dtypes. - - Returns - ------- - DataFrame - """ - ... - - def __mod__(self, other: DataFrame | Scalar) -> DataFrame: - """ - Return modulus of this dataframe by `other` (`%` operator). - - Parameters - ---------- - other : DataFrame or Scalar - If DataFrame, must have same length and matching columns. - "Scalar" here is defined implicitly by what scalar types are allowed - for the operation by the underling dtypes. - - Returns - ------- - DataFrame - """ - ... - - def __divmod__(self, other: DataFrame | Scalar) -> tuple[DataFrame, DataFrame]: - """ - Return quotient and remainder of integer division. See `divmod` builtin function. - - Parameters - ---------- - other : DataFrame or Scalar - If DataFrame, must have same length and matching columns. - "Scalar" here is defined implicitly by what scalar types are allowed - for the operation by the underling dtypes. - - Returns - ------- - A tuple of two DataFrame's - """ - ... - - def __invert__(self) -> DataFrame: - """ - Invert truthiness of (boolean) elements. - - Raises - ------ - ValueError - If any of the DataFrame's columns is not boolean. - """ - ... - - def __iter__(self) -> NoReturn: - """ - Iterate over elements. - - This is intentionally "poisoned" to discourage inefficient code patterns. - - Raises - ------ - NotImplementedError - """ - raise NotImplementedError("'__iter__' is intentionally not implemented.") - - def any(self, *, skip_nulls: bool = True) -> DataFrame: - """ - Reduction returns a 1-row DataFrame. - - Raises - ------ - ValueError - If any of the DataFrame's columns is not boolean. - """ - ... - - def all(self, *, skip_nulls: bool = True) -> DataFrame: - """ - Reduction returns a 1-row DataFrame. - - Raises - ------ - ValueError - If any of the DataFrame's columns is not boolean. - """ - ... - - def any_rowwise(self, *, skip_nulls: bool = True) -> Column: - """ - Reduction returns a Column. - - Differs from ``DataFrame.any`` and that the reduction happens - for each row, rather than for each column. - - Raises - ------ - ValueError - If any of the DataFrame's columns is not boolean. - """ - ... - - def all_rowwise(self, *, skip_nulls: bool = True) -> Column: - """ - Reduction returns a Column. - - Differs from ``DataFrame.all`` and that the reduction happens - for each row, rather than for each column. - - Raises - ------ - ValueError - If any of the DataFrame's columns is not boolean. - """ - ... - - def min(self, *, skip_nulls: bool = True) -> DataFrame: - """ - Reduction returns a 1-row DataFrame. - """ - ... - - def max(self, *, skip_nulls: bool = True) -> DataFrame: - """ - Reduction returns a 1-row DataFrame. - """ - ... - - def sum(self, *, skip_nulls: bool = True) -> DataFrame: - """ - Reduction returns a 1-row DataFrame. - """ - ... - - def prod(self, *, skip_nulls: bool = True) -> DataFrame: - """ - Reduction returns a 1-row DataFrame. - """ - ... - - def median(self, *, skip_nulls: bool = True) -> DataFrame: - """ - Reduction returns a 1-row DataFrame. - """ - ... - - def mean(self, *, skip_nulls: bool = True) -> DataFrame: - """ - Reduction returns a 1-row DataFrame. - """ - ... - - def std(self, *, skip_nulls: bool = True) -> DataFrame: - """ - Reduction returns a 1-row DataFrame. - """ - ... - - def var(self, *, skip_nulls: bool = True) -> DataFrame: - """ - Reduction returns a 1-row DataFrame. - """ - ... - - def is_null(self) -> DataFrame: - """ - Check for 'missing' or 'null' entries. - - Returns - ------- - DataFrame - - See also - -------- - is_nan - - Notes - ----- - Does *not* include NaN-like entries. - May optionally include 'NaT' values (if present in an implementation), - but note that the Standard makes no guarantees about them. - """ - ... - - def is_nan(self) -> DataFrame: - """ - Check for nan entries. - - Returns - ------- - DataFrame - - See also - -------- - is_null - - Notes - ----- - This only checks for 'NaN'. - Does *not* include 'missing' or 'null' entries. - In particular, does not check for `np.timedelta64('NaT')`. - """ - ... - - def fill_nan(self, value: float | 'null', /) -> DataFrame: - """ - Fill ``nan`` values with the given fill value. - - The fill operation will apply to all columns with a floating-point - dtype. Other columns remain unchanged. - - Parameters - ---------- - value : float or `null` - Value used to replace any ``nan`` in the column with. Must be - of the Python scalar type matching the dtype of the column (or - be `null`). - - """ - ... diff --git a/spec/API_specification/dataframe_api/groupby_object.py b/spec/API_specification/dataframe_api/groupby_object.py deleted file mode 100644 index cfc7bc62..00000000 --- a/spec/API_specification/dataframe_api/groupby_object.py +++ /dev/null @@ -1,51 +0,0 @@ -from typing import TYPE_CHECKING - -if TYPE_CHECKING: - from .dataframe_object import DataFrame - - -__all__ = ['GroupBy'] - - -class GroupBy: - """ - GroupBy object. - - Note that this class is not meant to be constructed by users. - It is returned from `DataFrame.groupby`. - - **Methods** - - """ - def any(self, *, skip_nulls: bool = True) -> "DataFrame": - ... - - def all(self, *, skip_nulls: bool = True) -> "DataFrame": - ... - - def min(self, *, skip_nulls: bool = True) -> "DataFrame": - ... - - def max(self, *, skip_nulls: bool = True) -> "DataFrame": - ... - - def sum(self, *, skip_nulls: bool = True) -> "DataFrame": - ... - - def prod(self, *, skip_nulls: bool = True) -> "DataFrame": - ... - - def median(self, *, skip_nulls: bool = True) -> "DataFrame": - ... - - def mean(self, *, skip_nulls: bool = True) -> "DataFrame": - ... - - def std(self, *, skip_nulls: bool = True) -> "DataFrame": - ... - - def var(self, *, skip_nulls: bool = True) -> "DataFrame": - ... - - def size(self) -> "DataFrame": - ... From da5324e7646f212b25370f0a2019d6f3e718b5e2 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Sat, 24 Jun 2023 09:04:58 +0100 Subject: [PATCH 02/38] get mypy --strict passing --- .../dataframe_api/__init__.pyi | 195 +++++ .../dataframe_api/_types.pyi | 63 ++ .../dataframe_api/column_object.pyi | 586 ++++++++++++++ .../dataframe_api/dataframe_object.pyi | 756 ++++++++++++++++++ .../dataframe_api/groupby_object.pyi | 54 ++ 5 files changed, 1654 insertions(+) create mode 100644 spec/API_specification/dataframe_api/__init__.pyi create mode 100644 spec/API_specification/dataframe_api/_types.pyi create mode 100644 spec/API_specification/dataframe_api/column_object.pyi create mode 100644 spec/API_specification/dataframe_api/dataframe_object.pyi create mode 100644 spec/API_specification/dataframe_api/groupby_object.pyi diff --git a/spec/API_specification/dataframe_api/__init__.pyi b/spec/API_specification/dataframe_api/__init__.pyi new file mode 100644 index 00000000..ed0d089c --- /dev/null +++ b/spec/API_specification/dataframe_api/__init__.pyi @@ -0,0 +1,195 @@ +""" +Function stubs and API documentation for the DataFrame API standard. +""" +from __future__ import annotations + +from typing import Mapping, Sequence, Any, Generic, TypeVar + +from .column_object import * +from .dataframe_object import DataFrame +from .groupby_object import * + +T = TypeVar("T", bound=DType) + +__all__ = [ + "__dataframe_api_version", + "column_from_sequence", + "concat", + "dataframe_from_dict", + "is_null", + "null", + "DType", + "Int64", + "Int32", + "Int16", + "Int8", + "UInt64", + "UInt32", + "UInt16", + "UInt8", + "Float64", + "Float32", + "Bool", +] + + +__dataframe_api_version__: str = "YYYY.MM" +""" +String representing the version of the DataFrame API specification to which +the conforming implementation adheres. Set to a concrete value for a stable +implementation of the dataframe API standard. +""" + +def concat(dataframes: Sequence[DataFrame[Any]]) -> DataFrame[Any]: + """ + Concatenate DataFrames vertically. + + To concatenate horizontally, please use ``insert``. + + Parameters + ---------- + dataframes : Sequence[DataFrame] + DataFrames to concatenate. + Column names, ordering, and dtypes must match. + + Notes + ----- + The order in which the input DataFrames appear in + the output is preserved (so long as the DataFrame implementation supports row + ordering). + """ + ... + +def column_from_sequence(sequence: Sequence[Scalar[DType]], *, dtype: DType) -> Column[DType]: + """ + Construct Column from sequence of elements. + + Parameters + ---------- + sequence : Sequence[object] + Sequence of elements. Each element must be of the specified + ``dtype``, the corresponding Python builtin scalar type, or + coercible to that Python scalar type. + dtype : DType + Dtype of result. Must be specified. + + Returns + ------- + Column + """ + ... + +def dataframe_from_dict(data: Mapping[str, Column[Any]]) -> DataFrame[Any]: + """ + Construct DataFrame from map of column names to Columns. + + Parameters + ---------- + data : Mapping[str, Column] + Column must be of the corresponding type of the DataFrame. + For example, it is only supported to build a ``LibraryXDataFrame`` using + ``LibraryXColumn`` instances. + + Returns + ------- + DataFrame + """ + ... + +class null: + """ + A `null` object to represent missing data. + + ``null`` is a scalar, and may be used when constructing a `Column` from a + Python sequence with `column_from_sequence`. It does not support ``is``, + ``==`` or ``bool``. + + Raises + ------ + TypeError + From ``__eq__`` and from ``__bool__``. + + For ``__eq__``: a missing value must not be compared for equality + directly. Instead, use `DataFrame.is_null` or `Column.is_null` to check + for presence of missing values. + + For ``__bool__``: truthiness of a missing value is ambiguous. + + Notes + ----- + Like for Python scalars, the ``null`` object may be duck typed so it can + reside on (e.g.) a GPU. Hence, the builtin ``is`` keyword should not be + used to check if an object *is* the ``null`` object. + + """ + ... + +def is_null(value: object, /) -> bool: + """ + Check if an object is a `null` scalar. + + Parameters + ---------- + value : object + Any input type is valid. + + Returns + ------- + bool + True if the input is a `null` object from the same library which + implements the dataframe API standard, False otherwise. + + """ + +########## +# Dtypes # +########## + +class DType: + """Base class for all dtypes.""" + +class IntDType(DType): + """Base class for all integer dtypes.""" + +class FloatDType(DType): + """Base class for all float dtypes.""" + +class Int64(IntDType): + """Integer type with 64 bits of precision.""" + +class Int32(IntDType): + """Integer type with 32 bits of precision.""" + +class Int16(IntDType): + """Integer type with 16 bits of precision.""" + +class Int8(IntDType): + """Integer type with 8 bits of precision.""" + +class UInt64(IntDType): + """Unsigned integer type with 64 bits of precision.""" + +class UInt32(IntDType): + """Unsigned integer type with 32 bits of precision.""" + +class UInt16(IntDType): + """Unsigned integer type with 16 bits of precision.""" + +class UInt8(IntDType): + """Unsigned integer type with 8 bits of precision.""" + +class Float64(FloatDType): + """Floating point type with 64 bits of precision.""" + +class Float32(FloatDType): + """Floating point type with 32 bits of precision.""" + +class Bool(DType): + """Boolean type with 8 bits of precision.""" + +########## +# Scalar # +########## + +class Scalar(Generic[T]): + ... diff --git a/spec/API_specification/dataframe_api/_types.pyi b/spec/API_specification/dataframe_api/_types.pyi new file mode 100644 index 00000000..2874ba4c --- /dev/null +++ b/spec/API_specification/dataframe_api/_types.pyi @@ -0,0 +1,63 @@ +""" +Types for type annotations used in the dataframe API standard. + +The type variables should be replaced with the actual types for a given +library, e.g., for Pandas TypeVar('DataFrame') would be replaced with pd.DataFrame. +""" +from __future__ import annotations + +from dataclasses import dataclass +from typing import ( + Any, + List, + Literal, + Optional, + Sequence, + Tuple, + TypeVar, + Union, + Protocol, +) +from enum import Enum + +array = TypeVar("array") +Scalar = TypeVar("Scalar") +device = TypeVar("device") +DType = TypeVar("DType") +SupportsDLPack = TypeVar("SupportsDLPack") +SupportsBufferProtocol = TypeVar("SupportsBufferProtocol") +PyCapsule = TypeVar("PyCapsule") +# ellipsis cannot actually be imported from anywhere, so include a dummy here +# to keep pyflakes happy. https://github.com/python/typeshed/issues/3556 +ellipsis = TypeVar("ellipsis") + +_T_co = TypeVar("_T_co", covariant=True) + + +class NestedSequence(Protocol[_T_co]): + def __getitem__(self, key: int, /) -> Union[_T_co, NestedSequence[_T_co]]: + ... + + def __len__(self, /) -> int: + ... + + +__all__ = [ + "Any", + "DataFrame", + "List", + "Literal", + "NestedSequence", + "Optional", + "PyCapsule", + "SupportsBufferProtocol", + "SupportsDLPack", + "Tuple", + "Union", + "Sequence", + "array", + "device", + "DType", + "ellipsis", + "Enum", +] diff --git a/spec/API_specification/dataframe_api/column_object.pyi b/spec/API_specification/dataframe_api/column_object.pyi new file mode 100644 index 00000000..a06c993a --- /dev/null +++ b/spec/API_specification/dataframe_api/column_object.pyi @@ -0,0 +1,586 @@ +from __future__ import annotations + +from typing import Any,NoReturn, Sequence, TYPE_CHECKING, Literal, Generic, TypeVar + +if TYPE_CHECKING: + from . import DType, IntDType, FloatDType, Bool, null, Scalar + +T = TypeVar('T', bound=DType) + +__all__ = ['Column'] + + +class Column(Generic[T]): + """ + Column object + + Note that this column object is not meant to be instantiated directly by + users of the library implementing the dataframe API standard. Rather, use + constructor functions or an already-created dataframe object retrieved via + + """ + + def __column_namespace__( + self, /, *, api_version: str | None = None + ) -> Any: + """ + Returns an object that has all the Dataframe Standard API functions on it. + + Parameters + ---------- + api_version: Optional[str] + String representing the version of the dataframe API specification + to be returned, in ``'YYYY.MM'`` form, for example, ``'2023.04'``. + If it is ``None``, it should return the namespace corresponding to + latest version of the dataframe API specification. If the given + version is invalid or not implemented for the given module, an + error should be raised. Default: ``None``. + + Returns + ------- + namespace: Any + An object representing the dataframe API namespace. It should have + every top-level function defined in the specification as an + attribute. It may contain other public names as well, but it is + recommended to only include those names that are part of the + specification. + + """ + + @property + def column(self) -> object: + """ + Return underlying (not-necessarily-Standard-compliant) column. + + If a library only implements the Standard, then this can return `self`. + """ + ... + + def __len__(self) -> int: + """ + Return the number of rows. + """ + + def __iter__(self) -> NoReturn: + """ + Iterate over elements. + + This is intentionally "poisoned" to discourage inefficient code patterns. + + Raises + ------ + NotImplementedError + """ + raise NotImplementedError("'__iter__' is intentionally not implemented.") + + @property + def dtype(self) -> DType: + """ + Return data type of column. + """ + + def get_rows(self, indices: Column[IntDType]) -> Column[T]: + """ + Select a subset of rows, similar to `ndarray.take`. + + Parameters + ---------- + indices : Column[IntDType] + Positions of rows to select. + """ + ... + + def get_value(self, row_number: int) -> Scalar[T]: + """ + Select the value at a row number, similar to `ndarray.__getitem__()`. + + Parameters + ---------- + row_number : int + Row number of value to return. + + Returns + ------- + Scalar + Depends on the dtype of the Column, and may vary + across implementations. + """ + ... + + def sorted_indices( + self, + *, + ascending: bool = True, + nulls_position: Literal['first', 'last'] = 'last', + ) -> Column[IntDType]: + """ + Return row numbers which would sort column. + + If you need to sort the Column, you can simply do:: + + col.get_rows(col.sorted_indices()) + + Parameters + ---------- + ascending : bool + If `True`, sort in ascending order. + If `False`, sort in descending order. + nulls_position : ``{'first', 'last'}`` + Whether null values should be placed at the beginning + or at the end of the result. + Note that the position of NaNs is unspecified and may + vary based on the implementation. + + Returns + ------- + Column[IntDType] + """ + ... + + def __eq__(self, other: Column[Bool] | Scalar[Bool]) -> Column[Bool]: # type: ignore[override] + """ + Compare for equality. + + Nulls should follow Kleene Logic. + + Parameters + ---------- + other : Column or Scalar + If Column, must have same length. + "Scalar" here is defined implicitly by what scalar types are allowed + for the operation by the underling dtypes. + + Returns + ------- + Column + """ + + def __ne__(self, other: Column[Bool] | Scalar[Bool]) -> Column[Bool]: # type: ignore[override] + """ + Compare for non-equality. + + Nulls should follow Kleene Logic. + + Parameters + ---------- + other : Column or Scalar + If Column, must have same length. + "Scalar" here is defined implicitly by what scalar types are allowed + for the operation by the underling dtypes. + + Returns + ------- + Column + """ + + def __ge__(self, other: Column[Any] | Scalar[Any]) -> Column[Any]: + """ + Compare for "greater than or equal to" `other`. + + Parameters + ---------- + other : Column or Scalar + If Column, must have same length. + "Scalar" here is defined implicitly by what scalar types are allowed + for the operation by the underling dtypes. + + Returns + ------- + Column + """ + + def __gt__(self, other: Column[Any] | Scalar[Any]) -> Column[Any]: + """ + Compare for "greater than" `other`. + + Parameters + ---------- + other : Column or Scalar + If Column, must have same length. + "Scalar" here is defined implicitly by what scalar types are allowed + for the operation by the underling dtypes. + + Returns + ------- + Column + """ + + def __le__(self, other: Column[Any] | Scalar[Any]) -> Column[Any]: + """ + Compare for "less than or equal to" `other`. + + Parameters + ---------- + other : Column or Scalar + If Column, must have same length. + "Scalar" here is defined implicitly by what scalar types are allowed + for the operation by the underling dtypes. + + Returns + ------- + Column + """ + + def __lt__(self, other: Column[Any] | Scalar[Any]) -> Column[Any]: + """ + Compare for "less than" `other`. + + Parameters + ---------- + other : Column or Scalar + If Column, must have same length. + "Scalar" here is defined implicitly by what scalar types are allowed + for the operation by the underling dtypes. + + Returns + ------- + Column + """ + + def __and__(self, other: Column[Bool] | bool) -> Column[Bool]: + """ + Apply logical 'and' to `other` Column (or scalar) and this Column. + + Nulls should follow Kleene Logic. + + Parameters + ---------- + other : Column[bool] or bool + If Column, must have same length. + + Returns + ------- + Column + + Raises + ------ + ValueError + If `self` or `other` is not boolean. + """ + + def __or__(self, other: Column[Bool] | bool) -> Column[Bool]: + """ + Apply logical 'or' to `other` Column (or scalar) and this column. + + Nulls should follow Kleene Logic. + + Parameters + ---------- + other : Column[bool] or Scalar + If Column, must have same length. + + Returns + ------- + Column[bool] + + Raises + ------ + ValueError + If `self` or `other` is not boolean. + """ + + def __add__(self, other: Column[Any] | Scalar[Any]) -> Column[Any]: + """ + Add `other` column or scalar to this column. + + Parameters + ---------- + other : Column or Scalar + If Column, must have same length. + "Scalar" here is defined implicitly by what scalar types are allowed + for the operation by the underling dtypes. + + Returns + ------- + Column + """ + + def __sub__(self, other: Column[Any] | Scalar[Any]) -> Column[Any]: + """ + Subtract `other` column or scalar from this column. + + Parameters + ---------- + other : Column or Scalar + If Column, must have same length. + "Scalar" here is defined implicitly by what scalar types are allowed + for the operation by the underling dtypes. + + Returns + ------- + Column + """ + + def __mul__(self, other: Column[Any] | Scalar[Any]) -> Column[Any]: + """ + Multiply `other` column or scalar with this column. + + Parameters + ---------- + other : Column or Scalar + If Column, must have same length. + "Scalar" here is defined implicitly by what scalar types are allowed + for the operation by the underling dtypes. + + Returns + ------- + Column + """ + + def __truediv__(self, other: Column[Any] | Scalar[Any]) -> Column[Any]: + """ + Divide this column by `other` column or scalar. True division, returns floats. + + Parameters + ---------- + other : Column or Scalar + If Column, must have same length. + "Scalar" here is defined implicitly by what scalar types are allowed + for the operation by the underling dtypes. + + Returns + ------- + Column + """ + + def __floordiv__(self, other: Column[Any] | Scalar[Any]) -> Column[Any]: + """ + Floor-divide `other` column or scalar to this column. + + Parameters + ---------- + other : Column or Scalar + If Column, must have same length. + "Scalar" here is defined implicitly by what scalar types are allowed + for the operation by the underling dtypes. + + Returns + ------- + Column + """ + + def __pow__(self, other: Column[Any] | Scalar[Any]) -> Column[Any]: + """ + Raise this column to the power of `other`. + + Parameters + ---------- + other : Column or Scalar + If Column, must have same length. + "Scalar" here is defined implicitly by what scalar types are allowed + for the operation by the underling dtypes. + + Returns + ------- + Column + """ + + def __mod__(self, other: Column[Any] | Scalar[Any]) -> Column[Any]: + """ + Returns modulus of this column by `other` (`%` operator). + + Parameters + ---------- + other : Column or Scalar + If Column, must have same length. + "Scalar" here is defined implicitly by what scalar types are allowed + for the operation by the underling dtypes. + + Returns + ------- + Column + """ + + def __divmod__(self, other: Column[Any] | Scalar[Any]) -> tuple[Column[IntDType], Column[IntDType]]: + """ + Return quotient and remainder of integer division. See `divmod` builtin function. + + Parameters + ---------- + other : Column or Scalar + If Column, must have same length. + "Scalar" here is defined implicitly by what scalar types are allowed + for the operation by the underling dtypes. + + Returns + ------- + Column + """ + + def __invert__(self) -> Column[Bool]: + """ + Invert truthiness of (boolean) elements. + + Raises + ------ + ValueError + If any of the Column's columns is not boolean. + """ + + def any(self, *, skip_nulls: bool = True) -> bool: + """ + Reduction returns a bool. + + Raises + ------ + ValueError + If column is not boolean. + """ + + def all(self, *, skip_nulls: bool = True) -> bool: + """ + Reduction returns a bool. + + Raises + ------ + ValueError + If column is not boolean. + """ + + def min(self, *, skip_nulls: bool = True) -> Scalar[T]: + """ + Reduction returns a scalar. Any data type that supports comparisons + must be supported. The returned value has the same dtype as the column. + """ + + def max(self, *, skip_nulls: bool = True) -> Scalar[T]: + """ + Reduction returns a scalar. Any data type that supports comparisons + must be supported. The returned value has the same dtype as the column. + """ + + def sum(self, *, skip_nulls: bool = True) -> Scalar[T]: + """ + Reduction returns a scalar. Must be supported for numerical and + datetime data types. The returned value has the same dtype as the + column. + """ + + def prod(self, *, skip_nulls: bool = True) -> Scalar[T]: + """ + Reduction returns a scalar. Must be supported for numerical data types. + The returned value has the same dtype as the column. + """ + + def median(self, *, skip_nulls: bool = True) -> Scalar[Any]: + """ + Reduction returns a scalar. Must be supported for numerical and + datetime data types. Returns a float for numerical data types, and + datetime (with the appropriate timedelta format string) for datetime + dtypes. + """ + + def mean(self, *, skip_nulls: bool = True) -> Scalar[Any]: + """ + Reduction returns a scalar. Must be supported for numerical and + datetime data types. Returns a float for numerical data types, and + datetime (with the appropriate timedelta format string) for datetime + dtypes. + """ + + def std(self, *, skip_nulls: bool = True) -> Scalar[Any]: + """ + Reduction returns a scalar. Must be supported for numerical and + datetime data types. Returns a float for numerical data types, and + datetime (with the appropriate timedelta format string) for datetime + dtypes. + """ + + def var(self, *, skip_nulls: bool = True) -> Scalar[Any]: + """ + Reduction returns a scalar. Must be supported for numerical and + datetime data types. Returns a float for numerical data types, and + datetime (with the appropriate timedelta format string) for datetime + dtypes. + """ + + def is_null(self) -> Column[Bool]: + """ + Check for 'missing' or 'null' entries. + + Returns + ------- + Column + + See also + -------- + is_nan + + Notes + ----- + Does *not* include NaN-like entries. + May optionally include 'NaT' values (if present in an implementation), + but note that the Standard makes no guarantees about them. + """ + + def is_nan(self) -> Column[Bool]: + """ + Check for nan entries. + + Returns + ------- + Column + + See also + -------- + is_null + + Notes + ----- + This only checks for 'NaN'. + Does *not* include 'missing' or 'null' entries. + In particular, does not check for `np.timedelta64('NaT')`. + """ + + def is_in(self, values: Column[T]) -> Column[Bool]: + """ + Indicate whether the value at each row matches any value in `values`. + + Parameters + ---------- + values : Column + Contains values to compare against. May include ``float('nan')`` and + ``null``, in which case ``'nan'`` and ``null`` will + respectively return ``True`` even though ``float('nan') == float('nan')`` + isn't ``True``. + The dtype of ``values`` must match the current column's dtype. + + Returns + ------- + Column[bool] + """ + + def unique_indices(self, *, skip_nulls: bool = True) -> Column[IntDType]: + """ + Return indices corresponding to unique values in Column. + + Returns + ------- + Column[IntDType] + Indices corresponding to unique values. + + Notes + ----- + There are no ordering guarantees. In particular, if there are multiple + indices corresponding to the same unique value, there is no guarantee + about which one will appear in the result. + If the original Column contains multiple `'NaN'` values, then + only a single index corresponding to those values should be returned. + Likewise for null values (if ``skip_nulls=False``). + To get the unique values, you can do ``col.get_rows(col.unique_indices())``. + """ + ... + + def fill_nan(self, value: float | 'null', /) -> Column[T]: + """ + Fill floating point ``nan`` values with the given fill value. + + Parameters + ---------- + value : float or `null` + Value used to replace any ``nan`` in the column with. Must be + of the Python scalar type matching the dtype of the column (or + be `null`). + + """ + ... diff --git a/spec/API_specification/dataframe_api/dataframe_object.pyi b/spec/API_specification/dataframe_api/dataframe_object.pyi new file mode 100644 index 00000000..14a1f29c --- /dev/null +++ b/spec/API_specification/dataframe_api/dataframe_object.pyi @@ -0,0 +1,756 @@ +from __future__ import annotations + +from typing import Any, Literal, Mapping, Sequence, Union, TYPE_CHECKING, NoReturn, TypeVar, Generic + + +if TYPE_CHECKING: + from .column_object import Column + from .groupby_object import GroupBy + from . import DType, IntDType, FloatDType, Bool, null, Scalar + + +__all__ = ["DataFrame"] + +T = TypeVar("T", bound=DType) + + +class DataFrame(Generic[T]): + """ + DataFrame object + + Note that this dataframe object is not meant to be instantiated directly by + users of the library implementing the dataframe API standard. Rather, use + constructor functions or an already-created dataframe object retrieved via + + **Python operator support** + + All arithmetic operators defined by the Python language, except for + ``__matmul__``, ``__neg__`` and ``__pos__``, must be supported for + numerical data types. + + All comparison operators defined by the Python language must be supported + by the dataframe object for all data types for which those comparisons are + supported by the builtin scalar types corresponding to a data type. + + In-place operators must not be supported. All operations on the dataframe + object are out-of-place. + + **Methods and Attributes** + + """ + def __dataframe_namespace__( + self, /, *, api_version: str | None = None + ) -> Any: + """ + Returns an object that has all the dataframe API functions on it. + + Parameters + ---------- + api_version: Optional[str] + String representing the version of the dataframe API specification + to be returned, in ``'YYYY.MM'`` form, for example, ``'2023.04'``. + If it is ``None``, it should return the namespace corresponding to + latest version of the dataframe API specification. If the given + version is invalid or not implemented for the given module, an + error should be raised. Default: ``None``. + + Returns + ------- + namespace: Any + An object representing the dataframe API namespace. It should have + every top-level function defined in the specification as an + attribute. It may contain other public names as well, but it is + recommended to only include those names that are part of the + specification. + + """ + + @property + def dataframe(self) -> object: + """ + Return underlying (not-necessarily-Standard-compliant) DataFrame. + + If a library only implements the Standard, then this can return `self`. + """ + ... + + def shape(self) -> tuple[int, int]: + """ + Return number of rows and number of columns. + """ + + def groupby(self, keys: Sequence[str], /) -> GroupBy: + """ + Group the DataFrame by the given columns. + + Parameters + ---------- + keys : Sequence[str] + + Returns + ------- + GroupBy + + Raises + ------ + KeyError + If any of the requested keys are not present. + + Notes + ----- + Downstream operations from this function, like aggregations, return + results for which row order is not guaranteed and is implementation + defined. + """ + ... + + def get_column_by_name(self, name: str, /) -> Column[T]: + """ + Select a column by name. + + Parameters + ---------- + name : str + + Returns + ------- + Column + + Raises + ------ + KeyError + If the key is not present. + """ + ... + + def get_columns_by_name(self, names: Sequence[str], /) -> DataFrame[T]: + """ + Select multiple columns by name. + + Parameters + ---------- + names : Sequence[str] + + Returns + ------- + DataFrame + + Raises + ------ + KeyError + If the any requested key is not present. + """ + ... + + def get_rows(self, indices: Column[IntDType]) -> DataFrame[T]: + """ + Select a subset of rows, similar to `ndarray.take`. + + Parameters + ---------- + indices : Column[IntDType] + Positions of rows to select. + + Returns + ------- + DataFrame + """ + ... + + def slice_rows( + self, start: int | None, stop: int | None, step: int | None + ) -> DataFrame[T]: + """ + Select a subset of rows corresponding to a slice. + + Parameters + ---------- + start : int or None + stop : int or None + step : int or None + + Returns + ------- + DataFrame + """ + ... + + def get_rows_by_mask(self, mask: Column[Bool]) -> DataFrame[T]: + """ + Select a subset of rows corresponding to a mask. + + Parameters + ---------- + mask : Column[bool] + + Returns + ------- + DataFrame + + Notes + ----- + Some participants preferred a weaker type Arraylike[bool] for mask, + where 'Arraylike' denotes an object adhering to the Array API standard. + """ + ... + + def insert(self, loc: int, label: str, value: Column[Any]) -> DataFrame[Any]: + """ + Insert column into DataFrame at specified location. + + Parameters + ---------- + loc : int + Insertion index. Must verify 0 <= loc <= len(columns). + label : str + Label of the inserted column. + value : Column + """ + ... + + def drop_column(self, label: str) -> DataFrame[T]: + """ + Drop the specified column. + + Parameters + ---------- + label : str + + Returns + ------- + DataFrame + + Raises + ------ + KeyError + If the label is not present. + """ + ... + + def rename_columns(self, mapping: Mapping[str, str]) -> DataFrame[T]: + """ + Rename columns. + + Parameters + ---------- + mapping : Mapping[str, str] + Keys are old column names, values are new column names. + + Returns + ------- + DataFrame + """ + ... + + def get_column_names(self) -> Sequence[str]: + """ + Get column names. + + Returns + ------- + Sequence[str] + """ + ... + + def sorted_indices( + self, + keys: Sequence[str], + *, + ascending: Sequence[bool] | bool = True, + nulls_position: Literal['first', 'last'] = 'last', + ) -> Column[IntDType]: + """ + Return row numbers which would sort according to given columns. + + If you need to sort the DataFrame, you can simply do:: + + df.get_rows(df.sorted_indices(keys)) + + Parameters + ---------- + keys : Sequence[str] + Names of columns to sort by. + ascending : Sequence[bool] or bool + If `True`, sort by all keys in ascending order. + If `False`, sort by all keys in descending order. + If a sequence, it must be the same length as `keys`, + and determines the direction with which to use each + key to sort by. + nulls_position : ``{'first', 'last'}`` + Whether null values should be placed at the beginning + or at the end of the result. + Note that the position of NaNs is unspecified and may + vary based on the implementation. + + Returns + ------- + Column[IntDType] + + Raises + ------ + ValueError + If `keys` and `ascending` are sequences of different lengths. + """ + ... + + def __eq__(self, other: DataFrame[Any] | Scalar[Any]) -> DataFrame[Bool]: # type: ignore[override] + """ + Compare for equality. + + Nulls should follow Kleene Logic. + + Parameters + ---------- + other : DataFrame or Scalar + If DataFrame, must have same length and matching columns. + "Scalar" here is defined implicitly by what scalar types are allowed + for the operation by the underling dtypes. + + Returns + ------- + DataFrame + """ + ... + + def __ne__(self, other: DataFrame[Any] | Scalar[Any]) -> DataFrame[Bool]: # type: ignore[override] + """ + Compare for non-equality. + + Nulls should follow Kleene Logic. + + Parameters + ---------- + other : DataFrame or Scalar + If DataFrame, must have same length and matching columns. + "Scalar" here is defined implicitly by what scalar types are allowed + for the operation by the underling dtypes. + + Returns + ------- + DataFrame + """ + ... + + def __ge__(self, other: DataFrame[Any] | Scalar[Any]) -> DataFrame[Bool]: + """ + Compare for "greater than or equal to" `other`. + + Parameters + ---------- + other : DataFrame or Scalar + If DataFrame, must have same length and matching columns. + "Scalar" here is defined implicitly by what scalar types are allowed + for the operation by the underling dtypes. + + Returns + ------- + DataFrame + """ + ... + + def __gt__(self, other: DataFrame[Any] | Scalar[Any]) -> DataFrame[Bool]: + """ + Compare for "greater than" `other`. + + Parameters + ---------- + other : DataFrame or Scalar + If DataFrame, must have same length and matching columns. + "Scalar" here is defined implicitly by what scalar types are allowed + for the operation by the underling dtypes. + + Returns + ------- + DataFrame + """ + ... + + def __le__(self, other: DataFrame[Any] | Scalar[Any]) -> DataFrame[Bool]: + """ + Compare for "less than or equal to" `other`. + + Parameters + ---------- + other : DataFrame or Scalar + If DataFrame, must have same length and matching columns. + "Scalar" here is defined implicitly by what scalar types are allowed + for the operation by the underling dtypes. + + Returns + ------- + DataFrame + """ + ... + + def __lt__(self, other: DataFrame[Any] | Scalar[Any]) -> DataFrame[Bool]: + """ + Compare for "less than" `other`. + + Parameters + ---------- + other : DataFrame or Scalar + If DataFrame, must have same length and matching columns. + "Scalar" here is defined implicitly by what scalar types are allowed + for the operation by the underling dtypes. + + Returns + ------- + DataFrame + """ + ... + + def __and__(self, other: DataFrame[Bool] | bool) -> DataFrame[Bool]: + """ + Apply logical 'and' to `other` DataFrame (or scalar) and this dataframe. + + Nulls should follow Kleene Logic. + + Parameters + ---------- + other : DataFrame[bool] or bool + If DataFrame, must have same length. + + Returns + ------- + DataFrame[bool] + + Raises + ------ + ValueError + If `self` or `other` is not boolean. + """ + + def __or__(self, other: DataFrame[Bool] | bool) -> DataFrame[Bool]: + """ + Apply logical 'or' to `other` DataFrame (or scalar) and this DataFrame. + + Nulls should follow Kleene Logic. + + Parameters + ---------- + other : DataFrame[bool] or bool + If DataFrame, must have same length. + + Returns + ------- + DataFrame[bool] + + Raises + ------ + ValueError + If `self` or `other` is not boolean. + """ + + def __add__(self, other: DataFrame[Any] | Scalar[Any]) -> DataFrame[Any]: + """ + Add `other` dataframe or scalar to this dataframe. + + Parameters + ---------- + other : DataFrame or Scalar + If DataFrame, must have same length and matching columns. + "Scalar" here is defined implicitly by what scalar types are allowed + for the operation by the underling dtypes. + + Returns + ------- + DataFrame + """ + ... + + def __sub__(self, other: DataFrame[Any] | Scalar[Any]) -> DataFrame[Any]: + """ + Subtract `other` dataframe or scalar from this dataframe. + + Parameters + ---------- + other : DataFrame or Scalar + If DataFrame, must have same length and matching columns. + "Scalar" here is defined implicitly by what scalar types are allowed + for the operation by the underling dtypes. + + Returns + ------- + DataFrame + """ + ... + + def __mul__(self, other: DataFrame[Any] | Scalar[Any]) -> DataFrame[Any]: + """ + Multiply `other` dataframe or scalar with this dataframe. + + Parameters + ---------- + other : DataFrame or Scalar + If DataFrame, must have same length and matching columns. + "Scalar" here is defined implicitly by what scalar types are allowed + for the operation by the underling dtypes. + + Returns + ------- + DataFrame + """ + ... + + def __truediv__(self, other: DataFrame[Any] | Scalar[Any]) -> DataFrame[Any]: + """ + Divide this dataframe by `other` dataframe or scalar. True division, returns floats. + + Parameters + ---------- + other : DataFrame or Scalar + If DataFrame, must have same length and matching columns. + "Scalar" here is defined implicitly by what scalar types are allowed + for the operation by the underling dtypes. + + Returns + ------- + DataFrame + """ + ... + + def __floordiv__(self, other: DataFrame[Any] | Scalar[Any]) -> DataFrame[Any]: + """ + Floor-divide (returns integers) this dataframe by `other` dataframe or scalar. + + Parameters + ---------- + other : DataFrame or Scalar + If DataFrame, must have same length and matching columns. + "Scalar" here is defined implicitly by what scalar types are allowed + for the operation by the underling dtypes. + + Returns + ------- + DataFrame + """ + ... + + def __pow__(self, other: DataFrame[Any] | Scalar[Any]) -> DataFrame[Any]: + """ + Raise this dataframe to the power of `other`. + + Parameters + ---------- + other : DataFrame or Scalar + If DataFrame, must have same length and matching columns. + "Scalar" here is defined implicitly by what scalar types are allowed + for the operation by the underling dtypes. + + Returns + ------- + DataFrame + """ + ... + + def __mod__(self, other: DataFrame[Any] | Scalar[Any]) -> DataFrame[Any]: + """ + Return modulus of this dataframe by `other` (`%` operator). + + Parameters + ---------- + other : DataFrame or Scalar + If DataFrame, must have same length and matching columns. + "Scalar" here is defined implicitly by what scalar types are allowed + for the operation by the underling dtypes. + + Returns + ------- + DataFrame + """ + ... + + def __divmod__(self, other: DataFrame[Any] | Scalar[Any]) -> tuple[DataFrame[Any], DataFrame[Any]]: + """ + Return quotient and remainder of integer division. See `divmod` builtin function. + + Parameters + ---------- + other : DataFrame or Scalar + If DataFrame, must have same length and matching columns. + "Scalar" here is defined implicitly by what scalar types are allowed + for the operation by the underling dtypes. + + Returns + ------- + A tuple of two DataFrame's + """ + ... + + def __invert__(self: DataFrame[Bool]) -> DataFrame[Bool]: + """ + Invert truthiness of (boolean) elements. + + Raises + ------ + ValueError + If any of the DataFrame's columns is not boolean. + """ + ... + + def __iter__(self) -> NoReturn: + """ + Iterate over elements. + + This is intentionally "poisoned" to discourage inefficient code patterns. + + Raises + ------ + NotImplementedError + """ + raise NotImplementedError("'__iter__' is intentionally not implemented.") + + def any(self, *, skip_nulls: bool = True) -> DataFrame[Bool]: + """ + Reduction returns a 1-row DataFrame. + + Raises + ------ + ValueError + If any of the DataFrame's columns is not boolean. + """ + ... + + def all(self, *, skip_nulls: bool = True) -> DataFrame[Bool]: + """ + Reduction returns a 1-row DataFrame. + + Raises + ------ + ValueError + If any of the DataFrame's columns is not boolean. + """ + ... + + def any_rowwise(self, *, skip_nulls: bool = True) -> Column[Bool]: + """ + Reduction returns a Column. + + Differs from ``DataFrame.any`` and that the reduction happens + for each row, rather than for each column. + + Raises + ------ + ValueError + If any of the DataFrame's columns is not boolean. + """ + ... + + def all_rowwise(self, *, skip_nulls: bool = True) -> Column[Bool]: + """ + Reduction returns a Column. + + Differs from ``DataFrame.all`` and that the reduction happens + for each row, rather than for each column. + + Raises + ------ + ValueError + If any of the DataFrame's columns is not boolean. + """ + ... + + def min(self, *, skip_nulls: bool = True) -> DataFrame[T]: + """ + Reduction returns a 1-row DataFrame. + """ + ... + + def max(self, *, skip_nulls: bool = True) -> DataFrame[T]: + """ + Reduction returns a 1-row DataFrame. + """ + ... + + def sum(self, *, skip_nulls: bool = True) -> DataFrame[Any]: + """ + Reduction returns a 1-row DataFrame. + """ + ... + + def prod(self, *, skip_nulls: bool = True) -> DataFrame[Any]: + """ + Reduction returns a 1-row DataFrame. + """ + ... + + def median(self, *, skip_nulls: bool = True) -> DataFrame[Any]: + """ + Reduction returns a 1-row DataFrame. + """ + ... + + def mean(self, *, skip_nulls: bool = True) -> DataFrame[Any]: + """ + Reduction returns a 1-row DataFrame. + """ + ... + + def std(self, *, skip_nulls: bool = True) -> DataFrame[Any]: + """ + Reduction returns a 1-row DataFrame. + """ + ... + + def var(self, *, skip_nulls: bool = True) -> DataFrame[Any]: + """ + Reduction returns a 1-row DataFrame. + """ + ... + + def is_null(self) -> DataFrame[Bool]: + """ + Check for 'missing' or 'null' entries. + + Returns + ------- + DataFrame + + See also + -------- + is_nan + + Notes + ----- + Does *not* include NaN-like entries. + May optionally include 'NaT' values (if present in an implementation), + but note that the Standard makes no guarantees about them. + """ + ... + + def is_nan(self) -> DataFrame[Bool]: + """ + Check for nan entries. + + Returns + ------- + DataFrame + + See also + -------- + is_null + + Notes + ----- + This only checks for 'NaN'. + Does *not* include 'missing' or 'null' entries. + In particular, does not check for `np.timedelta64('NaT')`. + """ + ... + + def fill_nan(self, value: float | 'null', /) -> DataFrame[T]: + """ + Fill ``nan`` values with the given fill value. + + The fill operation will apply to all columns with a floating-point + dtype. Other columns remain unchanged. + + Parameters + ---------- + value : float or `null` + Value used to replace any ``nan`` in the column with. Must be + of the Python scalar type matching the dtype of the column (or + be `null`). + + """ + ... diff --git a/spec/API_specification/dataframe_api/groupby_object.pyi b/spec/API_specification/dataframe_api/groupby_object.pyi new file mode 100644 index 00000000..6ca0a600 --- /dev/null +++ b/spec/API_specification/dataframe_api/groupby_object.pyi @@ -0,0 +1,54 @@ +from typing import TYPE_CHECKING, TypeVar, Generic, Any + +if TYPE_CHECKING: + from .dataframe_object import DataFrame + from . import IntDType, DType, Bool + + +__all__ = ['GroupBy'] + +T = TypeVar('T', bound=DType) + + +class GroupBy: + """ + GroupBy object. + + Note that this class is not meant to be constructed by users. + It is returned from `DataFrame.groupby`. + + **Methods** + + """ + def any(self, *, skip_nulls: bool = True) -> DataFrame[Bool]: + ... + + def all(self, *, skip_nulls: bool = True) -> DataFrame[Bool]: + ... + + def min(self, *, skip_nulls: bool = True) -> DataFrame[Any]: + ... + + def max(self, *, skip_nulls: bool = True) -> DataFrame[Any]: + ... + + def sum(self, *, skip_nulls: bool = True) -> DataFrame[Any]: + ... + + def prod(self, *, skip_nulls: bool = True) -> DataFrame[Any]: + ... + + def median(self, *, skip_nulls: bool = True) -> DataFrame[Any]: + ... + + def mean(self, *, skip_nulls: bool = True) -> DataFrame[Any]: + ... + + def std(self, *, skip_nulls: bool = True) -> DataFrame[Any]: + ... + + def var(self, *, skip_nulls: bool = True) -> DataFrame[Any]: + ... + + def size(self) -> DataFrame[IntDType]: + ... From 436b819ffcd5e7b4539ef10a8ab25f61ae15d5aa Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Sat, 24 Jun 2023 09:08:47 +0100 Subject: [PATCH 03/38] add CI --- .github/workflows/mypy.yml | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 .github/workflows/mypy.yml diff --git a/.github/workflows/mypy.yml b/.github/workflows/mypy.yml new file mode 100644 index 00000000..bc7d478e --- /dev/null +++ b/.github/workflows/mypy.yml @@ -0,0 +1,32 @@ +name: mypy + +on: + pull_request: + push: + branches: [main] + +jobs: + tox: + strategy: + matrix: + python-version: ["3.8", "3.11"] + os: [ubuntu-latest] + + runs-on: ${{ matrix.os }} + steps: + - uses: actions/checkout@v3 + - uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + - name: Cache multiple paths + uses: actions/cache@v3 + with: + path: | + ~/.cache/pip + $RUNNER_TOOL_CACHE/Python/* + ~\AppData\Local\pip\Cache + key: ${{ runner.os }}-build-${{ matrix.python-version }} + - name: install-reqs + run: python -m pip install --upgrade mypy==1.4.0 + - name: run mypy + run: cd spec/API_specifications && mypy dataframe_api --strict From e8f5ca723038102a6611cab56ad43a283884da0d Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Sat, 24 Jun 2023 09:16:24 +0100 Subject: [PATCH 04/38] typo --- .github/workflows/mypy.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/mypy.yml b/.github/workflows/mypy.yml index bc7d478e..1a60bec6 100644 --- a/.github/workflows/mypy.yml +++ b/.github/workflows/mypy.yml @@ -29,4 +29,4 @@ jobs: - name: install-reqs run: python -m pip install --upgrade mypy==1.4.0 - name: run mypy - run: cd spec/API_specifications && mypy dataframe_api --strict + run: cd spec/API_specification && mypy dataframe_api --strict From 8ce48df389f4a8a605202191072614583b3c2066 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Sat, 24 Jun 2023 09:21:17 +0100 Subject: [PATCH 05/38] wip --- spec/API_specification/dataframe_api/__init__.py | 1 + 1 file changed, 1 insertion(+) create mode 100644 spec/API_specification/dataframe_api/__init__.py diff --git a/spec/API_specification/dataframe_api/__init__.py b/spec/API_specification/dataframe_api/__init__.py new file mode 100644 index 00000000..1d2bc426 --- /dev/null +++ b/spec/API_specification/dataframe_api/__init__.py @@ -0,0 +1 @@ +from . import DataFrame, Column From 38e027677b883b7eb42341b924f03c80e439b27f Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Sat, 24 Jun 2023 10:20:15 +0100 Subject: [PATCH 06/38] keep to .py, but disable empty-body code --- .github/workflows/mypy.yml | 2 +- .../dataframe_api/__init__.py | 1 - .../dataframe_api/__init__.pyi | 195 ----- .../dataframe_api/{_types.pyi => _types.py} | 0 .../dataframe_api/column_object.pyi | 586 -------------- .../dataframe_api/dataframe_object.pyi | 756 ------------------ .../dataframe_api/groupby_object.pyi | 54 -- 7 files changed, 1 insertion(+), 1593 deletions(-) delete mode 100644 spec/API_specification/dataframe_api/__init__.py delete mode 100644 spec/API_specification/dataframe_api/__init__.pyi rename spec/API_specification/dataframe_api/{_types.pyi => _types.py} (100%) delete mode 100644 spec/API_specification/dataframe_api/column_object.pyi delete mode 100644 spec/API_specification/dataframe_api/dataframe_object.pyi delete mode 100644 spec/API_specification/dataframe_api/groupby_object.pyi diff --git a/.github/workflows/mypy.yml b/.github/workflows/mypy.yml index 1a60bec6..4c7f436b 100644 --- a/.github/workflows/mypy.yml +++ b/.github/workflows/mypy.yml @@ -29,4 +29,4 @@ jobs: - name: install-reqs run: python -m pip install --upgrade mypy==1.4.0 - name: run mypy - run: cd spec/API_specification && mypy dataframe_api --strict + run: cd spec/API_specification && mypy dataframe_api --strict --disable-error-code=empty-body diff --git a/spec/API_specification/dataframe_api/__init__.py b/spec/API_specification/dataframe_api/__init__.py deleted file mode 100644 index 1d2bc426..00000000 --- a/spec/API_specification/dataframe_api/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from . import DataFrame, Column diff --git a/spec/API_specification/dataframe_api/__init__.pyi b/spec/API_specification/dataframe_api/__init__.pyi deleted file mode 100644 index ed0d089c..00000000 --- a/spec/API_specification/dataframe_api/__init__.pyi +++ /dev/null @@ -1,195 +0,0 @@ -""" -Function stubs and API documentation for the DataFrame API standard. -""" -from __future__ import annotations - -from typing import Mapping, Sequence, Any, Generic, TypeVar - -from .column_object import * -from .dataframe_object import DataFrame -from .groupby_object import * - -T = TypeVar("T", bound=DType) - -__all__ = [ - "__dataframe_api_version", - "column_from_sequence", - "concat", - "dataframe_from_dict", - "is_null", - "null", - "DType", - "Int64", - "Int32", - "Int16", - "Int8", - "UInt64", - "UInt32", - "UInt16", - "UInt8", - "Float64", - "Float32", - "Bool", -] - - -__dataframe_api_version__: str = "YYYY.MM" -""" -String representing the version of the DataFrame API specification to which -the conforming implementation adheres. Set to a concrete value for a stable -implementation of the dataframe API standard. -""" - -def concat(dataframes: Sequence[DataFrame[Any]]) -> DataFrame[Any]: - """ - Concatenate DataFrames vertically. - - To concatenate horizontally, please use ``insert``. - - Parameters - ---------- - dataframes : Sequence[DataFrame] - DataFrames to concatenate. - Column names, ordering, and dtypes must match. - - Notes - ----- - The order in which the input DataFrames appear in - the output is preserved (so long as the DataFrame implementation supports row - ordering). - """ - ... - -def column_from_sequence(sequence: Sequence[Scalar[DType]], *, dtype: DType) -> Column[DType]: - """ - Construct Column from sequence of elements. - - Parameters - ---------- - sequence : Sequence[object] - Sequence of elements. Each element must be of the specified - ``dtype``, the corresponding Python builtin scalar type, or - coercible to that Python scalar type. - dtype : DType - Dtype of result. Must be specified. - - Returns - ------- - Column - """ - ... - -def dataframe_from_dict(data: Mapping[str, Column[Any]]) -> DataFrame[Any]: - """ - Construct DataFrame from map of column names to Columns. - - Parameters - ---------- - data : Mapping[str, Column] - Column must be of the corresponding type of the DataFrame. - For example, it is only supported to build a ``LibraryXDataFrame`` using - ``LibraryXColumn`` instances. - - Returns - ------- - DataFrame - """ - ... - -class null: - """ - A `null` object to represent missing data. - - ``null`` is a scalar, and may be used when constructing a `Column` from a - Python sequence with `column_from_sequence`. It does not support ``is``, - ``==`` or ``bool``. - - Raises - ------ - TypeError - From ``__eq__`` and from ``__bool__``. - - For ``__eq__``: a missing value must not be compared for equality - directly. Instead, use `DataFrame.is_null` or `Column.is_null` to check - for presence of missing values. - - For ``__bool__``: truthiness of a missing value is ambiguous. - - Notes - ----- - Like for Python scalars, the ``null`` object may be duck typed so it can - reside on (e.g.) a GPU. Hence, the builtin ``is`` keyword should not be - used to check if an object *is* the ``null`` object. - - """ - ... - -def is_null(value: object, /) -> bool: - """ - Check if an object is a `null` scalar. - - Parameters - ---------- - value : object - Any input type is valid. - - Returns - ------- - bool - True if the input is a `null` object from the same library which - implements the dataframe API standard, False otherwise. - - """ - -########## -# Dtypes # -########## - -class DType: - """Base class for all dtypes.""" - -class IntDType(DType): - """Base class for all integer dtypes.""" - -class FloatDType(DType): - """Base class for all float dtypes.""" - -class Int64(IntDType): - """Integer type with 64 bits of precision.""" - -class Int32(IntDType): - """Integer type with 32 bits of precision.""" - -class Int16(IntDType): - """Integer type with 16 bits of precision.""" - -class Int8(IntDType): - """Integer type with 8 bits of precision.""" - -class UInt64(IntDType): - """Unsigned integer type with 64 bits of precision.""" - -class UInt32(IntDType): - """Unsigned integer type with 32 bits of precision.""" - -class UInt16(IntDType): - """Unsigned integer type with 16 bits of precision.""" - -class UInt8(IntDType): - """Unsigned integer type with 8 bits of precision.""" - -class Float64(FloatDType): - """Floating point type with 64 bits of precision.""" - -class Float32(FloatDType): - """Floating point type with 32 bits of precision.""" - -class Bool(DType): - """Boolean type with 8 bits of precision.""" - -########## -# Scalar # -########## - -class Scalar(Generic[T]): - ... diff --git a/spec/API_specification/dataframe_api/_types.pyi b/spec/API_specification/dataframe_api/_types.py similarity index 100% rename from spec/API_specification/dataframe_api/_types.pyi rename to spec/API_specification/dataframe_api/_types.py diff --git a/spec/API_specification/dataframe_api/column_object.pyi b/spec/API_specification/dataframe_api/column_object.pyi deleted file mode 100644 index a06c993a..00000000 --- a/spec/API_specification/dataframe_api/column_object.pyi +++ /dev/null @@ -1,586 +0,0 @@ -from __future__ import annotations - -from typing import Any,NoReturn, Sequence, TYPE_CHECKING, Literal, Generic, TypeVar - -if TYPE_CHECKING: - from . import DType, IntDType, FloatDType, Bool, null, Scalar - -T = TypeVar('T', bound=DType) - -__all__ = ['Column'] - - -class Column(Generic[T]): - """ - Column object - - Note that this column object is not meant to be instantiated directly by - users of the library implementing the dataframe API standard. Rather, use - constructor functions or an already-created dataframe object retrieved via - - """ - - def __column_namespace__( - self, /, *, api_version: str | None = None - ) -> Any: - """ - Returns an object that has all the Dataframe Standard API functions on it. - - Parameters - ---------- - api_version: Optional[str] - String representing the version of the dataframe API specification - to be returned, in ``'YYYY.MM'`` form, for example, ``'2023.04'``. - If it is ``None``, it should return the namespace corresponding to - latest version of the dataframe API specification. If the given - version is invalid or not implemented for the given module, an - error should be raised. Default: ``None``. - - Returns - ------- - namespace: Any - An object representing the dataframe API namespace. It should have - every top-level function defined in the specification as an - attribute. It may contain other public names as well, but it is - recommended to only include those names that are part of the - specification. - - """ - - @property - def column(self) -> object: - """ - Return underlying (not-necessarily-Standard-compliant) column. - - If a library only implements the Standard, then this can return `self`. - """ - ... - - def __len__(self) -> int: - """ - Return the number of rows. - """ - - def __iter__(self) -> NoReturn: - """ - Iterate over elements. - - This is intentionally "poisoned" to discourage inefficient code patterns. - - Raises - ------ - NotImplementedError - """ - raise NotImplementedError("'__iter__' is intentionally not implemented.") - - @property - def dtype(self) -> DType: - """ - Return data type of column. - """ - - def get_rows(self, indices: Column[IntDType]) -> Column[T]: - """ - Select a subset of rows, similar to `ndarray.take`. - - Parameters - ---------- - indices : Column[IntDType] - Positions of rows to select. - """ - ... - - def get_value(self, row_number: int) -> Scalar[T]: - """ - Select the value at a row number, similar to `ndarray.__getitem__()`. - - Parameters - ---------- - row_number : int - Row number of value to return. - - Returns - ------- - Scalar - Depends on the dtype of the Column, and may vary - across implementations. - """ - ... - - def sorted_indices( - self, - *, - ascending: bool = True, - nulls_position: Literal['first', 'last'] = 'last', - ) -> Column[IntDType]: - """ - Return row numbers which would sort column. - - If you need to sort the Column, you can simply do:: - - col.get_rows(col.sorted_indices()) - - Parameters - ---------- - ascending : bool - If `True`, sort in ascending order. - If `False`, sort in descending order. - nulls_position : ``{'first', 'last'}`` - Whether null values should be placed at the beginning - or at the end of the result. - Note that the position of NaNs is unspecified and may - vary based on the implementation. - - Returns - ------- - Column[IntDType] - """ - ... - - def __eq__(self, other: Column[Bool] | Scalar[Bool]) -> Column[Bool]: # type: ignore[override] - """ - Compare for equality. - - Nulls should follow Kleene Logic. - - Parameters - ---------- - other : Column or Scalar - If Column, must have same length. - "Scalar" here is defined implicitly by what scalar types are allowed - for the operation by the underling dtypes. - - Returns - ------- - Column - """ - - def __ne__(self, other: Column[Bool] | Scalar[Bool]) -> Column[Bool]: # type: ignore[override] - """ - Compare for non-equality. - - Nulls should follow Kleene Logic. - - Parameters - ---------- - other : Column or Scalar - If Column, must have same length. - "Scalar" here is defined implicitly by what scalar types are allowed - for the operation by the underling dtypes. - - Returns - ------- - Column - """ - - def __ge__(self, other: Column[Any] | Scalar[Any]) -> Column[Any]: - """ - Compare for "greater than or equal to" `other`. - - Parameters - ---------- - other : Column or Scalar - If Column, must have same length. - "Scalar" here is defined implicitly by what scalar types are allowed - for the operation by the underling dtypes. - - Returns - ------- - Column - """ - - def __gt__(self, other: Column[Any] | Scalar[Any]) -> Column[Any]: - """ - Compare for "greater than" `other`. - - Parameters - ---------- - other : Column or Scalar - If Column, must have same length. - "Scalar" here is defined implicitly by what scalar types are allowed - for the operation by the underling dtypes. - - Returns - ------- - Column - """ - - def __le__(self, other: Column[Any] | Scalar[Any]) -> Column[Any]: - """ - Compare for "less than or equal to" `other`. - - Parameters - ---------- - other : Column or Scalar - If Column, must have same length. - "Scalar" here is defined implicitly by what scalar types are allowed - for the operation by the underling dtypes. - - Returns - ------- - Column - """ - - def __lt__(self, other: Column[Any] | Scalar[Any]) -> Column[Any]: - """ - Compare for "less than" `other`. - - Parameters - ---------- - other : Column or Scalar - If Column, must have same length. - "Scalar" here is defined implicitly by what scalar types are allowed - for the operation by the underling dtypes. - - Returns - ------- - Column - """ - - def __and__(self, other: Column[Bool] | bool) -> Column[Bool]: - """ - Apply logical 'and' to `other` Column (or scalar) and this Column. - - Nulls should follow Kleene Logic. - - Parameters - ---------- - other : Column[bool] or bool - If Column, must have same length. - - Returns - ------- - Column - - Raises - ------ - ValueError - If `self` or `other` is not boolean. - """ - - def __or__(self, other: Column[Bool] | bool) -> Column[Bool]: - """ - Apply logical 'or' to `other` Column (or scalar) and this column. - - Nulls should follow Kleene Logic. - - Parameters - ---------- - other : Column[bool] or Scalar - If Column, must have same length. - - Returns - ------- - Column[bool] - - Raises - ------ - ValueError - If `self` or `other` is not boolean. - """ - - def __add__(self, other: Column[Any] | Scalar[Any]) -> Column[Any]: - """ - Add `other` column or scalar to this column. - - Parameters - ---------- - other : Column or Scalar - If Column, must have same length. - "Scalar" here is defined implicitly by what scalar types are allowed - for the operation by the underling dtypes. - - Returns - ------- - Column - """ - - def __sub__(self, other: Column[Any] | Scalar[Any]) -> Column[Any]: - """ - Subtract `other` column or scalar from this column. - - Parameters - ---------- - other : Column or Scalar - If Column, must have same length. - "Scalar" here is defined implicitly by what scalar types are allowed - for the operation by the underling dtypes. - - Returns - ------- - Column - """ - - def __mul__(self, other: Column[Any] | Scalar[Any]) -> Column[Any]: - """ - Multiply `other` column or scalar with this column. - - Parameters - ---------- - other : Column or Scalar - If Column, must have same length. - "Scalar" here is defined implicitly by what scalar types are allowed - for the operation by the underling dtypes. - - Returns - ------- - Column - """ - - def __truediv__(self, other: Column[Any] | Scalar[Any]) -> Column[Any]: - """ - Divide this column by `other` column or scalar. True division, returns floats. - - Parameters - ---------- - other : Column or Scalar - If Column, must have same length. - "Scalar" here is defined implicitly by what scalar types are allowed - for the operation by the underling dtypes. - - Returns - ------- - Column - """ - - def __floordiv__(self, other: Column[Any] | Scalar[Any]) -> Column[Any]: - """ - Floor-divide `other` column or scalar to this column. - - Parameters - ---------- - other : Column or Scalar - If Column, must have same length. - "Scalar" here is defined implicitly by what scalar types are allowed - for the operation by the underling dtypes. - - Returns - ------- - Column - """ - - def __pow__(self, other: Column[Any] | Scalar[Any]) -> Column[Any]: - """ - Raise this column to the power of `other`. - - Parameters - ---------- - other : Column or Scalar - If Column, must have same length. - "Scalar" here is defined implicitly by what scalar types are allowed - for the operation by the underling dtypes. - - Returns - ------- - Column - """ - - def __mod__(self, other: Column[Any] | Scalar[Any]) -> Column[Any]: - """ - Returns modulus of this column by `other` (`%` operator). - - Parameters - ---------- - other : Column or Scalar - If Column, must have same length. - "Scalar" here is defined implicitly by what scalar types are allowed - for the operation by the underling dtypes. - - Returns - ------- - Column - """ - - def __divmod__(self, other: Column[Any] | Scalar[Any]) -> tuple[Column[IntDType], Column[IntDType]]: - """ - Return quotient and remainder of integer division. See `divmod` builtin function. - - Parameters - ---------- - other : Column or Scalar - If Column, must have same length. - "Scalar" here is defined implicitly by what scalar types are allowed - for the operation by the underling dtypes. - - Returns - ------- - Column - """ - - def __invert__(self) -> Column[Bool]: - """ - Invert truthiness of (boolean) elements. - - Raises - ------ - ValueError - If any of the Column's columns is not boolean. - """ - - def any(self, *, skip_nulls: bool = True) -> bool: - """ - Reduction returns a bool. - - Raises - ------ - ValueError - If column is not boolean. - """ - - def all(self, *, skip_nulls: bool = True) -> bool: - """ - Reduction returns a bool. - - Raises - ------ - ValueError - If column is not boolean. - """ - - def min(self, *, skip_nulls: bool = True) -> Scalar[T]: - """ - Reduction returns a scalar. Any data type that supports comparisons - must be supported. The returned value has the same dtype as the column. - """ - - def max(self, *, skip_nulls: bool = True) -> Scalar[T]: - """ - Reduction returns a scalar. Any data type that supports comparisons - must be supported. The returned value has the same dtype as the column. - """ - - def sum(self, *, skip_nulls: bool = True) -> Scalar[T]: - """ - Reduction returns a scalar. Must be supported for numerical and - datetime data types. The returned value has the same dtype as the - column. - """ - - def prod(self, *, skip_nulls: bool = True) -> Scalar[T]: - """ - Reduction returns a scalar. Must be supported for numerical data types. - The returned value has the same dtype as the column. - """ - - def median(self, *, skip_nulls: bool = True) -> Scalar[Any]: - """ - Reduction returns a scalar. Must be supported for numerical and - datetime data types. Returns a float for numerical data types, and - datetime (with the appropriate timedelta format string) for datetime - dtypes. - """ - - def mean(self, *, skip_nulls: bool = True) -> Scalar[Any]: - """ - Reduction returns a scalar. Must be supported for numerical and - datetime data types. Returns a float for numerical data types, and - datetime (with the appropriate timedelta format string) for datetime - dtypes. - """ - - def std(self, *, skip_nulls: bool = True) -> Scalar[Any]: - """ - Reduction returns a scalar. Must be supported for numerical and - datetime data types. Returns a float for numerical data types, and - datetime (with the appropriate timedelta format string) for datetime - dtypes. - """ - - def var(self, *, skip_nulls: bool = True) -> Scalar[Any]: - """ - Reduction returns a scalar. Must be supported for numerical and - datetime data types. Returns a float for numerical data types, and - datetime (with the appropriate timedelta format string) for datetime - dtypes. - """ - - def is_null(self) -> Column[Bool]: - """ - Check for 'missing' or 'null' entries. - - Returns - ------- - Column - - See also - -------- - is_nan - - Notes - ----- - Does *not* include NaN-like entries. - May optionally include 'NaT' values (if present in an implementation), - but note that the Standard makes no guarantees about them. - """ - - def is_nan(self) -> Column[Bool]: - """ - Check for nan entries. - - Returns - ------- - Column - - See also - -------- - is_null - - Notes - ----- - This only checks for 'NaN'. - Does *not* include 'missing' or 'null' entries. - In particular, does not check for `np.timedelta64('NaT')`. - """ - - def is_in(self, values: Column[T]) -> Column[Bool]: - """ - Indicate whether the value at each row matches any value in `values`. - - Parameters - ---------- - values : Column - Contains values to compare against. May include ``float('nan')`` and - ``null``, in which case ``'nan'`` and ``null`` will - respectively return ``True`` even though ``float('nan') == float('nan')`` - isn't ``True``. - The dtype of ``values`` must match the current column's dtype. - - Returns - ------- - Column[bool] - """ - - def unique_indices(self, *, skip_nulls: bool = True) -> Column[IntDType]: - """ - Return indices corresponding to unique values in Column. - - Returns - ------- - Column[IntDType] - Indices corresponding to unique values. - - Notes - ----- - There are no ordering guarantees. In particular, if there are multiple - indices corresponding to the same unique value, there is no guarantee - about which one will appear in the result. - If the original Column contains multiple `'NaN'` values, then - only a single index corresponding to those values should be returned. - Likewise for null values (if ``skip_nulls=False``). - To get the unique values, you can do ``col.get_rows(col.unique_indices())``. - """ - ... - - def fill_nan(self, value: float | 'null', /) -> Column[T]: - """ - Fill floating point ``nan`` values with the given fill value. - - Parameters - ---------- - value : float or `null` - Value used to replace any ``nan`` in the column with. Must be - of the Python scalar type matching the dtype of the column (or - be `null`). - - """ - ... diff --git a/spec/API_specification/dataframe_api/dataframe_object.pyi b/spec/API_specification/dataframe_api/dataframe_object.pyi deleted file mode 100644 index 14a1f29c..00000000 --- a/spec/API_specification/dataframe_api/dataframe_object.pyi +++ /dev/null @@ -1,756 +0,0 @@ -from __future__ import annotations - -from typing import Any, Literal, Mapping, Sequence, Union, TYPE_CHECKING, NoReturn, TypeVar, Generic - - -if TYPE_CHECKING: - from .column_object import Column - from .groupby_object import GroupBy - from . import DType, IntDType, FloatDType, Bool, null, Scalar - - -__all__ = ["DataFrame"] - -T = TypeVar("T", bound=DType) - - -class DataFrame(Generic[T]): - """ - DataFrame object - - Note that this dataframe object is not meant to be instantiated directly by - users of the library implementing the dataframe API standard. Rather, use - constructor functions or an already-created dataframe object retrieved via - - **Python operator support** - - All arithmetic operators defined by the Python language, except for - ``__matmul__``, ``__neg__`` and ``__pos__``, must be supported for - numerical data types. - - All comparison operators defined by the Python language must be supported - by the dataframe object for all data types for which those comparisons are - supported by the builtin scalar types corresponding to a data type. - - In-place operators must not be supported. All operations on the dataframe - object are out-of-place. - - **Methods and Attributes** - - """ - def __dataframe_namespace__( - self, /, *, api_version: str | None = None - ) -> Any: - """ - Returns an object that has all the dataframe API functions on it. - - Parameters - ---------- - api_version: Optional[str] - String representing the version of the dataframe API specification - to be returned, in ``'YYYY.MM'`` form, for example, ``'2023.04'``. - If it is ``None``, it should return the namespace corresponding to - latest version of the dataframe API specification. If the given - version is invalid or not implemented for the given module, an - error should be raised. Default: ``None``. - - Returns - ------- - namespace: Any - An object representing the dataframe API namespace. It should have - every top-level function defined in the specification as an - attribute. It may contain other public names as well, but it is - recommended to only include those names that are part of the - specification. - - """ - - @property - def dataframe(self) -> object: - """ - Return underlying (not-necessarily-Standard-compliant) DataFrame. - - If a library only implements the Standard, then this can return `self`. - """ - ... - - def shape(self) -> tuple[int, int]: - """ - Return number of rows and number of columns. - """ - - def groupby(self, keys: Sequence[str], /) -> GroupBy: - """ - Group the DataFrame by the given columns. - - Parameters - ---------- - keys : Sequence[str] - - Returns - ------- - GroupBy - - Raises - ------ - KeyError - If any of the requested keys are not present. - - Notes - ----- - Downstream operations from this function, like aggregations, return - results for which row order is not guaranteed and is implementation - defined. - """ - ... - - def get_column_by_name(self, name: str, /) -> Column[T]: - """ - Select a column by name. - - Parameters - ---------- - name : str - - Returns - ------- - Column - - Raises - ------ - KeyError - If the key is not present. - """ - ... - - def get_columns_by_name(self, names: Sequence[str], /) -> DataFrame[T]: - """ - Select multiple columns by name. - - Parameters - ---------- - names : Sequence[str] - - Returns - ------- - DataFrame - - Raises - ------ - KeyError - If the any requested key is not present. - """ - ... - - def get_rows(self, indices: Column[IntDType]) -> DataFrame[T]: - """ - Select a subset of rows, similar to `ndarray.take`. - - Parameters - ---------- - indices : Column[IntDType] - Positions of rows to select. - - Returns - ------- - DataFrame - """ - ... - - def slice_rows( - self, start: int | None, stop: int | None, step: int | None - ) -> DataFrame[T]: - """ - Select a subset of rows corresponding to a slice. - - Parameters - ---------- - start : int or None - stop : int or None - step : int or None - - Returns - ------- - DataFrame - """ - ... - - def get_rows_by_mask(self, mask: Column[Bool]) -> DataFrame[T]: - """ - Select a subset of rows corresponding to a mask. - - Parameters - ---------- - mask : Column[bool] - - Returns - ------- - DataFrame - - Notes - ----- - Some participants preferred a weaker type Arraylike[bool] for mask, - where 'Arraylike' denotes an object adhering to the Array API standard. - """ - ... - - def insert(self, loc: int, label: str, value: Column[Any]) -> DataFrame[Any]: - """ - Insert column into DataFrame at specified location. - - Parameters - ---------- - loc : int - Insertion index. Must verify 0 <= loc <= len(columns). - label : str - Label of the inserted column. - value : Column - """ - ... - - def drop_column(self, label: str) -> DataFrame[T]: - """ - Drop the specified column. - - Parameters - ---------- - label : str - - Returns - ------- - DataFrame - - Raises - ------ - KeyError - If the label is not present. - """ - ... - - def rename_columns(self, mapping: Mapping[str, str]) -> DataFrame[T]: - """ - Rename columns. - - Parameters - ---------- - mapping : Mapping[str, str] - Keys are old column names, values are new column names. - - Returns - ------- - DataFrame - """ - ... - - def get_column_names(self) -> Sequence[str]: - """ - Get column names. - - Returns - ------- - Sequence[str] - """ - ... - - def sorted_indices( - self, - keys: Sequence[str], - *, - ascending: Sequence[bool] | bool = True, - nulls_position: Literal['first', 'last'] = 'last', - ) -> Column[IntDType]: - """ - Return row numbers which would sort according to given columns. - - If you need to sort the DataFrame, you can simply do:: - - df.get_rows(df.sorted_indices(keys)) - - Parameters - ---------- - keys : Sequence[str] - Names of columns to sort by. - ascending : Sequence[bool] or bool - If `True`, sort by all keys in ascending order. - If `False`, sort by all keys in descending order. - If a sequence, it must be the same length as `keys`, - and determines the direction with which to use each - key to sort by. - nulls_position : ``{'first', 'last'}`` - Whether null values should be placed at the beginning - or at the end of the result. - Note that the position of NaNs is unspecified and may - vary based on the implementation. - - Returns - ------- - Column[IntDType] - - Raises - ------ - ValueError - If `keys` and `ascending` are sequences of different lengths. - """ - ... - - def __eq__(self, other: DataFrame[Any] | Scalar[Any]) -> DataFrame[Bool]: # type: ignore[override] - """ - Compare for equality. - - Nulls should follow Kleene Logic. - - Parameters - ---------- - other : DataFrame or Scalar - If DataFrame, must have same length and matching columns. - "Scalar" here is defined implicitly by what scalar types are allowed - for the operation by the underling dtypes. - - Returns - ------- - DataFrame - """ - ... - - def __ne__(self, other: DataFrame[Any] | Scalar[Any]) -> DataFrame[Bool]: # type: ignore[override] - """ - Compare for non-equality. - - Nulls should follow Kleene Logic. - - Parameters - ---------- - other : DataFrame or Scalar - If DataFrame, must have same length and matching columns. - "Scalar" here is defined implicitly by what scalar types are allowed - for the operation by the underling dtypes. - - Returns - ------- - DataFrame - """ - ... - - def __ge__(self, other: DataFrame[Any] | Scalar[Any]) -> DataFrame[Bool]: - """ - Compare for "greater than or equal to" `other`. - - Parameters - ---------- - other : DataFrame or Scalar - If DataFrame, must have same length and matching columns. - "Scalar" here is defined implicitly by what scalar types are allowed - for the operation by the underling dtypes. - - Returns - ------- - DataFrame - """ - ... - - def __gt__(self, other: DataFrame[Any] | Scalar[Any]) -> DataFrame[Bool]: - """ - Compare for "greater than" `other`. - - Parameters - ---------- - other : DataFrame or Scalar - If DataFrame, must have same length and matching columns. - "Scalar" here is defined implicitly by what scalar types are allowed - for the operation by the underling dtypes. - - Returns - ------- - DataFrame - """ - ... - - def __le__(self, other: DataFrame[Any] | Scalar[Any]) -> DataFrame[Bool]: - """ - Compare for "less than or equal to" `other`. - - Parameters - ---------- - other : DataFrame or Scalar - If DataFrame, must have same length and matching columns. - "Scalar" here is defined implicitly by what scalar types are allowed - for the operation by the underling dtypes. - - Returns - ------- - DataFrame - """ - ... - - def __lt__(self, other: DataFrame[Any] | Scalar[Any]) -> DataFrame[Bool]: - """ - Compare for "less than" `other`. - - Parameters - ---------- - other : DataFrame or Scalar - If DataFrame, must have same length and matching columns. - "Scalar" here is defined implicitly by what scalar types are allowed - for the operation by the underling dtypes. - - Returns - ------- - DataFrame - """ - ... - - def __and__(self, other: DataFrame[Bool] | bool) -> DataFrame[Bool]: - """ - Apply logical 'and' to `other` DataFrame (or scalar) and this dataframe. - - Nulls should follow Kleene Logic. - - Parameters - ---------- - other : DataFrame[bool] or bool - If DataFrame, must have same length. - - Returns - ------- - DataFrame[bool] - - Raises - ------ - ValueError - If `self` or `other` is not boolean. - """ - - def __or__(self, other: DataFrame[Bool] | bool) -> DataFrame[Bool]: - """ - Apply logical 'or' to `other` DataFrame (or scalar) and this DataFrame. - - Nulls should follow Kleene Logic. - - Parameters - ---------- - other : DataFrame[bool] or bool - If DataFrame, must have same length. - - Returns - ------- - DataFrame[bool] - - Raises - ------ - ValueError - If `self` or `other` is not boolean. - """ - - def __add__(self, other: DataFrame[Any] | Scalar[Any]) -> DataFrame[Any]: - """ - Add `other` dataframe or scalar to this dataframe. - - Parameters - ---------- - other : DataFrame or Scalar - If DataFrame, must have same length and matching columns. - "Scalar" here is defined implicitly by what scalar types are allowed - for the operation by the underling dtypes. - - Returns - ------- - DataFrame - """ - ... - - def __sub__(self, other: DataFrame[Any] | Scalar[Any]) -> DataFrame[Any]: - """ - Subtract `other` dataframe or scalar from this dataframe. - - Parameters - ---------- - other : DataFrame or Scalar - If DataFrame, must have same length and matching columns. - "Scalar" here is defined implicitly by what scalar types are allowed - for the operation by the underling dtypes. - - Returns - ------- - DataFrame - """ - ... - - def __mul__(self, other: DataFrame[Any] | Scalar[Any]) -> DataFrame[Any]: - """ - Multiply `other` dataframe or scalar with this dataframe. - - Parameters - ---------- - other : DataFrame or Scalar - If DataFrame, must have same length and matching columns. - "Scalar" here is defined implicitly by what scalar types are allowed - for the operation by the underling dtypes. - - Returns - ------- - DataFrame - """ - ... - - def __truediv__(self, other: DataFrame[Any] | Scalar[Any]) -> DataFrame[Any]: - """ - Divide this dataframe by `other` dataframe or scalar. True division, returns floats. - - Parameters - ---------- - other : DataFrame or Scalar - If DataFrame, must have same length and matching columns. - "Scalar" here is defined implicitly by what scalar types are allowed - for the operation by the underling dtypes. - - Returns - ------- - DataFrame - """ - ... - - def __floordiv__(self, other: DataFrame[Any] | Scalar[Any]) -> DataFrame[Any]: - """ - Floor-divide (returns integers) this dataframe by `other` dataframe or scalar. - - Parameters - ---------- - other : DataFrame or Scalar - If DataFrame, must have same length and matching columns. - "Scalar" here is defined implicitly by what scalar types are allowed - for the operation by the underling dtypes. - - Returns - ------- - DataFrame - """ - ... - - def __pow__(self, other: DataFrame[Any] | Scalar[Any]) -> DataFrame[Any]: - """ - Raise this dataframe to the power of `other`. - - Parameters - ---------- - other : DataFrame or Scalar - If DataFrame, must have same length and matching columns. - "Scalar" here is defined implicitly by what scalar types are allowed - for the operation by the underling dtypes. - - Returns - ------- - DataFrame - """ - ... - - def __mod__(self, other: DataFrame[Any] | Scalar[Any]) -> DataFrame[Any]: - """ - Return modulus of this dataframe by `other` (`%` operator). - - Parameters - ---------- - other : DataFrame or Scalar - If DataFrame, must have same length and matching columns. - "Scalar" here is defined implicitly by what scalar types are allowed - for the operation by the underling dtypes. - - Returns - ------- - DataFrame - """ - ... - - def __divmod__(self, other: DataFrame[Any] | Scalar[Any]) -> tuple[DataFrame[Any], DataFrame[Any]]: - """ - Return quotient and remainder of integer division. See `divmod` builtin function. - - Parameters - ---------- - other : DataFrame or Scalar - If DataFrame, must have same length and matching columns. - "Scalar" here is defined implicitly by what scalar types are allowed - for the operation by the underling dtypes. - - Returns - ------- - A tuple of two DataFrame's - """ - ... - - def __invert__(self: DataFrame[Bool]) -> DataFrame[Bool]: - """ - Invert truthiness of (boolean) elements. - - Raises - ------ - ValueError - If any of the DataFrame's columns is not boolean. - """ - ... - - def __iter__(self) -> NoReturn: - """ - Iterate over elements. - - This is intentionally "poisoned" to discourage inefficient code patterns. - - Raises - ------ - NotImplementedError - """ - raise NotImplementedError("'__iter__' is intentionally not implemented.") - - def any(self, *, skip_nulls: bool = True) -> DataFrame[Bool]: - """ - Reduction returns a 1-row DataFrame. - - Raises - ------ - ValueError - If any of the DataFrame's columns is not boolean. - """ - ... - - def all(self, *, skip_nulls: bool = True) -> DataFrame[Bool]: - """ - Reduction returns a 1-row DataFrame. - - Raises - ------ - ValueError - If any of the DataFrame's columns is not boolean. - """ - ... - - def any_rowwise(self, *, skip_nulls: bool = True) -> Column[Bool]: - """ - Reduction returns a Column. - - Differs from ``DataFrame.any`` and that the reduction happens - for each row, rather than for each column. - - Raises - ------ - ValueError - If any of the DataFrame's columns is not boolean. - """ - ... - - def all_rowwise(self, *, skip_nulls: bool = True) -> Column[Bool]: - """ - Reduction returns a Column. - - Differs from ``DataFrame.all`` and that the reduction happens - for each row, rather than for each column. - - Raises - ------ - ValueError - If any of the DataFrame's columns is not boolean. - """ - ... - - def min(self, *, skip_nulls: bool = True) -> DataFrame[T]: - """ - Reduction returns a 1-row DataFrame. - """ - ... - - def max(self, *, skip_nulls: bool = True) -> DataFrame[T]: - """ - Reduction returns a 1-row DataFrame. - """ - ... - - def sum(self, *, skip_nulls: bool = True) -> DataFrame[Any]: - """ - Reduction returns a 1-row DataFrame. - """ - ... - - def prod(self, *, skip_nulls: bool = True) -> DataFrame[Any]: - """ - Reduction returns a 1-row DataFrame. - """ - ... - - def median(self, *, skip_nulls: bool = True) -> DataFrame[Any]: - """ - Reduction returns a 1-row DataFrame. - """ - ... - - def mean(self, *, skip_nulls: bool = True) -> DataFrame[Any]: - """ - Reduction returns a 1-row DataFrame. - """ - ... - - def std(self, *, skip_nulls: bool = True) -> DataFrame[Any]: - """ - Reduction returns a 1-row DataFrame. - """ - ... - - def var(self, *, skip_nulls: bool = True) -> DataFrame[Any]: - """ - Reduction returns a 1-row DataFrame. - """ - ... - - def is_null(self) -> DataFrame[Bool]: - """ - Check for 'missing' or 'null' entries. - - Returns - ------- - DataFrame - - See also - -------- - is_nan - - Notes - ----- - Does *not* include NaN-like entries. - May optionally include 'NaT' values (if present in an implementation), - but note that the Standard makes no guarantees about them. - """ - ... - - def is_nan(self) -> DataFrame[Bool]: - """ - Check for nan entries. - - Returns - ------- - DataFrame - - See also - -------- - is_null - - Notes - ----- - This only checks for 'NaN'. - Does *not* include 'missing' or 'null' entries. - In particular, does not check for `np.timedelta64('NaT')`. - """ - ... - - def fill_nan(self, value: float | 'null', /) -> DataFrame[T]: - """ - Fill ``nan`` values with the given fill value. - - The fill operation will apply to all columns with a floating-point - dtype. Other columns remain unchanged. - - Parameters - ---------- - value : float or `null` - Value used to replace any ``nan`` in the column with. Must be - of the Python scalar type matching the dtype of the column (or - be `null`). - - """ - ... diff --git a/spec/API_specification/dataframe_api/groupby_object.pyi b/spec/API_specification/dataframe_api/groupby_object.pyi deleted file mode 100644 index 6ca0a600..00000000 --- a/spec/API_specification/dataframe_api/groupby_object.pyi +++ /dev/null @@ -1,54 +0,0 @@ -from typing import TYPE_CHECKING, TypeVar, Generic, Any - -if TYPE_CHECKING: - from .dataframe_object import DataFrame - from . import IntDType, DType, Bool - - -__all__ = ['GroupBy'] - -T = TypeVar('T', bound=DType) - - -class GroupBy: - """ - GroupBy object. - - Note that this class is not meant to be constructed by users. - It is returned from `DataFrame.groupby`. - - **Methods** - - """ - def any(self, *, skip_nulls: bool = True) -> DataFrame[Bool]: - ... - - def all(self, *, skip_nulls: bool = True) -> DataFrame[Bool]: - ... - - def min(self, *, skip_nulls: bool = True) -> DataFrame[Any]: - ... - - def max(self, *, skip_nulls: bool = True) -> DataFrame[Any]: - ... - - def sum(self, *, skip_nulls: bool = True) -> DataFrame[Any]: - ... - - def prod(self, *, skip_nulls: bool = True) -> DataFrame[Any]: - ... - - def median(self, *, skip_nulls: bool = True) -> DataFrame[Any]: - ... - - def mean(self, *, skip_nulls: bool = True) -> DataFrame[Any]: - ... - - def std(self, *, skip_nulls: bool = True) -> DataFrame[Any]: - ... - - def var(self, *, skip_nulls: bool = True) -> DataFrame[Any]: - ... - - def size(self) -> DataFrame[IntDType]: - ... From 8f460098b67ff76d38f5eda10e705e81f2cfb96d Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Sat, 24 Jun 2023 10:21:00 +0100 Subject: [PATCH 07/38] add missing files --- spec/API_specification/README.md | 7 + .../dataframe_api/__init__.py | 195 +++++ .../dataframe_api/column_object.py | 586 ++++++++++++++ .../dataframe_api/dataframe_object.py | 756 ++++++++++++++++++ .../dataframe_api/groupby_object.py | 54 ++ 5 files changed, 1598 insertions(+) create mode 100644 spec/API_specification/README.md create mode 100644 spec/API_specification/dataframe_api/__init__.py create mode 100644 spec/API_specification/dataframe_api/column_object.py create mode 100644 spec/API_specification/dataframe_api/dataframe_object.py create mode 100644 spec/API_specification/dataframe_api/groupby_object.py diff --git a/spec/API_specification/README.md b/spec/API_specification/README.md new file mode 100644 index 00000000..0f2a16db --- /dev/null +++ b/spec/API_specification/README.md @@ -0,0 +1,7 @@ +# API Specification + +To type-check the spec, please install `mypy==1.4.0` and run + +```console +mypy dataframe_api --strict +``` \ No newline at end of file diff --git a/spec/API_specification/dataframe_api/__init__.py b/spec/API_specification/dataframe_api/__init__.py new file mode 100644 index 00000000..1aa4e945 --- /dev/null +++ b/spec/API_specification/dataframe_api/__init__.py @@ -0,0 +1,195 @@ +""" +Function stubs and API documentation for the DataFrame API standard. +""" +from __future__ import annotations + +from typing import Mapping, Sequence, Any, Generic, TypeVar + +from .column_object import * +from .dataframe_object import DataFrame +from .groupby_object import * + +T = TypeVar("T", bound="DType") + +__all__ = [ + "__dataframe_api_version", + "column_from_sequence", + "concat", + "dataframe_from_dict", + "is_null", + "null", + "DType", + "Int64", + "Int32", + "Int16", + "Int8", + "UInt64", + "UInt32", + "UInt16", + "UInt8", + "Float64", + "Float32", + "Bool", +] + + +__dataframe_api_version__: str = "YYYY.MM" +""" +String representing the version of the DataFrame API specification to which +the conforming implementation adheres. Set to a concrete value for a stable +implementation of the dataframe API standard. +""" + +def concat(dataframes: Sequence[DataFrame[Any]]) -> DataFrame[Any]: + """ + Concatenate DataFrames vertically. + + To concatenate horizontally, please use ``insert``. + + Parameters + ---------- + dataframes : Sequence[DataFrame] + DataFrames to concatenate. + Column names, ordering, and dtypes must match. + + Notes + ----- + The order in which the input DataFrames appear in + the output is preserved (so long as the DataFrame implementation supports row + ordering). + """ + ... + +def column_from_sequence(sequence: Sequence[Scalar[DType]], *, dtype: DType) -> Column[DType]: + """ + Construct Column from sequence of elements. + + Parameters + ---------- + sequence : Sequence[object] + Sequence of elements. Each element must be of the specified + ``dtype``, the corresponding Python builtin scalar type, or + coercible to that Python scalar type. + dtype : DType + Dtype of result. Must be specified. + + Returns + ------- + Column + """ + ... + +def dataframe_from_dict(data: Mapping[str, Column[Any]]) -> DataFrame[Any]: + """ + Construct DataFrame from map of column names to Columns. + + Parameters + ---------- + data : Mapping[str, Column] + Column must be of the corresponding type of the DataFrame. + For example, it is only supported to build a ``LibraryXDataFrame`` using + ``LibraryXColumn`` instances. + + Returns + ------- + DataFrame + """ + ... + +class null: + """ + A `null` object to represent missing data. + + ``null`` is a scalar, and may be used when constructing a `Column` from a + Python sequence with `column_from_sequence`. It does not support ``is``, + ``==`` or ``bool``. + + Raises + ------ + TypeError + From ``__eq__`` and from ``__bool__``. + + For ``__eq__``: a missing value must not be compared for equality + directly. Instead, use `DataFrame.is_null` or `Column.is_null` to check + for presence of missing values. + + For ``__bool__``: truthiness of a missing value is ambiguous. + + Notes + ----- + Like for Python scalars, the ``null`` object may be duck typed so it can + reside on (e.g.) a GPU. Hence, the builtin ``is`` keyword should not be + used to check if an object *is* the ``null`` object. + + """ + ... + +def is_null(value: object, /) -> bool: + """ + Check if an object is a `null` scalar. + + Parameters + ---------- + value : object + Any input type is valid. + + Returns + ------- + bool + True if the input is a `null` object from the same library which + implements the dataframe API standard, False otherwise. + + """ + +########## +# Dtypes # +########## + +class DType: + """Base class for all dtypes.""" + +class IntDType(DType): + """Base class for all integer dtypes.""" + +class FloatDType(DType): + """Base class for all float dtypes.""" + +class Int64(IntDType): + """Integer type with 64 bits of precision.""" + +class Int32(IntDType): + """Integer type with 32 bits of precision.""" + +class Int16(IntDType): + """Integer type with 16 bits of precision.""" + +class Int8(IntDType): + """Integer type with 8 bits of precision.""" + +class UInt64(IntDType): + """Unsigned integer type with 64 bits of precision.""" + +class UInt32(IntDType): + """Unsigned integer type with 32 bits of precision.""" + +class UInt16(IntDType): + """Unsigned integer type with 16 bits of precision.""" + +class UInt8(IntDType): + """Unsigned integer type with 8 bits of precision.""" + +class Float64(FloatDType): + """Floating point type with 64 bits of precision.""" + +class Float32(FloatDType): + """Floating point type with 32 bits of precision.""" + +class Bool(DType): + """Boolean type with 8 bits of precision.""" + +########## +# Scalar # +########## + +class Scalar(Generic[T]): + ... diff --git a/spec/API_specification/dataframe_api/column_object.py b/spec/API_specification/dataframe_api/column_object.py new file mode 100644 index 00000000..a06c993a --- /dev/null +++ b/spec/API_specification/dataframe_api/column_object.py @@ -0,0 +1,586 @@ +from __future__ import annotations + +from typing import Any,NoReturn, Sequence, TYPE_CHECKING, Literal, Generic, TypeVar + +if TYPE_CHECKING: + from . import DType, IntDType, FloatDType, Bool, null, Scalar + +T = TypeVar('T', bound=DType) + +__all__ = ['Column'] + + +class Column(Generic[T]): + """ + Column object + + Note that this column object is not meant to be instantiated directly by + users of the library implementing the dataframe API standard. Rather, use + constructor functions or an already-created dataframe object retrieved via + + """ + + def __column_namespace__( + self, /, *, api_version: str | None = None + ) -> Any: + """ + Returns an object that has all the Dataframe Standard API functions on it. + + Parameters + ---------- + api_version: Optional[str] + String representing the version of the dataframe API specification + to be returned, in ``'YYYY.MM'`` form, for example, ``'2023.04'``. + If it is ``None``, it should return the namespace corresponding to + latest version of the dataframe API specification. If the given + version is invalid or not implemented for the given module, an + error should be raised. Default: ``None``. + + Returns + ------- + namespace: Any + An object representing the dataframe API namespace. It should have + every top-level function defined in the specification as an + attribute. It may contain other public names as well, but it is + recommended to only include those names that are part of the + specification. + + """ + + @property + def column(self) -> object: + """ + Return underlying (not-necessarily-Standard-compliant) column. + + If a library only implements the Standard, then this can return `self`. + """ + ... + + def __len__(self) -> int: + """ + Return the number of rows. + """ + + def __iter__(self) -> NoReturn: + """ + Iterate over elements. + + This is intentionally "poisoned" to discourage inefficient code patterns. + + Raises + ------ + NotImplementedError + """ + raise NotImplementedError("'__iter__' is intentionally not implemented.") + + @property + def dtype(self) -> DType: + """ + Return data type of column. + """ + + def get_rows(self, indices: Column[IntDType]) -> Column[T]: + """ + Select a subset of rows, similar to `ndarray.take`. + + Parameters + ---------- + indices : Column[IntDType] + Positions of rows to select. + """ + ... + + def get_value(self, row_number: int) -> Scalar[T]: + """ + Select the value at a row number, similar to `ndarray.__getitem__()`. + + Parameters + ---------- + row_number : int + Row number of value to return. + + Returns + ------- + Scalar + Depends on the dtype of the Column, and may vary + across implementations. + """ + ... + + def sorted_indices( + self, + *, + ascending: bool = True, + nulls_position: Literal['first', 'last'] = 'last', + ) -> Column[IntDType]: + """ + Return row numbers which would sort column. + + If you need to sort the Column, you can simply do:: + + col.get_rows(col.sorted_indices()) + + Parameters + ---------- + ascending : bool + If `True`, sort in ascending order. + If `False`, sort in descending order. + nulls_position : ``{'first', 'last'}`` + Whether null values should be placed at the beginning + or at the end of the result. + Note that the position of NaNs is unspecified and may + vary based on the implementation. + + Returns + ------- + Column[IntDType] + """ + ... + + def __eq__(self, other: Column[Bool] | Scalar[Bool]) -> Column[Bool]: # type: ignore[override] + """ + Compare for equality. + + Nulls should follow Kleene Logic. + + Parameters + ---------- + other : Column or Scalar + If Column, must have same length. + "Scalar" here is defined implicitly by what scalar types are allowed + for the operation by the underling dtypes. + + Returns + ------- + Column + """ + + def __ne__(self, other: Column[Bool] | Scalar[Bool]) -> Column[Bool]: # type: ignore[override] + """ + Compare for non-equality. + + Nulls should follow Kleene Logic. + + Parameters + ---------- + other : Column or Scalar + If Column, must have same length. + "Scalar" here is defined implicitly by what scalar types are allowed + for the operation by the underling dtypes. + + Returns + ------- + Column + """ + + def __ge__(self, other: Column[Any] | Scalar[Any]) -> Column[Any]: + """ + Compare for "greater than or equal to" `other`. + + Parameters + ---------- + other : Column or Scalar + If Column, must have same length. + "Scalar" here is defined implicitly by what scalar types are allowed + for the operation by the underling dtypes. + + Returns + ------- + Column + """ + + def __gt__(self, other: Column[Any] | Scalar[Any]) -> Column[Any]: + """ + Compare for "greater than" `other`. + + Parameters + ---------- + other : Column or Scalar + If Column, must have same length. + "Scalar" here is defined implicitly by what scalar types are allowed + for the operation by the underling dtypes. + + Returns + ------- + Column + """ + + def __le__(self, other: Column[Any] | Scalar[Any]) -> Column[Any]: + """ + Compare for "less than or equal to" `other`. + + Parameters + ---------- + other : Column or Scalar + If Column, must have same length. + "Scalar" here is defined implicitly by what scalar types are allowed + for the operation by the underling dtypes. + + Returns + ------- + Column + """ + + def __lt__(self, other: Column[Any] | Scalar[Any]) -> Column[Any]: + """ + Compare for "less than" `other`. + + Parameters + ---------- + other : Column or Scalar + If Column, must have same length. + "Scalar" here is defined implicitly by what scalar types are allowed + for the operation by the underling dtypes. + + Returns + ------- + Column + """ + + def __and__(self, other: Column[Bool] | bool) -> Column[Bool]: + """ + Apply logical 'and' to `other` Column (or scalar) and this Column. + + Nulls should follow Kleene Logic. + + Parameters + ---------- + other : Column[bool] or bool + If Column, must have same length. + + Returns + ------- + Column + + Raises + ------ + ValueError + If `self` or `other` is not boolean. + """ + + def __or__(self, other: Column[Bool] | bool) -> Column[Bool]: + """ + Apply logical 'or' to `other` Column (or scalar) and this column. + + Nulls should follow Kleene Logic. + + Parameters + ---------- + other : Column[bool] or Scalar + If Column, must have same length. + + Returns + ------- + Column[bool] + + Raises + ------ + ValueError + If `self` or `other` is not boolean. + """ + + def __add__(self, other: Column[Any] | Scalar[Any]) -> Column[Any]: + """ + Add `other` column or scalar to this column. + + Parameters + ---------- + other : Column or Scalar + If Column, must have same length. + "Scalar" here is defined implicitly by what scalar types are allowed + for the operation by the underling dtypes. + + Returns + ------- + Column + """ + + def __sub__(self, other: Column[Any] | Scalar[Any]) -> Column[Any]: + """ + Subtract `other` column or scalar from this column. + + Parameters + ---------- + other : Column or Scalar + If Column, must have same length. + "Scalar" here is defined implicitly by what scalar types are allowed + for the operation by the underling dtypes. + + Returns + ------- + Column + """ + + def __mul__(self, other: Column[Any] | Scalar[Any]) -> Column[Any]: + """ + Multiply `other` column or scalar with this column. + + Parameters + ---------- + other : Column or Scalar + If Column, must have same length. + "Scalar" here is defined implicitly by what scalar types are allowed + for the operation by the underling dtypes. + + Returns + ------- + Column + """ + + def __truediv__(self, other: Column[Any] | Scalar[Any]) -> Column[Any]: + """ + Divide this column by `other` column or scalar. True division, returns floats. + + Parameters + ---------- + other : Column or Scalar + If Column, must have same length. + "Scalar" here is defined implicitly by what scalar types are allowed + for the operation by the underling dtypes. + + Returns + ------- + Column + """ + + def __floordiv__(self, other: Column[Any] | Scalar[Any]) -> Column[Any]: + """ + Floor-divide `other` column or scalar to this column. + + Parameters + ---------- + other : Column or Scalar + If Column, must have same length. + "Scalar" here is defined implicitly by what scalar types are allowed + for the operation by the underling dtypes. + + Returns + ------- + Column + """ + + def __pow__(self, other: Column[Any] | Scalar[Any]) -> Column[Any]: + """ + Raise this column to the power of `other`. + + Parameters + ---------- + other : Column or Scalar + If Column, must have same length. + "Scalar" here is defined implicitly by what scalar types are allowed + for the operation by the underling dtypes. + + Returns + ------- + Column + """ + + def __mod__(self, other: Column[Any] | Scalar[Any]) -> Column[Any]: + """ + Returns modulus of this column by `other` (`%` operator). + + Parameters + ---------- + other : Column or Scalar + If Column, must have same length. + "Scalar" here is defined implicitly by what scalar types are allowed + for the operation by the underling dtypes. + + Returns + ------- + Column + """ + + def __divmod__(self, other: Column[Any] | Scalar[Any]) -> tuple[Column[IntDType], Column[IntDType]]: + """ + Return quotient and remainder of integer division. See `divmod` builtin function. + + Parameters + ---------- + other : Column or Scalar + If Column, must have same length. + "Scalar" here is defined implicitly by what scalar types are allowed + for the operation by the underling dtypes. + + Returns + ------- + Column + """ + + def __invert__(self) -> Column[Bool]: + """ + Invert truthiness of (boolean) elements. + + Raises + ------ + ValueError + If any of the Column's columns is not boolean. + """ + + def any(self, *, skip_nulls: bool = True) -> bool: + """ + Reduction returns a bool. + + Raises + ------ + ValueError + If column is not boolean. + """ + + def all(self, *, skip_nulls: bool = True) -> bool: + """ + Reduction returns a bool. + + Raises + ------ + ValueError + If column is not boolean. + """ + + def min(self, *, skip_nulls: bool = True) -> Scalar[T]: + """ + Reduction returns a scalar. Any data type that supports comparisons + must be supported. The returned value has the same dtype as the column. + """ + + def max(self, *, skip_nulls: bool = True) -> Scalar[T]: + """ + Reduction returns a scalar. Any data type that supports comparisons + must be supported. The returned value has the same dtype as the column. + """ + + def sum(self, *, skip_nulls: bool = True) -> Scalar[T]: + """ + Reduction returns a scalar. Must be supported for numerical and + datetime data types. The returned value has the same dtype as the + column. + """ + + def prod(self, *, skip_nulls: bool = True) -> Scalar[T]: + """ + Reduction returns a scalar. Must be supported for numerical data types. + The returned value has the same dtype as the column. + """ + + def median(self, *, skip_nulls: bool = True) -> Scalar[Any]: + """ + Reduction returns a scalar. Must be supported for numerical and + datetime data types. Returns a float for numerical data types, and + datetime (with the appropriate timedelta format string) for datetime + dtypes. + """ + + def mean(self, *, skip_nulls: bool = True) -> Scalar[Any]: + """ + Reduction returns a scalar. Must be supported for numerical and + datetime data types. Returns a float for numerical data types, and + datetime (with the appropriate timedelta format string) for datetime + dtypes. + """ + + def std(self, *, skip_nulls: bool = True) -> Scalar[Any]: + """ + Reduction returns a scalar. Must be supported for numerical and + datetime data types. Returns a float for numerical data types, and + datetime (with the appropriate timedelta format string) for datetime + dtypes. + """ + + def var(self, *, skip_nulls: bool = True) -> Scalar[Any]: + """ + Reduction returns a scalar. Must be supported for numerical and + datetime data types. Returns a float for numerical data types, and + datetime (with the appropriate timedelta format string) for datetime + dtypes. + """ + + def is_null(self) -> Column[Bool]: + """ + Check for 'missing' or 'null' entries. + + Returns + ------- + Column + + See also + -------- + is_nan + + Notes + ----- + Does *not* include NaN-like entries. + May optionally include 'NaT' values (if present in an implementation), + but note that the Standard makes no guarantees about them. + """ + + def is_nan(self) -> Column[Bool]: + """ + Check for nan entries. + + Returns + ------- + Column + + See also + -------- + is_null + + Notes + ----- + This only checks for 'NaN'. + Does *not* include 'missing' or 'null' entries. + In particular, does not check for `np.timedelta64('NaT')`. + """ + + def is_in(self, values: Column[T]) -> Column[Bool]: + """ + Indicate whether the value at each row matches any value in `values`. + + Parameters + ---------- + values : Column + Contains values to compare against. May include ``float('nan')`` and + ``null``, in which case ``'nan'`` and ``null`` will + respectively return ``True`` even though ``float('nan') == float('nan')`` + isn't ``True``. + The dtype of ``values`` must match the current column's dtype. + + Returns + ------- + Column[bool] + """ + + def unique_indices(self, *, skip_nulls: bool = True) -> Column[IntDType]: + """ + Return indices corresponding to unique values in Column. + + Returns + ------- + Column[IntDType] + Indices corresponding to unique values. + + Notes + ----- + There are no ordering guarantees. In particular, if there are multiple + indices corresponding to the same unique value, there is no guarantee + about which one will appear in the result. + If the original Column contains multiple `'NaN'` values, then + only a single index corresponding to those values should be returned. + Likewise for null values (if ``skip_nulls=False``). + To get the unique values, you can do ``col.get_rows(col.unique_indices())``. + """ + ... + + def fill_nan(self, value: float | 'null', /) -> Column[T]: + """ + Fill floating point ``nan`` values with the given fill value. + + Parameters + ---------- + value : float or `null` + Value used to replace any ``nan`` in the column with. Must be + of the Python scalar type matching the dtype of the column (or + be `null`). + + """ + ... diff --git a/spec/API_specification/dataframe_api/dataframe_object.py b/spec/API_specification/dataframe_api/dataframe_object.py new file mode 100644 index 00000000..14a1f29c --- /dev/null +++ b/spec/API_specification/dataframe_api/dataframe_object.py @@ -0,0 +1,756 @@ +from __future__ import annotations + +from typing import Any, Literal, Mapping, Sequence, Union, TYPE_CHECKING, NoReturn, TypeVar, Generic + + +if TYPE_CHECKING: + from .column_object import Column + from .groupby_object import GroupBy + from . import DType, IntDType, FloatDType, Bool, null, Scalar + + +__all__ = ["DataFrame"] + +T = TypeVar("T", bound=DType) + + +class DataFrame(Generic[T]): + """ + DataFrame object + + Note that this dataframe object is not meant to be instantiated directly by + users of the library implementing the dataframe API standard. Rather, use + constructor functions or an already-created dataframe object retrieved via + + **Python operator support** + + All arithmetic operators defined by the Python language, except for + ``__matmul__``, ``__neg__`` and ``__pos__``, must be supported for + numerical data types. + + All comparison operators defined by the Python language must be supported + by the dataframe object for all data types for which those comparisons are + supported by the builtin scalar types corresponding to a data type. + + In-place operators must not be supported. All operations on the dataframe + object are out-of-place. + + **Methods and Attributes** + + """ + def __dataframe_namespace__( + self, /, *, api_version: str | None = None + ) -> Any: + """ + Returns an object that has all the dataframe API functions on it. + + Parameters + ---------- + api_version: Optional[str] + String representing the version of the dataframe API specification + to be returned, in ``'YYYY.MM'`` form, for example, ``'2023.04'``. + If it is ``None``, it should return the namespace corresponding to + latest version of the dataframe API specification. If the given + version is invalid or not implemented for the given module, an + error should be raised. Default: ``None``. + + Returns + ------- + namespace: Any + An object representing the dataframe API namespace. It should have + every top-level function defined in the specification as an + attribute. It may contain other public names as well, but it is + recommended to only include those names that are part of the + specification. + + """ + + @property + def dataframe(self) -> object: + """ + Return underlying (not-necessarily-Standard-compliant) DataFrame. + + If a library only implements the Standard, then this can return `self`. + """ + ... + + def shape(self) -> tuple[int, int]: + """ + Return number of rows and number of columns. + """ + + def groupby(self, keys: Sequence[str], /) -> GroupBy: + """ + Group the DataFrame by the given columns. + + Parameters + ---------- + keys : Sequence[str] + + Returns + ------- + GroupBy + + Raises + ------ + KeyError + If any of the requested keys are not present. + + Notes + ----- + Downstream operations from this function, like aggregations, return + results for which row order is not guaranteed and is implementation + defined. + """ + ... + + def get_column_by_name(self, name: str, /) -> Column[T]: + """ + Select a column by name. + + Parameters + ---------- + name : str + + Returns + ------- + Column + + Raises + ------ + KeyError + If the key is not present. + """ + ... + + def get_columns_by_name(self, names: Sequence[str], /) -> DataFrame[T]: + """ + Select multiple columns by name. + + Parameters + ---------- + names : Sequence[str] + + Returns + ------- + DataFrame + + Raises + ------ + KeyError + If the any requested key is not present. + """ + ... + + def get_rows(self, indices: Column[IntDType]) -> DataFrame[T]: + """ + Select a subset of rows, similar to `ndarray.take`. + + Parameters + ---------- + indices : Column[IntDType] + Positions of rows to select. + + Returns + ------- + DataFrame + """ + ... + + def slice_rows( + self, start: int | None, stop: int | None, step: int | None + ) -> DataFrame[T]: + """ + Select a subset of rows corresponding to a slice. + + Parameters + ---------- + start : int or None + stop : int or None + step : int or None + + Returns + ------- + DataFrame + """ + ... + + def get_rows_by_mask(self, mask: Column[Bool]) -> DataFrame[T]: + """ + Select a subset of rows corresponding to a mask. + + Parameters + ---------- + mask : Column[bool] + + Returns + ------- + DataFrame + + Notes + ----- + Some participants preferred a weaker type Arraylike[bool] for mask, + where 'Arraylike' denotes an object adhering to the Array API standard. + """ + ... + + def insert(self, loc: int, label: str, value: Column[Any]) -> DataFrame[Any]: + """ + Insert column into DataFrame at specified location. + + Parameters + ---------- + loc : int + Insertion index. Must verify 0 <= loc <= len(columns). + label : str + Label of the inserted column. + value : Column + """ + ... + + def drop_column(self, label: str) -> DataFrame[T]: + """ + Drop the specified column. + + Parameters + ---------- + label : str + + Returns + ------- + DataFrame + + Raises + ------ + KeyError + If the label is not present. + """ + ... + + def rename_columns(self, mapping: Mapping[str, str]) -> DataFrame[T]: + """ + Rename columns. + + Parameters + ---------- + mapping : Mapping[str, str] + Keys are old column names, values are new column names. + + Returns + ------- + DataFrame + """ + ... + + def get_column_names(self) -> Sequence[str]: + """ + Get column names. + + Returns + ------- + Sequence[str] + """ + ... + + def sorted_indices( + self, + keys: Sequence[str], + *, + ascending: Sequence[bool] | bool = True, + nulls_position: Literal['first', 'last'] = 'last', + ) -> Column[IntDType]: + """ + Return row numbers which would sort according to given columns. + + If you need to sort the DataFrame, you can simply do:: + + df.get_rows(df.sorted_indices(keys)) + + Parameters + ---------- + keys : Sequence[str] + Names of columns to sort by. + ascending : Sequence[bool] or bool + If `True`, sort by all keys in ascending order. + If `False`, sort by all keys in descending order. + If a sequence, it must be the same length as `keys`, + and determines the direction with which to use each + key to sort by. + nulls_position : ``{'first', 'last'}`` + Whether null values should be placed at the beginning + or at the end of the result. + Note that the position of NaNs is unspecified and may + vary based on the implementation. + + Returns + ------- + Column[IntDType] + + Raises + ------ + ValueError + If `keys` and `ascending` are sequences of different lengths. + """ + ... + + def __eq__(self, other: DataFrame[Any] | Scalar[Any]) -> DataFrame[Bool]: # type: ignore[override] + """ + Compare for equality. + + Nulls should follow Kleene Logic. + + Parameters + ---------- + other : DataFrame or Scalar + If DataFrame, must have same length and matching columns. + "Scalar" here is defined implicitly by what scalar types are allowed + for the operation by the underling dtypes. + + Returns + ------- + DataFrame + """ + ... + + def __ne__(self, other: DataFrame[Any] | Scalar[Any]) -> DataFrame[Bool]: # type: ignore[override] + """ + Compare for non-equality. + + Nulls should follow Kleene Logic. + + Parameters + ---------- + other : DataFrame or Scalar + If DataFrame, must have same length and matching columns. + "Scalar" here is defined implicitly by what scalar types are allowed + for the operation by the underling dtypes. + + Returns + ------- + DataFrame + """ + ... + + def __ge__(self, other: DataFrame[Any] | Scalar[Any]) -> DataFrame[Bool]: + """ + Compare for "greater than or equal to" `other`. + + Parameters + ---------- + other : DataFrame or Scalar + If DataFrame, must have same length and matching columns. + "Scalar" here is defined implicitly by what scalar types are allowed + for the operation by the underling dtypes. + + Returns + ------- + DataFrame + """ + ... + + def __gt__(self, other: DataFrame[Any] | Scalar[Any]) -> DataFrame[Bool]: + """ + Compare for "greater than" `other`. + + Parameters + ---------- + other : DataFrame or Scalar + If DataFrame, must have same length and matching columns. + "Scalar" here is defined implicitly by what scalar types are allowed + for the operation by the underling dtypes. + + Returns + ------- + DataFrame + """ + ... + + def __le__(self, other: DataFrame[Any] | Scalar[Any]) -> DataFrame[Bool]: + """ + Compare for "less than or equal to" `other`. + + Parameters + ---------- + other : DataFrame or Scalar + If DataFrame, must have same length and matching columns. + "Scalar" here is defined implicitly by what scalar types are allowed + for the operation by the underling dtypes. + + Returns + ------- + DataFrame + """ + ... + + def __lt__(self, other: DataFrame[Any] | Scalar[Any]) -> DataFrame[Bool]: + """ + Compare for "less than" `other`. + + Parameters + ---------- + other : DataFrame or Scalar + If DataFrame, must have same length and matching columns. + "Scalar" here is defined implicitly by what scalar types are allowed + for the operation by the underling dtypes. + + Returns + ------- + DataFrame + """ + ... + + def __and__(self, other: DataFrame[Bool] | bool) -> DataFrame[Bool]: + """ + Apply logical 'and' to `other` DataFrame (or scalar) and this dataframe. + + Nulls should follow Kleene Logic. + + Parameters + ---------- + other : DataFrame[bool] or bool + If DataFrame, must have same length. + + Returns + ------- + DataFrame[bool] + + Raises + ------ + ValueError + If `self` or `other` is not boolean. + """ + + def __or__(self, other: DataFrame[Bool] | bool) -> DataFrame[Bool]: + """ + Apply logical 'or' to `other` DataFrame (or scalar) and this DataFrame. + + Nulls should follow Kleene Logic. + + Parameters + ---------- + other : DataFrame[bool] or bool + If DataFrame, must have same length. + + Returns + ------- + DataFrame[bool] + + Raises + ------ + ValueError + If `self` or `other` is not boolean. + """ + + def __add__(self, other: DataFrame[Any] | Scalar[Any]) -> DataFrame[Any]: + """ + Add `other` dataframe or scalar to this dataframe. + + Parameters + ---------- + other : DataFrame or Scalar + If DataFrame, must have same length and matching columns. + "Scalar" here is defined implicitly by what scalar types are allowed + for the operation by the underling dtypes. + + Returns + ------- + DataFrame + """ + ... + + def __sub__(self, other: DataFrame[Any] | Scalar[Any]) -> DataFrame[Any]: + """ + Subtract `other` dataframe or scalar from this dataframe. + + Parameters + ---------- + other : DataFrame or Scalar + If DataFrame, must have same length and matching columns. + "Scalar" here is defined implicitly by what scalar types are allowed + for the operation by the underling dtypes. + + Returns + ------- + DataFrame + """ + ... + + def __mul__(self, other: DataFrame[Any] | Scalar[Any]) -> DataFrame[Any]: + """ + Multiply `other` dataframe or scalar with this dataframe. + + Parameters + ---------- + other : DataFrame or Scalar + If DataFrame, must have same length and matching columns. + "Scalar" here is defined implicitly by what scalar types are allowed + for the operation by the underling dtypes. + + Returns + ------- + DataFrame + """ + ... + + def __truediv__(self, other: DataFrame[Any] | Scalar[Any]) -> DataFrame[Any]: + """ + Divide this dataframe by `other` dataframe or scalar. True division, returns floats. + + Parameters + ---------- + other : DataFrame or Scalar + If DataFrame, must have same length and matching columns. + "Scalar" here is defined implicitly by what scalar types are allowed + for the operation by the underling dtypes. + + Returns + ------- + DataFrame + """ + ... + + def __floordiv__(self, other: DataFrame[Any] | Scalar[Any]) -> DataFrame[Any]: + """ + Floor-divide (returns integers) this dataframe by `other` dataframe or scalar. + + Parameters + ---------- + other : DataFrame or Scalar + If DataFrame, must have same length and matching columns. + "Scalar" here is defined implicitly by what scalar types are allowed + for the operation by the underling dtypes. + + Returns + ------- + DataFrame + """ + ... + + def __pow__(self, other: DataFrame[Any] | Scalar[Any]) -> DataFrame[Any]: + """ + Raise this dataframe to the power of `other`. + + Parameters + ---------- + other : DataFrame or Scalar + If DataFrame, must have same length and matching columns. + "Scalar" here is defined implicitly by what scalar types are allowed + for the operation by the underling dtypes. + + Returns + ------- + DataFrame + """ + ... + + def __mod__(self, other: DataFrame[Any] | Scalar[Any]) -> DataFrame[Any]: + """ + Return modulus of this dataframe by `other` (`%` operator). + + Parameters + ---------- + other : DataFrame or Scalar + If DataFrame, must have same length and matching columns. + "Scalar" here is defined implicitly by what scalar types are allowed + for the operation by the underling dtypes. + + Returns + ------- + DataFrame + """ + ... + + def __divmod__(self, other: DataFrame[Any] | Scalar[Any]) -> tuple[DataFrame[Any], DataFrame[Any]]: + """ + Return quotient and remainder of integer division. See `divmod` builtin function. + + Parameters + ---------- + other : DataFrame or Scalar + If DataFrame, must have same length and matching columns. + "Scalar" here is defined implicitly by what scalar types are allowed + for the operation by the underling dtypes. + + Returns + ------- + A tuple of two DataFrame's + """ + ... + + def __invert__(self: DataFrame[Bool]) -> DataFrame[Bool]: + """ + Invert truthiness of (boolean) elements. + + Raises + ------ + ValueError + If any of the DataFrame's columns is not boolean. + """ + ... + + def __iter__(self) -> NoReturn: + """ + Iterate over elements. + + This is intentionally "poisoned" to discourage inefficient code patterns. + + Raises + ------ + NotImplementedError + """ + raise NotImplementedError("'__iter__' is intentionally not implemented.") + + def any(self, *, skip_nulls: bool = True) -> DataFrame[Bool]: + """ + Reduction returns a 1-row DataFrame. + + Raises + ------ + ValueError + If any of the DataFrame's columns is not boolean. + """ + ... + + def all(self, *, skip_nulls: bool = True) -> DataFrame[Bool]: + """ + Reduction returns a 1-row DataFrame. + + Raises + ------ + ValueError + If any of the DataFrame's columns is not boolean. + """ + ... + + def any_rowwise(self, *, skip_nulls: bool = True) -> Column[Bool]: + """ + Reduction returns a Column. + + Differs from ``DataFrame.any`` and that the reduction happens + for each row, rather than for each column. + + Raises + ------ + ValueError + If any of the DataFrame's columns is not boolean. + """ + ... + + def all_rowwise(self, *, skip_nulls: bool = True) -> Column[Bool]: + """ + Reduction returns a Column. + + Differs from ``DataFrame.all`` and that the reduction happens + for each row, rather than for each column. + + Raises + ------ + ValueError + If any of the DataFrame's columns is not boolean. + """ + ... + + def min(self, *, skip_nulls: bool = True) -> DataFrame[T]: + """ + Reduction returns a 1-row DataFrame. + """ + ... + + def max(self, *, skip_nulls: bool = True) -> DataFrame[T]: + """ + Reduction returns a 1-row DataFrame. + """ + ... + + def sum(self, *, skip_nulls: bool = True) -> DataFrame[Any]: + """ + Reduction returns a 1-row DataFrame. + """ + ... + + def prod(self, *, skip_nulls: bool = True) -> DataFrame[Any]: + """ + Reduction returns a 1-row DataFrame. + """ + ... + + def median(self, *, skip_nulls: bool = True) -> DataFrame[Any]: + """ + Reduction returns a 1-row DataFrame. + """ + ... + + def mean(self, *, skip_nulls: bool = True) -> DataFrame[Any]: + """ + Reduction returns a 1-row DataFrame. + """ + ... + + def std(self, *, skip_nulls: bool = True) -> DataFrame[Any]: + """ + Reduction returns a 1-row DataFrame. + """ + ... + + def var(self, *, skip_nulls: bool = True) -> DataFrame[Any]: + """ + Reduction returns a 1-row DataFrame. + """ + ... + + def is_null(self) -> DataFrame[Bool]: + """ + Check for 'missing' or 'null' entries. + + Returns + ------- + DataFrame + + See also + -------- + is_nan + + Notes + ----- + Does *not* include NaN-like entries. + May optionally include 'NaT' values (if present in an implementation), + but note that the Standard makes no guarantees about them. + """ + ... + + def is_nan(self) -> DataFrame[Bool]: + """ + Check for nan entries. + + Returns + ------- + DataFrame + + See also + -------- + is_null + + Notes + ----- + This only checks for 'NaN'. + Does *not* include 'missing' or 'null' entries. + In particular, does not check for `np.timedelta64('NaT')`. + """ + ... + + def fill_nan(self, value: float | 'null', /) -> DataFrame[T]: + """ + Fill ``nan`` values with the given fill value. + + The fill operation will apply to all columns with a floating-point + dtype. Other columns remain unchanged. + + Parameters + ---------- + value : float or `null` + Value used to replace any ``nan`` in the column with. Must be + of the Python scalar type matching the dtype of the column (or + be `null`). + + """ + ... diff --git a/spec/API_specification/dataframe_api/groupby_object.py b/spec/API_specification/dataframe_api/groupby_object.py new file mode 100644 index 00000000..6ca0a600 --- /dev/null +++ b/spec/API_specification/dataframe_api/groupby_object.py @@ -0,0 +1,54 @@ +from typing import TYPE_CHECKING, TypeVar, Generic, Any + +if TYPE_CHECKING: + from .dataframe_object import DataFrame + from . import IntDType, DType, Bool + + +__all__ = ['GroupBy'] + +T = TypeVar('T', bound=DType) + + +class GroupBy: + """ + GroupBy object. + + Note that this class is not meant to be constructed by users. + It is returned from `DataFrame.groupby`. + + **Methods** + + """ + def any(self, *, skip_nulls: bool = True) -> DataFrame[Bool]: + ... + + def all(self, *, skip_nulls: bool = True) -> DataFrame[Bool]: + ... + + def min(self, *, skip_nulls: bool = True) -> DataFrame[Any]: + ... + + def max(self, *, skip_nulls: bool = True) -> DataFrame[Any]: + ... + + def sum(self, *, skip_nulls: bool = True) -> DataFrame[Any]: + ... + + def prod(self, *, skip_nulls: bool = True) -> DataFrame[Any]: + ... + + def median(self, *, skip_nulls: bool = True) -> DataFrame[Any]: + ... + + def mean(self, *, skip_nulls: bool = True) -> DataFrame[Any]: + ... + + def std(self, *, skip_nulls: bool = True) -> DataFrame[Any]: + ... + + def var(self, *, skip_nulls: bool = True) -> DataFrame[Any]: + ... + + def size(self) -> DataFrame[IntDType]: + ... From 69c32833bb06663b6bb939f457294dc4b1f664eb Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Sat, 24 Jun 2023 10:41:15 +0100 Subject: [PATCH 08/38] fixup! --- spec/API_specification/dataframe_api/column_object.py | 2 +- spec/API_specification/dataframe_api/dataframe_object.py | 2 +- spec/API_specification/dataframe_api/groupby_object.py | 4 +++- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/spec/API_specification/dataframe_api/column_object.py b/spec/API_specification/dataframe_api/column_object.py index a06c993a..71f14119 100644 --- a/spec/API_specification/dataframe_api/column_object.py +++ b/spec/API_specification/dataframe_api/column_object.py @@ -5,7 +5,7 @@ if TYPE_CHECKING: from . import DType, IntDType, FloatDType, Bool, null, Scalar -T = TypeVar('T', bound=DType) +T = TypeVar('T', bound="DType") __all__ = ['Column'] diff --git a/spec/API_specification/dataframe_api/dataframe_object.py b/spec/API_specification/dataframe_api/dataframe_object.py index 14a1f29c..a920a7f3 100644 --- a/spec/API_specification/dataframe_api/dataframe_object.py +++ b/spec/API_specification/dataframe_api/dataframe_object.py @@ -11,7 +11,7 @@ __all__ = ["DataFrame"] -T = TypeVar("T", bound=DType) +T = TypeVar("T", bound="DType") class DataFrame(Generic[T]): diff --git a/spec/API_specification/dataframe_api/groupby_object.py b/spec/API_specification/dataframe_api/groupby_object.py index 6ca0a600..eb06f27d 100644 --- a/spec/API_specification/dataframe_api/groupby_object.py +++ b/spec/API_specification/dataframe_api/groupby_object.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from typing import TYPE_CHECKING, TypeVar, Generic, Any if TYPE_CHECKING: @@ -7,7 +9,7 @@ __all__ = ['GroupBy'] -T = TypeVar('T', bound=DType) +T = TypeVar('T', bound="DType") class GroupBy: From 08f085ea6186c5d319fc1c2ca798fced2cf40468 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Sat, 24 Jun 2023 10:48:05 +0100 Subject: [PATCH 09/38] fixup some types --- .../dataframe_api/column_object.py | 20 +++++++++---------- .../dataframe_api/dataframe_object.py | 4 ++-- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/spec/API_specification/dataframe_api/column_object.py b/spec/API_specification/dataframe_api/column_object.py index 71f14119..58e748bd 100644 --- a/spec/API_specification/dataframe_api/column_object.py +++ b/spec/API_specification/dataframe_api/column_object.py @@ -137,7 +137,7 @@ def sorted_indices( """ ... - def __eq__(self, other: Column[Bool] | Scalar[Bool]) -> Column[Bool]: # type: ignore[override] + def __eq__(self, other: Column[T] | Scalar[T]) -> Column[Bool]: # type: ignore[override] """ Compare for equality. @@ -155,7 +155,7 @@ def __eq__(self, other: Column[Bool] | Scalar[Bool]) -> Column[Bool]: # type: i Column """ - def __ne__(self, other: Column[Bool] | Scalar[Bool]) -> Column[Bool]: # type: ignore[override] + def __ne__(self, other: Column[T] | Scalar[T]) -> Column[Bool]: # type: ignore[override] """ Compare for non-equality. @@ -173,7 +173,7 @@ def __ne__(self, other: Column[Bool] | Scalar[Bool]) -> Column[Bool]: # type: i Column """ - def __ge__(self, other: Column[Any] | Scalar[Any]) -> Column[Any]: + def __ge__(self, other: Column[T] | Scalar[T]) -> Column[Bool]: """ Compare for "greater than or equal to" `other`. @@ -189,7 +189,7 @@ def __ge__(self, other: Column[Any] | Scalar[Any]) -> Column[Any]: Column """ - def __gt__(self, other: Column[Any] | Scalar[Any]) -> Column[Any]: + def __gt__(self, other: Column[T] | Scalar[T]) -> Column[Any]: """ Compare for "greater than" `other`. @@ -205,7 +205,7 @@ def __gt__(self, other: Column[Any] | Scalar[Any]) -> Column[Any]: Column """ - def __le__(self, other: Column[Any] | Scalar[Any]) -> Column[Any]: + def __le__(self, other: Column[T] | Scalar[T]) -> Column[Bool]: """ Compare for "less than or equal to" `other`. @@ -221,7 +221,7 @@ def __le__(self, other: Column[Any] | Scalar[Any]) -> Column[Any]: Column """ - def __lt__(self, other: Column[Any] | Scalar[Any]) -> Column[Any]: + def __lt__(self, other: Column[T] | Scalar[T]) -> Column[Bool]: """ Compare for "less than" `other`. @@ -237,7 +237,7 @@ def __lt__(self, other: Column[Any] | Scalar[Any]) -> Column[Any]: Column """ - def __and__(self, other: Column[Bool] | bool) -> Column[Bool]: + def __and__(self: Column[Bool], other: Column[Bool] | bool) -> Column[Bool]: """ Apply logical 'and' to `other` Column (or scalar) and this Column. @@ -258,7 +258,7 @@ def __and__(self, other: Column[Bool] | bool) -> Column[Bool]: If `self` or `other` is not boolean. """ - def __or__(self, other: Column[Bool] | bool) -> Column[Bool]: + def __or__(self: Column[Bool], other: Column[Bool] | bool) -> Column[Bool]: """ Apply logical 'or' to `other` Column (or scalar) and this column. @@ -343,7 +343,7 @@ def __truediv__(self, other: Column[Any] | Scalar[Any]) -> Column[Any]: Column """ - def __floordiv__(self, other: Column[Any] | Scalar[Any]) -> Column[Any]: + def __floordiv__(self, other: Column[Any] | Scalar[Any]) -> Column[T]: """ Floor-divide `other` column or scalar to this column. @@ -359,7 +359,7 @@ def __floordiv__(self, other: Column[Any] | Scalar[Any]) -> Column[Any]: Column """ - def __pow__(self, other: Column[Any] | Scalar[Any]) -> Column[Any]: + def __pow__(self, other: Column[Any] | Scalar[Any]) -> Column[T]: """ Raise this column to the power of `other`. diff --git a/spec/API_specification/dataframe_api/dataframe_object.py b/spec/API_specification/dataframe_api/dataframe_object.py index a920a7f3..dce657a7 100644 --- a/spec/API_specification/dataframe_api/dataframe_object.py +++ b/spec/API_specification/dataframe_api/dataframe_object.py @@ -293,7 +293,7 @@ def sorted_indices( """ ... - def __eq__(self, other: DataFrame[Any] | Scalar[Any]) -> DataFrame[Bool]: # type: ignore[override] + def __eq__(self, other: DataFrame[T] | Scalar[T]) -> DataFrame[Bool]: # type: ignore[override] """ Compare for equality. @@ -312,7 +312,7 @@ def __eq__(self, other: DataFrame[Any] | Scalar[Any]) -> DataFrame[Bool]: # typ """ ... - def __ne__(self, other: DataFrame[Any] | Scalar[Any]) -> DataFrame[Bool]: # type: ignore[override] + def __ne__(self, other: DataFrame[T] | Scalar[T]) -> DataFrame[Bool]: # type: ignore[override] """ Compare for non-equality. From 4465e674b8e9be9b4af3d8d2d8d8408678bf5a6b Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Sat, 24 Jun 2023 10:55:48 +0100 Subject: [PATCH 10/38] some corrections --- .../dataframe_api/__init__.py | 2 +- .../dataframe_api/column_object.py | 44 +++++++++---------- .../dataframe_api/dataframe_object.py | 24 +++++----- .../dataframe_api/groupby_object.py | 20 ++++----- spec/design_topics/python_builtin_types.md | 2 +- 5 files changed, 46 insertions(+), 46 deletions(-) diff --git a/spec/API_specification/dataframe_api/__init__.py b/spec/API_specification/dataframe_api/__init__.py index 1aa4e945..da5b4312 100644 --- a/spec/API_specification/dataframe_api/__init__.py +++ b/spec/API_specification/dataframe_api/__init__.py @@ -124,7 +124,7 @@ class null: """ ... -def is_null(value: object, /) -> bool: +def is_null(value: object, /) -> Scalar[Bool]: """ Check if an object is a `null` scalar. diff --git a/spec/API_specification/dataframe_api/column_object.py b/spec/API_specification/dataframe_api/column_object.py index 58e748bd..3eafd194 100644 --- a/spec/API_specification/dataframe_api/column_object.py +++ b/spec/API_specification/dataframe_api/column_object.py @@ -90,7 +90,7 @@ def get_rows(self, indices: Column[IntDType]) -> Column[T]: """ ... - def get_value(self, row_number: int) -> Scalar[T]: + def get_value(self, row_number: Scalar[IntDType]) -> Scalar[T]: """ Select the value at a row number, similar to `ndarray.__getitem__()`. @@ -110,7 +110,7 @@ def get_value(self, row_number: int) -> Scalar[T]: def sorted_indices( self, *, - ascending: bool = True, + ascending: Scalar[Bool] = True, nulls_position: Literal['first', 'last'] = 'last', ) -> Column[IntDType]: """ @@ -122,7 +122,7 @@ def sorted_indices( Parameters ---------- - ascending : bool + ascending : Scalar[Bool] If `True`, sort in ascending order. If `False`, sort in descending order. nulls_position : ``{'first', 'last'}`` @@ -189,7 +189,7 @@ def __ge__(self, other: Column[T] | Scalar[T]) -> Column[Bool]: Column """ - def __gt__(self, other: Column[T] | Scalar[T]) -> Column[Any]: + def __gt__(self, other: Column[T] | Scalar[T]) -> Column[Bool]: """ Compare for "greater than" `other`. @@ -279,7 +279,7 @@ def __or__(self: Column[Bool], other: Column[Bool] | bool) -> Column[Bool]: If `self` or `other` is not boolean. """ - def __add__(self, other: Column[Any] | Scalar[Any]) -> Column[Any]: + def __add__(self, other: Column[T] | Scalar[T]) -> Column[T]: """ Add `other` column or scalar to this column. @@ -295,7 +295,7 @@ def __add__(self, other: Column[Any] | Scalar[Any]) -> Column[Any]: Column """ - def __sub__(self, other: Column[Any] | Scalar[Any]) -> Column[Any]: + def __sub__(self, other: Column[T] | Scalar[T]) -> Column[T]: """ Subtract `other` column or scalar from this column. @@ -327,7 +327,7 @@ def __mul__(self, other: Column[Any] | Scalar[Any]) -> Column[Any]: Column """ - def __truediv__(self, other: Column[Any] | Scalar[Any]) -> Column[Any]: + def __truediv__(self, other: Column[Any] | Scalar[Any]) -> Column[FloatDType]: """ Divide this column by `other` column or scalar. True division, returns floats. @@ -343,7 +343,7 @@ def __truediv__(self, other: Column[Any] | Scalar[Any]) -> Column[Any]: Column """ - def __floordiv__(self, other: Column[Any] | Scalar[Any]) -> Column[T]: + def __floordiv__(self, other: Column[Any] | Scalar[Any]) -> Column[IntDType]: """ Floor-divide `other` column or scalar to this column. @@ -359,7 +359,7 @@ def __floordiv__(self, other: Column[Any] | Scalar[Any]) -> Column[T]: Column """ - def __pow__(self, other: Column[Any] | Scalar[Any]) -> Column[T]: + def __pow__(self, other: Column[Any] | Scalar[Any]) -> Column[Any]: """ Raise this column to the power of `other`. @@ -391,7 +391,7 @@ def __mod__(self, other: Column[Any] | Scalar[Any]) -> Column[Any]: Column """ - def __divmod__(self, other: Column[Any] | Scalar[Any]) -> tuple[Column[IntDType], Column[IntDType]]: + def __divmod__(self, other: Column[Any] | Scalar[Any]) -> tuple[Column[IntDType], Column[FloatDType]]: """ Return quotient and remainder of integer division. See `divmod` builtin function. @@ -407,7 +407,7 @@ def __divmod__(self, other: Column[Any] | Scalar[Any]) -> tuple[Column[IntDType] Column """ - def __invert__(self) -> Column[Bool]: + def __invert__(self: Column[Bool]) -> Column[Bool]: """ Invert truthiness of (boolean) elements. @@ -417,7 +417,7 @@ def __invert__(self) -> Column[Bool]: If any of the Column's columns is not boolean. """ - def any(self, *, skip_nulls: bool = True) -> bool: + def any(self: Column[Bool], *, skip_nulls: Scalar[Bool] = True) -> Scalar[Bool]: """ Reduction returns a bool. @@ -427,7 +427,7 @@ def any(self, *, skip_nulls: bool = True) -> bool: If column is not boolean. """ - def all(self, *, skip_nulls: bool = True) -> bool: + def all(self: Column[Bool], *, skip_nulls: Scalar[Bool] = True) -> Scalar[Bool]: """ Reduction returns a bool. @@ -437,32 +437,32 @@ def all(self, *, skip_nulls: bool = True) -> bool: If column is not boolean. """ - def min(self, *, skip_nulls: bool = True) -> Scalar[T]: + def min(self, *, skip_nulls: Scalar[Bool] = True) -> Scalar[T]: """ Reduction returns a scalar. Any data type that supports comparisons must be supported. The returned value has the same dtype as the column. """ - def max(self, *, skip_nulls: bool = True) -> Scalar[T]: + def max(self, *, skip_nulls: Scalar[Bool] = True) -> Scalar[T]: """ Reduction returns a scalar. Any data type that supports comparisons must be supported. The returned value has the same dtype as the column. """ - def sum(self, *, skip_nulls: bool = True) -> Scalar[T]: + def sum(self, *, skip_nulls: Scalar[Bool] = True) -> Scalar[T]: """ Reduction returns a scalar. Must be supported for numerical and datetime data types. The returned value has the same dtype as the column. """ - def prod(self, *, skip_nulls: bool = True) -> Scalar[T]: + def prod(self, *, skip_nulls: Scalar[Bool] = True) -> Scalar[Any]: """ Reduction returns a scalar. Must be supported for numerical data types. The returned value has the same dtype as the column. """ - def median(self, *, skip_nulls: bool = True) -> Scalar[Any]: + def median(self, *, skip_nulls: Scalar[Bool] = True) -> Scalar[Any]: """ Reduction returns a scalar. Must be supported for numerical and datetime data types. Returns a float for numerical data types, and @@ -470,7 +470,7 @@ def median(self, *, skip_nulls: bool = True) -> Scalar[Any]: dtypes. """ - def mean(self, *, skip_nulls: bool = True) -> Scalar[Any]: + def mean(self, *, skip_nulls: Scalar[Bool] = True) -> Scalar[Any]: """ Reduction returns a scalar. Must be supported for numerical and datetime data types. Returns a float for numerical data types, and @@ -478,7 +478,7 @@ def mean(self, *, skip_nulls: bool = True) -> Scalar[Any]: dtypes. """ - def std(self, *, skip_nulls: bool = True) -> Scalar[Any]: + def std(self, *, skip_nulls: Scalar[Bool] = True) -> Scalar[Any]: """ Reduction returns a scalar. Must be supported for numerical and datetime data types. Returns a float for numerical data types, and @@ -486,7 +486,7 @@ def std(self, *, skip_nulls: bool = True) -> Scalar[Any]: dtypes. """ - def var(self, *, skip_nulls: bool = True) -> Scalar[Any]: + def var(self, *, skip_nulls: Scalar[Bool] = True) -> Scalar[Any]: """ Reduction returns a scalar. Must be supported for numerical and datetime data types. Returns a float for numerical data types, and @@ -550,7 +550,7 @@ def is_in(self, values: Column[T]) -> Column[Bool]: Column[bool] """ - def unique_indices(self, *, skip_nulls: bool = True) -> Column[IntDType]: + def unique_indices(self, *, skip_nulls: Scalar[Bool] = True) -> Column[IntDType]: """ Return indices corresponding to unique values in Column. diff --git a/spec/API_specification/dataframe_api/dataframe_object.py b/spec/API_specification/dataframe_api/dataframe_object.py index dce657a7..8e9cbc73 100644 --- a/spec/API_specification/dataframe_api/dataframe_object.py +++ b/spec/API_specification/dataframe_api/dataframe_object.py @@ -600,7 +600,7 @@ def __iter__(self) -> NoReturn: """ raise NotImplementedError("'__iter__' is intentionally not implemented.") - def any(self, *, skip_nulls: bool = True) -> DataFrame[Bool]: + def any(self, *, skip_nulls: Scalar[Bool] = True) -> DataFrame[Bool]: """ Reduction returns a 1-row DataFrame. @@ -611,7 +611,7 @@ def any(self, *, skip_nulls: bool = True) -> DataFrame[Bool]: """ ... - def all(self, *, skip_nulls: bool = True) -> DataFrame[Bool]: + def all(self, *, skip_nulls: Scalar[Bool] = True) -> DataFrame[Bool]: """ Reduction returns a 1-row DataFrame. @@ -622,7 +622,7 @@ def all(self, *, skip_nulls: bool = True) -> DataFrame[Bool]: """ ... - def any_rowwise(self, *, skip_nulls: bool = True) -> Column[Bool]: + def any_rowwise(self, *, skip_nulls: Scalar[Bool] = True) -> Column[Bool]: """ Reduction returns a Column. @@ -636,7 +636,7 @@ def any_rowwise(self, *, skip_nulls: bool = True) -> Column[Bool]: """ ... - def all_rowwise(self, *, skip_nulls: bool = True) -> Column[Bool]: + def all_rowwise(self, *, skip_nulls: Scalar[Bool] = True) -> Column[Bool]: """ Reduction returns a Column. @@ -650,49 +650,49 @@ def all_rowwise(self, *, skip_nulls: bool = True) -> Column[Bool]: """ ... - def min(self, *, skip_nulls: bool = True) -> DataFrame[T]: + def min(self, *, skip_nulls: Scalar[Bool] = True) -> DataFrame[T]: """ Reduction returns a 1-row DataFrame. """ ... - def max(self, *, skip_nulls: bool = True) -> DataFrame[T]: + def max(self, *, skip_nulls: Scalar[Bool] = True) -> DataFrame[T]: """ Reduction returns a 1-row DataFrame. """ ... - def sum(self, *, skip_nulls: bool = True) -> DataFrame[Any]: + def sum(self, *, skip_nulls: Scalar[Bool] = True) -> DataFrame[Any]: """ Reduction returns a 1-row DataFrame. """ ... - def prod(self, *, skip_nulls: bool = True) -> DataFrame[Any]: + def prod(self, *, skip_nulls: Scalar[Bool] = True) -> DataFrame[Any]: """ Reduction returns a 1-row DataFrame. """ ... - def median(self, *, skip_nulls: bool = True) -> DataFrame[Any]: + def median(self, *, skip_nulls: Scalar[Bool] = True) -> DataFrame[Any]: """ Reduction returns a 1-row DataFrame. """ ... - def mean(self, *, skip_nulls: bool = True) -> DataFrame[Any]: + def mean(self, *, skip_nulls: Scalar[Bool] = True) -> DataFrame[Any]: """ Reduction returns a 1-row DataFrame. """ ... - def std(self, *, skip_nulls: bool = True) -> DataFrame[Any]: + def std(self, *, skip_nulls: Scalar[Bool] = True) -> DataFrame[Any]: """ Reduction returns a 1-row DataFrame. """ ... - def var(self, *, skip_nulls: bool = True) -> DataFrame[Any]: + def var(self, *, skip_nulls: Scalar[Bool] = True) -> DataFrame[Any]: """ Reduction returns a 1-row DataFrame. """ diff --git a/spec/API_specification/dataframe_api/groupby_object.py b/spec/API_specification/dataframe_api/groupby_object.py index eb06f27d..f6e845ad 100644 --- a/spec/API_specification/dataframe_api/groupby_object.py +++ b/spec/API_specification/dataframe_api/groupby_object.py @@ -22,34 +22,34 @@ class GroupBy: **Methods** """ - def any(self, *, skip_nulls: bool = True) -> DataFrame[Bool]: + def any(self, *, skip_nulls: Scalar[Bool] = True) -> DataFrame[Bool]: ... - def all(self, *, skip_nulls: bool = True) -> DataFrame[Bool]: + def all(self, *, skip_nulls: Scalar[Bool] = True) -> DataFrame[Bool]: ... - def min(self, *, skip_nulls: bool = True) -> DataFrame[Any]: + def min(self, *, skip_nulls: Scalar[Bool] = True) -> DataFrame[Any]: ... - def max(self, *, skip_nulls: bool = True) -> DataFrame[Any]: + def max(self, *, skip_nulls: Scalar[Bool] = True) -> DataFrame[Any]: ... - def sum(self, *, skip_nulls: bool = True) -> DataFrame[Any]: + def sum(self, *, skip_nulls: Scalar[Bool] = True) -> DataFrame[Any]: ... - def prod(self, *, skip_nulls: bool = True) -> DataFrame[Any]: + def prod(self, *, skip_nulls: Scalar[Bool] = True) -> DataFrame[Any]: ... - def median(self, *, skip_nulls: bool = True) -> DataFrame[Any]: + def median(self, *, skip_nulls: Scalar[Bool] = True) -> DataFrame[Any]: ... - def mean(self, *, skip_nulls: bool = True) -> DataFrame[Any]: + def mean(self, *, skip_nulls: Scalar[Bool] = True) -> DataFrame[Any]: ... - def std(self, *, skip_nulls: bool = True) -> DataFrame[Any]: + def std(self, *, skip_nulls: Scalar[Bool] = True) -> DataFrame[Any]: ... - def var(self, *, skip_nulls: bool = True) -> DataFrame[Any]: + def var(self, *, skip_nulls: Scalar[Bool] = True) -> DataFrame[Any]: ... def size(self) -> DataFrame[IntDType]: diff --git a/spec/design_topics/python_builtin_types.md b/spec/design_topics/python_builtin_types.md index 567baca3..04194bf7 100644 --- a/spec/design_topics/python_builtin_types.md +++ b/spec/design_topics/python_builtin_types.md @@ -18,7 +18,7 @@ class DataFrame: ... class Column: - def mean(self, skip_nulls: bool = True) -> float: + def mean(self, skip_nulls: Scalar[Bool] = True) -> float: ... larger = df2 > df1.get_column_by_name('foo').mean() From 51f542550cbd164d947f94aa1eac2142cc27878e Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Sat, 24 Jun 2023 11:01:41 +0100 Subject: [PATCH 11/38] fixup again --- spec/API_specification/.mypy.ini | 3 +++ .../dataframe_api/column_object.py | 22 ++++++++--------- .../dataframe_api/dataframe_object.py | 24 +++++++++---------- .../dataframe_api/groupby_object.py | 20 ++++++++-------- spec/design_topics/python_builtin_types.md | 2 +- 5 files changed, 37 insertions(+), 34 deletions(-) create mode 100644 spec/API_specification/.mypy.ini diff --git a/spec/API_specification/.mypy.ini b/spec/API_specification/.mypy.ini new file mode 100644 index 00000000..eef0ed08 --- /dev/null +++ b/spec/API_specification/.mypy.ini @@ -0,0 +1,3 @@ +[mypy] +strict=True +disable_error_code=empty-body diff --git a/spec/API_specification/dataframe_api/column_object.py b/spec/API_specification/dataframe_api/column_object.py index 3eafd194..cd574f81 100644 --- a/spec/API_specification/dataframe_api/column_object.py +++ b/spec/API_specification/dataframe_api/column_object.py @@ -417,7 +417,7 @@ def __invert__(self: Column[Bool]) -> Column[Bool]: If any of the Column's columns is not boolean. """ - def any(self: Column[Bool], *, skip_nulls: Scalar[Bool] = True) -> Scalar[Bool]: + def any(self: Column[Bool], *, skip_nulls: bool = True) -> Scalar[Bool]: """ Reduction returns a bool. @@ -427,7 +427,7 @@ def any(self: Column[Bool], *, skip_nulls: Scalar[Bool] = True) -> Scalar[Bool]: If column is not boolean. """ - def all(self: Column[Bool], *, skip_nulls: Scalar[Bool] = True) -> Scalar[Bool]: + def all(self: Column[Bool], *, skip_nulls: bool = True) -> Scalar[Bool]: """ Reduction returns a bool. @@ -437,32 +437,32 @@ def all(self: Column[Bool], *, skip_nulls: Scalar[Bool] = True) -> Scalar[Bool]: If column is not boolean. """ - def min(self, *, skip_nulls: Scalar[Bool] = True) -> Scalar[T]: + def min(self, *, skip_nulls: bool = True) -> Scalar[T]: """ Reduction returns a scalar. Any data type that supports comparisons must be supported. The returned value has the same dtype as the column. """ - def max(self, *, skip_nulls: Scalar[Bool] = True) -> Scalar[T]: + def max(self, *, skip_nulls: bool = True) -> Scalar[T]: """ Reduction returns a scalar. Any data type that supports comparisons must be supported. The returned value has the same dtype as the column. """ - def sum(self, *, skip_nulls: Scalar[Bool] = True) -> Scalar[T]: + def sum(self, *, skip_nulls: bool = True) -> Scalar[T]: """ Reduction returns a scalar. Must be supported for numerical and datetime data types. The returned value has the same dtype as the column. """ - def prod(self, *, skip_nulls: Scalar[Bool] = True) -> Scalar[Any]: + def prod(self, *, skip_nulls: bool = True) -> Scalar[Any]: """ Reduction returns a scalar. Must be supported for numerical data types. The returned value has the same dtype as the column. """ - def median(self, *, skip_nulls: Scalar[Bool] = True) -> Scalar[Any]: + def median(self, *, skip_nulls: bool = True) -> Scalar[Any]: """ Reduction returns a scalar. Must be supported for numerical and datetime data types. Returns a float for numerical data types, and @@ -470,7 +470,7 @@ def median(self, *, skip_nulls: Scalar[Bool] = True) -> Scalar[Any]: dtypes. """ - def mean(self, *, skip_nulls: Scalar[Bool] = True) -> Scalar[Any]: + def mean(self, *, skip_nulls: bool = True) -> Scalar[Any]: """ Reduction returns a scalar. Must be supported for numerical and datetime data types. Returns a float for numerical data types, and @@ -478,7 +478,7 @@ def mean(self, *, skip_nulls: Scalar[Bool] = True) -> Scalar[Any]: dtypes. """ - def std(self, *, skip_nulls: Scalar[Bool] = True) -> Scalar[Any]: + def std(self, *, skip_nulls: bool = True) -> Scalar[Any]: """ Reduction returns a scalar. Must be supported for numerical and datetime data types. Returns a float for numerical data types, and @@ -486,7 +486,7 @@ def std(self, *, skip_nulls: Scalar[Bool] = True) -> Scalar[Any]: dtypes. """ - def var(self, *, skip_nulls: Scalar[Bool] = True) -> Scalar[Any]: + def var(self, *, skip_nulls: bool = True) -> Scalar[Any]: """ Reduction returns a scalar. Must be supported for numerical and datetime data types. Returns a float for numerical data types, and @@ -550,7 +550,7 @@ def is_in(self, values: Column[T]) -> Column[Bool]: Column[bool] """ - def unique_indices(self, *, skip_nulls: Scalar[Bool] = True) -> Column[IntDType]: + def unique_indices(self, *, skip_nulls: bool = True) -> Column[IntDType]: """ Return indices corresponding to unique values in Column. diff --git a/spec/API_specification/dataframe_api/dataframe_object.py b/spec/API_specification/dataframe_api/dataframe_object.py index 8e9cbc73..dce657a7 100644 --- a/spec/API_specification/dataframe_api/dataframe_object.py +++ b/spec/API_specification/dataframe_api/dataframe_object.py @@ -600,7 +600,7 @@ def __iter__(self) -> NoReturn: """ raise NotImplementedError("'__iter__' is intentionally not implemented.") - def any(self, *, skip_nulls: Scalar[Bool] = True) -> DataFrame[Bool]: + def any(self, *, skip_nulls: bool = True) -> DataFrame[Bool]: """ Reduction returns a 1-row DataFrame. @@ -611,7 +611,7 @@ def any(self, *, skip_nulls: Scalar[Bool] = True) -> DataFrame[Bool]: """ ... - def all(self, *, skip_nulls: Scalar[Bool] = True) -> DataFrame[Bool]: + def all(self, *, skip_nulls: bool = True) -> DataFrame[Bool]: """ Reduction returns a 1-row DataFrame. @@ -622,7 +622,7 @@ def all(self, *, skip_nulls: Scalar[Bool] = True) -> DataFrame[Bool]: """ ... - def any_rowwise(self, *, skip_nulls: Scalar[Bool] = True) -> Column[Bool]: + def any_rowwise(self, *, skip_nulls: bool = True) -> Column[Bool]: """ Reduction returns a Column. @@ -636,7 +636,7 @@ def any_rowwise(self, *, skip_nulls: Scalar[Bool] = True) -> Column[Bool]: """ ... - def all_rowwise(self, *, skip_nulls: Scalar[Bool] = True) -> Column[Bool]: + def all_rowwise(self, *, skip_nulls: bool = True) -> Column[Bool]: """ Reduction returns a Column. @@ -650,49 +650,49 @@ def all_rowwise(self, *, skip_nulls: Scalar[Bool] = True) -> Column[Bool]: """ ... - def min(self, *, skip_nulls: Scalar[Bool] = True) -> DataFrame[T]: + def min(self, *, skip_nulls: bool = True) -> DataFrame[T]: """ Reduction returns a 1-row DataFrame. """ ... - def max(self, *, skip_nulls: Scalar[Bool] = True) -> DataFrame[T]: + def max(self, *, skip_nulls: bool = True) -> DataFrame[T]: """ Reduction returns a 1-row DataFrame. """ ... - def sum(self, *, skip_nulls: Scalar[Bool] = True) -> DataFrame[Any]: + def sum(self, *, skip_nulls: bool = True) -> DataFrame[Any]: """ Reduction returns a 1-row DataFrame. """ ... - def prod(self, *, skip_nulls: Scalar[Bool] = True) -> DataFrame[Any]: + def prod(self, *, skip_nulls: bool = True) -> DataFrame[Any]: """ Reduction returns a 1-row DataFrame. """ ... - def median(self, *, skip_nulls: Scalar[Bool] = True) -> DataFrame[Any]: + def median(self, *, skip_nulls: bool = True) -> DataFrame[Any]: """ Reduction returns a 1-row DataFrame. """ ... - def mean(self, *, skip_nulls: Scalar[Bool] = True) -> DataFrame[Any]: + def mean(self, *, skip_nulls: bool = True) -> DataFrame[Any]: """ Reduction returns a 1-row DataFrame. """ ... - def std(self, *, skip_nulls: Scalar[Bool] = True) -> DataFrame[Any]: + def std(self, *, skip_nulls: bool = True) -> DataFrame[Any]: """ Reduction returns a 1-row DataFrame. """ ... - def var(self, *, skip_nulls: Scalar[Bool] = True) -> DataFrame[Any]: + def var(self, *, skip_nulls: bool = True) -> DataFrame[Any]: """ Reduction returns a 1-row DataFrame. """ diff --git a/spec/API_specification/dataframe_api/groupby_object.py b/spec/API_specification/dataframe_api/groupby_object.py index f6e845ad..eb06f27d 100644 --- a/spec/API_specification/dataframe_api/groupby_object.py +++ b/spec/API_specification/dataframe_api/groupby_object.py @@ -22,34 +22,34 @@ class GroupBy: **Methods** """ - def any(self, *, skip_nulls: Scalar[Bool] = True) -> DataFrame[Bool]: + def any(self, *, skip_nulls: bool = True) -> DataFrame[Bool]: ... - def all(self, *, skip_nulls: Scalar[Bool] = True) -> DataFrame[Bool]: + def all(self, *, skip_nulls: bool = True) -> DataFrame[Bool]: ... - def min(self, *, skip_nulls: Scalar[Bool] = True) -> DataFrame[Any]: + def min(self, *, skip_nulls: bool = True) -> DataFrame[Any]: ... - def max(self, *, skip_nulls: Scalar[Bool] = True) -> DataFrame[Any]: + def max(self, *, skip_nulls: bool = True) -> DataFrame[Any]: ... - def sum(self, *, skip_nulls: Scalar[Bool] = True) -> DataFrame[Any]: + def sum(self, *, skip_nulls: bool = True) -> DataFrame[Any]: ... - def prod(self, *, skip_nulls: Scalar[Bool] = True) -> DataFrame[Any]: + def prod(self, *, skip_nulls: bool = True) -> DataFrame[Any]: ... - def median(self, *, skip_nulls: Scalar[Bool] = True) -> DataFrame[Any]: + def median(self, *, skip_nulls: bool = True) -> DataFrame[Any]: ... - def mean(self, *, skip_nulls: Scalar[Bool] = True) -> DataFrame[Any]: + def mean(self, *, skip_nulls: bool = True) -> DataFrame[Any]: ... - def std(self, *, skip_nulls: Scalar[Bool] = True) -> DataFrame[Any]: + def std(self, *, skip_nulls: bool = True) -> DataFrame[Any]: ... - def var(self, *, skip_nulls: Scalar[Bool] = True) -> DataFrame[Any]: + def var(self, *, skip_nulls: bool = True) -> DataFrame[Any]: ... def size(self) -> DataFrame[IntDType]: diff --git a/spec/design_topics/python_builtin_types.md b/spec/design_topics/python_builtin_types.md index 04194bf7..567baca3 100644 --- a/spec/design_topics/python_builtin_types.md +++ b/spec/design_topics/python_builtin_types.md @@ -18,7 +18,7 @@ class DataFrame: ... class Column: - def mean(self, skip_nulls: Scalar[Bool] = True) -> float: + def mean(self, skip_nulls: bool = True) -> float: ... larger = df2 > df1.get_column_by_name('foo').mean() From b654dadeef0f4e950faba68a3470fe22800a1565 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Sat, 24 Jun 2023 11:16:05 +0100 Subject: [PATCH 12/38] wip --- .../dataframe_api/__init__.py | 5 ++- .../API_specification/dataframe_api/_types.py | 6 +++- .../dataframe_api/column_object.py | 35 +++++++++---------- .../dataframe_api/dataframe_object.py | 29 ++++++++------- 4 files changed, 38 insertions(+), 37 deletions(-) diff --git a/spec/API_specification/dataframe_api/__init__.py b/spec/API_specification/dataframe_api/__init__.py index da5b4312..7f0cc513 100644 --- a/spec/API_specification/dataframe_api/__init__.py +++ b/spec/API_specification/dataframe_api/__init__.py @@ -8,8 +8,7 @@ from .column_object import * from .dataframe_object import DataFrame from .groupby_object import * - -T = TypeVar("T", bound="DType") +from ._types import DTypeT __all__ = [ "__dataframe_api_version", @@ -191,5 +190,5 @@ class Bool(DType): # Scalar # ########## -class Scalar(Generic[T]): +class Scalar(Generic[DTypeT]): ... diff --git a/spec/API_specification/dataframe_api/_types.py b/spec/API_specification/dataframe_api/_types.py index 2874ba4c..f01e6163 100644 --- a/spec/API_specification/dataframe_api/_types.py +++ b/spec/API_specification/dataframe_api/_types.py @@ -17,13 +17,17 @@ TypeVar, Union, Protocol, + TYPE_CHECKING, ) from enum import Enum +if TYPE_CHECKING: + from . import DType + array = TypeVar("array") Scalar = TypeVar("Scalar") device = TypeVar("device") -DType = TypeVar("DType") +DTypeT = TypeVar("DTypeT", bound="DType") SupportsDLPack = TypeVar("SupportsDLPack") SupportsBufferProtocol = TypeVar("SupportsBufferProtocol") PyCapsule = TypeVar("PyCapsule") diff --git a/spec/API_specification/dataframe_api/column_object.py b/spec/API_specification/dataframe_api/column_object.py index cd574f81..5cf4bbc1 100644 --- a/spec/API_specification/dataframe_api/column_object.py +++ b/spec/API_specification/dataframe_api/column_object.py @@ -4,13 +4,12 @@ if TYPE_CHECKING: from . import DType, IntDType, FloatDType, Bool, null, Scalar - -T = TypeVar('T', bound="DType") + from ._types import DTypeT __all__ = ['Column'] -class Column(Generic[T]): +class Column(Generic[DTypeT]): """ Column object @@ -79,7 +78,7 @@ def dtype(self) -> DType: Return data type of column. """ - def get_rows(self, indices: Column[IntDType]) -> Column[T]: + def get_rows(self, indices: Column[IntDType]) -> Column[DTypeT]: """ Select a subset of rows, similar to `ndarray.take`. @@ -90,7 +89,7 @@ def get_rows(self, indices: Column[IntDType]) -> Column[T]: """ ... - def get_value(self, row_number: Scalar[IntDType]) -> Scalar[T]: + def get_value(self, row_number: Scalar[IntDType]) -> Scalar[DTypeT]: """ Select the value at a row number, similar to `ndarray.__getitem__()`. @@ -137,7 +136,7 @@ def sorted_indices( """ ... - def __eq__(self, other: Column[T] | Scalar[T]) -> Column[Bool]: # type: ignore[override] + def __eq__(self, other: Column[DTypeT] | Scalar[DTypeT]) -> Column[Bool]: # type: ignore[override] """ Compare for equality. @@ -155,7 +154,7 @@ def __eq__(self, other: Column[T] | Scalar[T]) -> Column[Bool]: # type: ignore[ Column """ - def __ne__(self, other: Column[T] | Scalar[T]) -> Column[Bool]: # type: ignore[override] + def __ne__(self, other: Column[DTypeT] | Scalar[DTypeT]) -> Column[Bool]: # type: ignore[override] """ Compare for non-equality. @@ -173,7 +172,7 @@ def __ne__(self, other: Column[T] | Scalar[T]) -> Column[Bool]: # type: ignore[ Column """ - def __ge__(self, other: Column[T] | Scalar[T]) -> Column[Bool]: + def __ge__(self, other: Column[DTypeT] | Scalar[DTypeT]) -> Column[Bool]: """ Compare for "greater than or equal to" `other`. @@ -189,7 +188,7 @@ def __ge__(self, other: Column[T] | Scalar[T]) -> Column[Bool]: Column """ - def __gt__(self, other: Column[T] | Scalar[T]) -> Column[Bool]: + def __gt__(self, other: Column[DTypeT] | Scalar[DTypeT]) -> Column[Bool]: """ Compare for "greater than" `other`. @@ -205,7 +204,7 @@ def __gt__(self, other: Column[T] | Scalar[T]) -> Column[Bool]: Column """ - def __le__(self, other: Column[T] | Scalar[T]) -> Column[Bool]: + def __le__(self, other: Column[DTypeT] | Scalar[DTypeT]) -> Column[Bool]: """ Compare for "less than or equal to" `other`. @@ -221,7 +220,7 @@ def __le__(self, other: Column[T] | Scalar[T]) -> Column[Bool]: Column """ - def __lt__(self, other: Column[T] | Scalar[T]) -> Column[Bool]: + def __lt__(self, other: Column[DTypeT] | Scalar[DTypeT]) -> Column[Bool]: """ Compare for "less than" `other`. @@ -279,7 +278,7 @@ def __or__(self: Column[Bool], other: Column[Bool] | bool) -> Column[Bool]: If `self` or `other` is not boolean. """ - def __add__(self, other: Column[T] | Scalar[T]) -> Column[T]: + def __add__(self, other: Column[DTypeT] | Scalar[DTypeT]) -> Column[DTypeT]: """ Add `other` column or scalar to this column. @@ -295,7 +294,7 @@ def __add__(self, other: Column[T] | Scalar[T]) -> Column[T]: Column """ - def __sub__(self, other: Column[T] | Scalar[T]) -> Column[T]: + def __sub__(self, other: Column[DTypeT] | Scalar[DTypeT]) -> Column[DTypeT]: """ Subtract `other` column or scalar from this column. @@ -437,19 +436,19 @@ def all(self: Column[Bool], *, skip_nulls: bool = True) -> Scalar[Bool]: If column is not boolean. """ - def min(self, *, skip_nulls: bool = True) -> Scalar[T]: + def min(self, *, skip_nulls: bool = True) -> Scalar[DTypeT]: """ Reduction returns a scalar. Any data type that supports comparisons must be supported. The returned value has the same dtype as the column. """ - def max(self, *, skip_nulls: bool = True) -> Scalar[T]: + def max(self, *, skip_nulls: bool = True) -> Scalar[DTypeT]: """ Reduction returns a scalar. Any data type that supports comparisons must be supported. The returned value has the same dtype as the column. """ - def sum(self, *, skip_nulls: bool = True) -> Scalar[T]: + def sum(self, *, skip_nulls: bool = True) -> Scalar[DTypeT]: """ Reduction returns a scalar. Must be supported for numerical and datetime data types. The returned value has the same dtype as the @@ -532,7 +531,7 @@ def is_nan(self) -> Column[Bool]: In particular, does not check for `np.timedelta64('NaT')`. """ - def is_in(self, values: Column[T]) -> Column[Bool]: + def is_in(self, values: Column[DTypeT]) -> Column[Bool]: """ Indicate whether the value at each row matches any value in `values`. @@ -571,7 +570,7 @@ def unique_indices(self, *, skip_nulls: bool = True) -> Column[IntDType]: """ ... - def fill_nan(self, value: float | 'null', /) -> Column[T]: + def fill_nan(self, value: float | 'null', /) -> Column[DTypeT]: """ Fill floating point ``nan`` values with the given fill value. diff --git a/spec/API_specification/dataframe_api/dataframe_object.py b/spec/API_specification/dataframe_api/dataframe_object.py index dce657a7..6d076499 100644 --- a/spec/API_specification/dataframe_api/dataframe_object.py +++ b/spec/API_specification/dataframe_api/dataframe_object.py @@ -7,14 +7,13 @@ from .column_object import Column from .groupby_object import GroupBy from . import DType, IntDType, FloatDType, Bool, null, Scalar + from ._types import DTypeT __all__ = ["DataFrame"] -T = TypeVar("T", bound="DType") - -class DataFrame(Generic[T]): +class DataFrame(Generic[DTypeT]): """ DataFrame object @@ -104,7 +103,7 @@ def groupby(self, keys: Sequence[str], /) -> GroupBy: """ ... - def get_column_by_name(self, name: str, /) -> Column[T]: + def get_column_by_name(self, name: str, /) -> Column[DTypeT]: """ Select a column by name. @@ -123,7 +122,7 @@ def get_column_by_name(self, name: str, /) -> Column[T]: """ ... - def get_columns_by_name(self, names: Sequence[str], /) -> DataFrame[T]: + def get_columns_by_name(self, names: Sequence[str], /) -> DataFrame[DTypeT]: """ Select multiple columns by name. @@ -142,7 +141,7 @@ def get_columns_by_name(self, names: Sequence[str], /) -> DataFrame[T]: """ ... - def get_rows(self, indices: Column[IntDType]) -> DataFrame[T]: + def get_rows(self, indices: Column[IntDType]) -> DataFrame[DTypeT]: """ Select a subset of rows, similar to `ndarray.take`. @@ -159,7 +158,7 @@ def get_rows(self, indices: Column[IntDType]) -> DataFrame[T]: def slice_rows( self, start: int | None, stop: int | None, step: int | None - ) -> DataFrame[T]: + ) -> DataFrame[DTypeT]: """ Select a subset of rows corresponding to a slice. @@ -175,7 +174,7 @@ def slice_rows( """ ... - def get_rows_by_mask(self, mask: Column[Bool]) -> DataFrame[T]: + def get_rows_by_mask(self, mask: Column[Bool]) -> DataFrame[DTypeT]: """ Select a subset of rows corresponding to a mask. @@ -208,7 +207,7 @@ def insert(self, loc: int, label: str, value: Column[Any]) -> DataFrame[Any]: """ ... - def drop_column(self, label: str) -> DataFrame[T]: + def drop_column(self, label: str) -> DataFrame[DTypeT]: """ Drop the specified column. @@ -227,7 +226,7 @@ def drop_column(self, label: str) -> DataFrame[T]: """ ... - def rename_columns(self, mapping: Mapping[str, str]) -> DataFrame[T]: + def rename_columns(self, mapping: Mapping[str, str]) -> DataFrame[DTypeT]: """ Rename columns. @@ -293,7 +292,7 @@ def sorted_indices( """ ... - def __eq__(self, other: DataFrame[T] | Scalar[T]) -> DataFrame[Bool]: # type: ignore[override] + def __eq__(self, other: DataFrame[DTypeT] | Scalar[DTypeT]) -> DataFrame[Bool]: # type: ignore[override] """ Compare for equality. @@ -312,7 +311,7 @@ def __eq__(self, other: DataFrame[T] | Scalar[T]) -> DataFrame[Bool]: # type: i """ ... - def __ne__(self, other: DataFrame[T] | Scalar[T]) -> DataFrame[Bool]: # type: ignore[override] + def __ne__(self, other: DataFrame[DTypeT] | Scalar[DTypeT]) -> DataFrame[Bool]: # type: ignore[override] """ Compare for non-equality. @@ -650,13 +649,13 @@ def all_rowwise(self, *, skip_nulls: bool = True) -> Column[Bool]: """ ... - def min(self, *, skip_nulls: bool = True) -> DataFrame[T]: + def min(self, *, skip_nulls: bool = True) -> DataFrame[DTypeT]: """ Reduction returns a 1-row DataFrame. """ ... - def max(self, *, skip_nulls: bool = True) -> DataFrame[T]: + def max(self, *, skip_nulls: bool = True) -> DataFrame[DTypeT]: """ Reduction returns a 1-row DataFrame. """ @@ -738,7 +737,7 @@ def is_nan(self) -> DataFrame[Bool]: """ ... - def fill_nan(self, value: float | 'null', /) -> DataFrame[T]: + def fill_nan(self, value: float | 'null', /) -> DataFrame[DTypeT]: """ Fill ``nan`` values with the given fill value. From a096091902ce35f4b39b901a084e14cfed2c1535 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Sat, 24 Jun 2023 11:18:40 +0100 Subject: [PATCH 13/38] getting there? --- spec/API_specification/dataframe_api/__init__.py | 1 + spec/API_specification/dataframe_api/column_object.py | 3 ++- spec/API_specification/dataframe_api/dataframe_object.py | 3 ++- spec/API_specification/index.rst | 1 + 4 files changed, 6 insertions(+), 2 deletions(-) diff --git a/spec/API_specification/dataframe_api/__init__.py b/spec/API_specification/dataframe_api/__init__.py index 7f0cc513..bf5ebdd9 100644 --- a/spec/API_specification/dataframe_api/__init__.py +++ b/spec/API_specification/dataframe_api/__init__.py @@ -29,6 +29,7 @@ "Float64", "Float32", "Bool", + "DTypeT", ] diff --git a/spec/API_specification/dataframe_api/column_object.py b/spec/API_specification/dataframe_api/column_object.py index 5cf4bbc1..69d8cb30 100644 --- a/spec/API_specification/dataframe_api/column_object.py +++ b/spec/API_specification/dataframe_api/column_object.py @@ -4,7 +4,8 @@ if TYPE_CHECKING: from . import DType, IntDType, FloatDType, Bool, null, Scalar - from ._types import DTypeT + +from ._types import DTypeT __all__ = ['Column'] diff --git a/spec/API_specification/dataframe_api/dataframe_object.py b/spec/API_specification/dataframe_api/dataframe_object.py index 6d076499..729a57c4 100644 --- a/spec/API_specification/dataframe_api/dataframe_object.py +++ b/spec/API_specification/dataframe_api/dataframe_object.py @@ -2,12 +2,13 @@ from typing import Any, Literal, Mapping, Sequence, Union, TYPE_CHECKING, NoReturn, TypeVar, Generic +from ._types import DTypeT if TYPE_CHECKING: from .column_object import Column from .groupby_object import GroupBy from . import DType, IntDType, FloatDType, Bool, null, Scalar - from ._types import DTypeT + __all__ = ["DataFrame"] diff --git a/spec/API_specification/index.rst b/spec/API_specification/index.rst index 32b81a12..100659f6 100644 --- a/spec/API_specification/index.rst +++ b/spec/API_specification/index.rst @@ -28,6 +28,7 @@ of objects and functions in the top-level namespace. The latter are: Float64 Float32 Bool + DTypeT The ``DataFrame``, ``Column`` and ``GroupBy`` objects have the following methods and attributes: From b3847d0743cc424cedcf1351a6363979eeb789e9 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Sat, 24 Jun 2023 11:19:57 +0100 Subject: [PATCH 14/38] fixup --- spec/API_specification/dataframe_api/column_object.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spec/API_specification/dataframe_api/column_object.py b/spec/API_specification/dataframe_api/column_object.py index 69d8cb30..2929539f 100644 --- a/spec/API_specification/dataframe_api/column_object.py +++ b/spec/API_specification/dataframe_api/column_object.py @@ -110,7 +110,7 @@ def get_value(self, row_number: Scalar[IntDType]) -> Scalar[DTypeT]: def sorted_indices( self, *, - ascending: Scalar[Bool] = True, + ascending: bool = True, nulls_position: Literal['first', 'last'] = 'last', ) -> Column[IntDType]: """ From ffcd9e0cdecf3984a80bbb98513635b241bebdb4 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Sat, 24 Jun 2023 11:27:04 +0100 Subject: [PATCH 15/38] getting there? --- spec/API_specification/dataframe_api/__init__.py | 2 ++ spec/API_specification/index.rst | 2 ++ spec/conf.py | 1 + 3 files changed, 5 insertions(+) diff --git a/spec/API_specification/dataframe_api/__init__.py b/spec/API_specification/dataframe_api/__init__.py index bf5ebdd9..f0c46353 100644 --- a/spec/API_specification/dataframe_api/__init__.py +++ b/spec/API_specification/dataframe_api/__init__.py @@ -30,6 +30,8 @@ "Float32", "Bool", "DTypeT", + "FloatDType", + "IntDType", ] diff --git a/spec/API_specification/index.rst b/spec/API_specification/index.rst index 100659f6..165266a4 100644 --- a/spec/API_specification/index.rst +++ b/spec/API_specification/index.rst @@ -29,6 +29,8 @@ of objects and functions in the top-level namespace. The latter are: Float32 Bool DTypeT + IntDType + FloatDType The ``DataFrame``, ``Column`` and ``GroupBy`` objects have the following methods and attributes: diff --git a/spec/conf.py b/spec/conf.py index 8d3d7800..8b002d06 100644 --- a/spec/conf.py +++ b/spec/conf.py @@ -82,6 +82,7 @@ ('py:class', 'enum.Enum'), ('py:class', 'ellipsis'), ('py:class', 'Scalar'), + ('py:class', 'DTypeT'), ] # NOTE: this alias handling isn't used yet - added in anticipation of future # need based on dataframe API aliases. From a474d59a8003bd0959fbac522937bed0236a0f3b Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Sat, 24 Jun 2023 11:37:39 +0100 Subject: [PATCH 16/38] export DataFrame and Column --- spec/API_specification/dataframe_api/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/spec/API_specification/dataframe_api/__init__.py b/spec/API_specification/dataframe_api/__init__.py index f0c46353..1db776c5 100644 --- a/spec/API_specification/dataframe_api/__init__.py +++ b/spec/API_specification/dataframe_api/__init__.py @@ -11,6 +11,8 @@ from ._types import DTypeT __all__ = [ + "DataFrame", + "Column", "__dataframe_api_version", "column_from_sequence", "concat", From 38259abfdfbd51c2f055af42f437541eb487973b Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Sat, 24 Jun 2023 11:40:20 +0100 Subject: [PATCH 17/38] ignore some nitpicks for now --- spec/conf.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/spec/conf.py b/spec/conf.py index 8b002d06..da73887b 100644 --- a/spec/conf.py +++ b/spec/conf.py @@ -83,6 +83,9 @@ ('py:class', 'ellipsis'), ('py:class', 'Scalar'), ('py:class', 'DTypeT'), + ('py:class', 'Bool'), + ('py:class', 'IntDType'), + ('py:class', 'FloatDType'), ] # NOTE: this alias handling isn't used yet - added in anticipation of future # need based on dataframe API aliases. From a4d81fc337340772e07b26775c22d6fc4fe3508f Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Sat, 24 Jun 2023 12:01:58 +0100 Subject: [PATCH 18/38] more fixups --- spec/API_specification/dataframe_api/column_object.py | 6 +++--- spec/API_specification/dataframe_api/dataframe_object.py | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/spec/API_specification/dataframe_api/column_object.py b/spec/API_specification/dataframe_api/column_object.py index 2929539f..b24ef339 100644 --- a/spec/API_specification/dataframe_api/column_object.py +++ b/spec/API_specification/dataframe_api/column_object.py @@ -48,7 +48,7 @@ def __column_namespace__( """ @property - def column(self) -> object: + def column(self) -> Any: """ Return underlying (not-necessarily-Standard-compliant) column. @@ -237,7 +237,7 @@ def __lt__(self, other: Column[DTypeT] | Scalar[DTypeT]) -> Column[Bool]: Column """ - def __and__(self: Column[Bool], other: Column[Bool] | bool) -> Column[Bool]: + def __and__(self: Column[Bool], other: Column[Bool] | Scalar[Bool]) -> Column[Bool]: """ Apply logical 'and' to `other` Column (or scalar) and this Column. @@ -258,7 +258,7 @@ def __and__(self: Column[Bool], other: Column[Bool] | bool) -> Column[Bool]: If `self` or `other` is not boolean. """ - def __or__(self: Column[Bool], other: Column[Bool] | bool) -> Column[Bool]: + def __or__(self: Column[Bool], other: Column[Bool] | Scalar[Bool]) -> Column[Bool]: """ Apply logical 'or' to `other` Column (or scalar) and this column. diff --git a/spec/API_specification/dataframe_api/dataframe_object.py b/spec/API_specification/dataframe_api/dataframe_object.py index 729a57c4..8ba39d6c 100644 --- a/spec/API_specification/dataframe_api/dataframe_object.py +++ b/spec/API_specification/dataframe_api/dataframe_object.py @@ -399,7 +399,7 @@ def __lt__(self, other: DataFrame[Any] | Scalar[Any]) -> DataFrame[Bool]: """ ... - def __and__(self, other: DataFrame[Bool] | bool) -> DataFrame[Bool]: + def __and__(self, other: DataFrame[Bool] | Scalar[Bool]) -> DataFrame[Bool]: """ Apply logical 'and' to `other` DataFrame (or scalar) and this dataframe. @@ -420,7 +420,7 @@ def __and__(self, other: DataFrame[Bool] | bool) -> DataFrame[Bool]: If `self` or `other` is not boolean. """ - def __or__(self, other: DataFrame[Bool] | bool) -> DataFrame[Bool]: + def __or__(self, other: DataFrame[Bool] | Scalar[Bool]) -> DataFrame[Bool]: """ Apply logical 'or' to `other` DataFrame (or scalar) and this DataFrame. From cb2f1e4515b010874155ade5c647ca38f070c89a Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Sat, 24 Jun 2023 12:03:08 +0100 Subject: [PATCH 19/38] remove unnecessary file --- spec/API_specification/README.md | 7 ------- 1 file changed, 7 deletions(-) delete mode 100644 spec/API_specification/README.md diff --git a/spec/API_specification/README.md b/spec/API_specification/README.md deleted file mode 100644 index 0f2a16db..00000000 --- a/spec/API_specification/README.md +++ /dev/null @@ -1,7 +0,0 @@ -# API Specification - -To type-check the spec, please install `mypy==1.4.0` and run - -```console -mypy dataframe_api --strict -``` \ No newline at end of file From b7c00fb983c0e69d37ad16eee069695ab8273665 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Sat, 24 Jun 2023 14:03:09 +0100 Subject: [PATCH 20/38] few more corrections --- spec/API_specification/.mypy.ini | 2 +- spec/API_specification/dataframe_api/__init__.py | 2 +- spec/API_specification/dataframe_api/dataframe_object.py | 8 ++++---- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/spec/API_specification/.mypy.ini b/spec/API_specification/.mypy.ini index eef0ed08..a0e568d5 100644 --- a/spec/API_specification/.mypy.ini +++ b/spec/API_specification/.mypy.ini @@ -1,3 +1,3 @@ -[mypy] +[mypy-dataframe_api.*] strict=True disable_error_code=empty-body diff --git a/spec/API_specification/dataframe_api/__init__.py b/spec/API_specification/dataframe_api/__init__.py index 1db776c5..0c57adda 100644 --- a/spec/API_specification/dataframe_api/__init__.py +++ b/spec/API_specification/dataframe_api/__init__.py @@ -64,7 +64,7 @@ def concat(dataframes: Sequence[DataFrame[Any]]) -> DataFrame[Any]: """ ... -def column_from_sequence(sequence: Sequence[Scalar[DType]], *, dtype: DType) -> Column[DType]: +def column_from_sequence(sequence: Sequence[Scalar[DType]], *, dtype: DType) -> Column[DTypeT]: """ Construct Column from sequence of elements. diff --git a/spec/API_specification/dataframe_api/dataframe_object.py b/spec/API_specification/dataframe_api/dataframe_object.py index 8ba39d6c..e9c98487 100644 --- a/spec/API_specification/dataframe_api/dataframe_object.py +++ b/spec/API_specification/dataframe_api/dataframe_object.py @@ -331,7 +331,7 @@ def __ne__(self, other: DataFrame[DTypeT] | Scalar[DTypeT]) -> DataFrame[Bool]: """ ... - def __ge__(self, other: DataFrame[Any] | Scalar[Any]) -> DataFrame[Bool]: + def __ge__(self, other: DataFrame[DTypeT] | Scalar[DTypeT]) -> DataFrame[Bool]: """ Compare for "greater than or equal to" `other`. @@ -348,7 +348,7 @@ def __ge__(self, other: DataFrame[Any] | Scalar[Any]) -> DataFrame[Bool]: """ ... - def __gt__(self, other: DataFrame[Any] | Scalar[Any]) -> DataFrame[Bool]: + def __gt__(self, other: DataFrame[DTypeT] | Scalar[DTypeT]) -> DataFrame[Bool]: """ Compare for "greater than" `other`. @@ -365,7 +365,7 @@ def __gt__(self, other: DataFrame[Any] | Scalar[Any]) -> DataFrame[Bool]: """ ... - def __le__(self, other: DataFrame[Any] | Scalar[Any]) -> DataFrame[Bool]: + def __le__(self, other: DataFrame[DTypeT] | Scalar[DTypeT]) -> DataFrame[Bool]: """ Compare for "less than or equal to" `other`. @@ -382,7 +382,7 @@ def __le__(self, other: DataFrame[Any] | Scalar[Any]) -> DataFrame[Bool]: """ ... - def __lt__(self, other: DataFrame[Any] | Scalar[Any]) -> DataFrame[Bool]: + def __lt__(self, other: DataFrame[DTypeT] | Scalar[DTypeT]) -> DataFrame[Bool]: """ Compare for "less than" `other`. From cfaafa5d4cde5c964a0687d575de0456ad46805c Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Sat, 24 Jun 2023 14:14:41 +0100 Subject: [PATCH 21/38] use mypy.ini --- .github/workflows/mypy.yml | 2 +- spec/API_specification/.mypy.ini | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/workflows/mypy.yml b/.github/workflows/mypy.yml index 4c7f436b..a11bde7a 100644 --- a/.github/workflows/mypy.yml +++ b/.github/workflows/mypy.yml @@ -29,4 +29,4 @@ jobs: - name: install-reqs run: python -m pip install --upgrade mypy==1.4.0 - name: run mypy - run: cd spec/API_specification && mypy dataframe_api --strict --disable-error-code=empty-body + run: cd spec/API_specification && mypy dataframe_api diff --git a/spec/API_specification/.mypy.ini b/spec/API_specification/.mypy.ini index a0e568d5..b165602a 100644 --- a/spec/API_specification/.mypy.ini +++ b/spec/API_specification/.mypy.ini @@ -1,3 +1,5 @@ -[mypy-dataframe_api.*] +[mypy] strict=True + +[mypy-dataframe_api.*] disable_error_code=empty-body From e92a784f336447ef52a8ac844a6b34c1af48129f Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Tue, 27 Jun 2023 12:50:05 +0100 Subject: [PATCH 22/38] revert making DataFrame generic --- .../dataframe_api/__init__.py | 4 +- .../dataframe_api/dataframe_object.py | 76 +++++++++---------- .../dataframe_api/groupby_object.py | 22 +++--- 3 files changed, 51 insertions(+), 51 deletions(-) diff --git a/spec/API_specification/dataframe_api/__init__.py b/spec/API_specification/dataframe_api/__init__.py index 0c57adda..2de0f599 100644 --- a/spec/API_specification/dataframe_api/__init__.py +++ b/spec/API_specification/dataframe_api/__init__.py @@ -44,7 +44,7 @@ implementation of the dataframe API standard. """ -def concat(dataframes: Sequence[DataFrame[Any]]) -> DataFrame[Any]: +def concat(dataframes: Sequence[DataFrame]) -> DataFrame: """ Concatenate DataFrames vertically. @@ -83,7 +83,7 @@ def column_from_sequence(sequence: Sequence[Scalar[DType]], *, dtype: DType) -> """ ... -def dataframe_from_dict(data: Mapping[str, Column[Any]]) -> DataFrame[Any]: +def dataframe_from_dict(data: Mapping[str, Column[Any]]) -> DataFrame: """ Construct DataFrame from map of column names to Columns. diff --git a/spec/API_specification/dataframe_api/dataframe_object.py b/spec/API_specification/dataframe_api/dataframe_object.py index e9c98487..51c68fc7 100644 --- a/spec/API_specification/dataframe_api/dataframe_object.py +++ b/spec/API_specification/dataframe_api/dataframe_object.py @@ -14,7 +14,7 @@ __all__ = ["DataFrame"] -class DataFrame(Generic[DTypeT]): +class DataFrame: """ DataFrame object @@ -123,7 +123,7 @@ def get_column_by_name(self, name: str, /) -> Column[DTypeT]: """ ... - def get_columns_by_name(self, names: Sequence[str], /) -> DataFrame[DTypeT]: + def get_columns_by_name(self, names: Sequence[str], /) -> DataFrame: """ Select multiple columns by name. @@ -142,7 +142,7 @@ def get_columns_by_name(self, names: Sequence[str], /) -> DataFrame[DTypeT]: """ ... - def get_rows(self, indices: Column[IntDType]) -> DataFrame[DTypeT]: + def get_rows(self, indices: Column[IntDType]) -> DataFrame: """ Select a subset of rows, similar to `ndarray.take`. @@ -159,7 +159,7 @@ def get_rows(self, indices: Column[IntDType]) -> DataFrame[DTypeT]: def slice_rows( self, start: int | None, stop: int | None, step: int | None - ) -> DataFrame[DTypeT]: + ) -> DataFrame: """ Select a subset of rows corresponding to a slice. @@ -175,7 +175,7 @@ def slice_rows( """ ... - def get_rows_by_mask(self, mask: Column[Bool]) -> DataFrame[DTypeT]: + def get_rows_by_mask(self, mask: Column[Bool]) -> DataFrame: """ Select a subset of rows corresponding to a mask. @@ -194,7 +194,7 @@ def get_rows_by_mask(self, mask: Column[Bool]) -> DataFrame[DTypeT]: """ ... - def insert(self, loc: int, label: str, value: Column[Any]) -> DataFrame[Any]: + def insert(self, loc: int, label: str, value: Column[Any]) -> DataFrame: """ Insert column into DataFrame at specified location. @@ -208,7 +208,7 @@ def insert(self, loc: int, label: str, value: Column[Any]) -> DataFrame[Any]: """ ... - def drop_column(self, label: str) -> DataFrame[DTypeT]: + def drop_column(self, label: str) -> DataFrame: """ Drop the specified column. @@ -227,7 +227,7 @@ def drop_column(self, label: str) -> DataFrame[DTypeT]: """ ... - def rename_columns(self, mapping: Mapping[str, str]) -> DataFrame[DTypeT]: + def rename_columns(self, mapping: Mapping[str, str]) -> DataFrame: """ Rename columns. @@ -293,7 +293,7 @@ def sorted_indices( """ ... - def __eq__(self, other: DataFrame[DTypeT] | Scalar[DTypeT]) -> DataFrame[Bool]: # type: ignore[override] + def __eq__(self, other: DataFrame | Scalar[DTypeT]) -> DataFrame: # type: ignore[override] """ Compare for equality. @@ -312,7 +312,7 @@ def __eq__(self, other: DataFrame[DTypeT] | Scalar[DTypeT]) -> DataFrame[Bool]: """ ... - def __ne__(self, other: DataFrame[DTypeT] | Scalar[DTypeT]) -> DataFrame[Bool]: # type: ignore[override] + def __ne__(self, other: DataFrame | Scalar[DTypeT]) -> DataFrame: # type: ignore[override] """ Compare for non-equality. @@ -331,7 +331,7 @@ def __ne__(self, other: DataFrame[DTypeT] | Scalar[DTypeT]) -> DataFrame[Bool]: """ ... - def __ge__(self, other: DataFrame[DTypeT] | Scalar[DTypeT]) -> DataFrame[Bool]: + def __ge__(self, other: DataFrame | Scalar[DTypeT]) -> DataFrame: """ Compare for "greater than or equal to" `other`. @@ -348,7 +348,7 @@ def __ge__(self, other: DataFrame[DTypeT] | Scalar[DTypeT]) -> DataFrame[Bool]: """ ... - def __gt__(self, other: DataFrame[DTypeT] | Scalar[DTypeT]) -> DataFrame[Bool]: + def __gt__(self, other: DataFrame | Scalar[DTypeT]) -> DataFrame: """ Compare for "greater than" `other`. @@ -365,7 +365,7 @@ def __gt__(self, other: DataFrame[DTypeT] | Scalar[DTypeT]) -> DataFrame[Bool]: """ ... - def __le__(self, other: DataFrame[DTypeT] | Scalar[DTypeT]) -> DataFrame[Bool]: + def __le__(self, other: DataFrame | Scalar[DTypeT]) -> DataFrame: """ Compare for "less than or equal to" `other`. @@ -382,7 +382,7 @@ def __le__(self, other: DataFrame[DTypeT] | Scalar[DTypeT]) -> DataFrame[Bool]: """ ... - def __lt__(self, other: DataFrame[DTypeT] | Scalar[DTypeT]) -> DataFrame[Bool]: + def __lt__(self, other: DataFrame | Scalar[DTypeT]) -> DataFrame: """ Compare for "less than" `other`. @@ -399,7 +399,7 @@ def __lt__(self, other: DataFrame[DTypeT] | Scalar[DTypeT]) -> DataFrame[Bool]: """ ... - def __and__(self, other: DataFrame[Bool] | Scalar[Bool]) -> DataFrame[Bool]: + def __and__(self, other: DataFrame | Scalar[Bool]) -> DataFrame: """ Apply logical 'and' to `other` DataFrame (or scalar) and this dataframe. @@ -420,7 +420,7 @@ def __and__(self, other: DataFrame[Bool] | Scalar[Bool]) -> DataFrame[Bool]: If `self` or `other` is not boolean. """ - def __or__(self, other: DataFrame[Bool] | Scalar[Bool]) -> DataFrame[Bool]: + def __or__(self, other: DataFrame | Scalar[Bool]) -> DataFrame: """ Apply logical 'or' to `other` DataFrame (or scalar) and this DataFrame. @@ -441,7 +441,7 @@ def __or__(self, other: DataFrame[Bool] | Scalar[Bool]) -> DataFrame[Bool]: If `self` or `other` is not boolean. """ - def __add__(self, other: DataFrame[Any] | Scalar[Any]) -> DataFrame[Any]: + def __add__(self, other: DataFrame | Scalar[Any]) -> DataFrame: """ Add `other` dataframe or scalar to this dataframe. @@ -458,7 +458,7 @@ def __add__(self, other: DataFrame[Any] | Scalar[Any]) -> DataFrame[Any]: """ ... - def __sub__(self, other: DataFrame[Any] | Scalar[Any]) -> DataFrame[Any]: + def __sub__(self, other: DataFrame | Scalar[Any]) -> DataFrame: """ Subtract `other` dataframe or scalar from this dataframe. @@ -475,7 +475,7 @@ def __sub__(self, other: DataFrame[Any] | Scalar[Any]) -> DataFrame[Any]: """ ... - def __mul__(self, other: DataFrame[Any] | Scalar[Any]) -> DataFrame[Any]: + def __mul__(self, other: DataFrame | Scalar[Any]) -> DataFrame: """ Multiply `other` dataframe or scalar with this dataframe. @@ -492,7 +492,7 @@ def __mul__(self, other: DataFrame[Any] | Scalar[Any]) -> DataFrame[Any]: """ ... - def __truediv__(self, other: DataFrame[Any] | Scalar[Any]) -> DataFrame[Any]: + def __truediv__(self, other: DataFrame | Scalar[Any]) -> DataFrame: """ Divide this dataframe by `other` dataframe or scalar. True division, returns floats. @@ -509,7 +509,7 @@ def __truediv__(self, other: DataFrame[Any] | Scalar[Any]) -> DataFrame[Any]: """ ... - def __floordiv__(self, other: DataFrame[Any] | Scalar[Any]) -> DataFrame[Any]: + def __floordiv__(self, other: DataFrame | Scalar[Any]) -> DataFrame: """ Floor-divide (returns integers) this dataframe by `other` dataframe or scalar. @@ -526,7 +526,7 @@ def __floordiv__(self, other: DataFrame[Any] | Scalar[Any]) -> DataFrame[Any]: """ ... - def __pow__(self, other: DataFrame[Any] | Scalar[Any]) -> DataFrame[Any]: + def __pow__(self, other: DataFrame | Scalar[Any]) -> DataFrame: """ Raise this dataframe to the power of `other`. @@ -543,7 +543,7 @@ def __pow__(self, other: DataFrame[Any] | Scalar[Any]) -> DataFrame[Any]: """ ... - def __mod__(self, other: DataFrame[Any] | Scalar[Any]) -> DataFrame[Any]: + def __mod__(self, other: DataFrame | Scalar[Any]) -> DataFrame: """ Return modulus of this dataframe by `other` (`%` operator). @@ -560,7 +560,7 @@ def __mod__(self, other: DataFrame[Any] | Scalar[Any]) -> DataFrame[Any]: """ ... - def __divmod__(self, other: DataFrame[Any] | Scalar[Any]) -> tuple[DataFrame[Any], DataFrame[Any]]: + def __divmod__(self, other: DataFrame | Scalar[Any]) -> tuple[DataFrame, DataFrame]: """ Return quotient and remainder of integer division. See `divmod` builtin function. @@ -577,7 +577,7 @@ def __divmod__(self, other: DataFrame[Any] | Scalar[Any]) -> tuple[DataFrame[Any """ ... - def __invert__(self: DataFrame[Bool]) -> DataFrame[Bool]: + def __invert__(self: DataFrame) -> DataFrame: """ Invert truthiness of (boolean) elements. @@ -600,7 +600,7 @@ def __iter__(self) -> NoReturn: """ raise NotImplementedError("'__iter__' is intentionally not implemented.") - def any(self, *, skip_nulls: bool = True) -> DataFrame[Bool]: + def any(self, *, skip_nulls: bool = True) -> DataFrame: """ Reduction returns a 1-row DataFrame. @@ -611,7 +611,7 @@ def any(self, *, skip_nulls: bool = True) -> DataFrame[Bool]: """ ... - def all(self, *, skip_nulls: bool = True) -> DataFrame[Bool]: + def all(self, *, skip_nulls: bool = True) -> DataFrame: """ Reduction returns a 1-row DataFrame. @@ -650,55 +650,55 @@ def all_rowwise(self, *, skip_nulls: bool = True) -> Column[Bool]: """ ... - def min(self, *, skip_nulls: bool = True) -> DataFrame[DTypeT]: + def min(self, *, skip_nulls: bool = True) -> DataFrame: """ Reduction returns a 1-row DataFrame. """ ... - def max(self, *, skip_nulls: bool = True) -> DataFrame[DTypeT]: + def max(self, *, skip_nulls: bool = True) -> DataFrame: """ Reduction returns a 1-row DataFrame. """ ... - def sum(self, *, skip_nulls: bool = True) -> DataFrame[Any]: + def sum(self, *, skip_nulls: bool = True) -> DataFrame: """ Reduction returns a 1-row DataFrame. """ ... - def prod(self, *, skip_nulls: bool = True) -> DataFrame[Any]: + def prod(self, *, skip_nulls: bool = True) -> DataFrame: """ Reduction returns a 1-row DataFrame. """ ... - def median(self, *, skip_nulls: bool = True) -> DataFrame[Any]: + def median(self, *, skip_nulls: bool = True) -> DataFrame: """ Reduction returns a 1-row DataFrame. """ ... - def mean(self, *, skip_nulls: bool = True) -> DataFrame[Any]: + def mean(self, *, skip_nulls: bool = True) -> DataFrame: """ Reduction returns a 1-row DataFrame. """ ... - def std(self, *, skip_nulls: bool = True) -> DataFrame[Any]: + def std(self, *, skip_nulls: bool = True) -> DataFrame: """ Reduction returns a 1-row DataFrame. """ ... - def var(self, *, skip_nulls: bool = True) -> DataFrame[Any]: + def var(self, *, skip_nulls: bool = True) -> DataFrame: """ Reduction returns a 1-row DataFrame. """ ... - def is_null(self) -> DataFrame[Bool]: + def is_null(self) -> DataFrame: """ Check for 'missing' or 'null' entries. @@ -718,7 +718,7 @@ def is_null(self) -> DataFrame[Bool]: """ ... - def is_nan(self) -> DataFrame[Bool]: + def is_nan(self) -> DataFrame: """ Check for nan entries. @@ -738,7 +738,7 @@ def is_nan(self) -> DataFrame[Bool]: """ ... - def fill_nan(self, value: float | 'null', /) -> DataFrame[DTypeT]: + def fill_nan(self, value: float | 'null', /) -> DataFrame: """ Fill ``nan`` values with the given fill value. diff --git a/spec/API_specification/dataframe_api/groupby_object.py b/spec/API_specification/dataframe_api/groupby_object.py index eb06f27d..43ffa9b1 100644 --- a/spec/API_specification/dataframe_api/groupby_object.py +++ b/spec/API_specification/dataframe_api/groupby_object.py @@ -22,35 +22,35 @@ class GroupBy: **Methods** """ - def any(self, *, skip_nulls: bool = True) -> DataFrame[Bool]: + def any(self, *, skip_nulls: bool = True) -> DataFrame: ... - def all(self, *, skip_nulls: bool = True) -> DataFrame[Bool]: + def all(self, *, skip_nulls: bool = True) -> DataFrame: ... - def min(self, *, skip_nulls: bool = True) -> DataFrame[Any]: + def min(self, *, skip_nulls: bool = True) -> DataFrame: ... - def max(self, *, skip_nulls: bool = True) -> DataFrame[Any]: + def max(self, *, skip_nulls: bool = True) -> DataFrame: ... - def sum(self, *, skip_nulls: bool = True) -> DataFrame[Any]: + def sum(self, *, skip_nulls: bool = True) -> DataFrame: ... - def prod(self, *, skip_nulls: bool = True) -> DataFrame[Any]: + def prod(self, *, skip_nulls: bool = True) -> DataFrame: ... - def median(self, *, skip_nulls: bool = True) -> DataFrame[Any]: + def median(self, *, skip_nulls: bool = True) -> DataFrame: ... - def mean(self, *, skip_nulls: bool = True) -> DataFrame[Any]: + def mean(self, *, skip_nulls: bool = True) -> DataFrame: ... - def std(self, *, skip_nulls: bool = True) -> DataFrame[Any]: + def std(self, *, skip_nulls: bool = True) -> DataFrame: ... - def var(self, *, skip_nulls: bool = True) -> DataFrame[Any]: + def var(self, *, skip_nulls: bool = True) -> DataFrame: ... - def size(self) -> DataFrame[IntDType]: + def size(self) -> DataFrame: ... From b67398e79fedf506a0f68c97eba421c086e26267 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Wed, 28 Jun 2023 14:21:17 +0100 Subject: [PATCH 23/38] revert making Scalar generic --- .../dataframe_api/__init__.py | 6 +- .../dataframe_api/column_object.py | 56 +++++++++---------- .../dataframe_api/dataframe_object.py | 32 +++++------ 3 files changed, 47 insertions(+), 47 deletions(-) diff --git a/spec/API_specification/dataframe_api/__init__.py b/spec/API_specification/dataframe_api/__init__.py index 2de0f599..5a65b756 100644 --- a/spec/API_specification/dataframe_api/__init__.py +++ b/spec/API_specification/dataframe_api/__init__.py @@ -64,7 +64,7 @@ def concat(dataframes: Sequence[DataFrame]) -> DataFrame: """ ... -def column_from_sequence(sequence: Sequence[Scalar[DType]], *, dtype: DType) -> Column[DTypeT]: +def column_from_sequence(sequence: Sequence[Scalar], *, dtype: DType) -> Column[DTypeT]: """ Construct Column from sequence of elements. @@ -128,7 +128,7 @@ class null: """ ... -def is_null(value: object, /) -> Scalar[Bool]: +def is_null(value: object, /) -> bool: """ Check if an object is a `null` scalar. @@ -195,5 +195,5 @@ class Bool(DType): # Scalar # ########## -class Scalar(Generic[DTypeT]): +class Scalar: ... diff --git a/spec/API_specification/dataframe_api/column_object.py b/spec/API_specification/dataframe_api/column_object.py index b24ef339..d37032cf 100644 --- a/spec/API_specification/dataframe_api/column_object.py +++ b/spec/API_specification/dataframe_api/column_object.py @@ -90,7 +90,7 @@ def get_rows(self, indices: Column[IntDType]) -> Column[DTypeT]: """ ... - def get_value(self, row_number: Scalar[IntDType]) -> Scalar[DTypeT]: + def get_value(self, row_number: int) -> Scalar: """ Select the value at a row number, similar to `ndarray.__getitem__()`. @@ -122,7 +122,7 @@ def sorted_indices( Parameters ---------- - ascending : Scalar[Bool] + ascending : bool If `True`, sort in ascending order. If `False`, sort in descending order. nulls_position : ``{'first', 'last'}`` @@ -137,7 +137,7 @@ def sorted_indices( """ ... - def __eq__(self, other: Column[DTypeT] | Scalar[DTypeT]) -> Column[Bool]: # type: ignore[override] + def __eq__(self, other: Column[DTypeT] | Scalar) -> Column[Bool]: # type: ignore[override] """ Compare for equality. @@ -155,7 +155,7 @@ def __eq__(self, other: Column[DTypeT] | Scalar[DTypeT]) -> Column[Bool]: # typ Column """ - def __ne__(self, other: Column[DTypeT] | Scalar[DTypeT]) -> Column[Bool]: # type: ignore[override] + def __ne__(self, other: Column[DTypeT] | Scalar) -> Column[Bool]: # type: ignore[override] """ Compare for non-equality. @@ -173,7 +173,7 @@ def __ne__(self, other: Column[DTypeT] | Scalar[DTypeT]) -> Column[Bool]: # typ Column """ - def __ge__(self, other: Column[DTypeT] | Scalar[DTypeT]) -> Column[Bool]: + def __ge__(self, other: Column[DTypeT] | Scalar) -> Column[Bool]: """ Compare for "greater than or equal to" `other`. @@ -189,7 +189,7 @@ def __ge__(self, other: Column[DTypeT] | Scalar[DTypeT]) -> Column[Bool]: Column """ - def __gt__(self, other: Column[DTypeT] | Scalar[DTypeT]) -> Column[Bool]: + def __gt__(self, other: Column[DTypeT] | Scalar) -> Column[Bool]: """ Compare for "greater than" `other`. @@ -205,7 +205,7 @@ def __gt__(self, other: Column[DTypeT] | Scalar[DTypeT]) -> Column[Bool]: Column """ - def __le__(self, other: Column[DTypeT] | Scalar[DTypeT]) -> Column[Bool]: + def __le__(self, other: Column[DTypeT] | Scalar) -> Column[Bool]: """ Compare for "less than or equal to" `other`. @@ -221,7 +221,7 @@ def __le__(self, other: Column[DTypeT] | Scalar[DTypeT]) -> Column[Bool]: Column """ - def __lt__(self, other: Column[DTypeT] | Scalar[DTypeT]) -> Column[Bool]: + def __lt__(self, other: Column[DTypeT] | Scalar) -> Column[Bool]: """ Compare for "less than" `other`. @@ -237,7 +237,7 @@ def __lt__(self, other: Column[DTypeT] | Scalar[DTypeT]) -> Column[Bool]: Column """ - def __and__(self: Column[Bool], other: Column[Bool] | Scalar[Bool]) -> Column[Bool]: + def __and__(self: Column[Bool], other: Column[Bool] | bool) -> Column[Bool]: """ Apply logical 'and' to `other` Column (or scalar) and this Column. @@ -258,7 +258,7 @@ def __and__(self: Column[Bool], other: Column[Bool] | Scalar[Bool]) -> Column[Bo If `self` or `other` is not boolean. """ - def __or__(self: Column[Bool], other: Column[Bool] | Scalar[Bool]) -> Column[Bool]: + def __or__(self: Column[Bool], other: Column[Bool] | bool) -> Column[Bool]: """ Apply logical 'or' to `other` Column (or scalar) and this column. @@ -279,7 +279,7 @@ def __or__(self: Column[Bool], other: Column[Bool] | Scalar[Bool]) -> Column[Boo If `self` or `other` is not boolean. """ - def __add__(self, other: Column[DTypeT] | Scalar[DTypeT]) -> Column[DTypeT]: + def __add__(self, other: Column[DTypeT] | Scalar) -> Column[DTypeT]: """ Add `other` column or scalar to this column. @@ -295,7 +295,7 @@ def __add__(self, other: Column[DTypeT] | Scalar[DTypeT]) -> Column[DTypeT]: Column """ - def __sub__(self, other: Column[DTypeT] | Scalar[DTypeT]) -> Column[DTypeT]: + def __sub__(self, other: Column[DTypeT] | Scalar) -> Column[DTypeT]: """ Subtract `other` column or scalar from this column. @@ -311,7 +311,7 @@ def __sub__(self, other: Column[DTypeT] | Scalar[DTypeT]) -> Column[DTypeT]: Column """ - def __mul__(self, other: Column[Any] | Scalar[Any]) -> Column[Any]: + def __mul__(self, other: Column[Any] | Scalar) -> Column[Any]: """ Multiply `other` column or scalar with this column. @@ -327,7 +327,7 @@ def __mul__(self, other: Column[Any] | Scalar[Any]) -> Column[Any]: Column """ - def __truediv__(self, other: Column[Any] | Scalar[Any]) -> Column[FloatDType]: + def __truediv__(self, other: Column[Any] | Scalar) -> Column[FloatDType]: """ Divide this column by `other` column or scalar. True division, returns floats. @@ -343,7 +343,7 @@ def __truediv__(self, other: Column[Any] | Scalar[Any]) -> Column[FloatDType]: Column """ - def __floordiv__(self, other: Column[Any] | Scalar[Any]) -> Column[IntDType]: + def __floordiv__(self, other: Column[Any] | Scalar) -> Column[IntDType]: """ Floor-divide `other` column or scalar to this column. @@ -359,7 +359,7 @@ def __floordiv__(self, other: Column[Any] | Scalar[Any]) -> Column[IntDType]: Column """ - def __pow__(self, other: Column[Any] | Scalar[Any]) -> Column[Any]: + def __pow__(self, other: Column[Any] | Scalar) -> Column[Any]: """ Raise this column to the power of `other`. @@ -375,7 +375,7 @@ def __pow__(self, other: Column[Any] | Scalar[Any]) -> Column[Any]: Column """ - def __mod__(self, other: Column[Any] | Scalar[Any]) -> Column[Any]: + def __mod__(self, other: Column[Any] | Scalar) -> Column[Any]: """ Returns modulus of this column by `other` (`%` operator). @@ -391,7 +391,7 @@ def __mod__(self, other: Column[Any] | Scalar[Any]) -> Column[Any]: Column """ - def __divmod__(self, other: Column[Any] | Scalar[Any]) -> tuple[Column[IntDType], Column[FloatDType]]: + def __divmod__(self, other: Column[Any] | Scalar) -> tuple[Column[IntDType], Column[FloatDType]]: """ Return quotient and remainder of integer division. See `divmod` builtin function. @@ -417,7 +417,7 @@ def __invert__(self: Column[Bool]) -> Column[Bool]: If any of the Column's columns is not boolean. """ - def any(self: Column[Bool], *, skip_nulls: bool = True) -> Scalar[Bool]: + def any(self: Column[Bool], *, skip_nulls: bool = True) -> bool: """ Reduction returns a bool. @@ -427,7 +427,7 @@ def any(self: Column[Bool], *, skip_nulls: bool = True) -> Scalar[Bool]: If column is not boolean. """ - def all(self: Column[Bool], *, skip_nulls: bool = True) -> Scalar[Bool]: + def all(self: Column[Bool], *, skip_nulls: bool = True) -> bool: """ Reduction returns a bool. @@ -437,32 +437,32 @@ def all(self: Column[Bool], *, skip_nulls: bool = True) -> Scalar[Bool]: If column is not boolean. """ - def min(self, *, skip_nulls: bool = True) -> Scalar[DTypeT]: + def min(self, *, skip_nulls: bool = True) -> Scalar: """ Reduction returns a scalar. Any data type that supports comparisons must be supported. The returned value has the same dtype as the column. """ - def max(self, *, skip_nulls: bool = True) -> Scalar[DTypeT]: + def max(self, *, skip_nulls: bool = True) -> Scalar: """ Reduction returns a scalar. Any data type that supports comparisons must be supported. The returned value has the same dtype as the column. """ - def sum(self, *, skip_nulls: bool = True) -> Scalar[DTypeT]: + def sum(self, *, skip_nulls: bool = True) -> Scalar: """ Reduction returns a scalar. Must be supported for numerical and datetime data types. The returned value has the same dtype as the column. """ - def prod(self, *, skip_nulls: bool = True) -> Scalar[Any]: + def prod(self, *, skip_nulls: bool = True) -> Scalar: """ Reduction returns a scalar. Must be supported for numerical data types. The returned value has the same dtype as the column. """ - def median(self, *, skip_nulls: bool = True) -> Scalar[Any]: + def median(self, *, skip_nulls: bool = True) -> Scalar: """ Reduction returns a scalar. Must be supported for numerical and datetime data types. Returns a float for numerical data types, and @@ -470,7 +470,7 @@ def median(self, *, skip_nulls: bool = True) -> Scalar[Any]: dtypes. """ - def mean(self, *, skip_nulls: bool = True) -> Scalar[Any]: + def mean(self, *, skip_nulls: bool = True) -> Scalar: """ Reduction returns a scalar. Must be supported for numerical and datetime data types. Returns a float for numerical data types, and @@ -478,7 +478,7 @@ def mean(self, *, skip_nulls: bool = True) -> Scalar[Any]: dtypes. """ - def std(self, *, skip_nulls: bool = True) -> Scalar[Any]: + def std(self, *, skip_nulls: bool = True) -> Scalar: """ Reduction returns a scalar. Must be supported for numerical and datetime data types. Returns a float for numerical data types, and @@ -486,7 +486,7 @@ def std(self, *, skip_nulls: bool = True) -> Scalar[Any]: dtypes. """ - def var(self, *, skip_nulls: bool = True) -> Scalar[Any]: + def var(self, *, skip_nulls: bool = True) -> Scalar: """ Reduction returns a scalar. Must be supported for numerical and datetime data types. Returns a float for numerical data types, and diff --git a/spec/API_specification/dataframe_api/dataframe_object.py b/spec/API_specification/dataframe_api/dataframe_object.py index 51c68fc7..5627cd63 100644 --- a/spec/API_specification/dataframe_api/dataframe_object.py +++ b/spec/API_specification/dataframe_api/dataframe_object.py @@ -293,7 +293,7 @@ def sorted_indices( """ ... - def __eq__(self, other: DataFrame | Scalar[DTypeT]) -> DataFrame: # type: ignore[override] + def __eq__(self, other: DataFrame | Scalar) -> DataFrame: # type: ignore[override] """ Compare for equality. @@ -312,7 +312,7 @@ def __eq__(self, other: DataFrame | Scalar[DTypeT]) -> DataFrame: # type: ignor """ ... - def __ne__(self, other: DataFrame | Scalar[DTypeT]) -> DataFrame: # type: ignore[override] + def __ne__(self, other: DataFrame | Scalar) -> DataFrame: # type: ignore[override] """ Compare for non-equality. @@ -331,7 +331,7 @@ def __ne__(self, other: DataFrame | Scalar[DTypeT]) -> DataFrame: # type: ignor """ ... - def __ge__(self, other: DataFrame | Scalar[DTypeT]) -> DataFrame: + def __ge__(self, other: DataFrame | Scalar) -> DataFrame: """ Compare for "greater than or equal to" `other`. @@ -348,7 +348,7 @@ def __ge__(self, other: DataFrame | Scalar[DTypeT]) -> DataFrame: """ ... - def __gt__(self, other: DataFrame | Scalar[DTypeT]) -> DataFrame: + def __gt__(self, other: DataFrame | Scalar) -> DataFrame: """ Compare for "greater than" `other`. @@ -365,7 +365,7 @@ def __gt__(self, other: DataFrame | Scalar[DTypeT]) -> DataFrame: """ ... - def __le__(self, other: DataFrame | Scalar[DTypeT]) -> DataFrame: + def __le__(self, other: DataFrame | Scalar) -> DataFrame: """ Compare for "less than or equal to" `other`. @@ -382,7 +382,7 @@ def __le__(self, other: DataFrame | Scalar[DTypeT]) -> DataFrame: """ ... - def __lt__(self, other: DataFrame | Scalar[DTypeT]) -> DataFrame: + def __lt__(self, other: DataFrame | Scalar) -> DataFrame: """ Compare for "less than" `other`. @@ -399,7 +399,7 @@ def __lt__(self, other: DataFrame | Scalar[DTypeT]) -> DataFrame: """ ... - def __and__(self, other: DataFrame | Scalar[Bool]) -> DataFrame: + def __and__(self, other: DataFrame | bool) -> DataFrame: """ Apply logical 'and' to `other` DataFrame (or scalar) and this dataframe. @@ -420,7 +420,7 @@ def __and__(self, other: DataFrame | Scalar[Bool]) -> DataFrame: If `self` or `other` is not boolean. """ - def __or__(self, other: DataFrame | Scalar[Bool]) -> DataFrame: + def __or__(self, other: DataFrame | bool) -> DataFrame: """ Apply logical 'or' to `other` DataFrame (or scalar) and this DataFrame. @@ -441,7 +441,7 @@ def __or__(self, other: DataFrame | Scalar[Bool]) -> DataFrame: If `self` or `other` is not boolean. """ - def __add__(self, other: DataFrame | Scalar[Any]) -> DataFrame: + def __add__(self, other: DataFrame | Scalar) -> DataFrame: """ Add `other` dataframe or scalar to this dataframe. @@ -458,7 +458,7 @@ def __add__(self, other: DataFrame | Scalar[Any]) -> DataFrame: """ ... - def __sub__(self, other: DataFrame | Scalar[Any]) -> DataFrame: + def __sub__(self, other: DataFrame | Scalar) -> DataFrame: """ Subtract `other` dataframe or scalar from this dataframe. @@ -475,7 +475,7 @@ def __sub__(self, other: DataFrame | Scalar[Any]) -> DataFrame: """ ... - def __mul__(self, other: DataFrame | Scalar[Any]) -> DataFrame: + def __mul__(self, other: DataFrame | Scalar) -> DataFrame: """ Multiply `other` dataframe or scalar with this dataframe. @@ -492,7 +492,7 @@ def __mul__(self, other: DataFrame | Scalar[Any]) -> DataFrame: """ ... - def __truediv__(self, other: DataFrame | Scalar[Any]) -> DataFrame: + def __truediv__(self, other: DataFrame | Scalar) -> DataFrame: """ Divide this dataframe by `other` dataframe or scalar. True division, returns floats. @@ -509,7 +509,7 @@ def __truediv__(self, other: DataFrame | Scalar[Any]) -> DataFrame: """ ... - def __floordiv__(self, other: DataFrame | Scalar[Any]) -> DataFrame: + def __floordiv__(self, other: DataFrame | Scalar) -> DataFrame: """ Floor-divide (returns integers) this dataframe by `other` dataframe or scalar. @@ -526,7 +526,7 @@ def __floordiv__(self, other: DataFrame | Scalar[Any]) -> DataFrame: """ ... - def __pow__(self, other: DataFrame | Scalar[Any]) -> DataFrame: + def __pow__(self, other: DataFrame | Scalar) -> DataFrame: """ Raise this dataframe to the power of `other`. @@ -543,7 +543,7 @@ def __pow__(self, other: DataFrame | Scalar[Any]) -> DataFrame: """ ... - def __mod__(self, other: DataFrame | Scalar[Any]) -> DataFrame: + def __mod__(self, other: DataFrame | Scalar) -> DataFrame: """ Return modulus of this dataframe by `other` (`%` operator). @@ -560,7 +560,7 @@ def __mod__(self, other: DataFrame | Scalar[Any]) -> DataFrame: """ ... - def __divmod__(self, other: DataFrame | Scalar[Any]) -> tuple[DataFrame, DataFrame]: + def __divmod__(self, other: DataFrame | Scalar) -> tuple[DataFrame, DataFrame]: """ Return quotient and remainder of integer division. See `divmod` builtin function. From f28ddcd496f2a2af568b6ef94aa841f5dcfa7690 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Thu, 29 Jun 2023 08:42:18 +0100 Subject: [PATCH 24/38] remove DType class --- .../dataframe_api/__init__.py | 37 +++++++---------- .../API_specification/dataframe_api/_types.py | 4 +- .../dataframe_api/column_object.py | 40 +++++++++---------- .../dataframe_api/dataframe_object.py | 10 ++--- .../dataframe_api/groupby_object.py | 2 - 5 files changed, 40 insertions(+), 53 deletions(-) diff --git a/spec/API_specification/dataframe_api/__init__.py b/spec/API_specification/dataframe_api/__init__.py index 5a65b756..24fd0d69 100644 --- a/spec/API_specification/dataframe_api/__init__.py +++ b/spec/API_specification/dataframe_api/__init__.py @@ -8,7 +8,7 @@ from .column_object import * from .dataframe_object import DataFrame from .groupby_object import * -from ._types import DTypeT +from ._types import DType __all__ = [ "DataFrame", @@ -31,7 +31,7 @@ "Float64", "Float32", "Bool", - "DTypeT", + "DType", "FloatDType", "IntDType", ] @@ -64,7 +64,7 @@ def concat(dataframes: Sequence[DataFrame]) -> DataFrame: """ ... -def column_from_sequence(sequence: Sequence[Scalar], *, dtype: DType) -> Column[DTypeT]: +def column_from_sequence(sequence: Sequence[Scalar], *, dtype: Any) -> Column[Any]: """ Construct Column from sequence of elements. @@ -149,46 +149,37 @@ def is_null(value: object, /) -> bool: # Dtypes # ########## -class DType: - """Base class for all dtypes.""" - -class IntDType(DType): - """Base class for all integer dtypes.""" - -class FloatDType(DType): - """Base class for all float dtypes.""" - -class Int64(IntDType): +class Int64: """Integer type with 64 bits of precision.""" -class Int32(IntDType): +class Int32: """Integer type with 32 bits of precision.""" -class Int16(IntDType): +class Int16: """Integer type with 16 bits of precision.""" -class Int8(IntDType): +class Int8: """Integer type with 8 bits of precision.""" -class UInt64(IntDType): +class UInt64: """Unsigned integer type with 64 bits of precision.""" -class UInt32(IntDType): +class UInt32: """Unsigned integer type with 32 bits of precision.""" -class UInt16(IntDType): +class UInt16: """Unsigned integer type with 16 bits of precision.""" -class UInt8(IntDType): +class UInt8: """Unsigned integer type with 8 bits of precision.""" -class Float64(FloatDType): +class Float64: """Floating point type with 64 bits of precision.""" -class Float32(FloatDType): +class Float32: """Floating point type with 32 bits of precision.""" -class Bool(DType): +class Bool: """Boolean type with 8 bits of precision.""" ########## diff --git a/spec/API_specification/dataframe_api/_types.py b/spec/API_specification/dataframe_api/_types.py index f01e6163..320e1c2c 100644 --- a/spec/API_specification/dataframe_api/_types.py +++ b/spec/API_specification/dataframe_api/_types.py @@ -21,13 +21,11 @@ ) from enum import Enum -if TYPE_CHECKING: - from . import DType array = TypeVar("array") Scalar = TypeVar("Scalar") device = TypeVar("device") -DTypeT = TypeVar("DTypeT", bound="DType") +DType = TypeVar("DType") SupportsDLPack = TypeVar("SupportsDLPack") SupportsBufferProtocol = TypeVar("SupportsBufferProtocol") PyCapsule = TypeVar("PyCapsule") diff --git a/spec/API_specification/dataframe_api/column_object.py b/spec/API_specification/dataframe_api/column_object.py index d37032cf..d81fab62 100644 --- a/spec/API_specification/dataframe_api/column_object.py +++ b/spec/API_specification/dataframe_api/column_object.py @@ -3,14 +3,14 @@ from typing import Any,NoReturn, Sequence, TYPE_CHECKING, Literal, Generic, TypeVar if TYPE_CHECKING: - from . import DType, IntDType, FloatDType, Bool, null, Scalar + from . import Bool, null, Scalar + from ._types import DType -from ._types import DTypeT __all__ = ['Column'] -class Column(Generic[DTypeT]): +class Column(Generic[DType]): """ Column object @@ -74,12 +74,12 @@ def __iter__(self) -> NoReturn: raise NotImplementedError("'__iter__' is intentionally not implemented.") @property - def dtype(self) -> DType: + def dtype(self) -> Any: """ Return data type of column. """ - def get_rows(self, indices: Column[IntDType]) -> Column[DTypeT]: + def get_rows(self: Column[DType], indices: Column[Any]) -> Column[DType]: """ Select a subset of rows, similar to `ndarray.take`. @@ -112,7 +112,7 @@ def sorted_indices( *, ascending: bool = True, nulls_position: Literal['first', 'last'] = 'last', - ) -> Column[IntDType]: + ) -> Column[Any]: """ Return row numbers which would sort column. @@ -137,7 +137,7 @@ def sorted_indices( """ ... - def __eq__(self, other: Column[DTypeT] | Scalar) -> Column[Bool]: # type: ignore[override] + def __eq__(self, other: Column[DType] | Scalar) -> Column[Bool]: # type: ignore[override] """ Compare for equality. @@ -155,7 +155,7 @@ def __eq__(self, other: Column[DTypeT] | Scalar) -> Column[Bool]: # type: ignor Column """ - def __ne__(self, other: Column[DTypeT] | Scalar) -> Column[Bool]: # type: ignore[override] + def __ne__(self, other: Column[DType] | Scalar) -> Column[Bool]: # type: ignore[override] """ Compare for non-equality. @@ -173,7 +173,7 @@ def __ne__(self, other: Column[DTypeT] | Scalar) -> Column[Bool]: # type: ignor Column """ - def __ge__(self, other: Column[DTypeT] | Scalar) -> Column[Bool]: + def __ge__(self, other: Column[DType] | Scalar) -> Column[Bool]: """ Compare for "greater than or equal to" `other`. @@ -189,7 +189,7 @@ def __ge__(self, other: Column[DTypeT] | Scalar) -> Column[Bool]: Column """ - def __gt__(self, other: Column[DTypeT] | Scalar) -> Column[Bool]: + def __gt__(self, other: Column[DType] | Scalar) -> Column[Bool]: """ Compare for "greater than" `other`. @@ -205,7 +205,7 @@ def __gt__(self, other: Column[DTypeT] | Scalar) -> Column[Bool]: Column """ - def __le__(self, other: Column[DTypeT] | Scalar) -> Column[Bool]: + def __le__(self, other: Column[DType] | Scalar) -> Column[Bool]: """ Compare for "less than or equal to" `other`. @@ -221,7 +221,7 @@ def __le__(self, other: Column[DTypeT] | Scalar) -> Column[Bool]: Column """ - def __lt__(self, other: Column[DTypeT] | Scalar) -> Column[Bool]: + def __lt__(self, other: Column[DType] | Scalar) -> Column[Bool]: """ Compare for "less than" `other`. @@ -279,7 +279,7 @@ def __or__(self: Column[Bool], other: Column[Bool] | bool) -> Column[Bool]: If `self` or `other` is not boolean. """ - def __add__(self, other: Column[DTypeT] | Scalar) -> Column[DTypeT]: + def __add__(self, other: Column[DType] | Scalar) -> Column[DType]: """ Add `other` column or scalar to this column. @@ -295,7 +295,7 @@ def __add__(self, other: Column[DTypeT] | Scalar) -> Column[DTypeT]: Column """ - def __sub__(self, other: Column[DTypeT] | Scalar) -> Column[DTypeT]: + def __sub__(self, other: Column[DType] | Scalar) -> Column[DType]: """ Subtract `other` column or scalar from this column. @@ -327,7 +327,7 @@ def __mul__(self, other: Column[Any] | Scalar) -> Column[Any]: Column """ - def __truediv__(self, other: Column[Any] | Scalar) -> Column[FloatDType]: + def __truediv__(self, other: Column[Any] | Scalar) -> Column[Any]: """ Divide this column by `other` column or scalar. True division, returns floats. @@ -343,7 +343,7 @@ def __truediv__(self, other: Column[Any] | Scalar) -> Column[FloatDType]: Column """ - def __floordiv__(self, other: Column[Any] | Scalar) -> Column[IntDType]: + def __floordiv__(self, other: Column[Any] | Scalar) -> Column[Any]: """ Floor-divide `other` column or scalar to this column. @@ -391,7 +391,7 @@ def __mod__(self, other: Column[Any] | Scalar) -> Column[Any]: Column """ - def __divmod__(self, other: Column[Any] | Scalar) -> tuple[Column[IntDType], Column[FloatDType]]: + def __divmod__(self, other: Column[Any] | Scalar) -> tuple[Column[Any], Column[Any]]: """ Return quotient and remainder of integer division. See `divmod` builtin function. @@ -532,7 +532,7 @@ def is_nan(self) -> Column[Bool]: In particular, does not check for `np.timedelta64('NaT')`. """ - def is_in(self, values: Column[DTypeT]) -> Column[Bool]: + def is_in(self, values: Column[DType]) -> Column[Bool]: """ Indicate whether the value at each row matches any value in `values`. @@ -550,7 +550,7 @@ def is_in(self, values: Column[DTypeT]) -> Column[Bool]: Column[bool] """ - def unique_indices(self, *, skip_nulls: bool = True) -> Column[IntDType]: + def unique_indices(self, *, skip_nulls: bool = True) -> Column[Any]: """ Return indices corresponding to unique values in Column. @@ -571,7 +571,7 @@ def unique_indices(self, *, skip_nulls: bool = True) -> Column[IntDType]: """ ... - def fill_nan(self, value: float | 'null', /) -> Column[DTypeT]: + def fill_nan(self, value: float | 'null', /) -> Column[DType]: """ Fill floating point ``nan`` values with the given fill value. diff --git a/spec/API_specification/dataframe_api/dataframe_object.py b/spec/API_specification/dataframe_api/dataframe_object.py index 5627cd63..1c6dd9ec 100644 --- a/spec/API_specification/dataframe_api/dataframe_object.py +++ b/spec/API_specification/dataframe_api/dataframe_object.py @@ -2,12 +2,12 @@ from typing import Any, Literal, Mapping, Sequence, Union, TYPE_CHECKING, NoReturn, TypeVar, Generic -from ._types import DTypeT +from ._types import DType if TYPE_CHECKING: from .column_object import Column from .groupby_object import GroupBy - from . import DType, IntDType, FloatDType, Bool, null, Scalar + from . import Bool, null, Scalar @@ -104,7 +104,7 @@ def groupby(self, keys: Sequence[str], /) -> GroupBy: """ ... - def get_column_by_name(self, name: str, /) -> Column[DTypeT]: + def get_column_by_name(self, name: str, /) -> Column[DType]: """ Select a column by name. @@ -142,7 +142,7 @@ def get_columns_by_name(self, names: Sequence[str], /) -> DataFrame: """ ... - def get_rows(self, indices: Column[IntDType]) -> DataFrame: + def get_rows(self, indices: Column[Any]) -> DataFrame: """ Select a subset of rows, similar to `ndarray.take`. @@ -258,7 +258,7 @@ def sorted_indices( *, ascending: Sequence[bool] | bool = True, nulls_position: Literal['first', 'last'] = 'last', - ) -> Column[IntDType]: + ) -> Column[Any]: """ Return row numbers which would sort according to given columns. diff --git a/spec/API_specification/dataframe_api/groupby_object.py b/spec/API_specification/dataframe_api/groupby_object.py index 43ffa9b1..096c4e11 100644 --- a/spec/API_specification/dataframe_api/groupby_object.py +++ b/spec/API_specification/dataframe_api/groupby_object.py @@ -4,12 +4,10 @@ if TYPE_CHECKING: from .dataframe_object import DataFrame - from . import IntDType, DType, Bool __all__ = ['GroupBy'] -T = TypeVar('T', bound="DType") class GroupBy: From 06f8aa8e41b874bf9e15e6568e0f107802c950d7 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Thu, 29 Jun 2023 08:44:33 +0100 Subject: [PATCH 25/38] fixup --- spec/API_specification/dataframe_api/__init__.py | 5 ----- spec/API_specification/dataframe_api/_types.py | 1 - 2 files changed, 6 deletions(-) diff --git a/spec/API_specification/dataframe_api/__init__.py b/spec/API_specification/dataframe_api/__init__.py index 24fd0d69..9dbaf30a 100644 --- a/spec/API_specification/dataframe_api/__init__.py +++ b/spec/API_specification/dataframe_api/__init__.py @@ -19,7 +19,6 @@ "dataframe_from_dict", "is_null", "null", - "DType", "Int64", "Int32", "Int16", @@ -30,10 +29,6 @@ "UInt8", "Float64", "Float32", - "Bool", - "DType", - "FloatDType", - "IntDType", ] diff --git a/spec/API_specification/dataframe_api/_types.py b/spec/API_specification/dataframe_api/_types.py index 320e1c2c..0d90c75e 100644 --- a/spec/API_specification/dataframe_api/_types.py +++ b/spec/API_specification/dataframe_api/_types.py @@ -23,7 +23,6 @@ array = TypeVar("array") -Scalar = TypeVar("Scalar") device = TypeVar("device") DType = TypeVar("DType") SupportsDLPack = TypeVar("SupportsDLPack") From 7a26f3a712c25ca28b131d2b58f3b2ba68d4b765 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Thu, 29 Jun 2023 08:51:54 +0100 Subject: [PATCH 26/38] dont export Scalar, replace it with Any --- .../dataframe_api/__init__.py | 9 +--- .../API_specification/dataframe_api/_types.py | 2 +- .../dataframe_api/column_object.py | 48 +++++++++---------- .../dataframe_api/dataframe_object.py | 30 ++++++------ spec/design_topics/python_builtin_types.md | 2 +- 5 files changed, 42 insertions(+), 49 deletions(-) diff --git a/spec/API_specification/dataframe_api/__init__.py b/spec/API_specification/dataframe_api/__init__.py index 9dbaf30a..08015abb 100644 --- a/spec/API_specification/dataframe_api/__init__.py +++ b/spec/API_specification/dataframe_api/__init__.py @@ -59,7 +59,7 @@ def concat(dataframes: Sequence[DataFrame]) -> DataFrame: """ ... -def column_from_sequence(sequence: Sequence[Scalar], *, dtype: Any) -> Column[Any]: +def column_from_sequence(sequence: Sequence[Any], *, dtype: Any) -> Column[Any]: """ Construct Column from sequence of elements. @@ -176,10 +176,3 @@ class Float32: class Bool: """Boolean type with 8 bits of precision.""" - -########## -# Scalar # -########## - -class Scalar: - ... diff --git a/spec/API_specification/dataframe_api/_types.py b/spec/API_specification/dataframe_api/_types.py index 0d90c75e..030d920c 100644 --- a/spec/API_specification/dataframe_api/_types.py +++ b/spec/API_specification/dataframe_api/_types.py @@ -21,8 +21,8 @@ ) from enum import Enum - array = TypeVar("array") +Scalar = TypeVar("Scalar") device = TypeVar("device") DType = TypeVar("DType") SupportsDLPack = TypeVar("SupportsDLPack") diff --git a/spec/API_specification/dataframe_api/column_object.py b/spec/API_specification/dataframe_api/column_object.py index d81fab62..9da43f74 100644 --- a/spec/API_specification/dataframe_api/column_object.py +++ b/spec/API_specification/dataframe_api/column_object.py @@ -3,7 +3,7 @@ from typing import Any,NoReturn, Sequence, TYPE_CHECKING, Literal, Generic, TypeVar if TYPE_CHECKING: - from . import Bool, null, Scalar + from . import Bool, null from ._types import DType @@ -90,7 +90,7 @@ def get_rows(self: Column[DType], indices: Column[Any]) -> Column[DType]: """ ... - def get_value(self, row_number: int) -> Scalar: + def get_value(self, row_number: int) -> Any: """ Select the value at a row number, similar to `ndarray.__getitem__()`. @@ -137,7 +137,7 @@ def sorted_indices( """ ... - def __eq__(self, other: Column[DType] | Scalar) -> Column[Bool]: # type: ignore[override] + def __eq__(self, other: Column[DType] | Any) -> Column[Bool]: # type: ignore[override] """ Compare for equality. @@ -155,7 +155,7 @@ def __eq__(self, other: Column[DType] | Scalar) -> Column[Bool]: # type: ignore Column """ - def __ne__(self, other: Column[DType] | Scalar) -> Column[Bool]: # type: ignore[override] + def __ne__(self, other: Column[DType] | Any) -> Column[Bool]: # type: ignore[override] """ Compare for non-equality. @@ -173,7 +173,7 @@ def __ne__(self, other: Column[DType] | Scalar) -> Column[Bool]: # type: ignore Column """ - def __ge__(self, other: Column[DType] | Scalar) -> Column[Bool]: + def __ge__(self, other: Column[DType] | Any) -> Column[Bool]: """ Compare for "greater than or equal to" `other`. @@ -189,7 +189,7 @@ def __ge__(self, other: Column[DType] | Scalar) -> Column[Bool]: Column """ - def __gt__(self, other: Column[DType] | Scalar) -> Column[Bool]: + def __gt__(self, other: Column[DType] | Any) -> Column[Bool]: """ Compare for "greater than" `other`. @@ -205,7 +205,7 @@ def __gt__(self, other: Column[DType] | Scalar) -> Column[Bool]: Column """ - def __le__(self, other: Column[DType] | Scalar) -> Column[Bool]: + def __le__(self, other: Column[DType] | Any) -> Column[Bool]: """ Compare for "less than or equal to" `other`. @@ -221,7 +221,7 @@ def __le__(self, other: Column[DType] | Scalar) -> Column[Bool]: Column """ - def __lt__(self, other: Column[DType] | Scalar) -> Column[Bool]: + def __lt__(self, other: Column[DType] | Any) -> Column[Bool]: """ Compare for "less than" `other`. @@ -279,7 +279,7 @@ def __or__(self: Column[Bool], other: Column[Bool] | bool) -> Column[Bool]: If `self` or `other` is not boolean. """ - def __add__(self, other: Column[DType] | Scalar) -> Column[DType]: + def __add__(self, other: Column[DType] | Any) -> Column[DType]: """ Add `other` column or scalar to this column. @@ -295,7 +295,7 @@ def __add__(self, other: Column[DType] | Scalar) -> Column[DType]: Column """ - def __sub__(self, other: Column[DType] | Scalar) -> Column[DType]: + def __sub__(self, other: Column[DType] | Any) -> Column[DType]: """ Subtract `other` column or scalar from this column. @@ -311,7 +311,7 @@ def __sub__(self, other: Column[DType] | Scalar) -> Column[DType]: Column """ - def __mul__(self, other: Column[Any] | Scalar) -> Column[Any]: + def __mul__(self, other: Column[Any] | Any) -> Column[Any]: """ Multiply `other` column or scalar with this column. @@ -327,7 +327,7 @@ def __mul__(self, other: Column[Any] | Scalar) -> Column[Any]: Column """ - def __truediv__(self, other: Column[Any] | Scalar) -> Column[Any]: + def __truediv__(self, other: Column[Any] | Any) -> Column[Any]: """ Divide this column by `other` column or scalar. True division, returns floats. @@ -343,7 +343,7 @@ def __truediv__(self, other: Column[Any] | Scalar) -> Column[Any]: Column """ - def __floordiv__(self, other: Column[Any] | Scalar) -> Column[Any]: + def __floordiv__(self, other: Column[Any] | Any) -> Column[Any]: """ Floor-divide `other` column or scalar to this column. @@ -359,7 +359,7 @@ def __floordiv__(self, other: Column[Any] | Scalar) -> Column[Any]: Column """ - def __pow__(self, other: Column[Any] | Scalar) -> Column[Any]: + def __pow__(self, other: Column[Any] | Any) -> Column[Any]: """ Raise this column to the power of `other`. @@ -375,7 +375,7 @@ def __pow__(self, other: Column[Any] | Scalar) -> Column[Any]: Column """ - def __mod__(self, other: Column[Any] | Scalar) -> Column[Any]: + def __mod__(self, other: Column[Any] | Any) -> Column[Any]: """ Returns modulus of this column by `other` (`%` operator). @@ -391,7 +391,7 @@ def __mod__(self, other: Column[Any] | Scalar) -> Column[Any]: Column """ - def __divmod__(self, other: Column[Any] | Scalar) -> tuple[Column[Any], Column[Any]]: + def __divmod__(self, other: Column[Any] | Any) -> tuple[Column[Any], Column[Any]]: """ Return quotient and remainder of integer division. See `divmod` builtin function. @@ -437,32 +437,32 @@ def all(self: Column[Bool], *, skip_nulls: bool = True) -> bool: If column is not boolean. """ - def min(self, *, skip_nulls: bool = True) -> Scalar: + def min(self, *, skip_nulls: bool = True) -> Any: """ Reduction returns a scalar. Any data type that supports comparisons must be supported. The returned value has the same dtype as the column. """ - def max(self, *, skip_nulls: bool = True) -> Scalar: + def max(self, *, skip_nulls: bool = True) -> Any: """ Reduction returns a scalar. Any data type that supports comparisons must be supported. The returned value has the same dtype as the column. """ - def sum(self, *, skip_nulls: bool = True) -> Scalar: + def sum(self, *, skip_nulls: bool = True) -> Any: """ Reduction returns a scalar. Must be supported for numerical and datetime data types. The returned value has the same dtype as the column. """ - def prod(self, *, skip_nulls: bool = True) -> Scalar: + def prod(self, *, skip_nulls: bool = True) -> Any: """ Reduction returns a scalar. Must be supported for numerical data types. The returned value has the same dtype as the column. """ - def median(self, *, skip_nulls: bool = True) -> Scalar: + def median(self, *, skip_nulls: bool = True) -> Any: """ Reduction returns a scalar. Must be supported for numerical and datetime data types. Returns a float for numerical data types, and @@ -470,7 +470,7 @@ def median(self, *, skip_nulls: bool = True) -> Scalar: dtypes. """ - def mean(self, *, skip_nulls: bool = True) -> Scalar: + def mean(self, *, skip_nulls: bool = True) -> Any: """ Reduction returns a scalar. Must be supported for numerical and datetime data types. Returns a float for numerical data types, and @@ -478,7 +478,7 @@ def mean(self, *, skip_nulls: bool = True) -> Scalar: dtypes. """ - def std(self, *, skip_nulls: bool = True) -> Scalar: + def std(self, *, skip_nulls: bool = True) -> Any: """ Reduction returns a scalar. Must be supported for numerical and datetime data types. Returns a float for numerical data types, and @@ -486,7 +486,7 @@ def std(self, *, skip_nulls: bool = True) -> Scalar: dtypes. """ - def var(self, *, skip_nulls: bool = True) -> Scalar: + def var(self, *, skip_nulls: bool = True) -> Any: """ Reduction returns a scalar. Must be supported for numerical and datetime data types. Returns a float for numerical data types, and diff --git a/spec/API_specification/dataframe_api/dataframe_object.py b/spec/API_specification/dataframe_api/dataframe_object.py index 1c6dd9ec..04131ff8 100644 --- a/spec/API_specification/dataframe_api/dataframe_object.py +++ b/spec/API_specification/dataframe_api/dataframe_object.py @@ -7,7 +7,7 @@ if TYPE_CHECKING: from .column_object import Column from .groupby_object import GroupBy - from . import Bool, null, Scalar + from . import Bool, null @@ -293,7 +293,7 @@ def sorted_indices( """ ... - def __eq__(self, other: DataFrame | Scalar) -> DataFrame: # type: ignore[override] + def __eq__(self, other: DataFrame | Any) -> DataFrame: # type: ignore[override] """ Compare for equality. @@ -312,7 +312,7 @@ def __eq__(self, other: DataFrame | Scalar) -> DataFrame: # type: ignore[overri """ ... - def __ne__(self, other: DataFrame | Scalar) -> DataFrame: # type: ignore[override] + def __ne__(self, other: DataFrame | Any) -> DataFrame: # type: ignore[override] """ Compare for non-equality. @@ -331,7 +331,7 @@ def __ne__(self, other: DataFrame | Scalar) -> DataFrame: # type: ignore[overri """ ... - def __ge__(self, other: DataFrame | Scalar) -> DataFrame: + def __ge__(self, other: DataFrame | Any) -> DataFrame: """ Compare for "greater than or equal to" `other`. @@ -348,7 +348,7 @@ def __ge__(self, other: DataFrame | Scalar) -> DataFrame: """ ... - def __gt__(self, other: DataFrame | Scalar) -> DataFrame: + def __gt__(self, other: DataFrame | Any) -> DataFrame: """ Compare for "greater than" `other`. @@ -365,7 +365,7 @@ def __gt__(self, other: DataFrame | Scalar) -> DataFrame: """ ... - def __le__(self, other: DataFrame | Scalar) -> DataFrame: + def __le__(self, other: DataFrame | Any) -> DataFrame: """ Compare for "less than or equal to" `other`. @@ -382,7 +382,7 @@ def __le__(self, other: DataFrame | Scalar) -> DataFrame: """ ... - def __lt__(self, other: DataFrame | Scalar) -> DataFrame: + def __lt__(self, other: DataFrame | Any) -> DataFrame: """ Compare for "less than" `other`. @@ -441,7 +441,7 @@ def __or__(self, other: DataFrame | bool) -> DataFrame: If `self` or `other` is not boolean. """ - def __add__(self, other: DataFrame | Scalar) -> DataFrame: + def __add__(self, other: DataFrame | Any) -> DataFrame: """ Add `other` dataframe or scalar to this dataframe. @@ -458,7 +458,7 @@ def __add__(self, other: DataFrame | Scalar) -> DataFrame: """ ... - def __sub__(self, other: DataFrame | Scalar) -> DataFrame: + def __sub__(self, other: DataFrame | Any) -> DataFrame: """ Subtract `other` dataframe or scalar from this dataframe. @@ -475,7 +475,7 @@ def __sub__(self, other: DataFrame | Scalar) -> DataFrame: """ ... - def __mul__(self, other: DataFrame | Scalar) -> DataFrame: + def __mul__(self, other: DataFrame | Any) -> DataFrame: """ Multiply `other` dataframe or scalar with this dataframe. @@ -492,7 +492,7 @@ def __mul__(self, other: DataFrame | Scalar) -> DataFrame: """ ... - def __truediv__(self, other: DataFrame | Scalar) -> DataFrame: + def __truediv__(self, other: DataFrame | Any) -> DataFrame: """ Divide this dataframe by `other` dataframe or scalar. True division, returns floats. @@ -509,7 +509,7 @@ def __truediv__(self, other: DataFrame | Scalar) -> DataFrame: """ ... - def __floordiv__(self, other: DataFrame | Scalar) -> DataFrame: + def __floordiv__(self, other: DataFrame | Any) -> DataFrame: """ Floor-divide (returns integers) this dataframe by `other` dataframe or scalar. @@ -526,7 +526,7 @@ def __floordiv__(self, other: DataFrame | Scalar) -> DataFrame: """ ... - def __pow__(self, other: DataFrame | Scalar) -> DataFrame: + def __pow__(self, other: DataFrame | Any) -> DataFrame: """ Raise this dataframe to the power of `other`. @@ -543,7 +543,7 @@ def __pow__(self, other: DataFrame | Scalar) -> DataFrame: """ ... - def __mod__(self, other: DataFrame | Scalar) -> DataFrame: + def __mod__(self, other: DataFrame | Any) -> DataFrame: """ Return modulus of this dataframe by `other` (`%` operator). @@ -560,7 +560,7 @@ def __mod__(self, other: DataFrame | Scalar) -> DataFrame: """ ... - def __divmod__(self, other: DataFrame | Scalar) -> tuple[DataFrame, DataFrame]: + def __divmod__(self, other: DataFrame | Any) -> tuple[DataFrame, DataFrame]: """ Return quotient and remainder of integer division. See `divmod` builtin function. diff --git a/spec/design_topics/python_builtin_types.md b/spec/design_topics/python_builtin_types.md index 567baca3..1c78dd9a 100644 --- a/spec/design_topics/python_builtin_types.md +++ b/spec/design_topics/python_builtin_types.md @@ -12,7 +12,7 @@ the `float` it is documented to return, in combination with the `__gt__` method ```python class DataFrame: - def __gt__(self, other: DataFrame | Scalar) -> DataFrame: + def __gt__(self, other: DataFrame | Any) -> DataFrame: ... def get_column_by_name(self, name: str, /) -> Column: ... From 56c62935c953c5928d22fd05ccc7d00431a0eb7c Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Thu, 29 Jun 2023 08:53:31 +0100 Subject: [PATCH 27/38] make self: Column[DType] explicit --- .../dataframe_api/column_object.py | 20 +++++++++---------- .../dataframe_api/dataframe_object.py | 2 +- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/spec/API_specification/dataframe_api/column_object.py b/spec/API_specification/dataframe_api/column_object.py index 9da43f74..5231bcec 100644 --- a/spec/API_specification/dataframe_api/column_object.py +++ b/spec/API_specification/dataframe_api/column_object.py @@ -137,7 +137,7 @@ def sorted_indices( """ ... - def __eq__(self, other: Column[DType] | Any) -> Column[Bool]: # type: ignore[override] + def __eq__(self: Column[DType], other: Column[DType] | Any) -> Column[Bool]: # type: ignore[override] """ Compare for equality. @@ -155,7 +155,7 @@ def __eq__(self, other: Column[DType] | Any) -> Column[Bool]: # type: ignore[ov Column """ - def __ne__(self, other: Column[DType] | Any) -> Column[Bool]: # type: ignore[override] + def __ne__(self: Column[DType], other: Column[DType] | Any) -> Column[Bool]: # type: ignore[override] """ Compare for non-equality. @@ -173,7 +173,7 @@ def __ne__(self, other: Column[DType] | Any) -> Column[Bool]: # type: ignore[ov Column """ - def __ge__(self, other: Column[DType] | Any) -> Column[Bool]: + def __ge__(self: Column[DType], other: Column[DType] | Any) -> Column[Bool]: """ Compare for "greater than or equal to" `other`. @@ -189,7 +189,7 @@ def __ge__(self, other: Column[DType] | Any) -> Column[Bool]: Column """ - def __gt__(self, other: Column[DType] | Any) -> Column[Bool]: + def __gt__(self: Column[DType], other: Column[DType] | Any) -> Column[Bool]: """ Compare for "greater than" `other`. @@ -205,7 +205,7 @@ def __gt__(self, other: Column[DType] | Any) -> Column[Bool]: Column """ - def __le__(self, other: Column[DType] | Any) -> Column[Bool]: + def __le__(self: Column[DType], other: Column[DType] | Any) -> Column[Bool]: """ Compare for "less than or equal to" `other`. @@ -221,7 +221,7 @@ def __le__(self, other: Column[DType] | Any) -> Column[Bool]: Column """ - def __lt__(self, other: Column[DType] | Any) -> Column[Bool]: + def __lt__(self: Column[DType], other: Column[DType] | Any) -> Column[Bool]: """ Compare for "less than" `other`. @@ -279,7 +279,7 @@ def __or__(self: Column[Bool], other: Column[Bool] | bool) -> Column[Bool]: If `self` or `other` is not boolean. """ - def __add__(self, other: Column[DType] | Any) -> Column[DType]: + def __add__(self: Column[DType], other: Column[DType] | Any) -> Column[DType]: """ Add `other` column or scalar to this column. @@ -295,7 +295,7 @@ def __add__(self, other: Column[DType] | Any) -> Column[DType]: Column """ - def __sub__(self, other: Column[DType] | Any) -> Column[DType]: + def __sub__(self: Column[DType], other: Column[DType] | Any) -> Column[DType]: """ Subtract `other` column or scalar from this column. @@ -532,7 +532,7 @@ def is_nan(self) -> Column[Bool]: In particular, does not check for `np.timedelta64('NaT')`. """ - def is_in(self, values: Column[DType]) -> Column[Bool]: + def is_in(self: Column[DType], values: Column[DType]) -> Column[Bool]: """ Indicate whether the value at each row matches any value in `values`. @@ -571,7 +571,7 @@ def unique_indices(self, *, skip_nulls: bool = True) -> Column[Any]: """ ... - def fill_nan(self, value: float | 'null', /) -> Column[DType]: + def fill_nan(self: Column[DType], value: float | 'null', /) -> Column[DType]: """ Fill floating point ``nan`` values with the given fill value. diff --git a/spec/API_specification/dataframe_api/dataframe_object.py b/spec/API_specification/dataframe_api/dataframe_object.py index 04131ff8..ac0e0373 100644 --- a/spec/API_specification/dataframe_api/dataframe_object.py +++ b/spec/API_specification/dataframe_api/dataframe_object.py @@ -104,7 +104,7 @@ def groupby(self, keys: Sequence[str], /) -> GroupBy: """ ... - def get_column_by_name(self, name: str, /) -> Column[DType]: + def get_column_by_name(self, name: str, /) -> Column[Any]: """ Select a column by name. From 314ab42fb7f1626f47ab4ca974c065367e2b6167 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Thu, 29 Jun 2023 08:57:24 +0100 Subject: [PATCH 28/38] preserve Column[int] in docs --- spec/API_specification/dataframe_api/column_object.py | 6 +++--- spec/API_specification/dataframe_api/dataframe_object.py | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/spec/API_specification/dataframe_api/column_object.py b/spec/API_specification/dataframe_api/column_object.py index 5231bcec..48659618 100644 --- a/spec/API_specification/dataframe_api/column_object.py +++ b/spec/API_specification/dataframe_api/column_object.py @@ -85,7 +85,7 @@ def get_rows(self: Column[DType], indices: Column[Any]) -> Column[DType]: Parameters ---------- - indices : Column[IntDType] + indices : Column[int] Positions of rows to select. """ ... @@ -133,7 +133,7 @@ def sorted_indices( Returns ------- - Column[IntDType] + Column[int] """ ... @@ -556,7 +556,7 @@ def unique_indices(self, *, skip_nulls: bool = True) -> Column[Any]: Returns ------- - Column[IntDType] + Column[int] Indices corresponding to unique values. Notes diff --git a/spec/API_specification/dataframe_api/dataframe_object.py b/spec/API_specification/dataframe_api/dataframe_object.py index ac0e0373..18d4e8dd 100644 --- a/spec/API_specification/dataframe_api/dataframe_object.py +++ b/spec/API_specification/dataframe_api/dataframe_object.py @@ -148,7 +148,7 @@ def get_rows(self, indices: Column[Any]) -> DataFrame: Parameters ---------- - indices : Column[IntDType] + indices : Column[int] Positions of rows to select. Returns @@ -284,7 +284,7 @@ def sorted_indices( Returns ------- - Column[IntDType] + Column[int] Raises ------ From 4ddbae123beddf3af0e882df09615b991be161e4 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Thu, 29 Jun 2023 09:02:46 +0100 Subject: [PATCH 29/38] simplify further --- spec/API_specification/dataframe_api/column_object.py | 4 ++-- spec/conf.py | 3 --- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/spec/API_specification/dataframe_api/column_object.py b/spec/API_specification/dataframe_api/column_object.py index 48659618..8f33de48 100644 --- a/spec/API_specification/dataframe_api/column_object.py +++ b/spec/API_specification/dataframe_api/column_object.py @@ -279,7 +279,7 @@ def __or__(self: Column[Bool], other: Column[Bool] | bool) -> Column[Bool]: If `self` or `other` is not boolean. """ - def __add__(self: Column[DType], other: Column[DType] | Any) -> Column[DType]: + def __add__(self: Column[Any], other: Column[Any] | Any) -> Column[Any]: """ Add `other` column or scalar to this column. @@ -295,7 +295,7 @@ def __add__(self: Column[DType], other: Column[DType] | Any) -> Column[DType]: Column """ - def __sub__(self: Column[DType], other: Column[DType] | Any) -> Column[DType]: + def __sub__(self: Column[Any], other: Column[Any] | Any) -> Column[Any]: """ Subtract `other` column or scalar from this column. diff --git a/spec/conf.py b/spec/conf.py index da73887b..86ae0c06 100644 --- a/spec/conf.py +++ b/spec/conf.py @@ -82,10 +82,7 @@ ('py:class', 'enum.Enum'), ('py:class', 'ellipsis'), ('py:class', 'Scalar'), - ('py:class', 'DTypeT'), ('py:class', 'Bool'), - ('py:class', 'IntDType'), - ('py:class', 'FloatDType'), ] # NOTE: this alias handling isn't used yet - added in anticipation of future # need based on dataframe API aliases. From ea7a81a5dc6bd6000e0e63cc00778643eeeefd83 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Thu, 29 Jun 2023 09:07:32 +0100 Subject: [PATCH 30/38] reduce diff --- spec/API_specification/dataframe_api/__init__.py | 1 + spec/API_specification/dataframe_api/dataframe_object.py | 2 +- spec/API_specification/dataframe_api/groupby_object.py | 1 - spec/API_specification/index.rst | 4 ---- 4 files changed, 2 insertions(+), 6 deletions(-) diff --git a/spec/API_specification/dataframe_api/__init__.py b/spec/API_specification/dataframe_api/__init__.py index 08015abb..6e9b3e7a 100644 --- a/spec/API_specification/dataframe_api/__init__.py +++ b/spec/API_specification/dataframe_api/__init__.py @@ -29,6 +29,7 @@ "UInt8", "Float64", "Float32", + "Bool", ] diff --git a/spec/API_specification/dataframe_api/dataframe_object.py b/spec/API_specification/dataframe_api/dataframe_object.py index 18d4e8dd..1ffc7988 100644 --- a/spec/API_specification/dataframe_api/dataframe_object.py +++ b/spec/API_specification/dataframe_api/dataframe_object.py @@ -577,7 +577,7 @@ def __divmod__(self, other: DataFrame | Any) -> tuple[DataFrame, DataFrame]: """ ... - def __invert__(self: DataFrame) -> DataFrame: + def __invert__(self) -> DataFrame: """ Invert truthiness of (boolean) elements. diff --git a/spec/API_specification/dataframe_api/groupby_object.py b/spec/API_specification/dataframe_api/groupby_object.py index 096c4e11..5a4d2a73 100644 --- a/spec/API_specification/dataframe_api/groupby_object.py +++ b/spec/API_specification/dataframe_api/groupby_object.py @@ -9,7 +9,6 @@ __all__ = ['GroupBy'] - class GroupBy: """ GroupBy object. diff --git a/spec/API_specification/index.rst b/spec/API_specification/index.rst index 165266a4..b90d3320 100644 --- a/spec/API_specification/index.rst +++ b/spec/API_specification/index.rst @@ -16,7 +16,6 @@ of objects and functions in the top-level namespace. The latter are: __dataframe_api_version__ is_null null - DType Int64 Int32 Int16 @@ -28,9 +27,6 @@ of objects and functions in the top-level namespace. The latter are: Float64 Float32 Bool - DTypeT - IntDType - FloatDType The ``DataFrame``, ``Column`` and ``GroupBy`` objects have the following methods and attributes: From 4a740b931b382db7868146257665224004d6e4cd Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Thu, 29 Jun 2023 09:14:17 +0100 Subject: [PATCH 31/38] reduce diff --- spec/API_specification/dataframe_api/_types.py | 1 - spec/API_specification/dataframe_api/dataframe_object.py | 5 +---- spec/API_specification/dataframe_api/groupby_object.py | 2 +- 3 files changed, 2 insertions(+), 6 deletions(-) diff --git a/spec/API_specification/dataframe_api/_types.py b/spec/API_specification/dataframe_api/_types.py index 030d920c..2874ba4c 100644 --- a/spec/API_specification/dataframe_api/_types.py +++ b/spec/API_specification/dataframe_api/_types.py @@ -17,7 +17,6 @@ TypeVar, Union, Protocol, - TYPE_CHECKING, ) from enum import Enum diff --git a/spec/API_specification/dataframe_api/dataframe_object.py b/spec/API_specification/dataframe_api/dataframe_object.py index 1ffc7988..759f602f 100644 --- a/spec/API_specification/dataframe_api/dataframe_object.py +++ b/spec/API_specification/dataframe_api/dataframe_object.py @@ -1,8 +1,6 @@ from __future__ import annotations -from typing import Any, Literal, Mapping, Sequence, Union, TYPE_CHECKING, NoReturn, TypeVar, Generic - -from ._types import DType +from typing import Any, Literal, Mapping, Sequence, Union, TYPE_CHECKING, NoReturn if TYPE_CHECKING: from .column_object import Column @@ -10,7 +8,6 @@ from . import Bool, null - __all__ = ["DataFrame"] diff --git a/spec/API_specification/dataframe_api/groupby_object.py b/spec/API_specification/dataframe_api/groupby_object.py index 5a4d2a73..8787edf2 100644 --- a/spec/API_specification/dataframe_api/groupby_object.py +++ b/spec/API_specification/dataframe_api/groupby_object.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import TYPE_CHECKING, TypeVar, Generic, Any +from typing import TYPE_CHECKING if TYPE_CHECKING: from .dataframe_object import DataFrame From d6a6e87fde416024a568d609f0c3d48120f9745c Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Thu, 29 Jun 2023 09:23:39 +0100 Subject: [PATCH 32/38] get docs building again --- spec/API_specification/dataframe_api/__init__.py | 2 +- spec/API_specification/dataframe_api/column_object.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/spec/API_specification/dataframe_api/__init__.py b/spec/API_specification/dataframe_api/__init__.py index 6e9b3e7a..9071939c 100644 --- a/spec/API_specification/dataframe_api/__init__.py +++ b/spec/API_specification/dataframe_api/__init__.py @@ -13,7 +13,7 @@ __all__ = [ "DataFrame", "Column", - "__dataframe_api_version", + "__dataframe_api_version__", "column_from_sequence", "concat", "dataframe_from_dict", diff --git a/spec/API_specification/dataframe_api/column_object.py b/spec/API_specification/dataframe_api/column_object.py index 8f33de48..06c9a5b7 100644 --- a/spec/API_specification/dataframe_api/column_object.py +++ b/spec/API_specification/dataframe_api/column_object.py @@ -2,9 +2,10 @@ from typing import Any,NoReturn, Sequence, TYPE_CHECKING, Literal, Generic, TypeVar +from ._types import DType + if TYPE_CHECKING: from . import Bool, null - from ._types import DType __all__ = ['Column'] From 526a5d7cc33ab4bfd1a4488de4af77a6becd44e7 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Thu, 29 Jun 2023 09:35:51 +0100 Subject: [PATCH 33/38] further reduce diff --- spec/API_specification/dataframe_api/__init__.py | 4 ++-- spec/API_specification/dataframe_api/column_object.py | 2 +- spec/API_specification/dataframe_api/dataframe_object.py | 1 + 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/spec/API_specification/dataframe_api/__init__.py b/spec/API_specification/dataframe_api/__init__.py index 9071939c..ed664363 100644 --- a/spec/API_specification/dataframe_api/__init__.py +++ b/spec/API_specification/dataframe_api/__init__.py @@ -3,7 +3,7 @@ """ from __future__ import annotations -from typing import Mapping, Sequence, Any, Generic, TypeVar +from typing import Mapping, Sequence, Any from .column_object import * from .dataframe_object import DataFrame @@ -11,9 +11,9 @@ from ._types import DType __all__ = [ + "__dataframe_api_version__", "DataFrame", "Column", - "__dataframe_api_version__", "column_from_sequence", "concat", "dataframe_from_dict", diff --git a/spec/API_specification/dataframe_api/column_object.py b/spec/API_specification/dataframe_api/column_object.py index 06c9a5b7..3880406d 100644 --- a/spec/API_specification/dataframe_api/column_object.py +++ b/spec/API_specification/dataframe_api/column_object.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import Any,NoReturn, Sequence, TYPE_CHECKING, Literal, Generic, TypeVar +from typing import Any,NoReturn, Sequence, TYPE_CHECKING, Literal, Generic from ._types import DType diff --git a/spec/API_specification/dataframe_api/dataframe_object.py b/spec/API_specification/dataframe_api/dataframe_object.py index 759f602f..f2afaf83 100644 --- a/spec/API_specification/dataframe_api/dataframe_object.py +++ b/spec/API_specification/dataframe_api/dataframe_object.py @@ -2,6 +2,7 @@ from typing import Any, Literal, Mapping, Sequence, Union, TYPE_CHECKING, NoReturn + if TYPE_CHECKING: from .column_object import Column from .groupby_object import GroupBy From 5486e22ea916a8e8298dde5e0c3647c3a8494660 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Fri, 7 Jul 2023 13:30:09 +0100 Subject: [PATCH 34/38] introduce Scalar type alias --- .../API_specification/dataframe_api/_types.py | 5 +++- .../dataframe_api/column_object.py | 21 +++++++------- .../dataframe_api/dataframe_object.py | 29 ++++++++++--------- spec/design_topics/python_builtin_types.md | 2 +- 4 files changed, 31 insertions(+), 26 deletions(-) diff --git a/spec/API_specification/dataframe_api/_types.py b/spec/API_specification/dataframe_api/_types.py index 2874ba4c..dde7795a 100644 --- a/spec/API_specification/dataframe_api/_types.py +++ b/spec/API_specification/dataframe_api/_types.py @@ -20,8 +20,11 @@ ) from enum import Enum +# Type alias: Mypy needs Any, but for readability we need to make clear this +# is a Python scalar (i.e., an instance of `bool`, `int`, `float`, `str`, etc.) +Scalar = Any + array = TypeVar("array") -Scalar = TypeVar("Scalar") device = TypeVar("device") DType = TypeVar("DType") SupportsDLPack = TypeVar("SupportsDLPack") diff --git a/spec/API_specification/dataframe_api/column_object.py b/spec/API_specification/dataframe_api/column_object.py index f394a7f8..7d8300b8 100644 --- a/spec/API_specification/dataframe_api/column_object.py +++ b/spec/API_specification/dataframe_api/column_object.py @@ -6,6 +6,7 @@ if TYPE_CHECKING: from . import Bool, null + from ._types import Scalar __all__ = ['Column'] @@ -91,7 +92,7 @@ def get_rows(self: Column[DType], indices: Column[Any]) -> Column[DType]: """ ... - def get_value(self, row_number: int) -> Any: + def get_value(self, row_number: int) -> Scalar: """ Select the value at a row number, similar to `ndarray.__getitem__()`. @@ -138,7 +139,7 @@ def sorted_indices( """ ... - def __eq__(self: Column[DType], other: Column[DType] | Any) -> Column[Bool]: # type: ignore[override] + def __eq__(self, other: Column[Any] | Scalar) -> Column[Bool]: # type: ignore[override] """ Compare for equality. @@ -280,7 +281,7 @@ def __or__(self: Column[Bool], other: Column[Bool] | bool) -> Column[Bool]: If `self` or `other` is not boolean. """ - def __add__(self: Column[Any], other: Column[Any] | Any) -> Column[Any]: + def __add__(self: Column[Any], other: Column[Any] | Scalar) -> Column[Any]: """ Add `other` column or scalar to this column. @@ -296,7 +297,7 @@ def __add__(self: Column[Any], other: Column[Any] | Any) -> Column[Any]: Column """ - def __sub__(self: Column[Any], other: Column[Any] | Any) -> Column[Any]: + def __sub__(self: Column[Any], other: Column[Any] | Scalar) -> Column[Any]: """ Subtract `other` column or scalar from this column. @@ -312,7 +313,7 @@ def __sub__(self: Column[Any], other: Column[Any] | Any) -> Column[Any]: Column """ - def __mul__(self, other: Column[Any] | Any) -> Column[Any]: + def __mul__(self, other: Column[Any] | Scalar) -> Column[Any]: """ Multiply `other` column or scalar with this column. @@ -328,7 +329,7 @@ def __mul__(self, other: Column[Any] | Any) -> Column[Any]: Column """ - def __truediv__(self, other: Column[Any] | Any) -> Column[Any]: + def __truediv__(self, other: Column[Any] | Scalar) -> Column[Any]: """ Divide this column by `other` column or scalar. True division, returns floats. @@ -344,7 +345,7 @@ def __truediv__(self, other: Column[Any] | Any) -> Column[Any]: Column """ - def __floordiv__(self, other: Column[Any] | Any) -> Column[Any]: + def __floordiv__(self, other: Column[Any] | Scalar) -> Column[Any]: """ Floor-divide `other` column or scalar to this column. @@ -360,7 +361,7 @@ def __floordiv__(self, other: Column[Any] | Any) -> Column[Any]: Column """ - def __pow__(self, other: Column[Any] | Any) -> Column[Any]: + def __pow__(self, other: Column[Any] | Scalar) -> Column[Any]: """ Raise this column to the power of `other`. @@ -380,7 +381,7 @@ def __pow__(self, other: Column[Any] | Any) -> Column[Any]: Column """ - def __mod__(self, other: Column[Any] | Any) -> Column[Any]: + def __mod__(self, other: Column[Any] | Scalar) -> Column[Any]: """ Returns modulus of this column by `other` (`%` operator). @@ -396,7 +397,7 @@ def __mod__(self, other: Column[Any] | Any) -> Column[Any]: Column """ - def __divmod__(self, other: Column[Any] | Any) -> tuple[Column[Any], Column[Any]]: + def __divmod__(self, other: Column[Any] | Scalar) -> tuple[Column[Any], Column[Any]]: """ Return quotient and remainder of integer division. See `divmod` builtin function. diff --git a/spec/API_specification/dataframe_api/dataframe_object.py b/spec/API_specification/dataframe_api/dataframe_object.py index dbc84ac4..827d0f4c 100644 --- a/spec/API_specification/dataframe_api/dataframe_object.py +++ b/spec/API_specification/dataframe_api/dataframe_object.py @@ -7,6 +7,7 @@ from .column_object import Column from .groupby_object import GroupBy from . import Bool, null + from ._types import Scalar __all__ = ["DataFrame"] @@ -291,7 +292,7 @@ def sorted_indices( """ ... - def __eq__(self, other: DataFrame | Any) -> DataFrame: # type: ignore[override] + def __eq__(self, other: DataFrame | Scalar) -> DataFrame: # type: ignore[override] """ Compare for equality. @@ -310,7 +311,7 @@ def __eq__(self, other: DataFrame | Any) -> DataFrame: # type: ignore[override] """ ... - def __ne__(self, other: DataFrame | Any) -> DataFrame: # type: ignore[override] + def __ne__(self, other: DataFrame | Scalar) -> DataFrame: # type: ignore[override] """ Compare for non-equality. @@ -329,7 +330,7 @@ def __ne__(self, other: DataFrame | Any) -> DataFrame: # type: ignore[override] """ ... - def __ge__(self, other: DataFrame | Any) -> DataFrame: + def __ge__(self, other: DataFrame | Scalar) -> DataFrame: """ Compare for "greater than or equal to" `other`. @@ -346,7 +347,7 @@ def __ge__(self, other: DataFrame | Any) -> DataFrame: """ ... - def __gt__(self, other: DataFrame | Any) -> DataFrame: + def __gt__(self, other: DataFrame | Scalar) -> DataFrame: """ Compare for "greater than" `other`. @@ -363,7 +364,7 @@ def __gt__(self, other: DataFrame | Any) -> DataFrame: """ ... - def __le__(self, other: DataFrame | Any) -> DataFrame: + def __le__(self, other: DataFrame | Scalar) -> DataFrame: """ Compare for "less than or equal to" `other`. @@ -380,7 +381,7 @@ def __le__(self, other: DataFrame | Any) -> DataFrame: """ ... - def __lt__(self, other: DataFrame | Any) -> DataFrame: + def __lt__(self, other: DataFrame | Scalar) -> DataFrame: """ Compare for "less than" `other`. @@ -439,7 +440,7 @@ def __or__(self, other: DataFrame | bool) -> DataFrame: If `self` or `other` is not boolean. """ - def __add__(self, other: DataFrame | Any) -> DataFrame: + def __add__(self, other: DataFrame | Scalar) -> DataFrame: """ Add `other` dataframe or scalar to this dataframe. @@ -456,7 +457,7 @@ def __add__(self, other: DataFrame | Any) -> DataFrame: """ ... - def __sub__(self, other: DataFrame | Any) -> DataFrame: + def __sub__(self, other: DataFrame | Scalar) -> DataFrame: """ Subtract `other` dataframe or scalar from this dataframe. @@ -473,7 +474,7 @@ def __sub__(self, other: DataFrame | Any) -> DataFrame: """ ... - def __mul__(self, other: DataFrame | Any) -> DataFrame: + def __mul__(self, other: DataFrame | Scalar) -> DataFrame: """ Multiply `other` dataframe or scalar with this dataframe. @@ -490,7 +491,7 @@ def __mul__(self, other: DataFrame | Any) -> DataFrame: """ ... - def __truediv__(self, other: DataFrame | Any) -> DataFrame: + def __truediv__(self, other: DataFrame | Scalar) -> DataFrame: """ Divide this dataframe by `other` dataframe or scalar. True division, returns floats. @@ -507,7 +508,7 @@ def __truediv__(self, other: DataFrame | Any) -> DataFrame: """ ... - def __floordiv__(self, other: DataFrame | Any) -> DataFrame: + def __floordiv__(self, other: DataFrame | Scalar) -> DataFrame: """ Floor-divide (returns integers) this dataframe by `other` dataframe or scalar. @@ -524,7 +525,7 @@ def __floordiv__(self, other: DataFrame | Any) -> DataFrame: """ ... - def __pow__(self, other: DataFrame | Any) -> DataFrame: + def __pow__(self, other: DataFrame | Scalar) -> DataFrame: """ Raise this dataframe to the power of `other`. @@ -545,7 +546,7 @@ def __pow__(self, other: DataFrame | Any) -> DataFrame: """ ... - def __mod__(self, other: DataFrame | Any) -> DataFrame: + def __mod__(self, other: DataFrame | Scalar) -> DataFrame: """ Return modulus of this dataframe by `other` (`%` operator). @@ -562,7 +563,7 @@ def __mod__(self, other: DataFrame | Any) -> DataFrame: """ ... - def __divmod__(self, other: DataFrame | Any) -> tuple[DataFrame, DataFrame]: + def __divmod__(self, other: DataFrame | Scalar) -> tuple[DataFrame, DataFrame]: """ Return quotient and remainder of integer division. See `divmod` builtin function. diff --git a/spec/design_topics/python_builtin_types.md b/spec/design_topics/python_builtin_types.md index 1c78dd9a..567baca3 100644 --- a/spec/design_topics/python_builtin_types.md +++ b/spec/design_topics/python_builtin_types.md @@ -12,7 +12,7 @@ the `float` it is documented to return, in combination with the `__gt__` method ```python class DataFrame: - def __gt__(self, other: DataFrame | Any) -> DataFrame: + def __gt__(self, other: DataFrame | Scalar) -> DataFrame: ... def get_column_by_name(self, name: str, /) -> Column: ... From 6a8a428bfc591bba52480ef466820edd3ca955fd Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Fri, 7 Jul 2023 13:33:24 +0100 Subject: [PATCH 35/38] fixup --- .../API_specification/dataframe_api/groupby_object.py | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/spec/API_specification/dataframe_api/groupby_object.py b/spec/API_specification/dataframe_api/groupby_object.py index 9e41631a..c020be9d 100644 --- a/spec/API_specification/dataframe_api/groupby_object.py +++ b/spec/API_specification/dataframe_api/groupby_object.py @@ -43,17 +43,10 @@ def median(self, *, skip_nulls: bool = True) -> DataFrame: def mean(self, *, skip_nulls: bool = True) -> DataFrame: ... -<<<<<<< HEAD - def std(self, *, skip_nulls: bool = True) -> DataFrame: + def std(self, *, correction: int | float = 1, skip_nulls: bool = True) -> DataFrame: ... - def var(self, *, skip_nulls: bool = True) -> DataFrame: -======= - def std(self, *, correction: int | float = 1, skip_nulls: bool = True) -> "DataFrame": - ... - - def var(self, *, correction: int | float = 1, skip_nulls: bool = True) -> "DataFrame": ->>>>>>> upstream/main + def var(self, *, correction: int | float = 1, skip_nulls: bool = True) -> DataFrame: ... def size(self) -> DataFrame: From e2d3068cc390a77d55de330dcd90e1006cced393 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Fri, 7 Jul 2023 13:34:45 +0100 Subject: [PATCH 36/38] fixup mypy --- spec/API_specification/dataframe_api/column_object.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/spec/API_specification/dataframe_api/column_object.py b/spec/API_specification/dataframe_api/column_object.py index 7d8300b8..674d6d4f 100644 --- a/spec/API_specification/dataframe_api/column_object.py +++ b/spec/API_specification/dataframe_api/column_object.py @@ -527,26 +527,26 @@ def var(self, *, correction: int | float = 1, skip_nulls: bool = True) -> Any: Whether to skip null values. """ - def cumulative_max(self) -> Column: + def cumulative_max(self: Column[DType]) -> Column[DType]: """ Reduction returns a Column. Any data type that supports comparisons must be supported. The returned value has the same dtype as the column. """ - def cumulative_min(self) -> Column: + def cumulative_min(self: Column[DType]) -> Column[DType]: """ Reduction returns a Column. Any data type that supports comparisons must be supported. The returned value has the same dtype as the column. """ - def cumulative_sum(self) -> Column: + def cumulative_sum(self: Column[DType]) -> Column[DType]: """ Reduction returns a Column. Must be supported for numerical and datetime data types. The returned value has the same dtype as the column. """ - def cumulative_prod(self) -> Column: + def cumulative_prod(self: Column[DType]) -> Column[DType]: """ Reduction returns a Column. Must be supported for numerical and datetime data types. The returned value has the same dtype as the @@ -644,7 +644,7 @@ def fill_nan(self: Column[DType], value: float | 'null', /) -> Column[DType]: """ ... - def fill_null(self, value: Scalar, /) -> Column: + def fill_null(self: Column[DType], value: Scalar, /) -> Column[DType]: """ Fill null values with the given fill value. From 59140c28d0d7af73d7dad1a6096315a9b7f57887 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Fri, 7 Jul 2023 13:36:13 +0100 Subject: [PATCH 37/38] reduce diff --- spec/API_specification/dataframe_api/column_object.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/spec/API_specification/dataframe_api/column_object.py b/spec/API_specification/dataframe_api/column_object.py index 674d6d4f..c4d7656b 100644 --- a/spec/API_specification/dataframe_api/column_object.py +++ b/spec/API_specification/dataframe_api/column_object.py @@ -157,7 +157,7 @@ def __eq__(self, other: Column[Any] | Scalar) -> Column[Bool]: # type: ignore[o Column """ - def __ne__(self: Column[DType], other: Column[DType] | Any) -> Column[Bool]: # type: ignore[override] + def __ne__(self: Column[DType], other: Column[DType] | Scalar) -> Column[Bool]: # type: ignore[override] """ Compare for non-equality. @@ -175,7 +175,7 @@ def __ne__(self: Column[DType], other: Column[DType] | Any) -> Column[Bool]: # Column """ - def __ge__(self: Column[DType], other: Column[DType] | Any) -> Column[Bool]: + def __ge__(self: Column[DType], other: Column[DType] | Scalar) -> Column[Bool]: """ Compare for "greater than or equal to" `other`. @@ -191,7 +191,7 @@ def __ge__(self: Column[DType], other: Column[DType] | Any) -> Column[Bool]: Column """ - def __gt__(self: Column[DType], other: Column[DType] | Any) -> Column[Bool]: + def __gt__(self: Column[DType], other: Column[DType] | Scalar) -> Column[Bool]: """ Compare for "greater than" `other`. @@ -207,7 +207,7 @@ def __gt__(self: Column[DType], other: Column[DType] | Any) -> Column[Bool]: Column """ - def __le__(self: Column[DType], other: Column[DType] | Any) -> Column[Bool]: + def __le__(self: Column[DType], other: Column[DType] | Scalar) -> Column[Bool]: """ Compare for "less than or equal to" `other`. @@ -223,7 +223,7 @@ def __le__(self: Column[DType], other: Column[DType] | Any) -> Column[Bool]: Column """ - def __lt__(self: Column[DType], other: Column[DType] | Any) -> Column[Bool]: + def __lt__(self: Column[DType], other: Column[DType] | Scalar) -> Column[Bool]: """ Compare for "less than" `other`. From 9fd840a2cc2e3070553e6650c6e861dbd1343442 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Fri, 7 Jul 2023 13:37:28 +0100 Subject: [PATCH 38/38] fix return types; --- .../dataframe_api/column_object.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/spec/API_specification/dataframe_api/column_object.py b/spec/API_specification/dataframe_api/column_object.py index c4d7656b..ffcaca31 100644 --- a/spec/API_specification/dataframe_api/column_object.py +++ b/spec/API_specification/dataframe_api/column_object.py @@ -443,32 +443,32 @@ def all(self: Column[Bool], *, skip_nulls: bool = True) -> bool: If column is not boolean. """ - def min(self, *, skip_nulls: bool = True) -> Any: + def min(self, *, skip_nulls: bool = True) -> Scalar: """ Reduction returns a scalar. Any data type that supports comparisons must be supported. The returned value has the same dtype as the column. """ - def max(self, *, skip_nulls: bool = True) -> Any: + def max(self, *, skip_nulls: bool = True) -> Scalar: """ Reduction returns a scalar. Any data type that supports comparisons must be supported. The returned value has the same dtype as the column. """ - def sum(self, *, skip_nulls: bool = True) -> Any: + def sum(self, *, skip_nulls: bool = True) -> Scalar: """ Reduction returns a scalar. Must be supported for numerical and datetime data types. The returned value has the same dtype as the column. """ - def prod(self, *, skip_nulls: bool = True) -> Any: + def prod(self, *, skip_nulls: bool = True) -> Scalar: """ Reduction returns a scalar. Must be supported for numerical data types. The returned value has the same dtype as the column. """ - def median(self, *, skip_nulls: bool = True) -> Any: + def median(self, *, skip_nulls: bool = True) -> Scalar: """ Reduction returns a scalar. Must be supported for numerical and datetime data types. Returns a float for numerical data types, and @@ -476,7 +476,7 @@ def median(self, *, skip_nulls: bool = True) -> Any: dtypes. """ - def mean(self, *, skip_nulls: bool = True) -> Any: + def mean(self, *, skip_nulls: bool = True) -> Scalar: """ Reduction returns a scalar. Must be supported for numerical and datetime data types. Returns a float for numerical data types, and @@ -484,7 +484,7 @@ def mean(self, *, skip_nulls: bool = True) -> Any: dtypes. """ - def std(self, *, correction: int | float = 1, skip_nulls: bool = True) -> Any: + def std(self, *, correction: int | float = 1, skip_nulls: bool = True) -> Scalar: """ Reduction returns a scalar. Must be supported for numerical and datetime data types. Returns a float for numerical data types, and @@ -510,7 +510,7 @@ def std(self, *, correction: int | float = 1, skip_nulls: bool = True) -> Any: Whether to skip null values. """ - def var(self, *, correction: int | float = 1, skip_nulls: bool = True) -> Any: + def var(self, *, correction: int | float = 1, skip_nulls: bool = True) -> Scalar: """ Reduction returns a scalar. Must be supported for numerical and datetime data types. Returns a float for numerical data types, and