Skip to content

Commit 0642028

Browse files
committed
replace column with expression
1 parent b30416b commit 0642028

File tree

6 files changed

+355
-463
lines changed

6 files changed

+355
-463
lines changed

spec/API_specification/dataframe_api/__init__.py

Lines changed: 95 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -3,21 +3,20 @@
33
"""
44
from __future__ import annotations
55

6-
from typing import Mapping, Sequence, Any
6+
from typing import Mapping, Sequence, Any, Literal
77

8-
from .column_object import *
8+
from .expression_object import *
99
from .dataframe_object import DataFrame
1010
from .groupby_object import *
1111
from ._types import DType
1212

1313
__all__ = [
1414
"__dataframe_api_version__",
1515
"DataFrame",
16-
"Column",
17-
"column_from_sequence",
18-
"column_from_1d_array",
16+
"col",
1917
"concat",
20-
"dataframe_from_dict",
18+
"sorted_indices",
19+
"unique_indices",
2120
"dataframe_from_2d_array",
2221
"is_null",
2322
"null",
@@ -43,6 +42,21 @@
4342
implementation of the dataframe API standard.
4443
"""
4544

45+
def col(name: str) -> Expression:
46+
"""
47+
Instantiate an Expression which selects given column by name.
48+
49+
For example, to select column 'species' and then use it to filter
50+
a DataFrame, you could do:
51+
52+
.. code-block::python
53+
54+
df: DataFrame
55+
namespace = df.__dataframe_namespace__()
56+
df.get_rows_by_mask(pl.col('species') == 'setosa')
57+
"""
58+
...
59+
4660
def concat(dataframes: Sequence[DataFrame]) -> DataFrame:
4761
"""
4862
Concatenate DataFrames vertically.
@@ -63,104 +77,116 @@ def concat(dataframes: Sequence[DataFrame]) -> DataFrame:
6377
"""
6478
...
6579

66-
def column_from_sequence(sequence: Sequence[Any], *, dtype: Any, name: str = '', api_version: str | None = None) -> Column[Any]:
80+
def any_rowwise(keys: list[str] | None = None, *, skip_nulls: bool = True) -> Expression:
6781
"""
68-
Construct Column from sequence of elements.
82+
Reduction returns an Expression.
83+
84+
Differs from ``DataFrame.any`` and that the reduction happens
85+
for each row, rather than for each column.
6986
7087
Parameters
7188
----------
72-
sequence : Sequence[object]
73-
Sequence of elements. Each element must be of the specified
74-
``dtype``, the corresponding Python builtin scalar type, or
75-
coercible to that Python scalar type.
76-
name : str, optional
77-
Name of column.
78-
dtype : DType
79-
Dtype of result. Must be specified.
80-
api_version: str | None
81-
A string representing the version of the dataframe API specification
82-
in ``'YYYY.MM'`` form, for example, ``'2023.04'``.
83-
If it is ``None``, it should return an object corresponding to
84-
latest version of the dataframe API specification. If the given
85-
version is invalid or not implemented for the given module, an
86-
error should be raised. Default: ``None``.
89+
keys : list[str]
90+
Column names to consider. If `None`, all columns are considered.
8791
88-
Returns
89-
-------
90-
Column
92+
Raises
93+
------
94+
ValueError
95+
If any of the DataFrame's columns is not boolean.
9196
"""
9297
...
9398

94-
def dataframe_from_dict(data: Mapping[str, Column[Any]], *, api_version: str | None = None) -> DataFrame:
99+
def all_rowwise(keys: list[str] | None = None, *, skip_nulls: bool = True) -> Expression:
95100
"""
96-
Construct DataFrame from map of column names to Columns.
101+
Reduction returns a Column.
102+
103+
Differs from ``DataFrame.all`` and that the reduction happens
104+
for each row, rather than for each column.
97105
98106
Parameters
99107
----------
100-
data : Mapping[str, Column]
101-
Column must be of the corresponding type of the DataFrame.
102-
For example, it is only supported to build a ``LibraryXDataFrame`` using
103-
``LibraryXColumn`` instances.
104-
api_version: str | None
105-
A string representing the version of the dataframe API specification
106-
in ``'YYYY.MM'`` form, for example, ``'2023.04'``.
107-
If it is ``None``, it should return an object corresponding to
108-
latest version of the dataframe API specification. If the given
109-
version is invalid or not implemented for the given module, an
110-
error should be raised. Default: ``None``.
108+
keys : list[str]
109+
Column names to consider. If `None`, all columns are considered.
111110
112-
Returns
113-
-------
114-
DataFrame
115-
116111
Raises
117112
------
118113
ValueError
119-
If any of the columns already has a name, and the corresponding key
120-
in `data` doesn't match.
121-
114+
If any of the DataFrame's columns is not boolean.
122115
"""
123116
...
124117

118+
def sorted_indices(
119+
keys: str | list[str] | None = None,
120+
*,
121+
ascending: Sequence[bool] | bool = True,
122+
nulls_position: Literal['first', 'last'] = 'last',
123+
) -> Expression:
124+
"""
125+
Return row numbers which would sort according to given columns.
126+
127+
If you need to sort the DataFrame, use :meth:`DataFrame.sort`.
125128
126-
def column_from_1d_array(array: Any, *, dtype: Any, name: str = '', api_version: str | None = None) -> Column[Any]:
129+
Parameters
130+
----------
131+
keys : str | list[str], optional
132+
Names of columns to sort by.
133+
If `None`, sort by all columns.
134+
ascending : Sequence[bool] or bool
135+
If `True`, sort by all keys in ascending order.
136+
If `False`, sort by all keys in descending order.
137+
If a sequence, it must be the same length as `keys`,
138+
and determines the direction with which to use each
139+
key to sort by.
140+
nulls_position : ``{'first', 'last'}``
141+
Whether null values should be placed at the beginning
142+
or at the end of the result.
143+
Note that the position of NaNs is unspecified and may
144+
vary based on the implementation.
145+
146+
Returns
147+
-------
148+
Expression
149+
150+
Raises
151+
------
152+
ValueError
153+
If `keys` and `ascending` are sequences of different lengths.
127154
"""
128-
Construct Column from 1D array.
155+
...
129156

130-
See `dataframe_from_2d_array` for related 2D function.
131157

132-
Only Array-API-compliant 1D arrays are supported.
133-
Cross-kind casting is undefined and may vary across implementations.
134-
Downcasting is disallowed.
158+
def unique_indices(keys: str | list[str] | None = None, *, skip_nulls: bool = True) -> Expression:
159+
"""
160+
Return indices corresponding to unique values across selected columns.
135161
136162
Parameters
137163
----------
138-
array : array
139-
array-API compliant 1D array
140-
name : str, optional
141-
Name to give columns.
142-
dtype : DType
143-
Dtype of column.
144-
api_version: str | None
145-
A string representing the version of the dataframe API specification
146-
in ``'YYYY.MM'`` form, for example, ``'2023.04'``.
147-
If it is ``None``, it should return an object corresponding to
148-
latest version of the dataframe API specification. If the given
149-
version is invalid or not implemented for the given module, an
150-
error should be raised. Default: ``None``.
164+
keys : str | list[str], optional
165+
Column names to consider when finding unique values.
166+
If `None`, all columns are considered.
151167
152168
Returns
153169
-------
154-
Column
170+
Expression
171+
Indices corresponding to unique values.
172+
173+
Notes
174+
-----
175+
There are no ordering guarantees. In particular, if there are multiple
176+
indices corresponding to the same unique value(s), there is no guarantee
177+
about which one will appear in the result.
178+
If the original column(s) contain multiple `'NaN'` values, then
179+
only a single index corresponding to those values will be returned.
180+
Likewise for null values (if ``skip_nulls=False``).
181+
To get the unique values, you can do ``df.get_rows(df.unique_indices(keys))``.
155182
"""
156183
...
157184

158-
def dataframe_from_2d_array(array: Any, *, names: Sequence[str], dtypes: Mapping[str, Any], api_version: str | None = None) -> DataFrame:
185+
186+
def dataframe_from_2d_array(array: Any, *, names: Sequence[str], dtypes: Mapping[str, Any]) -> DataFrame:
159187
"""
160188
Construct DataFrame from 2D array.
161189
162-
See `column_from_1d_array` for related 1D function.
163-
164190
Only Array-API-compliant 2D arrays are supported.
165191
Cross-kind casting is undefined and may vary across implementations.
166192
Downcasting is disallowed.

0 commit comments

Comments
 (0)