From 69fe58e22471134e9f8d7409ad04412cdd4931d7 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Fri, 21 Apr 2023 18:16:44 +0100 Subject: [PATCH 1/6] add unique --- .../dataframe_api/column_object.py | 18 +++++++++++++++++- .../dataframe_api/dataframe_object.py | 19 +++++++++++++++++++ 2 files changed, 36 insertions(+), 1 deletion(-) diff --git a/spec/API_specification/dataframe_api/column_object.py b/spec/API_specification/dataframe_api/column_object.py index 31b610b7..528e94e5 100644 --- a/spec/API_specification/dataframe_api/column_object.py +++ b/spec/API_specification/dataframe_api/column_object.py @@ -1,2 +1,18 @@ +from __future__ import annotations + +from typing import Sequence + class Column: - pass + def unique(self) -> Column: + """ + Return a Column with a row for each unique value. + + Returns + ------- + Column + + Notes + ----- + There are no ordering guarantees. + """ + ... diff --git a/spec/API_specification/dataframe_api/dataframe_object.py b/spec/API_specification/dataframe_api/dataframe_object.py index 45269423..dda5e635 100644 --- a/spec/API_specification/dataframe_api/dataframe_object.py +++ b/spec/API_specification/dataframe_api/dataframe_object.py @@ -599,3 +599,22 @@ def isnan(self) -> DataFrame: Does *not* include 'missing' or 'null' entries. """ ... + + def unique(self, keys: Sequence[str]) -> DataFrame: + """ + Return a DataFrame with a row for each unique combination of `keys`. + + Parameters + ---------- + keys : Sequence[str] + Columns to use to find unique values. + + Returns + ------- + DataFrame + + Notes + ----- + There are no ordering guarantees. + """ + ... From 0530e3269f34d4527af0d1b1c72681f9c7f87cb7 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Tue, 2 May 2023 10:44:42 +0100 Subject: [PATCH 2/6] fixup --- spec/API_specification/dataframe_api/column_object.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spec/API_specification/dataframe_api/column_object.py b/spec/API_specification/dataframe_api/column_object.py index 5d39a1da..23ce8c84 100644 --- a/spec/API_specification/dataframe_api/column_object.py +++ b/spec/API_specification/dataframe_api/column_object.py @@ -399,7 +399,7 @@ def unique_indices(self, *, skip_nulls: bool) -> Column[int]: There are no ordering guarantees. If the original Column contains multiple `'NaN'` values, then only a single index corresponding to those values should be returned. - Likewise, for null values (if ``skip_nulls=False``). + Likewise for null values (if ``skip_nulls=False``). To get the unique values, you can do ``df.get_rows(df.unique_indices())``. """ ... From d7aaa339de7522cf36fee7431a904b1360d72dd1 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Tue, 2 May 2023 10:45:12 +0100 Subject: [PATCH 3/6] punt on DataFrame.unique for now --- .../dataframe_api/dataframe_object.py | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/spec/API_specification/dataframe_api/dataframe_object.py b/spec/API_specification/dataframe_api/dataframe_object.py index 64a1bee0..29dcf5fa 100644 --- a/spec/API_specification/dataframe_api/dataframe_object.py +++ b/spec/API_specification/dataframe_api/dataframe_object.py @@ -680,22 +680,3 @@ def isnan(self) -> DataFrame: In particular, does not check for `np.timedelta64('NaT')`. """ ... - - def unique(self, keys: Sequence[str]) -> DataFrame: - """ - Return a DataFrame with a row for each unique combination of `keys`. - - Parameters - ---------- - keys : Sequence[str] - Columns to use to find unique values. - - Returns - ------- - DataFrame - - Notes - ----- - There are no ordering guarantees. - """ - ... From 160c264ad4c2f4ca7314c85fe8f44dfb01f15b75 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Wed, 3 May 2023 12:24:54 +0100 Subject: [PATCH 4/6] clarify that there are really absolutely no ordering guarantees whatsoever --- spec/API_specification/dataframe_api/column_object.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/spec/API_specification/dataframe_api/column_object.py b/spec/API_specification/dataframe_api/column_object.py index 23ce8c84..9f33e388 100644 --- a/spec/API_specification/dataframe_api/column_object.py +++ b/spec/API_specification/dataframe_api/column_object.py @@ -396,7 +396,9 @@ def unique_indices(self, *, skip_nulls: bool) -> Column[int]: Notes ----- - There are no ordering guarantees. + There are no ordering guarantees. In particular, if there are multiple + indices corresponding to the same unique value, there is no guarantee + about which one will appear in the result. If the original Column contains multiple `'NaN'` values, then only a single index corresponding to those values should be returned. Likewise for null values (if ``skip_nulls=False``). From ab74976eddf543201048ba3791cf3d62168e84d1 Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Thu, 4 May 2023 10:21:25 +0100 Subject: [PATCH 5/6] fixup example --- spec/API_specification/dataframe_api/column_object.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spec/API_specification/dataframe_api/column_object.py b/spec/API_specification/dataframe_api/column_object.py index 9f33e388..52eaabad 100644 --- a/spec/API_specification/dataframe_api/column_object.py +++ b/spec/API_specification/dataframe_api/column_object.py @@ -402,6 +402,6 @@ def unique_indices(self, *, skip_nulls: bool) -> Column[int]: If the original Column contains multiple `'NaN'` values, then only a single index corresponding to those values should be returned. Likewise for null values (if ``skip_nulls=False``). - To get the unique values, you can do ``df.get_rows(df.unique_indices())``. + To get the unique values, you can do ``col.get_rows(col.unique_indices())``. """ ... From c0cefa2ead9282e4756c9bddbeffd3872f996ccd Mon Sep 17 00:00:00 2001 From: MarcoGorelli <> Date: Thu, 4 May 2023 10:24:49 +0100 Subject: [PATCH 6/6] make skip_nulls default true --- spec/API_specification/dataframe_api/column_object.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spec/API_specification/dataframe_api/column_object.py b/spec/API_specification/dataframe_api/column_object.py index 52eaabad..924ef94a 100644 --- a/spec/API_specification/dataframe_api/column_object.py +++ b/spec/API_specification/dataframe_api/column_object.py @@ -385,7 +385,7 @@ def isnan(self) -> Column: In particular, does not check for `np.timedelta64('NaT')`. """ - def unique_indices(self, *, skip_nulls: bool) -> Column[int]: + def unique_indices(self, *, skip_nulls: bool = True) -> Column[int]: """ Return indices corresponding to unique values in Column.