Skip to content

Commit 7cbf949

Browse files
authored
ENH: explicit filters parameter in pd.read_parquet (#53212)
* filters parameters in pd.read_parqeut * linter * docstring validation * test for filter args in pd.read_parquet * black * addressing reviews
1 parent 67984e9 commit 7cbf949

File tree

3 files changed

+48
-3
lines changed

3 files changed

+48
-3
lines changed

doc/source/whatsnew/v2.1.0.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -669,6 +669,7 @@ I/O
669669
^^^
670670
- :meth:`DataFrame.to_orc` now raising ``ValueError`` when non-default :class:`Index` is given (:issue:`51828`)
671671
- :meth:`DataFrame.to_sql` now raising ``ValueError`` when the name param is left empty while using SQLAlchemy to connect (:issue:`52675`)
672+
- Added ``filters`` parameter to :func:`read_parquet` to filter out data, compatible with both ``engines`` (:issue:`53212`)
672673
- Bug in :func:`json_normalize`, fix json_normalize cannot parse metadata fields list type (:issue:`37782`)
673674
- Bug in :func:`read_csv` where it would error when ``parse_dates`` was set to a list or dictionary with ``engine="pyarrow"`` (:issue:`47961`)
674675
- Bug in :func:`read_csv`, with ``engine="pyarrow"`` erroring when specifying a ``dtype`` with ``index_col`` (:issue:`53229`)

pandas/io/parquet.py

Lines changed: 28 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -228,6 +228,7 @@ def read(
228228
self,
229229
path,
230230
columns=None,
231+
filters=None,
231232
use_nullable_dtypes: bool = False,
232233
dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
233234
storage_options: StorageOptions | None = None,
@@ -257,7 +258,11 @@ def read(
257258
)
258259
try:
259260
pa_table = self.api.parquet.read_table(
260-
path_or_handle, columns=columns, filesystem=filesystem, **kwargs
261+
path_or_handle,
262+
columns=columns,
263+
filesystem=filesystem,
264+
filters=filters,
265+
**kwargs,
261266
)
262267
result = pa_table.to_pandas(**to_pandas_kwargs)
263268

@@ -335,6 +340,7 @@ def read(
335340
self,
336341
path,
337342
columns=None,
343+
filters=None,
338344
storage_options: StorageOptions | None = None,
339345
filesystem=None,
340346
**kwargs,
@@ -375,7 +381,7 @@ def read(
375381

376382
try:
377383
parquet_file = self.api.ParquetFile(path, **parquet_kwargs)
378-
return parquet_file.to_pandas(columns=columns, **kwargs)
384+
return parquet_file.to_pandas(columns=columns, filters=filters, **kwargs)
379385
finally:
380386
if handles is not None:
381387
handles.close()
@@ -487,6 +493,7 @@ def read_parquet(
487493
use_nullable_dtypes: bool | lib.NoDefault = lib.no_default,
488494
dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
489495
filesystem: Any = None,
496+
filters: list[tuple] | list[list[tuple]] | None = None,
490497
**kwargs,
491498
) -> DataFrame:
492499
"""
@@ -517,7 +524,6 @@ def read_parquet(
517524
if you wish to use its implementation.
518525
columns : list, default=None
519526
If not None, only these columns will be read from the file.
520-
521527
{storage_options}
522528
523529
.. versionadded:: 1.3.0
@@ -550,6 +556,24 @@ def read_parquet(
550556
551557
.. versionadded:: 2.1.0
552558
559+
filters : List[Tuple] or List[List[Tuple]], default None
560+
To filter out data.
561+
Filter syntax: [[(column, op, val), ...],...]
562+
where op is [==, =, >, >=, <, <=, !=, in, not in]
563+
The innermost tuples are transposed into a set of filters applied
564+
through an `AND` operation.
565+
The outer list combines these sets of filters through an `OR`
566+
operation.
567+
A single list of tuples can also be used, meaning that no `OR`
568+
operation between set of filters is to be conducted.
569+
570+
Using this argument will NOT result in row-wise filtering of the final
571+
partitions unless ``engine="pyarrow"`` is also specified. For
572+
other engines, filtering is only performed at the partition level, that is,
573+
to prevent the loading of some row-groups and/or files.
574+
575+
.. versionadded:: 2.1.0
576+
553577
**kwargs
554578
Any additional kwargs are passed to the engine.
555579
@@ -632,6 +656,7 @@ def read_parquet(
632656
return impl.read(
633657
path,
634658
columns=columns,
659+
filters=filters,
635660
storage_options=storage_options,
636661
use_nullable_dtypes=use_nullable_dtypes,
637662
dtype_backend=dtype_backend,

pandas/tests/io/test_parquet.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -426,6 +426,25 @@ def test_read_columns(self, engine):
426426
df, engine, expected=expected, read_kwargs={"columns": ["string"]}
427427
)
428428

429+
def test_read_filters(self, engine, tmp_path):
430+
df = pd.DataFrame(
431+
{
432+
"int": list(range(4)),
433+
"part": list("aabb"),
434+
}
435+
)
436+
437+
expected = pd.DataFrame({"int": [0, 1]})
438+
check_round_trip(
439+
df,
440+
engine,
441+
path=tmp_path,
442+
expected=expected,
443+
write_kwargs={"partition_cols": ["part"]},
444+
read_kwargs={"filters": [("part", "==", "a")], "columns": ["int"]},
445+
repeat=1,
446+
)
447+
429448
def test_write_index(self, engine, using_copy_on_write, request):
430449
check_names = engine != "fastparquet"
431450
if using_copy_on_write and engine == "fastparquet":

0 commit comments

Comments
 (0)