diff --git a/pandas/io/common.py b/pandas/io/common.py index 1a9e6b472463d..dd8d97101e736 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -10,6 +10,7 @@ from collections import defaultdict from collections.abc import ( Hashable, + Iterable, Mapping, Sequence, ) @@ -26,7 +27,10 @@ ) import mmap import os -from pathlib import Path +from pathlib import ( + Path, + PurePosixPath, +) import re import tarfile from typing import ( @@ -55,6 +59,7 @@ BaseBuffer, ReadCsvBuffer, ) +from pandas.compat import is_platform_windows from pandas.compat._optional import import_optional_dependency from pandas.util._decorators import doc from pandas.util._exceptions import find_stack_level @@ -1282,3 +1287,130 @@ def dedup_names( counts[col] = cur_count + 1 return names + + +def _infer_protocol(path: str) -> str: + # Treat Windows drive letters like C:\ as local file paths + if is_platform_windows() and re.match(r"^[a-zA-Z]:[\\/]", path): + return "file" + + if is_fsspec_url(path): + parsed = parse_url(path) + return parsed.scheme + return "file" + + +def _match_file( + path: Path | PurePosixPath, extensions: set[str] | None, glob: str | None +) -> bool: + """ + Check if the file matches the given extensions and glob pattern. + Parameters + ---------- + path : Path or PurePosixPath + The file path to check. + extensions : set[str] + A set of file extensions to match against. + glob : str + A glob pattern to match against. + Returns + ------- + bool + True if the file matches the extensions and glob pattern, False otherwise. + """ + return (extensions is None or path.suffix.lower() in extensions) and ( + glob is None or path.match(glob) + ) + + +def iterdir( + path: FilePath | BaseBuffer, + extensions: str | Iterable[str] | None = None, + glob: str | None = None, +) -> list[Path | PurePosixPath] | BaseBuffer: + """Yield file paths in a directory (no nesting allowed). + + Supports: + - Local paths (str, os.PathLike) + - file:// URLs + - Remote paths (e.g., s3://) via fsspec (if installed) + + Parameters + ---------- + path : FilePath + Path to the directory (local or remote). + extensions : str or list of str, optional + Only yield files with the given extension(s). Case-insensitive. + If None, all files are yielded. + glob : str, optional + Only yield files matching the given glob pattern. + If None, all files are yielded. + + Returns + ------ + pathlib.Path or pathlib.PurePosixPath + File paths within the directory. + + Raises + ------ + NotADirectoryError + If the given path is not a directory. + ImportError + If fsspec is required but not installed. + """ + if hasattr(path, "read") or hasattr(path, "write"): + return path + + if extensions is not None: + if isinstance(extensions, str): + extensions = {extensions.lower()} + else: + extensions = {ext.lower() for ext in extensions} + + path_str = os.fspath(path) + scheme = _infer_protocol(path_str) + + if scheme == "file": + resolved_path = Path(path_str) + if resolved_path.is_file(): + if _match_file( + resolved_path, + extensions, + glob, + ): + return [resolved_path] + + result = [] + for entry in resolved_path.iterdir(): + if entry.is_file(): + if _match_file( + entry, + extensions, + glob, + ): + result.append(entry) + return result + + # Remote paths + fsspec = import_optional_dependency("fsspec", extra=scheme) + fs = fsspec.filesystem(scheme) + path_without_scheme = fsspec.core.strip_protocol(path_str) + if fs.isfile(path_without_scheme): + if _match_file( + path_without_scheme, + extensions, + glob, + ): + return [PurePosixPath(path_without_scheme)] + + result = [] + for file in fs.ls(path_without_scheme, detail=True): + if file["type"] == "file": + path_obj = PurePosixPath(file["name"]) + if _match_file( + path_obj, + extensions, + glob, + ): + result.append(path_obj) + return result diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 67193f930b4dc..4b73d290d86b7 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -55,6 +55,7 @@ from pandas.io.common import ( IOHandles, get_handle, + iterdir, stringify_path, validate_header_arg, ) @@ -73,6 +74,7 @@ if TYPE_CHECKING: from collections.abc import ( Callable, + Generator, Hashable, Iterable, Mapping, @@ -668,9 +670,23 @@ def _validate_names(names: Sequence[Hashable] | None) -> None: raise ValueError("Names should be an ordered collection.") +def _multi_file_generator( + list_of_files: list[str], kwds +) -> Generator[DataFrame] | Generator[TextFileReader]: + """Generator for multiple files.""" + for file in list_of_files: + parser = TextFileReader(file, **kwds) + + if kwds.get("chunksize", None) or kwds.get("iterator", False): + yield parser + else: + with parser: + yield parser.read(kwds.get("nrows", None)) + + def _read( filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], kwds -) -> DataFrame | TextFileReader: +) -> DataFrame | TextFileReader | Generator[DataFrame] | Generator[TextFileReader]: """Generic reader of line files.""" # if we pass a date_format and parse_dates=False, we should not parse the # dates GH#44366 @@ -709,14 +725,25 @@ def _read( # Check for duplicates in names. _validate_names(kwds.get("names", None)) - # Create the parser. - parser = TextFileReader(filepath_or_buffer, **kwds) + extensions = kwds.get("extensions", None) + glob = kwds.get("glob", None) + files = iterdir(filepath_or_buffer, extensions, glob) + + if len(files) == 0: + raise FileNotFoundError( + f"No files found in {filepath_or_buffer}, " + f"with extension(s) {extensions} and glob pattern {glob}" + ) + elif len(files) == 1: + parser = TextFileReader(files[0], **kwds) - if chunksize or iterator: - return parser + if chunksize or iterator: + return parser - with parser: - return parser.read(nrows) + with parser: + return parser.read(nrows) + else: + return _multi_file_generator(files, kwds) @overload @@ -932,10 +959,9 @@ def read_table( skipfooter: int = 0, nrows: int | None = None, # NA and Missing Data Handling - na_values: Hashable - | Iterable[Hashable] - | Mapping[Hashable, Iterable[Hashable]] - | None = None, + na_values: ( + Hashable | Iterable[Hashable] | Mapping[Hashable, Iterable[Hashable]] | None + ) = None, keep_default_na: bool = True, na_filter: bool = True, skip_blank_lines: bool = True, diff --git a/pandas/tests/io/conftest.py b/pandas/tests/io/conftest.py index a5ddda9d66e7a..dbe8fed345d2e 100644 --- a/pandas/tests/io/conftest.py +++ b/pandas/tests/io/conftest.py @@ -223,3 +223,32 @@ def compression_format(request): @pytest.fixture(params=_compression_formats_params) def compression_ext(request): return request.param[0] + + +@pytest.fixture +def local_csv_directory(tmp_path): + """ + Fixture to create a directory with dummy CSV files for testing. + """ + for i in range(3): + file_path = tmp_path / f"{i}.csv" + file_path.touch() + return tmp_path + + +@pytest.fixture +def remote_csv_directory(monkeypatch): + _ = pytest.importorskip("fsspec", reason="fsspec is required for remote tests") + + from fsspec.implementations.memory import MemoryFileSystem + + fs = MemoryFileSystem() + fs.store.clear() + + dir_name = "remote-bucket" + fs.pipe(f"{dir_name}/a.csv", b"a,b,c\n1,2,3\n") + fs.pipe(f"{dir_name}/b.csv", b"a,b,c\n4,5,6\n") + fs.pipe(f"{dir_name}/nested/ignored.csv", b"x,y,z\n") + + monkeypatch.setattr("fsspec.filesystem", lambda _: fs) + return f"s3://{dir_name}" diff --git a/pandas/tests/io/parser/test_directory.py b/pandas/tests/io/parser/test_directory.py new file mode 100644 index 0000000000000..84edc58570036 --- /dev/null +++ b/pandas/tests/io/parser/test_directory.py @@ -0,0 +1,37 @@ +from csv import ( + DictWriter, + reader as csv_reader, +) + +import pytest + + +@pytest.fixture +def directory_data(): + return ["a", "b", "c"], [ + {"first": {"a": 1, "b": 2, "c": 3}}, + {"second": {"a": 4, "b": 5, "c": 6}}, + {"third": {"a": 7, "b": 8, "c": 9}}, + ] + + +@pytest.fixture +def directory_data_to_file(tmp_path, directory_data): + field_names, data_list = directory_data + for data in data_list: + file_name = next(iter(data.keys())) + path = tmp_path / f"{file_name}.csv" + with path.open("w", newline="", encoding="utf-8") as file: + writer = DictWriter(file, fieldnames=field_names) + writer.writeheader() + writer.writerow(data[file_name]) + return tmp_path + + +def test_directory_data(directory_data_to_file): + assert len(list(directory_data_to_file.iterdir())) == 3 + for file in directory_data_to_file.iterdir(): + with file.open(encoding="utf-8") as f: + reader = csv_reader(f) + header = next(reader) + assert header == ["a", "b", "c"] diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index 4a5e41397b59d..7f59b715207de 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -695,3 +695,42 @@ def test_pyarrow_read_csv_datetime_dtype(): expect = pd.DataFrame({"date": expect_data}) tm.assert_frame_equal(expect, result) + + +def test_iterdir_local(local_csv_directory): + for file in icom.iterdir(local_csv_directory): + assert file.is_file() + assert file.suffix == ".csv" + + +def test_remote_csv_directory(remote_csv_directory): + import fsspec + from fsspec.implementations.memory import MemoryFileSystem + + fs = fsspec.filesystem("s3") + assert isinstance(fs, MemoryFileSystem) + + assert fs.exists("remote-bucket") + assert fs.isdir("remote-bucket") + + files = fs.ls("remote-bucket", detail=True) + + file_names = sorted(f["name"] for f in files if f["type"] == "file") + assert file_names == ["/remote-bucket/a.csv", "/remote-bucket/b.csv"] + + dir_names = [f["name"] for f in files if f["type"] == "directory"] + assert "/remote-bucket/nested" in dir_names + + nested_files = fs.ls("remote-bucket/nested", detail=True) + assert nested_files[0]["name"] == "/remote-bucket/nested/ignored.csv" + + +def test_iterdir_remote(remote_csv_directory): + import fsspec + + fs = fsspec.filesystem("s3") + for file in icom.iterdir(remote_csv_directory): + # for fsspec<2024.5.0, fs.isfle(PurePosixPath) returns False + assert fs.exists(str(file)) + assert file.suffix == ".csv" + assert fs.isfile(str(file)) diff --git a/web/pandas/static/img/books/pandas_cookbook_3.jpeg b/web/pandas/static/img/books/pandas_cookbook_3.jpeg new file mode 100644 index 0000000000000..cf1c27037de68 Binary files /dev/null and b/web/pandas/static/img/books/pandas_cookbook_3.jpeg differ