From fe18dcb01e632544424330c8d96955bcb3b59416 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 4 Apr 2023 13:22:32 -0700 Subject: [PATCH 1/6] PERF: lazify IO imports --- pandas/__init__.py | 120 ++++++++++++++++++++--------- pandas/api/interchange/__init__.py | 15 +++- pandas/util/__init__.py | 35 ++++++--- 3 files changed, 121 insertions(+), 49 deletions(-) diff --git a/pandas/__init__.py b/pandas/__init__.py index 1a549c09d22f7..b93259fc39787 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -115,8 +115,6 @@ from pandas.tseries.api import infer_freq from pandas.tseries import offsets -from pandas.core.computation.api import eval - from pandas.core.reshape.api import ( concat, lreshape, @@ -135,43 +133,8 @@ ) from pandas import api, arrays, errors, io, plotting, tseries -from pandas import testing from pandas.util._print_versions import show_versions -from pandas.io.api import ( - # excel - ExcelFile, - ExcelWriter, - read_excel, - # parsers - read_csv, - read_fwf, - read_table, - # pickle - read_pickle, - to_pickle, - # pytables - HDFStore, - read_hdf, - # sql - read_sql, - read_sql_query, - read_sql_table, - # misc - read_clipboard, - read_parquet, - read_orc, - read_feather, - read_gbq, - read_html, - read_xml, - read_json, - read_stata, - read_sas, - read_spss, -) - -from pandas.io.json._normalize import json_normalize from pandas.util._tester import test @@ -184,6 +147,89 @@ del get_versions, v +def __getattr__(key: str): + # lazy imports to speed up 'import pandas as pd' + if key == "eval": + from pandas.core.computation.api import eval + + return eval + elif key == "testing": + import pandas.testing + + return pandas.testing + + elif key in { + "ExcelFile", + "ExcelWriter", + "read_excel", + "read_csv", + "read_fwf", + "read_table", + "read_pickle", + "to_pickle", + "HDFStore", + "read_hdf", + "read_sql", + "read_sql_query", + "read_sql_table", + "read_clipboard", + "read_parquet", + "read_orc", + "read_feather", + "read_gbq", + "read_html", + "read_xml", + "read_json", + "read_stata", + "read_sas", + "read_spss", + }: + import pandas.io.api + + return getattr(pandas.io.api, key) + elif key == "json_normalize": + from pandas.io.json._normalize import json_normalize + + return json_normalize + raise AttributeError(f"module 'pandas' has no attribute '{key}'") + + +def __dir__() -> list[str]: + # include lazy imports defined in __getattr__ in dir() + base = list(globals().keys()) + result = ( + base + + [ + "ExcelFile", + "ExcelWriter", + "read_excel", + "read_csv", + "read_fwf", + "read_table", + "read_pickle", + "to_pickle", + "HDFStore", + "read_hdf", + "read_sql", + "read_sql_query", + "read_sql_table", + "read_clipboard", + "read_parquet", + "read_orc", + "read_feather", + "read_gbq", + "read_html", + "read_xml", + "read_json", + "read_stata", + "read_sas", + "read_spss", + ] + + ["eval", "json_normalize", "testing"] + ) + return result + + # module level doc-string __doc__ = """ pandas - a powerful data analysis and manipulation library for Python diff --git a/pandas/api/interchange/__init__.py b/pandas/api/interchange/__init__.py index 2f3a73bc46b31..02409c9302063 100644 --- a/pandas/api/interchange/__init__.py +++ b/pandas/api/interchange/__init__.py @@ -2,7 +2,18 @@ Public API for DataFrame interchange protocol. """ -from pandas.core.interchange.dataframe_protocol import DataFrame -from pandas.core.interchange.from_dataframe import from_dataframe + +def __getattr__(key: str): + # lazy imports to speed 'import pandas as pd' + if key == "DataFrame": + from pandas.core.interchange.dataframe_protocol import DataFrame + + return DataFrame + elif key == "from_dataframe": + from pandas.core.interchange.from_dataframe import from_dataframe + + return from_dataframe + raise AttributeError(key) + __all__ = ["from_dataframe", "DataFrame"] diff --git a/pandas/util/__init__.py b/pandas/util/__init__.py index aa31c024fe338..230c8a98d79cd 100644 --- a/pandas/util/__init__.py +++ b/pandas/util/__init__.py @@ -1,11 +1,26 @@ # pyright: reportUnusedImport = false -from pandas.util._decorators import ( # noqa:F401 - Appender, - Substitution, - cache_readonly, -) - -from pandas.core.util.hashing import ( # noqa:F401 - hash_array, - hash_pandas_object, -) + + +def __getattr__(key: str): + # lazify imports to speed 'import pandas as pd' + if key == "Appender": + from pandas.util._decorators import Appender + + return Appender + if key == "Substitution": + from pandas.util._decorators import Substitution + + return Substitution + if key == "cache_readonly": + from pandas.util._decorators import cache_readonly + + return cache_readonly + if key == "hash_array": + from pandas.core.util.hashing import hash_array + + return hash_array + if key == "hash_pandas_object": + from pandas.core.util.hashing import hash_pandas_object + + return hash_pandas_object + raise AttributeError(f"module 'pandas.util' has no attribute '{key}'") From e8800c151b593d5aa3502d80fb98d79e522eb5d7 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 4 Apr 2023 16:43:35 -0700 Subject: [PATCH 2/6] lint fixup --- pandas/__init__.py | 5 +++-- pandas/api/interchange/__init__.py | 6 +++++- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/pandas/__init__.py b/pandas/__init__.py index b93259fc39787..5787241cb23e3 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -1,3 +1,4 @@ +# pylint: disable=undefined-all-variable from __future__ import annotations __docformat__ = "restructuredtext" @@ -154,7 +155,7 @@ def __getattr__(key: str): return eval elif key == "testing": - import pandas.testing + import pandas.testing # pylint: disable=redefined-outer-name return pandas.testing @@ -274,7 +275,7 @@ def __dir__() -> list[str]: # Use __all__ to let type checkers know what is part of the public API. # Pandas is not (yet) a py.typed library: the public API is determined # based on the documentation. -__all__ = [ +__all__ = [ # pyright: ignore[reportUnsupportedDunderAll] "ArrowDtype", "BooleanDtype", "Categorical", diff --git a/pandas/api/interchange/__init__.py b/pandas/api/interchange/__init__.py index 02409c9302063..bbbae02de6b70 100644 --- a/pandas/api/interchange/__init__.py +++ b/pandas/api/interchange/__init__.py @@ -1,3 +1,4 @@ +# pylint: disable=undefined-all-variable """ Public API for DataFrame interchange protocol. """ @@ -16,4 +17,7 @@ def __getattr__(key: str): raise AttributeError(key) -__all__ = ["from_dataframe", "DataFrame"] +__all__ = [ # pyright: ignore[reportUnsupportedDunderAll] + "from_dataframe", + "DataFrame", +] From 42a3c8f62a5d044b479076a0dd89f81eae35a895 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 4 Apr 2023 19:43:32 -0700 Subject: [PATCH 3/6] lint fixup --- pandas/__init__.py | 61 ++++++++++++++++-------------- pandas/api/interchange/__init__.py | 6 +-- 2 files changed, 35 insertions(+), 32 deletions(-) diff --git a/pandas/__init__.py b/pandas/__init__.py index 5787241cb23e3..36bcb0ef24edb 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -185,9 +185,12 @@ def __getattr__(key: str): "read_sas", "read_spss", }: - import pandas.io.api + # Workaround to avoid false-positive in "inconsistent-namespace-usage" + # complaining "Found both 'pandas.io' and 'io' " + import importlib - return getattr(pandas.io.api, key) + namespace = importlib.import_module(".io.api", package="pandas") + return getattr(namespace, key) elif key == "json_normalize": from pandas.io.json._normalize import json_normalize @@ -285,13 +288,13 @@ def __dir__() -> list[str]: "DateOffset", "DatetimeIndex", "DatetimeTZDtype", - "ExcelFile", - "ExcelWriter", + "ExcelFile", # pyright: ignore[reportUnsupportedDunderAll] + "ExcelWriter", # pyright: ignore[reportUnsupportedDunderAll] "Flags", "Float32Dtype", "Float64Dtype", "Grouper", - "HDFStore", + "HDFStore", # pyright: ignore[reportUnsupportedDunderAll] "Index", "IndexSlice", "Int16Dtype", @@ -329,7 +332,7 @@ def __dir__() -> list[str]: "date_range", "describe_option", "errors", - "eval", + "eval", # pyright: ignore[reportUnsupportedDunderAll] "factorize", "get_dummies", "from_dummies", @@ -339,7 +342,7 @@ def __dir__() -> list[str]: "io", "isna", "isnull", - "json_normalize", + "json_normalize", # pyright: ignore[reportUnsupportedDunderAll] "lreshape", "melt", "merge", @@ -355,36 +358,36 @@ def __dir__() -> list[str]: "pivot_table", "plotting", "qcut", - "read_clipboard", - "read_csv", - "read_excel", - "read_feather", - "read_fwf", - "read_gbq", - "read_hdf", - "read_html", - "read_json", - "read_orc", - "read_parquet", - "read_pickle", - "read_sas", - "read_spss", - "read_sql", - "read_sql_query", - "read_sql_table", - "read_stata", - "read_table", - "read_xml", + "read_clipboard", # pyright: ignore[reportUnsupportedDunderAll] + "read_csv", # pyright: ignore[reportUnsupportedDunderAll] + "read_excel", # pyright: ignore[reportUnsupportedDunderAll] + "read_feather", # pyright: ignore[reportUnsupportedDunderAll] + "read_fwf", # pyright: ignore[reportUnsupportedDunderAll] + "read_gbq", # pyright: ignore[reportUnsupportedDunderAll] + "read_hdf", # pyright: ignore[reportUnsupportedDunderAll] + "read_html", # pyright: ignore[reportUnsupportedDunderAll] + "read_json", # pyright: ignore[reportUnsupportedDunderAll] + "read_orc", # pyright: ignore[reportUnsupportedDunderAll] + "read_parquet", # pyright: ignore[reportUnsupportedDunderAll] + "read_pickle", # pyright: ignore[reportUnsupportedDunderAll] + "read_sas", # pyright: ignore[reportUnsupportedDunderAll] + "read_spss", # pyright: ignore[reportUnsupportedDunderAll] + "read_sql", # pyright: ignore[reportUnsupportedDunderAll] + "read_sql_query", # pyright: ignore[reportUnsupportedDunderAll] + "read_sql_table", # pyright: ignore[reportUnsupportedDunderAll] + "read_stata", # pyright: ignore[reportUnsupportedDunderAll] + "read_table", # pyright: ignore[reportUnsupportedDunderAll] + "read_xml", # pyright: ignore[reportUnsupportedDunderAll] "reset_option", "set_eng_float_format", "set_option", "show_versions", "test", - "testing", + "testing", # pyright: ignore[reportUnsupportedDunderAll] "timedelta_range", "to_datetime", "to_numeric", - "to_pickle", + "to_pickle", # pyright: ignore[reportUnsupportedDunderAll] "to_timedelta", "tseries", "unique", diff --git a/pandas/api/interchange/__init__.py b/pandas/api/interchange/__init__.py index bbbae02de6b70..dd54a893310c1 100644 --- a/pandas/api/interchange/__init__.py +++ b/pandas/api/interchange/__init__.py @@ -17,7 +17,7 @@ def __getattr__(key: str): raise AttributeError(key) -__all__ = [ # pyright: ignore[reportUnsupportedDunderAll] - "from_dataframe", - "DataFrame", +__all__ = [ + "from_dataframe", # pyright: ignore[reportUnsupportedDunderAll] + "DataFrame", # pyright: ignore[reportUnsupportedDunderAll] ] From 09f1c7b542b31330605aad91ab5ed17a9b00cdea Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 4 Apr 2023 20:37:57 -0700 Subject: [PATCH 4/6] mypy fixup --- pandas/io/parsers/c_parser_wrapper.py | 78 +++++++++++---------------- pandas/io/parsers/python_parser.py | 29 +++------- 2 files changed, 40 insertions(+), 67 deletions(-) diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py index a6647df947961..e91aa467bcd71 100644 --- a/pandas/io/parsers/c_parser_wrapper.py +++ b/pandas/io/parsers/c_parser_wrapper.py @@ -64,10 +64,7 @@ def __init__(self, src: ReadCsvBuffer[str], **kwds) -> None: self.low_memory = kwds.pop("low_memory", False) # #2442 - # error: Cannot determine type of 'index_col' - kwds["allow_leading_cols"] = ( - self.index_col is not False # type: ignore[has-type] - ) + kwds["allow_leading_cols"] = self.index_col is not False # GH20529, validate usecol arg before TextReader kwds["usecols"] = self.usecols @@ -93,27 +90,23 @@ def __init__(self, src: ReadCsvBuffer[str], **kwds) -> None: self.unnamed_cols = self._reader.unnamed_cols - # error: Cannot determine type of 'names' - passed_names = self.names is None # type: ignore[has-type] + passed_names = self.names is None if self._reader.header is None: self.names = None else: - # error: Cannot determine type of 'names' - # error: Cannot determine type of 'index_names' ( - self.names, # type: ignore[has-type] + self.names, self.index_names, self.col_names, passed_names, ) = self._extract_multi_indexer_columns( self._reader.header, - self.index_names, # type: ignore[has-type] + self.index_names, passed_names, ) - # error: Cannot determine type of 'names' - if self.names is None: # type: ignore[has-type] + if self.names is None: self.names = list(range(self._reader.table_width)) # gh-9755 @@ -124,8 +117,7 @@ def __init__(self, src: ReadCsvBuffer[str], **kwds) -> None: # # once names has been filtered, we will # then set orig_names again to names - # error: Cannot determine type of 'names' - self.orig_names = self.names[:] # type: ignore[has-type] + self.orig_names = self.names[:] if self.usecols: usecols = self._evaluate_usecols(self.usecols, self.orig_names) @@ -138,47 +130,34 @@ def __init__(self, src: ReadCsvBuffer[str], **kwds) -> None: ): self._validate_usecols_names(usecols, self.orig_names) - # error: Cannot determine type of 'names' - if len(self.names) > len(usecols): # type: ignore[has-type] - # error: Cannot determine type of 'names' - self.names = [ # type: ignore[has-type] + if len(self.names) > len(usecols): + self.names = [ n - # error: Cannot determine type of 'names' - for i, n in enumerate(self.names) # type: ignore[has-type] + for i, n in enumerate(self.names) if (i in usecols or n in usecols) ] - # error: Cannot determine type of 'names' - if len(self.names) < len(usecols): # type: ignore[has-type] - # error: Cannot determine type of 'names' + if len(self.names) < len(usecols): self._validate_usecols_names( usecols, - self.names, # type: ignore[has-type] + self.names, ) - # error: Cannot determine type of 'names' - self._validate_parse_dates_presence(self.names) # type: ignore[has-type] + self._validate_parse_dates_presence(self.names) self._set_noconvert_columns() - # error: Cannot determine type of 'names' - self.orig_names = self.names # type: ignore[has-type] + self.orig_names = self.names if not self._has_complex_date_col: - # error: Cannot determine type of 'index_col' - if self._reader.leading_cols == 0 and is_index_col( - self.index_col # type: ignore[has-type] - ): + if self._reader.leading_cols == 0 and is_index_col(self.index_col): self._name_processed = True ( index_names, - # error: Cannot determine type of 'names' - self.names, # type: ignore[has-type] + self.names, self.index_col, ) = self._clean_index_names( - # error: Cannot determine type of 'names' - self.names, # type: ignore[has-type] - # error: Cannot determine type of 'index_col' - self.index_col, # type: ignore[has-type] + self.names, + self.index_col, ) if self.index_names is None: @@ -205,15 +184,13 @@ def _set_noconvert_columns(self) -> None: undergo such conversions. """ assert self.orig_names is not None - # error: Cannot determine type of 'names' # much faster than using orig_names.index(x) xref GH#44106 names_dict = {x: i for i, x in enumerate(self.orig_names)} - col_indices = [names_dict[x] for x in self.names] # type: ignore[has-type] - # error: Cannot determine type of 'names' + col_indices = [names_dict[x] for x in self.names] noconvert_columns = self._set_noconvert_dtype_columns( col_indices, - self.names, # type: ignore[has-type] + self.names, ) for col in noconvert_columns: self._reader.set_noconvert(col) @@ -239,9 +216,19 @@ def read( except StopIteration: if self._first_chunk: self._first_chunk = False + + # error: Argument 1 to "is_potential_multi_index" has incompatible + # type "Optional[Sequence[Hashable]]"; expected + # "Union[Sequence[Hashable], MultiIndex]" + may_be_mi = is_potential_multi_index( + self.orig_names, self.index_col # type: ignore[arg-type] + ) + + # error: Argument 1 to "dedup_names" has incompatible type + # "Optional[Sequence[Hashable]]"; expected "Sequence[Hashable]" names = dedup_names( - self.orig_names, - is_potential_multi_index(self.orig_names, self.index_col), + self.orig_names, # type: ignore[arg-type] + may_be_mi, ) index, columns, col_dict = self._get_empty_meta( names, @@ -265,8 +252,7 @@ def read( # Done with first read, next time raise StopIteration self._first_chunk = False - # error: Cannot determine type of 'names' - names = self.names # type: ignore[has-type] + names = self.names if self._reader.leading_cols: if self._has_complex_date_col: diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index 0a39d7299d1bf..d850049dd683e 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -130,7 +130,6 @@ def __init__(self, f: ReadCsvBuffer[str] | list, **kwds) -> None: # Now self.columns has the set of columns that we will process. # The original set is stored in self.original_columns. - # error: Cannot determine type of 'index_names' ( self.columns, self.index_names, @@ -138,7 +137,7 @@ def __init__(self, f: ReadCsvBuffer[str] | list, **kwds) -> None: _, ) = self._extract_multi_indexer_columns( columns, - self.index_names, # type: ignore[has-type] + self.index_names, ) # get popped off for index @@ -262,18 +261,16 @@ def read( columns: Sequence[Hashable] = list(self.orig_names) if not len(content): # pragma: no cover # DataFrame with the right metadata, even though it's length 0 - # error: Cannot determine type of 'index_col' names = dedup_names( self.orig_names, is_potential_multi_index( self.orig_names, - self.index_col, # type: ignore[has-type] + self.index_col, ), ) - # error: Cannot determine type of 'index_col' index, columns, col_dict = self._get_empty_meta( names, - self.index_col, # type: ignore[has-type] + self.index_col, self.index_names, self.dtype, ) @@ -303,19 +300,17 @@ def _exclude_implicit_index( self, alldata: list[np.ndarray], ) -> tuple[Mapping[Hashable, np.ndarray], Sequence[Hashable]]: - # error: Cannot determine type of 'index_col' names = dedup_names( self.orig_names, is_potential_multi_index( self.orig_names, - self.index_col, # type: ignore[has-type] + self.index_col, ), ) offset = 0 if self._implicit_index: - # error: Cannot determine type of 'index_col' - offset = len(self.index_col) # type: ignore[has-type] + offset = len(self.index_col) len_alldata = len(alldata) self._check_data_length(names, alldata) @@ -481,8 +476,7 @@ def _infer_columns( # line for the rest of the parsing code if hr == header[-1]: lc = len(this_columns) - # error: Cannot determine type of 'index_col' - sic = self.index_col # type: ignore[has-type] + sic = self.index_col ic = len(sic) if sic is not None else 0 unnamed_count = len(this_unnamed_cols) @@ -947,8 +941,7 @@ def _get_index_name( if line is not None: # leave it 0, #2442 # Case 1 - # error: Cannot determine type of 'index_col' - index_col = self.index_col # type: ignore[has-type] + index_col = self.index_col if index_col is not False: implicit_first_cols = len(line) - self.num_original_columns @@ -998,13 +991,7 @@ def _rows_to_cols(self, content: list[list[Scalar]]) -> list[np.ndarray]: # Check that there are no rows with too many # elements in their row (rows with too few # elements are padded with NaN). - # error: Non-overlapping identity check (left operand type: "List[int]", - # right operand type: "Literal[False]") - if ( - max_len > col_len - and self.index_col is not False # type: ignore[comparison-overlap] - and self.usecols is None - ): + if max_len > col_len and self.index_col is not False and self.usecols is None: footers = self.skipfooter if self.skipfooter else 0 bad_lines = [] From a9e1f39686ea7ce6f3e5ab7073a03841dcab353d Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 21 Apr 2023 12:41:09 -0700 Subject: [PATCH 5/6] lazy io imports in pd.api.typing --- pandas/api/typing/__init__.py | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/pandas/api/typing/__init__.py b/pandas/api/typing/__init__.py index 4c535bf81d3b6..ea8cc6a1766a5 100644 --- a/pandas/api/typing/__init__.py +++ b/pandas/api/typing/__init__.py @@ -1,3 +1,4 @@ +# pylint: disable=undefined-all-variable """ Public API classes that store intermediate results useful for type-hinting. """ @@ -25,8 +26,27 @@ # TODO: Can't import Styler without importing jinja2 # from pandas.io.formats.style import Styler -from pandas.io.json._json import JsonReader -from pandas.io.stata import StataReader + + +def __getattr__(key: str): + if key == "JsonReader": + from pandas.io.json._json import JsonReader + + return JsonReader + elif key == "StataReader": + from pandas.io.stata import StataReader + + return StataReader + else: + raise AttributeError(f"module 'pandas.api.typing' has no attribute '{key}'") + + +def __dir__() -> list[str]: + # include lazy imports defined in __getattr__ in dir() + base = list(globals().keys()) + result = base + ["JsonReader", "StataReader"] + return sorted(result) + __all__ = [ "DataFrameGroupBy", @@ -35,13 +55,13 @@ "ExpandingGroupby", "ExponentialMovingWindow", "ExponentialMovingWindowGroupby", - "JsonReader", + "JsonReader", # pyright: ignore[reportUnsupportedDunderAll] "PeriodIndexResamplerGroupby", "Resampler", "Rolling", "RollingGroupby", "SeriesGroupBy", - "StataReader", + "StataReader", # pyright: ignore[reportUnsupportedDunderAll] # See TODO above # "Styler", "TimedeltaIndexResamplerGroupby", From 1cce118f47c193d028ffda5bb85cfe0d15c98b57 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 21 Apr 2023 13:57:01 -0700 Subject: [PATCH 6/6] __future__ annotations --- pandas/api/typing/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/api/typing/__init__.py b/pandas/api/typing/__init__.py index ea8cc6a1766a5..93b01a814d51a 100644 --- a/pandas/api/typing/__init__.py +++ b/pandas/api/typing/__init__.py @@ -2,6 +2,7 @@ """ Public API classes that store intermediate results useful for type-hinting. """ +from __future__ import annotations from pandas.core.groupby import ( DataFrameGroupBy,