From 3e82166957d2a50407378a5c421a7c79dfd6c7bf Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Tue, 4 Jan 2022 18:43:10 -0800 Subject: [PATCH 01/10] Start script to check sync of min versions --- scripts/validate_min_versions_in_sync.py | 35 ++++++++++++++++++++++++ 1 file changed, 35 insertions(+) create mode 100755 scripts/validate_min_versions_in_sync.py diff --git a/scripts/validate_min_versions_in_sync.py b/scripts/validate_min_versions_in_sync.py new file mode 100755 index 0000000000000..f559dec42c1d7 --- /dev/null +++ b/scripts/validate_min_versions_in_sync.py @@ -0,0 +1,35 @@ +#!/usr/bin/env python3 +""" +Check pandas required and optional dependencies are synced across: + +doc/source/getting_started/install.rst +ci/deps/actions-.*-minimum_versions.yaml +pandas/compat/_optional.py + +This is meant to be run as a pre-commit hook - to run it manually, you can do: + + pre-commit run validate-min-versions-in-sync --all-files +""" +# import argparse +import ast +import pathlib + +# import re +# import sys + +DOC_PATH = pathlib.Path("doc/source/getting_started/install.rst").resolve() +CI_PATH = next( + pathlib.Path("ci/deps").absolute().glob("actions-*-minimum_versions.yaml") +) +CODE_PATH = pathlib.Path("pandas/compat/_optional.py").resolve() + + +def get_versions_from_optional(content: str) -> dict[str, str]: + for node in ast.walk(ast.parse(content)): + if isinstance(node, ast.Dict): + # version_dict = node + break + + +if __name__ == "__main__": + pass From 4f996f31319f37320a41ee315220c252501f4f3f Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Tue, 4 Jan 2022 19:17:26 -0800 Subject: [PATCH 02/10] Complete get_versions_from_optional --- scripts/validate_min_versions_in_sync.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/scripts/validate_min_versions_in_sync.py b/scripts/validate_min_versions_in_sync.py index f559dec42c1d7..ae8d19b3ff57a 100755 --- a/scripts/validate_min_versions_in_sync.py +++ b/scripts/validate_min_versions_in_sync.py @@ -10,6 +10,8 @@ pre-commit run validate-min-versions-in-sync --all-files """ +from __future__ import annotations + # import argparse import ast import pathlib @@ -25,10 +27,22 @@ def get_versions_from_optional(content: str) -> dict[str, str]: + num_dicts = 0 for node in ast.walk(ast.parse(content)): if isinstance(node, ast.Dict): - # version_dict = node - break + if num_dicts == 0: + version_dict_ast = node + num_dicts += 1 + elif num_dicts == 1: + install_map_ast = node + break + install_map = { + k.value: v.value for k, v in zip(install_map_ast.keys, install_map_ast.values) + } + return { + install_map.get(k.value, k.value): v.value + for k, v in zip(version_dict_ast.keys, version_dict_ast.values) + } if __name__ == "__main__": From b4e417d08127054ce61b8470899f2d4ec5c23004 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Tue, 4 Jan 2022 20:36:20 -0800 Subject: [PATCH 03/10] Pin min pyreadstat, parser for CI file --- ci/deps/actions-38-minimum_versions.yaml | 2 +- doc/source/getting_started/install.rst | 2 +- pandas/compat/_optional.py | 1 + scripts/validate_min_versions_in_sync.py | 25 ++++++++++++++++++------ 4 files changed, 22 insertions(+), 8 deletions(-) diff --git a/ci/deps/actions-38-minimum_versions.yaml b/ci/deps/actions-38-minimum_versions.yaml index 8505dad542239..329dbd6a4e7fc 100644 --- a/ci/deps/actions-38-minimum_versions.yaml +++ b/ci/deps/actions-38-minimum_versions.yaml @@ -39,7 +39,7 @@ dependencies: - pymysql=0.10.1 - pytables=3.6.1 - pyarrow=1.0.1 - - pyreadstat + - pyreadstat=1.1.0 - pyxlsb=1.0.6 - s3fs=0.4.0 - scipy=1.4.1 diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index 05c47d5cdf4f7..25d7bc407b3b3 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -361,7 +361,7 @@ blosc 1.20.1 Compression for HDF5 zlib Compression for HDF5 fastparquet 0.4.0 Parquet reading / writing pyarrow 1.0.1 Parquet, ORC, and feather reading / writing -pyreadstat SPSS files (.sav) reading +pyreadstat 1.1.0 SPSS files (.sav) reading ========================= ================== ============================================================= .. _install.warn_orc: diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py index a2be663504abe..24c21c91e7a0e 100644 --- a/pandas/compat/_optional.py +++ b/pandas/compat/_optional.py @@ -22,6 +22,7 @@ "openpyxl": "3.0.3", "pandas_gbq": "0.14.0", "pyarrow": "1.0.1", + "pyreadstat": "1.1.0", "pytest": "6.0", "pyxlsb": "1.0.6", "s3fs": "0.4.0", diff --git a/scripts/validate_min_versions_in_sync.py b/scripts/validate_min_versions_in_sync.py index ae8d19b3ff57a..378fb2c17c5de 100755 --- a/scripts/validate_min_versions_in_sync.py +++ b/scripts/validate_min_versions_in_sync.py @@ -12,10 +12,11 @@ """ from __future__ import annotations -# import argparse import ast import pathlib +import yaml + # import re # import sys @@ -26,7 +27,7 @@ CODE_PATH = pathlib.Path("pandas/compat/_optional.py").resolve() -def get_versions_from_optional(content: str) -> dict[str, str]: +def get_versions_from_code(content: str) -> dict[str, str]: num_dicts = 0 for node in ast.walk(ast.parse(content)): if isinstance(node, ast.Dict): @@ -34,16 +35,28 @@ def get_versions_from_optional(content: str) -> dict[str, str]: version_dict_ast = node num_dicts += 1 elif num_dicts == 1: - install_map_ast = node + install_map = {k.value: v.value for k, v in zip(node.keys, node.values)} break - install_map = { - k.value: v.value for k, v in zip(install_map_ast.keys, install_map_ast.values) - } return { install_map.get(k.value, k.value): v.value for k, v in zip(version_dict_ast.keys, version_dict_ast.values) } +def get_versions_from_ci(fle) -> dict[str, str]: + yml_content = yaml.safe_load(fle) + yml_version = {} + for dependency in reversed(yml_content["dependencies"]): + if "=" not in dependency: + break + package, version = dependency.split("=") + yml_version[package] = version + return yml_version + + if __name__ == "__main__": + with open(CODE_PATH) as f: + code_versions = get_versions_from_code(f.read()) + with open(CI_PATH) as f: + yml_content = get_versions_from_ci(f) pass From 85f0e3d047a16bf6c9f87fd433254abb82bf1592 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Tue, 4 Jan 2022 21:45:29 -0800 Subject: [PATCH 04/10] Dont parse via yaml --- ci/deps/actions-38-minimum_versions.yaml | 2 +- pandas/compat/_optional.py | 10 ++++- scripts/validate_min_versions_in_sync.py | 52 ++++++++++++++++-------- 3 files changed, 43 insertions(+), 21 deletions(-) diff --git a/ci/deps/actions-38-minimum_versions.yaml b/ci/deps/actions-38-minimum_versions.yaml index 329dbd6a4e7fc..467402bb6ef7f 100644 --- a/ci/deps/actions-38-minimum_versions.yaml +++ b/ci/deps/actions-38-minimum_versions.yaml @@ -32,8 +32,8 @@ dependencies: - matplotlib=3.3.2 - numba=0.50.1 - numexpr=2.7.1 - - openpyxl=3.0.3 - odfpy=1.4.1 + - openpyxl=3.0.3 - pandas-gbq=0.14.0 - psycopg2=2.8.4 - pymysql=0.10.1 diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py index 24c21c91e7a0e..6e8c792192d8e 100644 --- a/pandas/compat/_optional.py +++ b/pandas/compat/_optional.py @@ -11,16 +11,22 @@ VERSIONS = { "bs4": "4.8.2", + "blosc": "1.20.1", "bottleneck": "1.3.1", - "fsspec": "0.7.4", "fastparquet": "0.4.0", + "fsspec": "0.7.4", + "html5lib": "1.1", "gcsfs": "0.6.0", + "jinja2": "2.11", "lxml.etree": "4.5.0", "matplotlib": "3.3.2", + "numba": "0.50.1", "numexpr": "2.7.1", "odfpy": "1.4.1", "openpyxl": "3.0.3", "pandas_gbq": "0.14.0", + "psycopg2": "2.8.4", + "pymysql": "0.10.1", "pyarrow": "1.0.1", "pyreadstat": "1.1.0", "pytest": "6.0", @@ -34,7 +40,6 @@ "xlrd": "2.0.1", "xlwt": "1.3.0", "xlsxwriter": "1.2.2", - "numba": "0.50.1", "zstandard": "0.15.2", } @@ -47,6 +52,7 @@ "lxml.etree": "lxml", "odf": "odfpy", "pandas_gbq": "pandas-gbq", + "tables": "pytables", "sqlalchemy": "SQLAlchemy", "jinja2": "Jinja2", } diff --git a/scripts/validate_min_versions_in_sync.py b/scripts/validate_min_versions_in_sync.py index 378fb2c17c5de..d63f3942f6fd5 100755 --- a/scripts/validate_min_versions_in_sync.py +++ b/scripts/validate_min_versions_in_sync.py @@ -14,11 +14,7 @@ import ast import pathlib - -import yaml - -# import re -# import sys +import sys DOC_PATH = pathlib.Path("doc/source/getting_started/install.rst").resolve() CI_PATH = next( @@ -43,20 +39,40 @@ def get_versions_from_code(content: str) -> dict[str, str]: } -def get_versions_from_ci(fle) -> dict[str, str]: - yml_content = yaml.safe_load(fle) - yml_version = {} - for dependency in reversed(yml_content["dependencies"]): - if "=" not in dependency: - break - package, version = dependency.split("=") - yml_version[package] = version - return yml_version +def get_versions_from_ci(content: list[str]) -> tuple[dict[str, str], dict[str, str]]: + # Don't parse with pyyaml because it ignores comments we're looking for + seen_required = False + seen_optional = False + required_deps = {} + optional_deps = {} + for line in content: + if "# required dependencies" in line: + seen_required = True + elif "# optional dependencies" in line: + seen_optional = True + elif seen_required and line.strip(): + package, version = line.strip().split("=") + package = package[2:] + if not seen_optional: + required_deps[package] = version + else: + optional_deps[package] = version + return required_deps, optional_deps -if __name__ == "__main__": +def get_versions_from_doc(content: str, ci_verions: dict[str, str]) -> dict[str, str]: + pass + + +def main(): + # The CI file is our source of truth since it's what we're testing. + with open(CI_PATH) as f: + ci_required, ci_optional = get_versions_from_ci(f.readlines()) with open(CODE_PATH) as f: code_versions = get_versions_from_code(f.read()) - with open(CI_PATH) as f: - yml_content = get_versions_from_ci(f) - pass + with open(DOC_PATH) as f: + doc_verions = get_versions_from_doc(f.read(), ci_versions) + + +if __name__ == "__main__": + main() From 976bdb8b7823e9b7c15597471ad79e6a31a162cc Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Wed, 5 Jan 2022 21:22:14 -0800 Subject: [PATCH 05/10] Only handle import_optional_dependency and CI for now --- .pre-commit-config.yaml | 4 +++ scripts/validate_min_versions_in_sync.py | 33 +++++++++++++----------- 2 files changed, 22 insertions(+), 15 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 854b7b2e4fe63..187761f603063 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -181,3 +181,7 @@ repos: entry: 'pg8000' files: ^ci/deps types: [yaml] + - id: validate-min-versions-in-sync + name: Check minimum version of dependencies are aligned + entry: python scripts/validate_min_versions_in_sync.py + language: python diff --git a/scripts/validate_min_versions_in_sync.py b/scripts/validate_min_versions_in_sync.py index d63f3942f6fd5..19728061e84a0 100755 --- a/scripts/validate_min_versions_in_sync.py +++ b/scripts/validate_min_versions_in_sync.py @@ -2,10 +2,11 @@ """ Check pandas required and optional dependencies are synced across: -doc/source/getting_started/install.rst ci/deps/actions-.*-minimum_versions.yaml pandas/compat/_optional.py +TODO: doc/source/getting_started/install.rst + This is meant to be run as a pre-commit hook - to run it manually, you can do: pre-commit run validate-min-versions-in-sync --all-files @@ -32,11 +33,11 @@ def get_versions_from_code(content: str) -> dict[str, str]: num_dicts += 1 elif num_dicts == 1: install_map = {k.value: v.value for k, v in zip(node.keys, node.values)} - break - return { - install_map.get(k.value, k.value): v.value - for k, v in zip(version_dict_ast.keys, version_dict_ast.values) - } + return { + install_map.get(k.value, k.value).casefold(): v.value + for k, v in zip(version_dict_ast.keys, version_dict_ast.values) + if k.value != "pytest" + } def get_versions_from_ci(content: list[str]) -> tuple[dict[str, str], dict[str, str]]: @@ -60,18 +61,20 @@ def get_versions_from_ci(content: list[str]) -> tuple[dict[str, str], dict[str, return required_deps, optional_deps -def get_versions_from_doc(content: str, ci_verions: dict[str, str]) -> dict[str, str]: - pass - - def main(): - # The CI file is our source of truth since it's what we're testing. with open(CI_PATH) as f: - ci_required, ci_optional = get_versions_from_ci(f.readlines()) + _, ci_optional = get_versions_from_ci(f.readlines()) with open(CODE_PATH) as f: - code_versions = get_versions_from_code(f.read()) - with open(DOC_PATH) as f: - doc_verions = get_versions_from_doc(f.read(), ci_versions) + code_optional = get_versions_from_code(f.read()) + diff = set(ci_optional.items()).symmetric_difference(code_optional.items()) + if diff: + sys.stdout.write( + f"The follow minimum version differences were found between " + f"{CI_PATH} and {CODE_PATH}. Please ensure these are aligned: " + f"{diff}" + ) + sys.exit(1) + sys.exit(0) if __name__ == "__main__": From 83d4f0982a5fab290f863105a5b12e7f66b3e3a5 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Thu, 6 Jan 2022 12:53:10 -0800 Subject: [PATCH 06/10] Fix edge case with psycopg2 --- pandas/compat/_optional.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py index 6e8c792192d8e..d407e6eedf584 100644 --- a/pandas/compat/_optional.py +++ b/pandas/compat/_optional.py @@ -25,7 +25,7 @@ "odfpy": "1.4.1", "openpyxl": "3.0.3", "pandas_gbq": "0.14.0", - "psycopg2": "2.8.4", + "psycopg2": "2.8.4", # (dt dec pq3 ext lo64) "pymysql": "0.10.1", "pyarrow": "1.0.1", "pyreadstat": "1.1.0", @@ -66,7 +66,8 @@ def get_version(module: types.ModuleType) -> str: if version is None: raise ImportError(f"Can't determine version for {module.__name__}") - return version + # psycopg2 appends " (dt dec pq3 ext lo64)" to it's version + return version.split()[0] def import_optional_dependency( From 82de0b78c2073bf96a586ccf6d6298fa20544f93 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Fri, 7 Jan 2022 18:55:26 -0800 Subject: [PATCH 07/10] Special case for psycopg2 --- pandas/compat/_optional.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py index d407e6eedf584..a26bc94ab883e 100644 --- a/pandas/compat/_optional.py +++ b/pandas/compat/_optional.py @@ -66,8 +66,10 @@ def get_version(module: types.ModuleType) -> str: if version is None: raise ImportError(f"Can't determine version for {module.__name__}") - # psycopg2 appends " (dt dec pq3 ext lo64)" to it's version - return version.split()[0] + if module.__name__ == "psycopg2": + # psycopg2 appends " (dt dec pq3 ext lo64)" to it's version + version = version.split()[0] + return version def import_optional_dependency( From 3a76a5264b713d1034c95ed5315fecc22d568735 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Tue, 11 Jan 2022 15:54:11 -0800 Subject: [PATCH 08/10] Add to sys path instead of using ast --- scripts/validate_min_versions_in_sync.py | 34 ++++++++++++------------ 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/scripts/validate_min_versions_in_sync.py b/scripts/validate_min_versions_in_sync.py index 19728061e84a0..87f427a6b9324 100755 --- a/scripts/validate_min_versions_in_sync.py +++ b/scripts/validate_min_versions_in_sync.py @@ -13,7 +13,6 @@ """ from __future__ import annotations -import ast import pathlib import sys @@ -22,22 +21,24 @@ pathlib.Path("ci/deps").absolute().glob("actions-*-minimum_versions.yaml") ) CODE_PATH = pathlib.Path("pandas/compat/_optional.py").resolve() +# pandas package is not available +# in pre-commit environment +sys.path.append("pandas/compat") +sys.path.append("pandas/util") +import version +sys.modules["pandas.util.version"] = version +import _optional -def get_versions_from_code(content: str) -> dict[str, str]: - num_dicts = 0 - for node in ast.walk(ast.parse(content)): - if isinstance(node, ast.Dict): - if num_dicts == 0: - version_dict_ast = node - num_dicts += 1 - elif num_dicts == 1: - install_map = {k.value: v.value for k, v in zip(node.keys, node.values)} - return { - install_map.get(k.value, k.value).casefold(): v.value - for k, v in zip(version_dict_ast.keys, version_dict_ast.values) - if k.value != "pytest" - } + +def get_versions_from_code() -> dict[str, str]: + install_map = _optional.INSTALL_MAPPING + versions = _optional.VERSIONS + return { + install_map.get(k, k).casefold(): v + for k, v in versions.items() + if k != "pytest" + } def get_versions_from_ci(content: list[str]) -> tuple[dict[str, str], dict[str, str]]: @@ -64,8 +65,7 @@ def get_versions_from_ci(content: list[str]) -> tuple[dict[str, str], dict[str, def main(): with open(CI_PATH) as f: _, ci_optional = get_versions_from_ci(f.readlines()) - with open(CODE_PATH) as f: - code_optional = get_versions_from_code(f.read()) + code_optional = get_versions_from_code() diff = set(ci_optional.items()).symmetric_difference(code_optional.items()) if diff: sys.stdout.write( From d9bdc6af2e8cd083f7a85e994d35c1c97a6e497c Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Wed, 12 Jan 2022 11:13:42 -0800 Subject: [PATCH 09/10] Scope to specific files --- .pre-commit-config.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 4f02cc8b529b3..cc2356873a8da 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -192,3 +192,4 @@ repos: name: Check minimum version of dependencies are aligned entry: python scripts/validate_min_versions_in_sync.py language: python + files: ^(ci/deps/actions-*-minimum_versions\.yaml|pandas/compat/_optional\.py)$ From 3c9f463cde2c6180200f3c07cd18014459200abb Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Sun, 16 Jan 2022 15:37:15 -0800 Subject: [PATCH 10/10] Address review --- .pre-commit-config.yaml | 2 +- scripts/validate_min_versions_in_sync.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index cc2356873a8da..f72450a18312e 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -192,4 +192,4 @@ repos: name: Check minimum version of dependencies are aligned entry: python scripts/validate_min_versions_in_sync.py language: python - files: ^(ci/deps/actions-*-minimum_versions\.yaml|pandas/compat/_optional\.py)$ + files: ^(ci/deps/actions-.*-minimum_versions\.yaml|pandas/compat/_optional\.py)$ diff --git a/scripts/validate_min_versions_in_sync.py b/scripts/validate_min_versions_in_sync.py index 87f427a6b9324..4dbf6a4cdcef8 100755 --- a/scripts/validate_min_versions_in_sync.py +++ b/scripts/validate_min_versions_in_sync.py @@ -63,7 +63,7 @@ def get_versions_from_ci(content: list[str]) -> tuple[dict[str, str], dict[str, def main(): - with open(CI_PATH) as f: + with open(CI_PATH, encoding="utf-8") as f: _, ci_optional = get_versions_from_ci(f.readlines()) code_optional = get_versions_from_code() diff = set(ci_optional.items()).symmetric_difference(code_optional.items()) @@ -71,7 +71,7 @@ def main(): sys.stdout.write( f"The follow minimum version differences were found between " f"{CI_PATH} and {CODE_PATH}. Please ensure these are aligned: " - f"{diff}" + f"{diff}\n" ) sys.exit(1) sys.exit(0)