ENH: Correct typing for read/to_html (#236)

bashtage · Kevin Sheppard · web-flow · commit 0822888d2d3b · 2022-09-02T08:31:53.000-04:00
* ENH: Correct typing for read/to_html

* TYP: Remove redundant and improve types

Co-authored-by: Kevin Sheppard &lt;kevin.sheppard@gmail.com&gt;
diff --git a/pandas-stubs/_typing.pyi b/pandas-stubs/_typing.pyi
@@ -236,5 +236,6 @@ CSVEngine = Literal["c", "python", "pyarrow", "python-fwf"]
 
 HDFCompLib = Literal["zlib", "lzo", "bzip2", "blosc"]
 ParquetEngine = Literal["auto", "pyarrow", "fastparquet"]
+ColspaceArgType = str | int | Sequence[int | str] | Mapping[Hashable, str | int]
 
 __all__ = ["npt", "type_t"]
diff --git a/pandas-stubs/core/frame.pyi b/pandas-stubs/core/frame.pyi
@@ -49,6 +49,7 @@ from pandas._typing import (
     Axes,
     Axis,
     AxisType,
+    ColspaceArgType,
     CompressionOptions,
     Dtype,
     DtypeNp,
@@ -326,23 +327,39 @@ class DataFrame(NDFrame, OpsMixin):
     @overload
     def to_html(
         self,
-        buf: FilePathOrBuffer | None,
-        columns: Sequence[_str] | None = ...,
-        col_space: int | list[int] | dict[_str | int, int] | None = ...,
+        buf: FilePath | WriteBuffer[str],
+        columns: list[HashableT] | None = ...,
+        col_space: ColspaceArgType | None = ...,
         header: _bool = ...,
         index: _bool = ...,
         na_rep: _str = ...,
-        formatters=...,
-        float_format=...,
+        formatters: list[Callable[[object], str]]
+        | tuple[Callable[[object], str], ...]
+        | Mapping[Hashable, Callable[[object], str]]
+        | None = ...,
+        float_format: Callable[[float], str] | None = ...,
         sparsify: _bool | None = ...,
         index_names: _bool = ...,
-        justify: _str | None = ...,
+        justify: Literal[
+            "left",
+            "right",
+            "center",
+            "justify",
+            "justify-all",
+            "start",
+            "end",
+            "inherit",
+            "match-parent",
+            "initial",
+            "unset",
+        ]
+        | None = ...,
         max_rows: int | None = ...,
         max_cols: int | None = ...,
         show_dimensions: _bool = ...,
         decimal: _str = ...,
         bold_rows: _bool = ...,
-        classes: _str | list | tuple | None = ...,
+        classes: Sequence[str] | None = ...,
         escape: _bool = ...,
         notebook: _bool = ...,
         border: int | None = ...,
@@ -353,22 +370,39 @@ class DataFrame(NDFrame, OpsMixin):
     @overload
     def to_html(
         self,
-        columns: Sequence[_str] | None = ...,
-        col_space: int | list[int] | dict[_str | int, int] | None = ...,
+        buf: None = ...,
+        columns: Sequence[HashableT] | None = ...,
+        col_space: ColspaceArgType | None = ...,
         header: _bool = ...,
         index: _bool = ...,
         na_rep: _str = ...,
-        formatters=...,
-        float_format=...,
+        formatters: list[Callable[[object], str]]
+        | tuple[Callable[[object], str], ...]
+        | Mapping[Hashable, Callable[[object], str]]
+        | None = ...,
+        float_format: Callable[[float], str] | None = ...,
         sparsify: _bool | None = ...,
         index_names: _bool = ...,
-        justify: _str | None = ...,
+        justify: Literal[
+            "left",
+            "right",
+            "center",
+            "justify",
+            "justify-all",
+            "start",
+            "end",
+            "inherit",
+            "match-parent",
+            "initial",
+            "unset",
+        ]
+        | None = ...,
         max_rows: int | None = ...,
         max_cols: int | None = ...,
         show_dimensions: _bool = ...,
         decimal: _str = ...,
         bold_rows: _bool = ...,
-        classes: _str | list | tuple | None = ...,
+        classes: Sequence[str] | None = ...,
         escape: _bool = ...,
         notebook: _bool = ...,
         border: int | None = ...,
diff --git a/pandas-stubs/io/html.pyi b/pandas-stubs/io/html.pyi
@@ -1,46 +1,45 @@
 from typing import (
     Any,
     Callable,
-    Iterable,
+    Hashable,
+    Literal,
     Mapping,
+    Pattern,
     Sequence,
 )
 
 from pandas.core.frame import DataFrame
 
-from pandas._typing import FilePathOrBuffer
-
-class _HtmlFrameParser:
-    io = ...
-    match = ...
-    attrs = ...
-    encoding = ...
-    displayed_only = ...
-    def __init__(self, io, match, attrs, encoding, displayed_only) -> None: ...
-    def parse_tables(self): ...
-
-class _BeautifulSoupHtml5LibFrameParser(_HtmlFrameParser):
-    def __init__(self, *args, **kwargs) -> None: ...
-
-class _LxmlFrameParser(_HtmlFrameParser):
-    def __init__(self, *args, **kwargs) -> None: ...
+from pandas._typing import (
+    FilePath,
+    HashableT,
+    ReadBuffer,
+)
 
 def read_html(
-    io: FilePathOrBuffer,
-    match: str = ...,
+    io: FilePath | ReadBuffer[str],
+    match: str | Pattern = ...,
     flavor: str | None = ...,
     header: int | Sequence[int] | None = ...,
-    index_col: int | Sequence[Any] | None = ...,
-    skiprows: int | Sequence[Any] | slice | None = ...,
-    attrs: Mapping[str, str] | None = ...,
+    index_col: int | Sequence[int] | list[HashableT] | None = ...,
+    skiprows: int | Sequence[int] | slice | None = ...,
+    attrs: dict[str, str] | None = ...,
     parse_dates: bool
-    | Sequence[int | str | Sequence[int | str]]
-    | dict[str, Sequence[int | str]] = ...,
+    | Sequence[int]
+    | list[HashableT]  # Cannot be Sequence[Hashable] to prevent str
+    | Sequence[Sequence[Hashable]]
+    | dict[str, Sequence[int]]
+    | dict[str, list[HashableT]] = ...,
     thousands: str = ...,
     encoding: str | None = ...,
     decimal: str = ...,
-    converters: Mapping[int | str, Callable] | None = ...,
-    na_values: Iterable[Any] | None = ...,
+    converters: Mapping[int | HashableT, Callable[[str], Any]] | None = ...,
+    na_values: str
+    | list[str]
+    | dict[HashableT, str]
+    | dict[HashableT, list[str]]
+    | None = ...,
     keep_default_na: bool = ...,
     displayed_only: bool = ...,
+    extract_links: Literal["header", "footer", "body", "all"] | None = ...,
 ) -> list[DataFrame]: ...
diff --git a/tests/test_io.py b/tests/test_io.py
@@ -1,5 +1,4 @@
 import io
-import os
 import os.path
 import pathlib
 from pathlib import Path
@@ -18,6 +17,7 @@
     read_clipboard,
     read_feather,
     read_hdf,
+    read_html,
     read_json,
     read_orc,
     read_parquet,
@@ -337,3 +337,10 @@ def test_feather():
     check(assert_type(DF.to_feather(bio), None), type(None))
     bio.seek(0)
     check(assert_type(read_feather(bio), DataFrame), DataFrame)
+
+
+def test_read_html():
+    check(assert_type(DF.to_html(), str), str)
+    with ensure_clean() as path:
+        check(assert_type(DF.to_html(path), None), type(None))
+        check(assert_type(read_html(path), List[DataFrame]), list)