TYP/CLN: mostly in io/html.py (#43958)

twoertwein · web-flow · commit 4f2b88a1ac95 · 2021-10-17T17:50:05.000-04:00
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -22,7 +22,6 @@
     IO,
     TYPE_CHECKING,
     Any,
-    AnyStr,
     Callable,
     Hashable,
     Iterable,
@@ -2598,7 +2597,7 @@ def to_stata(
         writer.write_file()
 
     @deprecate_kwarg(old_arg_name="fname", new_arg_name="path")
-    def to_feather(self, path: FilePathOrBuffer[AnyStr], **kwargs) -> None:
+    def to_feather(self, path: FilePathOrBuffer[bytes], **kwargs) -> None:
         """
         Write a DataFrame to the binary Feather format.
 
diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py
@@ -1,6 +1,10 @@
 """ feather-format compat """
+from __future__ import annotations
 
-from typing import AnyStr
+from typing import (
+    Hashable,
+    Sequence,
+)
 
 from pandas._typing import (
     FilePathOrBuffer,
@@ -22,7 +26,7 @@
 @doc(storage_options=generic._shared_docs["storage_options"])
 def to_feather(
     df: DataFrame,
-    path: FilePathOrBuffer[AnyStr],
+    path: FilePathOrBuffer[bytes],
     storage_options: StorageOptions = None,
     **kwargs,
 ):
@@ -89,7 +93,10 @@ def to_feather(
 
 @doc(storage_options=generic._shared_docs["storage_options"])
 def read_feather(
-    path, columns=None, use_threads: bool = True, storage_options: StorageOptions = None
+    path: FilePathOrBuffer[bytes],
+    columns: Sequence[Hashable] | None = None,
+    use_threads: bool = True,
+    storage_options: StorageOptions = None,
 ):
     """
     Load a feather-format object from the file path.
diff --git a/pandas/io/html.py b/pandas/io/html.py
@@ -8,7 +8,6 @@
 
 from collections import abc
 import numbers
-import os
 import re
 from typing import (
     Pattern,
@@ -29,6 +28,8 @@
 from pandas.core.frame import DataFrame
 
 from pandas.io.common import (
+    file_exists,
+    get_handle,
     is_url,
     stringify_path,
     urlopen,
@@ -70,7 +71,7 @@ def _importers():
 _RE_WHITESPACE = re.compile(r"[\r\n]+|\s{2,}")
 
 
-def _remove_whitespace(s: str, regex=_RE_WHITESPACE) -> str:
+def _remove_whitespace(s: str, regex: Pattern = _RE_WHITESPACE) -> str:
     """
     Replace extra whitespace inside of a string with a single space.
 
@@ -89,7 +90,7 @@ def _remove_whitespace(s: str, regex=_RE_WHITESPACE) -> str:
     return regex.sub(" ", s.strip())
 
 
-def _get_skiprows(skiprows):
+def _get_skiprows(skiprows: int | Sequence[int] | slice | None):
     """
     Get an iterator given an integer, slice or container.
 
@@ -118,7 +119,7 @@ def _get_skiprows(skiprows):
     raise TypeError(f"{type(skiprows).__name__} is not a valid type for skipping rows")
 
 
-def _read(obj):
+def _read(obj: bytes | FilePathOrBuffer, encoding: str | None) -> str | bytes:
     """
     Try to read from a url, file or string.
 
@@ -130,22 +131,26 @@ def _read(obj):
     -------
     raw_text : str
     """
-    if is_url(obj):
-        with urlopen(obj) as url:
-            text = url.read()
-    elif hasattr(obj, "read"):
-        text = obj.read()
+    if (
+        is_url(obj)
+        or hasattr(obj, "read")
+        or (isinstance(obj, str) and file_exists(obj))
+    ):
+        # error: Argument 1 to "get_handle" has incompatible type "Union[str, bytes,
+        # Union[IO[Any], RawIOBase, BufferedIOBase, TextIOBase, TextIOWrapper, mmap]]";
+        # expected "Union[PathLike[str], Union[str, Union[IO[Any], RawIOBase,
+        # BufferedIOBase, TextIOBase, TextIOWrapper, mmap]]]"
+        with get_handle(
+            obj, "r", encoding=encoding  # type: ignore[arg-type]
+        ) as handles:
+            text = handles.handle.read()
     elif isinstance(obj, (str, bytes)):
         text = obj
-        try:
-            if os.path.isfile(text):
-                with open(text, "rb") as f:
-                    return f.read()
-        except (TypeError, ValueError):
-            pass
     else:
         raise TypeError(f"Cannot read object of type '{type(obj).__name__}'")
-    return text
+    # error: Incompatible return value type (got "Union[Any, bytes, None, str]",
+    # expected "Union[str, bytes]")
+    return text  # type: ignore[return-value]
 
 
 class _HtmlFrameParser:
@@ -204,7 +209,14 @@ class _HtmlFrameParser:
     functionality.
     """
 
-    def __init__(self, io, match, attrs, encoding, displayed_only):
+    def __init__(
+        self,
+        io: FilePathOrBuffer,
+        match: str | Pattern,
+        attrs: dict[str, str] | None,
+        encoding: str,
+        displayed_only: bool,
+    ):
         self.io = io
         self.match = match
         self.attrs = attrs
@@ -590,7 +602,7 @@ def _parse_tfoot_tr(self, table):
         return table.select("tfoot tr")
 
     def _setup_build_doc(self):
-        raw_text = _read(self.io)
+        raw_text = _read(self.io, self.encoding)
         if not raw_text:
             raise ValueError(f"No text parsed from document: {self.io}")
         return raw_text
@@ -653,9 +665,6 @@ class _LxmlFrameParser(_HtmlFrameParser):
     :class:`_HtmlFrameParser`.
     """
 
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-
     def _text_getter(self, obj):
         return obj.text_content()
 
@@ -818,7 +827,7 @@ def _data_to_frame(**kwargs):
 }
 
 
-def _parser_dispatch(flavor):
+def _parser_dispatch(flavor: str | None) -> type[_HtmlFrameParser]:
     """
     Choose the parser based on the input flavor.
 
diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py
@@ -1233,6 +1233,10 @@ def seek(self, offset):
             def seekable(self):
                 return True
 
+            def __iter__(self):
+                # to fool `is_file_like`, should never end up here
+                assert False
+
         good = MockFile("<table><tr><td>spam<br />eggs</td></tr></table>")
         bad = MockFile("<table><tr><td>spam<foobr />eggs</td></tr></table>")