Skip to content

Commit 4f2b88a

Browse files
authored
TYP/CLN: mostly in io/html.py (#43958)
1 parent 9a29391 commit 4f2b88a

File tree

4 files changed

+46
-27
lines changed

4 files changed

+46
-27
lines changed

pandas/core/frame.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@
2222
IO,
2323
TYPE_CHECKING,
2424
Any,
25-
AnyStr,
2625
Callable,
2726
Hashable,
2827
Iterable,
@@ -2598,7 +2597,7 @@ def to_stata(
25982597
writer.write_file()
25992598

26002599
@deprecate_kwarg(old_arg_name="fname", new_arg_name="path")
2601-
def to_feather(self, path: FilePathOrBuffer[AnyStr], **kwargs) -> None:
2600+
def to_feather(self, path: FilePathOrBuffer[bytes], **kwargs) -> None:
26022601
"""
26032602
Write a DataFrame to the binary Feather format.
26042603

pandas/io/feather_format.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,10 @@
11
""" feather-format compat """
2+
from __future__ import annotations
23

3-
from typing import AnyStr
4+
from typing import (
5+
Hashable,
6+
Sequence,
7+
)
48

59
from pandas._typing import (
610
FilePathOrBuffer,
@@ -22,7 +26,7 @@
2226
@doc(storage_options=generic._shared_docs["storage_options"])
2327
def to_feather(
2428
df: DataFrame,
25-
path: FilePathOrBuffer[AnyStr],
29+
path: FilePathOrBuffer[bytes],
2630
storage_options: StorageOptions = None,
2731
**kwargs,
2832
):
@@ -89,7 +93,10 @@ def to_feather(
8993

9094
@doc(storage_options=generic._shared_docs["storage_options"])
9195
def read_feather(
92-
path, columns=None, use_threads: bool = True, storage_options: StorageOptions = None
96+
path: FilePathOrBuffer[bytes],
97+
columns: Sequence[Hashable] | None = None,
98+
use_threads: bool = True,
99+
storage_options: StorageOptions = None,
93100
):
94101
"""
95102
Load a feather-format object from the file path.

pandas/io/html.py

Lines changed: 31 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@
88

99
from collections import abc
1010
import numbers
11-
import os
1211
import re
1312
from typing import (
1413
Pattern,
@@ -29,6 +28,8 @@
2928
from pandas.core.frame import DataFrame
3029

3130
from pandas.io.common import (
31+
file_exists,
32+
get_handle,
3233
is_url,
3334
stringify_path,
3435
urlopen,
@@ -70,7 +71,7 @@ def _importers():
7071
_RE_WHITESPACE = re.compile(r"[\r\n]+|\s{2,}")
7172

7273

73-
def _remove_whitespace(s: str, regex=_RE_WHITESPACE) -> str:
74+
def _remove_whitespace(s: str, regex: Pattern = _RE_WHITESPACE) -> str:
7475
"""
7576
Replace extra whitespace inside of a string with a single space.
7677
@@ -89,7 +90,7 @@ def _remove_whitespace(s: str, regex=_RE_WHITESPACE) -> str:
8990
return regex.sub(" ", s.strip())
9091

9192

92-
def _get_skiprows(skiprows):
93+
def _get_skiprows(skiprows: int | Sequence[int] | slice | None):
9394
"""
9495
Get an iterator given an integer, slice or container.
9596
@@ -118,7 +119,7 @@ def _get_skiprows(skiprows):
118119
raise TypeError(f"{type(skiprows).__name__} is not a valid type for skipping rows")
119120

120121

121-
def _read(obj):
122+
def _read(obj: bytes | FilePathOrBuffer, encoding: str | None) -> str | bytes:
122123
"""
123124
Try to read from a url, file or string.
124125
@@ -130,22 +131,26 @@ def _read(obj):
130131
-------
131132
raw_text : str
132133
"""
133-
if is_url(obj):
134-
with urlopen(obj) as url:
135-
text = url.read()
136-
elif hasattr(obj, "read"):
137-
text = obj.read()
134+
if (
135+
is_url(obj)
136+
or hasattr(obj, "read")
137+
or (isinstance(obj, str) and file_exists(obj))
138+
):
139+
# error: Argument 1 to "get_handle" has incompatible type "Union[str, bytes,
140+
# Union[IO[Any], RawIOBase, BufferedIOBase, TextIOBase, TextIOWrapper, mmap]]";
141+
# expected "Union[PathLike[str], Union[str, Union[IO[Any], RawIOBase,
142+
# BufferedIOBase, TextIOBase, TextIOWrapper, mmap]]]"
143+
with get_handle(
144+
obj, "r", encoding=encoding # type: ignore[arg-type]
145+
) as handles:
146+
text = handles.handle.read()
138147
elif isinstance(obj, (str, bytes)):
139148
text = obj
140-
try:
141-
if os.path.isfile(text):
142-
with open(text, "rb") as f:
143-
return f.read()
144-
except (TypeError, ValueError):
145-
pass
146149
else:
147150
raise TypeError(f"Cannot read object of type '{type(obj).__name__}'")
148-
return text
151+
# error: Incompatible return value type (got "Union[Any, bytes, None, str]",
152+
# expected "Union[str, bytes]")
153+
return text # type: ignore[return-value]
149154

150155

151156
class _HtmlFrameParser:
@@ -204,7 +209,14 @@ class _HtmlFrameParser:
204209
functionality.
205210
"""
206211

207-
def __init__(self, io, match, attrs, encoding, displayed_only):
212+
def __init__(
213+
self,
214+
io: FilePathOrBuffer,
215+
match: str | Pattern,
216+
attrs: dict[str, str] | None,
217+
encoding: str,
218+
displayed_only: bool,
219+
):
208220
self.io = io
209221
self.match = match
210222
self.attrs = attrs
@@ -590,7 +602,7 @@ def _parse_tfoot_tr(self, table):
590602
return table.select("tfoot tr")
591603

592604
def _setup_build_doc(self):
593-
raw_text = _read(self.io)
605+
raw_text = _read(self.io, self.encoding)
594606
if not raw_text:
595607
raise ValueError(f"No text parsed from document: {self.io}")
596608
return raw_text
@@ -653,9 +665,6 @@ class _LxmlFrameParser(_HtmlFrameParser):
653665
:class:`_HtmlFrameParser`.
654666
"""
655667

656-
def __init__(self, *args, **kwargs):
657-
super().__init__(*args, **kwargs)
658-
659668
def _text_getter(self, obj):
660669
return obj.text_content()
661670

@@ -818,7 +827,7 @@ def _data_to_frame(**kwargs):
818827
}
819828

820829

821-
def _parser_dispatch(flavor):
830+
def _parser_dispatch(flavor: str | None) -> type[_HtmlFrameParser]:
822831
"""
823832
Choose the parser based on the input flavor.
824833

pandas/tests/io/test_html.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1233,6 +1233,10 @@ def seek(self, offset):
12331233
def seekable(self):
12341234
return True
12351235

1236+
def __iter__(self):
1237+
# to fool `is_file_like`, should never end up here
1238+
assert False
1239+
12361240
good = MockFile("<table><tr><td>spam<br />eggs</td></tr></table>")
12371241
bad = MockFile("<table><tr><td>spam<foobr />eggs</td></tr></table>")
12381242

0 commit comments

Comments
 (0)