8
8
9
9
from collections import abc
10
10
import numbers
11
- import os
12
11
import re
13
12
from typing import (
14
13
Pattern ,
29
28
from pandas .core .frame import DataFrame
30
29
31
30
from pandas .io .common import (
31
+ file_exists ,
32
+ get_handle ,
32
33
is_url ,
33
34
stringify_path ,
34
35
urlopen ,
@@ -70,7 +71,7 @@ def _importers():
70
71
_RE_WHITESPACE = re .compile (r"[\r\n]+|\s{2,}" )
71
72
72
73
73
- def _remove_whitespace (s : str , regex = _RE_WHITESPACE ) -> str :
74
+ def _remove_whitespace (s : str , regex : Pattern = _RE_WHITESPACE ) -> str :
74
75
"""
75
76
Replace extra whitespace inside of a string with a single space.
76
77
@@ -89,7 +90,7 @@ def _remove_whitespace(s: str, regex=_RE_WHITESPACE) -> str:
89
90
return regex .sub (" " , s .strip ())
90
91
91
92
92
- def _get_skiprows (skiprows ):
93
+ def _get_skiprows (skiprows : int | Sequence [ int ] | slice | None ):
93
94
"""
94
95
Get an iterator given an integer, slice or container.
95
96
@@ -118,7 +119,7 @@ def _get_skiprows(skiprows):
118
119
raise TypeError (f"{ type (skiprows ).__name__ } is not a valid type for skipping rows" )
119
120
120
121
121
- def _read (obj ) :
122
+ def _read (obj : bytes | FilePathOrBuffer , encoding : str | None ) -> str | bytes :
122
123
"""
123
124
Try to read from a url, file or string.
124
125
@@ -130,22 +131,26 @@ def _read(obj):
130
131
-------
131
132
raw_text : str
132
133
"""
133
- if is_url (obj ):
134
- with urlopen (obj ) as url :
135
- text = url .read ()
136
- elif hasattr (obj , "read" ):
137
- text = obj .read ()
134
+ if (
135
+ is_url (obj )
136
+ or hasattr (obj , "read" )
137
+ or (isinstance (obj , str ) and file_exists (obj ))
138
+ ):
139
+ # error: Argument 1 to "get_handle" has incompatible type "Union[str, bytes,
140
+ # Union[IO[Any], RawIOBase, BufferedIOBase, TextIOBase, TextIOWrapper, mmap]]";
141
+ # expected "Union[PathLike[str], Union[str, Union[IO[Any], RawIOBase,
142
+ # BufferedIOBase, TextIOBase, TextIOWrapper, mmap]]]"
143
+ with get_handle (
144
+ obj , "r" , encoding = encoding # type: ignore[arg-type]
145
+ ) as handles :
146
+ text = handles .handle .read ()
138
147
elif isinstance (obj , (str , bytes )):
139
148
text = obj
140
- try :
141
- if os .path .isfile (text ):
142
- with open (text , "rb" ) as f :
143
- return f .read ()
144
- except (TypeError , ValueError ):
145
- pass
146
149
else :
147
150
raise TypeError (f"Cannot read object of type '{ type (obj ).__name__ } '" )
148
- return text
151
+ # error: Incompatible return value type (got "Union[Any, bytes, None, str]",
152
+ # expected "Union[str, bytes]")
153
+ return text # type: ignore[return-value]
149
154
150
155
151
156
class _HtmlFrameParser :
@@ -204,7 +209,14 @@ class _HtmlFrameParser:
204
209
functionality.
205
210
"""
206
211
207
- def __init__ (self , io , match , attrs , encoding , displayed_only ):
212
+ def __init__ (
213
+ self ,
214
+ io : FilePathOrBuffer ,
215
+ match : str | Pattern ,
216
+ attrs : dict [str , str ] | None ,
217
+ encoding : str ,
218
+ displayed_only : bool ,
219
+ ):
208
220
self .io = io
209
221
self .match = match
210
222
self .attrs = attrs
@@ -590,7 +602,7 @@ def _parse_tfoot_tr(self, table):
590
602
return table .select ("tfoot tr" )
591
603
592
604
def _setup_build_doc (self ):
593
- raw_text = _read (self .io )
605
+ raw_text = _read (self .io , self . encoding )
594
606
if not raw_text :
595
607
raise ValueError (f"No text parsed from document: { self .io } " )
596
608
return raw_text
@@ -653,9 +665,6 @@ class _LxmlFrameParser(_HtmlFrameParser):
653
665
:class:`_HtmlFrameParser`.
654
666
"""
655
667
656
- def __init__ (self , * args , ** kwargs ):
657
- super ().__init__ (* args , ** kwargs )
658
-
659
668
def _text_getter (self , obj ):
660
669
return obj .text_content ()
661
670
@@ -818,7 +827,7 @@ def _data_to_frame(**kwargs):
818
827
}
819
828
820
829
821
- def _parser_dispatch (flavor ) :
830
+ def _parser_dispatch (flavor : str | None ) -> type [ _HtmlFrameParser ] :
822
831
"""
823
832
Choose the parser based on the input flavor.
824
833
0 commit comments