pandas-dev · ocefpaf · Oct 9, 2019 · Oct 9, 2019 · Oct 10, 2019 · Oct 10, 2019
diff --git a/pandas/io/common.py b/pandas/io/common.py
@@ -183,13 +183,46 @@ def is_gcs_url(url) -> bool:
     return parse_url(url).scheme in ["gcs", "gs"]
 
 
+def _urlopen(*args, **kwargs):
+    compression = None
+    content_encoding = None
+    try:
+        import requests
+
+        url = args[0]
+        session = kwargs.pop("session", None)
+        if session:
+            if not isinstance(session, requests.sessions.Session):
+                raise ValueError(
+                    "Expected a requests.sessions.Session object, "
+                    "got {!r}".format(session)
+                )
+            r = session.get(url)
+        else:
+            r = requests.get(url)
+        r.raise_for_status()
+        content = r.content
+        r.close()
+    except ImportError:
+        r = urlopen(*args, **kwargs)
+        content = r.read()
+        content_encoding = r.headers.get("Content-Encoding", None)
+    if content_encoding == "gzip":
+        # Override compression based on Content-Encoding header.
+        compression = "gzip"
+    reader = BytesIO(content)
+    return reader, compression
+
+
 def urlopen(*args, **kwargs):
     """
     Lazy-import wrapper for stdlib urlopen, as that imports a big chunk of
     the stdlib.
     """
     import urllib.request
 
+    _ = kwargs.pop("session")
+
     return urllib.request.urlopen(*args, **kwargs)
 
 
@@ -198,6 +231,7 @@ def get_filepath_or_buffer(
     encoding: Optional[str] = None,
     compression: Optional[str] = None,
     mode: Optional[str] = None,
+    session=None,
 ):
     """
     If the filepath_or_buffer is a url, translate and return the buffer.
@@ -221,13 +255,7 @@ def get_filepath_or_buffer(
     filepath_or_buffer = _stringify_path(filepath_or_buffer)
 
     if isinstance(filepath_or_buffer, str) and _is_url(filepath_or_buffer):
-        req = urlopen(filepath_or_buffer)
-        content_encoding = req.headers.get("Content-Encoding", None)
-        if content_encoding == "gzip":
-            # Override compression based on Content-Encoding header
-            compression = "gzip"
-        reader = BytesIO(req.read())
-        req.close()
+        reader, compression = _urlopen(filepath_or_buffer, session=session)
         return reader, encoding, compression, True
 
     if is_s3_url(filepath_or_buffer):

diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py
@@ -20,7 +20,7 @@
     _stringify_path,
     _validate_header_arg,
     get_filepath_or_buffer,
-    urlopen,
+    _urlopen,
 )
 from pandas.io.excel._util import (
     _fill_mi_header,
@@ -336,10 +336,10 @@ def read_excel(
 
 
 class _BaseExcelReader(metaclass=abc.ABCMeta):
-    def __init__(self, filepath_or_buffer):
+    def __init__(self, filepath_or_buffer, session=None):
         # If filepath_or_buffer is a url, load the data into a BytesIO
         if _is_url(filepath_or_buffer):
-            filepath_or_buffer = BytesIO(urlopen(filepath_or_buffer).read())
+            filepath_or_buffer, _ = _urlopen(filepath_or_buffer, session=session)
         elif not isinstance(filepath_or_buffer, (ExcelFile, self._workbook_class)):
             filepath_or_buffer, _, _, _ = get_filepath_or_buffer(filepath_or_buffer)
 

diff --git a/pandas/io/html.py b/pandas/io/html.py
@@ -17,7 +17,7 @@
 
 from pandas import Series
 
-from pandas.io.common import _is_url, _validate_header_arg, urlopen
+from pandas.io.common import _is_url, _urlopen, _validate_header_arg
 from pandas.io.formats.printing import pprint_thing
 from pandas.io.parsers import TextParser
 
@@ -109,7 +109,7 @@ def _get_skiprows(skiprows):
     )
 
 
-def _read(obj):
+def _read(obj, session=None):
     """
     Try to read from a url, file or string.
 
@@ -122,8 +122,7 @@ def _read(obj):
     raw_text : str
     """
     if _is_url(obj):
-        with urlopen(obj) as url:
-            text = url.read()
+        text, _ = _urlopen(obj, session=session)
     elif hasattr(obj, "read"):
         text = obj.read()
     elif isinstance(obj, (str, bytes)):
@@ -199,12 +198,13 @@ class _HtmlFrameParser:
     functionality.
     """
 
-    def __init__(self, io, match, attrs, encoding, displayed_only):
+    def __init__(self, io, match, attrs, encoding, displayed_only, session=None):
         self.io = io
         self.match = match
         self.attrs = attrs
         self.encoding = encoding
         self.displayed_only = displayed_only
+        self.session = session
 
     def parse_tables(self):
         """
@@ -588,7 +588,7 @@ def _parse_tfoot_tr(self, table):
         return table.select("tfoot tr")
 
     def _setup_build_doc(self):
-        raw_text = _read(self.io)
+        raw_text = _read(self.io, self.session)
         if not raw_text:
             raise ValueError("No text parsed from document: {doc}".format(doc=self.io))
         return raw_text
@@ -714,7 +714,7 @@ def _build_doc(self):
 
         try:
             if _is_url(self.io):
-                with urlopen(self.io) as f:
+                with _urlopen(self.io) as f:
                     r = parse(f, parser=parser)
             else:
                 # try to parse the input in the simplest way
@@ -891,9 +891,10 @@ def _parse(flavor, io, match, attrs, encoding, displayed_only, **kwargs):
 
     # hack around python 3 deleting the exception variable
     retained = None
+    session = kwargs.get("session", None)
     for flav in flavor:
         parser = _parser_dispatch(flav)
-        p = parser(io, compiled_match, attrs, encoding, displayed_only)
+        p = parser(io, compiled_match, attrs, encoding, displayed_only, session)
 
         try:
             tables = p.parse_tables()
@@ -943,6 +944,7 @@ def read_html(
     na_values=None,
     keep_default_na=True,
     displayed_only=True,
+    session=None,
 ):
     r"""
     Read HTML tables into a ``list`` of ``DataFrame`` objects.

diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py
@@ -366,6 +366,7 @@ def read_json(
     lines=False,
     chunksize=None,
     compression="infer",
+    session=None,
 ):
     """
     Convert a JSON string to pandas object.
@@ -582,7 +583,7 @@ def read_json(
 
     compression = _infer_compression(path_or_buf, compression)
     filepath_or_buffer, _, compression, should_close = get_filepath_or_buffer(
-        path_or_buf, encoding=encoding, compression=compression
+        path_or_buf, encoding=encoding, compression=compression, session=session
     )
 
     json_reader = JsonReader(

diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -342,6 +342,9 @@
     values. The options are `None` for the ordinary converter,
     `high` for the high-precision converter, and `round_trip` for the
     round-trip converter.
+session : requests.Session
+    object with the a requests session configuration for remote file.
+    (requires the requests library)
 
 Returns
 -------
@@ -423,6 +426,7 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds):
         encoding = re.sub("_", "-", encoding).lower()
         kwds["encoding"] = encoding
 
+    session = kwds.get("session", None)
     compression = kwds.get("compression", "infer")
     compression = _infer_compression(filepath_or_buffer, compression)
 
@@ -431,7 +435,7 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds):
     # though mypy handling of conditional imports is difficult.
     # See https://github.com/python/mypy/issues/1297
     fp_or_buf, _, compression, should_close = get_filepath_or_buffer(
-        filepath_or_buffer, encoding, compression
+        filepath_or_buffer, encoding, compression, session=session
     )
     kwds["compression"] = compression
 
@@ -588,6 +592,7 @@ def parser_f(
         low_memory=_c_parser_defaults["low_memory"],
         memory_map=False,
         float_precision=None,
+        session=None,
     ):
 
         # gh-23761
@@ -674,6 +679,7 @@ def parser_f(
             mangle_dupe_cols=mangle_dupe_cols,
             infer_datetime_format=infer_datetime_format,
             skip_blank_lines=skip_blank_lines,
+            session=session,
         )
 
         return _read(filepath_or_buffer, kwds)