From d16c631248aaf57771beff1b565d7168069ba680 Mon Sep 17 00:00:00 2001 From: Patrick O'Brien Date: Tue, 30 Apr 2013 22:37:43 -0400 Subject: [PATCH 1/3] Support for s3 file handling --- pandas/io/parsers.py | 43 ++++++++++++++++++++++++++++++++----------- 1 file changed, 32 insertions(+), 11 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 161e7a521b997..d8f99c083e832 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -188,6 +188,12 @@ def _is_url(url): except: return False +def _is_s3_url(url): + """ Check for an s3 url """ + try: + return urlparse.urlparse(url).scheme == 's3' + except: + return False def _read(filepath_or_buffer, kwds): "Generic reader of line files." @@ -196,17 +202,32 @@ def _read(filepath_or_buffer, kwds): if skipfooter is not None: kwds['skip_footer'] = skipfooter - if isinstance(filepath_or_buffer, basestring) and _is_url(filepath_or_buffer): - from urllib2 import urlopen - filepath_or_buffer = urlopen(filepath_or_buffer) - if py3compat.PY3: # pragma: no cover - if encoding: - errors = 'strict' - else: - errors = 'replace' - encoding = 'utf-8' - bytes = filepath_or_buffer.read() - filepath_or_buffer = StringIO(bytes.decode(encoding, errors)) + if isinstance(filepath_or_buffer, basestring): + if _is_url(filepath_or_buffer): + from urllib2 import urlopen + filepath_or_buffer = urlopen(filepath_or_buffer) + if py3compat.PY3: # pragma: no cover + if encoding: + errors = 'strict' + else: + errors = 'replace' + encoding = 'utf-8' + bytes = filepath_or_buffer.read() + filepath_or_buffer = StringIO(bytes.decode(encoding, errors)) + + if _is_s3_url(filepath_or_buffer): + try: + import boto + except: + raise ImportError("boto is required to handle s3 files") + # Assuming AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY + # are environment variables + parsed_url = urlparse.urlparse(filepath_or_buffer) + conn = boto.connect_s3() + b = conn.get_bucket(parsed_url.netloc) + k = boto.s3.key.Key(b) + k.key = parsed_url.path + filepath_or_buffer = StringIO(k.get_contents_as_string()) if kwds.get('date_parser', None) is not None: if isinstance(kwds['parse_dates'], bool): From e93838601d6ae106702b4b5f90ddf30ea0c03d39 Mon Sep 17 00:00:00 2001 From: Patrick O'Brien Date: Mon, 6 May 2013 15:10:27 -0400 Subject: [PATCH 2/3] Update README and RELEASE for new S3 support --- README.rst | 1 + RELEASE.rst | 1 + 2 files changed, 2 insertions(+) diff --git a/README.rst b/README.rst index c9b70f07b0862..ea713006c7189 100644 --- a/README.rst +++ b/README.rst @@ -90,6 +90,7 @@ Optional dependencies * openpyxl version 1.6.1 or higher, for writing .xlsx files * xlrd >= 0.9.0 * Needed for Excel I/O + * `boto `__: necessary for Amazon S3 access. Installation from sources diff --git a/RELEASE.rst b/RELEASE.rst index 77e8e85db6a76..0f52babf26ff0 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -32,6 +32,7 @@ pandas 0.11.1 - pd.read_html() can now parse HTML string, files or urls and return dataframes courtesy of @cpcloud. (GH3477_) + - Support for reading Amazon S3 files. (GH3504_) **Improvements to existing features** From f06b43c5c2039ea150cc18603b3ff833ea1e1d25 Mon Sep 17 00:00:00 2001 From: Patrick O'Brien Date: Fri, 10 May 2013 10:05:08 -0400 Subject: [PATCH 3/3] Update docs noting handling of s3 locations. --- doc/source/io.rst | 5 +++-- pandas/io/parsers.py | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index 9001ae393d552..8da3d422c50be 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -40,8 +40,9 @@ for some advanced strategies They can take a number of arguments: - - ``filepath_or_buffer``: Either a string path to a file, or any object with a - ``read`` method (such as an open file or ``StringIO``). + - ``filepath_or_buffer``: Either a string path to a file, url + (including http, ftp, and s3 locations), or any object with a ``read`` + method (such as an open file or ``StringIO``). - ``sep`` or ``delimiter``: A delimiter / separator to split fields on. `read_csv` is capable of inferring the delimiter automatically in some cases by "sniffing." The separator may be specified as a regular diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index d8f99c083e832..1430843998843 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -34,7 +34,7 @@ class DateConversionError(Exception): Parameters ---------- filepath_or_buffer : string or file handle / StringIO. The string could be - a URL. Valid URL schemes include http, ftp, and file. For file URLs, a host + a URL. Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is expected. For instance, a local file could be file ://localhost/path/to/table.csv %s