ENH: add gzip/bz2 decompression. tweak buffer_lines heuristic

wesm · wesm · commit 0f5fb558ef7c · 2012-10-31T15:53:22.000-04:00
diff --git a/.travis.yml b/.travis.yml
@@ -1,14 +1,12 @@
 language: python
 
 python:
-  - 2.5
   - 2.6
   - 2.7
   - 3.1
   - 3.2
 
 install:
-  - "if [[ $TRAVIS_PYTHON_VERSION == '2.5' ]]; then pip install --use-mirrors simplejson; fi"
   - pip install --use-mirrors cython numpy nose pytz python-dateutil
 
 script:
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -41,6 +41,8 @@ class DateConversionError(Exception):
     is expected. For instance, a local file could be
     file ://localhost/path/to/table.csv
 %s
+compression : {'gzip', 'bz2', None}, default None
+    For on-the-fly decompression of on-disk data
 dialect : string or csv.Dialect instance, default None
     If None defaults to Excel dialect. Ignored if sep longer than 1 char
     See csv.Dialect documentation for more details
@@ -172,15 +174,6 @@ def _read(filepath_or_buffer, kwds):
             bytes = filepath_or_buffer.read()
             filepath_or_buffer = StringIO(bytes.decode(encoding, errors))
 
-    if hasattr(filepath_or_buffer, 'read'):
-        f = filepath_or_buffer
-    else:
-        try:
-            # universal newline mode
-            f = com._get_handle(filepath_or_buffer, 'U', encoding=encoding)
-        except Exception: # pragma: no cover
-            f = com._get_handle(filepath_or_buffer, 'r', encoding=encoding)
-
     if kwds.get('date_parser', None) is not None:
         if isinstance(kwds['parse_dates'], bool):
             kwds['parse_dates'] = True
@@ -191,7 +184,7 @@ def _read(filepath_or_buffer, kwds):
     chunksize = kwds.get('chunksize', None)
 
     # Create the parser.
-    parser = TextFileReader(f, **kwds)
+    parser = TextFileReader(filepath_or_buffer, **kwds)
 
     if nrows is not None:
         return parser.read(nrows)
@@ -243,12 +236,14 @@ def _read(filepath_or_buffer, kwds):
     'compact_ints': False,
     'use_unsigned': False,
     'low_memory': True,
+    'memory_map': False,
     'buffer_lines': None,
     'error_bad_lines': True,
     'warn_bad_lines': True,
     'factorize': True,
     'dtype': None,
     'usecols': None,
+    'compression': None
 }
 
 _fwf_defaults = {
@@ -265,6 +260,7 @@ def _make_parser_function(name, sep=','):
     def parser_f(filepath_or_buffer,
                  sep=sep,
                  dialect=None,
+                 compression=None,
 
                  doublequote=True,
                  escapechar=None,
@@ -304,6 +300,7 @@ def parser_f(filepath_or_buffer,
                  dayfirst=False,
                  date_parser=None,
 
+                 memory_map=False,
                  nrows=None,
                  iterator=False,
                  chunksize=None,
@@ -319,6 +316,7 @@ def parser_f(filepath_or_buffer,
         kwds = dict(delimiter=delimiter,
                     engine=engine,
                     dialect=dialect,
+                    compression=compression,
 
                     doublequote=doublequote,
                     escapechar=escapechar,
@@ -350,6 +348,7 @@ def parser_f(filepath_or_buffer,
                     verbose=verbose,
                     encoding=encoding,
                     squeeze=squeeze,
+                    memory_map=memory_map,
 
                     na_filter=na_filter,
                     compact_ints=compact_ints,
@@ -1011,6 +1010,14 @@ def __init__(self, f, **kwds):
         self.comment = kwds['comment']
         self._comment_lines = []
 
+
+        if isinstance(f, basestring):
+            try:
+                # universal newline mode
+                f = com._get_handle(f, 'U', encoding=self.encoding)
+            except Exception: # pragma: no cover
+                f = com._get_handle(f, 'r', encoding=self.encoding)
+
         if hasattr(f, 'readline'):
             self._make_reader(f)
         else:
diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py
@@ -1702,6 +1702,41 @@ def test_pure_python_failover(self):
         expected = DataFrame({'a': [1, 4], 'b': [2, 5], 'c': [3, 6]})
         tm.assert_frame_equal(result, expected)
 
+    def test_decompression(self):
+        data = open(self.csv1, 'rb').read()
+
+        expected = self.read_csv(self.csv1)
+
+        import gzip, bz2
+
+        try:
+            tmp = gzip.GzipFile('__tmp__', mode='wb')
+            tmp.write(data)
+            tmp.close()
+
+            result = self.read_csv('__tmp__', compression='gzip')
+            tm.assert_frame_equal(result, expected)
+        finally:
+            # try:
+            #     os.remove('__tmp__')
+            # except:
+            #     pass
+            pass
+
+        try:
+            tmp = bz2.BZ2File('__tmp__', mode='wb')
+            tmp.write(data)
+            tmp.close()
+
+            result = self.read_csv('__tmp__', compression='bz2')
+            tm.assert_frame_equal(result, expected)
+        finally:
+            try:
+                os.remove('__tmp__')
+            except:
+                pass
+
+
 class TestParseSQL(unittest.TestCase):
 
     def test_convert_sql_column_floats(self):
diff --git a/pandas/src/parser.pyx b/pandas/src/parser.pyx
@@ -241,6 +241,7 @@ cdef class TextReader:
         object compact_ints, use_unsigned
         object dtype
         object encoding
+        object compression
         set noconvert, usecols
 
     def __cinit__(self, source,
@@ -253,6 +254,8 @@ cdef class TextReader:
                   tokenize_chunksize=DEFAULT_CHUNKSIZE,
                   delim_whitespace=False,
 
+                  compression=None,
+
                   converters=None,
 
                   factorize=True,
@@ -290,6 +293,7 @@ cdef class TextReader:
         # For timekeeping
         self.clocks = []
 
+        self.compression = compression
         self._setup_parser_source(source)
         parser_set_default_options(self.parser)
 
@@ -410,7 +414,7 @@ cdef class TextReader:
         self.header, self.table_width = self._get_header()
 
         # compute buffer_lines as function of table width
-        heuristic = 2**18 // self.table_width
+        heuristic = 2**20 // self.table_width
         self.buffer_lines = 1
         while self.buffer_lines * 2< heuristic:
             self.buffer_lines *= 2
@@ -436,6 +440,17 @@ cdef class TextReader:
             int status
             void *ptr
 
+        if isinstance(source, basestring) and self.compression:
+            if self.compression == 'gzip':
+                import gzip
+                source = gzip.GzipFile(source, 'rb')
+            elif self.compression == 'bz2':
+                import bz2
+                source = bz2.BZ2File(source, 'rb')
+            else:
+                raise ValueError('Unrecognized compression type: %s' %
+                                 self.compression)
+
         if isinstance(source, basestring):
             if not isinstance(source, bytes):
                 source = source.encode('utf-8')