Skip to content

Commit f2651b9

Browse files
committed
ENH: can ungzip data from file handle in parsers. close #2593
1 parent 7399f63 commit f2651b9

File tree

3 files changed

+28
-4
lines changed

3 files changed

+28
-4
lines changed

pandas/io/parsers.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1085,6 +1085,16 @@ def TextParser(*args, **kwds):
10851085
def count_empty_vals(vals):
10861086
return sum([1 for v in vals if v == '' or v is None])
10871087

1088+
def _wrap_compressed(f, compression):
1089+
compression = compression.lower()
1090+
if compression == 'gzip':
1091+
import gzip
1092+
return gzip.GzipFile(fileobj=f)
1093+
elif compression == 'bz2':
1094+
raise ValueError('Python cannot read bz2 data from file handle')
1095+
else:
1096+
raise ValueError('do not recognize compression method %s'
1097+
% compression)
10881098

10891099
class PythonParser(ParserBase):
10901100

@@ -1130,6 +1140,8 @@ def __init__(self, f, **kwds):
11301140
if isinstance(f, basestring):
11311141
f = com._get_handle(f, 'r', encoding=self.encoding,
11321142
compression=self.compression)
1143+
elif self.compression:
1144+
f = _wrap_compressed(f, self.compression)
11331145

11341146
if hasattr(f, 'readline'):
11351147
self._make_reader(f)

pandas/io/tests/test_parsers.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1861,6 +1861,9 @@ def test_decompression(self):
18611861

18621862
result = self.read_csv('__tmp__', compression='gzip')
18631863
tm.assert_frame_equal(result, expected)
1864+
1865+
result = self.read_csv(open('__tmp__', 'rb'), compression='gzip')
1866+
tm.assert_frame_equal(result, expected)
18641867
finally:
18651868
# try:
18661869
# os.remove('__tmp__')
@@ -1876,6 +1879,9 @@ def test_decompression(self):
18761879
result = self.read_csv('__tmp__', compression='bz2')
18771880
tm.assert_frame_equal(result, expected)
18781881

1882+
# result = self.read_csv(open('__tmp__', 'rb'), compression='bz2')
1883+
# tm.assert_frame_equal(result, expected)
1884+
18791885
self.assertRaises(ValueError, self.read_csv,
18801886
'__tmp__', compression='bz3')
18811887
finally:

pandas/src/parser.pyx

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -469,13 +469,20 @@ cdef class TextReader:
469469
self.parser.cb_io = NULL
470470
self.parser.cb_cleanup = NULL
471471

472-
if isinstance(source, basestring) and self.compression:
472+
if self.compression:
473473
if self.compression == 'gzip':
474474
import gzip
475-
source = gzip.GzipFile(source, 'rb')
475+
if isinstance(source, basestring):
476+
source = gzip.GzipFile(source, 'rb')
477+
else:
478+
source = gzip.GzipFile(fileobj=source)
476479
elif self.compression == 'bz2':
477480
import bz2
478-
source = bz2.BZ2File(source, 'rb')
481+
if isinstance(source, basestring):
482+
source = bz2.BZ2File(source, 'rb')
483+
else:
484+
raise ValueError('Python cannot read bz2 from open file '
485+
'handle')
479486
else:
480487
raise ValueError('Unrecognized compression type: %s' %
481488
self.compression)
@@ -1789,4 +1796,3 @@ def _maybe_encode(values):
17891796
if values is None:
17901797
return []
17911798
return [x.encode('utf-8') if isinstance(x, unicode) else x for x in values]
1792-

0 commit comments

Comments
 (0)