Skip to content

Commit 0f5fb55

Browse files
committed
ENH: add gzip/bz2 decompression. tweak buffer_lines heuristic
1 parent e9bc234 commit 0f5fb55

File tree

4 files changed

+68
-13
lines changed

4 files changed

+68
-13
lines changed

.travis.yml

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,12 @@
11
language: python
22

33
python:
4-
- 2.5
54
- 2.6
65
- 2.7
76
- 3.1
87
- 3.2
98

109
install:
11-
- "if [[ $TRAVIS_PYTHON_VERSION == '2.5' ]]; then pip install --use-mirrors simplejson; fi"
1210
- pip install --use-mirrors cython numpy nose pytz python-dateutil
1311

1412
script:

pandas/io/parsers.py

Lines changed: 17 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,8 @@ class DateConversionError(Exception):
4141
is expected. For instance, a local file could be
4242
file ://localhost/path/to/table.csv
4343
%s
44+
compression : {'gzip', 'bz2', None}, default None
45+
For on-the-fly decompression of on-disk data
4446
dialect : string or csv.Dialect instance, default None
4547
If None defaults to Excel dialect. Ignored if sep longer than 1 char
4648
See csv.Dialect documentation for more details
@@ -172,15 +174,6 @@ def _read(filepath_or_buffer, kwds):
172174
bytes = filepath_or_buffer.read()
173175
filepath_or_buffer = StringIO(bytes.decode(encoding, errors))
174176

175-
if hasattr(filepath_or_buffer, 'read'):
176-
f = filepath_or_buffer
177-
else:
178-
try:
179-
# universal newline mode
180-
f = com._get_handle(filepath_or_buffer, 'U', encoding=encoding)
181-
except Exception: # pragma: no cover
182-
f = com._get_handle(filepath_or_buffer, 'r', encoding=encoding)
183-
184177
if kwds.get('date_parser', None) is not None:
185178
if isinstance(kwds['parse_dates'], bool):
186179
kwds['parse_dates'] = True
@@ -191,7 +184,7 @@ def _read(filepath_or_buffer, kwds):
191184
chunksize = kwds.get('chunksize', None)
192185

193186
# Create the parser.
194-
parser = TextFileReader(f, **kwds)
187+
parser = TextFileReader(filepath_or_buffer, **kwds)
195188

196189
if nrows is not None:
197190
return parser.read(nrows)
@@ -243,12 +236,14 @@ def _read(filepath_or_buffer, kwds):
243236
'compact_ints': False,
244237
'use_unsigned': False,
245238
'low_memory': True,
239+
'memory_map': False,
246240
'buffer_lines': None,
247241
'error_bad_lines': True,
248242
'warn_bad_lines': True,
249243
'factorize': True,
250244
'dtype': None,
251245
'usecols': None,
246+
'compression': None
252247
}
253248

254249
_fwf_defaults = {
@@ -265,6 +260,7 @@ def _make_parser_function(name, sep=','):
265260
def parser_f(filepath_or_buffer,
266261
sep=sep,
267262
dialect=None,
263+
compression=None,
268264

269265
doublequote=True,
270266
escapechar=None,
@@ -304,6 +300,7 @@ def parser_f(filepath_or_buffer,
304300
dayfirst=False,
305301
date_parser=None,
306302

303+
memory_map=False,
307304
nrows=None,
308305
iterator=False,
309306
chunksize=None,
@@ -319,6 +316,7 @@ def parser_f(filepath_or_buffer,
319316
kwds = dict(delimiter=delimiter,
320317
engine=engine,
321318
dialect=dialect,
319+
compression=compression,
322320

323321
doublequote=doublequote,
324322
escapechar=escapechar,
@@ -350,6 +348,7 @@ def parser_f(filepath_or_buffer,
350348
verbose=verbose,
351349
encoding=encoding,
352350
squeeze=squeeze,
351+
memory_map=memory_map,
353352

354353
na_filter=na_filter,
355354
compact_ints=compact_ints,
@@ -1011,6 +1010,14 @@ def __init__(self, f, **kwds):
10111010
self.comment = kwds['comment']
10121011
self._comment_lines = []
10131012

1013+
1014+
if isinstance(f, basestring):
1015+
try:
1016+
# universal newline mode
1017+
f = com._get_handle(f, 'U', encoding=self.encoding)
1018+
except Exception: # pragma: no cover
1019+
f = com._get_handle(f, 'r', encoding=self.encoding)
1020+
10141021
if hasattr(f, 'readline'):
10151022
self._make_reader(f)
10161023
else:

pandas/io/tests/test_parsers.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1702,6 +1702,41 @@ def test_pure_python_failover(self):
17021702
expected = DataFrame({'a': [1, 4], 'b': [2, 5], 'c': [3, 6]})
17031703
tm.assert_frame_equal(result, expected)
17041704

1705+
def test_decompression(self):
1706+
data = open(self.csv1, 'rb').read()
1707+
1708+
expected = self.read_csv(self.csv1)
1709+
1710+
import gzip, bz2
1711+
1712+
try:
1713+
tmp = gzip.GzipFile('__tmp__', mode='wb')
1714+
tmp.write(data)
1715+
tmp.close()
1716+
1717+
result = self.read_csv('__tmp__', compression='gzip')
1718+
tm.assert_frame_equal(result, expected)
1719+
finally:
1720+
# try:
1721+
# os.remove('__tmp__')
1722+
# except:
1723+
# pass
1724+
pass
1725+
1726+
try:
1727+
tmp = bz2.BZ2File('__tmp__', mode='wb')
1728+
tmp.write(data)
1729+
tmp.close()
1730+
1731+
result = self.read_csv('__tmp__', compression='bz2')
1732+
tm.assert_frame_equal(result, expected)
1733+
finally:
1734+
try:
1735+
os.remove('__tmp__')
1736+
except:
1737+
pass
1738+
1739+
17051740
class TestParseSQL(unittest.TestCase):
17061741

17071742
def test_convert_sql_column_floats(self):

pandas/src/parser.pyx

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -241,6 +241,7 @@ cdef class TextReader:
241241
object compact_ints, use_unsigned
242242
object dtype
243243
object encoding
244+
object compression
244245
set noconvert, usecols
245246

246247
def __cinit__(self, source,
@@ -253,6 +254,8 @@ cdef class TextReader:
253254
tokenize_chunksize=DEFAULT_CHUNKSIZE,
254255
delim_whitespace=False,
255256

257+
compression=None,
258+
256259
converters=None,
257260

258261
factorize=True,
@@ -290,6 +293,7 @@ cdef class TextReader:
290293
# For timekeeping
291294
self.clocks = []
292295

296+
self.compression = compression
293297
self._setup_parser_source(source)
294298
parser_set_default_options(self.parser)
295299

@@ -410,7 +414,7 @@ cdef class TextReader:
410414
self.header, self.table_width = self._get_header()
411415

412416
# compute buffer_lines as function of table width
413-
heuristic = 2**18 // self.table_width
417+
heuristic = 2**20 // self.table_width
414418
self.buffer_lines = 1
415419
while self.buffer_lines * 2< heuristic:
416420
self.buffer_lines *= 2
@@ -436,6 +440,17 @@ cdef class TextReader:
436440
int status
437441
void *ptr
438442

443+
if isinstance(source, basestring) and self.compression:
444+
if self.compression == 'gzip':
445+
import gzip
446+
source = gzip.GzipFile(source, 'rb')
447+
elif self.compression == 'bz2':
448+
import bz2
449+
source = bz2.BZ2File(source, 'rb')
450+
else:
451+
raise ValueError('Unrecognized compression type: %s' %
452+
self.compression)
453+
439454
if isinstance(source, basestring):
440455
if not isinstance(source, bytes):
441456
source = source.encode('utf-8')

0 commit comments

Comments
 (0)