Skip to content

Rewrite internal _GzipReader implementation in C for extra speed. #26

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Dec 22, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,11 @@ Changelog

version 0.4.0-dev
-----------------
+ The internal ``gzip_ng._GzipReader`` has been rewritten in C. As a result the
overhead of decompressing files has significantly been reduced.
+ The ``gzip_ng._GzipReader`` in C is now used in ``gzip_ng.decompress``. The
``_GzipReader`` also can read from objects that support the buffer protocol.
This has reduced overhead significantly.
+ Fix some unclosed buffer errors in the gzip_ng CLI.

version 0.3.0
Expand Down
156 changes: 10 additions & 146 deletions src/zlib_ng/gzip_ng.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,9 @@
import struct
import sys
import time
import _compression # noqa: I201 # Not third-party

from . import zlib_ng
from .zlib_ng import _GzipReader

__all__ = ["GzipFile", "open", "compress", "decompress", "BadGzipFile",
"READ_BUFFER_SIZE"]
Expand All @@ -36,19 +36,14 @@
_COMPRESS_LEVEL_TRADEOFF = zlib_ng.Z_DEFAULT_COMPRESSION
_COMPRESS_LEVEL_BEST = zlib_ng.Z_BEST_COMPRESSION

#: The amount of data that is read in at once when decompressing a file.
#: Increasing this value may increase performance.
#: 128K is also the size used by pigz and cat to read files from the
# filesystem.
READ_BUFFER_SIZE = 128 * 1024
# The amount of data that is read in at once when decompressing a file.
# Increasing this value may increase performance.
READ_BUFFER_SIZE = 512 * 1024

FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16
READ, WRITE = 1, 2

try:
BadGzipFile = gzip.BadGzipFile # type: ignore
except AttributeError: # Versions lower than 3.8 do not have BadGzipFile
BadGzipFile = OSError # type: ignore
BadGzipFile = gzip.BadGzipFile # type: ignore


# The open method was copied from the CPython source with minor adjustments.
Expand Down Expand Up @@ -149,7 +144,7 @@ def __init__(self, filename=None, mode=None,
zlib_ng.DEF_MEM_LEVEL,
0)
if self.mode == READ:
raw = _GzipNGReader(self.fileobj)
raw = _GzipReader(self.fileobj, READ_BUFFER_SIZE)
self._buffer = io.BufferedReader(raw)

def __repr__(self):
Expand Down Expand Up @@ -180,124 +175,9 @@ def write(self, data):
return length


class _GzipNGReader(gzip._GzipReader):
def __init__(self, fp):
# Call the init method of gzip._GzipReader's parent here.
# It is not very invasive and allows us to override _PaddedFile
_compression.DecompressReader.__init__(
self, gzip._PaddedFile(fp), zlib_ng._ZlibDecompressor,
wbits=-zlib_ng.MAX_WBITS)
# Set flag indicating start of a new member
self._new_member = True
self._last_mtime = None

def read(self, size=-1):
if size < 0:
return self.readall()
# size=0 is special because decompress(max_length=0) is not supported
if not size:
return b""

# For certain input data, a single
# call to decompress() may not return
# any data. In this case, retry until we get some data or reach EOF.
while True:
if self._decompressor.eof:
# Ending case: we've come to the end of a member in the file,
# so finish up this member, and read a new gzip header.
# Check the CRC and file size, and set the flag so we read
# a new member
self._read_eof()
self._new_member = True
self._decompressor = self._decomp_factory(
**self._decomp_args)

if self._new_member:
# If the _new_member flag is set, we have to
# jump to the next member, if there is one.
self._init_read()
if not self._read_gzip_header():
self._size = self._pos
return b""
self._new_member = False

# Read a chunk of data from the file
if self._decompressor.needs_input:
buf = self._fp.read(READ_BUFFER_SIZE)
uncompress = self._decompressor.decompress(buf, size)
else:
uncompress = self._decompressor.decompress(b"", size)
if self._decompressor.unused_data != b"":
# Prepend the already read bytes to the fileobj so they can
# be seen by _read_eof() and _read_gzip_header()
self._fp.prepend(self._decompressor.unused_data)

if uncompress != b"":
break
if buf == b"":
raise EOFError("Compressed file ended before the "
"end-of-stream marker was reached")

self._crc = zlib_ng.crc32(uncompress, self._crc)
self._stream_size += len(uncompress)
self._pos += len(uncompress)
return uncompress


# Aliases for improved compatibility with CPython gzip module.
GzipFile = GzipNGFile
_GzipReader = _GzipNGReader


def _read_exact(fp, n):
'''Read exactly *n* bytes from `fp`
This method is required because fp may be unbuffered,
i.e. return short reads.
'''
data = fp.read(n)
while len(data) < n:
b = fp.read(n - len(data))
if not b:
raise EOFError("Compressed file ended before the "
"end-of-stream marker was reached")
data += b
return data


def _read_gzip_header(fp):
'''Read a gzip header from `fp` and progress to the end of the header.
Returns last mtime if header was present or None otherwise.
'''
magic = fp.read(2)
if magic == b'':
return None

if magic != b'\037\213':
raise BadGzipFile('Not a gzipped file (%r)' % magic)

(method, flag, last_mtime) = struct.unpack("<BBIxx", _read_exact(fp, 8))
if method != 8:
raise BadGzipFile('Unknown compression method')

if flag & FEXTRA:
# Read & discard the extra field, if present
extra_len, = struct.unpack("<H", _read_exact(fp, 2))
_read_exact(fp, extra_len)
if flag & FNAME:
# Read and discard a null-terminated string containing the filename
while True:
s = fp.read(1)
if not s or s == b'\000':
break
if flag & FCOMMENT:
# Read and discard a null-terminated string containing a comment
while True:
s = fp.read(1)
if not s or s == b'\000':
break
if flag & FHCRC:
_read_exact(fp, 2) # Read & discard the 16-bit header CRC
return last_mtime
_GzipNGReader = _GzipReader


def _create_simple_gzip_header(compresslevel: int,
Expand Down Expand Up @@ -342,25 +222,9 @@ def decompress(data):
"""Decompress a gzip compressed string in one shot.
Return the decompressed string.
"""
decompressed_members = []
while True:
fp = io.BytesIO(data)
if _read_gzip_header(fp) is None:
return b"".join(decompressed_members)
# Use a zlib raw deflate compressor
do = zlib_ng.decompressobj(wbits=-zlib_ng.MAX_WBITS)
# Read all the data except the header
decompressed = do.decompress(data[fp.tell():])
if not do.eof or len(do.unused_data) < 8:
raise EOFError("Compressed file ended before the end-of-stream "
"marker was reached")
crc, length = struct.unpack("<II", do.unused_data[:8])
if crc != zlib_ng.crc32(decompressed):
raise BadGzipFile("CRC check failed")
if length != (len(decompressed) & 0xffffffff):
raise BadGzipFile("Incorrect length of data produced")
decompressed_members.append(decompressed)
data = do.unused_data[8:].lstrip(b"\x00")
fp = io.BytesIO(data)
reader = _GzipReader(fp, max(len(data), 16))
return reader.readall()


def _argument_parser():
Expand Down
15 changes: 15 additions & 0 deletions src/zlib_ng/zlib_ng.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
# This file is part of python-zlib-ng which is distributed under the
# PYTHON SOFTWARE FOUNDATION LICENSE VERSION 2.

import typing

MAX_WBITS: int
DEFLATED: int
DEF_MEM_LEVEL: int
Expand Down Expand Up @@ -77,3 +79,16 @@ class _ZlibDecompressor:
zdict=None): ...

def decompress(self, __data, max_length=-1) -> bytes: ...

class _GzipReader:
def __init__(self, fp: typing.BinaryIO, buffersize: int = 32 * 1024): ...
def readinto(self, obj) -> int: ...
def readable(self) -> bool: ...
def writable(self) -> bool: ...
def seekable(self) -> bool: ...
def tell(self) -> int: ...
def seek(self, offset: int, whence: int): ...
def close(self): ...
def readall(self) -> bytes: ...
def read(self, __size: int): ...
def flush(self): ...
Loading