pycompression · rhpvorderman · Dec 22, 2023 · Dec 22, 2023 · Dec 22, 2023 · Dec 22, 2023
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -9,6 +9,11 @@ Changelog
 
 version 0.4.0-dev
 -----------------
++ The internal ``gzip_ng._GzipReader`` has been rewritten in C. As a result the
+  overhead of decompressing files has significantly been reduced.
++ The ``gzip_ng._GzipReader`` in C is now used in ``gzip_ng.decompress``. The
+  ``_GzipReader`` also can read from objects that support the buffer protocol.
+  This has reduced overhead significantly.
 + Fix some unclosed buffer errors in the gzip_ng CLI.
 
 version 0.3.0

diff --git a/src/zlib_ng/gzip_ng.py b/src/zlib_ng/gzip_ng.py
@@ -25,9 +25,9 @@
 import struct
 import sys
 import time
-import _compression  # noqa: I201  # Not third-party
 
 from . import zlib_ng
+from .zlib_ng import _GzipReader
 
 __all__ = ["GzipFile", "open", "compress", "decompress", "BadGzipFile",
            "READ_BUFFER_SIZE"]
@@ -36,19 +36,14 @@
 _COMPRESS_LEVEL_TRADEOFF = zlib_ng.Z_DEFAULT_COMPRESSION
 _COMPRESS_LEVEL_BEST = zlib_ng.Z_BEST_COMPRESSION
 
-#: The amount of data that is read in at once when decompressing a file.
-#: Increasing this value may increase performance.
-#: 128K is also the size used by pigz and cat to read files from the
-# filesystem.
-READ_BUFFER_SIZE = 128 * 1024
+# The amount of data that is read in at once when decompressing a file.
+# Increasing this value may increase performance.
+READ_BUFFER_SIZE = 512 * 1024
 
 FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16
 READ, WRITE = 1, 2
 
-try:
-    BadGzipFile = gzip.BadGzipFile  # type: ignore
-except AttributeError:  # Versions lower than 3.8 do not have BadGzipFile
-    BadGzipFile = OSError  # type: ignore
+BadGzipFile = gzip.BadGzipFile  # type: ignore
 
 
 # The open method was copied from the CPython source with minor adjustments.
@@ -149,7 +144,7 @@ def __init__(self, filename=None, mode=None,
                                                 zlib_ng.DEF_MEM_LEVEL,
                                                 0)
         if self.mode == READ:
-            raw = _GzipNGReader(self.fileobj)
+            raw = _GzipReader(self.fileobj, READ_BUFFER_SIZE)
             self._buffer = io.BufferedReader(raw)
 
     def __repr__(self):
@@ -180,124 +175,9 @@ def write(self, data):
         return length
 
 
-class _GzipNGReader(gzip._GzipReader):
-    def __init__(self, fp):
-        # Call the init method of gzip._GzipReader's parent here.
-        # It is not very invasive and allows us to override _PaddedFile
-        _compression.DecompressReader.__init__(
-            self, gzip._PaddedFile(fp), zlib_ng._ZlibDecompressor,
-            wbits=-zlib_ng.MAX_WBITS)
-        # Set flag indicating start of a new member
-        self._new_member = True
-        self._last_mtime = None
-
-    def read(self, size=-1):
-        if size < 0:
-            return self.readall()
-        # size=0 is special because decompress(max_length=0) is not supported
-        if not size:
-            return b""
-
-        # For certain input data, a single
-        # call to decompress() may not return
-        # any data. In this case, retry until we get some data or reach EOF.
-        while True:
-            if self._decompressor.eof:
-                # Ending case: we've come to the end of a member in the file,
-                # so finish up this member, and read a new gzip header.
-                # Check the CRC and file size, and set the flag so we read
-                # a new member
-                self._read_eof()
-                self._new_member = True
-                self._decompressor = self._decomp_factory(
-                    **self._decomp_args)
-
-            if self._new_member:
-                # If the _new_member flag is set, we have to
-                # jump to the next member, if there is one.
-                self._init_read()
-                if not self._read_gzip_header():
-                    self._size = self._pos
-                    return b""
-                self._new_member = False
-
-            # Read a chunk of data from the file
-            if self._decompressor.needs_input:
-                buf = self._fp.read(READ_BUFFER_SIZE)
-                uncompress = self._decompressor.decompress(buf, size)
-            else:
-                uncompress = self._decompressor.decompress(b"", size)
-            if self._decompressor.unused_data != b"":
-                # Prepend the already read bytes to the fileobj so they can
-                # be seen by _read_eof() and _read_gzip_header()
-                self._fp.prepend(self._decompressor.unused_data)
-
-            if uncompress != b"":
-                break
-            if buf == b"":
-                raise EOFError("Compressed file ended before the "
-                               "end-of-stream marker was reached")
-
-        self._crc = zlib_ng.crc32(uncompress, self._crc)
-        self._stream_size += len(uncompress)
-        self._pos += len(uncompress)
-        return uncompress
-
-
 # Aliases for improved compatibility with CPython gzip module.
 GzipFile = GzipNGFile
-_GzipReader = _GzipNGReader
-
-
-def _read_exact(fp, n):
-    '''Read exactly *n* bytes from `fp`
-    This method is required because fp may be unbuffered,
-    i.e. return short reads.
-    '''
-    data = fp.read(n)
-    while len(data) < n:
-        b = fp.read(n - len(data))
-        if not b:
-            raise EOFError("Compressed file ended before the "
-                           "end-of-stream marker was reached")
-        data += b
-    return data
-
-
-def _read_gzip_header(fp):
-    '''Read a gzip header from `fp` and progress to the end of the header.
-    Returns last mtime if header was present or None otherwise.
-    '''
-    magic = fp.read(2)
-    if magic == b'':
-        return None
-
-    if magic != b'\037\213':
-        raise BadGzipFile('Not a gzipped file (%r)' % magic)
-
-    (method, flag, last_mtime) = struct.unpack("<BBIxx", _read_exact(fp, 8))
-    if method != 8:
-        raise BadGzipFile('Unknown compression method')
-
-    if flag & FEXTRA:
-        # Read & discard the extra field, if present
-        extra_len, = struct.unpack("<H", _read_exact(fp, 2))
-        _read_exact(fp, extra_len)
-    if flag & FNAME:
-        # Read and discard a null-terminated string containing the filename
-        while True:
-            s = fp.read(1)
-            if not s or s == b'\000':
-                break
-    if flag & FCOMMENT:
-        # Read and discard a null-terminated string containing a comment
-        while True:
-            s = fp.read(1)
-            if not s or s == b'\000':
-                break
-    if flag & FHCRC:
-        _read_exact(fp, 2)  # Read & discard the 16-bit header CRC
-    return last_mtime
+_GzipNGReader = _GzipReader
 
 
 def _create_simple_gzip_header(compresslevel: int,
@@ -342,25 +222,9 @@ def decompress(data):
     """Decompress a gzip compressed string in one shot.
     Return the decompressed string.
     """
-    decompressed_members = []
-    while True:
-        fp = io.BytesIO(data)
-        if _read_gzip_header(fp) is None:
-            return b"".join(decompressed_members)
-        # Use a zlib raw deflate compressor
-        do = zlib_ng.decompressobj(wbits=-zlib_ng.MAX_WBITS)
-        # Read all the data except the header
-        decompressed = do.decompress(data[fp.tell():])
-        if not do.eof or len(do.unused_data) < 8:
-            raise EOFError("Compressed file ended before the end-of-stream "
-                           "marker was reached")
-        crc, length = struct.unpack("<II", do.unused_data[:8])
-        if crc != zlib_ng.crc32(decompressed):
-            raise BadGzipFile("CRC check failed")
-        if length != (len(decompressed) & 0xffffffff):
-            raise BadGzipFile("Incorrect length of data produced")
-        decompressed_members.append(decompressed)
-        data = do.unused_data[8:].lstrip(b"\x00")
+    fp = io.BytesIO(data)
+    reader = _GzipReader(fp, max(len(data), 16))
+    return reader.readall()
 
 
 def _argument_parser():

diff --git a/src/zlib_ng/zlib_ng.pyi b/src/zlib_ng/zlib_ng.pyi
@@ -5,6 +5,8 @@
 # This file is part of python-zlib-ng which is distributed under the
 # PYTHON SOFTWARE FOUNDATION LICENSE VERSION 2.
 
+import typing
+
 MAX_WBITS: int
 DEFLATED: int
 DEF_MEM_LEVEL: int
@@ -77,3 +79,16 @@ class _ZlibDecompressor:
                  zdict=None): ...
 
     def decompress(self, __data, max_length=-1) -> bytes: ...
+
+class _GzipReader:
+    def __init__(self, fp: typing.BinaryIO, buffersize: int = 32 * 1024): ...
+    def readinto(self, obj) -> int: ...
+    def readable(self) -> bool: ...
+    def writable(self) -> bool: ...
+    def seekable(self) -> bool: ...
+    def tell(self) -> int: ...
+    def seek(self, offset: int, whence: int): ...
+    def close(self): ...
+    def readall(self) -> bytes: ...
+    def read(self, __size: int): ...
+    def flush(self): ...