diff --git a/CHANGELOG.rst b/CHANGELOG.rst index f38c87e..4ac880b 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -9,6 +9,11 @@ Changelog version 0.4.0-dev ----------------- ++ The internal ``gzip_ng._GzipReader`` has been rewritten in C. As a result the + overhead of decompressing files has significantly been reduced. ++ The ``gzip_ng._GzipReader`` in C is now used in ``gzip_ng.decompress``. The + ``_GzipReader`` also can read from objects that support the buffer protocol. + This has reduced overhead significantly. + Fix some unclosed buffer errors in the gzip_ng CLI. version 0.3.0 diff --git a/src/zlib_ng/gzip_ng.py b/src/zlib_ng/gzip_ng.py index f29c24a..33db4be 100644 --- a/src/zlib_ng/gzip_ng.py +++ b/src/zlib_ng/gzip_ng.py @@ -25,9 +25,9 @@ import struct import sys import time -import _compression # noqa: I201 # Not third-party from . import zlib_ng +from .zlib_ng import _GzipReader __all__ = ["GzipFile", "open", "compress", "decompress", "BadGzipFile", "READ_BUFFER_SIZE"] @@ -36,19 +36,14 @@ _COMPRESS_LEVEL_TRADEOFF = zlib_ng.Z_DEFAULT_COMPRESSION _COMPRESS_LEVEL_BEST = zlib_ng.Z_BEST_COMPRESSION -#: The amount of data that is read in at once when decompressing a file. -#: Increasing this value may increase performance. -#: 128K is also the size used by pigz and cat to read files from the -# filesystem. -READ_BUFFER_SIZE = 128 * 1024 +# The amount of data that is read in at once when decompressing a file. +# Increasing this value may increase performance. +READ_BUFFER_SIZE = 512 * 1024 FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16 READ, WRITE = 1, 2 -try: - BadGzipFile = gzip.BadGzipFile # type: ignore -except AttributeError: # Versions lower than 3.8 do not have BadGzipFile - BadGzipFile = OSError # type: ignore +BadGzipFile = gzip.BadGzipFile # type: ignore # The open method was copied from the CPython source with minor adjustments. @@ -149,7 +144,7 @@ def __init__(self, filename=None, mode=None, zlib_ng.DEF_MEM_LEVEL, 0) if self.mode == READ: - raw = _GzipNGReader(self.fileobj) + raw = _GzipReader(self.fileobj, READ_BUFFER_SIZE) self._buffer = io.BufferedReader(raw) def __repr__(self): @@ -180,124 +175,9 @@ def write(self, data): return length -class _GzipNGReader(gzip._GzipReader): - def __init__(self, fp): - # Call the init method of gzip._GzipReader's parent here. - # It is not very invasive and allows us to override _PaddedFile - _compression.DecompressReader.__init__( - self, gzip._PaddedFile(fp), zlib_ng._ZlibDecompressor, - wbits=-zlib_ng.MAX_WBITS) - # Set flag indicating start of a new member - self._new_member = True - self._last_mtime = None - - def read(self, size=-1): - if size < 0: - return self.readall() - # size=0 is special because decompress(max_length=0) is not supported - if not size: - return b"" - - # For certain input data, a single - # call to decompress() may not return - # any data. In this case, retry until we get some data or reach EOF. - while True: - if self._decompressor.eof: - # Ending case: we've come to the end of a member in the file, - # so finish up this member, and read a new gzip header. - # Check the CRC and file size, and set the flag so we read - # a new member - self._read_eof() - self._new_member = True - self._decompressor = self._decomp_factory( - **self._decomp_args) - - if self._new_member: - # If the _new_member flag is set, we have to - # jump to the next member, if there is one. - self._init_read() - if not self._read_gzip_header(): - self._size = self._pos - return b"" - self._new_member = False - - # Read a chunk of data from the file - if self._decompressor.needs_input: - buf = self._fp.read(READ_BUFFER_SIZE) - uncompress = self._decompressor.decompress(buf, size) - else: - uncompress = self._decompressor.decompress(b"", size) - if self._decompressor.unused_data != b"": - # Prepend the already read bytes to the fileobj so they can - # be seen by _read_eof() and _read_gzip_header() - self._fp.prepend(self._decompressor.unused_data) - - if uncompress != b"": - break - if buf == b"": - raise EOFError("Compressed file ended before the " - "end-of-stream marker was reached") - - self._crc = zlib_ng.crc32(uncompress, self._crc) - self._stream_size += len(uncompress) - self._pos += len(uncompress) - return uncompress - - # Aliases for improved compatibility with CPython gzip module. GzipFile = GzipNGFile -_GzipReader = _GzipNGReader - - -def _read_exact(fp, n): - '''Read exactly *n* bytes from `fp` - This method is required because fp may be unbuffered, - i.e. return short reads. - ''' - data = fp.read(n) - while len(data) < n: - b = fp.read(n - len(data)) - if not b: - raise EOFError("Compressed file ended before the " - "end-of-stream marker was reached") - data += b - return data - - -def _read_gzip_header(fp): - '''Read a gzip header from `fp` and progress to the end of the header. - Returns last mtime if header was present or None otherwise. - ''' - magic = fp.read(2) - if magic == b'': - return None - - if magic != b'\037\213': - raise BadGzipFile('Not a gzipped file (%r)' % magic) - - (method, flag, last_mtime) = struct.unpack(" bytes: ... + +class _GzipReader: + def __init__(self, fp: typing.BinaryIO, buffersize: int = 32 * 1024): ... + def readinto(self, obj) -> int: ... + def readable(self) -> bool: ... + def writable(self) -> bool: ... + def seekable(self) -> bool: ... + def tell(self) -> int: ... + def seek(self, offset: int, whence: int): ... + def close(self): ... + def readall(self) -> bytes: ... + def read(self, __size: int): ... + def flush(self): ... diff --git a/src/zlib_ng/zlib_ngmodule.c b/src/zlib_ng/zlib_ngmodule.c index 5ecedb3..b3cb943 100644 --- a/src/zlib_ng/zlib_ngmodule.c +++ b/src/zlib_ng/zlib_ngmodule.c @@ -13,6 +13,19 @@ #error "At least zlib-ng version 2.0.7 is required" #endif +/* PyPy quirks: no Py_UNREACHABLE and requires PyBUF_READ and PyBUF_WRITE set + in memoryviews that enter a "readinto" call. CPython requires that only + PyBUF_WRITE is set. + (Both implementations are wrong because the state of the READ bit should + not matter.) +*/ +#ifdef PYPY_VERSION +#define Py_UNREACHABLE() Py_FatalError("Reached unreachable state") +#define MEMORYVIEW_READINTO_FLAGS (PyBUF_READ | PyBUF_WRITE) +#else +#define MEMORYVIEW_READINTO_FLAGS PyBUF_WRITE +#endif + #define ENTER_ZLIB(obj) do { \ if (!PyThread_acquire_lock((obj)->lock, 0)) { \ Py_BEGIN_ALLOW_THREADS \ @@ -1959,6 +1972,728 @@ static PyTypeObject ZlibDecompressorType = { }; +#define GzipReader_HEADER 1 +#define GzipReader_DEFLATE_BLOCK 2 +#define GzipReader_TRAILER 3 +#define GzipReader_NULL_BYTES 4 + +typedef struct _GzipReaderStruct { + PyObject_HEAD + uint8_t *input_buffer; + size_t buffer_size; + const uint8_t *current_pos; + const uint8_t *buffer_end; + int64_t _pos; + int64_t _size; + PyObject *fp; + Py_buffer *memview; + char stream_phase; + char all_bytes_read; + char closed; + uint32_t crc; + uint32_t stream_out; + uint32_t _last_mtime; + PyThread_type_lock lock; + zng_stream zst; +} GzipReader; + +static void GzipReader_dealloc(GzipReader *self) +{ + if (self->memview == NULL) { + PyMem_Free(self->input_buffer); + } else { + PyBuffer_Release(self->memview); + PyMem_Free(self->memview); + } + Py_XDECREF(self->fp); + PyThread_free_lock(self->lock); + zng_inflateEnd(&self->zst); + Py_TYPE(self)->tp_free(self); +} + +PyDoc_STRVAR(GzipReader__new____doc__, +"_GzipReader(fp, /, buffersize=32*1024)\n" +"--\n" +"\n" +"Return a _GzipReader object.\n" +"\n" +" fp\n" +" can be a file-like binary IO object or a bytes-like object.\n" +" For file-like objects _GzipReader's internal buffer is filled using \n" +" fp's readinto method during reading. For bytes-like objects, the \n" +" buffer protocol is used which allows _GzipReader to use the object \n" +" itself as read buffer. " +" buffersize\n" +" Size of the internal buffer. Only used when fp is a file-like object. \n" +" The buffer is automatically resized to fit the largest gzip header \n" +" upon use of the _GzipReader object.\n" +); + +static PyObject * +GzipReader__new__(PyTypeObject *type, PyObject *args, PyObject *kwargs) +{ + PyObject *fp = NULL; + Py_ssize_t buffer_size = 32 * 1024; + static char *keywords[] = {"fp", "buffersize", NULL}; + static char *format = "O|n:GzipReader"; + if (!PyArg_ParseTupleAndKeywords( + args, kwargs, format, keywords, &fp, &buffer_size)) { + return NULL; + } + if (buffer_size < 1) { + PyErr_Format( + PyExc_ValueError, + "buffersize must be at least 1, got %zd", buffer_size + ); + return NULL; + } + GzipReader *self = PyObject_New(GzipReader, type); + if (PyObject_HasAttrString(fp, "read")) { + self->memview = NULL; + self->buffer_size = buffer_size; + self->input_buffer = PyMem_Malloc(self->buffer_size); + if (self->input_buffer == NULL) { + Py_DECREF(self); + return PyErr_NoMemory(); + } + self->buffer_end = self->input_buffer; + self->all_bytes_read = 0; + } else { + self->memview = PyMem_Malloc(sizeof(Py_buffer)); + if (self->memview == NULL) { + return PyErr_NoMemory(); + } + if (PyObject_GetBuffer(fp, self->memview, PyBUF_SIMPLE) < 0) { + Py_DECREF(self); + return NULL; + } + self->buffer_size = self->memview->len; + self->input_buffer = self->memview->buf; + self->buffer_end = self->input_buffer + self->buffer_size; + self->all_bytes_read = 1; + } + self->current_pos = self->input_buffer; + self->_pos = 0; + self->_size = -1; + Py_INCREF(fp); + self->fp = fp; + self->stream_phase = GzipReader_HEADER; + self->closed = 0; + self->_last_mtime = 0; + self->crc = 0; + self->lock = PyThread_allocate_lock(); + if (self->lock == NULL) { + Py_DECREF(self); + PyErr_SetString(PyExc_MemoryError, "Unable to allocate lock"); + return NULL; + } + self->zst.zalloc = PyZlib_Malloc; + self->zst.zfree = PyZlib_Free; + self->zst.next_in = NULL; + self->zst.avail_in = 0; + self->zst.opaque = NULL; + int err = zng_inflateInit2(&(self->zst), -MAX_WBITS); + switch (err) { + case Z_OK: + return (PyObject *)self; + case Z_STREAM_ERROR: + Py_DECREF(self); + PyErr_SetString(PyExc_ValueError, "Invalid initialization option"); + return NULL; + case Z_MEM_ERROR: + Py_DECREF(self); + PyErr_SetString(PyExc_MemoryError, + "Can't allocate memory for decompression object"); + return NULL; + default: + zlib_error(self->zst, err, "while creating decompression object"); + Py_DECREF(self); + return NULL; + } +} + +static inline Py_ssize_t +GzipReader_read_from_file(GzipReader *self) +{ + + const uint8_t *current_pos = self->current_pos; + const uint8_t *buffer_end = self->buffer_end; + size_t remaining = buffer_end - current_pos; + if (remaining == self->buffer_size) { + /* Buffer is full but a new read request was issued. This will be due + to the header being bigger than the header. Enlarge the buffer + to accommodate the hzip header. */ + size_t new_buffer_size = self->buffer_size * 2; + uint8_t *tmp_buffer = PyMem_Realloc(self->input_buffer, new_buffer_size); + if (tmp_buffer == NULL) { + PyErr_NoMemory(); + return -1; + } + self->input_buffer = tmp_buffer; + self->buffer_size = new_buffer_size; + } else if (remaining > 0) { + memmove(self->input_buffer, current_pos, remaining); + } + uint8_t *input_buffer = self->input_buffer; + current_pos = input_buffer; + buffer_end = input_buffer + remaining; + size_t read_in_size = self->buffer_size - remaining; + PyObject *bufview = PyMemoryView_FromMemory( + (char *)buffer_end, read_in_size, MEMORYVIEW_READINTO_FLAGS); + if (bufview == NULL) { + return -1; + } + PyObject *new_size_obj = PyObject_CallMethod(self->fp, "readinto", "O", bufview); + Py_DECREF(bufview); + if (new_size_obj == NULL) { + return -1; + } + Py_ssize_t new_size = PyLong_AsSsize_t(new_size_obj); + Py_DECREF(new_size_obj); + if (new_size < 0) { + return -1; + } + if (new_size == 0) { + self->all_bytes_read = 1; + } + buffer_end += new_size; + self->current_pos = current_pos; + self->buffer_end = buffer_end; + return 0; +} + +#define FTEXT 1 +#define FHCRC 2 +#define FEXTRA 4 +#define FNAME 8 +#define FCOMMENT 16 + +static PyObject *BadGzipFile; // Import BadGzipFile error for consistency + +static inline uint32_t load_u32_le(const void *address) { + #if PY_BIG_ENDIAN + uint8_t *mem = address; + return mem[0] | (mem[1] << 8) | (mem[2] << 16) | (mem[3] << 24); + #else + return *(uint32_t *)address; + #endif +} + +static inline uint16_t load_u16_le(const void *address) { + #if PY_BIG_ENDIAN + uint8_t *mem = address; + return mem[0] | (mem[1] << 8) | (mem[2] << 16) | (mem[3] << 24); + #else + return *(uint16_t *)address; + #endif +} + +static Py_ssize_t +GzipReader_read_into_buffer(GzipReader *self, uint8_t *out_buffer, size_t out_buffer_size) +{ + Py_ssize_t bytes_written = 0; + /* Outer loop is the file read in loop */ + while (1) { + const uint8_t *current_pos = self->current_pos; + const uint8_t *buffer_end = self->buffer_end; + /* Inner loop fills the out buffer, with multiple gzip blocks if + necessary. Allow escaping the GIL except when throwing errors. + This makes a big difference for BGZF format gzip blocks. + Threads are blocked when the loop is exited. */ + PyThreadState *_save; + Py_UNBLOCK_THREADS + while(1) { + switch(self->stream_phase) { + size_t remaining; // Must be before labels. + case GzipReader_HEADER: + remaining = buffer_end - current_pos; + if (remaining == 0 && self->all_bytes_read) { + // Reached EOF + self->_size = self->_pos; + self->current_pos = current_pos; + Py_BLOCK_THREADS; + return bytes_written; + } + if ((remaining) < 10) { + break; + } + uint8_t magic1 = current_pos[0]; + uint8_t magic2 = current_pos[1]; + + if (!(magic1 == 0x1f && magic2 == 0x8b)) { + Py_BLOCK_THREADS; + PyObject *magic_obj = PyBytes_FromStringAndSize((char *)current_pos, 2); + PyErr_Format(BadGzipFile, + "Not a gzipped file (%R)", + magic_obj); + Py_DECREF(magic_obj); + return -1; + }; + uint8_t method = current_pos[2]; + if (method != 8) { + Py_BLOCK_THREADS; + PyErr_SetString(BadGzipFile, "Unknown compression method"); + return -1; + } + uint8_t flags = current_pos[3]; + self->_last_mtime = load_u32_le(current_pos + 4); + // Skip XFL and header flag + const uint8_t *header_cursor = current_pos + 10; + if (flags & FEXTRA) { + // Read the extra field and discard it. + if (header_cursor + 2 >= buffer_end) { + break; + } + uint16_t flength = load_u16_le(header_cursor); + header_cursor += 2; + if (header_cursor + flength >= buffer_end) { + break; + } + header_cursor += flength; + } + if (flags & FNAME) { + header_cursor = memchr(header_cursor, 0, buffer_end - header_cursor); + if (header_cursor == NULL) { + break; + } + // skip over the 0 value; + header_cursor +=1; + } + if (flags & FCOMMENT) { + header_cursor = memchr(header_cursor, 0, buffer_end - header_cursor); + if (header_cursor == NULL) { + break; + } + // skip over the 0 value; + header_cursor +=1; + } + if (flags & FHCRC) { + if (header_cursor + 2 >= buffer_end) { + break; + } + uint16_t header_crc = load_u16_le(header_cursor); + uint16_t crc = zng_crc32_z( + 0, current_pos, header_cursor - current_pos) & 0xFFFF; + if (header_crc != crc) { + Py_BLOCK_THREADS; + PyErr_Format( + BadGzipFile, + "Corrupted gzip header. Checksums do not " + "match: %04x != %04x", + crc, header_crc + ); + return -1; + } + header_cursor += 2; + } + current_pos = header_cursor; + int reset_err = zng_inflateReset(&(self->zst)); + if (reset_err != Z_OK) { + Py_BLOCK_THREADS; + zlib_error(self->zst, reset_err, "while initializing inflate stream."); + return -1; + } + self->crc = 0; + self->stream_phase = GzipReader_DEFLATE_BLOCK; + case GzipReader_DEFLATE_BLOCK: + self->zst.next_in = current_pos; + self->zst.avail_in = Py_MIN((buffer_end -current_pos), UINT32_MAX); + self->zst.next_out = out_buffer; + self->zst.avail_out = Py_MIN(out_buffer_size, UINT32_MAX); + int ret; + ret = zng_inflate(&self->zst, Z_SYNC_FLUSH); + switch (ret) { + case Z_OK: + case Z_BUF_ERROR: + case Z_STREAM_END: + break; + case Z_MEM_ERROR: + Py_BLOCK_THREADS; + PyErr_SetString(PyExc_MemoryError, + "Out of memory while decompressing data"); + return -1; + default: + Py_BLOCK_THREADS; + zlib_error(self->zst, ret, "while decompressing data"); + return -1; + } + size_t current_bytes_written = self->zst.next_out - out_buffer; + self->crc = zng_crc32_z(self->crc, out_buffer, current_bytes_written); + bytes_written += current_bytes_written; + self->_pos += current_bytes_written; + out_buffer = self->zst.next_out; + out_buffer_size -= current_bytes_written; + current_pos = self->zst.next_in; + if (!(ret == Z_STREAM_END)) { + if (out_buffer_size > 0) { + if (current_pos == buffer_end) { + // Need fresh bytes + break; + } + // Not all input data decompressed. + continue; + } + self->current_pos = current_pos; + Py_BLOCK_THREADS; + return bytes_written; + } + // Block done check trailer. + self->stream_phase = GzipReader_TRAILER; + case GzipReader_TRAILER: + if (buffer_end - current_pos < 8) { + break; + } + uint32_t crc = load_u32_le(current_pos); + current_pos += 4; + if (crc != self->crc) { + Py_BLOCK_THREADS; + PyErr_Format( + BadGzipFile, + "CRC check failed %u != %u", + crc, self->crc + ); + return -1; + } + uint32_t length = load_u32_le(current_pos); + current_pos += 4; + if (length != self->zst.total_out) { + Py_BLOCK_THREADS; + PyErr_SetString(BadGzipFile, "Incorrect length of data produced"); + return -1; + } + self->stream_phase = GzipReader_NULL_BYTES; + case GzipReader_NULL_BYTES: + // There maybe NULL bytes between gzip members + while (current_pos < buffer_end && *current_pos == 0) { + current_pos += 1; + } + if (current_pos == buffer_end) { + /* Not all NULL bytes may have been read, refresh the buffer.*/ + break; + } + self->stream_phase = GzipReader_HEADER; + continue; + default: + Py_UNREACHABLE(); + } + break; + } + Py_BLOCK_THREADS; + // If buffer_end is reached, nothing was returned and all bytes are + // read we have an EOFError. + if (self->all_bytes_read) { + if (self->stream_phase == GzipReader_NULL_BYTES) { + self->_size = self->_pos; + self->current_pos = current_pos; + return bytes_written; + } + PyErr_SetString( + PyExc_EOFError, + "Compressed file ended before the end-of-stream marker was reached" + ); + return -1; + } + self->current_pos = current_pos; + if (GzipReader_read_from_file(self) < 0) { + return -1; + } + } +} + +static PyObject * +GzipReader_readinto(GzipReader *self, PyObject *buffer_obj) +{ + Py_buffer view; + if (PyObject_GetBuffer(buffer_obj, &view, PyBUF_SIMPLE) < 0) { + return NULL; + } + uint8_t *buffer = view.buf; + size_t buffer_size = view.len; + ENTER_ZLIB(self); + Py_ssize_t written_size = GzipReader_read_into_buffer(self, buffer, buffer_size); + LEAVE_ZLIB(self); + PyBuffer_Release(&view); + if (written_size < 0) { + return NULL; + } + return PyLong_FromSsize_t((Py_ssize_t)written_size); +} + +static PyObject * +GzipReader_seek(GzipReader *self, PyObject *args, PyObject *kwargs) +{ + Py_ssize_t offset; + Py_ssize_t whence = SEEK_SET; + static char *keywords[] = {"offset", "whence", NULL}; + static char format[] = {"n|n:GzipReader.seek"}; + if (PyArg_ParseTupleAndKeywords(args, kwargs, format, keywords, &offset, &whence) < 0) { + return NULL; + } + // Recalculate offset as an absolute file position. + if (whence == SEEK_SET) { + ; + } else if (whence == SEEK_CUR) { + offset = self->_pos + offset; + } else if (whence == SEEK_END) { + // Seeking relative to EOF - we need to know the file's size. + if (self->_size < 0) { + size_t tmp_buffer_size = 8 * 1024; + uint8_t *tmp_buffer = PyMem_Malloc(tmp_buffer_size); + if (tmp_buffer == NULL) { + return PyErr_NoMemory(); + } + while (1) { + /* Simply overwrite the tmp buffer over and over */ + Py_ssize_t written_bytes = GzipReader_read_into_buffer( + self, tmp_buffer, tmp_buffer_size + ); + if (written_bytes < 0) { + PyMem_FREE(tmp_buffer); + return NULL; + } + if (written_bytes == 0) { + break; + } + } + assert(self->_size >= 0); + PyMem_Free(tmp_buffer); + } + offset = self->_size + offset; + } else { + PyErr_Format( + PyExc_ValueError, + "Invalid format for whence: %zd", whence + ); + return NULL; + } + + // Make it so that offset is the number of bytes to skip forward. + if (offset < self->_pos) { + PyObject *seek_result = PyObject_CallMethod(self->fp, "seek", "n", 0); + if (seek_result == NULL) { + return NULL; + } + self->stream_phase = GzipReader_HEADER; + self->_pos = 0; + self->all_bytes_read = 0; + int ret = zng_inflateReset(&self->zst); + if (ret != Z_OK) { + zlib_error(self->zst, ret, "while seeking"); + return NULL; + } + } else { + offset -= self->_pos; + } + + // Read and discard data until we reach the desired position. + if (offset > 0) { + Py_ssize_t tmp_buffer_size = 8 * 1024; + uint8_t *tmp_buffer = PyMem_Malloc(tmp_buffer_size); + if (tmp_buffer == NULL) { + return PyErr_NoMemory(); + } + while (offset > 0) { + Py_ssize_t bytes_written = GzipReader_read_into_buffer( + self, tmp_buffer, Py_MIN(tmp_buffer_size, offset)); + if (bytes_written < 0) { + PyMem_FREE(tmp_buffer); + return NULL; + } + if (bytes_written == 0) { + break; + } + offset -= bytes_written; + } + PyMem_Free(tmp_buffer); + } + return PyLong_FromLongLong(self->_pos); +} + +static PyObject * +GzipReader_readall(GzipReader *self, PyObject *Py_UNUSED(ignore)) +{ + /* Try to consume the entire buffer without too much overallocation */ + Py_ssize_t chunk_size = self->buffer_size * 4; + /* Rather than immediately creating a list, read one chunk first and + only create a list when more read operations are necessary. */ + PyObject *first_chunk = PyBytes_FromStringAndSize(NULL, chunk_size); + if (first_chunk == NULL) { + return NULL; + } + ENTER_ZLIB(self); + Py_ssize_t written_size = GzipReader_read_into_buffer( + self, (uint8_t *)PyBytes_AS_STRING(first_chunk), chunk_size); + LEAVE_ZLIB(self); + if (written_size < 0) { + Py_DECREF(first_chunk); + return NULL; + } + if (written_size < chunk_size) { + if (_PyBytes_Resize(&first_chunk, written_size) < 0) { + return NULL; + } + return first_chunk; + } + + PyObject *chunk_list = PyList_New(1); + if (chunk_list == NULL) { + return NULL; + } + PyList_SET_ITEM(chunk_list, 0, first_chunk); + while (1) { + PyObject *chunk = PyBytes_FromStringAndSize(NULL, chunk_size); + if (chunk == NULL) { + Py_DECREF(chunk_list); + return NULL; + } + ENTER_ZLIB(self); + written_size = GzipReader_read_into_buffer( + self, (uint8_t *)PyBytes_AS_STRING(chunk), chunk_size); + LEAVE_ZLIB(self); + if (written_size < 0) { + Py_DECREF(chunk); + Py_DECREF(chunk_list); + return NULL; + } + if (written_size == 0) { + Py_DECREF(chunk); + break; + } + if (_PyBytes_Resize(&chunk, written_size) < 0) { + Py_DECREF(chunk_list); + return NULL; + } + int ret = PyList_Append(chunk_list, chunk); + Py_DECREF(chunk); + if (ret < 0) { + Py_DECREF(chunk_list); + return NULL; + } + } + PyObject *empty_bytes = PyBytes_FromStringAndSize(NULL, 0); + if (empty_bytes == NULL) { + Py_DECREF(chunk_list); + return NULL; + } + PyObject *ret = _PyBytes_Join(empty_bytes, chunk_list); + Py_DECREF(empty_bytes); + Py_DECREF(chunk_list); + return ret; +} + +static PyObject * +GzipReader_read(GzipReader *self, PyObject *args) +{ + Py_ssize_t size = -1; + if (PyArg_ParseTuple(args, "|n:GzipReader.read", &size) < 0) { + return NULL; + } + if (size < 0) { + return GzipReader_readall(self, NULL); + } + if (size == 0) { + return PyBytes_FromStringAndSize(NULL, 0); + } + Py_ssize_t answer_size = Py_MIN((Py_ssize_t)self->buffer_size * 10, size); + PyObject *answer = PyBytes_FromStringAndSize(NULL, answer_size); + if (answer == NULL) { + return NULL; + } + ENTER_ZLIB(self); + Py_ssize_t written_bytes = GzipReader_read_into_buffer(self, (uint8_t *)PyBytes_AS_STRING(answer), answer_size); + LEAVE_ZLIB(self); + if (written_bytes < 0) { + Py_DECREF(answer); + return NULL; + } + if (_PyBytes_Resize(&answer, written_bytes) < 0) { + return NULL; + } + return answer; +} + +static PyObject * +GzipReader_close(GzipReader *self, PyObject *Py_UNUSED(ignore)) { + if (!self->closed) { + self->closed = 1; + } + Py_RETURN_NONE; +} + +static PyObject * +GzipReader_readable(GzipReader *self, PyObject *Py_UNUSED(ignore)) +{ + Py_RETURN_TRUE; +} + +static PyObject * +GzipReader_writable(GzipReader *self, PyObject *Py_UNUSED(ignore)) +{ + Py_RETURN_TRUE; +} + +static PyObject * +GzipReader_seekable(GzipReader *self, PyObject *Py_UNUSED(ignore)) { + return PyObject_CallMethod(self->fp, "seekable", NULL); +} + +static PyObject * +GzipReader_tell(GzipReader *self, PyObject *Py_UNUSED(ignore)) { + return PyLong_FromLongLong(self->_pos); +} + +static PyObject * +GzipReader_flush(GzipReader *self, PyObject *Py_UNUSED(ignore)) { + Py_RETURN_NONE; +} + +static PyObject * +GzipReader_get_last_mtime(GzipReader *self, void *Py_UNUSED(closure)) +{ + if (self->_last_mtime) { + return PyLong_FromUnsignedLong(self->_last_mtime); + } + Py_RETURN_NONE; +} + +static PyObject * +GzipReader_get_closed(GzipReader *self, void *Py_UNUSED(closure)) +{ + return PyBool_FromLong(self->closed); +} + +static PyMethodDef GzipReader_methods[] = { + {"readinto", (PyCFunction)GzipReader_readinto, METH_O, NULL}, + {"readable", (PyCFunction)GzipReader_readable, METH_NOARGS, NULL}, + {"writable", (PyCFunction)GzipReader_writable, METH_NOARGS, NULL}, + {"seekable", (PyCFunction)GzipReader_seekable, METH_NOARGS, NULL}, + {"tell", (PyCFunction)GzipReader_tell, METH_NOARGS, NULL}, + {"seek", (PyCFunction)GzipReader_seek, METH_VARARGS | METH_KEYWORDS, NULL}, + {"close", (PyCFunction)GzipReader_close, METH_NOARGS, NULL}, + {"readall", (PyCFunction)GzipReader_readall, METH_NOARGS, NULL}, + {"flush", (PyCFunction)GzipReader_flush, METH_NOARGS, NULL}, + {"read", (PyCFunction)GzipReader_read, METH_VARARGS, NULL}, + {NULL}, +}; + +static PyGetSetDef GzipReader_properties[] = { + {"closed", (getter)GzipReader_get_closed, NULL, NULL, NULL}, + {"_last_mtime", (getter)GzipReader_get_last_mtime, NULL, NULL, NULL}, + {NULL}, +}; + +static PyTypeObject GzipReader_Type = { + .tp_name = "isal_zlib._GzipReader", + .tp_basicsize = sizeof(GzipReader), + .tp_flags = Py_TPFLAGS_DEFAULT, + .tp_dealloc = (destructor)GzipReader_dealloc, + .tp_new = (newfunc)(GzipReader__new__), + .tp_doc = GzipReader__new____doc__, + .tp_methods = GzipReader_methods, + .tp_getset = GzipReader_properties, +}; + + static PyMethodDef zlib_methods[] = { ZLIB_ADLER32_METHODDEF, @@ -2020,12 +2755,32 @@ PyInit_zlib_ng(void) Py_INCREF(ZlibDecompressorType_obj); PyModule_AddObject(m, "_ZlibDecompressor", ZlibDecompressorType_obj); + if (PyType_Ready(&GzipReader_Type) != 0) { + return NULL; + } + Py_INCREF(&GzipReader_Type); + if (PyModule_AddObject(m, "_GzipReader", (PyObject *)&GzipReader_Type) < 0) { + return NULL; + } + ZlibError = PyErr_NewException("zlib_ng.error", NULL, NULL); if (ZlibError == NULL) { return NULL; } Py_INCREF(ZlibError); PyModule_AddObject(m, "error", ZlibError); + + PyObject *gzip_module = PyImport_ImportModule("gzip"); + if (gzip_module == NULL) { + return NULL; + } + + BadGzipFile = PyObject_GetAttrString(gzip_module, "BadGzipFile"); + if (BadGzipFile == NULL) { + return NULL; + } + Py_INCREF(BadGzipFile); + PyModule_AddIntMacro(m, MAX_WBITS); PyModule_AddIntMacro(m, DEFLATED); PyModule_AddIntMacro(m, DEF_MEM_LEVEL); diff --git a/tests/data/test.fastq.bgzip.gz b/tests/data/test.fastq.bgzip.gz new file mode 100644 index 0000000..8ef5eb5 Binary files /dev/null and b/tests/data/test.fastq.bgzip.gz differ diff --git a/tests/test_gzip_compliance.py b/tests/test_gzip_compliance.py index f51c867..d938966 100644 --- a/tests/test_gzip_compliance.py +++ b/tests/test_gzip_compliance.py @@ -613,8 +613,7 @@ def test_read_truncated(self): with gzip.GzipFile(fileobj=io.BytesIO(truncated)) as f: self.assertRaises(EOFError, f.read) with gzip.GzipFile(fileobj=io.BytesIO(truncated)) as f: - self.assertEqual(f.read(len(data)), data) - self.assertRaises(EOFError, f.read, 1) + self.assertRaises(EOFError, f.read) # Incomplete 10-byte header. for i in range(2, 10): with gzip.GzipFile(fileobj=io.BytesIO(truncated[:i])) as f: @@ -628,13 +627,6 @@ def test_read_with_extra(self): with gzip.GzipFile(fileobj=io.BytesIO(gzdata)) as f: self.assertEqual(f.read(), b'Test') - def test_prepend_error(self): - # See issue #20875 - with gzip.open(self.filename, "wb") as f: - f.write(data1) - with gzip.open(self.filename, "rb") as f: - f._buffer.raw._fp.prepend() - def test_issue44439(self): q = array.array('Q', [1, 2, 3, 4, 5]) LENGTH = len(q) * q.itemsize diff --git a/tests/test_gzip_ng.py b/tests/test_gzip_ng.py index fa6b68c..a751d8e 100644 --- a/tests/test_gzip_ng.py +++ b/tests/test_gzip_ng.py @@ -22,7 +22,7 @@ import pytest -from zlib_ng import gzip_ng +from zlib_ng import gzip_ng, zlib_ng DATA = b'This is a simple test with gzip_ng' COMPRESSED_DATA = gzip.compress(DATA) @@ -373,9 +373,76 @@ def test_truncated_header(trunc): gzip_ng.decompress(trunc) +def test_very_long_header_in_data(): + # header with a very long filename. + header = (b"\x1f\x8b\x08\x08\x00\x00\x00\x00\x00\xff" + 256 * 1024 * b"A" + + b"\x00") + compressed = header + zlib_ng.compress(b"", 3, -15) + 8 * b"\00" + assert gzip_ng.decompress(compressed) == b"" + + +def test_very_long_header_in_file(): + # header with a very long filename. + header = (b"\x1f\x8b\x08\x08\x00\x00\x00\x00\x00\xff" + + gzip_ng.READ_BUFFER_SIZE * 2 * b"A" + + b"\x00") + compressed = header + zlib_ng.compress(b"", 3, -15) + 8 * b"\00" + f = io.BytesIO(compressed) + with gzip_ng.open(f) as gzip_file: + assert gzip_file.read() == b"" + + def test_concatenated_gzip(): concat = Path(__file__).parent / "data" / "concatenated.fastq.gz" data = gzip.decompress(concat.read_bytes()) with gzip_ng.open(concat, "rb") as gzip_ng_h: result = gzip_ng_h.read() assert data == result + + +def test_seek(): + from io import SEEK_CUR, SEEK_END, SEEK_SET + with tempfile.NamedTemporaryFile("wb", delete=False) as tmpfile: + tmpfile.write(gzip.compress(b"X" * 500 + b"A" + b"X" * 499)) + tmpfile.write(gzip.compress(b"X" * 500 + b"B" + b"X" * 499)) + tmpfile.write(gzip.compress(b"X" * 500 + b"C" + b"X" * 499)) + tmpfile.write(gzip.compress(b"X" * 500 + b"D" + b"X" * 499)) + with gzip_ng.open(tmpfile.name, "rb") as gzip_file: + # Start testing forward seek + gzip_file.seek(500) + assert gzip_file.read(1) == b"A" + gzip_file.seek(1500) + assert gzip_file.read(1) == b"B" + # Test reverse + gzip_file.seek(500) + assert gzip_file.read(1) == b"A" + # Again, but with explicit SEEK_SET + gzip_file.seek(500, SEEK_SET) + assert gzip_file.read(1) == b"A" + gzip_file.seek(1500, SEEK_SET) + assert gzip_file.read(1) == b"B" + gzip_file.seek(500, SEEK_SET) + assert gzip_file.read(1) == b"A" + # Seeking from current position + gzip_file.seek(500) + gzip_file.seek(2000, SEEK_CUR) + assert gzip_file.read(1) == b"C" + gzip_file.seek(-1001, SEEK_CUR) + assert gzip_file.read(1) == b"B" + # Seeking from end + # Any positive number should end up at the end + gzip_file.seek(200, SEEK_END) + assert gzip_file.read(1) == b"" + gzip_file.seek(-1500, SEEK_END) + assert gzip_file.read(1) == b"C" + os.remove(tmpfile.name) + + +def test_bgzip(): + bgzip_file = Path(__file__).parent / "data" / "test.fastq.bgzip.gz" + gzip_file = Path(__file__).parent / "data" / "test.fastq.gz" + with gzip_ng.open(bgzip_file, "rb") as bgz: + bgz_data = bgz.read() + with gzip_ng.open(gzip_file, "rb") as gz: + gz_data = gz.read() + assert bgz_data == gz_data