From 688d551f027d8b12bf7268c1b9b54a54446c1c2e Mon Sep 17 00:00:00 2001 From: abhishekbhakat Date: Tue, 20 Aug 2024 15:50:43 +0000 Subject: [PATCH 1/3] base support for webp type. --- src/docx/image/__init__.py | 2 + src/docx/image/constants.py | 1 + src/docx/image/webp.py | 108 ++++++++++++++++++++++++++++++++++++ 3 files changed, 111 insertions(+) create mode 100644 src/docx/image/webp.py diff --git a/src/docx/image/__init__.py b/src/docx/image/__init__.py index d28033ef1..24b55fbfa 100644 --- a/src/docx/image/__init__.py +++ b/src/docx/image/__init__.py @@ -9,6 +9,7 @@ from docx.image.jpeg import Exif, Jfif from docx.image.png import Png from docx.image.tiff import Tiff +from docx.image.webp import Webp SIGNATURES = ( # class, offset, signature_bytes @@ -20,4 +21,5 @@ (Tiff, 0, b"MM\x00*"), # big-endian (Motorola) TIFF (Tiff, 0, b"II*\x00"), # little-endian (Intel) TIFF (Bmp, 0, b"BM"), + (Webp, 0, b"RIFF"), ) diff --git a/src/docx/image/constants.py b/src/docx/image/constants.py index 729a828b2..c5c5bf38d 100644 --- a/src/docx/image/constants.py +++ b/src/docx/image/constants.py @@ -105,6 +105,7 @@ class MIME_TYPE: JPEG = "image/jpeg" PNG = "image/png" TIFF = "image/tiff" + WEBP = "image/webp" class PNG_CHUNK_TYPE: diff --git a/src/docx/image/webp.py b/src/docx/image/webp.py new file mode 100644 index 000000000..87c882394 --- /dev/null +++ b/src/docx/image/webp.py @@ -0,0 +1,108 @@ +"""Objects related to parsing headers of WEBP image streams.""" + +import io +from struct import unpack + +from docx.image.constants import MIME_TYPE +from docx.image.helpers import BIG_ENDIAN, StreamReader +from docx.image.image import BaseImageHeader + +class Webp(BaseImageHeader): + """Image header parser for WEBP image format.""" + + @classmethod + def from_stream(cls, stream): + """Return |Webp| instance having header properties parsed from WEBP image in + `stream`.""" + stream.seek(0) + stream_reader = StreamReader(stream, BIG_ENDIAN) + + # Skip RIFF header + stream_reader.skip(12) + + # Read VP8 header + vp8_header = stream_reader.read(4) + + if vp8_header == b'VP8 ': + # Simple WebP + stream_reader.skip(6) + width, height = unpack('> 6) + else: + raise ValueError('Unsupported WebP format') + + # WebP doesn't store DPI information, so we use default 72 DPI + horz_dpi = vert_dpi = 72 + + return cls(width, height, horz_dpi, vert_dpi) + + @property + def content_type(self): + """MIME content type for this image.""" + return MIME_TYPE.WEBP + + @property + def default_ext(self): + """Default filename extension, always 'webp' for WEBP images.""" + return 'webp' + +class _WebpParser: + """Parser for WebP image binary data.""" + + def __init__(self, stream): + self._stream = stream + self._stream_rdr = StreamReader(stream, BIG_ENDIAN) + self._width = None + self._height = None + + @property + def px_width(self): + self._parse_dimensions() + return self._width + + @property + def px_height(self): + self._parse_dimensions() + return self._height + + @property + def horz_dpi(self): + return 72 + + @property + def vert_dpi(self): + return 72 + + def _parse_dimensions(self): + if self._width is not None: + return + + self._stream_rdr.seek(12) # Skip RIFF header + vp8_header = self._stream_rdr.read(4) + + if vp8_header == b'VP8 ': + self._parse_simple_webp() + elif vp8_header == b'VP8L': + self._parse_lossless_webp() + else: + raise ValueError('Unsupported WebP format') + + def _parse_simple_webp(self): + self._stream_rdr.skip(6) + self._width, self._height = unpack('> 6) + + @classmethod + def parse(cls, stream): + parser = cls(stream) + return parser From 5e70252b2c729e0e6fd7b77202eb5d23961db9a2 Mon Sep 17 00:00:00 2001 From: abhishekbhakat Date: Tue, 20 Aug 2024 15:51:08 +0000 Subject: [PATCH 2/3] unused import --- src/docx/image/webp.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/docx/image/webp.py b/src/docx/image/webp.py index 87c882394..17328ea01 100644 --- a/src/docx/image/webp.py +++ b/src/docx/image/webp.py @@ -1,6 +1,5 @@ """Objects related to parsing headers of WEBP image streams.""" -import io from struct import unpack from docx.image.constants import MIME_TYPE From 314304d2d0fbd7f7ca0505d52bbe3372ba51aa09 Mon Sep 17 00:00:00 2001 From: abhishekbhakat Date: Tue, 20 Aug 2024 19:00:10 +0000 Subject: [PATCH 3/3] webp stream reading to get proper image --- src/docx/image/webp.py | 194 ++++++++++++++++++++++++++++++++++++----- 1 file changed, 172 insertions(+), 22 deletions(-) diff --git a/src/docx/image/webp.py b/src/docx/image/webp.py index 17328ea01..c455d3f66 100644 --- a/src/docx/image/webp.py +++ b/src/docx/image/webp.py @@ -1,4 +1,100 @@ -"""Objects related to parsing headers of WEBP image streams.""" +"""Objects related to parsing headers of WEBP image streams. + +Docs: https://developers.google.com/speed/webp/docs/riff_container + +VP8: + + 0 1 2 3 + 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 ++-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +| 'R' | 'I' | 'F' | 'F' | ++-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +| File Size | ++-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +| 'W' | 'E' | 'B' | 'P' | ++-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +| ChunkHeader('VP8 ') | +| | ++-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +: VP8 data : ++-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + +1. Data begins with string RIFF +2. Little-endian 32-bit file size +3. String WEBP +4. String VP8 (with space) +5. Little-endian 32-bit chunk size +6. 3-byte frame tag for interframes, or 10-byte frame tag for keyframes +7. Compressed data partitions containing: + - Frame header + - Macroblock prediction data + - DCT/WHT coefficient data +The frame dimensions are encoded in the frame header within the first compressed data partition, +not in a fixed position like VP8L. + +VP8L: + + 0 1 2 3 + 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 ++-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +| 'R' | 'I' | 'F' | 'F' | ++-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +| File Size | ++-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +| 'W' | 'E' | 'B' | 'P' | ++-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +| ChunkHeader('VP8L') | +| | ++-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +: VP8L data : ++-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + +1. Data begins with the string RIFF +2. A little endian 32 bit value +3. String WEBP +4. String VP8L +5. A little endian 32 bit value +6. 1 byte signature 0x2f And then the first 28 bits contains the width and the height + +VP8X: + + 0 1 2 3 + 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 ++-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +| 'R' | 'I' | 'F' | 'F' | ++-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +| File Size | ++-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +| 'W' | 'E' | 'B' | 'P' | ++-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +| ChunkHeader('VP8X') | +| | ++-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +|Rsv|I|L|E|X|A|R| Reserved | ++-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +| Canvas Width Minus One | ... ++-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +... Canvas Height Minus One | ++-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + + +1. Data begins with string RIFF +2. Little-endian 32-bit file size +3. String WEBP +4. String VP8X +5. Little-endian 32-bit chunk size +6. Reeserved (Rsv): 2 bits (MUST be 0. Readers MUST ignore this field.) +7. ICC profile (I): 1 bit (Set if the file contains an 'ICCP' Chunk.) +8. Alpha (L): 1 bit (Set if any of the frames of the image contain transparency information ("alpha").) +9. Exif metadata (E): 1 bit (Set if the file contains Exif metadata.) +10. XMP metadata (X): 1 bit (Set if the file contains XMP metadata.) +11. Animation (A): 1 bit (Set if this is an animated image. Data in 'ANIM' and 'ANMF' Chunks should be used to control the animation.) +12. Reserved (R): 1 bit (MUST be 0. Readers MUST ignore this field.) +13. Reserved: 24 bits (MUST be 0. Readers MUST ignore this field.) +14. Canvas Width Minus One: 24 bits (1-based width of the canvas in pixels. The actual canvas width is 1 + Canvas Width Minus One.) +15. Canvas Height Minus One: 24 bits (1-based height of the canvas in pixels. The actual canvas height is 1 + Canvas Height Minus One.) +16. The product of Canvas Width and Canvas Height MUST be at most 2^32 - 1. +""" from struct import unpack @@ -6,6 +102,7 @@ from docx.image.helpers import BIG_ENDIAN, StreamReader from docx.image.image import BaseImageHeader + class Webp(BaseImageHeader): """Image header parser for WEBP image format.""" @@ -14,31 +111,66 @@ def from_stream(cls, stream): """Return |Webp| instance having header properties parsed from WEBP image in `stream`.""" stream.seek(0) - stream_reader = StreamReader(stream, BIG_ENDIAN) + if stream.read(4) != b'RIFF': + raise ValueError("Not a valid WebP file") - # Skip RIFF header - stream_reader.skip(12) + _ = stream.read(4) # File size, we can skip this - # Read VP8 header - vp8_header = stream_reader.read(4) + if stream.read(4) != b'WEBP': + raise ValueError("Not a valid WebP file") - if vp8_header == b'VP8 ': - # Simple WebP - stream_reader.skip(6) - width, height = unpack('> 6) + chunk_header = stream.read(4) + + if chunk_header == b'VP8L': + width, height = cls._parse_lossless(stream) + elif chunk_header == b'VP8X': + width, height = cls._parse_extended(stream) + elif chunk_header == b'VP8 ': + width, height = cls._parse_simple(stream) else: - raise ValueError('Unsupported WebP format') + raise ValueError("Unsupported WebP format") + + return cls(width, height, 72, 72) + + @staticmethod + def _parse_lossless(stream): + _ = unpack('> 14) & 0x3FFF) + 1 + + return w, h - # WebP doesn't store DPI information, so we use default 72 DPI - horz_dpi = vert_dpi = 72 + @staticmethod + def _parse_extended(stream): + _ = unpack('> 6) + def _parse_extended_webp(self): + self._stream_rdr.skip(8) # Skip chunk size and flags + width_minus_one = int.from_bytes(self._stream_rdr.read(3), 'little') + height_minus_one = int.from_bytes(self._stream_rdr.read(3), 'little') + self._width = width_minus_one + 1 + self._height = height_minus_one + 1 + @classmethod def parse(cls, stream): parser = cls(stream)