Skip to content

Commit 0ec1e6d

Browse files
authored
Merge branch 'master' into PYTHON-4636
2 parents c8b8395 + c0f7810 commit 0ec1e6d

File tree

14 files changed

+545
-12
lines changed

14 files changed

+545
-12
lines changed

.evergreen/resync-specs.sh

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,9 @@ do
7676
atlas-data-lake-testing|data_lake)
7777
cpjson atlas-data-lake-testing/tests/ data_lake
7878
;;
79+
bson-binary-vector|bson_binary_vector)
80+
cpjson bson-binary-vector/tests/ bson_binary_vector
81+
;;
7982
bson-corpus|bson_corpus)
8083
cpjson bson-corpus/tests/ bson_corpus
8184
;;

bson/binary.py

Lines changed: 146 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,10 @@
1313
# limitations under the License.
1414
from __future__ import annotations
1515

16-
from typing import TYPE_CHECKING, Any, Tuple, Type, Union
16+
import struct
17+
from dataclasses import dataclass
18+
from enum import Enum
19+
from typing import TYPE_CHECKING, Any, Sequence, Tuple, Type, Union
1720
from uuid import UUID
1821

1922
"""Tools for representing BSON binary data.
@@ -191,21 +194,75 @@ class UuidRepresentation:
191194
"""
192195

193196

197+
VECTOR_SUBTYPE = 9
198+
"""**(BETA)** BSON binary subtype for densely packed vector data.
199+
200+
.. versionadded:: 4.10
201+
"""
202+
203+
194204
USER_DEFINED_SUBTYPE = 128
195205
"""BSON binary subtype for any user defined structure.
196206
"""
197207

198208

209+
class BinaryVectorDtype(Enum):
210+
"""**(BETA)** Datatypes of vector subtype.
211+
212+
:param FLOAT32: (0x27) Pack list of :class:`float` as float32
213+
:param INT8: (0x03) Pack list of :class:`int` in [-128, 127] as signed int8
214+
:param PACKED_BIT: (0x10) Pack list of :class:`int` in [0, 255] as unsigned uint8
215+
216+
The `PACKED_BIT` value represents a special case where vector values themselves
217+
can only be of two values (0 or 1) but these are packed together into groups of 8,
218+
a byte. In Python, these are displayed as ints in range [0, 255]
219+
220+
Each value is of type bytes with a length of one.
221+
222+
.. versionadded:: 4.10
223+
"""
224+
225+
INT8 = b"\x03"
226+
FLOAT32 = b"\x27"
227+
PACKED_BIT = b"\x10"
228+
229+
230+
@dataclass
231+
class BinaryVector:
232+
"""**(BETA)** Vector of numbers along with metadata for binary interoperability.
233+
.. versionadded:: 4.10
234+
"""
235+
236+
__slots__ = ("data", "dtype", "padding")
237+
238+
def __init__(self, data: Sequence[float | int], dtype: BinaryVectorDtype, padding: int = 0):
239+
"""
240+
:param data: Sequence of numbers representing the mathematical vector.
241+
:param dtype: The data type stored in binary
242+
:param padding: The number of bits in the final byte that are to be ignored
243+
when a vector element's size is less than a byte
244+
and the length of the vector is not a multiple of 8.
245+
"""
246+
self.data = data
247+
self.dtype = dtype
248+
self.padding = padding
249+
250+
199251
class Binary(bytes):
200252
"""Representation of BSON binary data.
201253
202-
This is necessary because we want to represent Python strings as
203-
the BSON string type. We need to wrap binary data so we can tell
254+
We want to represent Python strings as the BSON string type.
255+
We need to wrap binary data so that we can tell
204256
the difference between what should be considered binary data and
205257
what should be considered a string when we encode to BSON.
206258
207-
Raises TypeError if `data` is not an instance of :class:`bytes`
208-
or `subtype` is not an instance of :class:`int`.
259+
**(BETA)** Subtype 9 provides a space-efficient representation of 1-dimensional vector data.
260+
Its data is prepended with two bytes of metadata.
261+
The first (dtype) describes its data type, such as float32 or int8.
262+
The second (padding) prescribes the number of bits to ignore in the final byte.
263+
This is relevant when the element size of the dtype is not a multiple of 8.
264+
265+
Raises TypeError if `subtype` is not an instance of :class:`int`.
209266
Raises ValueError if `subtype` is not in [0, 256).
210267
211268
.. note::
@@ -218,7 +275,10 @@ class Binary(bytes):
218275
to use
219276
220277
.. versionchanged:: 3.9
221-
Support any bytes-like type that implements the buffer protocol.
278+
Support any bytes-like type that implements the buffer protocol.
279+
280+
.. versionchanged:: 4.10
281+
**(BETA)** Addition of vector subtype.
222282
"""
223283

224284
_type_marker = 5
@@ -337,6 +397,86 @@ def as_uuid(self, uuid_representation: int = UuidRepresentation.STANDARD) -> UUI
337397
f"cannot decode subtype {self.subtype} to {UUID_REPRESENTATION_NAMES[uuid_representation]}"
338398
)
339399

400+
@classmethod
401+
def from_vector(
402+
cls: Type[Binary],
403+
vector: list[int, float],
404+
dtype: BinaryVectorDtype,
405+
padding: int = 0,
406+
) -> Binary:
407+
"""**(BETA)** Create a BSON :class:`~bson.binary.Binary` of Vector subtype from a list of Numbers.
408+
409+
To interpret the representation of the numbers, a data type must be included.
410+
See :class:`~bson.binary.BinaryVectorDtype` for available types and descriptions.
411+
412+
The dtype and padding are prepended to the binary data's value.
413+
414+
:param vector: List of values
415+
:param dtype: Data type of the values
416+
:param padding: For fractional bytes, number of bits to ignore at end of vector.
417+
:return: Binary packed data identified by dtype and padding.
418+
419+
.. versionadded:: 4.10
420+
"""
421+
if dtype == BinaryVectorDtype.INT8: # pack ints in [-128, 127] as signed int8
422+
format_str = "b"
423+
if padding:
424+
raise ValueError(f"padding does not apply to {dtype=}")
425+
elif dtype == BinaryVectorDtype.PACKED_BIT: # pack ints in [0, 255] as unsigned uint8
426+
format_str = "B"
427+
elif dtype == BinaryVectorDtype.FLOAT32: # pack floats as float32
428+
format_str = "f"
429+
if padding:
430+
raise ValueError(f"padding does not apply to {dtype=}")
431+
else:
432+
raise NotImplementedError("%s not yet supported" % dtype)
433+
434+
metadata = struct.pack("<sB", dtype.value, padding)
435+
data = struct.pack(f"{len(vector)}{format_str}", *vector)
436+
return cls(metadata + data, subtype=VECTOR_SUBTYPE)
437+
438+
def as_vector(self) -> BinaryVector:
439+
"""**(BETA)** From the Binary, create a list of numbers, along with dtype and padding.
440+
441+
:return: BinaryVector
442+
443+
.. versionadded:: 4.10
444+
"""
445+
446+
if self.subtype != VECTOR_SUBTYPE:
447+
raise ValueError(f"Cannot decode subtype {self.subtype} as a vector.")
448+
449+
position = 0
450+
dtype, padding = struct.unpack_from("<sB", self, position)
451+
position += 2
452+
dtype = BinaryVectorDtype(dtype)
453+
n_values = len(self) - position
454+
455+
if dtype == BinaryVectorDtype.INT8:
456+
dtype_format = "b"
457+
format_string = f"{n_values}{dtype_format}"
458+
vector = list(struct.unpack_from(format_string, self, position))
459+
return BinaryVector(vector, dtype, padding)
460+
461+
elif dtype == BinaryVectorDtype.FLOAT32:
462+
n_bytes = len(self) - position
463+
n_values = n_bytes // 4
464+
if n_bytes % 4:
465+
raise ValueError(
466+
"Corrupt data. N bytes for a float32 vector must be a multiple of 4."
467+
)
468+
vector = list(struct.unpack_from(f"{n_values}f", self, position))
469+
return BinaryVector(vector, dtype, padding)
470+
471+
elif dtype == BinaryVectorDtype.PACKED_BIT:
472+
# data packed as uint8
473+
dtype_format = "B"
474+
unpacked_uint8s = list(struct.unpack_from(f"{n_values}{dtype_format}", self, position))
475+
return BinaryVector(unpacked_uint8s, dtype, padding)
476+
477+
else:
478+
raise NotImplementedError("Binary Vector dtype %s not yet supported" % dtype.name)
479+
340480
@property
341481
def subtype(self) -> int:
342482
"""Subtype of this binary data."""

doc/api/bson/binary.rst

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,14 @@
2121
.. autoclass:: UuidRepresentation
2222
:members:
2323

24+
.. autoclass:: BinaryVectorDtype
25+
:members:
26+
:show-inheritance:
27+
28+
.. autoclass:: BinaryVector
29+
:members:
30+
31+
2432
.. autoclass:: Binary(data, subtype=BINARY_SUBTYPE)
2533
:members:
2634
:show-inheritance:

doc/async-tutorial.rst

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,11 @@
11
Async Tutorial
22
==============
33

4+
.. warning:: This API is currently in beta, meaning the classes, methods,
5+
and behaviors described within may change before the full release.
6+
If you come across any bugs during your use of this API,
7+
please file a Jira ticket in the "Python Driver" project at https://jira.mongodb.org/browse/PYTHON.
8+
49
.. code-block:: pycon
510
611
from pymongo import AsyncMongoClient

doc/changelog.rst

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,24 @@
11
Changelog
22
=========
33

4+
Changes in Version 4.10.0
5+
-------------------------
6+
7+
- Added provisional **(BETA)** support for a new Binary BSON subtype (9) used for efficient storage and retrieval of vectors:
8+
densely packed arrays of numbers, all of the same type.
9+
This includes new methods :meth:`~bson.binary.Binary.from_vector` and :meth:`~bson.binary.Binary.as_vector`.
10+
- Added C extension use to client metadata, for example: ``{"driver": {"name": "PyMongo|c", "version": "4.10.0"}, ...}``
11+
- Fixed a bug where :class:`~pymongo.asynchronous.mongo_client.AsyncMongoClient` could deadlock.
12+
- Fixed a bug where PyMongo could fail to import on Windows if ``asyncio`` is misconfigured.
13+
14+
Issues Resolved
15+
...............
16+
17+
See the `PyMongo 4.10 release notes in JIRA`_ for the list of resolved issues
18+
in this release.
19+
20+
.. _PyMongo 4.10 release notes in JIRA: https://jira.mongodb.org/secure/ReleaseNote.jspa?projectId=10004&version=40553
21+
422
Changes in Version 4.9.0
523
-------------------------
624

pymongo/_version.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
import re
1919
from typing import List, Tuple, Union
2020

21-
__version__ = "4.10.0.dev0"
21+
__version__ = "4.11.0.dev0"
2222

2323

2424
def get_version_tuple(version: str) -> Tuple[Union[int, str], ...]:

test/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -313,7 +313,7 @@ def _init_client(self):
313313
params = self.cmd_line["parsed"].get("setParameter", {})
314314
if params.get("enableTestCommands") == "1":
315315
self.test_commands_enabled = True
316-
self.has_ipv6 = self._server_started_with_ipv6()
316+
self.has_ipv6 = self._server_started_with_ipv6()
317317

318318
self.is_mongos = (self.hello).get("msg") == "isdbgrid"
319319
if self.is_mongos:

test/asynchronous/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -313,7 +313,7 @@ async def _init_client(self):
313313
params = self.cmd_line["parsed"].get("setParameter", {})
314314
if params.get("enableTestCommands") == "1":
315315
self.test_commands_enabled = True
316-
self.has_ipv6 = await self._server_started_with_ipv6()
316+
self.has_ipv6 = await self._server_started_with_ipv6()
317317

318318
self.is_mongos = (await self.hello).get("msg") == "isdbgrid"
319319
if self.is_mongos:

test/bson_binary_vector/float32.json

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
{
2+
"description": "Tests of Binary subtype 9, Vectors, with dtype FLOAT32",
3+
"test_key": "vector",
4+
"tests": [
5+
{
6+
"description": "Simple Vector FLOAT32",
7+
"valid": true,
8+
"vector": [127.0, 7.0],
9+
"dtype_hex": "0x27",
10+
"dtype_alias": "FLOAT32",
11+
"padding": 0,
12+
"canonical_bson": "1C00000005766563746F72000A0000000927000000FE420000E04000"
13+
},
14+
{
15+
"description": "Empty Vector FLOAT32",
16+
"valid": true,
17+
"vector": [],
18+
"dtype_hex": "0x27",
19+
"dtype_alias": "FLOAT32",
20+
"padding": 0,
21+
"canonical_bson": "1400000005766563746F72000200000009270000"
22+
},
23+
{
24+
"description": "Infinity Vector FLOAT32",
25+
"valid": true,
26+
"vector": ["-inf", 0.0, "inf"],
27+
"dtype_hex": "0x27",
28+
"dtype_alias": "FLOAT32",
29+
"padding": 0,
30+
"canonical_bson": "2000000005766563746F72000E000000092700000080FF000000000000807F00"
31+
},
32+
{
33+
"description": "FLOAT32 with padding",
34+
"valid": false,
35+
"vector": [127.0, 7.0],
36+
"dtype_hex": "0x27",
37+
"dtype_alias": "FLOAT32",
38+
"padding": 3
39+
}
40+
]
41+
}
42+

test/bson_binary_vector/int8.json

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
{
2+
"description": "Tests of Binary subtype 9, Vectors, with dtype INT8",
3+
"test_key": "vector",
4+
"tests": [
5+
{
6+
"description": "Simple Vector INT8",
7+
"valid": true,
8+
"vector": [127, 7],
9+
"dtype_hex": "0x03",
10+
"dtype_alias": "INT8",
11+
"padding": 0,
12+
"canonical_bson": "1600000005766563746F7200040000000903007F0700"
13+
},
14+
{
15+
"description": "Empty Vector INT8",
16+
"valid": true,
17+
"vector": [],
18+
"dtype_hex": "0x03",
19+
"dtype_alias": "INT8",
20+
"padding": 0,
21+
"canonical_bson": "1400000005766563746F72000200000009030000"
22+
},
23+
{
24+
"description": "Overflow Vector INT8",
25+
"valid": false,
26+
"vector": [128],
27+
"dtype_hex": "0x03",
28+
"dtype_alias": "INT8",
29+
"padding": 0
30+
},
31+
{
32+
"description": "Underflow Vector INT8",
33+
"valid": false,
34+
"vector": [-129],
35+
"dtype_hex": "0x03",
36+
"dtype_alias": "INT8",
37+
"padding": 0
38+
},
39+
{
40+
"description": "INT8 with padding",
41+
"valid": false,
42+
"vector": [127, 7],
43+
"dtype_hex": "0x03",
44+
"dtype_alias": "INT8",
45+
"padding": 3
46+
},
47+
{
48+
"description": "INT8 with float inputs",
49+
"valid": false,
50+
"vector": [127.77, 7.77],
51+
"dtype_hex": "0x03",
52+
"dtype_alias": "INT8",
53+
"padding": 0
54+
}
55+
]
56+
}
57+

0 commit comments

Comments
 (0)