Skip to content

Commit 49e656c

Browse files
committed
Merge branch 'master' into PYTHON-4786
2 parents edca524 + c0f7810 commit 49e656c

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

50 files changed

+1827
-234
lines changed

.evergreen/resync-specs.sh

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,9 @@ do
7676
atlas-data-lake-testing|data_lake)
7777
cpjson atlas-data-lake-testing/tests/ data_lake
7878
;;
79+
bson-binary-vector|bson_binary_vector)
80+
cpjson bson-binary-vector/tests/ bson_binary_vector
81+
;;
7982
bson-corpus|bson_corpus)
8083
cpjson bson-corpus/tests/ bson_corpus
8184
;;

bson/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1324,7 +1324,7 @@ def decode_iter(
13241324
elements = data[position : position + obj_size]
13251325
position += obj_size
13261326

1327-
yield _bson_to_dict(elements, opts) # type:ignore[misc, type-var]
1327+
yield _bson_to_dict(elements, opts) # type:ignore[misc]
13281328

13291329

13301330
@overload
@@ -1370,7 +1370,7 @@ def decode_file_iter(
13701370
raise InvalidBSON("cut off in middle of objsize")
13711371
obj_size = _UNPACK_INT_FROM(size_data, 0)[0] - 4
13721372
elements = size_data + file_obj.read(max(0, obj_size))
1373-
yield _bson_to_dict(elements, opts) # type:ignore[type-var, arg-type, misc]
1373+
yield _bson_to_dict(elements, opts) # type:ignore[arg-type, misc]
13741374

13751375

13761376
def is_valid(bson: bytes) -> bool:

bson/binary.py

Lines changed: 146 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,10 @@
1313
# limitations under the License.
1414
from __future__ import annotations
1515

16-
from typing import TYPE_CHECKING, Any, Tuple, Type, Union
16+
import struct
17+
from dataclasses import dataclass
18+
from enum import Enum
19+
from typing import TYPE_CHECKING, Any, Sequence, Tuple, Type, Union
1720
from uuid import UUID
1821

1922
"""Tools for representing BSON binary data.
@@ -191,21 +194,75 @@ class UuidRepresentation:
191194
"""
192195

193196

197+
VECTOR_SUBTYPE = 9
198+
"""**(BETA)** BSON binary subtype for densely packed vector data.
199+
200+
.. versionadded:: 4.10
201+
"""
202+
203+
194204
USER_DEFINED_SUBTYPE = 128
195205
"""BSON binary subtype for any user defined structure.
196206
"""
197207

198208

209+
class BinaryVectorDtype(Enum):
210+
"""**(BETA)** Datatypes of vector subtype.
211+
212+
:param FLOAT32: (0x27) Pack list of :class:`float` as float32
213+
:param INT8: (0x03) Pack list of :class:`int` in [-128, 127] as signed int8
214+
:param PACKED_BIT: (0x10) Pack list of :class:`int` in [0, 255] as unsigned uint8
215+
216+
The `PACKED_BIT` value represents a special case where vector values themselves
217+
can only be of two values (0 or 1) but these are packed together into groups of 8,
218+
a byte. In Python, these are displayed as ints in range [0, 255]
219+
220+
Each value is of type bytes with a length of one.
221+
222+
.. versionadded:: 4.10
223+
"""
224+
225+
INT8 = b"\x03"
226+
FLOAT32 = b"\x27"
227+
PACKED_BIT = b"\x10"
228+
229+
230+
@dataclass
231+
class BinaryVector:
232+
"""**(BETA)** Vector of numbers along with metadata for binary interoperability.
233+
.. versionadded:: 4.10
234+
"""
235+
236+
__slots__ = ("data", "dtype", "padding")
237+
238+
def __init__(self, data: Sequence[float | int], dtype: BinaryVectorDtype, padding: int = 0):
239+
"""
240+
:param data: Sequence of numbers representing the mathematical vector.
241+
:param dtype: The data type stored in binary
242+
:param padding: The number of bits in the final byte that are to be ignored
243+
when a vector element's size is less than a byte
244+
and the length of the vector is not a multiple of 8.
245+
"""
246+
self.data = data
247+
self.dtype = dtype
248+
self.padding = padding
249+
250+
199251
class Binary(bytes):
200252
"""Representation of BSON binary data.
201253
202-
This is necessary because we want to represent Python strings as
203-
the BSON string type. We need to wrap binary data so we can tell
254+
We want to represent Python strings as the BSON string type.
255+
We need to wrap binary data so that we can tell
204256
the difference between what should be considered binary data and
205257
what should be considered a string when we encode to BSON.
206258
207-
Raises TypeError if `data` is not an instance of :class:`bytes`
208-
or `subtype` is not an instance of :class:`int`.
259+
**(BETA)** Subtype 9 provides a space-efficient representation of 1-dimensional vector data.
260+
Its data is prepended with two bytes of metadata.
261+
The first (dtype) describes its data type, such as float32 or int8.
262+
The second (padding) prescribes the number of bits to ignore in the final byte.
263+
This is relevant when the element size of the dtype is not a multiple of 8.
264+
265+
Raises TypeError if `subtype` is not an instance of :class:`int`.
209266
Raises ValueError if `subtype` is not in [0, 256).
210267
211268
.. note::
@@ -218,7 +275,10 @@ class Binary(bytes):
218275
to use
219276
220277
.. versionchanged:: 3.9
221-
Support any bytes-like type that implements the buffer protocol.
278+
Support any bytes-like type that implements the buffer protocol.
279+
280+
.. versionchanged:: 4.10
281+
**(BETA)** Addition of vector subtype.
222282
"""
223283

224284
_type_marker = 5
@@ -337,6 +397,86 @@ def as_uuid(self, uuid_representation: int = UuidRepresentation.STANDARD) -> UUI
337397
f"cannot decode subtype {self.subtype} to {UUID_REPRESENTATION_NAMES[uuid_representation]}"
338398
)
339399

400+
@classmethod
401+
def from_vector(
402+
cls: Type[Binary],
403+
vector: list[int, float],
404+
dtype: BinaryVectorDtype,
405+
padding: int = 0,
406+
) -> Binary:
407+
"""**(BETA)** Create a BSON :class:`~bson.binary.Binary` of Vector subtype from a list of Numbers.
408+
409+
To interpret the representation of the numbers, a data type must be included.
410+
See :class:`~bson.binary.BinaryVectorDtype` for available types and descriptions.
411+
412+
The dtype and padding are prepended to the binary data's value.
413+
414+
:param vector: List of values
415+
:param dtype: Data type of the values
416+
:param padding: For fractional bytes, number of bits to ignore at end of vector.
417+
:return: Binary packed data identified by dtype and padding.
418+
419+
.. versionadded:: 4.10
420+
"""
421+
if dtype == BinaryVectorDtype.INT8: # pack ints in [-128, 127] as signed int8
422+
format_str = "b"
423+
if padding:
424+
raise ValueError(f"padding does not apply to {dtype=}")
425+
elif dtype == BinaryVectorDtype.PACKED_BIT: # pack ints in [0, 255] as unsigned uint8
426+
format_str = "B"
427+
elif dtype == BinaryVectorDtype.FLOAT32: # pack floats as float32
428+
format_str = "f"
429+
if padding:
430+
raise ValueError(f"padding does not apply to {dtype=}")
431+
else:
432+
raise NotImplementedError("%s not yet supported" % dtype)
433+
434+
metadata = struct.pack("<sB", dtype.value, padding)
435+
data = struct.pack(f"{len(vector)}{format_str}", *vector)
436+
return cls(metadata + data, subtype=VECTOR_SUBTYPE)
437+
438+
def as_vector(self) -> BinaryVector:
439+
"""**(BETA)** From the Binary, create a list of numbers, along with dtype and padding.
440+
441+
:return: BinaryVector
442+
443+
.. versionadded:: 4.10
444+
"""
445+
446+
if self.subtype != VECTOR_SUBTYPE:
447+
raise ValueError(f"Cannot decode subtype {self.subtype} as a vector.")
448+
449+
position = 0
450+
dtype, padding = struct.unpack_from("<sB", self, position)
451+
position += 2
452+
dtype = BinaryVectorDtype(dtype)
453+
n_values = len(self) - position
454+
455+
if dtype == BinaryVectorDtype.INT8:
456+
dtype_format = "b"
457+
format_string = f"{n_values}{dtype_format}"
458+
vector = list(struct.unpack_from(format_string, self, position))
459+
return BinaryVector(vector, dtype, padding)
460+
461+
elif dtype == BinaryVectorDtype.FLOAT32:
462+
n_bytes = len(self) - position
463+
n_values = n_bytes // 4
464+
if n_bytes % 4:
465+
raise ValueError(
466+
"Corrupt data. N bytes for a float32 vector must be a multiple of 4."
467+
)
468+
vector = list(struct.unpack_from(f"{n_values}f", self, position))
469+
return BinaryVector(vector, dtype, padding)
470+
471+
elif dtype == BinaryVectorDtype.PACKED_BIT:
472+
# data packed as uint8
473+
dtype_format = "B"
474+
unpacked_uint8s = list(struct.unpack_from(f"{n_values}{dtype_format}", self, position))
475+
return BinaryVector(unpacked_uint8s, dtype, padding)
476+
477+
else:
478+
raise NotImplementedError("Binary Vector dtype %s not yet supported" % dtype.name)
479+
340480
@property
341481
def subtype(self) -> int:
342482
"""Subtype of this binary data."""

bson/decimal128.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -223,7 +223,7 @@ def __init__(self, value: _VALUE_OPTIONS) -> None:
223223
"from list or tuple. Must have exactly 2 "
224224
"elements."
225225
)
226-
self.__high, self.__low = value # type: ignore
226+
self.__high, self.__low = value
227227
else:
228228
raise TypeError(f"Cannot convert {value!r} to Decimal128")
229229

bson/json_util.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -324,7 +324,7 @@ def __new__(
324324
"JSONOptions.datetime_representation must be one of LEGACY, "
325325
"NUMBERLONG, or ISO8601 from DatetimeRepresentation."
326326
)
327-
self = cast(JSONOptions, super().__new__(cls, *args, **kwargs)) # type:ignore[arg-type]
327+
self = cast(JSONOptions, super().__new__(cls, *args, **kwargs))
328328
if json_mode not in (JSONMode.LEGACY, JSONMode.RELAXED, JSONMode.CANONICAL):
329329
raise ValueError(
330330
"JSONOptions.json_mode must be one of LEGACY, RELAXED, "

bson/son.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ def __init__(
6868
self.update(kwargs)
6969

7070
def __new__(cls: Type[SON[_Key, _Value]], *args: Any, **kwargs: Any) -> SON[_Key, _Value]:
71-
instance = super().__new__(cls, *args, **kwargs) # type: ignore[type-var]
71+
instance = super().__new__(cls, *args, **kwargs)
7272
instance.__keys = []
7373
return instance
7474

doc/api/bson/binary.rst

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,14 @@
2121
.. autoclass:: UuidRepresentation
2222
:members:
2323

24+
.. autoclass:: BinaryVectorDtype
25+
:members:
26+
:show-inheritance:
27+
28+
.. autoclass:: BinaryVector
29+
:members:
30+
31+
2432
.. autoclass:: Binary(data, subtype=BINARY_SUBTYPE)
2533
:members:
2634
:show-inheritance:

doc/async-tutorial.rst

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,11 @@
11
Async Tutorial
22
==============
33

4+
.. warning:: This API is currently in beta, meaning the classes, methods,
5+
and behaviors described within may change before the full release.
6+
If you come across any bugs during your use of this API,
7+
please file a Jira ticket in the "Python Driver" project at https://jira.mongodb.org/browse/PYTHON.
8+
49
.. code-block:: pycon
510
611
from pymongo import AsyncMongoClient

doc/changelog.rst

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,24 @@
11
Changelog
22
=========
33

4+
Changes in Version 4.10.0
5+
-------------------------
6+
7+
- Added provisional **(BETA)** support for a new Binary BSON subtype (9) used for efficient storage and retrieval of vectors:
8+
densely packed arrays of numbers, all of the same type.
9+
This includes new methods :meth:`~bson.binary.Binary.from_vector` and :meth:`~bson.binary.Binary.as_vector`.
10+
- Added C extension use to client metadata, for example: ``{"driver": {"name": "PyMongo|c", "version": "4.10.0"}, ...}``
11+
- Fixed a bug where :class:`~pymongo.asynchronous.mongo_client.AsyncMongoClient` could deadlock.
12+
- Fixed a bug where PyMongo could fail to import on Windows if ``asyncio`` is misconfigured.
13+
14+
Issues Resolved
15+
...............
16+
17+
See the `PyMongo 4.10 release notes in JIRA`_ for the list of resolved issues
18+
in this release.
19+
20+
.. _PyMongo 4.10 release notes in JIRA: https://jira.mongodb.org/secure/ReleaseNote.jspa?projectId=10004&version=40553
21+
422
Changes in Version 4.9.0
523
-------------------------
624

hatch.toml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,9 @@ features = ["docs","test"]
1313
test = "sphinx-build -E -b doctest doc ./doc/_build/doctest"
1414

1515
[envs.typing]
16-
features = ["encryption", "ocsp", "zstd", "aws"]
17-
dependencies = ["mypy==1.2.0","pyright==1.1.290", "certifi", "typing_extensions"]
16+
pre-install-commands = [
17+
"pip install -q -r requirements/typing.txt",
18+
]
1819
[envs.typing.scripts]
1920
check-mypy = [
2021
"mypy --install-types --non-interactive bson gridfs tools pymongo",

pymongo/__init__.py

Lines changed: 1 addition & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@
8888

8989
from pymongo import _csot
9090
from pymongo._version import __version__, get_version_string, version_tuple
91-
from pymongo.common import MAX_SUPPORTED_WIRE_VERSION, MIN_SUPPORTED_WIRE_VERSION
91+
from pymongo.common import MAX_SUPPORTED_WIRE_VERSION, MIN_SUPPORTED_WIRE_VERSION, has_c
9292
from pymongo.cursor import CursorType
9393
from pymongo.operations import (
9494
DeleteMany,
@@ -116,16 +116,6 @@
116116
"""Current version of PyMongo."""
117117

118118

119-
def has_c() -> bool:
120-
"""Is the C extension installed?"""
121-
try:
122-
from pymongo import _cmessage # type: ignore[attr-defined] # noqa: F401
123-
124-
return True
125-
except ImportError:
126-
return False
127-
128-
129119
def timeout(seconds: Optional[float]) -> ContextManager[None]:
130120
"""**(Provisional)** Apply the given timeout for a block of operations.
131121

pymongo/_csot.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -75,14 +75,13 @@ def __init__(self, timeout: Optional[float]):
7575
self._timeout = timeout
7676
self._tokens: Optional[tuple[Token[Optional[float]], Token[float], Token[float]]] = None
7777

78-
def __enter__(self) -> _TimeoutContext:
78+
def __enter__(self) -> None:
7979
timeout_token = TIMEOUT.set(self._timeout)
8080
prev_deadline = DEADLINE.get()
8181
next_deadline = time.monotonic() + self._timeout if self._timeout else float("inf")
8282
deadline_token = DEADLINE.set(min(prev_deadline, next_deadline))
8383
rtt_token = RTT.set(0.0)
8484
self._tokens = (timeout_token, deadline_token, rtt_token)
85-
return self
8685

8786
def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
8887
if self._tokens:

pymongo/_version.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
import re
1919
from typing import List, Tuple, Union
2020

21-
__version__ = "4.10.0.dev0"
21+
__version__ = "4.11.0.dev0"
2222

2323

2424
def get_version_tuple(version: str) -> Tuple[Union[int, str], ...]:

0 commit comments

Comments
 (0)