From ec0102e96d31a2518047eb1cf3ffd707ffb347f7 Mon Sep 17 00:00:00 2001 From: Prashant Mital Date: Fri, 17 Jul 2020 01:32:23 -0700 Subject: [PATCH 1/8] PYTHON-2252 Add examples and documentation for new UUID behavior --- doc/examples/uuid.rst | 290 ++++++++++++++++++++++++++++++++++++++++ pymongo/mongo_client.py | 5 +- 2 files changed, 293 insertions(+), 2 deletions(-) create mode 100644 doc/examples/uuid.rst diff --git a/doc/examples/uuid.rst b/doc/examples/uuid.rst new file mode 100644 index 0000000000..ec748c2db9 --- /dev/null +++ b/doc/examples/uuid.rst @@ -0,0 +1,290 @@ +.. _Handling UUID Data: + +Handling UUID Data +================== + +PyMongo ships with built-in support for dealing with UUID types. +It is trivially simple to store native :class:`uuid.UUID` objects +to MongoDB and retrieve them as native :class:`uuid.UUID`s:: + + from pymongo import MongoClient + from bson.binary import UuidRepresentation + from uuid import uuid4 + + # use the 'standard' representation for cross-language compatibility. + client = MongoClient(uuid_representation=UuidRepresentation.STANDARD) + collection = client.get_database('uuid_db').get_collection('uuid_coll') + + # remove all documents from collection + collection.delete_many({}) + + # create a native uuid object + uuid_obj = uuid4() + + # save the native uuid object to MongoDB + collection.insert_one({'uuid': uuid_obj}) + + # retrieve the stored uuid object from MongoDB + document = collection.find_one({}) + + # check that the retrieved document matches the inserted document + assert document['uuid'] == uuid_obj + +Native :class:`uuid.UUID`s can also be used as part of MongoDB queries:: + + document = collection.find({'uuid': uuid_obj}) + assert document['uuid'] == uuid_obj + +The above examples illustrate the simplest of use-cases - one where the +UUID is generated by, and used in the same application. However, +the situation can be significantly more complex when dealing with a MongoDB +deployment that contains UUIDs created by other drivers as the Java and CSharp +drivers have historically encoded UUIDs using a byte-order that is different +from the one used by PyMongo. Applications that require interoperability across +these drivers must specify the appropriate +:class:`~bson.binary.UuidRepresentation`. + +In the following sections, we describe how drivers have historically differed +in their encoding of UUIDs, and how applications can use the +:class:`~bson.binary.UuidRepresentation` configuration option to maintain +cross-language compatibility. + +.. attention:: Applications that do not share a MongoDB deployment with + any other application and that have never stored UUIDs in MongoDB + should use the ``standard`` UUID representation for cross-language + compatibility. See :ref:`configuring-uuid-representation` for details + on how to configure the :class:`~bson.binary.UuidRepresentation`. + +.. _example-legacy-uuid: + +Legacy Handling of UUID Data +---------------------------- + +Historically, MongoDB Drivers have used different byte-ordering +while serializing UUID types to :class:`~bson.binary.Binary`. +Consider, for instance, a UUID with the following canonical textual +representation:: + + 00112233-4455-6677-8899-aabbccddeeff + +This UUID would historically be serialized by the Python driver as:: + + 00112233-4455-6677-8899-aabbccddeeff + +The same UUID would historically be serialized by the C# driver as:: + + 33221100-5544-7766-8899-aabbccddeeff + +Finally, the same UUID would historically be serialized by the Java driver as:: + + 77665544-3322-1100-ffee-ddccbbaa9988 + +.. note:: For in-depth information about the the byte-order historically + used by different drivers, see the `Handling of Native UUID Types + Specification + `_. + +This difference in the byte-order of UUIDs encoded by different drivers +resulted in highly unintuitive behavior in deployments where more than +one of these drivers was in use. As example, consider the following situation: + +* Application ``M`` written in C# generates a UUID and uses it as the ``_id`` + of a document that it proceeds to insert into the ``uuid_test`` collection of + the ``example_db`` database. Let's assume that the canonical textual + representation of the generated UUID is:: + + 00112233-4455-6677-8899-aabbccddeeff + +* Application ``N`` written in Python attempts to ``find`` the document + written by application ``M`` in the following manner:: + + from uuid import UUID + collection = client.example_db.uuid_test + results = collection.find({'_id': UUID('00112233-4455-6677-8899-aabbccddeeff')}) + + In this instance, the ``results`` cursor will never contain the document that + was inserted by application ``M`` in the previous step. This is because of + the different byte-order used by the C# driver for representing UUIDs as + BSON Binary. The following query, on the other hand, will successfully find + this document:: + + results = collection.find({'_id': UUID('33221100-5544-7766-8899-aabbccddeeff')}) + +As this example demonstrates, differing byte-order can hamper +interoperability between applications that use different drivers. To workaround +this problem, users can configure their ``MongoClient`` with the appropriate +:class:`~bson.binary.UuidRepresentation`. + +.. _configuring-uuid-representation: + +Configuring a UUID Representation +--------------------------------- + +Setting a :class:`~bson.binary.UuidRepresentation` configures +PyMongo's behavior while encoding/decoding native UUID types to/from BSON. +Applications can set the UUID representation in one of the following ways: + +#. Using the ``uuidRepresentation`` URI option:: + + client = MongoClient("mongodb://a:27107/?uuidRepresentation=javaLegacy") + + Valid values are ``pythonLegacy``, ``javaLegacy``, ``csharpLegacy``, + ``standard`` and ``unspecified``. + +#. Using the ``uuid_representation`` kwarg option:: + + from bson.binary import UuidRepresentation + client = MongoClient(uuid_representation=UuidRepresentation.PYTHON_LEGACY) + +#. By supplying a suitable :class:`~bson.codec_options.CodecOptions` instance:: + + from bson.codec_options import CodecOptions + csharp_opts = CodecOptions(uuid_representation=UuidRepresentation.CSHARP_LEGACY) + csharp_database = client.get_database('csharp_db', codec_options=csharp_opts) + csharp_collection = client.testdb.get_collection('csharp_coll', codec_options=csharp_opts) + + +We now detail the behavior and use-case for each support UUID representation. + +The ``PYTHON_LEGACY`` UUID Representation +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. attention:: This uuid representation should be used when reading UUIDs + written to MongoDB by existing applications that use the Python driver + and don't explicitly set a UUID representation. + +.. attention:: As of PyMongo 3.11.0, + :data:`~bson.binary.UuidRepresentation.PYTHON_LEGACY` + is the default uuid representation used by PyMongo. + +The :data:`~bson.binary.UuidRepresentation.PYTHON_LEGACY` representation +corresponds to the legacy representation of UUIDs used by PyMongo. This +representation conforms with +`RFC 4122 Section 4.1.2 `_. + +This is illustrated with the following example:: + + from bson.codec_options import CodecOptions, DEFAULT_CODEC_OPTIONS + from bson.binary import UuidRepresentation + + # No configured UUID representation (legacy + collection = client.python_legacy.get_collection('test', codec_options=DEFAULT_CODEC_OPTIONS) + + # Using UuidRepresentation.PYTHON_LEGACY + pylegacy_opts = CodecOptions(uuid_representation=UuidRepresentation.PYTHON_LEGACY) + pylegacy_collection = client.python_legacy..get_collection('test', codec_options=pylegacy_opts) + + # UUIDs written by PyMongo with no UuidRepresentation configured can be queried using PYTHON_LEGACY + uuid_1 = uuid4() + collection.insert_one({'uuid': uuid_1}) + document = pylegacy_collection.find_one({'uuid': uuid_1}) + + # UUIDs written using PYTHON_LEGACY can be read by PyMongo with no UuidRepresentation configured + uuid_2 = uuid4() + pylegacy_collection.insert_one({'uuid': uuid_2}) + document = collection.find_one({'uuid': uuid_2}) + +``PYTHON_LEGACY`` encodes native :class:`uuid.UUID`s to :class:`~bson.binary.Binary` +subtype 3 objects, preserving the same byte-order as :class:`uuid.UUID.bytes`:: + + from bson.binary import Binary + + document = collection.find_one({'uuid': Binary(uuid_2.bytes, subtype=3)}) + assert document['uuid'] == uuid_2 + +The ``JAVA_LEGACY`` UUID Representation +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. attention:: This uuid representation should be used when reading UUIDs + written to MongoDB by the Java driver without an explicitly configured UUID + representation. + +The :data:`~bson.binary.UuidRepresentation.JAVA_LEGACY` representation +corresponds to the legacy representation of UUIDs used by the MongoDB Java +Driver. + +.. note:: The ``JAVA_LEGACY`` representation reverses the order of bytes 0-7, + and bytes 8-15. + +As an example, consider the same UUID described in :ref:`example-legacy-uuid`. +Let us assume that an application used the Java driver without an explicitly +specified UUID representation to insert the example UUID +``00112233-4455-6677-8899-aabbccddeeff``into MongoDB. If we try to read this +value using PyMongo with no UUID representation specified, we end up with an +entirely different UUID:: + + UUID('77665544-3322-1100-ffee-ddccbbaa9988') + +However, if we explicitly set the representation to +:data:`~bson.binary.UuidRepresentation.JAVA_LEGACY`, we get the correct result:: + + UUID('00112233-4455-6677-8899-aabbccddeeff') + +PyMongo uses the specified UUID representation to reorder the BSON bytes and +load them correctly. ``JAVA_LEGACY`` encodes native :class:`uuid.UUID`s to +:class:`~bson.binary.Binary` subtype 3 objects, while performing the same +byte-reordering as the legacy Java driver's UUID to BSON encoder. + +The ``CSHARP_LEGACY`` UUID Representation +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. attention:: This uuid representation should be used when reading UUIDs + written to MongoDB by the C# driver without an explicitly configured UUID + representation. + +The :data:`~bson.binary.UuidRepresentation.CSHARP_LEGACY` representation +corresponds to the legacy representation of UUIDs used by the MongoDB Java +Driver. + +.. note:: The ``CSHARP_LEGACY`` representation reverses the order of bytes 0-3, + bytes 4-5, and bytes 6-7. + +As an example, consider the same UUID described in :ref:`example-legacy-uuid`. +Let us assume that an application used the C# driver without an explicitly +specified UUID representation to insert the example UUID +``00112233-4455-6677-8899-aabbccddeeff``into MongoDB. If we try to read this +value using PyMongo with no UUID representation specified, we end up with an +entirely different UUID:: + + UUID('33221100-5544-7766-8899-aabbccddeeff') + +However, if we explicitly set the representation to +:data:`~bson.binary.UuidRepresentation.CSHARP_LEGACY`, we get the correct result:: + + UUID('00112233-4455-6677-8899-aabbccddeeff') + +PyMongo uses the specified UUID representation to reorder the BSON bytes and +load them correctly. ``CSHARP_LEGACY`` encodes native :class:`uuid.UUID`s to +:class:`~bson.binary.Binary` subtype 3 objects, while performing the same +byte-reordering as the legacy C# driver's UUID to BSON encoder. + +The ``STANDARD`` UUID Representation +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. attention:: This uuid representation should be used by new applications + that have never stored UUIDs in MongoDB. + +The :data:`~bson.binary.UuidRepresentation.STANDARD` representation +enables cross-language compatibility by ensuring the same byte-ordering +when encoding UUIDs from all drivers. UUIDs written by a driver with this +representation configured can be read by every other driver correctly provided +it is configured with the ``STANDARD`` representation. + +``STANDARD`` encodes native :class:`uuid.UUID`s to +:class:`~bson.binary.Binary` subtype 4 objects. + + +The ``UNSPECIFIED`` UUID Representation +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. attention:: Starting in PyMongo 4.0, + :data:`~bson.binary.UuidRepresentation.UNSPECIFIED` will be the default + UUID representation used by PyMongo. + +The :data:`~bson.binary.UuidRepresentation.UNSPECIFIED` representation +prevents the incorrect interpretation of UUID bytes by stopping short of +automatically converting UUID fields in BSON to native UUID types. Loading +a UUID When using this representation returns a :data:`~bson.binary.Binary` +object instead. Users can explicitly convert the :data:`~bson.binary.Binary` +objects into native UUIDs in the appropriate representation by using the +:meth:`~bson.binary.Binary.as_uuid` method. diff --git a/pymongo/mongo_client.py b/pymongo/mongo_client.py index 2a6e9d180f..41eb7f28ef 100644 --- a/pymongo/mongo_client.py +++ b/pymongo/mongo_client.py @@ -339,8 +339,9 @@ def __init__( - `uuidRepresentation`: The BSON representation to use when encoding from and decoding to instances of :class:`~uuid.UUID`. Valid values are `pythonLegacy` (the default), `javaLegacy`, - `csharpLegacy` and `standard`. New applications should consider - setting this to `standard` for cross language compatibility. + `csharpLegacy`, `standard` and `unspecified`. New applications + should consider setting this to `standard` for cross language + compatibility. | **Write Concern options:** | (Only set if passed. No default values.) From 1101ea2a6ee25c2b88e579facf773e040b87a2f8 Mon Sep 17 00:00:00 2001 From: Prashant Mital Date: Tue, 21 Jul 2020 01:52:03 -0700 Subject: [PATCH 2/8] address review comments --- bson/binary.py | 12 +++++ bson/codec_options.py | 5 +- doc/examples/index.rst | 1 + doc/examples/uuid.rst | 103 ++++++++++++++++++++++++++++++---------- pymongo/mongo_client.py | 2 +- 5 files changed, 97 insertions(+), 26 deletions(-) diff --git a/bson/binary.py b/bson/binary.py index cb89c69da2..cc66444698 100644 --- a/bson/binary.py +++ b/bson/binary.py @@ -69,6 +69,8 @@ class UuidRepresentation: code. When decoding a BSON binary field with a UUID subtype, a :class:`~bson.binary.Binary` instance will be returned instead of a :class:`uuid.UUID` instance. + + See :ref:`unspecified-representation-details` for details. .. versionadded:: 3.11 """ @@ -79,6 +81,8 @@ class UuidRepresentation: :class:`uuid.UUID` instances will automatically be encoded to and decoded from BSON binary, using RFC-4122 byte order with binary subtype :data:`UUID_SUBTYPE`. + + See :ref:`standard-representation-details` for details. .. versionadded:: 3.11 """ @@ -89,6 +93,8 @@ class UuidRepresentation: :class:`uuid.UUID` instances will automatically be encoded to and decoded from BSON binary, using RFC-4122 byte order with binary subtype :data:`OLD_UUID_SUBTYPE`. + + See :ref:`python-legacy-representation-details` for details. .. versionadded:: 3.11 """ @@ -99,6 +105,8 @@ class UuidRepresentation: :class:`uuid.UUID` instances will automatically be encoded to and decoded from BSON binary subtype :data:`OLD_UUID_SUBTYPE`, using the Java driver's legacy byte order. + + See :ref:`java-legacy-representation-details` for details. .. versionadded:: 3.11 """ @@ -109,6 +117,8 @@ class UuidRepresentation: :class:`uuid.UUID` instances will automatically be encoded to and decoded from BSON binary subtype :data:`OLD_UUID_SUBTYPE`, using the C# driver's legacy byte order. + + See :ref:`csharp-legacy-representation-details` for details. .. versionadded:: 3.11 """ @@ -220,6 +230,7 @@ def from_uuid(cls, uuid, uuid_representation=UuidRepresentation.STANDARD): - `uuid_representation`: A member of :class:`~bson.binary.UuidRepresentation`. Default: :const:`~bson.binary.UuidRepresentation.STANDARD`. + See :ref:`handling-uuid-data-example` for details. .. versionadded:: 3.11 """ @@ -266,6 +277,7 @@ def as_uuid(self, uuid_representation=UuidRepresentation.STANDARD): - `uuid_representation`: A member of :class:`~bson.binary.UuidRepresentation`. Default: :const:`~bson.binary.UuidRepresentation.STANDARD`. + See :ref:`handling-uuid-data-example` for details. .. versionadded:: 3.11 """ diff --git a/bson/codec_options.py b/bson/codec_options.py index a514cc92d0..4ffcdb0a59 100644 --- a/bson/codec_options.py +++ b/bson/codec_options.py @@ -218,7 +218,10 @@ class CodecOptions(_options_base): naive. Defaults to ``False``. - `uuid_representation`: The BSON representation to use when encoding and decoding instances of :class:`~uuid.UUID`. Defaults to - :data:`~bson.binary.PYTHON_LEGACY`. + :data:`~bson.binary.UuidRepresentation.PYTHON_LEGACY`. New + applications should consider setting this to + :data:`~bson.binary.UuidRepresentation.STANDARD` for cross language + compatibility. See :ref:`handling-uuid-data-example` for details. - `unicode_decode_error_handler`: The error handler to apply when a Unicode-related error occurs during BSON decoding that would otherwise raise :exc:`UnicodeDecodeError`. Valid options include diff --git a/doc/examples/index.rst b/doc/examples/index.rst index baadd74464..f8828cdfd7 100644 --- a/doc/examples/index.rst +++ b/doc/examples/index.rst @@ -32,3 +32,4 @@ MongoDB, you can start it like so: tailable tls encryption + uuid diff --git a/doc/examples/uuid.rst b/doc/examples/uuid.rst index ec748c2db9..1dedfaa19e 100644 --- a/doc/examples/uuid.rst +++ b/doc/examples/uuid.rst @@ -1,11 +1,11 @@ -.. _Handling UUID Data: +.. _handling-uuid-data-example: Handling UUID Data ================== PyMongo ships with built-in support for dealing with UUID types. It is trivially simple to store native :class:`uuid.UUID` objects -to MongoDB and retrieve them as native :class:`uuid.UUID`s:: +to MongoDB and retrieve them as native :class:`uuid.UUID` objects:: from pymongo import MongoClient from bson.binary import UuidRepresentation @@ -30,7 +30,8 @@ to MongoDB and retrieve them as native :class:`uuid.UUID`s:: # check that the retrieved document matches the inserted document assert document['uuid'] == uuid_obj -Native :class:`uuid.UUID`s can also be used as part of MongoDB queries:: +Native :class:`uuid.UUID` objects can also be used as part of MongoDB +queries:: document = collection.find({'uuid': uuid_obj}) assert document['uuid'] == uuid_obj @@ -100,21 +101,22 @@ one of these drivers was in use. As example, consider the following situation: from uuid import UUID collection = client.example_db.uuid_test - results = collection.find({'_id': UUID('00112233-4455-6677-8899-aabbccddeeff')}) + result = collection.find_one({'_id': UUID('00112233-4455-6677-8899-aabbccddeeff')}) - In this instance, the ``results`` cursor will never contain the document that + In this instance, ``result`` will never be the document that was inserted by application ``M`` in the previous step. This is because of the different byte-order used by the C# driver for representing UUIDs as BSON Binary. The following query, on the other hand, will successfully find this document:: - results = collection.find({'_id': UUID('33221100-5544-7766-8899-aabbccddeeff')}) + result = collection.find_one({'_id': UUID('33221100-5544-7766-8899-aabbccddeeff')}) As this example demonstrates, differing byte-order can hamper interoperability between applications that use different drivers. To workaround this problem, users can configure their ``MongoClient`` with the appropriate :class:`~bson.binary.UuidRepresentation`. + .. _configuring-uuid-representation: Configuring a UUID Representation @@ -126,26 +128,28 @@ Applications can set the UUID representation in one of the following ways: #. Using the ``uuidRepresentation`` URI option:: - client = MongoClient("mongodb://a:27107/?uuidRepresentation=javaLegacy") + client = MongoClient("mongodb://a:27107/?uuidRepresentation=javaLegacy") Valid values are ``pythonLegacy``, ``javaLegacy``, ``csharpLegacy``, ``standard`` and ``unspecified``. #. Using the ``uuid_representation`` kwarg option:: - from bson.binary import UuidRepresentation - client = MongoClient(uuid_representation=UuidRepresentation.PYTHON_LEGACY) + from bson.binary import UuidRepresentation + client = MongoClient(uuid_representation=UuidRepresentation.PYTHON_LEGACY) #. By supplying a suitable :class:`~bson.codec_options.CodecOptions` instance:: - from bson.codec_options import CodecOptions - csharp_opts = CodecOptions(uuid_representation=UuidRepresentation.CSHARP_LEGACY) - csharp_database = client.get_database('csharp_db', codec_options=csharp_opts) - csharp_collection = client.testdb.get_collection('csharp_coll', codec_options=csharp_opts) + from bson.codec_options import CodecOptions + csharp_opts = CodecOptions(uuid_representation=UuidRepresentation.CSHARP_LEGACY) + csharp_database = client.get_database('csharp_db', codec_options=csharp_opts) + csharp_collection = client.testdb.get_collection('csharp_coll', codec_options=csharp_opts) We now detail the behavior and use-case for each support UUID representation. +.. _python-legacy-representation-details: + The ``PYTHON_LEGACY`` UUID Representation ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -153,9 +157,8 @@ The ``PYTHON_LEGACY`` UUID Representation written to MongoDB by existing applications that use the Python driver and don't explicitly set a UUID representation. -.. attention:: As of PyMongo 3.11.0, - :data:`~bson.binary.UuidRepresentation.PYTHON_LEGACY` - is the default uuid representation used by PyMongo. +.. attention:: :data:`~bson.binary.UuidRepresentation.PYTHON_LEGACY` + has been the default uuid representation since PyMongo 2.9. The :data:`~bson.binary.UuidRepresentation.PYTHON_LEGACY` representation corresponds to the legacy representation of UUIDs used by PyMongo. This @@ -167,7 +170,7 @@ This is illustrated with the following example:: from bson.codec_options import CodecOptions, DEFAULT_CODEC_OPTIONS from bson.binary import UuidRepresentation - # No configured UUID representation (legacy + # No configured UUID representation collection = client.python_legacy.get_collection('test', codec_options=DEFAULT_CODEC_OPTIONS) # Using UuidRepresentation.PYTHON_LEGACY @@ -192,6 +195,8 @@ subtype 3 objects, preserving the same byte-order as :class:`uuid.UUID.bytes`:: document = collection.find_one({'uuid': Binary(uuid_2.bytes, subtype=3)}) assert document['uuid'] == uuid_2 +.. _java-legacy-representation-details: + The ``JAVA_LEGACY`` UUID Representation ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -209,7 +214,7 @@ Driver. As an example, consider the same UUID described in :ref:`example-legacy-uuid`. Let us assume that an application used the Java driver without an explicitly specified UUID representation to insert the example UUID -``00112233-4455-6677-8899-aabbccddeeff``into MongoDB. If we try to read this +``00112233-4455-6677-8899-aabbccddeeff`` into MongoDB. If we try to read this value using PyMongo with no UUID representation specified, we end up with an entirely different UUID:: @@ -225,6 +230,8 @@ load them correctly. ``JAVA_LEGACY`` encodes native :class:`uuid.UUID`s to :class:`~bson.binary.Binary` subtype 3 objects, while performing the same byte-reordering as the legacy Java driver's UUID to BSON encoder. +.. _csharp-legacy-representation-details: + The ``CSHARP_LEGACY`` UUID Representation ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -242,7 +249,7 @@ Driver. As an example, consider the same UUID described in :ref:`example-legacy-uuid`. Let us assume that an application used the C# driver without an explicitly specified UUID representation to insert the example UUID -``00112233-4455-6677-8899-aabbccddeeff``into MongoDB. If we try to read this +``00112233-4455-6677-8899-aabbccddeeff`` into MongoDB. If we try to read this value using PyMongo with no UUID representation specified, we end up with an entirely different UUID:: @@ -258,6 +265,8 @@ load them correctly. ``CSHARP_LEGACY`` encodes native :class:`uuid.UUID`s to :class:`~bson.binary.Binary` subtype 3 objects, while performing the same byte-reordering as the legacy C# driver's UUID to BSON encoder. +.. _standard-representation-details: + The ``STANDARD`` UUID Representation ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -267,12 +276,13 @@ The ``STANDARD`` UUID Representation The :data:`~bson.binary.UuidRepresentation.STANDARD` representation enables cross-language compatibility by ensuring the same byte-ordering when encoding UUIDs from all drivers. UUIDs written by a driver with this -representation configured can be read by every other driver correctly provided -it is configured with the ``STANDARD`` representation. +representation configured will be handled correctly by every other provided +it is also configured with the ``STANDARD`` representation. ``STANDARD`` encodes native :class:`uuid.UUID`s to :class:`~bson.binary.Binary` subtype 4 objects. +.. _unspecified-representation-details: The ``UNSPECIFIED`` UUID Representation ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -284,7 +294,52 @@ The ``UNSPECIFIED`` UUID Representation The :data:`~bson.binary.UuidRepresentation.UNSPECIFIED` representation prevents the incorrect interpretation of UUID bytes by stopping short of automatically converting UUID fields in BSON to native UUID types. Loading -a UUID When using this representation returns a :data:`~bson.binary.Binary` -object instead. Users can explicitly convert the :data:`~bson.binary.Binary` +a UUID When using this representation returns a :class:`~bson.binary.Binary` +object instead. Users can explicitly convert the :class:`~bson.binary.Binary` objects into native UUIDs in the appropriate representation by using the -:meth:`~bson.binary.Binary.as_uuid` method. +:meth:`~bson.binary.Binary.as_uuid` method. The following example shows +what this might look like for a UUID stored by the C# driver:: + + from bson.codec_options import CodecOptions, DEFAULT_CODEC_OPTIONS + from bson.binary import Binary, UuidRepresentation + from uuid import uuid4 + + # Using UuidRepresentation.CSHARP_LEGACY + csharp_opts = CodecOptions(uuid_representation=UuidRepresentation.CSHARP_LEGACY) + + # Store a C#-formatted UUID + input_uuid = uuid4() + collection = client.testdb.get_collection('test', codec_options=csharp_opts) + collection.insert_one({'_id': 'foo', 'uuid': input_uuid}) + + # Using UuidRepresentation.UNSPECIFIED + unspec_opts = CodecOptions(uuid_representation=UuidRepresentation.UNSPECIFIED) + unspec_collection = client.testdb.get_collection('test', codec_options=unspec_opts) + + # UUID fields are decoded as Binary when UuidRepresentation.UNSPECIFIED is configured + uuid_1 = uuid4() + document = unspec_collection.find_one({'_id': 'foo'}) + decoded_field = document['uuid'] + assert isinstance(decoded_field, Binary) + + # Binary.as_uuid() can be used to coerce the decoded value to a native UUID + decoded_uuid = decoded_field.as_uuid(UuidRepresentation.CSHARP_LEGACY) + assert decoded_uuid == input_uuid + +Native :class:`uuid.UUID`s cannot directly be encoded to +:class:`~bson.binary.Binary` when the UUID representation is ``UNSPECIFIED`` +and attempting to do so will result in an exception:: + + unspec_collection.insert_one({'_id': 'bar', 'uuid': uuid4()}) + Traceback (most recent call last): + ... + ValueError: cannot encode native uuid.UUID with UuidRepresentation.UNSPECIFIED. UUIDs can be manually converted to bson.Binary instances using bson.Binary.from_uuid() or a different UuidRepresentation can be configured. + +Instead, applications using :data:`~bson.binary.UuidRepresentation.UNSPECIFIED` +must explicitly coerce a native UUID using the +:meth:`~bson.binary.Binary.from_uuid` method:: + + explicit_binary = Binary.from_uuid(uuid4(), UuidRepresentation.PYTHON_LEGACY) + unspec_collection.insert_one({'_id': 'bar', 'uuid': explicit_binary}) + + diff --git a/pymongo/mongo_client.py b/pymongo/mongo_client.py index 41eb7f28ef..c625ccbede 100644 --- a/pymongo/mongo_client.py +++ b/pymongo/mongo_client.py @@ -341,7 +341,7 @@ def __init__( values are `pythonLegacy` (the default), `javaLegacy`, `csharpLegacy`, `standard` and `unspecified`. New applications should consider setting this to `standard` for cross language - compatibility. + compatibility. See :ref:`handling-uuid-data-example` for details. | **Write Concern options:** | (Only set if passed. No default values.) From 4692ce9791ac4f071eeb6040b9b0a252c3c69598 Mon Sep 17 00:00:00 2001 From: Prashant Mital Date: Tue, 21 Jul 2020 10:02:51 -0700 Subject: [PATCH 3/8] cleanup --- doc/examples/uuid.rst | 49 +++++++++++++++++++++++-------------------- 1 file changed, 26 insertions(+), 23 deletions(-) diff --git a/doc/examples/uuid.rst b/doc/examples/uuid.rst index 1dedfaa19e..d08124c555 100644 --- a/doc/examples/uuid.rst +++ b/doc/examples/uuid.rst @@ -87,7 +87,7 @@ Finally, the same UUID would historically be serialized by the Java driver as:: This difference in the byte-order of UUIDs encoded by different drivers resulted in highly unintuitive behavior in deployments where more than -one of these drivers was in use. As example, consider the following situation: +one of these drivers was in use. For example, consider the following situation: * Application ``M`` written in C# generates a UUID and uses it as the ``_id`` of a document that it proceeds to insert into the ``uuid_test`` collection of @@ -112,9 +112,9 @@ one of these drivers was in use. As example, consider the following situation: result = collection.find_one({'_id': UUID('33221100-5544-7766-8899-aabbccddeeff')}) As this example demonstrates, differing byte-order can hamper -interoperability between applications that use different drivers. To workaround -this problem, users can configure their ``MongoClient`` with the appropriate -:class:`~bson.binary.UuidRepresentation`. +interoperability between applications that use different drivers but share a +MongoDB deployment. To workaround this problem, users should configure their +``MongoClient`` with the appropriate :class:`~bson.binary.UuidRepresentation`. .. _configuring-uuid-representation: @@ -126,27 +126,29 @@ Setting a :class:`~bson.binary.UuidRepresentation` configures PyMongo's behavior while encoding/decoding native UUID types to/from BSON. Applications can set the UUID representation in one of the following ways: -#. Using the ``uuidRepresentation`` URI option:: +#. Using the ``uuidRepresentation`` URI option, e.g.:: - client = MongoClient("mongodb://a:27107/?uuidRepresentation=javaLegacy") + client = MongoClient("mongodb://a:27107/?uuidRepresentation=javaLegacy") - Valid values are ``pythonLegacy``, ``javaLegacy``, ``csharpLegacy``, - ``standard`` and ``unspecified``. + Valid values are ``pythonLegacy``, ``javaLegacy``, ``csharpLegacy``, + ``standard`` and ``unspecified``. -#. Using the ``uuid_representation`` kwarg option:: +#. Using the ``uuid_representation`` kwarg option, e.g.:: - from bson.binary import UuidRepresentation - client = MongoClient(uuid_representation=UuidRepresentation.PYTHON_LEGACY) + from bson.binary import UuidRepresentation + client = MongoClient(uuid_representation=UuidRepresentation.PYTHON_LEGACY) -#. By supplying a suitable :class:`~bson.codec_options.CodecOptions` instance:: +#. By supplying a suitable :class:`~bson.codec_options.CodecOptions` + instance, e.g.:: - from bson.codec_options import CodecOptions - csharp_opts = CodecOptions(uuid_representation=UuidRepresentation.CSHARP_LEGACY) - csharp_database = client.get_database('csharp_db', codec_options=csharp_opts) - csharp_collection = client.testdb.get_collection('csharp_coll', codec_options=csharp_opts) + from bson.codec_options import CodecOptions + csharp_opts = CodecOptions(uuid_representation=UuidRepresentation.CSHARP_LEGACY) + csharp_database = client.get_database('csharp_db', codec_options=csharp_opts) + csharp_collection = client.testdb.get_collection('csharp_coll', codec_options=csharp_opts) -We now detail the behavior and use-case for each support UUID representation. +We now detail the behavior and use-case for each supported UUID +representation. .. _python-legacy-representation-details: @@ -154,8 +156,8 @@ The ``PYTHON_LEGACY`` UUID Representation ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. attention:: This uuid representation should be used when reading UUIDs - written to MongoDB by existing applications that use the Python driver - and don't explicitly set a UUID representation. + generated by existing applications that use the Python driver + but **don't** explicitly set a UUID representation. .. attention:: :data:`~bson.binary.UuidRepresentation.PYTHON_LEGACY` has been the default uuid representation since PyMongo 2.9. @@ -165,7 +167,7 @@ corresponds to the legacy representation of UUIDs used by PyMongo. This representation conforms with `RFC 4122 Section 4.1.2 `_. -This is illustrated with the following example:: +The following example illustrates the use of this representation:: from bson.codec_options import CodecOptions, DEFAULT_CODEC_OPTIONS from bson.binary import UuidRepresentation @@ -295,9 +297,10 @@ The :data:`~bson.binary.UuidRepresentation.UNSPECIFIED` representation prevents the incorrect interpretation of UUID bytes by stopping short of automatically converting UUID fields in BSON to native UUID types. Loading a UUID When using this representation returns a :class:`~bson.binary.Binary` -object instead. Users can explicitly convert the :class:`~bson.binary.Binary` -objects into native UUIDs in the appropriate representation by using the -:meth:`~bson.binary.Binary.as_uuid` method. The following example shows +object instead. If required, users can coerce the decoded +:class:`~bson.binary.Binary` objects into native UUIDs using the +:meth:`~bson.binary.Binary.as_uuid` method and specifyin the appropriate +representation format. The following example shows what this might look like for a UUID stored by the C# driver:: from bson.codec_options import CodecOptions, DEFAULT_CODEC_OPTIONS From c38b40ae025188d044861eff2a14c3102e940db3 Mon Sep 17 00:00:00 2001 From: Prashant Mital Date: Tue, 21 Jul 2020 23:27:40 -0700 Subject: [PATCH 4/8] more improvements --- doc/examples/uuid.rst | 117 +++++++++++++++++++++++++++++++++--------- 1 file changed, 93 insertions(+), 24 deletions(-) diff --git a/doc/examples/uuid.rst b/doc/examples/uuid.rst index d08124c555..2afb36502c 100644 --- a/doc/examples/uuid.rst +++ b/doc/examples/uuid.rst @@ -27,7 +27,7 @@ to MongoDB and retrieve them as native :class:`uuid.UUID` objects:: # retrieve the stored uuid object from MongoDB document = collection.find_one({}) - # check that the retrieved document matches the inserted document + # check that the retrieved UUID matches the inserted UUID assert document['uuid'] == uuid_obj Native :class:`uuid.UUID` objects can also be used as part of MongoDB @@ -116,7 +116,6 @@ interoperability between applications that use different drivers but share a MongoDB deployment. To workaround this problem, users should configure their ``MongoClient`` with the appropriate :class:`~bson.binary.UuidRepresentation`. - .. _configuring-uuid-representation: Configuring a UUID Representation @@ -146,14 +145,55 @@ Applications can set the UUID representation in one of the following ways: csharp_database = client.get_database('csharp_db', codec_options=csharp_opts) csharp_collection = client.testdb.get_collection('csharp_coll', codec_options=csharp_opts) +Supported UUID Representations +------------------------------ + +.. list-table:: + :header-rows: 1 + + * - UUID Representation + - Default? + - Encode :class:`uuid.UUID` to + - Decode :class:`~bson.binary.Binary` subtype 4 to + - Decode :class:`~bson.binary.Binary` subtype 3 to + + * - :ref:`python-legacy-representation-details` + - Yes, in PyMongo>=2.9,<4 + - :class:`~bson.binary.Binary` subtype 3 with standard byte-order + - :class:`~bson.binary.Binary` subtype 4 + - :class:`uuid.UUID` + + * - :ref:`java-legacy-representation-details` + - No + - :class:`~bson.binary.Binary` subtype 3 with Java legacy byte-order + - :class:`~bson.binary.Binary` subtype 4 + - :class:`uuid.UUID` + + * - :ref:`csharp-legacy-representation-details` + - No + - :class:`~bson.binary.Binary` subtype 3 with C# legacy byte-order + - :class:`~bson.binary.Binary` subtype 4 + - :class:`uuid.UUID` + + * - :ref:`standard-representation-details` + - No + - :class:`~bson.binary.Binary` subtype 4 + - :class:`uuid.UUID` + - :class:`uuid.UUID` in PyMongo<4; :class:`~bson.binary.Binary` subtype 3 in PyMongo>=4 + + * - :ref:`unspecified-representation-details` + - Yes, in PyMongo>=4 + - Raise :exc:`ValueError` + - :class:`~bson.binary.Binary` subtype 4 + - :class:`uuid.UUID` in PyMongo<4; :class:`~bson.binary.Binary` subtype 3 in PyMongo>=4 We now detail the behavior and use-case for each supported UUID representation. .. _python-legacy-representation-details: -The ``PYTHON_LEGACY`` UUID Representation -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +``PYTHON_LEGACY`` +^^^^^^^^^^^^^^^^^ .. attention:: This uuid representation should be used when reading UUIDs generated by existing applications that use the Python driver @@ -177,7 +217,7 @@ The following example illustrates the use of this representation:: # Using UuidRepresentation.PYTHON_LEGACY pylegacy_opts = CodecOptions(uuid_representation=UuidRepresentation.PYTHON_LEGACY) - pylegacy_collection = client.python_legacy..get_collection('test', codec_options=pylegacy_opts) + pylegacy_collection = client.python_legacy.get_collection('test', codec_options=pylegacy_opts) # UUIDs written by PyMongo with no UuidRepresentation configured can be queried using PYTHON_LEGACY uuid_1 = uuid4() @@ -189,8 +229,9 @@ The following example illustrates the use of this representation:: pylegacy_collection.insert_one({'uuid': uuid_2}) document = collection.find_one({'uuid': uuid_2}) -``PYTHON_LEGACY`` encodes native :class:`uuid.UUID`s to :class:`~bson.binary.Binary` -subtype 3 objects, preserving the same byte-order as :class:`uuid.UUID.bytes`:: +``PYTHON_LEGACY`` encodes native :class:`uuid.UUID` objects to +:class:`~bson.binary.Binary` subtype 3 objects, preserving the same +byte-order as :attr:`~uuid.UUID.bytes`:: from bson.binary import Binary @@ -199,8 +240,8 @@ subtype 3 objects, preserving the same byte-order as :class:`uuid.UUID.bytes`:: .. _java-legacy-representation-details: -The ``JAVA_LEGACY`` UUID Representation -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +``JAVA_LEGACY`` +^^^^^^^^^^^^^^^ .. attention:: This uuid representation should be used when reading UUIDs written to MongoDB by the Java driver without an explicitly configured UUID @@ -228,14 +269,14 @@ However, if we explicitly set the representation to UUID('00112233-4455-6677-8899-aabbccddeeff') PyMongo uses the specified UUID representation to reorder the BSON bytes and -load them correctly. ``JAVA_LEGACY`` encodes native :class:`uuid.UUID`s to -:class:`~bson.binary.Binary` subtype 3 objects, while performing the same +load them correctly. ``JAVA_LEGACY`` encodes native :class:`uuid.UUID` objects +to :class:`~bson.binary.Binary` subtype 3 objects, while performing the same byte-reordering as the legacy Java driver's UUID to BSON encoder. .. _csharp-legacy-representation-details: -The ``CSHARP_LEGACY`` UUID Representation -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +``CSHARP_LEGACY`` +^^^^^^^^^^^^^^^^^ .. attention:: This uuid representation should be used when reading UUIDs written to MongoDB by the C# driver without an explicitly configured UUID @@ -263,14 +304,14 @@ However, if we explicitly set the representation to UUID('00112233-4455-6677-8899-aabbccddeeff') PyMongo uses the specified UUID representation to reorder the BSON bytes and -load them correctly. ``CSHARP_LEGACY`` encodes native :class:`uuid.UUID`s to -:class:`~bson.binary.Binary` subtype 3 objects, while performing the same -byte-reordering as the legacy C# driver's UUID to BSON encoder. +load them correctly. ``CSHARP_LEGACY`` encodes native :class:`uuid.UUID` +objects to :class:`~bson.binary.Binary` subtype 3 objects, while performing +the same byte-reordering as the legacy C# driver's UUID to BSON encoder. .. _standard-representation-details: -The ``STANDARD`` UUID Representation -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +``STANDARD`` +^^^^^^^^^^^^ .. attention:: This uuid representation should be used by new applications that have never stored UUIDs in MongoDB. @@ -281,13 +322,43 @@ when encoding UUIDs from all drivers. UUIDs written by a driver with this representation configured will be handled correctly by every other provided it is also configured with the ``STANDARD`` representation. -``STANDARD`` encodes native :class:`uuid.UUID`s to +``STANDARD`` encodes native :class:`uuid.UUID` objects to :class:`~bson.binary.Binary` subtype 4 objects. +.. attention:: In PyMongo 3.x, applications can end up inadvertently changing + the :class:`~bson.binary.Binary` subtype of a UUID field when round-tripping + documents with legacy-encoded UUIDs. This happens because in PyMongo 3.x, + the ``STANDARD`` representation decodes all UUIDs (subtypes 3 and 4) as + native :class:`uuid.UUID` objects, and encodes all native UUIDs as a + subtype 4. For example:: + + from bson.codec_options import CodecOptions, DEFAULT_CODEC_OPTIONS + from bson.binary import Binary, UuidRepresentation + from uuid import uuid4 + + # Using UuidRepresentation.PYTHON_LEGACY stores a Binary subtype-3 UUID + python_opts = CodecOptions(uuid_representation=UuidRepresentation.PYTHON_LEGACY) + input_uuid = uuid4() + collection = client.testdb.get_collection('test', codec_options=python_opts) + collection.insert_one({'_id': 'foo', 'uuid': input_uuid}) + assert collection.find_one({'uuid': Binary(input_uuid.bytes, 3)})['_id'] == 'foo' + + # Retrieving this document using UuidRepresentation.STANDARD returns a native UUID + std_opts = CodecOptions(uuid_representation=UuidRepresentation.STANDARD) + std_collection = client.testdb.get_collection('test', codec_options=std_opts) + doc = std_collection.find_one({'_id': 'foo'}) + assert doc['uuid'] == input_uuid + + # Round-tripping the retrieved document silently changes the Binary subtype to 4 + std_collection.replace_one({'_id': 'foo'}, doc) + assert collection.find_one({'uuid': Binary(input_uuid.bytes, 3)}) is None + round_tripped_doc = collection.find_one({'uuid': Binary(input_uuid.bytes, 4)}) + assert doc == round_tripped_doc + .. _unspecified-representation-details: -The ``UNSPECIFIED`` UUID Representation -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +``UNSPECIFIED`` +^^^^^^^^^^^^^^^ .. attention:: Starting in PyMongo 4.0, :data:`~bson.binary.UuidRepresentation.UNSPECIFIED` will be the default @@ -329,7 +400,7 @@ what this might look like for a UUID stored by the C# driver:: decoded_uuid = decoded_field.as_uuid(UuidRepresentation.CSHARP_LEGACY) assert decoded_uuid == input_uuid -Native :class:`uuid.UUID`s cannot directly be encoded to +Native :class:`uuid.UUID` objects cannot directly be encoded to :class:`~bson.binary.Binary` when the UUID representation is ``UNSPECIFIED`` and attempting to do so will result in an exception:: @@ -344,5 +415,3 @@ must explicitly coerce a native UUID using the explicit_binary = Binary.from_uuid(uuid4(), UuidRepresentation.PYTHON_LEGACY) unspec_collection.insert_one({'_id': 'bar', 'uuid': explicit_binary}) - - From a57f0a0dd1aff4400e85788d75c62aa86a03a05a Mon Sep 17 00:00:00 2001 From: Prashant Mital Date: Wed, 22 Jul 2020 19:41:55 -0700 Subject: [PATCH 5/8] final changes --- doc/examples/uuid.rst | 147 +++++++++++++++++++++++++++++------------- 1 file changed, 101 insertions(+), 46 deletions(-) diff --git a/doc/examples/uuid.rst b/doc/examples/uuid.rst index 2afb36502c..8befedfe19 100644 --- a/doc/examples/uuid.rst +++ b/doc/examples/uuid.rst @@ -85,9 +85,14 @@ Finally, the same UUID would historically be serialized by the Java driver as:: Specification `_. -This difference in the byte-order of UUIDs encoded by different drivers -resulted in highly unintuitive behavior in deployments where more than -one of these drivers was in use. For example, consider the following situation: +This difference in the byte-order of UUIDs encoded by different drivers can +result in highly unintuitive behavior in some scenarios. We detail two such +scenarios in the next sections. + +Scenario 1: Applications Share a MongoDB Deployment +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Consider the following situation: * Application ``M`` written in C# generates a UUID and uses it as the ``_id`` of a document that it proceeds to insert into the ``uuid_test`` collection of @@ -111,26 +116,106 @@ one of these drivers was in use. For example, consider the following situation: result = collection.find_one({'_id': UUID('33221100-5544-7766-8899-aabbccddeeff')}) -As this example demonstrates, differing byte-order can hamper -interoperability between applications that use different drivers but share a -MongoDB deployment. To workaround this problem, users should configure their -``MongoClient`` with the appropriate :class:`~bson.binary.UuidRepresentation`. +This example demonstrates how the differing byte-order used by different +drivers can hamper interoperability. To workaround this problem, users should +configure their ``MongoClient`` with the appropriate +:class:`~bson.binary.UuidRepresentation` (in this case, ``client`` in application +``M`` can be configured to use the +:data:`~bson.binary.UuidRepresentation.CSHARP_LEGACY` representation to +avoid the unintuitive behavior) as described in +:ref:`configuring-uuid-representation`. + +Scenario 2: Round-Tripping UUIDs +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +In this scenario, we see how using a misconfigured +:class:`~bson.binary.UuidRepresentation` can cause an application +to inadvertently change the :class:`~bson.binary.Binary` subtype of a UUID +field when round-tripping documents containing UUIDs. + +Consider the following situation:: + + from bson.codec_options import CodecOptions, DEFAULT_CODEC_OPTIONS + from bson.binary import Binary, UuidRepresentation + from uuid import uuid4 + + # Using UuidRepresentation.PYTHON_LEGACY stores a Binary subtype-3 UUID + python_opts = CodecOptions(uuid_representation=UuidRepresentation.PYTHON_LEGACY) + input_uuid = uuid4() + collection = client.testdb.get_collection('test', codec_options=python_opts) + collection.insert_one({'_id': 'foo', 'uuid': input_uuid}) + assert collection.find_one({'uuid': Binary(input_uuid.bytes, 3)})['_id'] == 'foo' + + # Retrieving this document using UuidRepresentation.STANDARD returns a native UUID + std_opts = CodecOptions(uuid_representation=UuidRepresentation.STANDARD) + std_collection = client.testdb.get_collection('test', codec_options=std_opts) + doc = std_collection.find_one({'_id': 'foo'}) + assert doc['uuid'] == input_uuid + + # Round-tripping the retrieved document silently changes the Binary subtype to 4 + std_collection.replace_one({'_id': 'foo'}, doc) + assert collection.find_one({'uuid': Binary(input_uuid.bytes, 3)}) is None + round_tripped_doc = collection.find_one({'uuid': Binary(input_uuid.bytes, 4)}) + assert doc == round_tripped_doc + + +In this example, round-tripping the document using the incorrect +:class:`~bson.binary.UuidRepresentation` (``STANDARD`` instead of +``PYTHON_LEGACY``) changes the :class:`~bson.binary.Binary` subtype as a +side-effect. **Note that this can also happen when the situation is reversed - +i.e. when the original document is written using ``STANDARD`` representation +and then round-tripped using the ``PYTHON_LEGACY`` representation. Note also +that replacing ``PYTHON_LEGACY`` by ``JAVA_LEGACY`` or ``CSHARP_LEGACY`` in +the above example produces the same behavior.** + +.. note:: This will not be an issue in PyMongo>=4 as starting in that version, + the ``STANDARD`` representation will decode Binary subtype 3 fields as + :class:`~bson.binary.Binary` objects of subtype 3 (instead of + :class:`uuid.UUID`), and each of the ``LEGACY_*`` representations will + decode Binary subtype 4 fields to :class:`~bson.binary.Binary` objects of + subtype 4 (instead of :class:`uuid.UUID`). This will prevent + .. _configuring-uuid-representation: Configuring a UUID Representation --------------------------------- -Setting a :class:`~bson.binary.UuidRepresentation` configures -PyMongo's behavior while encoding/decoding native UUID types to/from BSON. +Users can workaround the problems described above by configuring their +applications with the appropriate :class:`~bson.binary.UuidRepresentation`. +Configuring the representation modifies PyMongo's behavior while +encoding :class:`uuid.UUID` objects to BSON and decoding +Binary subtype 3 and 4 fields from BSON. + Applications can set the UUID representation in one of the following ways: -#. Using the ``uuidRepresentation`` URI option, e.g.:: +#. At the ``MongoClient`` level using the ``uuidRepresentation`` URI option, + e.g.:: client = MongoClient("mongodb://a:27107/?uuidRepresentation=javaLegacy") - Valid values are ``pythonLegacy``, ``javaLegacy``, ``csharpLegacy``, - ``standard`` and ``unspecified``. + Valid values are: + + .. list-table:: + :header-rows: 1 + + * - Value + - UUID Representation + + * - ``pythonLegacy`` + - :ref:`python-legacy-representation-details` + + * - ``javaLegacy`` + - :ref:`java-legacy-representation-details` + + * - ``csharpLegacy`` + - :ref:`csharp-legacy-representation-details` + + * - ``standard`` + - :ref:`standard-representation-details` + + * - ``unspecified`` + - :ref:`unspecified-representation-details` #. Using the ``uuid_representation`` kwarg option, e.g.:: @@ -160,19 +245,19 @@ Supported UUID Representations * - :ref:`python-legacy-representation-details` - Yes, in PyMongo>=2.9,<4 - :class:`~bson.binary.Binary` subtype 3 with standard byte-order - - :class:`~bson.binary.Binary` subtype 4 + - :class:`uuid.UUID` in PyMongo<4; :class:`~bson.binary.Binary` subtype 4 in PyMongo>=4 - :class:`uuid.UUID` * - :ref:`java-legacy-representation-details` - No - :class:`~bson.binary.Binary` subtype 3 with Java legacy byte-order - - :class:`~bson.binary.Binary` subtype 4 + - :class:`uuid.UUID` in PyMongo<4; :class:`~bson.binary.Binary` subtype 4 in PyMongo>=4 - :class:`uuid.UUID` * - :ref:`csharp-legacy-representation-details` - No - :class:`~bson.binary.Binary` subtype 3 with C# legacy byte-order - - :class:`~bson.binary.Binary` subtype 4 + - :class:`uuid.UUID` in PyMongo<4; :class:`~bson.binary.Binary` subtype 4 in PyMongo>=4 - :class:`uuid.UUID` * - :ref:`standard-representation-details` @@ -313,7 +398,7 @@ the same byte-reordering as the legacy C# driver's UUID to BSON encoder. ``STANDARD`` ^^^^^^^^^^^^ -.. attention:: This uuid representation should be used by new applications +.. attention:: This UUID representation should be used by new applications that have never stored UUIDs in MongoDB. The :data:`~bson.binary.UuidRepresentation.STANDARD` representation @@ -325,36 +410,6 @@ it is also configured with the ``STANDARD`` representation. ``STANDARD`` encodes native :class:`uuid.UUID` objects to :class:`~bson.binary.Binary` subtype 4 objects. -.. attention:: In PyMongo 3.x, applications can end up inadvertently changing - the :class:`~bson.binary.Binary` subtype of a UUID field when round-tripping - documents with legacy-encoded UUIDs. This happens because in PyMongo 3.x, - the ``STANDARD`` representation decodes all UUIDs (subtypes 3 and 4) as - native :class:`uuid.UUID` objects, and encodes all native UUIDs as a - subtype 4. For example:: - - from bson.codec_options import CodecOptions, DEFAULT_CODEC_OPTIONS - from bson.binary import Binary, UuidRepresentation - from uuid import uuid4 - - # Using UuidRepresentation.PYTHON_LEGACY stores a Binary subtype-3 UUID - python_opts = CodecOptions(uuid_representation=UuidRepresentation.PYTHON_LEGACY) - input_uuid = uuid4() - collection = client.testdb.get_collection('test', codec_options=python_opts) - collection.insert_one({'_id': 'foo', 'uuid': input_uuid}) - assert collection.find_one({'uuid': Binary(input_uuid.bytes, 3)})['_id'] == 'foo' - - # Retrieving this document using UuidRepresentation.STANDARD returns a native UUID - std_opts = CodecOptions(uuid_representation=UuidRepresentation.STANDARD) - std_collection = client.testdb.get_collection('test', codec_options=std_opts) - doc = std_collection.find_one({'_id': 'foo'}) - assert doc['uuid'] == input_uuid - - # Round-tripping the retrieved document silently changes the Binary subtype to 4 - std_collection.replace_one({'_id': 'foo'}, doc) - assert collection.find_one({'uuid': Binary(input_uuid.bytes, 3)}) is None - round_tripped_doc = collection.find_one({'uuid': Binary(input_uuid.bytes, 4)}) - assert doc == round_tripped_doc - .. _unspecified-representation-details: ``UNSPECIFIED`` From 9a20e7664d4c8907911cbe7a9ac844c2cdd81902 Mon Sep 17 00:00:00 2001 From: Prashant Mital Date: Fri, 24 Jul 2020 10:51:23 -0700 Subject: [PATCH 6/8] address review comments --- doc/examples/uuid.rst | 38 ++++++++++++++++++-------------------- 1 file changed, 18 insertions(+), 20 deletions(-) diff --git a/doc/examples/uuid.rst b/doc/examples/uuid.rst index 8befedfe19..6859f92c17 100644 --- a/doc/examples/uuid.rst +++ b/doc/examples/uuid.rst @@ -4,7 +4,7 @@ Handling UUID Data ================== PyMongo ships with built-in support for dealing with UUID types. -It is trivially simple to store native :class:`uuid.UUID` objects +It is straightforward to store native :class:`uuid.UUID` objects to MongoDB and retrieve them as native :class:`uuid.UUID` objects:: from pymongo import MongoClient @@ -50,7 +50,7 @@ in their encoding of UUIDs, and how applications can use the :class:`~bson.binary.UuidRepresentation` configuration option to maintain cross-language compatibility. -.. attention:: Applications that do not share a MongoDB deployment with +.. attention:: New applications that do not share a MongoDB deployment with any other application and that have never stored UUIDs in MongoDB should use the ``standard`` UUID representation for cross-language compatibility. See :ref:`configuring-uuid-representation` for details @@ -94,22 +94,22 @@ Scenario 1: Applications Share a MongoDB Deployment Consider the following situation: -* Application ``M`` written in C# generates a UUID and uses it as the ``_id`` +* Application ``C`` written in C# generates a UUID and uses it as the ``_id`` of a document that it proceeds to insert into the ``uuid_test`` collection of the ``example_db`` database. Let's assume that the canonical textual representation of the generated UUID is:: 00112233-4455-6677-8899-aabbccddeeff -* Application ``N`` written in Python attempts to ``find`` the document - written by application ``M`` in the following manner:: +* Application ``P`` written in Python attempts to ``find`` the document + written by application ``C`` in the following manner:: from uuid import UUID collection = client.example_db.uuid_test result = collection.find_one({'_id': UUID('00112233-4455-6677-8899-aabbccddeeff')}) In this instance, ``result`` will never be the document that - was inserted by application ``M`` in the previous step. This is because of + was inserted by application ``C`` in the previous step. This is because of the different byte-order used by the C# driver for representing UUIDs as BSON Binary. The following query, on the other hand, will successfully find this document:: @@ -120,7 +120,7 @@ This example demonstrates how the differing byte-order used by different drivers can hamper interoperability. To workaround this problem, users should configure their ``MongoClient`` with the appropriate :class:`~bson.binary.UuidRepresentation` (in this case, ``client`` in application -``M`` can be configured to use the +``P`` can be configured to use the :data:`~bson.binary.UuidRepresentation.CSHARP_LEGACY` representation to avoid the unintuitive behavior) as described in :ref:`configuring-uuid-representation`. @@ -168,13 +168,12 @@ and then round-tripped using the ``PYTHON_LEGACY`` representation. Note also that replacing ``PYTHON_LEGACY`` by ``JAVA_LEGACY`` or ``CSHARP_LEGACY`` in the above example produces the same behavior.** -.. note:: This will not be an issue in PyMongo>=4 as starting in that version, +.. note:: Starting in PyMongo 4.0, this issue will be resolved as the ``STANDARD`` representation will decode Binary subtype 3 fields as :class:`~bson.binary.Binary` objects of subtype 3 (instead of :class:`uuid.UUID`), and each of the ``LEGACY_*`` representations will decode Binary subtype 4 fields to :class:`~bson.binary.Binary` objects of - subtype 4 (instead of :class:`uuid.UUID`). This will prevent - + subtype 4 (instead of :class:`uuid.UUID`). .. _configuring-uuid-representation: @@ -328,9 +327,9 @@ byte-order as :attr:`~uuid.UUID.bytes`:: ``JAVA_LEGACY`` ^^^^^^^^^^^^^^^ -.. attention:: This uuid representation should be used when reading UUIDs - written to MongoDB by the Java driver without an explicitly configured UUID - representation. +.. attention:: This UUID representation should be used when reading UUIDs + written to MongoDB by the legacy applications (i.e. applications that don't + use the ``STANDARD`` representation) using the Java driver. The :data:`~bson.binary.UuidRepresentation.JAVA_LEGACY` representation corresponds to the legacy representation of UUIDs used by the MongoDB Java @@ -363,9 +362,9 @@ byte-reordering as the legacy Java driver's UUID to BSON encoder. ``CSHARP_LEGACY`` ^^^^^^^^^^^^^^^^^ -.. attention:: This uuid representation should be used when reading UUIDs - written to MongoDB by the C# driver without an explicitly configured UUID - representation. +.. attention:: This UUID representation should be used when reading UUIDs + written to MongoDB by the legacy applications (i.e. applications that don't + use the ``STANDARD`` representation) using the C# driver. The :data:`~bson.binary.UuidRepresentation.CSHARP_LEGACY` representation corresponds to the legacy representation of UUIDs used by the MongoDB Java @@ -422,10 +421,10 @@ it is also configured with the ``STANDARD`` representation. The :data:`~bson.binary.UuidRepresentation.UNSPECIFIED` representation prevents the incorrect interpretation of UUID bytes by stopping short of automatically converting UUID fields in BSON to native UUID types. Loading -a UUID When using this representation returns a :class:`~bson.binary.Binary` +a UUID when using this representation returns a :class:`~bson.binary.Binary` object instead. If required, users can coerce the decoded :class:`~bson.binary.Binary` objects into native UUIDs using the -:meth:`~bson.binary.Binary.as_uuid` method and specifyin the appropriate +:meth:`~bson.binary.Binary.as_uuid` method and specifying the appropriate representation format. The following example shows what this might look like for a UUID stored by the C# driver:: @@ -436,7 +435,7 @@ what this might look like for a UUID stored by the C# driver:: # Using UuidRepresentation.CSHARP_LEGACY csharp_opts = CodecOptions(uuid_representation=UuidRepresentation.CSHARP_LEGACY) - # Store a C#-formatted UUID + # Store a legacy C#-formatted UUID input_uuid = uuid4() collection = client.testdb.get_collection('test', codec_options=csharp_opts) collection.insert_one({'_id': 'foo', 'uuid': input_uuid}) @@ -446,7 +445,6 @@ what this might look like for a UUID stored by the C# driver:: unspec_collection = client.testdb.get_collection('test', codec_options=unspec_opts) # UUID fields are decoded as Binary when UuidRepresentation.UNSPECIFIED is configured - uuid_1 = uuid4() document = unspec_collection.find_one({'_id': 'foo'}) decoded_field = document['uuid'] assert isinstance(decoded_field, Binary) From 46635e299d5a8e431d7a97f7890fc8bd9148838a Mon Sep 17 00:00:00 2001 From: Prashant Mital Date: Tue, 28 Jul 2020 19:18:15 -0700 Subject: [PATCH 7/8] review changes --- bson/binary.py | 3 ++- doc/examples/uuid.rst | 54 ++++++++++++++++++++++++++++++++++++------- 2 files changed, 48 insertions(+), 9 deletions(-) diff --git a/bson/binary.py b/bson/binary.py index cc66444698..d1f5aae7d2 100644 --- a/bson/binary.py +++ b/bson/binary.py @@ -247,7 +247,8 @@ def from_uuid(cls, uuid, uuid_representation=UuidRepresentation.STANDARD): "UuidRepresentation.UNSPECIFIED. UUIDs can be manually " "converted to bson.Binary instances using " "bson.Binary.from_uuid() or a different UuidRepresentation " - "can be configured.") + "can be configured. See the documentation for " + "UuidRepresentation for more information.") subtype = OLD_UUID_SUBTYPE if uuid_representation == UuidRepresentation.PYTHON_LEGACY: diff --git a/doc/examples/uuid.rst b/doc/examples/uuid.rst index 6859f92c17..70652ca198 100644 --- a/doc/examples/uuid.rst +++ b/doc/examples/uuid.rst @@ -128,10 +128,11 @@ avoid the unintuitive behavior) as described in Scenario 2: Round-Tripping UUIDs ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -In this scenario, we see how using a misconfigured +In the following examples, we see how using a misconfigured :class:`~bson.binary.UuidRepresentation` can cause an application -to inadvertently change the :class:`~bson.binary.Binary` subtype of a UUID -field when round-tripping documents containing UUIDs. +to inadvertently change the :class:`~bson.binary.Binary` subtype, and in some +cases, the bytes of the :class:`~bson.binary.Binary` field itself when +round-tripping documents containing UUIDs. Consider the following situation:: @@ -164,11 +165,48 @@ In this example, round-tripping the document using the incorrect ``PYTHON_LEGACY``) changes the :class:`~bson.binary.Binary` subtype as a side-effect. **Note that this can also happen when the situation is reversed - i.e. when the original document is written using ``STANDARD`` representation -and then round-tripped using the ``PYTHON_LEGACY`` representation. Note also -that replacing ``PYTHON_LEGACY`` by ``JAVA_LEGACY`` or ``CSHARP_LEGACY`` in -the above example produces the same behavior.** +and then round-tripped using the ``PYTHON_LEGACY`` representation.** -.. note:: Starting in PyMongo 4.0, this issue will be resolved as +In the next example, we see the consequences of incorrectly using a +representation that modifies byte-order (``CSHARP_LEGACY`` or ``JAVA_LEGACY``) +when round-tripping documents:: + + from bson.codec_options import CodecOptions, DEFAULT_CODEC_OPTIONS + from bson.binary import Binary, UuidRepresentation + from uuid import uuid4 + + # Using UuidRepresentation.STANDARD stores a Binary subtype-4 UUID + std_opts = CodecOptions(uuid_representation=UuidRepresentation.STANDARD) + input_uuid = uuid4() + collection = client.testdb.get_collection('test', codec_options=std_opts) + collection.insert_one({'_id': 'baz', 'uuid': input_uuid}) + assert collection.find_one({'uuid': Binary(input_uuid.bytes, 4)})['_id'] == 'baz' + + # Retrieving this document using UuidRepresentation.JAVA_LEGACY returns a native UUID + # without modifying the UUID byte-order + java_opts = CodecOptions(uuid_representation=UuidRepresentation.JAVA_LEGACY) + java_collection = client.testdb.get_collection('test', codec_options=java_opts) + doc = java_collection.find_one({'_id': 'baz'}) + assert doc['uuid'] == input_uuid + + # Round-tripping the retrieved document silently changes the Binary bytes and subtype + java_collection.replace_one({'_id': 'baz'}, doc) + assert collection.find_one({'uuid': Binary(input_uuid.bytes, 3)}) is None + round_tripped_doc = collection.find_one({'_id': 'baz'}) + assert round_tripped_doc['uuid'] == Binary(input_uuid.bytes, 3).as_uuid(UuidRepresentation.JAVA_LEGACY) + + +In this case, using the incorrect :class:`~bson.binary.UuidRepresentation` +(``JAVA_LEGACY`` instead of ``STANDARD``) changes the +:class:`~bson.binary.Binary` bytes and subtype as a side-effect. +**Note that this happens when any representation that +manipulates byte-order (``CSHARP_LEGACY`` or ``JAVA_LEGACY``) is incorrectly +used to round-trip UUIDs written with ``STANDARD``. When the situation is +reversed - i.e. when the original document is written using ``STANDARD`` +and then round-tripped using ``CSHARP_LEGACY`` or ``JAVA_LEGACY`` - +only the :class:`~bson.binary.Binary` subtype is changed.** + +.. note:: Starting in PyMongo 4.0, these issue will be resolved as the ``STANDARD`` representation will decode Binary subtype 3 fields as :class:`~bson.binary.Binary` objects of subtype 3 (instead of :class:`uuid.UUID`), and each of the ``LEGACY_*`` representations will @@ -460,7 +498,7 @@ and attempting to do so will result in an exception:: unspec_collection.insert_one({'_id': 'bar', 'uuid': uuid4()}) Traceback (most recent call last): ... - ValueError: cannot encode native uuid.UUID with UuidRepresentation.UNSPECIFIED. UUIDs can be manually converted to bson.Binary instances using bson.Binary.from_uuid() or a different UuidRepresentation can be configured. + ValueError: cannot encode native uuid.UUID with UuidRepresentation.UNSPECIFIED. UUIDs can be manually converted to bson.Binary instances using bson.Binary.from_uuid() or a different UuidRepresentation can be configured. See the documentation for UuidRepresentation for more information. Instead, applications using :data:`~bson.binary.UuidRepresentation.UNSPECIFIED` must explicitly coerce a native UUID using the From 97b45837543444f53f98f390adf6ce6168191e5e Mon Sep 17 00:00:00 2001 From: Prashant Mital Date: Wed, 29 Jul 2020 14:34:27 -0700 Subject: [PATCH 8/8] review changes --- doc/examples/uuid.rst | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/doc/examples/uuid.rst b/doc/examples/uuid.rst index 70652ca198..9b6762dc88 100644 --- a/doc/examples/uuid.rst +++ b/doc/examples/uuid.rst @@ -192,6 +192,7 @@ when round-tripping documents:: # Round-tripping the retrieved document silently changes the Binary bytes and subtype java_collection.replace_one({'_id': 'baz'}, doc) assert collection.find_one({'uuid': Binary(input_uuid.bytes, 3)}) is None + assert collection.find_one({'uuid': Binary(input_uuid.bytes, 4)}) is None round_tripped_doc = collection.find_one({'_id': 'baz'}) assert round_tripped_doc['uuid'] == Binary(input_uuid.bytes, 3).as_uuid(UuidRepresentation.JAVA_LEGACY) @@ -202,8 +203,8 @@ In this case, using the incorrect :class:`~bson.binary.UuidRepresentation` **Note that this happens when any representation that manipulates byte-order (``CSHARP_LEGACY`` or ``JAVA_LEGACY``) is incorrectly used to round-trip UUIDs written with ``STANDARD``. When the situation is -reversed - i.e. when the original document is written using ``STANDARD`` -and then round-tripped using ``CSHARP_LEGACY`` or ``JAVA_LEGACY`` - +reversed - i.e. when the original document is written using ``CSHARP_LEGACY`` +or ``JAVA_LEGACY`` and then round-tripped using ``STANDARD`` - only the :class:`~bson.binary.Binary` subtype is changed.** .. note:: Starting in PyMongo 4.0, these issue will be resolved as