From c6386e8f46ce5d7bbc328ec29c99ea4e25b55f50 Mon Sep 17 00:00:00 2001 From: Chris Cho Date: Wed, 26 Jan 2022 15:34:27 -0500 Subject: [PATCH 1/4] DOCSP-20056: UTF-8 validation options --- source/fundamentals.txt | 1 + source/fundamentals/utf8-validation.txt | 116 ++++++++++++++++++++++ source/includes/fundamentals-sections.rst | 2 + source/whats-new.txt | 2 +- 4 files changed, 120 insertions(+), 1 deletion(-) create mode 100644 source/fundamentals/utf8-validation.txt diff --git a/source/fundamentals.txt b/source/fundamentals.txt index da947880b..03325856e 100644 --- a/source/fundamentals.txt +++ b/source/fundamentals.txt @@ -19,5 +19,6 @@ Fundamentals /fundamentals/gridfs /fundamentals/time-series /fundamentals/typescript + /fundamentals/utf8-validation .. include:: /includes/fundamentals-sections.rst diff --git a/source/fundamentals/utf8-validation.txt b/source/fundamentals/utf8-validation.txt new file mode 100644 index 000000000..411006b01 --- /dev/null +++ b/source/fundamentals/utf8-validation.txt @@ -0,0 +1,116 @@ +.. _nodejs-utf-8-validation: + +================ +UTF-8 Validation +================ + +.. default-domain:: mongodb + +.. contents:: On this page + :local: + :backlinks: none + :depth: 2 + :class: singlecol + +Overview +-------- + +In this guide, you can learn how to enable or disable the {+driver-short+}'s +UTF-8 validation feature. By default, your driver checks documents for any +characters that are not encoded in a valid UTF-8 format when it transfers data +between your application and MongoDB. + +**UTF-8** is a character encoding specification that is compatible with +most operating systems, applications, and language character sets. See the +Wikipedia entry on :wikipedia:`UTF-8 ` for more information on this +standard. + +If you enable validation, the driver substitutes invalid UTF-8 characters +with valid UTF-8 ones when transferring data between your application and +MongoDB. The validation adds processing overhead since it needs to verify the +data and to make substitutions when necessary. + +If you disable validation, your application can avoid the processing +overhead caused by validation, but cannot guarantee consistent presentation of +invalid UTF-8 data. + +.. note:: + + The {+driver-short+} automatically substitutes invalid UTF-8 characters + in your application before sending them to MongoDB regardless of whether + you enable validation. You can only retrieve invalid UTF-8 characters + from MongoDB. + +.. _nodejs-specify-utf-8-validation: + +Specify the UTF-8 Validation Setting +------------------------------------ + +You can specify whether the driver should perform UTF-8 validation by +defining the ``enableUtf8Validation`` setting in the options parameter +when you create a client, reference a database or collection, or call a +CRUD operation. By default, the {+driver-short+} enables UTF-8 validation. + +.. code-block:: javascript + + // disable UTF-8 validation on the client + new MongoClient('', { enableUtf8Validation: false }); + + // disable UTF-8 validation on the database + client.db('', { enableUtf8Validation: false }); + + // disable UTF-8 validation on the collection + db.collection('', { enableUtf8Validation: false }); + + // disable UTF-8 validation on a specific operation call + await collection.findOne({ title: 'Cam Jansen'}, { enableUtf8Validation: false }); + +If your application reads invalid UTF-8 from MongoDB while the +``enableUtf8Validation`` option is enabled, it throws a ``BSONError`` that +contains the following message: + +.. code-block:: + + Invalid UTF-8 string in BSON document + +.. _nodejs-utf-8-validation-scope: + +Set the Validation Scope +~~~~~~~~~~~~~~~~~~~~~~~~ + +The ``enableUtf8Validation`` setting automatically applies to the scope of the +object instance on which you included it, and any other objects created by +calls on that instance. + +For example, if you include the option on the call to instantiate a database +object, any collection instance you construct from that object inherits +the setting. Any operations you call on that collection instance also +inherit the setting. + +.. code-block:: javascript + + const database = client.db('books', { enableUtf8Validation: false }); + + // The collection inherits the UTF-8 validation disabled setting from the database + const collection = database.collection('mystery'); + + // CRUD operation runs with UTF-8 validation disabled + await collection.findOne({ title: 'Encyclopedia Brown' }); + +You can override the setting at any level of scope by including it when +constructing the object instance or when calling an operation. + +For example, if you disable validation on the collection object, you can +override the setting in individual CRUD operation calls on that +collection. + +.. code-block:: javascript + + const collection = database.collection('mystery', { enableUtf8Validation: false }); + + // CRUD operation runs with UTF-8 validation enabled + await collection.findOne({ title: 'Trixie Belden' }, { enableUtf8Validation: true }); + + // CRUD operation runs with UTF-8 validation disabled + await collection.findOne({ title: 'Enola Holmes' }); + diff --git a/source/includes/fundamentals-sections.rst b/source/includes/fundamentals-sections.rst index 5cd61e690..07257825a 100644 --- a/source/includes/fundamentals-sections.rst +++ b/source/includes/fundamentals-sections.rst @@ -11,3 +11,5 @@ Fundamentals section: - :doc:`Log Events in the Driver ` - :doc:`Monitor Driver Events ` - :doc:`Store and Retrieve Large Files in MongoDB ` +- :doc:`Use TypeScript Types with the Driver ` +- :doc:`Specify UTF-8 Validation Settings ` diff --git a/source/whats-new.txt b/source/whats-new.txt index 5d17c1cd4..4da495071 100644 --- a/source/whats-new.txt +++ b/source/whats-new.txt @@ -26,7 +26,7 @@ What's New in 4.3 New features of the 4.3 Node.js driver release include: - SOCKS5 support -- Option to disable UTF-8 validation +- Option to :ref:`disable UTF-8 validation ` - Type inference for nested documents .. _version-4.2: From 498e0cc9c13bd4e1f18f772e6f620448cab20e35 Mon Sep 17 00:00:00 2001 From: Chris Cho Date: Wed, 26 Jan 2022 15:45:19 -0500 Subject: [PATCH 2/4] rearrange order of intro sentences --- source/fundamentals/utf8-validation.txt | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/source/fundamentals/utf8-validation.txt b/source/fundamentals/utf8-validation.txt index 411006b01..d1a53ba7e 100644 --- a/source/fundamentals/utf8-validation.txt +++ b/source/fundamentals/utf8-validation.txt @@ -16,14 +16,11 @@ Overview -------- In this guide, you can learn how to enable or disable the {+driver-short+}'s -UTF-8 validation feature. By default, your driver checks documents for any -characters that are not encoded in a valid UTF-8 format when it transfers data -between your application and MongoDB. - -**UTF-8** is a character encoding specification that is compatible with -most operating systems, applications, and language character sets. See the -Wikipedia entry on :wikipedia:`UTF-8 ` for more information on this -standard. +**UTF-8** validation feature. **UTF-8** is a character encoding specification +that ensures compatibility and consistent presentation across most operating +systems, applications, and language character sets. By default, the driver +checks documents for any characters that are not encoded in a valid UTF-8 +format when it transfers data between your application and MongoDB. If you enable validation, the driver substitutes invalid UTF-8 characters with valid UTF-8 ones when transferring data between your application and @@ -34,6 +31,9 @@ If you disable validation, your application can avoid the processing overhead caused by validation, but cannot guarantee consistent presentation of invalid UTF-8 data. +Read the sections below to learn how to set UTF-8 validation using the +{+driver-short+}. + .. note:: The {+driver-short+} automatically substitutes invalid UTF-8 characters From 9f56e3aeeabd0dcae771a52bf53f8846b1c76935 Mon Sep 17 00:00:00 2001 From: Chris Cho Date: Wed, 26 Jan 2022 19:57:16 -0500 Subject: [PATCH 3/4] PR review fixes --- source/fundamentals/utf8-validation.txt | 41 ++++++++++++++----------- 1 file changed, 23 insertions(+), 18 deletions(-) diff --git a/source/fundamentals/utf8-validation.txt b/source/fundamentals/utf8-validation.txt index d1a53ba7e..451db450f 100644 --- a/source/fundamentals/utf8-validation.txt +++ b/source/fundamentals/utf8-validation.txt @@ -16,30 +16,32 @@ Overview -------- In this guide, you can learn how to enable or disable the {+driver-short+}'s -**UTF-8** validation feature. **UTF-8** is a character encoding specification +**UTF-8** validation feature. UTF-8 is a character encoding specification that ensures compatibility and consistent presentation across most operating -systems, applications, and language character sets. By default, the driver -checks documents for any characters that are not encoded in a valid UTF-8 -format when it transfers data between your application and MongoDB. +systems, applications, and language character sets. -If you enable validation, the driver substitutes invalid UTF-8 characters -with valid UTF-8 ones when transferring data between your application and -MongoDB. The validation adds processing overhead since it needs to verify the -data and to make substitutions when necessary. +If you *enable* validation, the driver throws an error when it attempts to +convert data that contains invalid UTF-8 characters. The validation adds +processing overhead since it needs to check the data. -If you disable validation, your application can avoid the processing -overhead caused by validation, but cannot guarantee consistent presentation of -invalid UTF-8 data. +If you *disable* validation, your application avoids the validation processing +overhead, but cannot guarantee consistent presentation of invalid UTF-8 data. -Read the sections below to learn how to set UTF-8 validation using the -{+driver-short+}. +The driver enables UTF-8 validation by default. It checks documents for any +characters that are not encoded in a valid UTF-8 format when it transfers data +between your application. If it encounters invalid UTF-8 characters, it +throws an error. .. note:: - The {+driver-short+} automatically substitutes invalid UTF-8 characters - in your application before sending them to MongoDB regardless of whether - you enable validation. You can only retrieve invalid UTF-8 characters - from MongoDB. + The current version of the {+driver-short+} automatically substitutes + invalid UTF-8 characters with alternate valid UTF-8 ones prior to + validation when you send data to MongoDB. Therefore, the validation + only throws an error when the setting is enabled and the driver + receives invalid UTF-8 document data from MongoDB. + +Read the sections below to learn how to set UTF-8 validation using the +{+driver-short+}. .. _nodejs-specify-utf-8-validation: @@ -49,7 +51,10 @@ Specify the UTF-8 Validation Setting You can specify whether the driver should perform UTF-8 validation by defining the ``enableUtf8Validation`` setting in the options parameter when you create a client, reference a database or collection, or call a -CRUD operation. By default, the {+driver-short+} enables UTF-8 validation. +CRUD operation. If you omit the setting, the driver enables UTF-8 validation. + +See the following for code examples that demonstrate how to disable UTF-8 +validation on the client, database, collection, or CRUD operation: .. code-block:: javascript From 70acb931e71dfdf3d389facbff8727f283a780c3 Mon Sep 17 00:00:00 2001 From: Chris Cho Date: Thu, 27 Jan 2022 11:27:11 -0500 Subject: [PATCH 4/4] PR review fixes --- source/fundamentals/utf8-validation.txt | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/source/fundamentals/utf8-validation.txt b/source/fundamentals/utf8-validation.txt index 451db450f..361e2e8c8 100644 --- a/source/fundamentals/utf8-validation.txt +++ b/source/fundamentals/utf8-validation.txt @@ -29,8 +29,7 @@ overhead, but cannot guarantee consistent presentation of invalid UTF-8 data. The driver enables UTF-8 validation by default. It checks documents for any characters that are not encoded in a valid UTF-8 format when it transfers data -between your application. If it encounters invalid UTF-8 characters, it -throws an error. +between your application and MongoDB. .. note::