From d87e0f3b5685306cb00d913c36a27c27b8baf48f Mon Sep 17 00:00:00 2001 From: Quentin Pradet Date: Tue, 16 Jan 2024 21:09:53 +0100 Subject: [PATCH 1/5] Cleanup test_serializer.py --- test_elasticsearch/test_serializer.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/test_elasticsearch/test_serializer.py b/test_elasticsearch/test_serializer.py index efd56d28f..9dedf31d2 100644 --- a/test_elasticsearch/test_serializer.py +++ b/test_elasticsearch/test_serializer.py @@ -36,7 +36,7 @@ from elasticsearch.serializer import JSONSerializer, TextSerializer requires_numpy_and_pandas = pytest.mark.skipif( - np is None or pd is None, reason="Test requires numpy or pandas to be available" + np is None or pd is None, reason="Test requires numpy and pandas to be available" ) @@ -46,11 +46,8 @@ def test_datetime_serialization(): ) +@requires_numpy_and_pandas def test_decimal_serialization(): - requires_numpy_and_pandas() - - if sys.version_info[:2] == (2, 6): - pytest.skip("Float rounding is broken in 2.6.") assert b'{"d":3.8}' == JSONSerializer().dumps({"d": Decimal("3.8")}) From 48682701d7c2bb5d1b53d08818cda0276b1b7c25 Mon Sep 17 00:00:00 2001 From: Quentin Pradet Date: Tue, 26 Mar 2024 16:54:30 +0400 Subject: [PATCH 2/5] Add optional orjson serializer support --- dev-requirements.txt | 1 + docs/guide/configuration.asciidoc | 14 +++++ elasticsearch/__init__.py | 6 ++ elasticsearch/serializer.py | 11 ++++ test_elasticsearch/test_serializer.py | 82 ++++++++++++++------------- 5 files changed, 75 insertions(+), 39 deletions(-) diff --git a/dev-requirements.txt b/dev-requirements.txt index b5b92c283..77ca77ced 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -18,6 +18,7 @@ nox numpy pandas +orjson # Testing the 'search_mvt' API response mapbox-vector-tile diff --git a/docs/guide/configuration.asciidoc b/docs/guide/configuration.asciidoc index cb467dc0d..3215b35e3 100644 --- a/docs/guide/configuration.asciidoc +++ b/docs/guide/configuration.asciidoc @@ -381,6 +381,20 @@ es = Elasticsearch( ) ------------------------------------ +If the `orjson` package is installed, you can use the faster ``OrjsonSerializer`` for the default mimetype (``application/json``): + +[source,python] +------------------------------------ +from elasticsearch import Elasticsearch, OrjsonSerializer + +es = Elasticsearch( + ..., + serializer=OrjsonSerializer() +) +------------------------------------ + +It is particularly beneficial to serialize vectors. This will be the default in a future release. + [discrete] [[nodes]] diff --git a/elasticsearch/__init__.py b/elasticsearch/__init__.py index 8b38b3934..0a85f5fb8 100644 --- a/elasticsearch/__init__.py +++ b/elasticsearch/__init__.py @@ -62,6 +62,10 @@ UnsupportedProductError, ) from .serializer import JSONSerializer, JsonSerializer +try: + from .serializer import OrjsonSerializer +except ModuleNotFoundError: + OrjsonSerializer = None # Only raise one warning per deprecation message so as not # to spam up the user if the same action is done multiple times. @@ -86,6 +90,8 @@ "UnsupportedProductError", "ElasticsearchWarning", ] +if OrjsonSerializer is not None: + __all__.append("OrjsonSerializer") fixup_module_metadata(__name__, globals()) del fixup_module_metadata diff --git a/elasticsearch/serializer.py b/elasticsearch/serializer.py index 758c6b730..baf4e1581 100644 --- a/elasticsearch/serializer.py +++ b/elasticsearch/serializer.py @@ -41,6 +41,12 @@ "MapboxVectorTileSerializer", ] +try: + from elastic_transport import OrjsonSerializer as _OrjsonSerializer + __all__.append("OrjsonSerializer") +except ModuleNotFoundError: + _OrjsonSerializer = None + class JsonSerializer(_JsonSerializer): mimetype: ClassVar[str] = "application/json" @@ -73,6 +79,11 @@ def default(self, data: Any) -> Any: raise TypeError(f"Unable to serialize {data!r} (type: {type(data)})") +if _OrjsonSerializer is not None: + class OrjsonSerializer(JsonSerializer, _OrjsonSerializer): + def default(self, data: Any) -> Any: + return JsonSerializer.default(self, data) + class NdjsonSerializer(JsonSerializer, _NdjsonSerializer): mimetype: ClassVar[str] = "application/x-ndjson" diff --git a/test_elasticsearch/test_serializer.py b/test_elasticsearch/test_serializer.py index 9dedf31d2..c94c62677 100644 --- a/test_elasticsearch/test_serializer.py +++ b/test_elasticsearch/test_serializer.py @@ -33,38 +33,41 @@ from elasticsearch import Elasticsearch from elasticsearch.exceptions import SerializationError -from elasticsearch.serializer import JSONSerializer, TextSerializer +from elasticsearch.serializer import JSONSerializer, OrjsonSerializer, TextSerializer requires_numpy_and_pandas = pytest.mark.skipif( np is None or pd is None, reason="Test requires numpy and pandas to be available" ) +@pytest.fixture(params=[JSONSerializer, OrjsonSerializer]) +def json_serializer(request: pytest.FixtureRequest): + yield request.param() -def test_datetime_serialization(): - assert b'{"d":"2010-10-01T02:30:00"}' == JSONSerializer().dumps( + +def test_datetime_serialization(json_serializer): + assert b'{"d":"2010-10-01T02:30:00"}' == json_serializer.dumps( {"d": datetime(2010, 10, 1, 2, 30)} ) @requires_numpy_and_pandas -def test_decimal_serialization(): - assert b'{"d":3.8}' == JSONSerializer().dumps({"d": Decimal("3.8")}) +def test_decimal_serialization(json_serializer): + assert b'{"d":3.8}' == json_serializer.dumps({"d": Decimal("3.8")}) -def test_uuid_serialization(): - assert b'{"d":"00000000-0000-0000-0000-000000000003"}' == JSONSerializer().dumps( +def test_uuid_serialization(json_serializer): + assert b'{"d":"00000000-0000-0000-0000-000000000003"}' == json_serializer.dumps( {"d": uuid.UUID("00000000-0000-0000-0000-000000000003")} ) @requires_numpy_and_pandas -def test_serializes_numpy_bool(): - assert b'{"d":true}' == JSONSerializer().dumps({"d": np.bool_(True)}) +def test_serializes_numpy_bool(json_serializer): + assert b'{"d":true}' == json_serializer.dumps({"d": np.bool_(True)}) @requires_numpy_and_pandas -def test_serializes_numpy_integers(): - ser = JSONSerializer() +def test_serializes_numpy_integers(json_serializer): for np_type in ( np.int_, np.int8, @@ -72,7 +75,7 @@ def test_serializes_numpy_integers(): np.int32, np.int64, ): - assert ser.dumps({"d": np_type(-1)}) == b'{"d":-1}' + assert json_serializer.dumps({"d": np_type(-1)}) == b'{"d":-1}' for np_type in ( np.uint8, @@ -80,91 +83,92 @@ def test_serializes_numpy_integers(): np.uint32, np.uint64, ): - assert ser.dumps({"d": np_type(1)}) == b'{"d":1}' + assert json_serializer.dumps({"d": np_type(1)}) == b'{"d":1}' @requires_numpy_and_pandas -def test_serializes_numpy_floats(): - ser = JSONSerializer() +def test_serializes_numpy_floats(json_serializer): for np_type in ( np.float_, np.float32, np.float64, ): - assert re.search(rb'^{"d":1\.2[\d]*}$', ser.dumps({"d": np_type(1.2)})) + assert re.search(rb'^{"d":1\.2[\d]*}$', json_serializer.dumps({"d": np_type(1.2)})) @requires_numpy_and_pandas -def test_serializes_numpy_datetime(): - assert b'{"d":"2010-10-01T02:30:00"}' == JSONSerializer().dumps( +def test_serializes_numpy_datetime(json_serializer): + assert b'{"d":"2010-10-01T02:30:00"}' == json_serializer.dumps( {"d": np.datetime64("2010-10-01T02:30:00")} ) @requires_numpy_and_pandas -def test_serializes_numpy_ndarray(): - assert b'{"d":[0,0,0,0,0]}' == JSONSerializer().dumps( +def test_serializes_numpy_ndarray(json_serializer): + assert b'{"d":[0,0,0,0,0]}' == json_serializer.dumps( {"d": np.zeros((5,), dtype=np.uint8)} ) # This isn't useful for Elasticsearch, just want to make sure it works. - assert b'{"d":[[0,0],[0,0]]}' == JSONSerializer().dumps( + assert b'{"d":[[0,0],[0,0]]}' == json_serializer.dumps( {"d": np.zeros((2, 2), dtype=np.uint8)} ) @requires_numpy_and_pandas def test_serializes_numpy_nan_to_nan(): - assert b'{"d":NaN}' == JSONSerializer().dumps({"d": np.nan}) + assert b'{"d":NaN}' == JSONSerializer().dumps({"d": float("NaN")}) + # NaN is invalid JSON, and orjson silently converts it to null + assert b'{"d":null}' == OrjsonSerializer().dumps({"d": float("NaN")}) @requires_numpy_and_pandas -def test_serializes_pandas_timestamp(): - assert b'{"d":"2010-10-01T02:30:00"}' == JSONSerializer().dumps( +def test_serializes_pandas_timestamp(json_serializer): + assert b'{"d":"2010-10-01T02:30:00"}' == json_serializer.dumps( {"d": pd.Timestamp("2010-10-01T02:30:00")} ) @requires_numpy_and_pandas -def test_serializes_pandas_series(): - assert b'{"d":["a","b","c","d"]}' == JSONSerializer().dumps( +def test_serializes_pandas_series(json_serializer): + assert b'{"d":["a","b","c","d"]}' == json_serializer.dumps( {"d": pd.Series(["a", "b", "c", "d"])} ) @requires_numpy_and_pandas @pytest.mark.skipif(not hasattr(pd, "NA"), reason="pandas.NA is required") -def test_serializes_pandas_na(): - assert b'{"d":null}' == JSONSerializer().dumps({"d": pd.NA}) +def test_serializes_pandas_na(json_serializer): + assert b'{"d":null}' == json_serializer.dumps({"d": pd.NA}) @requires_numpy_and_pandas @pytest.mark.skipif(not hasattr(pd, "NaT"), reason="pandas.NaT required") -def test_raises_serialization_error_pandas_nat(): +def test_raises_serialization_error_pandas_nat(json_serializer): with pytest.raises(SerializationError): - JSONSerializer().dumps({"d": pd.NaT}) + json_serializer.dumps({"d": pd.NaT}) @requires_numpy_and_pandas -def test_serializes_pandas_category(): +def test_serializes_pandas_category(json_serializer): cat = pd.Categorical(["a", "c", "b", "a"], categories=["a", "b", "c"]) - assert b'{"d":["a","c","b","a"]}' == JSONSerializer().dumps({"d": cat}) + assert b'{"d":["a","c","b","a"]}' == json_serializer.dumps({"d": cat}) cat = pd.Categorical([1, 2, 3], categories=[1, 2, 3]) - assert b'{"d":[1,2,3]}' == JSONSerializer().dumps({"d": cat}) + assert b'{"d":[1,2,3]}' == json_serializer.dumps({"d": cat}) -def test_json_raises_serialization_error_on_dump_error(): +def test_json_raises_serialization_error_on_dump_error(json_serializer): with pytest.raises(SerializationError): - JSONSerializer().dumps(object()) + json_serializer.dumps(object()) -def test_raises_serialization_error_on_load_error(): +def test_raises_serialization_error_on_load_error(json_serializer): with pytest.raises(SerializationError): - JSONSerializer().loads(object()) + json_serializer.loads(object()) with pytest.raises(SerializationError): - JSONSerializer().loads("") + json_serializer.loads("") with pytest.raises(SerializationError): - JSONSerializer().loads("{{") + json_serializer.loads("{{") def test_strings_are_left_untouched(): From 72ea9951ec4caa935a159b3fe567df8accf6f211 Mon Sep 17 00:00:00 2001 From: Quentin Pradet Date: Tue, 26 Mar 2024 17:13:57 +0400 Subject: [PATCH 3/5] Fix lint --- elasticsearch/__init__.py | 3 ++- elasticsearch/serializer.py | 5 ++++- noxfile.py | 2 +- setup.py | 1 + test_elasticsearch/test_serializer.py | 6 ++++-- 5 files changed, 12 insertions(+), 5 deletions(-) diff --git a/elasticsearch/__init__.py b/elasticsearch/__init__.py index 0a85f5fb8..f4ac43ca7 100644 --- a/elasticsearch/__init__.py +++ b/elasticsearch/__init__.py @@ -62,10 +62,11 @@ UnsupportedProductError, ) from .serializer import JSONSerializer, JsonSerializer + try: from .serializer import OrjsonSerializer except ModuleNotFoundError: - OrjsonSerializer = None + OrjsonSerializer = None # type: ignore[assignment,misc] # Only raise one warning per deprecation message so as not # to spam up the user if the same action is done multiple times. diff --git a/elasticsearch/serializer.py b/elasticsearch/serializer.py index baf4e1581..4e6122fff 100644 --- a/elasticsearch/serializer.py +++ b/elasticsearch/serializer.py @@ -43,9 +43,10 @@ try: from elastic_transport import OrjsonSerializer as _OrjsonSerializer + __all__.append("OrjsonSerializer") except ModuleNotFoundError: - _OrjsonSerializer = None + _OrjsonSerializer = None # type: ignore[assignment,misc] class JsonSerializer(_JsonSerializer): @@ -80,10 +81,12 @@ def default(self, data: Any) -> Any: if _OrjsonSerializer is not None: + class OrjsonSerializer(JsonSerializer, _OrjsonSerializer): def default(self, data: Any) -> Any: return JsonSerializer.default(self, data) + class NdjsonSerializer(JsonSerializer, _NdjsonSerializer): mimetype: ClassVar[str] = "application/x-ndjson" diff --git a/noxfile.py b/noxfile.py index b166bf79a..54d8adcff 100644 --- a/noxfile.py +++ b/noxfile.py @@ -92,7 +92,7 @@ def lint(session): session.run("python", "utils/license-headers.py", "check", *SOURCE_FILES) # Workaround to make '-r' to still work despite uninstalling aiohttp below. - session.install(".[async,requests]", env=INSTALL_ENV) + session.install(".[async,requests,orjson]", env=INSTALL_ENV) # Run mypy on the package and then the type examples separately for # the two different mypy use-cases, ourselves and our users. diff --git a/setup.py b/setup.py index 7a146da9b..6104768d5 100644 --- a/setup.py +++ b/setup.py @@ -97,5 +97,6 @@ extras_require={ "requests": ["requests>=2.4.0, <3.0.0"], "async": async_requires, + "orjson": ["orjson>=3"], }, ) diff --git a/test_elasticsearch/test_serializer.py b/test_elasticsearch/test_serializer.py index c94c62677..9dc5f31e9 100644 --- a/test_elasticsearch/test_serializer.py +++ b/test_elasticsearch/test_serializer.py @@ -16,7 +16,6 @@ # specific language governing permissions and limitations # under the License. -import sys import uuid from datetime import datetime from decimal import Decimal @@ -39,6 +38,7 @@ np is None or pd is None, reason="Test requires numpy and pandas to be available" ) + @pytest.fixture(params=[JSONSerializer, OrjsonSerializer]) def json_serializer(request: pytest.FixtureRequest): yield request.param() @@ -93,7 +93,9 @@ def test_serializes_numpy_floats(json_serializer): np.float32, np.float64, ): - assert re.search(rb'^{"d":1\.2[\d]*}$', json_serializer.dumps({"d": np_type(1.2)})) + assert re.search( + rb'^{"d":1\.2[\d]*}$', json_serializer.dumps({"d": np_type(1.2)}) + ) @requires_numpy_and_pandas From 56236008a1db5d3336579719c0afe96c2a6d296d Mon Sep 17 00:00:00 2001 From: Quentin Pradet Date: Tue, 26 Mar 2024 17:14:06 +0400 Subject: [PATCH 4/5] Improve docs --- docs/guide/configuration.asciidoc | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/docs/guide/configuration.asciidoc b/docs/guide/configuration.asciidoc index 3215b35e3..06964dac9 100644 --- a/docs/guide/configuration.asciidoc +++ b/docs/guide/configuration.asciidoc @@ -393,8 +393,12 @@ es = Elasticsearch( ) ------------------------------------ -It is particularly beneficial to serialize vectors. This will be the default in a future release. +orjson is particularly fast when serializing vectors as it has native numpy support. This will be the default in a future release. Note that you can install orjson with the `orjson` extra: +[source,sh] +-------------------------------------------- +$ python -m pip install elasticsearch[orjson] +-------------------------------------------- [discrete] [[nodes]] From b3ffd73ca0a912d022dfd1994f4a5f7447a64774 Mon Sep 17 00:00:00 2001 From: Quentin Pradet Date: Tue, 26 Mar 2024 17:40:21 +0400 Subject: [PATCH 5/5] Add orjson extra in tests --- noxfile.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/noxfile.py b/noxfile.py index 54d8adcff..bf88249bd 100644 --- a/noxfile.py +++ b/noxfile.py @@ -48,7 +48,7 @@ def pytest_argv(): @nox.session(python=["3.7", "3.8", "3.9", "3.10", "3.11", "3.12"]) def test(session): - session.install(".[async,requests]", env=INSTALL_ENV, silent=False) + session.install(".[async,requests,orjson]", env=INSTALL_ENV, silent=False) session.install("-r", "dev-requirements.txt", silent=False) session.run(*pytest_argv())