From c7798018c2b050708db24f971be64e0b8e2d0313 Mon Sep 17 00:00:00 2001 From: Andreas Motl Date: Wed, 15 Jan 2025 23:57:17 +0100 Subject: [PATCH] Marshalling: Use `orjson` to improve JSON serialization performance https://github.com/ijl/orjson --- CHANGES.rst | 11 +++++++ setup.py | 1 + src/crate/client/http.py | 60 +++++++++++++++++++++------------------ tests/client/test_http.py | 14 ++++----- 4 files changed, 52 insertions(+), 34 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index 2cf9c9dd..598ce6a5 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -5,6 +5,17 @@ Changes for crate Unreleased ========== +- Switched JSON encoder to use the `orjson`_ library, to improve JSON + marshalling performance. Thanks, @widmogrod. + orjson is fast and in some spots even more correct when compared against + Python's stdlib ``json`` module. Contrary to the stdlib variant, orjson + will serialize to ``bytes`` instead of ``str``. Please also note it + will not deserialize to dataclasses, UUIDs, decimals, etc., or support + ``object_hook``. Within ``crate-python``, it is applied with an encoder + function for additional type support about Python's `Decimal` type and + freezegun's `FakeDateTime` type. + +.. _orjson: https://github.com/ijl/orjson 2024/11/23 1.0.1 ================ diff --git a/setup.py b/setup.py index 4da8be94..74d21fde 100644 --- a/setup.py +++ b/setup.py @@ -54,6 +54,7 @@ def read(path): packages=find_namespace_packages("src"), package_dir={"": "src"}, install_requires=[ + "orjson<4", "urllib3", "verlib2", ], diff --git a/src/crate/client/http.py b/src/crate/client/http.py index e2c164d9..65c4114c 100644 --- a/src/crate/client/http.py +++ b/src/crate/client/http.py @@ -20,23 +20,21 @@ # software solely pursuant to the terms of the relevant commercial agreement. -import calendar import heapq import io -import json import logging import os import re import socket import ssl import threading +import typing as t from base64 import b64encode -from datetime import date, datetime, timezone from decimal import Decimal from time import time from urllib.parse import urlparse -from uuid import UUID +import orjson import urllib3 from urllib3 import connection_from_url from urllib3.connection import HTTPConnection @@ -86,25 +84,33 @@ def super_len(o): return None -class CrateJsonEncoder(json.JSONEncoder): - epoch_aware = datetime(1970, 1, 1, tzinfo=timezone.utc) - epoch_naive = datetime(1970, 1, 1) - - def default(self, o): - if isinstance(o, (Decimal, UUID)): - return str(o) - if isinstance(o, datetime): - if o.tzinfo is not None: - delta = o - self.epoch_aware - else: - delta = o - self.epoch_naive - return int( - delta.microseconds / 1000.0 - + (delta.seconds + delta.days * 24 * 3600) * 1000.0 - ) - if isinstance(o, date): - return calendar.timegm(o.timetuple()) * 1000 - return json.JSONEncoder.default(self, o) +def cratedb_json_encoder(obj: t.Any) -> str: + """ + Encoder function for orjson, with additional type support. + + - Python's `Decimal` type. + - freezegun's `FakeDateTime` type. + + https://github.com/ijl/orjson#default + """ + if isinstance(obj, Decimal): + return str(obj) + elif hasattr(obj, "isoformat"): + return obj.isoformat() + raise TypeError + + +def json_dumps(obj: t.Any) -> bytes: + """ + Serialize to JSON format, using `orjson`, with additional type support. + + https://github.com/ijl/orjson + """ + return orjson.dumps( + obj, + default=cratedb_json_encoder, + option=orjson.OPT_SERIALIZE_NUMPY, + ) class Server: @@ -180,7 +186,7 @@ def close(self): def _json_from_response(response): try: - return json.loads(response.data.decode("utf-8")) + return orjson.loads(response.data) except ValueError as ex: raise ProgrammingError( "Invalid server response of content-type '{}':\n{}".format( @@ -223,7 +229,7 @@ def _raise_for_status_real(response): if response.status == 503: raise ConnectionError(message) if response.headers.get("content-type", "").startswith("application/json"): - data = json.loads(response.data.decode("utf-8")) + data = orjson.loads(response.data) error = data.get("error", {}) error_trace = data.get("error_trace", None) if "results" in data: @@ -323,7 +329,7 @@ def _update_pool_kwargs_for_ssl_minimum_version(server, kwargs): kwargs["ssl_minimum_version"] = ssl.TLSVersion.MINIMUM_SUPPORTED -def _create_sql_payload(stmt, args, bulk_args): +def _create_sql_payload(stmt, args, bulk_args) -> bytes: if not isinstance(stmt, str): raise ValueError("stmt is not a string") if args and bulk_args: @@ -334,7 +340,7 @@ def _create_sql_payload(stmt, args, bulk_args): data["args"] = args if bulk_args: data["bulk_args"] = bulk_args - return json.dumps(data, cls=CrateJsonEncoder) + return json_dumps(data) def _get_socket_opts( diff --git a/tests/client/test_http.py b/tests/client/test_http.py index 610197a8..554fbe5f 100644 --- a/tests/client/test_http.py +++ b/tests/client/test_http.py @@ -49,9 +49,9 @@ ) from crate.client.http import ( Client, - CrateJsonEncoder, _get_socket_opts, _remove_certs_for_non_https, + json_dumps, ) REQUEST = "crate.client.http.Server.request" @@ -318,7 +318,7 @@ def test_datetime_is_converted_to_ts(self, request): # convert string to dict # because the order of the keys isn't deterministic data = json.loads(request.call_args[1]["data"]) - self.assertEqual(data["args"], [1425108700000]) + self.assertEqual(data["args"], ["2015-02-28T07:31:40"]) client.close() @patch(REQUEST, autospec=True) @@ -329,7 +329,7 @@ def test_date_is_converted_to_ts(self, request): day = dt.date(2016, 4, 21) client.sql("insert into users (dt) values (?)", (day,)) data = json.loads(request.call_args[1]["data"]) - self.assertEqual(data["args"], [1461196800000]) + self.assertEqual(data["args"], ["2016-04-21"]) client.close() def test_socket_options_contain_keepalive(self): @@ -724,10 +724,10 @@ def test_username(self): class TestCrateJsonEncoder(TestCase): def test_naive_datetime(self): data = dt.datetime.fromisoformat("2023-06-26T09:24:00.123") - result = json.dumps(data, cls=CrateJsonEncoder) - self.assertEqual(result, "1687771440123") + result = json_dumps(data) + self.assertEqual(result, b'"2023-06-26T09:24:00.123000"') def test_aware_datetime(self): data = dt.datetime.fromisoformat("2023-06-26T09:24:00.123+02:00") - result = json.dumps(data, cls=CrateJsonEncoder) - self.assertEqual(result, "1687764240123") + result = json_dumps(data) + self.assertEqual(result, b'"2023-06-26T09:24:00.123000+02:00"')