From f832852198806d9e79f9607cae8608c5b31bfaf3 Mon Sep 17 00:00:00 2001 From: Ray Walker Date: Fri, 19 Jun 2026 23:33:34 +1000 Subject: [PATCH 1/2] feat: make orjson an optional dependency (cachekit[json]) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit orjson backed only the non-default OrjsonSerializer, yet was a hard, eagerly- imported dependency — unlike its structural twin ArrowSerializer, which is the optional + lazy `cachekit[data]`. So `import cachekit` pulled orjson even when only the default (MessagePack) serializer was ever used. orjson now mirrors Arrow exactly: - Moved from core dependencies to a new `json` optional extra (cachekit[json]); kept in the dev dependency-group so tests/doctests/markdown-docs still resolve it. - Lazy-loaded: SERIALIZER_REGISTRY["orjson"] is None; get_serializer("orjson") and `from cachekit.serializers import OrjsonSerializer` resolve it on demand via _get_orjson_serializer() + module __getattr__. - orjson_serializer.py fails fast with a helpful install hint when orjson is absent (mirrors arrow_serializer.py's pyarrow guard). - get_serializer_info()'s optional-dep ImportError branch generalized (was hardcoded to ArrowSerializer). `import cachekit` no longer imports orjson — verified by a subprocess regression guard; the default/std/auto path never needed it. Docs updated to note the [json] extra (mirroring the [data] treatment). BREAKING CHANGE: orjson is no longer installed by `pip install cachekit`. To use the orjson serializer (serializer="orjson" or OrjsonSerializer), install `cachekit[json]`. Without it, get_serializer("orjson") raises an ImportError with an actionable install hint. --- DEVELOPMENT.md | 2 +- README.md | 2 +- docs/api-reference.md | 2 +- docs/getting-started.md | 2 +- docs/serializers/README.md | 2 + docs/serializers/orjson.md | 10 +++ pyproject.toml | 8 ++- src/cachekit/serializers/__init__.py | 36 ++++++++-- src/cachekit/serializers/orjson_serializer.py | 7 +- tests/unit/test_orjson_serializer.py | 8 ++- tests/unit/test_serializer_lazy_loading.py | 71 +++++++++++++++++++ uv.lock | 10 ++- 12 files changed, 143 insertions(+), 17 deletions(-) diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md index 146b83f..cbee65d 100644 --- a/DEVELOPMENT.md +++ b/DEVELOPMENT.md @@ -41,7 +41,7 @@ cachekit prioritizes explicit, predictable behavior. | **StandardSerializer** | Language-agnostic MessagePack | Default, works everywhere | | **AutoSerializer** | Python-optimized (NumPy, pandas, datetime) | Named "Auto" to be transparent | | **ArrowSerializer** | Apache Arrow for DataFrames | 60%+ faster for pandas | -| **OrjsonSerializer** | JSON via orjson | JSON compatibility | +| **OrjsonSerializer** | JSON via orjson — requires `pip install 'cachekit[json]'` | JSON compatibility | > [!IMPORTANT] > **NO auto-detection of business logic**: Pydantic models, SQLAlchemy ORM objects, and custom classes require explicit conversion to dict. diff --git a/README.md b/README.md index ee6e51c..d9a9c83 100644 --- a/README.md +++ b/README.md @@ -249,7 +249,7 @@ def test_cached_function(): | Serializer | Speed | Use Case | |:-----------|:-----:|:---------| | **StandardSerializer** | ★★★★☆ | General Python types, NumPy, Pandas | -| **OrjsonSerializer** | ★★★★★ | JSON APIs (2-5x faster than stdlib) | +| **OrjsonSerializer** | ★★★★★ | JSON APIs (2-5x faster than stdlib) — requires `cachekit[json]` | | **ArrowSerializer** | ★★★★★ | Large DataFrames (6-23x faster for 10K+ rows) | | **EncryptionWrapper** | ★★★★☆ | Wraps any serializer with AES-256-GCM | diff --git a/docs/api-reference.md b/docs/api-reference.md index 3365a30..2f82120 100644 --- a/docs/api-reference.md +++ b/docs/api-reference.md @@ -489,7 +489,7 @@ result = process_numpy_data() ### Using OrjsonSerializer (JSON-Optimized) -Use OrjsonSerializer for JSON-heavy workloads and APIs: +Use OrjsonSerializer for JSON-heavy workloads and APIs. Requires the `[json]` extra: `pip install 'cachekit[json]'` (or `uv add 'cachekit[json]'`). ```python notest from cachekit import cache diff --git a/docs/getting-started.md b/docs/getting-started.md index 0ebfb25..b5ce0bb 100644 --- a/docs/getting-started.md +++ b/docs/getting-started.md @@ -126,7 +126,7 @@ def analyze_data(): import numpy as np return np.array([1, 2, 3, 4, 5]) -# For JSON APIs: OrjsonSerializer (2-5x faster) +# For JSON APIs: OrjsonSerializer (2-5x faster) — requires: pip install 'cachekit[json]' @cache(ttl=900, serializer="orjson") def get_api_response(): return {"status": "ok", "data": "response"} diff --git a/docs/serializers/README.md b/docs/serializers/README.md index 7243516..ad5a683 100644 --- a/docs/serializers/README.md +++ b/docs/serializers/README.md @@ -19,6 +19,8 @@ Each serializer integrates transparently with the `@cache` decorator. You can co | [EncryptionWrapper](encryption.md) | Adds ~3-5 μs | Zero-knowledge caching, GDPR/HIPAA/PCI-DSS compliance | | [Custom Serializers](custom.md) | Varies | Specialized data types not covered above | +> **OrjsonSerializer** requires the `[json]` extra: `pip install 'cachekit[json]'` (or `uv add 'cachekit[json]'`). + For caching Pydantic models, see [Caching Pydantic Models](pydantic.md). ## Decision Matrix diff --git a/docs/serializers/orjson.md b/docs/serializers/orjson.md index 0f9135b..67a927c 100644 --- a/docs/serializers/orjson.md +++ b/docs/serializers/orjson.md @@ -4,6 +4,16 @@ **JSON-optimized serializer** — Fast JSON serialization powered by Rust (orjson library). Ideal for JSON-heavy workloads and API response caching. +**Requires the `[json]` extra** — orjson is an optional dependency: + +```bash +pip install 'cachekit[json]' +# or +uv add 'cachekit[json]' +``` + +Without orjson installed, `get_serializer("orjson")` raises `ImportError: orjson is not installed. OrjsonSerializer requires the [json] extra: pip install 'cachekit[json]'`. + ## Overview **Best for:** diff --git a/pyproject.toml b/pyproject.toml index 7cb8cc8..db0d30b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -65,7 +65,6 @@ dependencies = [ # Serialization and hashing "blake3>=1.0.5", "msgpack>=1.2.1", - "orjson>=3.9.0", "xxhash>=3.5.0", # HTTP client for SaaS backend (cachekit.io) "httpx[http2]>=0.28.1", @@ -78,6 +77,10 @@ data = [ "pandas>=1.3.0", "pyarrow>=21.0.0", ] +json = [ + # OrjsonSerializer (serializer="orjson") — fast JSON via orjson + "orjson>=3.9.0", +] memcached = [ "pymemcache>=4.0.0", ] @@ -228,6 +231,9 @@ dev = [ "numpy>=2.0.2", "pandas>=1.3.0", "pyarrow>=21.0.0", + # OrjsonSerializer support — now the [json] optional extra; kept here so the + # orjson tests, doctests, and markdown-docs still resolve it in dev/CI. + "orjson>=3.9.0", "pytest-xdist>=3.8.0", "time-machine>=2.19.0", ] diff --git a/src/cachekit/serializers/__init__.py b/src/cachekit/serializers/__init__.py index 0312b09..d6f8c30 100644 --- a/src/cachekit/serializers/__init__.py +++ b/src/cachekit/serializers/__init__.py @@ -15,11 +15,11 @@ SerializerProtocol, ) from .encryption_wrapper import EncryptionWrapper -from .orjson_serializer import OrjsonSerializer from .standard_serializer import StandardSerializer if TYPE_CHECKING: from .arrow_serializer import ArrowSerializer + from .orjson_serializer import OrjsonSerializer logger = logging.getLogger(__name__) @@ -37,6 +37,20 @@ def _get_arrow_serializer() -> type: return _ArrowSerializer +# Lazy import for optional OrjsonSerializer (requires orjson from [json] extra) +_OrjsonSerializer: type | None = None + + +def _get_orjson_serializer() -> type: + """Lazy-load OrjsonSerializer. Raises ImportError if orjson not installed.""" + global _OrjsonSerializer + if _OrjsonSerializer is None: + from .orjson_serializer import OrjsonSerializer + + _OrjsonSerializer = OrjsonSerializer + return _OrjsonSerializer + + # Validate ByteStorage works correctly test_storage = ByteStorage("msgpack") test_data = b"test validation data" @@ -57,7 +71,7 @@ def _get_arrow_serializer() -> type: "default": StandardSerializer, # Language-agnostic MessagePack for multi-language caches "std": StandardSerializer, # Explicit StandardSerializer alias "arrow": None, # Lazy-loaded: requires pyarrow from [data] extra - "orjson": OrjsonSerializer, + "orjson": None, # Lazy-loaded: requires orjson from [json] extra "encrypted": EncryptionWrapper, # StandardSerializer + AES-256-GCM encryption } @@ -116,9 +130,11 @@ def get_serializer(name: str, enable_integrity_checking: bool = True) -> Seriali f"@cache(serializer=MySerializer())" ) - # Get serializer class (lazy-load arrow if needed) + # Get serializer class (lazy-load optional serializers if needed) if name == "arrow": serializer_class = _get_arrow_serializer() + elif name == "orjson": + serializer_class = _get_orjson_serializer() else: serializer_class = SERIALIZER_REGISTRY[name] @@ -177,9 +193,15 @@ def get_serializer_info() -> dict[str, dict[str, Any]]: if hasattr(instance, "get_info"): info[name].update(instance.get_info()) # type: ignore[attr-defined] except ImportError as e: + # Optional serializer whose backing dependency (pyarrow / orjson) is absent. + optional_modules = { + "arrow": ("ArrowSerializer", "cachekit.serializers.arrow_serializer"), + "orjson": ("OrjsonSerializer", "cachekit.serializers.orjson_serializer"), + } + cls, module = optional_modules.get(name, ("Unknown", "unknown")) info[name] = { - "class": "ArrowSerializer" if name == "arrow" else "Unknown", - "module": "cachekit.serializers.arrow_serializer", + "class": cls, + "module": module, "available": False, "error": str(e), } @@ -194,9 +216,11 @@ def get_serializer_info() -> dict[str, dict[str, Any]]: def __getattr__(name: str) -> Any: - """Lazy attribute access for optional ArrowSerializer.""" + """Lazy attribute access for optional ArrowSerializer / OrjsonSerializer.""" if name == "ArrowSerializer": return _get_arrow_serializer() + if name == "OrjsonSerializer": + return _get_orjson_serializer() raise AttributeError(f"module {__name__!r} has no attribute {name!r}") diff --git a/src/cachekit/serializers/orjson_serializer.py b/src/cachekit/serializers/orjson_serializer.py index b10400a..db1c62e 100644 --- a/src/cachekit/serializers/orjson_serializer.py +++ b/src/cachekit/serializers/orjson_serializer.py @@ -14,7 +14,12 @@ from typing import Any, ClassVar -import orjson +# Required dependency: orjson (fail-fast with install hint when the [json] extra is absent) +try: + import orjson +except ImportError as e: + raise ImportError("orjson is not installed. OrjsonSerializer requires the [json] extra: pip install 'cachekit[json]'") from e + import xxhash from .base import SerializationError, SerializationFormat, SerializationMetadata diff --git a/tests/unit/test_orjson_serializer.py b/tests/unit/test_orjson_serializer.py index 42dfac5..9524bf6 100644 --- a/tests/unit/test_orjson_serializer.py +++ b/tests/unit/test_orjson_serializer.py @@ -375,8 +375,12 @@ def test_factory_caches_orjson_serializer(self): assert serializer1 is serializer2 def test_orjson_serializer_in_registry(self): - """Test that OrjsonSerializer is registered.""" + """orjson is registered as a lazy (None) placeholder — like arrow. + + orjson moved to the optional [json] extra, so the registry holds None and + get_serializer('orjson') resolves the class on demand (covered above). + """ from cachekit.serializers import SERIALIZER_REGISTRY assert "orjson" in SERIALIZER_REGISTRY - assert SERIALIZER_REGISTRY["orjson"] == OrjsonSerializer + assert SERIALIZER_REGISTRY["orjson"] is None diff --git a/tests/unit/test_serializer_lazy_loading.py b/tests/unit/test_serializer_lazy_loading.py index 25a5a72..ca240a3 100644 --- a/tests/unit/test_serializer_lazy_loading.py +++ b/tests/unit/test_serializer_lazy_loading.py @@ -6,11 +6,15 @@ from __future__ import annotations +import subprocess +import sys + import pytest from cachekit.serializers import ( SERIALIZER_REGISTRY, _get_arrow_serializer, + _get_orjson_serializer, benchmark_serializers, get_available_serializers, get_serializer, @@ -18,6 +22,7 @@ ) from cachekit.serializers.arrow_serializer import ArrowSerializer from cachekit.serializers.base import SerializerProtocol +from cachekit.serializers.orjson_serializer import OrjsonSerializer class TestLazyArrowSerializerLoading: @@ -129,3 +134,69 @@ def test_arrow_is_none_in_registry(self): """Arrow entry is None in the raw registry (lazy placeholder).""" available = get_available_serializers() assert available["arrow"] is None + + +class TestLazyOrjsonSerializerLoading: + """Test lazy loading mechanism for OrjsonSerializer (optional [json] extra).""" + + def test_registry_has_none_for_orjson(self): + """SERIALIZER_REGISTRY stores None for orjson (lazy placeholder).""" + assert "orjson" in SERIALIZER_REGISTRY + assert SERIALIZER_REGISTRY["orjson"] is None + + def test_get_orjson_serializer_returns_class(self): + """_get_orjson_serializer() returns the OrjsonSerializer class.""" + assert _get_orjson_serializer() is OrjsonSerializer + + def test_get_orjson_serializer_caches_result(self): + """_get_orjson_serializer() caches the imported class.""" + assert _get_orjson_serializer() is _get_orjson_serializer() + + def test_get_serializer_orjson_returns_instance(self): + """get_serializer('orjson') returns an OrjsonSerializer instance.""" + serializer = get_serializer("orjson") + assert isinstance(serializer, OrjsonSerializer) + assert isinstance(serializer, SerializerProtocol) + + def test_module_getattr_returns_orjson_serializer(self): + """Module __getattr__ returns OrjsonSerializer for lazy access.""" + from cachekit import serializers + + assert serializers.OrjsonSerializer is OrjsonSerializer + + def test_get_serializer_info_includes_orjson(self): + """get_serializer_info() reports orjson as available with the right class.""" + info = get_serializer_info() + assert info["orjson"]["available"] is True + assert info["orjson"]["class"] == "OrjsonSerializer" + + +class TestOrjsonIsOptional: + """orjson is an optional dependency (the [json] extra): it must not be pulled + eagerly, and when absent it must yield a helpful install error while the rest of + cachekit keeps working. Verified in fresh subprocesses because sys.modules is + shared across the test session (orjson is installed in the dev environment). + """ + + def test_import_cachekit_does_not_pull_orjson(self): + """Importing cachekit must NOT eagerly import orjson (the optionality regression guard).""" + code = "import cachekit, sys; assert 'orjson' not in sys.modules, 'orjson was imported eagerly'" + result = subprocess.run([sys.executable, "-c", code], capture_output=True, text=True) # noqa: S603 (trusted: sys.executable + literal code) + assert result.returncode == 0, result.stderr + + def test_orjson_absent_raises_helpful_error(self): + """Without orjson, cachekit + the default serializer still work, and requesting + the orjson serializer raises a helpful, actionable [json]-extra ImportError.""" + code = ( + 'import sys; sys.modules["orjson"] = None\n' + "import cachekit\n" + "from cachekit.serializers import get_serializer\n" + 'assert type(get_serializer("default")).__name__ == "StandardSerializer"\n' + "try:\n" + ' get_serializer("orjson")\n' + ' raise SystemExit("expected ImportError")\n' + "except ImportError as e:\n" + ' assert "[json] extra" in str(e), str(e)\n' + ) + result = subprocess.run([sys.executable, "-c", code], capture_output=True, text=True) # noqa: S603 (trusted: sys.executable + literal code) + assert result.returncode == 0, result.stderr diff --git a/uv.lock b/uv.lock index 0408893..7c89d16 100644 --- a/uv.lock +++ b/uv.lock @@ -236,7 +236,6 @@ dependencies = [ { name = "blake3" }, { name = "httpx", extra = ["http2"] }, { name = "msgpack" }, - { name = "orjson" }, { name = "prometheus-client" }, { name = "psutil" }, { name = "pydantic" }, @@ -253,6 +252,9 @@ data = [ { name = "pandas" }, { name = "pyarrow" }, ] +json = [ + { name = "orjson" }, +] memcached = [ { name = "pymemcache" }, ] @@ -269,6 +271,7 @@ dev = [ { name = "hypothesis" }, { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, { name = "numpy", version = "2.3.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, + { name = "orjson" }, { name = "pandas" }, { name = "pip-audit" }, { name = "psutil" }, @@ -297,7 +300,7 @@ requires-dist = [ { name = "httpx", extras = ["http2"], specifier = ">=0.28.1" }, { name = "msgpack", specifier = ">=1.2.1" }, { name = "numpy", marker = "extra == 'data'", specifier = ">=2.0.2" }, - { name = "orjson", specifier = ">=3.9.0" }, + { name = "orjson", marker = "extra == 'json'", specifier = ">=3.9.0" }, { name = "pandas", marker = "extra == 'data'", specifier = ">=1.3.0" }, { name = "prometheus-client", specifier = ">=0.22.1" }, { name = "psutil", specifier = ">=7.0.0" }, @@ -309,7 +312,7 @@ requires-dist = [ { name = "tenacity", specifier = ">=8.0.0" }, { name = "xxhash", specifier = ">=3.5.0" }, ] -provides-extras = ["data", "memcached"] +provides-extras = ["data", "json", "memcached"] [package.metadata.requires-dev] dev = [ @@ -322,6 +325,7 @@ dev = [ { name = "httpx", specifier = ">=0.28.1" }, { name = "hypothesis", specifier = ">=6.0.0" }, { name = "numpy", specifier = ">=2.0.2" }, + { name = "orjson", specifier = ">=3.9.0" }, { name = "pandas", specifier = ">=1.3.0" }, { name = "pip-audit", specifier = ">=2.7.0" }, { name = "psutil", specifier = ">=5.9.0" }, From dee842827f1e5c5f0d4967424e6ea6cc08c0fa03 Mon Sep 17 00:00:00 2001 From: Ray Walker Date: Sat, 20 Jun 2026 09:14:34 +1000 Subject: [PATCH 2/2] test: cover orjson optional-dependency branches MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch coverage flagged the two optional-dep-absent paths the in-process suite cannot reach while orjson is installed: - get_serializer_info()'s generalized ImportError branch — now covered by a monkeypatch test asserting a missing orjson is labeled OrjsonSerializer (the mislabeling the generalization fixed), not the old hardcoded ArrowSerializer. - orjson_serializer.py's import guard — marked `# pragma: no cover`: it is only reachable without the [json] extra, and its behavior is already verified by the subprocess regression test in test_serializer_lazy_loading.py (matches how arrow_serializer.py's pyarrow guard is left uncovered). --- src/cachekit/serializers/orjson_serializer.py | 2 +- tests/unit/test_serializer_lazy_loading.py | 21 +++++++++++++++++++ 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/src/cachekit/serializers/orjson_serializer.py b/src/cachekit/serializers/orjson_serializer.py index db1c62e..d70939d 100644 --- a/src/cachekit/serializers/orjson_serializer.py +++ b/src/cachekit/serializers/orjson_serializer.py @@ -17,7 +17,7 @@ # Required dependency: orjson (fail-fast with install hint when the [json] extra is absent) try: import orjson -except ImportError as e: +except ImportError as e: # pragma: no cover - only reachable without the [json] extra (behavior tested via subprocess) raise ImportError("orjson is not installed. OrjsonSerializer requires the [json] extra: pip install 'cachekit[json]'") from e import xxhash diff --git a/tests/unit/test_serializer_lazy_loading.py b/tests/unit/test_serializer_lazy_loading.py index ca240a3..e22bf43 100644 --- a/tests/unit/test_serializer_lazy_loading.py +++ b/tests/unit/test_serializer_lazy_loading.py @@ -170,6 +170,27 @@ def test_get_serializer_info_includes_orjson(self): assert info["orjson"]["available"] is True assert info["orjson"]["class"] == "OrjsonSerializer" + def test_get_serializer_info_reports_orjson_unavailable(self, monkeypatch): + """When orjson is absent, get_serializer_info() labels it OrjsonSerializer/unavailable. + + Guards the generalized optional-dep branch — before it was hardcoded to + ArrowSerializer and would have mislabeled a missing orjson. + """ + import cachekit.serializers as serializers_mod + + def _missing() -> type: + raise ImportError("orjson is not installed. OrjsonSerializer requires the [json] extra") + + monkeypatch.setattr(serializers_mod, "_get_orjson_serializer", _missing) + # Bypass the factory cache so get_serializer re-resolves orjson and the + # ImportError reaches get_serializer_info's except branch. + monkeypatch.delitem(serializers_mod._serializer_cache, "orjson:True", raising=False) + + info = serializers_mod.get_serializer_info() + assert info["orjson"]["available"] is False + assert info["orjson"]["class"] == "OrjsonSerializer" + assert info["orjson"]["module"] == "cachekit.serializers.orjson_serializer" + class TestOrjsonIsOptional: """orjson is an optional dependency (the [json] extra): it must not be pulled