superlocalmemory - npm Package Compare versions

+111

src/superlocalmemory/mcp/agent_context.py

		"""Per-HTTP-request agent ID resolution — ContextVar home.

		Kept in a standalone module so both tools_core and tools_active can import
		it without creating circular dependencies (server.py → tools_core → here,
		and unified_daemon.py → here independently).

		Priority chain (HTTP-first, stdio-fallback):
		1. ContextVar set by _AgentIDExtractorASGI middleware from /mcp/{agent_id} URL path.
		2. SLM_AGENT_ID environment variable (stdio transport legacy).
		3. Hard-coded "mcp_client" sentinel.
		"""
		from __future__ import annotations

		import contextvars
		import os
		import re

		_current_agent_id: contextvars.ContextVar[str] = contextvars.ContextVar(
		"slm_agent_id", default="mcp_client"
		)

		# Agent ids arrive from an untrusted URL path segment. They are ATTRIBUTION
		# metadata, never an authenticated principal — but they reach loggers, the
		# agent registry, and SQL-bound attribution columns, so we hard-restrict the
		# charset at the single extraction chokepoint. This neutralises log-injection
		# (CRLF / ANSI), oversized ids, and any path-ish characters in one place.
		_AGENT_ID_SANITIZE = re.compile(r"[^A-Za-z0-9._-]")
		_AGENT_ID_MAX_LEN = 64


		def sanitize_agent_id(raw: str) -> str:
		"""Coerce an untrusted agent-id segment to a safe, bounded token."""
		return _AGENT_ID_SANITIZE.sub("_", raw)[:_AGENT_ID_MAX_LEN]


		def get_current_agent_id(env_fallback: bool = True) -> str:
		"""Return the agent_id for the current asyncio task.

		For HTTP transport the ASGI wrapper sets the ContextVar from the URL path
		before the request reaches any MCP tool, so this returns the URL-derived id.
		For stdio transport the ContextVar holds its default ("mcp_client") and we
		fall through to the SLM_AGENT_ID env var instead.
		"""
		ctx_id = _current_agent_id.get()
		if ctx_id != "mcp_client":
		return ctx_id
		if env_fallback:
		return os.environ.get("SLM_AGENT_ID", "mcp_client")
		return "mcp_client"


		class AgentIDExtractorASGI:
		"""ASGI wrapper that maps ``/mcp/{agent_id}`` → the agent-id ContextVar.

		Mounted at ``/mcp`` in unified_daemon. IMPORTANT: Starlette's ``Mount``
		(≥0.35 / 1.x) does NOT strip the mount prefix from ``scope["path"]`` — it
		records the prefix in ``scope["root_path"]`` and leaves ``path`` as the full
		request path (e.g. ``/mcp/claude`` with ``root_path == "/mcp"``). So we
		compute the mount-relative sub-path ourselves as ``path[len(root_path):]``.

		Flow for ``POST /mcp/claude``:
		sub-path ``/claude`` → agent id ``claude`` → set ContextVar → rewrite the
		scope path to ``{root_path}/`` so the inner FastMCP app (Starlette, route
		``/``) sees the same mount-relative ``/`` it sees for a bare ``/mcp/``.

		Backward compatible: bare ``/mcp/`` has sub-path ``/`` → no agent segment →
		the request passes through untouched and the ContextVar keeps its
		``"mcp_client"`` default.

		Per-request isolation is guaranteed by ContextVar + ``reset(token)`` in a
		``finally``, so concurrent HTTP sessions never see each other's agent id.
		"""

		__slots__ = ("_app",)

		def __init__(self, inner) -> None:
		self._app = inner

		async def __call__(self, scope, receive, send):
		if scope.get("type") == "http":
		root_path: str = scope.get("root_path", "")
		full_path: str = scope.get("path", "/")
		# Mount-relative sub-path (what comes AFTER /mcp). When a root_path
		# is present the request path MUST start with it (Starlette Mount
		# guarantees this); if it somehow does not, treat it as no-agent and
		# pass through untouched rather than mis-parsing the full path.
		if root_path:
		if not full_path.startswith(root_path):
		await self._app(scope, receive, send)
		return
		subpath = full_path[len(root_path):]
		else:
		subpath = full_path
		first = subpath.lstrip("/").split("/")[0]
		if first:
		first = sanitize_agent_id(first)
		token = _current_agent_id.set(first)
		# Rewrite the path so the inner app sees the bare mount root,
		# exactly as it would for a no-agent /mcp/ request.
		new_full = (root_path + "/") if root_path else "/"
		new_scope = {
		**scope,
		"path": new_full,
		"raw_path": new_full.encode(),
		}
		try:
		await self._app(new_scope, receive, send)
		finally:
		_current_agent_id.reset(token)
		return
		await self._app(scope, receive, send)

+243

src/superlocalmemory/optimize/proxy/capture.py

		"""capture.py — Lossless shadow-capture of real proxy traffic (v3.6.10, plan §7).

		Purpose: build a dogfood corpus of real {request, response, model, tokens,
		content_type} pairs so the cache + compression benchmark (benchmarks/optimize/)
		can be replayed against authentic traffic instead of only synthetic prompts.

		Activation: set ``SLM_OPTIMIZE_CAPTURE=1`` in the daemon's environment. When on:
		* the proxy runs in PURE PASSTHROUGH — cache + compression hooks are disabled
		at load time (see server._load_hooks), so capture never observes a mutated
		request or a cache hit; every line is a genuine upstream exchange.
		* each completed exchange is appended as one JSON line to
		``~/.superlocalmemory/optimize_capture.jsonl`` (0600, gitignored).

		ISOLATION GUARANTEE: this module writes ONLY to optimize_capture.jsonl. It never
		opens memory.db, llmcache.db, or any SLM memory store. (Plan §9 hard rule.)

		FAIL-OPEN: a capture failure (disk full, permission, encode error) is logged and
		swallowed — it MUST NOT break the proxied request the user is waiting on.
		"""

		from __future__ import annotations

		import asyncio
		import json
		import logging
		import os
		import threading
		from pathlib import Path
		from typing import Any

		logger = logging.getLogger("slm.optimize.proxy.capture")

		_CAPTURE_DIRNAME = ".superlocalmemory"
		_CAPTURE_FILENAME = "optimize_capture.jsonl"
		_CAPTURE_ENV = "SLM_OPTIMIZE_CAPTURE"
		_TRUTHY = frozenset({"1", "true", "yes", "on"})

		# Cap a single captured body so a pathological 10 MB request can't bloat the
		# corpus line beyond what the replay harness will read back. Bodies above this
		# are recorded truncated with a marker (capture is for benchmarking, not audit).
		_MAX_CAPTURE_BODY_BYTES = 1 * 1024 * 1024 # 1 MB per side

		# Providers whose response bodies follow the OpenAI usage schema
		# ({"usage": {"prompt_tokens", "completion_tokens"}, "model"}).
		_OPENAI_FORMAT_PROVIDERS = frozenset({"openai", "gemini-openai-compat"})


		def capture_enabled() -> bool:
		"""True iff ``SLM_OPTIMIZE_CAPTURE`` is set to a truthy value."""
		return os.environ.get(_CAPTURE_ENV, "").strip().lower() in _TRUTHY


		def _capture_path() -> Path:
		return Path.home() / _CAPTURE_DIRNAME / _CAPTURE_FILENAME


		class ShadowCapture:
		"""Thread-safe append-only JSONL writer for proxy exchanges (singleton)."""

		_instance: "ShadowCapture \| None" = None
		_instance_lock = threading.Lock()

		def __init__(self, path: Path \| None = None) -> None:
		self._path = path or _capture_path()
		self._write_lock = threading.Lock()
		self._count = 0

		@classmethod
		def get_instance(cls) -> "ShadowCapture":
		# Double-checked locking: cheap fast-path after first construction.
		if cls._instance is None:
		with cls._instance_lock:
		if cls._instance is None:
		cls._instance = cls()
		return cls._instance

		@classmethod
		def reset_instance(cls) -> None:
		"""Test hook — drop the singleton so a fresh path can be injected."""
		with cls._instance_lock:
		cls._instance = None

		@property
		def path(self) -> Path:
		return self._path

		@property
		def count(self) -> int:
		return self._count

		def record(self, entry: dict[str, Any]) -> bool:
		"""Append one capture entry as a JSON line. Returns True on success.

		Fail-open: any error is logged and False is returned; never raised.

		Security: opens with a single ``os.open`` carrying ``O_CREAT \|
		O_APPEND \| O_NOFOLLOW`` and mode ``0o600`` on EVERY write. O_NOFOLLOW
		refuses a symlink pre-placed at the path (symlink-append attack), and
		the unconditional 0600-on-create removes the stat/exists TOCTOU that
		could otherwise drop the file to the process umask.
		"""
		try:
		line = json.dumps(entry, ensure_ascii=False, separators=(",", ":"))
		except (TypeError, ValueError) as exc:
		logger.warning("capture: entry not JSON-serialisable, dropped: %r", exc)
		return False

		try:
		with self._write_lock:
		self._path.parent.mkdir(parents=True, exist_ok=True)
		flags = os.O_CREAT \| os.O_WRONLY \| os.O_APPEND \| getattr(os, "O_NOFOLLOW", 0)
		fd = os.open(self._path, flags, 0o600)
		with os.fdopen(fd, "a", encoding="utf-8") as fh:
		fh.write(line + "\n")
		self._count += 1
		return True
		except OSError as exc:
		# PermissionError / symlink-refusal (ELOOP) are security-relevant —
		# surface the errno but still fail open so the request is never blocked.
		logger.warning("capture: write failed (fail-open): %r", exc)
		return False


		def _truncate(raw: bytes) -> tuple[str, bool]:
		"""Decode bytes for storage; truncate beyond the per-side cap."""
		truncated = len(raw) > _MAX_CAPTURE_BODY_BYTES
		head = raw[:_MAX_CAPTURE_BODY_BYTES] if truncated else raw
		return head.decode("utf-8", errors="replace"), truncated


		def build_entry(
		*,
		provider: str,
		model: str,
		request_body: bytes,
		response_body: bytes,
		content_type: str,
		input_tokens: int,
		output_tokens: int,
		status_code: int,
		stream: bool,
		) -> dict[str, Any]:
		"""Construct a capture entry dict from a completed exchange.

		No timestamp is stamped here (Date.now is intentionally avoided in some
		runtimes); the replay harness keys on content, not time, and the file's own
		line order preserves arrival sequence.
		"""
		req_str, req_trunc = _truncate(request_body)
		resp_str, resp_trunc = _truncate(response_body)
		return {
		"provider": provider,
		"model": model,
		"content_type": content_type,
		"stream": stream,
		"status_code": status_code,
		"input_tokens": int(input_tokens),
		"output_tokens": int(output_tokens),
		"request": req_str,
		"response": resp_str,
		"request_truncated": req_trunc,
		"response_truncated": resp_trunc,
		}


		def extract_usage(provider: str, body: bytes \| None) -> tuple[int, int, str]:
		"""Best-effort (input_tokens, output_tokens, model) from a provider JSON body.

		Works on the normalised JSON the SSE parsers emit AND on non-streaming
		upstream JSON. Returns (0, 0, "") when the body is missing/unparseable —
		capture must never fail because usage couldn't be read.
		"""
		if not body:
		return 0, 0, ""
		try:
		data = json.loads(body)
		except (json.JSONDecodeError, ValueError, TypeError):
		return 0, 0, ""
		if not isinstance(data, dict):
		return 0, 0, ""

		if provider == "gemini":
		usage = data.get("usageMetadata") or {}
		return (
		int(usage.get("promptTokenCount", 0) or 0),
		int(usage.get("candidatesTokenCount", 0) or 0),
		str(data.get("modelVersion", "") or ""),
		)

		usage = data.get("usage") or {}
		if provider == "anthropic":
		return (
		int(usage.get("input_tokens", 0) or 0),
		int(usage.get("output_tokens", 0) or 0),
		str(data.get("model", "") or ""),
		)
		# OpenAI-format providers (explicit allowlist so a future provider variant
		# is not silently parsed with the wrong schema — it warns + returns zeros).
		if provider not in _OPENAI_FORMAT_PROVIDERS:
		logger.warning(
		"capture.extract_usage: unknown provider %r — recording zero tokens", provider
		)
		return 0, 0, str(data.get("model", "") or "")
		return (
		int(usage.get("prompt_tokens", 0) or 0),
		int(usage.get("completion_tokens", 0) or 0),
		str(data.get("model", "") or ""),
		)


		def record_exchange(
		*,
		provider: str,
		model: str,
		request_body: bytes,
		response_body: bytes,
		content_type: str = "application/json",
		input_tokens: int = 0,
		output_tokens: int = 0,
		status_code: int = 200,
		stream: bool = False,
		) -> bool:
		"""Build + append a capture entry. Fail-open. Returns True on success."""
		entry = build_entry(
		provider=provider,
		model=model,
		request_body=request_body,
		response_body=response_body,
		content_type=content_type,
		input_tokens=input_tokens,
		output_tokens=output_tokens,
		status_code=status_code,
		stream=stream,
		)
		return ShadowCapture.get_instance().record(entry)


		async def record_exchange_async(**kwargs: Any) -> bool:
		"""Async wrapper for ``record_exchange`` that offloads the synchronous file
		write to a worker thread so it never blocks the proxy event loop — relevant
		when many streaming responses complete in the same loop iteration.
		"""
		return await asyncio.to_thread(lambda: record_exchange(**kwargs))

+1

-1

package.json

		{
		"name": "superlocalmemory",
		"version": "3.6.9",
		"version": "3.6.10",
		"description": "Information-geometric agent memory with mathematical guarantees. 4-channel retrieval, Fisher-Rao similarity, zero-LLM mode, EU AI Act compliant. Works with Claude, Cursor, Windsurf, and 17+ AI tools.",
		@@ -5,0 +5,0 @@ "keywords": [

+6

-1

pyproject.toml

		[project]
		name = "superlocalmemory"
		version = "3.6.9"
		version = "3.6.10"
		description = "Information-geometric agent memory with mathematical guarantees"
		@@ -72,2 +72,7 @@ readme = "README.md"
		"sqlite-vec==0.1.9",
		# v3.6.10: LLMLingua-2 prose compression (aggressive mode, opt-in at runtime).
		# Hard dependency so `pip install superlocalmemory` ships compression-ready;
		# the ~560MB model downloads on first setup/warmup (fail-open). Verified the
		# pin does NOT disturb the transformers/torch/numpy pins above.
		"llmlingua==0.2.2",
		]
		@@ -74,0 +79,0 @@

+8

-4

README.md

		@@ -36,3 +36,5 @@ <p align="center">

		> V3.6 is the only local-first layer that SKIPS repeat LLM calls (cache: 100% saved), SHRINKS prompts 60-95% (compress: extractive + LLMLingua-2), and DISCOUNTS prefix costs (align: native KV-cache) — and remembers everything — in one install. Your first cache hit pays for the install time. Hours of coding on repeat, minimal API cost.
		> V3.6 is the only local-first layer that SKIPS repeat LLM calls (cache: 100% saved on a hit), SHRINKS prose prompts (compress: lossless-by-default, opt-in LLMLingua-2), and DISCOUNTS prefix costs (align: native KV-cache) — and remembers everything — in one install. Your first cache hit pays for the install time. Hours of coding on repeat, minimal API cost.
		>
		> v3.6.10: cache and compression are now independent runtime switches (cache-only, compress-only, both, or neither — toggle live from the dashboard, no restart). Compression was rebuilt to be lossless by default (the old string/array/code truncation is gone); aggressive mode adds LLMLingua-2 for prose only — never code, numbers, structured data, or the current turn.

		@@ -43,4 +45,4 @@ ### The Three Levers
		\|-------\|-----------\|:------:\|:---------------:\|
		\| Cache \| Skip repeat calls — exact-match SQLite lookup, vCache-gated semantic (opt-in) \| 100% on a hit (input + output) \| Cache ON, Semantic OFF \|
		\| Compress \| Shrink prompts — extractive JSON/code (lossless) + LLMLingua-2 prose (opt-in) \| 60–95% on a miss (input only) \| Safe mode ON, Aggressive OFF \|
		\| Cache \| Skip repeat calls — exact-match SQLite lookup (zero false hits), vCache-gated semantic (opt-in) \| 100% on a hit (input + output) \| Cache ON, Semantic OFF \|
		\| Compress \| Shrink prompts — safe = lossless normalization; aggressive = LLMLingua-2 prose only (opt-in) \| Safe: small + lossless · Aggressive: large on prose \| Safe mode, Aggressive OFF \|
		\| Align \| Stabilize prefix — maximize provider prefix-cache discounts \| Lossless extra \| ON when compression is ON \|
		@@ -50,2 +52,4 @@

		> Independent at runtime: enable caching only, compression only, both, or neither — from the dashboard Optimize tab, applied live (no restart). Each AI client can also get its own memory identity over HTTP MCP via `http://127.0.0.1:8765/mcp/{agent_id}`.

		### Quick Start
		@@ -66,3 +70,3 @@
		\| `slm cache status\\|clear\\|invalidate\\|ttl\\|semantic` \| Cache sub-control — exact + semantic tiers \|
		\| `slm compress status\\|mode\\|code\\|prose\\|ccr\\|align` \| Compression control — per-channel toggles \|
		\| `slm compress status\\|mode\\|prose` \| Compression control — safe (lossless) / aggressive (LLMLingua-2 prose) \|
		\| `slm proxy [--port] [--provider]` \| Start the interception proxy (port 8765) \|
		@@ -69,0 +73,0 @@ \| `slm wrap <agent>` \| Proxy-activate an agent — one command to start saving \|

+10

-5

src/superlocalmemory.egg-info/PKG-INFO

		Metadata-Version: 2.4
		Name: superlocalmemory
		Version: 3.6.9
		Version: 3.6.10
		Summary: Information-geometric agent memory with mathematical guarantees
		@@ -61,2 +61,3 @@ Author-email: Varun Pratap Bhardwaj <admin@superlocalmemory.com>
		Requires-Dist: sqlite-vec==0.1.9
		Requires-Dist: llmlingua==0.2.2
		Provides-Extra: search
		@@ -129,3 +130,5 @@ Requires-Dist: sentence-transformers==5.3.0; extra == "search"

		> V3.6 is the only local-first layer that SKIPS repeat LLM calls (cache: 100% saved), SHRINKS prompts 60-95% (compress: extractive + LLMLingua-2), and DISCOUNTS prefix costs (align: native KV-cache) — and remembers everything — in one install. Your first cache hit pays for the install time. Hours of coding on repeat, minimal API cost.
		> V3.6 is the only local-first layer that SKIPS repeat LLM calls (cache: 100% saved on a hit), SHRINKS prose prompts (compress: lossless-by-default, opt-in LLMLingua-2), and DISCOUNTS prefix costs (align: native KV-cache) — and remembers everything — in one install. Your first cache hit pays for the install time. Hours of coding on repeat, minimal API cost.
		>
		> v3.6.10: cache and compression are now independent runtime switches (cache-only, compress-only, both, or neither — toggle live from the dashboard, no restart). Compression was rebuilt to be lossless by default (the old string/array/code truncation is gone); aggressive mode adds LLMLingua-2 for prose only — never code, numbers, structured data, or the current turn.

		@@ -136,4 +139,4 @@ ### The Three Levers
		\|-------\|-----------\|:------:\|:---------------:\|
		\| Cache \| Skip repeat calls — exact-match SQLite lookup, vCache-gated semantic (opt-in) \| 100% on a hit (input + output) \| Cache ON, Semantic OFF \|
		\| Compress \| Shrink prompts — extractive JSON/code (lossless) + LLMLingua-2 prose (opt-in) \| 60–95% on a miss (input only) \| Safe mode ON, Aggressive OFF \|
		\| Cache \| Skip repeat calls — exact-match SQLite lookup (zero false hits), vCache-gated semantic (opt-in) \| 100% on a hit (input + output) \| Cache ON, Semantic OFF \|
		\| Compress \| Shrink prompts — safe = lossless normalization; aggressive = LLMLingua-2 prose only (opt-in) \| Safe: small + lossless · Aggressive: large on prose \| Safe mode, Aggressive OFF \|
		\| Align \| Stabilize prefix — maximize provider prefix-cache discounts \| Lossless extra \| ON when compression is ON \|
		@@ -143,2 +146,4 @@

		> Independent at runtime: enable caching only, compression only, both, or neither — from the dashboard Optimize tab, applied live (no restart). Each AI client can also get its own memory identity over HTTP MCP via `http://127.0.0.1:8765/mcp/{agent_id}`.

		### Quick Start
		@@ -159,3 +164,3 @@
		\| `slm cache status\\|clear\\|invalidate\\|ttl\\|semantic` \| Cache sub-control — exact + semantic tiers \|
		\| `slm compress status\\|mode\\|code\\|prose\\|ccr\\|align` \| Compression control — per-channel toggles \|
		\| `slm compress status\\|mode\\|prose` \| Compression control — safe (lossless) / aggressive (LLMLingua-2 prose) \|
		\| `slm proxy [--port] [--provider]` \| Start the interception proxy (port 8765) \|
		@@ -162,0 +167,0 @@ \| `slm wrap <agent>` \| Proxy-activate an agent — one command to start saving \|

+1

-0

src/superlocalmemory.egg-info/requires.txt

		@@ -32,2 +32,3 @@ httpx==0.28.1
		sqlite-vec==0.1.9
		llmlingua==0.2.2

		@@ -34,0 +35,0 @@ [dev]

+2

-2

src/superlocalmemory.egg-info/SOURCES.txt

		@@ -274,2 +274,3 @@ AUTHORS.md
		src/superlocalmemory/mcp/_stdin_guard.py
		src/superlocalmemory/mcp/agent_context.py
		src/superlocalmemory/mcp/resources.py
		@@ -313,4 +314,2 @@ src/superlocalmemory/mcp/server.py
		src/superlocalmemory/optimize/compress/ccr.py
		src/superlocalmemory/optimize/compress/extractive_code.py
		src/superlocalmemory/optimize/compress/extractive_json.py
		src/superlocalmemory/optimize/compress/prose_llmlingua.py
		@@ -330,2 +329,3 @@ src/superlocalmemory/optimize/compress/router.py
		src/superlocalmemory/optimize/proxy/anthropic_surface.py
		src/superlocalmemory/optimize/proxy/capture.py
		src/superlocalmemory/optimize/proxy/gemini_surface.py
		@@ -332,0 +332,0 @@ src/superlocalmemory/optimize/proxy/lifecycle.py

+1

-1

src/superlocalmemory/__init__.py

		@@ -35,3 +35,3 @@ """SuperLocalMemory — information-geometric agent memory.

		__version__ = "3.6.9"
		__version__ = "3.6.10"

		@@ -38,0 +38,0 @@ _REQUIRED_VERSIONS = {

+32

-70

src/superlocalmemory/cli/compress_cmd.py

		@@ -5,3 +5,3 @@ # Copyright (c) 2026 Varun Pratap Bhardwaj / Qualixar

		"""Handlers for ``slm compress status\|mode\|code\|prose\|ccr``."""
		"""Handlers for ``slm compress status\|mode\|prose``."""

		@@ -23,7 +23,2 @@ from __future__ import annotations

		def _get_cache_db():
		from superlocalmemory.optimize.storage.db import CacheDB
		return CacheDB()


		def _write_config(**fields) -> None:
		@@ -49,7 +44,8 @@ """5-step immutable config-write."""
		"status": cmd_compress_status,
		"mode": cmd_compress_mode,
		"code": cmd_compress_code,
		"prose": cmd_compress_prose,
		"ccr": cmd_compress_ccr,
		"align": cmd_compress_align,
		"mode": cmd_compress_mode,
		"prose": cmd_compress_prose,
		# removed in v3.6.10: code, ccr, align (extractive compressors removed)
		"code": _cmd_compress_removed("code"),
		"ccr": _cmd_compress_removed("ccr"),
		"align": _cmd_compress_removed("align"),
		}
		@@ -60,6 +56,18 @@ handler = _dispatch.get(sub or "")
		else:
		print("Usage: slm compress status\|mode\|code\|prose\|ccr [options]")
		print("Usage: slm compress status\|mode\|prose [options]")
		sys.exit(0)


		def _cmd_compress_removed(name: str):
		def handler(args: Namespace) -> None:
		print(
		f"slm compress {name}: removed in SLM v3.6.10. "
		f"Extractive {name} compression has been replaced by "
		f"Layer 1 (lossless whitespace) + Layer 2 (LLMLingua-2 prose). "
		f"Use 'slm compress prose on' to enable prose compression."
		)
		sys.exit(0)
		return handler


		def cmd_compress_status(args: Namespace) -> None:
		@@ -75,5 +83,4 @@ """Print compression status."""
		"compress_mode": cfg.compress_mode,
		"compress_code": cfg.compress_code,
		"compress_prose": cfg.compress_prose,
		"compress_ccr": cfg.compress_ccr,
		"compress_protect_recent": cfg.compress_protect_recent,
		}
		@@ -84,9 +91,8 @@ print(json.dumps(data, indent=2))
		print("Compression status:")
		print(f" Enabled: {'yes' if cfg.compress_enabled else 'no'}")
		print(f" Mode: {cfg.compress_mode}")
		print(f" Code: {'ON' if cfg.compress_code else 'OFF'}"
		" (extractive JSON/code — lossless structure)")
		print(f" Prose: {'ON' if cfg.compress_prose else 'OFF'}")
		print(f" CCR: {'ON' if cfg.compress_ccr else 'OFF'}"
		" (reversible context retrieval)")
		print(f" Enabled: {'yes' if cfg.compress_enabled else 'no'}")
		print(f" Mode: {cfg.compress_mode}")
		print(f" Prose (Layer 2): {'ON' if cfg.compress_prose else 'OFF'}"
		" (LLMLingua-2, aggressive mode only)")
		print(f" Protect recent: {cfg.compress_protect_recent} user turns")
		print(" Layer 1 (lossless whitespace normalization) is always ON when enabled.")

		@@ -112,19 +118,4 @@

		def cmd_compress_code(args: Namespace) -> None:
		"""Enable or disable code/JSON compression."""
		use_json = getattr(args, "json", False)
		value = getattr(args, "code_value", "on")

		_write_config(compress_code=(value == "on"))

		if use_json:
		print(json.dumps({"status": "ok", "compress_code": value == "on"}))
		return

		print(f"Code compression: {'ENABLED' if value == 'on' else 'DISABLED'}.")
		print("Daemon hot-reload: active within 2s.")


		def cmd_compress_prose(args: Namespace) -> None:
		"""Enable or disable prose compression."""
		"""Enable or disable prose compression (Layer 2, LLMLingua-2)."""
		use_json = getattr(args, "json", False)
		@@ -151,37 +142,8 @@ value = getattr(args, "prose_value", "off")

		print(f"Prose compression: {'ENABLED' if value == 'on' else 'DISABLED'}.")
		print(f"Prose compression (LLMLingua-2): {'ENABLED' if value == 'on' else 'DISABLED'}.")
		if value == "on":
		print(" Requires: compress_mode=aggressive and llmlingua package installed.")
		print(" Run: slm compress mode aggressive (if not already set)")
		if value == "on" and "compress_enabled" in fields:
		print(" (also enabled global compress)")
		print("Daemon hot-reload: active within 2s.")


		def cmd_compress_align(args: Namespace) -> None:
		"""Enable or disable alignment compression."""
		use_json = getattr(args, "json", False)
		value = getattr(args, "align_value", "on")

		_write_config(compress_align=(value == "on"))

		if use_json:
		print(json.dumps({"status": "ok", "compress_align": value == "on"}))
		return

		print(f"Alignment compression: {'ENABLED' if value == 'on' else 'DISABLED'}.")
		print("Daemon hot-reload: active within 2s.")


		def cmd_compress_ccr(args: Namespace) -> None:
		"""Enable or disable CCR (Compressed Context Retrieval)."""
		use_json = getattr(args, "json", False)
		value = getattr(args, "ccr_value", "off")

		_write_config(compress_ccr=(value == "on"))

		if use_json:
		print(json.dumps({"status": "ok", "compress_ccr": value == "on"}))
		return

		print(f"CCR (Compressed Context Retrieval): {'ENABLED' if value == 'on' else 'DISABLED'}.")
		if value == "on":
		print("Originals stored in llmcache.db for reversible retrieval.")
		print("Daemon hot-reload: active within 2s.")

+1

-3

src/superlocalmemory/cli/optimize_cmd.py

		@@ -100,5 +100,3 @@ # Copyright (c) 2026 Varun Pratap Bhardwaj / Qualixar
		f" (mode: {cfg.compress_mode},"
		f" code: {'ON' if cfg.compress_code else 'OFF'},"
		f" prose: {'ON' if cfg.compress_prose else 'OFF'},"
		f" CCR: {'ON' if cfg.compress_ccr else 'OFF'})")
		f" prose/L2: {'ON' if cfg.compress_prose else 'OFF'})")
		proxy_status = f"running on :{OPTIMIZE_DEFAULT_PORT}" if proxy_running else "not running"
		@@ -105,0 +103,0 @@ print(f" Proxy: {proxy_status}")

+49

-0

src/superlocalmemory/cli/setup_wizard.py

		@@ -35,2 +35,4 @@ # Copyright (c) 2026 Varun Pratap Bhardwaj / Qualixar
		_RERANKER_MODEL = "cross-encoder/ms-marco-MiniLM-L-12-v2"
		# v3.6.10: compulsory LLMLingua-2 prose compression model (~560MB, aggressive mode).
		_COMPRESSOR_MODEL = "microsoft/llmlingua-2-xlm-roberta-large-meetingbank"

		@@ -182,2 +184,45 @@

		def _download_compressor(model_name: str) -> bool:
		"""Download the LLMLingua-2 prose compression model (v3.6.10).

		Mirrors _download_reranker: a subprocess forces the HF download with visible
		progress. Fail-open — a network hiccup must NOT break setup; the model also
		lazy-downloads on first use in prose_llmlingua.py.
		"""
		print(f"\n Downloading compression model: {model_name}")
		print(f" (LLMLingua-2 prose compressor, ~560MB — aggressive mode only)\n")

		script = (
		"from llmlingua import PromptCompressor; "
		f"PromptCompressor(model_name='{model_name}', use_llmlingua2=True, "
		"device_map='cpu'); "
		"print('OK')"
		)

		try:
		result = subprocess.run(
		[sys.executable, "-c", script],
		timeout=900, # 560MB on a slow link can exceed 5 min
		capture_output=False,
		text=True,
		env={
		**os.environ,
		"CUDA_VISIBLE_DEVICES": "",
		"TOKENIZERS_PARALLELISM": "false",
		"TORCH_DEVICE": "cpu",
		},
		)
		if result.returncode == 0:
		print(f" ✓ Compression model ready")
		return True
		print(f" ✗ Compression model download failed (will lazy-download on first use)")
		return False
		except ImportError:
		print(f" ⚠ llmlingua not installed — compression model will download on first use")
		return False
		except Exception as exc:
		print(f" ✗ Compression model error: {exc}")
		return False


		# ---------------------------------------------------------------------------
		@@ -398,2 +443,6 @@ # Verification

		print()
		print("─── Step 4c/10: Download Compression Model (LLMLingua-2) ───")
		_download_compressor(_COMPRESSOR_MODEL)

		# -- Step 5: Daemon Configuration (v3.4.3) --
		@@ -400,0 +449,0 @@ print()

+7

-8

src/superlocalmemory/mcp/tools_active.py

		@@ -112,11 +112,10 @@ # Copyright (c) 2026 Varun Pratap Bhardwaj / Qualixar

		Each MCP client (Claude Code, Codex, Gemini CLI, Kimi, etc.) can set
		the ``SLM_AGENT_ID`` env var in its MCP server config so that memories,
		observations, and registry entries are tagged with the actual source
		agent — not the legacy ``"mcp_client"`` default.

		v3.4.39+: enables proper per-agent attribution in ``session_init``,
		``observe``, and event emissions.
		Priority chain (v3.6.10+):
		1. ContextVar set by HTTP URL path (/mcp/{agent_id}) — HTTP transport.
		2. SLM_AGENT_ID env var — stdio transport per-process identity.
		3. Provided default (legacy "mcp_client").
		"""
		return os.environ.get("SLM_AGENT_ID", default)
		from superlocalmemory.mcp.agent_context import get_current_agent_id
		resolved = get_current_agent_id(env_fallback=True)
		return resolved if resolved != "mcp_client" else default

		@@ -123,0 +122,0 @@

+16

-0

src/superlocalmemory/mcp/tools_core.py

		@@ -113,2 +113,6 @@ # Copyright (c) 2026 Varun Pratap Bhardwaj / Qualixar
		"""
		# v3.6.10: resolve "mcp_client" sentinel → URL path (HTTP) or env var (stdio)
		if agent_id == "mcp_client":
		from superlocalmemory.mcp.agent_context import get_current_agent_id
		agent_id = get_current_agent_id()
		meta = {
		@@ -175,2 +179,6 @@ "project": project,
		"""
		# v3.6.10: resolve "mcp_client" sentinel → URL path (HTTP) or env var (stdio)
		if agent_id == "mcp_client":
		from superlocalmemory.mcp.agent_context import get_current_agent_id
		agent_id = get_current_agent_id()
		import asyncio
		@@ -490,2 +498,6 @@ try:
		"""
		# v3.6.10: resolve "mcp_client" sentinel → URL path (HTTP) or env var (stdio)
		if agent_id == "mcp_client":
		from superlocalmemory.mcp.agent_context import get_current_agent_id
		agent_id = get_current_agent_id()
		try:
		@@ -525,2 +537,6 @@ from superlocalmemory.core.worker_pool import WorkerPool
		"""
		# v3.6.10: resolve "mcp_client" sentinel → URL path (HTTP) or env var (stdio)
		if agent_id == "mcp_client":
		from superlocalmemory.mcp.agent_context import get_current_agent_id
		agent_id = get_current_agent_id()
		try:
		@@ -527,0 +543,0 @@ if not content or not content.strip():

+23

-9

src/superlocalmemory/optimize/cache/boundary_store.py

		@@ -96,2 +96,3 @@ # optimize/cache/boundary_store.py
		epsilon_grid: tuple[float, ...] = (0.01, 0.02, 0.05, 0.10),
		return_threshold: float = 1.0,
		) -> float:
		@@ -101,11 +102,14 @@ """Compute τ̂ — the vCache exploration probability (Eq. 11).
		Args:
		query_sim: Cosine similarity s(x) ∈ [0, 1] for the incoming query.
		delta: δ — user-defined maximum error rate.
		Theorem 4.1 guarantee: Pr(correct) ≥ 1 - δ.
		epsilon_grid: ε values for the Eq. 11 min sweep.
		Distinct from δ; controls CI conservativeness.
		query_sim: Cosine similarity s(x) ∈ [0, 1] for the incoming query.
		delta: δ — user-defined maximum error rate.
		Theorem 4.1 guarantee: Pr(correct) ≥ 1 - δ.
		epsilon_grid: ε values for the Eq. 11 min sweep.
		Distinct from δ; controls CI conservativeness.
		return_threshold: Semantic return threshold from config (semantic_return_threshold).
		C-03 fix: during cold start, exploit directly when
		query_sim >= return_threshold instead of always exploring.

		Returns:
		τ̂ ∈ [0.0, 1.0]. Lower = more exploitation.
		Cold start (n < 3): returns 1.0 (always explore).
		Cold start (n < 3): 0.0 if query_sim >= return_threshold, else 1.0.

		@@ -122,3 +126,8 @@ Eq. 11 derivation (from the paper):
		if n < 3:
		return 1.0 # cold start — always explore
		# ARCH-02 note: this function serves dual purpose — (a) warm-phase vCache
		# Eq. 11 tau computation and (b) cold-start similarity gate. The cold-start
		# branch (n < 3) is intentionally simple: if the query is already above the
		# return threshold, serve it (tau=0.0 → exploit); otherwise explore (tau=1.0).
		# C-03: honor return_threshold — avoids 100% miss until 3 samples are accumulated.
		return 0.0 if query_sim >= return_threshold else 1.0

		@@ -150,3 +159,8 @@ # Step 1: Fisher-information SE

		def should_explore(self, query_sim: float, delta: float = 0.05) -> bool:
		def should_explore(
		self,
		query_sim: float,
		delta: float = 0.05,
		return_threshold: float = 1.0,
		) -> bool:
		"""Return True (explore = LLM call) or False (exploit = serve cache).
		@@ -156,3 +170,3 @@
		"""
		tau = self.compute_tau(query_sim, delta=delta)
		tau = self.compute_tau(query_sim, delta=delta, return_threshold=return_threshold)
		return _RNG.random() <= tau
		@@ -159,0 +173,0 @@

+7

-4

src/superlocalmemory/optimize/cache/exact.py

		@@ -6,2 +6,3 @@ """exact.py — SQLite-backed exact-match get/set, cacheable-response guard."""
		import json
		import logging
		import time
		@@ -12,2 +13,3 @@ from typing import Any

		logger = logging.getLogger(__name__)

		@@ -25,2 +27,3 @@ _NON_CACHEABLE_FINISH_REASONS: frozenset[str] = frozenset({
		if finish in _NON_CACHEABLE_FINISH_REASONS:
		logger.debug("exact: skip cache (finish_reason=%r)", finish)
		return False
		@@ -31,8 +34,11 @@ choices = response.get("choices") or []
		if fr in _NON_CACHEABLE_FINISH_REASONS:
		logger.debug("exact: skip cache (choice.finish_reason=%r)", fr)
		return False
		msg = choice.get("message") or {}
		if msg.get("tool_calls"):
		logger.debug("exact: skip cache (choice.message.tool_calls present)")
		return False
		for block in response.get("content") or []:
		if isinstance(block, dict) and block.get("type") == "tool_use":
		logger.debug("exact: skip cache (tool_use content block)")
		return False
		@@ -53,5 +59,2 @@ return True
		return None
		if row.ttl_expires is not None and row.ttl_expires < time.time():
		self._db.delete(key, tenant_id)
		return None
		return json.loads(row.value.decode("utf-8"))
		@@ -82,3 +85,3 @@
		ttl_expires=expires_at,
		tags=[],
		tags=tags,
		)
		@@ -85,0 +88,0 @@ return True

+13

-0

src/superlocalmemory/optimize/cache/key_builder.py

		@@ -7,2 +7,3 @@ """key_builder.py — deterministic SHA-256 cache key, tenant-scoped."""
		import json
		import logging
		import re
		@@ -12,2 +13,4 @@ from dataclasses import dataclass, field

		logger = logging.getLogger(__name__)

		DETERMINISTIC_PARAMS: frozenset[str] = frozenset({
		@@ -67,2 +70,12 @@ "max_tokens", "stop", "stop_sequences", "top_p", "top_k",
		if temperature != 0 and not self._config.allow_nonzero_temperature_cache:
		logger.debug(
		"cache skip: temperature=%.2f allow_nonzero=%s",
		temperature,
		self._config.allow_nonzero_temperature_cache,
		)
		try:
		from superlocalmemory.optimize.metrics.counters import MetricsCollector
		MetricsCollector.get_instance().increment_skipped_temperature()
		except Exception:
		pass
		return None
		@@ -69,0 +82,0 @@

+70

-8

src/superlocalmemory/optimize/cache/manager.py

		@@ -5,2 +5,4 @@ """manager.py — CacheManager orchestrator (singleton, fail-open, stampede-shielded)."""

		import hashlib as _hashlib
		import json as _json_mod
		import logging
		@@ -12,2 +14,5 @@ import threading

		# C-08: precomputed hash for the common "default" tenant — avoids sha256 on every request
		_DEFAULT_TENANT_HASH: str = _hashlib.sha256(b"default").hexdigest()

		from superlocalmemory.optimize.cache.exact import ExactCache
		@@ -235,2 +240,22 @@ from superlocalmemory.optimize.cache.invalidation import InvalidationEngine
		self._metrics.exact_misses += 1

		# C-01: semantic fallback on exact miss (only when explicitly enabled)
		if self._semantic.is_enabled():
		try:
		sem_result = self._semantic.lookup(req, tenant_id, None)
		if sem_result is not None:
		self._metrics.semantic_hits += 1
		if isinstance(sem_result, CachedResponse):
		return sem_result
		# VCacheSemantic may return a response dict — wrap it
		return CachedResponse(
		hit=True,
		data=_json_mod.dumps(sem_result, ensure_ascii=False).encode(),
		cache_key=key,
		ttl_seconds=0,
		)
		except Exception as exc:
		logger.warning("SemanticTier.lookup raised (fail-open): %s", exc)
		self._metrics.semantic_misses += 1

		# Return miss WITH the key so callers can use it for cache storage.
		@@ -264,2 +289,9 @@ return CachedResponse(hit=False, data=None, cache_key=key, ttl_seconds=0)

		# C-02: index in semantic tier after exact write (fail-open)
		if self._semantic.is_enabled():
		try:
		self._semantic.index_entry(req, tenant_id, None, response_dict)
		except Exception as exc:
		logger.warning("SemanticTier.index_entry raised (fail-open): %s", exc)

		# ---- CacheHook protocol implementation (INTERFACE-CONTRACT §3) ----
		@@ -276,3 +308,3 @@
		try:
		result = self.get(req, tenant_id="default")
		result = self.get(req, tenant_id=_DEFAULT_TENANT_HASH)
		if result is not None and not result.hit:
		@@ -288,3 +320,3 @@ MetricsCollector.get_instance().on_miss()
		try:
		self.set(req, resp, tenant_id="default")
		self.set(req, resp, tenant_id=_DEFAULT_TENANT_HASH)
		except Exception as exc:
		@@ -295,11 +327,41 @@ logger.warning("CacheManager.store raised (fail-open): %s", exc)
		"""CacheHook.on_hit() — forward token savings to MetricsCollector.

		Contract §7: cache skip saves BOTH input+output tokens (whole call avoided).
		tokens_saved = input tokens saved (from request).
		Output tokens estimated from response body size (~4 bytes per token).

		M-01: compute input tokens from request body when caller passes 0.
		M-02: parse real output tokens from cached response usage field.
		All counts are estimates for display — not billing-accurate.
		"""
		import json as _json

		# M-01: estimate input tokens from message content
		if tokens_saved == 0 and isinstance(req, ProxyRequest):
		try:
		body = req.body or {}
		total_chars = 0
		for m in (body.get("messages") or []):
		c = m.get("content", "")
		if isinstance(c, str):
		total_chars += len(c)
		elif isinstance(c, list):
		for blk in c:
		if isinstance(blk, dict):
		total_chars += len(blk.get("text", "") or "")
		total_chars += len(body.get("system", "") or "")
		tokens_saved = max(0, total_chars // 4)
		except Exception:
		pass

		# M-02: parse real output tokens from stored response
		output_tokens = 0
		if resp:
		# Rough estimate: 4 bytes per token for English text
		output_tokens = len(resp) // 4
		try:
		data = _json.loads(resp)
		usage = data.get("usage") or {}
		output_tokens = (
		usage.get("output_tokens")
		or usage.get("completion_tokens")
		or 0
		)
		except Exception:
		output_tokens = len(resp) // 4 # fallback byte-estimate

		MetricsCollector.get_instance().on_hit(
		@@ -306,0 +368,0 @@ tokens_saved_input=tokens_saved,

+10

-5

src/superlocalmemory/optimize/cache/semantic.py

		@@ -297,3 +297,4 @@ # optimize/cache/semantic.py
		delta = float(getattr(cfg, "semantic_error_target", _DEFAULT_ERROR_TARGET))
		if record.should_explore(best_score, delta=delta):
		sem_return_threshold = float(getattr(cfg, "semantic_return_threshold", _DEFAULT_RETURN_THRESHOLD))
		if record.should_explore(best_score, delta=delta, return_threshold=sem_return_threshold):
		logger.debug(
		@@ -367,7 +368,7 @@ "VCacheSemantic: explore (score=%.4f entry=%s t_hat=%.4f)",
		entries: list[tuple[str, str, np.ndarray]] = []
		for entry_id, blob in rows:
		for entry_id, blob, ctx_fp in rows: # C-10: unpack persisted context_fp
		try:
		v = np.frombuffer(blob, dtype=np.float32).copy()
		if v.shape[0] == _EMBED_DIM:
		entries.append((entry_id, "", v))
		entries.append((entry_id, ctx_fp, v))
		except Exception:
		@@ -427,3 +428,3 @@ continue

		# Store vector in DB
		# Store vector in DB — C-10: persist context_fp alongside the vector
		vec_bytes = vec.tobytes()
		@@ -434,3 +435,7 @@ self._db.vec_add(
		vector=vec_bytes,
		meta={"model": "nomic-ai/nomic-embed-text-v1.5", "dim": _EMBED_DIM},
		meta={
		"model": "nomic-ai/nomic-embed-text-v1.5",
		"dim": _EMBED_DIM,
		"context_fp": context_fp,
		},
		)
		@@ -437,0 +442,0 @@

+1

-7

src/superlocalmemory/optimize/compress/prose_llmlingua.py

		@@ -35,12 +35,6 @@ # compress/prose_llmlingua.py
		self,
		model_name: str = _MODEL_BERT,
		model_name: str = _MODEL_XLM,
		device_map: str = "cpu",
		rate: float = _DEFAULT_RATE,
		) -> None:
		import os
		if os.environ.get("SLM_DISABLE_HF_DOWNLOAD", "0") == "1":
		raise ImportError(
		"LLMLingua-2 model download blocked: SLM_DISABLE_HF_DOWNLOAD=1. "
		"Set compress_llmlingua_allow_download=true in optimize.json."
		)
		try:
		@@ -47,0 +41,0 @@ from llmlingua import PromptCompressor # type: ignore[import]

+81

-86

src/superlocalmemory/optimize/compress/router.py

		@@ -29,3 +29,2 @@ # compress/router.py
		_MIN_CHARS_FOR_COMPRESSION: int = 500
		_MIN_RATIO_STRUCTURED: float = 0.60

		@@ -54,4 +53,2 @@
		def __init__(self) -> None:
		self._json_compressor: "JSONCompressor \| None" = None
		self._code_compressor: "CodeCompressor \| None" = None
		self._llmlingua_compressor: "LLMLinguaCompressor \| None" = None
		@@ -108,8 +105,13 @@ self._ccr_store: "CCRStore \| None" = None

		if tokens_after >= tokens_before:
		return req # no improvement

		# S-01/Stage-9 fix: build new_bytes BEFORE the improvement guard.
		# Layer 1 normalization saves characters (not word-count tokens), so
		# tokens_after == tokens_before for "normalize" strategy. Checking byte
		# length of the serialized body correctly detects Layer 1 savings.
		body["messages"] = new_messages
		new_bytes = json.dumps(body, ensure_ascii=False, separators=(",", ":")).encode()

		bytes_saved = len(req.body_bytes) - len(new_bytes)
		if bytes_saved <= 0 and tokens_after >= tokens_before:
		return req # neither bytes nor tokens improved

		# CONTRACT §3: fire on_compress metrics callback
		@@ -166,3 +168,8 @@ lossy = strategy == "llmlingua2_prose"

		protect_indices = set(range(max(0, len(messages) - protect_recent), len(messages)))
		# K-05: protect last N user turns, not last N messages of any role
		user_indices = [i for i, m in enumerate(messages) if m.get("role") == "user"]
		protect_indices: set[int] = set(user_indices[-protect_recent:]) if protect_recent > 0 else set()
		# Always protect the very last message (current turn, any role)
		if messages:
		protect_indices.add(len(messages) - 1)

		@@ -255,92 +262,73 @@ for idx, msg in enumerate(messages):

		# JSON detection
		if len(text) < _MIN_CHARS_FOR_COMPRESSION:
		return text, tokens_before, tokens_before, "none"

		# K-01/K-02/K-03: NEVER compress structured content (JSON or code)
		stripped = text.strip()
		if stripped.startswith(("{", "[")):
		# PERF-02: for large content, structural bracket-match avoids O(n) json.loads().
		# Conservative: matching outer brackets → treat as JSON and skip compression.
		# K-01 mandate is safety-first: false-positive (non-JSON treated as JSON) is
		# safe; false-negative (JSON compressed) would be a correctness violation.
		_last = stripped[-1] if stripped else ""
		if len(stripped) > 8192 and (
		(stripped[0] == "{" and _last == "}") or (stripped[0] == "[" and _last == "]")
		):
		return text, tokens_before, tokens_before, "none" # large JSON → passthrough
		try:
		parsed = json.loads(stripped)
		compressor = self._get_json_compressor()
		compressed = compressor.compress(parsed)
		tokens_after = _token_estimate_structured(compressed)
		tokens_before_adj = _token_estimate_structured(text)
		ratio = tokens_after / tokens_before_adj if tokens_before_adj else 1.0
		if ratio < _MIN_RATIO_STRUCTURED:
		# B-03: store-before-compress
		ccr_id = self._ccr_store_original(text.encode(), model, tenant_id)
		if ccr_id:
		try:
		obj = json.loads(compressed)
		if isinstance(obj, dict):
		obj["__slm_ccr__"] = ccr_id
		compressed = json.dumps(obj, ensure_ascii=False, separators=(",", ":"))
		elif isinstance(obj, list):
		# RB-02: list-root embedding
		wrapper = {"__slm_ccr__": ccr_id, "__slm_data__": obj}
		compressed = json.dumps(wrapper, ensure_ascii=False, separators=(",", ":"))
		except Exception:
		pass
		self._ccr_update_compressed(ccr_id, compressed.encode())
		logger.debug("[%s] JSON compressed %.2f ratio ccr_id=%s", request_id, ratio, ccr_id)
		return compressed, tokens_before_adj, tokens_after, "extractive_json"
		return text, tokens_before, tokens_before, "none"
		except (json.JSONDecodeError, Exception):
		pass
		json.loads(stripped)
		return text, tokens_before, tokens_before, "none" # valid JSON → passthrough
		except json.JSONDecodeError:
		pass # not valid JSON — treat as prose
		except Exception as exc:
		logger.warning("compress: unexpected error probing JSON content: %s", exc)

		# Code detection
		lang = _detect_language(text)
		if lang is not None:
		compressor = self._get_code_compressor()
		# RB-03: compress first, compute ratio, then store CCR only if beneficial
		compressed_probe = compressor.compress(text, language=lang, ccr_id="")
		tokens_after = _token_estimate(compressed_probe)
		ratio = tokens_after / tokens_before if tokens_before else 1.0
		if ratio < _MIN_RATIO_STRUCTURED:
		# B-03: store-before-compress
		ccr_id = self._ccr_store_original(text.encode(), model, tenant_id)
		compressed = compressor.compress(text, language=lang, ccr_id=ccr_id)
		if ccr_id:
		self._ccr_update_compressed(ccr_id, compressed.encode())
		logger.debug("[%s] Code compressed lang=%s ratio=%.2f ccr_id=%s",
		request_id, lang, ratio, ccr_id)
		return compressed, tokens_before, tokens_after, "extractive_code"
		return text, tokens_before, tokens_before, "none"
		if _detect_language(text) is not None:
		return text, tokens_before, tokens_before, "none" # code → passthrough

		# Prose: only if aggressive mode AND compress_prose is enabled
		# (Phase 3 — opt-in prose tier; off by default; gated by config
		# field compress_prose added in LLD-04 INTERFACE-CONTRACT v2.)
		# Layer 1 — lossless whitespace normalization (always-on, safe)
		normalized = self._normalize_whitespace(text)
		tokens_after_l1 = _token_estimate(normalized)

		# Layer 2 — LLMLingua-2 prose compression (aggressive + opt-in only)
		cfg = self._get_config()
		prose_enabled = bool(getattr(cfg, "compress_prose", False))
		if aggressive and prose_enabled:
		if aggressive and prose_enabled: # pragma: no cover — LLMLingua optional dep
		compressor = self._get_llmlingua_compressor()
		if compressor is not None:
		# B-03: store-before-compress
		# B-03: store original BEFORE lossy compression
		ccr_id = self._ccr_store_original(text.encode(), model, tenant_id)
		compressed = compressor.compress(text)
		compressed = compressor.compress(normalized)
		if ccr_id:
		self._ccr_update_compressed(ccr_id, compressed.encode())
		tokens_after = _token_estimate(compressed)
		logger.info(
		"[%s] LLMLingua-2 prose compressed rate=%.2f ccr_id=%s (LOSSY)",
		request_id,
		tokens_after / tokens_before if tokens_before else 1.0,
		ccr_id,
		)
		return compressed, tokens_before, tokens_after, "llmlingua2_prose"
		tokens_after_l2 = _token_estimate(compressed)
		if tokens_after_l2 < tokens_before:
		logger.info(
		"[%s] LLMLingua-2 prose compressed rate=%.2f ccr_id=%s (LOSSY)",
		request_id,
		tokens_after_l2 / tokens_before if tokens_before else 1.0,
		ccr_id,
		)
		return compressed, tokens_before, tokens_after_l2, "llmlingua2_prose"

		# S-01 fix: compare character length, not word count.
		# _token_estimate() is word-count — whitespace normalization saves characters/bytes
		# but never removes words, so token counts are identical before and after Layer 1.
		# Character comparison correctly detects when normalization reduced the body size.
		if len(normalized) < len(text):
		return normalized, tokens_before, tokens_after_l1, "normalize"

		return text, tokens_before, tokens_before, "none"

		@staticmethod
		def _normalize_whitespace(text: str) -> str:
		"""Layer 1 lossless: collapse excess blank lines, strip trailing spaces per line."""
		import re
		text = re.sub(r"\n{3,}", "\n\n", text)
		lines = [line.rstrip() for line in text.split("\n")]
		return "\n".join(lines)

		# ── Lazy loaders ─────────────────────────────────────────────────────

		def _get_json_compressor(self) -> "JSONCompressor":
		if self._json_compressor is None:
		from superlocalmemory.optimize.compress.extractive_json import JSONCompressor
		self._json_compressor = JSONCompressor()
		return self._json_compressor

		def _get_code_compressor(self) -> "CodeCompressor":
		if self._code_compressor is None:
		from superlocalmemory.optimize.compress.extractive_code import CodeCompressor
		self._code_compressor = CodeCompressor()
		return self._code_compressor

		def _get_llmlingua_compressor(self) -> "LLMLinguaCompressor \| None":
		def _get_llmlingua_compressor(self) -> "LLMLinguaCompressor \| None": # pragma: no cover
		if self._llmlingua_compressor is None:
		@@ -404,8 +392,11 @@ try:
		tokens_before=tb, tokens_after=ta,
		lossy=strat == "llmlingua2_prose",
		)
		except Exception as exc:
		logger.debug("compress_text failed (non-fatal): %s", exc)
		t = _token_estimate(text)
		return CompressTextResult(
		compressed_text=text, strategy="none",
		tokens_before=len(text.split()), tokens_after=len(text.split()),
		tokens_before=t, tokens_after=t,
		lossy=False,
		)
		@@ -416,6 +407,14 @@
		class CompressTextResult:
		"""Result of a compress_text() call.

		UX-02 note: lossy=True only when strategy="llmlingua2_prose" (Layer 2).
		In the default install (LLMLingua optional dep not installed), lossy is
		always False — install `llmlingua>=0.2.0` and set compress_prose=True +
		compress_mode="aggressive" to activate lossy compression.
		"""
		compressed_text: str
		strategy: str # "extractive_json" \| "extractive_code" \| "llmlingua2_prose" \| "none"
		strategy: str # "normalize" \| "llmlingua2_prose" \| "none"
		tokens_before: int
		tokens_after: int
		lossy: bool = False # K-10: True only for llmlingua2_prose (Layer 2)

		@@ -429,6 +428,2 @@

		def _token_estimate_structured(text: str) -> int:
		return max(1, len(text) // 4) if text else 0


		def _msg_has_tool_result(msg: dict) -> bool:
		@@ -435,0 +430,0 @@ """B-09: Detect historical tool_result blocks in messages."""

+16

-0

src/superlocalmemory/optimize/config/__init__.py

		@@ -27,2 +27,18 @@ """Module-level accessor for OptimizeConfig (INTERFACE-CONTRACT §2).

		def get_shared_store() -> "ConfigStore":
		"""Return the process-wide ConfigStore singleton, creating it on first use.

		This is the SINGLE instance the daemon, the /api/optimize routes, the proxy
		hook-reload callback, and the hot-reload watchdog must all share so that a UI
		config change reaches the live proxy (fixes W-02 hot-reload + W-05 fresh-store
		-per-request). The daemon calls this at startup, registers a change callback,
		and starts the watchdog.
		"""
		global _store
		if _store is None:
		from superlocalmemory.optimize.config.store import ConfigStore
		_store = ConfigStore()
		return _store


		def _set_config_store(store: "ConfigStore") -> None:
		@@ -29,0 +45,0 @@ global _store

+1

-6

src/superlocalmemory/optimize/config/defaults.py

		@@ -26,11 +26,6 @@ """Default OptimizeConfig singleton — import-safe."""
		semantic_centroid_min_similarity=0.85,
		compress_enabled=True,
		compress_enabled=False,
		compress_mode="safe",
		compress_code=True,
		compress_json=True,
		compress_prose=False,
		compress_ccr=True,
		compress_align=True,
		compress_protect_recent=4,
		compress_llmlingua_allow_download=False,
		ttl_seconds=86400,
		@@ -37,0 +32,0 @@ ttl=TTLConfig(),

+2

-19

src/superlocalmemory/optimize/config/schema.py

		@@ -92,11 +92,6 @@ """Typed schema for optimize.json.
		# Compress
		compress_enabled: bool = True
		compress_enabled: bool = False
		compress_mode: str = "safe"
		compress_code: bool = True
		compress_json: bool = True
		compress_prose: bool = False
		compress_ccr: bool = True
		compress_align: bool = True
		compress_protect_recent: int = 4
		compress_llmlingua_allow_download: bool = False

		@@ -186,13 +181,6 @@ # TTL + providers + pricing
		),
		compress_enabled=bool(d.get("compress_enabled", True)),
		compress_enabled=bool(d.get("compress_enabled", False)),
		compress_mode=str(d.get("compress_mode", "safe")),
		compress_code=bool(d.get("compress_code", True)),
		compress_json=bool(d.get("compress_json", True)),
		compress_prose=bool(d.get("compress_prose", False)),
		compress_ccr=bool(d.get("compress_ccr", True)),
		compress_align=bool(d.get("compress_align", True)),
		compress_protect_recent=int(d.get("compress_protect_recent", 4)),
		compress_llmlingua_allow_download=bool(
		d.get("compress_llmlingua_allow_download", False)
		),
		ttl_seconds=int(d.get("ttl_seconds", 86400)),
		@@ -235,9 +223,4 @@ providers=providers,
		"compress_mode": self.compress_mode,
		"compress_code": self.compress_code,
		"compress_json": self.compress_json,
		"compress_prose": self.compress_prose,
		"compress_ccr": self.compress_ccr,
		"compress_align": self.compress_align,
		"compress_protect_recent": self.compress_protect_recent,
		"compress_llmlingua_allow_download": self.compress_llmlingua_allow_download,
		"ttl_seconds": self.ttl_seconds,
		@@ -244,0 +227,0 @@ "providers": {k: v.as_dict() for k, v in self.providers.items()},

+15

-1

src/superlocalmemory/optimize/config/store.py

		@@ -77,3 +77,9 @@ """ConfigStore — reads, writes, and hot-reloads optimize.json.
		def save(self, config: OptimizeConfig) -> None:
		"""Write config to optimize.json with version bump."""
		"""Write config to optimize.json with version bump.

		Fires registered change callbacks immediately after a successful write
		(outside the lock) so a UI/CLI save reaches the live proxy without
		waiting for the 2s watchdog poll. The watchdog skips this write via
		``_saved_by_self`` so callbacks fire exactly once.
		"""
		config.validate()
		@@ -97,2 +103,10 @@ with self._lock:
		self._saved_by_self = False
		callbacks = list(self._change_callbacks)
		# Fire callbacks OUTSIDE the lock — a callback may rebuild proxy hooks
		# or call back into get(); keeping them off the lock avoids contention.
		for cb in callbacks:
		try:
		cb(new_cfg)
		except Exception as exc:
		logger.warning("ConfigStore save callback error: %s", exc)

		@@ -99,0 +113,0 @@ def start_watchdog(self) -> None:

+15

-7

src/superlocalmemory/optimize/metrics/counters.py

		@@ -27,3 +27,3 @@ # Copyright (c) 2026 Varun Pratap Bhardwaj / Qualixar
		on_miss(): cache miss
		on_compress(bytes_original, bytes_after): compress ran
		on_compress(tokens_before, tokens_after): compress ran (word-count proxy)
		on_eviction(): entry expired/evicted
		@@ -73,10 +73,13 @@ """

		def on_compress(self, bytes_original: int, bytes_after: int) -> None:
		"""Record a compression run with byte counts."""
		def on_compress(self, tokens_before: int, tokens_after: int) -> None:
		"""Record a compression run. Arguments are word-count proxy estimates from _token_estimate().

		M-03: consistent naming — these are token estimates, not byte counts.
		Stored in compress_bytes_original/after fields for DB schema compat; unit is word-count.
		"""
		with self._data_lock:
		self._compress_runs += 1
		self._compress_bytes_original += max(0, bytes_original)
		self._compress_bytes_after += max(0, bytes_after)
		saved_tokens = max(0, bytes_original - bytes_after)
		self._tokens_saved_compress += saved_tokens
		self._compress_bytes_original += max(0, tokens_before)
		self._compress_bytes_after += max(0, tokens_after)
		self._tokens_saved_compress += max(0, tokens_before - tokens_after)

		@@ -88,2 +91,7 @@ def on_eviction(self) -> None:

		def increment_skipped_temperature(self) -> None:
		"""C-04: record a cache skip due to non-zero temperature."""
		with self._data_lock:
		self._calls_skipped += 1

		def record_latency(self, ms: float) -> None:
		@@ -90,0 +98,0 @@ """Record latency overhead in milliseconds."""

+100

-2

src/superlocalmemory/optimize/proxy/_helpers.py

		@@ -361,2 +361,3 @@ """_helpers.py — Shared HTTP utilities used by all surface modules."""
		on_complete: "Callable[[bytes], Any] \| None" = None,
		max_accumulate: int \| None = None,
		) -> Response \| StreamingResponse:
		@@ -392,6 +393,8 @@ """Stream-forward with optional post-stream cache-store callback.
		acc: list[bytes] = []
		acc_bytes = 0
		acc_capped = False
		complete_called = False

		async def _generate() -> AsyncIterator[bytes]:
		nonlocal complete_called
		nonlocal complete_called, acc_bytes, acc_capped
		stream_error = False
		@@ -404,3 +407,13 @@ try:
		if chunk:
		acc.append(chunk)
		# Always forward to the client; only bound what we hold
		# in memory for the on_complete callback (CWE-400).
		if max_accumulate is None or acc_bytes < max_accumulate:
		acc.append(chunk)
		acc_bytes += len(chunk)
		elif not acc_capped:
		acc_capped = True
		logger.debug(
		"[%s] stream accumulator capped at %d bytes",
		request_id, max_accumulate,
		)
		yield chunk
		@@ -453,2 +466,87 @@ except httpx.RemoteProtocolError as exc:

		async def capture_passthrough_forward(
		proxy: Any,
		request: Request,
		*,
		provider: str,
		upstream_url: str,
		allowed_headers: frozenset,
		request_id: str,
		model_hint: str = "",
		sse_parser: "Callable[[bytes], bytes \| None] \| None" = None,
		is_stream: bool = False,
		) -> Response \| StreamingResponse:
		"""Shadow-capture passthrough (v3.6.10, plan §7).

		Pure passthrough to upstream + record the exchange to the capture corpus.
		NO cache, NO compression — capture mode observes only authentic traffic.
		Fail-open: a capture or forward error degrades to a normal forward/error
		response; the user's request is never blocked by capture.
		"""
		from superlocalmemory.optimize.proxy.capture import (
		extract_usage,
		record_exchange_async,
		)

		body_bytes = await request.body()
		fwd_headers = _build_forward_headers(request, allowed_headers)
		fwd_headers["content-length"] = str(len(body_bytes))

		if is_stream:
		async def _on_complete(acc: bytes) -> None:
		parsed = sse_parser(acc) if sse_parser else None
		payload = parsed if parsed is not None else acc
		itok, otok, mdl = extract_usage(provider, parsed)
		await record_exchange_async(
		provider=provider,
		model=mdl or model_hint,
		request_body=body_bytes,
		response_body=payload,
		content_type="text/event-stream",
		input_tokens=itok,
		output_tokens=otok,
		status_code=200,
		stream=True,
		)

		# Bound the in-memory accumulator (CWE-400): the corpus only keeps the
		# first 1 MB per side anyway, so cap accumulation there.
		from superlocalmemory.optimize.proxy.capture import _MAX_CAPTURE_BODY_BYTES
		return await _stream_and_cache_forward(
		proxy, request_id, fwd_headers, body_bytes, upstream_url,
		on_complete=_on_complete,
		max_accumulate=_MAX_CAPTURE_BODY_BYTES,
		)

		if proxy.http_client is None:
		return await _fail_open_forward(proxy, request, upstream_url)
		try:
		upstream_resp = await proxy.http_client.post(
		upstream_url, content=body_bytes, headers=fwd_headers,
		)
		except Exception as exc:
		logger.error("[%s] capture passthrough upstream error: %r", request_id, exc)
		return await _fail_open_forward(proxy, request, upstream_url)

		resp_bytes = upstream_resp.content
		itok, otok, mdl = extract_usage(provider, resp_bytes)
		await record_exchange_async(
		provider=provider,
		model=mdl or model_hint,
		request_body=body_bytes,
		response_body=resp_bytes,
		content_type="application/json",
		input_tokens=itok,
		output_tokens=otok,
		status_code=upstream_resp.status_code,
		stream=False,
		)
		return Response(
		content=resp_bytes,
		status_code=upstream_resp.status_code,
		media_type="application/json",
		headers=_filter_response_headers(dict(upstream_resp.headers)),
		)


		async def _safe_cache_check(hooks: HookChain, ctx: ProxyRequest) -> CachedResponse:
		@@ -455,0 +553,0 @@ try:

+12

-0

src/superlocalmemory/optimize/proxy/anthropic_surface.py

		@@ -28,3 +28,5 @@ """anthropic_surface.py — Anthropic Messages + count_tokens + models surfaces."""
		_stream_forward,
		capture_passthrough_forward,
		)
		from superlocalmemory.optimize.proxy.capture import capture_enabled
		from superlocalmemory.optimize.proxy.lifecycle import ProviderResponse, ProxyRequest
		@@ -196,2 +198,12 @@
		has_tools = _body_has_tools(body)

		# v3.6.10 shadow-capture (plan §7): pure passthrough + corpus record.
		if capture_enabled():
		return await capture_passthrough_forward(
		proxy, request, provider="anthropic", upstream_url=upstream_url,
		allowed_headers=_ANTHROPIC_FORWARD_HEADERS, request_id=request_id,
		model_hint=str(body.get("model", "")),
		sse_parser=_parse_sse_to_json, is_stream=stream,
		)

		ctx = ProxyRequest(
		@@ -198,0 +210,0 @@ provider="anthropic",

+31

-0

src/superlocalmemory/optimize/proxy/gemini_surface.py

		@@ -50,3 +50,6 @@ """gemini_surface.py — Gemini native and OpenAI-compat surfaces.
		_stream_forward,
		capture_passthrough_forward,
		)
		from superlocalmemory.optimize.proxy.capture import capture_enabled
		from superlocalmemory.optimize.proxy.openai_surface import _parse_openai_sse_to_json
		from superlocalmemory.optimize.proxy.lifecycle import ProviderResponse, ProxyRequest
		@@ -242,2 +245,20 @@

		# v3.6.10 shadow-capture (plan §7): pure passthrough + corpus record.
		if capture_enabled():
		cap_model = model_and_method.split(":", 1)[0].replace("models/", "")
		cap_url = upstream_url
		if stream:
		_allowed = {
		k: v for k, v in request.query_params.items()
		if k.lower() in _GEMINI_ALLOWED_QUERY_PARAMS
		}
		_allowed["alt"] = "sse"
		cap_url = f"{upstream_url}?{urllib.parse.urlencode(_allowed)}"
		return await capture_passthrough_forward(
		proxy, request, provider="gemini", upstream_url=cap_url,
		allowed_headers=_GEMINI_NATIVE_FORWARD_HEADERS, request_id=request_id,
		model_hint=cap_model,
		sse_parser=_parse_gemini_sse_to_json, is_stream=stream,
		)

		ctx = ProxyRequest(
		@@ -412,2 +433,12 @@ provider="gemini",

		# v3.6.10 shadow-capture (plan §7): pure passthrough + corpus record.
		if capture_enabled():
		return await capture_passthrough_forward(
		proxy, request, provider="gemini-openai-compat",
		upstream_url=upstream_url,
		allowed_headers=_GEMINI_OPENAI_COMPAT_FORWARD_HEADERS,
		request_id=request_id, model_hint=str(body.get("model", "")),
		sse_parser=_parse_openai_sse_to_json, is_stream=stream,
		)

		ctx = ProxyRequest(
		@@ -414,0 +445,0 @@ provider="gemini-openai-compat",

+12

-0

src/superlocalmemory/optimize/proxy/openai_surface.py

		@@ -36,3 +36,5 @@ """openai_surface.py — OpenAI /v1/chat/completions and /v1/embeddings.
		_stream_forward,
		capture_passthrough_forward,
		)
		from superlocalmemory.optimize.proxy.capture import capture_enabled
		from superlocalmemory.optimize.proxy.lifecycle import ProviderResponse, ProxyRequest
		@@ -320,2 +322,12 @@
		has_tools = _body_has_tools(body)

		# v3.6.10 shadow-capture (plan §7): pure passthrough + corpus record.
		if capture_enabled():
		return await capture_passthrough_forward(
		proxy, request, provider="openai", upstream_url=upstream_url,
		allowed_headers=_OPENAI_FORWARD_HEADERS, request_id=request_id,
		model_hint=str(body.get("model", "")),
		sse_parser=_parse_openai_sse_to_json, is_stream=stream,
		)

		ctx = ProxyRequest(
		@@ -322,0 +334,0 @@ provider="openai", method="POST", path="/v1/chat/completions",

+29

-0

src/superlocalmemory/optimize/proxy/server.py

		@@ -78,3 +78,21 @@ """server.py — ProxyApp and build_proxy_router()."""

		def reload_from_config(self, config: OptimizeConfig) -> None:
		"""Hot-swap cache/compress behavior when optimize.json changes (v3.6.10).

		Called by the ConfigStore change-callback (UI save → immediate; external
		file/CLI edit → within the 2s watchdog poll). Rebuilds the HookChain so
		``cache_enabled`` and ``compress_enabled`` can be toggled INDEPENDENTLY
		at runtime with no daemon restart. Note: ``proxy_enabled`` (whether the
		proxy claims /v1/* at all) is a startup decision and is NOT changed here.
		"""
		self.config = config
		self.hooks = _load_hooks(config)
		logger.info(
		"slm.optimize.proxy reloaded (config v%s): cache=%s compress=%s",
		getattr(config, "config_version", "?"),
		type(self.hooks.cache).__name__ if self.hooks.cache else "None",
		type(self.hooks.compress).__name__ if self.hooks.compress else "None",
		)


		def build_proxy_router(proxy: ProxyApp) -> APIRouter:
		@@ -136,2 +154,13 @@ """Build and return the FastAPI router for all proxy surfaces."""
		def _load_hooks(config: OptimizeConfig) -> HookChain:
		# v3.6.10 shadow-capture (plan §7): capture mode is PURE passthrough — no
		# cache, no compression — so the corpus records only authentic upstream
		# exchanges. This is defense-in-depth alongside the per-surface guard.
		from superlocalmemory.optimize.proxy.capture import capture_enabled
		if capture_enabled():
		logger.info(
		"slm.optimize.proxy: SLM_OPTIMIZE_CAPTURE on — cache/compress "
		"DISABLED, recording exchanges to optimize_capture.jsonl"
		)
		return HookChain.empty()

		cache_hook = None
		@@ -138,0 +167,0 @@ compress_hook = None

+78

-11

src/superlocalmemory/optimize/storage/db.py

		@@ -36,2 +36,3 @@ """CacheDB — wraps DatabaseManager for llmcache.db operations.
		import os
		import re
		import sqlite3
		@@ -56,2 +57,21 @@ import struct
		_DEFAULT_TENANT: str = "default"
		_TENANT_HEX64 = re.compile(r"[0-9a-f]{64}")


		def _normalize_tenant_id(tenant_id: str) -> str:
		"""Match CacheManager.build_key tenant normalization (v3.6.10 fix).

		The proxy stores entries under SHA-256(tenant) when the tenant is not
		already a 64-char hex digest. Public tenant-scoped helpers (entry_count,
		clear_tenant, entry_exists) MUST apply the same hashing or they query the
		wrong tenant — the cause of the dashboard reporting "0 entries" while the
		cache is in fact populated.
		"""
		tid = tenant_id or _DEFAULT_TENANT
		if _TENANT_HEX64.fullmatch(tid):
		return tid
		import hashlib as _hashlib
		return _hashlib.sha256(tid.encode()).hexdigest()


		_ZLIB_LEVEL: int = 6
		@@ -66,2 +86,5 @@ _AES_NONCE_BYTES: int = 12

		# C-06: persisted AES key — survives machine-id changes after first run
		_KEY_FILE: Path = Path.home() / LLMCACHE_DIRNAME / "opt-key.bin"

		_FORBIDDEN_MEMORY_TABLES: frozenset[str] = frozenset({
		@@ -80,3 +103,9 @@ "memories", "atomic_facts", "profiles", "canonical_entities",
		class MetricsSnapshot:
		"""Mirror of llmcache_metrics columns — names MUST match exactly."""
		"""Mirror of llmcache_metrics columns — names MUST match exactly.

		S-03 / M-03 note: compress_bytes_original and compress_bytes_after store
		WORD-COUNT proxy values (len(text.split())), NOT byte counts. The column
		names use "bytes" for DB schema backward compatibility. Treat these fields
		as token-count proxies, not literal byte measurements.
		"""
		id: int = 1
		@@ -93,4 +122,4 @@ hits: int = 0
		compress_runs: int = 0
		compress_bytes_original: int = 0
		compress_bytes_after: int = 0
		compress_bytes_original: int = 0 # unit: word-count proxy (see S-03 note above)
		compress_bytes_after: int = 0 # unit: word-count proxy (see S-03 note above)
		cache_size_bytes: int = 0
		@@ -217,4 +246,3 @@ cache_entry_count: int = 0
		self._salt = self._load_or_create_salt()
		machine_id = self._get_machine_id()
		self._aes_key = self._derive_aes_key(machine_id, self._salt)
		self._aes_key = self._get_or_persist_aes_key(self._salt)
		self.assert_no_memory_db_tables()
		@@ -301,2 +329,29 @@

		def _get_or_persist_aes_key(self, salt: bytes) -> bytes:
		"""C-06: Load persisted key from disk, or derive + persist on first run.

		Surviving a machine-id change: after first derivation the key is saved to
		opt-key.bin (0600). On subsequent starts the file is read directly,
		so changing the underlying machine-id string cannot invalidate existing
		cache entries.
		"""
		try:
		if _KEY_FILE.exists():
		key = _KEY_FILE.read_bytes()
		if len(key) == 32:
		return key
		except Exception as exc:
		logger.warning("CacheDB: could not read persisted AES key: %s", exc)

		# First run (or corrupted file): derive from machine-id and persist.
		machine_id = self._get_machine_id()
		key = self._derive_aes_key(machine_id, salt)
		try:
		_KEY_FILE.parent.mkdir(parents=True, exist_ok=True)
		_KEY_FILE.write_bytes(key)
		os.chmod(_KEY_FILE, 0o600)
		except Exception as exc:
		logger.warning("CacheDB: could not persist AES key (fail-open): %s", exc)
		return key

		def _derive_aes_key(self, machine_id: str, salt: bytes) -> bytes:
		@@ -636,7 +691,8 @@ kdf = PBKDF2HMAC(
		model_name = str(meta.get("model", "nomic-ai/nomic-embed-text-v1.5"))
		context_fp = str(meta.get("context_fp", "")) # C-10: persist context fingerprint
		self._db.execute(
		"INSERT OR REPLACE INTO llmcache_semantic_vectors "
		"(entry_id, tenant_id, vector_blob, vector_dim, model_name) "
		"VALUES (?, ?, ?, ?, ?)",
		(entry_id, tenant_id, vector, dim, model_name),
		"(entry_id, tenant_id, vector_blob, vector_dim, model_name, context_fp) "
		"VALUES (?, ?, ?, ?, ?, ?)",
		(entry_id, tenant_id, vector, dim, model_name, context_fp),
		)
		@@ -866,10 +922,18 @@ except sqlite3.Error as exc:

		def get_all_vectors(self, tenant_id: str) -> list[tuple[str, bytes]]:
		def get_all_vectors(self, tenant_id: str) -> list[tuple[str, bytes, str]]:
		"""Return (entry_id, vector_blob, context_fp) for all vectors in a tenant.

		C-10: context_fp is included so _lazy_warm_tenant() can restore it without
		recomputing embeddings from messages that may no longer be in scope.
		"""
		try:
		rows = self._db.execute(
		"SELECT entry_id, vector_blob FROM llmcache_semantic_vectors "
		"SELECT entry_id, vector_blob, context_fp FROM llmcache_semantic_vectors "
		"WHERE tenant_id = ?",
		(tenant_id,),
		)
		return [(dict(r)["entry_id"], dict(r)["vector_blob"]) for r in rows]
		return [
		(dict(r)["entry_id"], dict(r)["vector_blob"], dict(r).get("context_fp", ""))
		for r in rows
		]
		except sqlite3.Error as exc:
		@@ -931,2 +995,3 @@ logger.warning("CacheDB.get_all_vectors failed: %s", exc)
		def entry_exists(self, cache_key: str, tenant_id: str = _DEFAULT_TENANT) -> bool:
		tenant_id = _normalize_tenant_id(tenant_id)
		try:
		@@ -942,2 +1007,3 @@ rows = self._db.execute(
		def clear_tenant(self, tenant_id: str) -> int:
		tenant_id = _normalize_tenant_id(tenant_id)
		try:
		@@ -969,2 +1035,3 @@ with self._db.transaction():
		def entry_count(self, tenant_id: str = _DEFAULT_TENANT) -> int:
		tenant_id = _normalize_tenant_id(tenant_id)
		try:
		@@ -971,0 +1038,0 @@ rows = self._db.execute(

+11

-0

src/superlocalmemory/optimize/storage/schema.py

		@@ -56,2 +56,3 @@ """DDL for llmcache.db — the Optimize module's dedicated SQLite database.
		model_name TEXT NOT NULL DEFAULT 'nomic-ai/nomic-embed-text-v1.5',
		context_fp TEXT NOT NULL DEFAULT '',
		created_at TEXT NOT NULL DEFAULT (strftime('%Y-%m-%dT%H:%M:%fZ', 'now'))
		@@ -131,2 +132,3 @@ )
		Also seeds the single llmcache_metrics row (id=1) via INSERT OR IGNORE.
		C-10: adds context_fp column to existing DBs via ALTER TABLE migration.
		"""
		@@ -136,2 +138,11 @@ for stmt in _DDL_STATEMENTS:
		conn.execute("INSERT OR IGNORE INTO llmcache_metrics(id) VALUES (1)")
		# C-10 migration: add context_fp column if missing (existing installs pre-v3.6.10)
		existing_cols = {
		row[1]
		for row in conn.execute("PRAGMA table_info(llmcache_semantic_vectors)")
		}
		if "context_fp" not in existing_cols:
		conn.execute(
		"ALTER TABLE llmcache_semantic_vectors ADD COLUMN context_fp TEXT NOT NULL DEFAULT ''"
		)
		row = conn.execute(
		@@ -138,0 +149,0 @@ "SELECT 1 FROM llmcache_schema_version WHERE version = ?",

+6

-8

src/superlocalmemory/server/routes/optimize.py

		@@ -35,5 +35,3 @@ # Copyright (c) 2026 Varun Pratap Bhardwaj / Qualixar
		compress_mode: Literal['safe', 'aggressive'] \| None = None
		compress_code: bool \| None = None
		compress_prose: bool \| None = None
		compress_ccr: bool \| None = None

		@@ -45,4 +43,4 @@
		try:
		from superlocalmemory.optimize.config.store import ConfigStore
		cfg = ConfigStore().get()
		from superlocalmemory.optimize.config import get_shared_store
		cfg = get_shared_store().get()
		return cfg.as_dict()
		@@ -59,4 +57,4 @@ except Exception as exc:
		import dataclasses
		from superlocalmemory.optimize.config.store import ConfigStore
		store = ConfigStore()
		from superlocalmemory.optimize.config import get_shared_store
		store = get_shared_store()
		cfg = store.get()
		@@ -86,7 +84,7 @@ updates: dict[str, Any] = {}
		try:
		from superlocalmemory.optimize.config.store import ConfigStore
		from superlocalmemory.optimize.config import get_shared_store
		from superlocalmemory.optimize.metrics.counters import get_metrics
		from superlocalmemory.optimize.storage.db import CacheDB

		cfg = ConfigStore().get()
		cfg = get_shared_store().get()
		collector = get_metrics()
		@@ -93,0 +91,0 @@ db = CacheDB()

+3

-1

src/superlocalmemory/ui/js/auto-settings.js

		@@ -327,6 +327,8 @@ // SuperLocalMemory V3 — Auto-Capture/Recall Settings
		try {
		var testBody = {provider: provider, model: model};
		if (apiKey) testBody.api_key = apiKey;
		var resp = await fetch('/api/v3/provider/test', {
		method: 'POST',
		headers: {'Content-Type': 'application/json'},
		body: JSON.stringify({provider: provider, model: model, api_key: apiKey})
		body: JSON.stringify(testBody)
		});
		@@ -333,0 +335,0 @@ var data = await resp.json();

+3

-0

src/superlocalmemory/ui/js/ng-shell.js

		@@ -341,2 +341,5 @@ // Neural Glass Shell — Dashboard V2 "Neural Glass"
		switch(tabId) {
		case 'brain-pane':
		if (typeof loadBrain === 'function') loadBrain();
		break;
		case 'graph-pane':
		@@ -343,0 +346,0 @@ if (typeof loadGraph === 'function') loadGraph();

+9

-9

src/superlocalmemory/ui/js/optimize.js

		@@ -26,2 +26,3 @@ // SuperLocalMemory V3 — Optimize Tab
		_setToggle('opt-enabled', cfg.enabled);
		_setToggle('opt-proxy-enabled', cfg.proxy_enabled);
		_setToggle('opt-cache-enabled', cfg.cache_enabled);
		@@ -31,6 +32,3 @@ _setToggle('opt-semantic-enabled', cfg.semantic_enabled);
		_setSelect('opt-compress-mode', cfg.compress_mode);
		_setToggle('opt-compress-code', cfg.compress_code);
		_setToggle('opt-compress-prose', cfg.compress_prose);
		_setToggle('opt-compress-ccr', cfg.compress_ccr);
		_setToggle('opt-compress-align', cfg.compress_align);
		var verEl = document.getElementById('opt-config-version');
		@@ -75,10 +73,8 @@ if (verEl) verEl.textContent = cfg.config_version \|\| '-';
		var fieldMap = {
		'opt-enabled': 'enabled',
		'opt-cache-enabled': 'cache_enabled',
		'opt-enabled': 'enabled',
		'opt-proxy-enabled': 'proxy_enabled',
		'opt-cache-enabled': 'cache_enabled',
		'opt-semantic-enabled': 'semantic_enabled',
		'opt-compress-enabled': 'compress_enabled',
		'opt-compress-code': 'compress_code',
		'opt-compress-prose': 'compress_prose',
		'opt-compress-ccr': 'compress_ccr',
		'opt-compress-align': 'compress_align'
		'opt-compress-prose': 'compress_prose'
		};
		@@ -115,2 +111,6 @@
		_putConfig(body);
		if (id === 'opt-proxy-enabled' \|\| id === 'opt-enabled') {
		var notice = document.getElementById('opt-restart-notice');
		if (notice) notice.classList.remove('d-none');
		}
		}
		@@ -117,0 +117,0 @@ });

-311

src/superlocalmemory/optimize/compress/extractive_code.py

		# compress/extractive_code.py
		# Copyright (c) 2026 Varun Pratap Bhardwaj / Qualixar
		# Licensed under AGPL-3.0-or-later
		#
		# AST structural patterns adapted from:
		# headroom/compression/handlers/code_handler.py:66-138 (Apache-2.0)
		# Specifically: _STRUCTURAL_NODE_TYPES per-language dict, CodeLanguage enum
		# headroom/compression/handlers/code_handler.py:141-150 — _SIGNATURE_PATTERNS regex fallback
		# Attribution: See ATTRIBUTION.md.

		"""CodeCompressor — AST-aware extractive code compressor.

		Supported languages: Python, JavaScript, Go, Rust, Java, C++.
		Path A (tree-sitter): used if tree-sitter-language-pack installed.
		Path B (regex fallback): used otherwise. Tests pass on both paths.
		"""

		from __future__ import annotations

		import logging
		import re
		from dataclasses import dataclass
		from enum import Enum

		logger = logging.getLogger("slm.optimize.compress.code")

		_BODY_STUB_BY_LANG: dict[str, str] = {
		"python": " # [slm: body compressed — retrieve with ccr_id={ccr_id}]",
		"javascript": " // [slm: body compressed — retrieve with ccr_id={ccr_id}]",
		"go": " // [slm: body compressed — retrieve with ccr_id={ccr_id}]",
		"rust": " // [slm: body compressed — retrieve with ccr_id={ccr_id}]",
		"java": " // [slm: body compressed — retrieve with ccr_id={ccr_id}]",
		"cpp": " // [slm: body compressed — retrieve with ccr_id={ccr_id}]",
		}
		_BODY_STUB_NO_CCR_BY_LANG: dict[str, str] = {
		"python": " # [slm: body compressed]",
		"javascript": " // [slm: body compressed]",
		"go": " // [slm: body compressed]",
		"rust": " // [slm: body compressed]",
		"java": " // [slm: body compressed]",
		"cpp": " // [slm: body compressed]",
		}

		_MIN_BODY_LINES: int = 4


		class CodeLanguage(Enum):
		PYTHON = "python"
		JAVASCRIPT = "javascript"
		GO = "go"
		RUST = "rust"
		JAVA = "java"
		CPP = "cpp"


		@dataclass(frozen=True)
		class CodeSpan:
		start_line: int
		end_line: int
		role: str # "import" \| "signature" \| "body" \| "class_header" \| "decorator"
		is_structural: bool


		# ── Signature patterns for regex path (Path B) ────────────────────────────────

		_SIGNATURE_PATTERNS: dict[str, list[str]] = {
		"python": [
		r"^\s(async\s+)?def\s+\w+\s$[^)]$\s(->\s*[^:]+)?:",
		r"^\sclass\s+\w+($[^)]$)?:",
		r"^\s*import\s+",
		r"^\s*from\s+\w+\s+import",
		r"^\s*@\w+",
		],
		"javascript": [
		r"^\s(async\s+)?function\s+\w+\s$[^)]*$",
		r"^\s*class\s+\w+(\s+extends\s+\w+)?",
		r"^\s*import\s+",
		r"^\sconst\s+\w+\s=\s*(async\s+)?\(",
		r"^\s*export\s+(default\s+\|const\s+\|class\s+\|function\s+)",
		],
		"go": [
		r"^\sfunc\s+($\w+\s+\?\w+$\s)?\w+\s\(",
		r"^\s*import\s+",
		r"^\s*package\s+",
		r"^\stype\s+\w+\s+(struct\|interface)\s\{",
		],
		"rust": [
		r"^\s*(pub\s+)?(async\s+)?fn\s+\w+",
		r"^\s*use\s+",
		r"^\s*impl\s+",
		r"^\s*struct\s+",
		r"^\s*enum\s+",
		r"^\s*trait\s+",
		],
		"java": [
		r"^\s(public\|private\|protected)\s+(static\s+)?\w+\s+\w+\s\(",
		r"^\s*import\s+",
		r"^\s(public\|private\|protected)?\sclass\s+\w+",
		],
		"cpp": [
		r"^\s\w[\w\s\&:<,>]\s+\w+\s$[^)]$\s(const\s*)?\{",
		r"^\s*#include\s+",
		r"^\s*(class\|struct)\s+\w+",
		],
		}


		class CodeCompressor:
		"""AST-aware extractive code compressor. Thread-safe (no mutable state)."""

		def compress(self, code: str, language: str, ccr_id: str = "") -> str:
		try:
		lang = CodeLanguage(language)
		except ValueError:
		logger.warning("CodeCompressor: unknown language %r — passthrough", language)
		return code

		try:
		if _tree_sitter_available():
		return self._compress_with_tree_sitter(code, lang, ccr_id)
		else:
		return self._compress_with_regex(code, lang, ccr_id)
		except Exception as exc:
		logger.warning("CodeCompressor.compress failed — passthrough: %s", exc)
		return code

		def _compress_with_tree_sitter(self, code: str, lang: CodeLanguage, ccr_id: str) -> str:
		from tree_sitter_language_pack import get_parser # type: ignore[import]
		parser = get_parser(lang.value)
		tree = parser.parse(code.encode())
		lines = code.split("\n")
		spans = _collect_spans_tree_sitter(tree.root_node, lang)
		return _apply_spans(lines, spans, ccr_id, lang.value)

		def _compress_with_regex(self, code: str, lang: CodeLanguage, ccr_id: str) -> str:
		lines = code.split("\n")
		spans = _collect_spans_regex(lines, lang)
		return _apply_spans(lines, spans, ccr_id, lang.value)


		# ── Span collection ───────────────────────────────────────────────────────────

		def _collect_spans_tree_sitter(root_node: object, lang: CodeLanguage) -> list[CodeSpan]:
		_STRUCTURAL_TYPES: dict[str, set[str]] = {
		"python": {"import_statement", "import_from_statement", "function_definition",
		"class_definition", "decorated_definition", "type_alias_statement"},
		"javascript": {"import_statement", "export_statement", "function_declaration",
		"class_declaration", "method_definition", "arrow_function"},
		"go": {"import_declaration", "function_declaration", "method_declaration",
		"type_declaration", "interface_type"},
		"rust": {"use_declaration", "function_item", "impl_item", "struct_item",
		"enum_item", "trait_item"},
		"java": {"import_declaration", "class_declaration", "method_declaration",
		"interface_declaration", "annotation"},
		"cpp": {"function_definition", "preproc_include", "class_specifier"},
		}
		structural = _STRUCTURAL_TYPES.get(lang.value, set())
		spans: list[CodeSpan] = []

		def _walk(node: object, depth: int = 0) -> None:
		start_row: int = getattr(node, "start_point", (0, 0))[0]
		end_row: int = getattr(node, "end_point", (0, 0))[0]
		node_type: str = getattr(node, "type", "")
		is_struct = node_type in structural

		if is_struct and node_type in {
		"function_definition", "method_definition", "function_declaration",
		"function_item", "method_declaration",
		}:
		body_node = _find_child(node, "block") or _find_child(node, "body")
		if body_node is not None:
		body_start = getattr(body_node, "start_point", (0, 0))[0]
		sig_end = body_start - 1
		if sig_end > start_row:
		spans.append(CodeSpan(start_row, sig_end, "signature", True))
		spans.append(CodeSpan(body_start, end_row, "body", False))
		else:
		spans.append(CodeSpan(start_row, end_row, "signature", True))
		elif is_struct and node_type in {"decorated_definition"}:
		def_node = _find_child(node, "function_definition") or _find_child(node, "class_definition")
		if def_node is not None:
		def_start = getattr(def_node, "start_point", (0, 0))[0]
		spans.append(CodeSpan(start_row, def_start - 1, "decorator", True))
		_walk(def_node, depth + 1)
		else:
		spans.append(CodeSpan(start_row, end_row, node_type, True))
		elif is_struct:
		spans.append(CodeSpan(start_row, end_row, "signature", True))
		else:
		children = getattr(node, "children", None)
		if children is not None:
		has_named = False
		for child in children:
		if getattr(child, "is_named", False):
		has_named = True
		_walk(child, depth + 1)
		if not has_named:
		spans.append(CodeSpan(start_row, end_row, "body", False))

		_walk(root_node)
		return spans


		def _find_child(node: object, child_type: str) -> object \| None:
		children = getattr(node, "children", None)
		if children is None:
		return None
		for child in children:
		if hasattr(child, "type") and child.type == child_type:
		return child
		return None


		def _collect_spans_regex(lines: list[str], lang: CodeLanguage) -> list[CodeSpan]:
		patterns = _SIGNATURE_PATTERNS.get(lang.value, [])
		spans: list[CodeSpan] = []
		i = 0
		n = len(lines)

		while i < n:
		line = lines[i]
		matched = False
		for pat in patterns:
		if re.search(pat, line):
		body_start = i + 1
		body_end = body_start
		while body_end < n and (
		lines[body_end].startswith((" ", "\t", "", "#", "//", "/", "/", "{"))
		or re.match(r"^\s*$", lines[body_end])
		):
		if re.match(r"^\s($\|#\|//\|/\\|\/\|\\|}\|\)\|\])", lines[body_end]):
		body_end += 1
		else:
		break
		body_len = body_end - body_start
		if body_len >= _MIN_BODY_LINES and _looks_like_body(lines, body_start, body_end):
		spans.append(CodeSpan(i, i, "signature", True))
		spans.append(CodeSpan(body_start, body_end, "body", False))
		i = body_end
		matched = True
		break
		else:
		body_start = body_end
		# else: pattern didn't match
		if not matched:
		i += 1
		return spans


		def _looks_like_body(lines: list[str], start: int, end: int) -> bool:
		body_lines = [l for l in lines[start:end] if l.strip() and not l.strip().startswith(("#", "//"))]
		return len(body_lines) >= _MIN_BODY_LINES


		def _apply_spans(
		lines: list[str],
		spans: list[CodeSpan],
		ccr_id: str,
		lang: str = "python",
		) -> str:
		if not spans:
		return "\n".join(lines)

		stub_tmpl = _BODY_STUB_BY_LANG.get(lang, _BODY_STUB_BY_LANG["python"])
		stub_no_ccr = _BODY_STUB_NO_CCR_BY_LANG.get(lang, _BODY_STUB_NO_CCR_BY_LANG["python"])
		stub = stub_tmpl.format(ccr_id=ccr_id) if ccr_id else stub_no_ccr

		sorted_spans = sorted(spans, key=lambda s: s.start_line)
		start_set = set()
		deduped: list[CodeSpan] = []
		for s in sorted_spans:
		if s.start_line not in start_set:
		deduped.append(s)
		start_set.add(s.start_line)

		span_by_start: dict[int, CodeSpan] = {s.start_line: s for s in deduped}

		result_lines: list[str] = []
		i = 0
		n = len(lines)

		while i < n:
		span = span_by_start.get(i)
		if span is not None and not span.is_structural:
		end = min(span.end_line, n - 1)
		body_len = end - i + 1
		if body_len >= _MIN_BODY_LINES:
		result_lines.append(stub)
		i = end + 1
		else:
		for j in range(i, end + 1):
		result_lines.append(lines[j])
		i = end + 1
		elif span is not None and span.is_structural:
		end = min(span.end_line, n - 1)
		for j in range(i, end + 1):
		result_lines.append(lines[j])
		i = end + 1
		else:
		result_lines.append(lines[i])
		i += 1

		return "\n".join(result_lines)


		def _tree_sitter_available() -> bool:
		try:
		import tree_sitter_language_pack # noqa: F401
		return True
		except ImportError:
		return False

-72

src/superlocalmemory/optimize/compress/extractive_json.py

		# compress/extractive_json.py
		# Copyright (c) 2026 Varun Pratap Bhardwaj / Qualixar
		# Licensed under AGPL-3.0-or-later
		#
		# Structural masking pattern adapted from:
		# headroom/compression/handlers/json_handler.py (Apache-2.0, Headroom contributors)
		# Specifically: JSONStructureHandler._extract_mask(), JSONToken, JSONTokenType
		# Attribution: See ATTRIBUTION.md.
		#
		# HARD RULE: This compressor MUST NEVER prune or reorder JSON keys.
		# Keys pruned = structured semantics corrupted non-recoverably. Not configurable.

		"""JSONCompressor — lossless-ish extractive JSON compressor."""

		from __future__ import annotations

		import json
		import logging
		from typing import Any

		logger = logging.getLogger("slm.optimize.compress.json")

		VALUE_TRUNCATE_CHARS: int = 120
		VALUE_TRUNCATE_SUFFIX: str = "\u2026" # ellipsis
		MAX_ARRAY_ITEMS_SHOWN: int = 5
		ARRAY_REMAINDER_KEY: str = "__slm_omitted__"
		MIN_VALUE_LEN_TO_TRUNCATE: int = 40


		class JSONCompressor:
		"""Lossless-ish JSON compressor. Thread-safe (no mutable state)."""

		def compress(self, parsed: Any) -> str:
		try:
		masked = self._mask(parsed, depth=0)
		return json.dumps(masked, ensure_ascii=False, separators=(",", ":"))
		except Exception as exc:
		logger.warning("JSONCompressor.compress failed — returning original: %s", exc)
		return json.dumps(parsed, ensure_ascii=False, separators=(",", ":"))

		def _mask(self, obj: Any, depth: int) -> Any:
		if isinstance(obj, dict):
		return {k: self._mask(v, depth + 1) for k, v in obj.items()}
		if isinstance(obj, list):
		return self._mask_array(obj, depth)
		if isinstance(obj, str):
		return self._mask_string(obj)
		return obj

		def _mask_array(self, arr: list, depth: int) -> list:
		if len(arr) <= MAX_ARRAY_ITEMS_SHOWN:
		return [self._mask(item, depth + 1) for item in arr]
		shown = [self._mask(item, depth + 1) for item in arr[:MAX_ARRAY_ITEMS_SHOWN]]
		collision = any(
		isinstance(item, dict) and ARRAY_REMAINDER_KEY in item
		for item in arr
		)
		if collision:
		logger.warning(
		"JSONCompressor: input contains reserved key %r — skipping array sentinel",
		ARRAY_REMAINDER_KEY,
		)
		else:
		shown.append({ARRAY_REMAINDER_KEY: len(arr) - MAX_ARRAY_ITEMS_SHOWN})
		return shown

		def _mask_string(self, s: str) -> str:
		if len(s) < MIN_VALUE_LEN_TO_TRUNCATE:
		return s
		if len(s) <= VALUE_TRUNCATE_CHARS:
		return s
		return s[:VALUE_TRUNCATE_CHARS] + VALUE_TRUNCATE_SUFFIX

CHANGELOG.md

Sorry, the diff of this file is too big to display

src/superlocalmemory/server/unified_daemon.py

Sorry, the diff of this file is too big to display

src/superlocalmemory/ui/index.html

Sorry, the diff of this file is too big to display

		@@ -36,3 +36,5 @@ <p align="center">

		> V3.6 is the only local-first layer that SKIPS repeat LLM calls (cache: 100% saved), SHRINKS prompts 60-95% (compress: extractive + LLMLingua-2), and DISCOUNTS prefix costs (align: native KV-cache) — and remembers everything — in one install. Your first cache hit pays for the install time. Hours of coding on repeat, minimal API cost.
		> V3.6 is the only local-first layer that SKIPS repeat LLM calls (cache: 100% saved on a hit), SHRINKS prose prompts (compress: lossless-by-default, opt-in LLMLingua-2), and DISCOUNTS prefix costs (align: native KV-cache) — and remembers everything — in one install. Your first cache hit pays for the install time. Hours of coding on repeat, minimal API cost.
		>
		> v3.6.10: cache and compression are now independent runtime switches (cache-only, compress-only, both, or neither — toggle live from the dashboard, no restart). Compression was rebuilt to be lossless by default (the old string/array/code truncation is gone); aggressive mode adds LLMLingua-2 for prose only — never code, numbers, structured data, or the current turn.

		@@ -43,4 +45,4 @@ ### The Three Levers
		\|-------\|-----------\|:------:\|:---------------:\|
		\| Cache \| Skip repeat calls — exact-match SQLite lookup, vCache-gated semantic (opt-in) \| 100% on a hit (input + output) \| Cache ON, Semantic OFF \|
		\| Compress \| Shrink prompts — extractive JSON/code (lossless) + LLMLingua-2 prose (opt-in) \| 60–95% on a miss (input only) \| Safe mode ON, Aggressive OFF \|
		\| Cache \| Skip repeat calls — exact-match SQLite lookup (zero false hits), vCache-gated semantic (opt-in) \| 100% on a hit (input + output) \| Cache ON, Semantic OFF \|
		\| Compress \| Shrink prompts — safe = lossless normalization; aggressive = LLMLingua-2 prose only (opt-in) \| Safe: small + lossless · Aggressive: large on prose \| Safe mode, Aggressive OFF \|
		\| Align \| Stabilize prefix — maximize provider prefix-cache discounts \| Lossless extra \| ON when compression is ON \|
		@@ -50,2 +52,4 @@

		> Independent at runtime: enable caching only, compression only, both, or neither — from the dashboard Optimize tab, applied live (no restart). Each AI client can also get its own memory identity over HTTP MCP via `http://127.0.0.1:8765/mcp/{agent_id}`.

		### Quick Start
		@@ -66,3 +70,3 @@
		\| `slm cache status\\|clear\\|invalidate\\|ttl\\|semantic` \| Cache sub-control — exact + semantic tiers \|
		\| `slm compress status\\|mode\\|code\\|prose\\|ccr\\|align` \| Compression control — per-channel toggles \|
		\| `slm compress status\\|mode\\|prose` \| Compression control — safe (lossless) / aggressive (LLMLingua-2 prose) \|
		\| `slm proxy [--port] [--provider]` \| Start the interception proxy (port 8765) \|
		@@ -69,0 +73,0 @@ \| `slm wrap <agent>` \| Proxy-activate an agent — one command to start saving \|

		Metadata-Version: 2.4
		Name: superlocalmemory
		Version: 3.6.9
		Version: 3.6.10
		Summary: Information-geometric agent memory with mathematical guarantees
		@@ -61,2 +61,3 @@ Author-email: Varun Pratap Bhardwaj <admin@superlocalmemory.com>
		Requires-Dist: sqlite-vec==0.1.9
		Requires-Dist: llmlingua==0.2.2
		Provides-Extra: search
		@@ -129,3 +130,5 @@ Requires-Dist: sentence-transformers==5.3.0; extra == "search"

		> V3.6 is the only local-first layer that SKIPS repeat LLM calls (cache: 100% saved), SHRINKS prompts 60-95% (compress: extractive + LLMLingua-2), and DISCOUNTS prefix costs (align: native KV-cache) — and remembers everything — in one install. Your first cache hit pays for the install time. Hours of coding on repeat, minimal API cost.
		> V3.6 is the only local-first layer that SKIPS repeat LLM calls (cache: 100% saved on a hit), SHRINKS prose prompts (compress: lossless-by-default, opt-in LLMLingua-2), and DISCOUNTS prefix costs (align: native KV-cache) — and remembers everything — in one install. Your first cache hit pays for the install time. Hours of coding on repeat, minimal API cost.
		>
		> v3.6.10: cache and compression are now independent runtime switches (cache-only, compress-only, both, or neither — toggle live from the dashboard, no restart). Compression was rebuilt to be lossless by default (the old string/array/code truncation is gone); aggressive mode adds LLMLingua-2 for prose only — never code, numbers, structured data, or the current turn.

		@@ -136,4 +139,4 @@ ### The Three Levers
		\|-------\|-----------\|:------:\|:---------------:\|
		\| Cache \| Skip repeat calls — exact-match SQLite lookup, vCache-gated semantic (opt-in) \| 100% on a hit (input + output) \| Cache ON, Semantic OFF \|
		\| Compress \| Shrink prompts — extractive JSON/code (lossless) + LLMLingua-2 prose (opt-in) \| 60–95% on a miss (input only) \| Safe mode ON, Aggressive OFF \|
		\| Cache \| Skip repeat calls — exact-match SQLite lookup (zero false hits), vCache-gated semantic (opt-in) \| 100% on a hit (input + output) \| Cache ON, Semantic OFF \|
		\| Compress \| Shrink prompts — safe = lossless normalization; aggressive = LLMLingua-2 prose only (opt-in) \| Safe: small + lossless · Aggressive: large on prose \| Safe mode, Aggressive OFF \|
		\| Align \| Stabilize prefix — maximize provider prefix-cache discounts \| Lossless extra \| ON when compression is ON \|
		@@ -143,2 +146,4 @@

		> Independent at runtime: enable caching only, compression only, both, or neither — from the dashboard Optimize tab, applied live (no restart). Each AI client can also get its own memory identity over HTTP MCP via `http://127.0.0.1:8765/mcp/{agent_id}`.

		### Quick Start
		@@ -159,3 +164,3 @@
		\| `slm cache status\\|clear\\|invalidate\\|ttl\\|semantic` \| Cache sub-control — exact + semantic tiers \|
		\| `slm compress status\\|mode\\|code\\|prose\\|ccr\\|align` \| Compression control — per-channel toggles \|
		\| `slm compress status\\|mode\\|prose` \| Compression control — safe (lossless) / aggressive (LLMLingua-2 prose) \|
		\| `slm proxy [--port] [--provider]` \| Start the interception proxy (port 8765) \|
		@@ -162,0 +167,0 @@ \| `slm wrap <agent>` \| Proxy-activate an agent — one command to start saving \|

superlocalmemory - npm Package Compare versions

New alerts

Fixed alerts

Improved metrics