Big News: Socket raises $60M Series C at a $1B valuation to secure software supply chains for AI-driven development.Announcement
Sign In

superlocalmemory

Package Overview
Dependencies
Maintainers
1
Versions
176
Alerts
File Explorer

Advanced tools

Socket logo

Install Socket

Detect and block malicious and high-risk dependencies

Install

superlocalmemory - npm Package Compare versions

Comparing version
3.6.9
to
3.6.10
+111
src/superlocalmemory/mcp/agent_context.py
"""Per-HTTP-request agent ID resolution — ContextVar home.
Kept in a standalone module so both tools_core and tools_active can import
it without creating circular dependencies (server.py → tools_core → here,
and unified_daemon.py → here independently).
Priority chain (HTTP-first, stdio-fallback):
1. ContextVar set by _AgentIDExtractorASGI middleware from /mcp/{agent_id} URL path.
2. SLM_AGENT_ID environment variable (stdio transport legacy).
3. Hard-coded "mcp_client" sentinel.
"""
from __future__ import annotations
import contextvars
import os
import re
_current_agent_id: contextvars.ContextVar[str] = contextvars.ContextVar(
"slm_agent_id", default="mcp_client"
)
# Agent ids arrive from an untrusted URL path segment. They are ATTRIBUTION
# metadata, never an authenticated principal — but they reach loggers, the
# agent registry, and SQL-bound attribution columns, so we hard-restrict the
# charset at the single extraction chokepoint. This neutralises log-injection
# (CRLF / ANSI), oversized ids, and any path-ish characters in one place.
_AGENT_ID_SANITIZE = re.compile(r"[^A-Za-z0-9._-]")
_AGENT_ID_MAX_LEN = 64
def sanitize_agent_id(raw: str) -> str:
"""Coerce an untrusted agent-id segment to a safe, bounded token."""
return _AGENT_ID_SANITIZE.sub("_", raw)[:_AGENT_ID_MAX_LEN]
def get_current_agent_id(env_fallback: bool = True) -> str:
"""Return the agent_id for the current asyncio task.
For HTTP transport the ASGI wrapper sets the ContextVar from the URL path
before the request reaches any MCP tool, so this returns the URL-derived id.
For stdio transport the ContextVar holds its default ("mcp_client") and we
fall through to the SLM_AGENT_ID env var instead.
"""
ctx_id = _current_agent_id.get()
if ctx_id != "mcp_client":
return ctx_id
if env_fallback:
return os.environ.get("SLM_AGENT_ID", "mcp_client")
return "mcp_client"
class AgentIDExtractorASGI:
"""ASGI wrapper that maps ``/mcp/{agent_id}`` → the agent-id ContextVar.
Mounted at ``/mcp`` in unified_daemon. IMPORTANT: Starlette's ``Mount``
(≥0.35 / 1.x) does NOT strip the mount prefix from ``scope["path"]`` — it
records the prefix in ``scope["root_path"]`` and leaves ``path`` as the full
request path (e.g. ``/mcp/claude`` with ``root_path == "/mcp"``). So we
compute the mount-relative sub-path ourselves as ``path[len(root_path):]``.
Flow for ``POST /mcp/claude``:
sub-path ``/claude`` → agent id ``claude`` → set ContextVar → rewrite the
scope path to ``{root_path}/`` so the inner FastMCP app (Starlette, route
``/``) sees the same mount-relative ``/`` it sees for a bare ``/mcp/``.
Backward compatible: bare ``/mcp/`` has sub-path ``/`` → no agent segment →
the request passes through untouched and the ContextVar keeps its
``"mcp_client"`` default.
Per-request isolation is guaranteed by ContextVar + ``reset(token)`` in a
``finally``, so concurrent HTTP sessions never see each other's agent id.
"""
__slots__ = ("_app",)
def __init__(self, inner) -> None:
self._app = inner
async def __call__(self, scope, receive, send):
if scope.get("type") == "http":
root_path: str = scope.get("root_path", "")
full_path: str = scope.get("path", "/")
# Mount-relative sub-path (what comes AFTER /mcp). When a root_path
# is present the request path MUST start with it (Starlette Mount
# guarantees this); if it somehow does not, treat it as no-agent and
# pass through untouched rather than mis-parsing the full path.
if root_path:
if not full_path.startswith(root_path):
await self._app(scope, receive, send)
return
subpath = full_path[len(root_path):]
else:
subpath = full_path
first = subpath.lstrip("/").split("/")[0]
if first:
first = sanitize_agent_id(first)
token = _current_agent_id.set(first)
# Rewrite the path so the inner app sees the bare mount root,
# exactly as it would for a no-agent /mcp/ request.
new_full = (root_path + "/") if root_path else "/"
new_scope = {
**scope,
"path": new_full,
"raw_path": new_full.encode(),
}
try:
await self._app(new_scope, receive, send)
finally:
_current_agent_id.reset(token)
return
await self._app(scope, receive, send)
"""capture.py — Lossless shadow-capture of real proxy traffic (v3.6.10, plan §7).
Purpose: build a dogfood corpus of real {request, response, model, tokens,
content_type} pairs so the cache + compression benchmark (benchmarks/optimize/)
can be replayed against authentic traffic instead of only synthetic prompts.
Activation: set ``SLM_OPTIMIZE_CAPTURE=1`` in the daemon's environment. When on:
* the proxy runs in PURE PASSTHROUGH — cache + compression hooks are disabled
at load time (see server._load_hooks), so capture never observes a mutated
request or a cache hit; every line is a genuine upstream exchange.
* each completed exchange is appended as one JSON line to
``~/.superlocalmemory/optimize_capture.jsonl`` (0600, gitignored).
ISOLATION GUARANTEE: this module writes ONLY to optimize_capture.jsonl. It never
opens memory.db, llmcache.db, or any SLM memory store. (Plan §9 hard rule.)
FAIL-OPEN: a capture failure (disk full, permission, encode error) is logged and
swallowed — it MUST NOT break the proxied request the user is waiting on.
"""
from __future__ import annotations
import asyncio
import json
import logging
import os
import threading
from pathlib import Path
from typing import Any
logger = logging.getLogger("slm.optimize.proxy.capture")
_CAPTURE_DIRNAME = ".superlocalmemory"
_CAPTURE_FILENAME = "optimize_capture.jsonl"
_CAPTURE_ENV = "SLM_OPTIMIZE_CAPTURE"
_TRUTHY = frozenset({"1", "true", "yes", "on"})
# Cap a single captured body so a pathological 10 MB request can't bloat the
# corpus line beyond what the replay harness will read back. Bodies above this
# are recorded truncated with a marker (capture is for benchmarking, not audit).
_MAX_CAPTURE_BODY_BYTES = 1 * 1024 * 1024 # 1 MB per side
# Providers whose response bodies follow the OpenAI usage schema
# ({"usage": {"prompt_tokens", "completion_tokens"}, "model"}).
_OPENAI_FORMAT_PROVIDERS = frozenset({"openai", "gemini-openai-compat"})
def capture_enabled() -> bool:
"""True iff ``SLM_OPTIMIZE_CAPTURE`` is set to a truthy value."""
return os.environ.get(_CAPTURE_ENV, "").strip().lower() in _TRUTHY
def _capture_path() -> Path:
return Path.home() / _CAPTURE_DIRNAME / _CAPTURE_FILENAME
class ShadowCapture:
"""Thread-safe append-only JSONL writer for proxy exchanges (singleton)."""
_instance: "ShadowCapture | None" = None
_instance_lock = threading.Lock()
def __init__(self, path: Path | None = None) -> None:
self._path = path or _capture_path()
self._write_lock = threading.Lock()
self._count = 0
@classmethod
def get_instance(cls) -> "ShadowCapture":
# Double-checked locking: cheap fast-path after first construction.
if cls._instance is None:
with cls._instance_lock:
if cls._instance is None:
cls._instance = cls()
return cls._instance
@classmethod
def reset_instance(cls) -> None:
"""Test hook — drop the singleton so a fresh path can be injected."""
with cls._instance_lock:
cls._instance = None
@property
def path(self) -> Path:
return self._path
@property
def count(self) -> int:
return self._count
def record(self, entry: dict[str, Any]) -> bool:
"""Append one capture entry as a JSON line. Returns True on success.
Fail-open: any error is logged and False is returned; never raised.
Security: opens with a single ``os.open`` carrying ``O_CREAT |
O_APPEND | O_NOFOLLOW`` and mode ``0o600`` on EVERY write. O_NOFOLLOW
refuses a symlink pre-placed at the path (symlink-append attack), and
the unconditional 0600-on-create removes the stat/exists TOCTOU that
could otherwise drop the file to the process umask.
"""
try:
line = json.dumps(entry, ensure_ascii=False, separators=(",", ":"))
except (TypeError, ValueError) as exc:
logger.warning("capture: entry not JSON-serialisable, dropped: %r", exc)
return False
try:
with self._write_lock:
self._path.parent.mkdir(parents=True, exist_ok=True)
flags = os.O_CREAT | os.O_WRONLY | os.O_APPEND | getattr(os, "O_NOFOLLOW", 0)
fd = os.open(self._path, flags, 0o600)
with os.fdopen(fd, "a", encoding="utf-8") as fh:
fh.write(line + "\n")
self._count += 1
return True
except OSError as exc:
# PermissionError / symlink-refusal (ELOOP) are security-relevant —
# surface the errno but still fail open so the request is never blocked.
logger.warning("capture: write failed (fail-open): %r", exc)
return False
def _truncate(raw: bytes) -> tuple[str, bool]:
"""Decode bytes for storage; truncate beyond the per-side cap."""
truncated = len(raw) > _MAX_CAPTURE_BODY_BYTES
head = raw[:_MAX_CAPTURE_BODY_BYTES] if truncated else raw
return head.decode("utf-8", errors="replace"), truncated
def build_entry(
*,
provider: str,
model: str,
request_body: bytes,
response_body: bytes,
content_type: str,
input_tokens: int,
output_tokens: int,
status_code: int,
stream: bool,
) -> dict[str, Any]:
"""Construct a capture entry dict from a completed exchange.
No timestamp is stamped here (Date.now is intentionally avoided in some
runtimes); the replay harness keys on content, not time, and the file's own
line order preserves arrival sequence.
"""
req_str, req_trunc = _truncate(request_body)
resp_str, resp_trunc = _truncate(response_body)
return {
"provider": provider,
"model": model,
"content_type": content_type,
"stream": stream,
"status_code": status_code,
"input_tokens": int(input_tokens),
"output_tokens": int(output_tokens),
"request": req_str,
"response": resp_str,
"request_truncated": req_trunc,
"response_truncated": resp_trunc,
}
def extract_usage(provider: str, body: bytes | None) -> tuple[int, int, str]:
"""Best-effort (input_tokens, output_tokens, model) from a provider JSON body.
Works on the normalised JSON the SSE parsers emit AND on non-streaming
upstream JSON. Returns (0, 0, "") when the body is missing/unparseable —
capture must never fail because usage couldn't be read.
"""
if not body:
return 0, 0, ""
try:
data = json.loads(body)
except (json.JSONDecodeError, ValueError, TypeError):
return 0, 0, ""
if not isinstance(data, dict):
return 0, 0, ""
if provider == "gemini":
usage = data.get("usageMetadata") or {}
return (
int(usage.get("promptTokenCount", 0) or 0),
int(usage.get("candidatesTokenCount", 0) or 0),
str(data.get("modelVersion", "") or ""),
)
usage = data.get("usage") or {}
if provider == "anthropic":
return (
int(usage.get("input_tokens", 0) or 0),
int(usage.get("output_tokens", 0) or 0),
str(data.get("model", "") or ""),
)
# OpenAI-format providers (explicit allowlist so a future provider variant
# is not silently parsed with the wrong schema — it warns + returns zeros).
if provider not in _OPENAI_FORMAT_PROVIDERS:
logger.warning(
"capture.extract_usage: unknown provider %r — recording zero tokens", provider
)
return 0, 0, str(data.get("model", "") or "")
return (
int(usage.get("prompt_tokens", 0) or 0),
int(usage.get("completion_tokens", 0) or 0),
str(data.get("model", "") or ""),
)
def record_exchange(
*,
provider: str,
model: str,
request_body: bytes,
response_body: bytes,
content_type: str = "application/json",
input_tokens: int = 0,
output_tokens: int = 0,
status_code: int = 200,
stream: bool = False,
) -> bool:
"""Build + append a capture entry. Fail-open. Returns True on success."""
entry = build_entry(
provider=provider,
model=model,
request_body=request_body,
response_body=response_body,
content_type=content_type,
input_tokens=input_tokens,
output_tokens=output_tokens,
status_code=status_code,
stream=stream,
)
return ShadowCapture.get_instance().record(entry)
async def record_exchange_async(**kwargs: Any) -> bool:
"""Async wrapper for ``record_exchange`` that offloads the synchronous file
write to a worker thread so it never blocks the proxy event loop — relevant
when many streaming responses complete in the same loop iteration.
"""
return await asyncio.to_thread(lambda: record_exchange(**kwargs))
+1
-1
{
"name": "superlocalmemory",
"version": "3.6.9",
"version": "3.6.10",
"description": "Information-geometric agent memory with mathematical guarantees. 4-channel retrieval, Fisher-Rao similarity, zero-LLM mode, EU AI Act compliant. Works with Claude, Cursor, Windsurf, and 17+ AI tools.",

@@ -5,0 +5,0 @@ "keywords": [

[project]
name = "superlocalmemory"
version = "3.6.9"
version = "3.6.10"
description = "Information-geometric agent memory with mathematical guarantees"

@@ -72,2 +72,7 @@ readme = "README.md"

"sqlite-vec==0.1.9",
# v3.6.10: LLMLingua-2 prose compression (aggressive mode, opt-in at runtime).
# Hard dependency so `pip install superlocalmemory` ships compression-ready;
# the ~560MB model downloads on first setup/warmup (fail-open). Verified the
# pin does NOT disturb the transformers/torch/numpy pins above.
"llmlingua==0.2.2",
]

@@ -74,0 +79,0 @@

@@ -36,3 +36,5 @@ <p align="center">

> V3.6 is the only local-first layer that SKIPS repeat LLM calls (cache: 100% saved), SHRINKS prompts 60-95% (compress: extractive + LLMLingua-2), and DISCOUNTS prefix costs (align: native KV-cache) — and remembers everything — in one install. **Your first cache hit pays for the install time. Hours of coding on repeat, minimal API cost.**
> V3.6 is the only local-first layer that SKIPS repeat LLM calls (cache: 100% saved on a hit), SHRINKS prose prompts (compress: lossless-by-default, opt-in LLMLingua-2), and DISCOUNTS prefix costs (align: native KV-cache) — and remembers everything — in one install. **Your first cache hit pays for the install time. Hours of coding on repeat, minimal API cost.**
>
> **v3.6.10:** cache and compression are now **independent runtime switches** (cache-only, compress-only, both, or neither — toggle live from the dashboard, no restart). Compression was rebuilt to be **lossless by default** (the old string/array/code truncation is gone); aggressive mode adds LLMLingua-2 for **prose only** — never code, numbers, structured data, or the current turn.

@@ -43,4 +45,4 @@ ### The Three Levers

|-------|-----------|:------:|:---------------:|
| **Cache** | Skip repeat calls — exact-match SQLite lookup, vCache-gated semantic (opt-in) | **100% on a hit** (input + output) | Cache ON, Semantic OFF |
| **Compress** | Shrink prompts — extractive JSON/code (lossless) + LLMLingua-2 prose (opt-in) | **60–95% on a miss** (input only) | Safe mode ON, Aggressive OFF |
| **Cache** | Skip repeat calls — exact-match SQLite lookup (zero false hits), vCache-gated semantic (opt-in) | **100% on a hit** (input + output) | Cache ON, Semantic OFF |
| **Compress** | Shrink prompts — **safe = lossless** normalization; **aggressive = LLMLingua-2 prose only** (opt-in) | Safe: small + lossless · Aggressive: large on prose | Safe mode, Aggressive OFF |
| **Align** | Stabilize prefix — maximize provider prefix-cache discounts | **Lossless extra** | ON when compression is ON |

@@ -50,2 +52,4 @@

> **Independent at runtime:** enable caching only, compression only, both, or neither — from the dashboard Optimize tab, applied live (no restart). Each AI client can also get its own memory identity over HTTP MCP via `http://127.0.0.1:8765/mcp/{agent_id}`.
### Quick Start

@@ -66,3 +70,3 @@

| `slm cache status\|clear\|invalidate\|ttl\|semantic` | Cache sub-control — exact + semantic tiers |
| `slm compress status\|mode\|code\|prose\|ccr\|align` | Compression control — per-channel toggles |
| `slm compress status\|mode\|prose` | Compression control — safe (lossless) / aggressive (LLMLingua-2 prose) |
| `slm proxy [--port] [--provider]` | Start the interception proxy (port 8765) |

@@ -69,0 +73,0 @@ | `slm wrap <agent>` | Proxy-activate an agent — one command to start saving |

Metadata-Version: 2.4
Name: superlocalmemory
Version: 3.6.9
Version: 3.6.10
Summary: Information-geometric agent memory with mathematical guarantees

@@ -61,2 +61,3 @@ Author-email: Varun Pratap Bhardwaj <admin@superlocalmemory.com>

Requires-Dist: sqlite-vec==0.1.9
Requires-Dist: llmlingua==0.2.2
Provides-Extra: search

@@ -129,3 +130,5 @@ Requires-Dist: sentence-transformers==5.3.0; extra == "search"

> V3.6 is the only local-first layer that SKIPS repeat LLM calls (cache: 100% saved), SHRINKS prompts 60-95% (compress: extractive + LLMLingua-2), and DISCOUNTS prefix costs (align: native KV-cache) — and remembers everything — in one install. **Your first cache hit pays for the install time. Hours of coding on repeat, minimal API cost.**
> V3.6 is the only local-first layer that SKIPS repeat LLM calls (cache: 100% saved on a hit), SHRINKS prose prompts (compress: lossless-by-default, opt-in LLMLingua-2), and DISCOUNTS prefix costs (align: native KV-cache) — and remembers everything — in one install. **Your first cache hit pays for the install time. Hours of coding on repeat, minimal API cost.**
>
> **v3.6.10:** cache and compression are now **independent runtime switches** (cache-only, compress-only, both, or neither — toggle live from the dashboard, no restart). Compression was rebuilt to be **lossless by default** (the old string/array/code truncation is gone); aggressive mode adds LLMLingua-2 for **prose only** — never code, numbers, structured data, or the current turn.

@@ -136,4 +139,4 @@ ### The Three Levers

|-------|-----------|:------:|:---------------:|
| **Cache** | Skip repeat calls — exact-match SQLite lookup, vCache-gated semantic (opt-in) | **100% on a hit** (input + output) | Cache ON, Semantic OFF |
| **Compress** | Shrink prompts — extractive JSON/code (lossless) + LLMLingua-2 prose (opt-in) | **60–95% on a miss** (input only) | Safe mode ON, Aggressive OFF |
| **Cache** | Skip repeat calls — exact-match SQLite lookup (zero false hits), vCache-gated semantic (opt-in) | **100% on a hit** (input + output) | Cache ON, Semantic OFF |
| **Compress** | Shrink prompts — **safe = lossless** normalization; **aggressive = LLMLingua-2 prose only** (opt-in) | Safe: small + lossless · Aggressive: large on prose | Safe mode, Aggressive OFF |
| **Align** | Stabilize prefix — maximize provider prefix-cache discounts | **Lossless extra** | ON when compression is ON |

@@ -143,2 +146,4 @@

> **Independent at runtime:** enable caching only, compression only, both, or neither — from the dashboard Optimize tab, applied live (no restart). Each AI client can also get its own memory identity over HTTP MCP via `http://127.0.0.1:8765/mcp/{agent_id}`.
### Quick Start

@@ -159,3 +164,3 @@

| `slm cache status\|clear\|invalidate\|ttl\|semantic` | Cache sub-control — exact + semantic tiers |
| `slm compress status\|mode\|code\|prose\|ccr\|align` | Compression control — per-channel toggles |
| `slm compress status\|mode\|prose` | Compression control — safe (lossless) / aggressive (LLMLingua-2 prose) |
| `slm proxy [--port] [--provider]` | Start the interception proxy (port 8765) |

@@ -162,0 +167,0 @@ | `slm wrap <agent>` | Proxy-activate an agent — one command to start saving |

@@ -32,2 +32,3 @@ httpx==0.28.1

sqlite-vec==0.1.9
llmlingua==0.2.2

@@ -34,0 +35,0 @@ [dev]

@@ -274,2 +274,3 @@ AUTHORS.md

src/superlocalmemory/mcp/_stdin_guard.py
src/superlocalmemory/mcp/agent_context.py
src/superlocalmemory/mcp/resources.py

@@ -313,4 +314,2 @@ src/superlocalmemory/mcp/server.py

src/superlocalmemory/optimize/compress/ccr.py
src/superlocalmemory/optimize/compress/extractive_code.py
src/superlocalmemory/optimize/compress/extractive_json.py
src/superlocalmemory/optimize/compress/prose_llmlingua.py

@@ -330,2 +329,3 @@ src/superlocalmemory/optimize/compress/router.py

src/superlocalmemory/optimize/proxy/anthropic_surface.py
src/superlocalmemory/optimize/proxy/capture.py
src/superlocalmemory/optimize/proxy/gemini_surface.py

@@ -332,0 +332,0 @@ src/superlocalmemory/optimize/proxy/lifecycle.py

@@ -35,3 +35,3 @@ """SuperLocalMemory — information-geometric agent memory.

__version__ = "3.6.9"
__version__ = "3.6.10"

@@ -38,0 +38,0 @@ _REQUIRED_VERSIONS = {

@@ -5,3 +5,3 @@ # Copyright (c) 2026 Varun Pratap Bhardwaj / Qualixar

"""Handlers for ``slm compress status|mode|code|prose|ccr``."""
"""Handlers for ``slm compress status|mode|prose``."""

@@ -23,7 +23,2 @@ from __future__ import annotations

def _get_cache_db():
from superlocalmemory.optimize.storage.db import CacheDB
return CacheDB()
def _write_config(**fields) -> None:

@@ -49,7 +44,8 @@ """5-step immutable config-write."""

"status": cmd_compress_status,
"mode": cmd_compress_mode,
"code": cmd_compress_code,
"prose": cmd_compress_prose,
"ccr": cmd_compress_ccr,
"align": cmd_compress_align,
"mode": cmd_compress_mode,
"prose": cmd_compress_prose,
# removed in v3.6.10: code, ccr, align (extractive compressors removed)
"code": _cmd_compress_removed("code"),
"ccr": _cmd_compress_removed("ccr"),
"align": _cmd_compress_removed("align"),
}

@@ -60,6 +56,18 @@ handler = _dispatch.get(sub or "")

else:
print("Usage: slm compress status|mode|code|prose|ccr [options]")
print("Usage: slm compress status|mode|prose [options]")
sys.exit(0)
def _cmd_compress_removed(name: str):
def handler(args: Namespace) -> None:
print(
f"slm compress {name}: removed in SLM v3.6.10. "
f"Extractive {name} compression has been replaced by "
f"Layer 1 (lossless whitespace) + Layer 2 (LLMLingua-2 prose). "
f"Use 'slm compress prose on' to enable prose compression."
)
sys.exit(0)
return handler
def cmd_compress_status(args: Namespace) -> None:

@@ -75,5 +83,4 @@ """Print compression status."""

"compress_mode": cfg.compress_mode,
"compress_code": cfg.compress_code,
"compress_prose": cfg.compress_prose,
"compress_ccr": cfg.compress_ccr,
"compress_protect_recent": cfg.compress_protect_recent,
}

@@ -84,9 +91,8 @@ print(json.dumps(data, indent=2))

print("Compression status:")
print(f" Enabled: {'yes' if cfg.compress_enabled else 'no'}")
print(f" Mode: {cfg.compress_mode}")
print(f" Code: {'ON' if cfg.compress_code else 'OFF'}"
" (extractive JSON/code — lossless structure)")
print(f" Prose: {'ON' if cfg.compress_prose else 'OFF'}")
print(f" CCR: {'ON' if cfg.compress_ccr else 'OFF'}"
" (reversible context retrieval)")
print(f" Enabled: {'yes' if cfg.compress_enabled else 'no'}")
print(f" Mode: {cfg.compress_mode}")
print(f" Prose (Layer 2): {'ON' if cfg.compress_prose else 'OFF'}"
" (LLMLingua-2, aggressive mode only)")
print(f" Protect recent: {cfg.compress_protect_recent} user turns")
print(" Layer 1 (lossless whitespace normalization) is always ON when enabled.")

@@ -112,19 +118,4 @@

def cmd_compress_code(args: Namespace) -> None:
"""Enable or disable code/JSON compression."""
use_json = getattr(args, "json", False)
value = getattr(args, "code_value", "on")
_write_config(compress_code=(value == "on"))
if use_json:
print(json.dumps({"status": "ok", "compress_code": value == "on"}))
return
print(f"Code compression: {'ENABLED' if value == 'on' else 'DISABLED'}.")
print("Daemon hot-reload: active within 2s.")
def cmd_compress_prose(args: Namespace) -> None:
"""Enable or disable prose compression."""
"""Enable or disable prose compression (Layer 2, LLMLingua-2)."""
use_json = getattr(args, "json", False)

@@ -151,37 +142,8 @@ value = getattr(args, "prose_value", "off")

print(f"Prose compression: {'ENABLED' if value == 'on' else 'DISABLED'}.")
print(f"Prose compression (LLMLingua-2): {'ENABLED' if value == 'on' else 'DISABLED'}.")
if value == "on":
print(" Requires: compress_mode=aggressive and llmlingua package installed.")
print(" Run: slm compress mode aggressive (if not already set)")
if value == "on" and "compress_enabled" in fields:
print(" (also enabled global compress)")
print("Daemon hot-reload: active within 2s.")
def cmd_compress_align(args: Namespace) -> None:
"""Enable or disable alignment compression."""
use_json = getattr(args, "json", False)
value = getattr(args, "align_value", "on")
_write_config(compress_align=(value == "on"))
if use_json:
print(json.dumps({"status": "ok", "compress_align": value == "on"}))
return
print(f"Alignment compression: {'ENABLED' if value == 'on' else 'DISABLED'}.")
print("Daemon hot-reload: active within 2s.")
def cmd_compress_ccr(args: Namespace) -> None:
"""Enable or disable CCR (Compressed Context Retrieval)."""
use_json = getattr(args, "json", False)
value = getattr(args, "ccr_value", "off")
_write_config(compress_ccr=(value == "on"))
if use_json:
print(json.dumps({"status": "ok", "compress_ccr": value == "on"}))
return
print(f"CCR (Compressed Context Retrieval): {'ENABLED' if value == 'on' else 'DISABLED'}.")
if value == "on":
print("Originals stored in llmcache.db for reversible retrieval.")
print("Daemon hot-reload: active within 2s.")

@@ -100,5 +100,3 @@ # Copyright (c) 2026 Varun Pratap Bhardwaj / Qualixar

f" (mode: {cfg.compress_mode},"
f" code: {'ON' if cfg.compress_code else 'OFF'},"
f" prose: {'ON' if cfg.compress_prose else 'OFF'},"
f" CCR: {'ON' if cfg.compress_ccr else 'OFF'})")
f" prose/L2: {'ON' if cfg.compress_prose else 'OFF'})")
proxy_status = f"running on :{OPTIMIZE_DEFAULT_PORT}" if proxy_running else "not running"

@@ -105,0 +103,0 @@ print(f" Proxy: {proxy_status}")

@@ -35,2 +35,4 @@ # Copyright (c) 2026 Varun Pratap Bhardwaj / Qualixar

_RERANKER_MODEL = "cross-encoder/ms-marco-MiniLM-L-12-v2"
# v3.6.10: compulsory LLMLingua-2 prose compression model (~560MB, aggressive mode).
_COMPRESSOR_MODEL = "microsoft/llmlingua-2-xlm-roberta-large-meetingbank"

@@ -182,2 +184,45 @@

def _download_compressor(model_name: str) -> bool:
"""Download the LLMLingua-2 prose compression model (v3.6.10).
Mirrors _download_reranker: a subprocess forces the HF download with visible
progress. Fail-open — a network hiccup must NOT break setup; the model also
lazy-downloads on first use in prose_llmlingua.py.
"""
print(f"\n Downloading compression model: {model_name}")
print(f" (LLMLingua-2 prose compressor, ~560MB — aggressive mode only)\n")
script = (
"from llmlingua import PromptCompressor; "
f"PromptCompressor(model_name='{model_name}', use_llmlingua2=True, "
"device_map='cpu'); "
"print('OK')"
)
try:
result = subprocess.run(
[sys.executable, "-c", script],
timeout=900, # 560MB on a slow link can exceed 5 min
capture_output=False,
text=True,
env={
**os.environ,
"CUDA_VISIBLE_DEVICES": "",
"TOKENIZERS_PARALLELISM": "false",
"TORCH_DEVICE": "cpu",
},
)
if result.returncode == 0:
print(f" ✓ Compression model ready")
return True
print(f" ✗ Compression model download failed (will lazy-download on first use)")
return False
except ImportError:
print(f" ⚠ llmlingua not installed — compression model will download on first use")
return False
except Exception as exc:
print(f" ✗ Compression model error: {exc}")
return False
# ---------------------------------------------------------------------------

@@ -398,2 +443,6 @@ # Verification

print()
print("─── Step 4c/10: Download Compression Model (LLMLingua-2) ───")
_download_compressor(_COMPRESSOR_MODEL)
# -- Step 5: Daemon Configuration (v3.4.3) --

@@ -400,0 +449,0 @@ print()

@@ -112,11 +112,10 @@ # Copyright (c) 2026 Varun Pratap Bhardwaj / Qualixar

Each MCP client (Claude Code, Codex, Gemini CLI, Kimi, etc.) can set
the ``SLM_AGENT_ID`` env var in its MCP server config so that memories,
observations, and registry entries are tagged with the actual source
agent — not the legacy ``"mcp_client"`` default.
v3.4.39+: enables proper per-agent attribution in ``session_init``,
``observe``, and event emissions.
Priority chain (v3.6.10+):
1. ContextVar set by HTTP URL path (/mcp/{agent_id}) — HTTP transport.
2. SLM_AGENT_ID env var — stdio transport per-process identity.
3. Provided default (legacy "mcp_client").
"""
return os.environ.get("SLM_AGENT_ID", default)
from superlocalmemory.mcp.agent_context import get_current_agent_id
resolved = get_current_agent_id(env_fallback=True)
return resolved if resolved != "mcp_client" else default

@@ -123,0 +122,0 @@

@@ -113,2 +113,6 @@ # Copyright (c) 2026 Varun Pratap Bhardwaj / Qualixar

"""
# v3.6.10: resolve "mcp_client" sentinel → URL path (HTTP) or env var (stdio)
if agent_id == "mcp_client":
from superlocalmemory.mcp.agent_context import get_current_agent_id
agent_id = get_current_agent_id()
meta = {

@@ -175,2 +179,6 @@ "project": project,

"""
# v3.6.10: resolve "mcp_client" sentinel → URL path (HTTP) or env var (stdio)
if agent_id == "mcp_client":
from superlocalmemory.mcp.agent_context import get_current_agent_id
agent_id = get_current_agent_id()
import asyncio

@@ -490,2 +498,6 @@ try:

"""
# v3.6.10: resolve "mcp_client" sentinel → URL path (HTTP) or env var (stdio)
if agent_id == "mcp_client":
from superlocalmemory.mcp.agent_context import get_current_agent_id
agent_id = get_current_agent_id()
try:

@@ -525,2 +537,6 @@ from superlocalmemory.core.worker_pool import WorkerPool

"""
# v3.6.10: resolve "mcp_client" sentinel → URL path (HTTP) or env var (stdio)
if agent_id == "mcp_client":
from superlocalmemory.mcp.agent_context import get_current_agent_id
agent_id = get_current_agent_id()
try:

@@ -527,0 +543,0 @@ if not content or not content.strip():

@@ -96,2 +96,3 @@ # optimize/cache/boundary_store.py

epsilon_grid: tuple[float, ...] = (0.01, 0.02, 0.05, 0.10),
return_threshold: float = 1.0,
) -> float:

@@ -101,11 +102,14 @@ """Compute τ̂ — the vCache exploration probability (Eq. 11).

Args:
query_sim: Cosine similarity s(x) ∈ [0, 1] for the incoming query.
delta: δ — user-defined maximum error rate.
Theorem 4.1 guarantee: Pr(correct) ≥ 1 - δ.
epsilon_grid: ε values for the Eq. 11 min sweep.
Distinct from δ; controls CI conservativeness.
query_sim: Cosine similarity s(x) ∈ [0, 1] for the incoming query.
delta: δ — user-defined maximum error rate.
Theorem 4.1 guarantee: Pr(correct) ≥ 1 - δ.
epsilon_grid: ε values for the Eq. 11 min sweep.
Distinct from δ; controls CI conservativeness.
return_threshold: Semantic return threshold from config (semantic_return_threshold).
C-03 fix: during cold start, exploit directly when
query_sim >= return_threshold instead of always exploring.
Returns:
τ̂ ∈ [0.0, 1.0]. Lower = more exploitation.
Cold start (n < 3): returns 1.0 (always explore).
Cold start (n < 3): 0.0 if query_sim >= return_threshold, else 1.0.

@@ -122,3 +126,8 @@ Eq. 11 derivation (from the paper):

if n < 3:
return 1.0 # cold start — always explore
# ARCH-02 note: this function serves dual purpose — (a) warm-phase vCache
# Eq. 11 tau computation and (b) cold-start similarity gate. The cold-start
# branch (n < 3) is intentionally simple: if the query is already above the
# return threshold, serve it (tau=0.0 → exploit); otherwise explore (tau=1.0).
# C-03: honor return_threshold — avoids 100% miss until 3 samples are accumulated.
return 0.0 if query_sim >= return_threshold else 1.0

@@ -150,3 +159,8 @@ # Step 1: Fisher-information SE

def should_explore(self, query_sim: float, delta: float = 0.05) -> bool:
def should_explore(
self,
query_sim: float,
delta: float = 0.05,
return_threshold: float = 1.0,
) -> bool:
"""Return True (explore = LLM call) or False (exploit = serve cache).

@@ -156,3 +170,3 @@

"""
tau = self.compute_tau(query_sim, delta=delta)
tau = self.compute_tau(query_sim, delta=delta, return_threshold=return_threshold)
return _RNG.random() <= tau

@@ -159,0 +173,0 @@

@@ -6,2 +6,3 @@ """exact.py — SQLite-backed exact-match get/set, cacheable-response guard."""

import json
import logging
import time

@@ -12,2 +13,3 @@ from typing import Any

logger = logging.getLogger(__name__)

@@ -25,2 +27,3 @@ _NON_CACHEABLE_FINISH_REASONS: frozenset[str] = frozenset({

if finish in _NON_CACHEABLE_FINISH_REASONS:
logger.debug("exact: skip cache (finish_reason=%r)", finish)
return False

@@ -31,8 +34,11 @@ choices = response.get("choices") or []

if fr in _NON_CACHEABLE_FINISH_REASONS:
logger.debug("exact: skip cache (choice.finish_reason=%r)", fr)
return False
msg = choice.get("message") or {}
if msg.get("tool_calls"):
logger.debug("exact: skip cache (choice.message.tool_calls present)")
return False
for block in response.get("content") or []:
if isinstance(block, dict) and block.get("type") == "tool_use":
logger.debug("exact: skip cache (tool_use content block)")
return False

@@ -53,5 +59,2 @@ return True

return None
if row.ttl_expires is not None and row.ttl_expires < time.time():
self._db.delete(key, tenant_id)
return None
return json.loads(row.value.decode("utf-8"))

@@ -82,3 +85,3 @@

ttl_expires=expires_at,
tags=[],
tags=tags,
)

@@ -85,0 +88,0 @@ return True

@@ -7,2 +7,3 @@ """key_builder.py — deterministic SHA-256 cache key, tenant-scoped."""

import json
import logging
import re

@@ -12,2 +13,4 @@ from dataclasses import dataclass, field

logger = logging.getLogger(__name__)
DETERMINISTIC_PARAMS: frozenset[str] = frozenset({

@@ -67,2 +70,12 @@ "max_tokens", "stop", "stop_sequences", "top_p", "top_k",

if temperature != 0 and not self._config.allow_nonzero_temperature_cache:
logger.debug(
"cache skip: temperature=%.2f allow_nonzero=%s",
temperature,
self._config.allow_nonzero_temperature_cache,
)
try:
from superlocalmemory.optimize.metrics.counters import MetricsCollector
MetricsCollector.get_instance().increment_skipped_temperature()
except Exception:
pass
return None

@@ -69,0 +82,0 @@

@@ -5,2 +5,4 @@ """manager.py — CacheManager orchestrator (singleton, fail-open, stampede-shielded)."""

import hashlib as _hashlib
import json as _json_mod
import logging

@@ -12,2 +14,5 @@ import threading

# C-08: precomputed hash for the common "default" tenant — avoids sha256 on every request
_DEFAULT_TENANT_HASH: str = _hashlib.sha256(b"default").hexdigest()
from superlocalmemory.optimize.cache.exact import ExactCache

@@ -235,2 +240,22 @@ from superlocalmemory.optimize.cache.invalidation import InvalidationEngine

self._metrics.exact_misses += 1
# C-01: semantic fallback on exact miss (only when explicitly enabled)
if self._semantic.is_enabled():
try:
sem_result = self._semantic.lookup(req, tenant_id, None)
if sem_result is not None:
self._metrics.semantic_hits += 1
if isinstance(sem_result, CachedResponse):
return sem_result
# VCacheSemantic may return a response dict — wrap it
return CachedResponse(
hit=True,
data=_json_mod.dumps(sem_result, ensure_ascii=False).encode(),
cache_key=key,
ttl_seconds=0,
)
except Exception as exc:
logger.warning("SemanticTier.lookup raised (fail-open): %s", exc)
self._metrics.semantic_misses += 1
# Return miss WITH the key so callers can use it for cache storage.

@@ -264,2 +289,9 @@ return CachedResponse(hit=False, data=None, cache_key=key, ttl_seconds=0)

# C-02: index in semantic tier after exact write (fail-open)
if self._semantic.is_enabled():
try:
self._semantic.index_entry(req, tenant_id, None, response_dict)
except Exception as exc:
logger.warning("SemanticTier.index_entry raised (fail-open): %s", exc)
# ---- CacheHook protocol implementation (INTERFACE-CONTRACT §3) ----

@@ -276,3 +308,3 @@

try:
result = self.get(req, tenant_id="default")
result = self.get(req, tenant_id=_DEFAULT_TENANT_HASH)
if result is not None and not result.hit:

@@ -288,3 +320,3 @@ MetricsCollector.get_instance().on_miss()

try:
self.set(req, resp, tenant_id="default")
self.set(req, resp, tenant_id=_DEFAULT_TENANT_HASH)
except Exception as exc:

@@ -295,11 +327,41 @@ logger.warning("CacheManager.store raised (fail-open): %s", exc)

"""CacheHook.on_hit() — forward token savings to MetricsCollector.
Contract §7: cache skip saves BOTH input+output tokens (whole call avoided).
tokens_saved = input tokens saved (from request).
Output tokens estimated from response body size (~4 bytes per token).
M-01: compute input tokens from request body when caller passes 0.
M-02: parse real output tokens from cached response usage field.
All counts are estimates for display — not billing-accurate.
"""
import json as _json
# M-01: estimate input tokens from message content
if tokens_saved == 0 and isinstance(req, ProxyRequest):
try:
body = req.body or {}
total_chars = 0
for m in (body.get("messages") or []):
c = m.get("content", "")
if isinstance(c, str):
total_chars += len(c)
elif isinstance(c, list):
for blk in c:
if isinstance(blk, dict):
total_chars += len(blk.get("text", "") or "")
total_chars += len(body.get("system", "") or "")
tokens_saved = max(0, total_chars // 4)
except Exception:
pass
# M-02: parse real output tokens from stored response
output_tokens = 0
if resp:
# Rough estimate: 4 bytes per token for English text
output_tokens = len(resp) // 4
try:
data = _json.loads(resp)
usage = data.get("usage") or {}
output_tokens = (
usage.get("output_tokens")
or usage.get("completion_tokens")
or 0
)
except Exception:
output_tokens = len(resp) // 4 # fallback byte-estimate
MetricsCollector.get_instance().on_hit(

@@ -306,0 +368,0 @@ tokens_saved_input=tokens_saved,

@@ -297,3 +297,4 @@ # optimize/cache/semantic.py

delta = float(getattr(cfg, "semantic_error_target", _DEFAULT_ERROR_TARGET))
if record.should_explore(best_score, delta=delta):
sem_return_threshold = float(getattr(cfg, "semantic_return_threshold", _DEFAULT_RETURN_THRESHOLD))
if record.should_explore(best_score, delta=delta, return_threshold=sem_return_threshold):
logger.debug(

@@ -367,7 +368,7 @@ "VCacheSemantic: explore (score=%.4f entry=%s t_hat=%.4f)",

entries: list[tuple[str, str, np.ndarray]] = []
for entry_id, blob in rows:
for entry_id, blob, ctx_fp in rows: # C-10: unpack persisted context_fp
try:
v = np.frombuffer(blob, dtype=np.float32).copy()
if v.shape[0] == _EMBED_DIM:
entries.append((entry_id, "", v))
entries.append((entry_id, ctx_fp, v))
except Exception:

@@ -427,3 +428,3 @@ continue

# Store vector in DB
# Store vector in DB — C-10: persist context_fp alongside the vector
vec_bytes = vec.tobytes()

@@ -434,3 +435,7 @@ self._db.vec_add(

vector=vec_bytes,
meta={"model": "nomic-ai/nomic-embed-text-v1.5", "dim": _EMBED_DIM},
meta={
"model": "nomic-ai/nomic-embed-text-v1.5",
"dim": _EMBED_DIM,
"context_fp": context_fp,
},
)

@@ -437,0 +442,0 @@

@@ -35,12 +35,6 @@ # compress/prose_llmlingua.py

self,
model_name: str = _MODEL_BERT,
model_name: str = _MODEL_XLM,
device_map: str = "cpu",
rate: float = _DEFAULT_RATE,
) -> None:
import os
if os.environ.get("SLM_DISABLE_HF_DOWNLOAD", "0") == "1":
raise ImportError(
"LLMLingua-2 model download blocked: SLM_DISABLE_HF_DOWNLOAD=1. "
"Set compress_llmlingua_allow_download=true in optimize.json."
)
try:

@@ -47,0 +41,0 @@ from llmlingua import PromptCompressor # type: ignore[import]

@@ -29,3 +29,2 @@ # compress/router.py

_MIN_CHARS_FOR_COMPRESSION: int = 500
_MIN_RATIO_STRUCTURED: float = 0.60

@@ -54,4 +53,2 @@

def __init__(self) -> None:
self._json_compressor: "JSONCompressor | None" = None
self._code_compressor: "CodeCompressor | None" = None
self._llmlingua_compressor: "LLMLinguaCompressor | None" = None

@@ -108,8 +105,13 @@ self._ccr_store: "CCRStore | None" = None

if tokens_after >= tokens_before:
return req # no improvement
# S-01/Stage-9 fix: build new_bytes BEFORE the improvement guard.
# Layer 1 normalization saves characters (not word-count tokens), so
# tokens_after == tokens_before for "normalize" strategy. Checking byte
# length of the serialized body correctly detects Layer 1 savings.
body["messages"] = new_messages
new_bytes = json.dumps(body, ensure_ascii=False, separators=(",", ":")).encode()
bytes_saved = len(req.body_bytes) - len(new_bytes)
if bytes_saved <= 0 and tokens_after >= tokens_before:
return req # neither bytes nor tokens improved
# CONTRACT §3: fire on_compress metrics callback

@@ -166,3 +168,8 @@ lossy = strategy == "llmlingua2_prose"

protect_indices = set(range(max(0, len(messages) - protect_recent), len(messages)))
# K-05: protect last N *user* turns, not last N messages of any role
user_indices = [i for i, m in enumerate(messages) if m.get("role") == "user"]
protect_indices: set[int] = set(user_indices[-protect_recent:]) if protect_recent > 0 else set()
# Always protect the very last message (current turn, any role)
if messages:
protect_indices.add(len(messages) - 1)

@@ -255,92 +262,73 @@ for idx, msg in enumerate(messages):

# JSON detection
if len(text) < _MIN_CHARS_FOR_COMPRESSION:
return text, tokens_before, tokens_before, "none"
# K-01/K-02/K-03: NEVER compress structured content (JSON or code)
stripped = text.strip()
if stripped.startswith(("{", "[")):
# PERF-02: for large content, structural bracket-match avoids O(n) json.loads().
# Conservative: matching outer brackets → treat as JSON and skip compression.
# K-01 mandate is safety-first: false-positive (non-JSON treated as JSON) is
# safe; false-negative (JSON compressed) would be a correctness violation.
_last = stripped[-1] if stripped else ""
if len(stripped) > 8192 and (
(stripped[0] == "{" and _last == "}") or (stripped[0] == "[" and _last == "]")
):
return text, tokens_before, tokens_before, "none" # large JSON → passthrough
try:
parsed = json.loads(stripped)
compressor = self._get_json_compressor()
compressed = compressor.compress(parsed)
tokens_after = _token_estimate_structured(compressed)
tokens_before_adj = _token_estimate_structured(text)
ratio = tokens_after / tokens_before_adj if tokens_before_adj else 1.0
if ratio < _MIN_RATIO_STRUCTURED:
# B-03: store-before-compress
ccr_id = self._ccr_store_original(text.encode(), model, tenant_id)
if ccr_id:
try:
obj = json.loads(compressed)
if isinstance(obj, dict):
obj["__slm_ccr__"] = ccr_id
compressed = json.dumps(obj, ensure_ascii=False, separators=(",", ":"))
elif isinstance(obj, list):
# RB-02: list-root embedding
wrapper = {"__slm_ccr__": ccr_id, "__slm_data__": obj}
compressed = json.dumps(wrapper, ensure_ascii=False, separators=(",", ":"))
except Exception:
pass
self._ccr_update_compressed(ccr_id, compressed.encode())
logger.debug("[%s] JSON compressed %.2f ratio ccr_id=%s", request_id, ratio, ccr_id)
return compressed, tokens_before_adj, tokens_after, "extractive_json"
return text, tokens_before, tokens_before, "none"
except (json.JSONDecodeError, Exception):
pass
json.loads(stripped)
return text, tokens_before, tokens_before, "none" # valid JSON → passthrough
except json.JSONDecodeError:
pass # not valid JSON — treat as prose
except Exception as exc:
logger.warning("compress: unexpected error probing JSON content: %s", exc)
# Code detection
lang = _detect_language(text)
if lang is not None:
compressor = self._get_code_compressor()
# RB-03: compress first, compute ratio, then store CCR only if beneficial
compressed_probe = compressor.compress(text, language=lang, ccr_id="")
tokens_after = _token_estimate(compressed_probe)
ratio = tokens_after / tokens_before if tokens_before else 1.0
if ratio < _MIN_RATIO_STRUCTURED:
# B-03: store-before-compress
ccr_id = self._ccr_store_original(text.encode(), model, tenant_id)
compressed = compressor.compress(text, language=lang, ccr_id=ccr_id)
if ccr_id:
self._ccr_update_compressed(ccr_id, compressed.encode())
logger.debug("[%s] Code compressed lang=%s ratio=%.2f ccr_id=%s",
request_id, lang, ratio, ccr_id)
return compressed, tokens_before, tokens_after, "extractive_code"
return text, tokens_before, tokens_before, "none"
if _detect_language(text) is not None:
return text, tokens_before, tokens_before, "none" # code → passthrough
# Prose: only if aggressive mode AND compress_prose is enabled
# (Phase 3 — opt-in prose tier; off by default; gated by config
# field compress_prose added in LLD-04 INTERFACE-CONTRACT v2.)
# Layer 1 — lossless whitespace normalization (always-on, safe)
normalized = self._normalize_whitespace(text)
tokens_after_l1 = _token_estimate(normalized)
# Layer 2 — LLMLingua-2 prose compression (aggressive + opt-in only)
cfg = self._get_config()
prose_enabled = bool(getattr(cfg, "compress_prose", False))
if aggressive and prose_enabled:
if aggressive and prose_enabled: # pragma: no cover — LLMLingua optional dep
compressor = self._get_llmlingua_compressor()
if compressor is not None:
# B-03: store-before-compress
# B-03: store original BEFORE lossy compression
ccr_id = self._ccr_store_original(text.encode(), model, tenant_id)
compressed = compressor.compress(text)
compressed = compressor.compress(normalized)
if ccr_id:
self._ccr_update_compressed(ccr_id, compressed.encode())
tokens_after = _token_estimate(compressed)
logger.info(
"[%s] LLMLingua-2 prose compressed rate=%.2f ccr_id=%s (LOSSY)",
request_id,
tokens_after / tokens_before if tokens_before else 1.0,
ccr_id,
)
return compressed, tokens_before, tokens_after, "llmlingua2_prose"
tokens_after_l2 = _token_estimate(compressed)
if tokens_after_l2 < tokens_before:
logger.info(
"[%s] LLMLingua-2 prose compressed rate=%.2f ccr_id=%s (LOSSY)",
request_id,
tokens_after_l2 / tokens_before if tokens_before else 1.0,
ccr_id,
)
return compressed, tokens_before, tokens_after_l2, "llmlingua2_prose"
# S-01 fix: compare character length, not word count.
# _token_estimate() is word-count — whitespace normalization saves characters/bytes
# but never removes words, so token counts are identical before and after Layer 1.
# Character comparison correctly detects when normalization reduced the body size.
if len(normalized) < len(text):
return normalized, tokens_before, tokens_after_l1, "normalize"
return text, tokens_before, tokens_before, "none"
@staticmethod
def _normalize_whitespace(text: str) -> str:
"""Layer 1 lossless: collapse excess blank lines, strip trailing spaces per line."""
import re
text = re.sub(r"\n{3,}", "\n\n", text)
lines = [line.rstrip() for line in text.split("\n")]
return "\n".join(lines)
# ── Lazy loaders ─────────────────────────────────────────────────────
def _get_json_compressor(self) -> "JSONCompressor":
if self._json_compressor is None:
from superlocalmemory.optimize.compress.extractive_json import JSONCompressor
self._json_compressor = JSONCompressor()
return self._json_compressor
def _get_code_compressor(self) -> "CodeCompressor":
if self._code_compressor is None:
from superlocalmemory.optimize.compress.extractive_code import CodeCompressor
self._code_compressor = CodeCompressor()
return self._code_compressor
def _get_llmlingua_compressor(self) -> "LLMLinguaCompressor | None":
def _get_llmlingua_compressor(self) -> "LLMLinguaCompressor | None": # pragma: no cover
if self._llmlingua_compressor is None:

@@ -404,8 +392,11 @@ try:

tokens_before=tb, tokens_after=ta,
lossy=strat == "llmlingua2_prose",
)
except Exception as exc:
logger.debug("compress_text failed (non-fatal): %s", exc)
t = _token_estimate(text)
return CompressTextResult(
compressed_text=text, strategy="none",
tokens_before=len(text.split()), tokens_after=len(text.split()),
tokens_before=t, tokens_after=t,
lossy=False,
)

@@ -416,6 +407,14 @@

class CompressTextResult:
"""Result of a compress_text() call.
UX-02 note: lossy=True only when strategy="llmlingua2_prose" (Layer 2).
In the default install (LLMLingua optional dep not installed), lossy is
always False — install `llmlingua>=0.2.0` and set compress_prose=True +
compress_mode="aggressive" to activate lossy compression.
"""
compressed_text: str
strategy: str # "extractive_json" | "extractive_code" | "llmlingua2_prose" | "none"
strategy: str # "normalize" | "llmlingua2_prose" | "none"
tokens_before: int
tokens_after: int
lossy: bool = False # K-10: True only for llmlingua2_prose (Layer 2)

@@ -429,6 +428,2 @@

def _token_estimate_structured(text: str) -> int:
return max(1, len(text) // 4) if text else 0
def _msg_has_tool_result(msg: dict) -> bool:

@@ -435,0 +430,0 @@ """B-09: Detect historical tool_result blocks in messages."""

@@ -27,2 +27,18 @@ """Module-level accessor for OptimizeConfig (INTERFACE-CONTRACT §2).

def get_shared_store() -> "ConfigStore":
"""Return the process-wide ConfigStore singleton, creating it on first use.
This is the SINGLE instance the daemon, the /api/optimize routes, the proxy
hook-reload callback, and the hot-reload watchdog must all share so that a UI
config change reaches the live proxy (fixes W-02 hot-reload + W-05 fresh-store
-per-request). The daemon calls this at startup, registers a change callback,
and starts the watchdog.
"""
global _store
if _store is None:
from superlocalmemory.optimize.config.store import ConfigStore
_store = ConfigStore()
return _store
def _set_config_store(store: "ConfigStore") -> None:

@@ -29,0 +45,0 @@ global _store

@@ -26,11 +26,6 @@ """Default OptimizeConfig singleton — import-safe."""

semantic_centroid_min_similarity=0.85,
compress_enabled=True,
compress_enabled=False,
compress_mode="safe",
compress_code=True,
compress_json=True,
compress_prose=False,
compress_ccr=True,
compress_align=True,
compress_protect_recent=4,
compress_llmlingua_allow_download=False,
ttl_seconds=86400,

@@ -37,0 +32,0 @@ ttl=TTLConfig(),

@@ -92,11 +92,6 @@ """Typed schema for optimize.json.

# Compress
compress_enabled: bool = True
compress_enabled: bool = False
compress_mode: str = "safe"
compress_code: bool = True
compress_json: bool = True
compress_prose: bool = False
compress_ccr: bool = True
compress_align: bool = True
compress_protect_recent: int = 4
compress_llmlingua_allow_download: bool = False

@@ -186,13 +181,6 @@ # TTL + providers + pricing

),
compress_enabled=bool(d.get("compress_enabled", True)),
compress_enabled=bool(d.get("compress_enabled", False)),
compress_mode=str(d.get("compress_mode", "safe")),
compress_code=bool(d.get("compress_code", True)),
compress_json=bool(d.get("compress_json", True)),
compress_prose=bool(d.get("compress_prose", False)),
compress_ccr=bool(d.get("compress_ccr", True)),
compress_align=bool(d.get("compress_align", True)),
compress_protect_recent=int(d.get("compress_protect_recent", 4)),
compress_llmlingua_allow_download=bool(
d.get("compress_llmlingua_allow_download", False)
),
ttl_seconds=int(d.get("ttl_seconds", 86400)),

@@ -235,9 +223,4 @@ providers=providers,

"compress_mode": self.compress_mode,
"compress_code": self.compress_code,
"compress_json": self.compress_json,
"compress_prose": self.compress_prose,
"compress_ccr": self.compress_ccr,
"compress_align": self.compress_align,
"compress_protect_recent": self.compress_protect_recent,
"compress_llmlingua_allow_download": self.compress_llmlingua_allow_download,
"ttl_seconds": self.ttl_seconds,

@@ -244,0 +227,0 @@ "providers": {k: v.as_dict() for k, v in self.providers.items()},

@@ -77,3 +77,9 @@ """ConfigStore — reads, writes, and hot-reloads optimize.json.

def save(self, config: OptimizeConfig) -> None:
"""Write config to optimize.json with version bump."""
"""Write config to optimize.json with version bump.
Fires registered change callbacks immediately after a successful write
(outside the lock) so a UI/CLI save reaches the live proxy without
waiting for the 2s watchdog poll. The watchdog skips this write via
``_saved_by_self`` so callbacks fire exactly once.
"""
config.validate()

@@ -97,2 +103,10 @@ with self._lock:

self._saved_by_self = False
callbacks = list(self._change_callbacks)
# Fire callbacks OUTSIDE the lock — a callback may rebuild proxy hooks
# or call back into get(); keeping them off the lock avoids contention.
for cb in callbacks:
try:
cb(new_cfg)
except Exception as exc:
logger.warning("ConfigStore save callback error: %s", exc)

@@ -99,0 +113,0 @@ def start_watchdog(self) -> None:

@@ -27,3 +27,3 @@ # Copyright (c) 2026 Varun Pratap Bhardwaj / Qualixar

on_miss(): cache miss
on_compress(bytes_original, bytes_after): compress ran
on_compress(tokens_before, tokens_after): compress ran (word-count proxy)
on_eviction(): entry expired/evicted

@@ -73,10 +73,13 @@ """

def on_compress(self, bytes_original: int, bytes_after: int) -> None:
"""Record a compression run with byte counts."""
def on_compress(self, tokens_before: int, tokens_after: int) -> None:
"""Record a compression run. Arguments are word-count proxy estimates from _token_estimate().
M-03: consistent naming — these are token estimates, not byte counts.
Stored in compress_bytes_original/after fields for DB schema compat; unit is word-count.
"""
with self._data_lock:
self._compress_runs += 1
self._compress_bytes_original += max(0, bytes_original)
self._compress_bytes_after += max(0, bytes_after)
saved_tokens = max(0, bytes_original - bytes_after)
self._tokens_saved_compress += saved_tokens
self._compress_bytes_original += max(0, tokens_before)
self._compress_bytes_after += max(0, tokens_after)
self._tokens_saved_compress += max(0, tokens_before - tokens_after)

@@ -88,2 +91,7 @@ def on_eviction(self) -> None:

def increment_skipped_temperature(self) -> None:
"""C-04: record a cache skip due to non-zero temperature."""
with self._data_lock:
self._calls_skipped += 1
def record_latency(self, ms: float) -> None:

@@ -90,0 +98,0 @@ """Record latency overhead in milliseconds."""

@@ -361,2 +361,3 @@ """_helpers.py — Shared HTTP utilities used by all surface modules."""

on_complete: "Callable[[bytes], Any] | None" = None,
max_accumulate: int | None = None,
) -> Response | StreamingResponse:

@@ -392,6 +393,8 @@ """Stream-forward with optional post-stream cache-store callback.

acc: list[bytes] = []
acc_bytes = 0
acc_capped = False
complete_called = False
async def _generate() -> AsyncIterator[bytes]:
nonlocal complete_called
nonlocal complete_called, acc_bytes, acc_capped
stream_error = False

@@ -404,3 +407,13 @@ try:

if chunk:
acc.append(chunk)
# Always forward to the client; only bound what we hold
# in memory for the on_complete callback (CWE-400).
if max_accumulate is None or acc_bytes < max_accumulate:
acc.append(chunk)
acc_bytes += len(chunk)
elif not acc_capped:
acc_capped = True
logger.debug(
"[%s] stream accumulator capped at %d bytes",
request_id, max_accumulate,
)
yield chunk

@@ -453,2 +466,87 @@ except httpx.RemoteProtocolError as exc:

async def capture_passthrough_forward(
proxy: Any,
request: Request,
*,
provider: str,
upstream_url: str,
allowed_headers: frozenset,
request_id: str,
model_hint: str = "",
sse_parser: "Callable[[bytes], bytes | None] | None" = None,
is_stream: bool = False,
) -> Response | StreamingResponse:
"""Shadow-capture passthrough (v3.6.10, plan §7).
Pure passthrough to upstream + record the exchange to the capture corpus.
NO cache, NO compression — capture mode observes only authentic traffic.
Fail-open: a capture or forward error degrades to a normal forward/error
response; the user's request is never blocked by capture.
"""
from superlocalmemory.optimize.proxy.capture import (
extract_usage,
record_exchange_async,
)
body_bytes = await request.body()
fwd_headers = _build_forward_headers(request, allowed_headers)
fwd_headers["content-length"] = str(len(body_bytes))
if is_stream:
async def _on_complete(acc: bytes) -> None:
parsed = sse_parser(acc) if sse_parser else None
payload = parsed if parsed is not None else acc
itok, otok, mdl = extract_usage(provider, parsed)
await record_exchange_async(
provider=provider,
model=mdl or model_hint,
request_body=body_bytes,
response_body=payload,
content_type="text/event-stream",
input_tokens=itok,
output_tokens=otok,
status_code=200,
stream=True,
)
# Bound the in-memory accumulator (CWE-400): the corpus only keeps the
# first 1 MB per side anyway, so cap accumulation there.
from superlocalmemory.optimize.proxy.capture import _MAX_CAPTURE_BODY_BYTES
return await _stream_and_cache_forward(
proxy, request_id, fwd_headers, body_bytes, upstream_url,
on_complete=_on_complete,
max_accumulate=_MAX_CAPTURE_BODY_BYTES,
)
if proxy.http_client is None:
return await _fail_open_forward(proxy, request, upstream_url)
try:
upstream_resp = await proxy.http_client.post(
upstream_url, content=body_bytes, headers=fwd_headers,
)
except Exception as exc:
logger.error("[%s] capture passthrough upstream error: %r", request_id, exc)
return await _fail_open_forward(proxy, request, upstream_url)
resp_bytes = upstream_resp.content
itok, otok, mdl = extract_usage(provider, resp_bytes)
await record_exchange_async(
provider=provider,
model=mdl or model_hint,
request_body=body_bytes,
response_body=resp_bytes,
content_type="application/json",
input_tokens=itok,
output_tokens=otok,
status_code=upstream_resp.status_code,
stream=False,
)
return Response(
content=resp_bytes,
status_code=upstream_resp.status_code,
media_type="application/json",
headers=_filter_response_headers(dict(upstream_resp.headers)),
)
async def _safe_cache_check(hooks: HookChain, ctx: ProxyRequest) -> CachedResponse:

@@ -455,0 +553,0 @@ try:

@@ -28,3 +28,5 @@ """anthropic_surface.py — Anthropic Messages + count_tokens + models surfaces."""

_stream_forward,
capture_passthrough_forward,
)
from superlocalmemory.optimize.proxy.capture import capture_enabled
from superlocalmemory.optimize.proxy.lifecycle import ProviderResponse, ProxyRequest

@@ -196,2 +198,12 @@

has_tools = _body_has_tools(body)
# v3.6.10 shadow-capture (plan §7): pure passthrough + corpus record.
if capture_enabled():
return await capture_passthrough_forward(
proxy, request, provider="anthropic", upstream_url=upstream_url,
allowed_headers=_ANTHROPIC_FORWARD_HEADERS, request_id=request_id,
model_hint=str(body.get("model", "")),
sse_parser=_parse_sse_to_json, is_stream=stream,
)
ctx = ProxyRequest(

@@ -198,0 +210,0 @@ provider="anthropic",

@@ -50,3 +50,6 @@ """gemini_surface.py — Gemini native and OpenAI-compat surfaces.

_stream_forward,
capture_passthrough_forward,
)
from superlocalmemory.optimize.proxy.capture import capture_enabled
from superlocalmemory.optimize.proxy.openai_surface import _parse_openai_sse_to_json
from superlocalmemory.optimize.proxy.lifecycle import ProviderResponse, ProxyRequest

@@ -242,2 +245,20 @@

# v3.6.10 shadow-capture (plan §7): pure passthrough + corpus record.
if capture_enabled():
cap_model = model_and_method.split(":", 1)[0].replace("models/", "")
cap_url = upstream_url
if stream:
_allowed = {
k: v for k, v in request.query_params.items()
if k.lower() in _GEMINI_ALLOWED_QUERY_PARAMS
}
_allowed["alt"] = "sse"
cap_url = f"{upstream_url}?{urllib.parse.urlencode(_allowed)}"
return await capture_passthrough_forward(
proxy, request, provider="gemini", upstream_url=cap_url,
allowed_headers=_GEMINI_NATIVE_FORWARD_HEADERS, request_id=request_id,
model_hint=cap_model,
sse_parser=_parse_gemini_sse_to_json, is_stream=stream,
)
ctx = ProxyRequest(

@@ -412,2 +433,12 @@ provider="gemini",

# v3.6.10 shadow-capture (plan §7): pure passthrough + corpus record.
if capture_enabled():
return await capture_passthrough_forward(
proxy, request, provider="gemini-openai-compat",
upstream_url=upstream_url,
allowed_headers=_GEMINI_OPENAI_COMPAT_FORWARD_HEADERS,
request_id=request_id, model_hint=str(body.get("model", "")),
sse_parser=_parse_openai_sse_to_json, is_stream=stream,
)
ctx = ProxyRequest(

@@ -414,0 +445,0 @@ provider="gemini-openai-compat",

@@ -36,3 +36,5 @@ """openai_surface.py — OpenAI /v1/chat/completions and /v1/embeddings.

_stream_forward,
capture_passthrough_forward,
)
from superlocalmemory.optimize.proxy.capture import capture_enabled
from superlocalmemory.optimize.proxy.lifecycle import ProviderResponse, ProxyRequest

@@ -320,2 +322,12 @@

has_tools = _body_has_tools(body)
# v3.6.10 shadow-capture (plan §7): pure passthrough + corpus record.
if capture_enabled():
return await capture_passthrough_forward(
proxy, request, provider="openai", upstream_url=upstream_url,
allowed_headers=_OPENAI_FORWARD_HEADERS, request_id=request_id,
model_hint=str(body.get("model", "")),
sse_parser=_parse_openai_sse_to_json, is_stream=stream,
)
ctx = ProxyRequest(

@@ -322,0 +334,0 @@ provider="openai", method="POST", path="/v1/chat/completions",

@@ -78,3 +78,21 @@ """server.py — ProxyApp and build_proxy_router()."""

def reload_from_config(self, config: OptimizeConfig) -> None:
"""Hot-swap cache/compress behavior when optimize.json changes (v3.6.10).
Called by the ConfigStore change-callback (UI save → immediate; external
file/CLI edit → within the 2s watchdog poll). Rebuilds the HookChain so
``cache_enabled`` and ``compress_enabled`` can be toggled INDEPENDENTLY
at runtime with no daemon restart. Note: ``proxy_enabled`` (whether the
proxy claims /v1/* at all) is a startup decision and is NOT changed here.
"""
self.config = config
self.hooks = _load_hooks(config)
logger.info(
"slm.optimize.proxy reloaded (config v%s): cache=%s compress=%s",
getattr(config, "config_version", "?"),
type(self.hooks.cache).__name__ if self.hooks.cache else "None",
type(self.hooks.compress).__name__ if self.hooks.compress else "None",
)
def build_proxy_router(proxy: ProxyApp) -> APIRouter:

@@ -136,2 +154,13 @@ """Build and return the FastAPI router for all proxy surfaces."""

def _load_hooks(config: OptimizeConfig) -> HookChain:
# v3.6.10 shadow-capture (plan §7): capture mode is PURE passthrough — no
# cache, no compression — so the corpus records only authentic upstream
# exchanges. This is defense-in-depth alongside the per-surface guard.
from superlocalmemory.optimize.proxy.capture import capture_enabled
if capture_enabled():
logger.info(
"slm.optimize.proxy: SLM_OPTIMIZE_CAPTURE on — cache/compress "
"DISABLED, recording exchanges to optimize_capture.jsonl"
)
return HookChain.empty()
cache_hook = None

@@ -138,0 +167,0 @@ compress_hook = None

@@ -36,2 +36,3 @@ """CacheDB — wraps DatabaseManager for llmcache.db operations.

import os
import re
import sqlite3

@@ -56,2 +57,21 @@ import struct

_DEFAULT_TENANT: str = "default"
_TENANT_HEX64 = re.compile(r"[0-9a-f]{64}")
def _normalize_tenant_id(tenant_id: str) -> str:
"""Match CacheManager.build_key tenant normalization (v3.6.10 fix).
The proxy stores entries under SHA-256(tenant) when the tenant is not
already a 64-char hex digest. Public tenant-scoped helpers (entry_count,
clear_tenant, entry_exists) MUST apply the same hashing or they query the
wrong tenant — the cause of the dashboard reporting "0 entries" while the
cache is in fact populated.
"""
tid = tenant_id or _DEFAULT_TENANT
if _TENANT_HEX64.fullmatch(tid):
return tid
import hashlib as _hashlib
return _hashlib.sha256(tid.encode()).hexdigest()
_ZLIB_LEVEL: int = 6

@@ -66,2 +86,5 @@ _AES_NONCE_BYTES: int = 12

# C-06: persisted AES key — survives machine-id changes after first run
_KEY_FILE: Path = Path.home() / LLMCACHE_DIRNAME / "opt-key.bin"
_FORBIDDEN_MEMORY_TABLES: frozenset[str] = frozenset({

@@ -80,3 +103,9 @@ "memories", "atomic_facts", "profiles", "canonical_entities",

class MetricsSnapshot:
"""Mirror of llmcache_metrics columns — names MUST match exactly."""
"""Mirror of llmcache_metrics columns — names MUST match exactly.
S-03 / M-03 note: compress_bytes_original and compress_bytes_after store
WORD-COUNT proxy values (len(text.split())), NOT byte counts. The column
names use "bytes" for DB schema backward compatibility. Treat these fields
as token-count proxies, not literal byte measurements.
"""
id: int = 1

@@ -93,4 +122,4 @@ hits: int = 0

compress_runs: int = 0
compress_bytes_original: int = 0
compress_bytes_after: int = 0
compress_bytes_original: int = 0 # unit: word-count proxy (see S-03 note above)
compress_bytes_after: int = 0 # unit: word-count proxy (see S-03 note above)
cache_size_bytes: int = 0

@@ -217,4 +246,3 @@ cache_entry_count: int = 0

self._salt = self._load_or_create_salt()
machine_id = self._get_machine_id()
self._aes_key = self._derive_aes_key(machine_id, self._salt)
self._aes_key = self._get_or_persist_aes_key(self._salt)
self.assert_no_memory_db_tables()

@@ -301,2 +329,29 @@

def _get_or_persist_aes_key(self, salt: bytes) -> bytes:
"""C-06: Load persisted key from disk, or derive + persist on first run.
Surviving a machine-id change: after first derivation the key is saved to
opt-key.bin (0600). On subsequent starts the file is read directly,
so changing the underlying machine-id string cannot invalidate existing
cache entries.
"""
try:
if _KEY_FILE.exists():
key = _KEY_FILE.read_bytes()
if len(key) == 32:
return key
except Exception as exc:
logger.warning("CacheDB: could not read persisted AES key: %s", exc)
# First run (or corrupted file): derive from machine-id and persist.
machine_id = self._get_machine_id()
key = self._derive_aes_key(machine_id, salt)
try:
_KEY_FILE.parent.mkdir(parents=True, exist_ok=True)
_KEY_FILE.write_bytes(key)
os.chmod(_KEY_FILE, 0o600)
except Exception as exc:
logger.warning("CacheDB: could not persist AES key (fail-open): %s", exc)
return key
def _derive_aes_key(self, machine_id: str, salt: bytes) -> bytes:

@@ -636,7 +691,8 @@ kdf = PBKDF2HMAC(

model_name = str(meta.get("model", "nomic-ai/nomic-embed-text-v1.5"))
context_fp = str(meta.get("context_fp", "")) # C-10: persist context fingerprint
self._db.execute(
"INSERT OR REPLACE INTO llmcache_semantic_vectors "
"(entry_id, tenant_id, vector_blob, vector_dim, model_name) "
"VALUES (?, ?, ?, ?, ?)",
(entry_id, tenant_id, vector, dim, model_name),
"(entry_id, tenant_id, vector_blob, vector_dim, model_name, context_fp) "
"VALUES (?, ?, ?, ?, ?, ?)",
(entry_id, tenant_id, vector, dim, model_name, context_fp),
)

@@ -866,10 +922,18 @@ except sqlite3.Error as exc:

def get_all_vectors(self, tenant_id: str) -> list[tuple[str, bytes]]:
def get_all_vectors(self, tenant_id: str) -> list[tuple[str, bytes, str]]:
"""Return (entry_id, vector_blob, context_fp) for all vectors in a tenant.
C-10: context_fp is included so _lazy_warm_tenant() can restore it without
recomputing embeddings from messages that may no longer be in scope.
"""
try:
rows = self._db.execute(
"SELECT entry_id, vector_blob FROM llmcache_semantic_vectors "
"SELECT entry_id, vector_blob, context_fp FROM llmcache_semantic_vectors "
"WHERE tenant_id = ?",
(tenant_id,),
)
return [(dict(r)["entry_id"], dict(r)["vector_blob"]) for r in rows]
return [
(dict(r)["entry_id"], dict(r)["vector_blob"], dict(r).get("context_fp", ""))
for r in rows
]
except sqlite3.Error as exc:

@@ -931,2 +995,3 @@ logger.warning("CacheDB.get_all_vectors failed: %s", exc)

def entry_exists(self, cache_key: str, tenant_id: str = _DEFAULT_TENANT) -> bool:
tenant_id = _normalize_tenant_id(tenant_id)
try:

@@ -942,2 +1007,3 @@ rows = self._db.execute(

def clear_tenant(self, tenant_id: str) -> int:
tenant_id = _normalize_tenant_id(tenant_id)
try:

@@ -969,2 +1035,3 @@ with self._db.transaction():

def entry_count(self, tenant_id: str = _DEFAULT_TENANT) -> int:
tenant_id = _normalize_tenant_id(tenant_id)
try:

@@ -971,0 +1038,0 @@ rows = self._db.execute(

@@ -56,2 +56,3 @@ """DDL for llmcache.db — the Optimize module's dedicated SQLite database.

model_name TEXT NOT NULL DEFAULT 'nomic-ai/nomic-embed-text-v1.5',
context_fp TEXT NOT NULL DEFAULT '',
created_at TEXT NOT NULL DEFAULT (strftime('%Y-%m-%dT%H:%M:%fZ', 'now'))

@@ -131,2 +132,3 @@ )

Also seeds the single llmcache_metrics row (id=1) via INSERT OR IGNORE.
C-10: adds context_fp column to existing DBs via ALTER TABLE migration.
"""

@@ -136,2 +138,11 @@ for stmt in _DDL_STATEMENTS:

conn.execute("INSERT OR IGNORE INTO llmcache_metrics(id) VALUES (1)")
# C-10 migration: add context_fp column if missing (existing installs pre-v3.6.10)
existing_cols = {
row[1]
for row in conn.execute("PRAGMA table_info(llmcache_semantic_vectors)")
}
if "context_fp" not in existing_cols:
conn.execute(
"ALTER TABLE llmcache_semantic_vectors ADD COLUMN context_fp TEXT NOT NULL DEFAULT ''"
)
row = conn.execute(

@@ -138,0 +149,0 @@ "SELECT 1 FROM llmcache_schema_version WHERE version = ?",

@@ -35,5 +35,3 @@ # Copyright (c) 2026 Varun Pratap Bhardwaj / Qualixar

compress_mode: Literal['safe', 'aggressive'] | None = None
compress_code: bool | None = None
compress_prose: bool | None = None
compress_ccr: bool | None = None

@@ -45,4 +43,4 @@

try:
from superlocalmemory.optimize.config.store import ConfigStore
cfg = ConfigStore().get()
from superlocalmemory.optimize.config import get_shared_store
cfg = get_shared_store().get()
return cfg.as_dict()

@@ -59,4 +57,4 @@ except Exception as exc:

import dataclasses
from superlocalmemory.optimize.config.store import ConfigStore
store = ConfigStore()
from superlocalmemory.optimize.config import get_shared_store
store = get_shared_store()
cfg = store.get()

@@ -86,7 +84,7 @@ updates: dict[str, Any] = {}

try:
from superlocalmemory.optimize.config.store import ConfigStore
from superlocalmemory.optimize.config import get_shared_store
from superlocalmemory.optimize.metrics.counters import get_metrics
from superlocalmemory.optimize.storage.db import CacheDB
cfg = ConfigStore().get()
cfg = get_shared_store().get()
collector = get_metrics()

@@ -93,0 +91,0 @@ db = CacheDB()

@@ -327,6 +327,8 @@ // SuperLocalMemory V3 — Auto-Capture/Recall Settings

try {
var testBody = {provider: provider, model: model};
if (apiKey) testBody.api_key = apiKey;
var resp = await fetch('/api/v3/provider/test', {
method: 'POST',
headers: {'Content-Type': 'application/json'},
body: JSON.stringify({provider: provider, model: model, api_key: apiKey})
body: JSON.stringify(testBody)
});

@@ -333,0 +335,0 @@ var data = await resp.json();

@@ -341,2 +341,5 @@ // Neural Glass Shell — Dashboard V2 "Neural Glass"

switch(tabId) {
case 'brain-pane':
if (typeof loadBrain === 'function') loadBrain();
break;
case 'graph-pane':

@@ -343,0 +346,0 @@ if (typeof loadGraph === 'function') loadGraph();

@@ -26,2 +26,3 @@ // SuperLocalMemory V3 — Optimize Tab

_setToggle('opt-enabled', cfg.enabled);
_setToggle('opt-proxy-enabled', cfg.proxy_enabled);
_setToggle('opt-cache-enabled', cfg.cache_enabled);

@@ -31,6 +32,3 @@ _setToggle('opt-semantic-enabled', cfg.semantic_enabled);

_setSelect('opt-compress-mode', cfg.compress_mode);
_setToggle('opt-compress-code', cfg.compress_code);
_setToggle('opt-compress-prose', cfg.compress_prose);
_setToggle('opt-compress-ccr', cfg.compress_ccr);
_setToggle('opt-compress-align', cfg.compress_align);
var verEl = document.getElementById('opt-config-version');

@@ -75,10 +73,8 @@ if (verEl) verEl.textContent = cfg.config_version || '-';

var fieldMap = {
'opt-enabled': 'enabled',
'opt-cache-enabled': 'cache_enabled',
'opt-enabled': 'enabled',
'opt-proxy-enabled': 'proxy_enabled',
'opt-cache-enabled': 'cache_enabled',
'opt-semantic-enabled': 'semantic_enabled',
'opt-compress-enabled': 'compress_enabled',
'opt-compress-code': 'compress_code',
'opt-compress-prose': 'compress_prose',
'opt-compress-ccr': 'compress_ccr',
'opt-compress-align': 'compress_align'
'opt-compress-prose': 'compress_prose'
};

@@ -115,2 +111,6 @@

_putConfig(body);
if (id === 'opt-proxy-enabled' || id === 'opt-enabled') {
var notice = document.getElementById('opt-restart-notice');
if (notice) notice.classList.remove('d-none');
}
}

@@ -117,0 +117,0 @@ });

# compress/extractive_code.py
# Copyright (c) 2026 Varun Pratap Bhardwaj / Qualixar
# Licensed under AGPL-3.0-or-later
#
# AST structural patterns adapted from:
# headroom/compression/handlers/code_handler.py:66-138 (Apache-2.0)
# Specifically: _STRUCTURAL_NODE_TYPES per-language dict, CodeLanguage enum
# headroom/compression/handlers/code_handler.py:141-150 — _SIGNATURE_PATTERNS regex fallback
# Attribution: See ATTRIBUTION.md.
"""CodeCompressor — AST-aware extractive code compressor.
Supported languages: Python, JavaScript, Go, Rust, Java, C++.
Path A (tree-sitter): used if tree-sitter-language-pack installed.
Path B (regex fallback): used otherwise. Tests pass on both paths.
"""
from __future__ import annotations
import logging
import re
from dataclasses import dataclass
from enum import Enum
logger = logging.getLogger("slm.optimize.compress.code")
_BODY_STUB_BY_LANG: dict[str, str] = {
"python": " # [slm: body compressed — retrieve with ccr_id={ccr_id}]",
"javascript": " // [slm: body compressed — retrieve with ccr_id={ccr_id}]",
"go": " // [slm: body compressed — retrieve with ccr_id={ccr_id}]",
"rust": " // [slm: body compressed — retrieve with ccr_id={ccr_id}]",
"java": " // [slm: body compressed — retrieve with ccr_id={ccr_id}]",
"cpp": " // [slm: body compressed — retrieve with ccr_id={ccr_id}]",
}
_BODY_STUB_NO_CCR_BY_LANG: dict[str, str] = {
"python": " # [slm: body compressed]",
"javascript": " // [slm: body compressed]",
"go": " // [slm: body compressed]",
"rust": " // [slm: body compressed]",
"java": " // [slm: body compressed]",
"cpp": " // [slm: body compressed]",
}
_MIN_BODY_LINES: int = 4
class CodeLanguage(Enum):
PYTHON = "python"
JAVASCRIPT = "javascript"
GO = "go"
RUST = "rust"
JAVA = "java"
CPP = "cpp"
@dataclass(frozen=True)
class CodeSpan:
start_line: int
end_line: int
role: str # "import" | "signature" | "body" | "class_header" | "decorator"
is_structural: bool
# ── Signature patterns for regex path (Path B) ────────────────────────────────
_SIGNATURE_PATTERNS: dict[str, list[str]] = {
"python": [
r"^\s*(async\s+)?def\s+\w+\s*\([^)]*\)\s*(->\s*[^:]+)?:",
r"^\s*class\s+\w+(\([^)]*\))?:",
r"^\s*import\s+",
r"^\s*from\s+\w+\s+import",
r"^\s*@\w+",
],
"javascript": [
r"^\s*(async\s+)?function\s+\w+\s*\([^)]*\)",
r"^\s*class\s+\w+(\s+extends\s+\w+)?",
r"^\s*import\s+",
r"^\s*const\s+\w+\s*=\s*(async\s+)?\(",
r"^\s*export\s+(default\s+|const\s+|class\s+|function\s+)",
],
"go": [
r"^\s*func\s+(\(\w+\s+\*?\w+\)\s*)?\w+\s*\(",
r"^\s*import\s+",
r"^\s*package\s+",
r"^\s*type\s+\w+\s+(struct|interface)\s*\{",
],
"rust": [
r"^\s*(pub\s+)?(async\s+)?fn\s+\w+",
r"^\s*use\s+",
r"^\s*impl\s+",
r"^\s*struct\s+",
r"^\s*enum\s+",
r"^\s*trait\s+",
],
"java": [
r"^\s*(public|private|protected)\s+(static\s+)?\w+\s+\w+\s*\(",
r"^\s*import\s+",
r"^\s*(public|private|protected)?\s*class\s+\w+",
],
"cpp": [
r"^\s*\w[\w\s\*&:<,>]*\s+\w+\s*\([^)]*\)\s*(const\s*)?\{",
r"^\s*#include\s+",
r"^\s*(class|struct)\s+\w+",
],
}
class CodeCompressor:
"""AST-aware extractive code compressor. Thread-safe (no mutable state)."""
def compress(self, code: str, language: str, ccr_id: str = "") -> str:
try:
lang = CodeLanguage(language)
except ValueError:
logger.warning("CodeCompressor: unknown language %r — passthrough", language)
return code
try:
if _tree_sitter_available():
return self._compress_with_tree_sitter(code, lang, ccr_id)
else:
return self._compress_with_regex(code, lang, ccr_id)
except Exception as exc:
logger.warning("CodeCompressor.compress failed — passthrough: %s", exc)
return code
def _compress_with_tree_sitter(self, code: str, lang: CodeLanguage, ccr_id: str) -> str:
from tree_sitter_language_pack import get_parser # type: ignore[import]
parser = get_parser(lang.value)
tree = parser.parse(code.encode())
lines = code.split("\n")
spans = _collect_spans_tree_sitter(tree.root_node, lang)
return _apply_spans(lines, spans, ccr_id, lang.value)
def _compress_with_regex(self, code: str, lang: CodeLanguage, ccr_id: str) -> str:
lines = code.split("\n")
spans = _collect_spans_regex(lines, lang)
return _apply_spans(lines, spans, ccr_id, lang.value)
# ── Span collection ───────────────────────────────────────────────────────────
def _collect_spans_tree_sitter(root_node: object, lang: CodeLanguage) -> list[CodeSpan]:
_STRUCTURAL_TYPES: dict[str, set[str]] = {
"python": {"import_statement", "import_from_statement", "function_definition",
"class_definition", "decorated_definition", "type_alias_statement"},
"javascript": {"import_statement", "export_statement", "function_declaration",
"class_declaration", "method_definition", "arrow_function"},
"go": {"import_declaration", "function_declaration", "method_declaration",
"type_declaration", "interface_type"},
"rust": {"use_declaration", "function_item", "impl_item", "struct_item",
"enum_item", "trait_item"},
"java": {"import_declaration", "class_declaration", "method_declaration",
"interface_declaration", "annotation"},
"cpp": {"function_definition", "preproc_include", "class_specifier"},
}
structural = _STRUCTURAL_TYPES.get(lang.value, set())
spans: list[CodeSpan] = []
def _walk(node: object, depth: int = 0) -> None:
start_row: int = getattr(node, "start_point", (0, 0))[0]
end_row: int = getattr(node, "end_point", (0, 0))[0]
node_type: str = getattr(node, "type", "")
is_struct = node_type in structural
if is_struct and node_type in {
"function_definition", "method_definition", "function_declaration",
"function_item", "method_declaration",
}:
body_node = _find_child(node, "block") or _find_child(node, "body")
if body_node is not None:
body_start = getattr(body_node, "start_point", (0, 0))[0]
sig_end = body_start - 1
if sig_end > start_row:
spans.append(CodeSpan(start_row, sig_end, "signature", True))
spans.append(CodeSpan(body_start, end_row, "body", False))
else:
spans.append(CodeSpan(start_row, end_row, "signature", True))
elif is_struct and node_type in {"decorated_definition"}:
def_node = _find_child(node, "function_definition") or _find_child(node, "class_definition")
if def_node is not None:
def_start = getattr(def_node, "start_point", (0, 0))[0]
spans.append(CodeSpan(start_row, def_start - 1, "decorator", True))
_walk(def_node, depth + 1)
else:
spans.append(CodeSpan(start_row, end_row, node_type, True))
elif is_struct:
spans.append(CodeSpan(start_row, end_row, "signature", True))
else:
children = getattr(node, "children", None)
if children is not None:
has_named = False
for child in children:
if getattr(child, "is_named", False):
has_named = True
_walk(child, depth + 1)
if not has_named:
spans.append(CodeSpan(start_row, end_row, "body", False))
_walk(root_node)
return spans
def _find_child(node: object, child_type: str) -> object | None:
children = getattr(node, "children", None)
if children is None:
return None
for child in children:
if hasattr(child, "type") and child.type == child_type:
return child
return None
def _collect_spans_regex(lines: list[str], lang: CodeLanguage) -> list[CodeSpan]:
patterns = _SIGNATURE_PATTERNS.get(lang.value, [])
spans: list[CodeSpan] = []
i = 0
n = len(lines)
while i < n:
line = lines[i]
matched = False
for pat in patterns:
if re.search(pat, line):
body_start = i + 1
body_end = body_start
while body_end < n and (
lines[body_end].startswith((" ", "\t", "", "#", "//", "/*", "*/", "{"))
or re.match(r"^\s*$", lines[body_end])
):
if re.match(r"^\s*($|#|//|/\*|\*/|\*|}|\)|\])", lines[body_end]):
body_end += 1
else:
break
body_len = body_end - body_start
if body_len >= _MIN_BODY_LINES and _looks_like_body(lines, body_start, body_end):
spans.append(CodeSpan(i, i, "signature", True))
spans.append(CodeSpan(body_start, body_end, "body", False))
i = body_end
matched = True
break
else:
body_start = body_end
# else: pattern didn't match
if not matched:
i += 1
return spans
def _looks_like_body(lines: list[str], start: int, end: int) -> bool:
body_lines = [l for l in lines[start:end] if l.strip() and not l.strip().startswith(("#", "//"))]
return len(body_lines) >= _MIN_BODY_LINES
def _apply_spans(
lines: list[str],
spans: list[CodeSpan],
ccr_id: str,
lang: str = "python",
) -> str:
if not spans:
return "\n".join(lines)
stub_tmpl = _BODY_STUB_BY_LANG.get(lang, _BODY_STUB_BY_LANG["python"])
stub_no_ccr = _BODY_STUB_NO_CCR_BY_LANG.get(lang, _BODY_STUB_NO_CCR_BY_LANG["python"])
stub = stub_tmpl.format(ccr_id=ccr_id) if ccr_id else stub_no_ccr
sorted_spans = sorted(spans, key=lambda s: s.start_line)
start_set = set()
deduped: list[CodeSpan] = []
for s in sorted_spans:
if s.start_line not in start_set:
deduped.append(s)
start_set.add(s.start_line)
span_by_start: dict[int, CodeSpan] = {s.start_line: s for s in deduped}
result_lines: list[str] = []
i = 0
n = len(lines)
while i < n:
span = span_by_start.get(i)
if span is not None and not span.is_structural:
end = min(span.end_line, n - 1)
body_len = end - i + 1
if body_len >= _MIN_BODY_LINES:
result_lines.append(stub)
i = end + 1
else:
for j in range(i, end + 1):
result_lines.append(lines[j])
i = end + 1
elif span is not None and span.is_structural:
end = min(span.end_line, n - 1)
for j in range(i, end + 1):
result_lines.append(lines[j])
i = end + 1
else:
result_lines.append(lines[i])
i += 1
return "\n".join(result_lines)
def _tree_sitter_available() -> bool:
try:
import tree_sitter_language_pack # noqa: F401
return True
except ImportError:
return False
# compress/extractive_json.py
# Copyright (c) 2026 Varun Pratap Bhardwaj / Qualixar
# Licensed under AGPL-3.0-or-later
#
# Structural masking pattern adapted from:
# headroom/compression/handlers/json_handler.py (Apache-2.0, Headroom contributors)
# Specifically: JSONStructureHandler._extract_mask(), JSONToken, JSONTokenType
# Attribution: See ATTRIBUTION.md.
#
# HARD RULE: This compressor MUST NEVER prune or reorder JSON keys.
# Keys pruned = structured semantics corrupted non-recoverably. Not configurable.
"""JSONCompressor — lossless-ish extractive JSON compressor."""
from __future__ import annotations
import json
import logging
from typing import Any
logger = logging.getLogger("slm.optimize.compress.json")
VALUE_TRUNCATE_CHARS: int = 120
VALUE_TRUNCATE_SUFFIX: str = "\u2026" # ellipsis
MAX_ARRAY_ITEMS_SHOWN: int = 5
ARRAY_REMAINDER_KEY: str = "__slm_omitted__"
MIN_VALUE_LEN_TO_TRUNCATE: int = 40
class JSONCompressor:
"""Lossless-ish JSON compressor. Thread-safe (no mutable state)."""
def compress(self, parsed: Any) -> str:
try:
masked = self._mask(parsed, depth=0)
return json.dumps(masked, ensure_ascii=False, separators=(",", ":"))
except Exception as exc:
logger.warning("JSONCompressor.compress failed — returning original: %s", exc)
return json.dumps(parsed, ensure_ascii=False, separators=(",", ":"))
def _mask(self, obj: Any, depth: int) -> Any:
if isinstance(obj, dict):
return {k: self._mask(v, depth + 1) for k, v in obj.items()}
if isinstance(obj, list):
return self._mask_array(obj, depth)
if isinstance(obj, str):
return self._mask_string(obj)
return obj
def _mask_array(self, arr: list, depth: int) -> list:
if len(arr) <= MAX_ARRAY_ITEMS_SHOWN:
return [self._mask(item, depth + 1) for item in arr]
shown = [self._mask(item, depth + 1) for item in arr[:MAX_ARRAY_ITEMS_SHOWN]]
collision = any(
isinstance(item, dict) and ARRAY_REMAINDER_KEY in item
for item in arr
)
if collision:
logger.warning(
"JSONCompressor: input contains reserved key %r — skipping array sentinel",
ARRAY_REMAINDER_KEY,
)
else:
shown.append({ARRAY_REMAINDER_KEY: len(arr) - MAX_ARRAY_ITEMS_SHOWN})
return shown
def _mask_string(self, s: str) -> str:
if len(s) < MIN_VALUE_LEN_TO_TRUNCATE:
return s
if len(s) <= VALUE_TRUNCATE_CHARS:
return s
return s[:VALUE_TRUNCATE_CHARS] + VALUE_TRUNCATE_SUFFIX

Sorry, the diff of this file is too big to display

Sorry, the diff of this file is too big to display

Sorry, the diff of this file is too big to display